LLVM 23.0.0git
ARMISelLowering.cpp
Go to the documentation of this file.
1//===- ARMISelLowering.cpp - ARM DAG Lowering Implementation --------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines the interfaces that ARM uses to lower LLVM code into a
10// selection DAG.
11//
12//===----------------------------------------------------------------------===//
13
14#include "ARMISelLowering.h"
15#include "ARMBaseInstrInfo.h"
16#include "ARMBaseRegisterInfo.h"
17#include "ARMCallingConv.h"
20#include "ARMPerfectShuffle.h"
21#include "ARMRegisterInfo.h"
22#include "ARMSelectionDAGInfo.h"
23#include "ARMSubtarget.h"
27#include "Utils/ARMBaseInfo.h"
28#include "llvm/ADT/APFloat.h"
29#include "llvm/ADT/APInt.h"
30#include "llvm/ADT/ArrayRef.h"
31#include "llvm/ADT/BitVector.h"
32#include "llvm/ADT/DenseMap.h"
33#include "llvm/ADT/STLExtras.h"
36#include "llvm/ADT/Statistic.h"
38#include "llvm/ADT/StringRef.h"
40#include "llvm/ADT/Twine.h"
66#include "llvm/IR/Attributes.h"
67#include "llvm/IR/CallingConv.h"
68#include "llvm/IR/Constant.h"
69#include "llvm/IR/Constants.h"
70#include "llvm/IR/DataLayout.h"
71#include "llvm/IR/DebugLoc.h"
73#include "llvm/IR/Function.h"
74#include "llvm/IR/GlobalAlias.h"
75#include "llvm/IR/GlobalValue.h"
77#include "llvm/IR/IRBuilder.h"
78#include "llvm/IR/InlineAsm.h"
79#include "llvm/IR/Instruction.h"
82#include "llvm/IR/Intrinsics.h"
83#include "llvm/IR/IntrinsicsARM.h"
84#include "llvm/IR/Module.h"
85#include "llvm/IR/Type.h"
86#include "llvm/IR/User.h"
87#include "llvm/IR/Value.h"
88#include "llvm/MC/MCInstrDesc.h"
90#include "llvm/MC/MCSchedule.h"
97#include "llvm/Support/Debug.h"
105#include <algorithm>
106#include <cassert>
107#include <cstdint>
108#include <cstdlib>
109#include <iterator>
110#include <limits>
111#include <optional>
112#include <tuple>
113#include <utility>
114#include <vector>
115
116using namespace llvm;
117
118#define DEBUG_TYPE "arm-isel"
119
120STATISTIC(NumTailCalls, "Number of tail calls");
121STATISTIC(NumMovwMovt, "Number of GAs materialized with movw + movt");
122STATISTIC(NumLoopByVals, "Number of loops generated for byval arguments");
123STATISTIC(NumConstpoolPromoted,
124 "Number of constants with their storage promoted into constant pools");
125
126static cl::opt<bool>
127ARMInterworking("arm-interworking", cl::Hidden,
128 cl::desc("Enable / disable ARM interworking (for debugging only)"),
129 cl::init(true));
130
132 "arm-promote-constant", cl::Hidden,
133 cl::desc("Enable / disable promotion of unnamed_addr constants into "
134 "constant pools"),
135 cl::init(false)); // FIXME: set to true by default once PR32780 is fixed
137 "arm-promote-constant-max-size", cl::Hidden,
138 cl::desc("Maximum size of constant to promote into a constant pool"),
139 cl::init(64));
141 "arm-promote-constant-max-total", cl::Hidden,
142 cl::desc("Maximum size of ALL constants to promote into a constant pool"),
143 cl::init(128));
144
146MVEMaxSupportedInterleaveFactor("mve-max-interleave-factor", cl::Hidden,
147 cl::desc("Maximum interleave factor for MVE VLDn to generate."),
148 cl::init(2));
149
151 "arm-max-base-updates-to-check", cl::Hidden,
152 cl::desc("Maximum number of base-updates to check generating postindex."),
153 cl::init(64));
154
155/// Value type used for "flags" operands / results (either CPSR or FPSCR_NZCV).
156constexpr MVT FlagsVT = MVT::i32;
157
158// The APCS parameter registers.
159static const MCPhysReg GPRArgRegs[] = {
160 ARM::R0, ARM::R1, ARM::R2, ARM::R3
161};
162
164 SelectionDAG &DAG, const SDLoc &DL) {
166 assert(Arg.ArgVT.bitsLT(MVT::i32));
167 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, Arg.ArgVT, Value);
168 SDValue Ext =
170 MVT::i32, Trunc);
171 return Ext;
172}
173
174void ARMTargetLowering::addTypeForNEON(MVT VT, MVT PromotedLdStVT) {
175 if (VT != PromotedLdStVT) {
177 AddPromotedToType (ISD::LOAD, VT, PromotedLdStVT);
178
180 AddPromotedToType (ISD::STORE, VT, PromotedLdStVT);
181 }
182
183 MVT ElemTy = VT.getVectorElementType();
184 if (ElemTy != MVT::f64)
188 if (ElemTy == MVT::i32) {
193 } else {
198 }
207 if (VT.isInteger()) {
211 }
212
213 // Neon does not support vector divide/remainder operations.
222
223 if (!VT.isFloatingPoint() && VT != MVT::v2i64 && VT != MVT::v1i64)
224 for (auto Opcode : {ISD::ABS, ISD::ABDS, ISD::ABDU, ISD::SMIN, ISD::SMAX,
226 setOperationAction(Opcode, VT, Legal);
227 if (!VT.isFloatingPoint())
228 for (auto Opcode : {ISD::SADDSAT, ISD::UADDSAT, ISD::SSUBSAT, ISD::USUBSAT})
229 setOperationAction(Opcode, VT, Legal);
230}
231
232void ARMTargetLowering::addDRTypeForNEON(MVT VT) {
233 addRegisterClass(VT, &ARM::DPRRegClass);
234 addTypeForNEON(VT, MVT::f64);
235}
236
237void ARMTargetLowering::addQRTypeForNEON(MVT VT) {
238 addRegisterClass(VT, &ARM::DPairRegClass);
239 addTypeForNEON(VT, MVT::v2f64);
240}
241
242void ARMTargetLowering::setAllExpand(MVT VT) {
243 for (unsigned Opc = 0; Opc < ISD::BUILTIN_OP_END; ++Opc)
245
246 // We support these really simple operations even on types where all
247 // the actual arithmetic has to be broken down into simpler
248 // operations or turned into library calls.
253}
254
255void ARMTargetLowering::addAllExtLoads(const MVT From, const MVT To,
256 LegalizeAction Action) {
257 setLoadExtAction(ISD::EXTLOAD, From, To, Action);
258 setLoadExtAction(ISD::ZEXTLOAD, From, To, Action);
259 setLoadExtAction(ISD::SEXTLOAD, From, To, Action);
260}
261
262void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) {
263 const MVT IntTypes[] = { MVT::v16i8, MVT::v8i16, MVT::v4i32 };
264
265 for (auto VT : IntTypes) {
266 addRegisterClass(VT, &ARM::MQPRRegClass);
297
298 // No native support for these.
308
309 // Vector reductions
319
320 if (!HasMVEFP) {
325 } else {
328 }
329
330 // Pre and Post inc are supported on loads and stores
331 for (unsigned im = (unsigned)ISD::PRE_INC;
332 im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
337 }
338 }
339
340 const MVT FloatTypes[] = { MVT::v8f16, MVT::v4f32 };
341 for (auto VT : FloatTypes) {
342 addRegisterClass(VT, &ARM::MQPRRegClass);
343 if (!HasMVEFP)
344 setAllExpand(VT);
345
346 // These are legal or custom whether we have MVE.fp or not
359
360 // Pre and Post inc are supported on loads and stores
361 for (unsigned im = (unsigned)ISD::PRE_INC;
362 im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
367 }
368
369 if (HasMVEFP) {
377 }
382
383 // No native support for these.
398 }
399 }
400
401 // Custom Expand smaller than legal vector reductions to prevent false zero
402 // items being added.
411
412 // We 'support' these types up to bitcast/load/store level, regardless of
413 // MVE integer-only / float support. Only doing FP data processing on the FP
414 // vector types is inhibited at integer-only level.
415 const MVT LongTypes[] = { MVT::v2i64, MVT::v2f64 };
416 for (auto VT : LongTypes) {
417 addRegisterClass(VT, &ARM::MQPRRegClass);
418 setAllExpand(VT);
424 }
426
427 // We can do bitwise operations on v2i64 vectors
428 setOperationAction(ISD::AND, MVT::v2i64, Legal);
429 setOperationAction(ISD::OR, MVT::v2i64, Legal);
430 setOperationAction(ISD::XOR, MVT::v2i64, Legal);
431
432 // It is legal to extload from v4i8 to v4i16 or v4i32.
433 addAllExtLoads(MVT::v8i16, MVT::v8i8, Legal);
434 addAllExtLoads(MVT::v4i32, MVT::v4i16, Legal);
435 addAllExtLoads(MVT::v4i32, MVT::v4i8, Legal);
436
437 // It is legal to sign extend from v4i8/v4i16 to v4i32 or v8i8 to v8i16.
443
444 // Some truncating stores are legal too.
445 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal);
446 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal);
447 setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal);
448
449 // Pre and Post inc on these are legal, given the correct extends
450 for (unsigned im = (unsigned)ISD::PRE_INC;
451 im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
452 for (auto VT : {MVT::v8i8, MVT::v4i8, MVT::v4i16}) {
457 }
458 }
459
460 // Predicate types
461 const MVT pTypes[] = {MVT::v16i1, MVT::v8i1, MVT::v4i1, MVT::v2i1};
462 for (auto VT : pTypes) {
463 addRegisterClass(VT, &ARM::VCCRRegClass);
478
479 if (!HasMVEFP) {
484 }
485 }
489 setOperationAction(ISD::OR, MVT::v2i1, Expand);
495
504}
505
507 return static_cast<const ARMBaseTargetMachine &>(getTargetMachine());
508}
509
511 const ARMSubtarget &STI)
512 : TargetLowering(TM_, STI), Subtarget(&STI),
513 RegInfo(Subtarget->getRegisterInfo()),
514 Itins(Subtarget->getInstrItineraryData()) {
515 const auto &TM = static_cast<const ARMBaseTargetMachine &>(TM_);
516
519
520 const Triple &TT = TM.getTargetTriple();
521
522 if (Subtarget->isThumb1Only())
523 addRegisterClass(MVT::i32, &ARM::tGPRRegClass);
524 else
525 addRegisterClass(MVT::i32, &ARM::GPRRegClass);
526
527 if (!Subtarget->useSoftFloat() && !Subtarget->isThumb1Only() &&
528 Subtarget->hasFPRegs()) {
529 addRegisterClass(MVT::f32, &ARM::SPRRegClass);
530 addRegisterClass(MVT::f64, &ARM::DPRRegClass);
531
536
537 if (!Subtarget->hasVFP2Base()) {
538 setAllExpand(MVT::f32);
539 } else {
542 setOperationAction(Op, MVT::f32, Legal);
543 }
544 if (!Subtarget->hasFP64()) {
545 setAllExpand(MVT::f64);
546 } else {
549 setOperationAction(Op, MVT::f64, Legal);
550
552 }
553 }
554
555 if (Subtarget->hasFullFP16()) {
558 setOperationAction(Op, MVT::f16, Legal);
559
560 addRegisterClass(MVT::f16, &ARM::HPRRegClass);
563
568 }
569
570 if (Subtarget->hasBF16()) {
571 addRegisterClass(MVT::bf16, &ARM::HPRRegClass);
572 setAllExpand(MVT::bf16);
573 if (!Subtarget->hasFullFP16())
575 } else {
580 }
581
583 for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
584 setTruncStoreAction(VT, InnerVT, Expand);
585 addAllExtLoads(VT, InnerVT, Expand);
586 }
587
590
592 }
593
594 if (!Subtarget->isThumb1Only() && !Subtarget->hasV8_1MMainlineOps())
596
597 if (!Subtarget->hasV8_1MMainlineOps())
599
600 if (!Subtarget->isThumb1Only())
602
605
608
609 if (Subtarget->hasMVEIntegerOps())
610 addMVEVectorTypes(Subtarget->hasMVEFloatOps());
611
612 // Combine low-overhead loop intrinsics so that we can lower i1 types.
613 if (Subtarget->hasLOB()) {
615 }
616
617 if (Subtarget->hasNEON()) {
618 addDRTypeForNEON(MVT::v2f32);
619 addDRTypeForNEON(MVT::v8i8);
620 addDRTypeForNEON(MVT::v4i16);
621 addDRTypeForNEON(MVT::v2i32);
622 addDRTypeForNEON(MVT::v1i64);
623
624 addQRTypeForNEON(MVT::v4f32);
625 addQRTypeForNEON(MVT::v2f64);
626 addQRTypeForNEON(MVT::v16i8);
627 addQRTypeForNEON(MVT::v8i16);
628 addQRTypeForNEON(MVT::v4i32);
629 addQRTypeForNEON(MVT::v2i64);
630
631 if (Subtarget->hasFullFP16()) {
632 addQRTypeForNEON(MVT::v8f16);
633 addDRTypeForNEON(MVT::v4f16);
634 }
635
636 if (Subtarget->hasBF16()) {
637 addQRTypeForNEON(MVT::v8bf16);
638 addDRTypeForNEON(MVT::v4bf16);
639 }
640 }
641
642 if (Subtarget->hasMVEIntegerOps() || Subtarget->hasNEON()) {
643 // v2f64 is legal so that QR subregs can be extracted as f64 elements, but
644 // none of Neon, MVE or VFP supports any arithmetic operations on it.
645 setOperationAction(ISD::FADD, MVT::v2f64, Expand);
646 setOperationAction(ISD::FSUB, MVT::v2f64, Expand);
647 setOperationAction(ISD::FMUL, MVT::v2f64, Expand);
648 // FIXME: Code duplication: FDIV and FREM are expanded always, see
649 // ARMTargetLowering::addTypeForNEON method for details.
650 setOperationAction(ISD::FDIV, MVT::v2f64, Expand);
651 setOperationAction(ISD::FREM, MVT::v2f64, Expand);
652 // FIXME: Create unittest.
653 // In another words, find a way when "copysign" appears in DAG with vector
654 // operands.
656 // FIXME: Code duplication: SETCC has custom operation action, see
657 // ARMTargetLowering::addTypeForNEON method for details.
659 // FIXME: Create unittest for FNEG and for FABS.
660 setOperationAction(ISD::FNEG, MVT::v2f64, Expand);
661 setOperationAction(ISD::FABS, MVT::v2f64, Expand);
663 setOperationAction(ISD::FSIN, MVT::v2f64, Expand);
664 setOperationAction(ISD::FCOS, MVT::v2f64, Expand);
665 setOperationAction(ISD::FTAN, MVT::v2f64, Expand);
666 setOperationAction(ISD::FPOW, MVT::v2f64, Expand);
667 setOperationAction(ISD::FLOG, MVT::v2f64, Expand);
670 setOperationAction(ISD::FEXP, MVT::v2f64, Expand);
679 setOperationAction(ISD::FMA, MVT::v2f64, Expand);
680 }
681
682 if (Subtarget->hasNEON()) {
683 // The same with v4f32. But keep in mind that vadd, vsub, vmul are natively
684 // supported for v4f32.
686 setOperationAction(ISD::FSIN, MVT::v4f32, Expand);
687 setOperationAction(ISD::FCOS, MVT::v4f32, Expand);
688 setOperationAction(ISD::FTAN, MVT::v4f32, Expand);
689 setOperationAction(ISD::FPOW, MVT::v4f32, Expand);
690 setOperationAction(ISD::FLOG, MVT::v4f32, Expand);
693 setOperationAction(ISD::FEXP, MVT::v4f32, Expand);
702
703 // Mark v2f32 intrinsics.
705 setOperationAction(ISD::FSIN, MVT::v2f32, Expand);
706 setOperationAction(ISD::FCOS, MVT::v2f32, Expand);
707 setOperationAction(ISD::FTAN, MVT::v2f32, Expand);
708 setOperationAction(ISD::FPOW, MVT::v2f32, Expand);
709 setOperationAction(ISD::FLOG, MVT::v2f32, Expand);
712 setOperationAction(ISD::FEXP, MVT::v2f32, Expand);
721
724 setOperationAction(Op, MVT::v4f16, Expand);
725 setOperationAction(Op, MVT::v8f16, Expand);
726 }
727
728 // Neon does not support some operations on v1i64 and v2i64 types.
729 setOperationAction(ISD::MUL, MVT::v1i64, Expand);
730 // Custom handling for some quad-vector types to detect VMULL.
731 setOperationAction(ISD::MUL, MVT::v8i16, Custom);
732 setOperationAction(ISD::MUL, MVT::v4i32, Custom);
733 setOperationAction(ISD::MUL, MVT::v2i64, Custom);
734 // Custom handling for some vector types to avoid expensive expansions
735 setOperationAction(ISD::SDIV, MVT::v4i16, Custom);
737 setOperationAction(ISD::UDIV, MVT::v4i16, Custom);
739 // Neon does not have single instruction SINT_TO_FP and UINT_TO_FP with
740 // a destination type that is wider than the source, and nor does
741 // it have a FP_TO_[SU]INT instruction with a narrower destination than
742 // source.
751
754
755 // NEON does not have single instruction CTPOP for vectors with element
756 // types wider than 8-bits. However, custom lowering can leverage the
757 // v8i8/v16i8 vcnt instruction.
764
765 setOperationAction(ISD::CTLZ, MVT::v1i64, Expand);
766 setOperationAction(ISD::CTLZ, MVT::v2i64, Expand);
767
768 // NEON does not have single instruction CTTZ for vectors.
770 setOperationAction(ISD::CTTZ, MVT::v4i16, Custom);
771 setOperationAction(ISD::CTTZ, MVT::v2i32, Custom);
772 setOperationAction(ISD::CTTZ, MVT::v1i64, Custom);
773
774 setOperationAction(ISD::CTTZ, MVT::v16i8, Custom);
775 setOperationAction(ISD::CTTZ, MVT::v8i16, Custom);
776 setOperationAction(ISD::CTTZ, MVT::v4i32, Custom);
777 setOperationAction(ISD::CTTZ, MVT::v2i64, Custom);
778
783
788
792 }
793
794 // NEON only has FMA instructions as of VFP4.
795 if (!Subtarget->hasVFP4Base()) {
796 setOperationAction(ISD::FMA, MVT::v2f32, Expand);
797 setOperationAction(ISD::FMA, MVT::v4f32, Expand);
798 }
799
802
803 // It is legal to extload from v4i8 to v4i16 or v4i32.
804 for (MVT Ty : {MVT::v8i8, MVT::v4i8, MVT::v2i8, MVT::v4i16, MVT::v2i16,
805 MVT::v2i32}) {
810 }
811 }
812
813 for (auto VT : {MVT::v8i8, MVT::v4i16, MVT::v2i32, MVT::v16i8, MVT::v8i16,
814 MVT::v4i32}) {
819 }
820 }
821
822 if (Subtarget->hasNEON() || Subtarget->hasMVEIntegerOps()) {
829 }
830 if (Subtarget->hasMVEIntegerOps()) {
833 ISD::SETCC});
834 }
835 if (Subtarget->hasMVEFloatOps()) {
837 }
838
839 if (!Subtarget->hasFP64()) {
840 // When targeting a floating-point unit with only single-precision
841 // operations, f64 is legal for the few double-precision instructions which
842 // are present However, no double-precision operations other than moves,
843 // loads and stores are provided by the hardware.
880 }
881
884
885 if (!Subtarget->hasFP64() || !Subtarget->hasFPARMv8Base()) {
888 if (Subtarget->hasFullFP16()) {
891 }
892 } else {
894 }
895
896 if (!Subtarget->hasFP16()) {
899 } else {
902 }
903
904 computeRegisterProperties(Subtarget->getRegisterInfo());
905
906 // ARM does not have floating-point extending loads.
907 for (MVT VT : MVT::fp_valuetypes()) {
908 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand);
909 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand);
910 setLoadExtAction(ISD::EXTLOAD, VT, MVT::bf16, Expand);
911 }
912
913 // ... or truncating stores
914 setTruncStoreAction(MVT::f64, MVT::f32, Expand);
915 setTruncStoreAction(MVT::f32, MVT::f16, Expand);
916 setTruncStoreAction(MVT::f64, MVT::f16, Expand);
917 setTruncStoreAction(MVT::f32, MVT::bf16, Expand);
918 setTruncStoreAction(MVT::f64, MVT::bf16, Expand);
919
920 // ARM does not have i1 sign extending load.
921 for (MVT VT : MVT::integer_valuetypes())
923
924 // ARM supports all 4 flavors of integer indexed load / store.
925 if (!Subtarget->isThumb1Only()) {
926 for (unsigned im = (unsigned)ISD::PRE_INC;
928 setIndexedLoadAction(im, MVT::i1, Legal);
929 setIndexedLoadAction(im, MVT::i8, Legal);
930 setIndexedLoadAction(im, MVT::i16, Legal);
931 setIndexedLoadAction(im, MVT::i32, Legal);
932 setIndexedStoreAction(im, MVT::i1, Legal);
933 setIndexedStoreAction(im, MVT::i8, Legal);
934 setIndexedStoreAction(im, MVT::i16, Legal);
935 setIndexedStoreAction(im, MVT::i32, Legal);
936 }
937 } else {
938 // Thumb-1 has limited post-inc load/store support - LDM r0!, {r1}.
941 }
942
943 // Custom loads/stores to possible use __aeabi_uread/write*
944 if (TT.isTargetAEABI() && !Subtarget->allowsUnalignedMem()) {
949 }
950
955
956 if (!Subtarget->isThumb1Only()) {
959 }
960
963 if (Subtarget->hasDSP()) {
972 }
973 if (Subtarget->hasBaseDSP()) {
976 }
977
978 // i64 operation support.
981 if (Subtarget->isThumb1Only()) {
984 }
985 if (Subtarget->isThumb1Only() || !Subtarget->hasV6Ops()
986 || (Subtarget->isThumb2() && !Subtarget->hasDSP()))
988
998
999 // MVE lowers 64 bit shifts to lsll and lsrl
1000 // assuming that ISD::SRL and SRA of i64 are already marked custom
1001 if (Subtarget->hasMVEIntegerOps())
1003
1004 // Expand to __aeabi_l{lsl,lsr,asr} calls for Thumb1.
1005 if (Subtarget->isThumb1Only()) {
1009 }
1010
1011 if (!Subtarget->isThumb1Only() && Subtarget->hasV6T2Ops())
1013
1014 // ARM does not have ROTL.
1019 }
1021 // TODO: These two should be set to LibCall, but this currently breaks
1022 // the Linux kernel build. See #101786.
1025 if (!Subtarget->hasV5TOps() || Subtarget->isThumb1Only()) {
1028 }
1029
1030 // @llvm.readcyclecounter requires the Performance Monitors extension.
1031 // Default to the 0 expansion on unsupported platforms.
1032 // FIXME: Technically there are older ARM CPUs that have
1033 // implementation-specific ways of obtaining this information.
1034 if (Subtarget->hasPerfMon())
1036
1037 // Only ARMv6 has BSWAP.
1038 if (!Subtarget->hasV6Ops())
1040
1041 bool hasDivide = Subtarget->isThumb() ? Subtarget->hasDivideInThumbMode()
1042 : Subtarget->hasDivideInARMMode();
1043 if (!hasDivide) {
1044 // These are expanded into libcalls if the cpu doesn't have HW divider.
1047 }
1048
1049 if (TT.isOSWindows() && !Subtarget->hasDivideInThumbMode()) {
1052
1055 }
1056
1059
1060 // Register based DivRem for AEABI (RTABI 4.2)
1061 if (TT.isTargetAEABI() || TT.isAndroid() || TT.isTargetGNUAEABI() ||
1062 TT.isTargetMuslAEABI() || TT.isOSFuchsia() || TT.isOSWindows()) {
1065 HasStandaloneRem = false;
1066
1071 } else {
1074 }
1075
1080
1081 setOperationAction(ISD::TRAP, MVT::Other, Legal);
1083
1084 // Use the default implementation.
1086 setOperationAction(ISD::VAARG, MVT::Other, Expand);
1088 setOperationAction(ISD::VAEND, MVT::Other, Expand);
1091
1092 if (TT.isOSWindows())
1094 else
1096
1097 // ARMv6 Thumb1 (except for CPUs that support dmb / dsb) and earlier use
1098 // the default expansion.
1099 InsertFencesForAtomic = false;
1100 if (Subtarget->hasAnyDataBarrier() &&
1101 (!Subtarget->isThumb() || Subtarget->hasV8MBaselineOps())) {
1102 // ATOMIC_FENCE needs custom lowering; the others should have been expanded
1103 // to ldrex/strex loops already.
1105 if (!Subtarget->isThumb() || !Subtarget->isMClass())
1107
1108 // On v8, we have particularly efficient implementations of atomic fences
1109 // if they can be combined with nearby atomic loads and stores.
1110 if (!Subtarget->hasAcquireRelease() ||
1111 getTargetMachine().getOptLevel() == CodeGenOptLevel::None) {
1112 // Automatically insert fences (dmb ish) around ATOMIC_SWAP etc.
1113 InsertFencesForAtomic = true;
1114 }
1115 } else {
1116 // If there's anything we can use as a barrier, go through custom lowering
1117 // for ATOMIC_FENCE.
1118 // If target has DMB in thumb, Fences can be inserted.
1119 if (Subtarget->hasDataBarrier())
1120 InsertFencesForAtomic = true;
1121
1123 Subtarget->hasAnyDataBarrier() ? Custom : Expand);
1124
1125 // Set them all for libcall, which will force libcalls.
1138 // Mark ATOMIC_LOAD and ATOMIC_STORE custom so we can handle the
1139 // Unordered/Monotonic case.
1140 if (!InsertFencesForAtomic) {
1143 }
1144 }
1145
1146 // Compute supported atomic widths.
1147 if (TT.isOSLinux() || (!Subtarget->isMClass() && Subtarget->hasV6Ops())) {
1148 // For targets where __sync_* routines are reliably available, we use them
1149 // if necessary.
1150 //
1151 // ARM Linux always supports 64-bit atomics through kernel-assisted atomic
1152 // routines (kernel 3.1 or later). FIXME: Not with compiler-rt?
1153 //
1154 // ARMv6 targets have native instructions in ARM mode. For Thumb mode,
1155 // such targets should provide __sync_* routines, which use the ARM mode
1156 // instructions. (ARMv6 doesn't have dmb, but it has an equivalent
1157 // encoding; see ARMISD::MEMBARRIER_MCR.)
1159 } else if ((Subtarget->isMClass() && Subtarget->hasV8MBaselineOps()) ||
1160 Subtarget->hasForced32BitAtomics()) {
1161 // Cortex-M (besides Cortex-M0) have 32-bit atomics.
1163 } else {
1164 // We can't assume anything about other targets; just use libatomic
1165 // routines.
1167 }
1168
1170
1172
1173 // Requires SXTB/SXTH, available on v6 and up in both ARM and Thumb modes.
1174 if (!Subtarget->hasV6Ops()) {
1177 }
1179
1180 if (!Subtarget->useSoftFloat() && Subtarget->hasFPRegs() &&
1181 !Subtarget->isThumb1Only()) {
1182 // Turn f64->i64 into VMOVRRD, i64 -> f64 to VMOVDRR
1183 // iff target supports vfp2.
1193 }
1194
1195 // We want to custom lower some of our intrinsics.
1200
1210 if (Subtarget->hasFullFP16()) {
1214 }
1215
1217
1220 if (Subtarget->hasFullFP16())
1224 setOperationAction(ISD::BR_JT, MVT::Other, Custom);
1225
1226 // We don't support sin/cos/fmod/copysign/pow
1235 if (!Subtarget->useSoftFloat() && Subtarget->hasVFP2Base() &&
1236 !Subtarget->isThumb1Only()) {
1239 }
1242
1243 if (!Subtarget->hasVFP4Base()) {
1246 }
1247
1248 // Various VFP goodness
1249 if (!Subtarget->useSoftFloat() && !Subtarget->isThumb1Only()) {
1250 // FP-ARMv8 adds f64 <-> f16 conversion. Before that it should be expanded.
1251 if (!Subtarget->hasFPARMv8Base() || !Subtarget->hasFP64()) {
1256 }
1257
1258 // fp16 is a special v7 extension that adds f16 <-> f32 conversions.
1259 if (!Subtarget->hasFP16()) {
1264 }
1265
1266 // Strict floating-point comparisons need custom lowering.
1273 }
1274
1277
1278 // FP-ARMv8 implements a lot of rounding-like FP operations.
1279 if (Subtarget->hasFPARMv8Base()) {
1280 for (auto Op :
1287 setOperationAction(Op, MVT::f32, Legal);
1288
1289 if (Subtarget->hasFP64())
1290 setOperationAction(Op, MVT::f64, Legal);
1291 }
1292
1293 if (Subtarget->hasNEON()) {
1298 }
1299 }
1300
1301 // FP16 often need to be promoted to call lib functions
1302 // clang-format off
1303 if (Subtarget->hasFullFP16()) {
1307
1308 for (auto Op : {ISD::FREM, ISD::FPOW, ISD::FPOWI,
1322 setOperationAction(Op, MVT::f16, Promote);
1323 }
1324
1325 // Round-to-integer need custom lowering for fp16, as Promote doesn't work
1326 // because the result type is integer.
1328 setOperationAction(Op, MVT::f16, Custom);
1329
1335 setOperationAction(Op, MVT::f16, Legal);
1336 }
1337 // clang-format on
1338 }
1339
1340 if (Subtarget->hasNEON()) {
1341 // vmin and vmax aren't available in a scalar form, so we can use
1342 // a NEON instruction with an undef lane instead.
1351
1352 if (Subtarget->hasV8Ops()) {
1357 setOperationAction(Op, MVT::v2f32, Legal);
1358 setOperationAction(Op, MVT::v4f32, Legal);
1359 }
1360 }
1361
1362 if (Subtarget->hasFullFP16()) {
1367
1372
1377 setOperationAction(Op, MVT::v4f16, Legal);
1378 setOperationAction(Op, MVT::v8f16, Legal);
1379 }
1380 }
1381 }
1382
1383 // On MSVC, both 32-bit and 64-bit, ldexpf(f32) is not defined. MinGW has
1384 // it, but it's just a wrapper around ldexp.
1385 if (TT.isOSWindows()) {
1387 if (isOperationExpand(Op, MVT::f32))
1388 setOperationAction(Op, MVT::f32, Promote);
1389 }
1390
1391 // LegalizeDAG currently can't expand fp16 LDEXP/FREXP on targets where i16
1392 // isn't legal.
1394 if (isOperationExpand(Op, MVT::f16))
1395 setOperationAction(Op, MVT::f16, Promote);
1396
1397 // We have target-specific dag combine patterns for the following nodes:
1398 // ARMISD::VMOVRRD - No need to call setTargetDAGCombine
1401
1402 if (Subtarget->hasMVEIntegerOps())
1404
1405 if (Subtarget->hasV6Ops())
1407 if (Subtarget->isThumb1Only())
1409 // Attempt to lower smin/smax to ssat/usat
1410 if ((!Subtarget->isThumb() && Subtarget->hasV6Ops()) ||
1411 Subtarget->isThumb2()) {
1413 }
1414
1416
1417 if (Subtarget->useSoftFloat() || Subtarget->isThumb1Only() ||
1418 !Subtarget->hasVFP2Base() || Subtarget->hasMinSize())
1420 else
1422
1423 //// temporary - rewrite interface to use type
1426 MaxStoresPerMemcpy = 4; // For @llvm.memcpy -> sequence of stores
1428 MaxStoresPerMemmove = 4; // For @llvm.memmove -> sequence of stores
1430
1431 // On ARM arguments smaller than 4 bytes are extended, so all arguments
1432 // are at least 4 bytes aligned.
1434
1435 // Prefer likely predicted branches to selects on out-of-order cores.
1436 PredictableSelectIsExpensive = Subtarget->getSchedModel().isOutOfOrder();
1437
1438 setPrefLoopAlignment(Align(1ULL << Subtarget->getPreferBranchLogAlignment()));
1440 Align(1ULL << Subtarget->getPreferBranchLogAlignment()));
1441
1442 setMinFunctionAlignment(Subtarget->isThumb() ? Align(2) : Align(4));
1443
1444 IsStrictFPEnabled = true;
1445}
1446
1448 return Subtarget->useSoftFloat();
1449}
1450
1452 return !Subtarget->isThumb1Only() && VT.getSizeInBits() <= 32;
1453}
1454
1455// FIXME: It might make sense to define the representative register class as the
1456// nearest super-register that has a non-null superset. For example, DPR_VFP2 is
1457// a super-register of SPR, and DPR is a superset if DPR_VFP2. Consequently,
1458// SPR's representative would be DPR_VFP2. This should work well if register
1459// pressure tracking were modified such that a register use would increment the
1460// pressure of the register class's representative and all of it's super
1461// classes' representatives transitively. We have not implemented this because
1462// of the difficulty prior to coalescing of modeling operand register classes
1463// due to the common occurrence of cross class copies and subregister insertions
1464// and extractions.
1465std::pair<const TargetRegisterClass *, uint8_t>
1467 MVT VT) const {
1468 const TargetRegisterClass *RRC = nullptr;
1469 uint8_t Cost = 1;
1470 switch (VT.SimpleTy) {
1471 default:
1473 // Use DPR as representative register class for all floating point
1474 // and vector types. Since there are 32 SPR registers and 32 DPR registers so
1475 // the cost is 1 for both f32 and f64.
1476 case MVT::f32: case MVT::f64: case MVT::v8i8: case MVT::v4i16:
1477 case MVT::v2i32: case MVT::v1i64: case MVT::v2f32:
1478 RRC = &ARM::DPRRegClass;
1479 // When NEON is used for SP, only half of the register file is available
1480 // because operations that define both SP and DP results will be constrained
1481 // to the VFP2 class (D0-D15). We currently model this constraint prior to
1482 // coalescing by double-counting the SP regs. See the FIXME above.
1483 if (Subtarget->useNEONForSinglePrecisionFP())
1484 Cost = 2;
1485 break;
1486 case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
1487 case MVT::v4f32: case MVT::v2f64:
1488 RRC = &ARM::DPRRegClass;
1489 Cost = 2;
1490 break;
1491 case MVT::v4i64:
1492 RRC = &ARM::DPRRegClass;
1493 Cost = 4;
1494 break;
1495 case MVT::v8i64:
1496 RRC = &ARM::DPRRegClass;
1497 Cost = 8;
1498 break;
1499 }
1500 return std::make_pair(RRC, Cost);
1501}
1502
1504 EVT VT) const {
1505 if (!VT.isVector())
1506 return getPointerTy(DL);
1507
1508 // MVE has a predicate register.
1509 if ((Subtarget->hasMVEIntegerOps() &&
1510 (VT == MVT::v2i64 || VT == MVT::v4i32 || VT == MVT::v8i16 ||
1511 VT == MVT::v16i8)) ||
1512 (Subtarget->hasMVEFloatOps() &&
1513 (VT == MVT::v2f64 || VT == MVT::v4f32 || VT == MVT::v8f16)))
1514 return MVT::getVectorVT(MVT::i1, VT.getVectorElementCount());
1516}
1517
1518/// getRegClassFor - Return the register class that should be used for the
1519/// specified value type.
1520const TargetRegisterClass *
1521ARMTargetLowering::getRegClassFor(MVT VT, bool isDivergent) const {
1522 (void)isDivergent;
1523 // Map v4i64 to QQ registers but do not make the type legal. Similarly map
1524 // v8i64 to QQQQ registers. v4i64 and v8i64 are only used for REG_SEQUENCE to
1525 // load / store 4 to 8 consecutive NEON D registers, or 2 to 4 consecutive
1526 // MVE Q registers.
1527 if (Subtarget->hasNEON()) {
1528 if (VT == MVT::v4i64)
1529 return &ARM::QQPRRegClass;
1530 if (VT == MVT::v8i64)
1531 return &ARM::QQQQPRRegClass;
1532 }
1533 if (Subtarget->hasMVEIntegerOps()) {
1534 if (VT == MVT::v4i64)
1535 return &ARM::MQQPRRegClass;
1536 if (VT == MVT::v8i64)
1537 return &ARM::MQQQQPRRegClass;
1538 }
1540}
1541
1542// memcpy, and other memory intrinsics, typically tries to use LDM/STM if the
1543// source/dest is aligned and the copy size is large enough. We therefore want
1544// to align such objects passed to memory intrinsics.
1546 Align &PrefAlign) const {
1547 if (!isa<MemIntrinsic>(CI))
1548 return false;
1549 MinSize = 8;
1550 // On ARM11 onwards (excluding M class) 8-byte aligned LDM is typically 1
1551 // cycle faster than 4-byte aligned LDM.
1552 PrefAlign =
1553 (Subtarget->hasV6Ops() && !Subtarget->isMClass() ? Align(8) : Align(4));
1554 return true;
1555}
1556
1557// Create a fast isel object.
1559 FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo,
1560 const LibcallLoweringInfo *libcallLowering) const {
1561 return ARM::createFastISel(funcInfo, libInfo, libcallLowering);
1562}
1563
1565 unsigned NumVals = N->getNumValues();
1566 if (!NumVals)
1567 return Sched::RegPressure;
1568
1569 for (unsigned i = 0; i != NumVals; ++i) {
1570 EVT VT = N->getValueType(i);
1571 if (VT == MVT::Glue || VT == MVT::Other)
1572 continue;
1573 if (VT.isFloatingPoint() || VT.isVector())
1574 return Sched::ILP;
1575 }
1576
1577 if (!N->isMachineOpcode())
1578 return Sched::RegPressure;
1579
1580 // Load are scheduled for latency even if there instruction itinerary
1581 // is not available.
1582 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
1583 const MCInstrDesc &MCID = TII->get(N->getMachineOpcode());
1584
1585 if (MCID.getNumDefs() == 0)
1586 return Sched::RegPressure;
1587 if (!Itins->isEmpty() &&
1588 Itins->getOperandCycle(MCID.getSchedClass(), 0) > 2U)
1589 return Sched::ILP;
1590
1591 return Sched::RegPressure;
1592}
1593
1594//===----------------------------------------------------------------------===//
1595// Lowering Code
1596//===----------------------------------------------------------------------===//
1597
1598static bool isSRL16(const SDValue &Op) {
1599 if (Op.getOpcode() != ISD::SRL)
1600 return false;
1601 if (auto Const = dyn_cast<ConstantSDNode>(Op.getOperand(1)))
1602 return Const->getZExtValue() == 16;
1603 return false;
1604}
1605
1606static bool isSRA16(const SDValue &Op) {
1607 if (Op.getOpcode() != ISD::SRA)
1608 return false;
1609 if (auto Const = dyn_cast<ConstantSDNode>(Op.getOperand(1)))
1610 return Const->getZExtValue() == 16;
1611 return false;
1612}
1613
1614static bool isSHL16(const SDValue &Op) {
1615 if (Op.getOpcode() != ISD::SHL)
1616 return false;
1617 if (auto Const = dyn_cast<ConstantSDNode>(Op.getOperand(1)))
1618 return Const->getZExtValue() == 16;
1619 return false;
1620}
1621
1622// Check for a signed 16-bit value. We special case SRA because it makes it
1623// more simple when also looking for SRAs that aren't sign extending a
1624// smaller value. Without the check, we'd need to take extra care with
1625// checking order for some operations.
1626static bool isS16(const SDValue &Op, SelectionDAG &DAG) {
1627 if (isSRA16(Op))
1628 return isSHL16(Op.getOperand(0));
1629 return DAG.ComputeNumSignBits(Op) == 17;
1630}
1631
1632/// IntCCToARMCC - Convert a DAG integer condition code to an ARM CC
1634 switch (CC) {
1635 default: llvm_unreachable("Unknown condition code!");
1636 case ISD::SETNE: return ARMCC::NE;
1637 case ISD::SETEQ: return ARMCC::EQ;
1638 case ISD::SETGT: return ARMCC::GT;
1639 case ISD::SETGE: return ARMCC::GE;
1640 case ISD::SETLT: return ARMCC::LT;
1641 case ISD::SETLE: return ARMCC::LE;
1642 case ISD::SETUGT: return ARMCC::HI;
1643 case ISD::SETUGE: return ARMCC::HS;
1644 case ISD::SETULT: return ARMCC::LO;
1645 case ISD::SETULE: return ARMCC::LS;
1646 }
1647}
1648
1649/// FPCCToARMCC - Convert a DAG fp condition code to an ARM CC.
1651 ARMCC::CondCodes &CondCode2) {
1652 CondCode2 = ARMCC::AL;
1653 switch (CC) {
1654 default: llvm_unreachable("Unknown FP condition!");
1655 case ISD::SETEQ:
1656 case ISD::SETOEQ: CondCode = ARMCC::EQ; break;
1657 case ISD::SETGT:
1658 case ISD::SETOGT: CondCode = ARMCC::GT; break;
1659 case ISD::SETGE:
1660 case ISD::SETOGE: CondCode = ARMCC::GE; break;
1661 case ISD::SETOLT: CondCode = ARMCC::MI; break;
1662 case ISD::SETOLE: CondCode = ARMCC::LS; break;
1663 case ISD::SETONE: CondCode = ARMCC::MI; CondCode2 = ARMCC::GT; break;
1664 case ISD::SETO: CondCode = ARMCC::VC; break;
1665 case ISD::SETUO: CondCode = ARMCC::VS; break;
1666 case ISD::SETUEQ: CondCode = ARMCC::EQ; CondCode2 = ARMCC::VS; break;
1667 case ISD::SETUGT: CondCode = ARMCC::HI; break;
1668 case ISD::SETUGE: CondCode = ARMCC::PL; break;
1669 case ISD::SETLT:
1670 case ISD::SETULT: CondCode = ARMCC::LT; break;
1671 case ISD::SETLE:
1672 case ISD::SETULE: CondCode = ARMCC::LE; break;
1673 case ISD::SETNE:
1674 case ISD::SETUNE: CondCode = ARMCC::NE; break;
1675 }
1676}
1677
1678//===----------------------------------------------------------------------===//
1679// Calling Convention Implementation
1680//===----------------------------------------------------------------------===//
1681
1682/// getEffectiveCallingConv - Get the effective calling convention, taking into
1683/// account presence of floating point hardware and calling convention
1684/// limitations, such as support for variadic functions.
1686ARMTargetLowering::getEffectiveCallingConv(CallingConv::ID CC,
1687 bool isVarArg) const {
1688 switch (CC) {
1689 default:
1690 report_fatal_error("Unsupported calling convention");
1693 case CallingConv::GHC:
1695 return CC;
1701 case CallingConv::Swift:
1704 case CallingConv::C:
1705 case CallingConv::Tail:
1706 if (!getTM().isAAPCS_ABI())
1707 return CallingConv::ARM_APCS;
1708 else if (Subtarget->hasFPRegs() && !Subtarget->isThumb1Only() &&
1709 getTargetMachine().Options.FloatABIType == FloatABI::Hard &&
1710 !isVarArg)
1712 else
1714 case CallingConv::Fast:
1716 if (!getTM().isAAPCS_ABI()) {
1717 if (Subtarget->hasVFP2Base() && !Subtarget->isThumb1Only() && !isVarArg)
1718 return CallingConv::Fast;
1719 return CallingConv::ARM_APCS;
1720 } else if (Subtarget->hasVFP2Base() && !Subtarget->isThumb1Only() &&
1721 !isVarArg)
1723 else
1725 }
1726}
1727
1729 bool isVarArg) const {
1730 return CCAssignFnForNode(CC, false, isVarArg);
1731}
1732
1734 bool isVarArg) const {
1735 return CCAssignFnForNode(CC, true, isVarArg);
1736}
1737
1738/// CCAssignFnForNode - Selects the correct CCAssignFn for the given
1739/// CallingConvention.
1740CCAssignFn *ARMTargetLowering::CCAssignFnForNode(CallingConv::ID CC,
1741 bool Return,
1742 bool isVarArg) const {
1743 switch (getEffectiveCallingConv(CC, isVarArg)) {
1744 default:
1745 report_fatal_error("Unsupported calling convention");
1747 return (Return ? RetCC_ARM_APCS : CC_ARM_APCS);
1749 return (Return ? RetCC_ARM_AAPCS : CC_ARM_AAPCS);
1751 return (Return ? RetCC_ARM_AAPCS_VFP : CC_ARM_AAPCS_VFP);
1752 case CallingConv::Fast:
1753 return (Return ? RetFastCC_ARM_APCS : FastCC_ARM_APCS);
1754 case CallingConv::GHC:
1755 return (Return ? RetCC_ARM_APCS : CC_ARM_APCS_GHC);
1757 return (Return ? RetCC_ARM_AAPCS : CC_ARM_AAPCS);
1759 return (Return ? RetCC_ARM_AAPCS : CC_ARM_AAPCS);
1761 return (Return ? RetCC_ARM_AAPCS : CC_ARM_Win32_CFGuard_Check);
1762 }
1763}
1764
1765SDValue ARMTargetLowering::MoveToHPR(const SDLoc &dl, SelectionDAG &DAG,
1766 MVT LocVT, MVT ValVT, SDValue Val) const {
1767 Val = DAG.getNode(ISD::BITCAST, dl, MVT::getIntegerVT(LocVT.getSizeInBits()),
1768 Val);
1769 if (Subtarget->hasFullFP16()) {
1770 Val = DAG.getNode(ARMISD::VMOVhr, dl, ValVT, Val);
1771 } else {
1772 Val = DAG.getNode(ISD::TRUNCATE, dl,
1773 MVT::getIntegerVT(ValVT.getSizeInBits()), Val);
1774 Val = DAG.getNode(ISD::BITCAST, dl, ValVT, Val);
1775 }
1776 return Val;
1777}
1778
1779SDValue ARMTargetLowering::MoveFromHPR(const SDLoc &dl, SelectionDAG &DAG,
1780 MVT LocVT, MVT ValVT,
1781 SDValue Val) const {
1782 if (Subtarget->hasFullFP16()) {
1783 Val = DAG.getNode(ARMISD::VMOVrh, dl,
1784 MVT::getIntegerVT(LocVT.getSizeInBits()), Val);
1785 } else {
1786 Val = DAG.getNode(ISD::BITCAST, dl,
1787 MVT::getIntegerVT(ValVT.getSizeInBits()), Val);
1788 Val = DAG.getNode(ISD::ZERO_EXTEND, dl,
1789 MVT::getIntegerVT(LocVT.getSizeInBits()), Val);
1790 }
1791 return DAG.getNode(ISD::BITCAST, dl, LocVT, Val);
1792}
1793
1794/// LowerCallResult - Lower the result values of a call into the
1795/// appropriate copies out of appropriate physical registers.
1796SDValue ARMTargetLowering::LowerCallResult(
1797 SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg,
1798 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
1799 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool isThisReturn,
1800 SDValue ThisVal, bool isCmseNSCall) const {
1801 // Assign locations to each value returned by this call.
1803 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
1804 *DAG.getContext());
1805 CCInfo.AnalyzeCallResult(Ins, CCAssignFnForReturn(CallConv, isVarArg));
1806
1807 // Copy all of the result registers out of their specified physreg.
1808 for (unsigned i = 0; i != RVLocs.size(); ++i) {
1809 CCValAssign VA = RVLocs[i];
1810
1811 // Pass 'this' value directly from the argument to return value, to avoid
1812 // reg unit interference
1813 if (i == 0 && isThisReturn) {
1814 assert(!VA.needsCustom() && VA.getLocVT() == MVT::i32 &&
1815 "unexpected return calling convention register assignment");
1816 InVals.push_back(ThisVal);
1817 continue;
1818 }
1819
1820 SDValue Val;
1821 if (VA.needsCustom() &&
1822 (VA.getLocVT() == MVT::f64 || VA.getLocVT() == MVT::v2f64)) {
1823 // Handle f64 or half of a v2f64.
1824 SDValue Lo = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32,
1825 InGlue);
1826 Chain = Lo.getValue(1);
1827 InGlue = Lo.getValue(2);
1828 VA = RVLocs[++i]; // skip ahead to next loc
1829 SDValue Hi = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32,
1830 InGlue);
1831 Chain = Hi.getValue(1);
1832 InGlue = Hi.getValue(2);
1833 if (!Subtarget->isLittle())
1834 std::swap (Lo, Hi);
1835 Val = DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi);
1836
1837 if (VA.getLocVT() == MVT::v2f64) {
1838 SDValue Vec = DAG.getNode(ISD::UNDEF, dl, MVT::v2f64);
1839 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Vec, Val,
1840 DAG.getConstant(0, dl, MVT::i32));
1841
1842 VA = RVLocs[++i]; // skip ahead to next loc
1843 Lo = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, InGlue);
1844 Chain = Lo.getValue(1);
1845 InGlue = Lo.getValue(2);
1846 VA = RVLocs[++i]; // skip ahead to next loc
1847 Hi = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, InGlue);
1848 Chain = Hi.getValue(1);
1849 InGlue = Hi.getValue(2);
1850 if (!Subtarget->isLittle())
1851 std::swap (Lo, Hi);
1852 Val = DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi);
1853 Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Vec, Val,
1854 DAG.getConstant(1, dl, MVT::i32));
1855 }
1856 } else {
1857 Val = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), VA.getLocVT(),
1858 InGlue);
1859 Chain = Val.getValue(1);
1860 InGlue = Val.getValue(2);
1861 }
1862
1863 switch (VA.getLocInfo()) {
1864 default: llvm_unreachable("Unknown loc info!");
1865 case CCValAssign::Full: break;
1866 case CCValAssign::BCvt:
1867 Val = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), Val);
1868 break;
1869 }
1870
1871 // f16 arguments have their size extended to 4 bytes and passed as if they
1872 // had been copied to the LSBs of a 32-bit register.
1873 // For that, it's passed extended to i32 (soft ABI) or to f32 (hard ABI)
1874 if (VA.needsCustom() &&
1875 (VA.getValVT() == MVT::f16 || VA.getValVT() == MVT::bf16))
1876 Val = MoveToHPR(dl, DAG, VA.getLocVT(), VA.getValVT(), Val);
1877
1878 // On CMSE Non-secure Calls, call results (returned values) whose bitwidth
1879 // is less than 32 bits must be sign- or zero-extended after the call for
1880 // security reasons. Although the ABI mandates an extension done by the
1881 // callee, the latter cannot be trusted to follow the rules of the ABI.
1882 const ISD::InputArg &Arg = Ins[VA.getValNo()];
1883 if (isCmseNSCall && Arg.ArgVT.isScalarInteger() &&
1884 VA.getLocVT().isScalarInteger() && Arg.ArgVT.bitsLT(MVT::i32))
1885 Val = handleCMSEValue(Val, Arg, DAG, dl);
1886
1887 InVals.push_back(Val);
1888 }
1889
1890 return Chain;
1891}
1892
1893std::pair<SDValue, MachinePointerInfo> ARMTargetLowering::computeAddrForCallArg(
1894 const SDLoc &dl, SelectionDAG &DAG, const CCValAssign &VA, SDValue StackPtr,
1895 bool IsTailCall, int SPDiff) const {
1896 SDValue DstAddr;
1897 MachinePointerInfo DstInfo;
1898 int32_t Offset = VA.getLocMemOffset();
1899 MachineFunction &MF = DAG.getMachineFunction();
1900
1901 if (IsTailCall) {
1902 Offset += SPDiff;
1903 auto PtrVT = getPointerTy(DAG.getDataLayout());
1904 int Size = VA.getLocVT().getFixedSizeInBits() / 8;
1905 int FI = MF.getFrameInfo().CreateFixedObject(Size, Offset, true);
1906 DstAddr = DAG.getFrameIndex(FI, PtrVT);
1907 DstInfo =
1909 } else {
1910 SDValue PtrOff = DAG.getIntPtrConstant(Offset, dl);
1911 DstAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
1912 StackPtr, PtrOff);
1913 DstInfo =
1915 }
1916
1917 return std::make_pair(DstAddr, DstInfo);
1918}
1919
1920// Returns the type of copying which is required to set up a byval argument to
1921// a tail-called function. This isn't needed for non-tail calls, because they
1922// always need the equivalent of CopyOnce, but tail-calls sometimes need two to
1923// avoid clobbering another argument (CopyViaTemp), and sometimes can be
1924// optimised to zero copies when forwarding an argument from the caller's
1925// caller (NoCopy).
1926ARMTargetLowering::ByValCopyKind ARMTargetLowering::ByValNeedsCopyForTailCall(
1927 SelectionDAG &DAG, SDValue Src, SDValue Dst, ISD::ArgFlagsTy Flags) const {
1928 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
1929 ARMFunctionInfo *AFI = DAG.getMachineFunction().getInfo<ARMFunctionInfo>();
1930
1931 // Globals are always safe to copy from.
1933 return CopyOnce;
1934
1935 // Can only analyse frame index nodes, conservatively assume we need a
1936 // temporary.
1937 auto *SrcFrameIdxNode = dyn_cast<FrameIndexSDNode>(Src);
1938 auto *DstFrameIdxNode = dyn_cast<FrameIndexSDNode>(Dst);
1939 if (!SrcFrameIdxNode || !DstFrameIdxNode)
1940 return CopyViaTemp;
1941
1942 int SrcFI = SrcFrameIdxNode->getIndex();
1943 int DstFI = DstFrameIdxNode->getIndex();
1944 assert(MFI.isFixedObjectIndex(DstFI) &&
1945 "byval passed in non-fixed stack slot");
1946
1947 int64_t SrcOffset = MFI.getObjectOffset(SrcFI);
1948 int64_t DstOffset = MFI.getObjectOffset(DstFI);
1949
1950 // If the source is in the local frame, then the copy to the argument memory
1951 // is always valid.
1952 bool FixedSrc = MFI.isFixedObjectIndex(SrcFI);
1953 if (!FixedSrc ||
1954 (FixedSrc && SrcOffset < -(int64_t)AFI->getArgRegsSaveSize()))
1955 return CopyOnce;
1956
1957 // In the case of byval arguments split between registers and the stack,
1958 // computeAddrForCallArg returns a FrameIndex which corresponds only to the
1959 // stack portion, but the Src SDValue will refer to the full value, including
1960 // the local stack memory that the register portion gets stored into. We only
1961 // need to compare them for equality, so normalise on the full value version.
1962 uint64_t RegSize = Flags.getByValSize() - MFI.getObjectSize(DstFI);
1963 DstOffset -= RegSize;
1964
1965 // If the value is already in the correct location, then no copying is
1966 // needed. If not, then we need to copy via a temporary.
1967 if (SrcOffset == DstOffset)
1968 return NoCopy;
1969 else
1970 return CopyViaTemp;
1971}
1972
1973void ARMTargetLowering::PassF64ArgInRegs(const SDLoc &dl, SelectionDAG &DAG,
1974 SDValue Chain, SDValue &Arg,
1975 RegsToPassVector &RegsToPass,
1976 CCValAssign &VA, CCValAssign &NextVA,
1977 SDValue &StackPtr,
1978 SmallVectorImpl<SDValue> &MemOpChains,
1979 bool IsTailCall,
1980 int SPDiff) const {
1981 SDValue fmrrd = DAG.getNode(ARMISD::VMOVRRD, dl,
1982 DAG.getVTList(MVT::i32, MVT::i32), Arg);
1983 unsigned id = Subtarget->isLittle() ? 0 : 1;
1984 RegsToPass.push_back(std::make_pair(VA.getLocReg(), fmrrd.getValue(id)));
1985
1986 if (NextVA.isRegLoc())
1987 RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), fmrrd.getValue(1-id)));
1988 else {
1989 assert(NextVA.isMemLoc());
1990 if (!StackPtr.getNode())
1991 StackPtr = DAG.getCopyFromReg(Chain, dl, ARM::SP,
1993
1994 SDValue DstAddr;
1995 MachinePointerInfo DstInfo;
1996 std::tie(DstAddr, DstInfo) =
1997 computeAddrForCallArg(dl, DAG, NextVA, StackPtr, IsTailCall, SPDiff);
1998 MemOpChains.push_back(
1999 DAG.getStore(Chain, dl, fmrrd.getValue(1 - id), DstAddr, DstInfo));
2000 }
2001}
2002
2003static bool canGuaranteeTCO(CallingConv::ID CC, bool GuaranteeTailCalls) {
2004 return (CC == CallingConv::Fast && GuaranteeTailCalls) ||
2006}
2007
2008/// LowerCall - Lowering a call into a callseq_start <-
2009/// ARMISD:CALL <- callseq_end chain. Also add input and output parameter
2010/// nodes.
2011SDValue
2012ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
2013 SmallVectorImpl<SDValue> &InVals) const {
2014 SelectionDAG &DAG = CLI.DAG;
2015 SDLoc &dl = CLI.DL;
2016 SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
2017 SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
2018 SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
2019 SDValue Chain = CLI.Chain;
2020 SDValue Callee = CLI.Callee;
2021 bool &isTailCall = CLI.IsTailCall;
2022 CallingConv::ID CallConv = CLI.CallConv;
2023 bool doesNotRet = CLI.DoesNotReturn;
2024 bool isVarArg = CLI.IsVarArg;
2025 const CallBase *CB = CLI.CB;
2026
2027 MachineFunction &MF = DAG.getMachineFunction();
2028 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
2029 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
2030 MachineFunction::CallSiteInfo CSInfo;
2031 bool isStructRet = (Outs.empty()) ? false : Outs[0].Flags.isSRet();
2032 bool isThisReturn = false;
2033 bool isCmseNSCall = false;
2034 bool isSibCall = false;
2035 bool PreferIndirect = false;
2036 bool GuardWithBTI = false;
2037
2038 // Analyze operands of the call, assigning locations to each operand.
2040 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
2041 *DAG.getContext());
2042 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CallConv, isVarArg));
2043
2044 // Lower 'returns_twice' calls to a pseudo-instruction.
2045 if (CLI.CB && CLI.CB->getAttributes().hasFnAttr(Attribute::ReturnsTwice) &&
2046 !Subtarget->noBTIAtReturnTwice())
2047 GuardWithBTI = AFI->branchTargetEnforcement();
2048
2049 // Set type id for call site info.
2050 setTypeIdForCallsiteInfo(CB, MF, CSInfo);
2051
2052 // Determine whether this is a non-secure function call.
2053 if (CLI.CB && CLI.CB->getAttributes().hasFnAttr("cmse_nonsecure_call"))
2054 isCmseNSCall = true;
2055
2056 // Disable tail calls if they're not supported.
2057 if (!Subtarget->supportsTailCall())
2058 isTailCall = false;
2059
2060 // For both the non-secure calls and the returns from a CMSE entry function,
2061 // the function needs to do some extra work after the call, or before the
2062 // return, respectively, thus it cannot end with a tail call
2063 if (isCmseNSCall || AFI->isCmseNSEntryFunction())
2064 isTailCall = false;
2065
2066 if (isa<GlobalAddressSDNode>(Callee)) {
2067 // If we're optimizing for minimum size and the function is called three or
2068 // more times in this block, we can improve codesize by calling indirectly
2069 // as BLXr has a 16-bit encoding.
2070 auto *GV = cast<GlobalAddressSDNode>(Callee)->getGlobal();
2071 if (CLI.CB) {
2072 auto *BB = CLI.CB->getParent();
2073 PreferIndirect = Subtarget->isThumb() && Subtarget->hasMinSize() &&
2074 count_if(GV->users(), [&BB](const User *U) {
2075 return isa<Instruction>(U) &&
2076 cast<Instruction>(U)->getParent() == BB;
2077 }) > 2;
2078 }
2079 }
2080 if (isTailCall) {
2081 // Check if it's really possible to do a tail call.
2082 isTailCall =
2083 IsEligibleForTailCallOptimization(CLI, CCInfo, ArgLocs, PreferIndirect);
2084
2085 if (isTailCall && !getTargetMachine().Options.GuaranteedTailCallOpt &&
2086 CallConv != CallingConv::Tail && CallConv != CallingConv::SwiftTail)
2087 isSibCall = true;
2088
2089 // We don't support GuaranteedTailCallOpt for ARM, only automatically
2090 // detected sibcalls.
2091 if (isTailCall)
2092 ++NumTailCalls;
2093 }
2094
2095 if (!isTailCall && CLI.CB && CLI.CB->isMustTailCall())
2096 report_fatal_error("failed to perform tail call elimination on a call "
2097 "site marked musttail");
2098
2099 // Get a count of how many bytes are to be pushed on the stack.
2100 unsigned NumBytes = CCInfo.getStackSize();
2101
2102 // SPDiff is the byte offset of the call's argument area from the callee's.
2103 // Stores to callee stack arguments will be placed in FixedStackSlots offset
2104 // by this amount for a tail call. In a sibling call it must be 0 because the
2105 // caller will deallocate the entire stack and the callee still expects its
2106 // arguments to begin at SP+0. Completely unused for non-tail calls.
2107 int SPDiff = 0;
2108
2109 if (isTailCall && !isSibCall) {
2110 auto FuncInfo = MF.getInfo<ARMFunctionInfo>();
2111 unsigned NumReusableBytes = FuncInfo->getArgumentStackSize();
2112
2113 // Since callee will pop argument stack as a tail call, we must keep the
2114 // popped size 16-byte aligned.
2115 MaybeAlign StackAlign = DAG.getDataLayout().getStackAlignment();
2116 assert(StackAlign && "data layout string is missing stack alignment");
2117 NumBytes = alignTo(NumBytes, *StackAlign);
2118
2119 // SPDiff will be negative if this tail call requires more space than we
2120 // would automatically have in our incoming argument space. Positive if we
2121 // can actually shrink the stack.
2122 SPDiff = NumReusableBytes - NumBytes;
2123
2124 // If this call requires more stack than we have available from
2125 // LowerFormalArguments, tell FrameLowering to reserve space for it.
2126 if (SPDiff < 0 && AFI->getArgRegsSaveSize() < (unsigned)-SPDiff)
2127 AFI->setArgRegsSaveSize(-SPDiff);
2128 }
2129
2130 if (isSibCall) {
2131 // For sibling tail calls, memory operands are available in our caller's stack.
2132 NumBytes = 0;
2133 } else {
2134 // Adjust the stack pointer for the new arguments...
2135 // These operations are automatically eliminated by the prolog/epilog pass
2136 Chain = DAG.getCALLSEQ_START(Chain, isTailCall ? 0 : NumBytes, 0, dl);
2137 }
2138
2140 DAG.getCopyFromReg(Chain, dl, ARM::SP, getPointerTy(DAG.getDataLayout()));
2141
2142 RegsToPassVector RegsToPass;
2143 SmallVector<SDValue, 8> MemOpChains;
2144
2145 // If we are doing a tail-call, any byval arguments will be written to stack
2146 // space which was used for incoming arguments. If any the values being used
2147 // are incoming byval arguments to this function, then they might be
2148 // overwritten by the stores of the outgoing arguments. To avoid this, we
2149 // need to make a temporary copy of them in local stack space, then copy back
2150 // to the argument area.
2151 DenseMap<unsigned, SDValue> ByValTemporaries;
2152 SDValue ByValTempChain;
2153 if (isTailCall) {
2154 SmallVector<SDValue, 8> ByValCopyChains;
2155 for (const CCValAssign &VA : ArgLocs) {
2156 unsigned ArgIdx = VA.getValNo();
2157 SDValue Src = OutVals[ArgIdx];
2158 ISD::ArgFlagsTy Flags = Outs[ArgIdx].Flags;
2159
2160 if (!Flags.isByVal())
2161 continue;
2162
2163 SDValue Dst;
2164 MachinePointerInfo DstInfo;
2165 std::tie(Dst, DstInfo) =
2166 computeAddrForCallArg(dl, DAG, VA, SDValue(), true, SPDiff);
2167 ByValCopyKind Copy = ByValNeedsCopyForTailCall(DAG, Src, Dst, Flags);
2168
2169 if (Copy == NoCopy) {
2170 // If the argument is already at the correct offset on the stack
2171 // (because we are forwarding a byval argument from our caller), we
2172 // don't need any copying.
2173 continue;
2174 } else if (Copy == CopyOnce) {
2175 // If the argument is in our local stack frame, no other argument
2176 // preparation can clobber it, so we can copy it to the final location
2177 // later.
2178 ByValTemporaries[ArgIdx] = Src;
2179 } else {
2180 assert(Copy == CopyViaTemp && "unexpected enum value");
2181 // If we might be copying this argument from the outgoing argument
2182 // stack area, we need to copy via a temporary in the local stack
2183 // frame.
2184 int TempFrameIdx = MFI.CreateStackObject(
2185 Flags.getByValSize(), Flags.getNonZeroByValAlign(), false);
2186 SDValue Temp =
2187 DAG.getFrameIndex(TempFrameIdx, getPointerTy(DAG.getDataLayout()));
2188
2189 SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), dl, MVT::i32);
2190 SDValue AlignNode =
2191 DAG.getConstant(Flags.getNonZeroByValAlign().value(), dl, MVT::i32);
2192
2193 SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
2194 SDValue Ops[] = {Chain, Temp, Src, SizeNode, AlignNode};
2195 ByValCopyChains.push_back(
2196 DAG.getNode(ARMISD::COPY_STRUCT_BYVAL, dl, VTs, Ops));
2197 ByValTemporaries[ArgIdx] = Temp;
2198 }
2199 }
2200 if (!ByValCopyChains.empty())
2201 ByValTempChain =
2202 DAG.getNode(ISD::TokenFactor, dl, MVT::Other, ByValCopyChains);
2203 }
2204
2205 // During a tail call, stores to the argument area must happen after all of
2206 // the function's incoming arguments have been loaded because they may alias.
2207 // This is done by folding in a TokenFactor from LowerFormalArguments, but
2208 // there's no point in doing so repeatedly so this tracks whether that's
2209 // happened yet.
2210 bool AfterFormalArgLoads = false;
2211
2212 // Walk the register/memloc assignments, inserting copies/loads. In the case
2213 // of tail call optimization, arguments are handled later.
2214 for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size();
2215 i != e;
2216 ++i, ++realArgIdx) {
2217 CCValAssign &VA = ArgLocs[i];
2218 SDValue Arg = OutVals[realArgIdx];
2219 ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags;
2220 bool isByVal = Flags.isByVal();
2221
2222 // Promote the value if needed.
2223 switch (VA.getLocInfo()) {
2224 default: llvm_unreachable("Unknown loc info!");
2225 case CCValAssign::Full: break;
2226 case CCValAssign::SExt:
2227 Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), Arg);
2228 break;
2229 case CCValAssign::ZExt:
2230 Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), Arg);
2231 break;
2232 case CCValAssign::AExt:
2233 Arg = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), Arg);
2234 break;
2235 case CCValAssign::BCvt:
2236 Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg);
2237 break;
2238 }
2239
2240 if (isTailCall && VA.isMemLoc() && !AfterFormalArgLoads) {
2241 Chain = DAG.getStackArgumentTokenFactor(Chain);
2242 if (ByValTempChain) {
2243 // In case of large byval copies, re-using the stackframe for tail-calls
2244 // can lead to overwriting incoming arguments on the stack. Force
2245 // loading these stack arguments before the copy to avoid that.
2246 SmallVector<SDValue, 8> IncomingLoad;
2247 for (unsigned I = 0; I < OutVals.size(); ++I) {
2248 if (Outs[I].Flags.isByVal())
2249 continue;
2250
2251 SDValue OutVal = OutVals[I];
2252 LoadSDNode *OutLN = dyn_cast_or_null<LoadSDNode>(OutVal);
2253 if (!OutLN)
2254 continue;
2255
2256 FrameIndexSDNode *FIN =
2258 if (!FIN)
2259 continue;
2260
2261 if (!MFI.isFixedObjectIndex(FIN->getIndex()))
2262 continue;
2263
2264 for (const CCValAssign &VA : ArgLocs) {
2265 if (VA.isMemLoc())
2266 IncomingLoad.push_back(OutVal.getValue(1));
2267 }
2268 }
2269
2270 // Update the chain to force loads for potentially clobbered argument
2271 // loads to happen before the byval copy.
2272 if (!IncomingLoad.empty()) {
2273 IncomingLoad.push_back(Chain);
2274 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, IncomingLoad);
2275 }
2276
2277 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chain,
2278 ByValTempChain);
2279 }
2280 AfterFormalArgLoads = true;
2281 }
2282
2283 // f16 arguments have their size extended to 4 bytes and passed as if they
2284 // had been copied to the LSBs of a 32-bit register.
2285 // For that, it's passed extended to i32 (soft ABI) or to f32 (hard ABI)
2286 if (VA.needsCustom() &&
2287 (VA.getValVT() == MVT::f16 || VA.getValVT() == MVT::bf16)) {
2288 Arg = MoveFromHPR(dl, DAG, VA.getLocVT(), VA.getValVT(), Arg);
2289 } else {
2290 // f16 arguments could have been extended prior to argument lowering.
2291 // Mask them arguments if this is a CMSE nonsecure call.
2292 auto ArgVT = Outs[realArgIdx].ArgVT;
2293 if (isCmseNSCall && (ArgVT == MVT::f16)) {
2294 auto LocBits = VA.getLocVT().getSizeInBits();
2295 auto MaskValue = APInt::getLowBitsSet(LocBits, ArgVT.getSizeInBits());
2296 SDValue Mask =
2297 DAG.getConstant(MaskValue, dl, MVT::getIntegerVT(LocBits));
2298 Arg = DAG.getNode(ISD::BITCAST, dl, MVT::getIntegerVT(LocBits), Arg);
2299 Arg = DAG.getNode(ISD::AND, dl, MVT::getIntegerVT(LocBits), Arg, Mask);
2300 Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg);
2301 }
2302 }
2303
2304 // f64 and v2f64 might be passed in i32 pairs and must be split into pieces
2305 if (VA.needsCustom() && VA.getLocVT() == MVT::v2f64) {
2306 SDValue Op0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
2307 DAG.getConstant(0, dl, MVT::i32));
2308 SDValue Op1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
2309 DAG.getConstant(1, dl, MVT::i32));
2310
2311 PassF64ArgInRegs(dl, DAG, Chain, Op0, RegsToPass, VA, ArgLocs[++i],
2312 StackPtr, MemOpChains, isTailCall, SPDiff);
2313
2314 VA = ArgLocs[++i]; // skip ahead to next loc
2315 if (VA.isRegLoc()) {
2316 PassF64ArgInRegs(dl, DAG, Chain, Op1, RegsToPass, VA, ArgLocs[++i],
2317 StackPtr, MemOpChains, isTailCall, SPDiff);
2318 } else {
2319 assert(VA.isMemLoc());
2320 SDValue DstAddr;
2321 MachinePointerInfo DstInfo;
2322 std::tie(DstAddr, DstInfo) =
2323 computeAddrForCallArg(dl, DAG, VA, StackPtr, isTailCall, SPDiff);
2324 MemOpChains.push_back(DAG.getStore(Chain, dl, Op1, DstAddr, DstInfo));
2325 }
2326 } else if (VA.needsCustom() && VA.getLocVT() == MVT::f64) {
2327 PassF64ArgInRegs(dl, DAG, Chain, Arg, RegsToPass, VA, ArgLocs[++i],
2328 StackPtr, MemOpChains, isTailCall, SPDiff);
2329 } else if (VA.isRegLoc()) {
2330 if (realArgIdx == 0 && Flags.isReturned() && !Flags.isSwiftSelf() &&
2331 Outs[0].VT == MVT::i32) {
2332 assert(VA.getLocVT() == MVT::i32 &&
2333 "unexpected calling convention register assignment");
2334 assert(!Ins.empty() && Ins[0].VT == MVT::i32 &&
2335 "unexpected use of 'returned'");
2336 isThisReturn = true;
2337 }
2338 const TargetOptions &Options = DAG.getTarget().Options;
2339 if (Options.EmitCallSiteInfo)
2340 CSInfo.ArgRegPairs.emplace_back(VA.getLocReg(), i);
2341 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
2342 } else if (isByVal) {
2343 assert(VA.isMemLoc());
2344 unsigned offset = 0;
2345
2346 // True if this byval aggregate will be split between registers
2347 // and memory.
2348 unsigned ByValArgsCount = CCInfo.getInRegsParamsCount();
2349 unsigned CurByValIdx = CCInfo.getInRegsParamsProcessed();
2350
2351 SDValue ByValSrc;
2352 bool NeedsStackCopy;
2353 if (auto It = ByValTemporaries.find(realArgIdx);
2354 It != ByValTemporaries.end()) {
2355 ByValSrc = It->second;
2356 NeedsStackCopy = true;
2357 } else {
2358 ByValSrc = Arg;
2359 NeedsStackCopy = !isTailCall;
2360 }
2361
2362 // If part of the argument is in registers, load them.
2363 if (CurByValIdx < ByValArgsCount) {
2364 unsigned RegBegin, RegEnd;
2365 CCInfo.getInRegsParamInfo(CurByValIdx, RegBegin, RegEnd);
2366
2367 EVT PtrVT = getPointerTy(DAG.getDataLayout());
2368 unsigned int i, j;
2369 for (i = 0, j = RegBegin; j < RegEnd; i++, j++) {
2370 SDValue Const = DAG.getConstant(4*i, dl, MVT::i32);
2371 SDValue AddArg = DAG.getNode(ISD::ADD, dl, PtrVT, ByValSrc, Const);
2372 SDValue Load =
2373 DAG.getLoad(PtrVT, dl, Chain, AddArg, MachinePointerInfo(),
2374 DAG.InferPtrAlign(AddArg));
2375 MemOpChains.push_back(Load.getValue(1));
2376 RegsToPass.push_back(std::make_pair(j, Load));
2377 }
2378
2379 // If parameter size outsides register area, "offset" value
2380 // helps us to calculate stack slot for remained part properly.
2381 offset = RegEnd - RegBegin;
2382
2383 CCInfo.nextInRegsParam();
2384 }
2385
2386 // If the memory part of the argument isn't already in the correct place
2387 // (which can happen with tail calls), copy it into the argument area.
2388 if (NeedsStackCopy && Flags.getByValSize() > 4 * offset) {
2389 auto PtrVT = getPointerTy(DAG.getDataLayout());
2390 SDValue Dst;
2391 MachinePointerInfo DstInfo;
2392 std::tie(Dst, DstInfo) =
2393 computeAddrForCallArg(dl, DAG, VA, StackPtr, isTailCall, SPDiff);
2394 SDValue SrcOffset = DAG.getIntPtrConstant(4*offset, dl);
2395 SDValue Src = DAG.getNode(ISD::ADD, dl, PtrVT, ByValSrc, SrcOffset);
2396 SDValue SizeNode = DAG.getConstant(Flags.getByValSize() - 4*offset, dl,
2397 MVT::i32);
2398 SDValue AlignNode =
2399 DAG.getConstant(Flags.getNonZeroByValAlign().value(), dl, MVT::i32);
2400
2401 SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
2402 SDValue Ops[] = { Chain, Dst, Src, SizeNode, AlignNode};
2403 MemOpChains.push_back(DAG.getNode(ARMISD::COPY_STRUCT_BYVAL, dl, VTs,
2404 Ops));
2405 }
2406 } else {
2407 assert(VA.isMemLoc());
2408 SDValue DstAddr;
2409 MachinePointerInfo DstInfo;
2410 std::tie(DstAddr, DstInfo) =
2411 computeAddrForCallArg(dl, DAG, VA, StackPtr, isTailCall, SPDiff);
2412
2413 SDValue Store = DAG.getStore(Chain, dl, Arg, DstAddr, DstInfo);
2414 MemOpChains.push_back(Store);
2415 }
2416 }
2417
2418 if (!MemOpChains.empty())
2419 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
2420
2421 // Build a sequence of copy-to-reg nodes chained together with token chain
2422 // and flag operands which copy the outgoing args into the appropriate regs.
2423 SDValue InGlue;
2424 for (const auto &[Reg, N] : RegsToPass) {
2425 Chain = DAG.getCopyToReg(Chain, dl, Reg, N, InGlue);
2426 InGlue = Chain.getValue(1);
2427 }
2428
2429 // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
2430 // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
2431 // node so that legalize doesn't hack it.
2432 bool isDirect = false;
2433
2434 const TargetMachine &TM = getTargetMachine();
2435 const Triple &TT = TM.getTargetTriple();
2436 const GlobalValue *GVal = nullptr;
2437 if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
2438 GVal = G->getGlobal();
2439 bool isStub = !TM.shouldAssumeDSOLocal(GVal) && TT.isOSBinFormatMachO();
2440
2441 bool isARMFunc = !Subtarget->isThumb() || (isStub && !Subtarget->isMClass());
2442 bool isLocalARMFunc = false;
2443 auto PtrVt = getPointerTy(DAG.getDataLayout());
2444
2445 if (Subtarget->genLongCalls()) {
2446 assert((!isPositionIndependent() || TT.isOSWindows()) &&
2447 "long-calls codegen is not position independent!");
2448 // Handle a global address or an external symbol. If it's not one of
2449 // those, the target's already in a register, so we don't need to do
2450 // anything extra.
2451 if (isa<GlobalAddressSDNode>(Callee)) {
2452 if (Subtarget->genExecuteOnly()) {
2453 if (Subtarget->useMovt())
2454 ++NumMovwMovt;
2455 Callee = DAG.getNode(ARMISD::Wrapper, dl, PtrVt,
2456 DAG.getTargetGlobalAddress(GVal, dl, PtrVt));
2457 } else {
2458 // Create a constant pool entry for the callee address
2459 unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
2460 ARMConstantPoolValue *CPV = ARMConstantPoolConstant::Create(
2461 GVal, ARMPCLabelIndex, ARMCP::CPValue, 0);
2462
2463 // Get the address of the callee into a register
2464 SDValue Addr = DAG.getTargetConstantPool(CPV, PtrVt, Align(4));
2465 Addr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Addr);
2466 Callee = DAG.getLoad(
2467 PtrVt, dl, DAG.getEntryNode(), Addr,
2469 }
2470 } else if (ExternalSymbolSDNode *S=dyn_cast<ExternalSymbolSDNode>(Callee)) {
2471 const char *Sym = S->getSymbol();
2472
2473 if (Subtarget->genExecuteOnly()) {
2474 if (Subtarget->useMovt())
2475 ++NumMovwMovt;
2476 Callee = DAG.getNode(ARMISD::Wrapper, dl, PtrVt,
2477 DAG.getTargetGlobalAddress(GVal, dl, PtrVt));
2478 } else {
2479 // Create a constant pool entry for the callee address
2480 unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
2481 ARMConstantPoolValue *CPV = ARMConstantPoolSymbol::Create(
2482 *DAG.getContext(), Sym, ARMPCLabelIndex, 0);
2483
2484 // Get the address of the callee into a register
2485 SDValue Addr = DAG.getTargetConstantPool(CPV, PtrVt, Align(4));
2486 Addr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Addr);
2487 Callee = DAG.getLoad(
2488 PtrVt, dl, DAG.getEntryNode(), Addr,
2490 }
2491 }
2492 } else if (isa<GlobalAddressSDNode>(Callee)) {
2493 if (!PreferIndirect) {
2494 isDirect = true;
2495 bool isDef = GVal->isStrongDefinitionForLinker();
2496
2497 // ARM call to a local ARM function is predicable.
2498 isLocalARMFunc = !Subtarget->isThumb() && (isDef || !ARMInterworking);
2499 // tBX takes a register source operand.
2500 if (isStub && Subtarget->isThumb1Only() && !Subtarget->hasV5TOps()) {
2501 assert(TT.isOSBinFormatMachO() && "WrapperPIC use on non-MachO?");
2502 Callee = DAG.getNode(
2503 ARMISD::WrapperPIC, dl, PtrVt,
2504 DAG.getTargetGlobalAddress(GVal, dl, PtrVt, 0, ARMII::MO_NONLAZY));
2505 Callee = DAG.getLoad(
2506 PtrVt, dl, DAG.getEntryNode(), Callee,
2510 } else if (Subtarget->isTargetCOFF()) {
2511 assert(Subtarget->isTargetWindows() &&
2512 "Windows is the only supported COFF target");
2513 unsigned TargetFlags = ARMII::MO_NO_FLAG;
2514 if (GVal->hasDLLImportStorageClass())
2515 TargetFlags = ARMII::MO_DLLIMPORT;
2516 else if (!TM.shouldAssumeDSOLocal(GVal))
2517 TargetFlags = ARMII::MO_COFFSTUB;
2518 Callee = DAG.getTargetGlobalAddress(GVal, dl, PtrVt, /*offset=*/0,
2519 TargetFlags);
2520 if (TargetFlags & (ARMII::MO_DLLIMPORT | ARMII::MO_COFFSTUB))
2521 Callee =
2522 DAG.getLoad(PtrVt, dl, DAG.getEntryNode(),
2523 DAG.getNode(ARMISD::Wrapper, dl, PtrVt, Callee),
2525 } else {
2526 Callee = DAG.getTargetGlobalAddress(GVal, dl, PtrVt, 0, 0);
2527 }
2528 }
2529 } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
2530 isDirect = true;
2531 // tBX takes a register source operand.
2532 const char *Sym = S->getSymbol();
2533 if (isARMFunc && Subtarget->isThumb1Only() && !Subtarget->hasV5TOps()) {
2534 unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
2535 ARMConstantPoolValue *CPV =
2537 ARMPCLabelIndex, 4);
2538 SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, Align(4));
2539 CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
2540 Callee = DAG.getLoad(
2541 PtrVt, dl, DAG.getEntryNode(), CPAddr,
2543 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32);
2544 Callee = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVt, Callee, PICLabel);
2545 } else {
2546 Callee = DAG.getTargetExternalSymbol(Sym, PtrVt, 0);
2547 }
2548 }
2549
2550 if (isCmseNSCall) {
2551 assert(!isARMFunc && !isDirect &&
2552 "Cannot handle call to ARM function or direct call");
2553 if (NumBytes > 0) {
2554 DAG.getContext()->diagnose(
2555 DiagnosticInfoUnsupported(DAG.getMachineFunction().getFunction(),
2556 "call to non-secure function would require "
2557 "passing arguments on stack",
2558 dl.getDebugLoc()));
2559 }
2560 if (isStructRet) {
2561 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
2563 "call to non-secure function would return value through pointer",
2564 dl.getDebugLoc()));
2565 }
2566 }
2567
2568 // FIXME: handle tail calls differently.
2569 unsigned CallOpc;
2570 if (Subtarget->isThumb()) {
2571 if (GuardWithBTI)
2572 CallOpc = ARMISD::t2CALL_BTI;
2573 else if (isCmseNSCall)
2574 CallOpc = ARMISD::tSECALL;
2575 else if ((!isDirect || isARMFunc) && !Subtarget->hasV5TOps())
2576 CallOpc = ARMISD::CALL_NOLINK;
2577 else
2578 CallOpc = ARMISD::CALL;
2579 } else {
2580 if (!isDirect && !Subtarget->hasV5TOps())
2581 CallOpc = ARMISD::CALL_NOLINK;
2582 else if (doesNotRet && isDirect && Subtarget->hasRetAddrStack() &&
2583 // Emit regular call when code size is the priority
2584 !Subtarget->hasMinSize())
2585 // "mov lr, pc; b _foo" to avoid confusing the RSP
2586 CallOpc = ARMISD::CALL_NOLINK;
2587 else
2588 CallOpc = isLocalARMFunc ? ARMISD::CALL_PRED : ARMISD::CALL;
2589 }
2590
2591 // We don't usually want to end the call-sequence here because we would tidy
2592 // the frame up *after* the call, however in the ABI-changing tail-call case
2593 // we've carefully laid out the parameters so that when sp is reset they'll be
2594 // in the correct location.
2595 if (isTailCall && !isSibCall) {
2596 Chain = DAG.getCALLSEQ_END(Chain, 0, 0, InGlue, dl);
2597 InGlue = Chain.getValue(1);
2598 }
2599
2600 std::vector<SDValue> Ops;
2601 Ops.push_back(Chain);
2602 Ops.push_back(Callee);
2603
2604 if (isTailCall) {
2605 Ops.push_back(DAG.getSignedTargetConstant(SPDiff, dl, MVT::i32));
2606 }
2607
2608 // Add argument registers to the end of the list so that they are known live
2609 // into the call.
2610 for (const auto &[Reg, N] : RegsToPass)
2611 Ops.push_back(DAG.getRegister(Reg, N.getValueType()));
2612
2613 // Add a register mask operand representing the call-preserved registers.
2614 const uint32_t *Mask;
2615 const ARMBaseRegisterInfo *ARI = Subtarget->getRegisterInfo();
2616 if (isThisReturn) {
2617 // For 'this' returns, use the R0-preserving mask if applicable
2618 Mask = ARI->getThisReturnPreservedMask(MF, CallConv);
2619 if (!Mask) {
2620 // Set isThisReturn to false if the calling convention is not one that
2621 // allows 'returned' to be modeled in this way, so LowerCallResult does
2622 // not try to pass 'this' straight through
2623 isThisReturn = false;
2624 Mask = ARI->getCallPreservedMask(MF, CallConv);
2625 }
2626 } else
2627 Mask = ARI->getCallPreservedMask(MF, CallConv);
2628
2629 assert(Mask && "Missing call preserved mask for calling convention");
2630 Ops.push_back(DAG.getRegisterMask(Mask));
2631
2632 if (InGlue.getNode())
2633 Ops.push_back(InGlue);
2634
2635 if (isTailCall) {
2637 SDValue Ret = DAG.getNode(ARMISD::TC_RETURN, dl, MVT::Other, Ops);
2638 if (CLI.CFIType)
2639 Ret.getNode()->setCFIType(CLI.CFIType->getZExtValue());
2640 DAG.addNoMergeSiteInfo(Ret.getNode(), CLI.NoMerge);
2641 DAG.addCallSiteInfo(Ret.getNode(), std::move(CSInfo));
2642 return Ret;
2643 }
2644
2645 // Returns a chain and a flag for retval copy to use.
2646 Chain = DAG.getNode(CallOpc, dl, {MVT::Other, MVT::Glue}, Ops);
2647 if (CLI.CFIType)
2648 Chain.getNode()->setCFIType(CLI.CFIType->getZExtValue());
2649 DAG.addNoMergeSiteInfo(Chain.getNode(), CLI.NoMerge);
2650 InGlue = Chain.getValue(1);
2651 DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo));
2652
2653 // If we're guaranteeing tail-calls will be honoured, the callee must
2654 // pop its own argument stack on return. But this call is *not* a tail call so
2655 // we need to undo that after it returns to restore the status-quo.
2656 bool TailCallOpt = getTargetMachine().Options.GuaranteedTailCallOpt;
2657 uint64_t CalleePopBytes =
2658 canGuaranteeTCO(CallConv, TailCallOpt) ? alignTo(NumBytes, 16) : -1U;
2659
2660 Chain = DAG.getCALLSEQ_END(Chain, NumBytes, CalleePopBytes, InGlue, dl);
2661 if (!Ins.empty())
2662 InGlue = Chain.getValue(1);
2663
2664 // Handle result values, copying them out of physregs into vregs that we
2665 // return.
2666 return LowerCallResult(Chain, InGlue, CallConv, isVarArg, Ins, dl, DAG,
2667 InVals, isThisReturn,
2668 isThisReturn ? OutVals[0] : SDValue(), isCmseNSCall);
2669}
2670
2671/// HandleByVal - Every parameter *after* a byval parameter is passed
2672/// on the stack. Remember the next parameter register to allocate,
2673/// and then confiscate the rest of the parameter registers to insure
2674/// this.
2675void ARMTargetLowering::HandleByVal(CCState *State, unsigned &Size,
2676 Align Alignment) const {
2677 // Byval (as with any stack) slots are always at least 4 byte aligned.
2678 Alignment = std::max(Alignment, Align(4));
2679
2680 MCRegister Reg = State->AllocateReg(GPRArgRegs);
2681 if (!Reg)
2682 return;
2683
2684 unsigned AlignInRegs = Alignment.value() / 4;
2685 unsigned Waste = (ARM::R4 - Reg) % AlignInRegs;
2686 for (unsigned i = 0; i < Waste; ++i)
2687 Reg = State->AllocateReg(GPRArgRegs);
2688
2689 if (!Reg)
2690 return;
2691
2692 unsigned Excess = 4 * (ARM::R4 - Reg);
2693
2694 // Special case when NSAA != SP and parameter size greater than size of
2695 // all remained GPR regs. In that case we can't split parameter, we must
2696 // send it to stack. We also must set NCRN to R4, so waste all
2697 // remained registers.
2698 const unsigned NSAAOffset = State->getStackSize();
2699 if (NSAAOffset != 0 && Size > Excess) {
2700 while (State->AllocateReg(GPRArgRegs))
2701 ;
2702 return;
2703 }
2704
2705 // First register for byval parameter is the first register that wasn't
2706 // allocated before this method call, so it would be "reg".
2707 // If parameter is small enough to be saved in range [reg, r4), then
2708 // the end (first after last) register would be reg + param-size-in-regs,
2709 // else parameter would be splitted between registers and stack,
2710 // end register would be r4 in this case.
2711 unsigned ByValRegBegin = Reg;
2712 unsigned ByValRegEnd = std::min<unsigned>(Reg + Size / 4, ARM::R4);
2713 State->addInRegsParamInfo(ByValRegBegin, ByValRegEnd);
2714 // Note, first register is allocated in the beginning of function already,
2715 // allocate remained amount of registers we need.
2716 for (unsigned i = Reg + 1; i != ByValRegEnd; ++i)
2717 State->AllocateReg(GPRArgRegs);
2718 // A byval parameter that is split between registers and memory needs its
2719 // size truncated here.
2720 // In the case where the entire structure fits in registers, we set the
2721 // size in memory to zero.
2722 Size = std::max<int>(Size - Excess, 0);
2723}
2724
2725/// IsEligibleForTailCallOptimization - Check whether the call is eligible
2726/// for tail call optimization. Targets which want to do tail call
2727/// optimization should implement this function. Note that this function also
2728/// processes musttail calls, so when this function returns false on a valid
2729/// musttail call, a fatal backend error occurs.
2730bool ARMTargetLowering::IsEligibleForTailCallOptimization(
2732 SmallVectorImpl<CCValAssign> &ArgLocs, const bool isIndirect) const {
2733 CallingConv::ID CalleeCC = CLI.CallConv;
2734 SDValue Callee = CLI.Callee;
2735 bool isVarArg = CLI.IsVarArg;
2736 const SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
2737 const SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
2738 const SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
2739 const SelectionDAG &DAG = CLI.DAG;
2740 MachineFunction &MF = DAG.getMachineFunction();
2741 const Function &CallerF = MF.getFunction();
2742 CallingConv::ID CallerCC = CallerF.getCallingConv();
2743
2744 assert(Subtarget->supportsTailCall());
2745
2746 // Indirect tail-calls require a register to hold the target address. That
2747 // register must be:
2748 // * Allocatable (i.e. r0-r7 if the target is Thumb1).
2749 // * Not callee-saved, so must be one of r0-r3 or r12.
2750 // * Not used to hold an argument to the tail-called function, which might be
2751 // in r0-r3.
2752 // * Not used to hold the return address authentication code, which is in r12
2753 // if enabled.
2754 // Sometimes, no register matches all of these conditions, so we can't do a
2755 // tail-call.
2756 if (!isa<GlobalAddressSDNode>(Callee.getNode()) || isIndirect) {
2757 SmallSet<MCPhysReg, 5> AddressRegisters = {ARM::R0, ARM::R1, ARM::R2,
2758 ARM::R3};
2759 if (!(Subtarget->isThumb1Only() ||
2760 MF.getInfo<ARMFunctionInfo>()->shouldSignReturnAddress(true)))
2761 AddressRegisters.insert(ARM::R12);
2762 for (const CCValAssign &AL : ArgLocs)
2763 if (AL.isRegLoc())
2764 AddressRegisters.erase(AL.getLocReg());
2765 if (AddressRegisters.empty()) {
2766 LLVM_DEBUG(dbgs() << "false (no reg to hold function pointer)\n");
2767 return false;
2768 }
2769 }
2770
2771 // Look for obvious safe cases to perform tail call optimization that do not
2772 // require ABI changes. This is what gcc calls sibcall.
2773
2774 // Exception-handling functions need a special set of instructions to indicate
2775 // a return to the hardware. Tail-calling another function would probably
2776 // break this.
2777 if (CallerF.hasFnAttribute("interrupt")) {
2778 LLVM_DEBUG(dbgs() << "false (interrupt attribute)\n");
2779 return false;
2780 }
2781
2782 if (canGuaranteeTCO(CalleeCC,
2783 getTargetMachine().Options.GuaranteedTailCallOpt)) {
2784 LLVM_DEBUG(dbgs() << (CalleeCC == CallerCC ? "true" : "false")
2785 << " (guaranteed tail-call CC)\n");
2786 return CalleeCC == CallerCC;
2787 }
2788
2789 // Also avoid sibcall optimization if either caller or callee uses struct
2790 // return semantics.
2791 bool isCalleeStructRet = Outs.empty() ? false : Outs[0].Flags.isSRet();
2792 bool isCallerStructRet = MF.getFunction().hasStructRetAttr();
2793 if (isCalleeStructRet != isCallerStructRet) {
2794 LLVM_DEBUG(dbgs() << "false (struct-ret)\n");
2795 return false;
2796 }
2797
2798 // Externally-defined functions with weak linkage should not be
2799 // tail-called on ARM when the OS does not support dynamic
2800 // pre-emption of symbols, as the AAELF spec requires normal calls
2801 // to undefined weak functions to be replaced with a NOP or jump to the
2802 // next instruction. The behaviour of branch instructions in this
2803 // situation (as used for tail calls) is implementation-defined, so we
2804 // cannot rely on the linker replacing the tail call with a return.
2805 if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
2806 const GlobalValue *GV = G->getGlobal();
2807 const Triple &TT = getTargetMachine().getTargetTriple();
2808 if (GV->hasExternalWeakLinkage() &&
2809 (!TT.isOSWindows() || TT.isOSBinFormatELF() ||
2810 TT.isOSBinFormatMachO())) {
2811 LLVM_DEBUG(dbgs() << "false (external weak linkage)\n");
2812 return false;
2813 }
2814 }
2815
2816 // Check that the call results are passed in the same way.
2817 LLVMContext &C = *DAG.getContext();
2819 getEffectiveCallingConv(CalleeCC, isVarArg),
2820 getEffectiveCallingConv(CallerCC, CallerF.isVarArg()), MF, C, Ins,
2821 CCAssignFnForReturn(CalleeCC, isVarArg),
2822 CCAssignFnForReturn(CallerCC, CallerF.isVarArg()))) {
2823 LLVM_DEBUG(dbgs() << "false (incompatible results)\n");
2824 return false;
2825 }
2826 // The callee has to preserve all registers the caller needs to preserve.
2827 const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo();
2828 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
2829 if (CalleeCC != CallerCC) {
2830 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
2831 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved)) {
2832 LLVM_DEBUG(dbgs() << "false (not all registers preserved)\n");
2833 return false;
2834 }
2835 }
2836
2837 // If Caller's vararg argument has been split between registers and stack, do
2838 // not perform tail call, since part of the argument is in caller's local
2839 // frame.
2840 const ARMFunctionInfo *AFI_Caller = MF.getInfo<ARMFunctionInfo>();
2841 if (CLI.IsVarArg && AFI_Caller->getArgRegsSaveSize()) {
2842 LLVM_DEBUG(dbgs() << "false (arg reg save area)\n");
2843 return false;
2844 }
2845
2846 // If the callee takes no arguments then go on to check the results of the
2847 // call.
2848 const MachineRegisterInfo &MRI = MF.getRegInfo();
2849 if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals)) {
2850 LLVM_DEBUG(dbgs() << "false (parameters in CSRs do not match)\n");
2851 return false;
2852 }
2853
2854 // If the stack arguments for this call do not fit into our own save area then
2855 // the call cannot be made tail.
2856 if (CCInfo.getStackSize() > AFI_Caller->getArgumentStackSize())
2857 return false;
2858
2859 LLVM_DEBUG(dbgs() << "true\n");
2860 return true;
2861}
2862
2863bool
2864ARMTargetLowering::CanLowerReturn(CallingConv::ID CallConv,
2865 MachineFunction &MF, bool isVarArg,
2867 LLVMContext &Context, const Type *RetTy) const {
2869 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
2870 return CCInfo.CheckReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg));
2871}
2872
2874 const SDLoc &DL, SelectionDAG &DAG) {
2875 const MachineFunction &MF = DAG.getMachineFunction();
2876 const Function &F = MF.getFunction();
2877
2878 StringRef IntKind = F.getFnAttribute("interrupt").getValueAsString();
2879
2880 // See ARM ARM v7 B1.8.3. On exception entry LR is set to a possibly offset
2881 // version of the "preferred return address". These offsets affect the return
2882 // instruction if this is a return from PL1 without hypervisor extensions.
2883 // IRQ/FIQ: +4 "subs pc, lr, #4"
2884 // SWI: 0 "subs pc, lr, #0"
2885 // ABORT: +4 "subs pc, lr, #4"
2886 // UNDEF: +4/+2 "subs pc, lr, #0"
2887 // UNDEF varies depending on where the exception came from ARM or Thumb
2888 // mode. Alongside GCC, we throw our hands up in disgust and pretend it's 0.
2889
2890 int64_t LROffset;
2891 if (IntKind == "" || IntKind == "IRQ" || IntKind == "FIQ" ||
2892 IntKind == "ABORT")
2893 LROffset = 4;
2894 else if (IntKind == "SWI" || IntKind == "UNDEF")
2895 LROffset = 0;
2896 else
2897 report_fatal_error("Unsupported interrupt attribute. If present, value "
2898 "must be one of: IRQ, FIQ, SWI, ABORT or UNDEF");
2899
2900 RetOps.insert(RetOps.begin() + 1,
2901 DAG.getConstant(LROffset, DL, MVT::i32, false));
2902
2903 return DAG.getNode(ARMISD::INTRET_GLUE, DL, MVT::Other, RetOps);
2904}
2905
2906SDValue
2907ARMTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
2908 bool isVarArg,
2910 const SmallVectorImpl<SDValue> &OutVals,
2911 const SDLoc &dl, SelectionDAG &DAG) const {
2912 // CCValAssign - represent the assignment of the return value to a location.
2914
2915 // CCState - Info about the registers and stack slots.
2916 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
2917 *DAG.getContext());
2918
2919 // Analyze outgoing return values.
2920 CCInfo.AnalyzeReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg));
2921
2922 SDValue Glue;
2924 RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
2925 bool isLittleEndian = Subtarget->isLittle();
2926
2927 MachineFunction &MF = DAG.getMachineFunction();
2928 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
2929 AFI->setReturnRegsCount(RVLocs.size());
2930
2931 // Report error if cmse entry function returns structure through first ptr arg.
2932 if (AFI->isCmseNSEntryFunction() && MF.getFunction().hasStructRetAttr()) {
2933 // Note: using an empty SDLoc(), as the first line of the function is a
2934 // better place to report than the last line.
2935 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
2937 "secure entry function would return value through pointer",
2938 SDLoc().getDebugLoc()));
2939 }
2940
2941 // Copy the result values into the output registers.
2942 for (unsigned i = 0, realRVLocIdx = 0;
2943 i != RVLocs.size();
2944 ++i, ++realRVLocIdx) {
2945 CCValAssign &VA = RVLocs[i];
2946 assert(VA.isRegLoc() && "Can only return in registers!");
2947
2948 SDValue Arg = OutVals[realRVLocIdx];
2949 bool ReturnF16 = false;
2950
2951 if (Subtarget->hasFullFP16() && getTM().isTargetHardFloat()) {
2952 // Half-precision return values can be returned like this:
2953 //
2954 // t11 f16 = fadd ...
2955 // t12: i16 = bitcast t11
2956 // t13: i32 = zero_extend t12
2957 // t14: f32 = bitcast t13 <~~~~~~~ Arg
2958 //
2959 // to avoid code generation for bitcasts, we simply set Arg to the node
2960 // that produces the f16 value, t11 in this case.
2961 //
2962 if (Arg.getValueType() == MVT::f32 && Arg.getOpcode() == ISD::BITCAST) {
2963 SDValue ZE = Arg.getOperand(0);
2964 if (ZE.getOpcode() == ISD::ZERO_EXTEND && ZE.getValueType() == MVT::i32) {
2965 SDValue BC = ZE.getOperand(0);
2966 if (BC.getOpcode() == ISD::BITCAST && BC.getValueType() == MVT::i16) {
2967 Arg = BC.getOperand(0);
2968 ReturnF16 = true;
2969 }
2970 }
2971 }
2972 }
2973
2974 switch (VA.getLocInfo()) {
2975 default: llvm_unreachable("Unknown loc info!");
2976 case CCValAssign::Full: break;
2977 case CCValAssign::BCvt:
2978 if (!ReturnF16)
2979 Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg);
2980 break;
2981 }
2982
2983 // Mask f16 arguments if this is a CMSE nonsecure entry.
2984 auto RetVT = Outs[realRVLocIdx].ArgVT;
2985 if (AFI->isCmseNSEntryFunction() && (RetVT == MVT::f16)) {
2986 if (VA.needsCustom() && VA.getValVT() == MVT::f16) {
2987 Arg = MoveFromHPR(dl, DAG, VA.getLocVT(), VA.getValVT(), Arg);
2988 } else {
2989 auto LocBits = VA.getLocVT().getSizeInBits();
2990 auto MaskValue = APInt::getLowBitsSet(LocBits, RetVT.getSizeInBits());
2991 SDValue Mask =
2992 DAG.getConstant(MaskValue, dl, MVT::getIntegerVT(LocBits));
2993 Arg = DAG.getNode(ISD::BITCAST, dl, MVT::getIntegerVT(LocBits), Arg);
2994 Arg = DAG.getNode(ISD::AND, dl, MVT::getIntegerVT(LocBits), Arg, Mask);
2995 Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg);
2996 }
2997 }
2998
2999 if (VA.needsCustom() &&
3000 (VA.getLocVT() == MVT::v2f64 || VA.getLocVT() == MVT::f64)) {
3001 if (VA.getLocVT() == MVT::v2f64) {
3002 // Extract the first half and return it in two registers.
3003 SDValue Half = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
3004 DAG.getConstant(0, dl, MVT::i32));
3005 SDValue HalfGPRs = DAG.getNode(ARMISD::VMOVRRD, dl,
3006 DAG.getVTList(MVT::i32, MVT::i32), Half);
3007
3008 Chain =
3009 DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
3010 HalfGPRs.getValue(isLittleEndian ? 0 : 1), Glue);
3011 Glue = Chain.getValue(1);
3012 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
3013 VA = RVLocs[++i]; // skip ahead to next loc
3014 Chain =
3015 DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
3016 HalfGPRs.getValue(isLittleEndian ? 1 : 0), Glue);
3017 Glue = Chain.getValue(1);
3018 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
3019 VA = RVLocs[++i]; // skip ahead to next loc
3020
3021 // Extract the 2nd half and fall through to handle it as an f64 value.
3022 Arg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
3023 DAG.getConstant(1, dl, MVT::i32));
3024 }
3025 // Legalize ret f64 -> ret 2 x i32. We always have fmrrd if f64 is
3026 // available.
3027 SDValue fmrrd = DAG.getNode(ARMISD::VMOVRRD, dl,
3028 DAG.getVTList(MVT::i32, MVT::i32), Arg);
3029 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
3030 fmrrd.getValue(isLittleEndian ? 0 : 1), Glue);
3031 Glue = Chain.getValue(1);
3032 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
3033 VA = RVLocs[++i]; // skip ahead to next loc
3034 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
3035 fmrrd.getValue(isLittleEndian ? 1 : 0), Glue);
3036 } else
3037 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), Arg, Glue);
3038
3039 // Guarantee that all emitted copies are
3040 // stuck together, avoiding something bad.
3041 Glue = Chain.getValue(1);
3042 RetOps.push_back(DAG.getRegister(
3043 VA.getLocReg(), ReturnF16 ? Arg.getValueType() : VA.getLocVT()));
3044 }
3045 const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo();
3046 const MCPhysReg *I =
3047 TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
3048 if (I) {
3049 for (; *I; ++I) {
3050 if (ARM::GPRRegClass.contains(*I))
3051 RetOps.push_back(DAG.getRegister(*I, MVT::i32));
3052 else if (ARM::DPRRegClass.contains(*I))
3054 else
3055 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
3056 }
3057 }
3058
3059 // Update chain and glue.
3060 RetOps[0] = Chain;
3061 if (Glue.getNode())
3062 RetOps.push_back(Glue);
3063
3064 // CPUs which aren't M-class use a special sequence to return from
3065 // exceptions (roughly, any instruction setting pc and cpsr simultaneously,
3066 // though we use "subs pc, lr, #N").
3067 //
3068 // M-class CPUs actually use a normal return sequence with a special
3069 // (hardware-provided) value in LR, so the normal code path works.
3070 if (DAG.getMachineFunction().getFunction().hasFnAttribute("interrupt") &&
3071 !Subtarget->isMClass()) {
3072 if (Subtarget->isThumb1Only())
3073 report_fatal_error("interrupt attribute is not supported in Thumb1");
3074 return LowerInterruptReturn(RetOps, dl, DAG);
3075 }
3076
3077 unsigned RetNode =
3078 AFI->isCmseNSEntryFunction() ? ARMISD::SERET_GLUE : ARMISD::RET_GLUE;
3079 return DAG.getNode(RetNode, dl, MVT::Other, RetOps);
3080}
3081
3082bool ARMTargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
3083 if (N->getNumValues() != 1)
3084 return false;
3085 if (!N->hasNUsesOfValue(1, 0))
3086 return false;
3087
3088 SDValue TCChain = Chain;
3089 SDNode *Copy = *N->user_begin();
3090 if (Copy->getOpcode() == ISD::CopyToReg) {
3091 // If the copy has a glue operand, we conservatively assume it isn't safe to
3092 // perform a tail call.
3093 if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
3094 return false;
3095 TCChain = Copy->getOperand(0);
3096 } else if (Copy->getOpcode() == ARMISD::VMOVRRD) {
3097 SDNode *VMov = Copy;
3098 // f64 returned in a pair of GPRs.
3099 SmallPtrSet<SDNode*, 2> Copies;
3100 for (SDNode *U : VMov->users()) {
3101 if (U->getOpcode() != ISD::CopyToReg)
3102 return false;
3103 Copies.insert(U);
3104 }
3105 if (Copies.size() > 2)
3106 return false;
3107
3108 for (SDNode *U : VMov->users()) {
3109 SDValue UseChain = U->getOperand(0);
3110 if (Copies.count(UseChain.getNode()))
3111 // Second CopyToReg
3112 Copy = U;
3113 else {
3114 // We are at the top of this chain.
3115 // If the copy has a glue operand, we conservatively assume it
3116 // isn't safe to perform a tail call.
3117 if (U->getOperand(U->getNumOperands() - 1).getValueType() == MVT::Glue)
3118 return false;
3119 // First CopyToReg
3120 TCChain = UseChain;
3121 }
3122 }
3123 } else if (Copy->getOpcode() == ISD::BITCAST) {
3124 // f32 returned in a single GPR.
3125 if (!Copy->hasOneUse())
3126 return false;
3127 Copy = *Copy->user_begin();
3128 if (Copy->getOpcode() != ISD::CopyToReg || !Copy->hasNUsesOfValue(1, 0))
3129 return false;
3130 // If the copy has a glue operand, we conservatively assume it isn't safe to
3131 // perform a tail call.
3132 if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
3133 return false;
3134 TCChain = Copy->getOperand(0);
3135 } else {
3136 return false;
3137 }
3138
3139 bool HasRet = false;
3140 for (const SDNode *U : Copy->users()) {
3141 if (U->getOpcode() != ARMISD::RET_GLUE &&
3142 U->getOpcode() != ARMISD::INTRET_GLUE)
3143 return false;
3144 HasRet = true;
3145 }
3146
3147 if (!HasRet)
3148 return false;
3149
3150 Chain = TCChain;
3151 return true;
3152}
3153
3154bool ARMTargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
3155 if (!Subtarget->supportsTailCall())
3156 return false;
3157
3158 if (!CI->isTailCall())
3159 return false;
3160
3161 return true;
3162}
3163
3164// Trying to write a 64 bit value so need to split into two 32 bit values first,
3165// and pass the lower and high parts through.
3167 SDLoc DL(Op);
3168 SDValue WriteValue = Op->getOperand(2);
3169
3170 // This function is only supposed to be called for i64 type argument.
3171 assert(WriteValue.getValueType() == MVT::i64
3172 && "LowerWRITE_REGISTER called for non-i64 type argument.");
3173
3174 SDValue Lo, Hi;
3175 std::tie(Lo, Hi) = DAG.SplitScalar(WriteValue, DL, MVT::i32, MVT::i32);
3176 SDValue Ops[] = { Op->getOperand(0), Op->getOperand(1), Lo, Hi };
3177 return DAG.getNode(ISD::WRITE_REGISTER, DL, MVT::Other, Ops);
3178}
3179
3180// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
3181// their target counterpart wrapped in the ARMISD::Wrapper node. Suppose N is
3182// one of the above mentioned nodes. It has to be wrapped because otherwise
3183// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
3184// be used to form addressing mode. These wrapped nodes will be selected
3185// into MOVi.
3186SDValue ARMTargetLowering::LowerConstantPool(SDValue Op,
3187 SelectionDAG &DAG) const {
3188 EVT PtrVT = Op.getValueType();
3189 // FIXME there is no actual debug info here
3190 SDLoc dl(Op);
3191 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
3192 SDValue Res;
3193
3194 // When generating execute-only code Constant Pools must be promoted to the
3195 // global data section. It's a bit ugly that we can't share them across basic
3196 // blocks, but this way we guarantee that execute-only behaves correct with
3197 // position-independent addressing modes.
3198 if (Subtarget->genExecuteOnly()) {
3199 auto AFI = DAG.getMachineFunction().getInfo<ARMFunctionInfo>();
3200 auto *T = CP->getType();
3201 auto C = const_cast<Constant*>(CP->getConstVal());
3202 auto M = DAG.getMachineFunction().getFunction().getParent();
3203 auto GV = new GlobalVariable(
3204 *M, T, /*isConstant=*/true, GlobalVariable::InternalLinkage, C,
3205 Twine(DAG.getDataLayout().getPrivateGlobalPrefix()) + "CP" +
3206 Twine(DAG.getMachineFunction().getFunctionNumber()) + "_" +
3207 Twine(AFI->createPICLabelUId())
3208 );
3210 dl, PtrVT);
3211 return LowerGlobalAddress(GA, DAG);
3212 }
3213
3214 // The 16-bit ADR instruction can only encode offsets that are multiples of 4,
3215 // so we need to align to at least 4 bytes when we don't have 32-bit ADR.
3216 Align CPAlign = CP->getAlign();
3217 if (Subtarget->isThumb1Only())
3218 CPAlign = std::max(CPAlign, Align(4));
3220 Res =
3221 DAG.getTargetConstantPool(CP->getMachineCPVal(), PtrVT, CPAlign);
3222 else
3223 Res = DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CPAlign);
3224 return DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Res);
3225}
3226
3228 // If we don't have a 32-bit pc-relative branch instruction then the jump
3229 // table consists of block addresses. Usually this is inline, but for
3230 // execute-only it must be placed out-of-line.
3231 if (Subtarget->genExecuteOnly() && !Subtarget->hasV8MBaselineOps())
3234}
3235
3236SDValue ARMTargetLowering::LowerBlockAddress(SDValue Op,
3237 SelectionDAG &DAG) const {
3240 unsigned ARMPCLabelIndex = 0;
3241 SDLoc DL(Op);
3242 EVT PtrVT = getPointerTy(DAG.getDataLayout());
3243 const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
3244 SDValue CPAddr;
3245 bool IsPositionIndependent = isPositionIndependent() || Subtarget->isROPI();
3246 if (!IsPositionIndependent) {
3247 CPAddr = DAG.getTargetConstantPool(BA, PtrVT, Align(4));
3248 } else {
3249 unsigned PCAdj = Subtarget->isThumb() ? 4 : 8;
3250 ARMPCLabelIndex = AFI->createPICLabelUId();
3252 ARMConstantPoolConstant::Create(BA, ARMPCLabelIndex,
3253 ARMCP::CPBlockAddress, PCAdj);
3254 CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, Align(4));
3255 }
3256 CPAddr = DAG.getNode(ARMISD::Wrapper, DL, PtrVT, CPAddr);
3257 SDValue Result = DAG.getLoad(
3258 PtrVT, DL, DAG.getEntryNode(), CPAddr,
3260 if (!IsPositionIndependent)
3261 return Result;
3262 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, DL, MVT::i32);
3263 return DAG.getNode(ARMISD::PIC_ADD, DL, PtrVT, Result, PICLabel);
3264}
3265
3266/// Convert a TLS address reference into the correct sequence of loads
3267/// and calls to compute the variable's address for Darwin, and return an
3268/// SDValue containing the final node.
3269
3270/// Darwin only has one TLS scheme which must be capable of dealing with the
3271/// fully general situation, in the worst case. This means:
3272/// + "extern __thread" declaration.
3273/// + Defined in a possibly unknown dynamic library.
3274///
3275/// The general system is that each __thread variable has a [3 x i32] descriptor
3276/// which contains information used by the runtime to calculate the address. The
3277/// only part of this the compiler needs to know about is the first word, which
3278/// contains a function pointer that must be called with the address of the
3279/// entire descriptor in "r0".
3280///
3281/// Since this descriptor may be in a different unit, in general access must
3282/// proceed along the usual ARM rules. A common sequence to produce is:
3283///
3284/// movw rT1, :lower16:_var$non_lazy_ptr
3285/// movt rT1, :upper16:_var$non_lazy_ptr
3286/// ldr r0, [rT1]
3287/// ldr rT2, [r0]
3288/// blx rT2
3289/// [...address now in r0...]
3290SDValue
3291ARMTargetLowering::LowerGlobalTLSAddressDarwin(SDValue Op,
3292 SelectionDAG &DAG) const {
3293 assert(getTargetMachine().getTargetTriple().isOSDarwin() &&
3294 "This function expects a Darwin target");
3295 SDLoc DL(Op);
3296
3297 // First step is to get the address of the actua global symbol. This is where
3298 // the TLS descriptor lives.
3299 SDValue DescAddr = LowerGlobalAddressDarwin(Op, DAG);
3300
3301 // The first entry in the descriptor is a function pointer that we must call
3302 // to obtain the address of the variable.
3303 SDValue Chain = DAG.getEntryNode();
3304 SDValue FuncTLVGet = DAG.getLoad(
3305 MVT::i32, DL, Chain, DescAddr,
3309 Chain = FuncTLVGet.getValue(1);
3310
3311 MachineFunction &F = DAG.getMachineFunction();
3312 MachineFrameInfo &MFI = F.getFrameInfo();
3313 MFI.setAdjustsStack(true);
3314
3315 // TLS calls preserve all registers except those that absolutely must be
3316 // trashed: R0 (it takes an argument), LR (it's a call) and CPSR (let's not be
3317 // silly).
3318 auto TRI =
3320 auto ARI = static_cast<const ARMRegisterInfo *>(TRI);
3321 const uint32_t *Mask = ARI->getTLSCallPreservedMask(DAG.getMachineFunction());
3322
3323 // Finally, we can make the call. This is just a degenerate version of a
3324 // normal AArch64 call node: r0 takes the address of the descriptor, and
3325 // returns the address of the variable in this thread.
3326 Chain = DAG.getCopyToReg(Chain, DL, ARM::R0, DescAddr, SDValue());
3327 Chain =
3328 DAG.getNode(ARMISD::CALL, DL, DAG.getVTList(MVT::Other, MVT::Glue),
3329 Chain, FuncTLVGet, DAG.getRegister(ARM::R0, MVT::i32),
3330 DAG.getRegisterMask(Mask), Chain.getValue(1));
3331 return DAG.getCopyFromReg(Chain, DL, ARM::R0, MVT::i32, Chain.getValue(1));
3332}
3333
3334SDValue
3335ARMTargetLowering::LowerGlobalTLSAddressWindows(SDValue Op,
3336 SelectionDAG &DAG) const {
3337 assert(getTargetMachine().getTargetTriple().isOSWindows() &&
3338 "Windows specific TLS lowering");
3339
3340 SDValue Chain = DAG.getEntryNode();
3341 EVT PtrVT = getPointerTy(DAG.getDataLayout());
3342 SDLoc DL(Op);
3343
3344 // Load the current TEB (thread environment block)
3345 SDValue Ops[] = {Chain,
3346 DAG.getTargetConstant(Intrinsic::arm_mrc, DL, MVT::i32),
3347 DAG.getTargetConstant(15, DL, MVT::i32),
3348 DAG.getTargetConstant(0, DL, MVT::i32),
3349 DAG.getTargetConstant(13, DL, MVT::i32),
3350 DAG.getTargetConstant(0, DL, MVT::i32),
3351 DAG.getTargetConstant(2, DL, MVT::i32)};
3352 SDValue CurrentTEB = DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL,
3353 DAG.getVTList(MVT::i32, MVT::Other), Ops);
3354
3355 SDValue TEB = CurrentTEB.getValue(0);
3356 Chain = CurrentTEB.getValue(1);
3357
3358 // Load the ThreadLocalStoragePointer from the TEB
3359 // A pointer to the TLS array is located at offset 0x2c from the TEB.
3360 SDValue TLSArray =
3361 DAG.getNode(ISD::ADD, DL, PtrVT, TEB, DAG.getIntPtrConstant(0x2c, DL));
3362 TLSArray = DAG.getLoad(PtrVT, DL, Chain, TLSArray, MachinePointerInfo());
3363
3364 // The pointer to the thread's TLS data area is at the TLS Index scaled by 4
3365 // offset into the TLSArray.
3366
3367 // Load the TLS index from the C runtime
3368 SDValue TLSIndex =
3369 DAG.getTargetExternalSymbol("_tls_index", PtrVT, ARMII::MO_NO_FLAG);
3370 TLSIndex = DAG.getNode(ARMISD::Wrapper, DL, PtrVT, TLSIndex);
3371 TLSIndex = DAG.getLoad(PtrVT, DL, Chain, TLSIndex, MachinePointerInfo());
3372
3373 SDValue Slot = DAG.getNode(ISD::SHL, DL, PtrVT, TLSIndex,
3374 DAG.getConstant(2, DL, MVT::i32));
3375 SDValue TLS = DAG.getLoad(PtrVT, DL, Chain,
3376 DAG.getNode(ISD::ADD, DL, PtrVT, TLSArray, Slot),
3377 MachinePointerInfo());
3378
3379 // Get the offset of the start of the .tls section (section base)
3380 const auto *GA = cast<GlobalAddressSDNode>(Op);
3381 auto *CPV = ARMConstantPoolConstant::Create(GA->getGlobal(), ARMCP::SECREL);
3382 SDValue Offset = DAG.getLoad(
3383 PtrVT, DL, Chain,
3384 DAG.getNode(ARMISD::Wrapper, DL, MVT::i32,
3385 DAG.getTargetConstantPool(CPV, PtrVT, Align(4))),
3387
3388 return DAG.getNode(ISD::ADD, DL, PtrVT, TLS, Offset);
3389}
3390
3391// Lower ISD::GlobalTLSAddress using the "general dynamic" model
3392SDValue
3393ARMTargetLowering::LowerToTLSGeneralDynamicModel(GlobalAddressSDNode *GA,
3394 SelectionDAG &DAG) const {
3395 SDLoc dl(GA);
3396 EVT PtrVT = getPointerTy(DAG.getDataLayout());
3397 unsigned char PCAdj = Subtarget->isThumb() ? 4 : 8;
3398 MachineFunction &MF = DAG.getMachineFunction();
3399 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
3400 unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
3401 ARMConstantPoolValue *CPV =
3402 ARMConstantPoolConstant::Create(GA->getGlobal(), ARMPCLabelIndex,
3403 ARMCP::CPValue, PCAdj, ARMCP::TLSGD, true);
3404 SDValue Argument = DAG.getTargetConstantPool(CPV, PtrVT, Align(4));
3405 Argument = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Argument);
3406 Argument = DAG.getLoad(
3407 PtrVT, dl, DAG.getEntryNode(), Argument,
3409 SDValue Chain = Argument.getValue(1);
3410
3411 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32);
3412 Argument = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Argument, PICLabel);
3413
3414 // call __tls_get_addr.
3416 Args.emplace_back(Argument, Type::getInt32Ty(*DAG.getContext()));
3417
3418 // FIXME: is there useful debug info available here?
3419 TargetLowering::CallLoweringInfo CLI(DAG);
3420 CLI.setDebugLoc(dl).setChain(Chain).setLibCallee(
3422 DAG.getExternalSymbol("__tls_get_addr", PtrVT), std::move(Args));
3423
3424 std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
3425 return CallResult.first;
3426}
3427
3428// Lower ISD::GlobalTLSAddress using the "initial exec" or
3429// "local exec" model.
3430SDValue
3431ARMTargetLowering::LowerToTLSExecModels(GlobalAddressSDNode *GA,
3432 SelectionDAG &DAG,
3433 TLSModel::Model model) const {
3434 const GlobalValue *GV = GA->getGlobal();
3435 SDLoc dl(GA);
3437 SDValue Chain = DAG.getEntryNode();
3438 EVT PtrVT = getPointerTy(DAG.getDataLayout());
3439 // Get the Thread Pointer
3440 SDValue ThreadPointer = DAG.getNode(ARMISD::THREAD_POINTER, dl, PtrVT);
3441
3442 if (model == TLSModel::InitialExec) {
3443 MachineFunction &MF = DAG.getMachineFunction();
3444 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
3445 unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
3446 // Initial exec model.
3447 unsigned char PCAdj = Subtarget->isThumb() ? 4 : 8;
3448 ARMConstantPoolValue *CPV =
3449 ARMConstantPoolConstant::Create(GA->getGlobal(), ARMPCLabelIndex,
3451 true);
3452 Offset = DAG.getTargetConstantPool(CPV, PtrVT, Align(4));
3453 Offset = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Offset);
3454 Offset = DAG.getLoad(
3455 PtrVT, dl, Chain, Offset,
3457 Chain = Offset.getValue(1);
3458
3459 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32);
3460 Offset = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Offset, PICLabel);
3461
3462 Offset = DAG.getLoad(
3463 PtrVT, dl, Chain, Offset,
3465 } else {
3466 // local exec model
3467 assert(model == TLSModel::LocalExec);
3468 ARMConstantPoolValue *CPV =
3470 Offset = DAG.getTargetConstantPool(CPV, PtrVT, Align(4));
3471 Offset = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Offset);
3472 Offset = DAG.getLoad(
3473 PtrVT, dl, Chain, Offset,
3475 }
3476
3477 // The address of the thread local variable is the add of the thread
3478 // pointer with the offset of the variable.
3479 return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
3480}
3481
3482SDValue
3483ARMTargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
3484 GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
3485 if (DAG.getTarget().useEmulatedTLS())
3486 return LowerToTLSEmulatedModel(GA, DAG);
3487
3488 const Triple &TT = getTargetMachine().getTargetTriple();
3489 if (TT.isOSDarwin())
3490 return LowerGlobalTLSAddressDarwin(Op, DAG);
3491
3492 if (TT.isOSWindows())
3493 return LowerGlobalTLSAddressWindows(Op, DAG);
3494
3495 // TODO: implement the "local dynamic" model
3496 assert(TT.isOSBinFormatELF() && "Only ELF implemented here");
3498
3499 switch (model) {
3502 return LowerToTLSGeneralDynamicModel(GA, DAG);
3505 return LowerToTLSExecModels(GA, DAG, model);
3506 }
3507 llvm_unreachable("bogus TLS model");
3508}
3509
3510/// Return true if all users of V are within function F, looking through
3511/// ConstantExprs.
3512static bool allUsersAreInFunction(const Value *V, const Function *F) {
3513 SmallVector<const User*,4> Worklist(V->users());
3514 while (!Worklist.empty()) {
3515 auto *U = Worklist.pop_back_val();
3516 if (isa<ConstantExpr>(U)) {
3517 append_range(Worklist, U->users());
3518 continue;
3519 }
3520
3521 auto *I = dyn_cast<Instruction>(U);
3522 if (!I || I->getParent()->getParent() != F)
3523 return false;
3524 }
3525 return true;
3526}
3527
3529 const GlobalValue *GV, SelectionDAG &DAG,
3530 EVT PtrVT, const SDLoc &dl) {
3531 // If we're creating a pool entry for a constant global with unnamed address,
3532 // and the global is small enough, we can emit it inline into the constant pool
3533 // to save ourselves an indirection.
3534 //
3535 // This is a win if the constant is only used in one function (so it doesn't
3536 // need to be duplicated) or duplicating the constant wouldn't increase code
3537 // size (implying the constant is no larger than 4 bytes).
3538 const Function &F = DAG.getMachineFunction().getFunction();
3539
3540 // We rely on this decision to inline being idemopotent and unrelated to the
3541 // use-site. We know that if we inline a variable at one use site, we'll
3542 // inline it elsewhere too (and reuse the constant pool entry). Fast-isel
3543 // doesn't know about this optimization, so bail out if it's enabled else
3544 // we could decide to inline here (and thus never emit the GV) but require
3545 // the GV from fast-isel generated code.
3548 return SDValue();
3549
3550 auto *GVar = dyn_cast<GlobalVariable>(GV);
3551 if (!GVar || !GVar->hasInitializer() ||
3552 !GVar->isConstant() || !GVar->hasGlobalUnnamedAddr() ||
3553 !GVar->hasLocalLinkage())
3554 return SDValue();
3555
3556 // If we inline a value that contains relocations, we move the relocations
3557 // from .data to .text. This is not allowed in position-independent code.
3558 auto *Init = GVar->getInitializer();
3559 if ((TLI->isPositionIndependent() || TLI->getSubtarget()->isROPI()) &&
3560 Init->needsDynamicRelocation())
3561 return SDValue();
3562
3563 // The constant islands pass can only really deal with alignment requests
3564 // <= 4 bytes and cannot pad constants itself. Therefore we cannot promote
3565 // any type wanting greater alignment requirements than 4 bytes. We also
3566 // can only promote constants that are multiples of 4 bytes in size or
3567 // are paddable to a multiple of 4. Currently we only try and pad constants
3568 // that are strings for simplicity.
3569 auto *CDAInit = dyn_cast<ConstantDataArray>(Init);
3570 unsigned Size = DAG.getDataLayout().getTypeAllocSize(Init->getType());
3571 Align PrefAlign = DAG.getDataLayout().getPreferredAlign(GVar);
3572 unsigned RequiredPadding = 4 - (Size % 4);
3573 bool PaddingPossible =
3574 RequiredPadding == 4 || (CDAInit && CDAInit->isString());
3575 if (!PaddingPossible || PrefAlign > 4 || Size > ConstpoolPromotionMaxSize ||
3576 Size == 0)
3577 return SDValue();
3578
3579 unsigned PaddedSize = Size + ((RequiredPadding == 4) ? 0 : RequiredPadding);
3581 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
3582
3583 // We can't bloat the constant pool too much, else the ConstantIslands pass
3584 // may fail to converge. If we haven't promoted this global yet (it may have
3585 // multiple uses), and promoting it would increase the constant pool size (Sz
3586 // > 4), ensure we have space to do so up to MaxTotal.
3587 if (!AFI->getGlobalsPromotedToConstantPool().count(GVar) && Size > 4)
3588 if (AFI->getPromotedConstpoolIncrease() + PaddedSize - 4 >=
3590 return SDValue();
3591
3592 // This is only valid if all users are in a single function; we can't clone
3593 // the constant in general. The LLVM IR unnamed_addr allows merging
3594 // constants, but not cloning them.
3595 //
3596 // We could potentially allow cloning if we could prove all uses of the
3597 // constant in the current function don't care about the address, like
3598 // printf format strings. But that isn't implemented for now.
3599 if (!allUsersAreInFunction(GVar, &F))
3600 return SDValue();
3601
3602 // We're going to inline this global. Pad it out if needed.
3603 if (RequiredPadding != 4) {
3604 StringRef S = CDAInit->getAsString();
3605
3607 std::copy(S.bytes_begin(), S.bytes_end(), V.begin());
3608 while (RequiredPadding--)
3609 V.push_back(0);
3611 }
3612
3613 auto CPVal = ARMConstantPoolConstant::Create(GVar, Init);
3614 SDValue CPAddr = DAG.getTargetConstantPool(CPVal, PtrVT, Align(4));
3615 if (!AFI->getGlobalsPromotedToConstantPool().count(GVar)) {
3618 PaddedSize - 4);
3619 }
3620 ++NumConstpoolPromoted;
3621 return DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
3622}
3623
3625 if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV))
3626 if (!(GV = GA->getAliaseeObject()))
3627 return false;
3628 if (const auto *V = dyn_cast<GlobalVariable>(GV))
3629 return V->isConstant();
3630 return isa<Function>(GV);
3631}
3632
3633SDValue ARMTargetLowering::LowerGlobalAddress(SDValue Op,
3634 SelectionDAG &DAG) const {
3635 switch (Subtarget->getTargetTriple().getObjectFormat()) {
3636 default: llvm_unreachable("unknown object format");
3637 case Triple::COFF:
3638 return LowerGlobalAddressWindows(Op, DAG);
3639 case Triple::ELF:
3640 return LowerGlobalAddressELF(Op, DAG);
3641 case Triple::MachO:
3642 return LowerGlobalAddressDarwin(Op, DAG);
3643 }
3644}
3645
3646SDValue ARMTargetLowering::LowerGlobalAddressELF(SDValue Op,
3647 SelectionDAG &DAG) const {
3648 EVT PtrVT = getPointerTy(DAG.getDataLayout());
3649 SDLoc dl(Op);
3650 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
3651 bool IsRO = isReadOnly(GV);
3652
3653 // promoteToConstantPool only if not generating XO text section
3654 if (GV->isDSOLocal() && !Subtarget->genExecuteOnly())
3655 if (SDValue V = promoteToConstantPool(this, GV, DAG, PtrVT, dl))
3656 return V;
3657
3658 if (isPositionIndependent()) {
3660 GV, dl, PtrVT, 0, GV->isDSOLocal() ? 0 : ARMII::MO_GOT);
3661 SDValue Result = DAG.getNode(ARMISD::WrapperPIC, dl, PtrVT, G);
3662 if (!GV->isDSOLocal())
3663 Result =
3664 DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
3666 return Result;
3667 } else if (Subtarget->isROPI() && IsRO) {
3668 // PC-relative.
3669 SDValue G = DAG.getTargetGlobalAddress(GV, dl, PtrVT);
3670 SDValue Result = DAG.getNode(ARMISD::WrapperPIC, dl, PtrVT, G);
3671 return Result;
3672 } else if (Subtarget->isRWPI() && !IsRO) {
3673 // SB-relative.
3674 SDValue RelAddr;
3675 if (Subtarget->useMovt()) {
3676 ++NumMovwMovt;
3677 SDValue G = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, ARMII::MO_SBREL);
3678 RelAddr = DAG.getNode(ARMISD::Wrapper, dl, PtrVT, G);
3679 } else { // use literal pool for address constant
3680 ARMConstantPoolValue *CPV =
3682 SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, Align(4));
3683 CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
3684 RelAddr = DAG.getLoad(
3685 PtrVT, dl, DAG.getEntryNode(), CPAddr,
3687 }
3688 SDValue SB = DAG.getCopyFromReg(DAG.getEntryNode(), dl, ARM::R9, PtrVT);
3689 SDValue Result = DAG.getNode(ISD::ADD, dl, PtrVT, SB, RelAddr);
3690 return Result;
3691 }
3692
3693 // If we have T2 ops, we can materialize the address directly via movt/movw
3694 // pair. This is always cheaper. If need to generate Execute Only code, and we
3695 // only have Thumb1 available, we can't use a constant pool and are forced to
3696 // use immediate relocations.
3697 if (Subtarget->useMovt() || Subtarget->genExecuteOnly()) {
3698 if (Subtarget->useMovt())
3699 ++NumMovwMovt;
3700 // FIXME: Once remat is capable of dealing with instructions with register
3701 // operands, expand this into two nodes.
3702 return DAG.getNode(ARMISD::Wrapper, dl, PtrVT,
3703 DAG.getTargetGlobalAddress(GV, dl, PtrVT));
3704 } else {
3705 SDValue CPAddr = DAG.getTargetConstantPool(GV, PtrVT, Align(4));
3706 CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
3707 return DAG.getLoad(
3708 PtrVT, dl, DAG.getEntryNode(), CPAddr,
3710 }
3711}
3712
3713SDValue ARMTargetLowering::LowerGlobalAddressDarwin(SDValue Op,
3714 SelectionDAG &DAG) const {
3715 assert(!Subtarget->isROPI() && !Subtarget->isRWPI() &&
3716 "ROPI/RWPI not currently supported for Darwin");
3717 EVT PtrVT = getPointerTy(DAG.getDataLayout());
3718 SDLoc dl(Op);
3719 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
3720
3721 if (Subtarget->useMovt())
3722 ++NumMovwMovt;
3723
3724 // FIXME: Once remat is capable of dealing with instructions with register
3725 // operands, expand this into multiple nodes
3726 unsigned Wrapper =
3727 isPositionIndependent() ? ARMISD::WrapperPIC : ARMISD::Wrapper;
3728
3729 SDValue G = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, ARMII::MO_NONLAZY);
3730 SDValue Result = DAG.getNode(Wrapper, dl, PtrVT, G);
3731
3732 if (Subtarget->isGVIndirectSymbol(GV))
3733 Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
3735 return Result;
3736}
3737
3738SDValue ARMTargetLowering::LowerGlobalAddressWindows(SDValue Op,
3739 SelectionDAG &DAG) const {
3740 assert(getTargetMachine().getTargetTriple().isOSWindows() &&
3741 "non-Windows COFF is not supported");
3742 assert(Subtarget->useMovt() &&
3743 "Windows on ARM expects to use movw/movt");
3744 assert(!Subtarget->isROPI() && !Subtarget->isRWPI() &&
3745 "ROPI/RWPI not currently supported for Windows");
3746
3747 const TargetMachine &TM = getTargetMachine();
3748 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
3749 ARMII::TOF TargetFlags = ARMII::MO_NO_FLAG;
3750 if (GV->hasDLLImportStorageClass())
3751 TargetFlags = ARMII::MO_DLLIMPORT;
3752 else if (!TM.shouldAssumeDSOLocal(GV))
3753 TargetFlags = ARMII::MO_COFFSTUB;
3754 EVT PtrVT = getPointerTy(DAG.getDataLayout());
3756 SDLoc DL(Op);
3757
3758 ++NumMovwMovt;
3759
3760 // FIXME: Once remat is capable of dealing with instructions with register
3761 // operands, expand this into two nodes.
3762 Result = DAG.getNode(ARMISD::Wrapper, DL, PtrVT,
3763 DAG.getTargetGlobalAddress(GV, DL, PtrVT, /*offset=*/0,
3764 TargetFlags));
3765 if (TargetFlags & (ARMII::MO_DLLIMPORT | ARMII::MO_COFFSTUB))
3766 Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result,
3768 return Result;
3769}
3770
3771SDValue
3772ARMTargetLowering::LowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const {
3773 SDLoc dl(Op);
3774 SDValue Val = DAG.getConstant(0, dl, MVT::i32);
3775 return DAG.getNode(ARMISD::EH_SJLJ_SETJMP, dl,
3776 DAG.getVTList(MVT::i32, MVT::Other), Op.getOperand(0),
3777 Op.getOperand(1), Val);
3778}
3779
3780SDValue
3781ARMTargetLowering::LowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const {
3782 SDLoc dl(Op);
3783 return DAG.getNode(ARMISD::EH_SJLJ_LONGJMP, dl, MVT::Other, Op.getOperand(0),
3784 Op.getOperand(1), DAG.getConstant(0, dl, MVT::i32));
3785}
3786
3787SDValue ARMTargetLowering::LowerEH_SJLJ_SETUP_DISPATCH(SDValue Op,
3788 SelectionDAG &DAG) const {
3789 SDLoc dl(Op);
3790 return DAG.getNode(ARMISD::EH_SJLJ_SETUP_DISPATCH, dl, MVT::Other,
3791 Op.getOperand(0));
3792}
3793
3794SDValue ARMTargetLowering::LowerINTRINSIC_VOID(
3795 SDValue Op, SelectionDAG &DAG, const ARMSubtarget *Subtarget) const {
3796 unsigned IntNo =
3797 Op.getConstantOperandVal(Op.getOperand(0).getValueType() == MVT::Other);
3798 switch (IntNo) {
3799 default:
3800 return SDValue(); // Don't custom lower most intrinsics.
3801 case Intrinsic::arm_gnu_eabi_mcount: {
3802 MachineFunction &MF = DAG.getMachineFunction();
3803 EVT PtrVT = getPointerTy(DAG.getDataLayout());
3804 SDLoc dl(Op);
3805 SDValue Chain = Op.getOperand(0);
3806 // call "\01__gnu_mcount_nc"
3807 const ARMBaseRegisterInfo *ARI = Subtarget->getRegisterInfo();
3808 const uint32_t *Mask =
3810 assert(Mask && "Missing call preserved mask for calling convention");
3811 // Mark LR an implicit live-in.
3812 Register Reg = MF.addLiveIn(ARM::LR, getRegClassFor(MVT::i32));
3813 SDValue ReturnAddress =
3814 DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, PtrVT);
3815 constexpr EVT ResultTys[] = {MVT::Other, MVT::Glue};
3816 SDValue Callee =
3817 DAG.getTargetExternalSymbol("\01__gnu_mcount_nc", PtrVT, 0);
3819 if (Subtarget->isThumb())
3820 return SDValue(
3821 DAG.getMachineNode(
3822 ARM::tBL_PUSHLR, dl, ResultTys,
3823 {ReturnAddress, DAG.getTargetConstant(ARMCC::AL, dl, PtrVT),
3824 DAG.getRegister(0, PtrVT), Callee, RegisterMask, Chain}),
3825 0);
3826 return SDValue(
3827 DAG.getMachineNode(ARM::BL_PUSHLR, dl, ResultTys,
3828 {ReturnAddress, Callee, RegisterMask, Chain}),
3829 0);
3830 }
3831 }
3832}
3833
3834SDValue
3835ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG,
3836 const ARMSubtarget *Subtarget) const {
3837 unsigned IntNo = Op.getConstantOperandVal(0);
3838 SDLoc dl(Op);
3839 switch (IntNo) {
3840 default: return SDValue(); // Don't custom lower most intrinsics.
3841 case Intrinsic::thread_pointer: {
3842 EVT PtrVT = getPointerTy(DAG.getDataLayout());
3843 return DAG.getNode(ARMISD::THREAD_POINTER, dl, PtrVT);
3844 }
3845 case Intrinsic::arm_cls: {
3846 // Note: arm_cls and arm_cls64 intrinsics are expanded directly here
3847 // in LowerINTRINSIC_WO_CHAIN since there's no native scalar CLS
3848 // instruction.
3849 const SDValue &Operand = Op.getOperand(1);
3850 const EVT VTy = Op.getValueType();
3851 return DAG.getNode(ISD::CTLS, dl, VTy, Operand);
3852 }
3853 case Intrinsic::arm_cls64: {
3854 // arm_cls64 returns i32 but takes i64 input.
3855 // Use ISD::CTLS for i64 and truncate the result.
3856 SDValue CTLS64 = DAG.getNode(ISD::CTLS, dl, MVT::i64, Op.getOperand(1));
3857 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, CTLS64);
3858 }
3859 case Intrinsic::arm_neon_vcls:
3860 case Intrinsic::arm_mve_vcls: {
3861 // Lower vector CLS intrinsics to ISD::CTLS.
3862 // Vector CTLS is Legal when NEON/MVE is available (set elsewhere).
3863 const EVT VTy = Op.getValueType();
3864 return DAG.getNode(ISD::CTLS, dl, VTy, Op.getOperand(1));
3865 }
3866 case Intrinsic::eh_sjlj_lsda: {
3867 MachineFunction &MF = DAG.getMachineFunction();
3868 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
3869 unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
3870 EVT PtrVT = getPointerTy(DAG.getDataLayout());
3871 SDValue CPAddr;
3872 bool IsPositionIndependent = isPositionIndependent();
3873 unsigned PCAdj = IsPositionIndependent ? (Subtarget->isThumb() ? 4 : 8) : 0;
3874 ARMConstantPoolValue *CPV =
3875 ARMConstantPoolConstant::Create(&MF.getFunction(), ARMPCLabelIndex,
3876 ARMCP::CPLSDA, PCAdj);
3877 CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, Align(4));
3878 CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
3879 SDValue Result = DAG.getLoad(
3880 PtrVT, dl, DAG.getEntryNode(), CPAddr,
3882
3883 if (IsPositionIndependent) {
3884 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32);
3885 Result = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Result, PICLabel);
3886 }
3887 return Result;
3888 }
3889 case Intrinsic::arm_neon_vabs:
3890 return DAG.getNode(ISD::ABS, SDLoc(Op), Op.getValueType(),
3891 Op.getOperand(1));
3892 case Intrinsic::arm_neon_vabds:
3893 if (Op.getValueType().isInteger())
3894 return DAG.getNode(ISD::ABDS, SDLoc(Op), Op.getValueType(),
3895 Op.getOperand(1), Op.getOperand(2));
3896 return SDValue();
3897 case Intrinsic::arm_neon_vabdu:
3898 return DAG.getNode(ISD::ABDU, SDLoc(Op), Op.getValueType(),
3899 Op.getOperand(1), Op.getOperand(2));
3900 case Intrinsic::arm_neon_vmulls:
3901 case Intrinsic::arm_neon_vmullu: {
3902 unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmulls)
3903 ? ARMISD::VMULLs : ARMISD::VMULLu;
3904 return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),
3905 Op.getOperand(1), Op.getOperand(2));
3906 }
3907 case Intrinsic::arm_neon_vminnm:
3908 case Intrinsic::arm_neon_vmaxnm: {
3909 unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vminnm)
3910 ? ISD::FMINNUM : ISD::FMAXNUM;
3911 return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),
3912 Op.getOperand(1), Op.getOperand(2));
3913 }
3914 case Intrinsic::arm_neon_vminu:
3915 case Intrinsic::arm_neon_vmaxu: {
3916 if (Op.getValueType().isFloatingPoint())
3917 return SDValue();
3918 unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vminu)
3919 ? ISD::UMIN : ISD::UMAX;
3920 return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),
3921 Op.getOperand(1), Op.getOperand(2));
3922 }
3923 case Intrinsic::arm_neon_vmins:
3924 case Intrinsic::arm_neon_vmaxs: {
3925 // v{min,max}s is overloaded between signed integers and floats.
3926 if (!Op.getValueType().isFloatingPoint()) {
3927 unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmins)
3928 ? ISD::SMIN : ISD::SMAX;
3929 return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),
3930 Op.getOperand(1), Op.getOperand(2));
3931 }
3932 unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmins)
3933 ? ISD::FMINIMUM : ISD::FMAXIMUM;
3934 return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),
3935 Op.getOperand(1), Op.getOperand(2));
3936 }
3937 case Intrinsic::arm_neon_vtbl1:
3938 return DAG.getNode(ARMISD::VTBL1, SDLoc(Op), Op.getValueType(),
3939 Op.getOperand(1), Op.getOperand(2));
3940 case Intrinsic::arm_neon_vtbl2:
3941 return DAG.getNode(ARMISD::VTBL2, SDLoc(Op), Op.getValueType(),
3942 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
3943 case Intrinsic::arm_mve_pred_i2v:
3944 case Intrinsic::arm_mve_pred_v2i:
3945 return DAG.getNode(ARMISD::PREDICATE_CAST, SDLoc(Op), Op.getValueType(),
3946 Op.getOperand(1));
3947 case Intrinsic::arm_mve_vreinterpretq:
3948 return DAG.getNode(ARMISD::VECTOR_REG_CAST, SDLoc(Op), Op.getValueType(),
3949 Op.getOperand(1));
3950 case Intrinsic::arm_mve_lsll:
3951 return DAG.getNode(ARMISD::LSLL, SDLoc(Op), Op->getVTList(),
3952 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
3953 case Intrinsic::arm_mve_asrl:
3954 return DAG.getNode(ARMISD::ASRL, SDLoc(Op), Op->getVTList(),
3955 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
3956 case Intrinsic::arm_mve_vsli:
3957 return DAG.getNode(ARMISD::VSLIIMM, SDLoc(Op), Op->getVTList(),
3958 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
3959 case Intrinsic::arm_mve_vsri:
3960 return DAG.getNode(ARMISD::VSRIIMM, SDLoc(Op), Op->getVTList(),
3961 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
3962 }
3963}
3964
3966 const ARMSubtarget *Subtarget) {
3967 SDLoc dl(Op);
3968 auto SSID = static_cast<SyncScope::ID>(Op.getConstantOperandVal(2));
3969 if (SSID == SyncScope::SingleThread)
3970 return Op;
3971
3972 if (!Subtarget->hasDataBarrier()) {
3973 // Some ARMv6 cpus can support data barriers with an mcr instruction.
3974 // Thumb1 and pre-v6 ARM mode use a libcall instead and should never get
3975 // here.
3976 assert(Subtarget->hasV6Ops() && !Subtarget->isThumb() &&
3977 "Unexpected ISD::ATOMIC_FENCE encountered. Should be libcall!");
3978 return DAG.getNode(ARMISD::MEMBARRIER_MCR, dl, MVT::Other, Op.getOperand(0),
3979 DAG.getConstant(0, dl, MVT::i32));
3980 }
3981
3982 AtomicOrdering Ord =
3983 static_cast<AtomicOrdering>(Op.getConstantOperandVal(1));
3985 if (Subtarget->isMClass()) {
3986 // Only a full system barrier exists in the M-class architectures.
3988 } else if (Subtarget->preferISHSTBarriers() &&
3989 Ord == AtomicOrdering::Release) {
3990 // Swift happens to implement ISHST barriers in a way that's compatible with
3991 // Release semantics but weaker than ISH so we'd be fools not to use
3992 // it. Beware: other processors probably don't!
3994 }
3995
3996 return DAG.getNode(ISD::INTRINSIC_VOID, dl, MVT::Other, Op.getOperand(0),
3997 DAG.getConstant(Intrinsic::arm_dmb, dl, MVT::i32),
3998 DAG.getConstant(Domain, dl, MVT::i32));
3999}
4000
4002 const ARMSubtarget *Subtarget) {
4003 // ARM pre v5TE and Thumb1 does not have preload instructions.
4004 if (!(Subtarget->isThumb2() ||
4005 (!Subtarget->isThumb1Only() && Subtarget->hasV5TEOps())))
4006 // Just preserve the chain.
4007 return Op.getOperand(0);
4008
4009 SDLoc dl(Op);
4010 unsigned isRead = ~Op.getConstantOperandVal(2) & 1;
4011 if (!isRead &&
4012 (!Subtarget->hasV7Ops() || !Subtarget->hasMPExtension()))
4013 // ARMv7 with MP extension has PLDW.
4014 return Op.getOperand(0);
4015
4016 unsigned isData = Op.getConstantOperandVal(4);
4017 if (Subtarget->isThumb()) {
4018 // Invert the bits.
4019 isRead = ~isRead & 1;
4020 isData = ~isData & 1;
4021 }
4022
4023 return DAG.getNode(ARMISD::PRELOAD, dl, MVT::Other, Op.getOperand(0),
4024 Op.getOperand(1), DAG.getConstant(isRead, dl, MVT::i32),
4025 DAG.getConstant(isData, dl, MVT::i32));
4026}
4027
4030 ARMFunctionInfo *FuncInfo = MF.getInfo<ARMFunctionInfo>();
4031
4032 // vastart just stores the address of the VarArgsFrameIndex slot into the
4033 // memory location argument.
4034 SDLoc dl(Op);
4036 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
4037 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
4038 return DAG.getStore(Op.getOperand(0), dl, FR, Op.getOperand(1),
4039 MachinePointerInfo(SV));
4040}
4041
4042SDValue ARMTargetLowering::GetF64FormalArgument(CCValAssign &VA,
4043 CCValAssign &NextVA,
4044 SDValue &Root,
4045 SelectionDAG &DAG,
4046 const SDLoc &dl) const {
4047 MachineFunction &MF = DAG.getMachineFunction();
4048 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
4049
4050 const TargetRegisterClass *RC;
4051 if (AFI->isThumb1OnlyFunction())
4052 RC = &ARM::tGPRRegClass;
4053 else
4054 RC = &ARM::GPRRegClass;
4055
4056 // Transform the arguments stored in physical registers into virtual ones.
4057 Register Reg = MF.addLiveIn(VA.getLocReg(), RC);
4058 SDValue ArgValue = DAG.getCopyFromReg(Root, dl, Reg, MVT::i32);
4059
4060 SDValue ArgValue2;
4061 if (NextVA.isMemLoc()) {
4062 MachineFrameInfo &MFI = MF.getFrameInfo();
4063 int FI = MFI.CreateFixedObject(4, NextVA.getLocMemOffset(), true);
4064
4065 // Create load node to retrieve arguments from the stack.
4066 SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
4067 ArgValue2 = DAG.getLoad(
4068 MVT::i32, dl, Root, FIN,
4070 } else {
4071 Reg = MF.addLiveIn(NextVA.getLocReg(), RC);
4072 ArgValue2 = DAG.getCopyFromReg(Root, dl, Reg, MVT::i32);
4073 }
4074 if (!Subtarget->isLittle())
4075 std::swap (ArgValue, ArgValue2);
4076 return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, ArgValue, ArgValue2);
4077}
4078
4079// The remaining GPRs hold either the beginning of variable-argument
4080// data, or the beginning of an aggregate passed by value (usually
4081// byval). Either way, we allocate stack slots adjacent to the data
4082// provided by our caller, and store the unallocated registers there.
4083// If this is a variadic function, the va_list pointer will begin with
4084// these values; otherwise, this reassembles a (byval) structure that
4085// was split between registers and memory.
4086// Return: The frame index registers were stored into.
4087int ARMTargetLowering::StoreByValRegs(CCState &CCInfo, SelectionDAG &DAG,
4088 const SDLoc &dl, SDValue &Chain,
4089 const Value *OrigArg,
4090 unsigned InRegsParamRecordIdx,
4091 int ArgOffset, unsigned ArgSize) const {
4092 // Currently, two use-cases possible:
4093 // Case #1. Non-var-args function, and we meet first byval parameter.
4094 // Setup first unallocated register as first byval register;
4095 // eat all remained registers
4096 // (these two actions are performed by HandleByVal method).
4097 // Then, here, we initialize stack frame with
4098 // "store-reg" instructions.
4099 // Case #2. Var-args function, that doesn't contain byval parameters.
4100 // The same: eat all remained unallocated registers,
4101 // initialize stack frame.
4102
4103 MachineFunction &MF = DAG.getMachineFunction();
4104 MachineFrameInfo &MFI = MF.getFrameInfo();
4105 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
4106 unsigned RBegin, REnd;
4107 if (InRegsParamRecordIdx < CCInfo.getInRegsParamsCount()) {
4108 CCInfo.getInRegsParamInfo(InRegsParamRecordIdx, RBegin, REnd);
4109 } else {
4110 unsigned RBeginIdx = CCInfo.getFirstUnallocated(GPRArgRegs);
4111 RBegin = RBeginIdx == 4 ? (unsigned)ARM::R4 : GPRArgRegs[RBeginIdx];
4112 REnd = ARM::R4;
4113 }
4114
4115 if (REnd != RBegin)
4116 ArgOffset = -4 * (ARM::R4 - RBegin);
4117
4118 auto PtrVT = getPointerTy(DAG.getDataLayout());
4119 int FrameIndex = MFI.CreateFixedObject(ArgSize, ArgOffset, false);
4120 SDValue FIN = DAG.getFrameIndex(FrameIndex, PtrVT);
4121
4123 const TargetRegisterClass *RC =
4124 AFI->isThumb1OnlyFunction() ? &ARM::tGPRRegClass : &ARM::GPRRegClass;
4125
4126 for (unsigned Reg = RBegin, i = 0; Reg < REnd; ++Reg, ++i) {
4127 Register VReg = MF.addLiveIn(Reg, RC);
4128 SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i32);
4129 SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN,
4130 MachinePointerInfo(OrigArg, 4 * i));
4131 MemOps.push_back(Store);
4132 FIN = DAG.getNode(ISD::ADD, dl, PtrVT, FIN, DAG.getConstant(4, dl, PtrVT));
4133 }
4134
4135 if (!MemOps.empty())
4136 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
4137 return FrameIndex;
4138}
4139
4140// Setup stack frame, the va_list pointer will start from.
4141void ARMTargetLowering::VarArgStyleRegisters(CCState &CCInfo, SelectionDAG &DAG,
4142 const SDLoc &dl, SDValue &Chain,
4143 unsigned ArgOffset,
4144 unsigned TotalArgRegsSaveSize,
4145 bool ForceMutable) const {
4146 MachineFunction &MF = DAG.getMachineFunction();
4147 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
4148
4149 // Try to store any remaining integer argument regs
4150 // to their spots on the stack so that they may be loaded by dereferencing
4151 // the result of va_next.
4152 // If there is no regs to be stored, just point address after last
4153 // argument passed via stack.
4154 int FrameIndex = StoreByValRegs(
4155 CCInfo, DAG, dl, Chain, nullptr, CCInfo.getInRegsParamsCount(),
4156 CCInfo.getStackSize(), std::max(4U, TotalArgRegsSaveSize));
4157 AFI->setVarArgsFrameIndex(FrameIndex);
4158}
4159
4160bool ARMTargetLowering::splitValueIntoRegisterParts(
4161 SelectionDAG &DAG, const SDLoc &DL, SDValue Val, SDValue *Parts,
4162 unsigned NumParts, MVT PartVT, std::optional<CallingConv::ID> CC) const {
4163 EVT ValueVT = Val.getValueType();
4164 if ((ValueVT == MVT::f16 || ValueVT == MVT::bf16) && PartVT == MVT::f32) {
4165 unsigned ValueBits = ValueVT.getSizeInBits();
4166 unsigned PartBits = PartVT.getSizeInBits();
4167 Val = DAG.getNode(ISD::BITCAST, DL, MVT::getIntegerVT(ValueBits), Val);
4168 Val = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::getIntegerVT(PartBits), Val);
4169 Val = DAG.getNode(ISD::BITCAST, DL, PartVT, Val);
4170 Parts[0] = Val;
4171 return true;
4172 }
4173 return false;
4174}
4175
4176SDValue ARMTargetLowering::joinRegisterPartsIntoValue(
4177 SelectionDAG &DAG, const SDLoc &DL, const SDValue *Parts, unsigned NumParts,
4178 MVT PartVT, EVT ValueVT, std::optional<CallingConv::ID> CC) const {
4179 if ((ValueVT == MVT::f16 || ValueVT == MVT::bf16) && PartVT == MVT::f32) {
4180 unsigned ValueBits = ValueVT.getSizeInBits();
4181 unsigned PartBits = PartVT.getSizeInBits();
4182 SDValue Val = Parts[0];
4183
4184 Val = DAG.getNode(ISD::BITCAST, DL, MVT::getIntegerVT(PartBits), Val);
4185 Val = DAG.getNode(ISD::TRUNCATE, DL, MVT::getIntegerVT(ValueBits), Val);
4186 Val = DAG.getNode(ISD::BITCAST, DL, ValueVT, Val);
4187 return Val;
4188 }
4189 return SDValue();
4190}
4191
4192SDValue ARMTargetLowering::LowerFormalArguments(
4193 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
4194 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
4195 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
4196 MachineFunction &MF = DAG.getMachineFunction();
4197 MachineFrameInfo &MFI = MF.getFrameInfo();
4198
4199 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
4200
4201 // Assign locations to all of the incoming arguments.
4203 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
4204 *DAG.getContext());
4205 CCInfo.AnalyzeFormalArguments(Ins, CCAssignFnForCall(CallConv, isVarArg));
4206
4208 unsigned CurArgIdx = 0;
4209
4210 // Initially ArgRegsSaveSize is zero.
4211 // Then we increase this value each time we meet byval parameter.
4212 // We also increase this value in case of varargs function.
4213 AFI->setArgRegsSaveSize(0);
4214
4215 // Calculate the amount of stack space that we need to allocate to store
4216 // byval and variadic arguments that are passed in registers.
4217 // We need to know this before we allocate the first byval or variadic
4218 // argument, as they will be allocated a stack slot below the CFA (Canonical
4219 // Frame Address, the stack pointer at entry to the function).
4220 unsigned ArgRegBegin = ARM::R4;
4221 for (const CCValAssign &VA : ArgLocs) {
4222 if (CCInfo.getInRegsParamsProcessed() >= CCInfo.getInRegsParamsCount())
4223 break;
4224
4225 unsigned Index = VA.getValNo();
4226 ISD::ArgFlagsTy Flags = Ins[Index].Flags;
4227 if (!Flags.isByVal())
4228 continue;
4229
4230 assert(VA.isMemLoc() && "unexpected byval pointer in reg");
4231 unsigned RBegin, REnd;
4232 CCInfo.getInRegsParamInfo(CCInfo.getInRegsParamsProcessed(), RBegin, REnd);
4233 ArgRegBegin = std::min(ArgRegBegin, RBegin);
4234
4235 CCInfo.nextInRegsParam();
4236 }
4237 CCInfo.rewindByValRegsInfo();
4238
4239 int lastInsIndex = -1;
4240 if (isVarArg && MFI.hasVAStart()) {
4241 unsigned RegIdx = CCInfo.getFirstUnallocated(GPRArgRegs);
4242 if (RegIdx != std::size(GPRArgRegs))
4243 ArgRegBegin = std::min(ArgRegBegin, (unsigned)GPRArgRegs[RegIdx]);
4244 }
4245
4246 unsigned TotalArgRegsSaveSize = 4 * (ARM::R4 - ArgRegBegin);
4247 AFI->setArgRegsSaveSize(TotalArgRegsSaveSize);
4248 auto PtrVT = getPointerTy(DAG.getDataLayout());
4249
4250 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
4251 CCValAssign &VA = ArgLocs[i];
4252 if (Ins[VA.getValNo()].isOrigArg()) {
4253 std::advance(CurOrigArg,
4254 Ins[VA.getValNo()].getOrigArgIndex() - CurArgIdx);
4255 CurArgIdx = Ins[VA.getValNo()].getOrigArgIndex();
4256 }
4257 // Arguments stored in registers.
4258 if (VA.isRegLoc()) {
4259 EVT RegVT = VA.getLocVT();
4260 SDValue ArgValue;
4261
4262 if (VA.needsCustom() && VA.getLocVT() == MVT::v2f64) {
4263 // f64 and vector types are split up into multiple registers or
4264 // combinations of registers and stack slots.
4265 SDValue ArgValue1 =
4266 GetF64FormalArgument(VA, ArgLocs[++i], Chain, DAG, dl);
4267 VA = ArgLocs[++i]; // skip ahead to next loc
4268 SDValue ArgValue2;
4269 if (VA.isMemLoc()) {
4270 int FI = MFI.CreateFixedObject(8, VA.getLocMemOffset(), true);
4271 SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
4272 ArgValue2 = DAG.getLoad(
4273 MVT::f64, dl, Chain, FIN,
4275 } else {
4276 ArgValue2 = GetF64FormalArgument(VA, ArgLocs[++i], Chain, DAG, dl);
4277 }
4278 ArgValue = DAG.getNode(ISD::UNDEF, dl, MVT::v2f64);
4279 ArgValue = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, ArgValue,
4280 ArgValue1, DAG.getIntPtrConstant(0, dl));
4281 ArgValue = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, ArgValue,
4282 ArgValue2, DAG.getIntPtrConstant(1, dl));
4283 } else if (VA.needsCustom() && VA.getLocVT() == MVT::f64) {
4284 ArgValue = GetF64FormalArgument(VA, ArgLocs[++i], Chain, DAG, dl);
4285 } else {
4286 const TargetRegisterClass *RC;
4287
4288 if (RegVT == MVT::f16 || RegVT == MVT::bf16)
4289 RC = &ARM::HPRRegClass;
4290 else if (RegVT == MVT::f32)
4291 RC = &ARM::SPRRegClass;
4292 else if (RegVT == MVT::f64 || RegVT == MVT::v4f16 ||
4293 RegVT == MVT::v4bf16)
4294 RC = &ARM::DPRRegClass;
4295 else if (RegVT == MVT::v2f64 || RegVT == MVT::v8f16 ||
4296 RegVT == MVT::v8bf16)
4297 RC = &ARM::QPRRegClass;
4298 else if (RegVT == MVT::i32)
4299 RC = AFI->isThumb1OnlyFunction() ? &ARM::tGPRRegClass
4300 : &ARM::GPRRegClass;
4301 else
4302 llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering");
4303
4304 // Transform the arguments in physical registers into virtual ones.
4305 Register Reg = MF.addLiveIn(VA.getLocReg(), RC);
4306 ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
4307
4308 // If this value is passed in r0 and has the returned attribute (e.g.
4309 // C++ 'structors), record this fact for later use.
4310 if (VA.getLocReg() == ARM::R0 && Ins[VA.getValNo()].Flags.isReturned()) {
4311 AFI->setPreservesR0();
4312 }
4313 }
4314
4315 // If this is an 8 or 16-bit value, it is really passed promoted
4316 // to 32 bits. Insert an assert[sz]ext to capture this, then
4317 // truncate to the right size.
4318 switch (VA.getLocInfo()) {
4319 default: llvm_unreachable("Unknown loc info!");
4320 case CCValAssign::Full: break;
4321 case CCValAssign::BCvt:
4322 ArgValue = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), ArgValue);
4323 break;
4324 }
4325
4326 // f16 arguments have their size extended to 4 bytes and passed as if they
4327 // had been copied to the LSBs of a 32-bit register.
4328 // For that, it's passed extended to i32 (soft ABI) or to f32 (hard ABI)
4329 if (VA.needsCustom() &&
4330 (VA.getValVT() == MVT::f16 || VA.getValVT() == MVT::bf16))
4331 ArgValue = MoveToHPR(dl, DAG, VA.getLocVT(), VA.getValVT(), ArgValue);
4332
4333 // On CMSE Entry Functions, formal integer arguments whose bitwidth is
4334 // less than 32 bits must be sign- or zero-extended in the callee for
4335 // security reasons. Although the ABI mandates an extension done by the
4336 // caller, the latter cannot be trusted to follow the rules of the ABI.
4337 const ISD::InputArg &Arg = Ins[VA.getValNo()];
4338 if (AFI->isCmseNSEntryFunction() && Arg.ArgVT.isScalarInteger() &&
4339 RegVT.isScalarInteger() && Arg.ArgVT.bitsLT(MVT::i32))
4340 ArgValue = handleCMSEValue(ArgValue, Arg, DAG, dl);
4341
4342 InVals.push_back(ArgValue);
4343 } else { // VA.isRegLoc()
4344 // Only arguments passed on the stack should make it here.
4345 assert(VA.isMemLoc());
4346 assert(VA.getValVT() != MVT::i64 && "i64 should already be lowered");
4347
4348 int index = VA.getValNo();
4349
4350 // Some Ins[] entries become multiple ArgLoc[] entries.
4351 // Process them only once.
4352 if (index != lastInsIndex)
4353 {
4354 ISD::ArgFlagsTy Flags = Ins[index].Flags;
4355 // FIXME: For now, all byval parameter objects are marked mutable.
4356 // This can be changed with more analysis.
4357 // In case of tail call optimization mark all arguments mutable.
4358 // Since they could be overwritten by lowering of arguments in case of
4359 // a tail call.
4360 if (Flags.isByVal()) {
4361 assert(Ins[index].isOrigArg() &&
4362 "Byval arguments cannot be implicit");
4363 unsigned CurByValIndex = CCInfo.getInRegsParamsProcessed();
4364
4365 int FrameIndex = StoreByValRegs(
4366 CCInfo, DAG, dl, Chain, &*CurOrigArg, CurByValIndex,
4367 VA.getLocMemOffset(), Flags.getByValSize());
4368 InVals.push_back(DAG.getFrameIndex(FrameIndex, PtrVT));
4369 CCInfo.nextInRegsParam();
4370 } else if (VA.needsCustom() && (VA.getValVT() == MVT::f16 ||
4371 VA.getValVT() == MVT::bf16)) {
4372 // f16 and bf16 values are passed in the least-significant half of
4373 // a 4 byte stack slot. This is done as-if the extension was done
4374 // in a 32-bit register, so the actual bytes used for the value
4375 // differ between little and big endian.
4376 assert(VA.getLocVT().getSizeInBits() == 32);
4377 unsigned FIOffset = VA.getLocMemOffset();
4378 int FI = MFI.CreateFixedObject(VA.getLocVT().getSizeInBits() / 8,
4379 FIOffset, true);
4380
4381 SDValue Addr = DAG.getFrameIndex(FI, PtrVT);
4382 if (DAG.getDataLayout().isBigEndian())
4383 Addr = DAG.getObjectPtrOffset(dl, Addr, TypeSize::getFixed(2));
4384
4385 InVals.push_back(DAG.getLoad(VA.getValVT(), dl, Chain, Addr,
4387 DAG.getMachineFunction(), FI)));
4388
4389 } else {
4390 unsigned FIOffset = VA.getLocMemOffset();
4391 int FI = MFI.CreateFixedObject(VA.getLocVT().getSizeInBits()/8,
4392 FIOffset, true);
4393
4394 // Create load nodes to retrieve arguments from the stack.
4395 SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
4396 InVals.push_back(DAG.getLoad(VA.getValVT(), dl, Chain, FIN,
4398 DAG.getMachineFunction(), FI)));
4399 }
4400 lastInsIndex = index;
4401 }
4402 }
4403 }
4404
4405 // varargs
4406 if (isVarArg && MFI.hasVAStart()) {
4407 VarArgStyleRegisters(CCInfo, DAG, dl, Chain, CCInfo.getStackSize(),
4408 TotalArgRegsSaveSize);
4409 if (AFI->isCmseNSEntryFunction()) {
4410 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
4412 "secure entry function must not be variadic", dl.getDebugLoc()));
4413 }
4414 }
4415
4416 unsigned StackArgSize = CCInfo.getStackSize();
4417 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
4418 if (canGuaranteeTCO(CallConv, TailCallOpt)) {
4419 // The only way to guarantee a tail call is if the callee restores its
4420 // argument area, but it must also keep the stack aligned when doing so.
4421 MaybeAlign StackAlign = DAG.getDataLayout().getStackAlignment();
4422 assert(StackAlign && "data layout string is missing stack alignment");
4423 StackArgSize = alignTo(StackArgSize, *StackAlign);
4424
4425 AFI->setArgumentStackToRestore(StackArgSize);
4426 }
4427 AFI->setArgumentStackSize(StackArgSize);
4428
4429 if (CCInfo.getStackSize() > 0 && AFI->isCmseNSEntryFunction()) {
4430 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
4432 "secure entry function requires arguments on stack", dl.getDebugLoc()));
4433 }
4434
4435 return Chain;
4436}
4437
4438/// isFloatingPointZero - Return true if this is +0.0.
4441 return CFP->getValueAPF().isPosZero();
4442 else if (ISD::isEXTLoad(Op.getNode()) || ISD::isNON_EXTLoad(Op.getNode())) {
4443 // Maybe this has already been legalized into the constant pool?
4444 if (Op.getOperand(1).getOpcode() == ARMISD::Wrapper) {
4445 SDValue WrapperOp = Op.getOperand(1).getOperand(0);
4447 if (const ConstantFP *CFP = dyn_cast<ConstantFP>(CP->getConstVal()))
4448 return CFP->getValueAPF().isPosZero();
4449 }
4450 } else if (Op->getOpcode() == ISD::BITCAST &&
4451 Op->getValueType(0) == MVT::f64) {
4452 // Handle (ISD::BITCAST (ARMISD::VMOVIMM (ISD::TargetConstant 0)) MVT::f64)
4453 // created by LowerConstantFP().
4454 SDValue BitcastOp = Op->getOperand(0);
4455 if (BitcastOp->getOpcode() == ARMISD::VMOVIMM &&
4456 isNullConstant(BitcastOp->getOperand(0)))
4457 return true;
4458 }
4459 return false;
4460}
4461
4462/// Returns appropriate ARM CMP (cmp) and corresponding condition code for
4463/// the given operands.
4464SDValue ARMTargetLowering::getARMCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
4465 SDValue &ARMcc, SelectionDAG &DAG,
4466 const SDLoc &dl) const {
4467 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) {
4468 unsigned C = RHSC->getZExtValue();
4469 if (!isLegalICmpImmediate((int32_t)C)) {
4470 // Constant does not fit, try adjusting it by one.
4471 switch (CC) {
4472 default: break;
4473 case ISD::SETLT:
4474 case ISD::SETGE:
4475 if (C != 0x80000000 && isLegalICmpImmediate(C-1)) {
4476 CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT;
4477 RHS = DAG.getConstant(C - 1, dl, MVT::i32);
4478 }
4479 break;
4480 case ISD::SETULT:
4481 case ISD::SETUGE:
4482 if (C != 0 && isLegalICmpImmediate(C-1)) {
4483 CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT;
4484 RHS = DAG.getConstant(C - 1, dl, MVT::i32);
4485 }
4486 break;
4487 case ISD::SETLE:
4488 case ISD::SETGT:
4489 if (C != 0x7fffffff && isLegalICmpImmediate(C+1)) {
4490 CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE;
4491 RHS = DAG.getConstant(C + 1, dl, MVT::i32);
4492 }
4493 break;
4494 case ISD::SETULE:
4495 case ISD::SETUGT:
4496 if (C != 0xffffffff && isLegalICmpImmediate(C+1)) {
4497 CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE;
4498 RHS = DAG.getConstant(C + 1, dl, MVT::i32);
4499 }
4500 break;
4501 }
4502 }
4503 } else if ((ARM_AM::getShiftOpcForNode(LHS.getOpcode()) != ARM_AM::no_shift) &&
4505 // In ARM and Thumb-2, the compare instructions can shift their second
4506 // operand.
4508 std::swap(LHS, RHS);
4509 }
4510
4511 // Thumb1 has very limited immediate modes, so turning an "and" into a
4512 // shift can save multiple instructions.
4513 //
4514 // If we have (x & C1), and C1 is an appropriate mask, we can transform it
4515 // into "((x << n) >> n)". But that isn't necessarily profitable on its
4516 // own. If it's the operand to an unsigned comparison with an immediate,
4517 // we can eliminate one of the shifts: we transform
4518 // "((x << n) >> n) == C2" to "(x << n) == (C2 << n)".
4519 //
4520 // We avoid transforming cases which aren't profitable due to encoding
4521 // details:
4522 //
4523 // 1. C2 fits into the immediate field of a cmp, and the transformed version
4524 // would not; in that case, we're essentially trading one immediate load for
4525 // another.
4526 // 2. C1 is 255 or 65535, so we can use uxtb or uxth.
4527 // 3. C2 is zero; we have other code for this special case.
4528 //
4529 // FIXME: Figure out profitability for Thumb2; we usually can't save an
4530 // instruction, since the AND is always one instruction anyway, but we could
4531 // use narrow instructions in some cases.
4532 if (Subtarget->isThumb1Only() && LHS->getOpcode() == ISD::AND &&
4533 LHS->hasOneUse() && isa<ConstantSDNode>(LHS.getOperand(1)) &&
4534 LHS.getValueType() == MVT::i32 && isa<ConstantSDNode>(RHS) &&
4535 !isSignedIntSetCC(CC)) {
4536 unsigned Mask = LHS.getConstantOperandVal(1);
4537 auto *RHSC = cast<ConstantSDNode>(RHS.getNode());
4538 uint64_t RHSV = RHSC->getZExtValue();
4539 if (isMask_32(Mask) && (RHSV & ~Mask) == 0 && Mask != 255 && Mask != 65535) {
4540 unsigned ShiftBits = llvm::countl_zero(Mask);
4541 if (RHSV && (RHSV > 255 || (RHSV << ShiftBits) <= 255)) {
4542 SDValue ShiftAmt = DAG.getConstant(ShiftBits, dl, MVT::i32);
4543 LHS = DAG.getNode(ISD::SHL, dl, MVT::i32, LHS.getOperand(0), ShiftAmt);
4544 RHS = DAG.getConstant(RHSV << ShiftBits, dl, MVT::i32);
4545 }
4546 }
4547 }
4548
4549 // The specific comparison "(x<<c) > 0x80000000U" can be optimized to a
4550 // single "lsls x, c+1". The shift sets the "C" and "Z" flags the same
4551 // way a cmp would.
4552 // FIXME: Add support for ARM/Thumb2; this would need isel patterns, and
4553 // some tweaks to the heuristics for the previous and->shift transform.
4554 // FIXME: Optimize cases where the LHS isn't a shift.
4555 if (Subtarget->isThumb1Only() && LHS->getOpcode() == ISD::SHL &&
4556 isa<ConstantSDNode>(RHS) && RHS->getAsZExtVal() == 0x80000000U &&
4557 CC == ISD::SETUGT && isa<ConstantSDNode>(LHS.getOperand(1)) &&
4558 LHS.getConstantOperandVal(1) < 31) {
4559 unsigned ShiftAmt = LHS.getConstantOperandVal(1) + 1;
4560 SDValue Shift =
4561 DAG.getNode(ARMISD::LSLS, dl, DAG.getVTList(MVT::i32, FlagsVT),
4562 LHS.getOperand(0), DAG.getConstant(ShiftAmt, dl, MVT::i32));
4563 ARMcc = DAG.getConstant(ARMCC::HI, dl, MVT::i32);
4564 return Shift.getValue(1);
4565 }
4566
4568
4569 // If the RHS is a constant zero then the V (overflow) flag will never be
4570 // set. This can allow us to simplify GE to PL or LT to MI, which can be
4571 // simpler for other passes (like the peephole optimiser) to deal with.
4572 if (isNullConstant(RHS)) {
4573 switch (CondCode) {
4574 default: break;
4575 case ARMCC::GE:
4577 break;
4578 case ARMCC::LT:
4580 break;
4581 }
4582 }
4583
4584 unsigned CompareType;
4585 switch (CondCode) {
4586 default:
4587 CompareType = ARMISD::CMP;
4588 break;
4589 case ARMCC::EQ:
4590 case ARMCC::NE:
4591 // Uses only Z Flag
4592 CompareType = ARMISD::CMPZ;
4593 break;
4594 }
4595 ARMcc = DAG.getConstant(CondCode, dl, MVT::i32);
4596 return DAG.getNode(CompareType, dl, FlagsVT, LHS, RHS);
4597}
4598
4599/// Returns a appropriate VFP CMP (fcmp{s|d}+fmstat) for the given operands.
4600SDValue ARMTargetLowering::getVFPCmp(SDValue LHS, SDValue RHS,
4601 SelectionDAG &DAG, const SDLoc &dl,
4602 bool Signaling) const {
4603 assert(Subtarget->hasFP64() || RHS.getValueType() != MVT::f64);
4604 SDValue Flags;
4606 Flags = DAG.getNode(Signaling ? ARMISD::CMPFPE : ARMISD::CMPFP, dl, FlagsVT,
4607 LHS, RHS);
4608 else
4609 Flags = DAG.getNode(Signaling ? ARMISD::CMPFPEw0 : ARMISD::CMPFPw0, dl,
4610 FlagsVT, LHS);
4611 return DAG.getNode(ARMISD::FMSTAT, dl, FlagsVT, Flags);
4612}
4613
4614// This function returns three things: the arithmetic computation itself
4615// (Value), a comparison (OverflowCmp), and a condition code (ARMcc). The
4616// comparison and the condition code define the case in which the arithmetic
4617// computation *does not* overflow.
4618std::pair<SDValue, SDValue>
4619ARMTargetLowering::getARMXALUOOp(SDValue Op, SelectionDAG &DAG,
4620 SDValue &ARMcc) const {
4621 assert(Op.getValueType() == MVT::i32 && "Unsupported value type");
4622
4623 SDValue Value, OverflowCmp;
4624 SDValue LHS = Op.getOperand(0);
4625 SDValue RHS = Op.getOperand(1);
4626 SDLoc dl(Op);
4627
4628 // FIXME: We are currently always generating CMPs because we don't support
4629 // generating CMN through the backend. This is not as good as the natural
4630 // CMP case because it causes a register dependency and cannot be folded
4631 // later.
4632
4633 switch (Op.getOpcode()) {
4634 default:
4635 llvm_unreachable("Unknown overflow instruction!");
4636 case ISD::SADDO:
4637 ARMcc = DAG.getConstant(ARMCC::VC, dl, MVT::i32);
4638 Value = DAG.getNode(ISD::ADD, dl, Op.getValueType(), LHS, RHS);
4639 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, FlagsVT, Value, LHS);
4640 break;
4641 case ISD::UADDO:
4642 ARMcc = DAG.getConstant(ARMCC::HS, dl, MVT::i32);
4643 // We use ADDC here to correspond to its use in LowerALUO.
4644 // We do not use it in the USUBO case as Value may not be used.
4645 Value = DAG.getNode(ARMISD::ADDC, dl,
4646 DAG.getVTList(Op.getValueType(), MVT::i32), LHS, RHS)
4647 .getValue(0);
4648 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, FlagsVT, Value, LHS);
4649 break;
4650 case ISD::SSUBO:
4651 ARMcc = DAG.getConstant(ARMCC::VC, dl, MVT::i32);
4652 Value = DAG.getNode(ISD::SUB, dl, Op.getValueType(), LHS, RHS);
4653 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, FlagsVT, LHS, RHS);
4654 break;
4655 case ISD::USUBO:
4656 ARMcc = DAG.getConstant(ARMCC::HS, dl, MVT::i32);
4657 Value = DAG.getNode(ISD::SUB, dl, Op.getValueType(), LHS, RHS);
4658 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, FlagsVT, LHS, RHS);
4659 break;
4660 case ISD::UMULO:
4661 // We generate a UMUL_LOHI and then check if the high word is 0.
4662 ARMcc = DAG.getConstant(ARMCC::EQ, dl, MVT::i32);
4663 Value = DAG.getNode(ISD::UMUL_LOHI, dl,
4664 DAG.getVTList(Op.getValueType(), Op.getValueType()),
4665 LHS, RHS);
4666 OverflowCmp = DAG.getNode(ARMISD::CMPZ, dl, FlagsVT, Value.getValue(1),
4667 DAG.getConstant(0, dl, MVT::i32));
4668 Value = Value.getValue(0); // We only want the low 32 bits for the result.
4669 break;
4670 case ISD::SMULO:
4671 // We generate a SMUL_LOHI and then check if all the bits of the high word
4672 // are the same as the sign bit of the low word.
4673 ARMcc = DAG.getConstant(ARMCC::EQ, dl, MVT::i32);
4674 Value = DAG.getNode(ISD::SMUL_LOHI, dl,
4675 DAG.getVTList(Op.getValueType(), Op.getValueType()),
4676 LHS, RHS);
4677 OverflowCmp = DAG.getNode(ARMISD::CMPZ, dl, FlagsVT, Value.getValue(1),
4678 DAG.getNode(ISD::SRA, dl, Op.getValueType(),
4679 Value.getValue(0),
4680 DAG.getConstant(31, dl, MVT::i32)));
4681 Value = Value.getValue(0); // We only want the low 32 bits for the result.
4682 break;
4683 } // switch (...)
4684
4685 return std::make_pair(Value, OverflowCmp);
4686}
4687
4689 SelectionDAG &DAG) {
4690 SDLoc DL(BoolCarry);
4691 EVT CarryVT = BoolCarry.getValueType();
4692
4693 // This converts the boolean value carry into the carry flag by doing
4694 // ARMISD::SUBC Carry, 1
4695 SDValue Carry = DAG.getNode(ARMISD::SUBC, DL,
4696 DAG.getVTList(CarryVT, MVT::i32),
4697 BoolCarry, DAG.getConstant(1, DL, CarryVT));
4698 return Carry.getValue(1);
4699}
4700
4702 SelectionDAG &DAG) {
4703 SDLoc DL(Flags);
4704
4705 // Now convert the carry flag into a boolean carry. We do this
4706 // using ARMISD:ADDE 0, 0, Carry
4707 return DAG.getNode(ARMISD::ADDE, DL, DAG.getVTList(VT, MVT::i32),
4708 DAG.getConstant(0, DL, MVT::i32),
4709 DAG.getConstant(0, DL, MVT::i32), Flags);
4710}
4711
4712SDValue ARMTargetLowering::LowerALUO(SDValue Op, SelectionDAG &DAG) const {
4713 // Let legalize expand this if it isn't a legal type yet.
4714 if (!isTypeLegal(Op.getValueType()))
4715 return SDValue();
4716
4717 SDValue LHS = Op.getOperand(0);
4718 SDValue RHS = Op.getOperand(1);
4719 SDLoc dl(Op);
4720
4721 EVT VT = Op.getValueType();
4722 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
4723 SDValue Value;
4724 SDValue Overflow;
4725 switch (Op.getOpcode()) {
4726 case ISD::UADDO:
4727 Value = DAG.getNode(ARMISD::ADDC, dl, VTs, LHS, RHS);
4728 // Convert the carry flag into a boolean value.
4729 Overflow = ConvertCarryFlagToBooleanCarry(Value.getValue(1), VT, DAG);
4730 break;
4731 case ISD::USUBO:
4732 Value = DAG.getNode(ARMISD::SUBC, dl, VTs, LHS, RHS);
4733 // Convert the carry flag into a boolean value.
4734 Overflow = ConvertCarryFlagToBooleanCarry(Value.getValue(1), VT, DAG);
4735 // ARMISD::SUBC returns 0 when we have to borrow, so make it an overflow
4736 // value. So compute 1 - C.
4737 Overflow = DAG.getNode(ISD::SUB, dl, MVT::i32,
4738 DAG.getConstant(1, dl, MVT::i32), Overflow);
4739 break;
4740 default: {
4741 // Handle other operations with getARMXALUOOp
4742 SDValue OverflowCmp, ARMcc;
4743 std::tie(Value, OverflowCmp) = getARMXALUOOp(Op, DAG, ARMcc);
4744 // We use 0 and 1 as false and true values.
4745 // ARMcc represents the "no overflow" condition (e.g., VC for signed ops).
4746 // CMOV operand order is (FalseVal, TrueVal), so we put 1 in FalseVal
4747 // position to get Overflow=1 when the "no overflow" condition is false.
4748 Overflow =
4749 DAG.getNode(ARMISD::CMOV, dl, MVT::i32,
4750 DAG.getConstant(1, dl, MVT::i32), // FalseVal: overflow
4751 DAG.getConstant(0, dl, MVT::i32), // TrueVal: no overflow
4752 ARMcc, OverflowCmp);
4753 break;
4754 }
4755 }
4756
4757 return DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Value, Overflow);
4758}
4759
4761 const ARMSubtarget *Subtarget) {
4762 EVT VT = Op.getValueType();
4763 if (!Subtarget->hasV6Ops() || !Subtarget->hasDSP() || Subtarget->isThumb1Only())
4764 return SDValue();
4765 if (!VT.isSimple())
4766 return SDValue();
4767
4768 unsigned NewOpcode;
4769 switch (VT.getSimpleVT().SimpleTy) {
4770 default:
4771 return SDValue();
4772 case MVT::i8:
4773 switch (Op->getOpcode()) {
4774 case ISD::UADDSAT:
4775 NewOpcode = ARMISD::UQADD8b;
4776 break;
4777 case ISD::SADDSAT:
4778 NewOpcode = ARMISD::QADD8b;
4779 break;
4780 case ISD::USUBSAT:
4781 NewOpcode = ARMISD::UQSUB8b;
4782 break;
4783 case ISD::SSUBSAT:
4784 NewOpcode = ARMISD::QSUB8b;
4785 break;
4786 }
4787 break;
4788 case MVT::i16:
4789 switch (Op->getOpcode()) {
4790 case ISD::UADDSAT:
4791 NewOpcode = ARMISD::UQADD16b;
4792 break;
4793 case ISD::SADDSAT:
4794 NewOpcode = ARMISD::QADD16b;
4795 break;
4796 case ISD::USUBSAT:
4797 NewOpcode = ARMISD::UQSUB16b;
4798 break;
4799 case ISD::SSUBSAT:
4800 NewOpcode = ARMISD::QSUB16b;
4801 break;
4802 }
4803 break;
4804 }
4805
4806 SDLoc dl(Op);
4807 SDValue Add =
4808 DAG.getNode(NewOpcode, dl, MVT::i32,
4809 DAG.getSExtOrTrunc(Op->getOperand(0), dl, MVT::i32),
4810 DAG.getSExtOrTrunc(Op->getOperand(1), dl, MVT::i32));
4811 return DAG.getNode(ISD::TRUNCATE, dl, VT, Add);
4812}
4813
4814SDValue ARMTargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
4815 SDValue Cond = Op.getOperand(0);
4816 SDValue SelectTrue = Op.getOperand(1);
4817 SDValue SelectFalse = Op.getOperand(2);
4818 SDLoc dl(Op);
4819 unsigned Opc = Cond.getOpcode();
4820
4821 if (Cond.getResNo() == 1 &&
4822 (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO ||
4823 Opc == ISD::USUBO)) {
4824 if (!isTypeLegal(Cond->getValueType(0)))
4825 return SDValue();
4826
4827 SDValue Value, OverflowCmp;
4828 SDValue ARMcc;
4829 std::tie(Value, OverflowCmp) = getARMXALUOOp(Cond, DAG, ARMcc);
4830 EVT VT = Op.getValueType();
4831
4832 return getCMOV(dl, VT, SelectTrue, SelectFalse, ARMcc, OverflowCmp, DAG);
4833 }
4834
4835 // Convert:
4836 //
4837 // (select (cmov 1, 0, cond), t, f) -> (cmov t, f, cond)
4838 // (select (cmov 0, 1, cond), t, f) -> (cmov f, t, cond)
4839 //
4840 if (Cond.getOpcode() == ARMISD::CMOV && Cond.hasOneUse()) {
4841 const ConstantSDNode *CMOVTrue =
4842 dyn_cast<ConstantSDNode>(Cond.getOperand(0));
4843 const ConstantSDNode *CMOVFalse =
4844 dyn_cast<ConstantSDNode>(Cond.getOperand(1));
4845
4846 if (CMOVTrue && CMOVFalse) {
4847 unsigned CMOVTrueVal = CMOVTrue->getZExtValue();
4848 unsigned CMOVFalseVal = CMOVFalse->getZExtValue();
4849
4850 SDValue True;
4851 SDValue False;
4852 if (CMOVTrueVal == 1 && CMOVFalseVal == 0) {
4853 True = SelectTrue;
4854 False = SelectFalse;
4855 } else if (CMOVTrueVal == 0 && CMOVFalseVal == 1) {
4856 True = SelectFalse;
4857 False = SelectTrue;
4858 }
4859
4860 if (True.getNode() && False.getNode())
4861 return getCMOV(dl, Op.getValueType(), True, False, Cond.getOperand(2),
4862 Cond.getOperand(3), DAG);
4863 }
4864 }
4865
4866 // ARM's BooleanContents value is UndefinedBooleanContent. Mask out the
4867 // undefined bits before doing a full-word comparison with zero.
4868 Cond = DAG.getNode(ISD::AND, dl, Cond.getValueType(), Cond,
4869 DAG.getConstant(1, dl, Cond.getValueType()));
4870
4871 return DAG.getSelectCC(dl, Cond,
4872 DAG.getConstant(0, dl, Cond.getValueType()),
4873 SelectTrue, SelectFalse, ISD::SETNE);
4874}
4875
4877 bool &swpCmpOps, bool &swpVselOps) {
4878 // Start by selecting the GE condition code for opcodes that return true for
4879 // 'equality'
4880 if (CC == ISD::SETUGE || CC == ISD::SETOGE || CC == ISD::SETOLE ||
4881 CC == ISD::SETULE || CC == ISD::SETGE || CC == ISD::SETLE)
4882 CondCode = ARMCC::GE;
4883
4884 // and GT for opcodes that return false for 'equality'.
4885 else if (CC == ISD::SETUGT || CC == ISD::SETOGT || CC == ISD::SETOLT ||
4886 CC == ISD::SETULT || CC == ISD::SETGT || CC == ISD::SETLT)
4887 CondCode = ARMCC::GT;
4888
4889 // Since we are constrained to GE/GT, if the opcode contains 'less', we need
4890 // to swap the compare operands.
4891 if (CC == ISD::SETOLE || CC == ISD::SETULE || CC == ISD::SETOLT ||
4892 CC == ISD::SETULT || CC == ISD::SETLE || CC == ISD::SETLT)
4893 swpCmpOps = true;
4894
4895 // Both GT and GE are ordered comparisons, and return false for 'unordered'.
4896 // If we have an unordered opcode, we need to swap the operands to the VSEL
4897 // instruction (effectively negating the condition).
4898 //
4899 // This also has the effect of swapping which one of 'less' or 'greater'
4900 // returns true, so we also swap the compare operands. It also switches
4901 // whether we return true for 'equality', so we compensate by picking the
4902 // opposite condition code to our original choice.
4903 if (CC == ISD::SETULE || CC == ISD::SETULT || CC == ISD::SETUGE ||
4904 CC == ISD::SETUGT) {
4905 swpCmpOps = !swpCmpOps;
4906 swpVselOps = !swpVselOps;
4907 CondCode = CondCode == ARMCC::GT ? ARMCC::GE : ARMCC::GT;
4908 }
4909
4910 // 'ordered' is 'anything but unordered', so use the VS condition code and
4911 // swap the VSEL operands.
4912 if (CC == ISD::SETO) {
4913 CondCode = ARMCC::VS;
4914 swpVselOps = true;
4915 }
4916
4917 // 'unordered or not equal' is 'anything but equal', so use the EQ condition
4918 // code and swap the VSEL operands. Also do this if we don't care about the
4919 // unordered case.
4920 if (CC == ISD::SETUNE || CC == ISD::SETNE) {
4921 CondCode = ARMCC::EQ;
4922 swpVselOps = true;
4923 }
4924}
4925
4926SDValue ARMTargetLowering::getCMOV(const SDLoc &dl, EVT VT, SDValue FalseVal,
4927 SDValue TrueVal, SDValue ARMcc,
4928 SDValue Flags, SelectionDAG &DAG) const {
4929 if (!Subtarget->hasFP64() && VT == MVT::f64) {
4930 FalseVal = DAG.getNode(ARMISD::VMOVRRD, dl,
4931 DAG.getVTList(MVT::i32, MVT::i32), FalseVal);
4932 TrueVal = DAG.getNode(ARMISD::VMOVRRD, dl,
4933 DAG.getVTList(MVT::i32, MVT::i32), TrueVal);
4934
4935 SDValue TrueLow = TrueVal.getValue(0);
4936 SDValue TrueHigh = TrueVal.getValue(1);
4937 SDValue FalseLow = FalseVal.getValue(0);
4938 SDValue FalseHigh = FalseVal.getValue(1);
4939
4940 SDValue Low = DAG.getNode(ARMISD::CMOV, dl, MVT::i32, FalseLow, TrueLow,
4941 ARMcc, Flags);
4942 SDValue High = DAG.getNode(ARMISD::CMOV, dl, MVT::i32, FalseHigh, TrueHigh,
4943 ARMcc, Flags);
4944
4945 return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Low, High);
4946 }
4947 return DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, TrueVal, ARMcc, Flags);
4948}
4949
4950static bool isGTorGE(ISD::CondCode CC) {
4951 return CC == ISD::SETGT || CC == ISD::SETGE;
4952}
4953
4954static bool isLTorLE(ISD::CondCode CC) {
4955 return CC == ISD::SETLT || CC == ISD::SETLE;
4956}
4957
4958// See if a conditional (LHS CC RHS ? TrueVal : FalseVal) is lower-saturating.
4959// All of these conditions (and their <= and >= counterparts) will do:
4960// x < k ? k : x
4961// x > k ? x : k
4962// k < x ? x : k
4963// k > x ? k : x
4964static bool isLowerSaturate(const SDValue LHS, const SDValue RHS,
4965 const SDValue TrueVal, const SDValue FalseVal,
4966 const ISD::CondCode CC, const SDValue K) {
4967 return (isGTorGE(CC) &&
4968 ((K == LHS && K == TrueVal) || (K == RHS && K == FalseVal))) ||
4969 (isLTorLE(CC) &&
4970 ((K == RHS && K == TrueVal) || (K == LHS && K == FalseVal)));
4971}
4972
4973// Check if two chained conditionals could be converted into SSAT or USAT.
4974//
4975// SSAT can replace a set of two conditional selectors that bound a number to an
4976// interval of type [k, ~k] when k + 1 is a power of 2. Here are some examples:
4977//
4978// x < -k ? -k : (x > k ? k : x)
4979// x < -k ? -k : (x < k ? x : k)
4980// x > -k ? (x > k ? k : x) : -k
4981// x < k ? (x < -k ? -k : x) : k
4982// etc.
4983//
4984// LLVM canonicalizes these to either a min(max()) or a max(min())
4985// pattern. This function tries to match one of these and will return a SSAT
4986// node if successful.
4987//
4988// USAT works similarily to SSAT but bounds on the interval [0, k] where k + 1
4989// is a power of 2.
4991 EVT VT = Op.getValueType();
4992 SDValue V1 = Op.getOperand(0);
4993 SDValue K1 = Op.getOperand(1);
4994 SDValue TrueVal1 = Op.getOperand(2);
4995 SDValue FalseVal1 = Op.getOperand(3);
4996 ISD::CondCode CC1 = cast<CondCodeSDNode>(Op.getOperand(4))->get();
4997
4998 const SDValue Op2 = isa<ConstantSDNode>(TrueVal1) ? FalseVal1 : TrueVal1;
4999 if (Op2.getOpcode() != ISD::SELECT_CC)
5000 return SDValue();
5001
5002 SDValue V2 = Op2.getOperand(0);
5003 SDValue K2 = Op2.getOperand(1);
5004 SDValue TrueVal2 = Op2.getOperand(2);
5005 SDValue FalseVal2 = Op2.getOperand(3);
5006 ISD::CondCode CC2 = cast<CondCodeSDNode>(Op2.getOperand(4))->get();
5007
5008 SDValue V1Tmp = V1;
5009 SDValue V2Tmp = V2;
5010
5011 // Check that the registers and the constants match a max(min()) or min(max())
5012 // pattern
5013 if (V1Tmp != TrueVal1 || V2Tmp != TrueVal2 || K1 != FalseVal1 ||
5014 K2 != FalseVal2 ||
5015 !((isGTorGE(CC1) && isLTorLE(CC2)) || (isLTorLE(CC1) && isGTorGE(CC2))))
5016 return SDValue();
5017
5018 // Check that the constant in the lower-bound check is
5019 // the opposite of the constant in the upper-bound check
5020 // in 1's complement.
5022 return SDValue();
5023
5024 int64_t Val1 = cast<ConstantSDNode>(K1)->getSExtValue();
5025 int64_t Val2 = cast<ConstantSDNode>(K2)->getSExtValue();
5026 int64_t PosVal = std::max(Val1, Val2);
5027 int64_t NegVal = std::min(Val1, Val2);
5028
5029 if (!((Val1 > Val2 && isLTorLE(CC1)) || (Val1 < Val2 && isLTorLE(CC2))) ||
5030 !isPowerOf2_64(PosVal + 1))
5031 return SDValue();
5032
5033 // Handle the difference between USAT (unsigned) and SSAT (signed)
5034 // saturation
5035 // At this point, PosVal is guaranteed to be positive
5036 uint64_t K = PosVal;
5037 SDLoc dl(Op);
5038 if (Val1 == ~Val2)
5039 return DAG.getNode(ARMISD::SSAT, dl, VT, V2Tmp,
5040 DAG.getConstant(llvm::countr_one(K), dl, VT));
5041 if (NegVal == 0)
5042 return DAG.getNode(ARMISD::USAT, dl, VT, V2Tmp,
5043 DAG.getConstant(llvm::countr_one(K), dl, VT));
5044
5045 return SDValue();
5046}
5047
5048// Check if a condition of the type x < k ? k : x can be converted into a
5049// bit operation instead of conditional moves.
5050// Currently this is allowed given:
5051// - The conditions and values match up
5052// - k is 0 or -1 (all ones)
5053// This function will not check the last condition, thats up to the caller
5054// It returns true if the transformation can be made, and in such case
5055// returns x in V, and k in SatK.
5057 SDValue &SatK)
5058{
5059 SDValue LHS = Op.getOperand(0);
5060 SDValue RHS = Op.getOperand(1);
5061 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
5062 SDValue TrueVal = Op.getOperand(2);
5063 SDValue FalseVal = Op.getOperand(3);
5064
5066 ? &RHS
5067 : nullptr;
5068
5069 // No constant operation in comparison, early out
5070 if (!K)
5071 return false;
5072
5073 SDValue KTmp = isa<ConstantSDNode>(TrueVal) ? TrueVal : FalseVal;
5074 V = (KTmp == TrueVal) ? FalseVal : TrueVal;
5075 SDValue VTmp = (K && *K == LHS) ? RHS : LHS;
5076
5077 // If the constant on left and right side, or variable on left and right,
5078 // does not match, early out
5079 if (*K != KTmp || V != VTmp)
5080 return false;
5081
5082 if (isLowerSaturate(LHS, RHS, TrueVal, FalseVal, CC, *K)) {
5083 SatK = *K;
5084 return true;
5085 }
5086
5087 return false;
5088}
5089
5090bool ARMTargetLowering::isUnsupportedFloatingType(EVT VT) const {
5091 if (VT == MVT::f32)
5092 return !Subtarget->hasVFP2Base();
5093 if (VT == MVT::f64)
5094 return !Subtarget->hasFP64();
5095 if (VT == MVT::f16)
5096 return !Subtarget->hasFullFP16();
5097 return false;
5098}
5099
5100SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
5101 EVT VT = Op.getValueType();
5102 SDLoc dl(Op);
5103
5104 // Try to convert two saturating conditional selects into a single SSAT
5105 if ((!Subtarget->isThumb() && Subtarget->hasV6Ops()) || Subtarget->isThumb2())
5106 if (SDValue SatValue = LowerSaturatingConditional(Op, DAG))
5107 return SatValue;
5108
5109 // Try to convert expressions of the form x < k ? k : x (and similar forms)
5110 // into more efficient bit operations, which is possible when k is 0 or -1
5111 // On ARM and Thumb-2 which have flexible operand 2 this will result in
5112 // single instructions. On Thumb the shift and the bit operation will be two
5113 // instructions.
5114 // Only allow this transformation on full-width (32-bit) operations
5115 SDValue LowerSatConstant;
5116 SDValue SatValue;
5117 if (VT == MVT::i32 &&
5118 isLowerSaturatingConditional(Op, SatValue, LowerSatConstant)) {
5119 SDValue ShiftV = DAG.getNode(ISD::SRA, dl, VT, SatValue,
5120 DAG.getConstant(31, dl, VT));
5121 if (isNullConstant(LowerSatConstant)) {
5122 SDValue NotShiftV = DAG.getNode(ISD::XOR, dl, VT, ShiftV,
5123 DAG.getAllOnesConstant(dl, VT));
5124 return DAG.getNode(ISD::AND, dl, VT, SatValue, NotShiftV);
5125 } else if (isAllOnesConstant(LowerSatConstant))
5126 return DAG.getNode(ISD::OR, dl, VT, SatValue, ShiftV);
5127 }
5128
5129 SDValue LHS = Op.getOperand(0);
5130 SDValue RHS = Op.getOperand(1);
5131 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
5132 SDValue TrueVal = Op.getOperand(2);
5133 SDValue FalseVal = Op.getOperand(3);
5134 ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FalseVal);
5135 ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TrueVal);
5136 ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS);
5137 if (Op.getValueType().isInteger()) {
5138
5139 // Check for SMAX(lhs, 0) and SMIN(lhs, 0) patterns.
5140 // (SELECT_CC setgt, lhs, 0, lhs, 0) -> (BIC lhs, (SRA lhs, typesize-1))
5141 // (SELECT_CC setlt, lhs, 0, lhs, 0) -> (AND lhs, (SRA lhs, typesize-1))
5142 // Both require less instructions than compare and conditional select.
5143 if ((CC == ISD::SETGT || CC == ISD::SETLT) && LHS == TrueVal && RHSC &&
5144 RHSC->isZero() && CFVal && CFVal->isZero() &&
5145 LHS.getValueType() == RHS.getValueType()) {
5146 EVT VT = LHS.getValueType();
5147 SDValue Shift =
5148 DAG.getNode(ISD::SRA, dl, VT, LHS,
5149 DAG.getConstant(VT.getSizeInBits() - 1, dl, VT));
5150
5151 if (CC == ISD::SETGT)
5152 Shift = DAG.getNOT(dl, Shift, VT);
5153
5154 return DAG.getNode(ISD::AND, dl, VT, LHS, Shift);
5155 }
5156 }
5157
5158 if (Subtarget->hasV8_1MMainlineOps() && CFVal && CTVal &&
5159 LHS.getValueType() == MVT::i32 && RHS.getValueType() == MVT::i32) {
5160 unsigned TVal = CTVal->getZExtValue();
5161 unsigned FVal = CFVal->getZExtValue();
5162 unsigned Opcode = 0;
5163
5164 if (TVal == ~FVal) {
5165 Opcode = ARMISD::CSINV;
5166 } else if (TVal == ~FVal + 1) {
5167 Opcode = ARMISD::CSNEG;
5168 } else if (TVal + 1 == FVal) {
5169 Opcode = ARMISD::CSINC;
5170 } else if (TVal == FVal + 1) {
5171 Opcode = ARMISD::CSINC;
5172 std::swap(TrueVal, FalseVal);
5173 std::swap(TVal, FVal);
5174 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
5175 }
5176
5177 if (Opcode) {
5178 // If one of the constants is cheaper than another, materialise the
5179 // cheaper one and let the csel generate the other.
5180 if (Opcode != ARMISD::CSINC &&
5181 HasLowerConstantMaterializationCost(FVal, TVal, Subtarget)) {
5182 std::swap(TrueVal, FalseVal);
5183 std::swap(TVal, FVal);
5184 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
5185 }
5186
5187 // Attempt to use ZR checking TVal is 0, possibly inverting the condition
5188 // to get there. CSINC not is invertable like the other two (~(~a) == a,
5189 // -(-a) == a, but (a+1)+1 != a).
5190 if (FVal == 0 && Opcode != ARMISD::CSINC) {
5191 std::swap(TrueVal, FalseVal);
5192 std::swap(TVal, FVal);
5193 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
5194 }
5195
5196 // Drops F's value because we can get it by inverting/negating TVal.
5197 FalseVal = TrueVal;
5198
5199 SDValue ARMcc;
5200 SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl);
5201 EVT VT = TrueVal.getValueType();
5202 return DAG.getNode(Opcode, dl, VT, TrueVal, FalseVal, ARMcc, Cmp);
5203 }
5204 }
5205
5206 if (isUnsupportedFloatingType(LHS.getValueType())) {
5207 softenSetCCOperands(DAG, LHS.getValueType(), LHS, RHS, CC, dl, LHS, RHS);
5208
5209 // If softenSetCCOperands only returned one value, we should compare it to
5210 // zero.
5211 if (!RHS.getNode()) {
5212 RHS = DAG.getConstant(0, dl, LHS.getValueType());
5213 CC = ISD::SETNE;
5214 }
5215 }
5216
5217 if (LHS.getValueType() == MVT::i32) {
5218 // Try to generate VSEL on ARMv8.
5219 // The VSEL instruction can't use all the usual ARM condition
5220 // codes: it only has two bits to select the condition code, so it's
5221 // constrained to use only GE, GT, VS and EQ.
5222 //
5223 // To implement all the various ISD::SETXXX opcodes, we sometimes need to
5224 // swap the operands of the previous compare instruction (effectively
5225 // inverting the compare condition, swapping 'less' and 'greater') and
5226 // sometimes need to swap the operands to the VSEL (which inverts the
5227 // condition in the sense of firing whenever the previous condition didn't)
5228 if (Subtarget->hasFPARMv8Base() && (TrueVal.getValueType() == MVT::f16 ||
5229 TrueVal.getValueType() == MVT::f32 ||
5230 TrueVal.getValueType() == MVT::f64)) {
5232 if (CondCode == ARMCC::LT || CondCode == ARMCC::LE ||
5233 CondCode == ARMCC::VC || CondCode == ARMCC::NE) {
5234 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
5235 std::swap(TrueVal, FalseVal);
5236 }
5237 }
5238
5239 SDValue ARMcc;
5240 SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl);
5241 // Choose GE over PL, which vsel does now support
5242 if (ARMcc->getAsZExtVal() == ARMCC::PL)
5243 ARMcc = DAG.getConstant(ARMCC::GE, dl, MVT::i32);
5244 return getCMOV(dl, VT, FalseVal, TrueVal, ARMcc, Cmp, DAG);
5245 }
5246
5247 ARMCC::CondCodes CondCode, CondCode2;
5248 FPCCToARMCC(CC, CondCode, CondCode2);
5249
5250 // Normalize the fp compare. If RHS is zero we prefer to keep it there so we
5251 // match CMPFPw0 instead of CMPFP, though we don't do this for f16 because we
5252 // must use VSEL (limited condition codes), due to not having conditional f16
5253 // moves.
5254 if (Subtarget->hasFPARMv8Base() &&
5255 !(isFloatingPointZero(RHS) && TrueVal.getValueType() != MVT::f16) &&
5256 (TrueVal.getValueType() == MVT::f16 ||
5257 TrueVal.getValueType() == MVT::f32 ||
5258 TrueVal.getValueType() == MVT::f64)) {
5259 bool swpCmpOps = false;
5260 bool swpVselOps = false;
5261 checkVSELConstraints(CC, CondCode, swpCmpOps, swpVselOps);
5262
5263 if (CondCode == ARMCC::GT || CondCode == ARMCC::GE ||
5264 CondCode == ARMCC::VS || CondCode == ARMCC::EQ) {
5265 if (swpCmpOps)
5266 std::swap(LHS, RHS);
5267 if (swpVselOps)
5268 std::swap(TrueVal, FalseVal);
5269 }
5270 }
5271
5272 SDValue ARMcc = DAG.getConstant(CondCode, dl, MVT::i32);
5273 SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl);
5274 SDValue Result = getCMOV(dl, VT, FalseVal, TrueVal, ARMcc, Cmp, DAG);
5275 if (CondCode2 != ARMCC::AL) {
5276 SDValue ARMcc2 = DAG.getConstant(CondCode2, dl, MVT::i32);
5277 Result = getCMOV(dl, VT, Result, TrueVal, ARMcc2, Cmp, DAG);
5278 }
5279 return Result;
5280}
5281
5282/// canChangeToInt - Given the fp compare operand, return true if it is suitable
5283/// to morph to an integer compare sequence.
5284static bool canChangeToInt(SDValue Op, bool &SeenZero,
5285 const ARMSubtarget *Subtarget) {
5286 SDNode *N = Op.getNode();
5287 if (!N->hasOneUse())
5288 // Otherwise it requires moving the value from fp to integer registers.
5289 return false;
5290 if (!N->getNumValues())
5291 return false;
5292 EVT VT = Op.getValueType();
5293 if (VT != MVT::f32 && !Subtarget->isFPBrccSlow())
5294 // f32 case is generally profitable. f64 case only makes sense when vcmpe +
5295 // vmrs are very slow, e.g. cortex-a8.
5296 return false;
5297
5298 if (isFloatingPointZero(Op)) {
5299 SeenZero = true;
5300 return true;
5301 }
5302 return ISD::isNormalLoad(N);
5303}
5304
5307 return DAG.getConstant(0, SDLoc(Op), MVT::i32);
5308
5310 return DAG.getLoad(MVT::i32, SDLoc(Op), Ld->getChain(), Ld->getBasePtr(),
5311 Ld->getPointerInfo(), Ld->getAlign(),
5312 Ld->getMemOperand()->getFlags());
5313
5314 llvm_unreachable("Unknown VFP cmp argument!");
5315}
5316
5318 SDValue &RetVal1, SDValue &RetVal2) {
5319 SDLoc dl(Op);
5320
5321 if (isFloatingPointZero(Op)) {
5322 RetVal1 = DAG.getConstant(0, dl, MVT::i32);
5323 RetVal2 = DAG.getConstant(0, dl, MVT::i32);
5324 return;
5325 }
5326
5327 if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Op)) {
5328 SDValue Ptr = Ld->getBasePtr();
5329 RetVal1 =
5330 DAG.getLoad(MVT::i32, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),
5331 Ld->getAlign(), Ld->getMemOperand()->getFlags());
5332
5333 EVT PtrType = Ptr.getValueType();
5334 SDValue NewPtr = DAG.getNode(ISD::ADD, dl,
5335 PtrType, Ptr, DAG.getConstant(4, dl, PtrType));
5336 RetVal2 = DAG.getLoad(MVT::i32, dl, Ld->getChain(), NewPtr,
5337 Ld->getPointerInfo().getWithOffset(4),
5338 commonAlignment(Ld->getAlign(), 4),
5339 Ld->getMemOperand()->getFlags());
5340 return;
5341 }
5342
5343 llvm_unreachable("Unknown VFP cmp argument!");
5344}
5345
5346/// OptimizeVFPBrcond - With nnan and without daz, it's legal to optimize some
5347/// f32 and even f64 comparisons to integer ones.
5348SDValue
5349ARMTargetLowering::OptimizeVFPBrcond(SDValue Op, SelectionDAG &DAG) const {
5350 SDValue Chain = Op.getOperand(0);
5351 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
5352 SDValue LHS = Op.getOperand(2);
5353 SDValue RHS = Op.getOperand(3);
5354 SDValue Dest = Op.getOperand(4);
5355 SDLoc dl(Op);
5356
5357 bool LHSSeenZero = false;
5358 bool LHSOk = canChangeToInt(LHS, LHSSeenZero, Subtarget);
5359 bool RHSSeenZero = false;
5360 bool RHSOk = canChangeToInt(RHS, RHSSeenZero, Subtarget);
5361 if (LHSOk && RHSOk && (LHSSeenZero || RHSSeenZero)) {
5362 // If unsafe fp math optimization is enabled and there are no other uses of
5363 // the CMP operands, and the condition code is EQ or NE, we can optimize it
5364 // to an integer comparison.
5365 if (CC == ISD::SETOEQ)
5366 CC = ISD::SETEQ;
5367 else if (CC == ISD::SETUNE)
5368 CC = ISD::SETNE;
5369
5370 SDValue Mask = DAG.getConstant(0x7fffffff, dl, MVT::i32);
5371 SDValue ARMcc;
5372 if (LHS.getValueType() == MVT::f32) {
5373 LHS = DAG.getNode(ISD::AND, dl, MVT::i32,
5374 bitcastf32Toi32(LHS, DAG), Mask);
5375 RHS = DAG.getNode(ISD::AND, dl, MVT::i32,
5376 bitcastf32Toi32(RHS, DAG), Mask);
5377 SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl);
5378 return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, Chain, Dest, ARMcc,
5379 Cmp);
5380 }
5381
5382 SDValue LHS1, LHS2;
5383 SDValue RHS1, RHS2;
5384 expandf64Toi32(LHS, DAG, LHS1, LHS2);
5385 expandf64Toi32(RHS, DAG, RHS1, RHS2);
5386 LHS2 = DAG.getNode(ISD::AND, dl, MVT::i32, LHS2, Mask);
5387 RHS2 = DAG.getNode(ISD::AND, dl, MVT::i32, RHS2, Mask);
5389 ARMcc = DAG.getConstant(CondCode, dl, MVT::i32);
5390 SDValue Ops[] = { Chain, ARMcc, LHS1, LHS2, RHS1, RHS2, Dest };
5391 return DAG.getNode(ARMISD::BCC_i64, dl, MVT::Other, Ops);
5392 }
5393
5394 return SDValue();
5395}
5396
5397// Generate CMP + CMOV for integer abs.
5398SDValue ARMTargetLowering::LowerABS(SDValue Op, SelectionDAG &DAG) const {
5399 SDLoc DL(Op);
5400
5401 SDValue Neg = DAG.getNegative(Op.getOperand(0), DL, MVT::i32);
5402
5403 // Generate CMP & CMOV.
5404 SDValue Cmp = DAG.getNode(ARMISD::CMP, DL, FlagsVT, Op.getOperand(0),
5405 DAG.getConstant(0, DL, MVT::i32));
5406 return DAG.getNode(ARMISD::CMOV, DL, MVT::i32, Op.getOperand(0), Neg,
5407 DAG.getConstant(ARMCC::MI, DL, MVT::i32), Cmp);
5408}
5409
5410SDValue ARMTargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
5411 SDValue Chain = Op.getOperand(0);
5412 SDValue Cond = Op.getOperand(1);
5413 SDValue Dest = Op.getOperand(2);
5414 SDLoc dl(Op);
5415
5416 // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch
5417 // instruction.
5418 unsigned Opc = Cond.getOpcode();
5419 bool OptimizeMul = (Opc == ISD::SMULO || Opc == ISD::UMULO) &&
5420 !Subtarget->isThumb1Only();
5421 if (Cond.getResNo() == 1 &&
5422 (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO ||
5423 Opc == ISD::USUBO || OptimizeMul)) {
5424 // Only lower legal XALUO ops.
5425 if (!isTypeLegal(Cond->getValueType(0)))
5426 return SDValue();
5427
5428 // The actual operation with overflow check.
5429 SDValue Value, OverflowCmp;
5430 SDValue ARMcc;
5431 std::tie(Value, OverflowCmp) = getARMXALUOOp(Cond, DAG, ARMcc);
5432
5433 // Reverse the condition code.
5435 (ARMCC::CondCodes)cast<const ConstantSDNode>(ARMcc)->getZExtValue();
5437 ARMcc = DAG.getConstant(CondCode, SDLoc(ARMcc), MVT::i32);
5438
5439 return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, Chain, Dest, ARMcc,
5440 OverflowCmp);
5441 }
5442
5443 return SDValue();
5444}
5445
5446SDValue ARMTargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
5447 SDValue Chain = Op.getOperand(0);
5448 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
5449 SDValue LHS = Op.getOperand(2);
5450 SDValue RHS = Op.getOperand(3);
5451 SDValue Dest = Op.getOperand(4);
5452 SDLoc dl(Op);
5453
5454 if (isUnsupportedFloatingType(LHS.getValueType())) {
5455 softenSetCCOperands(DAG, LHS.getValueType(), LHS, RHS, CC, dl, LHS, RHS);
5456
5457 // If softenSetCCOperands only returned one value, we should compare it to
5458 // zero.
5459 if (!RHS.getNode()) {
5460 RHS = DAG.getConstant(0, dl, LHS.getValueType());
5461 CC = ISD::SETNE;
5462 }
5463 }
5464
5465 // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch
5466 // instruction.
5467 unsigned Opc = LHS.getOpcode();
5468 bool OptimizeMul = (Opc == ISD::SMULO || Opc == ISD::UMULO) &&
5469 !Subtarget->isThumb1Only();
5470 if (LHS.getResNo() == 1 && (isOneConstant(RHS) || isNullConstant(RHS)) &&
5471 (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO ||
5472 Opc == ISD::USUBO || OptimizeMul) &&
5473 (CC == ISD::SETEQ || CC == ISD::SETNE)) {
5474 // Only lower legal XALUO ops.
5475 if (!isTypeLegal(LHS->getValueType(0)))
5476 return SDValue();
5477
5478 // The actual operation with overflow check.
5479 SDValue Value, OverflowCmp;
5480 SDValue ARMcc;
5481 std::tie(Value, OverflowCmp) = getARMXALUOOp(LHS.getValue(0), DAG, ARMcc);
5482
5483 if ((CC == ISD::SETNE) != isOneConstant(RHS)) {
5484 // Reverse the condition code.
5486 (ARMCC::CondCodes)cast<const ConstantSDNode>(ARMcc)->getZExtValue();
5488 ARMcc = DAG.getConstant(CondCode, SDLoc(ARMcc), MVT::i32);
5489 }
5490
5491 return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, Chain, Dest, ARMcc,
5492 OverflowCmp);
5493 }
5494
5495 if (LHS.getValueType() == MVT::i32) {
5496 SDValue ARMcc;
5497 SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl);
5498 return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, Chain, Dest, ARMcc, Cmp);
5499 }
5500
5501 SDNodeFlags Flags = Op->getFlags();
5502 if (Flags.hasNoNaNs() &&
5503 DAG.getDenormalMode(MVT::f32) == DenormalMode::getIEEE() &&
5504 DAG.getDenormalMode(MVT::f64) == DenormalMode::getIEEE() &&
5505 (CC == ISD::SETEQ || CC == ISD::SETOEQ || CC == ISD::SETNE ||
5506 CC == ISD::SETUNE)) {
5507 if (SDValue Result = OptimizeVFPBrcond(Op, DAG))
5508 return Result;
5509 }
5510
5511 ARMCC::CondCodes CondCode, CondCode2;
5512 FPCCToARMCC(CC, CondCode, CondCode2);
5513
5514 SDValue ARMcc = DAG.getConstant(CondCode, dl, MVT::i32);
5515 SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl);
5516 SDValue Ops[] = {Chain, Dest, ARMcc, Cmp};
5517 SDValue Res = DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, Ops);
5518 if (CondCode2 != ARMCC::AL) {
5519 ARMcc = DAG.getConstant(CondCode2, dl, MVT::i32);
5520 SDValue Ops[] = {Res, Dest, ARMcc, Cmp};
5521 Res = DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, Ops);
5522 }
5523 return Res;
5524}
5525
5526SDValue ARMTargetLowering::LowerBR_JT(SDValue Op, SelectionDAG &DAG) const {
5527 SDValue Chain = Op.getOperand(0);
5528 SDValue Table = Op.getOperand(1);
5529 SDValue Index = Op.getOperand(2);
5530 SDLoc dl(Op);
5531
5532 EVT PTy = getPointerTy(DAG.getDataLayout());
5533 JumpTableSDNode *JT = cast<JumpTableSDNode>(Table);
5534 SDValue JTI = DAG.getTargetJumpTable(JT->getIndex(), PTy);
5535 Table = DAG.getNode(ARMISD::WrapperJT, dl, MVT::i32, JTI);
5536 Index = DAG.getNode(ISD::MUL, dl, PTy, Index, DAG.getConstant(4, dl, PTy));
5537 SDValue Addr = DAG.getNode(ISD::ADD, dl, PTy, Table, Index);
5538 if (Subtarget->isThumb2() || (Subtarget->hasV8MBaselineOps() && Subtarget->isThumb())) {
5539 // Thumb2 and ARMv8-M use a two-level jump. That is, it jumps into the jump table
5540 // which does another jump to the destination. This also makes it easier
5541 // to translate it to TBB / TBH later (Thumb2 only).
5542 // FIXME: This might not work if the function is extremely large.
5543 return DAG.getNode(ARMISD::BR2_JT, dl, MVT::Other, Chain,
5544 Addr, Op.getOperand(2), JTI);
5545 }
5546 if (isPositionIndependent() || Subtarget->isROPI()) {
5547 Addr =
5548 DAG.getLoad((EVT)MVT::i32, dl, Chain, Addr,
5550 Chain = Addr.getValue(1);
5551 Addr = DAG.getNode(ISD::ADD, dl, PTy, Table, Addr);
5552 return DAG.getNode(ARMISD::BR_JT, dl, MVT::Other, Chain, Addr, JTI);
5553 } else {
5554 Addr =
5555 DAG.getLoad(PTy, dl, Chain, Addr,
5557 Chain = Addr.getValue(1);
5558 return DAG.getNode(ARMISD::BR_JT, dl, MVT::Other, Chain, Addr, JTI);
5559 }
5560}
5561
5563 EVT VT = Op.getValueType();
5564 SDLoc dl(Op);
5565
5566 if (Op.getValueType().getVectorElementType() == MVT::i32) {
5567 if (Op.getOperand(0).getValueType().getVectorElementType() == MVT::f32)
5568 return Op;
5569 return DAG.UnrollVectorOp(Op.getNode());
5570 }
5571
5572 const bool HasFullFP16 = DAG.getSubtarget<ARMSubtarget>().hasFullFP16();
5573
5574 EVT NewTy;
5575 const EVT OpTy = Op.getOperand(0).getValueType();
5576 if (OpTy == MVT::v4f32)
5577 NewTy = MVT::v4i32;
5578 else if (OpTy == MVT::v4f16 && HasFullFP16)
5579 NewTy = MVT::v4i16;
5580 else if (OpTy == MVT::v8f16 && HasFullFP16)
5581 NewTy = MVT::v8i16;
5582 else
5583 llvm_unreachable("Invalid type for custom lowering!");
5584
5585 if (VT != MVT::v4i16 && VT != MVT::v8i16)
5586 return DAG.UnrollVectorOp(Op.getNode());
5587
5588 Op = DAG.getNode(Op.getOpcode(), dl, NewTy, Op.getOperand(0));
5589 return DAG.getNode(ISD::TRUNCATE, dl, VT, Op);
5590}
5591
5592SDValue ARMTargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
5593 EVT VT = Op.getValueType();
5594 if (VT.isVector())
5595 return LowerVectorFP_TO_INT(Op, DAG);
5596
5597 bool IsStrict = Op->isStrictFPOpcode();
5598 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
5599
5600 if (isUnsupportedFloatingType(SrcVal.getValueType())) {
5601 RTLIB::Libcall LC;
5602 if (Op.getOpcode() == ISD::FP_TO_SINT ||
5603 Op.getOpcode() == ISD::STRICT_FP_TO_SINT)
5604 LC = RTLIB::getFPTOSINT(SrcVal.getValueType(),
5605 Op.getValueType());
5606 else
5607 LC = RTLIB::getFPTOUINT(SrcVal.getValueType(),
5608 Op.getValueType());
5609 SDLoc Loc(Op);
5610 MakeLibCallOptions CallOptions;
5611 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
5613 std::tie(Result, Chain) = makeLibCall(DAG, LC, Op.getValueType(), SrcVal,
5614 CallOptions, Loc, Chain);
5615 return IsStrict ? DAG.getMergeValues({Result, Chain}, Loc) : Result;
5616 }
5617
5618 // FIXME: Remove this when we have strict fp instruction selection patterns
5619 if (IsStrict) {
5620 SDLoc Loc(Op);
5621 SDValue Result =
5624 Loc, Op.getValueType(), SrcVal);
5625 return DAG.getMergeValues({Result, Op.getOperand(0)}, Loc);
5626 }
5627
5628 return Op;
5629}
5630
5632 const ARMSubtarget *Subtarget) {
5633 EVT VT = Op.getValueType();
5634 EVT ToVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
5635 EVT FromVT = Op.getOperand(0).getValueType();
5636
5637 if (VT == MVT::i32 && ToVT == MVT::i32 && FromVT == MVT::f32)
5638 return Op;
5639 if (VT == MVT::i32 && ToVT == MVT::i32 && FromVT == MVT::f64 &&
5640 Subtarget->hasFP64())
5641 return Op;
5642 if (VT == MVT::i32 && ToVT == MVT::i32 && FromVT == MVT::f16 &&
5643 Subtarget->hasFullFP16())
5644 return Op;
5645 if (VT == MVT::v4i32 && ToVT == MVT::i32 && FromVT == MVT::v4f32 &&
5646 Subtarget->hasMVEFloatOps())
5647 return Op;
5648 if (VT == MVT::v8i16 && ToVT == MVT::i16 && FromVT == MVT::v8f16 &&
5649 Subtarget->hasMVEFloatOps())
5650 return Op;
5651
5652 if (FromVT != MVT::v4f32 && FromVT != MVT::v8f16)
5653 return SDValue();
5654
5655 SDLoc DL(Op);
5656 bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT_SAT;
5657 unsigned BW = ToVT.getScalarSizeInBits() - IsSigned;
5658 SDValue CVT = DAG.getNode(Op.getOpcode(), DL, VT, Op.getOperand(0),
5659 DAG.getValueType(VT.getScalarType()));
5660 SDValue Max = DAG.getNode(IsSigned ? ISD::SMIN : ISD::UMIN, DL, VT, CVT,
5661 DAG.getConstant((1 << BW) - 1, DL, VT));
5662 if (IsSigned)
5663 Max = DAG.getNode(ISD::SMAX, DL, VT, Max,
5664 DAG.getSignedConstant(-(1 << BW), DL, VT));
5665 return Max;
5666}
5667
5669 EVT VT = Op.getValueType();
5670 SDLoc dl(Op);
5671
5672 if (Op.getOperand(0).getValueType().getVectorElementType() == MVT::i32) {
5673 if (VT.getVectorElementType() == MVT::f32)
5674 return Op;
5675 return DAG.UnrollVectorOp(Op.getNode());
5676 }
5677
5678 assert((Op.getOperand(0).getValueType() == MVT::v4i16 ||
5679 Op.getOperand(0).getValueType() == MVT::v8i16) &&
5680 "Invalid type for custom lowering!");
5681
5682 const bool HasFullFP16 = DAG.getSubtarget<ARMSubtarget>().hasFullFP16();
5683
5684 EVT DestVecType;
5685 if (VT == MVT::v4f32)
5686 DestVecType = MVT::v4i32;
5687 else if (VT == MVT::v4f16 && HasFullFP16)
5688 DestVecType = MVT::v4i16;
5689 else if (VT == MVT::v8f16 && HasFullFP16)
5690 DestVecType = MVT::v8i16;
5691 else
5692 return DAG.UnrollVectorOp(Op.getNode());
5693
5694 unsigned CastOpc;
5695 unsigned Opc;
5696 switch (Op.getOpcode()) {
5697 default: llvm_unreachable("Invalid opcode!");
5698 case ISD::SINT_TO_FP:
5699 CastOpc = ISD::SIGN_EXTEND;
5701 break;
5702 case ISD::UINT_TO_FP:
5703 CastOpc = ISD::ZERO_EXTEND;
5705 break;
5706 }
5707
5708 Op = DAG.getNode(CastOpc, dl, DestVecType, Op.getOperand(0));
5709 return DAG.getNode(Opc, dl, VT, Op);
5710}
5711
5712SDValue ARMTargetLowering::LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) const {
5713 EVT VT = Op.getValueType();
5714 if (VT.isVector())
5715 return LowerVectorINT_TO_FP(Op, DAG);
5716 if (isUnsupportedFloatingType(VT)) {
5717 RTLIB::Libcall LC;
5718 if (Op.getOpcode() == ISD::SINT_TO_FP)
5719 LC = RTLIB::getSINTTOFP(Op.getOperand(0).getValueType(),
5720 Op.getValueType());
5721 else
5722 LC = RTLIB::getUINTTOFP(Op.getOperand(0).getValueType(),
5723 Op.getValueType());
5724 MakeLibCallOptions CallOptions;
5725 return makeLibCall(DAG, LC, Op.getValueType(), Op.getOperand(0),
5726 CallOptions, SDLoc(Op)).first;
5727 }
5728
5729 return Op;
5730}
5731
5732SDValue ARMTargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
5733 // Implement fcopysign with a fabs and a conditional fneg.
5734 SDValue Tmp0 = Op.getOperand(0);
5735 SDValue Tmp1 = Op.getOperand(1);
5736 SDLoc dl(Op);
5737 EVT VT = Op.getValueType();
5738 EVT SrcVT = Tmp1.getValueType();
5739 bool InGPR = Tmp0.getOpcode() == ISD::BITCAST ||
5740 Tmp0.getOpcode() == ARMISD::VMOVDRR;
5741 bool UseNEON = !InGPR && Subtarget->hasNEON();
5742
5743 if (UseNEON) {
5744 // Use VBSL to copy the sign bit.
5745 unsigned EncodedVal = ARM_AM::createVMOVModImm(0x6, 0x80);
5746 SDValue Mask = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v2i32,
5747 DAG.getTargetConstant(EncodedVal, dl, MVT::i32));
5748 EVT OpVT = (VT == MVT::f32) ? MVT::v2i32 : MVT::v1i64;
5749 if (VT == MVT::f64)
5750 Mask = DAG.getNode(ARMISD::VSHLIMM, dl, OpVT,
5751 DAG.getNode(ISD::BITCAST, dl, OpVT, Mask),
5752 DAG.getConstant(32, dl, MVT::i32));
5753 else /*if (VT == MVT::f32)*/
5754 Tmp0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f32, Tmp0);
5755 if (SrcVT == MVT::f32) {
5756 Tmp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f32, Tmp1);
5757 if (VT == MVT::f64)
5758 Tmp1 = DAG.getNode(ARMISD::VSHLIMM, dl, OpVT,
5759 DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp1),
5760 DAG.getConstant(32, dl, MVT::i32));
5761 } else if (VT == MVT::f32)
5762 Tmp1 = DAG.getNode(ARMISD::VSHRuIMM, dl, MVT::v1i64,
5763 DAG.getNode(ISD::BITCAST, dl, MVT::v1i64, Tmp1),
5764 DAG.getConstant(32, dl, MVT::i32));
5765 Tmp0 = DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp0);
5766 Tmp1 = DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp1);
5767
5769 dl, MVT::i32);
5770 AllOnes = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v8i8, AllOnes);
5771 SDValue MaskNot = DAG.getNode(ISD::XOR, dl, OpVT, Mask,
5772 DAG.getNode(ISD::BITCAST, dl, OpVT, AllOnes));
5773
5774 SDValue Res = DAG.getNode(ISD::OR, dl, OpVT,
5775 DAG.getNode(ISD::AND, dl, OpVT, Tmp1, Mask),
5776 DAG.getNode(ISD::AND, dl, OpVT, Tmp0, MaskNot));
5777 if (VT == MVT::f32) {
5778 Res = DAG.getNode(ISD::BITCAST, dl, MVT::v2f32, Res);
5779 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res,
5780 DAG.getConstant(0, dl, MVT::i32));
5781 } else {
5782 Res = DAG.getNode(ISD::BITCAST, dl, MVT::f64, Res);
5783 }
5784
5785 return Res;
5786 }
5787
5788 // Bitcast operand 1 to i32.
5789 if (SrcVT == MVT::f64)
5790 Tmp1 = DAG.getNode(ARMISD::VMOVRRD, dl, DAG.getVTList(MVT::i32, MVT::i32),
5791 Tmp1).getValue(1);
5792 Tmp1 = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Tmp1);
5793
5794 // Or in the signbit with integer operations.
5795 SDValue Mask1 = DAG.getConstant(0x80000000, dl, MVT::i32);
5796 SDValue Mask2 = DAG.getConstant(0x7fffffff, dl, MVT::i32);
5797 Tmp1 = DAG.getNode(ISD::AND, dl, MVT::i32, Tmp1, Mask1);
5798 if (VT == MVT::f32) {
5799 Tmp0 = DAG.getNode(ISD::AND, dl, MVT::i32,
5800 DAG.getNode(ISD::BITCAST, dl, MVT::i32, Tmp0), Mask2);
5801 return DAG.getNode(ISD::BITCAST, dl, MVT::f32,
5802 DAG.getNode(ISD::OR, dl, MVT::i32, Tmp0, Tmp1));
5803 }
5804
5805 // f64: Or the high part with signbit and then combine two parts.
5806 Tmp0 = DAG.getNode(ARMISD::VMOVRRD, dl, DAG.getVTList(MVT::i32, MVT::i32),
5807 Tmp0);
5808 SDValue Lo = Tmp0.getValue(0);
5809 SDValue Hi = DAG.getNode(ISD::AND, dl, MVT::i32, Tmp0.getValue(1), Mask2);
5810 Hi = DAG.getNode(ISD::OR, dl, MVT::i32, Hi, Tmp1);
5811 return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi);
5812}
5813
5814SDValue ARMTargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const{
5815 MachineFunction &MF = DAG.getMachineFunction();
5816 MachineFrameInfo &MFI = MF.getFrameInfo();
5817 MFI.setReturnAddressIsTaken(true);
5818
5819 EVT VT = Op.getValueType();
5820 SDLoc dl(Op);
5821 unsigned Depth = Op.getConstantOperandVal(0);
5822 if (Depth) {
5823 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
5824 SDValue Offset = DAG.getConstant(4, dl, MVT::i32);
5825 return DAG.getLoad(VT, dl, DAG.getEntryNode(),
5826 DAG.getNode(ISD::ADD, dl, VT, FrameAddr, Offset),
5827 MachinePointerInfo());
5828 }
5829
5830 // Return LR, which contains the return address. Mark it an implicit live-in.
5831 Register Reg = MF.addLiveIn(ARM::LR, getRegClassFor(MVT::i32));
5832 return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT);
5833}
5834
5835SDValue ARMTargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
5836 const ARMBaseRegisterInfo &ARI =
5837 *static_cast<const ARMBaseRegisterInfo*>(RegInfo);
5838 MachineFunction &MF = DAG.getMachineFunction();
5839 MachineFrameInfo &MFI = MF.getFrameInfo();
5840 MFI.setFrameAddressIsTaken(true);
5841
5842 EVT VT = Op.getValueType();
5843 SDLoc dl(Op); // FIXME probably not meaningful
5844 unsigned Depth = Op.getConstantOperandVal(0);
5845 Register FrameReg = ARI.getFrameRegister(MF);
5846 SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
5847 while (Depth--)
5848 FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
5849 MachinePointerInfo());
5850 return FrameAddr;
5851}
5852
5853// FIXME? Maybe this could be a TableGen attribute on some registers and
5854// this table could be generated automatically from RegInfo.
5855Register ARMTargetLowering::getRegisterByName(const char* RegName, LLT VT,
5856 const MachineFunction &MF) const {
5857 return StringSwitch<Register>(RegName)
5858 .Case("sp", ARM::SP)
5859 .Default(Register());
5860}
5861
5862// Result is 64 bit value so split into two 32 bit values and return as a
5863// pair of values.
5865 SelectionDAG &DAG) {
5866 SDLoc DL(N);
5867
5868 // This function is only supposed to be called for i64 type destination.
5869 assert(N->getValueType(0) == MVT::i64
5870 && "ExpandREAD_REGISTER called for non-i64 type result.");
5871
5873 DAG.getVTList(MVT::i32, MVT::i32, MVT::Other),
5874 N->getOperand(0),
5875 N->getOperand(1));
5876
5877 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Read.getValue(0),
5878 Read.getValue(1)));
5879 Results.push_back(Read.getValue(2)); // Chain
5880}
5881
5882/// \p BC is a bitcast that is about to be turned into a VMOVDRR.
5883/// When \p DstVT, the destination type of \p BC, is on the vector
5884/// register bank and the source of bitcast, \p Op, operates on the same bank,
5885/// it might be possible to combine them, such that everything stays on the
5886/// vector register bank.
5887/// \p return The node that would replace \p BT, if the combine
5888/// is possible.
5890 SelectionDAG &DAG) {
5891 SDValue Op = BC->getOperand(0);
5892 EVT DstVT = BC->getValueType(0);
5893
5894 // The only vector instruction that can produce a scalar (remember,
5895 // since the bitcast was about to be turned into VMOVDRR, the source
5896 // type is i64) from a vector is EXTRACT_VECTOR_ELT.
5897 // Moreover, we can do this combine only if there is one use.
5898 // Finally, if the destination type is not a vector, there is not
5899 // much point on forcing everything on the vector bank.
5900 if (!DstVT.isVector() || Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
5901 !Op.hasOneUse())
5902 return SDValue();
5903
5904 // If the index is not constant, we will introduce an additional
5905 // multiply that will stick.
5906 // Give up in that case.
5907 ConstantSDNode *Index = dyn_cast<ConstantSDNode>(Op.getOperand(1));
5908 if (!Index)
5909 return SDValue();
5910 unsigned DstNumElt = DstVT.getVectorNumElements();
5911
5912 // Compute the new index.
5913 const APInt &APIntIndex = Index->getAPIntValue();
5914 APInt NewIndex(APIntIndex.getBitWidth(), DstNumElt);
5915 NewIndex *= APIntIndex;
5916 // Check if the new constant index fits into i32.
5917 if (NewIndex.getBitWidth() > 32)
5918 return SDValue();
5919
5920 // vMTy bitcast(i64 extractelt vNi64 src, i32 index) ->
5921 // vMTy extractsubvector vNxMTy (bitcast vNi64 src), i32 index*M)
5922 SDLoc dl(Op);
5923 SDValue ExtractSrc = Op.getOperand(0);
5924 EVT VecVT = EVT::getVectorVT(
5925 *DAG.getContext(), DstVT.getScalarType(),
5926 ExtractSrc.getValueType().getVectorNumElements() * DstNumElt);
5927 SDValue BitCast = DAG.getNode(ISD::BITCAST, dl, VecVT, ExtractSrc);
5928 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DstVT, BitCast,
5929 DAG.getConstant(NewIndex.getZExtValue(), dl, MVT::i32));
5930}
5931
5932/// ExpandBITCAST - If the target supports VFP, this function is called to
5933/// expand a bit convert where either the source or destination type is i64 to
5934/// use a VMOVDRR or VMOVRRD node. This should not be done when the non-i64
5935/// operand type is illegal (e.g., v2f32 for a target that doesn't support
5936/// vectors), since the legalizer won't know what to do with that.
5937SDValue ARMTargetLowering::ExpandBITCAST(SDNode *N, SelectionDAG &DAG,
5938 const ARMSubtarget *Subtarget) const {
5939 SDLoc dl(N);
5940 SDValue Op = N->getOperand(0);
5941
5942 // This function is only supposed to be called for i16 and i64 types, either
5943 // as the source or destination of the bit convert.
5944 EVT SrcVT = Op.getValueType();
5945 EVT DstVT = N->getValueType(0);
5946
5947 if ((SrcVT == MVT::i16 || SrcVT == MVT::i32) &&
5948 (DstVT == MVT::f16 || DstVT == MVT::bf16))
5949 return MoveToHPR(SDLoc(N), DAG, MVT::i32, DstVT.getSimpleVT(),
5950 DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), MVT::i32, Op));
5951
5952 if ((DstVT == MVT::i16 || DstVT == MVT::i32) &&
5953 (SrcVT == MVT::f16 || SrcVT == MVT::bf16)) {
5954 if (Subtarget->hasFullFP16() && !Subtarget->hasBF16())
5955 Op = DAG.getBitcast(MVT::f16, Op);
5956 return DAG.getNode(
5957 ISD::TRUNCATE, SDLoc(N), DstVT,
5958 MoveFromHPR(SDLoc(N), DAG, MVT::i32, SrcVT.getSimpleVT(), Op));
5959 }
5960
5961 if (!(SrcVT == MVT::i64 || DstVT == MVT::i64))
5962 return SDValue();
5963
5964 // Turn i64->f64 into VMOVDRR.
5965 if (SrcVT == MVT::i64 && isTypeLegal(DstVT)) {
5966 // Do not force values to GPRs (this is what VMOVDRR does for the inputs)
5967 // if we can combine the bitcast with its source.
5969 return Val;
5970 SDValue Lo, Hi;
5971 std::tie(Lo, Hi) = DAG.SplitScalar(Op, dl, MVT::i32, MVT::i32);
5972 return DAG.getNode(ISD::BITCAST, dl, DstVT,
5973 DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi));
5974 }
5975
5976 // Turn f64->i64 into VMOVRRD.
5977 if (DstVT == MVT::i64 && isTypeLegal(SrcVT)) {
5978 SDValue Cvt;
5979 if (DAG.getDataLayout().isBigEndian() && SrcVT.isVector() &&
5980 SrcVT.getVectorNumElements() > 1)
5981 Cvt = DAG.getNode(ARMISD::VMOVRRD, dl,
5982 DAG.getVTList(MVT::i32, MVT::i32),
5983 DAG.getNode(ARMISD::VREV64, dl, SrcVT, Op));
5984 else
5985 Cvt = DAG.getNode(ARMISD::VMOVRRD, dl,
5986 DAG.getVTList(MVT::i32, MVT::i32), Op);
5987 // Merge the pieces into a single i64 value.
5988 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Cvt, Cvt.getValue(1));
5989 }
5990
5991 return SDValue();
5992}
5993
5994/// getZeroVector - Returns a vector of specified type with all zero elements.
5995/// Zero vectors are used to represent vector negation and in those cases
5996/// will be implemented with the NEON VNEG instruction. However, VNEG does
5997/// not support i64 elements, so sometimes the zero vectors will need to be
5998/// explicitly constructed. Regardless, use a canonical VMOV to create the
5999/// zero vector.
6000static SDValue getZeroVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) {
6001 assert(VT.isVector() && "Expected a vector type");
6002 // The canonical modified immediate encoding of a zero vector is....0!
6003 SDValue EncodedVal = DAG.getTargetConstant(0, dl, MVT::i32);
6004 EVT VmovVT = VT.is128BitVector() ? MVT::v4i32 : MVT::v2i32;
6005 SDValue Vmov = DAG.getNode(ARMISD::VMOVIMM, dl, VmovVT, EncodedVal);
6006 return DAG.getNode(ISD::BITCAST, dl, VT, Vmov);
6007}
6008
6009/// LowerShiftRightParts - Lower SRA_PARTS, which returns two
6010/// i32 values and take a 2 x i32 value to shift plus a shift amount.
6011SDValue ARMTargetLowering::LowerShiftRightParts(SDValue Op,
6012 SelectionDAG &DAG) const {
6013 assert(Op.getNumOperands() == 3 && "Not a double-shift!");
6014 EVT VT = Op.getValueType();
6015 unsigned VTBits = VT.getSizeInBits();
6016 SDLoc dl(Op);
6017 SDValue ShOpLo = Op.getOperand(0);
6018 SDValue ShOpHi = Op.getOperand(1);
6019 SDValue ShAmt = Op.getOperand(2);
6020 SDValue ARMcc;
6021 unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL;
6022
6023 assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS);
6024
6025 SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
6026 DAG.getConstant(VTBits, dl, MVT::i32), ShAmt);
6027 SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt);
6028 SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
6029 DAG.getConstant(VTBits, dl, MVT::i32));
6030 SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt);
6031 SDValue LoSmallShift = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
6032 SDValue LoBigShift = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt);
6033 SDValue CmpLo = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32),
6034 ISD::SETGE, ARMcc, DAG, dl);
6035 SDValue Lo =
6036 DAG.getNode(ARMISD::CMOV, dl, VT, LoSmallShift, LoBigShift, ARMcc, CmpLo);
6037
6038 SDValue HiSmallShift = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
6039 SDValue HiBigShift = Opc == ISD::SRA
6040 ? DAG.getNode(Opc, dl, VT, ShOpHi,
6041 DAG.getConstant(VTBits - 1, dl, VT))
6042 : DAG.getConstant(0, dl, VT);
6043 SDValue CmpHi = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32),
6044 ISD::SETGE, ARMcc, DAG, dl);
6045 SDValue Hi =
6046 DAG.getNode(ARMISD::CMOV, dl, VT, HiSmallShift, HiBigShift, ARMcc, CmpHi);
6047
6048 SDValue Ops[2] = { Lo, Hi };
6049 return DAG.getMergeValues(Ops, dl);
6050}
6051
6052/// LowerShiftLeftParts - Lower SHL_PARTS, which returns two
6053/// i32 values and take a 2 x i32 value to shift plus a shift amount.
6054SDValue ARMTargetLowering::LowerShiftLeftParts(SDValue Op,
6055 SelectionDAG &DAG) const {
6056 assert(Op.getNumOperands() == 3 && "Not a double-shift!");
6057 EVT VT = Op.getValueType();
6058 unsigned VTBits = VT.getSizeInBits();
6059 SDLoc dl(Op);
6060 SDValue ShOpLo = Op.getOperand(0);
6061 SDValue ShOpHi = Op.getOperand(1);
6062 SDValue ShAmt = Op.getOperand(2);
6063 SDValue ARMcc;
6064
6065 assert(Op.getOpcode() == ISD::SHL_PARTS);
6066 SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
6067 DAG.getConstant(VTBits, dl, MVT::i32), ShAmt);
6068 SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt);
6069 SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt);
6070 SDValue HiSmallShift = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
6071
6072 SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
6073 DAG.getConstant(VTBits, dl, MVT::i32));
6074 SDValue HiBigShift = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt);
6075 SDValue CmpHi = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32),
6076 ISD::SETGE, ARMcc, DAG, dl);
6077 SDValue Hi =
6078 DAG.getNode(ARMISD::CMOV, dl, VT, HiSmallShift, HiBigShift, ARMcc, CmpHi);
6079
6080 SDValue CmpLo = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32),
6081 ISD::SETGE, ARMcc, DAG, dl);
6082 SDValue LoSmallShift = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
6083 SDValue Lo = DAG.getNode(ARMISD::CMOV, dl, VT, LoSmallShift,
6084 DAG.getConstant(0, dl, VT), ARMcc, CmpLo);
6085
6086 SDValue Ops[2] = { Lo, Hi };
6087 return DAG.getMergeValues(Ops, dl);
6088}
6089
6090SDValue ARMTargetLowering::LowerGET_ROUNDING(SDValue Op,
6091 SelectionDAG &DAG) const {
6092 // The rounding mode is in bits 23:22 of the FPSCR.
6093 // The ARM rounding mode value to FLT_ROUNDS mapping is 0->1, 1->2, 2->3, 3->0
6094 // The formula we use to implement this is (((FPSCR + 1 << 22) >> 22) & 3)
6095 // so that the shift + and get folded into a bitfield extract.
6096 SDLoc dl(Op);
6097 SDValue Chain = Op.getOperand(0);
6098 SDValue Ops[] = {Chain,
6099 DAG.getConstant(Intrinsic::arm_get_fpscr, dl, MVT::i32)};
6100
6101 SDValue FPSCR =
6102 DAG.getNode(ISD::INTRINSIC_W_CHAIN, dl, {MVT::i32, MVT::Other}, Ops);
6103 Chain = FPSCR.getValue(1);
6104 SDValue FltRounds = DAG.getNode(ISD::ADD, dl, MVT::i32, FPSCR,
6105 DAG.getConstant(1U << 22, dl, MVT::i32));
6106 SDValue RMODE = DAG.getNode(ISD::SRL, dl, MVT::i32, FltRounds,
6107 DAG.getConstant(22, dl, MVT::i32));
6108 SDValue And = DAG.getNode(ISD::AND, dl, MVT::i32, RMODE,
6109 DAG.getConstant(3, dl, MVT::i32));
6110 return DAG.getMergeValues({And, Chain}, dl);
6111}
6112
6113SDValue ARMTargetLowering::LowerSET_ROUNDING(SDValue Op,
6114 SelectionDAG &DAG) const {
6115 SDLoc DL(Op);
6116 SDValue Chain = Op->getOperand(0);
6117 SDValue RMValue = Op->getOperand(1);
6118
6119 // The rounding mode is in bits 23:22 of the FPSCR.
6120 // The llvm.set.rounding argument value to ARM rounding mode value mapping
6121 // is 0->3, 1->0, 2->1, 3->2. The formula we use to implement this is
6122 // ((arg - 1) & 3) << 22).
6123 //
6124 // It is expected that the argument of llvm.set.rounding is within the
6125 // segment [0, 3], so NearestTiesToAway (4) is not handled here. It is
6126 // responsibility of the code generated llvm.set.rounding to ensure this
6127 // condition.
6128
6129 // Calculate new value of FPSCR[23:22].
6130 RMValue = DAG.getNode(ISD::SUB, DL, MVT::i32, RMValue,
6131 DAG.getConstant(1, DL, MVT::i32));
6132 RMValue = DAG.getNode(ISD::AND, DL, MVT::i32, RMValue,
6133 DAG.getConstant(0x3, DL, MVT::i32));
6134 RMValue = DAG.getNode(ISD::SHL, DL, MVT::i32, RMValue,
6135 DAG.getConstant(ARM::RoundingBitsPos, DL, MVT::i32));
6136
6137 // Get current value of FPSCR.
6138 SDValue Ops[] = {Chain,
6139 DAG.getConstant(Intrinsic::arm_get_fpscr, DL, MVT::i32)};
6140 SDValue FPSCR =
6141 DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, {MVT::i32, MVT::Other}, Ops);
6142 Chain = FPSCR.getValue(1);
6143 FPSCR = FPSCR.getValue(0);
6144
6145 // Put new rounding mode into FPSCR[23:22].
6146 const unsigned RMMask = ~(ARM::Rounding::rmMask << ARM::RoundingBitsPos);
6147 FPSCR = DAG.getNode(ISD::AND, DL, MVT::i32, FPSCR,
6148 DAG.getConstant(RMMask, DL, MVT::i32));
6149 FPSCR = DAG.getNode(ISD::OR, DL, MVT::i32, FPSCR, RMValue);
6150 SDValue Ops2[] = {
6151 Chain, DAG.getConstant(Intrinsic::arm_set_fpscr, DL, MVT::i32), FPSCR};
6152 return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, Ops2);
6153}
6154
6155SDValue ARMTargetLowering::LowerSET_FPMODE(SDValue Op,
6156 SelectionDAG &DAG) const {
6157 SDLoc DL(Op);
6158 SDValue Chain = Op->getOperand(0);
6159 SDValue Mode = Op->getOperand(1);
6160
6161 // Generate nodes to build:
6162 // FPSCR = (FPSCR & FPStatusBits) | (Mode & ~FPStatusBits)
6163 SDValue Ops[] = {Chain,
6164 DAG.getConstant(Intrinsic::arm_get_fpscr, DL, MVT::i32)};
6165 SDValue FPSCR =
6166 DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, {MVT::i32, MVT::Other}, Ops);
6167 Chain = FPSCR.getValue(1);
6168 FPSCR = FPSCR.getValue(0);
6169
6170 SDValue FPSCRMasked =
6171 DAG.getNode(ISD::AND, DL, MVT::i32, FPSCR,
6172 DAG.getConstant(ARM::FPStatusBits, DL, MVT::i32));
6173 SDValue InputMasked =
6174 DAG.getNode(ISD::AND, DL, MVT::i32, Mode,
6175 DAG.getConstant(~ARM::FPStatusBits, DL, MVT::i32));
6176 FPSCR = DAG.getNode(ISD::OR, DL, MVT::i32, FPSCRMasked, InputMasked);
6177
6178 SDValue Ops2[] = {
6179 Chain, DAG.getConstant(Intrinsic::arm_set_fpscr, DL, MVT::i32), FPSCR};
6180 return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, Ops2);
6181}
6182
6183SDValue ARMTargetLowering::LowerRESET_FPMODE(SDValue Op,
6184 SelectionDAG &DAG) const {
6185 SDLoc DL(Op);
6186 SDValue Chain = Op->getOperand(0);
6187
6188 // To get the default FP mode all control bits are cleared:
6189 // FPSCR = FPSCR & (FPStatusBits | FPReservedBits)
6190 SDValue Ops[] = {Chain,
6191 DAG.getConstant(Intrinsic::arm_get_fpscr, DL, MVT::i32)};
6192 SDValue FPSCR =
6193 DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, {MVT::i32, MVT::Other}, Ops);
6194 Chain = FPSCR.getValue(1);
6195 FPSCR = FPSCR.getValue(0);
6196
6197 SDValue FPSCRMasked = DAG.getNode(
6198 ISD::AND, DL, MVT::i32, FPSCR,
6200 SDValue Ops2[] = {Chain,
6201 DAG.getConstant(Intrinsic::arm_set_fpscr, DL, MVT::i32),
6202 FPSCRMasked};
6203 return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, Ops2);
6204}
6205
6207 const ARMSubtarget *ST) {
6208 SDLoc dl(N);
6209 EVT VT = N->getValueType(0);
6210 if (VT.isVector() && ST->hasNEON()) {
6211
6212 // Compute the least significant set bit: LSB = X & -X
6213 SDValue X = N->getOperand(0);
6214 SDValue NX = DAG.getNode(ISD::SUB, dl, VT, getZeroVector(VT, DAG, dl), X);
6215 SDValue LSB = DAG.getNode(ISD::AND, dl, VT, X, NX);
6216
6217 EVT ElemTy = VT.getVectorElementType();
6218
6219 if (ElemTy == MVT::i8) {
6220 // Compute with: cttz(x) = ctpop(lsb - 1)
6221 SDValue One = DAG.getNode(ARMISD::VMOVIMM, dl, VT,
6222 DAG.getTargetConstant(1, dl, ElemTy));
6223 SDValue Bits = DAG.getNode(ISD::SUB, dl, VT, LSB, One);
6224 return DAG.getNode(ISD::CTPOP, dl, VT, Bits);
6225 }
6226
6227 if ((ElemTy == MVT::i16 || ElemTy == MVT::i32) &&
6228 (N->getOpcode() == ISD::CTTZ_ZERO_UNDEF)) {
6229 // Compute with: cttz(x) = (width - 1) - ctlz(lsb), if x != 0
6230 unsigned NumBits = ElemTy.getSizeInBits();
6231 SDValue WidthMinus1 =
6232 DAG.getNode(ARMISD::VMOVIMM, dl, VT,
6233 DAG.getTargetConstant(NumBits - 1, dl, ElemTy));
6234 SDValue CTLZ = DAG.getNode(ISD::CTLZ, dl, VT, LSB);
6235 return DAG.getNode(ISD::SUB, dl, VT, WidthMinus1, CTLZ);
6236 }
6237
6238 // Compute with: cttz(x) = ctpop(lsb - 1)
6239
6240 // Compute LSB - 1.
6241 SDValue Bits;
6242 if (ElemTy == MVT::i64) {
6243 // Load constant 0xffff'ffff'ffff'ffff to register.
6244 SDValue FF = DAG.getNode(ARMISD::VMOVIMM, dl, VT,
6245 DAG.getTargetConstant(0x1eff, dl, MVT::i32));
6246 Bits = DAG.getNode(ISD::ADD, dl, VT, LSB, FF);
6247 } else {
6248 SDValue One = DAG.getNode(ARMISD::VMOVIMM, dl, VT,
6249 DAG.getTargetConstant(1, dl, ElemTy));
6250 Bits = DAG.getNode(ISD::SUB, dl, VT, LSB, One);
6251 }
6252 return DAG.getNode(ISD::CTPOP, dl, VT, Bits);
6253 }
6254
6255 if (!ST->hasV6T2Ops())
6256 return SDValue();
6257
6258 SDValue rbit = DAG.getNode(ISD::BITREVERSE, dl, VT, N->getOperand(0));
6259 return DAG.getNode(ISD::CTLZ, dl, VT, rbit);
6260}
6261
6263 const ARMSubtarget *ST) {
6264 EVT VT = N->getValueType(0);
6265 SDLoc DL(N);
6266
6267 assert(ST->hasNEON() && "Custom ctpop lowering requires NEON.");
6268 assert((VT == MVT::v1i64 || VT == MVT::v2i64 || VT == MVT::v2i32 ||
6269 VT == MVT::v4i32 || VT == MVT::v4i16 || VT == MVT::v8i16) &&
6270 "Unexpected type for custom ctpop lowering");
6271
6272 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
6273 EVT VT8Bit = VT.is64BitVector() ? MVT::v8i8 : MVT::v16i8;
6274 SDValue Res = DAG.getBitcast(VT8Bit, N->getOperand(0));
6275 Res = DAG.getNode(ISD::CTPOP, DL, VT8Bit, Res);
6276
6277 // Widen v8i8/v16i8 CTPOP result to VT by repeatedly widening pairwise adds.
6278 unsigned EltSize = 8;
6279 unsigned NumElts = VT.is64BitVector() ? 8 : 16;
6280 while (EltSize != VT.getScalarSizeInBits()) {
6282 Ops.push_back(DAG.getConstant(Intrinsic::arm_neon_vpaddlu, DL,
6283 TLI.getPointerTy(DAG.getDataLayout())));
6284 Ops.push_back(Res);
6285
6286 EltSize *= 2;
6287 NumElts /= 2;
6288 MVT WidenVT = MVT::getVectorVT(MVT::getIntegerVT(EltSize), NumElts);
6289 Res = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, WidenVT, Ops);
6290 }
6291
6292 return Res;
6293}
6294
6295/// Getvshiftimm - Check if this is a valid build_vector for the immediate
6296/// operand of a vector shift operation, where all the elements of the
6297/// build_vector must have the same constant integer value.
6298static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) {
6299 // Ignore bit_converts.
6300 while (Op.getOpcode() == ISD::BITCAST)
6301 Op = Op.getOperand(0);
6303 APInt SplatBits, SplatUndef;
6304 unsigned SplatBitSize;
6305 bool HasAnyUndefs;
6306 if (!BVN ||
6307 !BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs,
6308 ElementBits) ||
6309 SplatBitSize > ElementBits)
6310 return false;
6311 Cnt = SplatBits.getSExtValue();
6312 return true;
6313}
6314
6315/// isVShiftLImm - Check if this is a valid build_vector for the immediate
6316/// operand of a vector shift left operation. That value must be in the range:
6317/// 0 <= Value < ElementBits for a left shift; or
6318/// 0 <= Value <= ElementBits for a long left shift.
6319static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) {
6320 assert(VT.isVector() && "vector shift count is not a vector type");
6321 int64_t ElementBits = VT.getScalarSizeInBits();
6322 if (!getVShiftImm(Op, ElementBits, Cnt))
6323 return false;
6324 return (Cnt >= 0 && (isLong ? Cnt - 1 : Cnt) < ElementBits);
6325}
6326
6327/// isVShiftRImm - Check if this is a valid build_vector for the immediate
6328/// operand of a vector shift right operation. For a shift opcode, the value
6329/// is positive, but for an intrinsic the value count must be negative. The
6330/// absolute value must be in the range:
6331/// 1 <= |Value| <= ElementBits for a right shift; or
6332/// 1 <= |Value| <= ElementBits/2 for a narrow right shift.
6333static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, bool isIntrinsic,
6334 int64_t &Cnt) {
6335 assert(VT.isVector() && "vector shift count is not a vector type");
6336 int64_t ElementBits = VT.getScalarSizeInBits();
6337 if (!getVShiftImm(Op, ElementBits, Cnt))
6338 return false;
6339 if (!isIntrinsic)
6340 return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits / 2 : ElementBits));
6341 if (Cnt >= -(isNarrow ? ElementBits / 2 : ElementBits) && Cnt <= -1) {
6342 Cnt = -Cnt;
6343 return true;
6344 }
6345 return false;
6346}
6347
6349 const ARMSubtarget *ST) {
6350 EVT VT = N->getValueType(0);
6351 SDLoc dl(N);
6352 int64_t Cnt;
6353
6354 if (!VT.isVector())
6355 return SDValue();
6356
6357 // We essentially have two forms here. Shift by an immediate and shift by a
6358 // vector register (there are also shift by a gpr, but that is just handled
6359 // with a tablegen pattern). We cannot easily match shift by an immediate in
6360 // tablegen so we do that here and generate a VSHLIMM/VSHRsIMM/VSHRuIMM.
6361 // For shifting by a vector, we don't have VSHR, only VSHL (which can be
6362 // signed or unsigned, and a negative shift indicates a shift right).
6363 if (N->getOpcode() == ISD::SHL) {
6364 if (isVShiftLImm(N->getOperand(1), VT, false, Cnt))
6365 return DAG.getNode(ARMISD::VSHLIMM, dl, VT, N->getOperand(0),
6366 DAG.getConstant(Cnt, dl, MVT::i32));
6367 return DAG.getNode(ARMISD::VSHLu, dl, VT, N->getOperand(0),
6368 N->getOperand(1));
6369 }
6370
6371 assert((N->getOpcode() == ISD::SRA || N->getOpcode() == ISD::SRL) &&
6372 "unexpected vector shift opcode");
6373
6374 if (isVShiftRImm(N->getOperand(1), VT, false, false, Cnt)) {
6375 unsigned VShiftOpc =
6376 (N->getOpcode() == ISD::SRA ? ARMISD::VSHRsIMM : ARMISD::VSHRuIMM);
6377 return DAG.getNode(VShiftOpc, dl, VT, N->getOperand(0),
6378 DAG.getConstant(Cnt, dl, MVT::i32));
6379 }
6380
6381 // Other right shifts we don't have operations for (we use a shift left by a
6382 // negative number).
6383 EVT ShiftVT = N->getOperand(1).getValueType();
6384 SDValue NegatedCount = DAG.getNode(
6385 ISD::SUB, dl, ShiftVT, getZeroVector(ShiftVT, DAG, dl), N->getOperand(1));
6386 unsigned VShiftOpc =
6387 (N->getOpcode() == ISD::SRA ? ARMISD::VSHLs : ARMISD::VSHLu);
6388 return DAG.getNode(VShiftOpc, dl, VT, N->getOperand(0), NegatedCount);
6389}
6390
6392 const ARMSubtarget *ST) {
6393 EVT VT = N->getValueType(0);
6394 SDLoc dl(N);
6395
6396 // We can get here for a node like i32 = ISD::SHL i32, i64
6397 if (VT != MVT::i64)
6398 return SDValue();
6399
6400 assert((N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA ||
6401 N->getOpcode() == ISD::SHL) &&
6402 "Unknown shift to lower!");
6403
6404 unsigned ShOpc = N->getOpcode();
6405 if (ST->hasMVEIntegerOps()) {
6406 SDValue ShAmt = N->getOperand(1);
6407 unsigned ShPartsOpc = ARMISD::LSLL;
6409
6410 // If the shift amount is greater than 32 or has a greater bitwidth than 64
6411 // then do the default optimisation
6412 if ((!Con && ShAmt->getValueType(0).getSizeInBits() > 64) ||
6413 (Con && (Con->getAPIntValue() == 0 || Con->getAPIntValue().uge(32))))
6414 return SDValue();
6415
6416 // Extract the lower 32 bits of the shift amount if it's not an i32
6417 if (ShAmt->getValueType(0) != MVT::i32)
6418 ShAmt = DAG.getZExtOrTrunc(ShAmt, dl, MVT::i32);
6419
6420 if (ShOpc == ISD::SRL) {
6421 if (!Con)
6422 // There is no t2LSRLr instruction so negate and perform an lsll if the
6423 // shift amount is in a register, emulating a right shift.
6424 ShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
6425 DAG.getConstant(0, dl, MVT::i32), ShAmt);
6426 else
6427 // Else generate an lsrl on the immediate shift amount
6428 ShPartsOpc = ARMISD::LSRL;
6429 } else if (ShOpc == ISD::SRA)
6430 ShPartsOpc = ARMISD::ASRL;
6431
6432 // Split Lower/Upper 32 bits of the destination/source
6433 SDValue Lo, Hi;
6434 std::tie(Lo, Hi) =
6435 DAG.SplitScalar(N->getOperand(0), dl, MVT::i32, MVT::i32);
6436 // Generate the shift operation as computed above
6437 Lo = DAG.getNode(ShPartsOpc, dl, DAG.getVTList(MVT::i32, MVT::i32), Lo, Hi,
6438 ShAmt);
6439 // The upper 32 bits come from the second return value of lsll
6440 Hi = SDValue(Lo.getNode(), 1);
6441 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
6442 }
6443
6444 // We only lower SRA, SRL of 1 here, all others use generic lowering.
6445 if (!isOneConstant(N->getOperand(1)) || N->getOpcode() == ISD::SHL)
6446 return SDValue();
6447
6448 // If we are in thumb mode, we don't have RRX.
6449 if (ST->isThumb1Only())
6450 return SDValue();
6451
6452 // Okay, we have a 64-bit SRA or SRL of 1. Lower this to an RRX expr.
6453 SDValue Lo, Hi;
6454 std::tie(Lo, Hi) = DAG.SplitScalar(N->getOperand(0), dl, MVT::i32, MVT::i32);
6455
6456 // First, build a LSRS1/ASRS1 op, which shifts the top part by one and
6457 // captures the shifted out bit into a carry flag.
6458 unsigned Opc = N->getOpcode() == ISD::SRL ? ARMISD::LSRS1 : ARMISD::ASRS1;
6459 Hi = DAG.getNode(Opc, dl, DAG.getVTList(MVT::i32, FlagsVT), Hi);
6460
6461 // The low part is an ARMISD::RRX operand, which shifts the carry in.
6462 Lo = DAG.getNode(ARMISD::RRX, dl, MVT::i32, Lo, Hi.getValue(1));
6463
6464 // Merge the pieces into a single i64 value.
6465 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
6466}
6467
6469 const ARMSubtarget *ST) {
6470 bool Invert = false;
6471 bool Swap = false;
6472 unsigned Opc = ARMCC::AL;
6473
6474 SDValue Op0 = Op.getOperand(0);
6475 SDValue Op1 = Op.getOperand(1);
6476 SDValue CC = Op.getOperand(2);
6477 EVT VT = Op.getValueType();
6478 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
6479 SDLoc dl(Op);
6480
6481 EVT CmpVT;
6482 if (ST->hasNEON())
6484 else {
6485 assert(ST->hasMVEIntegerOps() &&
6486 "No hardware support for integer vector comparison!");
6487
6488 if (Op.getValueType().getVectorElementType() != MVT::i1)
6489 return SDValue();
6490
6491 // Make sure we expand floating point setcc to scalar if we do not have
6492 // mve.fp, so that we can handle them from there.
6493 if (Op0.getValueType().isFloatingPoint() && !ST->hasMVEFloatOps())
6494 return SDValue();
6495
6496 CmpVT = VT;
6497 }
6498
6499 if (Op0.getValueType().getVectorElementType() == MVT::i64 &&
6500 (SetCCOpcode == ISD::SETEQ || SetCCOpcode == ISD::SETNE)) {
6501 // Special-case integer 64-bit equality comparisons. They aren't legal,
6502 // but they can be lowered with a few vector instructions.
6503 unsigned CmpElements = CmpVT.getVectorNumElements() * 2;
6504 EVT SplitVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, CmpElements);
6505 SDValue CastOp0 = DAG.getNode(ISD::BITCAST, dl, SplitVT, Op0);
6506 SDValue CastOp1 = DAG.getNode(ISD::BITCAST, dl, SplitVT, Op1);
6507 SDValue Cmp = DAG.getNode(ISD::SETCC, dl, SplitVT, CastOp0, CastOp1,
6508 DAG.getCondCode(ISD::SETEQ));
6509 SDValue Reversed = DAG.getNode(ARMISD::VREV64, dl, SplitVT, Cmp);
6510 SDValue Merged = DAG.getNode(ISD::AND, dl, SplitVT, Cmp, Reversed);
6511 Merged = DAG.getNode(ISD::BITCAST, dl, CmpVT, Merged);
6512 if (SetCCOpcode == ISD::SETNE)
6513 Merged = DAG.getNOT(dl, Merged, CmpVT);
6514 Merged = DAG.getSExtOrTrunc(Merged, dl, VT);
6515 return Merged;
6516 }
6517
6518 if (CmpVT.getVectorElementType() == MVT::i64)
6519 // 64-bit comparisons are not legal in general.
6520 return SDValue();
6521
6522 if (Op1.getValueType().isFloatingPoint()) {
6523 switch (SetCCOpcode) {
6524 default: llvm_unreachable("Illegal FP comparison");
6525 case ISD::SETUNE:
6526 case ISD::SETNE:
6527 if (ST->hasMVEFloatOps()) {
6528 Opc = ARMCC::NE; break;
6529 } else {
6530 Invert = true; [[fallthrough]];
6531 }
6532 case ISD::SETOEQ:
6533 case ISD::SETEQ: Opc = ARMCC::EQ; break;
6534 case ISD::SETOLT:
6535 case ISD::SETLT: Swap = true; [[fallthrough]];
6536 case ISD::SETOGT:
6537 case ISD::SETGT: Opc = ARMCC::GT; break;
6538 case ISD::SETOLE:
6539 case ISD::SETLE: Swap = true; [[fallthrough]];
6540 case ISD::SETOGE:
6541 case ISD::SETGE: Opc = ARMCC::GE; break;
6542 case ISD::SETUGE: Swap = true; [[fallthrough]];
6543 case ISD::SETULE: Invert = true; Opc = ARMCC::GT; break;
6544 case ISD::SETUGT: Swap = true; [[fallthrough]];
6545 case ISD::SETULT: Invert = true; Opc = ARMCC::GE; break;
6546 case ISD::SETUEQ: Invert = true; [[fallthrough]];
6547 case ISD::SETONE: {
6548 // Expand this to (OLT | OGT).
6549 SDValue TmpOp0 = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op1, Op0,
6550 DAG.getConstant(ARMCC::GT, dl, MVT::i32));
6551 SDValue TmpOp1 = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op0, Op1,
6552 DAG.getConstant(ARMCC::GT, dl, MVT::i32));
6553 SDValue Result = DAG.getNode(ISD::OR, dl, CmpVT, TmpOp0, TmpOp1);
6554 if (Invert)
6555 Result = DAG.getNOT(dl, Result, VT);
6556 return Result;
6557 }
6558 case ISD::SETUO: Invert = true; [[fallthrough]];
6559 case ISD::SETO: {
6560 // Expand this to (OLT | OGE).
6561 SDValue TmpOp0 = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op1, Op0,
6562 DAG.getConstant(ARMCC::GT, dl, MVT::i32));
6563 SDValue TmpOp1 = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op0, Op1,
6564 DAG.getConstant(ARMCC::GE, dl, MVT::i32));
6565 SDValue Result = DAG.getNode(ISD::OR, dl, CmpVT, TmpOp0, TmpOp1);
6566 if (Invert)
6567 Result = DAG.getNOT(dl, Result, VT);
6568 return Result;
6569 }
6570 }
6571 } else {
6572 // Integer comparisons.
6573 switch (SetCCOpcode) {
6574 default: llvm_unreachable("Illegal integer comparison");
6575 case ISD::SETNE:
6576 if (ST->hasMVEIntegerOps()) {
6577 Opc = ARMCC::NE; break;
6578 } else {
6579 Invert = true; [[fallthrough]];
6580 }
6581 case ISD::SETEQ: Opc = ARMCC::EQ; break;
6582 case ISD::SETLT: Swap = true; [[fallthrough]];
6583 case ISD::SETGT: Opc = ARMCC::GT; break;
6584 case ISD::SETLE: Swap = true; [[fallthrough]];
6585 case ISD::SETGE: Opc = ARMCC::GE; break;
6586 case ISD::SETULT: Swap = true; [[fallthrough]];
6587 case ISD::SETUGT: Opc = ARMCC::HI; break;
6588 case ISD::SETULE: Swap = true; [[fallthrough]];
6589 case ISD::SETUGE: Opc = ARMCC::HS; break;
6590 }
6591
6592 // Detect VTST (Vector Test Bits) = icmp ne (and (op0, op1), zero).
6593 if (ST->hasNEON() && Opc == ARMCC::EQ) {
6594 SDValue AndOp;
6596 AndOp = Op0;
6597 else if (ISD::isBuildVectorAllZeros(Op0.getNode()))
6598 AndOp = Op1;
6599
6600 // Ignore bitconvert.
6601 if (AndOp.getNode() && AndOp.getOpcode() == ISD::BITCAST)
6602 AndOp = AndOp.getOperand(0);
6603
6604 if (AndOp.getNode() && AndOp.getOpcode() == ISD::AND) {
6605 Op0 = DAG.getNode(ISD::BITCAST, dl, CmpVT, AndOp.getOperand(0));
6606 Op1 = DAG.getNode(ISD::BITCAST, dl, CmpVT, AndOp.getOperand(1));
6607 SDValue Result = DAG.getNode(ARMISD::VTST, dl, CmpVT, Op0, Op1);
6608 if (!Invert)
6609 Result = DAG.getNOT(dl, Result, VT);
6610 return Result;
6611 }
6612 }
6613 }
6614
6615 if (Swap)
6616 std::swap(Op0, Op1);
6617
6618 // If one of the operands is a constant vector zero, attempt to fold the
6619 // comparison to a specialized compare-against-zero form.
6621 (Opc == ARMCC::GE || Opc == ARMCC::GT || Opc == ARMCC::EQ ||
6622 Opc == ARMCC::NE)) {
6623 if (Opc == ARMCC::GE)
6624 Opc = ARMCC::LE;
6625 else if (Opc == ARMCC::GT)
6626 Opc = ARMCC::LT;
6627 std::swap(Op0, Op1);
6628 }
6629
6630 SDValue Result;
6632 (Opc == ARMCC::GE || Opc == ARMCC::GT || Opc == ARMCC::LE ||
6633 Opc == ARMCC::LT || Opc == ARMCC::NE || Opc == ARMCC::EQ))
6634 Result = DAG.getNode(ARMISD::VCMPZ, dl, CmpVT, Op0,
6635 DAG.getConstant(Opc, dl, MVT::i32));
6636 else
6637 Result = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op0, Op1,
6638 DAG.getConstant(Opc, dl, MVT::i32));
6639
6640 Result = DAG.getSExtOrTrunc(Result, dl, VT);
6641
6642 if (Invert)
6643 Result = DAG.getNOT(dl, Result, VT);
6644
6645 return Result;
6646}
6647
6649 SDValue LHS = Op.getOperand(0);
6650 SDValue RHS = Op.getOperand(1);
6651 SDValue Carry = Op.getOperand(2);
6652 SDValue Cond = Op.getOperand(3);
6653 SDLoc DL(Op);
6654
6655 assert(LHS.getSimpleValueType().isInteger() && "SETCCCARRY is integer only.");
6656
6657 // ARMISD::SUBE expects a carry not a borrow like ISD::USUBO_CARRY so we
6658 // have to invert the carry first.
6659 Carry = DAG.getNode(ISD::SUB, DL, MVT::i32,
6660 DAG.getConstant(1, DL, MVT::i32), Carry);
6661 // This converts the boolean value carry into the carry flag.
6662 Carry = ConvertBooleanCarryToCarryFlag(Carry, DAG);
6663
6664 SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
6665 SDValue Cmp = DAG.getNode(ARMISD::SUBE, DL, VTs, LHS, RHS, Carry);
6666
6667 SDValue FVal = DAG.getConstant(0, DL, MVT::i32);
6668 SDValue TVal = DAG.getConstant(1, DL, MVT::i32);
6669 SDValue ARMcc = DAG.getConstant(
6670 IntCCToARMCC(cast<CondCodeSDNode>(Cond)->get()), DL, MVT::i32);
6671 return DAG.getNode(ARMISD::CMOV, DL, Op.getValueType(), FVal, TVal, ARMcc,
6672 Cmp.getValue(1));
6673}
6674
6675/// isVMOVModifiedImm - Check if the specified splat value corresponds to a
6676/// valid vector constant for a NEON or MVE instruction with a "modified
6677/// immediate" operand (e.g., VMOV). If so, return the encoded value.
6678static SDValue isVMOVModifiedImm(uint64_t SplatBits, uint64_t SplatUndef,
6679 unsigned SplatBitSize, SelectionDAG &DAG,
6680 const SDLoc &dl, EVT &VT, EVT VectorVT,
6681 VMOVModImmType type) {
6682 unsigned OpCmode, Imm;
6683 bool is128Bits = VectorVT.is128BitVector();
6684
6685 // SplatBitSize is set to the smallest size that splats the vector, so a
6686 // zero vector will always have SplatBitSize == 8. However, NEON modified
6687 // immediate instructions others than VMOV do not support the 8-bit encoding
6688 // of a zero vector, and the default encoding of zero is supposed to be the
6689 // 32-bit version.
6690 if (SplatBits == 0)
6691 SplatBitSize = 32;
6692
6693 switch (SplatBitSize) {
6694 case 8:
6695 if (type != VMOVModImm)
6696 return SDValue();
6697 // Any 1-byte value is OK. Op=0, Cmode=1110.
6698 assert((SplatBits & ~0xff) == 0 && "one byte splat value is too big");
6699 OpCmode = 0xe;
6700 Imm = SplatBits;
6701 VT = is128Bits ? MVT::v16i8 : MVT::v8i8;
6702 break;
6703
6704 case 16:
6705 // NEON's 16-bit VMOV supports splat values where only one byte is nonzero.
6706 VT = is128Bits ? MVT::v8i16 : MVT::v4i16;
6707 if ((SplatBits & ~0xff) == 0) {
6708 // Value = 0x00nn: Op=x, Cmode=100x.
6709 OpCmode = 0x8;
6710 Imm = SplatBits;
6711 break;
6712 }
6713 if ((SplatBits & ~0xff00) == 0) {
6714 // Value = 0xnn00: Op=x, Cmode=101x.
6715 OpCmode = 0xa;
6716 Imm = SplatBits >> 8;
6717 break;
6718 }
6719 return SDValue();
6720
6721 case 32:
6722 // NEON's 32-bit VMOV supports splat values where:
6723 // * only one byte is nonzero, or
6724 // * the least significant byte is 0xff and the second byte is nonzero, or
6725 // * the least significant 2 bytes are 0xff and the third is nonzero.
6726 VT = is128Bits ? MVT::v4i32 : MVT::v2i32;
6727 if ((SplatBits & ~0xff) == 0) {
6728 // Value = 0x000000nn: Op=x, Cmode=000x.
6729 OpCmode = 0;
6730 Imm = SplatBits;
6731 break;
6732 }
6733 if ((SplatBits & ~0xff00) == 0) {
6734 // Value = 0x0000nn00: Op=x, Cmode=001x.
6735 OpCmode = 0x2;
6736 Imm = SplatBits >> 8;
6737 break;
6738 }
6739 if ((SplatBits & ~0xff0000) == 0) {
6740 // Value = 0x00nn0000: Op=x, Cmode=010x.
6741 OpCmode = 0x4;
6742 Imm = SplatBits >> 16;
6743 break;
6744 }
6745 if ((SplatBits & ~0xff000000) == 0) {
6746 // Value = 0xnn000000: Op=x, Cmode=011x.
6747 OpCmode = 0x6;
6748 Imm = SplatBits >> 24;
6749 break;
6750 }
6751
6752 // cmode == 0b1100 and cmode == 0b1101 are not supported for VORR or VBIC
6753 if (type == OtherModImm) return SDValue();
6754
6755 if ((SplatBits & ~0xffff) == 0 &&
6756 ((SplatBits | SplatUndef) & 0xff) == 0xff) {
6757 // Value = 0x0000nnff: Op=x, Cmode=1100.
6758 OpCmode = 0xc;
6759 Imm = SplatBits >> 8;
6760 break;
6761 }
6762
6763 // cmode == 0b1101 is not supported for MVE VMVN
6764 if (type == MVEVMVNModImm)
6765 return SDValue();
6766
6767 if ((SplatBits & ~0xffffff) == 0 &&
6768 ((SplatBits | SplatUndef) & 0xffff) == 0xffff) {
6769 // Value = 0x00nnffff: Op=x, Cmode=1101.
6770 OpCmode = 0xd;
6771 Imm = SplatBits >> 16;
6772 break;
6773 }
6774
6775 // Note: there are a few 32-bit splat values (specifically: 00ffff00,
6776 // ff000000, ff0000ff, and ffff00ff) that are valid for VMOV.I64 but not
6777 // VMOV.I32. A (very) minor optimization would be to replicate the value
6778 // and fall through here to test for a valid 64-bit splat. But, then the
6779 // caller would also need to check and handle the change in size.
6780 return SDValue();
6781
6782 case 64: {
6783 if (type != VMOVModImm)
6784 return SDValue();
6785 // NEON has a 64-bit VMOV splat where each byte is either 0 or 0xff.
6786 uint64_t BitMask = 0xff;
6787 unsigned ImmMask = 1;
6788 Imm = 0;
6789 for (int ByteNum = 0; ByteNum < 8; ++ByteNum) {
6790 if (((SplatBits | SplatUndef) & BitMask) == BitMask) {
6791 Imm |= ImmMask;
6792 } else if ((SplatBits & BitMask) != 0) {
6793 return SDValue();
6794 }
6795 BitMask <<= 8;
6796 ImmMask <<= 1;
6797 }
6798
6799 // Op=1, Cmode=1110.
6800 OpCmode = 0x1e;
6801 VT = is128Bits ? MVT::v2i64 : MVT::v1i64;
6802 break;
6803 }
6804
6805 default:
6806 llvm_unreachable("unexpected size for isVMOVModifiedImm");
6807 }
6808
6809 unsigned EncodedVal = ARM_AM::createVMOVModImm(OpCmode, Imm);
6810 return DAG.getTargetConstant(EncodedVal, dl, MVT::i32);
6811}
6812
6813SDValue ARMTargetLowering::LowerConstantFP(SDValue Op, SelectionDAG &DAG,
6814 const ARMSubtarget *ST) const {
6815 EVT VT = Op.getValueType();
6816 bool IsDouble = (VT == MVT::f64);
6817 ConstantFPSDNode *CFP = cast<ConstantFPSDNode>(Op);
6818 const APFloat &FPVal = CFP->getValueAPF();
6819
6820 // Prevent floating-point constants from using literal loads
6821 // when execute-only is enabled.
6822 if (ST->genExecuteOnly()) {
6823 // We shouldn't trigger this for v6m execute-only
6824 assert((!ST->isThumb1Only() || ST->hasV8MBaselineOps()) &&
6825 "Unexpected architecture");
6826
6827 // If we can represent the constant as an immediate, don't lower it
6828 if (isFPImmLegal(FPVal, VT))
6829 return Op;
6830 // Otherwise, construct as integer, and move to float register
6831 APInt INTVal = FPVal.bitcastToAPInt();
6832 SDLoc DL(CFP);
6833 switch (VT.getSimpleVT().SimpleTy) {
6834 default:
6835 llvm_unreachable("Unknown floating point type!");
6836 break;
6837 case MVT::f64: {
6838 SDValue Lo = DAG.getConstant(INTVal.trunc(32), DL, MVT::i32);
6839 SDValue Hi = DAG.getConstant(INTVal.lshr(32).trunc(32), DL, MVT::i32);
6840 return DAG.getNode(ARMISD::VMOVDRR, DL, MVT::f64, Lo, Hi);
6841 }
6842 case MVT::f32:
6843 return DAG.getNode(ARMISD::VMOVSR, DL, VT,
6844 DAG.getConstant(INTVal, DL, MVT::i32));
6845 }
6846 }
6847
6848 if (!ST->hasVFP3Base())
6849 return SDValue();
6850
6851 // Use the default (constant pool) lowering for double constants when we have
6852 // an SP-only FPU
6853 if (IsDouble && !Subtarget->hasFP64())
6854 return SDValue();
6855
6856 // Try splatting with a VMOV.f32...
6857 int ImmVal = IsDouble ? ARM_AM::getFP64Imm(FPVal) : ARM_AM::getFP32Imm(FPVal);
6858
6859 if (ImmVal != -1) {
6860 if (IsDouble || !ST->useNEONForSinglePrecisionFP()) {
6861 // We have code in place to select a valid ConstantFP already, no need to
6862 // do any mangling.
6863 return Op;
6864 }
6865
6866 // It's a float and we are trying to use NEON operations where
6867 // possible. Lower it to a splat followed by an extract.
6868 SDLoc DL(Op);
6869 SDValue NewVal = DAG.getTargetConstant(ImmVal, DL, MVT::i32);
6870 SDValue VecConstant = DAG.getNode(ARMISD::VMOVFPIMM, DL, MVT::v2f32,
6871 NewVal);
6872 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecConstant,
6873 DAG.getConstant(0, DL, MVT::i32));
6874 }
6875
6876 // The rest of our options are NEON only, make sure that's allowed before
6877 // proceeding..
6878 if (!ST->hasNEON() || (!IsDouble && !ST->useNEONForSinglePrecisionFP()))
6879 return SDValue();
6880
6881 EVT VMovVT;
6882 uint64_t iVal = FPVal.bitcastToAPInt().getZExtValue();
6883
6884 // It wouldn't really be worth bothering for doubles except for one very
6885 // important value, which does happen to match: 0.0. So make sure we don't do
6886 // anything stupid.
6887 if (IsDouble && (iVal & 0xffffffff) != (iVal >> 32))
6888 return SDValue();
6889
6890 // Try a VMOV.i32 (FIXME: i8, i16, or i64 could work too).
6891 SDValue NewVal = isVMOVModifiedImm(iVal & 0xffffffffU, 0, 32, DAG, SDLoc(Op),
6892 VMovVT, VT, VMOVModImm);
6893 if (NewVal != SDValue()) {
6894 SDLoc DL(Op);
6895 SDValue VecConstant = DAG.getNode(ARMISD::VMOVIMM, DL, VMovVT,
6896 NewVal);
6897 if (IsDouble)
6898 return DAG.getNode(ISD::BITCAST, DL, MVT::f64, VecConstant);
6899
6900 // It's a float: cast and extract a vector element.
6901 SDValue VecFConstant = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32,
6902 VecConstant);
6903 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecFConstant,
6904 DAG.getConstant(0, DL, MVT::i32));
6905 }
6906
6907 // Finally, try a VMVN.i32
6908 NewVal = isVMOVModifiedImm(~iVal & 0xffffffffU, 0, 32, DAG, SDLoc(Op), VMovVT,
6909 VT, VMVNModImm);
6910 if (NewVal != SDValue()) {
6911 SDLoc DL(Op);
6912 SDValue VecConstant = DAG.getNode(ARMISD::VMVNIMM, DL, VMovVT, NewVal);
6913
6914 if (IsDouble)
6915 return DAG.getNode(ISD::BITCAST, DL, MVT::f64, VecConstant);
6916
6917 // It's a float: cast and extract a vector element.
6918 SDValue VecFConstant = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32,
6919 VecConstant);
6920 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecFConstant,
6921 DAG.getConstant(0, DL, MVT::i32));
6922 }
6923
6924 return SDValue();
6925}
6926
6927// check if an VEXT instruction can handle the shuffle mask when the
6928// vector sources of the shuffle are the same.
6929static bool isSingletonVEXTMask(ArrayRef<int> M, EVT VT, unsigned &Imm) {
6930 unsigned NumElts = VT.getVectorNumElements();
6931
6932 // Assume that the first shuffle index is not UNDEF. Fail if it is.
6933 if (M[0] < 0)
6934 return false;
6935
6936 Imm = M[0];
6937
6938 // If this is a VEXT shuffle, the immediate value is the index of the first
6939 // element. The other shuffle indices must be the successive elements after
6940 // the first one.
6941 unsigned ExpectedElt = Imm;
6942 for (unsigned i = 1; i < NumElts; ++i) {
6943 // Increment the expected index. If it wraps around, just follow it
6944 // back to index zero and keep going.
6945 ++ExpectedElt;
6946 if (ExpectedElt == NumElts)
6947 ExpectedElt = 0;
6948
6949 if (M[i] < 0) continue; // ignore UNDEF indices
6950 if (ExpectedElt != static_cast<unsigned>(M[i]))
6951 return false;
6952 }
6953
6954 return true;
6955}
6956
6957static bool isVEXTMask(ArrayRef<int> M, EVT VT,
6958 bool &ReverseVEXT, unsigned &Imm) {
6959 unsigned NumElts = VT.getVectorNumElements();
6960 ReverseVEXT = false;
6961
6962 // Assume that the first shuffle index is not UNDEF. Fail if it is.
6963 if (M[0] < 0)
6964 return false;
6965
6966 Imm = M[0];
6967
6968 // If this is a VEXT shuffle, the immediate value is the index of the first
6969 // element. The other shuffle indices must be the successive elements after
6970 // the first one.
6971 unsigned ExpectedElt = Imm;
6972 for (unsigned i = 1; i < NumElts; ++i) {
6973 // Increment the expected index. If it wraps around, it may still be
6974 // a VEXT but the source vectors must be swapped.
6975 ExpectedElt += 1;
6976 if (ExpectedElt == NumElts * 2) {
6977 ExpectedElt = 0;
6978 ReverseVEXT = true;
6979 }
6980
6981 if (M[i] < 0) continue; // ignore UNDEF indices
6982 if (ExpectedElt != static_cast<unsigned>(M[i]))
6983 return false;
6984 }
6985
6986 // Adjust the index value if the source operands will be swapped.
6987 if (ReverseVEXT)
6988 Imm -= NumElts;
6989
6990 return true;
6991}
6992
6993static bool isVTBLMask(ArrayRef<int> M, EVT VT) {
6994 // We can handle <8 x i8> vector shuffles. If the index in the mask is out of
6995 // range, then 0 is placed into the resulting vector. So pretty much any mask
6996 // of 8 elements can work here.
6997 return VT == MVT::v8i8 && M.size() == 8;
6998}
6999
7000static unsigned SelectPairHalf(unsigned Elements, ArrayRef<int> Mask,
7001 unsigned Index) {
7002 if (Mask.size() == Elements * 2)
7003 return Index / Elements;
7004 return Mask[Index] == 0 ? 0 : 1;
7005}
7006
7007// Checks whether the shuffle mask represents a vector transpose (VTRN) by
7008// checking that pairs of elements in the shuffle mask represent the same index
7009// in each vector, incrementing the expected index by 2 at each step.
7010// e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 4, 2, 6]
7011// v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,e,c,g}
7012// v2={e,f,g,h}
7013// WhichResult gives the offset for each element in the mask based on which
7014// of the two results it belongs to.
7015//
7016// The transpose can be represented either as:
7017// result1 = shufflevector v1, v2, result1_shuffle_mask
7018// result2 = shufflevector v1, v2, result2_shuffle_mask
7019// where v1/v2 and the shuffle masks have the same number of elements
7020// (here WhichResult (see below) indicates which result is being checked)
7021//
7022// or as:
7023// results = shufflevector v1, v2, shuffle_mask
7024// where both results are returned in one vector and the shuffle mask has twice
7025// as many elements as v1/v2 (here WhichResult will always be 0 if true) here we
7026// want to check the low half and high half of the shuffle mask as if it were
7027// the other case
7028static bool isVTRNMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
7029 unsigned EltSz = VT.getScalarSizeInBits();
7030 if (EltSz == 64)
7031 return false;
7032
7033 unsigned NumElts = VT.getVectorNumElements();
7034 if ((M.size() != NumElts && M.size() != NumElts * 2) || NumElts % 2 != 0)
7035 return false;
7036
7037 // If the mask is twice as long as the input vector then we need to check the
7038 // upper and lower parts of the mask with a matching value for WhichResult
7039 // FIXME: A mask with only even values will be rejected in case the first
7040 // element is undefined, e.g. [-1, 4, 2, 6] will be rejected, because only
7041 // M[0] is used to determine WhichResult
7042 for (unsigned i = 0; i < M.size(); i += NumElts) {
7043 WhichResult = SelectPairHalf(NumElts, M, i);
7044 for (unsigned j = 0; j < NumElts; j += 2) {
7045 if ((M[i+j] >= 0 && (unsigned) M[i+j] != j + WhichResult) ||
7046 (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != j + NumElts + WhichResult))
7047 return false;
7048 }
7049 }
7050
7051 if (M.size() == NumElts*2)
7052 WhichResult = 0;
7053
7054 return true;
7055}
7056
7057/// isVTRN_v_undef_Mask - Special case of isVTRNMask for canonical form of
7058/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
7059/// Mask is e.g., <0, 0, 2, 2> instead of <0, 4, 2, 6>.
7060static bool isVTRN_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){
7061 unsigned EltSz = VT.getScalarSizeInBits();
7062 if (EltSz == 64)
7063 return false;
7064
7065 unsigned NumElts = VT.getVectorNumElements();
7066 if ((M.size() != NumElts && M.size() != NumElts * 2) || NumElts % 2 != 0)
7067 return false;
7068
7069 for (unsigned i = 0; i < M.size(); i += NumElts) {
7070 WhichResult = SelectPairHalf(NumElts, M, i);
7071 for (unsigned j = 0; j < NumElts; j += 2) {
7072 if ((M[i+j] >= 0 && (unsigned) M[i+j] != j + WhichResult) ||
7073 (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != j + WhichResult))
7074 return false;
7075 }
7076 }
7077
7078 if (M.size() == NumElts*2)
7079 WhichResult = 0;
7080
7081 return true;
7082}
7083
7084// Checks whether the shuffle mask represents a vector unzip (VUZP) by checking
7085// that the mask elements are either all even and in steps of size 2 or all odd
7086// and in steps of size 2.
7087// e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 2, 4, 6]
7088// v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,c,e,g}
7089// v2={e,f,g,h}
7090// Requires similar checks to that of isVTRNMask with
7091// respect the how results are returned.
7092static bool isVUZPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
7093 unsigned EltSz = VT.getScalarSizeInBits();
7094 if (EltSz == 64)
7095 return false;
7096
7097 unsigned NumElts = VT.getVectorNumElements();
7098 if (M.size() != NumElts && M.size() != NumElts*2)
7099 return false;
7100
7101 for (unsigned i = 0; i < M.size(); i += NumElts) {
7102 WhichResult = SelectPairHalf(NumElts, M, i);
7103 for (unsigned j = 0; j < NumElts; ++j) {
7104 if (M[i+j] >= 0 && (unsigned) M[i+j] != 2 * j + WhichResult)
7105 return false;
7106 }
7107 }
7108
7109 if (M.size() == NumElts*2)
7110 WhichResult = 0;
7111
7112 // VUZP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
7113 if (VT.is64BitVector() && EltSz == 32)
7114 return false;
7115
7116 return true;
7117}
7118
7119/// isVUZP_v_undef_Mask - Special case of isVUZPMask for canonical form of
7120/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
7121/// Mask is e.g., <0, 2, 0, 2> instead of <0, 2, 4, 6>,
7122static bool isVUZP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){
7123 unsigned EltSz = VT.getScalarSizeInBits();
7124 if (EltSz == 64)
7125 return false;
7126
7127 unsigned NumElts = VT.getVectorNumElements();
7128 if (M.size() != NumElts && M.size() != NumElts*2)
7129 return false;
7130
7131 unsigned Half = NumElts / 2;
7132 for (unsigned i = 0; i < M.size(); i += NumElts) {
7133 WhichResult = SelectPairHalf(NumElts, M, i);
7134 for (unsigned j = 0; j < NumElts; j += Half) {
7135 unsigned Idx = WhichResult;
7136 for (unsigned k = 0; k < Half; ++k) {
7137 int MIdx = M[i + j + k];
7138 if (MIdx >= 0 && (unsigned) MIdx != Idx)
7139 return false;
7140 Idx += 2;
7141 }
7142 }
7143 }
7144
7145 if (M.size() == NumElts*2)
7146 WhichResult = 0;
7147
7148 // VUZP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
7149 if (VT.is64BitVector() && EltSz == 32)
7150 return false;
7151
7152 return true;
7153}
7154
7155// Checks whether the shuffle mask represents a vector zip (VZIP) by checking
7156// that pairs of elements of the shufflemask represent the same index in each
7157// vector incrementing sequentially through the vectors.
7158// e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 4, 1, 5]
7159// v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,e,b,f}
7160// v2={e,f,g,h}
7161// Requires similar checks to that of isVTRNMask with respect the how results
7162// are returned.
7163static bool isVZIPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
7164 unsigned EltSz = VT.getScalarSizeInBits();
7165 if (EltSz == 64)
7166 return false;
7167
7168 unsigned NumElts = VT.getVectorNumElements();
7169 if ((M.size() != NumElts && M.size() != NumElts * 2) || NumElts % 2 != 0)
7170 return false;
7171
7172 for (unsigned i = 0; i < M.size(); i += NumElts) {
7173 WhichResult = SelectPairHalf(NumElts, M, i);
7174 unsigned Idx = WhichResult * NumElts / 2;
7175 for (unsigned j = 0; j < NumElts; j += 2) {
7176 if ((M[i+j] >= 0 && (unsigned) M[i+j] != Idx) ||
7177 (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != Idx + NumElts))
7178 return false;
7179 Idx += 1;
7180 }
7181 }
7182
7183 if (M.size() == NumElts*2)
7184 WhichResult = 0;
7185
7186 // VZIP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
7187 if (VT.is64BitVector() && EltSz == 32)
7188 return false;
7189
7190 return true;
7191}
7192
7193/// isVZIP_v_undef_Mask - Special case of isVZIPMask for canonical form of
7194/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
7195/// Mask is e.g., <0, 0, 1, 1> instead of <0, 4, 1, 5>.
7196static bool isVZIP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){
7197 unsigned EltSz = VT.getScalarSizeInBits();
7198 if (EltSz == 64)
7199 return false;
7200
7201 unsigned NumElts = VT.getVectorNumElements();
7202 if ((M.size() != NumElts && M.size() != NumElts * 2) || NumElts % 2 != 0)
7203 return false;
7204
7205 for (unsigned i = 0; i < M.size(); i += NumElts) {
7206 WhichResult = SelectPairHalf(NumElts, M, i);
7207 unsigned Idx = WhichResult * NumElts / 2;
7208 for (unsigned j = 0; j < NumElts; j += 2) {
7209 if ((M[i+j] >= 0 && (unsigned) M[i+j] != Idx) ||
7210 (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != Idx))
7211 return false;
7212 Idx += 1;
7213 }
7214 }
7215
7216 if (M.size() == NumElts*2)
7217 WhichResult = 0;
7218
7219 // VZIP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
7220 if (VT.is64BitVector() && EltSz == 32)
7221 return false;
7222
7223 return true;
7224}
7225
7226/// Check if \p ShuffleMask is a NEON two-result shuffle (VZIP, VUZP, VTRN),
7227/// and return the corresponding ARMISD opcode if it is, or 0 if it isn't.
7228static unsigned isNEONTwoResultShuffleMask(ArrayRef<int> ShuffleMask, EVT VT,
7229 unsigned &WhichResult,
7230 bool &isV_UNDEF) {
7231 isV_UNDEF = false;
7232 if (isVTRNMask(ShuffleMask, VT, WhichResult))
7233 return ARMISD::VTRN;
7234 if (isVUZPMask(ShuffleMask, VT, WhichResult))
7235 return ARMISD::VUZP;
7236 if (isVZIPMask(ShuffleMask, VT, WhichResult))
7237 return ARMISD::VZIP;
7238
7239 isV_UNDEF = true;
7240 if (isVTRN_v_undef_Mask(ShuffleMask, VT, WhichResult))
7241 return ARMISD::VTRN;
7242 if (isVUZP_v_undef_Mask(ShuffleMask, VT, WhichResult))
7243 return ARMISD::VUZP;
7244 if (isVZIP_v_undef_Mask(ShuffleMask, VT, WhichResult))
7245 return ARMISD::VZIP;
7246
7247 return 0;
7248}
7249
7250/// \return true if this is a reverse operation on an vector.
7251static bool isReverseMask(ArrayRef<int> M, EVT VT) {
7252 unsigned NumElts = VT.getVectorNumElements();
7253 // Make sure the mask has the right size.
7254 if (NumElts != M.size())
7255 return false;
7256
7257 // Look for <15, ..., 3, -1, 1, 0>.
7258 for (unsigned i = 0; i != NumElts; ++i)
7259 if (M[i] >= 0 && M[i] != (int) (NumElts - 1 - i))
7260 return false;
7261
7262 return true;
7263}
7264
7265static bool isTruncMask(ArrayRef<int> M, EVT VT, bool Top, bool SingleSource) {
7266 unsigned NumElts = VT.getVectorNumElements();
7267 // Make sure the mask has the right size.
7268 if (NumElts != M.size() || (VT != MVT::v8i16 && VT != MVT::v16i8))
7269 return false;
7270
7271 // Half-width truncation patterns (e.g. v4i32 -> v8i16):
7272 // !Top && SingleSource: <0, 2, 4, 6, 0, 2, 4, 6>
7273 // !Top && !SingleSource: <0, 2, 4, 6, 8, 10, 12, 14>
7274 // Top && SingleSource: <1, 3, 5, 7, 1, 3, 5, 7>
7275 // Top && !SingleSource: <1, 3, 5, 7, 9, 11, 13, 15>
7276 int Ofs = Top ? 1 : 0;
7277 int Upper = SingleSource ? 0 : NumElts;
7278 for (int i = 0, e = NumElts / 2; i != e; ++i) {
7279 if (M[i] >= 0 && M[i] != (i * 2) + Ofs)
7280 return false;
7281 if (M[i + e] >= 0 && M[i + e] != (i * 2) + Ofs + Upper)
7282 return false;
7283 }
7284 return true;
7285}
7286
7287static bool isVMOVNMask(ArrayRef<int> M, EVT VT, bool Top, bool SingleSource) {
7288 unsigned NumElts = VT.getVectorNumElements();
7289 // Make sure the mask has the right size.
7290 if (NumElts != M.size() || (VT != MVT::v8i16 && VT != MVT::v16i8))
7291 return false;
7292
7293 // If Top
7294 // Look for <0, N, 2, N+2, 4, N+4, ..>.
7295 // This inserts Input2 into Input1
7296 // else if not Top
7297 // Look for <0, N+1, 2, N+3, 4, N+5, ..>
7298 // This inserts Input1 into Input2
7299 unsigned Offset = Top ? 0 : 1;
7300 unsigned N = SingleSource ? 0 : NumElts;
7301 for (unsigned i = 0; i < NumElts; i += 2) {
7302 if (M[i] >= 0 && M[i] != (int)i)
7303 return false;
7304 if (M[i + 1] >= 0 && M[i + 1] != (int)(N + i + Offset))
7305 return false;
7306 }
7307
7308 return true;
7309}
7310
7311static bool isVMOVNTruncMask(ArrayRef<int> M, EVT ToVT, bool rev) {
7312 unsigned NumElts = ToVT.getVectorNumElements();
7313 if (NumElts != M.size())
7314 return false;
7315
7316 // Test if the Trunc can be convertable to a VMOVN with this shuffle. We are
7317 // looking for patterns of:
7318 // !rev: 0 N/2 1 N/2+1 2 N/2+2 ...
7319 // rev: N/2 0 N/2+1 1 N/2+2 2 ...
7320
7321 unsigned Off0 = rev ? NumElts / 2 : 0;
7322 unsigned Off1 = rev ? 0 : NumElts / 2;
7323 for (unsigned i = 0; i < NumElts; i += 2) {
7324 if (M[i] >= 0 && M[i] != (int)(Off0 + i / 2))
7325 return false;
7326 if (M[i + 1] >= 0 && M[i + 1] != (int)(Off1 + i / 2))
7327 return false;
7328 }
7329
7330 return true;
7331}
7332
7333// Reconstruct an MVE VCVT from a BuildVector of scalar fptrunc, all extracted
7334// from a pair of inputs. For example:
7335// BUILDVECTOR(FP_ROUND(EXTRACT_ELT(X, 0),
7336// FP_ROUND(EXTRACT_ELT(Y, 0),
7337// FP_ROUND(EXTRACT_ELT(X, 1),
7338// FP_ROUND(EXTRACT_ELT(Y, 1), ...)
7340 const ARMSubtarget *ST) {
7341 assert(BV.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
7342 if (!ST->hasMVEFloatOps())
7343 return SDValue();
7344
7345 SDLoc dl(BV);
7346 EVT VT = BV.getValueType();
7347 if (VT != MVT::v8f16)
7348 return SDValue();
7349
7350 // We are looking for a buildvector of fptrunc elements, where all the
7351 // elements are interleavingly extracted from two sources. Check the first two
7352 // items are valid enough and extract some info from them (they are checked
7353 // properly in the loop below).
7354 if (BV.getOperand(0).getOpcode() != ISD::FP_ROUND ||
7357 return SDValue();
7358 if (BV.getOperand(1).getOpcode() != ISD::FP_ROUND ||
7361 return SDValue();
7362 SDValue Op0 = BV.getOperand(0).getOperand(0).getOperand(0);
7363 SDValue Op1 = BV.getOperand(1).getOperand(0).getOperand(0);
7364 if (Op0.getValueType() != MVT::v4f32 || Op1.getValueType() != MVT::v4f32)
7365 return SDValue();
7366
7367 // Check all the values in the BuildVector line up with our expectations.
7368 for (unsigned i = 1; i < 4; i++) {
7369 auto Check = [](SDValue Trunc, SDValue Op, unsigned Idx) {
7370 return Trunc.getOpcode() == ISD::FP_ROUND &&
7372 Trunc.getOperand(0).getOperand(0) == Op &&
7373 Trunc.getOperand(0).getConstantOperandVal(1) == Idx;
7374 };
7375 if (!Check(BV.getOperand(i * 2 + 0), Op0, i))
7376 return SDValue();
7377 if (!Check(BV.getOperand(i * 2 + 1), Op1, i))
7378 return SDValue();
7379 }
7380
7381 SDValue N1 = DAG.getNode(ARMISD::VCVTN, dl, VT, DAG.getUNDEF(VT), Op0,
7382 DAG.getConstant(0, dl, MVT::i32));
7383 return DAG.getNode(ARMISD::VCVTN, dl, VT, N1, Op1,
7384 DAG.getConstant(1, dl, MVT::i32));
7385}
7386
7387// Reconstruct an MVE VCVT from a BuildVector of scalar fpext, all extracted
7388// from a single input on alternating lanes. For example:
7389// BUILDVECTOR(FP_ROUND(EXTRACT_ELT(X, 0),
7390// FP_ROUND(EXTRACT_ELT(X, 2),
7391// FP_ROUND(EXTRACT_ELT(X, 4), ...)
7393 const ARMSubtarget *ST) {
7394 assert(BV.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
7395 if (!ST->hasMVEFloatOps())
7396 return SDValue();
7397
7398 SDLoc dl(BV);
7399 EVT VT = BV.getValueType();
7400 if (VT != MVT::v4f32)
7401 return SDValue();
7402
7403 // We are looking for a buildvector of fptext elements, where all the
7404 // elements are alternating lanes from a single source. For example <0,2,4,6>
7405 // or <1,3,5,7>. Check the first two items are valid enough and extract some
7406 // info from them (they are checked properly in the loop below).
7407 if (BV.getOperand(0).getOpcode() != ISD::FP_EXTEND ||
7409 return SDValue();
7410 SDValue Op0 = BV.getOperand(0).getOperand(0).getOperand(0);
7412 if (Op0.getValueType() != MVT::v8f16 || (Offset != 0 && Offset != 1))
7413 return SDValue();
7414
7415 // Check all the values in the BuildVector line up with our expectations.
7416 for (unsigned i = 1; i < 4; i++) {
7417 auto Check = [](SDValue Trunc, SDValue Op, unsigned Idx) {
7418 return Trunc.getOpcode() == ISD::FP_EXTEND &&
7420 Trunc.getOperand(0).getOperand(0) == Op &&
7421 Trunc.getOperand(0).getConstantOperandVal(1) == Idx;
7422 };
7423 if (!Check(BV.getOperand(i), Op0, 2 * i + Offset))
7424 return SDValue();
7425 }
7426
7427 return DAG.getNode(ARMISD::VCVTL, dl, VT, Op0,
7428 DAG.getConstant(Offset, dl, MVT::i32));
7429}
7430
7431// If N is an integer constant that can be moved into a register in one
7432// instruction, return an SDValue of such a constant (will become a MOV
7433// instruction). Otherwise return null.
7435 const ARMSubtarget *ST, const SDLoc &dl) {
7436 uint64_t Val;
7437 if (!isa<ConstantSDNode>(N))
7438 return SDValue();
7439 Val = N->getAsZExtVal();
7440
7441 if (ST->isThumb1Only()) {
7442 if (Val <= 255 || ~Val <= 255)
7443 return DAG.getConstant(Val, dl, MVT::i32);
7444 } else {
7445 if (ARM_AM::getSOImmVal(Val) != -1 || ARM_AM::getSOImmVal(~Val) != -1)
7446 return DAG.getConstant(Val, dl, MVT::i32);
7447 }
7448 return SDValue();
7449}
7450
7452 const ARMSubtarget *ST) {
7453 SDLoc dl(Op);
7454 EVT VT = Op.getValueType();
7455
7456 assert(ST->hasMVEIntegerOps() && "LowerBUILD_VECTOR_i1 called without MVE!");
7457
7458 unsigned NumElts = VT.getVectorNumElements();
7459 unsigned BoolMask;
7460 unsigned BitsPerBool;
7461 if (NumElts == 2) {
7462 BitsPerBool = 8;
7463 BoolMask = 0xff;
7464 } else if (NumElts == 4) {
7465 BitsPerBool = 4;
7466 BoolMask = 0xf;
7467 } else if (NumElts == 8) {
7468 BitsPerBool = 2;
7469 BoolMask = 0x3;
7470 } else if (NumElts == 16) {
7471 BitsPerBool = 1;
7472 BoolMask = 0x1;
7473 } else
7474 return SDValue();
7475
7476 // If this is a single value copied into all lanes (a splat), we can just sign
7477 // extend that single value
7478 SDValue FirstOp = Op.getOperand(0);
7479 if (!isa<ConstantSDNode>(FirstOp) &&
7480 llvm::all_of(llvm::drop_begin(Op->ops()), [&FirstOp](const SDUse &U) {
7481 return U.get().isUndef() || U.get() == FirstOp;
7482 })) {
7483 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::i32, FirstOp,
7484 DAG.getValueType(MVT::i1));
7485 return DAG.getNode(ARMISD::PREDICATE_CAST, dl, Op.getValueType(), Ext);
7486 }
7487
7488 // First create base with bits set where known
7489 unsigned Bits32 = 0;
7490 for (unsigned i = 0; i < NumElts; ++i) {
7491 SDValue V = Op.getOperand(i);
7492 if (!isa<ConstantSDNode>(V) && !V.isUndef())
7493 continue;
7494 bool BitSet = V.isUndef() ? false : V->getAsZExtVal();
7495 if (BitSet)
7496 Bits32 |= BoolMask << (i * BitsPerBool);
7497 }
7498
7499 // Add in unknown nodes
7500 SDValue Base = DAG.getNode(ARMISD::PREDICATE_CAST, dl, VT,
7501 DAG.getConstant(Bits32, dl, MVT::i32));
7502 for (unsigned i = 0; i < NumElts; ++i) {
7503 SDValue V = Op.getOperand(i);
7504 if (isa<ConstantSDNode>(V) || V.isUndef())
7505 continue;
7506 Base = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Base, V,
7507 DAG.getConstant(i, dl, MVT::i32));
7508 }
7509
7510 return Base;
7511}
7512
7514 const ARMSubtarget *ST) {
7515 if (!ST->hasMVEIntegerOps())
7516 return SDValue();
7517
7518 // We are looking for a buildvector where each element is Op[0] + i*N
7519 EVT VT = Op.getValueType();
7520 SDValue Op0 = Op.getOperand(0);
7521 unsigned NumElts = VT.getVectorNumElements();
7522
7523 // Get the increment value from operand 1
7524 SDValue Op1 = Op.getOperand(1);
7525 if (Op1.getOpcode() != ISD::ADD || Op1.getOperand(0) != Op0 ||
7527 return SDValue();
7528 unsigned N = Op1.getConstantOperandVal(1);
7529 if (N != 1 && N != 2 && N != 4 && N != 8)
7530 return SDValue();
7531
7532 // Check that each other operand matches
7533 for (unsigned I = 2; I < NumElts; I++) {
7534 SDValue OpI = Op.getOperand(I);
7535 if (OpI.getOpcode() != ISD::ADD || OpI.getOperand(0) != Op0 ||
7537 OpI.getConstantOperandVal(1) != I * N)
7538 return SDValue();
7539 }
7540
7541 SDLoc DL(Op);
7542 return DAG.getNode(ARMISD::VIDUP, DL, DAG.getVTList(VT, MVT::i32), Op0,
7543 DAG.getConstant(N, DL, MVT::i32));
7544}
7545
7546// Returns true if the operation N can be treated as qr instruction variant at
7547// operand Op.
7548static bool IsQRMVEInstruction(const SDNode *N, const SDNode *Op) {
7549 switch (N->getOpcode()) {
7550 case ISD::ADD:
7551 case ISD::MUL:
7552 case ISD::SADDSAT:
7553 case ISD::UADDSAT:
7554 case ISD::AVGFLOORS:
7555 case ISD::AVGFLOORU:
7556 return true;
7557 case ISD::SUB:
7558 case ISD::SSUBSAT:
7559 case ISD::USUBSAT:
7560 return N->getOperand(1).getNode() == Op;
7562 switch (N->getConstantOperandVal(0)) {
7563 case Intrinsic::arm_mve_add_predicated:
7564 case Intrinsic::arm_mve_mul_predicated:
7565 case Intrinsic::arm_mve_qadd_predicated:
7566 case Intrinsic::arm_mve_vhadd:
7567 case Intrinsic::arm_mve_hadd_predicated:
7568 case Intrinsic::arm_mve_vqdmulh:
7569 case Intrinsic::arm_mve_qdmulh_predicated:
7570 case Intrinsic::arm_mve_vqrdmulh:
7571 case Intrinsic::arm_mve_qrdmulh_predicated:
7572 case Intrinsic::arm_mve_vqdmull:
7573 case Intrinsic::arm_mve_vqdmull_predicated:
7574 return true;
7575 case Intrinsic::arm_mve_sub_predicated:
7576 case Intrinsic::arm_mve_qsub_predicated:
7577 case Intrinsic::arm_mve_vhsub:
7578 case Intrinsic::arm_mve_hsub_predicated:
7579 return N->getOperand(2).getNode() == Op;
7580 default:
7581 return false;
7582 }
7583 default:
7584 return false;
7585 }
7586}
7587
7588// If this is a case we can't handle, return null and let the default
7589// expansion code take care of it.
7590SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
7591 const ARMSubtarget *ST) const {
7592 BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode());
7593 SDLoc dl(Op);
7594 EVT VT = Op.getValueType();
7595
7596 if (ST->hasMVEIntegerOps() && VT.getScalarSizeInBits() == 1)
7597 return LowerBUILD_VECTOR_i1(Op, DAG, ST);
7598
7599 if (SDValue R = LowerBUILD_VECTORToVIDUP(Op, DAG, ST))
7600 return R;
7601
7602 APInt SplatBits, SplatUndef;
7603 unsigned SplatBitSize;
7604 bool HasAnyUndefs;
7605 if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
7606 if (SplatUndef.isAllOnes())
7607 return DAG.getUNDEF(VT);
7608
7609 // If all the users of this constant splat are qr instruction variants,
7610 // generate a vdup of the constant.
7611 if (ST->hasMVEIntegerOps() && VT.getScalarSizeInBits() == SplatBitSize &&
7612 (SplatBitSize == 8 || SplatBitSize == 16 || SplatBitSize == 32) &&
7613 all_of(BVN->users(),
7614 [BVN](const SDNode *U) { return IsQRMVEInstruction(U, BVN); })) {
7615 EVT DupVT = SplatBitSize == 32 ? MVT::v4i32
7616 : SplatBitSize == 16 ? MVT::v8i16
7617 : MVT::v16i8;
7618 SDValue Const = DAG.getConstant(SplatBits.getZExtValue(), dl, MVT::i32);
7619 SDValue VDup = DAG.getNode(ARMISD::VDUP, dl, DupVT, Const);
7620 return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, VDup);
7621 }
7622
7623 if ((ST->hasNEON() && SplatBitSize <= 64) ||
7624 (ST->hasMVEIntegerOps() && SplatBitSize <= 64)) {
7625 // Check if an immediate VMOV works.
7626 EVT VmovVT;
7627 SDValue Val =
7628 isVMOVModifiedImm(SplatBits.getZExtValue(), SplatUndef.getZExtValue(),
7629 SplatBitSize, DAG, dl, VmovVT, VT, VMOVModImm);
7630
7631 if (Val.getNode()) {
7632 SDValue Vmov = DAG.getNode(ARMISD::VMOVIMM, dl, VmovVT, Val);
7633 return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, Vmov);
7634 }
7635
7636 // Try an immediate VMVN.
7637 uint64_t NegatedImm = (~SplatBits).getZExtValue();
7638 Val = isVMOVModifiedImm(
7639 NegatedImm, SplatUndef.getZExtValue(), SplatBitSize, DAG, dl, VmovVT,
7640 VT, ST->hasMVEIntegerOps() ? MVEVMVNModImm : VMVNModImm);
7641 if (Val.getNode()) {
7642 SDValue Vmov = DAG.getNode(ARMISD::VMVNIMM, dl, VmovVT, Val);
7643 return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, Vmov);
7644 }
7645
7646 // Use vmov.f32 to materialize other v2f32 and v4f32 splats.
7647 if ((VT == MVT::v2f32 || VT == MVT::v4f32) && SplatBitSize == 32) {
7648 int ImmVal = ARM_AM::getFP32Imm(SplatBits);
7649 if (ImmVal != -1) {
7650 SDValue Val = DAG.getTargetConstant(ImmVal, dl, MVT::i32);
7651 return DAG.getNode(ARMISD::VMOVFPIMM, dl, VT, Val);
7652 }
7653 }
7654
7655 // If we are under MVE, generate a VDUP(constant), bitcast to the original
7656 // type.
7657 if (ST->hasMVEIntegerOps() &&
7658 (SplatBitSize == 8 || SplatBitSize == 16 || SplatBitSize == 32)) {
7659 EVT DupVT = SplatBitSize == 32 ? MVT::v4i32
7660 : SplatBitSize == 16 ? MVT::v8i16
7661 : MVT::v16i8;
7662 SDValue Const = DAG.getConstant(SplatBits.getZExtValue(), dl, MVT::i32);
7663 SDValue VDup = DAG.getNode(ARMISD::VDUP, dl, DupVT, Const);
7664 return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, VDup);
7665 }
7666 }
7667 }
7668
7669 // Scan through the operands to see if only one value is used.
7670 //
7671 // As an optimisation, even if more than one value is used it may be more
7672 // profitable to splat with one value then change some lanes.
7673 //
7674 // Heuristically we decide to do this if the vector has a "dominant" value,
7675 // defined as splatted to more than half of the lanes.
7676 unsigned NumElts = VT.getVectorNumElements();
7677 bool isOnlyLowElement = true;
7678 bool usesOnlyOneValue = true;
7679 bool hasDominantValue = false;
7680 bool isConstant = true;
7681
7682 // Map of the number of times a particular SDValue appears in the
7683 // element list.
7684 DenseMap<SDValue, unsigned> ValueCounts;
7685 SDValue Value;
7686 for (unsigned i = 0; i < NumElts; ++i) {
7687 SDValue V = Op.getOperand(i);
7688 if (V.isUndef())
7689 continue;
7690 if (i > 0)
7691 isOnlyLowElement = false;
7693 isConstant = false;
7694
7695 unsigned &Count = ValueCounts[V];
7696
7697 // Is this value dominant? (takes up more than half of the lanes)
7698 if (++Count > (NumElts / 2)) {
7699 hasDominantValue = true;
7700 Value = V;
7701 }
7702 }
7703 if (ValueCounts.size() != 1)
7704 usesOnlyOneValue = false;
7705 if (!Value.getNode() && !ValueCounts.empty())
7706 Value = ValueCounts.begin()->first;
7707
7708 if (ValueCounts.empty())
7709 return DAG.getUNDEF(VT);
7710
7711 // Loads are better lowered with insert_vector_elt/ARMISD::BUILD_VECTOR.
7712 // Keep going if we are hitting this case.
7713 if (isOnlyLowElement && !ISD::isNormalLoad(Value.getNode()))
7714 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Value);
7715
7716 unsigned EltSize = VT.getScalarSizeInBits();
7717
7718 // Use VDUP for non-constant splats. For f32 constant splats, reduce to
7719 // i32 and try again.
7720 if (hasDominantValue && EltSize <= 32) {
7721 if (!isConstant) {
7722 SDValue N;
7723
7724 // If we are VDUPing a value that comes directly from a vector, that will
7725 // cause an unnecessary move to and from a GPR, where instead we could
7726 // just use VDUPLANE. We can only do this if the lane being extracted
7727 // is at a constant index, as the VDUP from lane instructions only have
7728 // constant-index forms.
7729 ConstantSDNode *constIndex;
7730 if (Value->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
7731 (constIndex = dyn_cast<ConstantSDNode>(Value->getOperand(1)))) {
7732 // We need to create a new undef vector to use for the VDUPLANE if the
7733 // size of the vector from which we get the value is different than the
7734 // size of the vector that we need to create. We will insert the element
7735 // such that the register coalescer will remove unnecessary copies.
7736 if (VT != Value->getOperand(0).getValueType()) {
7737 unsigned index = constIndex->getAPIntValue().getLimitedValue() %
7739 N = DAG.getNode(ARMISD::VDUPLANE, dl, VT,
7740 DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DAG.getUNDEF(VT),
7741 Value, DAG.getConstant(index, dl, MVT::i32)),
7742 DAG.getConstant(index, dl, MVT::i32));
7743 } else
7744 N = DAG.getNode(ARMISD::VDUPLANE, dl, VT,
7745 Value->getOperand(0), Value->getOperand(1));
7746 } else
7747 N = DAG.getNode(ARMISD::VDUP, dl, VT, Value);
7748
7749 if (!usesOnlyOneValue) {
7750 // The dominant value was splatted as 'N', but we now have to insert
7751 // all differing elements.
7752 for (unsigned I = 0; I < NumElts; ++I) {
7753 if (Op.getOperand(I) == Value)
7754 continue;
7756 Ops.push_back(N);
7757 Ops.push_back(Op.getOperand(I));
7758 Ops.push_back(DAG.getConstant(I, dl, MVT::i32));
7759 N = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Ops);
7760 }
7761 }
7762 return N;
7763 }
7766 MVT FVT = VT.getVectorElementType().getSimpleVT();
7767 assert(FVT == MVT::f32 || FVT == MVT::f16);
7768 MVT IVT = (FVT == MVT::f32) ? MVT::i32 : MVT::i16;
7769 for (unsigned i = 0; i < NumElts; ++i)
7770 Ops.push_back(DAG.getNode(ISD::BITCAST, dl, IVT,
7771 Op.getOperand(i)));
7772 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), IVT, NumElts);
7773 SDValue Val = DAG.getBuildVector(VecVT, dl, Ops);
7774 Val = LowerBUILD_VECTOR(Val, DAG, ST);
7775 if (Val.getNode())
7776 return DAG.getNode(ISD::BITCAST, dl, VT, Val);
7777 }
7778 if (usesOnlyOneValue) {
7779 SDValue Val = IsSingleInstrConstant(Value, DAG, ST, dl);
7780 if (isConstant && Val.getNode())
7781 return DAG.getNode(ARMISD::VDUP, dl, VT, Val);
7782 }
7783 }
7784
7785 // If all elements are constants and the case above didn't get hit, fall back
7786 // to the default expansion, which will generate a load from the constant
7787 // pool.
7788 if (isConstant)
7789 return SDValue();
7790
7791 // Reconstruct the BUILDVECTOR to one of the legal shuffles (such as vext and
7792 // vmovn). Empirical tests suggest this is rarely worth it for vectors of
7793 // length <= 2.
7794 if (NumElts >= 4)
7795 if (SDValue shuffle = ReconstructShuffle(Op, DAG))
7796 return shuffle;
7797
7798 // Attempt to turn a buildvector of scalar fptrunc's or fpext's back into
7799 // VCVT's
7800 if (SDValue VCVT = LowerBuildVectorOfFPTrunc(Op, DAG, Subtarget))
7801 return VCVT;
7802 if (SDValue VCVT = LowerBuildVectorOfFPExt(Op, DAG, Subtarget))
7803 return VCVT;
7804
7805 if (ST->hasNEON() && VT.is128BitVector() && VT != MVT::v2f64 && VT != MVT::v4f32) {
7806 // If we haven't found an efficient lowering, try splitting a 128-bit vector
7807 // into two 64-bit vectors; we might discover a better way to lower it.
7808 SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElts);
7809 EVT ExtVT = VT.getVectorElementType();
7810 EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElts / 2);
7811 SDValue Lower = DAG.getBuildVector(HVT, dl, ArrayRef(&Ops[0], NumElts / 2));
7812 if (Lower.getOpcode() == ISD::BUILD_VECTOR)
7813 Lower = LowerBUILD_VECTOR(Lower, DAG, ST);
7814 SDValue Upper =
7815 DAG.getBuildVector(HVT, dl, ArrayRef(&Ops[NumElts / 2], NumElts / 2));
7816 if (Upper.getOpcode() == ISD::BUILD_VECTOR)
7817 Upper = LowerBUILD_VECTOR(Upper, DAG, ST);
7818 if (Lower && Upper)
7819 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lower, Upper);
7820 }
7821
7822 // Vectors with 32- or 64-bit elements can be built by directly assigning
7823 // the subregisters. Lower it to an ARMISD::BUILD_VECTOR so the operands
7824 // will be legalized.
7825 if (EltSize >= 32) {
7826 // Do the expansion with floating-point types, since that is what the VFP
7827 // registers are defined to use, and since i64 is not legal.
7828 EVT EltVT = EVT::getFloatingPointVT(EltSize);
7829 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts);
7831 for (unsigned i = 0; i < NumElts; ++i)
7832 Ops.push_back(DAG.getNode(ISD::BITCAST, dl, EltVT, Op.getOperand(i)));
7833 SDValue Val = DAG.getNode(ARMISD::BUILD_VECTOR, dl, VecVT, Ops);
7834 return DAG.getNode(ISD::BITCAST, dl, VT, Val);
7835 }
7836
7837 // If all else fails, just use a sequence of INSERT_VECTOR_ELT when we
7838 // know the default expansion would otherwise fall back on something even
7839 // worse. For a vector with one or two non-undef values, that's
7840 // scalar_to_vector for the elements followed by a shuffle (provided the
7841 // shuffle is valid for the target) and materialization element by element
7842 // on the stack followed by a load for everything else.
7843 if (!isConstant && !usesOnlyOneValue) {
7844 SDValue Vec = DAG.getUNDEF(VT);
7845 for (unsigned i = 0 ; i < NumElts; ++i) {
7846 SDValue V = Op.getOperand(i);
7847 if (V.isUndef())
7848 continue;
7849 SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i32);
7850 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Vec, V, LaneIdx);
7851 }
7852 return Vec;
7853 }
7854
7855 return SDValue();
7856}
7857
7858// Gather data to see if the operation can be modelled as a
7859// shuffle in combination with VEXTs.
7860SDValue ARMTargetLowering::ReconstructShuffle(SDValue Op,
7861 SelectionDAG &DAG) const {
7862 assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
7863 SDLoc dl(Op);
7864 EVT VT = Op.getValueType();
7865 unsigned NumElts = VT.getVectorNumElements();
7866
7867 struct ShuffleSourceInfo {
7868 SDValue Vec;
7869 unsigned MinElt = std::numeric_limits<unsigned>::max();
7870 unsigned MaxElt = 0;
7871
7872 // We may insert some combination of BITCASTs and VEXT nodes to force Vec to
7873 // be compatible with the shuffle we intend to construct. As a result
7874 // ShuffleVec will be some sliding window into the original Vec.
7875 SDValue ShuffleVec;
7876
7877 // Code should guarantee that element i in Vec starts at element "WindowBase
7878 // + i * WindowScale in ShuffleVec".
7879 int WindowBase = 0;
7880 int WindowScale = 1;
7881
7882 ShuffleSourceInfo(SDValue Vec) : Vec(Vec), ShuffleVec(Vec) {}
7883
7884 bool operator ==(SDValue OtherVec) { return Vec == OtherVec; }
7885 };
7886
7887 // First gather all vectors used as an immediate source for this BUILD_VECTOR
7888 // node.
7890 for (unsigned i = 0; i < NumElts; ++i) {
7891 SDValue V = Op.getOperand(i);
7892 if (V.isUndef())
7893 continue;
7894 else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT) {
7895 // A shuffle can only come from building a vector from various
7896 // elements of other vectors.
7897 return SDValue();
7898 } else if (!isa<ConstantSDNode>(V.getOperand(1))) {
7899 // Furthermore, shuffles require a constant mask, whereas extractelts
7900 // accept variable indices.
7901 return SDValue();
7902 }
7903
7904 // Add this element source to the list if it's not already there.
7905 SDValue SourceVec = V.getOperand(0);
7906 auto Source = llvm::find(Sources, SourceVec);
7907 if (Source == Sources.end())
7908 Source = Sources.insert(Sources.end(), ShuffleSourceInfo(SourceVec));
7909
7910 // Update the minimum and maximum lane number seen.
7911 unsigned EltNo = V.getConstantOperandVal(1);
7912 Source->MinElt = std::min(Source->MinElt, EltNo);
7913 Source->MaxElt = std::max(Source->MaxElt, EltNo);
7914 }
7915
7916 // Currently only do something sane when at most two source vectors
7917 // are involved.
7918 if (Sources.size() > 2)
7919 return SDValue();
7920
7921 // Find out the smallest element size among result and two sources, and use
7922 // it as element size to build the shuffle_vector.
7923 EVT SmallestEltTy = VT.getVectorElementType();
7924 for (auto &Source : Sources) {
7925 EVT SrcEltTy = Source.Vec.getValueType().getVectorElementType();
7926 if (SrcEltTy.bitsLT(SmallestEltTy))
7927 SmallestEltTy = SrcEltTy;
7928 }
7929 unsigned ResMultiplier =
7930 VT.getScalarSizeInBits() / SmallestEltTy.getSizeInBits();
7931 NumElts = VT.getSizeInBits() / SmallestEltTy.getSizeInBits();
7932 EVT ShuffleVT = EVT::getVectorVT(*DAG.getContext(), SmallestEltTy, NumElts);
7933
7934 // If the source vector is too wide or too narrow, we may nevertheless be able
7935 // to construct a compatible shuffle either by concatenating it with UNDEF or
7936 // extracting a suitable range of elements.
7937 for (auto &Src : Sources) {
7938 EVT SrcVT = Src.ShuffleVec.getValueType();
7939
7940 uint64_t SrcVTSize = SrcVT.getFixedSizeInBits();
7941 uint64_t VTSize = VT.getFixedSizeInBits();
7942 if (SrcVTSize == VTSize)
7943 continue;
7944
7945 // This stage of the search produces a source with the same element type as
7946 // the original, but with a total width matching the BUILD_VECTOR output.
7947 EVT EltVT = SrcVT.getVectorElementType();
7948 unsigned NumSrcElts = VTSize / EltVT.getFixedSizeInBits();
7949 EVT DestVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumSrcElts);
7950
7951 if (SrcVTSize < VTSize) {
7952 if (2 * SrcVTSize != VTSize)
7953 return SDValue();
7954 // We can pad out the smaller vector for free, so if it's part of a
7955 // shuffle...
7956 Src.ShuffleVec =
7957 DAG.getNode(ISD::CONCAT_VECTORS, dl, DestVT, Src.ShuffleVec,
7958 DAG.getUNDEF(Src.ShuffleVec.getValueType()));
7959 continue;
7960 }
7961
7962 if (SrcVTSize != 2 * VTSize)
7963 return SDValue();
7964
7965 if (Src.MaxElt - Src.MinElt >= NumSrcElts) {
7966 // Span too large for a VEXT to cope
7967 return SDValue();
7968 }
7969
7970 if (Src.MinElt >= NumSrcElts) {
7971 // The extraction can just take the second half
7972 Src.ShuffleVec =
7973 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
7974 DAG.getConstant(NumSrcElts, dl, MVT::i32));
7975 Src.WindowBase = -NumSrcElts;
7976 } else if (Src.MaxElt < NumSrcElts) {
7977 // The extraction can just take the first half
7978 Src.ShuffleVec =
7979 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
7980 DAG.getConstant(0, dl, MVT::i32));
7981 } else {
7982 // An actual VEXT is needed
7983 SDValue VEXTSrc1 =
7984 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
7985 DAG.getConstant(0, dl, MVT::i32));
7986 SDValue VEXTSrc2 =
7987 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
7988 DAG.getConstant(NumSrcElts, dl, MVT::i32));
7989
7990 Src.ShuffleVec = DAG.getNode(ARMISD::VEXT, dl, DestVT, VEXTSrc1,
7991 VEXTSrc2,
7992 DAG.getConstant(Src.MinElt, dl, MVT::i32));
7993 Src.WindowBase = -Src.MinElt;
7994 }
7995 }
7996
7997 // Another possible incompatibility occurs from the vector element types. We
7998 // can fix this by bitcasting the source vectors to the same type we intend
7999 // for the shuffle.
8000 for (auto &Src : Sources) {
8001 EVT SrcEltTy = Src.ShuffleVec.getValueType().getVectorElementType();
8002 if (SrcEltTy == SmallestEltTy)
8003 continue;
8004 assert(ShuffleVT.getVectorElementType() == SmallestEltTy);
8005 Src.ShuffleVec = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, ShuffleVT, Src.ShuffleVec);
8006 Src.WindowScale = SrcEltTy.getSizeInBits() / SmallestEltTy.getSizeInBits();
8007 Src.WindowBase *= Src.WindowScale;
8008 }
8009
8010 // Final check before we try to actually produce a shuffle.
8011 LLVM_DEBUG({
8012 for (auto Src : Sources)
8013 assert(Src.ShuffleVec.getValueType() == ShuffleVT);
8014 });
8015
8016 // The stars all align, our next step is to produce the mask for the shuffle.
8017 SmallVector<int, 8> Mask(ShuffleVT.getVectorNumElements(), -1);
8018 int BitsPerShuffleLane = ShuffleVT.getScalarSizeInBits();
8019 for (unsigned i = 0; i < VT.getVectorNumElements(); ++i) {
8020 SDValue Entry = Op.getOperand(i);
8021 if (Entry.isUndef())
8022 continue;
8023
8024 auto Src = llvm::find(Sources, Entry.getOperand(0));
8025 int EltNo = cast<ConstantSDNode>(Entry.getOperand(1))->getSExtValue();
8026
8027 // EXTRACT_VECTOR_ELT performs an implicit any_ext; BUILD_VECTOR an implicit
8028 // trunc. So only std::min(SrcBits, DestBits) actually get defined in this
8029 // segment.
8030 EVT OrigEltTy = Entry.getOperand(0).getValueType().getVectorElementType();
8031 int BitsDefined = std::min(OrigEltTy.getScalarSizeInBits(),
8032 VT.getScalarSizeInBits());
8033 int LanesDefined = BitsDefined / BitsPerShuffleLane;
8034
8035 // This source is expected to fill ResMultiplier lanes of the final shuffle,
8036 // starting at the appropriate offset.
8037 int *LaneMask = &Mask[i * ResMultiplier];
8038
8039 int ExtractBase = EltNo * Src->WindowScale + Src->WindowBase;
8040 ExtractBase += NumElts * (Src - Sources.begin());
8041 for (int j = 0; j < LanesDefined; ++j)
8042 LaneMask[j] = ExtractBase + j;
8043 }
8044
8045
8046 // We can't handle more than two sources. This should have already
8047 // been checked before this point.
8048 assert(Sources.size() <= 2 && "Too many sources!");
8049
8050 SDValue ShuffleOps[] = { DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT) };
8051 for (unsigned i = 0; i < Sources.size(); ++i)
8052 ShuffleOps[i] = Sources[i].ShuffleVec;
8053
8054 SDValue Shuffle = buildLegalVectorShuffle(ShuffleVT, dl, ShuffleOps[0],
8055 ShuffleOps[1], Mask, DAG);
8056 if (!Shuffle)
8057 return SDValue();
8058 return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, Shuffle);
8059}
8060
8062 OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3>
8071 OP_VUZPL, // VUZP, left result
8072 OP_VUZPR, // VUZP, right result
8073 OP_VZIPL, // VZIP, left result
8074 OP_VZIPR, // VZIP, right result
8075 OP_VTRNL, // VTRN, left result
8076 OP_VTRNR // VTRN, right result
8077};
8078
8079static bool isLegalMVEShuffleOp(unsigned PFEntry) {
8080 unsigned OpNum = (PFEntry >> 26) & 0x0F;
8081 switch (OpNum) {
8082 case OP_COPY:
8083 case OP_VREV:
8084 case OP_VDUP0:
8085 case OP_VDUP1:
8086 case OP_VDUP2:
8087 case OP_VDUP3:
8088 return true;
8089 }
8090 return false;
8091}
8092
8093/// isShuffleMaskLegal - Targets can use this to indicate that they only
8094/// support *some* VECTOR_SHUFFLE operations, those with specific masks.
8095/// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
8096/// are assumed to be legal.
8098 if (VT.getVectorNumElements() == 4 &&
8099 (VT.is128BitVector() || VT.is64BitVector())) {
8100 unsigned PFIndexes[4];
8101 for (unsigned i = 0; i != 4; ++i) {
8102 if (M[i] < 0)
8103 PFIndexes[i] = 8;
8104 else
8105 PFIndexes[i] = M[i];
8106 }
8107
8108 // Compute the index in the perfect shuffle table.
8109 unsigned PFTableIndex =
8110 PFIndexes[0]*9*9*9+PFIndexes[1]*9*9+PFIndexes[2]*9+PFIndexes[3];
8111 unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
8112 unsigned Cost = (PFEntry >> 30);
8113
8114 if (Cost <= 4 && (Subtarget->hasNEON() || isLegalMVEShuffleOp(PFEntry)))
8115 return true;
8116 }
8117
8118 bool ReverseVEXT, isV_UNDEF;
8119 unsigned Imm, WhichResult;
8120
8121 unsigned EltSize = VT.getScalarSizeInBits();
8122 if (EltSize >= 32 ||
8124 ShuffleVectorInst::isIdentityMask(M, M.size()) ||
8125 isVREVMask(M, VT, 64) ||
8126 isVREVMask(M, VT, 32) ||
8127 isVREVMask(M, VT, 16))
8128 return true;
8129 else if (Subtarget->hasNEON() &&
8130 (isVEXTMask(M, VT, ReverseVEXT, Imm) ||
8131 isVTBLMask(M, VT) ||
8132 isNEONTwoResultShuffleMask(M, VT, WhichResult, isV_UNDEF)))
8133 return true;
8134 else if ((VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i8) &&
8135 isReverseMask(M, VT))
8136 return true;
8137 else if (Subtarget->hasMVEIntegerOps() &&
8138 (isVMOVNMask(M, VT, true, false) ||
8139 isVMOVNMask(M, VT, false, false) || isVMOVNMask(M, VT, true, true)))
8140 return true;
8141 else if (Subtarget->hasMVEIntegerOps() &&
8142 (isTruncMask(M, VT, false, false) ||
8143 isTruncMask(M, VT, false, true) ||
8144 isTruncMask(M, VT, true, false) || isTruncMask(M, VT, true, true)))
8145 return true;
8146 else
8147 return false;
8148}
8149
8150/// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit
8151/// the specified operations to build the shuffle.
8153 SDValue RHS, SelectionDAG &DAG,
8154 const SDLoc &dl) {
8155 unsigned OpNum = (PFEntry >> 26) & 0x0F;
8156 unsigned LHSID = (PFEntry >> 13) & ((1 << 13)-1);
8157 unsigned RHSID = (PFEntry >> 0) & ((1 << 13)-1);
8158
8159 if (OpNum == OP_COPY) {
8160 if (LHSID == (1*9+2)*9+3) return LHS;
8161 assert(LHSID == ((4*9+5)*9+6)*9+7 && "Illegal OP_COPY!");
8162 return RHS;
8163 }
8164
8165 SDValue OpLHS, OpRHS;
8166 OpLHS = GeneratePerfectShuffle(PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl);
8167 OpRHS = GeneratePerfectShuffle(PerfectShuffleTable[RHSID], LHS, RHS, DAG, dl);
8168 EVT VT = OpLHS.getValueType();
8169
8170 switch (OpNum) {
8171 default: llvm_unreachable("Unknown shuffle opcode!");
8172 case OP_VREV:
8173 // VREV divides the vector in half and swaps within the half.
8174 if (VT.getScalarSizeInBits() == 32)
8175 return DAG.getNode(ARMISD::VREV64, dl, VT, OpLHS);
8176 // vrev <4 x i16> -> VREV32
8177 if (VT.getScalarSizeInBits() == 16)
8178 return DAG.getNode(ARMISD::VREV32, dl, VT, OpLHS);
8179 // vrev <4 x i8> -> VREV16
8180 assert(VT.getScalarSizeInBits() == 8);
8181 return DAG.getNode(ARMISD::VREV16, dl, VT, OpLHS);
8182 case OP_VDUP0:
8183 case OP_VDUP1:
8184 case OP_VDUP2:
8185 case OP_VDUP3:
8186 return DAG.getNode(ARMISD::VDUPLANE, dl, VT,
8187 OpLHS, DAG.getConstant(OpNum-OP_VDUP0, dl, MVT::i32));
8188 case OP_VEXT1:
8189 case OP_VEXT2:
8190 case OP_VEXT3:
8191 return DAG.getNode(ARMISD::VEXT, dl, VT,
8192 OpLHS, OpRHS,
8193 DAG.getConstant(OpNum - OP_VEXT1 + 1, dl, MVT::i32));
8194 case OP_VUZPL:
8195 case OP_VUZPR:
8196 return DAG.getNode(ARMISD::VUZP, dl, DAG.getVTList(VT, VT),
8197 OpLHS, OpRHS).getValue(OpNum-OP_VUZPL);
8198 case OP_VZIPL:
8199 case OP_VZIPR:
8200 return DAG.getNode(ARMISD::VZIP, dl, DAG.getVTList(VT, VT),
8201 OpLHS, OpRHS).getValue(OpNum-OP_VZIPL);
8202 case OP_VTRNL:
8203 case OP_VTRNR:
8204 return DAG.getNode(ARMISD::VTRN, dl, DAG.getVTList(VT, VT),
8205 OpLHS, OpRHS).getValue(OpNum-OP_VTRNL);
8206 }
8207}
8208
8210 ArrayRef<int> ShuffleMask,
8211 SelectionDAG &DAG) {
8212 // Check to see if we can use the VTBL instruction.
8213 SDValue V1 = Op.getOperand(0);
8214 SDValue V2 = Op.getOperand(1);
8215 SDLoc DL(Op);
8216
8217 SmallVector<SDValue, 8> VTBLMask;
8218 for (int I : ShuffleMask)
8219 VTBLMask.push_back(DAG.getSignedConstant(I, DL, MVT::i32));
8220
8221 if (V2.getNode()->isUndef())
8222 return DAG.getNode(ARMISD::VTBL1, DL, MVT::v8i8, V1,
8223 DAG.getBuildVector(MVT::v8i8, DL, VTBLMask));
8224
8225 return DAG.getNode(ARMISD::VTBL2, DL, MVT::v8i8, V1, V2,
8226 DAG.getBuildVector(MVT::v8i8, DL, VTBLMask));
8227}
8228
8230 SDLoc DL(Op);
8231 EVT VT = Op.getValueType();
8232
8233 assert((VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i8) &&
8234 "Expect an v8i16/v16i8 type");
8235 SDValue OpLHS = DAG.getNode(ARMISD::VREV64, DL, VT, Op.getOperand(0));
8236 // For a v16i8 type: After the VREV, we have got <7, ..., 0, 15, ..., 8>. Now,
8237 // extract the first 8 bytes into the top double word and the last 8 bytes
8238 // into the bottom double word, through a new vector shuffle that will be
8239 // turned into a VEXT on Neon, or a couple of VMOVDs on MVE.
8240 std::vector<int> NewMask;
8241 for (unsigned i = 0; i < VT.getVectorNumElements() / 2; i++)
8242 NewMask.push_back(VT.getVectorNumElements() / 2 + i);
8243 for (unsigned i = 0; i < VT.getVectorNumElements() / 2; i++)
8244 NewMask.push_back(i);
8245 return DAG.getVectorShuffle(VT, DL, OpLHS, OpLHS, NewMask);
8246}
8247
8249 switch (VT.getSimpleVT().SimpleTy) {
8250 case MVT::v2i1:
8251 return MVT::v2f64;
8252 case MVT::v4i1:
8253 return MVT::v4i32;
8254 case MVT::v8i1:
8255 return MVT::v8i16;
8256 case MVT::v16i1:
8257 return MVT::v16i8;
8258 default:
8259 llvm_unreachable("Unexpected vector predicate type");
8260 }
8261}
8262
8264 SelectionDAG &DAG) {
8265 // Converting from boolean predicates to integers involves creating a vector
8266 // of all ones or all zeroes and selecting the lanes based upon the real
8267 // predicate.
8269 DAG.getTargetConstant(ARM_AM::createVMOVModImm(0xe, 0xff), dl, MVT::i32);
8270 AllOnes = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v16i8, AllOnes);
8271
8272 SDValue AllZeroes =
8273 DAG.getTargetConstant(ARM_AM::createVMOVModImm(0xe, 0x0), dl, MVT::i32);
8274 AllZeroes = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v16i8, AllZeroes);
8275
8276 // Get full vector type from predicate type
8278
8279 SDValue RecastV1;
8280 // If the real predicate is an v8i1 or v4i1 (not v16i1) then we need to recast
8281 // this to a v16i1. This cannot be done with an ordinary bitcast because the
8282 // sizes are not the same. We have to use a MVE specific PREDICATE_CAST node,
8283 // since we know in hardware the sizes are really the same.
8284 if (VT != MVT::v16i1)
8285 RecastV1 = DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::v16i1, Pred);
8286 else
8287 RecastV1 = Pred;
8288
8289 // Select either all ones or zeroes depending upon the real predicate bits.
8290 SDValue PredAsVector =
8291 DAG.getNode(ISD::VSELECT, dl, MVT::v16i8, RecastV1, AllOnes, AllZeroes);
8292
8293 // Recast our new predicate-as-integer v16i8 vector into something
8294 // appropriate for the shuffle, i.e. v4i32 for a real v4i1 predicate.
8295 return DAG.getNode(ISD::BITCAST, dl, NewVT, PredAsVector);
8296}
8297
8299 const ARMSubtarget *ST) {
8300 EVT VT = Op.getValueType();
8302 ArrayRef<int> ShuffleMask = SVN->getMask();
8303
8304 assert(ST->hasMVEIntegerOps() &&
8305 "No support for vector shuffle of boolean predicates");
8306
8307 SDValue V1 = Op.getOperand(0);
8308 SDValue V2 = Op.getOperand(1);
8309 SDLoc dl(Op);
8310 if (isReverseMask(ShuffleMask, VT)) {
8311 SDValue cast = DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::i32, V1);
8312 SDValue rbit = DAG.getNode(ISD::BITREVERSE, dl, MVT::i32, cast);
8313 SDValue srl = DAG.getNode(ISD::SRL, dl, MVT::i32, rbit,
8314 DAG.getConstant(16, dl, MVT::i32));
8315 return DAG.getNode(ARMISD::PREDICATE_CAST, dl, VT, srl);
8316 }
8317
8318 // Until we can come up with optimised cases for every single vector
8319 // shuffle in existence we have chosen the least painful strategy. This is
8320 // to essentially promote the boolean predicate to a 8-bit integer, where
8321 // each predicate represents a byte. Then we fall back on a normal integer
8322 // vector shuffle and convert the result back into a predicate vector. In
8323 // many cases the generated code might be even better than scalar code
8324 // operating on bits. Just imagine trying to shuffle 8 arbitrary 2-bit
8325 // fields in a register into 8 other arbitrary 2-bit fields!
8326 SDValue PredAsVector1 = PromoteMVEPredVector(dl, V1, VT, DAG);
8327 EVT NewVT = PredAsVector1.getValueType();
8328 SDValue PredAsVector2 = V2.isUndef() ? DAG.getUNDEF(NewVT)
8329 : PromoteMVEPredVector(dl, V2, VT, DAG);
8330 assert(PredAsVector2.getValueType() == NewVT &&
8331 "Expected identical vector type in expanded i1 shuffle!");
8332
8333 // Do the shuffle!
8334 SDValue Shuffled = DAG.getVectorShuffle(NewVT, dl, PredAsVector1,
8335 PredAsVector2, ShuffleMask);
8336
8337 // Now return the result of comparing the shuffled vector with zero,
8338 // which will generate a real predicate, i.e. v4i1, v8i1 or v16i1. For a v2i1
8339 // we convert to a v4i1 compare to fill in the two halves of the i64 as i32s.
8340 if (VT == MVT::v2i1) {
8341 SDValue BC = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, Shuffled);
8342 SDValue Cmp = DAG.getNode(ARMISD::VCMPZ, dl, MVT::v4i1, BC,
8343 DAG.getConstant(ARMCC::NE, dl, MVT::i32));
8344 return DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::v2i1, Cmp);
8345 }
8346 return DAG.getNode(ARMISD::VCMPZ, dl, VT, Shuffled,
8347 DAG.getConstant(ARMCC::NE, dl, MVT::i32));
8348}
8349
8351 ArrayRef<int> ShuffleMask,
8352 SelectionDAG &DAG) {
8353 // Attempt to lower the vector shuffle using as many whole register movs as
8354 // possible. This is useful for types smaller than 32bits, which would
8355 // often otherwise become a series for grp movs.
8356 SDLoc dl(Op);
8357 EVT VT = Op.getValueType();
8358 if (VT.getScalarSizeInBits() >= 32)
8359 return SDValue();
8360
8361 assert((VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i8) &&
8362 "Unexpected vector type");
8363 int NumElts = VT.getVectorNumElements();
8364 int QuarterSize = NumElts / 4;
8365 // The four final parts of the vector, as i32's
8366 SDValue Parts[4];
8367
8368 // Look for full lane vmovs like <0,1,2,3> or <u,5,6,7> etc, (but not
8369 // <u,u,u,u>), returning the vmov lane index
8370 auto getMovIdx = [](ArrayRef<int> ShuffleMask, int Start, int Length) {
8371 // Detect which mov lane this would be from the first non-undef element.
8372 int MovIdx = -1;
8373 for (int i = 0; i < Length; i++) {
8374 if (ShuffleMask[Start + i] >= 0) {
8375 if (ShuffleMask[Start + i] % Length != i)
8376 return -1;
8377 MovIdx = ShuffleMask[Start + i] / Length;
8378 break;
8379 }
8380 }
8381 // If all items are undef, leave this for other combines
8382 if (MovIdx == -1)
8383 return -1;
8384 // Check the remaining values are the correct part of the same mov
8385 for (int i = 1; i < Length; i++) {
8386 if (ShuffleMask[Start + i] >= 0 &&
8387 (ShuffleMask[Start + i] / Length != MovIdx ||
8388 ShuffleMask[Start + i] % Length != i))
8389 return -1;
8390 }
8391 return MovIdx;
8392 };
8393
8394 for (int Part = 0; Part < 4; ++Part) {
8395 // Does this part look like a mov
8396 int Elt = getMovIdx(ShuffleMask, Part * QuarterSize, QuarterSize);
8397 if (Elt != -1) {
8398 SDValue Input = Op->getOperand(0);
8399 if (Elt >= 4) {
8400 Input = Op->getOperand(1);
8401 Elt -= 4;
8402 }
8403 SDValue BitCast = DAG.getBitcast(MVT::v4f32, Input);
8404 Parts[Part] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, BitCast,
8405 DAG.getConstant(Elt, dl, MVT::i32));
8406 }
8407 }
8408
8409 // Nothing interesting found, just return
8410 if (!Parts[0] && !Parts[1] && !Parts[2] && !Parts[3])
8411 return SDValue();
8412
8413 // The other parts need to be built with the old shuffle vector, cast to a
8414 // v4i32 and extract_vector_elts
8415 if (!Parts[0] || !Parts[1] || !Parts[2] || !Parts[3]) {
8416 SmallVector<int, 16> NewShuffleMask;
8417 for (int Part = 0; Part < 4; ++Part)
8418 for (int i = 0; i < QuarterSize; i++)
8419 NewShuffleMask.push_back(
8420 Parts[Part] ? -1 : ShuffleMask[Part * QuarterSize + i]);
8421 SDValue NewShuffle = DAG.getVectorShuffle(
8422 VT, dl, Op->getOperand(0), Op->getOperand(1), NewShuffleMask);
8423 SDValue BitCast = DAG.getBitcast(MVT::v4f32, NewShuffle);
8424
8425 for (int Part = 0; Part < 4; ++Part)
8426 if (!Parts[Part])
8427 Parts[Part] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32,
8428 BitCast, DAG.getConstant(Part, dl, MVT::i32));
8429 }
8430 // Build a vector out of the various parts and bitcast it back to the original
8431 // type.
8432 SDValue NewVec = DAG.getNode(ARMISD::BUILD_VECTOR, dl, MVT::v4f32, Parts);
8433 return DAG.getBitcast(VT, NewVec);
8434}
8435
8437 ArrayRef<int> ShuffleMask,
8438 SelectionDAG &DAG) {
8439 SDValue V1 = Op.getOperand(0);
8440 SDValue V2 = Op.getOperand(1);
8441 EVT VT = Op.getValueType();
8442 unsigned NumElts = VT.getVectorNumElements();
8443
8444 // An One-Off Identity mask is one that is mostly an identity mask from as
8445 // single source but contains a single element out-of-place, either from a
8446 // different vector or from another position in the same vector. As opposed to
8447 // lowering this via a ARMISD::BUILD_VECTOR we can generate an extract/insert
8448 // pair directly.
8449 auto isOneOffIdentityMask = [](ArrayRef<int> Mask, EVT VT, int BaseOffset,
8450 int &OffElement) {
8451 OffElement = -1;
8452 int NonUndef = 0;
8453 for (int i = 0, NumMaskElts = Mask.size(); i < NumMaskElts; ++i) {
8454 if (Mask[i] == -1)
8455 continue;
8456 NonUndef++;
8457 if (Mask[i] != i + BaseOffset) {
8458 if (OffElement == -1)
8459 OffElement = i;
8460 else
8461 return false;
8462 }
8463 }
8464 return NonUndef > 2 && OffElement != -1;
8465 };
8466 int OffElement;
8467 SDValue VInput;
8468 if (isOneOffIdentityMask(ShuffleMask, VT, 0, OffElement))
8469 VInput = V1;
8470 else if (isOneOffIdentityMask(ShuffleMask, VT, NumElts, OffElement))
8471 VInput = V2;
8472 else
8473 return SDValue();
8474
8475 SDLoc dl(Op);
8476 EVT SVT = VT.getScalarType() == MVT::i8 || VT.getScalarType() == MVT::i16
8477 ? MVT::i32
8478 : VT.getScalarType();
8479 SDValue Elt = DAG.getNode(
8480 ISD::EXTRACT_VECTOR_ELT, dl, SVT,
8481 ShuffleMask[OffElement] < (int)NumElts ? V1 : V2,
8482 DAG.getVectorIdxConstant(ShuffleMask[OffElement] % NumElts, dl));
8483 return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, VInput, Elt,
8484 DAG.getVectorIdxConstant(OffElement % NumElts, dl));
8485}
8486
8488 const ARMSubtarget *ST) {
8489 SDValue V1 = Op.getOperand(0);
8490 SDValue V2 = Op.getOperand(1);
8491 SDLoc dl(Op);
8492 EVT VT = Op.getValueType();
8494 unsigned EltSize = VT.getScalarSizeInBits();
8495
8496 if (ST->hasMVEIntegerOps() && EltSize == 1)
8497 return LowerVECTOR_SHUFFLE_i1(Op, DAG, ST);
8498
8499 // Convert shuffles that are directly supported on NEON to target-specific
8500 // DAG nodes, instead of keeping them as shuffles and matching them again
8501 // during code selection. This is more efficient and avoids the possibility
8502 // of inconsistencies between legalization and selection.
8503 // FIXME: floating-point vectors should be canonicalized to integer vectors
8504 // of the same time so that they get CSEd properly.
8505 ArrayRef<int> ShuffleMask = SVN->getMask();
8506
8507 if (EltSize <= 32) {
8508 if (SVN->isSplat()) {
8509 int Lane = SVN->getSplatIndex();
8510 // If this is undef splat, generate it via "just" vdup, if possible.
8511 if (Lane == -1) Lane = 0;
8512
8513 // Test if V1 is a SCALAR_TO_VECTOR.
8514 if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR) {
8515 return DAG.getNode(ARMISD::VDUP, dl, VT, V1.getOperand(0));
8516 }
8517 // Test if V1 is a BUILD_VECTOR which is equivalent to a SCALAR_TO_VECTOR
8518 // (and probably will turn into a SCALAR_TO_VECTOR once legalization
8519 // reaches it).
8520 if (Lane == 0 && V1.getOpcode() == ISD::BUILD_VECTOR &&
8522 bool IsScalarToVector = true;
8523 for (unsigned i = 1, e = V1.getNumOperands(); i != e; ++i)
8524 if (!V1.getOperand(i).isUndef()) {
8525 IsScalarToVector = false;
8526 break;
8527 }
8528 if (IsScalarToVector)
8529 return DAG.getNode(ARMISD::VDUP, dl, VT, V1.getOperand(0));
8530 }
8531 return DAG.getNode(ARMISD::VDUPLANE, dl, VT, V1,
8532 DAG.getConstant(Lane, dl, MVT::i32));
8533 }
8534
8535 bool ReverseVEXT = false;
8536 unsigned Imm = 0;
8537 if (ST->hasNEON() && isVEXTMask(ShuffleMask, VT, ReverseVEXT, Imm)) {
8538 if (ReverseVEXT)
8539 std::swap(V1, V2);
8540 return DAG.getNode(ARMISD::VEXT, dl, VT, V1, V2,
8541 DAG.getConstant(Imm, dl, MVT::i32));
8542 }
8543
8544 if (isVREVMask(ShuffleMask, VT, 64))
8545 return DAG.getNode(ARMISD::VREV64, dl, VT, V1);
8546 if (isVREVMask(ShuffleMask, VT, 32))
8547 return DAG.getNode(ARMISD::VREV32, dl, VT, V1);
8548 if (isVREVMask(ShuffleMask, VT, 16))
8549 return DAG.getNode(ARMISD::VREV16, dl, VT, V1);
8550
8551 if (ST->hasNEON() && V2->isUndef() && isSingletonVEXTMask(ShuffleMask, VT, Imm)) {
8552 return DAG.getNode(ARMISD::VEXT, dl, VT, V1, V1,
8553 DAG.getConstant(Imm, dl, MVT::i32));
8554 }
8555
8556 // Check for Neon shuffles that modify both input vectors in place.
8557 // If both results are used, i.e., if there are two shuffles with the same
8558 // source operands and with masks corresponding to both results of one of
8559 // these operations, DAG memoization will ensure that a single node is
8560 // used for both shuffles.
8561 unsigned WhichResult = 0;
8562 bool isV_UNDEF = false;
8563 if (ST->hasNEON()) {
8564 if (unsigned ShuffleOpc = isNEONTwoResultShuffleMask(
8565 ShuffleMask, VT, WhichResult, isV_UNDEF)) {
8566 if (isV_UNDEF)
8567 V2 = V1;
8568 return DAG.getNode(ShuffleOpc, dl, DAG.getVTList(VT, VT), V1, V2)
8569 .getValue(WhichResult);
8570 }
8571 }
8572 if (ST->hasMVEIntegerOps()) {
8573 if (isVMOVNMask(ShuffleMask, VT, false, false))
8574 return DAG.getNode(ARMISD::VMOVN, dl, VT, V2, V1,
8575 DAG.getConstant(0, dl, MVT::i32));
8576 if (isVMOVNMask(ShuffleMask, VT, true, false))
8577 return DAG.getNode(ARMISD::VMOVN, dl, VT, V1, V2,
8578 DAG.getConstant(1, dl, MVT::i32));
8579 if (isVMOVNMask(ShuffleMask, VT, true, true))
8580 return DAG.getNode(ARMISD::VMOVN, dl, VT, V1, V1,
8581 DAG.getConstant(1, dl, MVT::i32));
8582 }
8583
8584 // Also check for these shuffles through CONCAT_VECTORS: we canonicalize
8585 // shuffles that produce a result larger than their operands with:
8586 // shuffle(concat(v1, undef), concat(v2, undef))
8587 // ->
8588 // shuffle(concat(v1, v2), undef)
8589 // because we can access quad vectors (see PerformVECTOR_SHUFFLECombine).
8590 //
8591 // This is useful in the general case, but there are special cases where
8592 // native shuffles produce larger results: the two-result ops.
8593 //
8594 // Look through the concat when lowering them:
8595 // shuffle(concat(v1, v2), undef)
8596 // ->
8597 // concat(VZIP(v1, v2):0, :1)
8598 //
8599 if (ST->hasNEON() && V1->getOpcode() == ISD::CONCAT_VECTORS && V2->isUndef()) {
8600 SDValue SubV1 = V1->getOperand(0);
8601 SDValue SubV2 = V1->getOperand(1);
8602 EVT SubVT = SubV1.getValueType();
8603
8604 // We expect these to have been canonicalized to -1.
8605 assert(llvm::all_of(ShuffleMask, [&](int i) {
8606 return i < (int)VT.getVectorNumElements();
8607 }) && "Unexpected shuffle index into UNDEF operand!");
8608
8609 if (unsigned ShuffleOpc = isNEONTwoResultShuffleMask(
8610 ShuffleMask, SubVT, WhichResult, isV_UNDEF)) {
8611 if (isV_UNDEF)
8612 SubV2 = SubV1;
8613 assert((WhichResult == 0) &&
8614 "In-place shuffle of concat can only have one result!");
8615 SDValue Res = DAG.getNode(ShuffleOpc, dl, DAG.getVTList(SubVT, SubVT),
8616 SubV1, SubV2);
8617 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Res.getValue(0),
8618 Res.getValue(1));
8619 }
8620 }
8621 }
8622
8623 if (ST->hasMVEIntegerOps() && EltSize <= 32) {
8624 if (SDValue V = LowerVECTOR_SHUFFLEUsingOneOff(Op, ShuffleMask, DAG))
8625 return V;
8626
8627 for (bool Top : {false, true}) {
8628 for (bool SingleSource : {false, true}) {
8629 if (isTruncMask(ShuffleMask, VT, Top, SingleSource)) {
8630 MVT FromSVT = MVT::getIntegerVT(EltSize * 2);
8631 MVT FromVT = MVT::getVectorVT(FromSVT, ShuffleMask.size() / 2);
8632 SDValue Lo = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, FromVT, V1);
8633 SDValue Hi = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, FromVT,
8634 SingleSource ? V1 : V2);
8635 if (Top) {
8636 SDValue Amt = DAG.getConstant(EltSize, dl, FromVT);
8637 Lo = DAG.getNode(ISD::SRL, dl, FromVT, Lo, Amt);
8638 Hi = DAG.getNode(ISD::SRL, dl, FromVT, Hi, Amt);
8639 }
8640 return DAG.getNode(ARMISD::MVETRUNC, dl, VT, Lo, Hi);
8641 }
8642 }
8643 }
8644 }
8645
8646 // If the shuffle is not directly supported and it has 4 elements, use
8647 // the PerfectShuffle-generated table to synthesize it from other shuffles.
8648 unsigned NumElts = VT.getVectorNumElements();
8649 if (NumElts == 4) {
8650 unsigned PFIndexes[4];
8651 for (unsigned i = 0; i != 4; ++i) {
8652 if (ShuffleMask[i] < 0)
8653 PFIndexes[i] = 8;
8654 else
8655 PFIndexes[i] = ShuffleMask[i];
8656 }
8657
8658 // Compute the index in the perfect shuffle table.
8659 unsigned PFTableIndex =
8660 PFIndexes[0]*9*9*9+PFIndexes[1]*9*9+PFIndexes[2]*9+PFIndexes[3];
8661 unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
8662 unsigned Cost = (PFEntry >> 30);
8663
8664 if (Cost <= 4) {
8665 if (ST->hasNEON())
8666 return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl);
8667 else if (isLegalMVEShuffleOp(PFEntry)) {
8668 unsigned LHSID = (PFEntry >> 13) & ((1 << 13)-1);
8669 unsigned RHSID = (PFEntry >> 0) & ((1 << 13)-1);
8670 unsigned PFEntryLHS = PerfectShuffleTable[LHSID];
8671 unsigned PFEntryRHS = PerfectShuffleTable[RHSID];
8672 if (isLegalMVEShuffleOp(PFEntryLHS) && isLegalMVEShuffleOp(PFEntryRHS))
8673 return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl);
8674 }
8675 }
8676 }
8677
8678 // Implement shuffles with 32- or 64-bit elements as ARMISD::BUILD_VECTORs.
8679 if (EltSize >= 32) {
8680 // Do the expansion with floating-point types, since that is what the VFP
8681 // registers are defined to use, and since i64 is not legal.
8682 EVT EltVT = EVT::getFloatingPointVT(EltSize);
8683 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts);
8684 V1 = DAG.getNode(ISD::BITCAST, dl, VecVT, V1);
8685 V2 = DAG.getNode(ISD::BITCAST, dl, VecVT, V2);
8687 for (unsigned i = 0; i < NumElts; ++i) {
8688 if (ShuffleMask[i] < 0)
8689 Ops.push_back(DAG.getUNDEF(EltVT));
8690 else
8691 Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT,
8692 ShuffleMask[i] < (int)NumElts ? V1 : V2,
8693 DAG.getConstant(ShuffleMask[i] & (NumElts-1),
8694 dl, MVT::i32)));
8695 }
8696 SDValue Val = DAG.getNode(ARMISD::BUILD_VECTOR, dl, VecVT, Ops);
8697 return DAG.getNode(ISD::BITCAST, dl, VT, Val);
8698 }
8699
8700 if ((VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i8) &&
8701 isReverseMask(ShuffleMask, VT))
8702 return LowerReverse_VECTOR_SHUFFLE(Op, DAG);
8703
8704 if (ST->hasNEON() && VT == MVT::v8i8)
8705 if (SDValue NewOp = LowerVECTOR_SHUFFLEv8i8(Op, ShuffleMask, DAG))
8706 return NewOp;
8707
8708 if (ST->hasMVEIntegerOps())
8709 if (SDValue NewOp = LowerVECTOR_SHUFFLEUsingMovs(Op, ShuffleMask, DAG))
8710 return NewOp;
8711
8712 return SDValue();
8713}
8714
8716 const ARMSubtarget *ST) {
8717 EVT VecVT = Op.getOperand(0).getValueType();
8718 SDLoc dl(Op);
8719
8720 assert(ST->hasMVEIntegerOps() &&
8721 "LowerINSERT_VECTOR_ELT_i1 called without MVE!");
8722
8723 SDValue Conv =
8724 DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::i32, Op->getOperand(0));
8725 unsigned Lane = Op.getConstantOperandVal(2);
8726 unsigned LaneWidth =
8728 unsigned Mask = ((1 << LaneWidth) - 1) << Lane * LaneWidth;
8729 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::i32,
8730 Op.getOperand(1), DAG.getValueType(MVT::i1));
8731 SDValue BFI = DAG.getNode(ARMISD::BFI, dl, MVT::i32, Conv, Ext,
8732 DAG.getConstant(~Mask, dl, MVT::i32));
8733 return DAG.getNode(ARMISD::PREDICATE_CAST, dl, Op.getValueType(), BFI);
8734}
8735
8736SDValue ARMTargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
8737 SelectionDAG &DAG) const {
8738 // INSERT_VECTOR_ELT is legal only for immediate indexes.
8739 SDValue Lane = Op.getOperand(2);
8740 if (!isa<ConstantSDNode>(Lane))
8741 return SDValue();
8742
8743 SDValue Elt = Op.getOperand(1);
8744 EVT EltVT = Elt.getValueType();
8745
8746 if (Subtarget->hasMVEIntegerOps() &&
8747 Op.getValueType().getScalarSizeInBits() == 1)
8748 return LowerINSERT_VECTOR_ELT_i1(Op, DAG, Subtarget);
8749
8750 if (getTypeAction(*DAG.getContext(), EltVT) ==
8752 // INSERT_VECTOR_ELT doesn't want f16 operands promoting to f32,
8753 // but the type system will try to do that if we don't intervene.
8754 // Reinterpret any such vector-element insertion as one with the
8755 // corresponding integer types.
8756
8757 SDLoc dl(Op);
8758
8759 EVT IEltVT = MVT::getIntegerVT(EltVT.getScalarSizeInBits());
8760 assert(getTypeAction(*DAG.getContext(), IEltVT) !=
8762
8763 SDValue VecIn = Op.getOperand(0);
8764 EVT VecVT = VecIn.getValueType();
8765 EVT IVecVT = EVT::getVectorVT(*DAG.getContext(), IEltVT,
8766 VecVT.getVectorNumElements());
8767
8768 SDValue IElt = DAG.getNode(ISD::BITCAST, dl, IEltVT, Elt);
8769 SDValue IVecIn = DAG.getNode(ISD::BITCAST, dl, IVecVT, VecIn);
8770 SDValue IVecOut = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, IVecVT,
8771 IVecIn, IElt, Lane);
8772 return DAG.getNode(ISD::BITCAST, dl, VecVT, IVecOut);
8773 }
8774
8775 return Op;
8776}
8777
8779 const ARMSubtarget *ST) {
8780 EVT VecVT = Op.getOperand(0).getValueType();
8781 SDLoc dl(Op);
8782
8783 assert(ST->hasMVEIntegerOps() &&
8784 "LowerINSERT_VECTOR_ELT_i1 called without MVE!");
8785
8786 SDValue Conv =
8787 DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::i32, Op->getOperand(0));
8788 unsigned Lane = Op.getConstantOperandVal(1);
8789 unsigned LaneWidth =
8791 SDValue Shift = DAG.getNode(ISD::SRL, dl, MVT::i32, Conv,
8792 DAG.getConstant(Lane * LaneWidth, dl, MVT::i32));
8793 return Shift;
8794}
8795
8797 const ARMSubtarget *ST) {
8798 // EXTRACT_VECTOR_ELT is legal only for immediate indexes.
8799 SDValue Lane = Op.getOperand(1);
8800 if (!isa<ConstantSDNode>(Lane))
8801 return SDValue();
8802
8803 SDValue Vec = Op.getOperand(0);
8804 EVT VT = Vec.getValueType();
8805
8806 if (ST->hasMVEIntegerOps() && VT.getScalarSizeInBits() == 1)
8807 return LowerEXTRACT_VECTOR_ELT_i1(Op, DAG, ST);
8808
8809 if (Op.getValueType() == MVT::i32 && Vec.getScalarValueSizeInBits() < 32) {
8810 SDLoc dl(Op);
8811 return DAG.getNode(ARMISD::VGETLANEu, dl, MVT::i32, Vec, Lane);
8812 }
8813
8814 return Op;
8815}
8816
8818 const ARMSubtarget *ST) {
8819 SDLoc dl(Op);
8820 assert(Op.getValueType().getScalarSizeInBits() == 1 &&
8821 "Unexpected custom CONCAT_VECTORS lowering");
8822 assert(isPowerOf2_32(Op.getNumOperands()) &&
8823 "Unexpected custom CONCAT_VECTORS lowering");
8824 assert(ST->hasMVEIntegerOps() &&
8825 "CONCAT_VECTORS lowering only supported for MVE");
8826
8827 auto ConcatPair = [&](SDValue V1, SDValue V2) {
8828 EVT Op1VT = V1.getValueType();
8829 EVT Op2VT = V2.getValueType();
8830 assert(Op1VT == Op2VT && "Operand types don't match!");
8831 assert((Op1VT == MVT::v2i1 || Op1VT == MVT::v4i1 || Op1VT == MVT::v8i1) &&
8832 "Unexpected i1 concat operations!");
8833 EVT VT = Op1VT.getDoubleNumVectorElementsVT(*DAG.getContext());
8834
8835 SDValue NewV1 = PromoteMVEPredVector(dl, V1, Op1VT, DAG);
8836 SDValue NewV2 = PromoteMVEPredVector(dl, V2, Op2VT, DAG);
8837
8838 // We now have Op1 + Op2 promoted to vectors of integers, where v8i1 gets
8839 // promoted to v8i16, etc.
8840 MVT ElType =
8842 unsigned NumElts = 2 * Op1VT.getVectorNumElements();
8843
8844 EVT ConcatVT = MVT::getVectorVT(ElType, NumElts);
8845 if (Op1VT == MVT::v4i1 || Op1VT == MVT::v8i1) {
8846 // Use MVETRUNC to truncate the combined NewV1::NewV2 into the smaller
8847 // ConcatVT.
8848 SDValue ConVec =
8849 DAG.getNode(ARMISD::MVETRUNC, dl, ConcatVT, NewV1, NewV2);
8850 return DAG.getNode(ARMISD::VCMPZ, dl, VT, ConVec,
8851 DAG.getConstant(ARMCC::NE, dl, MVT::i32));
8852 }
8853
8854 // Extract the vector elements from Op1 and Op2 one by one and truncate them
8855 // to be the right size for the destination. For example, if Op1 is v4i1
8856 // then the promoted vector is v4i32. The result of concatenation gives a
8857 // v8i1, which when promoted is v8i16. That means each i32 element from Op1
8858 // needs truncating to i16 and inserting in the result.
8859 auto ExtractInto = [&DAG, &dl](SDValue NewV, SDValue ConVec, unsigned &j) {
8860 EVT NewVT = NewV.getValueType();
8861 EVT ConcatVT = ConVec.getValueType();
8862 unsigned ExtScale = 1;
8863 if (NewVT == MVT::v2f64) {
8864 NewV = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, NewV);
8865 ExtScale = 2;
8866 }
8867 for (unsigned i = 0, e = NewVT.getVectorNumElements(); i < e; i++, j++) {
8868 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, NewV,
8869 DAG.getIntPtrConstant(i * ExtScale, dl));
8870 ConVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ConcatVT, ConVec, Elt,
8871 DAG.getConstant(j, dl, MVT::i32));
8872 }
8873 return ConVec;
8874 };
8875 unsigned j = 0;
8876 SDValue ConVec = DAG.getNode(ISD::UNDEF, dl, ConcatVT);
8877 ConVec = ExtractInto(NewV1, ConVec, j);
8878 ConVec = ExtractInto(NewV2, ConVec, j);
8879
8880 // Now return the result of comparing the subvector with zero, which will
8881 // generate a real predicate, i.e. v4i1, v8i1 or v16i1.
8882 return DAG.getNode(ARMISD::VCMPZ, dl, VT, ConVec,
8883 DAG.getConstant(ARMCC::NE, dl, MVT::i32));
8884 };
8885
8886 // Concat each pair of subvectors and pack into the lower half of the array.
8887 SmallVector<SDValue> ConcatOps(Op->ops());
8888 while (ConcatOps.size() > 1) {
8889 for (unsigned I = 0, E = ConcatOps.size(); I != E; I += 2) {
8890 SDValue V1 = ConcatOps[I];
8891 SDValue V2 = ConcatOps[I + 1];
8892 ConcatOps[I / 2] = ConcatPair(V1, V2);
8893 }
8894 ConcatOps.resize(ConcatOps.size() / 2);
8895 }
8896 return ConcatOps[0];
8897}
8898
8900 const ARMSubtarget *ST) {
8901 EVT VT = Op->getValueType(0);
8902 if (ST->hasMVEIntegerOps() && VT.getScalarSizeInBits() == 1)
8903 return LowerCONCAT_VECTORS_i1(Op, DAG, ST);
8904
8905 // The only time a CONCAT_VECTORS operation can have legal types is when
8906 // two 64-bit vectors are concatenated to a 128-bit vector.
8907 assert(Op.getValueType().is128BitVector() && Op.getNumOperands() == 2 &&
8908 "unexpected CONCAT_VECTORS");
8909 SDLoc dl(Op);
8910 SDValue Val = DAG.getUNDEF(MVT::v2f64);
8911 SDValue Op0 = Op.getOperand(0);
8912 SDValue Op1 = Op.getOperand(1);
8913 if (!Op0.isUndef())
8914 Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Val,
8915 DAG.getNode(ISD::BITCAST, dl, MVT::f64, Op0),
8916 DAG.getIntPtrConstant(0, dl));
8917 if (!Op1.isUndef())
8918 Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Val,
8919 DAG.getNode(ISD::BITCAST, dl, MVT::f64, Op1),
8920 DAG.getIntPtrConstant(1, dl));
8921 return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Val);
8922}
8923
8925 const ARMSubtarget *ST) {
8926 SDValue V1 = Op.getOperand(0);
8927 SDValue V2 = Op.getOperand(1);
8928 SDLoc dl(Op);
8929 EVT VT = Op.getValueType();
8930 EVT Op1VT = V1.getValueType();
8931 unsigned NumElts = VT.getVectorNumElements();
8932 unsigned Index = V2->getAsZExtVal();
8933
8934 assert(VT.getScalarSizeInBits() == 1 &&
8935 "Unexpected custom EXTRACT_SUBVECTOR lowering");
8936 assert(ST->hasMVEIntegerOps() &&
8937 "EXTRACT_SUBVECTOR lowering only supported for MVE");
8938
8939 SDValue NewV1 = PromoteMVEPredVector(dl, V1, Op1VT, DAG);
8940
8941 // We now have Op1 promoted to a vector of integers, where v8i1 gets
8942 // promoted to v8i16, etc.
8943
8945
8946 if (NumElts == 2) {
8947 EVT SubVT = MVT::v4i32;
8948 SDValue SubVec = DAG.getNode(ISD::UNDEF, dl, SubVT);
8949 for (unsigned i = Index, j = 0; i < (Index + NumElts); i++, j += 2) {
8950 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, NewV1,
8951 DAG.getIntPtrConstant(i, dl));
8952 SubVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, SubVT, SubVec, Elt,
8953 DAG.getConstant(j, dl, MVT::i32));
8954 SubVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, SubVT, SubVec, Elt,
8955 DAG.getConstant(j + 1, dl, MVT::i32));
8956 }
8957 SDValue Cmp = DAG.getNode(ARMISD::VCMPZ, dl, MVT::v4i1, SubVec,
8958 DAG.getConstant(ARMCC::NE, dl, MVT::i32));
8959 return DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::v2i1, Cmp);
8960 }
8961
8962 EVT SubVT = MVT::getVectorVT(ElType, NumElts);
8963 SDValue SubVec = DAG.getNode(ISD::UNDEF, dl, SubVT);
8964 for (unsigned i = Index, j = 0; i < (Index + NumElts); i++, j++) {
8965 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, NewV1,
8966 DAG.getIntPtrConstant(i, dl));
8967 SubVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, SubVT, SubVec, Elt,
8968 DAG.getConstant(j, dl, MVT::i32));
8969 }
8970
8971 // Now return the result of comparing the subvector with zero,
8972 // which will generate a real predicate, i.e. v4i1, v8i1 or v16i1.
8973 return DAG.getNode(ARMISD::VCMPZ, dl, VT, SubVec,
8974 DAG.getConstant(ARMCC::NE, dl, MVT::i32));
8975}
8976
8977// Turn a truncate into a predicate (an i1 vector) into icmp(and(x, 1), 0).
8979 const ARMSubtarget *ST) {
8980 assert(ST->hasMVEIntegerOps() && "Expected MVE!");
8981 EVT VT = N->getValueType(0);
8982 assert((VT == MVT::v16i1 || VT == MVT::v8i1 || VT == MVT::v4i1) &&
8983 "Expected a vector i1 type!");
8984 SDValue Op = N->getOperand(0);
8985 EVT FromVT = Op.getValueType();
8986 SDLoc DL(N);
8987
8988 SDValue And =
8989 DAG.getNode(ISD::AND, DL, FromVT, Op, DAG.getConstant(1, DL, FromVT));
8990 return DAG.getNode(ISD::SETCC, DL, VT, And, DAG.getConstant(0, DL, FromVT),
8991 DAG.getCondCode(ISD::SETNE));
8992}
8993
8995 const ARMSubtarget *Subtarget) {
8996 if (!Subtarget->hasMVEIntegerOps())
8997 return SDValue();
8998
8999 EVT ToVT = N->getValueType(0);
9000 if (ToVT.getScalarType() == MVT::i1)
9001 return LowerTruncatei1(N, DAG, Subtarget);
9002
9003 // MVE does not have a single instruction to perform the truncation of a v4i32
9004 // into the lower half of a v8i16, in the same way that a NEON vmovn would.
9005 // Most of the instructions in MVE follow the 'Beats' system, where moving
9006 // values from different lanes is usually something that the instructions
9007 // avoid.
9008 //
9009 // Instead it has top/bottom instructions such as VMOVLT/B and VMOVNT/B,
9010 // which take a the top/bottom half of a larger lane and extend it (or do the
9011 // opposite, truncating into the top/bottom lane from a larger lane). Note
9012 // that because of the way we widen lanes, a v4i16 is really a v4i32 using the
9013 // bottom 16bits from each vector lane. This works really well with T/B
9014 // instructions, but that doesn't extend to v8i32->v8i16 where the lanes need
9015 // to move order.
9016 //
9017 // But truncates and sext/zext are always going to be fairly common from llvm.
9018 // We have several options for how to deal with them:
9019 // - Wherever possible combine them into an instruction that makes them
9020 // "free". This includes loads/stores, which can perform the trunc as part
9021 // of the memory operation. Or certain shuffles that can be turned into
9022 // VMOVN/VMOVL.
9023 // - Lane Interleaving to transform blocks surrounded by ext/trunc. So
9024 // trunc(mul(sext(a), sext(b))) may become
9025 // VMOVNT(VMUL(VMOVLB(a), VMOVLB(b)), VMUL(VMOVLT(a), VMOVLT(b))). (Which in
9026 // this case can use VMULL). This is performed in the
9027 // MVELaneInterleavingPass.
9028 // - Otherwise we have an option. By default we would expand the
9029 // zext/sext/trunc into a series of lane extract/inserts going via GPR
9030 // registers. One for each vector lane in the vector. This can obviously be
9031 // very expensive.
9032 // - The other option is to use the fact that loads/store can extend/truncate
9033 // to turn a trunc into two truncating stack stores and a stack reload. This
9034 // becomes 3 back-to-back memory operations, but at least that is less than
9035 // all the insert/extracts.
9036 //
9037 // In order to do the last, we convert certain trunc's into MVETRUNC, which
9038 // are either optimized where they can be, or eventually lowered into stack
9039 // stores/loads. This prevents us from splitting a v8i16 trunc into two stores
9040 // two early, where other instructions would be better, and stops us from
9041 // having to reconstruct multiple buildvector shuffles into loads/stores.
9042 if (ToVT != MVT::v8i16 && ToVT != MVT::v16i8)
9043 return SDValue();
9044 EVT FromVT = N->getOperand(0).getValueType();
9045 if (FromVT != MVT::v8i32 && FromVT != MVT::v16i16)
9046 return SDValue();
9047
9048 SDValue Lo, Hi;
9049 std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
9050 SDLoc DL(N);
9051 return DAG.getNode(ARMISD::MVETRUNC, DL, ToVT, Lo, Hi);
9052}
9053
9055 const ARMSubtarget *Subtarget) {
9056 if (!Subtarget->hasMVEIntegerOps())
9057 return SDValue();
9058
9059 // See LowerTruncate above for an explanation of MVEEXT/MVETRUNC.
9060
9061 EVT ToVT = N->getValueType(0);
9062 if (ToVT != MVT::v16i32 && ToVT != MVT::v8i32 && ToVT != MVT::v16i16)
9063 return SDValue();
9064 SDValue Op = N->getOperand(0);
9065 EVT FromVT = Op.getValueType();
9066 if (FromVT != MVT::v8i16 && FromVT != MVT::v16i8)
9067 return SDValue();
9068
9069 SDLoc DL(N);
9070 EVT ExtVT = ToVT.getHalfNumVectorElementsVT(*DAG.getContext());
9071 if (ToVT.getScalarType() == MVT::i32 && FromVT.getScalarType() == MVT::i8)
9072 ExtVT = MVT::v8i16;
9073
9074 unsigned Opcode =
9076 SDValue Ext = DAG.getNode(Opcode, DL, DAG.getVTList(ExtVT, ExtVT), Op);
9077 SDValue Ext1 = Ext.getValue(1);
9078
9079 if (ToVT.getScalarType() == MVT::i32 && FromVT.getScalarType() == MVT::i8) {
9080 Ext = DAG.getNode(N->getOpcode(), DL, MVT::v8i32, Ext);
9081 Ext1 = DAG.getNode(N->getOpcode(), DL, MVT::v8i32, Ext1);
9082 }
9083
9084 return DAG.getNode(ISD::CONCAT_VECTORS, DL, ToVT, Ext, Ext1);
9085}
9086
9087/// isExtendedBUILD_VECTOR - Check if N is a constant BUILD_VECTOR where each
9088/// element has been zero/sign-extended, depending on the isSigned parameter,
9089/// from an integer type half its size.
9091 bool isSigned) {
9092 // A v2i64 BUILD_VECTOR will have been legalized to a BITCAST from v4i32.
9093 EVT VT = N->getValueType(0);
9094 if (VT == MVT::v2i64 && N->getOpcode() == ISD::BITCAST) {
9095 SDNode *BVN = N->getOperand(0).getNode();
9096 if (BVN->getValueType(0) != MVT::v4i32 ||
9097 BVN->getOpcode() != ISD::BUILD_VECTOR)
9098 return false;
9099 unsigned LoElt = DAG.getDataLayout().isBigEndian() ? 1 : 0;
9100 unsigned HiElt = 1 - LoElt;
9105 if (!Lo0 || !Hi0 || !Lo1 || !Hi1)
9106 return false;
9107 if (isSigned) {
9108 if (Hi0->getSExtValue() == Lo0->getSExtValue() >> 32 &&
9109 Hi1->getSExtValue() == Lo1->getSExtValue() >> 32)
9110 return true;
9111 } else {
9112 if (Hi0->isZero() && Hi1->isZero())
9113 return true;
9114 }
9115 return false;
9116 }
9117
9118 if (N->getOpcode() != ISD::BUILD_VECTOR)
9119 return false;
9120
9121 for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
9122 SDNode *Elt = N->getOperand(i).getNode();
9124 unsigned EltSize = VT.getScalarSizeInBits();
9125 unsigned HalfSize = EltSize / 2;
9126 if (isSigned) {
9127 if (!isIntN(HalfSize, C->getSExtValue()))
9128 return false;
9129 } else {
9130 if (!isUIntN(HalfSize, C->getZExtValue()))
9131 return false;
9132 }
9133 continue;
9134 }
9135 return false;
9136 }
9137
9138 return true;
9139}
9140
9141/// isSignExtended - Check if a node is a vector value that is sign-extended
9142/// or a constant BUILD_VECTOR with sign-extended elements.
9144 if (N->getOpcode() == ISD::SIGN_EXTEND || ISD::isSEXTLoad(N))
9145 return true;
9146 if (isExtendedBUILD_VECTOR(N, DAG, true))
9147 return true;
9148 return false;
9149}
9150
9151/// isZeroExtended - Check if a node is a vector value that is zero-extended (or
9152/// any-extended) or a constant BUILD_VECTOR with zero-extended elements.
9154 if (N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode() == ISD::ANY_EXTEND ||
9156 return true;
9157 if (isExtendedBUILD_VECTOR(N, DAG, false))
9158 return true;
9159 return false;
9160}
9161
9162static EVT getExtensionTo64Bits(const EVT &OrigVT) {
9163 if (OrigVT.getSizeInBits() >= 64)
9164 return OrigVT;
9165
9166 assert(OrigVT.isSimple() && "Expecting a simple value type");
9167
9168 MVT::SimpleValueType OrigSimpleTy = OrigVT.getSimpleVT().SimpleTy;
9169 switch (OrigSimpleTy) {
9170 default: llvm_unreachable("Unexpected Vector Type");
9171 case MVT::v2i8:
9172 case MVT::v2i16:
9173 return MVT::v2i32;
9174 case MVT::v4i8:
9175 return MVT::v4i16;
9176 }
9177}
9178
9179/// AddRequiredExtensionForVMULL - Add a sign/zero extension to extend the total
9180/// value size to 64 bits. We need a 64-bit D register as an operand to VMULL.
9181/// We insert the required extension here to get the vector to fill a D register.
9183 const EVT &OrigTy,
9184 const EVT &ExtTy,
9185 unsigned ExtOpcode) {
9186 // The vector originally had a size of OrigTy. It was then extended to ExtTy.
9187 // We expect the ExtTy to be 128-bits total. If the OrigTy is less than
9188 // 64-bits we need to insert a new extension so that it will be 64-bits.
9189 assert(ExtTy.is128BitVector() && "Unexpected extension size");
9190 if (OrigTy.getSizeInBits() >= 64)
9191 return N;
9192
9193 // Must extend size to at least 64 bits to be used as an operand for VMULL.
9194 EVT NewVT = getExtensionTo64Bits(OrigTy);
9195
9196 return DAG.getNode(ExtOpcode, SDLoc(N), NewVT, N);
9197}
9198
9199/// SkipLoadExtensionForVMULL - return a load of the original vector size that
9200/// does not do any sign/zero extension. If the original vector is less
9201/// than 64 bits, an appropriate extension will be added after the load to
9202/// reach a total size of 64 bits. We have to add the extension separately
9203/// because ARM does not have a sign/zero extending load for vectors.
9205 EVT ExtendedTy = getExtensionTo64Bits(LD->getMemoryVT());
9206
9207 // The load already has the right type.
9208 if (ExtendedTy == LD->getMemoryVT())
9209 return DAG.getLoad(LD->getMemoryVT(), SDLoc(LD), LD->getChain(),
9210 LD->getBasePtr(), LD->getPointerInfo(), LD->getAlign(),
9211 LD->getMemOperand()->getFlags());
9212
9213 // We need to create a zextload/sextload. We cannot just create a load
9214 // followed by a zext/zext node because LowerMUL is also run during normal
9215 // operation legalization where we can't create illegal types.
9216 return DAG.getExtLoad(LD->getExtensionType(), SDLoc(LD), ExtendedTy,
9217 LD->getChain(), LD->getBasePtr(), LD->getPointerInfo(),
9218 LD->getMemoryVT(), LD->getAlign(),
9219 LD->getMemOperand()->getFlags());
9220}
9221
9222/// SkipExtensionForVMULL - For a node that is a SIGN_EXTEND, ZERO_EXTEND,
9223/// ANY_EXTEND, extending load, or BUILD_VECTOR with extended elements, return
9224/// the unextended value. The unextended vector should be 64 bits so that it can
9225/// be used as an operand to a VMULL instruction. If the original vector size
9226/// before extension is less than 64 bits we add a an extension to resize
9227/// the vector to 64 bits.
9229 if (N->getOpcode() == ISD::SIGN_EXTEND ||
9230 N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode() == ISD::ANY_EXTEND)
9231 return AddRequiredExtensionForVMULL(N->getOperand(0), DAG,
9232 N->getOperand(0)->getValueType(0),
9233 N->getValueType(0),
9234 N->getOpcode());
9235
9236 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
9237 assert((ISD::isSEXTLoad(LD) || ISD::isZEXTLoad(LD)) &&
9238 "Expected extending load");
9239
9240 SDValue newLoad = SkipLoadExtensionForVMULL(LD, DAG);
9241 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), newLoad.getValue(1));
9242 unsigned Opcode = ISD::isSEXTLoad(LD) ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
9243 SDValue extLoad =
9244 DAG.getNode(Opcode, SDLoc(newLoad), LD->getValueType(0), newLoad);
9245 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 0), extLoad);
9246
9247 return newLoad;
9248 }
9249
9250 // Otherwise, the value must be a BUILD_VECTOR. For v2i64, it will
9251 // have been legalized as a BITCAST from v4i32.
9252 if (N->getOpcode() == ISD::BITCAST) {
9253 SDNode *BVN = N->getOperand(0).getNode();
9255 BVN->getValueType(0) == MVT::v4i32 && "expected v4i32 BUILD_VECTOR");
9256 unsigned LowElt = DAG.getDataLayout().isBigEndian() ? 1 : 0;
9257 return DAG.getBuildVector(
9258 MVT::v2i32, SDLoc(N),
9259 {BVN->getOperand(LowElt), BVN->getOperand(LowElt + 2)});
9260 }
9261 // Construct a new BUILD_VECTOR with elements truncated to half the size.
9262 assert(N->getOpcode() == ISD::BUILD_VECTOR && "expected BUILD_VECTOR");
9263 EVT VT = N->getValueType(0);
9264 unsigned EltSize = VT.getScalarSizeInBits() / 2;
9265 unsigned NumElts = VT.getVectorNumElements();
9266 MVT TruncVT = MVT::getIntegerVT(EltSize);
9268 SDLoc dl(N);
9269 for (unsigned i = 0; i != NumElts; ++i) {
9270 const APInt &CInt = N->getConstantOperandAPInt(i);
9271 // Element types smaller than 32 bits are not legal, so use i32 elements.
9272 // The values are implicitly truncated so sext vs. zext doesn't matter.
9273 Ops.push_back(DAG.getConstant(CInt.zextOrTrunc(32), dl, MVT::i32));
9274 }
9275 return DAG.getBuildVector(MVT::getVectorVT(TruncVT, NumElts), dl, Ops);
9276}
9277
9278static bool isAddSubSExt(SDNode *N, SelectionDAG &DAG) {
9279 unsigned Opcode = N->getOpcode();
9280 if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
9281 SDNode *N0 = N->getOperand(0).getNode();
9282 SDNode *N1 = N->getOperand(1).getNode();
9283 return N0->hasOneUse() && N1->hasOneUse() &&
9284 isSignExtended(N0, DAG) && isSignExtended(N1, DAG);
9285 }
9286 return false;
9287}
9288
9289static bool isAddSubZExt(SDNode *N, SelectionDAG &DAG) {
9290 unsigned Opcode = N->getOpcode();
9291 if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
9292 SDNode *N0 = N->getOperand(0).getNode();
9293 SDNode *N1 = N->getOperand(1).getNode();
9294 return N0->hasOneUse() && N1->hasOneUse() &&
9295 isZeroExtended(N0, DAG) && isZeroExtended(N1, DAG);
9296 }
9297 return false;
9298}
9299
9301 // Multiplications are only custom-lowered for 128-bit vectors so that
9302 // VMULL can be detected. Otherwise v2i64 multiplications are not legal.
9303 EVT VT = Op.getValueType();
9304 assert(VT.is128BitVector() && VT.isInteger() &&
9305 "unexpected type for custom-lowering ISD::MUL");
9306 SDNode *N0 = Op.getOperand(0).getNode();
9307 SDNode *N1 = Op.getOperand(1).getNode();
9308 unsigned NewOpc = 0;
9309 bool isMLA = false;
9310 bool isN0SExt = isSignExtended(N0, DAG);
9311 bool isN1SExt = isSignExtended(N1, DAG);
9312 if (isN0SExt && isN1SExt)
9313 NewOpc = ARMISD::VMULLs;
9314 else {
9315 bool isN0ZExt = isZeroExtended(N0, DAG);
9316 bool isN1ZExt = isZeroExtended(N1, DAG);
9317 if (isN0ZExt && isN1ZExt)
9318 NewOpc = ARMISD::VMULLu;
9319 else if (isN1SExt || isN1ZExt) {
9320 // Look for (s/zext A + s/zext B) * (s/zext C). We want to turn these
9321 // into (s/zext A * s/zext C) + (s/zext B * s/zext C)
9322 if (isN1SExt && isAddSubSExt(N0, DAG)) {
9323 NewOpc = ARMISD::VMULLs;
9324 isMLA = true;
9325 } else if (isN1ZExt && isAddSubZExt(N0, DAG)) {
9326 NewOpc = ARMISD::VMULLu;
9327 isMLA = true;
9328 } else if (isN0ZExt && isAddSubZExt(N1, DAG)) {
9329 std::swap(N0, N1);
9330 NewOpc = ARMISD::VMULLu;
9331 isMLA = true;
9332 }
9333 }
9334
9335 if (!NewOpc) {
9336 if (VT == MVT::v2i64)
9337 // Fall through to expand this. It is not legal.
9338 return SDValue();
9339 else
9340 // Other vector multiplications are legal.
9341 return Op;
9342 }
9343 }
9344
9345 // Legalize to a VMULL instruction.
9346 SDLoc DL(Op);
9347 SDValue Op0;
9348 SDValue Op1 = SkipExtensionForVMULL(N1, DAG);
9349 if (!isMLA) {
9350 Op0 = SkipExtensionForVMULL(N0, DAG);
9352 Op1.getValueType().is64BitVector() &&
9353 "unexpected types for extended operands to VMULL");
9354 return DAG.getNode(NewOpc, DL, VT, Op0, Op1);
9355 }
9356
9357 // Optimizing (zext A + zext B) * C, to (VMULL A, C) + (VMULL B, C) during
9358 // isel lowering to take advantage of no-stall back to back vmul + vmla.
9359 // vmull q0, d4, d6
9360 // vmlal q0, d5, d6
9361 // is faster than
9362 // vaddl q0, d4, d5
9363 // vmovl q1, d6
9364 // vmul q0, q0, q1
9365 SDValue N00 = SkipExtensionForVMULL(N0->getOperand(0).getNode(), DAG);
9366 SDValue N01 = SkipExtensionForVMULL(N0->getOperand(1).getNode(), DAG);
9367 EVT Op1VT = Op1.getValueType();
9368 return DAG.getNode(N0->getOpcode(), DL, VT,
9369 DAG.getNode(NewOpc, DL, VT,
9370 DAG.getNode(ISD::BITCAST, DL, Op1VT, N00), Op1),
9371 DAG.getNode(NewOpc, DL, VT,
9372 DAG.getNode(ISD::BITCAST, DL, Op1VT, N01), Op1));
9373}
9374
9376 SelectionDAG &DAG) {
9377 // TODO: Should this propagate fast-math-flags?
9378
9379 // Convert to float
9380 // float4 xf = vcvt_f32_s32(vmovl_s16(a.lo));
9381 // float4 yf = vcvt_f32_s32(vmovl_s16(b.lo));
9382 X = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, X);
9383 Y = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Y);
9384 X = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, X);
9385 Y = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, Y);
9386 // Get reciprocal estimate.
9387 // float4 recip = vrecpeq_f32(yf);
9388 Y = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
9389 DAG.getConstant(Intrinsic::arm_neon_vrecpe, dl, MVT::i32),
9390 Y);
9391 // Because char has a smaller range than uchar, we can actually get away
9392 // without any newton steps. This requires that we use a weird bias
9393 // of 0xb000, however (again, this has been exhaustively tested).
9394 // float4 result = as_float4(as_int4(xf*recip) + 0xb000);
9395 X = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, X, Y);
9396 X = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, X);
9397 Y = DAG.getConstant(0xb000, dl, MVT::v4i32);
9398 X = DAG.getNode(ISD::ADD, dl, MVT::v4i32, X, Y);
9399 X = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, X);
9400 // Convert back to short.
9401 X = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, X);
9402 X = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, X);
9403 return X;
9404}
9405
9407 SelectionDAG &DAG) {
9408 // TODO: Should this propagate fast-math-flags?
9409
9410 SDValue N2;
9411 // Convert to float.
9412 // float4 yf = vcvt_f32_s32(vmovl_s16(y));
9413 // float4 xf = vcvt_f32_s32(vmovl_s16(x));
9414 N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, N0);
9415 N1 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, N1);
9416 N0 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N0);
9417 N1 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N1);
9418
9419 // Use reciprocal estimate and one refinement step.
9420 // float4 recip = vrecpeq_f32(yf);
9421 // recip *= vrecpsq_f32(yf, recip);
9422 N2 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
9423 DAG.getConstant(Intrinsic::arm_neon_vrecpe, dl, MVT::i32),
9424 N1);
9425 N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
9426 DAG.getConstant(Intrinsic::arm_neon_vrecps, dl, MVT::i32),
9427 N1, N2);
9428 N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2);
9429 // Because short has a smaller range than ushort, we can actually get away
9430 // with only a single newton step. This requires that we use a weird bias
9431 // of 89, however (again, this has been exhaustively tested).
9432 // float4 result = as_float4(as_int4(xf*recip) + 0x89);
9433 N0 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N0, N2);
9434 N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, N0);
9435 N1 = DAG.getConstant(0x89, dl, MVT::v4i32);
9436 N0 = DAG.getNode(ISD::ADD, dl, MVT::v4i32, N0, N1);
9437 N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, N0);
9438 // Convert back to integer and return.
9439 // return vmovn_s32(vcvt_s32_f32(result));
9440 N0 = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, N0);
9441 N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, N0);
9442 return N0;
9443}
9444
9446 const ARMSubtarget *ST) {
9447 EVT VT = Op.getValueType();
9448 assert((VT == MVT::v4i16 || VT == MVT::v8i8) &&
9449 "unexpected type for custom-lowering ISD::SDIV");
9450
9451 SDLoc dl(Op);
9452 SDValue N0 = Op.getOperand(0);
9453 SDValue N1 = Op.getOperand(1);
9454 SDValue N2, N3;
9455
9456 if (VT == MVT::v8i8) {
9457 N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i16, N0);
9458 N1 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i16, N1);
9459
9460 N2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,
9461 DAG.getIntPtrConstant(4, dl));
9462 N3 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1,
9463 DAG.getIntPtrConstant(4, dl));
9464 N0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,
9465 DAG.getIntPtrConstant(0, dl));
9466 N1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1,
9467 DAG.getIntPtrConstant(0, dl));
9468
9469 N0 = LowerSDIV_v4i8(N0, N1, dl, DAG); // v4i16
9470 N2 = LowerSDIV_v4i8(N2, N3, dl, DAG); // v4i16
9471
9472 N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, N0, N2);
9473 N0 = LowerCONCAT_VECTORS(N0, DAG, ST);
9474
9475 N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v8i8, N0);
9476 return N0;
9477 }
9478 return LowerSDIV_v4i16(N0, N1, dl, DAG);
9479}
9480
9482 const ARMSubtarget *ST) {
9483 // TODO: Should this propagate fast-math-flags?
9484 EVT VT = Op.getValueType();
9485 assert((VT == MVT::v4i16 || VT == MVT::v8i8) &&
9486 "unexpected type for custom-lowering ISD::UDIV");
9487
9488 SDLoc dl(Op);
9489 SDValue N0 = Op.getOperand(0);
9490 SDValue N1 = Op.getOperand(1);
9491 SDValue N2, N3;
9492
9493 if (VT == MVT::v8i8) {
9494 N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v8i16, N0);
9495 N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v8i16, N1);
9496
9497 N2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,
9498 DAG.getIntPtrConstant(4, dl));
9499 N3 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1,
9500 DAG.getIntPtrConstant(4, dl));
9501 N0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,
9502 DAG.getIntPtrConstant(0, dl));
9503 N1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1,
9504 DAG.getIntPtrConstant(0, dl));
9505
9506 N0 = LowerSDIV_v4i16(N0, N1, dl, DAG); // v4i16
9507 N2 = LowerSDIV_v4i16(N2, N3, dl, DAG); // v4i16
9508
9509 N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, N0, N2);
9510 N0 = LowerCONCAT_VECTORS(N0, DAG, ST);
9511
9512 N0 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v8i8,
9513 DAG.getConstant(Intrinsic::arm_neon_vqmovnsu, dl,
9514 MVT::i32),
9515 N0);
9516 return N0;
9517 }
9518
9519 // v4i16 sdiv ... Convert to float.
9520 // float4 yf = vcvt_f32_s32(vmovl_u16(y));
9521 // float4 xf = vcvt_f32_s32(vmovl_u16(x));
9522 N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v4i32, N0);
9523 N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v4i32, N1);
9524 N0 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N0);
9525 SDValue BN1 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N1);
9526
9527 // Use reciprocal estimate and two refinement steps.
9528 // float4 recip = vrecpeq_f32(yf);
9529 // recip *= vrecpsq_f32(yf, recip);
9530 // recip *= vrecpsq_f32(yf, recip);
9531 N2 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
9532 DAG.getConstant(Intrinsic::arm_neon_vrecpe, dl, MVT::i32),
9533 BN1);
9534 N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
9535 DAG.getConstant(Intrinsic::arm_neon_vrecps, dl, MVT::i32),
9536 BN1, N2);
9537 N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2);
9538 N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
9539 DAG.getConstant(Intrinsic::arm_neon_vrecps, dl, MVT::i32),
9540 BN1, N2);
9541 N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2);
9542 // Simply multiplying by the reciprocal estimate can leave us a few ulps
9543 // too low, so we add 2 ulps (exhaustive testing shows that this is enough,
9544 // and that it will never cause us to return an answer too large).
9545 // float4 result = as_float4(as_int4(xf*recip) + 2);
9546 N0 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N0, N2);
9547 N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, N0);
9548 N1 = DAG.getConstant(2, dl, MVT::v4i32);
9549 N0 = DAG.getNode(ISD::ADD, dl, MVT::v4i32, N0, N1);
9550 N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, N0);
9551 // Convert back to integer and return.
9552 // return vmovn_u32(vcvt_s32_f32(result));
9553 N0 = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, N0);
9554 N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, N0);
9555 return N0;
9556}
9557
9559 SDNode *N = Op.getNode();
9560 EVT VT = N->getValueType(0);
9561 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
9562
9563 SDValue Carry = Op.getOperand(2);
9564
9565 SDLoc DL(Op);
9566
9567 SDValue Result;
9568 if (Op.getOpcode() == ISD::UADDO_CARRY) {
9569 // This converts the boolean value carry into the carry flag.
9570 Carry = ConvertBooleanCarryToCarryFlag(Carry, DAG);
9571
9572 // Do the addition proper using the carry flag we wanted.
9573 Result = DAG.getNode(ARMISD::ADDE, DL, VTs, Op.getOperand(0),
9574 Op.getOperand(1), Carry);
9575
9576 // Now convert the carry flag into a boolean value.
9577 Carry = ConvertCarryFlagToBooleanCarry(Result.getValue(1), VT, DAG);
9578 } else {
9579 // ARMISD::SUBE expects a carry not a borrow like ISD::USUBO_CARRY so we
9580 // have to invert the carry first.
9581 Carry = DAG.getNode(ISD::SUB, DL, MVT::i32,
9582 DAG.getConstant(1, DL, MVT::i32), Carry);
9583 // This converts the boolean value carry into the carry flag.
9584 Carry = ConvertBooleanCarryToCarryFlag(Carry, DAG);
9585
9586 // Do the subtraction proper using the carry flag we wanted.
9587 Result = DAG.getNode(ARMISD::SUBE, DL, VTs, Op.getOperand(0),
9588 Op.getOperand(1), Carry);
9589
9590 // Now convert the carry flag into a boolean value.
9591 Carry = ConvertCarryFlagToBooleanCarry(Result.getValue(1), VT, DAG);
9592 // But the carry returned by ARMISD::SUBE is not a borrow as expected
9593 // by ISD::USUBO_CARRY, so compute 1 - C.
9594 Carry = DAG.getNode(ISD::SUB, DL, MVT::i32,
9595 DAG.getConstant(1, DL, MVT::i32), Carry);
9596 }
9597
9598 // Return both values.
9599 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Result, Carry);
9600}
9601
9602SDValue ARMTargetLowering::LowerWindowsDIVLibCall(SDValue Op, SelectionDAG &DAG,
9603 bool Signed,
9604 SDValue &Chain) const {
9605 EVT VT = Op.getValueType();
9606 assert((VT == MVT::i32 || VT == MVT::i64) &&
9607 "unexpected type for custom lowering DIV");
9608 SDLoc dl(Op);
9609
9610 const auto &DL = DAG.getDataLayout();
9611 RTLIB::Libcall LC;
9612 if (Signed)
9613 LC = VT == MVT::i32 ? RTLIB::SDIVREM_I32 : RTLIB::SDIVREM_I64;
9614 else
9615 LC = VT == MVT::i32 ? RTLIB::UDIVREM_I32 : RTLIB::UDIVREM_I64;
9616
9617 RTLIB::LibcallImpl LCImpl = DAG.getLibcalls().getLibcallImpl(LC);
9618 SDValue ES = DAG.getExternalSymbol(LCImpl, getPointerTy(DL));
9619
9621
9622 for (auto AI : {1, 0}) {
9623 SDValue Operand = Op.getOperand(AI);
9624 Args.emplace_back(Operand,
9625 Operand.getValueType().getTypeForEVT(*DAG.getContext()));
9626 }
9627
9628 CallLoweringInfo CLI(DAG);
9629 CLI.setDebugLoc(dl).setChain(Chain).setCallee(
9631 VT.getTypeForEVT(*DAG.getContext()), ES, std::move(Args));
9632
9633 return LowerCallTo(CLI).first;
9634}
9635
9636// This is a code size optimisation: return the original SDIV node to
9637// DAGCombiner when we don't want to expand SDIV into a sequence of
9638// instructions, and an empty node otherwise which will cause the
9639// SDIV to be expanded in DAGCombine.
9640SDValue
9641ARMTargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
9642 SelectionDAG &DAG,
9643 SmallVectorImpl<SDNode *> &Created) const {
9644 // TODO: Support SREM
9645 if (N->getOpcode() != ISD::SDIV)
9646 return SDValue();
9647
9648 const auto &ST = DAG.getSubtarget<ARMSubtarget>();
9649 const bool MinSize = ST.hasMinSize();
9650 const bool HasDivide = ST.isThumb() ? ST.hasDivideInThumbMode()
9651 : ST.hasDivideInARMMode();
9652
9653 // Don't touch vector types; rewriting this may lead to scalarizing
9654 // the int divs.
9655 if (N->getOperand(0).getValueType().isVector())
9656 return SDValue();
9657
9658 // Bail if MinSize is not set, and also for both ARM and Thumb mode we need
9659 // hwdiv support for this to be really profitable.
9660 if (!(MinSize && HasDivide))
9661 return SDValue();
9662
9663 // ARM mode is a bit simpler than Thumb: we can handle large power
9664 // of 2 immediates with 1 mov instruction; no further checks required,
9665 // just return the sdiv node.
9666 if (!ST.isThumb())
9667 return SDValue(N, 0);
9668
9669 // In Thumb mode, immediates larger than 128 need a wide 4-byte MOV,
9670 // and thus lose the code size benefits of a MOVS that requires only 2.
9671 // TargetTransformInfo and 'getIntImmCodeSizeCost' could be helpful here,
9672 // but as it's doing exactly this, it's not worth the trouble to get TTI.
9673 if (Divisor.sgt(128))
9674 return SDValue();
9675
9676 return SDValue(N, 0);
9677}
9678
9679SDValue ARMTargetLowering::LowerDIV_Windows(SDValue Op, SelectionDAG &DAG,
9680 bool Signed) const {
9681 assert(Op.getValueType() == MVT::i32 &&
9682 "unexpected type for custom lowering DIV");
9683 SDLoc dl(Op);
9684
9685 SDValue DBZCHK = DAG.getNode(ARMISD::WIN__DBZCHK, dl, MVT::Other,
9686 DAG.getEntryNode(), Op.getOperand(1));
9687
9688 return LowerWindowsDIVLibCall(Op, DAG, Signed, DBZCHK);
9689}
9690
9692 SDLoc DL(N);
9693 SDValue Op = N->getOperand(1);
9694 if (N->getValueType(0) == MVT::i32)
9695 return DAG.getNode(ARMISD::WIN__DBZCHK, DL, MVT::Other, InChain, Op);
9696 SDValue Lo, Hi;
9697 std::tie(Lo, Hi) = DAG.SplitScalar(Op, DL, MVT::i32, MVT::i32);
9698 return DAG.getNode(ARMISD::WIN__DBZCHK, DL, MVT::Other, InChain,
9699 DAG.getNode(ISD::OR, DL, MVT::i32, Lo, Hi));
9700}
9701
9702void ARMTargetLowering::ExpandDIV_Windows(
9703 SDValue Op, SelectionDAG &DAG, bool Signed,
9705 const auto &DL = DAG.getDataLayout();
9706
9707 assert(Op.getValueType() == MVT::i64 &&
9708 "unexpected type for custom lowering DIV");
9709 SDLoc dl(Op);
9710
9711 SDValue DBZCHK = WinDBZCheckDenominator(DAG, Op.getNode(), DAG.getEntryNode());
9712
9713 SDValue Result = LowerWindowsDIVLibCall(Op, DAG, Signed, DBZCHK);
9714
9715 SDValue Lower = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Result);
9716 SDValue Upper = DAG.getNode(ISD::SRL, dl, MVT::i64, Result,
9717 DAG.getConstant(32, dl, getPointerTy(DL)));
9718 Upper = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Upper);
9719
9720 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lower, Upper));
9721}
9722
9723std::pair<SDValue, SDValue>
9724ARMTargetLowering::LowerAEABIUnalignedLoad(SDValue Op,
9725 SelectionDAG &DAG) const {
9726 // If we have an unaligned load from a i32 or i64 that would normally be
9727 // split into separate ldrb's, we can use the __aeabi_uread4/__aeabi_uread8
9728 // functions instead.
9729 LoadSDNode *LD = cast<LoadSDNode>(Op.getNode());
9730 EVT MemVT = LD->getMemoryVT();
9731 if (MemVT != MVT::i32 && MemVT != MVT::i64)
9732 return std::make_pair(SDValue(), SDValue());
9733
9734 const auto &MF = DAG.getMachineFunction();
9735 unsigned AS = LD->getAddressSpace();
9736 Align Alignment = LD->getAlign();
9737 const DataLayout &DL = DAG.getDataLayout();
9738 bool AllowsUnaligned = Subtarget->allowsUnalignedMem();
9739
9740 if (MF.getFunction().hasMinSize() && !AllowsUnaligned &&
9741 Alignment <= llvm::Align(2)) {
9742
9743 RTLIB::Libcall LC =
9744 (MemVT == MVT::i32) ? RTLIB::AEABI_UREAD4 : RTLIB::AEABI_UREAD8;
9745
9746 MakeLibCallOptions Opts;
9747 SDLoc dl(Op);
9748
9749 auto Pair = makeLibCall(DAG, LC, MemVT.getSimpleVT(), LD->getBasePtr(),
9750 Opts, dl, LD->getChain());
9751
9752 // If necessary, extend the node to 64bit
9753 if (LD->getExtensionType() != ISD::NON_EXTLOAD) {
9754 unsigned ExtType = LD->getExtensionType() == ISD::SEXTLOAD
9757 SDValue EN = DAG.getNode(ExtType, dl, LD->getValueType(0), Pair.first);
9758 Pair.first = EN;
9759 }
9760 return Pair;
9761 }
9762
9763 // Default expand to individual loads
9764 if (!allowsMemoryAccess(*DAG.getContext(), DL, MemVT, AS, Alignment))
9765 return expandUnalignedLoad(LD, DAG);
9766 return std::make_pair(SDValue(), SDValue());
9767}
9768
9769SDValue ARMTargetLowering::LowerAEABIUnalignedStore(SDValue Op,
9770 SelectionDAG &DAG) const {
9771 // If we have an unaligned store to a i32 or i64 that would normally be
9772 // split into separate ldrb's, we can use the __aeabi_uwrite4/__aeabi_uwrite8
9773 // functions instead.
9774 StoreSDNode *ST = cast<StoreSDNode>(Op.getNode());
9775 EVT MemVT = ST->getMemoryVT();
9776 if (MemVT != MVT::i32 && MemVT != MVT::i64)
9777 return SDValue();
9778
9779 const auto &MF = DAG.getMachineFunction();
9780 unsigned AS = ST->getAddressSpace();
9781 Align Alignment = ST->getAlign();
9782 const DataLayout &DL = DAG.getDataLayout();
9783 bool AllowsUnaligned = Subtarget->allowsUnalignedMem();
9784
9785 if (MF.getFunction().hasMinSize() && !AllowsUnaligned &&
9786 Alignment <= llvm::Align(2)) {
9787
9788 SDLoc dl(Op);
9789
9790 // If necessary, trunc the value to 32bit
9791 SDValue StoreVal = ST->getOperand(1);
9792 if (ST->isTruncatingStore())
9793 StoreVal = DAG.getNode(ISD::TRUNCATE, dl, MemVT, ST->getOperand(1));
9794
9795 RTLIB::Libcall LC =
9796 (MemVT == MVT::i32) ? RTLIB::AEABI_UWRITE4 : RTLIB::AEABI_UWRITE8;
9797
9798 MakeLibCallOptions Opts;
9799 auto CallResult =
9800 makeLibCall(DAG, LC, MVT::isVoid, {StoreVal, ST->getBasePtr()}, Opts,
9801 dl, ST->getChain());
9802
9803 return CallResult.second;
9804 }
9805
9806 // Default expand to individual stores
9807 if (!allowsMemoryAccess(*DAG.getContext(), DL, MemVT, AS, Alignment))
9808 return expandUnalignedStore(ST, DAG);
9809 return SDValue();
9810}
9811
9813 LoadSDNode *LD = cast<LoadSDNode>(Op.getNode());
9814 EVT MemVT = LD->getMemoryVT();
9815 assert((MemVT == MVT::v2i1 || MemVT == MVT::v4i1 || MemVT == MVT::v8i1 ||
9816 MemVT == MVT::v16i1) &&
9817 "Expected a predicate type!");
9818 assert(MemVT == Op.getValueType());
9819 assert(LD->getExtensionType() == ISD::NON_EXTLOAD &&
9820 "Expected a non-extending load");
9821 assert(LD->isUnindexed() && "Expected a unindexed load");
9822
9823 // The basic MVE VLDR on a v2i1/v4i1/v8i1 actually loads the entire 16bit
9824 // predicate, with the "v4i1" bits spread out over the 16 bits loaded. We
9825 // need to make sure that 8/4/2 bits are actually loaded into the correct
9826 // place, which means loading the value and then shuffling the values into
9827 // the bottom bits of the predicate.
9828 // Equally, VLDR for an v16i1 will actually load 32bits (so will be incorrect
9829 // for BE).
9830 // Speaking of BE, apparently the rest of llvm will assume a reverse order to
9831 // a natural VMSR(load), so needs to be reversed.
9832
9833 SDLoc dl(Op);
9834 SDValue Load = DAG.getExtLoad(
9835 ISD::EXTLOAD, dl, MVT::i32, LD->getChain(), LD->getBasePtr(),
9837 LD->getMemOperand());
9838 SDValue Val = Load;
9839 if (DAG.getDataLayout().isBigEndian())
9840 Val = DAG.getNode(ISD::SRL, dl, MVT::i32,
9841 DAG.getNode(ISD::BITREVERSE, dl, MVT::i32, Load),
9842 DAG.getConstant(32 - MemVT.getSizeInBits(), dl, MVT::i32));
9843 SDValue Pred = DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::v16i1, Val);
9844 if (MemVT != MVT::v16i1)
9845 Pred = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MemVT, Pred,
9846 DAG.getConstant(0, dl, MVT::i32));
9847 return DAG.getMergeValues({Pred, Load.getValue(1)}, dl);
9848}
9849
9850void ARMTargetLowering::LowerLOAD(SDNode *N, SmallVectorImpl<SDValue> &Results,
9851 SelectionDAG &DAG) const {
9852 LoadSDNode *LD = cast<LoadSDNode>(N);
9853 EVT MemVT = LD->getMemoryVT();
9854
9855 if (MemVT == MVT::i64 && Subtarget->hasV5TEOps() &&
9856 !Subtarget->isThumb1Only() && LD->isVolatile() &&
9857 LD->getAlign() >= Subtarget->getDualLoadStoreAlignment()) {
9858 assert(LD->isUnindexed() && "Loads should be unindexed at this point.");
9859 SDLoc dl(N);
9861 ARMISD::LDRD, dl, DAG.getVTList({MVT::i32, MVT::i32, MVT::Other}),
9862 {LD->getChain(), LD->getBasePtr()}, MemVT, LD->getMemOperand());
9863 SDValue Lo = Result.getValue(DAG.getDataLayout().isLittleEndian() ? 0 : 1);
9864 SDValue Hi = Result.getValue(DAG.getDataLayout().isLittleEndian() ? 1 : 0);
9865 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
9866 Results.append({Pair, Result.getValue(2)});
9867 } else if (MemVT == MVT::i32 || MemVT == MVT::i64) {
9868 auto Pair = LowerAEABIUnalignedLoad(SDValue(N, 0), DAG);
9869 if (Pair.first) {
9870 Results.push_back(Pair.first);
9871 Results.push_back(Pair.second);
9872 }
9873 }
9874}
9875
9877 StoreSDNode *ST = cast<StoreSDNode>(Op.getNode());
9878 EVT MemVT = ST->getMemoryVT();
9879 assert((MemVT == MVT::v2i1 || MemVT == MVT::v4i1 || MemVT == MVT::v8i1 ||
9880 MemVT == MVT::v16i1) &&
9881 "Expected a predicate type!");
9882 assert(MemVT == ST->getValue().getValueType());
9883 assert(!ST->isTruncatingStore() && "Expected a non-extending store");
9884 assert(ST->isUnindexed() && "Expected a unindexed store");
9885
9886 // Only store the v2i1 or v4i1 or v8i1 worth of bits, via a buildvector with
9887 // top bits unset and a scalar store.
9888 SDLoc dl(Op);
9889 SDValue Build = ST->getValue();
9890 if (MemVT != MVT::v16i1) {
9892 for (unsigned I = 0; I < MemVT.getVectorNumElements(); I++) {
9893 unsigned Elt = DAG.getDataLayout().isBigEndian()
9894 ? MemVT.getVectorNumElements() - I - 1
9895 : I;
9896 Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, Build,
9897 DAG.getConstant(Elt, dl, MVT::i32)));
9898 }
9899 for (unsigned I = MemVT.getVectorNumElements(); I < 16; I++)
9900 Ops.push_back(DAG.getUNDEF(MVT::i32));
9901 Build = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i1, Ops);
9902 }
9903 SDValue GRP = DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::i32, Build);
9904 if (MemVT == MVT::v16i1 && DAG.getDataLayout().isBigEndian())
9905 GRP = DAG.getNode(ISD::SRL, dl, MVT::i32,
9906 DAG.getNode(ISD::BITREVERSE, dl, MVT::i32, GRP),
9907 DAG.getConstant(16, dl, MVT::i32));
9908 return DAG.getTruncStore(
9909 ST->getChain(), dl, GRP, ST->getBasePtr(),
9911 ST->getMemOperand());
9912}
9913
9914SDValue ARMTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG,
9915 const ARMSubtarget *Subtarget) const {
9916 StoreSDNode *ST = cast<StoreSDNode>(Op.getNode());
9917 EVT MemVT = ST->getMemoryVT();
9918
9919 if (MemVT == MVT::i64 && Subtarget->hasV5TEOps() &&
9920 !Subtarget->isThumb1Only() && ST->isVolatile() &&
9921 ST->getAlign() >= Subtarget->getDualLoadStoreAlignment()) {
9922 assert(ST->isUnindexed() && "Stores should be unindexed at this point.");
9923 SDNode *N = Op.getNode();
9924 SDLoc dl(N);
9925
9926 SDValue Lo = DAG.getNode(
9927 ISD::EXTRACT_ELEMENT, dl, MVT::i32, ST->getValue(),
9928 DAG.getTargetConstant(DAG.getDataLayout().isLittleEndian() ? 0 : 1, dl,
9929 MVT::i32));
9930 SDValue Hi = DAG.getNode(
9931 ISD::EXTRACT_ELEMENT, dl, MVT::i32, ST->getValue(),
9932 DAG.getTargetConstant(DAG.getDataLayout().isLittleEndian() ? 1 : 0, dl,
9933 MVT::i32));
9934
9935 return DAG.getMemIntrinsicNode(ARMISD::STRD, dl, DAG.getVTList(MVT::Other),
9936 {ST->getChain(), Lo, Hi, ST->getBasePtr()},
9937 MemVT, ST->getMemOperand());
9938 } else if (Subtarget->hasMVEIntegerOps() &&
9939 ((MemVT == MVT::v2i1 || MemVT == MVT::v4i1 || MemVT == MVT::v8i1 ||
9940 MemVT == MVT::v16i1))) {
9941 return LowerPredicateStore(Op, DAG);
9942 } else if (MemVT == MVT::i32 || MemVT == MVT::i64) {
9943 return LowerAEABIUnalignedStore(Op, DAG);
9944 }
9945 return SDValue();
9946}
9947
9948static bool isZeroVector(SDValue N) {
9949 return (ISD::isBuildVectorAllZeros(N.getNode()) ||
9950 (N->getOpcode() == ARMISD::VMOVIMM &&
9951 isNullConstant(N->getOperand(0))));
9952}
9953
9956 MVT VT = Op.getSimpleValueType();
9957 SDValue Mask = N->getMask();
9958 SDValue PassThru = N->getPassThru();
9959 SDLoc dl(Op);
9960
9961 if (isZeroVector(PassThru))
9962 return Op;
9963
9964 // MVE Masked loads use zero as the passthru value. Here we convert undef to
9965 // zero too, and other values are lowered to a select.
9966 SDValue ZeroVec = DAG.getNode(ARMISD::VMOVIMM, dl, VT,
9967 DAG.getTargetConstant(0, dl, MVT::i32));
9968 SDValue NewLoad = DAG.getMaskedLoad(
9969 VT, dl, N->getChain(), N->getBasePtr(), N->getOffset(), Mask, ZeroVec,
9970 N->getMemoryVT(), N->getMemOperand(), N->getAddressingMode(),
9971 N->getExtensionType(), N->isExpandingLoad());
9972 SDValue Combo = NewLoad;
9973 bool PassThruIsCastZero = (PassThru.getOpcode() == ISD::BITCAST ||
9974 PassThru.getOpcode() == ARMISD::VECTOR_REG_CAST) &&
9975 isZeroVector(PassThru->getOperand(0));
9976 if (!PassThru.isUndef() && !PassThruIsCastZero)
9977 Combo = DAG.getNode(ISD::VSELECT, dl, VT, Mask, NewLoad, PassThru);
9978 return DAG.getMergeValues({Combo, NewLoad.getValue(1)}, dl);
9979}
9980
9982 const ARMSubtarget *ST) {
9983 if (!ST->hasMVEIntegerOps())
9984 return SDValue();
9985
9986 SDLoc dl(Op);
9987 unsigned BaseOpcode = 0;
9988 switch (Op->getOpcode()) {
9989 default: llvm_unreachable("Expected VECREDUCE opcode");
9990 case ISD::VECREDUCE_FADD: BaseOpcode = ISD::FADD; break;
9991 case ISD::VECREDUCE_FMUL: BaseOpcode = ISD::FMUL; break;
9992 case ISD::VECREDUCE_MUL: BaseOpcode = ISD::MUL; break;
9993 case ISD::VECREDUCE_AND: BaseOpcode = ISD::AND; break;
9994 case ISD::VECREDUCE_OR: BaseOpcode = ISD::OR; break;
9995 case ISD::VECREDUCE_XOR: BaseOpcode = ISD::XOR; break;
9996 case ISD::VECREDUCE_FMAX: BaseOpcode = ISD::FMAXNUM; break;
9997 case ISD::VECREDUCE_FMIN: BaseOpcode = ISD::FMINNUM; break;
9998 }
9999
10000 SDValue Op0 = Op->getOperand(0);
10001 EVT VT = Op0.getValueType();
10002 EVT EltVT = VT.getVectorElementType();
10003 unsigned NumElts = VT.getVectorNumElements();
10004 unsigned NumActiveLanes = NumElts;
10005
10006 assert((NumActiveLanes == 16 || NumActiveLanes == 8 || NumActiveLanes == 4 ||
10007 NumActiveLanes == 2) &&
10008 "Only expected a power 2 vector size");
10009
10010 // Use Mul(X, Rev(X)) until 4 items remain. Going down to 4 vector elements
10011 // allows us to easily extract vector elements from the lanes.
10012 while (NumActiveLanes > 4) {
10013 unsigned RevOpcode = NumActiveLanes == 16 ? ARMISD::VREV16 : ARMISD::VREV32;
10014 SDValue Rev = DAG.getNode(RevOpcode, dl, VT, Op0);
10015 Op0 = DAG.getNode(BaseOpcode, dl, VT, Op0, Rev);
10016 NumActiveLanes /= 2;
10017 }
10018
10019 SDValue Res;
10020 if (NumActiveLanes == 4) {
10021 // The remaining 4 elements are summed sequentially
10022 SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0,
10023 DAG.getConstant(0 * NumElts / 4, dl, MVT::i32));
10024 SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0,
10025 DAG.getConstant(1 * NumElts / 4, dl, MVT::i32));
10026 SDValue Ext2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0,
10027 DAG.getConstant(2 * NumElts / 4, dl, MVT::i32));
10028 SDValue Ext3 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0,
10029 DAG.getConstant(3 * NumElts / 4, dl, MVT::i32));
10030 SDValue Res0 = DAG.getNode(BaseOpcode, dl, EltVT, Ext0, Ext1, Op->getFlags());
10031 SDValue Res1 = DAG.getNode(BaseOpcode, dl, EltVT, Ext2, Ext3, Op->getFlags());
10032 Res = DAG.getNode(BaseOpcode, dl, EltVT, Res0, Res1, Op->getFlags());
10033 } else {
10034 SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0,
10035 DAG.getConstant(0, dl, MVT::i32));
10036 SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0,
10037 DAG.getConstant(1, dl, MVT::i32));
10038 Res = DAG.getNode(BaseOpcode, dl, EltVT, Ext0, Ext1, Op->getFlags());
10039 }
10040
10041 // Result type may be wider than element type.
10042 if (EltVT != Op->getValueType(0))
10043 Res = DAG.getNode(ISD::ANY_EXTEND, dl, Op->getValueType(0), Res);
10044 return Res;
10045}
10046
10048 const ARMSubtarget *ST) {
10049 if (!ST->hasMVEFloatOps())
10050 return SDValue();
10051 return LowerVecReduce(Op, DAG, ST);
10052}
10053
10055 const ARMSubtarget *ST) {
10056 if (!ST->hasNEON())
10057 return SDValue();
10058
10059 SDLoc dl(Op);
10060 SDValue Op0 = Op->getOperand(0);
10061 EVT VT = Op0.getValueType();
10062 EVT EltVT = VT.getVectorElementType();
10063
10064 unsigned PairwiseIntrinsic = 0;
10065 switch (Op->getOpcode()) {
10066 default:
10067 llvm_unreachable("Expected VECREDUCE opcode");
10069 PairwiseIntrinsic = Intrinsic::arm_neon_vpminu;
10070 break;
10072 PairwiseIntrinsic = Intrinsic::arm_neon_vpmaxu;
10073 break;
10075 PairwiseIntrinsic = Intrinsic::arm_neon_vpmins;
10076 break;
10078 PairwiseIntrinsic = Intrinsic::arm_neon_vpmaxs;
10079 break;
10080 }
10081 SDValue PairwiseOp = DAG.getConstant(PairwiseIntrinsic, dl, MVT::i32);
10082
10083 unsigned NumElts = VT.getVectorNumElements();
10084 unsigned NumActiveLanes = NumElts;
10085
10086 assert((NumActiveLanes == 16 || NumActiveLanes == 8 || NumActiveLanes == 4 ||
10087 NumActiveLanes == 2) &&
10088 "Only expected a power 2 vector size");
10089
10090 // Split 128-bit vectors, since vpmin/max takes 2 64-bit vectors.
10091 if (VT.is128BitVector()) {
10092 SDValue Lo, Hi;
10093 std::tie(Lo, Hi) = DAG.SplitVector(Op0, dl);
10094 VT = Lo.getValueType();
10095 Op0 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, {PairwiseOp, Lo, Hi});
10096 NumActiveLanes /= 2;
10097 }
10098
10099 // Use pairwise reductions until one lane remains
10100 while (NumActiveLanes > 1) {
10101 Op0 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, {PairwiseOp, Op0, Op0});
10102 NumActiveLanes /= 2;
10103 }
10104
10105 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0,
10106 DAG.getConstant(0, dl, MVT::i32));
10107
10108 // Result type may be wider than element type.
10109 if (EltVT != Op.getValueType()) {
10110 unsigned Extend = 0;
10111 switch (Op->getOpcode()) {
10112 default:
10113 llvm_unreachable("Expected VECREDUCE opcode");
10116 Extend = ISD::ZERO_EXTEND;
10117 break;
10120 Extend = ISD::SIGN_EXTEND;
10121 break;
10122 }
10123 Res = DAG.getNode(Extend, dl, Op.getValueType(), Res);
10124 }
10125 return Res;
10126}
10127
10129 if (isStrongerThanMonotonic(cast<AtomicSDNode>(Op)->getSuccessOrdering()))
10130 // Acquire/Release load/store is not legal for targets without a dmb or
10131 // equivalent available.
10132 return SDValue();
10133
10134 // Monotonic load/store is legal for all targets.
10135 return Op;
10136}
10137
10140 SelectionDAG &DAG,
10141 const ARMSubtarget *Subtarget) {
10142 SDLoc DL(N);
10143 // Under Power Management extensions, the cycle-count is:
10144 // mrc p15, #0, <Rt>, c9, c13, #0
10145 SDValue Ops[] = { N->getOperand(0), // Chain
10146 DAG.getTargetConstant(Intrinsic::arm_mrc, DL, MVT::i32),
10147 DAG.getTargetConstant(15, DL, MVT::i32),
10148 DAG.getTargetConstant(0, DL, MVT::i32),
10149 DAG.getTargetConstant(9, DL, MVT::i32),
10150 DAG.getTargetConstant(13, DL, MVT::i32),
10151 DAG.getTargetConstant(0, DL, MVT::i32)
10152 };
10153
10154 SDValue Cycles32 = DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL,
10155 DAG.getVTList(MVT::i32, MVT::Other), Ops);
10156 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Cycles32,
10157 DAG.getConstant(0, DL, MVT::i32)));
10158 Results.push_back(Cycles32.getValue(1));
10159}
10160
10162 SDValue V1) {
10163 SDLoc dl(V0.getNode());
10164 SDValue RegClass =
10165 DAG.getTargetConstant(ARM::GPRPairRegClassID, dl, MVT::i32);
10166 SDValue SubReg0 = DAG.getTargetConstant(ARM::gsub_0, dl, MVT::i32);
10167 SDValue SubReg1 = DAG.getTargetConstant(ARM::gsub_1, dl, MVT::i32);
10168 const SDValue Ops[] = {RegClass, V0, SubReg0, V1, SubReg1};
10169 return SDValue(
10170 DAG.getMachineNode(TargetOpcode::REG_SEQUENCE, dl, MVT::Untyped, Ops), 0);
10171}
10172
10174 SDLoc dl(V.getNode());
10175 auto [VLo, VHi] = DAG.SplitScalar(V, dl, MVT::i32, MVT::i32);
10176 bool isBigEndian = DAG.getDataLayout().isBigEndian();
10177 if (isBigEndian)
10178 std::swap(VLo, VHi);
10179 return createGPRPairNode2xi32(DAG, VLo, VHi);
10180}
10181
10184 SelectionDAG &DAG) {
10185 assert(N->getValueType(0) == MVT::i64 &&
10186 "AtomicCmpSwap on types less than 64 should be legal");
10187 SDValue Ops[] = {
10188 createGPRPairNode2xi32(DAG, N->getOperand(1),
10189 DAG.getUNDEF(MVT::i32)), // pointer, temp
10190 createGPRPairNodei64(DAG, N->getOperand(2)), // expected
10191 createGPRPairNodei64(DAG, N->getOperand(3)), // new
10192 N->getOperand(0), // chain in
10193 };
10194 SDNode *CmpSwap = DAG.getMachineNode(
10195 ARM::CMP_SWAP_64, SDLoc(N),
10196 DAG.getVTList(MVT::Untyped, MVT::Untyped, MVT::Other), Ops);
10197
10198 MachineMemOperand *MemOp = cast<MemSDNode>(N)->getMemOperand();
10199 DAG.setNodeMemRefs(cast<MachineSDNode>(CmpSwap), {MemOp});
10200
10201 bool isBigEndian = DAG.getDataLayout().isBigEndian();
10202
10203 SDValue Lo =
10204 DAG.getTargetExtractSubreg(isBigEndian ? ARM::gsub_1 : ARM::gsub_0,
10205 SDLoc(N), MVT::i32, SDValue(CmpSwap, 0));
10206 SDValue Hi =
10207 DAG.getTargetExtractSubreg(isBigEndian ? ARM::gsub_0 : ARM::gsub_1,
10208 SDLoc(N), MVT::i32, SDValue(CmpSwap, 0));
10209 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i64, Lo, Hi));
10210 Results.push_back(SDValue(CmpSwap, 2));
10211}
10212
10213SDValue ARMTargetLowering::LowerFSETCC(SDValue Op, SelectionDAG &DAG) const {
10214 SDLoc dl(Op);
10215 EVT VT = Op.getValueType();
10216 SDValue Chain = Op.getOperand(0);
10217 SDValue LHS = Op.getOperand(1);
10218 SDValue RHS = Op.getOperand(2);
10219 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(3))->get();
10220 bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
10221
10222 // If we don't have instructions of this float type then soften to a libcall
10223 // and use SETCC instead.
10224 if (isUnsupportedFloatingType(LHS.getValueType())) {
10225 softenSetCCOperands(DAG, LHS.getValueType(), LHS, RHS, CC, dl, LHS, RHS,
10226 Chain, IsSignaling);
10227 if (!RHS.getNode()) {
10228 RHS = DAG.getConstant(0, dl, LHS.getValueType());
10229 CC = ISD::SETNE;
10230 }
10231 SDValue Result = DAG.getNode(ISD::SETCC, dl, VT, LHS, RHS,
10232 DAG.getCondCode(CC));
10233 return DAG.getMergeValues({Result, Chain}, dl);
10234 }
10235
10236 ARMCC::CondCodes CondCode, CondCode2;
10237 FPCCToARMCC(CC, CondCode, CondCode2);
10238
10239 SDValue True = DAG.getConstant(1, dl, VT);
10240 SDValue False = DAG.getConstant(0, dl, VT);
10241 SDValue ARMcc = DAG.getConstant(CondCode, dl, MVT::i32);
10242 SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl, IsSignaling);
10243 SDValue Result = getCMOV(dl, VT, False, True, ARMcc, Cmp, DAG);
10244 if (CondCode2 != ARMCC::AL) {
10245 ARMcc = DAG.getConstant(CondCode2, dl, MVT::i32);
10246 Result = getCMOV(dl, VT, Result, True, ARMcc, Cmp, DAG);
10247 }
10248 return DAG.getMergeValues({Result, Chain}, dl);
10249}
10250
10251SDValue ARMTargetLowering::LowerSPONENTRY(SDValue Op, SelectionDAG &DAG) const {
10252 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
10253
10254 EVT VT = getPointerTy(DAG.getDataLayout());
10255 int FI = MFI.CreateFixedObject(4, 0, false);
10256 return DAG.getFrameIndex(FI, VT);
10257}
10258
10259SDValue ARMTargetLowering::LowerFP_TO_BF16(SDValue Op,
10260 SelectionDAG &DAG) const {
10261 SDLoc DL(Op);
10262 MakeLibCallOptions CallOptions;
10263 MVT SVT = Op.getOperand(0).getSimpleValueType();
10264 RTLIB::Libcall LC = RTLIB::getFPROUND(SVT, MVT::bf16);
10265 SDValue Res =
10266 makeLibCall(DAG, LC, MVT::f32, Op.getOperand(0), CallOptions, DL).first;
10267 return DAG.getBitcast(MVT::i32, Res);
10268}
10269
10270SDValue ARMTargetLowering::LowerCMP(SDValue Op, SelectionDAG &DAG) const {
10271 SDLoc dl(Op);
10272 SDValue LHS = Op.getOperand(0);
10273 SDValue RHS = Op.getOperand(1);
10274
10275 // Determine if this is signed or unsigned comparison
10276 bool IsSigned = (Op.getOpcode() == ISD::SCMP);
10277
10278 // Special case for Thumb1 UCMP only
10279 if (!IsSigned && Subtarget->isThumb1Only()) {
10280 // For Thumb unsigned comparison, use this sequence:
10281 // subs r2, r0, r1 ; r2 = LHS - RHS, sets flags
10282 // sbc r2, r2 ; r2 = r2 - r2 - !carry
10283 // cmp r1, r0 ; compare RHS with LHS
10284 // sbc r1, r1 ; r1 = r1 - r1 - !carry
10285 // subs r0, r2, r1 ; r0 = r2 - r1 (final result)
10286
10287 // First subtraction: LHS - RHS
10288 SDValue Sub1WithFlags = DAG.getNode(
10289 ARMISD::SUBC, dl, DAG.getVTList(MVT::i32, FlagsVT), LHS, RHS);
10290 SDValue Sub1Result = Sub1WithFlags.getValue(0);
10291 SDValue Flags1 = Sub1WithFlags.getValue(1);
10292
10293 // SUBE: Sub1Result - Sub1Result - !carry
10294 // This gives 0 if LHS >= RHS (unsigned), -1 if LHS < RHS (unsigned)
10295 SDValue Sbc1 =
10296 DAG.getNode(ARMISD::SUBE, dl, DAG.getVTList(MVT::i32, FlagsVT),
10297 Sub1Result, Sub1Result, Flags1);
10298 SDValue Sbc1Result = Sbc1.getValue(0);
10299
10300 // Second comparison: RHS vs LHS (reverse comparison)
10301 SDValue CmpFlags = DAG.getNode(ARMISD::CMP, dl, FlagsVT, RHS, LHS);
10302
10303 // SUBE: RHS - RHS - !carry
10304 // This gives 0 if RHS <= LHS (unsigned), -1 if RHS > LHS (unsigned)
10305 SDValue Sbc2 = DAG.getNode(
10306 ARMISD::SUBE, dl, DAG.getVTList(MVT::i32, FlagsVT), RHS, RHS, CmpFlags);
10307 SDValue Sbc2Result = Sbc2.getValue(0);
10308
10309 // Final subtraction: Sbc1Result - Sbc2Result (no flags needed)
10310 SDValue Result =
10311 DAG.getNode(ISD::SUB, dl, MVT::i32, Sbc1Result, Sbc2Result);
10312 if (Op.getValueType() != MVT::i32)
10313 Result = DAG.getSExtOrTrunc(Result, dl, Op.getValueType());
10314
10315 return Result;
10316 }
10317
10318 // For the ARM assembly pattern:
10319 // subs r0, r0, r1 ; subtract RHS from LHS and set flags
10320 // movgt r0, #1 ; if LHS > RHS, set result to 1 (GT for signed, HI for
10321 // unsigned) mvnlt r0, #0 ; if LHS < RHS, set result to -1 (LT for
10322 // signed, LO for unsigned)
10323 // ; if LHS == RHS, result remains 0 from the subs
10324
10325 // Optimization: if RHS is a subtraction against 0, use ADDC instead of SUBC
10326 unsigned Opcode = ARMISD::SUBC;
10327
10328 // Check if RHS is a subtraction against 0: (0 - X)
10329 if (RHS.getOpcode() == ISD::SUB) {
10330 SDValue SubLHS = RHS.getOperand(0);
10331 SDValue SubRHS = RHS.getOperand(1);
10332
10333 // Check if it's 0 - X
10334 if (isNullConstant(SubLHS)) {
10335 bool CanUseAdd = false;
10336 if (IsSigned) {
10337 // For SCMP: only if X is known to never be INT_MIN (to avoid overflow)
10338 if (RHS->getFlags().hasNoSignedWrap() || !DAG.computeKnownBits(SubRHS)
10340 .isMinSignedValue()) {
10341 CanUseAdd = true;
10342 }
10343 } else {
10344 // For UCMP: only if X is known to never be zero
10345 if (DAG.isKnownNeverZero(SubRHS)) {
10346 CanUseAdd = true;
10347 }
10348 }
10349
10350 if (CanUseAdd) {
10351 Opcode = ARMISD::ADDC;
10352 RHS = SubRHS; // Replace RHS with X, so we do LHS + X instead of
10353 // LHS - (0 - X)
10354 }
10355 }
10356 }
10357
10358 // Generate the operation with flags
10359 SDValue OpWithFlags =
10360 DAG.getNode(Opcode, dl, DAG.getVTList(MVT::i32, FlagsVT), LHS, RHS);
10361
10362 SDValue OpResult = OpWithFlags.getValue(0);
10363 SDValue Flags = OpWithFlags.getValue(1);
10364
10365 // Constants for conditional moves
10366 SDValue One = DAG.getConstant(1, dl, MVT::i32);
10367 SDValue MinusOne = DAG.getAllOnesConstant(dl, MVT::i32);
10368
10369 // Select condition codes based on signed vs unsigned
10370 ARMCC::CondCodes GTCond = IsSigned ? ARMCC::GT : ARMCC::HI;
10371 ARMCC::CondCodes LTCond = IsSigned ? ARMCC::LT : ARMCC::LO;
10372
10373 // First conditional move: if greater than, set to 1
10374 SDValue GTCondValue = DAG.getConstant(GTCond, dl, MVT::i32);
10375 SDValue Result1 = DAG.getNode(ARMISD::CMOV, dl, MVT::i32, OpResult, One,
10376 GTCondValue, Flags);
10377
10378 // Second conditional move: if less than, set to -1
10379 SDValue LTCondValue = DAG.getConstant(LTCond, dl, MVT::i32);
10380 SDValue Result2 = DAG.getNode(ARMISD::CMOV, dl, MVT::i32, Result1, MinusOne,
10381 LTCondValue, Flags);
10382
10383 if (Op.getValueType() != MVT::i32)
10384 Result2 = DAG.getSExtOrTrunc(Result2, dl, Op.getValueType());
10385
10386 return Result2;
10387}
10388
10390 LLVM_DEBUG(dbgs() << "Lowering node: "; Op.dump());
10391 switch (Op.getOpcode()) {
10392 default: llvm_unreachable("Don't know how to custom lower this!");
10393 case ISD::WRITE_REGISTER: return LowerWRITE_REGISTER(Op, DAG);
10394 case ISD::ConstantPool: return LowerConstantPool(Op, DAG);
10395 case ISD::BlockAddress: return LowerBlockAddress(Op, DAG);
10396 case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG);
10397 case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
10398 case ISD::SELECT: return LowerSELECT(Op, DAG);
10399 case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
10400 case ISD::BRCOND: return LowerBRCOND(Op, DAG);
10401 case ISD::BR_CC: return LowerBR_CC(Op, DAG);
10402 case ISD::BR_JT: return LowerBR_JT(Op, DAG);
10403 case ISD::VASTART: return LowerVASTART(Op, DAG);
10404 case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, DAG, Subtarget);
10405 case ISD::PREFETCH: return LowerPREFETCH(Op, DAG, Subtarget);
10406 case ISD::SINT_TO_FP:
10407 case ISD::UINT_TO_FP: return LowerINT_TO_FP(Op, DAG);
10410 case ISD::FP_TO_SINT:
10411 case ISD::FP_TO_UINT: return LowerFP_TO_INT(Op, DAG);
10413 case ISD::FP_TO_UINT_SAT: return LowerFP_TO_INT_SAT(Op, DAG, Subtarget);
10414 case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG);
10415 case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG);
10416 case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG);
10417 case ISD::EH_SJLJ_SETJMP: return LowerEH_SJLJ_SETJMP(Op, DAG);
10418 case ISD::EH_SJLJ_LONGJMP: return LowerEH_SJLJ_LONGJMP(Op, DAG);
10419 case ISD::EH_SJLJ_SETUP_DISPATCH: return LowerEH_SJLJ_SETUP_DISPATCH(Op, DAG);
10420 case ISD::INTRINSIC_VOID: return LowerINTRINSIC_VOID(Op, DAG, Subtarget);
10421 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG,
10422 Subtarget);
10423 case ISD::BITCAST: return ExpandBITCAST(Op.getNode(), DAG, Subtarget);
10424 case ISD::SHL:
10425 case ISD::SRL:
10426 case ISD::SRA: return LowerShift(Op.getNode(), DAG, Subtarget);
10427 case ISD::SREM: return LowerREM(Op.getNode(), DAG);
10428 case ISD::UREM: return LowerREM(Op.getNode(), DAG);
10429 case ISD::SHL_PARTS: return LowerShiftLeftParts(Op, DAG);
10430 case ISD::SRL_PARTS:
10431 case ISD::SRA_PARTS: return LowerShiftRightParts(Op, DAG);
10432 case ISD::CTTZ:
10433 case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op.getNode(), DAG, Subtarget);
10434 case ISD::CTPOP: return LowerCTPOP(Op.getNode(), DAG, Subtarget);
10435 case ISD::SETCC: return LowerVSETCC(Op, DAG, Subtarget);
10436 case ISD::SETCCCARRY: return LowerSETCCCARRY(Op, DAG);
10437 case ISD::ConstantFP: return LowerConstantFP(Op, DAG, Subtarget);
10438 case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG, Subtarget);
10439 case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG, Subtarget);
10440 case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG, Subtarget);
10441 case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
10442 case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG, Subtarget);
10443 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG, Subtarget);
10444 case ISD::TRUNCATE: return LowerTruncate(Op.getNode(), DAG, Subtarget);
10445 case ISD::SIGN_EXTEND:
10446 case ISD::ZERO_EXTEND: return LowerVectorExtend(Op.getNode(), DAG, Subtarget);
10447 case ISD::GET_ROUNDING: return LowerGET_ROUNDING(Op, DAG);
10448 case ISD::SET_ROUNDING: return LowerSET_ROUNDING(Op, DAG);
10449 case ISD::SET_FPMODE:
10450 return LowerSET_FPMODE(Op, DAG);
10451 case ISD::RESET_FPMODE:
10452 return LowerRESET_FPMODE(Op, DAG);
10453 case ISD::MUL: return LowerMUL(Op, DAG);
10454 case ISD::SDIV:
10455 if (getTargetMachine().getTargetTriple().isOSWindows() &&
10456 !Op.getValueType().isVector())
10457 return LowerDIV_Windows(Op, DAG, /* Signed */ true);
10458 return LowerSDIV(Op, DAG, Subtarget);
10459 case ISD::UDIV:
10460 if (getTargetMachine().getTargetTriple().isOSWindows() &&
10461 !Op.getValueType().isVector())
10462 return LowerDIV_Windows(Op, DAG, /* Signed */ false);
10463 return LowerUDIV(Op, DAG, Subtarget);
10464 case ISD::UADDO_CARRY:
10465 case ISD::USUBO_CARRY:
10466 return LowerUADDSUBO_CARRY(Op, DAG);
10467 case ISD::UADDO:
10468 case ISD::USUBO:
10469 case ISD::UMULO:
10470 case ISD::SADDO:
10471 case ISD::SSUBO:
10472 case ISD::SMULO:
10473 return LowerALUO(Op, DAG);
10474 case ISD::SADDSAT:
10475 case ISD::SSUBSAT:
10476 case ISD::UADDSAT:
10477 case ISD::USUBSAT:
10478 return LowerADDSUBSAT(Op, DAG, Subtarget);
10479 case ISD::LOAD: {
10480 auto *LD = cast<LoadSDNode>(Op);
10481 EVT MemVT = LD->getMemoryVT();
10482 if (Subtarget->hasMVEIntegerOps() &&
10483 (MemVT == MVT::v2i1 || MemVT == MVT::v4i1 || MemVT == MVT::v8i1 ||
10484 MemVT == MVT::v16i1))
10485 return LowerPredicateLoad(Op, DAG);
10486
10487 auto Pair = LowerAEABIUnalignedLoad(Op, DAG);
10488 if (Pair.first)
10489 return DAG.getMergeValues({Pair.first, Pair.second}, SDLoc(Pair.first));
10490 return SDValue();
10491 }
10492 case ISD::STORE:
10493 return LowerSTORE(Op, DAG, Subtarget);
10494 case ISD::MLOAD:
10495 return LowerMLOAD(Op, DAG);
10496 case ISD::VECREDUCE_MUL:
10497 case ISD::VECREDUCE_AND:
10498 case ISD::VECREDUCE_OR:
10499 case ISD::VECREDUCE_XOR:
10500 return LowerVecReduce(Op, DAG, Subtarget);
10505 return LowerVecReduceF(Op, DAG, Subtarget);
10510 return LowerVecReduceMinMax(Op, DAG, Subtarget);
10511 case ISD::ATOMIC_LOAD:
10512 case ISD::ATOMIC_STORE:
10513 return LowerAtomicLoadStore(Op, DAG);
10514 case ISD::SDIVREM:
10515 case ISD::UDIVREM: return LowerDivRem(Op, DAG);
10517 if (getTargetMachine().getTargetTriple().isOSWindows())
10518 return LowerDYNAMIC_STACKALLOC(Op, DAG);
10519 llvm_unreachable("Don't know how to custom lower this!");
10521 case ISD::FP_ROUND: return LowerFP_ROUND(Op, DAG);
10523 case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG);
10524 case ISD::STRICT_FSETCC:
10525 case ISD::STRICT_FSETCCS: return LowerFSETCC(Op, DAG);
10526 case ISD::SPONENTRY:
10527 return LowerSPONENTRY(Op, DAG);
10528 case ISD::FP_TO_BF16:
10529 return LowerFP_TO_BF16(Op, DAG);
10530 case ARMISD::WIN__DBZCHK: return SDValue();
10531 case ISD::UCMP:
10532 case ISD::SCMP:
10533 return LowerCMP(Op, DAG);
10534 case ISD::ABS:
10535 return LowerABS(Op, DAG);
10536 case ISD::STRICT_LROUND:
10538 case ISD::STRICT_LRINT:
10539 case ISD::STRICT_LLRINT: {
10540 assert((Op.getOperand(1).getValueType() == MVT::f16 ||
10541 Op.getOperand(1).getValueType() == MVT::bf16) &&
10542 "Expected custom lowering of rounding operations only for f16");
10543 SDLoc DL(Op);
10544 SDValue Ext = DAG.getNode(ISD::STRICT_FP_EXTEND, DL, {MVT::f32, MVT::Other},
10545 {Op.getOperand(0), Op.getOperand(1)});
10546 return DAG.getNode(Op.getOpcode(), DL, {Op.getValueType(), MVT::Other},
10547 {Ext.getValue(1), Ext.getValue(0)});
10548 }
10549 }
10550}
10551
10553 SelectionDAG &DAG) {
10554 unsigned IntNo = N->getConstantOperandVal(0);
10555 unsigned Opc = 0;
10556 if (IntNo == Intrinsic::arm_smlald)
10557 Opc = ARMISD::SMLALD;
10558 else if (IntNo == Intrinsic::arm_smlaldx)
10559 Opc = ARMISD::SMLALDX;
10560 else if (IntNo == Intrinsic::arm_smlsld)
10561 Opc = ARMISD::SMLSLD;
10562 else if (IntNo == Intrinsic::arm_smlsldx)
10563 Opc = ARMISD::SMLSLDX;
10564 else
10565 return;
10566
10567 SDLoc dl(N);
10568 SDValue Lo, Hi;
10569 std::tie(Lo, Hi) = DAG.SplitScalar(N->getOperand(3), dl, MVT::i32, MVT::i32);
10570
10571 SDValue LongMul = DAG.getNode(Opc, dl,
10572 DAG.getVTList(MVT::i32, MVT::i32),
10573 N->getOperand(1), N->getOperand(2),
10574 Lo, Hi);
10575 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64,
10576 LongMul.getValue(0), LongMul.getValue(1)));
10577}
10578
10579/// ReplaceNodeResults - Replace the results of node with an illegal result
10580/// type with new values built out of custom code.
10583 SelectionDAG &DAG) const {
10584 SDValue Res;
10585 switch (N->getOpcode()) {
10586 default:
10587 llvm_unreachable("Don't know how to custom expand this!");
10588 case ISD::READ_REGISTER:
10590 break;
10591 case ISD::BITCAST:
10592 Res = ExpandBITCAST(N, DAG, Subtarget);
10593 break;
10594 case ISD::SRL:
10595 case ISD::SRA:
10596 case ISD::SHL:
10597 Res = Expand64BitShift(N, DAG, Subtarget);
10598 break;
10599 case ISD::SREM:
10600 case ISD::UREM:
10601 Res = LowerREM(N, DAG);
10602 break;
10603 case ISD::SDIVREM:
10604 case ISD::UDIVREM:
10605 Res = LowerDivRem(SDValue(N, 0), DAG);
10606 assert(Res.getNumOperands() == 2 && "DivRem needs two values");
10607 Results.push_back(Res.getValue(0));
10608 Results.push_back(Res.getValue(1));
10609 return;
10610 case ISD::SADDSAT:
10611 case ISD::SSUBSAT:
10612 case ISD::UADDSAT:
10613 case ISD::USUBSAT:
10614 Res = LowerADDSUBSAT(SDValue(N, 0), DAG, Subtarget);
10615 break;
10617 ReplaceREADCYCLECOUNTER(N, Results, DAG, Subtarget);
10618 return;
10619 case ISD::UDIV:
10620 case ISD::SDIV:
10621 assert(getTargetMachine().getTargetTriple().isOSWindows() &&
10622 "can only expand DIV on Windows");
10623 return ExpandDIV_Windows(SDValue(N, 0), DAG, N->getOpcode() == ISD::SDIV,
10624 Results);
10627 return;
10629 return ReplaceLongIntrinsic(N, Results, DAG);
10630 case ISD::LOAD:
10631 LowerLOAD(N, Results, DAG);
10632 break;
10633 case ISD::STORE:
10634 Res = LowerAEABIUnalignedStore(SDValue(N, 0), DAG);
10635 break;
10636 case ISD::TRUNCATE:
10637 Res = LowerTruncate(N, DAG, Subtarget);
10638 break;
10639 case ISD::SIGN_EXTEND:
10640 case ISD::ZERO_EXTEND:
10641 Res = LowerVectorExtend(N, DAG, Subtarget);
10642 break;
10645 Res = LowerFP_TO_INT_SAT(SDValue(N, 0), DAG, Subtarget);
10646 break;
10647 }
10648 if (Res.getNode())
10649 Results.push_back(Res);
10650}
10651
10652//===----------------------------------------------------------------------===//
10653// ARM Scheduler Hooks
10654//===----------------------------------------------------------------------===//
10655
10656/// SetupEntryBlockForSjLj - Insert code into the entry block that creates and
10657/// registers the function context.
10658void ARMTargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,
10660 MachineBasicBlock *DispatchBB,
10661 int FI) const {
10662 assert(!Subtarget->isROPI() && !Subtarget->isRWPI() &&
10663 "ROPI/RWPI not currently supported with SjLj");
10664 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
10665 DebugLoc dl = MI.getDebugLoc();
10666 MachineFunction *MF = MBB->getParent();
10670 const Function &F = MF->getFunction();
10671
10672 bool isThumb = Subtarget->isThumb();
10673 bool isThumb2 = Subtarget->isThumb2();
10674
10675 unsigned PCLabelId = AFI->createPICLabelUId();
10676 unsigned PCAdj = (isThumb || isThumb2) ? 4 : 8;
10678 ARMConstantPoolMBB::Create(F.getContext(), DispatchBB, PCLabelId, PCAdj);
10679 unsigned CPI = MCP->getConstantPoolIndex(CPV, Align(4));
10680
10681 const TargetRegisterClass *TRC = isThumb ? &ARM::tGPRRegClass
10682 : &ARM::GPRRegClass;
10683
10684 // Grab constant pool and fixed stack memory operands.
10685 MachineMemOperand *CPMMO =
10688
10689 MachineMemOperand *FIMMOSt =
10692
10693 // Load the address of the dispatch MBB into the jump buffer.
10694 if (isThumb2) {
10695 // Incoming value: jbuf
10696 // ldr.n r5, LCPI1_1
10697 // orr r5, r5, #1
10698 // add r5, pc
10699 // str r5, [$jbuf, #+4] ; &jbuf[1]
10700 Register NewVReg1 = MRI->createVirtualRegister(TRC);
10701 BuildMI(*MBB, MI, dl, TII->get(ARM::t2LDRpci), NewVReg1)
10703 .addMemOperand(CPMMO)
10705 // Set the low bit because of thumb mode.
10706 Register NewVReg2 = MRI->createVirtualRegister(TRC);
10707 BuildMI(*MBB, MI, dl, TII->get(ARM::t2ORRri), NewVReg2)
10708 .addReg(NewVReg1, RegState::Kill)
10709 .addImm(0x01)
10711 .add(condCodeOp());
10712 Register NewVReg3 = MRI->createVirtualRegister(TRC);
10713 BuildMI(*MBB, MI, dl, TII->get(ARM::tPICADD), NewVReg3)
10714 .addReg(NewVReg2, RegState::Kill)
10715 .addImm(PCLabelId);
10716 BuildMI(*MBB, MI, dl, TII->get(ARM::t2STRi12))
10717 .addReg(NewVReg3, RegState::Kill)
10718 .addFrameIndex(FI)
10719 .addImm(36) // &jbuf[1] :: pc
10720 .addMemOperand(FIMMOSt)
10722 } else if (isThumb) {
10723 // Incoming value: jbuf
10724 // ldr.n r1, LCPI1_4
10725 // add r1, pc
10726 // mov r2, #1
10727 // orrs r1, r2
10728 // add r2, $jbuf, #+4 ; &jbuf[1]
10729 // str r1, [r2]
10730 Register NewVReg1 = MRI->createVirtualRegister(TRC);
10731 BuildMI(*MBB, MI, dl, TII->get(ARM::tLDRpci), NewVReg1)
10733 .addMemOperand(CPMMO)
10735 Register NewVReg2 = MRI->createVirtualRegister(TRC);
10736 BuildMI(*MBB, MI, dl, TII->get(ARM::tPICADD), NewVReg2)
10737 .addReg(NewVReg1, RegState::Kill)
10738 .addImm(PCLabelId);
10739 // Set the low bit because of thumb mode.
10740 Register NewVReg3 = MRI->createVirtualRegister(TRC);
10741 BuildMI(*MBB, MI, dl, TII->get(ARM::tMOVi8), NewVReg3)
10742 .addReg(ARM::CPSR, RegState::Define)
10743 .addImm(1)
10745 Register NewVReg4 = MRI->createVirtualRegister(TRC);
10746 BuildMI(*MBB, MI, dl, TII->get(ARM::tORR), NewVReg4)
10747 .addReg(ARM::CPSR, RegState::Define)
10748 .addReg(NewVReg2, RegState::Kill)
10749 .addReg(NewVReg3, RegState::Kill)
10751 Register NewVReg5 = MRI->createVirtualRegister(TRC);
10752 BuildMI(*MBB, MI, dl, TII->get(ARM::tADDframe), NewVReg5)
10753 .addFrameIndex(FI)
10754 .addImm(36); // &jbuf[1] :: pc
10755 BuildMI(*MBB, MI, dl, TII->get(ARM::tSTRi))
10756 .addReg(NewVReg4, RegState::Kill)
10757 .addReg(NewVReg5, RegState::Kill)
10758 .addImm(0)
10759 .addMemOperand(FIMMOSt)
10761 } else {
10762 // Incoming value: jbuf
10763 // ldr r1, LCPI1_1
10764 // add r1, pc, r1
10765 // str r1, [$jbuf, #+4] ; &jbuf[1]
10766 Register NewVReg1 = MRI->createVirtualRegister(TRC);
10767 BuildMI(*MBB, MI, dl, TII->get(ARM::LDRi12), NewVReg1)
10769 .addImm(0)
10770 .addMemOperand(CPMMO)
10772 Register NewVReg2 = MRI->createVirtualRegister(TRC);
10773 BuildMI(*MBB, MI, dl, TII->get(ARM::PICADD), NewVReg2)
10774 .addReg(NewVReg1, RegState::Kill)
10775 .addImm(PCLabelId)
10777 BuildMI(*MBB, MI, dl, TII->get(ARM::STRi12))
10778 .addReg(NewVReg2, RegState::Kill)
10779 .addFrameIndex(FI)
10780 .addImm(36) // &jbuf[1] :: pc
10781 .addMemOperand(FIMMOSt)
10783 }
10784}
10785
10786void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
10787 MachineBasicBlock *MBB) const {
10788 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
10789 DebugLoc dl = MI.getDebugLoc();
10790 MachineFunction *MF = MBB->getParent();
10791 MachineRegisterInfo *MRI = &MF->getRegInfo();
10792 MachineFrameInfo &MFI = MF->getFrameInfo();
10793 int FI = MFI.getFunctionContextIndex();
10794
10795 const TargetRegisterClass *TRC = Subtarget->isThumb() ? &ARM::tGPRRegClass
10796 : &ARM::GPRnopcRegClass;
10797
10798 // Get a mapping of the call site numbers to all of the landing pads they're
10799 // associated with.
10800 DenseMap<unsigned, SmallVector<MachineBasicBlock*, 2>> CallSiteNumToLPad;
10801 unsigned MaxCSNum = 0;
10802 for (MachineBasicBlock &BB : *MF) {
10803 if (!BB.isEHPad())
10804 continue;
10805
10806 // FIXME: We should assert that the EH_LABEL is the first MI in the landing
10807 // pad.
10808 for (MachineInstr &II : BB) {
10809 if (!II.isEHLabel())
10810 continue;
10811
10812 MCSymbol *Sym = II.getOperand(0).getMCSymbol();
10813 if (!MF->hasCallSiteLandingPad(Sym)) continue;
10814
10815 SmallVectorImpl<unsigned> &CallSiteIdxs = MF->getCallSiteLandingPad(Sym);
10816 for (unsigned Idx : CallSiteIdxs) {
10817 CallSiteNumToLPad[Idx].push_back(&BB);
10818 MaxCSNum = std::max(MaxCSNum, Idx);
10819 }
10820 break;
10821 }
10822 }
10823
10824 // Get an ordered list of the machine basic blocks for the jump table.
10825 std::vector<MachineBasicBlock*> LPadList;
10826 SmallPtrSet<MachineBasicBlock*, 32> InvokeBBs;
10827 LPadList.reserve(CallSiteNumToLPad.size());
10828 for (unsigned I = 1; I <= MaxCSNum; ++I) {
10829 SmallVectorImpl<MachineBasicBlock*> &MBBList = CallSiteNumToLPad[I];
10830 for (MachineBasicBlock *MBB : MBBList) {
10831 LPadList.push_back(MBB);
10832 InvokeBBs.insert_range(MBB->predecessors());
10833 }
10834 }
10835
10836 assert(!LPadList.empty() &&
10837 "No landing pad destinations for the dispatch jump table!");
10838
10839 // Create the jump table and associated information.
10840 MachineJumpTableInfo *JTI =
10841 MF->getOrCreateJumpTableInfo(MachineJumpTableInfo::EK_Inline);
10842 unsigned MJTI = JTI->createJumpTableIndex(LPadList);
10843
10844 // Create the MBBs for the dispatch code.
10845
10846 // Shove the dispatch's address into the return slot in the function context.
10847 MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock();
10848 DispatchBB->setIsEHPad();
10849
10850 MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
10851
10852 BuildMI(TrapBB, dl, TII->get(Subtarget->isThumb() ? ARM::tTRAP : ARM::TRAP));
10853 DispatchBB->addSuccessor(TrapBB);
10854
10855 MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock();
10856 DispatchBB->addSuccessor(DispContBB);
10857
10858 // Insert and MBBs.
10859 MF->insert(MF->end(), DispatchBB);
10860 MF->insert(MF->end(), DispContBB);
10861 MF->insert(MF->end(), TrapBB);
10862
10863 // Insert code into the entry block that creates and registers the function
10864 // context.
10865 SetupEntryBlockForSjLj(MI, MBB, DispatchBB, FI);
10866
10867 MachineMemOperand *FIMMOLd = MF->getMachineMemOperand(
10870
10871 MachineInstrBuilder MIB;
10872 MIB = BuildMI(DispatchBB, dl, TII->get(ARM::Int_eh_sjlj_dispatchsetup));
10873
10874 const ARMBaseInstrInfo *AII = static_cast<const ARMBaseInstrInfo*>(TII);
10875 const ARMBaseRegisterInfo &RI = AII->getRegisterInfo();
10876
10877 // Add a register mask with no preserved registers. This results in all
10878 // registers being marked as clobbered. This can't work if the dispatch block
10879 // is in a Thumb1 function and is linked with ARM code which uses the FP
10880 // registers, as there is no way to preserve the FP registers in Thumb1 mode.
10882
10883 bool IsPositionIndependent = isPositionIndependent();
10884 unsigned NumLPads = LPadList.size();
10885 if (Subtarget->isThumb2()) {
10886 Register NewVReg1 = MRI->createVirtualRegister(TRC);
10887 BuildMI(DispatchBB, dl, TII->get(ARM::t2LDRi12), NewVReg1)
10888 .addFrameIndex(FI)
10889 .addImm(4)
10890 .addMemOperand(FIMMOLd)
10892
10893 if (NumLPads < 256) {
10894 BuildMI(DispatchBB, dl, TII->get(ARM::t2CMPri))
10895 .addReg(NewVReg1)
10896 .addImm(LPadList.size())
10898 } else {
10899 Register VReg1 = MRI->createVirtualRegister(TRC);
10900 BuildMI(DispatchBB, dl, TII->get(ARM::t2MOVi16), VReg1)
10901 .addImm(NumLPads & 0xFFFF)
10903
10904 unsigned VReg2 = VReg1;
10905 if ((NumLPads & 0xFFFF0000) != 0) {
10906 VReg2 = MRI->createVirtualRegister(TRC);
10907 BuildMI(DispatchBB, dl, TII->get(ARM::t2MOVTi16), VReg2)
10908 .addReg(VReg1)
10909 .addImm(NumLPads >> 16)
10911 }
10912
10913 BuildMI(DispatchBB, dl, TII->get(ARM::t2CMPrr))
10914 .addReg(NewVReg1)
10915 .addReg(VReg2)
10917 }
10918
10919 BuildMI(DispatchBB, dl, TII->get(ARM::t2Bcc))
10920 .addMBB(TrapBB)
10922 .addReg(ARM::CPSR);
10923
10924 Register NewVReg3 = MRI->createVirtualRegister(TRC);
10925 BuildMI(DispContBB, dl, TII->get(ARM::t2LEApcrelJT), NewVReg3)
10926 .addJumpTableIndex(MJTI)
10928
10929 Register NewVReg4 = MRI->createVirtualRegister(TRC);
10930 BuildMI(DispContBB, dl, TII->get(ARM::t2ADDrs), NewVReg4)
10931 .addReg(NewVReg3, RegState::Kill)
10932 .addReg(NewVReg1)
10935 .add(condCodeOp());
10936
10937 BuildMI(DispContBB, dl, TII->get(ARM::t2BR_JT))
10938 .addReg(NewVReg4, RegState::Kill)
10939 .addReg(NewVReg1)
10940 .addJumpTableIndex(MJTI);
10941 } else if (Subtarget->isThumb()) {
10942 Register NewVReg1 = MRI->createVirtualRegister(TRC);
10943 BuildMI(DispatchBB, dl, TII->get(ARM::tLDRspi), NewVReg1)
10944 .addFrameIndex(FI)
10945 .addImm(1)
10946 .addMemOperand(FIMMOLd)
10948
10949 if (NumLPads < 256) {
10950 BuildMI(DispatchBB, dl, TII->get(ARM::tCMPi8))
10951 .addReg(NewVReg1)
10952 .addImm(NumLPads)
10954 } else {
10955 MachineConstantPool *ConstantPool = MF->getConstantPool();
10956 Type *Int32Ty = Type::getInt32Ty(MF->getFunction().getContext());
10957 const Constant *C = ConstantInt::get(Int32Ty, NumLPads);
10958
10959 // MachineConstantPool wants an explicit alignment.
10960 Align Alignment = MF->getDataLayout().getPrefTypeAlign(Int32Ty);
10961 unsigned Idx = ConstantPool->getConstantPoolIndex(C, Alignment);
10962
10963 Register VReg1 = MRI->createVirtualRegister(TRC);
10964 BuildMI(DispatchBB, dl, TII->get(ARM::tLDRpci))
10965 .addReg(VReg1, RegState::Define)
10968 BuildMI(DispatchBB, dl, TII->get(ARM::tCMPr))
10969 .addReg(NewVReg1)
10970 .addReg(VReg1)
10972 }
10973
10974 BuildMI(DispatchBB, dl, TII->get(ARM::tBcc))
10975 .addMBB(TrapBB)
10977 .addReg(ARM::CPSR);
10978
10979 Register NewVReg2 = MRI->createVirtualRegister(TRC);
10980 BuildMI(DispContBB, dl, TII->get(ARM::tLSLri), NewVReg2)
10981 .addReg(ARM::CPSR, RegState::Define)
10982 .addReg(NewVReg1)
10983 .addImm(2)
10985
10986 Register NewVReg3 = MRI->createVirtualRegister(TRC);
10987 BuildMI(DispContBB, dl, TII->get(ARM::tLEApcrelJT), NewVReg3)
10988 .addJumpTableIndex(MJTI)
10990
10991 Register NewVReg4 = MRI->createVirtualRegister(TRC);
10992 BuildMI(DispContBB, dl, TII->get(ARM::tADDrr), NewVReg4)
10993 .addReg(ARM::CPSR, RegState::Define)
10994 .addReg(NewVReg2, RegState::Kill)
10995 .addReg(NewVReg3)
10997
10998 MachineMemOperand *JTMMOLd =
10999 MF->getMachineMemOperand(MachinePointerInfo::getJumpTable(*MF),
11001
11002 Register NewVReg5 = MRI->createVirtualRegister(TRC);
11003 BuildMI(DispContBB, dl, TII->get(ARM::tLDRi), NewVReg5)
11004 .addReg(NewVReg4, RegState::Kill)
11005 .addImm(0)
11006 .addMemOperand(JTMMOLd)
11008
11009 unsigned NewVReg6 = NewVReg5;
11010 if (IsPositionIndependent) {
11011 NewVReg6 = MRI->createVirtualRegister(TRC);
11012 BuildMI(DispContBB, dl, TII->get(ARM::tADDrr), NewVReg6)
11013 .addReg(ARM::CPSR, RegState::Define)
11014 .addReg(NewVReg5, RegState::Kill)
11015 .addReg(NewVReg3)
11017 }
11018
11019 BuildMI(DispContBB, dl, TII->get(ARM::tBR_JTr))
11020 .addReg(NewVReg6, RegState::Kill)
11021 .addJumpTableIndex(MJTI);
11022 } else {
11023 Register NewVReg1 = MRI->createVirtualRegister(TRC);
11024 BuildMI(DispatchBB, dl, TII->get(ARM::LDRi12), NewVReg1)
11025 .addFrameIndex(FI)
11026 .addImm(4)
11027 .addMemOperand(FIMMOLd)
11029
11030 if (NumLPads < 256) {
11031 BuildMI(DispatchBB, dl, TII->get(ARM::CMPri))
11032 .addReg(NewVReg1)
11033 .addImm(NumLPads)
11035 } else if (Subtarget->hasV6T2Ops() && isUInt<16>(NumLPads)) {
11036 Register VReg1 = MRI->createVirtualRegister(TRC);
11037 BuildMI(DispatchBB, dl, TII->get(ARM::MOVi16), VReg1)
11038 .addImm(NumLPads & 0xFFFF)
11040
11041 unsigned VReg2 = VReg1;
11042 if ((NumLPads & 0xFFFF0000) != 0) {
11043 VReg2 = MRI->createVirtualRegister(TRC);
11044 BuildMI(DispatchBB, dl, TII->get(ARM::MOVTi16), VReg2)
11045 .addReg(VReg1)
11046 .addImm(NumLPads >> 16)
11048 }
11049
11050 BuildMI(DispatchBB, dl, TII->get(ARM::CMPrr))
11051 .addReg(NewVReg1)
11052 .addReg(VReg2)
11054 } else {
11055 MachineConstantPool *ConstantPool = MF->getConstantPool();
11056 Type *Int32Ty = Type::getInt32Ty(MF->getFunction().getContext());
11057 const Constant *C = ConstantInt::get(Int32Ty, NumLPads);
11058
11059 // MachineConstantPool wants an explicit alignment.
11060 Align Alignment = MF->getDataLayout().getPrefTypeAlign(Int32Ty);
11061 unsigned Idx = ConstantPool->getConstantPoolIndex(C, Alignment);
11062
11063 Register VReg1 = MRI->createVirtualRegister(TRC);
11064 BuildMI(DispatchBB, dl, TII->get(ARM::LDRcp))
11065 .addReg(VReg1, RegState::Define)
11067 .addImm(0)
11069 BuildMI(DispatchBB, dl, TII->get(ARM::CMPrr))
11070 .addReg(NewVReg1)
11071 .addReg(VReg1, RegState::Kill)
11073 }
11074
11075 BuildMI(DispatchBB, dl, TII->get(ARM::Bcc))
11076 .addMBB(TrapBB)
11078 .addReg(ARM::CPSR);
11079
11080 Register NewVReg3 = MRI->createVirtualRegister(TRC);
11081 BuildMI(DispContBB, dl, TII->get(ARM::MOVsi), NewVReg3)
11082 .addReg(NewVReg1)
11085 .add(condCodeOp());
11086 Register NewVReg4 = MRI->createVirtualRegister(TRC);
11087 BuildMI(DispContBB, dl, TII->get(ARM::LEApcrelJT), NewVReg4)
11088 .addJumpTableIndex(MJTI)
11090
11091 MachineMemOperand *JTMMOLd =
11092 MF->getMachineMemOperand(MachinePointerInfo::getJumpTable(*MF),
11094 Register NewVReg5 = MRI->createVirtualRegister(TRC);
11095 BuildMI(DispContBB, dl, TII->get(ARM::LDRrs), NewVReg5)
11096 .addReg(NewVReg3, RegState::Kill)
11097 .addReg(NewVReg4)
11098 .addImm(0)
11099 .addMemOperand(JTMMOLd)
11101
11102 if (IsPositionIndependent) {
11103 BuildMI(DispContBB, dl, TII->get(ARM::BR_JTadd))
11104 .addReg(NewVReg5, RegState::Kill)
11105 .addReg(NewVReg4)
11106 .addJumpTableIndex(MJTI);
11107 } else {
11108 BuildMI(DispContBB, dl, TII->get(ARM::BR_JTr))
11109 .addReg(NewVReg5, RegState::Kill)
11110 .addJumpTableIndex(MJTI);
11111 }
11112 }
11113
11114 // Add the jump table entries as successors to the MBB.
11115 SmallPtrSet<MachineBasicBlock*, 8> SeenMBBs;
11116 for (MachineBasicBlock *CurMBB : LPadList) {
11117 if (SeenMBBs.insert(CurMBB).second)
11118 DispContBB->addSuccessor(CurMBB);
11119 }
11120
11121 // N.B. the order the invoke BBs are processed in doesn't matter here.
11122 const MCPhysReg *SavedRegs = RI.getCalleeSavedRegs(MF);
11124 for (MachineBasicBlock *BB : InvokeBBs) {
11125
11126 // Remove the landing pad successor from the invoke block and replace it
11127 // with the new dispatch block.
11128 SmallVector<MachineBasicBlock*, 4> Successors(BB->successors());
11129 while (!Successors.empty()) {
11130 MachineBasicBlock *SMBB = Successors.pop_back_val();
11131 if (SMBB->isEHPad()) {
11132 BB->removeSuccessor(SMBB);
11133 MBBLPads.push_back(SMBB);
11134 }
11135 }
11136
11137 BB->addSuccessor(DispatchBB, BranchProbability::getZero());
11138 BB->normalizeSuccProbs();
11139
11140 // Find the invoke call and mark all of the callee-saved registers as
11141 // 'implicit defined' so that they're spilled. This prevents code from
11142 // moving instructions to before the EH block, where they will never be
11143 // executed.
11145 II = BB->rbegin(), IE = BB->rend(); II != IE; ++II) {
11146 if (!II->isCall()) continue;
11147
11148 DenseSet<unsigned> DefRegs;
11150 OI = II->operands_begin(), OE = II->operands_end();
11151 OI != OE; ++OI) {
11152 if (!OI->isReg()) continue;
11153 DefRegs.insert(OI->getReg());
11154 }
11155
11156 MachineInstrBuilder MIB(*MF, &*II);
11157
11158 for (unsigned i = 0; SavedRegs[i] != 0; ++i) {
11159 unsigned Reg = SavedRegs[i];
11160 if (Subtarget->isThumb2() &&
11161 !ARM::tGPRRegClass.contains(Reg) &&
11162 !ARM::hGPRRegClass.contains(Reg))
11163 continue;
11164 if (Subtarget->isThumb1Only() && !ARM::tGPRRegClass.contains(Reg))
11165 continue;
11166 if (!Subtarget->isThumb() && !ARM::GPRRegClass.contains(Reg))
11167 continue;
11168 if (!DefRegs.contains(Reg))
11170 }
11171
11172 break;
11173 }
11174 }
11175
11176 // Mark all former landing pads as non-landing pads. The dispatch is the only
11177 // landing pad now.
11178 for (MachineBasicBlock *MBBLPad : MBBLPads)
11179 MBBLPad->setIsEHPad(false);
11180
11181 // The instruction is gone now.
11182 MI.eraseFromParent();
11183}
11184
11185static
11187 for (MachineBasicBlock *S : MBB->successors())
11188 if (S != Succ)
11189 return S;
11190 llvm_unreachable("Expecting a BB with two successors!");
11191}
11192
11193/// Return the load opcode for a given load size. If load size >= 8,
11194/// neon opcode will be returned.
11195static unsigned getLdOpcode(unsigned LdSize, bool IsThumb1, bool IsThumb2) {
11196 if (LdSize >= 8)
11197 return LdSize == 16 ? ARM::VLD1q32wb_fixed
11198 : LdSize == 8 ? ARM::VLD1d32wb_fixed : 0;
11199 if (IsThumb1)
11200 return LdSize == 4 ? ARM::tLDRi
11201 : LdSize == 2 ? ARM::tLDRHi
11202 : LdSize == 1 ? ARM::tLDRBi : 0;
11203 if (IsThumb2)
11204 return LdSize == 4 ? ARM::t2LDR_POST
11205 : LdSize == 2 ? ARM::t2LDRH_POST
11206 : LdSize == 1 ? ARM::t2LDRB_POST : 0;
11207 return LdSize == 4 ? ARM::LDR_POST_IMM
11208 : LdSize == 2 ? ARM::LDRH_POST
11209 : LdSize == 1 ? ARM::LDRB_POST_IMM : 0;
11210}
11211
11212/// Return the store opcode for a given store size. If store size >= 8,
11213/// neon opcode will be returned.
11214static unsigned getStOpcode(unsigned StSize, bool IsThumb1, bool IsThumb2) {
11215 if (StSize >= 8)
11216 return StSize == 16 ? ARM::VST1q32wb_fixed
11217 : StSize == 8 ? ARM::VST1d32wb_fixed : 0;
11218 if (IsThumb1)
11219 return StSize == 4 ? ARM::tSTRi
11220 : StSize == 2 ? ARM::tSTRHi
11221 : StSize == 1 ? ARM::tSTRBi : 0;
11222 if (IsThumb2)
11223 return StSize == 4 ? ARM::t2STR_POST
11224 : StSize == 2 ? ARM::t2STRH_POST
11225 : StSize == 1 ? ARM::t2STRB_POST : 0;
11226 return StSize == 4 ? ARM::STR_POST_IMM
11227 : StSize == 2 ? ARM::STRH_POST
11228 : StSize == 1 ? ARM::STRB_POST_IMM : 0;
11229}
11230
11231/// Emit a post-increment load operation with given size. The instructions
11232/// will be added to BB at Pos.
11234 const TargetInstrInfo *TII, const DebugLoc &dl,
11235 unsigned LdSize, unsigned Data, unsigned AddrIn,
11236 unsigned AddrOut, bool IsThumb1, bool IsThumb2) {
11237 unsigned LdOpc = getLdOpcode(LdSize, IsThumb1, IsThumb2);
11238 assert(LdOpc != 0 && "Should have a load opcode");
11239 if (LdSize >= 8) {
11240 BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data)
11241 .addReg(AddrOut, RegState::Define)
11242 .addReg(AddrIn)
11243 .addImm(0)
11245 } else if (IsThumb1) {
11246 // load + update AddrIn
11247 BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data)
11248 .addReg(AddrIn)
11249 .addImm(0)
11251 BuildMI(*BB, Pos, dl, TII->get(ARM::tADDi8), AddrOut)
11252 .add(t1CondCodeOp())
11253 .addReg(AddrIn)
11254 .addImm(LdSize)
11256 } else if (IsThumb2) {
11257 BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data)
11258 .addReg(AddrOut, RegState::Define)
11259 .addReg(AddrIn)
11260 .addImm(LdSize)
11262 } else { // arm
11263 BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data)
11264 .addReg(AddrOut, RegState::Define)
11265 .addReg(AddrIn)
11266 .addReg(0)
11267 .addImm(LdSize)
11269 }
11270}
11271
11272/// Emit a post-increment store operation with given size. The instructions
11273/// will be added to BB at Pos.
11275 const TargetInstrInfo *TII, const DebugLoc &dl,
11276 unsigned StSize, unsigned Data, unsigned AddrIn,
11277 unsigned AddrOut, bool IsThumb1, bool IsThumb2) {
11278 unsigned StOpc = getStOpcode(StSize, IsThumb1, IsThumb2);
11279 assert(StOpc != 0 && "Should have a store opcode");
11280 if (StSize >= 8) {
11281 BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut)
11282 .addReg(AddrIn)
11283 .addImm(0)
11284 .addReg(Data)
11286 } else if (IsThumb1) {
11287 // store + update AddrIn
11288 BuildMI(*BB, Pos, dl, TII->get(StOpc))
11289 .addReg(Data)
11290 .addReg(AddrIn)
11291 .addImm(0)
11293 BuildMI(*BB, Pos, dl, TII->get(ARM::tADDi8), AddrOut)
11294 .add(t1CondCodeOp())
11295 .addReg(AddrIn)
11296 .addImm(StSize)
11298 } else if (IsThumb2) {
11299 BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut)
11300 .addReg(Data)
11301 .addReg(AddrIn)
11302 .addImm(StSize)
11304 } else { // arm
11305 BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut)
11306 .addReg(Data)
11307 .addReg(AddrIn)
11308 .addReg(0)
11309 .addImm(StSize)
11311 }
11312}
11313
11315ARMTargetLowering::EmitStructByval(MachineInstr &MI,
11316 MachineBasicBlock *BB) const {
11317 // This pseudo instruction has 3 operands: dst, src, size
11318 // We expand it to a loop if size > Subtarget->getMaxInlineSizeThreshold().
11319 // Otherwise, we will generate unrolled scalar copies.
11320 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
11321 const BasicBlock *LLVM_BB = BB->getBasicBlock();
11323
11324 Register dest = MI.getOperand(0).getReg();
11325 Register src = MI.getOperand(1).getReg();
11326 unsigned SizeVal = MI.getOperand(2).getImm();
11327 unsigned Alignment = MI.getOperand(3).getImm();
11328 DebugLoc dl = MI.getDebugLoc();
11329
11330 MachineFunction *MF = BB->getParent();
11331 MachineRegisterInfo &MRI = MF->getRegInfo();
11332 unsigned UnitSize = 0;
11333 const TargetRegisterClass *TRC = nullptr;
11334 const TargetRegisterClass *VecTRC = nullptr;
11335
11336 bool IsThumb1 = Subtarget->isThumb1Only();
11337 bool IsThumb2 = Subtarget->isThumb2();
11338 bool IsThumb = Subtarget->isThumb();
11339
11340 if (Alignment & 1) {
11341 UnitSize = 1;
11342 } else if (Alignment & 2) {
11343 UnitSize = 2;
11344 } else {
11345 // Check whether we can use NEON instructions.
11346 if (!MF->getFunction().hasFnAttribute(Attribute::NoImplicitFloat) &&
11347 Subtarget->hasNEON()) {
11348 if ((Alignment % 16 == 0) && SizeVal >= 16)
11349 UnitSize = 16;
11350 else if ((Alignment % 8 == 0) && SizeVal >= 8)
11351 UnitSize = 8;
11352 }
11353 // Can't use NEON instructions.
11354 if (UnitSize == 0)
11355 UnitSize = 4;
11356 }
11357
11358 // Select the correct opcode and register class for unit size load/store
11359 bool IsNeon = UnitSize >= 8;
11360 TRC = IsThumb ? &ARM::tGPRRegClass : &ARM::GPRRegClass;
11361 if (IsNeon)
11362 VecTRC = UnitSize == 16 ? &ARM::DPairRegClass
11363 : UnitSize == 8 ? &ARM::DPRRegClass
11364 : nullptr;
11365
11366 unsigned BytesLeft = SizeVal % UnitSize;
11367 unsigned LoopSize = SizeVal - BytesLeft;
11368
11369 if (SizeVal <= Subtarget->getMaxInlineSizeThreshold()) {
11370 // Use LDR and STR to copy.
11371 // [scratch, srcOut] = LDR_POST(srcIn, UnitSize)
11372 // [destOut] = STR_POST(scratch, destIn, UnitSize)
11373 unsigned srcIn = src;
11374 unsigned destIn = dest;
11375 for (unsigned i = 0; i < LoopSize; i+=UnitSize) {
11376 Register srcOut = MRI.createVirtualRegister(TRC);
11377 Register destOut = MRI.createVirtualRegister(TRC);
11378 Register scratch = MRI.createVirtualRegister(IsNeon ? VecTRC : TRC);
11379 emitPostLd(BB, MI, TII, dl, UnitSize, scratch, srcIn, srcOut,
11380 IsThumb1, IsThumb2);
11381 emitPostSt(BB, MI, TII, dl, UnitSize, scratch, destIn, destOut,
11382 IsThumb1, IsThumb2);
11383 srcIn = srcOut;
11384 destIn = destOut;
11385 }
11386
11387 // Handle the leftover bytes with LDRB and STRB.
11388 // [scratch, srcOut] = LDRB_POST(srcIn, 1)
11389 // [destOut] = STRB_POST(scratch, destIn, 1)
11390 for (unsigned i = 0; i < BytesLeft; i++) {
11391 Register srcOut = MRI.createVirtualRegister(TRC);
11392 Register destOut = MRI.createVirtualRegister(TRC);
11393 Register scratch = MRI.createVirtualRegister(TRC);
11394 emitPostLd(BB, MI, TII, dl, 1, scratch, srcIn, srcOut,
11395 IsThumb1, IsThumb2);
11396 emitPostSt(BB, MI, TII, dl, 1, scratch, destIn, destOut,
11397 IsThumb1, IsThumb2);
11398 srcIn = srcOut;
11399 destIn = destOut;
11400 }
11401 MI.eraseFromParent(); // The instruction is gone now.
11402 return BB;
11403 }
11404
11405 // Expand the pseudo op to a loop.
11406 // thisMBB:
11407 // ...
11408 // movw varEnd, # --> with thumb2
11409 // movt varEnd, #
11410 // ldrcp varEnd, idx --> without thumb2
11411 // fallthrough --> loopMBB
11412 // loopMBB:
11413 // PHI varPhi, varEnd, varLoop
11414 // PHI srcPhi, src, srcLoop
11415 // PHI destPhi, dst, destLoop
11416 // [scratch, srcLoop] = LDR_POST(srcPhi, UnitSize)
11417 // [destLoop] = STR_POST(scratch, destPhi, UnitSize)
11418 // subs varLoop, varPhi, #UnitSize
11419 // bne loopMBB
11420 // fallthrough --> exitMBB
11421 // exitMBB:
11422 // epilogue to handle left-over bytes
11423 // [scratch, srcOut] = LDRB_POST(srcLoop, 1)
11424 // [destOut] = STRB_POST(scratch, destLoop, 1)
11425 MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB);
11426 MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
11427 MF->insert(It, loopMBB);
11428 MF->insert(It, exitMBB);
11429
11430 // Set the call frame size on entry to the new basic blocks.
11431 unsigned CallFrameSize = TII->getCallFrameSizeAt(MI);
11432 loopMBB->setCallFrameSize(CallFrameSize);
11433 exitMBB->setCallFrameSize(CallFrameSize);
11434
11435 // Transfer the remainder of BB and its successor edges to exitMBB.
11436 exitMBB->splice(exitMBB->begin(), BB,
11437 std::next(MachineBasicBlock::iterator(MI)), BB->end());
11439
11440 // Load an immediate to varEnd.
11441 Register varEnd = MRI.createVirtualRegister(TRC);
11442 if (Subtarget->useMovt()) {
11443 BuildMI(BB, dl, TII->get(IsThumb ? ARM::t2MOVi32imm : ARM::MOVi32imm),
11444 varEnd)
11445 .addImm(LoopSize);
11446 } else if (Subtarget->genExecuteOnly()) {
11447 assert(IsThumb && "Non-thumb expected to have used movt");
11448 BuildMI(BB, dl, TII->get(ARM::tMOVi32imm), varEnd).addImm(LoopSize);
11449 } else {
11450 MachineConstantPool *ConstantPool = MF->getConstantPool();
11452 const Constant *C = ConstantInt::get(Int32Ty, LoopSize);
11453
11454 // MachineConstantPool wants an explicit alignment.
11455 Align Alignment = MF->getDataLayout().getPrefTypeAlign(Int32Ty);
11456 unsigned Idx = ConstantPool->getConstantPoolIndex(C, Alignment);
11457 MachineMemOperand *CPMMO =
11460
11461 if (IsThumb)
11462 BuildMI(*BB, MI, dl, TII->get(ARM::tLDRpci))
11463 .addReg(varEnd, RegState::Define)
11466 .addMemOperand(CPMMO);
11467 else
11468 BuildMI(*BB, MI, dl, TII->get(ARM::LDRcp))
11469 .addReg(varEnd, RegState::Define)
11471 .addImm(0)
11473 .addMemOperand(CPMMO);
11474 }
11475 BB->addSuccessor(loopMBB);
11476
11477 // Generate the loop body:
11478 // varPhi = PHI(varLoop, varEnd)
11479 // srcPhi = PHI(srcLoop, src)
11480 // destPhi = PHI(destLoop, dst)
11481 MachineBasicBlock *entryBB = BB;
11482 BB = loopMBB;
11483 Register varLoop = MRI.createVirtualRegister(TRC);
11484 Register varPhi = MRI.createVirtualRegister(TRC);
11485 Register srcLoop = MRI.createVirtualRegister(TRC);
11486 Register srcPhi = MRI.createVirtualRegister(TRC);
11487 Register destLoop = MRI.createVirtualRegister(TRC);
11488 Register destPhi = MRI.createVirtualRegister(TRC);
11489
11490 BuildMI(*BB, BB->begin(), dl, TII->get(ARM::PHI), varPhi)
11491 .addReg(varLoop).addMBB(loopMBB)
11492 .addReg(varEnd).addMBB(entryBB);
11493 BuildMI(BB, dl, TII->get(ARM::PHI), srcPhi)
11494 .addReg(srcLoop).addMBB(loopMBB)
11495 .addReg(src).addMBB(entryBB);
11496 BuildMI(BB, dl, TII->get(ARM::PHI), destPhi)
11497 .addReg(destLoop).addMBB(loopMBB)
11498 .addReg(dest).addMBB(entryBB);
11499
11500 // [scratch, srcLoop] = LDR_POST(srcPhi, UnitSize)
11501 // [destLoop] = STR_POST(scratch, destPhi, UnitSiz)
11502 Register scratch = MRI.createVirtualRegister(IsNeon ? VecTRC : TRC);
11503 emitPostLd(BB, BB->end(), TII, dl, UnitSize, scratch, srcPhi, srcLoop,
11504 IsThumb1, IsThumb2);
11505 emitPostSt(BB, BB->end(), TII, dl, UnitSize, scratch, destPhi, destLoop,
11506 IsThumb1, IsThumb2);
11507
11508 // Decrement loop variable by UnitSize.
11509 if (IsThumb1) {
11510 BuildMI(*BB, BB->end(), dl, TII->get(ARM::tSUBi8), varLoop)
11511 .add(t1CondCodeOp())
11512 .addReg(varPhi)
11513 .addImm(UnitSize)
11515 } else {
11516 MachineInstrBuilder MIB =
11517 BuildMI(*BB, BB->end(), dl,
11518 TII->get(IsThumb2 ? ARM::t2SUBri : ARM::SUBri), varLoop);
11519 MIB.addReg(varPhi)
11520 .addImm(UnitSize)
11522 .add(condCodeOp());
11523 MIB->getOperand(5).setReg(ARM::CPSR);
11524 MIB->getOperand(5).setIsDef(true);
11525 }
11526 BuildMI(*BB, BB->end(), dl,
11527 TII->get(IsThumb1 ? ARM::tBcc : IsThumb2 ? ARM::t2Bcc : ARM::Bcc))
11528 .addMBB(loopMBB).addImm(ARMCC::NE).addReg(ARM::CPSR);
11529
11530 // loopMBB can loop back to loopMBB or fall through to exitMBB.
11531 BB->addSuccessor(loopMBB);
11532 BB->addSuccessor(exitMBB);
11533
11534 // Add epilogue to handle BytesLeft.
11535 BB = exitMBB;
11536 auto StartOfExit = exitMBB->begin();
11537
11538 // [scratch, srcOut] = LDRB_POST(srcLoop, 1)
11539 // [destOut] = STRB_POST(scratch, destLoop, 1)
11540 unsigned srcIn = srcLoop;
11541 unsigned destIn = destLoop;
11542 for (unsigned i = 0; i < BytesLeft; i++) {
11543 Register srcOut = MRI.createVirtualRegister(TRC);
11544 Register destOut = MRI.createVirtualRegister(TRC);
11545 Register scratch = MRI.createVirtualRegister(TRC);
11546 emitPostLd(BB, StartOfExit, TII, dl, 1, scratch, srcIn, srcOut,
11547 IsThumb1, IsThumb2);
11548 emitPostSt(BB, StartOfExit, TII, dl, 1, scratch, destIn, destOut,
11549 IsThumb1, IsThumb2);
11550 srcIn = srcOut;
11551 destIn = destOut;
11552 }
11553
11554 MI.eraseFromParent(); // The instruction is gone now.
11555 return BB;
11556}
11557
11559ARMTargetLowering::EmitLowered__chkstk(MachineInstr &MI,
11560 MachineBasicBlock *MBB) const {
11561 const TargetMachine &TM = getTargetMachine();
11562 const TargetInstrInfo &TII = *Subtarget->getInstrInfo();
11563 DebugLoc DL = MI.getDebugLoc();
11564
11565 assert(TM.getTargetTriple().isOSWindows() &&
11566 "__chkstk is only supported on Windows");
11567 assert(Subtarget->isThumb2() && "Windows on ARM requires Thumb-2 mode");
11568
11569 // __chkstk takes the number of words to allocate on the stack in R4, and
11570 // returns the stack adjustment in number of bytes in R4. This will not
11571 // clober any other registers (other than the obvious lr).
11572 //
11573 // Although, technically, IP should be considered a register which may be
11574 // clobbered, the call itself will not touch it. Windows on ARM is a pure
11575 // thumb-2 environment, so there is no interworking required. As a result, we
11576 // do not expect a veneer to be emitted by the linker, clobbering IP.
11577 //
11578 // Each module receives its own copy of __chkstk, so no import thunk is
11579 // required, again, ensuring that IP is not clobbered.
11580 //
11581 // Finally, although some linkers may theoretically provide a trampoline for
11582 // out of range calls (which is quite common due to a 32M range limitation of
11583 // branches for Thumb), we can generate the long-call version via
11584 // -mcmodel=large, alleviating the need for the trampoline which may clobber
11585 // IP.
11586
11587 RTLIB::LibcallImpl ChkStkLibcall = getLibcallImpl(RTLIB::STACK_PROBE);
11588 if (ChkStkLibcall == RTLIB::Unsupported)
11589 reportFatalUsageError("no available implementation of __chkstk");
11590
11591 const char *ChkStk = getLibcallImplName(ChkStkLibcall).data();
11592 switch (TM.getCodeModel()) {
11593 case CodeModel::Tiny:
11594 llvm_unreachable("Tiny code model not available on ARM.");
11595 case CodeModel::Small:
11596 case CodeModel::Medium:
11597 case CodeModel::Kernel:
11598 BuildMI(*MBB, MI, DL, TII.get(ARM::tBL))
11600 .addExternalSymbol(ChkStk)
11603 .addReg(ARM::R12,
11605 .addReg(ARM::CPSR,
11607 break;
11608 case CodeModel::Large: {
11609 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
11610 Register Reg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
11611
11612 BuildMI(*MBB, MI, DL, TII.get(ARM::t2MOVi32imm), Reg)
11613 .addExternalSymbol(ChkStk);
11619 .addReg(ARM::R12,
11621 .addReg(ARM::CPSR,
11623 break;
11624 }
11625 }
11626
11627 BuildMI(*MBB, MI, DL, TII.get(ARM::t2SUBrr), ARM::SP)
11628 .addReg(ARM::SP, RegState::Kill)
11629 .addReg(ARM::R4, RegState::Kill)
11632 .add(condCodeOp());
11633
11634 MI.eraseFromParent();
11635 return MBB;
11636}
11637
11639ARMTargetLowering::EmitLowered__dbzchk(MachineInstr &MI,
11640 MachineBasicBlock *MBB) const {
11641 DebugLoc DL = MI.getDebugLoc();
11642 MachineFunction *MF = MBB->getParent();
11643 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
11644
11645 MachineBasicBlock *ContBB = MF->CreateMachineBasicBlock();
11646 MF->insert(++MBB->getIterator(), ContBB);
11647 ContBB->splice(ContBB->begin(), MBB,
11648 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
11650 MBB->addSuccessor(ContBB);
11651
11652 MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
11653 BuildMI(TrapBB, DL, TII->get(ARM::t__brkdiv0));
11654 MF->push_back(TrapBB);
11655 MBB->addSuccessor(TrapBB);
11656
11657 BuildMI(*MBB, MI, DL, TII->get(ARM::tCMPi8))
11658 .addReg(MI.getOperand(0).getReg())
11659 .addImm(0)
11661 BuildMI(*MBB, MI, DL, TII->get(ARM::t2Bcc))
11662 .addMBB(TrapBB)
11664 .addReg(ARM::CPSR);
11665
11666 MI.eraseFromParent();
11667 return ContBB;
11668}
11669
11670// The CPSR operand of SelectItr might be missing a kill marker
11671// because there were multiple uses of CPSR, and ISel didn't know
11672// which to mark. Figure out whether SelectItr should have had a
11673// kill marker, and set it if it should. Returns the correct kill
11674// marker value.
11677 const TargetRegisterInfo* TRI) {
11678 // Scan forward through BB for a use/def of CPSR.
11679 MachineBasicBlock::iterator miI(std::next(SelectItr));
11680 for (MachineBasicBlock::iterator miE = BB->end(); miI != miE; ++miI) {
11681 const MachineInstr& mi = *miI;
11682 if (mi.readsRegister(ARM::CPSR, /*TRI=*/nullptr))
11683 return false;
11684 if (mi.definesRegister(ARM::CPSR, /*TRI=*/nullptr))
11685 break; // Should have kill-flag - update below.
11686 }
11687
11688 // If we hit the end of the block, check whether CPSR is live into a
11689 // successor.
11690 if (miI == BB->end()) {
11691 for (MachineBasicBlock *Succ : BB->successors())
11692 if (Succ->isLiveIn(ARM::CPSR))
11693 return false;
11694 }
11695
11696 // We found a def, or hit the end of the basic block and CPSR wasn't live
11697 // out. SelectMI should have a kill flag on CPSR.
11698 SelectItr->addRegisterKilled(ARM::CPSR, TRI);
11699 return true;
11700}
11701
11702/// Adds logic in loop entry MBB to calculate loop iteration count and adds
11703/// t2WhileLoopSetup and t2WhileLoopStart to generate WLS loop
11705 MachineBasicBlock *TpLoopBody,
11706 MachineBasicBlock *TpExit, Register OpSizeReg,
11707 const TargetInstrInfo *TII, DebugLoc Dl,
11709 // Calculates loop iteration count = ceil(n/16) = (n + 15) >> 4.
11710 Register AddDestReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
11711 BuildMI(TpEntry, Dl, TII->get(ARM::t2ADDri), AddDestReg)
11712 .addUse(OpSizeReg)
11713 .addImm(15)
11715 .addReg(0);
11716
11717 Register LsrDestReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
11718 BuildMI(TpEntry, Dl, TII->get(ARM::t2LSRri), LsrDestReg)
11719 .addUse(AddDestReg, RegState::Kill)
11720 .addImm(4)
11722 .addReg(0);
11723
11724 Register TotalIterationsReg = MRI.createVirtualRegister(&ARM::GPRlrRegClass);
11725 BuildMI(TpEntry, Dl, TII->get(ARM::t2WhileLoopSetup), TotalIterationsReg)
11726 .addUse(LsrDestReg, RegState::Kill);
11727
11728 BuildMI(TpEntry, Dl, TII->get(ARM::t2WhileLoopStart))
11729 .addUse(TotalIterationsReg)
11730 .addMBB(TpExit);
11731
11732 BuildMI(TpEntry, Dl, TII->get(ARM::t2B))
11733 .addMBB(TpLoopBody)
11735
11736 return TotalIterationsReg;
11737}
11738
11739/// Adds logic in the loopBody MBB to generate MVE_VCTP, t2DoLoopDec and
11740/// t2DoLoopEnd. These are used by later passes to generate tail predicated
11741/// loops.
11742static void genTPLoopBody(MachineBasicBlock *TpLoopBody,
11743 MachineBasicBlock *TpEntry, MachineBasicBlock *TpExit,
11744 const TargetInstrInfo *TII, DebugLoc Dl,
11745 MachineRegisterInfo &MRI, Register OpSrcReg,
11746 Register OpDestReg, Register ElementCountReg,
11747 Register TotalIterationsReg, bool IsMemcpy) {
11748 // First insert 4 PHI nodes for: Current pointer to Src (if memcpy), Dest
11749 // array, loop iteration counter, predication counter.
11750
11751 Register SrcPhiReg, CurrSrcReg;
11752 if (IsMemcpy) {
11753 // Current position in the src array
11754 SrcPhiReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
11755 CurrSrcReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
11756 BuildMI(TpLoopBody, Dl, TII->get(ARM::PHI), SrcPhiReg)
11757 .addUse(OpSrcReg)
11758 .addMBB(TpEntry)
11759 .addUse(CurrSrcReg)
11760 .addMBB(TpLoopBody);
11761 }
11762
11763 // Current position in the dest array
11764 Register DestPhiReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
11765 Register CurrDestReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
11766 BuildMI(TpLoopBody, Dl, TII->get(ARM::PHI), DestPhiReg)
11767 .addUse(OpDestReg)
11768 .addMBB(TpEntry)
11769 .addUse(CurrDestReg)
11770 .addMBB(TpLoopBody);
11771
11772 // Current loop counter
11773 Register LoopCounterPhiReg = MRI.createVirtualRegister(&ARM::GPRlrRegClass);
11774 Register RemainingLoopIterationsReg =
11775 MRI.createVirtualRegister(&ARM::GPRlrRegClass);
11776 BuildMI(TpLoopBody, Dl, TII->get(ARM::PHI), LoopCounterPhiReg)
11777 .addUse(TotalIterationsReg)
11778 .addMBB(TpEntry)
11779 .addUse(RemainingLoopIterationsReg)
11780 .addMBB(TpLoopBody);
11781
11782 // Predication counter
11783 Register PredCounterPhiReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
11784 Register RemainingElementsReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
11785 BuildMI(TpLoopBody, Dl, TII->get(ARM::PHI), PredCounterPhiReg)
11786 .addUse(ElementCountReg)
11787 .addMBB(TpEntry)
11788 .addUse(RemainingElementsReg)
11789 .addMBB(TpLoopBody);
11790
11791 // Pass predication counter to VCTP
11792 Register VccrReg = MRI.createVirtualRegister(&ARM::VCCRRegClass);
11793 BuildMI(TpLoopBody, Dl, TII->get(ARM::MVE_VCTP8), VccrReg)
11794 .addUse(PredCounterPhiReg)
11796 .addReg(0)
11797 .addReg(0);
11798
11799 BuildMI(TpLoopBody, Dl, TII->get(ARM::t2SUBri), RemainingElementsReg)
11800 .addUse(PredCounterPhiReg)
11801 .addImm(16)
11803 .addReg(0);
11804
11805 // VLDRB (only if memcpy) and VSTRB instructions, predicated using VPR
11806 Register SrcValueReg;
11807 if (IsMemcpy) {
11808 SrcValueReg = MRI.createVirtualRegister(&ARM::MQPRRegClass);
11809 BuildMI(TpLoopBody, Dl, TII->get(ARM::MVE_VLDRBU8_post))
11810 .addDef(CurrSrcReg)
11811 .addDef(SrcValueReg)
11812 .addReg(SrcPhiReg)
11813 .addImm(16)
11815 .addUse(VccrReg)
11816 .addReg(0);
11817 } else
11818 SrcValueReg = OpSrcReg;
11819
11820 BuildMI(TpLoopBody, Dl, TII->get(ARM::MVE_VSTRBU8_post))
11821 .addDef(CurrDestReg)
11822 .addUse(SrcValueReg)
11823 .addReg(DestPhiReg)
11824 .addImm(16)
11826 .addUse(VccrReg)
11827 .addReg(0);
11828
11829 // Add the pseudoInstrs for decrementing the loop counter and marking the
11830 // end:t2DoLoopDec and t2DoLoopEnd
11831 BuildMI(TpLoopBody, Dl, TII->get(ARM::t2LoopDec), RemainingLoopIterationsReg)
11832 .addUse(LoopCounterPhiReg)
11833 .addImm(1);
11834
11835 BuildMI(TpLoopBody, Dl, TII->get(ARM::t2LoopEnd))
11836 .addUse(RemainingLoopIterationsReg)
11837 .addMBB(TpLoopBody);
11838
11839 BuildMI(TpLoopBody, Dl, TII->get(ARM::t2B))
11840 .addMBB(TpExit)
11842}
11843
11845 // KCFI is supported in all ARM/Thumb modes
11846 return true;
11847}
11848
11852 const TargetInstrInfo *TII) const {
11853 assert(MBBI->isCall() && MBBI->getCFIType() &&
11854 "Invalid call instruction for a KCFI check");
11855
11856 MachineOperand *TargetOp = nullptr;
11857 switch (MBBI->getOpcode()) {
11858 // ARM mode opcodes
11859 case ARM::BLX:
11860 case ARM::BLX_pred:
11861 case ARM::BLX_noip:
11862 case ARM::BLX_pred_noip:
11863 case ARM::BX_CALL:
11864 TargetOp = &MBBI->getOperand(0);
11865 break;
11866 case ARM::TCRETURNri:
11867 case ARM::TCRETURNrinotr12:
11868 case ARM::TAILJMPr:
11869 case ARM::TAILJMPr4:
11870 TargetOp = &MBBI->getOperand(0);
11871 break;
11872 // Thumb mode opcodes (Thumb1 and Thumb2)
11873 // Note: Most Thumb call instructions have predicate operands before the
11874 // target register Format: tBLXr pred, predreg, target_register, ...
11875 case ARM::tBLXr: // Thumb1/Thumb2: BLX register (requires V5T)
11876 case ARM::tBLXr_noip: // Thumb1/Thumb2: BLX register, no IP clobber
11877 case ARM::tBX_CALL: // Thumb1 only: BX call (push LR, BX)
11878 TargetOp = &MBBI->getOperand(2);
11879 break;
11880 // Tail call instructions don't have predicates, target is operand 0
11881 case ARM::tTAILJMPr: // Thumb1/Thumb2: Tail call via register
11882 TargetOp = &MBBI->getOperand(0);
11883 break;
11884 default:
11885 llvm_unreachable("Unexpected CFI call opcode");
11886 }
11887
11888 assert(TargetOp && TargetOp->isReg() && "Invalid target operand");
11889 TargetOp->setIsRenamable(false);
11890
11891 // Select the appropriate KCFI_CHECK variant based on the instruction set
11892 unsigned KCFICheckOpcode;
11893 if (Subtarget->isThumb()) {
11894 if (Subtarget->isThumb2()) {
11895 KCFICheckOpcode = ARM::KCFI_CHECK_Thumb2;
11896 } else {
11897 KCFICheckOpcode = ARM::KCFI_CHECK_Thumb1;
11898 }
11899 } else {
11900 KCFICheckOpcode = ARM::KCFI_CHECK_ARM;
11901 }
11902
11903 return BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII->get(KCFICheckOpcode))
11904 .addReg(TargetOp->getReg())
11905 .addImm(MBBI->getCFIType())
11906 .getInstr();
11907}
11908
11911 MachineBasicBlock *BB) const {
11912 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
11913 DebugLoc dl = MI.getDebugLoc();
11914 bool isThumb2 = Subtarget->isThumb2();
11915 switch (MI.getOpcode()) {
11916 default: {
11917 MI.print(errs());
11918 llvm_unreachable("Unexpected instr type to insert");
11919 }
11920
11921 // Thumb1 post-indexed loads are really just single-register LDMs.
11922 case ARM::tLDR_postidx: {
11923 MachineOperand Def(MI.getOperand(1));
11924 BuildMI(*BB, MI, dl, TII->get(ARM::tLDMIA_UPD))
11925 .add(Def) // Rn_wb
11926 .add(MI.getOperand(2)) // Rn
11927 .add(MI.getOperand(3)) // PredImm
11928 .add(MI.getOperand(4)) // PredReg
11929 .add(MI.getOperand(0)) // Rt
11930 .cloneMemRefs(MI);
11931 MI.eraseFromParent();
11932 return BB;
11933 }
11934
11935 case ARM::MVE_MEMCPYLOOPINST:
11936 case ARM::MVE_MEMSETLOOPINST: {
11937
11938 // Transformation below expands MVE_MEMCPYLOOPINST/MVE_MEMSETLOOPINST Pseudo
11939 // into a Tail Predicated (TP) Loop. It adds the instructions to calculate
11940 // the iteration count =ceil(size_in_bytes/16)) in the TP entry block and
11941 // adds the relevant instructions in the TP loop Body for generation of a
11942 // WLSTP loop.
11943
11944 // Below is relevant portion of the CFG after the transformation.
11945 // The Machine Basic Blocks are shown along with branch conditions (in
11946 // brackets). Note that TP entry/exit MBBs depict the entry/exit of this
11947 // portion of the CFG and may not necessarily be the entry/exit of the
11948 // function.
11949
11950 // (Relevant) CFG after transformation:
11951 // TP entry MBB
11952 // |
11953 // |-----------------|
11954 // (n <= 0) (n > 0)
11955 // | |
11956 // | TP loop Body MBB<--|
11957 // | | |
11958 // \ |___________|
11959 // \ /
11960 // TP exit MBB
11961
11962 MachineFunction *MF = BB->getParent();
11963 MachineFunctionProperties &Properties = MF->getProperties();
11965
11966 Register OpDestReg = MI.getOperand(0).getReg();
11967 Register OpSrcReg = MI.getOperand(1).getReg();
11968 Register OpSizeReg = MI.getOperand(2).getReg();
11969
11970 // Allocate the required MBBs and add to parent function.
11971 MachineBasicBlock *TpEntry = BB;
11972 MachineBasicBlock *TpLoopBody = MF->CreateMachineBasicBlock();
11973 MachineBasicBlock *TpExit;
11974
11975 MF->push_back(TpLoopBody);
11976
11977 // If any instructions are present in the current block after
11978 // MVE_MEMCPYLOOPINST or MVE_MEMSETLOOPINST, split the current block and
11979 // move the instructions into the newly created exit block. If there are no
11980 // instructions add an explicit branch to the FallThrough block and then
11981 // split.
11982 //
11983 // The split is required for two reasons:
11984 // 1) A terminator(t2WhileLoopStart) will be placed at that site.
11985 // 2) Since a TPLoopBody will be added later, any phis in successive blocks
11986 // need to be updated. splitAt() already handles this.
11987 TpExit = BB->splitAt(MI, false);
11988 if (TpExit == BB) {
11989 assert(BB->canFallThrough() && "Exit Block must be Fallthrough of the "
11990 "block containing memcpy/memset Pseudo");
11991 TpExit = BB->getFallThrough();
11992 BuildMI(BB, dl, TII->get(ARM::t2B))
11993 .addMBB(TpExit)
11995 TpExit = BB->splitAt(MI, false);
11996 }
11997
11998 // Add logic for iteration count
11999 Register TotalIterationsReg =
12000 genTPEntry(TpEntry, TpLoopBody, TpExit, OpSizeReg, TII, dl, MRI);
12001
12002 // Add the vectorized (and predicated) loads/store instructions
12003 bool IsMemcpy = MI.getOpcode() == ARM::MVE_MEMCPYLOOPINST;
12004 genTPLoopBody(TpLoopBody, TpEntry, TpExit, TII, dl, MRI, OpSrcReg,
12005 OpDestReg, OpSizeReg, TotalIterationsReg, IsMemcpy);
12006
12007 // Required to avoid conflict with the MachineVerifier during testing.
12008 Properties.resetNoPHIs();
12009
12010 // Connect the blocks
12011 TpEntry->addSuccessor(TpLoopBody);
12012 TpLoopBody->addSuccessor(TpLoopBody);
12013 TpLoopBody->addSuccessor(TpExit);
12014
12015 // Reorder for a more natural layout
12016 TpLoopBody->moveAfter(TpEntry);
12017 TpExit->moveAfter(TpLoopBody);
12018
12019 // Finally, remove the memcpy Pseudo Instruction
12020 MI.eraseFromParent();
12021
12022 // Return the exit block as it may contain other instructions requiring a
12023 // custom inserter
12024 return TpExit;
12025 }
12026
12027 // The Thumb2 pre-indexed stores have the same MI operands, they just
12028 // define them differently in the .td files from the isel patterns, so
12029 // they need pseudos.
12030 case ARM::t2STR_preidx:
12031 MI.setDesc(TII->get(ARM::t2STR_PRE));
12032 return BB;
12033 case ARM::t2STRB_preidx:
12034 MI.setDesc(TII->get(ARM::t2STRB_PRE));
12035 return BB;
12036 case ARM::t2STRH_preidx:
12037 MI.setDesc(TII->get(ARM::t2STRH_PRE));
12038 return BB;
12039
12040 case ARM::STRi_preidx:
12041 case ARM::STRBi_preidx: {
12042 unsigned NewOpc = MI.getOpcode() == ARM::STRi_preidx ? ARM::STR_PRE_IMM
12043 : ARM::STRB_PRE_IMM;
12044 // Decode the offset.
12045 unsigned Offset = MI.getOperand(4).getImm();
12046 bool isSub = ARM_AM::getAM2Op(Offset) == ARM_AM::sub;
12048 if (isSub)
12049 Offset = -Offset;
12050
12051 MachineMemOperand *MMO = *MI.memoperands_begin();
12052 BuildMI(*BB, MI, dl, TII->get(NewOpc))
12053 .add(MI.getOperand(0)) // Rn_wb
12054 .add(MI.getOperand(1)) // Rt
12055 .add(MI.getOperand(2)) // Rn
12056 .addImm(Offset) // offset (skip GPR==zero_reg)
12057 .add(MI.getOperand(5)) // pred
12058 .add(MI.getOperand(6))
12059 .addMemOperand(MMO);
12060 MI.eraseFromParent();
12061 return BB;
12062 }
12063 case ARM::STRr_preidx:
12064 case ARM::STRBr_preidx:
12065 case ARM::STRH_preidx: {
12066 unsigned NewOpc;
12067 switch (MI.getOpcode()) {
12068 default: llvm_unreachable("unexpected opcode!");
12069 case ARM::STRr_preidx: NewOpc = ARM::STR_PRE_REG; break;
12070 case ARM::STRBr_preidx: NewOpc = ARM::STRB_PRE_REG; break;
12071 case ARM::STRH_preidx: NewOpc = ARM::STRH_PRE; break;
12072 }
12073 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(NewOpc));
12074 for (const MachineOperand &MO : MI.operands())
12075 MIB.add(MO);
12076 MI.eraseFromParent();
12077 return BB;
12078 }
12079
12080 case ARM::tMOVCCr_pseudo: {
12081 // To "insert" a SELECT_CC instruction, we actually have to insert the
12082 // diamond control-flow pattern. The incoming instruction knows the
12083 // destination vreg to set, the condition code register to branch on, the
12084 // true/false values to select between, and a branch opcode to use.
12085 const BasicBlock *LLVM_BB = BB->getBasicBlock();
12087
12088 // thisMBB:
12089 // ...
12090 // TrueVal = ...
12091 // cmpTY ccX, r1, r2
12092 // bCC copy1MBB
12093 // fallthrough --> copy0MBB
12094 MachineBasicBlock *thisMBB = BB;
12095 MachineFunction *F = BB->getParent();
12096 MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
12097 MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
12098 F->insert(It, copy0MBB);
12099 F->insert(It, sinkMBB);
12100
12101 // Set the call frame size on entry to the new basic blocks.
12102 unsigned CallFrameSize = TII->getCallFrameSizeAt(MI);
12103 copy0MBB->setCallFrameSize(CallFrameSize);
12104 sinkMBB->setCallFrameSize(CallFrameSize);
12105
12106 // Check whether CPSR is live past the tMOVCCr_pseudo.
12107 const TargetRegisterInfo *TRI = Subtarget->getRegisterInfo();
12108 if (!MI.killsRegister(ARM::CPSR, /*TRI=*/nullptr) &&
12109 !checkAndUpdateCPSRKill(MI, thisMBB, TRI)) {
12110 copy0MBB->addLiveIn(ARM::CPSR);
12111 sinkMBB->addLiveIn(ARM::CPSR);
12112 }
12113
12114 // Transfer the remainder of BB and its successor edges to sinkMBB.
12115 sinkMBB->splice(sinkMBB->begin(), BB,
12116 std::next(MachineBasicBlock::iterator(MI)), BB->end());
12118
12119 BB->addSuccessor(copy0MBB);
12120 BB->addSuccessor(sinkMBB);
12121
12122 BuildMI(BB, dl, TII->get(ARM::tBcc))
12123 .addMBB(sinkMBB)
12124 .addImm(MI.getOperand(3).getImm())
12125 .addReg(MI.getOperand(4).getReg());
12126
12127 // copy0MBB:
12128 // %FalseValue = ...
12129 // # fallthrough to sinkMBB
12130 BB = copy0MBB;
12131
12132 // Update machine-CFG edges
12133 BB->addSuccessor(sinkMBB);
12134
12135 // sinkMBB:
12136 // %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
12137 // ...
12138 BB = sinkMBB;
12139 BuildMI(*BB, BB->begin(), dl, TII->get(ARM::PHI), MI.getOperand(0).getReg())
12140 .addReg(MI.getOperand(1).getReg())
12141 .addMBB(copy0MBB)
12142 .addReg(MI.getOperand(2).getReg())
12143 .addMBB(thisMBB);
12144
12145 MI.eraseFromParent(); // The pseudo instruction is gone now.
12146 return BB;
12147 }
12148
12149 case ARM::BCCi64:
12150 case ARM::BCCZi64: {
12151 // If there is an unconditional branch to the other successor, remove it.
12152 BB->erase(std::next(MachineBasicBlock::iterator(MI)), BB->end());
12153
12154 // Compare both parts that make up the double comparison separately for
12155 // equality.
12156 bool RHSisZero = MI.getOpcode() == ARM::BCCZi64;
12157
12158 Register LHS1 = MI.getOperand(1).getReg();
12159 Register LHS2 = MI.getOperand(2).getReg();
12160 if (RHSisZero) {
12161 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri))
12162 .addReg(LHS1)
12163 .addImm(0)
12165 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri))
12166 .addReg(LHS2).addImm(0)
12167 .addImm(ARMCC::EQ).addReg(ARM::CPSR);
12168 } else {
12169 Register RHS1 = MI.getOperand(3).getReg();
12170 Register RHS2 = MI.getOperand(4).getReg();
12171 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPrr : ARM::CMPrr))
12172 .addReg(LHS1)
12173 .addReg(RHS1)
12175 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPrr : ARM::CMPrr))
12176 .addReg(LHS2).addReg(RHS2)
12177 .addImm(ARMCC::EQ).addReg(ARM::CPSR);
12178 }
12179
12180 MachineBasicBlock *destMBB = MI.getOperand(RHSisZero ? 3 : 5).getMBB();
12181 MachineBasicBlock *exitMBB = OtherSucc(BB, destMBB);
12182 if (MI.getOperand(0).getImm() == ARMCC::NE)
12183 std::swap(destMBB, exitMBB);
12184
12185 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc))
12186 .addMBB(destMBB).addImm(ARMCC::EQ).addReg(ARM::CPSR);
12187 if (isThumb2)
12188 BuildMI(BB, dl, TII->get(ARM::t2B))
12189 .addMBB(exitMBB)
12191 else
12192 BuildMI(BB, dl, TII->get(ARM::B)) .addMBB(exitMBB);
12193
12194 MI.eraseFromParent(); // The pseudo instruction is gone now.
12195 return BB;
12196 }
12197
12198 case ARM::Int_eh_sjlj_setjmp:
12199 case ARM::Int_eh_sjlj_setjmp_nofp:
12200 case ARM::tInt_eh_sjlj_setjmp:
12201 case ARM::t2Int_eh_sjlj_setjmp:
12202 case ARM::t2Int_eh_sjlj_setjmp_nofp:
12203 return BB;
12204
12205 case ARM::Int_eh_sjlj_setup_dispatch:
12206 EmitSjLjDispatchBlock(MI, BB);
12207 return BB;
12208 case ARM::COPY_STRUCT_BYVAL_I32:
12209 ++NumLoopByVals;
12210 return EmitStructByval(MI, BB);
12211 case ARM::WIN__CHKSTK:
12212 return EmitLowered__chkstk(MI, BB);
12213 case ARM::WIN__DBZCHK:
12214 return EmitLowered__dbzchk(MI, BB);
12215 }
12216}
12217
12218/// Attaches vregs to MEMCPY that it will use as scratch registers
12219/// when it is expanded into LDM/STM. This is done as a post-isel lowering
12220/// instead of as a custom inserter because we need the use list from the SDNode.
12221static void attachMEMCPYScratchRegs(const ARMSubtarget *Subtarget,
12222 MachineInstr &MI, const SDNode *Node) {
12223 bool isThumb1 = Subtarget->isThumb1Only();
12224
12225 MachineFunction *MF = MI.getParent()->getParent();
12227 MachineInstrBuilder MIB(*MF, MI);
12228
12229 // If the new dst/src is unused mark it as dead.
12230 if (!Node->hasAnyUseOfValue(0)) {
12231 MI.getOperand(0).setIsDead(true);
12232 }
12233 if (!Node->hasAnyUseOfValue(1)) {
12234 MI.getOperand(1).setIsDead(true);
12235 }
12236
12237 // The MEMCPY both defines and kills the scratch registers.
12238 for (unsigned I = 0; I != MI.getOperand(4).getImm(); ++I) {
12239 Register TmpReg = MRI.createVirtualRegister(isThumb1 ? &ARM::tGPRRegClass
12240 : &ARM::GPRRegClass);
12242 }
12243}
12244
12246 SDNode *Node) const {
12247 if (MI.getOpcode() == ARM::MEMCPY) {
12248 attachMEMCPYScratchRegs(Subtarget, MI, Node);
12249 return;
12250 }
12251
12252 const MCInstrDesc *MCID = &MI.getDesc();
12253 // Adjust potentially 's' setting instructions after isel, i.e. ADC, SBC, RSB,
12254 // RSC. Coming out of isel, they have an implicit CPSR def, but the optional
12255 // operand is still set to noreg. If needed, set the optional operand's
12256 // register to CPSR, and remove the redundant implicit def.
12257 //
12258 // e.g. ADCS (..., implicit-def CPSR) -> ADC (... opt:def CPSR).
12259
12260 // Rename pseudo opcodes.
12261 unsigned NewOpc = convertAddSubFlagsOpcode(MI.getOpcode());
12262 unsigned ccOutIdx;
12263 if (NewOpc) {
12264 const ARMBaseInstrInfo *TII = Subtarget->getInstrInfo();
12265 MCID = &TII->get(NewOpc);
12266
12267 assert(MCID->getNumOperands() ==
12268 MI.getDesc().getNumOperands() + 5 - MI.getDesc().getSize()
12269 && "converted opcode should be the same except for cc_out"
12270 " (and, on Thumb1, pred)");
12271
12272 MI.setDesc(*MCID);
12273
12274 // Add the optional cc_out operand
12275 MI.addOperand(MachineOperand::CreateReg(0, /*isDef=*/true));
12276
12277 // On Thumb1, move all input operands to the end, then add the predicate
12278 if (Subtarget->isThumb1Only()) {
12279 for (unsigned c = MCID->getNumOperands() - 4; c--;) {
12280 MI.addOperand(MI.getOperand(1));
12281 MI.removeOperand(1);
12282 }
12283
12284 // Restore the ties
12285 for (unsigned i = MI.getNumOperands(); i--;) {
12286 const MachineOperand& op = MI.getOperand(i);
12287 if (op.isReg() && op.isUse()) {
12288 int DefIdx = MCID->getOperandConstraint(i, MCOI::TIED_TO);
12289 if (DefIdx != -1)
12290 MI.tieOperands(DefIdx, i);
12291 }
12292 }
12293
12295 MI.addOperand(MachineOperand::CreateReg(0, /*isDef=*/false));
12296 ccOutIdx = 1;
12297 } else
12298 ccOutIdx = MCID->getNumOperands() - 1;
12299 } else
12300 ccOutIdx = MCID->getNumOperands() - 1;
12301
12302 // Any ARM instruction that sets the 's' bit should specify an optional
12303 // "cc_out" operand in the last operand position.
12304 if (!MI.hasOptionalDef() || !MCID->operands()[ccOutIdx].isOptionalDef()) {
12305 assert(!NewOpc && "Optional cc_out operand required");
12306 return;
12307 }
12308 // Look for an implicit def of CPSR added by MachineInstr ctor. Remove it
12309 // since we already have an optional CPSR def.
12310 bool definesCPSR = false;
12311 bool deadCPSR = false;
12312 for (unsigned i = MCID->getNumOperands(), e = MI.getNumOperands(); i != e;
12313 ++i) {
12314 const MachineOperand &MO = MI.getOperand(i);
12315 if (MO.isReg() && MO.isDef() && MO.getReg() == ARM::CPSR) {
12316 definesCPSR = true;
12317 if (MO.isDead())
12318 deadCPSR = true;
12319 MI.removeOperand(i);
12320 break;
12321 }
12322 }
12323 if (!definesCPSR) {
12324 assert(!NewOpc && "Optional cc_out operand required");
12325 return;
12326 }
12327 assert(deadCPSR == !Node->hasAnyUseOfValue(1) && "inconsistent dead flag");
12328 if (deadCPSR) {
12329 assert(!MI.getOperand(ccOutIdx).getReg() &&
12330 "expect uninitialized optional cc_out operand");
12331 // Thumb1 instructions must have the S bit even if the CPSR is dead.
12332 if (!Subtarget->isThumb1Only())
12333 return;
12334 }
12335
12336 // If this instruction was defined with an optional CPSR def and its dag node
12337 // had a live implicit CPSR def, then activate the optional CPSR def.
12338 MachineOperand &MO = MI.getOperand(ccOutIdx);
12339 MO.setReg(ARM::CPSR);
12340 MO.setIsDef(true);
12341}
12342
12343//===----------------------------------------------------------------------===//
12344// ARM Optimization Hooks
12345//===----------------------------------------------------------------------===//
12346
12347// Helper function that checks if N is a null or all ones constant.
12348static inline bool isZeroOrAllOnes(SDValue N, bool AllOnes) {
12350}
12351
12352// Return true if N is conditionally 0 or all ones.
12353// Detects these expressions where cc is an i1 value:
12354//
12355// (select cc 0, y) [AllOnes=0]
12356// (select cc y, 0) [AllOnes=0]
12357// (zext cc) [AllOnes=0]
12358// (sext cc) [AllOnes=0/1]
12359// (select cc -1, y) [AllOnes=1]
12360// (select cc y, -1) [AllOnes=1]
12361//
12362// Invert is set when N is the null/all ones constant when CC is false.
12363// OtherOp is set to the alternative value of N.
12365 SDValue &CC, bool &Invert,
12366 SDValue &OtherOp,
12367 SelectionDAG &DAG) {
12368 switch (N->getOpcode()) {
12369 default: return false;
12370 case ISD::SELECT: {
12371 CC = N->getOperand(0);
12372 SDValue N1 = N->getOperand(1);
12373 SDValue N2 = N->getOperand(2);
12374 if (isZeroOrAllOnes(N1, AllOnes)) {
12375 Invert = false;
12376 OtherOp = N2;
12377 return true;
12378 }
12379 if (isZeroOrAllOnes(N2, AllOnes)) {
12380 Invert = true;
12381 OtherOp = N1;
12382 return true;
12383 }
12384 return false;
12385 }
12386 case ISD::ZERO_EXTEND:
12387 // (zext cc) can never be the all ones value.
12388 if (AllOnes)
12389 return false;
12390 [[fallthrough]];
12391 case ISD::SIGN_EXTEND: {
12392 SDLoc dl(N);
12393 EVT VT = N->getValueType(0);
12394 CC = N->getOperand(0);
12395 if (CC.getValueType() != MVT::i1 || CC.getOpcode() != ISD::SETCC)
12396 return false;
12397 Invert = !AllOnes;
12398 if (AllOnes)
12399 // When looking for an AllOnes constant, N is an sext, and the 'other'
12400 // value is 0.
12401 OtherOp = DAG.getConstant(0, dl, VT);
12402 else if (N->getOpcode() == ISD::ZERO_EXTEND)
12403 // When looking for a 0 constant, N can be zext or sext.
12404 OtherOp = DAG.getConstant(1, dl, VT);
12405 else
12406 OtherOp = DAG.getAllOnesConstant(dl, VT);
12407 return true;
12408 }
12409 }
12410}
12411
12412// Combine a constant select operand into its use:
12413//
12414// (add (select cc, 0, c), x) -> (select cc, x, (add, x, c))
12415// (sub x, (select cc, 0, c)) -> (select cc, x, (sub, x, c))
12416// (and (select cc, -1, c), x) -> (select cc, x, (and, x, c)) [AllOnes=1]
12417// (or (select cc, 0, c), x) -> (select cc, x, (or, x, c))
12418// (xor (select cc, 0, c), x) -> (select cc, x, (xor, x, c))
12419//
12420// The transform is rejected if the select doesn't have a constant operand that
12421// is null, or all ones when AllOnes is set.
12422//
12423// Also recognize sext/zext from i1:
12424//
12425// (add (zext cc), x) -> (select cc (add x, 1), x)
12426// (add (sext cc), x) -> (select cc (add x, -1), x)
12427//
12428// These transformations eventually create predicated instructions.
12429//
12430// @param N The node to transform.
12431// @param Slct The N operand that is a select.
12432// @param OtherOp The other N operand (x above).
12433// @param DCI Context.
12434// @param AllOnes Require the select constant to be all ones instead of null.
12435// @returns The new node, or SDValue() on failure.
12436static
12439 bool AllOnes = false) {
12440 SelectionDAG &DAG = DCI.DAG;
12441 EVT VT = N->getValueType(0);
12442 SDValue NonConstantVal;
12443 SDValue CCOp;
12444 bool SwapSelectOps;
12445 if (!isConditionalZeroOrAllOnes(Slct.getNode(), AllOnes, CCOp, SwapSelectOps,
12446 NonConstantVal, DAG))
12447 return SDValue();
12448
12449 // Slct is now know to be the desired identity constant when CC is true.
12450 SDValue TrueVal = OtherOp;
12451 SDValue FalseVal = DAG.getNode(N->getOpcode(), SDLoc(N), VT,
12452 OtherOp, NonConstantVal);
12453 // Unless SwapSelectOps says CC should be false.
12454 if (SwapSelectOps)
12455 std::swap(TrueVal, FalseVal);
12456
12457 return DAG.getNode(ISD::SELECT, SDLoc(N), VT,
12458 CCOp, TrueVal, FalseVal);
12459}
12460
12461// Attempt combineSelectAndUse on each operand of a commutative operator N.
12462static
12465 SDValue N0 = N->getOperand(0);
12466 SDValue N1 = N->getOperand(1);
12467 if (N0.getNode()->hasOneUse())
12468 if (SDValue Result = combineSelectAndUse(N, N0, N1, DCI, AllOnes))
12469 return Result;
12470 if (N1.getNode()->hasOneUse())
12471 if (SDValue Result = combineSelectAndUse(N, N1, N0, DCI, AllOnes))
12472 return Result;
12473 return SDValue();
12474}
12475
12477 // VUZP shuffle node.
12478 if (N->getOpcode() == ARMISD::VUZP)
12479 return true;
12480
12481 // "VUZP" on i32 is an alias for VTRN.
12482 if (N->getOpcode() == ARMISD::VTRN && N->getValueType(0) == MVT::v2i32)
12483 return true;
12484
12485 return false;
12486}
12487
12490 const ARMSubtarget *Subtarget) {
12491 // Look for ADD(VUZP.0, VUZP.1).
12492 if (!IsVUZPShuffleNode(N0.getNode()) || N0.getNode() != N1.getNode() ||
12493 N0 == N1)
12494 return SDValue();
12495
12496 // Make sure the ADD is a 64-bit add; there is no 128-bit VPADD.
12497 if (!N->getValueType(0).is64BitVector())
12498 return SDValue();
12499
12500 // Generate vpadd.
12501 SelectionDAG &DAG = DCI.DAG;
12502 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
12503 SDLoc dl(N);
12504 SDNode *Unzip = N0.getNode();
12505 EVT VT = N->getValueType(0);
12506
12508 Ops.push_back(DAG.getConstant(Intrinsic::arm_neon_vpadd, dl,
12509 TLI.getPointerTy(DAG.getDataLayout())));
12510 Ops.push_back(Unzip->getOperand(0));
12511 Ops.push_back(Unzip->getOperand(1));
12512
12513 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, Ops);
12514}
12515
12518 const ARMSubtarget *Subtarget) {
12519 // Check for two extended operands.
12520 if (!(N0.getOpcode() == ISD::SIGN_EXTEND &&
12521 N1.getOpcode() == ISD::SIGN_EXTEND) &&
12522 !(N0.getOpcode() == ISD::ZERO_EXTEND &&
12523 N1.getOpcode() == ISD::ZERO_EXTEND))
12524 return SDValue();
12525
12526 SDValue N00 = N0.getOperand(0);
12527 SDValue N10 = N1.getOperand(0);
12528
12529 // Look for ADD(SEXT(VUZP.0), SEXT(VUZP.1))
12530 if (!IsVUZPShuffleNode(N00.getNode()) || N00.getNode() != N10.getNode() ||
12531 N00 == N10)
12532 return SDValue();
12533
12534 // We only recognize Q register paddl here; this can't be reached until
12535 // after type legalization.
12536 if (!N00.getValueType().is64BitVector() ||
12538 return SDValue();
12539
12540 // Generate vpaddl.
12541 SelectionDAG &DAG = DCI.DAG;
12542 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
12543 SDLoc dl(N);
12544 EVT VT = N->getValueType(0);
12545
12547 // Form vpaddl.sN or vpaddl.uN depending on the kind of extension.
12548 unsigned Opcode;
12549 if (N0.getOpcode() == ISD::SIGN_EXTEND)
12550 Opcode = Intrinsic::arm_neon_vpaddls;
12551 else
12552 Opcode = Intrinsic::arm_neon_vpaddlu;
12553 Ops.push_back(DAG.getConstant(Opcode, dl,
12554 TLI.getPointerTy(DAG.getDataLayout())));
12555 EVT ElemTy = N00.getValueType().getVectorElementType();
12556 unsigned NumElts = VT.getVectorNumElements();
12557 EVT ConcatVT = EVT::getVectorVT(*DAG.getContext(), ElemTy, NumElts * 2);
12558 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), ConcatVT,
12559 N00.getOperand(0), N00.getOperand(1));
12560 Ops.push_back(Concat);
12561
12562 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, Ops);
12563}
12564
12565// FIXME: This function shouldn't be necessary; if we lower BUILD_VECTOR in
12566// an appropriate manner, we end up with ADD(VUZP(ZEXT(N))), which is
12567// much easier to match.
12568static SDValue
12571 const ARMSubtarget *Subtarget) {
12572 // Only perform optimization if after legalize, and if NEON is available. We
12573 // also expected both operands to be BUILD_VECTORs.
12574 if (DCI.isBeforeLegalize() || !Subtarget->hasNEON()
12575 || N0.getOpcode() != ISD::BUILD_VECTOR
12576 || N1.getOpcode() != ISD::BUILD_VECTOR)
12577 return SDValue();
12578
12579 // Check output type since VPADDL operand elements can only be 8, 16, or 32.
12580 EVT VT = N->getValueType(0);
12581 if (!VT.isInteger() || VT.getVectorElementType() == MVT::i64)
12582 return SDValue();
12583
12584 // Check that the vector operands are of the right form.
12585 // N0 and N1 are BUILD_VECTOR nodes with N number of EXTRACT_VECTOR
12586 // operands, where N is the size of the formed vector.
12587 // Each EXTRACT_VECTOR should have the same input vector and odd or even
12588 // index such that we have a pair wise add pattern.
12589
12590 // Grab the vector that all EXTRACT_VECTOR nodes should be referencing.
12592 return SDValue();
12593 SDValue Vec = N0->getOperand(0)->getOperand(0);
12594 SDNode *V = Vec.getNode();
12595 unsigned nextIndex = 0;
12596
12597 // For each operands to the ADD which are BUILD_VECTORs,
12598 // check to see if each of their operands are an EXTRACT_VECTOR with
12599 // the same vector and appropriate index.
12600 for (unsigned i = 0, e = N0->getNumOperands(); i != e; ++i) {
12603
12604 SDValue ExtVec0 = N0->getOperand(i);
12605 SDValue ExtVec1 = N1->getOperand(i);
12606
12607 // First operand is the vector, verify its the same.
12608 if (V != ExtVec0->getOperand(0).getNode() ||
12609 V != ExtVec1->getOperand(0).getNode())
12610 return SDValue();
12611
12612 // Second is the constant, verify its correct.
12615
12616 // For the constant, we want to see all the even or all the odd.
12617 if (!C0 || !C1 || C0->getZExtValue() != nextIndex
12618 || C1->getZExtValue() != nextIndex+1)
12619 return SDValue();
12620
12621 // Increment index.
12622 nextIndex+=2;
12623 } else
12624 return SDValue();
12625 }
12626
12627 // Don't generate vpaddl+vmovn; we'll match it to vpadd later. Also make sure
12628 // we're using the entire input vector, otherwise there's a size/legality
12629 // mismatch somewhere.
12630 if (nextIndex != Vec.getValueType().getVectorNumElements() ||
12632 return SDValue();
12633
12634 // Create VPADDL node.
12635 SelectionDAG &DAG = DCI.DAG;
12636 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
12637
12638 SDLoc dl(N);
12639
12640 // Build operand list.
12642 Ops.push_back(DAG.getConstant(Intrinsic::arm_neon_vpaddls, dl,
12643 TLI.getPointerTy(DAG.getDataLayout())));
12644
12645 // Input is the vector.
12646 Ops.push_back(Vec);
12647
12648 // Get widened type and narrowed type.
12649 MVT widenType;
12650 unsigned numElem = VT.getVectorNumElements();
12651
12652 EVT inputLaneType = Vec.getValueType().getVectorElementType();
12653 switch (inputLaneType.getSimpleVT().SimpleTy) {
12654 case MVT::i8: widenType = MVT::getVectorVT(MVT::i16, numElem); break;
12655 case MVT::i16: widenType = MVT::getVectorVT(MVT::i32, numElem); break;
12656 case MVT::i32: widenType = MVT::getVectorVT(MVT::i64, numElem); break;
12657 default:
12658 llvm_unreachable("Invalid vector element type for padd optimization.");
12659 }
12660
12661 SDValue tmp = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, widenType, Ops);
12662 unsigned ExtOp = VT.bitsGT(tmp.getValueType()) ? ISD::ANY_EXTEND : ISD::TRUNCATE;
12663 return DAG.getNode(ExtOp, dl, VT, tmp);
12664}
12665
12667 if (V->getOpcode() == ISD::UMUL_LOHI ||
12668 V->getOpcode() == ISD::SMUL_LOHI)
12669 return V;
12670 return SDValue();
12671}
12672
12673static SDValue AddCombineTo64BitSMLAL16(SDNode *AddcNode, SDNode *AddeNode,
12675 const ARMSubtarget *Subtarget) {
12676 if (!Subtarget->hasBaseDSP())
12677 return SDValue();
12678
12679 // SMLALBB, SMLALBT, SMLALTB, SMLALTT multiply two 16-bit values and
12680 // accumulates the product into a 64-bit value. The 16-bit values will
12681 // be sign extended somehow or SRA'd into 32-bit values
12682 // (addc (adde (mul 16bit, 16bit), lo), hi)
12683 SDValue Mul = AddcNode->getOperand(0);
12684 SDValue Lo = AddcNode->getOperand(1);
12685 if (Mul.getOpcode() != ISD::MUL) {
12686 Lo = AddcNode->getOperand(0);
12687 Mul = AddcNode->getOperand(1);
12688 if (Mul.getOpcode() != ISD::MUL)
12689 return SDValue();
12690 }
12691
12692 SDValue SRA = AddeNode->getOperand(0);
12693 SDValue Hi = AddeNode->getOperand(1);
12694 if (SRA.getOpcode() != ISD::SRA) {
12695 SRA = AddeNode->getOperand(1);
12696 Hi = AddeNode->getOperand(0);
12697 if (SRA.getOpcode() != ISD::SRA)
12698 return SDValue();
12699 }
12700 if (auto Const = dyn_cast<ConstantSDNode>(SRA.getOperand(1))) {
12701 if (Const->getZExtValue() != 31)
12702 return SDValue();
12703 } else
12704 return SDValue();
12705
12706 if (SRA.getOperand(0) != Mul)
12707 return SDValue();
12708
12709 SelectionDAG &DAG = DCI.DAG;
12710 SDLoc dl(AddcNode);
12711 unsigned Opcode = 0;
12712 SDValue Op0;
12713 SDValue Op1;
12714
12715 if (isS16(Mul.getOperand(0), DAG) && isS16(Mul.getOperand(1), DAG)) {
12716 Opcode = ARMISD::SMLALBB;
12717 Op0 = Mul.getOperand(0);
12718 Op1 = Mul.getOperand(1);
12719 } else if (isS16(Mul.getOperand(0), DAG) && isSRA16(Mul.getOperand(1))) {
12720 Opcode = ARMISD::SMLALBT;
12721 Op0 = Mul.getOperand(0);
12722 Op1 = Mul.getOperand(1).getOperand(0);
12723 } else if (isSRA16(Mul.getOperand(0)) && isS16(Mul.getOperand(1), DAG)) {
12724 Opcode = ARMISD::SMLALTB;
12725 Op0 = Mul.getOperand(0).getOperand(0);
12726 Op1 = Mul.getOperand(1);
12727 } else if (isSRA16(Mul.getOperand(0)) && isSRA16(Mul.getOperand(1))) {
12728 Opcode = ARMISD::SMLALTT;
12729 Op0 = Mul->getOperand(0).getOperand(0);
12730 Op1 = Mul->getOperand(1).getOperand(0);
12731 }
12732
12733 if (!Op0 || !Op1)
12734 return SDValue();
12735
12736 SDValue SMLAL = DAG.getNode(Opcode, dl, DAG.getVTList(MVT::i32, MVT::i32),
12737 Op0, Op1, Lo, Hi);
12738 // Replace the ADDs' nodes uses by the MLA node's values.
12739 SDValue HiMLALResult(SMLAL.getNode(), 1);
12740 SDValue LoMLALResult(SMLAL.getNode(), 0);
12741
12742 DAG.ReplaceAllUsesOfValueWith(SDValue(AddcNode, 0), LoMLALResult);
12743 DAG.ReplaceAllUsesOfValueWith(SDValue(AddeNode, 0), HiMLALResult);
12744
12745 // Return original node to notify the driver to stop replacing.
12746 SDValue resNode(AddcNode, 0);
12747 return resNode;
12748}
12749
12752 const ARMSubtarget *Subtarget) {
12753 // Look for multiply add opportunities.
12754 // The pattern is a ISD::UMUL_LOHI followed by two add nodes, where
12755 // each add nodes consumes a value from ISD::UMUL_LOHI and there is
12756 // a glue link from the first add to the second add.
12757 // If we find this pattern, we can replace the U/SMUL_LOHI, ADDC, and ADDE by
12758 // a S/UMLAL instruction.
12759 // UMUL_LOHI
12760 // / :lo \ :hi
12761 // V \ [no multiline comment]
12762 // loAdd -> ADDC |
12763 // \ :carry /
12764 // V V
12765 // ADDE <- hiAdd
12766 //
12767 // In the special case where only the higher part of a signed result is used
12768 // and the add to the low part of the result of ISD::UMUL_LOHI adds or subtracts
12769 // a constant with the exact value of 0x80000000, we recognize we are dealing
12770 // with a "rounded multiply and add" (or subtract) and transform it into
12771 // either a ARMISD::SMMLAR or ARMISD::SMMLSR respectively.
12772
12773 assert((AddeSubeNode->getOpcode() == ARMISD::ADDE ||
12774 AddeSubeNode->getOpcode() == ARMISD::SUBE) &&
12775 "Expect an ADDE or SUBE");
12776
12777 assert(AddeSubeNode->getNumOperands() == 3 &&
12778 AddeSubeNode->getOperand(2).getValueType() == MVT::i32 &&
12779 "ADDE node has the wrong inputs");
12780
12781 // Check that we are chained to the right ADDC or SUBC node.
12782 SDNode *AddcSubcNode = AddeSubeNode->getOperand(2).getNode();
12783 if ((AddeSubeNode->getOpcode() == ARMISD::ADDE &&
12784 AddcSubcNode->getOpcode() != ARMISD::ADDC) ||
12785 (AddeSubeNode->getOpcode() == ARMISD::SUBE &&
12786 AddcSubcNode->getOpcode() != ARMISD::SUBC))
12787 return SDValue();
12788
12789 SDValue AddcSubcOp0 = AddcSubcNode->getOperand(0);
12790 SDValue AddcSubcOp1 = AddcSubcNode->getOperand(1);
12791
12792 // Check if the two operands are from the same mul_lohi node.
12793 if (AddcSubcOp0.getNode() == AddcSubcOp1.getNode())
12794 return SDValue();
12795
12796 assert(AddcSubcNode->getNumValues() == 2 &&
12797 AddcSubcNode->getValueType(0) == MVT::i32 &&
12798 "Expect ADDC with two result values. First: i32");
12799
12800 // Check that the ADDC adds the low result of the S/UMUL_LOHI. If not, it
12801 // maybe a SMLAL which multiplies two 16-bit values.
12802 if (AddeSubeNode->getOpcode() == ARMISD::ADDE &&
12803 AddcSubcOp0->getOpcode() != ISD::UMUL_LOHI &&
12804 AddcSubcOp0->getOpcode() != ISD::SMUL_LOHI &&
12805 AddcSubcOp1->getOpcode() != ISD::UMUL_LOHI &&
12806 AddcSubcOp1->getOpcode() != ISD::SMUL_LOHI)
12807 return AddCombineTo64BitSMLAL16(AddcSubcNode, AddeSubeNode, DCI, Subtarget);
12808
12809 // Check for the triangle shape.
12810 SDValue AddeSubeOp0 = AddeSubeNode->getOperand(0);
12811 SDValue AddeSubeOp1 = AddeSubeNode->getOperand(1);
12812
12813 // Make sure that the ADDE/SUBE operands are not coming from the same node.
12814 if (AddeSubeOp0.getNode() == AddeSubeOp1.getNode())
12815 return SDValue();
12816
12817 // Find the MUL_LOHI node walking up ADDE/SUBE's operands.
12818 bool IsLeftOperandMUL = false;
12819 SDValue MULOp = findMUL_LOHI(AddeSubeOp0);
12820 if (MULOp == SDValue())
12821 MULOp = findMUL_LOHI(AddeSubeOp1);
12822 else
12823 IsLeftOperandMUL = true;
12824 if (MULOp == SDValue())
12825 return SDValue();
12826
12827 // Figure out the right opcode.
12828 unsigned Opc = MULOp->getOpcode();
12829 unsigned FinalOpc = (Opc == ISD::SMUL_LOHI) ? ARMISD::SMLAL : ARMISD::UMLAL;
12830
12831 // Figure out the high and low input values to the MLAL node.
12832 SDValue *HiAddSub = nullptr;
12833 SDValue *LoMul = nullptr;
12834 SDValue *LowAddSub = nullptr;
12835
12836 // Ensure that ADDE/SUBE is from high result of ISD::xMUL_LOHI.
12837 if ((AddeSubeOp0 != MULOp.getValue(1)) && (AddeSubeOp1 != MULOp.getValue(1)))
12838 return SDValue();
12839
12840 if (IsLeftOperandMUL)
12841 HiAddSub = &AddeSubeOp1;
12842 else
12843 HiAddSub = &AddeSubeOp0;
12844
12845 // Ensure that LoMul and LowAddSub are taken from correct ISD::SMUL_LOHI node
12846 // whose low result is fed to the ADDC/SUBC we are checking.
12847
12848 if (AddcSubcOp0 == MULOp.getValue(0)) {
12849 LoMul = &AddcSubcOp0;
12850 LowAddSub = &AddcSubcOp1;
12851 }
12852 if (AddcSubcOp1 == MULOp.getValue(0)) {
12853 LoMul = &AddcSubcOp1;
12854 LowAddSub = &AddcSubcOp0;
12855 }
12856
12857 if (!LoMul)
12858 return SDValue();
12859
12860 // If HiAddSub is the same node as ADDC/SUBC or is a predecessor of ADDC/SUBC
12861 // the replacement below will create a cycle.
12862 if (AddcSubcNode == HiAddSub->getNode() ||
12863 AddcSubcNode->isPredecessorOf(HiAddSub->getNode()))
12864 return SDValue();
12865
12866 // Create the merged node.
12867 SelectionDAG &DAG = DCI.DAG;
12868
12869 // Start building operand list.
12871 Ops.push_back(LoMul->getOperand(0));
12872 Ops.push_back(LoMul->getOperand(1));
12873
12874 // Check whether we can use SMMLAR, SMMLSR or SMMULR instead. For this to be
12875 // the case, we must be doing signed multiplication and only use the higher
12876 // part of the result of the MLAL, furthermore the LowAddSub must be a constant
12877 // addition or subtraction with the value of 0x800000.
12878 if (Subtarget->hasV6Ops() && Subtarget->hasDSP() && Subtarget->useMulOps() &&
12879 FinalOpc == ARMISD::SMLAL && !AddeSubeNode->hasAnyUseOfValue(1) &&
12880 LowAddSub->getNode()->getOpcode() == ISD::Constant &&
12881 static_cast<ConstantSDNode *>(LowAddSub->getNode())->getZExtValue() ==
12882 0x80000000) {
12883 Ops.push_back(*HiAddSub);
12884 if (AddcSubcNode->getOpcode() == ARMISD::SUBC) {
12885 FinalOpc = ARMISD::SMMLSR;
12886 } else {
12887 FinalOpc = ARMISD::SMMLAR;
12888 }
12889 SDValue NewNode = DAG.getNode(FinalOpc, SDLoc(AddcSubcNode), MVT::i32, Ops);
12890 DAG.ReplaceAllUsesOfValueWith(SDValue(AddeSubeNode, 0), NewNode);
12891
12892 return SDValue(AddeSubeNode, 0);
12893 } else if (AddcSubcNode->getOpcode() == ARMISD::SUBC)
12894 // SMMLS is generated during instruction selection and the rest of this
12895 // function can not handle the case where AddcSubcNode is a SUBC.
12896 return SDValue();
12897
12898 // Finish building the operand list for {U/S}MLAL
12899 Ops.push_back(*LowAddSub);
12900 Ops.push_back(*HiAddSub);
12901
12902 SDValue MLALNode = DAG.getNode(FinalOpc, SDLoc(AddcSubcNode),
12903 DAG.getVTList(MVT::i32, MVT::i32), Ops);
12904
12905 // Replace the ADDs' nodes uses by the MLA node's values.
12906 SDValue HiMLALResult(MLALNode.getNode(), 1);
12907 DAG.ReplaceAllUsesOfValueWith(SDValue(AddeSubeNode, 0), HiMLALResult);
12908
12909 SDValue LoMLALResult(MLALNode.getNode(), 0);
12910 DAG.ReplaceAllUsesOfValueWith(SDValue(AddcSubcNode, 0), LoMLALResult);
12911
12912 // Return original node to notify the driver to stop replacing.
12913 return SDValue(AddeSubeNode, 0);
12914}
12915
12918 const ARMSubtarget *Subtarget) {
12919 // UMAAL is similar to UMLAL except that it adds two unsigned values.
12920 // While trying to combine for the other MLAL nodes, first search for the
12921 // chance to use UMAAL. Check if Addc uses a node which has already
12922 // been combined into a UMLAL. The other pattern is UMLAL using Addc/Adde
12923 // as the addend, and it's handled in PerformUMLALCombine.
12924
12925 if (!Subtarget->hasV6Ops() || !Subtarget->hasDSP())
12926 return AddCombineTo64bitMLAL(AddeNode, DCI, Subtarget);
12927
12928 // Check that we have a glued ADDC node.
12929 SDNode* AddcNode = AddeNode->getOperand(2).getNode();
12930 if (AddcNode->getOpcode() != ARMISD::ADDC)
12931 return SDValue();
12932
12933 // Find the converted UMAAL or quit if it doesn't exist.
12934 SDNode *UmlalNode = nullptr;
12935 SDValue AddHi;
12936 if (AddcNode->getOperand(0).getOpcode() == ARMISD::UMLAL) {
12937 UmlalNode = AddcNode->getOperand(0).getNode();
12938 AddHi = AddcNode->getOperand(1);
12939 } else if (AddcNode->getOperand(1).getOpcode() == ARMISD::UMLAL) {
12940 UmlalNode = AddcNode->getOperand(1).getNode();
12941 AddHi = AddcNode->getOperand(0);
12942 } else {
12943 return AddCombineTo64bitMLAL(AddeNode, DCI, Subtarget);
12944 }
12945
12946 // The ADDC should be glued to an ADDE node, which uses the same UMLAL as
12947 // the ADDC as well as Zero.
12948 if (!isNullConstant(UmlalNode->getOperand(3)))
12949 return SDValue();
12950
12951 if ((isNullConstant(AddeNode->getOperand(0)) &&
12952 AddeNode->getOperand(1).getNode() == UmlalNode) ||
12953 (AddeNode->getOperand(0).getNode() == UmlalNode &&
12954 isNullConstant(AddeNode->getOperand(1)))) {
12955 SelectionDAG &DAG = DCI.DAG;
12956 SDValue Ops[] = { UmlalNode->getOperand(0), UmlalNode->getOperand(1),
12957 UmlalNode->getOperand(2), AddHi };
12958 SDValue UMAAL = DAG.getNode(ARMISD::UMAAL, SDLoc(AddcNode),
12959 DAG.getVTList(MVT::i32, MVT::i32), Ops);
12960
12961 // Replace the ADDs' nodes uses by the UMAAL node's values.
12962 DAG.ReplaceAllUsesOfValueWith(SDValue(AddeNode, 0), SDValue(UMAAL.getNode(), 1));
12963 DAG.ReplaceAllUsesOfValueWith(SDValue(AddcNode, 0), SDValue(UMAAL.getNode(), 0));
12964
12965 // Return original node to notify the driver to stop replacing.
12966 return SDValue(AddeNode, 0);
12967 }
12968 return SDValue();
12969}
12970
12972 const ARMSubtarget *Subtarget) {
12973 if (!Subtarget->hasV6Ops() || !Subtarget->hasDSP())
12974 return SDValue();
12975
12976 // Check that we have a pair of ADDC and ADDE as operands.
12977 // Both addends of the ADDE must be zero.
12978 SDNode* AddcNode = N->getOperand(2).getNode();
12979 SDNode* AddeNode = N->getOperand(3).getNode();
12980 if ((AddcNode->getOpcode() == ARMISD::ADDC) &&
12981 (AddeNode->getOpcode() == ARMISD::ADDE) &&
12982 isNullConstant(AddeNode->getOperand(0)) &&
12983 isNullConstant(AddeNode->getOperand(1)) &&
12984 (AddeNode->getOperand(2).getNode() == AddcNode))
12985 return DAG.getNode(ARMISD::UMAAL, SDLoc(N),
12986 DAG.getVTList(MVT::i32, MVT::i32),
12987 {N->getOperand(0), N->getOperand(1),
12988 AddcNode->getOperand(0), AddcNode->getOperand(1)});
12989 else
12990 return SDValue();
12991}
12992
12995 const ARMSubtarget *Subtarget) {
12996 SelectionDAG &DAG(DCI.DAG);
12997
12998 if (N->getOpcode() == ARMISD::SUBC && N->hasAnyUseOfValue(1)) {
12999 // (SUBC (ADDE 0, 0, C), 1) -> C
13000 SDValue LHS = N->getOperand(0);
13001 SDValue RHS = N->getOperand(1);
13002 if (LHS->getOpcode() == ARMISD::ADDE &&
13003 isNullConstant(LHS->getOperand(0)) &&
13004 isNullConstant(LHS->getOperand(1)) && isOneConstant(RHS)) {
13005 return DCI.CombineTo(N, SDValue(N, 0), LHS->getOperand(2));
13006 }
13007 }
13008
13009 if (Subtarget->isThumb1Only()) {
13010 SDValue RHS = N->getOperand(1);
13012 int32_t imm = C->getSExtValue();
13013 if (imm < 0 && imm > std::numeric_limits<int>::min()) {
13014 SDLoc DL(N);
13015 RHS = DAG.getConstant(-imm, DL, MVT::i32);
13016 unsigned Opcode = (N->getOpcode() == ARMISD::ADDC) ? ARMISD::SUBC
13017 : ARMISD::ADDC;
13018 return DAG.getNode(Opcode, DL, N->getVTList(), N->getOperand(0), RHS);
13019 }
13020 }
13021 }
13022
13023 return SDValue();
13024}
13025
13028 const ARMSubtarget *Subtarget) {
13029 if (Subtarget->isThumb1Only()) {
13030 SelectionDAG &DAG = DCI.DAG;
13031 SDValue RHS = N->getOperand(1);
13033 int64_t imm = C->getSExtValue();
13034 if (imm < 0) {
13035 SDLoc DL(N);
13036
13037 // The with-carry-in form matches bitwise not instead of the negation.
13038 // Effectively, the inverse interpretation of the carry flag already
13039 // accounts for part of the negation.
13040 RHS = DAG.getConstant(~imm, DL, MVT::i32);
13041
13042 unsigned Opcode = (N->getOpcode() == ARMISD::ADDE) ? ARMISD::SUBE
13043 : ARMISD::ADDE;
13044 return DAG.getNode(Opcode, DL, N->getVTList(),
13045 N->getOperand(0), RHS, N->getOperand(2));
13046 }
13047 }
13048 } else if (N->getOperand(1)->getOpcode() == ISD::SMUL_LOHI) {
13049 return AddCombineTo64bitMLAL(N, DCI, Subtarget);
13050 }
13051 return SDValue();
13052}
13053
13056 const ARMSubtarget *Subtarget) {
13057 if (!Subtarget->hasMVEIntegerOps())
13058 return SDValue();
13059
13060 SDLoc dl(N);
13061 SDValue SetCC;
13062 SDValue LHS;
13063 SDValue RHS;
13064 ISD::CondCode CC;
13065 SDValue TrueVal;
13066 SDValue FalseVal;
13067
13068 if (N->getOpcode() == ISD::SELECT &&
13069 N->getOperand(0)->getOpcode() == ISD::SETCC) {
13070 SetCC = N->getOperand(0);
13071 LHS = SetCC->getOperand(0);
13072 RHS = SetCC->getOperand(1);
13073 CC = cast<CondCodeSDNode>(SetCC->getOperand(2))->get();
13074 TrueVal = N->getOperand(1);
13075 FalseVal = N->getOperand(2);
13076 } else if (N->getOpcode() == ISD::SELECT_CC) {
13077 LHS = N->getOperand(0);
13078 RHS = N->getOperand(1);
13079 CC = cast<CondCodeSDNode>(N->getOperand(4))->get();
13080 TrueVal = N->getOperand(2);
13081 FalseVal = N->getOperand(3);
13082 } else {
13083 return SDValue();
13084 }
13085
13086 unsigned int Opcode = 0;
13087 if ((TrueVal->getOpcode() == ISD::VECREDUCE_UMIN ||
13088 FalseVal->getOpcode() == ISD::VECREDUCE_UMIN) &&
13089 (CC == ISD::SETULT || CC == ISD::SETUGT)) {
13090 Opcode = ARMISD::VMINVu;
13091 if (CC == ISD::SETUGT)
13092 std::swap(TrueVal, FalseVal);
13093 } else if ((TrueVal->getOpcode() == ISD::VECREDUCE_SMIN ||
13094 FalseVal->getOpcode() == ISD::VECREDUCE_SMIN) &&
13095 (CC == ISD::SETLT || CC == ISD::SETGT)) {
13096 Opcode = ARMISD::VMINVs;
13097 if (CC == ISD::SETGT)
13098 std::swap(TrueVal, FalseVal);
13099 } else if ((TrueVal->getOpcode() == ISD::VECREDUCE_UMAX ||
13100 FalseVal->getOpcode() == ISD::VECREDUCE_UMAX) &&
13101 (CC == ISD::SETUGT || CC == ISD::SETULT)) {
13102 Opcode = ARMISD::VMAXVu;
13103 if (CC == ISD::SETULT)
13104 std::swap(TrueVal, FalseVal);
13105 } else if ((TrueVal->getOpcode() == ISD::VECREDUCE_SMAX ||
13106 FalseVal->getOpcode() == ISD::VECREDUCE_SMAX) &&
13107 (CC == ISD::SETGT || CC == ISD::SETLT)) {
13108 Opcode = ARMISD::VMAXVs;
13109 if (CC == ISD::SETLT)
13110 std::swap(TrueVal, FalseVal);
13111 } else
13112 return SDValue();
13113
13114 // Normalise to the right hand side being the vector reduction
13115 switch (TrueVal->getOpcode()) {
13120 std::swap(LHS, RHS);
13121 std::swap(TrueVal, FalseVal);
13122 break;
13123 }
13124
13125 EVT VectorType = FalseVal->getOperand(0).getValueType();
13126
13127 if (VectorType != MVT::v16i8 && VectorType != MVT::v8i16 &&
13128 VectorType != MVT::v4i32)
13129 return SDValue();
13130
13131 EVT VectorScalarType = VectorType.getVectorElementType();
13132
13133 // The values being selected must also be the ones being compared
13134 if (TrueVal != LHS || FalseVal != RHS)
13135 return SDValue();
13136
13137 EVT LeftType = LHS->getValueType(0);
13138 EVT RightType = RHS->getValueType(0);
13139
13140 // The types must match the reduced type too
13141 if (LeftType != VectorScalarType || RightType != VectorScalarType)
13142 return SDValue();
13143
13144 // Legalise the scalar to an i32
13145 if (VectorScalarType != MVT::i32)
13146 LHS = DCI.DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, LHS);
13147
13148 // Generate the reduction as an i32 for legalisation purposes
13149 auto Reduction =
13150 DCI.DAG.getNode(Opcode, dl, MVT::i32, LHS, RHS->getOperand(0));
13151
13152 // The result isn't actually an i32 so truncate it back to its original type
13153 if (VectorScalarType != MVT::i32)
13154 Reduction = DCI.DAG.getNode(ISD::TRUNCATE, dl, VectorScalarType, Reduction);
13155
13156 return Reduction;
13157}
13158
13159// A special combine for the vqdmulh family of instructions. This is one of the
13160// potential set of patterns that could patch this instruction. The base pattern
13161// you would expect to be min(max(ashr(mul(mul(sext(x), 2), sext(y)), 16))).
13162// This matches the different min(max(ashr(mul(mul(sext(x), sext(y)), 2), 16))),
13163// which llvm will have optimized to min(ashr(mul(sext(x), sext(y)), 15))) as
13164// the max is unnecessary.
13166 EVT VT = N->getValueType(0);
13167 SDValue Shft;
13168 ConstantSDNode *Clamp;
13169
13170 if (!VT.isVector() || VT.getScalarSizeInBits() > 64)
13171 return SDValue();
13172
13173 if (N->getOpcode() == ISD::SMIN) {
13174 Shft = N->getOperand(0);
13175 Clamp = isConstOrConstSplat(N->getOperand(1));
13176 } else if (N->getOpcode() == ISD::VSELECT) {
13177 // Detect a SMIN, which for an i64 node will be a vselect/setcc, not a smin.
13178 SDValue Cmp = N->getOperand(0);
13179 if (Cmp.getOpcode() != ISD::SETCC ||
13180 cast<CondCodeSDNode>(Cmp.getOperand(2))->get() != ISD::SETLT ||
13181 Cmp.getOperand(0) != N->getOperand(1) ||
13182 Cmp.getOperand(1) != N->getOperand(2))
13183 return SDValue();
13184 Shft = N->getOperand(1);
13185 Clamp = isConstOrConstSplat(N->getOperand(2));
13186 } else
13187 return SDValue();
13188
13189 if (!Clamp)
13190 return SDValue();
13191
13192 MVT ScalarType;
13193 int ShftAmt = 0;
13194 switch (Clamp->getSExtValue()) {
13195 case (1 << 7) - 1:
13196 ScalarType = MVT::i8;
13197 ShftAmt = 7;
13198 break;
13199 case (1 << 15) - 1:
13200 ScalarType = MVT::i16;
13201 ShftAmt = 15;
13202 break;
13203 case (1ULL << 31) - 1:
13204 ScalarType = MVT::i32;
13205 ShftAmt = 31;
13206 break;
13207 default:
13208 return SDValue();
13209 }
13210
13211 if (Shft.getOpcode() != ISD::SRA)
13212 return SDValue();
13214 if (!N1 || N1->getSExtValue() != ShftAmt)
13215 return SDValue();
13216
13217 SDValue Mul = Shft.getOperand(0);
13218 if (Mul.getOpcode() != ISD::MUL)
13219 return SDValue();
13220
13221 SDValue Ext0 = Mul.getOperand(0);
13222 SDValue Ext1 = Mul.getOperand(1);
13223 if (Ext0.getOpcode() != ISD::SIGN_EXTEND ||
13224 Ext1.getOpcode() != ISD::SIGN_EXTEND)
13225 return SDValue();
13226 EVT VecVT = Ext0.getOperand(0).getValueType();
13227 if (!VecVT.isPow2VectorType() || VecVT.getVectorNumElements() == 1)
13228 return SDValue();
13229 if (Ext1.getOperand(0).getValueType() != VecVT ||
13230 VecVT.getScalarType() != ScalarType ||
13231 VT.getScalarSizeInBits() < ScalarType.getScalarSizeInBits() * 2)
13232 return SDValue();
13233
13234 SDLoc DL(Mul);
13235 unsigned LegalLanes = 128 / (ShftAmt + 1);
13236 EVT LegalVecVT = MVT::getVectorVT(ScalarType, LegalLanes);
13237 // For types smaller than legal vectors extend to be legal and only use needed
13238 // lanes.
13239 if (VecVT.getSizeInBits() < 128) {
13240 EVT ExtVecVT =
13242 VecVT.getVectorNumElements());
13243 SDValue Inp0 =
13244 DAG.getNode(ISD::ANY_EXTEND, DL, ExtVecVT, Ext0.getOperand(0));
13245 SDValue Inp1 =
13246 DAG.getNode(ISD::ANY_EXTEND, DL, ExtVecVT, Ext1.getOperand(0));
13247 Inp0 = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, LegalVecVT, Inp0);
13248 Inp1 = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, LegalVecVT, Inp1);
13249 SDValue VQDMULH = DAG.getNode(ARMISD::VQDMULH, DL, LegalVecVT, Inp0, Inp1);
13250 SDValue Trunc = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, ExtVecVT, VQDMULH);
13251 Trunc = DAG.getNode(ISD::TRUNCATE, DL, VecVT, Trunc);
13252 return DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Trunc);
13253 }
13254
13255 // For larger types, split into legal sized chunks.
13256 assert(VecVT.getSizeInBits() % 128 == 0 && "Expected a power2 type");
13257 unsigned NumParts = VecVT.getSizeInBits() / 128;
13259 for (unsigned I = 0; I < NumParts; ++I) {
13260 SDValue Inp0 =
13261 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, LegalVecVT, Ext0.getOperand(0),
13262 DAG.getVectorIdxConstant(I * LegalLanes, DL));
13263 SDValue Inp1 =
13264 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, LegalVecVT, Ext1.getOperand(0),
13265 DAG.getVectorIdxConstant(I * LegalLanes, DL));
13266 SDValue VQDMULH = DAG.getNode(ARMISD::VQDMULH, DL, LegalVecVT, Inp0, Inp1);
13267 Parts.push_back(VQDMULH);
13268 }
13269 return DAG.getNode(ISD::SIGN_EXTEND, DL, VT,
13270 DAG.getNode(ISD::CONCAT_VECTORS, DL, VecVT, Parts));
13271}
13272
13275 const ARMSubtarget *Subtarget) {
13276 if (!Subtarget->hasMVEIntegerOps())
13277 return SDValue();
13278
13279 if (SDValue V = PerformVQDMULHCombine(N, DCI.DAG))
13280 return V;
13281
13282 // Transforms vselect(not(cond), lhs, rhs) into vselect(cond, rhs, lhs).
13283 //
13284 // We need to re-implement this optimization here as the implementation in the
13285 // Target-Independent DAGCombiner does not handle the kind of constant we make
13286 // (it calls isConstOrConstSplat with AllowTruncation set to false - and for
13287 // good reason, allowing truncation there would break other targets).
13288 //
13289 // Currently, this is only done for MVE, as it's the only target that benefits
13290 // from this transformation (e.g. VPNOT+VPSEL becomes a single VPSEL).
13291 if (N->getOperand(0).getOpcode() != ISD::XOR)
13292 return SDValue();
13293 SDValue XOR = N->getOperand(0);
13294
13295 // Check if the XOR's RHS is either a 1, or a BUILD_VECTOR of 1s.
13296 // It is important to check with truncation allowed as the BUILD_VECTORs we
13297 // generate in those situations will truncate their operands.
13298 ConstantSDNode *Const =
13299 isConstOrConstSplat(XOR->getOperand(1), /*AllowUndefs*/ false,
13300 /*AllowTruncation*/ true);
13301 if (!Const || !Const->isOne())
13302 return SDValue();
13303
13304 // Rewrite into vselect(cond, rhs, lhs).
13305 SDValue Cond = XOR->getOperand(0);
13306 SDValue LHS = N->getOperand(1);
13307 SDValue RHS = N->getOperand(2);
13308 EVT Type = N->getValueType(0);
13309 return DCI.DAG.getNode(ISD::VSELECT, SDLoc(N), Type, Cond, RHS, LHS);
13310}
13311
13312// Convert vsetcc([0,1,2,..], splat(n), ult) -> vctp n
13315 const ARMSubtarget *Subtarget) {
13316 SDValue Op0 = N->getOperand(0);
13317 SDValue Op1 = N->getOperand(1);
13318 ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
13319 EVT VT = N->getValueType(0);
13320
13321 if (!Subtarget->hasMVEIntegerOps() ||
13323 return SDValue();
13324
13325 if (CC == ISD::SETUGE) {
13326 std::swap(Op0, Op1);
13327 CC = ISD::SETULT;
13328 }
13329
13330 if (CC != ISD::SETULT || VT.getScalarSizeInBits() != 1 ||
13332 return SDValue();
13333
13334 // Check first operand is BuildVector of 0,1,2,...
13335 for (unsigned I = 0; I < VT.getVectorNumElements(); I++) {
13336 if (!Op0.getOperand(I).isUndef() &&
13338 Op0.getConstantOperandVal(I) == I))
13339 return SDValue();
13340 }
13341
13342 // The second is a Splat of Op1S
13343 SDValue Op1S = DCI.DAG.getSplatValue(Op1);
13344 if (!Op1S)
13345 return SDValue();
13346
13347 unsigned Opc;
13348 switch (VT.getVectorNumElements()) {
13349 case 2:
13350 Opc = Intrinsic::arm_mve_vctp64;
13351 break;
13352 case 4:
13353 Opc = Intrinsic::arm_mve_vctp32;
13354 break;
13355 case 8:
13356 Opc = Intrinsic::arm_mve_vctp16;
13357 break;
13358 case 16:
13359 Opc = Intrinsic::arm_mve_vctp8;
13360 break;
13361 default:
13362 return SDValue();
13363 }
13364
13365 SDLoc DL(N);
13366 return DCI.DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
13367 DCI.DAG.getConstant(Opc, DL, MVT::i32),
13368 DCI.DAG.getZExtOrTrunc(Op1S, DL, MVT::i32));
13369}
13370
13371/// PerformADDECombine - Target-specific dag combine transform from
13372/// ARMISD::ADDC, ARMISD::ADDE, and ISD::MUL_LOHI to MLAL or
13373/// ARMISD::ADDC, ARMISD::ADDE and ARMISD::UMLAL to ARMISD::UMAAL
13376 const ARMSubtarget *Subtarget) {
13377 // Only ARM and Thumb2 support UMLAL/SMLAL.
13378 if (Subtarget->isThumb1Only())
13379 return PerformAddeSubeCombine(N, DCI, Subtarget);
13380
13381 // Only perform the checks after legalize when the pattern is available.
13382 if (DCI.isBeforeLegalize()) return SDValue();
13383
13384 return AddCombineTo64bitUMAAL(N, DCI, Subtarget);
13385}
13386
13387/// PerformADDCombineWithOperands - Try DAG combinations for an ADD with
13388/// operands N0 and N1. This is a helper for PerformADDCombine that is
13389/// called with the default operands, and if that fails, with commuted
13390/// operands.
13393 const ARMSubtarget *Subtarget){
13394 // Attempt to create vpadd for this add.
13395 if (SDValue Result = AddCombineToVPADD(N, N0, N1, DCI, Subtarget))
13396 return Result;
13397
13398 // Attempt to create vpaddl for this add.
13399 if (SDValue Result = AddCombineVUZPToVPADDL(N, N0, N1, DCI, Subtarget))
13400 return Result;
13401 if (SDValue Result = AddCombineBUILD_VECTORToVPADDL(N, N0, N1, DCI,
13402 Subtarget))
13403 return Result;
13404
13405 // fold (add (select cc, 0, c), x) -> (select cc, x, (add, x, c))
13406 if (N0.getNode()->hasOneUse())
13407 if (SDValue Result = combineSelectAndUse(N, N0, N1, DCI))
13408 return Result;
13409 return SDValue();
13410}
13411
13413 EVT VT = N->getValueType(0);
13414 SDValue N0 = N->getOperand(0);
13415 SDValue N1 = N->getOperand(1);
13416 SDLoc dl(N);
13417
13418 auto IsVecReduce = [](SDValue Op) {
13419 switch (Op.getOpcode()) {
13420 case ISD::VECREDUCE_ADD:
13421 case ARMISD::VADDVs:
13422 case ARMISD::VADDVu:
13423 case ARMISD::VMLAVs:
13424 case ARMISD::VMLAVu:
13425 return true;
13426 }
13427 return false;
13428 };
13429
13430 auto DistrubuteAddAddVecReduce = [&](SDValue N0, SDValue N1) {
13431 // Distribute add(X, add(vecreduce(Y), vecreduce(Z))) ->
13432 // add(add(X, vecreduce(Y)), vecreduce(Z))
13433 // to make better use of vaddva style instructions.
13434 if (VT == MVT::i32 && N1.getOpcode() == ISD::ADD && !IsVecReduce(N0) &&
13435 IsVecReduce(N1.getOperand(0)) && IsVecReduce(N1.getOperand(1)) &&
13436 !isa<ConstantSDNode>(N0) && N1->hasOneUse()) {
13437 SDValue Add0 = DAG.getNode(ISD::ADD, dl, VT, N0, N1.getOperand(0));
13438 return DAG.getNode(ISD::ADD, dl, VT, Add0, N1.getOperand(1));
13439 }
13440 // And turn add(add(A, reduce(B)), add(C, reduce(D))) ->
13441 // add(add(add(A, C), reduce(B)), reduce(D))
13442 if (VT == MVT::i32 && N0.getOpcode() == ISD::ADD &&
13443 N1.getOpcode() == ISD::ADD && N0->hasOneUse() && N1->hasOneUse()) {
13444 unsigned N0RedOp = 0;
13445 if (!IsVecReduce(N0.getOperand(N0RedOp))) {
13446 N0RedOp = 1;
13447 if (!IsVecReduce(N0.getOperand(N0RedOp)))
13448 return SDValue();
13449 }
13450
13451 unsigned N1RedOp = 0;
13452 if (!IsVecReduce(N1.getOperand(N1RedOp)))
13453 N1RedOp = 1;
13454 if (!IsVecReduce(N1.getOperand(N1RedOp)))
13455 return SDValue();
13456
13457 SDValue Add0 = DAG.getNode(ISD::ADD, dl, VT, N0.getOperand(1 - N0RedOp),
13458 N1.getOperand(1 - N1RedOp));
13459 SDValue Add1 =
13460 DAG.getNode(ISD::ADD, dl, VT, Add0, N0.getOperand(N0RedOp));
13461 return DAG.getNode(ISD::ADD, dl, VT, Add1, N1.getOperand(N1RedOp));
13462 }
13463 return SDValue();
13464 };
13465 if (SDValue R = DistrubuteAddAddVecReduce(N0, N1))
13466 return R;
13467 if (SDValue R = DistrubuteAddAddVecReduce(N1, N0))
13468 return R;
13469
13470 // Distribute add(vecreduce(load(Y)), vecreduce(load(Z)))
13471 // Or add(add(X, vecreduce(load(Y))), vecreduce(load(Z)))
13472 // by ascending load offsets. This can help cores prefetch if the order of
13473 // loads is more predictable.
13474 auto DistrubuteVecReduceLoad = [&](SDValue N0, SDValue N1, bool IsForward) {
13475 // Check if two reductions are known to load data where one is before/after
13476 // another. Return negative if N0 loads data before N1, positive if N1 is
13477 // before N0 and 0 otherwise if nothing is known.
13478 auto IsKnownOrderedLoad = [&](SDValue N0, SDValue N1) {
13479 // Look through to the first operand of a MUL, for the VMLA case.
13480 // Currently only looks at the first operand, in the hope they are equal.
13481 if (N0.getOpcode() == ISD::MUL)
13482 N0 = N0.getOperand(0);
13483 if (N1.getOpcode() == ISD::MUL)
13484 N1 = N1.getOperand(0);
13485
13486 // Return true if the two operands are loads to the same object and the
13487 // offset of the first is known to be less than the offset of the second.
13488 LoadSDNode *Load0 = dyn_cast<LoadSDNode>(N0);
13489 LoadSDNode *Load1 = dyn_cast<LoadSDNode>(N1);
13490 if (!Load0 || !Load1 || Load0->getChain() != Load1->getChain() ||
13491 !Load0->isSimple() || !Load1->isSimple() || Load0->isIndexed() ||
13492 Load1->isIndexed())
13493 return 0;
13494
13495 auto BaseLocDecomp0 = BaseIndexOffset::match(Load0, DAG);
13496 auto BaseLocDecomp1 = BaseIndexOffset::match(Load1, DAG);
13497
13498 if (!BaseLocDecomp0.getBase() ||
13499 BaseLocDecomp0.getBase() != BaseLocDecomp1.getBase() ||
13500 !BaseLocDecomp0.hasValidOffset() || !BaseLocDecomp1.hasValidOffset())
13501 return 0;
13502 if (BaseLocDecomp0.getOffset() < BaseLocDecomp1.getOffset())
13503 return -1;
13504 if (BaseLocDecomp0.getOffset() > BaseLocDecomp1.getOffset())
13505 return 1;
13506 return 0;
13507 };
13508
13509 SDValue X;
13510 if (N0.getOpcode() == ISD::ADD && N0->hasOneUse()) {
13511 if (IsVecReduce(N0.getOperand(0)) && IsVecReduce(N0.getOperand(1))) {
13512 int IsBefore = IsKnownOrderedLoad(N0.getOperand(0).getOperand(0),
13513 N0.getOperand(1).getOperand(0));
13514 if (IsBefore < 0) {
13515 X = N0.getOperand(0);
13516 N0 = N0.getOperand(1);
13517 } else if (IsBefore > 0) {
13518 X = N0.getOperand(1);
13519 N0 = N0.getOperand(0);
13520 } else
13521 return SDValue();
13522 } else if (IsVecReduce(N0.getOperand(0))) {
13523 X = N0.getOperand(1);
13524 N0 = N0.getOperand(0);
13525 } else if (IsVecReduce(N0.getOperand(1))) {
13526 X = N0.getOperand(0);
13527 N0 = N0.getOperand(1);
13528 } else
13529 return SDValue();
13530 } else if (IsForward && IsVecReduce(N0) && IsVecReduce(N1) &&
13531 IsKnownOrderedLoad(N0.getOperand(0), N1.getOperand(0)) < 0) {
13532 // Note this is backward to how you would expect. We create
13533 // add(reduce(load + 16), reduce(load + 0)) so that the
13534 // add(reduce(load+16), X) is combined into VADDVA(X, load+16)), leaving
13535 // the X as VADDV(load + 0)
13536 return DAG.getNode(ISD::ADD, dl, VT, N1, N0);
13537 } else
13538 return SDValue();
13539
13540 if (!IsVecReduce(N0) || !IsVecReduce(N1))
13541 return SDValue();
13542
13543 if (IsKnownOrderedLoad(N1.getOperand(0), N0.getOperand(0)) >= 0)
13544 return SDValue();
13545
13546 // Switch from add(add(X, N0), N1) to add(add(X, N1), N0)
13547 SDValue Add0 = DAG.getNode(ISD::ADD, dl, VT, X, N1);
13548 return DAG.getNode(ISD::ADD, dl, VT, Add0, N0);
13549 };
13550 if (SDValue R = DistrubuteVecReduceLoad(N0, N1, true))
13551 return R;
13552 if (SDValue R = DistrubuteVecReduceLoad(N1, N0, false))
13553 return R;
13554 return SDValue();
13555}
13556
13558 const ARMSubtarget *Subtarget) {
13559 if (!Subtarget->hasMVEIntegerOps())
13560 return SDValue();
13561
13563 return R;
13564
13565 EVT VT = N->getValueType(0);
13566 SDValue N0 = N->getOperand(0);
13567 SDValue N1 = N->getOperand(1);
13568 SDLoc dl(N);
13569
13570 if (VT != MVT::i64)
13571 return SDValue();
13572
13573 // We are looking for a i64 add of a VADDLVx. Due to these being i64's, this
13574 // will look like:
13575 // t1: i32,i32 = ARMISD::VADDLVs x
13576 // t2: i64 = build_pair t1, t1:1
13577 // t3: i64 = add t2, y
13578 // Otherwise we try to push the add up above VADDLVAx, to potentially allow
13579 // the add to be simplified separately.
13580 // We also need to check for sext / zext and commutitive adds.
13581 auto MakeVecReduce = [&](unsigned Opcode, unsigned OpcodeA, SDValue NA,
13582 SDValue NB) {
13583 if (NB->getOpcode() != ISD::BUILD_PAIR)
13584 return SDValue();
13585 SDValue VecRed = NB->getOperand(0);
13586 if ((VecRed->getOpcode() != Opcode && VecRed->getOpcode() != OpcodeA) ||
13587 VecRed.getResNo() != 0 ||
13588 NB->getOperand(1) != SDValue(VecRed.getNode(), 1))
13589 return SDValue();
13590
13591 if (VecRed->getOpcode() == OpcodeA) {
13592 // add(NA, VADDLVA(Inp), Y) -> VADDLVA(add(NA, Inp), Y)
13593 SDValue Inp = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64,
13594 VecRed.getOperand(0), VecRed.getOperand(1));
13595 NA = DAG.getNode(ISD::ADD, dl, MVT::i64, Inp, NA);
13596 }
13597
13599 std::tie(Ops[0], Ops[1]) = DAG.SplitScalar(NA, dl, MVT::i32, MVT::i32);
13600
13601 unsigned S = VecRed->getOpcode() == OpcodeA ? 2 : 0;
13602 for (unsigned I = S, E = VecRed.getNumOperands(); I < E; I++)
13603 Ops.push_back(VecRed->getOperand(I));
13604 SDValue Red =
13605 DAG.getNode(OpcodeA, dl, DAG.getVTList({MVT::i32, MVT::i32}), Ops);
13606 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Red,
13607 SDValue(Red.getNode(), 1));
13608 };
13609
13610 if (SDValue M = MakeVecReduce(ARMISD::VADDLVs, ARMISD::VADDLVAs, N0, N1))
13611 return M;
13612 if (SDValue M = MakeVecReduce(ARMISD::VADDLVu, ARMISD::VADDLVAu, N0, N1))
13613 return M;
13614 if (SDValue M = MakeVecReduce(ARMISD::VADDLVs, ARMISD::VADDLVAs, N1, N0))
13615 return M;
13616 if (SDValue M = MakeVecReduce(ARMISD::VADDLVu, ARMISD::VADDLVAu, N1, N0))
13617 return M;
13618 if (SDValue M = MakeVecReduce(ARMISD::VADDLVps, ARMISD::VADDLVAps, N0, N1))
13619 return M;
13620 if (SDValue M = MakeVecReduce(ARMISD::VADDLVpu, ARMISD::VADDLVApu, N0, N1))
13621 return M;
13622 if (SDValue M = MakeVecReduce(ARMISD::VADDLVps, ARMISD::VADDLVAps, N1, N0))
13623 return M;
13624 if (SDValue M = MakeVecReduce(ARMISD::VADDLVpu, ARMISD::VADDLVApu, N1, N0))
13625 return M;
13626 if (SDValue M = MakeVecReduce(ARMISD::VMLALVs, ARMISD::VMLALVAs, N0, N1))
13627 return M;
13628 if (SDValue M = MakeVecReduce(ARMISD::VMLALVu, ARMISD::VMLALVAu, N0, N1))
13629 return M;
13630 if (SDValue M = MakeVecReduce(ARMISD::VMLALVs, ARMISD::VMLALVAs, N1, N0))
13631 return M;
13632 if (SDValue M = MakeVecReduce(ARMISD::VMLALVu, ARMISD::VMLALVAu, N1, N0))
13633 return M;
13634 if (SDValue M = MakeVecReduce(ARMISD::VMLALVps, ARMISD::VMLALVAps, N0, N1))
13635 return M;
13636 if (SDValue M = MakeVecReduce(ARMISD::VMLALVpu, ARMISD::VMLALVApu, N0, N1))
13637 return M;
13638 if (SDValue M = MakeVecReduce(ARMISD::VMLALVps, ARMISD::VMLALVAps, N1, N0))
13639 return M;
13640 if (SDValue M = MakeVecReduce(ARMISD::VMLALVpu, ARMISD::VMLALVApu, N1, N0))
13641 return M;
13642 return SDValue();
13643}
13644
13645bool
13647 CombineLevel Level) const {
13648 assert((N->getOpcode() == ISD::SHL || N->getOpcode() == ISD::SRA ||
13649 N->getOpcode() == ISD::SRL) &&
13650 "Expected shift op");
13651
13652 SDValue ShiftLHS = N->getOperand(0);
13653 if (!ShiftLHS->hasOneUse())
13654 return false;
13655
13656 if (ShiftLHS.getOpcode() == ISD::SIGN_EXTEND &&
13657 !ShiftLHS.getOperand(0)->hasOneUse())
13658 return false;
13659
13660 if (Level == BeforeLegalizeTypes)
13661 return true;
13662
13663 if (N->getOpcode() != ISD::SHL)
13664 return true;
13665
13666 if (Subtarget->isThumb1Only()) {
13667 // Avoid making expensive immediates by commuting shifts. (This logic
13668 // only applies to Thumb1 because ARM and Thumb2 immediates can be shifted
13669 // for free.)
13670 if (N->getOpcode() != ISD::SHL)
13671 return true;
13672 SDValue N1 = N->getOperand(0);
13673 if (N1->getOpcode() != ISD::ADD && N1->getOpcode() != ISD::AND &&
13674 N1->getOpcode() != ISD::OR && N1->getOpcode() != ISD::XOR)
13675 return true;
13676 if (auto *Const = dyn_cast<ConstantSDNode>(N1->getOperand(1))) {
13677 if (Const->getAPIntValue().ult(256))
13678 return false;
13679 if (N1->getOpcode() == ISD::ADD && Const->getAPIntValue().slt(0) &&
13680 Const->getAPIntValue().sgt(-256))
13681 return false;
13682 }
13683 return true;
13684 }
13685
13686 // Turn off commute-with-shift transform after legalization, so it doesn't
13687 // conflict with PerformSHLSimplify. (We could try to detect when
13688 // PerformSHLSimplify would trigger more precisely, but it isn't
13689 // really necessary.)
13690 return false;
13691}
13692
13694 const SDNode *N) const {
13695 assert(N->getOpcode() == ISD::XOR &&
13696 (N->getOperand(0).getOpcode() == ISD::SHL ||
13697 N->getOperand(0).getOpcode() == ISD::SRL) &&
13698 "Expected XOR(SHIFT) pattern");
13699
13700 // Only commute if the entire NOT mask is a hidden shifted mask.
13701 auto *XorC = dyn_cast<ConstantSDNode>(N->getOperand(1));
13702 auto *ShiftC = dyn_cast<ConstantSDNode>(N->getOperand(0).getOperand(1));
13703 if (XorC && ShiftC) {
13704 unsigned MaskIdx, MaskLen;
13705 if (XorC->getAPIntValue().isShiftedMask(MaskIdx, MaskLen)) {
13706 unsigned ShiftAmt = ShiftC->getZExtValue();
13707 unsigned BitWidth = N->getValueType(0).getScalarSizeInBits();
13708 if (N->getOperand(0).getOpcode() == ISD::SHL)
13709 return MaskIdx == ShiftAmt && MaskLen == (BitWidth - ShiftAmt);
13710 return MaskIdx == 0 && MaskLen == (BitWidth - ShiftAmt);
13711 }
13712 }
13713
13714 return false;
13715}
13716
13718 const SDNode *N) const {
13719 assert(((N->getOpcode() == ISD::SHL &&
13720 N->getOperand(0).getOpcode() == ISD::SRL) ||
13721 (N->getOpcode() == ISD::SRL &&
13722 N->getOperand(0).getOpcode() == ISD::SHL)) &&
13723 "Expected shift-shift mask");
13724
13725 if (!Subtarget->isThumb1Only())
13726 return true;
13727
13728 EVT VT = N->getValueType(0);
13729 if (VT.getScalarSizeInBits() > 32)
13730 return true;
13731
13732 return false;
13733}
13734
13736 unsigned BinOpcode, EVT VT, unsigned SelectOpcode, SDValue X,
13737 SDValue Y) const {
13738 return Subtarget->hasMVEIntegerOps() && isTypeLegal(VT) &&
13739 SelectOpcode == ISD::VSELECT;
13740}
13741
13743 if (!Subtarget->hasNEON()) {
13744 if (Subtarget->isThumb1Only())
13745 return VT.getScalarSizeInBits() <= 32;
13746 return true;
13747 }
13748 return VT.isScalarInteger();
13749}
13750
13752 EVT VT) const {
13753 if (!isOperationLegalOrCustom(Op, VT) || !FPVT.isSimple())
13754 return false;
13755
13756 switch (FPVT.getSimpleVT().SimpleTy) {
13757 case MVT::f16:
13758 return Subtarget->hasVFP2Base();
13759 case MVT::f32:
13760 return Subtarget->hasVFP2Base();
13761 case MVT::f64:
13762 return Subtarget->hasFP64();
13763 case MVT::v4f32:
13764 case MVT::v8f16:
13765 return Subtarget->hasMVEFloatOps();
13766 default:
13767 return false;
13768 }
13769}
13770
13773 const ARMSubtarget *ST) {
13774 // Allow the generic combiner to identify potential bswaps.
13775 if (DCI.isBeforeLegalize())
13776 return SDValue();
13777
13778 // DAG combiner will fold:
13779 // (shl (add x, c1), c2) -> (add (shl x, c2), c1 << c2)
13780 // (shl (or x, c1), c2) -> (or (shl x, c2), c1 << c2
13781 // Other code patterns that can be also be modified have the following form:
13782 // b + ((a << 1) | 510)
13783 // b + ((a << 1) & 510)
13784 // b + ((a << 1) ^ 510)
13785 // b + ((a << 1) + 510)
13786
13787 // Many instructions can perform the shift for free, but it requires both
13788 // the operands to be registers. If c1 << c2 is too large, a mov immediate
13789 // instruction will needed. So, unfold back to the original pattern if:
13790 // - if c1 and c2 are small enough that they don't require mov imms.
13791 // - the user(s) of the node can perform an shl
13792
13793 // No shifted operands for 16-bit instructions.
13794 if (ST->isThumb() && ST->isThumb1Only())
13795 return SDValue();
13796
13797 // Check that all the users could perform the shl themselves.
13798 for (auto *U : N->users()) {
13799 switch(U->getOpcode()) {
13800 default:
13801 return SDValue();
13802 case ISD::SUB:
13803 case ISD::ADD:
13804 case ISD::AND:
13805 case ISD::OR:
13806 case ISD::XOR:
13807 case ISD::SETCC:
13808 case ARMISD::CMP:
13809 // Check that the user isn't already using a constant because there
13810 // aren't any instructions that support an immediate operand and a
13811 // shifted operand.
13812 if (isa<ConstantSDNode>(U->getOperand(0)) ||
13813 isa<ConstantSDNode>(U->getOperand(1)))
13814 return SDValue();
13815
13816 // Check that it's not already using a shift.
13817 if (U->getOperand(0).getOpcode() == ISD::SHL ||
13818 U->getOperand(1).getOpcode() == ISD::SHL)
13819 return SDValue();
13820 break;
13821 }
13822 }
13823
13824 if (N->getOpcode() != ISD::ADD && N->getOpcode() != ISD::OR &&
13825 N->getOpcode() != ISD::XOR && N->getOpcode() != ISD::AND)
13826 return SDValue();
13827
13828 if (N->getOperand(0).getOpcode() != ISD::SHL)
13829 return SDValue();
13830
13831 SDValue SHL = N->getOperand(0);
13832
13833 auto *C1ShlC2 = dyn_cast<ConstantSDNode>(N->getOperand(1));
13834 auto *C2 = dyn_cast<ConstantSDNode>(SHL.getOperand(1));
13835 if (!C1ShlC2 || !C2)
13836 return SDValue();
13837
13838 APInt C2Int = C2->getAPIntValue();
13839 APInt C1Int = C1ShlC2->getAPIntValue();
13840 unsigned C2Width = C2Int.getBitWidth();
13841 if (C2Int.uge(C2Width))
13842 return SDValue();
13843 uint64_t C2Value = C2Int.getZExtValue();
13844
13845 // Check that performing a lshr will not lose any information.
13846 APInt Mask = APInt::getHighBitsSet(C2Width, C2Width - C2Value);
13847 if ((C1Int & Mask) != C1Int)
13848 return SDValue();
13849
13850 // Shift the first constant.
13851 C1Int.lshrInPlace(C2Int);
13852
13853 // The immediates are encoded as an 8-bit value that can be rotated.
13854 auto LargeImm = [](const APInt &Imm) {
13855 unsigned Zeros = Imm.countl_zero() + Imm.countr_zero();
13856 return Imm.getBitWidth() - Zeros > 8;
13857 };
13858
13859 if (LargeImm(C1Int) || LargeImm(C2Int))
13860 return SDValue();
13861
13862 SelectionDAG &DAG = DCI.DAG;
13863 SDLoc dl(N);
13864 SDValue X = SHL.getOperand(0);
13865 SDValue BinOp = DAG.getNode(N->getOpcode(), dl, MVT::i32, X,
13866 DAG.getConstant(C1Int, dl, MVT::i32));
13867 // Shift left to compensate for the lshr of C1Int.
13868 SDValue Res = DAG.getNode(ISD::SHL, dl, MVT::i32, BinOp, SHL.getOperand(1));
13869
13870 LLVM_DEBUG(dbgs() << "Simplify shl use:\n"; SHL.getOperand(0).dump();
13871 SHL.dump(); N->dump());
13872 LLVM_DEBUG(dbgs() << "Into:\n"; X.dump(); BinOp.dump(); Res.dump());
13873 return Res;
13874}
13875
13876
13877/// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD.
13878///
13881 const ARMSubtarget *Subtarget) {
13882 SDValue N0 = N->getOperand(0);
13883 SDValue N1 = N->getOperand(1);
13884
13885 // Only works one way, because it needs an immediate operand.
13886 if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget))
13887 return Result;
13888
13889 if (SDValue Result = PerformADDVecReduce(N, DCI.DAG, Subtarget))
13890 return Result;
13891
13892 // First try with the default operand order.
13893 if (SDValue Result = PerformADDCombineWithOperands(N, N0, N1, DCI, Subtarget))
13894 return Result;
13895
13896 // If that didn't work, try again with the operands commuted.
13897 return PerformADDCombineWithOperands(N, N1, N0, DCI, Subtarget);
13898}
13899
13900// Combine (sub 0, (csinc X, Y, CC)) -> (csinv -X, Y, CC)
13901// providing -X is as cheap as X (currently, just a constant).
13903 if (N->getValueType(0) != MVT::i32 || !isNullConstant(N->getOperand(0)))
13904 return SDValue();
13905 SDValue CSINC = N->getOperand(1);
13906 if (CSINC.getOpcode() != ARMISD::CSINC || !CSINC.hasOneUse())
13907 return SDValue();
13908
13910 if (!X)
13911 return SDValue();
13912
13913 return DAG.getNode(ARMISD::CSINV, SDLoc(N), MVT::i32,
13914 DAG.getNode(ISD::SUB, SDLoc(N), MVT::i32, N->getOperand(0),
13915 CSINC.getOperand(0)),
13916 CSINC.getOperand(1), CSINC.getOperand(2),
13917 CSINC.getOperand(3));
13918}
13919
13921 return Op.getOpcode() == ISD::SUB && isNullConstant(Op.getOperand(0));
13922}
13923
13924// Try to fold
13925//
13926// (neg (cmov X, Y)) -> (cmov (neg X), (neg Y))
13927//
13928// The folding helps cmov to be matched with csneg without generating
13929// redundant neg instruction.
13931 if (!isNegatedInteger(SDValue(N, 0)))
13932 return SDValue();
13933
13934 SDValue CMov = N->getOperand(1);
13935 if (CMov.getOpcode() != ARMISD::CMOV || !CMov->hasOneUse())
13936 return SDValue();
13937
13938 SDValue N0 = CMov.getOperand(0);
13939 SDValue N1 = CMov.getOperand(1);
13940
13941 // If neither of them are negations, it's not worth the folding as it
13942 // introduces two additional negations while reducing one negation.
13943 if (!isNegatedInteger(N0) && !isNegatedInteger(N1))
13944 return SDValue();
13945
13946 SDLoc DL(N);
13947 EVT VT = CMov.getValueType();
13948
13949 SDValue N0N = DAG.getNegative(N0, DL, VT);
13950 SDValue N1N = DAG.getNegative(N1, DL, VT);
13951 return DAG.getNode(ARMISD::CMOV, DL, VT, N0N, N1N, CMov.getOperand(2),
13952 CMov.getOperand(3));
13953}
13954
13955/// PerformSUBCombine - Target-specific dag combine xforms for ISD::SUB.
13956///
13959 const ARMSubtarget *Subtarget) {
13960 SDValue N0 = N->getOperand(0);
13961 SDValue N1 = N->getOperand(1);
13962
13963 // fold (sub x, (select cc, 0, c)) -> (select cc, x, (sub, x, c))
13964 if (N1.getNode()->hasOneUse())
13965 if (SDValue Result = combineSelectAndUse(N, N1, N0, DCI))
13966 return Result;
13967
13968 if (SDValue R = PerformSubCSINCCombine(N, DCI.DAG))
13969 return R;
13970
13971 if (SDValue Val = performNegCMovCombine(N, DCI.DAG))
13972 return Val;
13973
13974 if (!Subtarget->hasMVEIntegerOps() || !N->getValueType(0).isVector())
13975 return SDValue();
13976
13977 // Fold (sub (ARMvmovImm 0), (ARMvdup x)) -> (ARMvdup (sub 0, x))
13978 // so that we can readily pattern match more mve instructions which can use
13979 // a scalar operand.
13980 SDValue VDup = N->getOperand(1);
13981 if (VDup->getOpcode() != ARMISD::VDUP)
13982 return SDValue();
13983
13984 SDValue VMov = N->getOperand(0);
13985 if (VMov->getOpcode() == ISD::BITCAST)
13986 VMov = VMov->getOperand(0);
13987
13988 if (VMov->getOpcode() != ARMISD::VMOVIMM || !isZeroVector(VMov))
13989 return SDValue();
13990
13991 SDLoc dl(N);
13992 SDValue Negate = DCI.DAG.getNode(ISD::SUB, dl, MVT::i32,
13993 DCI.DAG.getConstant(0, dl, MVT::i32),
13994 VDup->getOperand(0));
13995 return DCI.DAG.getNode(ARMISD::VDUP, dl, N->getValueType(0), Negate);
13996}
13997
13998/// PerformVMULCombine
13999/// Distribute (A + B) * C to (A * C) + (B * C) to take advantage of the
14000/// special multiplier accumulator forwarding.
14001/// vmul d3, d0, d2
14002/// vmla d3, d1, d2
14003/// is faster than
14004/// vadd d3, d0, d1
14005/// vmul d3, d3, d2
14006// However, for (A + B) * (A + B),
14007// vadd d2, d0, d1
14008// vmul d3, d0, d2
14009// vmla d3, d1, d2
14010// is slower than
14011// vadd d2, d0, d1
14012// vmul d3, d2, d2
14015 const ARMSubtarget *Subtarget) {
14016 if (!Subtarget->hasVMLxForwarding())
14017 return SDValue();
14018
14019 SelectionDAG &DAG = DCI.DAG;
14020 SDValue N0 = N->getOperand(0);
14021 SDValue N1 = N->getOperand(1);
14022 unsigned Opcode = N0.getOpcode();
14023 if (Opcode != ISD::ADD && Opcode != ISD::SUB &&
14024 Opcode != ISD::FADD && Opcode != ISD::FSUB) {
14025 Opcode = N1.getOpcode();
14026 if (Opcode != ISD::ADD && Opcode != ISD::SUB &&
14027 Opcode != ISD::FADD && Opcode != ISD::FSUB)
14028 return SDValue();
14029 std::swap(N0, N1);
14030 }
14031
14032 if (N0 == N1)
14033 return SDValue();
14034
14035 EVT VT = N->getValueType(0);
14036 SDLoc DL(N);
14037 SDValue N00 = N0->getOperand(0);
14038 SDValue N01 = N0->getOperand(1);
14039 return DAG.getNode(Opcode, DL, VT,
14040 DAG.getNode(ISD::MUL, DL, VT, N00, N1),
14041 DAG.getNode(ISD::MUL, DL, VT, N01, N1));
14042}
14043
14045 const ARMSubtarget *Subtarget) {
14046 EVT VT = N->getValueType(0);
14047 if (VT != MVT::v2i64)
14048 return SDValue();
14049
14050 SDValue N0 = N->getOperand(0);
14051 SDValue N1 = N->getOperand(1);
14052
14053 auto IsSignExt = [&](SDValue Op) {
14054 if (Op->getOpcode() != ISD::SIGN_EXTEND_INREG)
14055 return SDValue();
14056 EVT VT = cast<VTSDNode>(Op->getOperand(1))->getVT();
14057 if (VT.getScalarSizeInBits() == 32)
14058 return Op->getOperand(0);
14059 return SDValue();
14060 };
14061 auto IsZeroExt = [&](SDValue Op) {
14062 // Zero extends are a little more awkward. At the point we are matching
14063 // this, we are looking for an AND with a (-1, 0, -1, 0) buildvector mask.
14064 // That might be before of after a bitcast depending on how the and is
14065 // placed. Because this has to look through bitcasts, it is currently only
14066 // supported on LE.
14067 if (!Subtarget->isLittle())
14068 return SDValue();
14069
14070 SDValue And = Op;
14071 if (And->getOpcode() == ISD::BITCAST)
14072 And = And->getOperand(0);
14073 if (And->getOpcode() != ISD::AND)
14074 return SDValue();
14075 SDValue Mask = And->getOperand(1);
14076 if (Mask->getOpcode() == ISD::BITCAST)
14077 Mask = Mask->getOperand(0);
14078
14079 if (Mask->getOpcode() != ISD::BUILD_VECTOR ||
14080 Mask.getValueType() != MVT::v4i32)
14081 return SDValue();
14082 if (isAllOnesConstant(Mask->getOperand(0)) &&
14083 isNullConstant(Mask->getOperand(1)) &&
14084 isAllOnesConstant(Mask->getOperand(2)) &&
14085 isNullConstant(Mask->getOperand(3)))
14086 return And->getOperand(0);
14087 return SDValue();
14088 };
14089
14090 SDLoc dl(N);
14091 if (SDValue Op0 = IsSignExt(N0)) {
14092 if (SDValue Op1 = IsSignExt(N1)) {
14093 SDValue New0a = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, Op0);
14094 SDValue New1a = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, Op1);
14095 return DAG.getNode(ARMISD::VMULLs, dl, VT, New0a, New1a);
14096 }
14097 }
14098 if (SDValue Op0 = IsZeroExt(N0)) {
14099 if (SDValue Op1 = IsZeroExt(N1)) {
14100 SDValue New0a = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, Op0);
14101 SDValue New1a = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, Op1);
14102 return DAG.getNode(ARMISD::VMULLu, dl, VT, New0a, New1a);
14103 }
14104 }
14105
14106 return SDValue();
14107}
14108
14111 const ARMSubtarget *Subtarget) {
14112 SelectionDAG &DAG = DCI.DAG;
14113
14114 EVT VT = N->getValueType(0);
14115 if (Subtarget->hasMVEIntegerOps() && VT == MVT::v2i64)
14116 return PerformMVEVMULLCombine(N, DAG, Subtarget);
14117
14118 if (Subtarget->isThumb1Only())
14119 return SDValue();
14120
14121 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
14122 return SDValue();
14123
14124 if (VT.is64BitVector() || VT.is128BitVector())
14125 return PerformVMULCombine(N, DCI, Subtarget);
14126 if (VT != MVT::i32)
14127 return SDValue();
14128
14129 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
14130 if (!C)
14131 return SDValue();
14132
14133 int64_t MulAmt = C->getSExtValue();
14134 unsigned ShiftAmt = llvm::countr_zero<uint64_t>(MulAmt);
14135
14136 ShiftAmt = ShiftAmt & (32 - 1);
14137 SDValue V = N->getOperand(0);
14138 SDLoc DL(N);
14139
14140 SDValue Res;
14141 MulAmt >>= ShiftAmt;
14142
14143 if (MulAmt >= 0) {
14144 if (llvm::has_single_bit<uint32_t>(MulAmt - 1)) {
14145 // (mul x, 2^N + 1) => (add (shl x, N), x)
14146 Res = DAG.getNode(ISD::ADD, DL, VT,
14147 V,
14148 DAG.getNode(ISD::SHL, DL, VT,
14149 V,
14150 DAG.getConstant(Log2_32(MulAmt - 1), DL,
14151 MVT::i32)));
14152 } else if (llvm::has_single_bit<uint32_t>(MulAmt + 1)) {
14153 // (mul x, 2^N - 1) => (sub (shl x, N), x)
14154 Res = DAG.getNode(ISD::SUB, DL, VT,
14155 DAG.getNode(ISD::SHL, DL, VT,
14156 V,
14157 DAG.getConstant(Log2_32(MulAmt + 1), DL,
14158 MVT::i32)),
14159 V);
14160 } else
14161 return SDValue();
14162 } else {
14163 uint64_t MulAmtAbs = -MulAmt;
14164 if (llvm::has_single_bit<uint32_t>(MulAmtAbs + 1)) {
14165 // (mul x, -(2^N - 1)) => (sub x, (shl x, N))
14166 Res = DAG.getNode(ISD::SUB, DL, VT,
14167 V,
14168 DAG.getNode(ISD::SHL, DL, VT,
14169 V,
14170 DAG.getConstant(Log2_32(MulAmtAbs + 1), DL,
14171 MVT::i32)));
14172 } else if (llvm::has_single_bit<uint32_t>(MulAmtAbs - 1)) {
14173 // (mul x, -(2^N + 1)) => - (add (shl x, N), x)
14174 Res = DAG.getNode(ISD::ADD, DL, VT,
14175 V,
14176 DAG.getNode(ISD::SHL, DL, VT,
14177 V,
14178 DAG.getConstant(Log2_32(MulAmtAbs - 1), DL,
14179 MVT::i32)));
14180 Res = DAG.getNode(ISD::SUB, DL, VT,
14181 DAG.getConstant(0, DL, MVT::i32), Res);
14182 } else
14183 return SDValue();
14184 }
14185
14186 if (ShiftAmt != 0)
14187 Res = DAG.getNode(ISD::SHL, DL, VT,
14188 Res, DAG.getConstant(ShiftAmt, DL, MVT::i32));
14189
14190 // Do not add new nodes to DAG combiner worklist.
14191 DCI.CombineTo(N, Res, false);
14192 return SDValue();
14193}
14194
14197 const ARMSubtarget *Subtarget) {
14198 // Allow DAGCombine to pattern-match before we touch the canonical form.
14199 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
14200 return SDValue();
14201
14202 if (N->getValueType(0) != MVT::i32)
14203 return SDValue();
14204
14205 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N->getOperand(1));
14206 if (!N1C)
14207 return SDValue();
14208
14209 uint32_t C1 = (uint32_t)N1C->getZExtValue();
14210 // Don't transform uxtb/uxth.
14211 if (C1 == 255 || C1 == 65535)
14212 return SDValue();
14213
14214 SDNode *N0 = N->getOperand(0).getNode();
14215 if (!N0->hasOneUse())
14216 return SDValue();
14217
14218 if (N0->getOpcode() != ISD::SHL && N0->getOpcode() != ISD::SRL)
14219 return SDValue();
14220
14221 bool LeftShift = N0->getOpcode() == ISD::SHL;
14222
14224 if (!N01C)
14225 return SDValue();
14226
14227 uint32_t C2 = (uint32_t)N01C->getZExtValue();
14228 if (!C2 || C2 >= 32)
14229 return SDValue();
14230
14231 // Clear irrelevant bits in the mask.
14232 if (LeftShift)
14233 C1 &= (-1U << C2);
14234 else
14235 C1 &= (-1U >> C2);
14236
14237 SelectionDAG &DAG = DCI.DAG;
14238 SDLoc DL(N);
14239
14240 // We have a pattern of the form "(and (shl x, c2) c1)" or
14241 // "(and (srl x, c2) c1)", where c1 is a shifted mask. Try to
14242 // transform to a pair of shifts, to save materializing c1.
14243
14244 // First pattern: right shift, then mask off leading bits.
14245 // FIXME: Use demanded bits?
14246 if (!LeftShift && isMask_32(C1)) {
14247 uint32_t C3 = llvm::countl_zero(C1);
14248 if (C2 < C3) {
14249 SDValue SHL = DAG.getNode(ISD::SHL, DL, MVT::i32, N0->getOperand(0),
14250 DAG.getConstant(C3 - C2, DL, MVT::i32));
14251 return DAG.getNode(ISD::SRL, DL, MVT::i32, SHL,
14252 DAG.getConstant(C3, DL, MVT::i32));
14253 }
14254 }
14255
14256 // First pattern, reversed: left shift, then mask off trailing bits.
14257 if (LeftShift && isMask_32(~C1)) {
14258 uint32_t C3 = llvm::countr_zero(C1);
14259 if (C2 < C3) {
14260 SDValue SHL = DAG.getNode(ISD::SRL, DL, MVT::i32, N0->getOperand(0),
14261 DAG.getConstant(C3 - C2, DL, MVT::i32));
14262 return DAG.getNode(ISD::SHL, DL, MVT::i32, SHL,
14263 DAG.getConstant(C3, DL, MVT::i32));
14264 }
14265 }
14266
14267 // Second pattern: left shift, then mask off leading bits.
14268 // FIXME: Use demanded bits?
14269 if (LeftShift && isShiftedMask_32(C1)) {
14270 uint32_t Trailing = llvm::countr_zero(C1);
14271 uint32_t C3 = llvm::countl_zero(C1);
14272 if (Trailing == C2 && C2 + C3 < 32) {
14273 SDValue SHL = DAG.getNode(ISD::SHL, DL, MVT::i32, N0->getOperand(0),
14274 DAG.getConstant(C2 + C3, DL, MVT::i32));
14275 return DAG.getNode(ISD::SRL, DL, MVT::i32, SHL,
14276 DAG.getConstant(C3, DL, MVT::i32));
14277 }
14278 }
14279
14280 // Second pattern, reversed: right shift, then mask off trailing bits.
14281 // FIXME: Handle other patterns of known/demanded bits.
14282 if (!LeftShift && isShiftedMask_32(C1)) {
14283 uint32_t Leading = llvm::countl_zero(C1);
14284 uint32_t C3 = llvm::countr_zero(C1);
14285 if (Leading == C2 && C2 + C3 < 32) {
14286 SDValue SHL = DAG.getNode(ISD::SRL, DL, MVT::i32, N0->getOperand(0),
14287 DAG.getConstant(C2 + C3, DL, MVT::i32));
14288 return DAG.getNode(ISD::SHL, DL, MVT::i32, SHL,
14289 DAG.getConstant(C3, DL, MVT::i32));
14290 }
14291 }
14292
14293 // Transform "(and (shl x, c2) c1)" into "(shl (and x, c1>>c2), c2)"
14294 // if "c1 >> c2" is a cheaper immediate than "c1"
14295 if (LeftShift &&
14296 HasLowerConstantMaterializationCost(C1 >> C2, C1, Subtarget)) {
14297
14298 SDValue And = DAG.getNode(ISD::AND, DL, MVT::i32, N0->getOperand(0),
14299 DAG.getConstant(C1 >> C2, DL, MVT::i32));
14300 return DAG.getNode(ISD::SHL, DL, MVT::i32, And,
14301 DAG.getConstant(C2, DL, MVT::i32));
14302 }
14303
14304 return SDValue();
14305}
14306
14309 const ARMSubtarget *Subtarget) {
14310 // Attempt to use immediate-form VBIC
14311 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(1));
14312 SDLoc dl(N);
14313 EVT VT = N->getValueType(0);
14314 SelectionDAG &DAG = DCI.DAG;
14315
14316 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT) || VT == MVT::v2i1 ||
14317 VT == MVT::v4i1 || VT == MVT::v8i1 || VT == MVT::v16i1)
14318 return SDValue();
14319
14320 APInt SplatBits, SplatUndef;
14321 unsigned SplatBitSize;
14322 bool HasAnyUndefs;
14323 if (BVN && (Subtarget->hasNEON() || Subtarget->hasMVEIntegerOps()) &&
14324 BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
14325 if (SplatBitSize == 8 || SplatBitSize == 16 || SplatBitSize == 32 ||
14326 SplatBitSize == 64) {
14327 EVT VbicVT;
14328 SDValue Val = isVMOVModifiedImm((~SplatBits).getZExtValue(),
14329 SplatUndef.getZExtValue(), SplatBitSize,
14330 DAG, dl, VbicVT, VT, OtherModImm);
14331 if (Val.getNode()) {
14332 SDValue Input =
14333 DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VbicVT, N->getOperand(0));
14334 SDValue Vbic = DAG.getNode(ARMISD::VBICIMM, dl, VbicVT, Input, Val);
14335 return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, Vbic);
14336 }
14337 }
14338 }
14339
14340 if (!Subtarget->isThumb1Only()) {
14341 // fold (and (select cc, -1, c), x) -> (select cc, x, (and, x, c))
14342 if (SDValue Result = combineSelectAndUseCommutative(N, true, DCI))
14343 return Result;
14344
14345 if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget))
14346 return Result;
14347 }
14348
14349 if (Subtarget->isThumb1Only())
14350 if (SDValue Result = CombineANDShift(N, DCI, Subtarget))
14351 return Result;
14352
14353 return SDValue();
14354}
14355
14356// Try combining OR nodes to SMULWB, SMULWT.
14359 const ARMSubtarget *Subtarget) {
14360 if (!Subtarget->hasV6Ops() ||
14361 (Subtarget->isThumb() &&
14362 (!Subtarget->hasThumb2() || !Subtarget->hasDSP())))
14363 return SDValue();
14364
14365 SDValue SRL = OR->getOperand(0);
14366 SDValue SHL = OR->getOperand(1);
14367
14368 if (SRL.getOpcode() != ISD::SRL || SHL.getOpcode() != ISD::SHL) {
14369 SRL = OR->getOperand(1);
14370 SHL = OR->getOperand(0);
14371 }
14372 if (!isSRL16(SRL) || !isSHL16(SHL))
14373 return SDValue();
14374
14375 // The first operands to the shifts need to be the two results from the
14376 // same smul_lohi node.
14377 if ((SRL.getOperand(0).getNode() != SHL.getOperand(0).getNode()) ||
14378 SRL.getOperand(0).getOpcode() != ISD::SMUL_LOHI)
14379 return SDValue();
14380
14381 SDNode *SMULLOHI = SRL.getOperand(0).getNode();
14382 if (SRL.getOperand(0) != SDValue(SMULLOHI, 0) ||
14383 SHL.getOperand(0) != SDValue(SMULLOHI, 1))
14384 return SDValue();
14385
14386 // Now we have:
14387 // (or (srl (smul_lohi ?, ?), 16), (shl (smul_lohi ?, ?), 16)))
14388 // For SMUL[B|T] smul_lohi will take a 32-bit and a 16-bit arguments.
14389 // For SMUWB the 16-bit value will signed extended somehow.
14390 // For SMULWT only the SRA is required.
14391 // Check both sides of SMUL_LOHI
14392 SDValue OpS16 = SMULLOHI->getOperand(0);
14393 SDValue OpS32 = SMULLOHI->getOperand(1);
14394
14395 SelectionDAG &DAG = DCI.DAG;
14396 if (!isS16(OpS16, DAG) && !isSRA16(OpS16)) {
14397 OpS16 = OpS32;
14398 OpS32 = SMULLOHI->getOperand(0);
14399 }
14400
14401 SDLoc dl(OR);
14402 unsigned Opcode = 0;
14403 if (isS16(OpS16, DAG))
14404 Opcode = ARMISD::SMULWB;
14405 else if (isSRA16(OpS16)) {
14406 Opcode = ARMISD::SMULWT;
14407 OpS16 = OpS16->getOperand(0);
14408 }
14409 else
14410 return SDValue();
14411
14412 SDValue Res = DAG.getNode(Opcode, dl, MVT::i32, OpS32, OpS16);
14413 DAG.ReplaceAllUsesOfValueWith(SDValue(OR, 0), Res);
14414 return SDValue(OR, 0);
14415}
14416
14419 const ARMSubtarget *Subtarget) {
14420 // BFI is only available on V6T2+
14421 if (Subtarget->isThumb1Only() || !Subtarget->hasV6T2Ops())
14422 return SDValue();
14423
14424 EVT VT = N->getValueType(0);
14425 SDValue N0 = N->getOperand(0);
14426 SDValue N1 = N->getOperand(1);
14427 SelectionDAG &DAG = DCI.DAG;
14428 SDLoc DL(N);
14429 // 1) or (and A, mask), val => ARMbfi A, val, mask
14430 // iff (val & mask) == val
14431 //
14432 // 2) or (and A, mask), (and B, mask2) => ARMbfi A, (lsr B, amt), mask
14433 // 2a) iff isBitFieldInvertedMask(mask) && isBitFieldInvertedMask(~mask2)
14434 // && mask == ~mask2
14435 // 2b) iff isBitFieldInvertedMask(~mask) && isBitFieldInvertedMask(mask2)
14436 // && ~mask == mask2
14437 // (i.e., copy a bitfield value into another bitfield of the same width)
14438
14439 if (VT != MVT::i32)
14440 return SDValue();
14441
14442 SDValue N00 = N0.getOperand(0);
14443
14444 // The value and the mask need to be constants so we can verify this is
14445 // actually a bitfield set. If the mask is 0xffff, we can do better
14446 // via a movt instruction, so don't use BFI in that case.
14447 SDValue MaskOp = N0.getOperand(1);
14449 if (!MaskC)
14450 return SDValue();
14451 unsigned Mask = MaskC->getZExtValue();
14452 if (Mask == 0xffff)
14453 return SDValue();
14454 SDValue Res;
14455 // Case (1): or (and A, mask), val => ARMbfi A, val, mask
14457 if (N1C) {
14458 unsigned Val = N1C->getZExtValue();
14459 if ((Val & ~Mask) != Val)
14460 return SDValue();
14461
14462 if (ARM::isBitFieldInvertedMask(Mask)) {
14463 Val >>= llvm::countr_zero(~Mask);
14464
14465 Res = DAG.getNode(ARMISD::BFI, DL, VT, N00,
14466 DAG.getConstant(Val, DL, MVT::i32),
14467 DAG.getConstant(Mask, DL, MVT::i32));
14468
14469 DCI.CombineTo(N, Res, false);
14470 // Return value from the original node to inform the combiner than N is
14471 // now dead.
14472 return SDValue(N, 0);
14473 }
14474 } else if (N1.getOpcode() == ISD::AND) {
14475 // case (2) or (and A, mask), (and B, mask2) => ARMbfi A, (lsr B, amt), mask
14477 if (!N11C)
14478 return SDValue();
14479 unsigned Mask2 = N11C->getZExtValue();
14480
14481 // Mask and ~Mask2 (or reverse) must be equivalent for the BFI pattern
14482 // as is to match.
14483 if (ARM::isBitFieldInvertedMask(Mask) &&
14484 (Mask == ~Mask2)) {
14485 // The pack halfword instruction works better for masks that fit it,
14486 // so use that when it's available.
14487 if (Subtarget->hasDSP() &&
14488 (Mask == 0xffff || Mask == 0xffff0000))
14489 return SDValue();
14490 // 2a
14491 unsigned amt = llvm::countr_zero(Mask2);
14492 Res = DAG.getNode(ISD::SRL, DL, VT, N1.getOperand(0),
14493 DAG.getConstant(amt, DL, MVT::i32));
14494 Res = DAG.getNode(ARMISD::BFI, DL, VT, N00, Res,
14495 DAG.getConstant(Mask, DL, MVT::i32));
14496 DCI.CombineTo(N, Res, false);
14497 // Return value from the original node to inform the combiner than N is
14498 // now dead.
14499 return SDValue(N, 0);
14500 } else if (ARM::isBitFieldInvertedMask(~Mask) &&
14501 (~Mask == Mask2)) {
14502 // The pack halfword instruction works better for masks that fit it,
14503 // so use that when it's available.
14504 if (Subtarget->hasDSP() &&
14505 (Mask2 == 0xffff || Mask2 == 0xffff0000))
14506 return SDValue();
14507 // 2b
14508 unsigned lsb = llvm::countr_zero(Mask);
14509 Res = DAG.getNode(ISD::SRL, DL, VT, N00,
14510 DAG.getConstant(lsb, DL, MVT::i32));
14511 Res = DAG.getNode(ARMISD::BFI, DL, VT, N1.getOperand(0), Res,
14512 DAG.getConstant(Mask2, DL, MVT::i32));
14513 DCI.CombineTo(N, Res, false);
14514 // Return value from the original node to inform the combiner than N is
14515 // now dead.
14516 return SDValue(N, 0);
14517 }
14518 }
14519
14520 if (DAG.MaskedValueIsZero(N1, MaskC->getAPIntValue()) &&
14521 N00.getOpcode() == ISD::SHL && isa<ConstantSDNode>(N00.getOperand(1)) &&
14523 // Case (3): or (and (shl A, #shamt), mask), B => ARMbfi B, A, ~mask
14524 // where lsb(mask) == #shamt and masked bits of B are known zero.
14525 SDValue ShAmt = N00.getOperand(1);
14526 unsigned ShAmtC = ShAmt->getAsZExtVal();
14527 unsigned LSB = llvm::countr_zero(Mask);
14528 if (ShAmtC != LSB)
14529 return SDValue();
14530
14531 Res = DAG.getNode(ARMISD::BFI, DL, VT, N1, N00.getOperand(0),
14532 DAG.getConstant(~Mask, DL, MVT::i32));
14533
14534 DCI.CombineTo(N, Res, false);
14535 // Return value from the original node to inform the combiner than N is
14536 // now dead.
14537 return SDValue(N, 0);
14538 }
14539
14540 return SDValue();
14541}
14542
14543static bool isValidMVECond(unsigned CC, bool IsFloat) {
14544 switch (CC) {
14545 case ARMCC::EQ:
14546 case ARMCC::NE:
14547 case ARMCC::LE:
14548 case ARMCC::GT:
14549 case ARMCC::GE:
14550 case ARMCC::LT:
14551 return true;
14552 case ARMCC::HS:
14553 case ARMCC::HI:
14554 return !IsFloat;
14555 default:
14556 return false;
14557 };
14558}
14559
14561 if (N->getOpcode() == ARMISD::VCMP)
14562 return (ARMCC::CondCodes)N->getConstantOperandVal(2);
14563 else if (N->getOpcode() == ARMISD::VCMPZ)
14564 return (ARMCC::CondCodes)N->getConstantOperandVal(1);
14565 else
14566 llvm_unreachable("Not a VCMP/VCMPZ!");
14567}
14568
14571 return isValidMVECond(CC, N->getOperand(0).getValueType().isFloatingPoint());
14572}
14573
14575 const ARMSubtarget *Subtarget) {
14576 // Try to invert "or A, B" -> "and ~A, ~B", as the "and" is easier to chain
14577 // together with predicates
14578 EVT VT = N->getValueType(0);
14579 SDLoc DL(N);
14580 SDValue N0 = N->getOperand(0);
14581 SDValue N1 = N->getOperand(1);
14582
14583 auto IsFreelyInvertable = [&](SDValue V) {
14584 if (V->getOpcode() == ARMISD::VCMP || V->getOpcode() == ARMISD::VCMPZ)
14585 return CanInvertMVEVCMP(V);
14586 return false;
14587 };
14588
14589 // At least one operand must be freely invertable.
14590 if (!(IsFreelyInvertable(N0) || IsFreelyInvertable(N1)))
14591 return SDValue();
14592
14593 SDValue NewN0 = DAG.getLogicalNOT(DL, N0, VT);
14594 SDValue NewN1 = DAG.getLogicalNOT(DL, N1, VT);
14595 SDValue And = DAG.getNode(ISD::AND, DL, VT, NewN0, NewN1);
14596 return DAG.getLogicalNOT(DL, And, VT);
14597}
14598
14599// Try to form a NEON shift-{right, left}-and-insert (VSRI/VSLI) from:
14600// (or (and X, splat (i32 C1)), (srl Y, splat (i32 C2))) -> VSRI X, Y, #C2
14601// (or (and X, splat (i32 C1)), (shl Y, splat (i32 C2))) -> VSLI X, Y, #C2
14602// where C1 is a mask that preserves the bits not written by the shift/insert,
14603// i.e. `C1 == (1 << C2) - 1`.
14605 SDValue ShiftOp, EVT VT,
14606 SDLoc dl) {
14607 // Match (and X, Mask)
14608 if (AndOp.getOpcode() != ISD::AND)
14609 return SDValue();
14610
14611 SDValue X = AndOp.getOperand(0);
14612 SDValue Mask = AndOp.getOperand(1);
14613
14614 ConstantSDNode *MaskC = isConstOrConstSplat(Mask, false, true);
14615 if (!MaskC)
14616 return SDValue();
14617 APInt MaskBits =
14618 MaskC->getAPIntValue().trunc(Mask.getScalarValueSizeInBits());
14619
14620 // Match shift (srl/shl Y, CntVec)
14621 int64_t Cnt = 0;
14622 bool IsShiftRight = false;
14623 SDValue Y;
14624
14625 if (ShiftOp.getOpcode() == ARMISD::VSHRuIMM) {
14626 IsShiftRight = true;
14627 Y = ShiftOp.getOperand(0);
14628 Cnt = ShiftOp.getConstantOperandVal(1);
14629 } else if (ShiftOp.getOpcode() == ARMISD::VSHLIMM) {
14630 Y = ShiftOp.getOperand(0);
14631 Cnt = ShiftOp.getConstantOperandVal(1);
14632 } else {
14633 return SDValue();
14634 }
14635
14636 unsigned ElemBits = VT.getScalarSizeInBits();
14637 APInt RequiredMask = IsShiftRight
14638 ? APInt::getHighBitsSet(ElemBits, (unsigned)Cnt)
14639 : APInt::getLowBitsSet(ElemBits, (unsigned)Cnt);
14640 if (MaskBits != RequiredMask)
14641 return SDValue();
14642
14643 unsigned Opc = IsShiftRight ? ARMISD::VSRIIMM : ARMISD::VSLIIMM;
14644 return DAG.getNode(Opc, dl, VT, X, Y, DAG.getConstant(Cnt, dl, MVT::i32));
14645}
14646
14647/// PerformORCombine - Target-specific dag combine xforms for ISD::OR
14649 const ARMSubtarget *Subtarget) {
14650 // Attempt to use immediate-form VORR
14651 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(1));
14652 SDLoc dl(N);
14653 EVT VT = N->getValueType(0);
14654 SelectionDAG &DAG = DCI.DAG;
14655
14656 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
14657 return SDValue();
14658
14659 if (Subtarget->hasMVEIntegerOps() && (VT == MVT::v2i1 || VT == MVT::v4i1 ||
14660 VT == MVT::v8i1 || VT == MVT::v16i1))
14661 return PerformORCombine_i1(N, DAG, Subtarget);
14662
14663 APInt SplatBits, SplatUndef;
14664 unsigned SplatBitSize;
14665 bool HasAnyUndefs;
14666 if (BVN && (Subtarget->hasNEON() || Subtarget->hasMVEIntegerOps()) &&
14667 BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
14668 if (SplatBitSize == 8 || SplatBitSize == 16 || SplatBitSize == 32 ||
14669 SplatBitSize == 64) {
14670 EVT VorrVT;
14671 SDValue Val =
14672 isVMOVModifiedImm(SplatBits.getZExtValue(), SplatUndef.getZExtValue(),
14673 SplatBitSize, DAG, dl, VorrVT, VT, OtherModImm);
14674 if (Val.getNode()) {
14675 SDValue Input =
14676 DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VorrVT, N->getOperand(0));
14677 SDValue Vorr = DAG.getNode(ARMISD::VORRIMM, dl, VorrVT, Input, Val);
14678 return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, Vorr);
14679 }
14680 }
14681 }
14682
14683 if (!Subtarget->isThumb1Only()) {
14684 // fold (or (select cc, 0, c), x) -> (select cc, x, (or, x, c))
14685 if (SDValue Result = combineSelectAndUseCommutative(N, false, DCI))
14686 return Result;
14687 if (SDValue Result = PerformORCombineToSMULWBT(N, DCI, Subtarget))
14688 return Result;
14689 }
14690
14691 SDValue N0 = N->getOperand(0);
14692 SDValue N1 = N->getOperand(1);
14693
14694 // (or (and X, C1), (srl Y, C2)) -> VSRI X, Y, #C2
14695 // (or (and X, C1), (shl Y, C2)) -> VSLI X, Y, #C2
14696 if (VT.isVector() &&
14697 ((Subtarget->hasNEON() && DAG.getTargetLoweringInfo().isTypeLegal(VT)) ||
14698 (Subtarget->hasMVEIntegerOps() &&
14699 (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32)))) {
14700 if (SDValue ShiftInsert =
14701 PerformORCombineToShiftInsert(DAG, N0, N1, VT, dl))
14702 return ShiftInsert;
14703
14704 if (SDValue ShiftInsert =
14705 PerformORCombineToShiftInsert(DAG, N1, N0, VT, dl))
14706 return ShiftInsert;
14707 }
14708
14709 // (or (and B, A), (and C, ~A)) => (VBSL A, B, C) when A is a constant.
14710 if (Subtarget->hasNEON() && N1.getOpcode() == ISD::AND && VT.isVector() &&
14712
14713 // The code below optimizes (or (and X, Y), Z).
14714 // The AND operand needs to have a single user to make these optimizations
14715 // profitable.
14716 if (N0.getOpcode() != ISD::AND || !N0.hasOneUse())
14717 return SDValue();
14718
14719 APInt SplatUndef;
14720 unsigned SplatBitSize;
14721 bool HasAnyUndefs;
14722
14723 APInt SplatBits0, SplatBits1;
14726 // Ensure that the second operand of both ands are constants
14727 if (BVN0 && BVN0->isConstantSplat(SplatBits0, SplatUndef, SplatBitSize,
14728 HasAnyUndefs) && !HasAnyUndefs) {
14729 if (BVN1 && BVN1->isConstantSplat(SplatBits1, SplatUndef, SplatBitSize,
14730 HasAnyUndefs) && !HasAnyUndefs) {
14731 // Ensure that the bit width of the constants are the same and that
14732 // the splat arguments are logical inverses as per the pattern we
14733 // are trying to simplify.
14734 if (SplatBits0.getBitWidth() == SplatBits1.getBitWidth() &&
14735 SplatBits0 == ~SplatBits1) {
14736 // Canonicalize the vector type to make instruction selection
14737 // simpler.
14738 EVT CanonicalVT = VT.is128BitVector() ? MVT::v4i32 : MVT::v2i32;
14739 SDValue Result = DAG.getNode(ARMISD::VBSP, dl, CanonicalVT,
14740 N0->getOperand(1),
14741 N0->getOperand(0),
14742 N1->getOperand(0));
14743 return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, Result);
14744 }
14745 }
14746 }
14747 }
14748
14749 // Try to use the ARM/Thumb2 BFI (bitfield insert) instruction when
14750 // reasonable.
14751 if (N0.getOpcode() == ISD::AND && N0.hasOneUse()) {
14752 if (SDValue Res = PerformORCombineToBFI(N, DCI, Subtarget))
14753 return Res;
14754 }
14755
14756 if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget))
14757 return Result;
14758
14759 return SDValue();
14760}
14761
14764 const ARMSubtarget *Subtarget) {
14765 EVT VT = N->getValueType(0);
14766 SelectionDAG &DAG = DCI.DAG;
14767
14768 if(!DAG.getTargetLoweringInfo().isTypeLegal(VT))
14769 return SDValue();
14770
14771 if (!Subtarget->isThumb1Only()) {
14772 // fold (xor (select cc, 0, c), x) -> (select cc, x, (xor, x, c))
14773 if (SDValue Result = combineSelectAndUseCommutative(N, false, DCI))
14774 return Result;
14775
14776 if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget))
14777 return Result;
14778 }
14779
14780 if (Subtarget->hasMVEIntegerOps()) {
14781 // fold (xor(vcmp/z, 1)) into a vcmp with the opposite condition.
14782 SDValue N0 = N->getOperand(0);
14783 SDValue N1 = N->getOperand(1);
14784 const TargetLowering *TLI = Subtarget->getTargetLowering();
14785 if (TLI->isConstTrueVal(N1) &&
14786 (N0->getOpcode() == ARMISD::VCMP || N0->getOpcode() == ARMISD::VCMPZ)) {
14787 if (CanInvertMVEVCMP(N0)) {
14788 SDLoc DL(N0);
14790
14792 Ops.push_back(N0->getOperand(0));
14793 if (N0->getOpcode() == ARMISD::VCMP)
14794 Ops.push_back(N0->getOperand(1));
14795 Ops.push_back(DAG.getConstant(CC, DL, MVT::i32));
14796 return DAG.getNode(N0->getOpcode(), DL, N0->getValueType(0), Ops);
14797 }
14798 }
14799 }
14800
14801 return SDValue();
14802}
14803
14804// ParseBFI - given a BFI instruction in N, extract the "from" value (Rn) and return it,
14805// and fill in FromMask and ToMask with (consecutive) bits in "from" to be extracted and
14806// their position in "to" (Rd).
14807static SDValue ParseBFI(SDNode *N, APInt &ToMask, APInt &FromMask) {
14808 assert(N->getOpcode() == ARMISD::BFI);
14809
14810 SDValue From = N->getOperand(1);
14811 ToMask = ~N->getConstantOperandAPInt(2);
14812 FromMask = APInt::getLowBitsSet(ToMask.getBitWidth(), ToMask.popcount());
14813
14814 // If the Base came from a SHR #C, we can deduce that it is really testing bit
14815 // #C in the base of the SHR.
14816 if (From->getOpcode() == ISD::SRL &&
14817 isa<ConstantSDNode>(From->getOperand(1))) {
14818 APInt Shift = From->getConstantOperandAPInt(1);
14819 assert(Shift.getLimitedValue() < 32 && "Shift too large!");
14820 FromMask <<= Shift.getLimitedValue(31);
14821 From = From->getOperand(0);
14822 }
14823
14824 return From;
14825}
14826
14827// If A and B contain one contiguous set of bits, does A | B == A . B?
14828//
14829// Neither A nor B must be zero.
14830static bool BitsProperlyConcatenate(const APInt &A, const APInt &B) {
14831 unsigned LastActiveBitInA = A.countr_zero();
14832 unsigned FirstActiveBitInB = B.getBitWidth() - B.countl_zero() - 1;
14833 return LastActiveBitInA - 1 == FirstActiveBitInB;
14834}
14835
14837 // We have a BFI in N. Find a BFI it can combine with, if one exists.
14838 APInt ToMask, FromMask;
14839 SDValue From = ParseBFI(N, ToMask, FromMask);
14840 SDValue To = N->getOperand(0);
14841
14842 SDValue V = To;
14843 if (V.getOpcode() != ARMISD::BFI)
14844 return SDValue();
14845
14846 APInt NewToMask, NewFromMask;
14847 SDValue NewFrom = ParseBFI(V.getNode(), NewToMask, NewFromMask);
14848 if (NewFrom != From)
14849 return SDValue();
14850
14851 // Do the written bits conflict with any we've seen so far?
14852 if ((NewToMask & ToMask).getBoolValue())
14853 // Conflicting bits.
14854 return SDValue();
14855
14856 // Are the new bits contiguous when combined with the old bits?
14857 if (BitsProperlyConcatenate(ToMask, NewToMask) &&
14858 BitsProperlyConcatenate(FromMask, NewFromMask))
14859 return V;
14860 if (BitsProperlyConcatenate(NewToMask, ToMask) &&
14861 BitsProperlyConcatenate(NewFromMask, FromMask))
14862 return V;
14863
14864 return SDValue();
14865}
14866
14868 SDValue N0 = N->getOperand(0);
14869 SDValue N1 = N->getOperand(1);
14870
14871 if (N1.getOpcode() == ISD::AND) {
14872 // (bfi A, (and B, Mask1), Mask2) -> (bfi A, B, Mask2) iff
14873 // the bits being cleared by the AND are not demanded by the BFI.
14875 if (!N11C)
14876 return SDValue();
14877 unsigned InvMask = N->getConstantOperandVal(2);
14878 unsigned LSB = llvm::countr_zero(~InvMask);
14879 unsigned Width = llvm::bit_width<unsigned>(~InvMask) - LSB;
14880 assert(Width <
14881 static_cast<unsigned>(std::numeric_limits<unsigned>::digits) &&
14882 "undefined behavior");
14883 unsigned Mask = (1u << Width) - 1;
14884 unsigned Mask2 = N11C->getZExtValue();
14885 if ((Mask & (~Mask2)) == 0)
14886 return DAG.getNode(ARMISD::BFI, SDLoc(N), N->getValueType(0),
14887 N->getOperand(0), N1.getOperand(0), N->getOperand(2));
14888 return SDValue();
14889 }
14890
14891 // Look for another BFI to combine with.
14892 if (SDValue CombineBFI = FindBFIToCombineWith(N)) {
14893 // We've found a BFI.
14894 APInt ToMask1, FromMask1;
14895 SDValue From1 = ParseBFI(N, ToMask1, FromMask1);
14896
14897 APInt ToMask2, FromMask2;
14898 SDValue From2 = ParseBFI(CombineBFI.getNode(), ToMask2, FromMask2);
14899 assert(From1 == From2);
14900 (void)From2;
14901
14902 // Create a new BFI, combining the two together.
14903 APInt NewFromMask = FromMask1 | FromMask2;
14904 APInt NewToMask = ToMask1 | ToMask2;
14905
14906 EVT VT = N->getValueType(0);
14907 SDLoc dl(N);
14908
14909 if (NewFromMask[0] == 0)
14910 From1 = DAG.getNode(ISD::SRL, dl, VT, From1,
14911 DAG.getConstant(NewFromMask.countr_zero(), dl, VT));
14912 return DAG.getNode(ARMISD::BFI, dl, VT, CombineBFI.getOperand(0), From1,
14913 DAG.getConstant(~NewToMask, dl, VT));
14914 }
14915
14916 // Reassociate BFI(BFI (A, B, M1), C, M2) to BFI(BFI (A, C, M2), B, M1) so
14917 // that lower bit insertions are performed first, providing that M1 and M2
14918 // do no overlap. This can allow multiple BFI instructions to be combined
14919 // together by the other folds above.
14920 if (N->getOperand(0).getOpcode() == ARMISD::BFI) {
14921 APInt ToMask1 = ~N->getConstantOperandAPInt(2);
14922 APInt ToMask2 = ~N0.getConstantOperandAPInt(2);
14923
14924 if (!N0.hasOneUse() || (ToMask1 & ToMask2) != 0 ||
14925 ToMask1.countl_zero() < ToMask2.countl_zero())
14926 return SDValue();
14927
14928 EVT VT = N->getValueType(0);
14929 SDLoc dl(N);
14930 SDValue BFI1 = DAG.getNode(ARMISD::BFI, dl, VT, N0.getOperand(0),
14931 N->getOperand(1), N->getOperand(2));
14932 return DAG.getNode(ARMISD::BFI, dl, VT, BFI1, N0.getOperand(1),
14933 N0.getOperand(2));
14934 }
14935
14936 return SDValue();
14937}
14938
14939// Check that N is CMPZ(CSINC(0, 0, CC, X)),
14940// or CMPZ(CMOV(1, 0, CC, X))
14941// return X if valid.
14943 if (Cmp->getOpcode() != ARMISD::CMPZ || !isNullConstant(Cmp->getOperand(1)))
14944 return SDValue();
14945 SDValue CSInc = Cmp->getOperand(0);
14946
14947 // Ignore any `And 1` nodes that may not yet have been removed. We are
14948 // looking for a value that produces 1/0, so these have no effect on the
14949 // code.
14950 while (CSInc.getOpcode() == ISD::AND &&
14951 isa<ConstantSDNode>(CSInc.getOperand(1)) &&
14952 CSInc.getConstantOperandVal(1) == 1 && CSInc->hasOneUse())
14953 CSInc = CSInc.getOperand(0);
14954
14955 if (CSInc.getOpcode() == ARMISD::CSINC &&
14956 isNullConstant(CSInc.getOperand(0)) &&
14957 isNullConstant(CSInc.getOperand(1)) && CSInc->hasOneUse()) {
14959 return CSInc.getOperand(3);
14960 }
14961 if (CSInc.getOpcode() == ARMISD::CMOV && isOneConstant(CSInc.getOperand(0)) &&
14962 isNullConstant(CSInc.getOperand(1)) && CSInc->hasOneUse()) {
14964 return CSInc.getOperand(3);
14965 }
14966 if (CSInc.getOpcode() == ARMISD::CMOV && isOneConstant(CSInc.getOperand(1)) &&
14967 isNullConstant(CSInc.getOperand(0)) && CSInc->hasOneUse()) {
14970 return CSInc.getOperand(3);
14971 }
14972 return SDValue();
14973}
14974
14976 // Given CMPZ(CSINC(C, 0, 0, EQ), 0), we can just use C directly. As in
14977 // t92: flags = ARMISD::CMPZ t74, 0
14978 // t93: i32 = ARMISD::CSINC 0, 0, 1, t92
14979 // t96: flags = ARMISD::CMPZ t93, 0
14980 // t114: i32 = ARMISD::CSINV 0, 0, 0, t96
14982 if (SDValue C = IsCMPZCSINC(N, Cond))
14983 if (Cond == ARMCC::EQ)
14984 return C;
14985 return SDValue();
14986}
14987
14989 // Fold away an unneccessary CMPZ/CSINC
14990 // CSXYZ A, B, C1 (CMPZ (CSINC 0, 0, C2, D), 0) ->
14991 // if C1==EQ -> CSXYZ A, B, C2, D
14992 // if C1==NE -> CSXYZ A, B, NOT(C2), D
14994 if (SDValue C = IsCMPZCSINC(N->getOperand(3).getNode(), Cond)) {
14995 if (N->getConstantOperandVal(2) == ARMCC::EQ)
14996 return DAG.getNode(N->getOpcode(), SDLoc(N), MVT::i32, N->getOperand(0),
14997 N->getOperand(1),
14998 DAG.getConstant(Cond, SDLoc(N), MVT::i32), C);
14999 if (N->getConstantOperandVal(2) == ARMCC::NE)
15000 return DAG.getNode(
15001 N->getOpcode(), SDLoc(N), MVT::i32, N->getOperand(0),
15002 N->getOperand(1),
15004 }
15005 return SDValue();
15006}
15007
15008/// PerformVMOVRRDCombine - Target-specific dag combine xforms for
15009/// ARMISD::VMOVRRD.
15012 const ARMSubtarget *Subtarget) {
15013 // vmovrrd(vmovdrr x, y) -> x,y
15014 SDValue InDouble = N->getOperand(0);
15015 if (InDouble.getOpcode() == ARMISD::VMOVDRR && Subtarget->hasFP64())
15016 return DCI.CombineTo(N, InDouble.getOperand(0), InDouble.getOperand(1));
15017
15018 // vmovrrd(load f64) -> (load i32), (load i32)
15019 SDNode *InNode = InDouble.getNode();
15020 if (ISD::isNormalLoad(InNode) && InNode->hasOneUse() &&
15021 InNode->getValueType(0) == MVT::f64 &&
15022 InNode->getOperand(1).getOpcode() == ISD::FrameIndex &&
15023 !cast<LoadSDNode>(InNode)->isVolatile()) {
15024 // TODO: Should this be done for non-FrameIndex operands?
15025 LoadSDNode *LD = cast<LoadSDNode>(InNode);
15026
15027 SelectionDAG &DAG = DCI.DAG;
15028 SDLoc DL(LD);
15029 SDValue BasePtr = LD->getBasePtr();
15030 SDValue NewLD1 =
15031 DAG.getLoad(MVT::i32, DL, LD->getChain(), BasePtr, LD->getPointerInfo(),
15032 LD->getAlign(), LD->getMemOperand()->getFlags());
15033
15034 SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr,
15035 DAG.getConstant(4, DL, MVT::i32));
15036
15037 SDValue NewLD2 = DAG.getLoad(MVT::i32, DL, LD->getChain(), OffsetPtr,
15038 LD->getPointerInfo().getWithOffset(4),
15039 commonAlignment(LD->getAlign(), 4),
15040 LD->getMemOperand()->getFlags());
15041
15042 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewLD2.getValue(1));
15043 if (DCI.DAG.getDataLayout().isBigEndian())
15044 std::swap (NewLD1, NewLD2);
15045 SDValue Result = DCI.CombineTo(N, NewLD1, NewLD2);
15046 return Result;
15047 }
15048
15049 // VMOVRRD(extract(..(build_vector(a, b, c, d)))) -> a,b or c,d
15050 // VMOVRRD(extract(insert_vector(insert_vector(.., a, l1), b, l2))) -> a,b
15051 if (InDouble.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
15052 isa<ConstantSDNode>(InDouble.getOperand(1))) {
15053 SDValue BV = InDouble.getOperand(0);
15054 // Look up through any nop bitcasts and vector_reg_casts. bitcasts may
15055 // change lane order under big endian.
15056 bool BVSwap = BV.getOpcode() == ISD::BITCAST;
15057 while (
15058 (BV.getOpcode() == ISD::BITCAST ||
15059 BV.getOpcode() == ARMISD::VECTOR_REG_CAST) &&
15060 (BV.getValueType() == MVT::v2f64 || BV.getValueType() == MVT::v2i64)) {
15061 BVSwap = BV.getOpcode() == ISD::BITCAST;
15062 BV = BV.getOperand(0);
15063 }
15064 if (BV.getValueType() != MVT::v4i32)
15065 return SDValue();
15066
15067 // Handle buildvectors, pulling out the correct lane depending on
15068 // endianness.
15069 unsigned Offset = InDouble.getConstantOperandVal(1) == 1 ? 2 : 0;
15070 if (BV.getOpcode() == ISD::BUILD_VECTOR) {
15071 SDValue Op0 = BV.getOperand(Offset);
15072 SDValue Op1 = BV.getOperand(Offset + 1);
15073 if (!Subtarget->isLittle() && BVSwap)
15074 std::swap(Op0, Op1);
15075
15076 return DCI.DAG.getMergeValues({Op0, Op1}, SDLoc(N));
15077 }
15078
15079 // A chain of insert_vectors, grabbing the correct value of the chain of
15080 // inserts.
15081 SDValue Op0, Op1;
15082 while (BV.getOpcode() == ISD::INSERT_VECTOR_ELT) {
15083 if (isa<ConstantSDNode>(BV.getOperand(2))) {
15084 if (BV.getConstantOperandVal(2) == Offset && !Op0)
15085 Op0 = BV.getOperand(1);
15086 if (BV.getConstantOperandVal(2) == Offset + 1 && !Op1)
15087 Op1 = BV.getOperand(1);
15088 }
15089 BV = BV.getOperand(0);
15090 }
15091 if (!Subtarget->isLittle() && BVSwap)
15092 std::swap(Op0, Op1);
15093 if (Op0 && Op1)
15094 return DCI.DAG.getMergeValues({Op0, Op1}, SDLoc(N));
15095 }
15096
15097 return SDValue();
15098}
15099
15100/// PerformVMOVDRRCombine - Target-specific dag combine xforms for
15101/// ARMISD::VMOVDRR. This is also used for BUILD_VECTORs with 2 operands.
15103 // N=vmovrrd(X); vmovdrr(N:0, N:1) -> bit_convert(X)
15104 SDValue Op0 = N->getOperand(0);
15105 SDValue Op1 = N->getOperand(1);
15106 if (Op0.getOpcode() == ISD::BITCAST)
15107 Op0 = Op0.getOperand(0);
15108 if (Op1.getOpcode() == ISD::BITCAST)
15109 Op1 = Op1.getOperand(0);
15110 if (Op0.getOpcode() == ARMISD::VMOVRRD &&
15111 Op0.getNode() == Op1.getNode() &&
15112 Op0.getResNo() == 0 && Op1.getResNo() == 1)
15113 return DAG.getNode(ISD::BITCAST, SDLoc(N),
15114 N->getValueType(0), Op0.getOperand(0));
15115 return SDValue();
15116}
15117
15120 SDValue Op0 = N->getOperand(0);
15121
15122 // VMOVhr (VMOVrh (X)) -> X
15123 if (Op0->getOpcode() == ARMISD::VMOVrh)
15124 return Op0->getOperand(0);
15125
15126 // FullFP16: half values are passed in S-registers, and we don't
15127 // need any of the bitcast and moves:
15128 //
15129 // t2: f32,ch1,gl1? = CopyFromReg ch, Register:f32 %0, gl?
15130 // t5: i32 = bitcast t2
15131 // t18: f16 = ARMISD::VMOVhr t5
15132 // =>
15133 // tN: f16,ch2,gl2? = CopyFromReg ch, Register::f32 %0, gl?
15134 if (Op0->getOpcode() == ISD::BITCAST) {
15135 SDValue Copy = Op0->getOperand(0);
15136 if (Copy.getValueType() == MVT::f32 &&
15137 Copy->getOpcode() == ISD::CopyFromReg) {
15138 bool HasGlue = Copy->getNumOperands() == 3;
15139 SDValue Ops[] = {Copy->getOperand(0), Copy->getOperand(1),
15140 HasGlue ? Copy->getOperand(2) : SDValue()};
15141 EVT OutTys[] = {N->getValueType(0), MVT::Other, MVT::Glue};
15142 SDValue NewCopy =
15144 DCI.DAG.getVTList(ArrayRef(OutTys, HasGlue ? 3 : 2)),
15145 ArrayRef(Ops, HasGlue ? 3 : 2));
15146
15147 // Update Users, Chains, and Potential Glue.
15148 DCI.DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), NewCopy.getValue(0));
15149 DCI.DAG.ReplaceAllUsesOfValueWith(Copy.getValue(1), NewCopy.getValue(1));
15150 if (HasGlue)
15151 DCI.DAG.ReplaceAllUsesOfValueWith(Copy.getValue(2),
15152 NewCopy.getValue(2));
15153
15154 return NewCopy;
15155 }
15156 }
15157
15158 // fold (VMOVhr (load x)) -> (load (f16*)x)
15159 if (LoadSDNode *LN0 = dyn_cast<LoadSDNode>(Op0)) {
15160 if (LN0->hasOneUse() && LN0->isUnindexed() &&
15161 LN0->getMemoryVT() == MVT::i16) {
15162 SDValue Load =
15163 DCI.DAG.getLoad(N->getValueType(0), SDLoc(N), LN0->getChain(),
15164 LN0->getBasePtr(), LN0->getMemOperand());
15165 DCI.DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Load.getValue(0));
15166 DCI.DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), Load.getValue(1));
15167 return Load;
15168 }
15169 }
15170
15171 // Only the bottom 16 bits of the source register are used.
15172 APInt DemandedMask = APInt::getLowBitsSet(32, 16);
15173 const TargetLowering &TLI = DCI.DAG.getTargetLoweringInfo();
15174 if (TLI.SimplifyDemandedBits(Op0, DemandedMask, DCI))
15175 return SDValue(N, 0);
15176
15177 return SDValue();
15178}
15179
15181 SDValue N0 = N->getOperand(0);
15182 EVT VT = N->getValueType(0);
15183
15184 // fold (VMOVrh (fpconst x)) -> const x
15186 APFloat V = C->getValueAPF();
15187 return DAG.getConstant(V.bitcastToAPInt().getZExtValue(), SDLoc(N), VT);
15188 }
15189
15190 // fold (VMOVrh (load x)) -> (zextload (i16*)x)
15191 if (ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse()) {
15192 LoadSDNode *LN0 = cast<LoadSDNode>(N0);
15193
15194 SDValue Load =
15195 DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(N), VT, LN0->getChain(),
15196 LN0->getBasePtr(), MVT::i16, LN0->getMemOperand());
15197 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Load.getValue(0));
15198 DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), Load.getValue(1));
15199 return Load;
15200 }
15201
15202 // Fold VMOVrh(extract(x, n)) -> vgetlaneu(x, n)
15203 if (N0->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
15205 return DAG.getNode(ARMISD::VGETLANEu, SDLoc(N), VT, N0->getOperand(0),
15206 N0->getOperand(1));
15207
15208 return SDValue();
15209}
15210
15211/// hasNormalLoadOperand - Check if any of the operands of a BUILD_VECTOR node
15212/// are normal, non-volatile loads. If so, it is profitable to bitcast an
15213/// i64 vector to have f64 elements, since the value can then be loaded
15214/// directly into a VFP register.
15216 unsigned NumElts = N->getValueType(0).getVectorNumElements();
15217 for (unsigned i = 0; i < NumElts; ++i) {
15218 SDNode *Elt = N->getOperand(i).getNode();
15219 if (ISD::isNormalLoad(Elt) && !cast<LoadSDNode>(Elt)->isVolatile())
15220 return true;
15221 }
15222 return false;
15223}
15224
15225/// PerformBUILD_VECTORCombine - Target-specific dag combine xforms for
15226/// ISD::BUILD_VECTOR.
15229 const ARMSubtarget *Subtarget) {
15230 // build_vector(N=ARMISD::VMOVRRD(X), N:1) -> bit_convert(X):
15231 // VMOVRRD is introduced when legalizing i64 types. It forces the i64 value
15232 // into a pair of GPRs, which is fine when the value is used as a scalar,
15233 // but if the i64 value is converted to a vector, we need to undo the VMOVRRD.
15234 SelectionDAG &DAG = DCI.DAG;
15235 if (N->getNumOperands() == 2)
15236 if (SDValue RV = PerformVMOVDRRCombine(N, DAG))
15237 return RV;
15238
15239 // Load i64 elements as f64 values so that type legalization does not split
15240 // them up into i32 values.
15241 EVT VT = N->getValueType(0);
15242 if (VT.getVectorElementType() != MVT::i64 || !hasNormalLoadOperand(N))
15243 return SDValue();
15244 SDLoc dl(N);
15246 unsigned NumElts = VT.getVectorNumElements();
15247 for (unsigned i = 0; i < NumElts; ++i) {
15248 SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::f64, N->getOperand(i));
15249 Ops.push_back(V);
15250 // Make the DAGCombiner fold the bitcast.
15251 DCI.AddToWorklist(V.getNode());
15252 }
15253 EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, NumElts);
15254 SDValue BV = DAG.getBuildVector(FloatVT, dl, Ops);
15255 return DAG.getNode(ISD::BITCAST, dl, VT, BV);
15256}
15257
15258/// Target-specific dag combine xforms for ARMISD::BUILD_VECTOR.
15259static SDValue
15261 // ARMISD::BUILD_VECTOR is introduced when legalizing ISD::BUILD_VECTOR.
15262 // At that time, we may have inserted bitcasts from integer to float.
15263 // If these bitcasts have survived DAGCombine, change the lowering of this
15264 // BUILD_VECTOR in something more vector friendly, i.e., that does not
15265 // force to use floating point types.
15266
15267 // Make sure we can change the type of the vector.
15268 // This is possible iff:
15269 // 1. The vector is only used in a bitcast to a integer type. I.e.,
15270 // 1.1. Vector is used only once.
15271 // 1.2. Use is a bit convert to an integer type.
15272 // 2. The size of its operands are 32-bits (64-bits are not legal).
15273 EVT VT = N->getValueType(0);
15274 EVT EltVT = VT.getVectorElementType();
15275
15276 // Check 1.1. and 2.
15277 if (EltVT.getSizeInBits() != 32 || !N->hasOneUse())
15278 return SDValue();
15279
15280 // By construction, the input type must be float.
15281 assert(EltVT == MVT::f32 && "Unexpected type!");
15282
15283 // Check 1.2.
15284 SDNode *Use = *N->user_begin();
15285 if (Use->getOpcode() != ISD::BITCAST ||
15286 Use->getValueType(0).isFloatingPoint())
15287 return SDValue();
15288
15289 // Check profitability.
15290 // Model is, if more than half of the relevant operands are bitcast from
15291 // i32, turn the build_vector into a sequence of insert_vector_elt.
15292 // Relevant operands are everything that is not statically
15293 // (i.e., at compile time) bitcasted.
15294 unsigned NumOfBitCastedElts = 0;
15295 unsigned NumElts = VT.getVectorNumElements();
15296 unsigned NumOfRelevantElts = NumElts;
15297 for (unsigned Idx = 0; Idx < NumElts; ++Idx) {
15298 SDValue Elt = N->getOperand(Idx);
15299 if (Elt->getOpcode() == ISD::BITCAST) {
15300 // Assume only bit cast to i32 will go away.
15301 if (Elt->getOperand(0).getValueType() == MVT::i32)
15302 ++NumOfBitCastedElts;
15303 } else if (Elt.isUndef() || isa<ConstantSDNode>(Elt))
15304 // Constants are statically casted, thus do not count them as
15305 // relevant operands.
15306 --NumOfRelevantElts;
15307 }
15308
15309 // Check if more than half of the elements require a non-free bitcast.
15310 if (NumOfBitCastedElts <= NumOfRelevantElts / 2)
15311 return SDValue();
15312
15313 SelectionDAG &DAG = DCI.DAG;
15314 // Create the new vector type.
15315 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts);
15316 // Check if the type is legal.
15317 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
15318 if (!TLI.isTypeLegal(VecVT))
15319 return SDValue();
15320
15321 // Combine:
15322 // ARMISD::BUILD_VECTOR E1, E2, ..., EN.
15323 // => BITCAST INSERT_VECTOR_ELT
15324 // (INSERT_VECTOR_ELT (...), (BITCAST EN-1), N-1),
15325 // (BITCAST EN), N.
15326 SDValue Vec = DAG.getUNDEF(VecVT);
15327 SDLoc dl(N);
15328 for (unsigned Idx = 0 ; Idx < NumElts; ++Idx) {
15329 SDValue V = N->getOperand(Idx);
15330 if (V.isUndef())
15331 continue;
15332 if (V.getOpcode() == ISD::BITCAST &&
15333 V->getOperand(0).getValueType() == MVT::i32)
15334 // Fold obvious case.
15335 V = V.getOperand(0);
15336 else {
15337 V = DAG.getNode(ISD::BITCAST, SDLoc(V), MVT::i32, V);
15338 // Make the DAGCombiner fold the bitcasts.
15339 DCI.AddToWorklist(V.getNode());
15340 }
15341 SDValue LaneIdx = DAG.getConstant(Idx, dl, MVT::i32);
15342 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VecVT, Vec, V, LaneIdx);
15343 }
15344 Vec = DAG.getNode(ISD::BITCAST, dl, VT, Vec);
15345 // Make the DAGCombiner fold the bitcasts.
15346 DCI.AddToWorklist(Vec.getNode());
15347 return Vec;
15348}
15349
15350static SDValue
15352 EVT VT = N->getValueType(0);
15353 SDValue Op = N->getOperand(0);
15354 SDLoc dl(N);
15355
15356 // PREDICATE_CAST(PREDICATE_CAST(x)) == PREDICATE_CAST(x)
15357 if (Op->getOpcode() == ARMISD::PREDICATE_CAST) {
15358 // If the valuetypes are the same, we can remove the cast entirely.
15359 if (Op->getOperand(0).getValueType() == VT)
15360 return Op->getOperand(0);
15361 return DCI.DAG.getNode(ARMISD::PREDICATE_CAST, dl, VT, Op->getOperand(0));
15362 }
15363
15364 // Turn pred_cast(xor x, -1) into xor(pred_cast x, -1), in order to produce
15365 // more VPNOT which might get folded as else predicates.
15366 if (Op.getValueType() == MVT::i32 && isBitwiseNot(Op)) {
15367 SDValue X =
15368 DCI.DAG.getNode(ARMISD::PREDICATE_CAST, dl, VT, Op->getOperand(0));
15369 SDValue C = DCI.DAG.getNode(ARMISD::PREDICATE_CAST, dl, VT,
15370 DCI.DAG.getConstant(65535, dl, MVT::i32));
15371 return DCI.DAG.getNode(ISD::XOR, dl, VT, X, C);
15372 }
15373
15374 // Only the bottom 16 bits of the source register are used.
15375 if (Op.getValueType() == MVT::i32) {
15376 APInt DemandedMask = APInt::getLowBitsSet(32, 16);
15377 const TargetLowering &TLI = DCI.DAG.getTargetLoweringInfo();
15378 if (TLI.SimplifyDemandedBits(Op, DemandedMask, DCI))
15379 return SDValue(N, 0);
15380 }
15381 return SDValue();
15382}
15383
15385 const ARMSubtarget *ST) {
15386 EVT VT = N->getValueType(0);
15387 SDValue Op = N->getOperand(0);
15388 SDLoc dl(N);
15389
15390 // Under Little endian, a VECTOR_REG_CAST is equivalent to a BITCAST
15391 if (ST->isLittle())
15392 return DAG.getNode(ISD::BITCAST, dl, VT, Op);
15393
15394 // VT VECTOR_REG_CAST (VT Op) -> Op
15395 if (Op.getValueType() == VT)
15396 return Op;
15397 // VECTOR_REG_CAST undef -> undef
15398 if (Op.isUndef())
15399 return DAG.getUNDEF(VT);
15400
15401 // VECTOR_REG_CAST(VECTOR_REG_CAST(x)) == VECTOR_REG_CAST(x)
15402 if (Op->getOpcode() == ARMISD::VECTOR_REG_CAST) {
15403 // If the valuetypes are the same, we can remove the cast entirely.
15404 if (Op->getOperand(0).getValueType() == VT)
15405 return Op->getOperand(0);
15406 return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, Op->getOperand(0));
15407 }
15408
15409 return SDValue();
15410}
15411
15413 const ARMSubtarget *Subtarget) {
15414 if (!Subtarget->hasMVEIntegerOps())
15415 return SDValue();
15416
15417 EVT VT = N->getValueType(0);
15418 SDValue Op0 = N->getOperand(0);
15419 SDValue Op1 = N->getOperand(1);
15420 ARMCC::CondCodes Cond = (ARMCC::CondCodes)N->getConstantOperandVal(2);
15421 SDLoc dl(N);
15422
15423 // vcmp X, 0, cc -> vcmpz X, cc
15424 if (isZeroVector(Op1))
15425 return DAG.getNode(ARMISD::VCMPZ, dl, VT, Op0, N->getOperand(2));
15426
15427 unsigned SwappedCond = getSwappedCondition(Cond);
15428 if (isValidMVECond(SwappedCond, VT.isFloatingPoint())) {
15429 // vcmp 0, X, cc -> vcmpz X, reversed(cc)
15430 if (isZeroVector(Op0))
15431 return DAG.getNode(ARMISD::VCMPZ, dl, VT, Op1,
15432 DAG.getConstant(SwappedCond, dl, MVT::i32));
15433 // vcmp vdup(Y), X, cc -> vcmp X, vdup(Y), reversed(cc)
15434 if (Op0->getOpcode() == ARMISD::VDUP && Op1->getOpcode() != ARMISD::VDUP)
15435 return DAG.getNode(ARMISD::VCMP, dl, VT, Op1, Op0,
15436 DAG.getConstant(SwappedCond, dl, MVT::i32));
15437 }
15438
15439 return SDValue();
15440}
15441
15442/// PerformInsertEltCombine - Target-specific dag combine xforms for
15443/// ISD::INSERT_VECTOR_ELT.
15446 // Bitcast an i64 load inserted into a vector to f64.
15447 // Otherwise, the i64 value will be legalized to a pair of i32 values.
15448 EVT VT = N->getValueType(0);
15449 SDNode *Elt = N->getOperand(1).getNode();
15450 if (VT.getVectorElementType() != MVT::i64 ||
15451 !ISD::isNormalLoad(Elt) || cast<LoadSDNode>(Elt)->isVolatile())
15452 return SDValue();
15453
15454 SelectionDAG &DAG = DCI.DAG;
15455 SDLoc dl(N);
15456 EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64,
15458 SDValue Vec = DAG.getNode(ISD::BITCAST, dl, FloatVT, N->getOperand(0));
15459 SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::f64, N->getOperand(1));
15460 // Make the DAGCombiner fold the bitcasts.
15461 DCI.AddToWorklist(Vec.getNode());
15462 DCI.AddToWorklist(V.getNode());
15463 SDValue InsElt = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, FloatVT,
15464 Vec, V, N->getOperand(2));
15465 return DAG.getNode(ISD::BITCAST, dl, VT, InsElt);
15466}
15467
15468// Convert a pair of extracts from the same base vector to a VMOVRRD. Either
15469// directly or bitcast to an integer if the original is a float vector.
15470// extract(x, n); extract(x, n+1) -> VMOVRRD(extract v2f64 x, n/2)
15471// bitcast(extract(x, n)); bitcast(extract(x, n+1)) -> VMOVRRD(extract x, n/2)
15472static SDValue
15474 EVT VT = N->getValueType(0);
15475 SDLoc dl(N);
15476
15477 if (!DCI.isAfterLegalizeDAG() || VT != MVT::i32 ||
15478 !DCI.DAG.getTargetLoweringInfo().isTypeLegal(MVT::f64))
15479 return SDValue();
15480
15481 SDValue Ext = SDValue(N, 0);
15482 if (Ext.getOpcode() == ISD::BITCAST &&
15483 Ext.getOperand(0).getValueType() == MVT::f32)
15484 Ext = Ext.getOperand(0);
15485 if (Ext.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
15487 Ext.getConstantOperandVal(1) % 2 != 0)
15488 return SDValue();
15489 if (Ext->hasOneUse() && (Ext->user_begin()->getOpcode() == ISD::SINT_TO_FP ||
15490 Ext->user_begin()->getOpcode() == ISD::UINT_TO_FP))
15491 return SDValue();
15492
15493 SDValue Op0 = Ext.getOperand(0);
15494 EVT VecVT = Op0.getValueType();
15495 unsigned ResNo = Op0.getResNo();
15496 unsigned Lane = Ext.getConstantOperandVal(1);
15497 if (VecVT.getVectorNumElements() != 4)
15498 return SDValue();
15499
15500 // Find another extract, of Lane + 1
15501 auto OtherIt = find_if(Op0->users(), [&](SDNode *V) {
15502 return V->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
15503 isa<ConstantSDNode>(V->getOperand(1)) &&
15504 V->getConstantOperandVal(1) == Lane + 1 &&
15505 V->getOperand(0).getResNo() == ResNo;
15506 });
15507 if (OtherIt == Op0->users().end())
15508 return SDValue();
15509
15510 // For float extracts, we need to be converting to a i32 for both vector
15511 // lanes.
15512 SDValue OtherExt(*OtherIt, 0);
15513 if (OtherExt.getValueType() != MVT::i32) {
15514 if (!OtherExt->hasOneUse() ||
15515 OtherExt->user_begin()->getOpcode() != ISD::BITCAST ||
15516 OtherExt->user_begin()->getValueType(0) != MVT::i32)
15517 return SDValue();
15518 OtherExt = SDValue(*OtherExt->user_begin(), 0);
15519 }
15520
15521 // Convert the type to a f64 and extract with a VMOVRRD.
15522 SDValue F64 = DCI.DAG.getNode(
15523 ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
15524 DCI.DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v2f64, Op0),
15525 DCI.DAG.getConstant(Ext.getConstantOperandVal(1) / 2, dl, MVT::i32));
15526 SDValue VMOVRRD =
15527 DCI.DAG.getNode(ARMISD::VMOVRRD, dl, {MVT::i32, MVT::i32}, F64);
15528
15529 DCI.CombineTo(OtherExt.getNode(), SDValue(VMOVRRD.getNode(), 1));
15530 return VMOVRRD;
15531}
15532
15535 const ARMSubtarget *ST) {
15536 SDValue Op0 = N->getOperand(0);
15537 EVT VT = N->getValueType(0);
15538 SDLoc dl(N);
15539
15540 // extract (vdup x) -> x
15541 if (Op0->getOpcode() == ARMISD::VDUP) {
15542 SDValue X = Op0->getOperand(0);
15543 if (VT == MVT::f16 && X.getValueType() == MVT::i32)
15544 return DCI.DAG.getNode(ARMISD::VMOVhr, dl, VT, X);
15545 if (VT == MVT::i32 && X.getValueType() == MVT::f16)
15546 return DCI.DAG.getNode(ARMISD::VMOVrh, dl, VT, X);
15547 if (VT == MVT::f32 && X.getValueType() == MVT::i32)
15548 return DCI.DAG.getNode(ISD::BITCAST, dl, VT, X);
15549
15550 while (X.getValueType() != VT && X->getOpcode() == ISD::BITCAST)
15551 X = X->getOperand(0);
15552 if (X.getValueType() == VT)
15553 return X;
15554 }
15555
15556 // extract ARM_BUILD_VECTOR -> x
15557 if (Op0->getOpcode() == ARMISD::BUILD_VECTOR &&
15558 isa<ConstantSDNode>(N->getOperand(1)) &&
15559 N->getConstantOperandVal(1) < Op0.getNumOperands()) {
15560 return Op0.getOperand(N->getConstantOperandVal(1));
15561 }
15562
15563 // extract(bitcast(BUILD_VECTOR(VMOVDRR(a, b), ..))) -> a or b
15564 if (Op0.getValueType() == MVT::v4i32 &&
15565 isa<ConstantSDNode>(N->getOperand(1)) &&
15566 Op0.getOpcode() == ISD::BITCAST &&
15568 Op0.getOperand(0).getValueType() == MVT::v2f64) {
15569 SDValue BV = Op0.getOperand(0);
15570 unsigned Offset = N->getConstantOperandVal(1);
15571 SDValue MOV = BV.getOperand(Offset < 2 ? 0 : 1);
15572 if (MOV.getOpcode() == ARMISD::VMOVDRR)
15573 return MOV.getOperand(ST->isLittle() ? Offset % 2 : 1 - Offset % 2);
15574 }
15575
15576 // extract x, n; extract x, n+1 -> VMOVRRD x
15577 if (SDValue R = PerformExtractEltToVMOVRRD(N, DCI))
15578 return R;
15579
15580 // extract (MVETrunc(x)) -> extract x
15581 if (Op0->getOpcode() == ARMISD::MVETRUNC) {
15582 unsigned Idx = N->getConstantOperandVal(1);
15583 unsigned Vec =
15585 unsigned SubIdx =
15587 return DCI.DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Op0.getOperand(Vec),
15588 DCI.DAG.getConstant(SubIdx, dl, MVT::i32));
15589 }
15590
15591 return SDValue();
15592}
15593
15595 SDValue Op = N->getOperand(0);
15596 EVT VT = N->getValueType(0);
15597
15598 // sext_inreg(VGETLANEu) -> VGETLANEs
15599 if (Op.getOpcode() == ARMISD::VGETLANEu &&
15600 cast<VTSDNode>(N->getOperand(1))->getVT() ==
15601 Op.getOperand(0).getValueType().getScalarType())
15602 return DAG.getNode(ARMISD::VGETLANEs, SDLoc(N), VT, Op.getOperand(0),
15603 Op.getOperand(1));
15604
15605 return SDValue();
15606}
15607
15608static SDValue
15610 SDValue Vec = N->getOperand(0);
15611 SDValue SubVec = N->getOperand(1);
15612 uint64_t IdxVal = N->getConstantOperandVal(2);
15613 EVT VecVT = Vec.getValueType();
15614 EVT SubVT = SubVec.getValueType();
15615
15616 // Only do this for legal fixed vector types.
15617 if (!VecVT.isFixedLengthVector() ||
15618 !DCI.DAG.getTargetLoweringInfo().isTypeLegal(VecVT) ||
15620 return SDValue();
15621
15622 // Ignore widening patterns.
15623 if (IdxVal == 0 && Vec.isUndef())
15624 return SDValue();
15625
15626 // Subvector must be half the width and an "aligned" insertion.
15627 unsigned NumSubElts = SubVT.getVectorNumElements();
15628 if ((SubVT.getSizeInBits() * 2) != VecVT.getSizeInBits() ||
15629 (IdxVal != 0 && IdxVal != NumSubElts))
15630 return SDValue();
15631
15632 // Fold insert_subvector -> concat_vectors
15633 // insert_subvector(Vec,Sub,lo) -> concat_vectors(Sub,extract(Vec,hi))
15634 // insert_subvector(Vec,Sub,hi) -> concat_vectors(extract(Vec,lo),Sub)
15635 SDLoc DL(N);
15636 SDValue Lo, Hi;
15637 if (IdxVal == 0) {
15638 Lo = SubVec;
15639 Hi = DCI.DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, Vec,
15640 DCI.DAG.getVectorIdxConstant(NumSubElts, DL));
15641 } else {
15642 Lo = DCI.DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, Vec,
15643 DCI.DAG.getVectorIdxConstant(0, DL));
15644 Hi = SubVec;
15645 }
15646 return DCI.DAG.getNode(ISD::CONCAT_VECTORS, DL, VecVT, Lo, Hi);
15647}
15648
15649// shuffle(MVETrunc(x, y)) -> VMOVN(x, y)
15651 SelectionDAG &DAG) {
15652 SDValue Trunc = N->getOperand(0);
15653 EVT VT = Trunc.getValueType();
15654 if (Trunc.getOpcode() != ARMISD::MVETRUNC || !N->getOperand(1).isUndef())
15655 return SDValue();
15656
15657 SDLoc DL(Trunc);
15658 if (isVMOVNTruncMask(N->getMask(), VT, false))
15659 return DAG.getNode(
15660 ARMISD::VMOVN, DL, VT,
15661 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, Trunc.getOperand(0)),
15662 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, Trunc.getOperand(1)),
15663 DAG.getConstant(1, DL, MVT::i32));
15664 else if (isVMOVNTruncMask(N->getMask(), VT, true))
15665 return DAG.getNode(
15666 ARMISD::VMOVN, DL, VT,
15667 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, Trunc.getOperand(1)),
15668 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, Trunc.getOperand(0)),
15669 DAG.getConstant(1, DL, MVT::i32));
15670 return SDValue();
15671}
15672
15673/// PerformVECTOR_SHUFFLECombine - Target-specific dag combine xforms for
15674/// ISD::VECTOR_SHUFFLE.
15677 return R;
15678
15679 // The LLVM shufflevector instruction does not require the shuffle mask
15680 // length to match the operand vector length, but ISD::VECTOR_SHUFFLE does
15681 // have that requirement. When translating to ISD::VECTOR_SHUFFLE, if the
15682 // operands do not match the mask length, they are extended by concatenating
15683 // them with undef vectors. That is probably the right thing for other
15684 // targets, but for NEON it is better to concatenate two double-register
15685 // size vector operands into a single quad-register size vector. Do that
15686 // transformation here:
15687 // shuffle(concat(v1, undef), concat(v2, undef)) ->
15688 // shuffle(concat(v1, v2), undef)
15689 SDValue Op0 = N->getOperand(0);
15690 SDValue Op1 = N->getOperand(1);
15691 if (Op0.getOpcode() != ISD::CONCAT_VECTORS ||
15692 Op1.getOpcode() != ISD::CONCAT_VECTORS ||
15693 Op0.getNumOperands() != 2 ||
15694 Op1.getNumOperands() != 2)
15695 return SDValue();
15696 SDValue Concat0Op1 = Op0.getOperand(1);
15697 SDValue Concat1Op1 = Op1.getOperand(1);
15698 if (!Concat0Op1.isUndef() || !Concat1Op1.isUndef())
15699 return SDValue();
15700 // Skip the transformation if any of the types are illegal.
15701 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
15702 EVT VT = N->getValueType(0);
15703 if (!TLI.isTypeLegal(VT) ||
15704 !TLI.isTypeLegal(Concat0Op1.getValueType()) ||
15705 !TLI.isTypeLegal(Concat1Op1.getValueType()))
15706 return SDValue();
15707
15708 SDValue NewConcat = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT,
15709 Op0.getOperand(0), Op1.getOperand(0));
15710 // Translate the shuffle mask.
15711 SmallVector<int, 16> NewMask;
15712 unsigned NumElts = VT.getVectorNumElements();
15713 unsigned HalfElts = NumElts/2;
15715 for (unsigned n = 0; n < NumElts; ++n) {
15716 int MaskElt = SVN->getMaskElt(n);
15717 int NewElt = -1;
15718 if (MaskElt < (int)HalfElts)
15719 NewElt = MaskElt;
15720 else if (MaskElt >= (int)NumElts && MaskElt < (int)(NumElts + HalfElts))
15721 NewElt = HalfElts + MaskElt - NumElts;
15722 NewMask.push_back(NewElt);
15723 }
15724 return DAG.getVectorShuffle(VT, SDLoc(N), NewConcat,
15725 DAG.getUNDEF(VT), NewMask);
15726}
15727
15728/// Load/store instruction that can be merged with a base address
15729/// update
15734 unsigned AddrOpIdx;
15735};
15736
15738 /// Instruction that updates a pointer
15740 /// Pointer increment operand
15742 /// Pointer increment value if it is a constant, or 0 otherwise
15743 unsigned ConstInc;
15744};
15745
15747 // Check that the add is independent of the load/store.
15748 // Otherwise, folding it would create a cycle. Search through Addr
15749 // as well, since the User may not be a direct user of Addr and
15750 // only share a base pointer.
15753 Worklist.push_back(N);
15754 Worklist.push_back(User);
15755 const unsigned MaxSteps = 1024;
15756 if (SDNode::hasPredecessorHelper(N, Visited, Worklist, MaxSteps) ||
15757 SDNode::hasPredecessorHelper(User, Visited, Worklist, MaxSteps))
15758 return false;
15759 return true;
15760}
15761
15763 struct BaseUpdateUser &User,
15764 bool SimpleConstIncOnly,
15766 SelectionDAG &DAG = DCI.DAG;
15767 SDNode *N = Target.N;
15768 MemSDNode *MemN = cast<MemSDNode>(N);
15769 SDLoc dl(N);
15770
15771 // Find the new opcode for the updating load/store.
15772 bool isLoadOp = true;
15773 bool isLaneOp = false;
15774 // Workaround for vst1x and vld1x intrinsics which do not have alignment
15775 // as an operand.
15776 bool hasAlignment = true;
15777 unsigned NewOpc = 0;
15778 unsigned NumVecs = 0;
15779 if (Target.isIntrinsic) {
15780 unsigned IntNo = N->getConstantOperandVal(1);
15781 switch (IntNo) {
15782 default:
15783 llvm_unreachable("unexpected intrinsic for Neon base update");
15784 case Intrinsic::arm_neon_vld1:
15785 NewOpc = ARMISD::VLD1_UPD;
15786 NumVecs = 1;
15787 break;
15788 case Intrinsic::arm_neon_vld2:
15789 NewOpc = ARMISD::VLD2_UPD;
15790 NumVecs = 2;
15791 break;
15792 case Intrinsic::arm_neon_vld3:
15793 NewOpc = ARMISD::VLD3_UPD;
15794 NumVecs = 3;
15795 break;
15796 case Intrinsic::arm_neon_vld4:
15797 NewOpc = ARMISD::VLD4_UPD;
15798 NumVecs = 4;
15799 break;
15800 case Intrinsic::arm_neon_vld1x2:
15801 NewOpc = ARMISD::VLD1x2_UPD;
15802 NumVecs = 2;
15803 hasAlignment = false;
15804 break;
15805 case Intrinsic::arm_neon_vld1x3:
15806 NewOpc = ARMISD::VLD1x3_UPD;
15807 NumVecs = 3;
15808 hasAlignment = false;
15809 break;
15810 case Intrinsic::arm_neon_vld1x4:
15811 NewOpc = ARMISD::VLD1x4_UPD;
15812 NumVecs = 4;
15813 hasAlignment = false;
15814 break;
15815 case Intrinsic::arm_neon_vld2dup:
15816 NewOpc = ARMISD::VLD2DUP_UPD;
15817 NumVecs = 2;
15818 break;
15819 case Intrinsic::arm_neon_vld3dup:
15820 NewOpc = ARMISD::VLD3DUP_UPD;
15821 NumVecs = 3;
15822 break;
15823 case Intrinsic::arm_neon_vld4dup:
15824 NewOpc = ARMISD::VLD4DUP_UPD;
15825 NumVecs = 4;
15826 break;
15827 case Intrinsic::arm_neon_vld2lane:
15828 NewOpc = ARMISD::VLD2LN_UPD;
15829 NumVecs = 2;
15830 isLaneOp = true;
15831 break;
15832 case Intrinsic::arm_neon_vld3lane:
15833 NewOpc = ARMISD::VLD3LN_UPD;
15834 NumVecs = 3;
15835 isLaneOp = true;
15836 break;
15837 case Intrinsic::arm_neon_vld4lane:
15838 NewOpc = ARMISD::VLD4LN_UPD;
15839 NumVecs = 4;
15840 isLaneOp = true;
15841 break;
15842 case Intrinsic::arm_neon_vst1:
15843 NewOpc = ARMISD::VST1_UPD;
15844 NumVecs = 1;
15845 isLoadOp = false;
15846 break;
15847 case Intrinsic::arm_neon_vst2:
15848 NewOpc = ARMISD::VST2_UPD;
15849 NumVecs = 2;
15850 isLoadOp = false;
15851 break;
15852 case Intrinsic::arm_neon_vst3:
15853 NewOpc = ARMISD::VST3_UPD;
15854 NumVecs = 3;
15855 isLoadOp = false;
15856 break;
15857 case Intrinsic::arm_neon_vst4:
15858 NewOpc = ARMISD::VST4_UPD;
15859 NumVecs = 4;
15860 isLoadOp = false;
15861 break;
15862 case Intrinsic::arm_neon_vst2lane:
15863 NewOpc = ARMISD::VST2LN_UPD;
15864 NumVecs = 2;
15865 isLoadOp = false;
15866 isLaneOp = true;
15867 break;
15868 case Intrinsic::arm_neon_vst3lane:
15869 NewOpc = ARMISD::VST3LN_UPD;
15870 NumVecs = 3;
15871 isLoadOp = false;
15872 isLaneOp = true;
15873 break;
15874 case Intrinsic::arm_neon_vst4lane:
15875 NewOpc = ARMISD::VST4LN_UPD;
15876 NumVecs = 4;
15877 isLoadOp = false;
15878 isLaneOp = true;
15879 break;
15880 case Intrinsic::arm_neon_vst1x2:
15881 NewOpc = ARMISD::VST1x2_UPD;
15882 NumVecs = 2;
15883 isLoadOp = false;
15884 hasAlignment = false;
15885 break;
15886 case Intrinsic::arm_neon_vst1x3:
15887 NewOpc = ARMISD::VST1x3_UPD;
15888 NumVecs = 3;
15889 isLoadOp = false;
15890 hasAlignment = false;
15891 break;
15892 case Intrinsic::arm_neon_vst1x4:
15893 NewOpc = ARMISD::VST1x4_UPD;
15894 NumVecs = 4;
15895 isLoadOp = false;
15896 hasAlignment = false;
15897 break;
15898 }
15899 } else {
15900 isLaneOp = true;
15901 switch (N->getOpcode()) {
15902 default:
15903 llvm_unreachable("unexpected opcode for Neon base update");
15904 case ARMISD::VLD1DUP:
15905 NewOpc = ARMISD::VLD1DUP_UPD;
15906 NumVecs = 1;
15907 break;
15908 case ARMISD::VLD2DUP:
15909 NewOpc = ARMISD::VLD2DUP_UPD;
15910 NumVecs = 2;
15911 break;
15912 case ARMISD::VLD3DUP:
15913 NewOpc = ARMISD::VLD3DUP_UPD;
15914 NumVecs = 3;
15915 break;
15916 case ARMISD::VLD4DUP:
15917 NewOpc = ARMISD::VLD4DUP_UPD;
15918 NumVecs = 4;
15919 break;
15920 case ISD::LOAD:
15921 NewOpc = ARMISD::VLD1_UPD;
15922 NumVecs = 1;
15923 isLaneOp = false;
15924 break;
15925 case ISD::STORE:
15926 NewOpc = ARMISD::VST1_UPD;
15927 NumVecs = 1;
15928 isLaneOp = false;
15929 isLoadOp = false;
15930 break;
15931 }
15932 }
15933
15934 // Find the size of memory referenced by the load/store.
15935 EVT VecTy;
15936 if (isLoadOp) {
15937 VecTy = N->getValueType(0);
15938 } else if (Target.isIntrinsic) {
15939 VecTy = N->getOperand(Target.AddrOpIdx + 1).getValueType();
15940 } else {
15941 assert(Target.isStore &&
15942 "Node has to be a load, a store, or an intrinsic!");
15943 VecTy = N->getOperand(1).getValueType();
15944 }
15945
15946 bool isVLDDUPOp =
15947 NewOpc == ARMISD::VLD1DUP_UPD || NewOpc == ARMISD::VLD2DUP_UPD ||
15948 NewOpc == ARMISD::VLD3DUP_UPD || NewOpc == ARMISD::VLD4DUP_UPD;
15949
15950 unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8;
15951 if (isLaneOp || isVLDDUPOp)
15952 NumBytes /= VecTy.getVectorNumElements();
15953
15954 if (NumBytes >= 3 * 16 && User.ConstInc != NumBytes) {
15955 // VLD3/4 and VST3/4 for 128-bit vectors are implemented with two
15956 // separate instructions that make it harder to use a non-constant update.
15957 return false;
15958 }
15959
15960 if (SimpleConstIncOnly && User.ConstInc != NumBytes)
15961 return false;
15962
15963 if (!isValidBaseUpdate(N, User.N))
15964 return false;
15965
15966 // OK, we found an ADD we can fold into the base update.
15967 // Now, create a _UPD node, taking care of not breaking alignment.
15968
15969 EVT AlignedVecTy = VecTy;
15970 Align Alignment = MemN->getAlign();
15971
15972 // If this is a less-than-standard-aligned load/store, change the type to
15973 // match the standard alignment.
15974 // The alignment is overlooked when selecting _UPD variants; and it's
15975 // easier to introduce bitcasts here than fix that.
15976 // There are 3 ways to get to this base-update combine:
15977 // - intrinsics: they are assumed to be properly aligned (to the standard
15978 // alignment of the memory type), so we don't need to do anything.
15979 // - ARMISD::VLDx nodes: they are only generated from the aforementioned
15980 // intrinsics, so, likewise, there's nothing to do.
15981 // - generic load/store instructions: the alignment is specified as an
15982 // explicit operand, rather than implicitly as the standard alignment
15983 // of the memory type (like the intrisics). We need to change the
15984 // memory type to match the explicit alignment. That way, we don't
15985 // generate non-standard-aligned ARMISD::VLDx nodes.
15986 if (isa<LSBaseSDNode>(N)) {
15987 if (Alignment.value() < VecTy.getScalarSizeInBits() / 8) {
15988 MVT EltTy = MVT::getIntegerVT(Alignment.value() * 8);
15989 assert(NumVecs == 1 && "Unexpected multi-element generic load/store.");
15990 assert(!isLaneOp && "Unexpected generic load/store lane.");
15991 unsigned NumElts = NumBytes / (EltTy.getSizeInBits() / 8);
15992 AlignedVecTy = MVT::getVectorVT(EltTy, NumElts);
15993 }
15994 // Don't set an explicit alignment on regular load/stores that we want
15995 // to transform to VLD/VST 1_UPD nodes.
15996 // This matches the behavior of regular load/stores, which only get an
15997 // explicit alignment if the MMO alignment is larger than the standard
15998 // alignment of the memory type.
15999 // Intrinsics, however, always get an explicit alignment, set to the
16000 // alignment of the MMO.
16001 Alignment = Align(1);
16002 }
16003
16004 // Create the new updating load/store node.
16005 // First, create an SDVTList for the new updating node's results.
16006 EVT Tys[6];
16007 unsigned NumResultVecs = (isLoadOp ? NumVecs : 0);
16008 unsigned n;
16009 for (n = 0; n < NumResultVecs; ++n)
16010 Tys[n] = AlignedVecTy;
16011 Tys[n++] = MVT::i32;
16012 Tys[n] = MVT::Other;
16013 SDVTList SDTys = DAG.getVTList(ArrayRef(Tys, NumResultVecs + 2));
16014
16015 // Then, gather the new node's operands.
16017 Ops.push_back(N->getOperand(0)); // incoming chain
16018 Ops.push_back(N->getOperand(Target.AddrOpIdx));
16019 Ops.push_back(User.Inc);
16020
16021 if (StoreSDNode *StN = dyn_cast<StoreSDNode>(N)) {
16022 // Try to match the intrinsic's signature
16023 Ops.push_back(StN->getValue());
16024 } else {
16025 // Loads (and of course intrinsics) match the intrinsics' signature,
16026 // so just add all but the alignment operand.
16027 unsigned LastOperand =
16028 hasAlignment ? N->getNumOperands() - 1 : N->getNumOperands();
16029 for (unsigned i = Target.AddrOpIdx + 1; i < LastOperand; ++i)
16030 Ops.push_back(N->getOperand(i));
16031 }
16032
16033 // For all node types, the alignment operand is always the last one.
16034 Ops.push_back(DAG.getConstant(Alignment.value(), dl, MVT::i32));
16035
16036 // If this is a non-standard-aligned STORE, the penultimate operand is the
16037 // stored value. Bitcast it to the aligned type.
16038 if (AlignedVecTy != VecTy && N->getOpcode() == ISD::STORE) {
16039 SDValue &StVal = Ops[Ops.size() - 2];
16040 StVal = DAG.getNode(ISD::BITCAST, dl, AlignedVecTy, StVal);
16041 }
16042
16043 EVT LoadVT = isLaneOp ? VecTy.getVectorElementType() : AlignedVecTy;
16044 SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, dl, SDTys, Ops, LoadVT,
16045 MemN->getMemOperand());
16046
16047 // Update the uses.
16048 SmallVector<SDValue, 5> NewResults;
16049 for (unsigned i = 0; i < NumResultVecs; ++i)
16050 NewResults.push_back(SDValue(UpdN.getNode(), i));
16051
16052 // If this is an non-standard-aligned LOAD, the first result is the loaded
16053 // value. Bitcast it to the expected result type.
16054 if (AlignedVecTy != VecTy && N->getOpcode() == ISD::LOAD) {
16055 SDValue &LdVal = NewResults[0];
16056 LdVal = DAG.getNode(ISD::BITCAST, dl, VecTy, LdVal);
16057 }
16058
16059 NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs + 1)); // chain
16060 DCI.CombineTo(N, NewResults);
16061 DCI.CombineTo(User.N, SDValue(UpdN.getNode(), NumResultVecs));
16062
16063 return true;
16064}
16065
16066// If (opcode ptr inc) is and ADD-like instruction, return the
16067// increment value. Otherwise return 0.
16068static unsigned getPointerConstIncrement(unsigned Opcode, SDValue Ptr,
16069 SDValue Inc, const SelectionDAG &DAG) {
16071 if (!CInc)
16072 return 0;
16073
16074 switch (Opcode) {
16075 case ARMISD::VLD1_UPD:
16076 case ISD::ADD:
16077 return CInc->getZExtValue();
16078 case ISD::OR: {
16079 if (DAG.haveNoCommonBitsSet(Ptr, Inc)) {
16080 // (OR ptr inc) is the same as (ADD ptr inc)
16081 return CInc->getZExtValue();
16082 }
16083 return 0;
16084 }
16085 default:
16086 return 0;
16087 }
16088}
16089
16091 switch (N->getOpcode()) {
16092 case ISD::ADD:
16093 case ISD::OR: {
16094 if (isa<ConstantSDNode>(N->getOperand(1))) {
16095 *Ptr = N->getOperand(0);
16096 *CInc = N->getOperand(1);
16097 return true;
16098 }
16099 return false;
16100 }
16101 case ARMISD::VLD1_UPD: {
16102 if (isa<ConstantSDNode>(N->getOperand(2))) {
16103 *Ptr = N->getOperand(1);
16104 *CInc = N->getOperand(2);
16105 return true;
16106 }
16107 return false;
16108 }
16109 default:
16110 return false;
16111 }
16112}
16113
16114/// CombineBaseUpdate - Target-specific DAG combine function for VLDDUP,
16115/// NEON load/store intrinsics, and generic vector load/stores, to merge
16116/// base address updates.
16117/// For generic load/stores, the memory type is assumed to be a vector.
16118/// The caller is assumed to have checked legality.
16121 const bool isIntrinsic = (N->getOpcode() == ISD::INTRINSIC_VOID ||
16122 N->getOpcode() == ISD::INTRINSIC_W_CHAIN);
16123 const bool isStore = N->getOpcode() == ISD::STORE;
16124 const unsigned AddrOpIdx = ((isIntrinsic || isStore) ? 2 : 1);
16125 BaseUpdateTarget Target = {N, isIntrinsic, isStore, AddrOpIdx};
16126
16127 // Limit the number of possible base-updates we look at to prevent degenerate
16128 // cases.
16129 unsigned MaxBaseUpdates = ArmMaxBaseUpdatesToCheck;
16130
16131 SDValue Addr = N->getOperand(AddrOpIdx);
16132
16134
16135 // Search for a use of the address operand that is an increment.
16136 for (SDUse &Use : Addr->uses()) {
16137 SDNode *User = Use.getUser();
16138 if (Use.getResNo() != Addr.getResNo() || User->getNumOperands() != 2)
16139 continue;
16140
16141 SDValue Inc = User->getOperand(Use.getOperandNo() == 1 ? 0 : 1);
16142 unsigned ConstInc =
16143 getPointerConstIncrement(User->getOpcode(), Addr, Inc, DCI.DAG);
16144
16145 if (ConstInc || User->getOpcode() == ISD::ADD) {
16146 BaseUpdates.push_back({User, Inc, ConstInc});
16147 if (BaseUpdates.size() >= MaxBaseUpdates)
16148 break;
16149 }
16150 }
16151
16152 // If the address is a constant pointer increment itself, find
16153 // another constant increment that has the same base operand
16154 SDValue Base;
16155 SDValue CInc;
16156 if (findPointerConstIncrement(Addr.getNode(), &Base, &CInc)) {
16157 unsigned Offset =
16158 getPointerConstIncrement(Addr->getOpcode(), Base, CInc, DCI.DAG);
16159 for (SDUse &Use : Base->uses()) {
16160
16161 SDNode *User = Use.getUser();
16162 if (Use.getResNo() != Base.getResNo() || User == Addr.getNode() ||
16163 User->getNumOperands() != 2)
16164 continue;
16165
16166 SDValue UserInc = User->getOperand(Use.getOperandNo() == 0 ? 1 : 0);
16167 unsigned UserOffset =
16168 getPointerConstIncrement(User->getOpcode(), Base, UserInc, DCI.DAG);
16169
16170 if (!UserOffset || UserOffset <= Offset)
16171 continue;
16172
16173 unsigned NewConstInc = UserOffset - Offset;
16174 SDValue NewInc = DCI.DAG.getConstant(NewConstInc, SDLoc(N), MVT::i32);
16175 BaseUpdates.push_back({User, NewInc, NewConstInc});
16176 if (BaseUpdates.size() >= MaxBaseUpdates)
16177 break;
16178 }
16179 }
16180
16181 // Try to fold the load/store with an update that matches memory
16182 // access size. This should work well for sequential loads.
16183 unsigned NumValidUpd = BaseUpdates.size();
16184 for (unsigned I = 0; I < NumValidUpd; I++) {
16185 BaseUpdateUser &User = BaseUpdates[I];
16186 if (TryCombineBaseUpdate(Target, User, /*SimpleConstIncOnly=*/true, DCI))
16187 return SDValue();
16188 }
16189
16190 // Try to fold with other users. Non-constant updates are considered
16191 // first, and constant updates are sorted to not break a sequence of
16192 // strided accesses (if there is any).
16193 llvm::stable_sort(BaseUpdates,
16194 [](const BaseUpdateUser &LHS, const BaseUpdateUser &RHS) {
16195 return LHS.ConstInc < RHS.ConstInc;
16196 });
16197 for (BaseUpdateUser &User : BaseUpdates) {
16198 if (TryCombineBaseUpdate(Target, User, /*SimpleConstIncOnly=*/false, DCI))
16199 return SDValue();
16200 }
16201 return SDValue();
16202}
16203
16206 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
16207 return SDValue();
16208
16209 return CombineBaseUpdate(N, DCI);
16210}
16211
16214 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
16215 return SDValue();
16216
16217 SelectionDAG &DAG = DCI.DAG;
16218 SDValue Addr = N->getOperand(2);
16219 MemSDNode *MemN = cast<MemSDNode>(N);
16220 SDLoc dl(N);
16221
16222 // For the stores, where there are multiple intrinsics we only actually want
16223 // to post-inc the last of the them.
16224 unsigned IntNo = N->getConstantOperandVal(1);
16225 if (IntNo == Intrinsic::arm_mve_vst2q && N->getConstantOperandVal(5) != 1)
16226 return SDValue();
16227 if (IntNo == Intrinsic::arm_mve_vst4q && N->getConstantOperandVal(7) != 3)
16228 return SDValue();
16229
16230 // Search for a use of the address operand that is an increment.
16231 for (SDUse &Use : Addr->uses()) {
16232 SDNode *User = Use.getUser();
16233 if (User->getOpcode() != ISD::ADD || Use.getResNo() != Addr.getResNo())
16234 continue;
16235
16236 // Check that the add is independent of the load/store. Otherwise, folding
16237 // it would create a cycle. We can avoid searching through Addr as it's a
16238 // predecessor to both.
16241 Visited.insert(Addr.getNode());
16242 Worklist.push_back(N);
16243 Worklist.push_back(User);
16244 const unsigned MaxSteps = 1024;
16245 if (SDNode::hasPredecessorHelper(N, Visited, Worklist, MaxSteps) ||
16246 SDNode::hasPredecessorHelper(User, Visited, Worklist, MaxSteps))
16247 continue;
16248
16249 // Find the new opcode for the updating load/store.
16250 bool isLoadOp = true;
16251 unsigned NewOpc = 0;
16252 unsigned NumVecs = 0;
16253 switch (IntNo) {
16254 default:
16255 llvm_unreachable("unexpected intrinsic for MVE VLDn combine");
16256 case Intrinsic::arm_mve_vld2q:
16257 NewOpc = ARMISD::VLD2_UPD;
16258 NumVecs = 2;
16259 break;
16260 case Intrinsic::arm_mve_vld4q:
16261 NewOpc = ARMISD::VLD4_UPD;
16262 NumVecs = 4;
16263 break;
16264 case Intrinsic::arm_mve_vst2q:
16265 NewOpc = ARMISD::VST2_UPD;
16266 NumVecs = 2;
16267 isLoadOp = false;
16268 break;
16269 case Intrinsic::arm_mve_vst4q:
16270 NewOpc = ARMISD::VST4_UPD;
16271 NumVecs = 4;
16272 isLoadOp = false;
16273 break;
16274 }
16275
16276 // Find the size of memory referenced by the load/store.
16277 EVT VecTy;
16278 if (isLoadOp) {
16279 VecTy = N->getValueType(0);
16280 } else {
16281 VecTy = N->getOperand(3).getValueType();
16282 }
16283
16284 unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8;
16285
16286 // If the increment is a constant, it must match the memory ref size.
16287 SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);
16289 if (!CInc || CInc->getZExtValue() != NumBytes)
16290 continue;
16291
16292 // Create the new updating load/store node.
16293 // First, create an SDVTList for the new updating node's results.
16294 EVT Tys[6];
16295 unsigned NumResultVecs = (isLoadOp ? NumVecs : 0);
16296 unsigned n;
16297 for (n = 0; n < NumResultVecs; ++n)
16298 Tys[n] = VecTy;
16299 Tys[n++] = MVT::i32;
16300 Tys[n] = MVT::Other;
16301 SDVTList SDTys = DAG.getVTList(ArrayRef(Tys, NumResultVecs + 2));
16302
16303 // Then, gather the new node's operands.
16305 Ops.push_back(N->getOperand(0)); // incoming chain
16306 Ops.push_back(N->getOperand(2)); // ptr
16307 Ops.push_back(Inc);
16308
16309 for (unsigned i = 3; i < N->getNumOperands(); ++i)
16310 Ops.push_back(N->getOperand(i));
16311
16312 SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, dl, SDTys, Ops, VecTy,
16313 MemN->getMemOperand());
16314
16315 // Update the uses.
16316 SmallVector<SDValue, 5> NewResults;
16317 for (unsigned i = 0; i < NumResultVecs; ++i)
16318 NewResults.push_back(SDValue(UpdN.getNode(), i));
16319
16320 NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs + 1)); // chain
16321 DCI.CombineTo(N, NewResults);
16322 DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs));
16323
16324 break;
16325 }
16326
16327 return SDValue();
16328}
16329
16330/// CombineVLDDUP - For a VDUPLANE node N, check if its source operand is a
16331/// vldN-lane (N > 1) intrinsic, and if all the other uses of that intrinsic
16332/// are also VDUPLANEs. If so, combine them to a vldN-dup operation and
16333/// return true.
16335 SelectionDAG &DAG = DCI.DAG;
16336 EVT VT = N->getValueType(0);
16337 // vldN-dup instructions only support 64-bit vectors for N > 1.
16338 if (!VT.is64BitVector())
16339 return false;
16340
16341 // Check if the VDUPLANE operand is a vldN-dup intrinsic.
16342 SDNode *VLD = N->getOperand(0).getNode();
16343 if (VLD->getOpcode() != ISD::INTRINSIC_W_CHAIN)
16344 return false;
16345 unsigned NumVecs = 0;
16346 unsigned NewOpc = 0;
16347 unsigned IntNo = VLD->getConstantOperandVal(1);
16348 if (IntNo == Intrinsic::arm_neon_vld2lane) {
16349 NumVecs = 2;
16350 NewOpc = ARMISD::VLD2DUP;
16351 } else if (IntNo == Intrinsic::arm_neon_vld3lane) {
16352 NumVecs = 3;
16353 NewOpc = ARMISD::VLD3DUP;
16354 } else if (IntNo == Intrinsic::arm_neon_vld4lane) {
16355 NumVecs = 4;
16356 NewOpc = ARMISD::VLD4DUP;
16357 } else {
16358 return false;
16359 }
16360
16361 // First check that all the vldN-lane uses are VDUPLANEs and that the lane
16362 // numbers match the load.
16363 unsigned VLDLaneNo = VLD->getConstantOperandVal(NumVecs + 3);
16364 for (SDUse &Use : VLD->uses()) {
16365 // Ignore uses of the chain result.
16366 if (Use.getResNo() == NumVecs)
16367 continue;
16368 SDNode *User = Use.getUser();
16369 if (User->getOpcode() != ARMISD::VDUPLANE ||
16370 VLDLaneNo != User->getConstantOperandVal(1))
16371 return false;
16372 }
16373
16374 // Create the vldN-dup node.
16375 EVT Tys[5];
16376 unsigned n;
16377 for (n = 0; n < NumVecs; ++n)
16378 Tys[n] = VT;
16379 Tys[n] = MVT::Other;
16380 SDVTList SDTys = DAG.getVTList(ArrayRef(Tys, NumVecs + 1));
16381 SDValue Ops[] = { VLD->getOperand(0), VLD->getOperand(2) };
16383 SDValue VLDDup = DAG.getMemIntrinsicNode(NewOpc, SDLoc(VLD), SDTys,
16384 Ops, VLDMemInt->getMemoryVT(),
16385 VLDMemInt->getMemOperand());
16386
16387 // Update the uses.
16388 for (SDUse &Use : VLD->uses()) {
16389 unsigned ResNo = Use.getResNo();
16390 // Ignore uses of the chain result.
16391 if (ResNo == NumVecs)
16392 continue;
16393 DCI.CombineTo(Use.getUser(), SDValue(VLDDup.getNode(), ResNo));
16394 }
16395
16396 // Now the vldN-lane intrinsic is dead except for its chain result.
16397 // Update uses of the chain.
16398 std::vector<SDValue> VLDDupResults;
16399 for (unsigned n = 0; n < NumVecs; ++n)
16400 VLDDupResults.push_back(SDValue(VLDDup.getNode(), n));
16401 VLDDupResults.push_back(SDValue(VLDDup.getNode(), NumVecs));
16402 DCI.CombineTo(VLD, VLDDupResults);
16403
16404 return true;
16405}
16406
16407/// PerformVDUPLANECombine - Target-specific dag combine xforms for
16408/// ARMISD::VDUPLANE.
16411 const ARMSubtarget *Subtarget) {
16412 SDValue Op = N->getOperand(0);
16413 EVT VT = N->getValueType(0);
16414
16415 // On MVE, we just convert the VDUPLANE to a VDUP with an extract.
16416 if (Subtarget->hasMVEIntegerOps()) {
16417 EVT ExtractVT = VT.getVectorElementType();
16418 // We need to ensure we are creating a legal type.
16419 if (!DCI.DAG.getTargetLoweringInfo().isTypeLegal(ExtractVT))
16420 ExtractVT = MVT::i32;
16421 SDValue Extract = DCI.DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(N), ExtractVT,
16422 N->getOperand(0), N->getOperand(1));
16423 return DCI.DAG.getNode(ARMISD::VDUP, SDLoc(N), VT, Extract);
16424 }
16425
16426 // If the source is a vldN-lane (N > 1) intrinsic, and all the other uses
16427 // of that intrinsic are also VDUPLANEs, combine them to a vldN-dup operation.
16428 if (CombineVLDDUP(N, DCI))
16429 return SDValue(N, 0);
16430
16431 // If the source is already a VMOVIMM or VMVNIMM splat, the VDUPLANE is
16432 // redundant. Ignore bit_converts for now; element sizes are checked below.
16433 while (Op.getOpcode() == ISD::BITCAST)
16434 Op = Op.getOperand(0);
16435 if (Op.getOpcode() != ARMISD::VMOVIMM && Op.getOpcode() != ARMISD::VMVNIMM)
16436 return SDValue();
16437
16438 // Make sure the VMOV element size is not bigger than the VDUPLANE elements.
16439 unsigned EltSize = Op.getScalarValueSizeInBits();
16440 // The canonical VMOV for a zero vector uses a 32-bit element size.
16441 unsigned Imm = Op.getConstantOperandVal(0);
16442 unsigned EltBits;
16443 if (ARM_AM::decodeVMOVModImm(Imm, EltBits) == 0)
16444 EltSize = 8;
16445 if (EltSize > VT.getScalarSizeInBits())
16446 return SDValue();
16447
16448 return DCI.DAG.getNode(ISD::BITCAST, SDLoc(N), VT, Op);
16449}
16450
16451/// PerformVDUPCombine - Target-specific dag combine xforms for ARMISD::VDUP.
16453 const ARMSubtarget *Subtarget) {
16454 SDValue Op = N->getOperand(0);
16455 SDLoc dl(N);
16456
16457 if (Subtarget->hasMVEIntegerOps()) {
16458 // Convert VDUP f32 -> VDUP BITCAST i32 under MVE, as we know the value will
16459 // need to come from a GPR.
16460 if (Op.getValueType() == MVT::f32)
16461 return DAG.getNode(ARMISD::VDUP, dl, N->getValueType(0),
16462 DAG.getNode(ISD::BITCAST, dl, MVT::i32, Op));
16463 else if (Op.getValueType() == MVT::f16)
16464 return DAG.getNode(ARMISD::VDUP, dl, N->getValueType(0),
16465 DAG.getNode(ARMISD::VMOVrh, dl, MVT::i32, Op));
16466 }
16467
16468 if (!Subtarget->hasNEON())
16469 return SDValue();
16470
16471 // Match VDUP(LOAD) -> VLD1DUP.
16472 // We match this pattern here rather than waiting for isel because the
16473 // transform is only legal for unindexed loads.
16474 LoadSDNode *LD = dyn_cast<LoadSDNode>(Op.getNode());
16475 if (LD && Op.hasOneUse() && LD->isUnindexed() &&
16476 LD->getMemoryVT() == N->getValueType(0).getVectorElementType()) {
16477 SDValue Ops[] = {LD->getOperand(0), LD->getOperand(1),
16478 DAG.getConstant(LD->getAlign().value(), SDLoc(N), MVT::i32)};
16479 SDVTList SDTys = DAG.getVTList(N->getValueType(0), MVT::Other);
16480 SDValue VLDDup =
16482 LD->getMemoryVT(), LD->getMemOperand());
16483 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), VLDDup.getValue(1));
16484 return VLDDup;
16485 }
16486
16487 return SDValue();
16488}
16489
16492 const ARMSubtarget *Subtarget) {
16493 EVT VT = N->getValueType(0);
16494
16495 // If this is a legal vector load, try to combine it into a VLD1_UPD.
16496 if (Subtarget->hasNEON() && ISD::isNormalLoad(N) && VT.isVector() &&
16498 return CombineBaseUpdate(N, DCI);
16499
16500 return SDValue();
16501}
16502
16503// Optimize trunc store (of multiple scalars) to shuffle and store. First,
16504// pack all of the elements in one place. Next, store to memory in fewer
16505// chunks.
16507 SelectionDAG &DAG) {
16508 SDValue StVal = St->getValue();
16509 EVT VT = StVal.getValueType();
16510 if (!St->isTruncatingStore() || !VT.isVector())
16511 return SDValue();
16512 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
16513 EVT StVT = St->getMemoryVT();
16514 unsigned NumElems = VT.getVectorNumElements();
16515 assert(StVT != VT && "Cannot truncate to the same type");
16516 unsigned FromEltSz = VT.getScalarSizeInBits();
16517 unsigned ToEltSz = StVT.getScalarSizeInBits();
16518
16519 // From, To sizes and ElemCount must be pow of two
16520 if (!isPowerOf2_32(NumElems * FromEltSz * ToEltSz))
16521 return SDValue();
16522
16523 // We are going to use the original vector elt for storing.
16524 // Accumulated smaller vector elements must be a multiple of the store size.
16525 if (0 != (NumElems * FromEltSz) % ToEltSz)
16526 return SDValue();
16527
16528 unsigned SizeRatio = FromEltSz / ToEltSz;
16529 assert(SizeRatio * NumElems * ToEltSz == VT.getSizeInBits());
16530
16531 // Create a type on which we perform the shuffle.
16532 EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), StVT.getScalarType(),
16533 NumElems * SizeRatio);
16534 assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
16535
16536 SDLoc DL(St);
16537 SDValue WideVec = DAG.getNode(ISD::BITCAST, DL, WideVecVT, StVal);
16538 SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
16539 for (unsigned i = 0; i < NumElems; ++i)
16540 ShuffleVec[i] = DAG.getDataLayout().isBigEndian() ? (i + 1) * SizeRatio - 1
16541 : i * SizeRatio;
16542
16543 // Can't shuffle using an illegal type.
16544 if (!TLI.isTypeLegal(WideVecVT))
16545 return SDValue();
16546
16547 SDValue Shuff = DAG.getVectorShuffle(
16548 WideVecVT, DL, WideVec, DAG.getUNDEF(WideVec.getValueType()), ShuffleVec);
16549 // At this point all of the data is stored at the bottom of the
16550 // register. We now need to save it to mem.
16551
16552 // Find the largest store unit
16553 MVT StoreType = MVT::i8;
16554 for (MVT Tp : MVT::integer_valuetypes()) {
16555 if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToEltSz)
16556 StoreType = Tp;
16557 }
16558 // Didn't find a legal store type.
16559 if (!TLI.isTypeLegal(StoreType))
16560 return SDValue();
16561
16562 // Bitcast the original vector into a vector of store-size units
16563 EVT StoreVecVT =
16564 EVT::getVectorVT(*DAG.getContext(), StoreType,
16565 VT.getSizeInBits() / EVT(StoreType).getSizeInBits());
16566 assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits());
16567 SDValue ShuffWide = DAG.getNode(ISD::BITCAST, DL, StoreVecVT, Shuff);
16569 SDValue Increment = DAG.getConstant(StoreType.getSizeInBits() / 8, DL,
16570 TLI.getPointerTy(DAG.getDataLayout()));
16571 SDValue BasePtr = St->getBasePtr();
16572
16573 // Perform one or more big stores into memory.
16574 unsigned E = (ToEltSz * NumElems) / StoreType.getSizeInBits();
16575 for (unsigned I = 0; I < E; I++) {
16576 SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, StoreType,
16577 ShuffWide, DAG.getIntPtrConstant(I, DL));
16578 SDValue Ch =
16579 DAG.getStore(St->getChain(), DL, SubVec, BasePtr, St->getPointerInfo(),
16580 St->getAlign(), St->getMemOperand()->getFlags());
16581 BasePtr =
16582 DAG.getNode(ISD::ADD, DL, BasePtr.getValueType(), BasePtr, Increment);
16583 Chains.push_back(Ch);
16584 }
16585 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
16586}
16587
16588// Try taking a single vector store from an fpround (which would otherwise turn
16589// into an expensive buildvector) and splitting it into a series of narrowing
16590// stores.
16592 SelectionDAG &DAG) {
16593 if (!St->isSimple() || St->isTruncatingStore() || !St->isUnindexed())
16594 return SDValue();
16595 SDValue Trunc = St->getValue();
16596 if (Trunc->getOpcode() != ISD::FP_ROUND)
16597 return SDValue();
16598 EVT FromVT = Trunc->getOperand(0).getValueType();
16599 EVT ToVT = Trunc.getValueType();
16600 if (!ToVT.isVector())
16601 return SDValue();
16603 EVT ToEltVT = ToVT.getVectorElementType();
16604 EVT FromEltVT = FromVT.getVectorElementType();
16605
16606 if (FromEltVT != MVT::f32 || ToEltVT != MVT::f16)
16607 return SDValue();
16608
16609 unsigned NumElements = 4;
16610 if (FromVT.getVectorNumElements() % NumElements != 0)
16611 return SDValue();
16612
16613 // Test if the Trunc will be convertable to a VMOVN with a shuffle, and if so
16614 // use the VMOVN over splitting the store. We are looking for patterns of:
16615 // !rev: 0 N 1 N+1 2 N+2 ...
16616 // rev: N 0 N+1 1 N+2 2 ...
16617 // The shuffle may either be a single source (in which case N = NumElts/2) or
16618 // two inputs extended with concat to the same size (in which case N =
16619 // NumElts).
16620 auto isVMOVNShuffle = [&](ShuffleVectorSDNode *SVN, bool Rev) {
16621 ArrayRef<int> M = SVN->getMask();
16622 unsigned NumElts = ToVT.getVectorNumElements();
16623 if (SVN->getOperand(1).isUndef())
16624 NumElts /= 2;
16625
16626 unsigned Off0 = Rev ? NumElts : 0;
16627 unsigned Off1 = Rev ? 0 : NumElts;
16628
16629 for (unsigned I = 0; I < NumElts; I += 2) {
16630 if (M[I] >= 0 && M[I] != (int)(Off0 + I / 2))
16631 return false;
16632 if (M[I + 1] >= 0 && M[I + 1] != (int)(Off1 + I / 2))
16633 return false;
16634 }
16635
16636 return true;
16637 };
16638
16639 if (auto *Shuffle = dyn_cast<ShuffleVectorSDNode>(Trunc.getOperand(0)))
16640 if (isVMOVNShuffle(Shuffle, false) || isVMOVNShuffle(Shuffle, true))
16641 return SDValue();
16642
16643 LLVMContext &C = *DAG.getContext();
16644 SDLoc DL(St);
16645 // Details about the old store
16646 SDValue Ch = St->getChain();
16647 SDValue BasePtr = St->getBasePtr();
16648 Align Alignment = St->getBaseAlign();
16650 AAMDNodes AAInfo = St->getAAInfo();
16651
16652 // We split the store into slices of NumElements. fp16 trunc stores are vcvt
16653 // and then stored as truncating integer stores.
16654 EVT NewFromVT = EVT::getVectorVT(C, FromEltVT, NumElements);
16655 EVT NewToVT = EVT::getVectorVT(
16656 C, EVT::getIntegerVT(C, ToEltVT.getSizeInBits()), NumElements);
16657
16659 for (unsigned i = 0; i < FromVT.getVectorNumElements() / NumElements; i++) {
16660 unsigned NewOffset = i * NumElements * ToEltVT.getSizeInBits() / 8;
16661 SDValue NewPtr =
16662 DAG.getObjectPtrOffset(DL, BasePtr, TypeSize::getFixed(NewOffset));
16663
16664 SDValue Extract =
16665 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NewFromVT, Trunc.getOperand(0),
16666 DAG.getConstant(i * NumElements, DL, MVT::i32));
16667
16668 SDValue FPTrunc =
16669 DAG.getNode(ARMISD::VCVTN, DL, MVT::v8f16, DAG.getUNDEF(MVT::v8f16),
16670 Extract, DAG.getConstant(0, DL, MVT::i32));
16671 Extract = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, MVT::v4i32, FPTrunc);
16672
16673 SDValue Store = DAG.getTruncStore(
16674 Ch, DL, Extract, NewPtr, St->getPointerInfo().getWithOffset(NewOffset),
16675 NewToVT, Alignment, MMOFlags, AAInfo);
16676 Stores.push_back(Store);
16677 }
16678 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores);
16679}
16680
16681// Try taking a single vector store from an MVETRUNC (which would otherwise turn
16682// into an expensive buildvector) and splitting it into a series of narrowing
16683// stores.
16685 SelectionDAG &DAG) {
16686 if (!St->isSimple() || St->isTruncatingStore() || !St->isUnindexed())
16687 return SDValue();
16688 SDValue Trunc = St->getValue();
16689 if (Trunc->getOpcode() != ARMISD::MVETRUNC)
16690 return SDValue();
16691 EVT FromVT = Trunc->getOperand(0).getValueType();
16692 EVT ToVT = Trunc.getValueType();
16693
16694 LLVMContext &C = *DAG.getContext();
16695 SDLoc DL(St);
16696 // Details about the old store
16697 SDValue Ch = St->getChain();
16698 SDValue BasePtr = St->getBasePtr();
16699 Align Alignment = St->getBaseAlign();
16701 AAMDNodes AAInfo = St->getAAInfo();
16702
16703 EVT NewToVT = EVT::getVectorVT(C, ToVT.getVectorElementType(),
16704 FromVT.getVectorNumElements());
16705
16707 for (unsigned i = 0; i < Trunc.getNumOperands(); i++) {
16708 unsigned NewOffset =
16709 i * FromVT.getVectorNumElements() * ToVT.getScalarSizeInBits() / 8;
16710 SDValue NewPtr =
16711 DAG.getObjectPtrOffset(DL, BasePtr, TypeSize::getFixed(NewOffset));
16712
16713 SDValue Extract = Trunc.getOperand(i);
16714 SDValue Store = DAG.getTruncStore(
16715 Ch, DL, Extract, NewPtr, St->getPointerInfo().getWithOffset(NewOffset),
16716 NewToVT, Alignment, MMOFlags, AAInfo);
16717 Stores.push_back(Store);
16718 }
16719 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores);
16720}
16721
16722// Given a floating point store from an extracted vector, with an integer
16723// VGETLANE that already exists, store the existing VGETLANEu directly. This can
16724// help reduce fp register pressure, doesn't require the fp extract and allows
16725// use of more integer post-inc stores not available with vstr.
16727 if (!St->isSimple() || St->isTruncatingStore() || !St->isUnindexed())
16728 return SDValue();
16729 SDValue Extract = St->getValue();
16730 EVT VT = Extract.getValueType();
16731 // For now only uses f16. This may be useful for f32 too, but that will
16732 // be bitcast(extract), not the VGETLANEu we currently check here.
16733 if (VT != MVT::f16 || Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
16734 return SDValue();
16735
16736 SDNode *GetLane =
16737 DAG.getNodeIfExists(ARMISD::VGETLANEu, DAG.getVTList(MVT::i32),
16738 {Extract.getOperand(0), Extract.getOperand(1)});
16739 if (!GetLane)
16740 return SDValue();
16741
16742 LLVMContext &C = *DAG.getContext();
16743 SDLoc DL(St);
16744 // Create a new integer store to replace the existing floating point version.
16745 SDValue Ch = St->getChain();
16746 SDValue BasePtr = St->getBasePtr();
16747 Align Alignment = St->getBaseAlign();
16749 AAMDNodes AAInfo = St->getAAInfo();
16750 EVT NewToVT = EVT::getIntegerVT(C, VT.getSizeInBits());
16751 SDValue Store = DAG.getTruncStore(Ch, DL, SDValue(GetLane, 0), BasePtr,
16752 St->getPointerInfo(), NewToVT, Alignment,
16753 MMOFlags, AAInfo);
16754
16755 return Store;
16756}
16757
16758/// PerformSTORECombine - Target-specific dag combine xforms for
16759/// ISD::STORE.
16762 const ARMSubtarget *Subtarget) {
16764 if (St->isVolatile())
16765 return SDValue();
16766 SDValue StVal = St->getValue();
16767 EVT VT = StVal.getValueType();
16768
16769 if (Subtarget->hasNEON())
16770 if (SDValue Store = PerformTruncatingStoreCombine(St, DCI.DAG))
16771 return Store;
16772
16773 if (Subtarget->hasMVEFloatOps())
16774 if (SDValue NewToken = PerformSplittingToNarrowingStores(St, DCI.DAG))
16775 return NewToken;
16776
16777 if (Subtarget->hasMVEIntegerOps()) {
16778 if (SDValue NewChain = PerformExtractFpToIntStores(St, DCI.DAG))
16779 return NewChain;
16780 if (SDValue NewToken =
16782 return NewToken;
16783 }
16784
16785 if (!ISD::isNormalStore(St))
16786 return SDValue();
16787
16788 // Split a store of a VMOVDRR into two integer stores to avoid mixing NEON and
16789 // ARM stores of arguments in the same cache line.
16790 if (StVal.getNode()->getOpcode() == ARMISD::VMOVDRR &&
16791 StVal.getNode()->hasOneUse()) {
16792 SelectionDAG &DAG = DCI.DAG;
16793 bool isBigEndian = DAG.getDataLayout().isBigEndian();
16794 SDLoc DL(St);
16795 SDValue BasePtr = St->getBasePtr();
16796 SDValue NewST1 = DAG.getStore(
16797 St->getChain(), DL, StVal.getNode()->getOperand(isBigEndian ? 1 : 0),
16798 BasePtr, St->getPointerInfo(), St->getBaseAlign(),
16799 St->getMemOperand()->getFlags());
16800
16801 SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr,
16802 DAG.getConstant(4, DL, MVT::i32));
16803 return DAG.getStore(NewST1.getValue(0), DL,
16804 StVal.getNode()->getOperand(isBigEndian ? 0 : 1),
16805 OffsetPtr, St->getPointerInfo().getWithOffset(4),
16806 St->getBaseAlign(), St->getMemOperand()->getFlags());
16807 }
16808
16809 if (StVal.getValueType() == MVT::i64 &&
16811
16812 // Bitcast an i64 store extracted from a vector to f64.
16813 // Otherwise, the i64 value will be legalized to a pair of i32 values.
16814 SelectionDAG &DAG = DCI.DAG;
16815 SDLoc dl(StVal);
16816 SDValue IntVec = StVal.getOperand(0);
16817 EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64,
16819 SDValue Vec = DAG.getNode(ISD::BITCAST, dl, FloatVT, IntVec);
16820 SDValue ExtElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
16821 Vec, StVal.getOperand(1));
16822 dl = SDLoc(N);
16823 SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::i64, ExtElt);
16824 // Make the DAGCombiner fold the bitcasts.
16825 DCI.AddToWorklist(Vec.getNode());
16826 DCI.AddToWorklist(ExtElt.getNode());
16827 DCI.AddToWorklist(V.getNode());
16828 return DAG.getStore(St->getChain(), dl, V, St->getBasePtr(),
16829 St->getPointerInfo(), St->getAlign(),
16830 St->getMemOperand()->getFlags(), St->getAAInfo());
16831 }
16832
16833 // If this is a legal vector store, try to combine it into a VST1_UPD.
16834 if (Subtarget->hasNEON() && ISD::isNormalStore(N) && VT.isVector() &&
16836 return CombineBaseUpdate(N, DCI);
16837
16838 return SDValue();
16839}
16840
16841/// PerformVCVTCombine - VCVT (floating-point to fixed-point, Advanced SIMD)
16842/// can replace combinations of VMUL and VCVT (floating-point to integer)
16843/// when the VMUL has a constant operand that is a power of 2.
16844///
16845/// Example (assume d17 = <float 8.000000e+00, float 8.000000e+00>):
16846/// vmul.f32 d16, d17, d16
16847/// vcvt.s32.f32 d16, d16
16848/// becomes:
16849/// vcvt.s32.f32 d16, d16, #3
16851 const ARMSubtarget *Subtarget) {
16852 if (!Subtarget->hasNEON())
16853 return SDValue();
16854
16855 SDValue Op = N->getOperand(0);
16856 if (!Op.getValueType().isVector() || !Op.getValueType().isSimple() ||
16857 Op.getOpcode() != ISD::FMUL)
16858 return SDValue();
16859
16860 SDValue ConstVec = Op->getOperand(1);
16861 if (!isa<BuildVectorSDNode>(ConstVec))
16862 return SDValue();
16863
16864 MVT FloatTy = Op.getSimpleValueType().getVectorElementType();
16865 uint32_t FloatBits = FloatTy.getSizeInBits();
16866 MVT IntTy = N->getSimpleValueType(0).getVectorElementType();
16867 uint32_t IntBits = IntTy.getSizeInBits();
16868 unsigned NumLanes = Op.getValueType().getVectorNumElements();
16869 if (FloatBits != 32 || IntBits > 32 || (NumLanes != 4 && NumLanes != 2)) {
16870 // These instructions only exist converting from f32 to i32. We can handle
16871 // smaller integers by generating an extra truncate, but larger ones would
16872 // be lossy. We also can't handle anything other than 2 or 4 lanes, since
16873 // these intructions only support v2i32/v4i32 types.
16874 return SDValue();
16875 }
16876
16877 BitVector UndefElements;
16879 int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, 33);
16880 if (C == -1 || C == 0 || C > 32)
16881 return SDValue();
16882
16883 SDLoc dl(N);
16884 bool isSigned = N->getOpcode() == ISD::FP_TO_SINT;
16885 unsigned IntrinsicOpcode = isSigned ? Intrinsic::arm_neon_vcvtfp2fxs :
16886 Intrinsic::arm_neon_vcvtfp2fxu;
16887 SDValue FixConv = DAG.getNode(
16888 ISD::INTRINSIC_WO_CHAIN, dl, NumLanes == 2 ? MVT::v2i32 : MVT::v4i32,
16889 DAG.getConstant(IntrinsicOpcode, dl, MVT::i32), Op->getOperand(0),
16890 DAG.getConstant(C, dl, MVT::i32));
16891
16892 if (IntBits < FloatBits)
16893 FixConv = DAG.getNode(ISD::TRUNCATE, dl, N->getValueType(0), FixConv);
16894
16895 return FixConv;
16896}
16897
16899 const ARMSubtarget *Subtarget) {
16900 if (!Subtarget->hasMVEFloatOps())
16901 return SDValue();
16902
16903 // Turn (fadd x, (vselect c, y, -0.0)) into (vselect c, (fadd x, y), x)
16904 // The second form can be more easily turned into a predicated vadd, and
16905 // possibly combined into a fma to become a predicated vfma.
16906 SDValue Op0 = N->getOperand(0);
16907 SDValue Op1 = N->getOperand(1);
16908 EVT VT = N->getValueType(0);
16909 SDLoc DL(N);
16910
16911 // The identity element for a fadd is -0.0 or +0.0 when the nsz flag is set,
16912 // which these VMOV's represent.
16913 auto isIdentitySplat = [&](SDValue Op, bool NSZ) {
16914 if (Op.getOpcode() != ISD::BITCAST ||
16915 Op.getOperand(0).getOpcode() != ARMISD::VMOVIMM)
16916 return false;
16917 uint64_t ImmVal = Op.getOperand(0).getConstantOperandVal(0);
16918 if (VT == MVT::v4f32 && (ImmVal == 1664 || (ImmVal == 0 && NSZ)))
16919 return true;
16920 if (VT == MVT::v8f16 && (ImmVal == 2688 || (ImmVal == 0 && NSZ)))
16921 return true;
16922 return false;
16923 };
16924
16925 if (Op0.getOpcode() == ISD::VSELECT && Op1.getOpcode() != ISD::VSELECT)
16926 std::swap(Op0, Op1);
16927
16928 if (Op1.getOpcode() != ISD::VSELECT)
16929 return SDValue();
16930
16931 SDNodeFlags FaddFlags = N->getFlags();
16932 bool NSZ = FaddFlags.hasNoSignedZeros();
16933 if (!isIdentitySplat(Op1.getOperand(2), NSZ))
16934 return SDValue();
16935
16936 SDValue FAdd =
16937 DAG.getNode(ISD::FADD, DL, VT, Op0, Op1.getOperand(1), FaddFlags);
16938 return DAG.getNode(ISD::VSELECT, DL, VT, Op1.getOperand(0), FAdd, Op0, FaddFlags);
16939}
16940
16942 SDValue LHS = N->getOperand(0);
16943 SDValue RHS = N->getOperand(1);
16944 EVT VT = N->getValueType(0);
16945 SDLoc DL(N);
16946
16947 if (!N->getFlags().hasAllowReassociation())
16948 return SDValue();
16949
16950 // Combine fadd(a, vcmla(b, c, d)) -> vcmla(fadd(a, b), b, c)
16951 auto ReassocComplex = [&](SDValue A, SDValue B) {
16952 if (A.getOpcode() != ISD::INTRINSIC_WO_CHAIN)
16953 return SDValue();
16954 unsigned Opc = A.getConstantOperandVal(0);
16955 if (Opc != Intrinsic::arm_mve_vcmlaq)
16956 return SDValue();
16957 SDValue VCMLA = DAG.getNode(
16958 ISD::INTRINSIC_WO_CHAIN, DL, VT, A.getOperand(0), A.getOperand(1),
16959 DAG.getNode(ISD::FADD, DL, VT, A.getOperand(2), B, N->getFlags()),
16960 A.getOperand(3), A.getOperand(4));
16961 VCMLA->setFlags(A->getFlags());
16962 return VCMLA;
16963 };
16964 if (SDValue R = ReassocComplex(LHS, RHS))
16965 return R;
16966 if (SDValue R = ReassocComplex(RHS, LHS))
16967 return R;
16968
16969 return SDValue();
16970}
16971
16973 const ARMSubtarget *Subtarget) {
16974 if (SDValue S = PerformFAddVSelectCombine(N, DAG, Subtarget))
16975 return S;
16976 if (SDValue S = PerformFADDVCMLACombine(N, DAG))
16977 return S;
16978 return SDValue();
16979}
16980
16981/// PerformVMulVCTPCombine - VCVT (fixed-point to floating-point, Advanced SIMD)
16982/// can replace combinations of VCVT (integer to floating-point) and VMUL
16983/// when the VMUL has a constant operand that is a power of 2.
16984///
16985/// Example (assume d17 = <float 0.125, float 0.125>):
16986/// vcvt.f32.s32 d16, d16
16987/// vmul.f32 d16, d16, d17
16988/// becomes:
16989/// vcvt.f32.s32 d16, d16, #3
16991 const ARMSubtarget *Subtarget) {
16992 if (!Subtarget->hasNEON())
16993 return SDValue();
16994
16995 SDValue Op = N->getOperand(0);
16996 unsigned OpOpcode = Op.getNode()->getOpcode();
16997 if (!N->getValueType(0).isVector() || !N->getValueType(0).isSimple() ||
16998 (OpOpcode != ISD::SINT_TO_FP && OpOpcode != ISD::UINT_TO_FP))
16999 return SDValue();
17000
17001 SDValue ConstVec = N->getOperand(1);
17002 if (!isa<BuildVectorSDNode>(ConstVec))
17003 return SDValue();
17004
17005 MVT FloatTy = N->getSimpleValueType(0).getVectorElementType();
17006 uint32_t FloatBits = FloatTy.getSizeInBits();
17007 MVT IntTy = Op.getOperand(0).getSimpleValueType().getVectorElementType();
17008 uint32_t IntBits = IntTy.getSizeInBits();
17009 unsigned NumLanes = Op.getValueType().getVectorNumElements();
17010 if (FloatBits != 32 || IntBits > 32 || (NumLanes != 4 && NumLanes != 2)) {
17011 // These instructions only exist converting from i32 to f32. We can handle
17012 // smaller integers by generating an extra extend, but larger ones would
17013 // be lossy. We also can't handle anything other than 2 or 4 lanes, since
17014 // these intructions only support v2i32/v4i32 types.
17015 return SDValue();
17016 }
17017
17018 ConstantFPSDNode *CN = isConstOrConstSplatFP(ConstVec, true);
17019 APFloat Recip(0.0f);
17020 if (!CN || !CN->getValueAPF().getExactInverse(&Recip))
17021 return SDValue();
17022
17023 bool IsExact;
17024 APSInt IntVal(33);
17025 if (Recip.convertToInteger(IntVal, APFloat::rmTowardZero, &IsExact) !=
17026 APFloat::opOK ||
17027 !IsExact)
17028 return SDValue();
17029
17030 int32_t C = IntVal.exactLogBase2();
17031 if (C == -1 || C == 0 || C > 32)
17032 return SDValue();
17033
17034 SDLoc DL(N);
17035 bool isSigned = OpOpcode == ISD::SINT_TO_FP;
17036 SDValue ConvInput = Op.getOperand(0);
17037 if (IntBits < FloatBits)
17038 ConvInput = DAG.getNode(isSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, DL,
17039 NumLanes == 2 ? MVT::v2i32 : MVT::v4i32, ConvInput);
17040
17041 unsigned IntrinsicOpcode = isSigned ? Intrinsic::arm_neon_vcvtfxs2fp
17042 : Intrinsic::arm_neon_vcvtfxu2fp;
17043 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),
17044 DAG.getConstant(IntrinsicOpcode, DL, MVT::i32), ConvInput,
17045 DAG.getConstant(C, DL, MVT::i32));
17046}
17047
17049 const ARMSubtarget *ST) {
17050 if (!ST->hasMVEIntegerOps())
17051 return SDValue();
17052
17053 assert(N->getOpcode() == ISD::VECREDUCE_ADD);
17054 EVT ResVT = N->getValueType(0);
17055 SDValue N0 = N->getOperand(0);
17056 SDLoc dl(N);
17057
17058 // Try to turn vecreduce_add(add(x, y)) into vecreduce(x) + vecreduce(y)
17059 if (ResVT == MVT::i32 && N0.getOpcode() == ISD::ADD &&
17060 (N0.getValueType() == MVT::v4i32 || N0.getValueType() == MVT::v8i16 ||
17061 N0.getValueType() == MVT::v16i8)) {
17062 SDValue Red0 = DAG.getNode(ISD::VECREDUCE_ADD, dl, ResVT, N0.getOperand(0));
17063 SDValue Red1 = DAG.getNode(ISD::VECREDUCE_ADD, dl, ResVT, N0.getOperand(1));
17064 return DAG.getNode(ISD::ADD, dl, ResVT, Red0, Red1);
17065 }
17066
17067 // We are looking for something that will have illegal types if left alone,
17068 // but that we can convert to a single instruction under MVE. For example
17069 // vecreduce_add(sext(A, v8i32)) => VADDV.s16 A
17070 // or
17071 // vecreduce_add(mul(zext(A, v16i32), zext(B, v16i32))) => VMLADAV.u8 A, B
17072
17073 // The legal cases are:
17074 // VADDV u/s 8/16/32
17075 // VMLAV u/s 8/16/32
17076 // VADDLV u/s 32
17077 // VMLALV u/s 16/32
17078
17079 // If the input vector is smaller than legal (v4i8/v4i16 for example) we can
17080 // extend it and use v4i32 instead.
17081 auto ExtTypeMatches = [](SDValue A, ArrayRef<MVT> ExtTypes) {
17082 EVT AVT = A.getValueType();
17083 return any_of(ExtTypes, [&](MVT Ty) {
17084 return AVT.getVectorNumElements() == Ty.getVectorNumElements() &&
17085 AVT.bitsLE(Ty);
17086 });
17087 };
17088 auto ExtendIfNeeded = [&](SDValue A, unsigned ExtendCode) {
17089 EVT AVT = A.getValueType();
17090 if (!AVT.is128BitVector())
17091 A = DAG.getNode(
17092 ExtendCode, dl,
17094 *DAG.getContext(),
17096 A);
17097 return A;
17098 };
17099 auto IsVADDV = [&](MVT RetTy, unsigned ExtendCode, ArrayRef<MVT> ExtTypes) {
17100 if (ResVT != RetTy || N0->getOpcode() != ExtendCode)
17101 return SDValue();
17102 SDValue A = N0->getOperand(0);
17103 if (ExtTypeMatches(A, ExtTypes))
17104 return ExtendIfNeeded(A, ExtendCode);
17105 return SDValue();
17106 };
17107 auto IsPredVADDV = [&](MVT RetTy, unsigned ExtendCode,
17108 ArrayRef<MVT> ExtTypes, SDValue &Mask) {
17109 if (ResVT != RetTy || N0->getOpcode() != ISD::VSELECT ||
17111 return SDValue();
17112 Mask = N0->getOperand(0);
17113 SDValue Ext = N0->getOperand(1);
17114 if (Ext->getOpcode() != ExtendCode)
17115 return SDValue();
17116 SDValue A = Ext->getOperand(0);
17117 if (ExtTypeMatches(A, ExtTypes))
17118 return ExtendIfNeeded(A, ExtendCode);
17119 return SDValue();
17120 };
17121 auto IsVMLAV = [&](MVT RetTy, unsigned ExtendCode, ArrayRef<MVT> ExtTypes,
17122 SDValue &A, SDValue &B) {
17123 // For a vmla we are trying to match a larger pattern:
17124 // ExtA = sext/zext A
17125 // ExtB = sext/zext B
17126 // Mul = mul ExtA, ExtB
17127 // vecreduce.add Mul
17128 // There might also be en extra extend between the mul and the addreduce, so
17129 // long as the bitwidth is high enough to make them equivalent (for example
17130 // original v8i16 might be mul at v8i32 and the reduce happens at v8i64).
17131 if (ResVT != RetTy)
17132 return false;
17133 SDValue Mul = N0;
17134 if (Mul->getOpcode() == ExtendCode &&
17135 Mul->getOperand(0).getScalarValueSizeInBits() * 2 >=
17136 ResVT.getScalarSizeInBits())
17137 Mul = Mul->getOperand(0);
17138 if (Mul->getOpcode() != ISD::MUL)
17139 return false;
17140 SDValue ExtA = Mul->getOperand(0);
17141 SDValue ExtB = Mul->getOperand(1);
17142 if (ExtA->getOpcode() != ExtendCode || ExtB->getOpcode() != ExtendCode)
17143 return false;
17144 A = ExtA->getOperand(0);
17145 B = ExtB->getOperand(0);
17146 if (ExtTypeMatches(A, ExtTypes) && ExtTypeMatches(B, ExtTypes)) {
17147 A = ExtendIfNeeded(A, ExtendCode);
17148 B = ExtendIfNeeded(B, ExtendCode);
17149 return true;
17150 }
17151 return false;
17152 };
17153 auto IsPredVMLAV = [&](MVT RetTy, unsigned ExtendCode, ArrayRef<MVT> ExtTypes,
17154 SDValue &A, SDValue &B, SDValue &Mask) {
17155 // Same as the pattern above with a select for the zero predicated lanes
17156 // ExtA = sext/zext A
17157 // ExtB = sext/zext B
17158 // Mul = mul ExtA, ExtB
17159 // N0 = select Mask, Mul, 0
17160 // vecreduce.add N0
17161 if (ResVT != RetTy || N0->getOpcode() != ISD::VSELECT ||
17163 return false;
17164 Mask = N0->getOperand(0);
17165 SDValue Mul = N0->getOperand(1);
17166 if (Mul->getOpcode() == ExtendCode &&
17167 Mul->getOperand(0).getScalarValueSizeInBits() * 2 >=
17168 ResVT.getScalarSizeInBits())
17169 Mul = Mul->getOperand(0);
17170 if (Mul->getOpcode() != ISD::MUL)
17171 return false;
17172 SDValue ExtA = Mul->getOperand(0);
17173 SDValue ExtB = Mul->getOperand(1);
17174 if (ExtA->getOpcode() != ExtendCode || ExtB->getOpcode() != ExtendCode)
17175 return false;
17176 A = ExtA->getOperand(0);
17177 B = ExtB->getOperand(0);
17178 if (ExtTypeMatches(A, ExtTypes) && ExtTypeMatches(B, ExtTypes)) {
17179 A = ExtendIfNeeded(A, ExtendCode);
17180 B = ExtendIfNeeded(B, ExtendCode);
17181 return true;
17182 }
17183 return false;
17184 };
17185 auto Create64bitNode = [&](unsigned Opcode, ArrayRef<SDValue> Ops) {
17186 // Split illegal MVT::v16i8->i64 vector reductions into two legal v8i16->i64
17187 // reductions. The operands are extended with MVEEXT, but as they are
17188 // reductions the lane orders do not matter. MVEEXT may be combined with
17189 // loads to produce two extending loads, or else they will be expanded to
17190 // VREV/VMOVL.
17191 EVT VT = Ops[0].getValueType();
17192 if (VT == MVT::v16i8) {
17193 assert((Opcode == ARMISD::VMLALVs || Opcode == ARMISD::VMLALVu) &&
17194 "Unexpected illegal long reduction opcode");
17195 bool IsUnsigned = Opcode == ARMISD::VMLALVu;
17196
17197 SDValue Ext0 =
17198 DAG.getNode(IsUnsigned ? ARMISD::MVEZEXT : ARMISD::MVESEXT, dl,
17199 DAG.getVTList(MVT::v8i16, MVT::v8i16), Ops[0]);
17200 SDValue Ext1 =
17201 DAG.getNode(IsUnsigned ? ARMISD::MVEZEXT : ARMISD::MVESEXT, dl,
17202 DAG.getVTList(MVT::v8i16, MVT::v8i16), Ops[1]);
17203
17204 SDValue MLA0 = DAG.getNode(Opcode, dl, DAG.getVTList(MVT::i32, MVT::i32),
17205 Ext0, Ext1);
17206 SDValue MLA1 =
17207 DAG.getNode(IsUnsigned ? ARMISD::VMLALVAu : ARMISD::VMLALVAs, dl,
17208 DAG.getVTList(MVT::i32, MVT::i32), MLA0, MLA0.getValue(1),
17209 Ext0.getValue(1), Ext1.getValue(1));
17210 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, MLA1, MLA1.getValue(1));
17211 }
17212 SDValue Node = DAG.getNode(Opcode, dl, {MVT::i32, MVT::i32}, Ops);
17213 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Node,
17214 SDValue(Node.getNode(), 1));
17215 };
17216
17217 SDValue A, B;
17218 SDValue Mask;
17219 if (IsVMLAV(MVT::i32, ISD::SIGN_EXTEND, {MVT::v8i16, MVT::v16i8}, A, B))
17220 return DAG.getNode(ARMISD::VMLAVs, dl, ResVT, A, B);
17221 if (IsVMLAV(MVT::i32, ISD::ZERO_EXTEND, {MVT::v8i16, MVT::v16i8}, A, B))
17222 return DAG.getNode(ARMISD::VMLAVu, dl, ResVT, A, B);
17223 if (IsVMLAV(MVT::i64, ISD::SIGN_EXTEND, {MVT::v16i8, MVT::v8i16, MVT::v4i32},
17224 A, B))
17225 return Create64bitNode(ARMISD::VMLALVs, {A, B});
17226 if (IsVMLAV(MVT::i64, ISD::ZERO_EXTEND, {MVT::v16i8, MVT::v8i16, MVT::v4i32},
17227 A, B))
17228 return Create64bitNode(ARMISD::VMLALVu, {A, B});
17229 if (IsVMLAV(MVT::i16, ISD::SIGN_EXTEND, {MVT::v16i8}, A, B))
17230 return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
17231 DAG.getNode(ARMISD::VMLAVs, dl, MVT::i32, A, B));
17232 if (IsVMLAV(MVT::i16, ISD::ZERO_EXTEND, {MVT::v16i8}, A, B))
17233 return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
17234 DAG.getNode(ARMISD::VMLAVu, dl, MVT::i32, A, B));
17235
17236 if (IsPredVMLAV(MVT::i32, ISD::SIGN_EXTEND, {MVT::v8i16, MVT::v16i8}, A, B,
17237 Mask))
17238 return DAG.getNode(ARMISD::VMLAVps, dl, ResVT, A, B, Mask);
17239 if (IsPredVMLAV(MVT::i32, ISD::ZERO_EXTEND, {MVT::v8i16, MVT::v16i8}, A, B,
17240 Mask))
17241 return DAG.getNode(ARMISD::VMLAVpu, dl, ResVT, A, B, Mask);
17242 if (IsPredVMLAV(MVT::i64, ISD::SIGN_EXTEND, {MVT::v8i16, MVT::v4i32}, A, B,
17243 Mask))
17244 return Create64bitNode(ARMISD::VMLALVps, {A, B, Mask});
17245 if (IsPredVMLAV(MVT::i64, ISD::ZERO_EXTEND, {MVT::v8i16, MVT::v4i32}, A, B,
17246 Mask))
17247 return Create64bitNode(ARMISD::VMLALVpu, {A, B, Mask});
17248 if (IsPredVMLAV(MVT::i16, ISD::SIGN_EXTEND, {MVT::v16i8}, A, B, Mask))
17249 return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
17250 DAG.getNode(ARMISD::VMLAVps, dl, MVT::i32, A, B, Mask));
17251 if (IsPredVMLAV(MVT::i16, ISD::ZERO_EXTEND, {MVT::v16i8}, A, B, Mask))
17252 return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
17253 DAG.getNode(ARMISD::VMLAVpu, dl, MVT::i32, A, B, Mask));
17254
17255 if (SDValue A = IsVADDV(MVT::i32, ISD::SIGN_EXTEND, {MVT::v8i16, MVT::v16i8}))
17256 return DAG.getNode(ARMISD::VADDVs, dl, ResVT, A);
17257 if (SDValue A = IsVADDV(MVT::i32, ISD::ZERO_EXTEND, {MVT::v8i16, MVT::v16i8}))
17258 return DAG.getNode(ARMISD::VADDVu, dl, ResVT, A);
17259 if (SDValue A = IsVADDV(MVT::i64, ISD::SIGN_EXTEND, {MVT::v4i32}))
17260 return Create64bitNode(ARMISD::VADDLVs, {A});
17261 if (SDValue A = IsVADDV(MVT::i64, ISD::ZERO_EXTEND, {MVT::v4i32}))
17262 return Create64bitNode(ARMISD::VADDLVu, {A});
17263 if (SDValue A = IsVADDV(MVT::i16, ISD::SIGN_EXTEND, {MVT::v16i8}))
17264 return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
17265 DAG.getNode(ARMISD::VADDVs, dl, MVT::i32, A));
17266 if (SDValue A = IsVADDV(MVT::i16, ISD::ZERO_EXTEND, {MVT::v16i8}))
17267 return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
17268 DAG.getNode(ARMISD::VADDVu, dl, MVT::i32, A));
17269
17270 if (SDValue A = IsPredVADDV(MVT::i32, ISD::SIGN_EXTEND, {MVT::v8i16, MVT::v16i8}, Mask))
17271 return DAG.getNode(ARMISD::VADDVps, dl, ResVT, A, Mask);
17272 if (SDValue A = IsPredVADDV(MVT::i32, ISD::ZERO_EXTEND, {MVT::v8i16, MVT::v16i8}, Mask))
17273 return DAG.getNode(ARMISD::VADDVpu, dl, ResVT, A, Mask);
17274 if (SDValue A = IsPredVADDV(MVT::i64, ISD::SIGN_EXTEND, {MVT::v4i32}, Mask))
17275 return Create64bitNode(ARMISD::VADDLVps, {A, Mask});
17276 if (SDValue A = IsPredVADDV(MVT::i64, ISD::ZERO_EXTEND, {MVT::v4i32}, Mask))
17277 return Create64bitNode(ARMISD::VADDLVpu, {A, Mask});
17278 if (SDValue A = IsPredVADDV(MVT::i16, ISD::SIGN_EXTEND, {MVT::v16i8}, Mask))
17279 return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
17280 DAG.getNode(ARMISD::VADDVps, dl, MVT::i32, A, Mask));
17281 if (SDValue A = IsPredVADDV(MVT::i16, ISD::ZERO_EXTEND, {MVT::v16i8}, Mask))
17282 return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
17283 DAG.getNode(ARMISD::VADDVpu, dl, MVT::i32, A, Mask));
17284
17285 // Some complications. We can get a case where the two inputs of the mul are
17286 // the same, then the output sext will have been helpfully converted to a
17287 // zext. Turn it back.
17288 SDValue Op = N0;
17289 if (Op->getOpcode() == ISD::VSELECT)
17290 Op = Op->getOperand(1);
17291 if (Op->getOpcode() == ISD::ZERO_EXTEND &&
17292 Op->getOperand(0)->getOpcode() == ISD::MUL) {
17293 SDValue Mul = Op->getOperand(0);
17294 if (Mul->getOperand(0) == Mul->getOperand(1) &&
17295 Mul->getOperand(0)->getOpcode() == ISD::SIGN_EXTEND) {
17296 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, N0->getValueType(0), Mul);
17297 if (Op != N0)
17298 Ext = DAG.getNode(ISD::VSELECT, dl, N0->getValueType(0),
17299 N0->getOperand(0), Ext, N0->getOperand(2));
17300 return DAG.getNode(ISD::VECREDUCE_ADD, dl, ResVT, Ext);
17301 }
17302 }
17303
17304 return SDValue();
17305}
17306
17307// Looks for vaddv(shuffle) or vmlav(shuffle, shuffle), with a shuffle where all
17308// the lanes are used. Due to the reduction being commutative the shuffle can be
17309// removed.
17311 unsigned VecOp = N->getOperand(0).getValueType().isVector() ? 0 : 2;
17312 auto *Shuf = dyn_cast<ShuffleVectorSDNode>(N->getOperand(VecOp));
17313 if (!Shuf || !Shuf->getOperand(1).isUndef())
17314 return SDValue();
17315
17316 // Check all elements are used once in the mask.
17317 ArrayRef<int> Mask = Shuf->getMask();
17318 APInt SetElts(Mask.size(), 0);
17319 for (int E : Mask) {
17320 if (E < 0 || E >= (int)Mask.size())
17321 return SDValue();
17322 SetElts.setBit(E);
17323 }
17324 if (!SetElts.isAllOnes())
17325 return SDValue();
17326
17327 if (N->getNumOperands() != VecOp + 1) {
17328 auto *Shuf2 = dyn_cast<ShuffleVectorSDNode>(N->getOperand(VecOp + 1));
17329 if (!Shuf2 || !Shuf2->getOperand(1).isUndef() || Shuf2->getMask() != Mask)
17330 return SDValue();
17331 }
17332
17334 for (SDValue Op : N->ops()) {
17335 if (Op.getValueType().isVector())
17336 Ops.push_back(Op.getOperand(0));
17337 else
17338 Ops.push_back(Op);
17339 }
17340 return DAG.getNode(N->getOpcode(), SDLoc(N), N->getVTList(), Ops);
17341}
17342
17345 SDValue Op0 = N->getOperand(0);
17346 SDValue Op1 = N->getOperand(1);
17347 unsigned IsTop = N->getConstantOperandVal(2);
17348
17349 // VMOVNT a undef -> a
17350 // VMOVNB a undef -> a
17351 // VMOVNB undef a -> a
17352 if (Op1->isUndef())
17353 return Op0;
17354 if (Op0->isUndef() && !IsTop)
17355 return Op1;
17356
17357 // VMOVNt(c, VQMOVNb(a, b)) => VQMOVNt(c, b)
17358 // VMOVNb(c, VQMOVNb(a, b)) => VQMOVNb(c, b)
17359 if ((Op1->getOpcode() == ARMISD::VQMOVNs ||
17360 Op1->getOpcode() == ARMISD::VQMOVNu) &&
17361 Op1->getConstantOperandVal(2) == 0)
17362 return DCI.DAG.getNode(Op1->getOpcode(), SDLoc(Op1), N->getValueType(0),
17363 Op0, Op1->getOperand(1), N->getOperand(2));
17364
17365 // Only the bottom lanes from Qm (Op1) and either the top or bottom lanes from
17366 // Qd (Op0) are demanded from a VMOVN, depending on whether we are inserting
17367 // into the top or bottom lanes.
17368 unsigned NumElts = N->getValueType(0).getVectorNumElements();
17369 APInt Op1DemandedElts = APInt::getSplat(NumElts, APInt::getLowBitsSet(2, 1));
17370 APInt Op0DemandedElts =
17371 IsTop ? Op1DemandedElts
17372 : APInt::getSplat(NumElts, APInt::getHighBitsSet(2, 1));
17373
17374 const TargetLowering &TLI = DCI.DAG.getTargetLoweringInfo();
17375 if (TLI.SimplifyDemandedVectorElts(Op0, Op0DemandedElts, DCI))
17376 return SDValue(N, 0);
17377 if (TLI.SimplifyDemandedVectorElts(Op1, Op1DemandedElts, DCI))
17378 return SDValue(N, 0);
17379
17380 return SDValue();
17381}
17382
17385 SDValue Op0 = N->getOperand(0);
17386 unsigned IsTop = N->getConstantOperandVal(2);
17387
17388 unsigned NumElts = N->getValueType(0).getVectorNumElements();
17389 APInt Op0DemandedElts =
17390 APInt::getSplat(NumElts, IsTop ? APInt::getLowBitsSet(2, 1)
17391 : APInt::getHighBitsSet(2, 1));
17392
17393 const TargetLowering &TLI = DCI.DAG.getTargetLoweringInfo();
17394 if (TLI.SimplifyDemandedVectorElts(Op0, Op0DemandedElts, DCI))
17395 return SDValue(N, 0);
17396 return SDValue();
17397}
17398
17401 EVT VT = N->getValueType(0);
17402 SDValue LHS = N->getOperand(0);
17403 SDValue RHS = N->getOperand(1);
17404
17405 auto *Shuf0 = dyn_cast<ShuffleVectorSDNode>(LHS);
17406 auto *Shuf1 = dyn_cast<ShuffleVectorSDNode>(RHS);
17407 // Turn VQDMULH(shuffle, shuffle) -> shuffle(VQDMULH)
17408 if (Shuf0 && Shuf1 && Shuf0->getMask().equals(Shuf1->getMask()) &&
17409 LHS.getOperand(1).isUndef() && RHS.getOperand(1).isUndef() &&
17410 (LHS.hasOneUse() || RHS.hasOneUse() || LHS == RHS)) {
17411 SDLoc DL(N);
17412 SDValue NewBinOp = DCI.DAG.getNode(N->getOpcode(), DL, VT,
17413 LHS.getOperand(0), RHS.getOperand(0));
17414 SDValue UndefV = LHS.getOperand(1);
17415 return DCI.DAG.getVectorShuffle(VT, DL, NewBinOp, UndefV, Shuf0->getMask());
17416 }
17417 return SDValue();
17418}
17419
17421 SDLoc DL(N);
17422 SDValue Op0 = N->getOperand(0);
17423 SDValue Op1 = N->getOperand(1);
17424
17425 // Turn X << -C -> X >> C and viceversa. The negative shifts can come up from
17426 // uses of the intrinsics.
17427 if (auto C = dyn_cast<ConstantSDNode>(N->getOperand(2))) {
17428 int ShiftAmt = C->getSExtValue();
17429 if (ShiftAmt == 0) {
17430 SDValue Merge = DAG.getMergeValues({Op0, Op1}, DL);
17431 DAG.ReplaceAllUsesWith(N, Merge.getNode());
17432 return SDValue();
17433 }
17434
17435 if (ShiftAmt >= -32 && ShiftAmt < 0) {
17436 unsigned NewOpcode =
17437 N->getOpcode() == ARMISD::LSLL ? ARMISD::LSRL : ARMISD::LSLL;
17438 SDValue NewShift = DAG.getNode(NewOpcode, DL, N->getVTList(), Op0, Op1,
17439 DAG.getConstant(-ShiftAmt, DL, MVT::i32));
17440 DAG.ReplaceAllUsesWith(N, NewShift.getNode());
17441 return NewShift;
17442 }
17443 }
17444
17445 return SDValue();
17446}
17447
17448/// PerformIntrinsicCombine - ARM-specific DAG combining for intrinsics.
17450 DAGCombinerInfo &DCI) const {
17451 SelectionDAG &DAG = DCI.DAG;
17452 unsigned IntNo = N->getConstantOperandVal(0);
17453 switch (IntNo) {
17454 default:
17455 // Don't do anything for most intrinsics.
17456 break;
17457
17458 // Vector shifts: check for immediate versions and lower them.
17459 // Note: This is done during DAG combining instead of DAG legalizing because
17460 // the build_vectors for 64-bit vector element shift counts are generally
17461 // not legal, and it is hard to see their values after they get legalized to
17462 // loads from a constant pool.
17463 case Intrinsic::arm_neon_vshifts:
17464 case Intrinsic::arm_neon_vshiftu:
17465 case Intrinsic::arm_neon_vrshifts:
17466 case Intrinsic::arm_neon_vrshiftu:
17467 case Intrinsic::arm_neon_vrshiftn:
17468 case Intrinsic::arm_neon_vqshifts:
17469 case Intrinsic::arm_neon_vqshiftu:
17470 case Intrinsic::arm_neon_vqshiftsu:
17471 case Intrinsic::arm_neon_vqshiftns:
17472 case Intrinsic::arm_neon_vqshiftnu:
17473 case Intrinsic::arm_neon_vqshiftnsu:
17474 case Intrinsic::arm_neon_vqrshiftns:
17475 case Intrinsic::arm_neon_vqrshiftnu:
17476 case Intrinsic::arm_neon_vqrshiftnsu: {
17477 EVT VT = N->getOperand(1).getValueType();
17478 int64_t Cnt;
17479 unsigned VShiftOpc = 0;
17480
17481 switch (IntNo) {
17482 case Intrinsic::arm_neon_vshifts:
17483 case Intrinsic::arm_neon_vshiftu:
17484 if (isVShiftLImm(N->getOperand(2), VT, false, Cnt)) {
17485 VShiftOpc = ARMISD::VSHLIMM;
17486 break;
17487 }
17488 if (isVShiftRImm(N->getOperand(2), VT, false, true, Cnt)) {
17489 VShiftOpc = (IntNo == Intrinsic::arm_neon_vshifts ? ARMISD::VSHRsIMM
17490 : ARMISD::VSHRuIMM);
17491 break;
17492 }
17493 return SDValue();
17494
17495 case Intrinsic::arm_neon_vrshifts:
17496 case Intrinsic::arm_neon_vrshiftu:
17497 if (isVShiftRImm(N->getOperand(2), VT, false, true, Cnt))
17498 break;
17499 return SDValue();
17500
17501 case Intrinsic::arm_neon_vqshifts:
17502 case Intrinsic::arm_neon_vqshiftu:
17503 if (isVShiftLImm(N->getOperand(2), VT, false, Cnt))
17504 break;
17505 return SDValue();
17506
17507 case Intrinsic::arm_neon_vqshiftsu:
17508 if (isVShiftLImm(N->getOperand(2), VT, false, Cnt))
17509 break;
17510 llvm_unreachable("invalid shift count for vqshlu intrinsic");
17511
17512 case Intrinsic::arm_neon_vrshiftn:
17513 case Intrinsic::arm_neon_vqshiftns:
17514 case Intrinsic::arm_neon_vqshiftnu:
17515 case Intrinsic::arm_neon_vqshiftnsu:
17516 case Intrinsic::arm_neon_vqrshiftns:
17517 case Intrinsic::arm_neon_vqrshiftnu:
17518 case Intrinsic::arm_neon_vqrshiftnsu:
17519 // Narrowing shifts require an immediate right shift.
17520 if (isVShiftRImm(N->getOperand(2), VT, true, true, Cnt))
17521 break;
17522 llvm_unreachable("invalid shift count for narrowing vector shift "
17523 "intrinsic");
17524
17525 default:
17526 llvm_unreachable("unhandled vector shift");
17527 }
17528
17529 switch (IntNo) {
17530 case Intrinsic::arm_neon_vshifts:
17531 case Intrinsic::arm_neon_vshiftu:
17532 // Opcode already set above.
17533 break;
17534 case Intrinsic::arm_neon_vrshifts:
17535 VShiftOpc = ARMISD::VRSHRsIMM;
17536 break;
17537 case Intrinsic::arm_neon_vrshiftu:
17538 VShiftOpc = ARMISD::VRSHRuIMM;
17539 break;
17540 case Intrinsic::arm_neon_vrshiftn:
17541 VShiftOpc = ARMISD::VRSHRNIMM;
17542 break;
17543 case Intrinsic::arm_neon_vqshifts:
17544 VShiftOpc = ARMISD::VQSHLsIMM;
17545 break;
17546 case Intrinsic::arm_neon_vqshiftu:
17547 VShiftOpc = ARMISD::VQSHLuIMM;
17548 break;
17549 case Intrinsic::arm_neon_vqshiftsu:
17550 VShiftOpc = ARMISD::VQSHLsuIMM;
17551 break;
17552 case Intrinsic::arm_neon_vqshiftns:
17553 VShiftOpc = ARMISD::VQSHRNsIMM;
17554 break;
17555 case Intrinsic::arm_neon_vqshiftnu:
17556 VShiftOpc = ARMISD::VQSHRNuIMM;
17557 break;
17558 case Intrinsic::arm_neon_vqshiftnsu:
17559 VShiftOpc = ARMISD::VQSHRNsuIMM;
17560 break;
17561 case Intrinsic::arm_neon_vqrshiftns:
17562 VShiftOpc = ARMISD::VQRSHRNsIMM;
17563 break;
17564 case Intrinsic::arm_neon_vqrshiftnu:
17565 VShiftOpc = ARMISD::VQRSHRNuIMM;
17566 break;
17567 case Intrinsic::arm_neon_vqrshiftnsu:
17568 VShiftOpc = ARMISD::VQRSHRNsuIMM;
17569 break;
17570 }
17571
17572 SDLoc dl(N);
17573 return DAG.getNode(VShiftOpc, dl, N->getValueType(0),
17574 N->getOperand(1), DAG.getConstant(Cnt, dl, MVT::i32));
17575 }
17576
17577 case Intrinsic::arm_neon_vshiftins: {
17578 EVT VT = N->getOperand(1).getValueType();
17579 int64_t Cnt;
17580 unsigned VShiftOpc = 0;
17581
17582 if (isVShiftLImm(N->getOperand(3), VT, false, Cnt))
17583 VShiftOpc = ARMISD::VSLIIMM;
17584 else if (isVShiftRImm(N->getOperand(3), VT, false, true, Cnt))
17585 VShiftOpc = ARMISD::VSRIIMM;
17586 else {
17587 llvm_unreachable("invalid shift count for vsli/vsri intrinsic");
17588 }
17589
17590 SDLoc dl(N);
17591 return DAG.getNode(VShiftOpc, dl, N->getValueType(0),
17592 N->getOperand(1), N->getOperand(2),
17593 DAG.getConstant(Cnt, dl, MVT::i32));
17594 }
17595
17596 case Intrinsic::arm_neon_vqrshifts:
17597 case Intrinsic::arm_neon_vqrshiftu:
17598 // No immediate versions of these to check for.
17599 break;
17600
17601 case Intrinsic::arm_neon_vbsl: {
17602 SDLoc dl(N);
17603 return DAG.getNode(ARMISD::VBSP, dl, N->getValueType(0), N->getOperand(1),
17604 N->getOperand(2), N->getOperand(3));
17605 }
17606 case Intrinsic::arm_mve_vqdmlah:
17607 case Intrinsic::arm_mve_vqdmlash:
17608 case Intrinsic::arm_mve_vqrdmlah:
17609 case Intrinsic::arm_mve_vqrdmlash:
17610 case Intrinsic::arm_mve_vmla_n_predicated:
17611 case Intrinsic::arm_mve_vmlas_n_predicated:
17612 case Intrinsic::arm_mve_vqdmlah_predicated:
17613 case Intrinsic::arm_mve_vqdmlash_predicated:
17614 case Intrinsic::arm_mve_vqrdmlah_predicated:
17615 case Intrinsic::arm_mve_vqrdmlash_predicated: {
17616 // These intrinsics all take an i32 scalar operand which is narrowed to the
17617 // size of a single lane of the vector type they return. So we don't need
17618 // any bits of that operand above that point, which allows us to eliminate
17619 // uxth/sxth.
17620 unsigned BitWidth = N->getValueType(0).getScalarSizeInBits();
17621 APInt DemandedMask = APInt::getLowBitsSet(32, BitWidth);
17622 if (SimplifyDemandedBits(N->getOperand(3), DemandedMask, DCI))
17623 return SDValue();
17624 break;
17625 }
17626
17627 case Intrinsic::arm_mve_minv:
17628 case Intrinsic::arm_mve_maxv:
17629 case Intrinsic::arm_mve_minav:
17630 case Intrinsic::arm_mve_maxav:
17631 case Intrinsic::arm_mve_minv_predicated:
17632 case Intrinsic::arm_mve_maxv_predicated:
17633 case Intrinsic::arm_mve_minav_predicated:
17634 case Intrinsic::arm_mve_maxav_predicated: {
17635 // These intrinsics all take an i32 scalar operand which is narrowed to the
17636 // size of a single lane of the vector type they take as the other input.
17637 unsigned BitWidth = N->getOperand(2)->getValueType(0).getScalarSizeInBits();
17638 APInt DemandedMask = APInt::getLowBitsSet(32, BitWidth);
17639 if (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI))
17640 return SDValue();
17641 break;
17642 }
17643
17644 case Intrinsic::arm_mve_addv: {
17645 // Turn this intrinsic straight into the appropriate ARMISD::VADDV node,
17646 // which allow PerformADDVecReduce to turn it into VADDLV when possible.
17647 bool Unsigned = N->getConstantOperandVal(2);
17648 unsigned Opc = Unsigned ? ARMISD::VADDVu : ARMISD::VADDVs;
17649 return DAG.getNode(Opc, SDLoc(N), N->getVTList(), N->getOperand(1));
17650 }
17651
17652 case Intrinsic::arm_mve_addlv:
17653 case Intrinsic::arm_mve_addlv_predicated: {
17654 // Same for these, but ARMISD::VADDLV has to be followed by a BUILD_PAIR
17655 // which recombines the two outputs into an i64
17656 bool Unsigned = N->getConstantOperandVal(2);
17657 unsigned Opc = IntNo == Intrinsic::arm_mve_addlv ?
17658 (Unsigned ? ARMISD::VADDLVu : ARMISD::VADDLVs) :
17659 (Unsigned ? ARMISD::VADDLVpu : ARMISD::VADDLVps);
17660
17662 for (unsigned i = 1, e = N->getNumOperands(); i < e; i++)
17663 if (i != 2) // skip the unsigned flag
17664 Ops.push_back(N->getOperand(i));
17665
17666 SDLoc dl(N);
17667 SDValue val = DAG.getNode(Opc, dl, {MVT::i32, MVT::i32}, Ops);
17668 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, val.getValue(0),
17669 val.getValue(1));
17670 }
17671 }
17672
17673 return SDValue();
17674}
17675
17676/// PerformShiftCombine - Checks for immediate versions of vector shifts and
17677/// lowers them. As with the vector shift intrinsics, this is done during DAG
17678/// combining instead of DAG legalizing because the build_vectors for 64-bit
17679/// vector element shift counts are generally not legal, and it is hard to see
17680/// their values after they get legalized to loads from a constant pool.
17683 const ARMSubtarget *ST) {
17684 SelectionDAG &DAG = DCI.DAG;
17685 EVT VT = N->getValueType(0);
17686
17687 if (ST->isThumb1Only() && N->getOpcode() == ISD::SHL && VT == MVT::i32 &&
17688 N->getOperand(0)->getOpcode() == ISD::AND &&
17689 N->getOperand(0)->hasOneUse()) {
17690 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
17691 return SDValue();
17692 // Look for the pattern (shl (and x, AndMask), ShiftAmt). This doesn't
17693 // usually show up because instcombine prefers to canonicalize it to
17694 // (and (shl x, ShiftAmt) (shl AndMask, ShiftAmt)), but the shift can come
17695 // out of GEP lowering in some cases.
17696 SDValue N0 = N->getOperand(0);
17697 ConstantSDNode *ShiftAmtNode = dyn_cast<ConstantSDNode>(N->getOperand(1));
17698 if (!ShiftAmtNode)
17699 return SDValue();
17700 uint32_t ShiftAmt = static_cast<uint32_t>(ShiftAmtNode->getZExtValue());
17701 ConstantSDNode *AndMaskNode = dyn_cast<ConstantSDNode>(N0->getOperand(1));
17702 if (!AndMaskNode)
17703 return SDValue();
17704 uint32_t AndMask = static_cast<uint32_t>(AndMaskNode->getZExtValue());
17705 // Don't transform uxtb/uxth.
17706 if (AndMask == 255 || AndMask == 65535)
17707 return SDValue();
17708 if (isMask_32(AndMask)) {
17709 uint32_t MaskedBits = llvm::countl_zero(AndMask);
17710 if (MaskedBits > ShiftAmt) {
17711 SDLoc DL(N);
17712 SDValue SHL = DAG.getNode(ISD::SHL, DL, MVT::i32, N0->getOperand(0),
17713 DAG.getConstant(MaskedBits, DL, MVT::i32));
17714 return DAG.getNode(
17715 ISD::SRL, DL, MVT::i32, SHL,
17716 DAG.getConstant(MaskedBits - ShiftAmt, DL, MVT::i32));
17717 }
17718 }
17719 }
17720
17721 // Nothing to be done for scalar shifts.
17722 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
17723 if (!VT.isVector() || !TLI.isTypeLegal(VT))
17724 return SDValue();
17725 if (ST->hasMVEIntegerOps())
17726 return SDValue();
17727
17728 int64_t Cnt;
17729
17730 switch (N->getOpcode()) {
17731 default: llvm_unreachable("unexpected shift opcode");
17732
17733 case ISD::SHL:
17734 if (isVShiftLImm(N->getOperand(1), VT, false, Cnt)) {
17735 SDLoc dl(N);
17736 return DAG.getNode(ARMISD::VSHLIMM, dl, VT, N->getOperand(0),
17737 DAG.getConstant(Cnt, dl, MVT::i32));
17738 }
17739 break;
17740
17741 case ISD::SRA:
17742 case ISD::SRL:
17743 if (isVShiftRImm(N->getOperand(1), VT, false, false, Cnt)) {
17744 unsigned VShiftOpc =
17745 (N->getOpcode() == ISD::SRA ? ARMISD::VSHRsIMM : ARMISD::VSHRuIMM);
17746 SDLoc dl(N);
17747 return DAG.getNode(VShiftOpc, dl, VT, N->getOperand(0),
17748 DAG.getConstant(Cnt, dl, MVT::i32));
17749 }
17750 }
17751 return SDValue();
17752}
17753
17754// Look for a sign/zero/fpextend extend of a larger than legal load. This can be
17755// split into multiple extending loads, which are simpler to deal with than an
17756// arbitrary extend. For fp extends we use an integer extending load and a VCVTL
17757// to convert the type to an f32.
17759 SDValue N0 = N->getOperand(0);
17760 if (N0.getOpcode() != ISD::LOAD)
17761 return SDValue();
17763 if (!LD->isSimple() || !N0.hasOneUse() || LD->isIndexed() ||
17764 LD->getExtensionType() != ISD::NON_EXTLOAD)
17765 return SDValue();
17766 EVT FromVT = LD->getValueType(0);
17767 EVT ToVT = N->getValueType(0);
17768 if (!ToVT.isVector())
17769 return SDValue();
17771 EVT ToEltVT = ToVT.getVectorElementType();
17772 EVT FromEltVT = FromVT.getVectorElementType();
17773
17774 unsigned NumElements = 0;
17775 if (ToEltVT == MVT::i32 && FromEltVT == MVT::i8)
17776 NumElements = 4;
17777 if (ToEltVT == MVT::f32 && FromEltVT == MVT::f16)
17778 NumElements = 4;
17779 if (NumElements == 0 ||
17780 (FromEltVT != MVT::f16 && FromVT.getVectorNumElements() == NumElements) ||
17781 FromVT.getVectorNumElements() % NumElements != 0 ||
17782 !isPowerOf2_32(NumElements))
17783 return SDValue();
17784
17785 LLVMContext &C = *DAG.getContext();
17786 SDLoc DL(LD);
17787 // Details about the old load
17788 SDValue Ch = LD->getChain();
17789 SDValue BasePtr = LD->getBasePtr();
17790 Align Alignment = LD->getBaseAlign();
17791 MachineMemOperand::Flags MMOFlags = LD->getMemOperand()->getFlags();
17792 AAMDNodes AAInfo = LD->getAAInfo();
17793
17794 ISD::LoadExtType NewExtType =
17795 N->getOpcode() == ISD::SIGN_EXTEND ? ISD::SEXTLOAD : ISD::ZEXTLOAD;
17796 SDValue Offset = DAG.getUNDEF(BasePtr.getValueType());
17797 EVT NewFromVT = EVT::getVectorVT(
17798 C, EVT::getIntegerVT(C, FromEltVT.getScalarSizeInBits()), NumElements);
17799 EVT NewToVT = EVT::getVectorVT(
17800 C, EVT::getIntegerVT(C, ToEltVT.getScalarSizeInBits()), NumElements);
17801
17804 for (unsigned i = 0; i < FromVT.getVectorNumElements() / NumElements; i++) {
17805 unsigned NewOffset = (i * NewFromVT.getSizeInBits()) / 8;
17806 SDValue NewPtr =
17807 DAG.getObjectPtrOffset(DL, BasePtr, TypeSize::getFixed(NewOffset));
17808
17809 SDValue NewLoad =
17810 DAG.getLoad(ISD::UNINDEXED, NewExtType, NewToVT, DL, Ch, NewPtr, Offset,
17811 LD->getPointerInfo().getWithOffset(NewOffset), NewFromVT,
17812 Alignment, MMOFlags, AAInfo);
17813 Loads.push_back(NewLoad);
17814 Chains.push_back(SDValue(NewLoad.getNode(), 1));
17815 }
17816
17817 // Float truncs need to extended with VCVTB's into their floating point types.
17818 if (FromEltVT == MVT::f16) {
17820
17821 for (unsigned i = 0; i < Loads.size(); i++) {
17822 SDValue LoadBC =
17823 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, MVT::v8f16, Loads[i]);
17824 SDValue FPExt = DAG.getNode(ARMISD::VCVTL, DL, MVT::v4f32, LoadBC,
17825 DAG.getConstant(0, DL, MVT::i32));
17826 Extends.push_back(FPExt);
17827 }
17828
17829 Loads = Extends;
17830 }
17831
17832 SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
17833 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewChain);
17834 return DAG.getNode(ISD::CONCAT_VECTORS, DL, ToVT, Loads);
17835}
17836
17837/// PerformExtendCombine - Target-specific DAG combining for ISD::SIGN_EXTEND,
17838/// ISD::ZERO_EXTEND, and ISD::ANY_EXTEND.
17840 const ARMSubtarget *ST) {
17841 SDValue N0 = N->getOperand(0);
17842
17843 // Check for sign- and zero-extensions of vector extract operations of 8- and
17844 // 16-bit vector elements. NEON and MVE support these directly. They are
17845 // handled during DAG combining because type legalization will promote them
17846 // to 32-bit types and it is messy to recognize the operations after that.
17847 if ((ST->hasNEON() || ST->hasMVEIntegerOps()) &&
17849 SDValue Vec = N0.getOperand(0);
17850 SDValue Lane = N0.getOperand(1);
17851 EVT VT = N->getValueType(0);
17852 EVT EltVT = N0.getValueType();
17853 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
17854
17855 if (VT == MVT::i32 &&
17856 (EltVT == MVT::i8 || EltVT == MVT::i16) &&
17857 TLI.isTypeLegal(Vec.getValueType()) &&
17858 isa<ConstantSDNode>(Lane)) {
17859
17860 unsigned Opc = 0;
17861 switch (N->getOpcode()) {
17862 default: llvm_unreachable("unexpected opcode");
17863 case ISD::SIGN_EXTEND:
17864 Opc = ARMISD::VGETLANEs;
17865 break;
17866 case ISD::ZERO_EXTEND:
17867 case ISD::ANY_EXTEND:
17868 Opc = ARMISD::VGETLANEu;
17869 break;
17870 }
17871 return DAG.getNode(Opc, SDLoc(N), VT, Vec, Lane);
17872 }
17873 }
17874
17875 if (ST->hasMVEIntegerOps())
17876 if (SDValue NewLoad = PerformSplittingToWideningLoad(N, DAG))
17877 return NewLoad;
17878
17879 return SDValue();
17880}
17881
17883 const ARMSubtarget *ST) {
17884 if (ST->hasMVEFloatOps())
17885 if (SDValue NewLoad = PerformSplittingToWideningLoad(N, DAG))
17886 return NewLoad;
17887
17888 return SDValue();
17889}
17890
17891// Lower smin(smax(x, C1), C2) to ssat or usat, if they have saturating
17892// constant bounds.
17894 const ARMSubtarget *Subtarget) {
17895 if ((Subtarget->isThumb() || !Subtarget->hasV6Ops()) &&
17896 !Subtarget->isThumb2())
17897 return SDValue();
17898
17899 EVT VT = Op.getValueType();
17900 SDValue Op0 = Op.getOperand(0);
17901
17902 if (VT != MVT::i32 ||
17903 (Op0.getOpcode() != ISD::SMIN && Op0.getOpcode() != ISD::SMAX) ||
17904 !isa<ConstantSDNode>(Op.getOperand(1)) ||
17906 return SDValue();
17907
17908 SDValue Min = Op;
17909 SDValue Max = Op0;
17910 SDValue Input = Op0.getOperand(0);
17911 if (Min.getOpcode() == ISD::SMAX)
17912 std::swap(Min, Max);
17913
17914 APInt MinC = Min.getConstantOperandAPInt(1);
17915 APInt MaxC = Max.getConstantOperandAPInt(1);
17916
17917 if (Min.getOpcode() != ISD::SMIN || Max.getOpcode() != ISD::SMAX ||
17918 !(MinC + 1).isPowerOf2())
17919 return SDValue();
17920
17921 SDLoc DL(Op);
17922 if (MinC == ~MaxC)
17923 return DAG.getNode(ARMISD::SSAT, DL, VT, Input,
17924 DAG.getConstant(MinC.countr_one(), DL, VT));
17925 if (MaxC == 0)
17926 return DAG.getNode(ARMISD::USAT, DL, VT, Input,
17927 DAG.getConstant(MinC.countr_one(), DL, VT));
17928
17929 return SDValue();
17930}
17931
17932/// PerformMinMaxCombine - Target-specific DAG combining for creating truncating
17933/// saturates.
17935 const ARMSubtarget *ST) {
17936 EVT VT = N->getValueType(0);
17937 SDValue N0 = N->getOperand(0);
17938
17939 if (VT == MVT::i32)
17940 return PerformMinMaxToSatCombine(SDValue(N, 0), DAG, ST);
17941
17942 if (!ST->hasMVEIntegerOps())
17943 return SDValue();
17944
17945 if (SDValue V = PerformVQDMULHCombine(N, DAG))
17946 return V;
17947
17948 if (VT != MVT::v4i32 && VT != MVT::v8i16)
17949 return SDValue();
17950
17951 auto IsSignedSaturate = [&](SDNode *Min, SDNode *Max) {
17952 // Check one is a smin and the other is a smax
17953 if (Min->getOpcode() != ISD::SMIN)
17954 std::swap(Min, Max);
17955 if (Min->getOpcode() != ISD::SMIN || Max->getOpcode() != ISD::SMAX)
17956 return false;
17957
17958 APInt SaturateC;
17959 if (VT == MVT::v4i32)
17960 SaturateC = APInt(32, (1 << 15) - 1, true);
17961 else //if (VT == MVT::v8i16)
17962 SaturateC = APInt(16, (1 << 7) - 1, true);
17963
17964 APInt MinC, MaxC;
17965 if (!ISD::isConstantSplatVector(Min->getOperand(1).getNode(), MinC) ||
17966 MinC != SaturateC)
17967 return false;
17968 if (!ISD::isConstantSplatVector(Max->getOperand(1).getNode(), MaxC) ||
17969 MaxC != ~SaturateC)
17970 return false;
17971 return true;
17972 };
17973
17974 if (IsSignedSaturate(N, N0.getNode())) {
17975 SDLoc DL(N);
17976 MVT ExtVT, HalfVT;
17977 if (VT == MVT::v4i32) {
17978 HalfVT = MVT::v8i16;
17979 ExtVT = MVT::v4i16;
17980 } else { // if (VT == MVT::v8i16)
17981 HalfVT = MVT::v16i8;
17982 ExtVT = MVT::v8i8;
17983 }
17984
17985 // Create a VQMOVNB with undef top lanes, then signed extended into the top
17986 // half. That extend will hopefully be removed if only the bottom bits are
17987 // demanded (though a truncating store, for example).
17988 SDValue VQMOVN =
17989 DAG.getNode(ARMISD::VQMOVNs, DL, HalfVT, DAG.getUNDEF(HalfVT),
17990 N0->getOperand(0), DAG.getConstant(0, DL, MVT::i32));
17991 SDValue Bitcast = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, VQMOVN);
17992 return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Bitcast,
17993 DAG.getValueType(ExtVT));
17994 }
17995
17996 auto IsUnsignedSaturate = [&](SDNode *Min) {
17997 // For unsigned, we just need to check for <= 0xffff
17998 if (Min->getOpcode() != ISD::UMIN)
17999 return false;
18000
18001 APInt SaturateC;
18002 if (VT == MVT::v4i32)
18003 SaturateC = APInt(32, (1 << 16) - 1, true);
18004 else //if (VT == MVT::v8i16)
18005 SaturateC = APInt(16, (1 << 8) - 1, true);
18006
18007 APInt MinC;
18008 if (!ISD::isConstantSplatVector(Min->getOperand(1).getNode(), MinC) ||
18009 MinC != SaturateC)
18010 return false;
18011 return true;
18012 };
18013
18014 if (IsUnsignedSaturate(N)) {
18015 SDLoc DL(N);
18016 MVT HalfVT;
18017 unsigned ExtConst;
18018 if (VT == MVT::v4i32) {
18019 HalfVT = MVT::v8i16;
18020 ExtConst = 0x0000FFFF;
18021 } else { //if (VT == MVT::v8i16)
18022 HalfVT = MVT::v16i8;
18023 ExtConst = 0x00FF;
18024 }
18025
18026 // Create a VQMOVNB with undef top lanes, then ZExt into the top half with
18027 // an AND. That extend will hopefully be removed if only the bottom bits are
18028 // demanded (though a truncating store, for example).
18029 SDValue VQMOVN =
18030 DAG.getNode(ARMISD::VQMOVNu, DL, HalfVT, DAG.getUNDEF(HalfVT), N0,
18031 DAG.getConstant(0, DL, MVT::i32));
18032 SDValue Bitcast = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, VQMOVN);
18033 return DAG.getNode(ISD::AND, DL, VT, Bitcast,
18034 DAG.getConstant(ExtConst, DL, VT));
18035 }
18036
18037 return SDValue();
18038}
18039
18042 if (!C)
18043 return nullptr;
18044 const APInt *CV = &C->getAPIntValue();
18045 return CV->isPowerOf2() ? CV : nullptr;
18046}
18047
18049 // If we have a CMOV, OR and AND combination such as:
18050 // if (x & CN)
18051 // y |= CM;
18052 //
18053 // And:
18054 // * CN is a single bit;
18055 // * All bits covered by CM are known zero in y
18056 //
18057 // Then we can convert this into a sequence of BFI instructions. This will
18058 // always be a win if CM is a single bit, will always be no worse than the
18059 // TST&OR sequence if CM is two bits, and for thumb will be no worse if CM is
18060 // three bits (due to the extra IT instruction).
18061
18062 SDValue Op0 = CMOV->getOperand(0);
18063 SDValue Op1 = CMOV->getOperand(1);
18064 auto CC = CMOV->getConstantOperandAPInt(2).getLimitedValue();
18065 SDValue CmpZ = CMOV->getOperand(3);
18066
18067 // The compare must be against zero.
18068 if (!isNullConstant(CmpZ->getOperand(1)))
18069 return SDValue();
18070
18071 assert(CmpZ->getOpcode() == ARMISD::CMPZ);
18072 SDValue And = CmpZ->getOperand(0);
18073 if (And->getOpcode() != ISD::AND)
18074 return SDValue();
18075 const APInt *AndC = isPowerOf2Constant(And->getOperand(1));
18076 if (!AndC)
18077 return SDValue();
18078 SDValue X = And->getOperand(0);
18079
18080 if (CC == ARMCC::EQ) {
18081 // We're performing an "equal to zero" compare. Swap the operands so we
18082 // canonicalize on a "not equal to zero" compare.
18083 std::swap(Op0, Op1);
18084 } else {
18085 assert(CC == ARMCC::NE && "How can a CMPZ node not be EQ or NE?");
18086 }
18087
18088 if (Op1->getOpcode() != ISD::OR)
18089 return SDValue();
18090
18092 if (!OrC)
18093 return SDValue();
18094 SDValue Y = Op1->getOperand(0);
18095
18096 if (Op0 != Y)
18097 return SDValue();
18098
18099 // Now, is it profitable to continue?
18100 APInt OrCI = OrC->getAPIntValue();
18101 unsigned Heuristic = Subtarget->isThumb() ? 3 : 2;
18102 if (OrCI.popcount() > Heuristic)
18103 return SDValue();
18104
18105 // Lastly, can we determine that the bits defined by OrCI
18106 // are zero in Y?
18107 KnownBits Known = DAG.computeKnownBits(Y);
18108 if ((OrCI & Known.Zero) != OrCI)
18109 return SDValue();
18110
18111 // OK, we can do the combine.
18112 SDValue V = Y;
18113 SDLoc dl(X);
18114 EVT VT = X.getValueType();
18115 unsigned BitInX = AndC->logBase2();
18116
18117 if (BitInX != 0) {
18118 // We must shift X first.
18119 X = DAG.getNode(ISD::SRL, dl, VT, X,
18120 DAG.getConstant(BitInX, dl, VT));
18121 }
18122
18123 for (unsigned BitInY = 0, NumActiveBits = OrCI.getActiveBits();
18124 BitInY < NumActiveBits; ++BitInY) {
18125 if (OrCI[BitInY] == 0)
18126 continue;
18127 APInt Mask(VT.getSizeInBits(), 0);
18128 Mask.setBit(BitInY);
18129 V = DAG.getNode(ARMISD::BFI, dl, VT, V, X,
18130 // Confusingly, the operand is an *inverted* mask.
18131 DAG.getConstant(~Mask, dl, VT));
18132 }
18133
18134 return V;
18135}
18136
18137// Given N, the value controlling the conditional branch, search for the loop
18138// intrinsic, returning it, along with how the value is used. We need to handle
18139// patterns such as the following:
18140// (brcond (xor (setcc (loop.decrement), 0, ne), 1), exit)
18141// (brcond (setcc (loop.decrement), 0, eq), exit)
18142// (brcond (setcc (loop.decrement), 0, ne), header)
18144 bool &Negate) {
18145 switch (N->getOpcode()) {
18146 default:
18147 break;
18148 case ISD::XOR: {
18149 if (!isa<ConstantSDNode>(N.getOperand(1)))
18150 return SDValue();
18151 if (!cast<ConstantSDNode>(N.getOperand(1))->isOne())
18152 return SDValue();
18153 Negate = !Negate;
18154 return SearchLoopIntrinsic(N.getOperand(0), CC, Imm, Negate);
18155 }
18156 case ISD::SETCC: {
18157 auto *Const = dyn_cast<ConstantSDNode>(N.getOperand(1));
18158 if (!Const)
18159 return SDValue();
18160 if (Const->isZero())
18161 Imm = 0;
18162 else if (Const->isOne())
18163 Imm = 1;
18164 else
18165 return SDValue();
18166 CC = cast<CondCodeSDNode>(N.getOperand(2))->get();
18167 return SearchLoopIntrinsic(N->getOperand(0), CC, Imm, Negate);
18168 }
18170 unsigned IntOp = N.getConstantOperandVal(1);
18171 if (IntOp != Intrinsic::test_start_loop_iterations &&
18172 IntOp != Intrinsic::loop_decrement_reg)
18173 return SDValue();
18174 return N;
18175 }
18176 }
18177 return SDValue();
18178}
18179
18182 const ARMSubtarget *ST) {
18183
18184 // The hwloop intrinsics that we're interested are used for control-flow,
18185 // either for entering or exiting the loop:
18186 // - test.start.loop.iterations will test whether its operand is zero. If it
18187 // is zero, the proceeding branch should not enter the loop.
18188 // - loop.decrement.reg also tests whether its operand is zero. If it is
18189 // zero, the proceeding branch should not branch back to the beginning of
18190 // the loop.
18191 // So here, we need to check that how the brcond is using the result of each
18192 // of the intrinsics to ensure that we're branching to the right place at the
18193 // right time.
18194
18195 ISD::CondCode CC;
18196 SDValue Cond;
18197 int Imm = 1;
18198 bool Negate = false;
18199 SDValue Chain = N->getOperand(0);
18200 SDValue Dest;
18201
18202 if (N->getOpcode() == ISD::BRCOND) {
18203 CC = ISD::SETEQ;
18204 Cond = N->getOperand(1);
18205 Dest = N->getOperand(2);
18206 } else {
18207 assert(N->getOpcode() == ISD::BR_CC && "Expected BRCOND or BR_CC!");
18208 CC = cast<CondCodeSDNode>(N->getOperand(1))->get();
18209 Cond = N->getOperand(2);
18210 Dest = N->getOperand(4);
18211 if (auto *Const = dyn_cast<ConstantSDNode>(N->getOperand(3))) {
18212 if (!Const->isOne() && !Const->isZero())
18213 return SDValue();
18214 Imm = Const->getZExtValue();
18215 } else
18216 return SDValue();
18217 }
18218
18219 SDValue Int = SearchLoopIntrinsic(Cond, CC, Imm, Negate);
18220 if (!Int)
18221 return SDValue();
18222
18223 if (Negate)
18224 CC = ISD::getSetCCInverse(CC, /* Integer inverse */ MVT::i32);
18225
18226 auto IsTrueIfZero = [](ISD::CondCode CC, int Imm) {
18227 return (CC == ISD::SETEQ && Imm == 0) ||
18228 (CC == ISD::SETNE && Imm == 1) ||
18229 (CC == ISD::SETLT && Imm == 1) ||
18230 (CC == ISD::SETULT && Imm == 1);
18231 };
18232
18233 auto IsFalseIfZero = [](ISD::CondCode CC, int Imm) {
18234 return (CC == ISD::SETEQ && Imm == 1) ||
18235 (CC == ISD::SETNE && Imm == 0) ||
18236 (CC == ISD::SETGT && Imm == 0) ||
18237 (CC == ISD::SETUGT && Imm == 0) ||
18238 (CC == ISD::SETGE && Imm == 1) ||
18239 (CC == ISD::SETUGE && Imm == 1);
18240 };
18241
18242 assert((IsTrueIfZero(CC, Imm) || IsFalseIfZero(CC, Imm)) &&
18243 "unsupported condition");
18244
18245 SDLoc dl(Int);
18246 SelectionDAG &DAG = DCI.DAG;
18247 SDValue Elements = Int.getOperand(2);
18248 unsigned IntOp = Int->getConstantOperandVal(1);
18249 assert((N->hasOneUse() && N->user_begin()->getOpcode() == ISD::BR) &&
18250 "expected single br user");
18251 SDNode *Br = *N->user_begin();
18252 SDValue OtherTarget = Br->getOperand(1);
18253
18254 // Update the unconditional branch to branch to the given Dest.
18255 auto UpdateUncondBr = [](SDNode *Br, SDValue Dest, SelectionDAG &DAG) {
18256 SDValue NewBrOps[] = { Br->getOperand(0), Dest };
18257 SDValue NewBr = DAG.getNode(ISD::BR, SDLoc(Br), MVT::Other, NewBrOps);
18258 DAG.ReplaceAllUsesOfValueWith(SDValue(Br, 0), NewBr);
18259 };
18260
18261 if (IntOp == Intrinsic::test_start_loop_iterations) {
18262 SDValue Res;
18263 SDValue Setup = DAG.getNode(ARMISD::WLSSETUP, dl, MVT::i32, Elements);
18264 // We expect this 'instruction' to branch when the counter is zero.
18265 if (IsTrueIfZero(CC, Imm)) {
18266 SDValue Ops[] = {Chain, Setup, Dest};
18267 Res = DAG.getNode(ARMISD::WLS, dl, MVT::Other, Ops);
18268 } else {
18269 // The logic is the reverse of what we need for WLS, so find the other
18270 // basic block target: the target of the proceeding br.
18271 UpdateUncondBr(Br, Dest, DAG);
18272
18273 SDValue Ops[] = {Chain, Setup, OtherTarget};
18274 Res = DAG.getNode(ARMISD::WLS, dl, MVT::Other, Ops);
18275 }
18276 // Update LR count to the new value
18277 DAG.ReplaceAllUsesOfValueWith(Int.getValue(0), Setup);
18278 // Update chain
18279 DAG.ReplaceAllUsesOfValueWith(Int.getValue(2), Int.getOperand(0));
18280 return Res;
18281 } else {
18282 SDValue Size =
18283 DAG.getTargetConstant(Int.getConstantOperandVal(3), dl, MVT::i32);
18284 SDValue Args[] = { Int.getOperand(0), Elements, Size, };
18285 SDValue LoopDec = DAG.getNode(ARMISD::LOOP_DEC, dl,
18286 DAG.getVTList(MVT::i32, MVT::Other), Args);
18287 DAG.ReplaceAllUsesWith(Int.getNode(), LoopDec.getNode());
18288
18289 // We expect this instruction to branch when the count is not zero.
18290 SDValue Target = IsFalseIfZero(CC, Imm) ? Dest : OtherTarget;
18291
18292 // Update the unconditional branch to target the loop preheader if we've
18293 // found the condition has been reversed.
18294 if (Target == OtherTarget)
18295 UpdateUncondBr(Br, Dest, DAG);
18296
18297 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
18298 SDValue(LoopDec.getNode(), 1), Chain);
18299
18300 SDValue EndArgs[] = { Chain, SDValue(LoopDec.getNode(), 0), Target };
18301 return DAG.getNode(ARMISD::LE, dl, MVT::Other, EndArgs);
18302 }
18303 return SDValue();
18304}
18305
18306/// PerformBRCONDCombine - Target-specific DAG combining for ARMISD::BRCOND.
18307SDValue
18309 SDValue Cmp = N->getOperand(3);
18310 if (Cmp.getOpcode() != ARMISD::CMPZ)
18311 // Only looking at NE cases.
18312 return SDValue();
18313
18314 SDLoc dl(N);
18315 SDValue LHS = Cmp.getOperand(0);
18316 SDValue RHS = Cmp.getOperand(1);
18317 SDValue Chain = N->getOperand(0);
18318 SDValue BB = N->getOperand(1);
18319 SDValue ARMcc = N->getOperand(2);
18321
18322 // (brcond Chain BB ne (cmpz (and (cmov 0 1 CC Flags) 1) 0))
18323 // -> (brcond Chain BB CC Flags)
18324 if (CC == ARMCC::NE && LHS.getOpcode() == ISD::AND && LHS->hasOneUse() &&
18325 LHS->getOperand(0)->getOpcode() == ARMISD::CMOV &&
18326 LHS->getOperand(0)->hasOneUse() &&
18327 isNullConstant(LHS->getOperand(0)->getOperand(0)) &&
18328 isOneConstant(LHS->getOperand(0)->getOperand(1)) &&
18329 isOneConstant(LHS->getOperand(1)) && isNullConstant(RHS)) {
18330 return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, Chain, BB,
18331 LHS->getOperand(0)->getOperand(2),
18332 LHS->getOperand(0)->getOperand(3));
18333 }
18334
18335 return SDValue();
18336}
18337
18338/// PerformCMOVCombine - Target-specific DAG combining for ARMISD::CMOV.
18339SDValue
18341 SDValue Cmp = N->getOperand(3);
18342 if (Cmp.getOpcode() != ARMISD::CMPZ)
18343 // Only looking at EQ and NE cases.
18344 return SDValue();
18345
18346 EVT VT = N->getValueType(0);
18347 SDLoc dl(N);
18348 SDValue LHS = Cmp.getOperand(0);
18349 SDValue RHS = Cmp.getOperand(1);
18350 SDValue FalseVal = N->getOperand(0);
18351 SDValue TrueVal = N->getOperand(1);
18352 SDValue ARMcc = N->getOperand(2);
18354
18355 // BFI is only available on V6T2+.
18356 if (!Subtarget->isThumb1Only() && Subtarget->hasV6T2Ops()) {
18358 if (R)
18359 return R;
18360 }
18361
18362 // Simplify
18363 // mov r1, r0
18364 // cmp r1, x
18365 // mov r0, y
18366 // moveq r0, x
18367 // to
18368 // cmp r0, x
18369 // movne r0, y
18370 //
18371 // mov r1, r0
18372 // cmp r1, x
18373 // mov r0, x
18374 // movne r0, y
18375 // to
18376 // cmp r0, x
18377 // movne r0, y
18378 /// FIXME: Turn this into a target neutral optimization?
18379 SDValue Res;
18380 if (CC == ARMCC::NE && FalseVal == RHS && FalseVal != LHS) {
18381 Res = DAG.getNode(ARMISD::CMOV, dl, VT, LHS, TrueVal, ARMcc, Cmp);
18382 } else if (CC == ARMCC::EQ && TrueVal == RHS) {
18383 SDValue ARMcc;
18384 SDValue NewCmp = getARMCmp(LHS, RHS, ISD::SETNE, ARMcc, DAG, dl);
18385 Res = DAG.getNode(ARMISD::CMOV, dl, VT, LHS, FalseVal, ARMcc, NewCmp);
18386 }
18387
18388 // (cmov F T ne (cmpz (cmov 0 1 CC Flags) 0))
18389 // -> (cmov F T CC Flags)
18390 if (CC == ARMCC::NE && LHS.getOpcode() == ARMISD::CMOV && LHS->hasOneUse() &&
18391 isNullConstant(LHS->getOperand(0)) && isOneConstant(LHS->getOperand(1)) &&
18392 isNullConstant(RHS)) {
18393 return DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, TrueVal,
18394 LHS->getOperand(2), LHS->getOperand(3));
18395 }
18396
18397 if (!VT.isInteger())
18398 return SDValue();
18399
18400 // Fold away an unneccessary CMPZ/CMOV
18401 // CMOV A, B, C1, (CMPZ (CMOV 1, 0, C2, D), 0) ->
18402 // if C1==EQ -> CMOV A, B, C2, D
18403 // if C1==NE -> CMOV A, B, NOT(C2), D
18404 if (N->getConstantOperandVal(2) == ARMCC::EQ ||
18405 N->getConstantOperandVal(2) == ARMCC::NE) {
18407 if (SDValue C = IsCMPZCSINC(N->getOperand(3).getNode(), Cond)) {
18408 if (N->getConstantOperandVal(2) == ARMCC::NE)
18410 return DAG.getNode(N->getOpcode(), SDLoc(N), MVT::i32, N->getOperand(0),
18411 N->getOperand(1),
18412 DAG.getConstant(Cond, SDLoc(N), MVT::i32), C);
18413 }
18414 }
18415
18416 // Materialize a boolean comparison for integers so we can avoid branching.
18417 if (isNullConstant(FalseVal)) {
18418 if (CC == ARMCC::EQ && isOneConstant(TrueVal)) {
18419 if (!Subtarget->isThumb1Only() && Subtarget->hasV5TOps()) {
18420 // If x == y then x - y == 0 and ARM's CLZ will return 32, shifting it
18421 // right 5 bits will make that 32 be 1, otherwise it will be 0.
18422 // CMOV 0, 1, ==, (CMPZ x, y) -> SRL (CTLZ (SUB x, y)), 5
18423 SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, LHS, RHS);
18424 Res = DAG.getNode(ISD::SRL, dl, VT, DAG.getNode(ISD::CTLZ, dl, VT, Sub),
18425 DAG.getConstant(5, dl, MVT::i32));
18426 } else {
18427 // CMOV 0, 1, ==, (CMPZ x, y) ->
18428 // (UADDO_CARRY (SUB x, y), t:0, t:1)
18429 // where t = (USUBO_CARRY 0, (SUB x, y), 0)
18430 //
18431 // The USUBO_CARRY computes 0 - (x - y) and this will give a borrow when
18432 // x != y. In other words, a carry C == 1 when x == y, C == 0
18433 // otherwise.
18434 // The final UADDO_CARRY computes
18435 // x - y + (0 - (x - y)) + C == C
18436 SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, LHS, RHS);
18437 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
18438 SDValue Neg = DAG.getNode(ISD::USUBO, dl, VTs, FalseVal, Sub);
18439 // ISD::USUBO_CARRY returns a borrow but we want the carry here
18440 // actually.
18441 SDValue Carry =
18442 DAG.getNode(ISD::SUB, dl, MVT::i32,
18443 DAG.getConstant(1, dl, MVT::i32), Neg.getValue(1));
18444 Res = DAG.getNode(ISD::UADDO_CARRY, dl, VTs, Sub, Neg, Carry);
18445 }
18446 } else if (CC == ARMCC::NE && !isNullConstant(RHS) &&
18447 (!Subtarget->isThumb1Only() || isPowerOf2Constant(TrueVal))) {
18448 // This seems pointless but will allow us to combine it further below.
18449 // CMOV 0, z, !=, (CMPZ x, y) -> CMOV (SUBC x, y), z, !=, (SUBC x, y):1
18450 SDValue Sub =
18451 DAG.getNode(ARMISD::SUBC, dl, DAG.getVTList(VT, MVT::i32), LHS, RHS);
18452 Res = DAG.getNode(ARMISD::CMOV, dl, VT, Sub, TrueVal, ARMcc,
18453 Sub.getValue(1));
18454 FalseVal = Sub;
18455 }
18456 } else if (isNullConstant(TrueVal)) {
18457 if (CC == ARMCC::EQ && !isNullConstant(RHS) &&
18458 (!Subtarget->isThumb1Only() || isPowerOf2Constant(FalseVal))) {
18459 // This seems pointless but will allow us to combine it further below
18460 // Note that we change == for != as this is the dual for the case above.
18461 // CMOV z, 0, ==, (CMPZ x, y) -> CMOV (SUBC x, y), z, !=, (SUBC x, y):1
18462 SDValue Sub =
18463 DAG.getNode(ARMISD::SUBC, dl, DAG.getVTList(VT, MVT::i32), LHS, RHS);
18464 Res = DAG.getNode(ARMISD::CMOV, dl, VT, Sub, FalseVal,
18465 DAG.getConstant(ARMCC::NE, dl, MVT::i32),
18466 Sub.getValue(1));
18467 FalseVal = Sub;
18468 }
18469 }
18470
18471 // On Thumb1, the DAG above may be further combined if z is a power of 2
18472 // (z == 2 ^ K).
18473 // CMOV (SUBC x, y), z, !=, (SUBC x, y):1 ->
18474 // t1 = (USUBO (SUB x, y), 1)
18475 // t2 = (USUBO_CARRY (SUB x, y), t1:0, t1:1)
18476 // Result = if K != 0 then (SHL t2:0, K) else t2:0
18477 //
18478 // This also handles the special case of comparing against zero; it's
18479 // essentially, the same pattern, except there's no SUBC:
18480 // CMOV x, z, !=, (CMPZ x, 0) ->
18481 // t1 = (USUBO x, 1)
18482 // t2 = (USUBO_CARRY x, t1:0, t1:1)
18483 // Result = if K != 0 then (SHL t2:0, K) else t2:0
18484 const APInt *TrueConst;
18485 if (Subtarget->isThumb1Only() && CC == ARMCC::NE &&
18486 ((FalseVal.getOpcode() == ARMISD::SUBC && FalseVal.getOperand(0) == LHS &&
18487 FalseVal.getOperand(1) == RHS) ||
18488 (FalseVal == LHS && isNullConstant(RHS))) &&
18489 (TrueConst = isPowerOf2Constant(TrueVal))) {
18490 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
18491 unsigned ShiftAmount = TrueConst->logBase2();
18492 if (ShiftAmount)
18493 TrueVal = DAG.getConstant(1, dl, VT);
18494 SDValue Subc = DAG.getNode(ISD::USUBO, dl, VTs, FalseVal, TrueVal);
18495 Res = DAG.getNode(ISD::USUBO_CARRY, dl, VTs, FalseVal, Subc,
18496 Subc.getValue(1));
18497
18498 if (ShiftAmount)
18499 Res = DAG.getNode(ISD::SHL, dl, VT, Res,
18500 DAG.getConstant(ShiftAmount, dl, MVT::i32));
18501 }
18502
18503 if (Res.getNode()) {
18504 KnownBits Known = DAG.computeKnownBits(SDValue(N,0));
18505 // Capture demanded bits information that would be otherwise lost.
18506 if (Known.Zero == 0xfffffffe)
18507 Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res,
18508 DAG.getValueType(MVT::i1));
18509 else if (Known.Zero == 0xffffff00)
18510 Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res,
18511 DAG.getValueType(MVT::i8));
18512 else if (Known.Zero == 0xffff0000)
18513 Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res,
18514 DAG.getValueType(MVT::i16));
18515 }
18516
18517 return Res;
18518}
18519
18522 const ARMSubtarget *ST) {
18523 SelectionDAG &DAG = DCI.DAG;
18524 SDValue Src = N->getOperand(0);
18525 EVT DstVT = N->getValueType(0);
18526
18527 // Convert v4f32 bitcast (v4i32 vdup (i32)) -> v4f32 vdup (i32) under MVE.
18528 if (ST->hasMVEIntegerOps() && Src.getOpcode() == ARMISD::VDUP) {
18529 EVT SrcVT = Src.getValueType();
18530 if (SrcVT.getScalarSizeInBits() == DstVT.getScalarSizeInBits())
18531 return DAG.getNode(ARMISD::VDUP, SDLoc(N), DstVT, Src.getOperand(0));
18532 }
18533
18534 // We may have a bitcast of something that has already had this bitcast
18535 // combine performed on it, so skip past any VECTOR_REG_CASTs.
18536 if (Src.getOpcode() == ARMISD::VECTOR_REG_CAST &&
18537 Src.getOperand(0).getValueType().getScalarSizeInBits() <=
18538 Src.getValueType().getScalarSizeInBits())
18539 Src = Src.getOperand(0);
18540
18541 // Bitcast from element-wise VMOV or VMVN doesn't need VREV if the VREV that
18542 // would be generated is at least the width of the element type.
18543 EVT SrcVT = Src.getValueType();
18544 if ((Src.getOpcode() == ARMISD::VMOVIMM ||
18545 Src.getOpcode() == ARMISD::VMVNIMM ||
18546 Src.getOpcode() == ARMISD::VMOVFPIMM) &&
18547 SrcVT.getScalarSizeInBits() <= DstVT.getScalarSizeInBits() &&
18548 DAG.getDataLayout().isBigEndian())
18549 return DAG.getNode(ARMISD::VECTOR_REG_CAST, SDLoc(N), DstVT, Src);
18550
18551 // bitcast(extract(x, n)); bitcast(extract(x, n+1)) -> VMOVRRD x
18552 if (SDValue R = PerformExtractEltToVMOVRRD(N, DCI))
18553 return R;
18554
18555 return SDValue();
18556}
18557
18558// Some combines for the MVETrunc truncations legalizer helper. Also lowers the
18559// node into stack operations after legalizeOps.
18562 SelectionDAG &DAG = DCI.DAG;
18563 EVT VT = N->getValueType(0);
18564 SDLoc DL(N);
18565
18566 // MVETrunc(Undef, Undef) -> Undef
18567 if (all_of(N->ops(), [](SDValue Op) { return Op.isUndef(); }))
18568 return DAG.getUNDEF(VT);
18569
18570 // MVETrunc(MVETrunc a b, MVETrunc c, d) -> MVETrunc
18571 if (N->getNumOperands() == 2 &&
18572 N->getOperand(0).getOpcode() == ARMISD::MVETRUNC &&
18573 N->getOperand(1).getOpcode() == ARMISD::MVETRUNC)
18574 return DAG.getNode(ARMISD::MVETRUNC, DL, VT, N->getOperand(0).getOperand(0),
18575 N->getOperand(0).getOperand(1),
18576 N->getOperand(1).getOperand(0),
18577 N->getOperand(1).getOperand(1));
18578
18579 // MVETrunc(shuffle, shuffle) -> VMOVN
18580 if (N->getNumOperands() == 2 &&
18581 N->getOperand(0).getOpcode() == ISD::VECTOR_SHUFFLE &&
18582 N->getOperand(1).getOpcode() == ISD::VECTOR_SHUFFLE) {
18583 auto *S0 = cast<ShuffleVectorSDNode>(N->getOperand(0).getNode());
18584 auto *S1 = cast<ShuffleVectorSDNode>(N->getOperand(1).getNode());
18585
18586 if (S0->getOperand(0) == S1->getOperand(0) &&
18587 S0->getOperand(1) == S1->getOperand(1)) {
18588 // Construct complete shuffle mask
18589 SmallVector<int, 8> Mask(S0->getMask());
18590 Mask.append(S1->getMask().begin(), S1->getMask().end());
18591
18592 if (isVMOVNTruncMask(Mask, VT, false))
18593 return DAG.getNode(
18594 ARMISD::VMOVN, DL, VT,
18595 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, S0->getOperand(0)),
18596 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, S0->getOperand(1)),
18597 DAG.getConstant(1, DL, MVT::i32));
18598 if (isVMOVNTruncMask(Mask, VT, true))
18599 return DAG.getNode(
18600 ARMISD::VMOVN, DL, VT,
18601 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, S0->getOperand(1)),
18602 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, S0->getOperand(0)),
18603 DAG.getConstant(1, DL, MVT::i32));
18604 }
18605 }
18606
18607 // For MVETrunc of a buildvector or shuffle, it can be beneficial to lower the
18608 // truncate to a buildvector to allow the generic optimisations to kick in.
18609 if (all_of(N->ops(), [](SDValue Op) {
18610 return Op.getOpcode() == ISD::BUILD_VECTOR ||
18611 Op.getOpcode() == ISD::VECTOR_SHUFFLE ||
18612 (Op.getOpcode() == ISD::BITCAST &&
18613 Op.getOperand(0).getOpcode() == ISD::BUILD_VECTOR);
18614 })) {
18615 SmallVector<SDValue, 8> Extracts;
18616 for (unsigned Op = 0; Op < N->getNumOperands(); Op++) {
18617 SDValue O = N->getOperand(Op);
18618 for (unsigned i = 0; i < O.getValueType().getVectorNumElements(); i++) {
18619 SDValue Ext = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, O,
18620 DAG.getConstant(i, DL, MVT::i32));
18621 Extracts.push_back(Ext);
18622 }
18623 }
18624 return DAG.getBuildVector(VT, DL, Extracts);
18625 }
18626
18627 // If we are late in the legalization process and nothing has optimised
18628 // the trunc to anything better, lower it to a stack store and reload,
18629 // performing the truncation whilst keeping the lanes in the correct order:
18630 // VSTRH.32 a, stack; VSTRH.32 b, stack+8; VLDRW.32 stack;
18631 if (!DCI.isAfterLegalizeDAG())
18632 return SDValue();
18633
18634 SDValue StackPtr = DAG.CreateStackTemporary(TypeSize::getFixed(16), Align(4));
18635 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
18636 int NumIns = N->getNumOperands();
18637 assert((NumIns == 2 || NumIns == 4) &&
18638 "Expected 2 or 4 inputs to an MVETrunc");
18639 EVT StoreVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
18640 if (N->getNumOperands() == 4)
18641 StoreVT = StoreVT.getHalfNumVectorElementsVT(*DAG.getContext());
18642
18643 SmallVector<SDValue> Chains;
18644 for (int I = 0; I < NumIns; I++) {
18645 SDValue Ptr = DAG.getNode(
18646 ISD::ADD, DL, StackPtr.getValueType(), StackPtr,
18647 DAG.getConstant(I * 16 / NumIns, DL, StackPtr.getValueType()));
18649 DAG.getMachineFunction(), SPFI, I * 16 / NumIns);
18650 SDValue Ch = DAG.getTruncStore(DAG.getEntryNode(), DL, N->getOperand(I),
18651 Ptr, MPI, StoreVT, Align(4));
18652 Chains.push_back(Ch);
18653 }
18654
18655 SDValue Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
18656 MachinePointerInfo MPI =
18658 return DAG.getLoad(VT, DL, Chain, StackPtr, MPI, Align(4));
18659}
18660
18661// Take a MVEEXT(load x) and split that into (extload x, extload x+8)
18663 SelectionDAG &DAG) {
18664 SDValue N0 = N->getOperand(0);
18666 if (!LD || !LD->isSimple() || !N0.hasOneUse() || LD->isIndexed())
18667 return SDValue();
18668
18669 EVT FromVT = LD->getMemoryVT();
18670 EVT ToVT = N->getValueType(0);
18671 if (!ToVT.isVector())
18672 return SDValue();
18673 assert(FromVT.getVectorNumElements() == ToVT.getVectorNumElements() * 2);
18674 EVT ToEltVT = ToVT.getVectorElementType();
18675 EVT FromEltVT = FromVT.getVectorElementType();
18676
18677 unsigned NumElements = 0;
18678 if (ToEltVT == MVT::i32 && (FromEltVT == MVT::i16 || FromEltVT == MVT::i8))
18679 NumElements = 4;
18680 if (ToEltVT == MVT::i16 && FromEltVT == MVT::i8)
18681 NumElements = 8;
18682 assert(NumElements != 0);
18683
18684 ISD::LoadExtType NewExtType =
18685 N->getOpcode() == ARMISD::MVESEXT ? ISD::SEXTLOAD : ISD::ZEXTLOAD;
18686 if (LD->getExtensionType() != ISD::NON_EXTLOAD &&
18687 LD->getExtensionType() != ISD::EXTLOAD &&
18688 LD->getExtensionType() != NewExtType)
18689 return SDValue();
18690
18691 LLVMContext &C = *DAG.getContext();
18692 SDLoc DL(LD);
18693 // Details about the old load
18694 SDValue Ch = LD->getChain();
18695 SDValue BasePtr = LD->getBasePtr();
18696 Align Alignment = LD->getBaseAlign();
18697 MachineMemOperand::Flags MMOFlags = LD->getMemOperand()->getFlags();
18698 AAMDNodes AAInfo = LD->getAAInfo();
18699
18700 SDValue Offset = DAG.getUNDEF(BasePtr.getValueType());
18701 EVT NewFromVT = EVT::getVectorVT(
18702 C, EVT::getIntegerVT(C, FromEltVT.getScalarSizeInBits()), NumElements);
18703 EVT NewToVT = EVT::getVectorVT(
18704 C, EVT::getIntegerVT(C, ToEltVT.getScalarSizeInBits()), NumElements);
18705
18708 for (unsigned i = 0; i < FromVT.getVectorNumElements() / NumElements; i++) {
18709 unsigned NewOffset = (i * NewFromVT.getSizeInBits()) / 8;
18710 SDValue NewPtr =
18711 DAG.getObjectPtrOffset(DL, BasePtr, TypeSize::getFixed(NewOffset));
18712
18713 SDValue NewLoad =
18714 DAG.getLoad(ISD::UNINDEXED, NewExtType, NewToVT, DL, Ch, NewPtr, Offset,
18715 LD->getPointerInfo().getWithOffset(NewOffset), NewFromVT,
18716 Alignment, MMOFlags, AAInfo);
18717 Loads.push_back(NewLoad);
18718 Chains.push_back(SDValue(NewLoad.getNode(), 1));
18719 }
18720
18721 SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
18722 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewChain);
18723 return DAG.getMergeValues(Loads, DL);
18724}
18725
18726// Perform combines for MVEEXT. If it has not be optimized to anything better
18727// before lowering, it gets converted to stack store and extloads performing the
18728// extend whilst still keeping the same lane ordering.
18731 SelectionDAG &DAG = DCI.DAG;
18732 EVT VT = N->getValueType(0);
18733 SDLoc DL(N);
18734 assert(N->getNumValues() == 2 && "Expected MVEEXT with 2 elements");
18735 assert((VT == MVT::v4i32 || VT == MVT::v8i16) && "Unexpected MVEEXT type");
18736
18737 EVT ExtVT = N->getOperand(0).getValueType().getHalfNumVectorElementsVT(
18738 *DAG.getContext());
18739 auto Extend = [&](SDValue V) {
18740 SDValue VVT = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, V);
18741 return N->getOpcode() == ARMISD::MVESEXT
18742 ? DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, VVT,
18743 DAG.getValueType(ExtVT))
18744 : DAG.getZeroExtendInReg(VVT, DL, ExtVT);
18745 };
18746
18747 // MVEEXT(VDUP) -> SIGN_EXTEND_INREG(VDUP)
18748 if (N->getOperand(0).getOpcode() == ARMISD::VDUP) {
18749 SDValue Ext = Extend(N->getOperand(0));
18750 return DAG.getMergeValues({Ext, Ext}, DL);
18751 }
18752
18753 // MVEEXT(shuffle) -> SIGN_EXTEND_INREG/ZERO_EXTEND_INREG
18754 if (auto *SVN = dyn_cast<ShuffleVectorSDNode>(N->getOperand(0))) {
18755 ArrayRef<int> Mask = SVN->getMask();
18756 assert(Mask.size() == 2 * VT.getVectorNumElements());
18757 assert(Mask.size() == SVN->getValueType(0).getVectorNumElements());
18758 unsigned Rev = VT == MVT::v4i32 ? ARMISD::VREV32 : ARMISD::VREV16;
18759 SDValue Op0 = SVN->getOperand(0);
18760 SDValue Op1 = SVN->getOperand(1);
18761
18762 auto CheckInregMask = [&](int Start, int Offset) {
18763 for (int Idx = 0, E = VT.getVectorNumElements(); Idx < E; ++Idx)
18764 if (Mask[Start + Idx] >= 0 && Mask[Start + Idx] != Idx * 2 + Offset)
18765 return false;
18766 return true;
18767 };
18768 SDValue V0 = SDValue(N, 0);
18769 SDValue V1 = SDValue(N, 1);
18770 if (CheckInregMask(0, 0))
18771 V0 = Extend(Op0);
18772 else if (CheckInregMask(0, 1))
18773 V0 = Extend(DAG.getNode(Rev, DL, SVN->getValueType(0), Op0));
18774 else if (CheckInregMask(0, Mask.size()))
18775 V0 = Extend(Op1);
18776 else if (CheckInregMask(0, Mask.size() + 1))
18777 V0 = Extend(DAG.getNode(Rev, DL, SVN->getValueType(0), Op1));
18778
18779 if (CheckInregMask(VT.getVectorNumElements(), Mask.size()))
18780 V1 = Extend(Op1);
18781 else if (CheckInregMask(VT.getVectorNumElements(), Mask.size() + 1))
18782 V1 = Extend(DAG.getNode(Rev, DL, SVN->getValueType(0), Op1));
18783 else if (CheckInregMask(VT.getVectorNumElements(), 0))
18784 V1 = Extend(Op0);
18785 else if (CheckInregMask(VT.getVectorNumElements(), 1))
18786 V1 = Extend(DAG.getNode(Rev, DL, SVN->getValueType(0), Op0));
18787
18788 if (V0.getNode() != N || V1.getNode() != N)
18789 return DAG.getMergeValues({V0, V1}, DL);
18790 }
18791
18792 // MVEEXT(load) -> extload, extload
18793 if (N->getOperand(0)->getOpcode() == ISD::LOAD)
18795 return L;
18796
18797 if (!DCI.isAfterLegalizeDAG())
18798 return SDValue();
18799
18800 // Lower to a stack store and reload:
18801 // VSTRW.32 a, stack; VLDRH.32 stack; VLDRH.32 stack+8;
18802 SDValue StackPtr = DAG.CreateStackTemporary(TypeSize::getFixed(16), Align(4));
18803 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
18804 int NumOuts = N->getNumValues();
18805 assert((NumOuts == 2 || NumOuts == 4) &&
18806 "Expected 2 or 4 outputs to an MVEEXT");
18807 EVT LoadVT = N->getOperand(0).getValueType().getHalfNumVectorElementsVT(
18808 *DAG.getContext());
18809 if (N->getNumOperands() == 4)
18810 LoadVT = LoadVT.getHalfNumVectorElementsVT(*DAG.getContext());
18811
18812 MachinePointerInfo MPI =
18814 SDValue Chain = DAG.getStore(DAG.getEntryNode(), DL, N->getOperand(0),
18815 StackPtr, MPI, Align(4));
18816
18818 for (int I = 0; I < NumOuts; I++) {
18819 SDValue Ptr = DAG.getNode(
18820 ISD::ADD, DL, StackPtr.getValueType(), StackPtr,
18821 DAG.getConstant(I * 16 / NumOuts, DL, StackPtr.getValueType()));
18823 DAG.getMachineFunction(), SPFI, I * 16 / NumOuts);
18824 SDValue Load = DAG.getExtLoad(
18825 N->getOpcode() == ARMISD::MVESEXT ? ISD::SEXTLOAD : ISD::ZEXTLOAD, DL,
18826 VT, Chain, Ptr, MPI, LoadVT, Align(4));
18827 Loads.push_back(Load);
18828 }
18829
18830 return DAG.getMergeValues(Loads, DL);
18831}
18832
18834 DAGCombinerInfo &DCI) const {
18835 switch (N->getOpcode()) {
18836 default: break;
18837 case ISD::SELECT_CC:
18838 case ISD::SELECT: return PerformSELECTCombine(N, DCI, Subtarget);
18839 case ISD::VSELECT: return PerformVSELECTCombine(N, DCI, Subtarget);
18840 case ISD::SETCC: return PerformVSetCCToVCTPCombine(N, DCI, Subtarget);
18841 case ARMISD::ADDE: return PerformADDECombine(N, DCI, Subtarget);
18842 case ARMISD::UMLAL: return PerformUMLALCombine(N, DCI.DAG, Subtarget);
18843 case ISD::ADD: return PerformADDCombine(N, DCI, Subtarget);
18844 case ISD::SUB: return PerformSUBCombine(N, DCI, Subtarget);
18845 case ISD::MUL: return PerformMULCombine(N, DCI, Subtarget);
18846 case ISD::OR: return PerformORCombine(N, DCI, Subtarget);
18847 case ISD::XOR: return PerformXORCombine(N, DCI, Subtarget);
18848 case ISD::AND: return PerformANDCombine(N, DCI, Subtarget);
18849 case ISD::BRCOND:
18850 case ISD::BR_CC: return PerformHWLoopCombine(N, DCI, Subtarget);
18851 case ARMISD::ADDC:
18852 case ARMISD::SUBC: return PerformAddcSubcCombine(N, DCI, Subtarget);
18853 case ARMISD::SUBE: return PerformAddeSubeCombine(N, DCI, Subtarget);
18854 case ARMISD::BFI: return PerformBFICombine(N, DCI.DAG);
18855 case ARMISD::VMOVRRD: return PerformVMOVRRDCombine(N, DCI, Subtarget);
18856 case ARMISD::VMOVDRR: return PerformVMOVDRRCombine(N, DCI.DAG);
18857 case ARMISD::VMOVhr: return PerformVMOVhrCombine(N, DCI);
18858 case ARMISD::VMOVrh: return PerformVMOVrhCombine(N, DCI.DAG);
18859 case ISD::STORE: return PerformSTORECombine(N, DCI, Subtarget);
18860 case ISD::BUILD_VECTOR: return PerformBUILD_VECTORCombine(N, DCI, Subtarget);
18863 return PerformExtractEltCombine(N, DCI, Subtarget);
18867 case ARMISD::VDUPLANE: return PerformVDUPLANECombine(N, DCI, Subtarget);
18868 case ARMISD::VDUP: return PerformVDUPCombine(N, DCI.DAG, Subtarget);
18869 case ISD::FP_TO_SINT:
18870 case ISD::FP_TO_UINT:
18871 return PerformVCVTCombine(N, DCI.DAG, Subtarget);
18872 case ISD::FADD:
18873 return PerformFADDCombine(N, DCI.DAG, Subtarget);
18874 case ISD::FMUL:
18875 return PerformVMulVCTPCombine(N, DCI.DAG, Subtarget);
18877 return PerformIntrinsicCombine(N, DCI);
18878 case ISD::SHL:
18879 case ISD::SRA:
18880 case ISD::SRL:
18881 return PerformShiftCombine(N, DCI, Subtarget);
18882 case ISD::SIGN_EXTEND:
18883 case ISD::ZERO_EXTEND:
18884 case ISD::ANY_EXTEND:
18885 return PerformExtendCombine(N, DCI.DAG, Subtarget);
18886 case ISD::FP_EXTEND:
18887 return PerformFPExtendCombine(N, DCI.DAG, Subtarget);
18888 case ISD::SMIN:
18889 case ISD::UMIN:
18890 case ISD::SMAX:
18891 case ISD::UMAX:
18892 return PerformMinMaxCombine(N, DCI.DAG, Subtarget);
18893 case ARMISD::CMOV:
18894 return PerformCMOVCombine(N, DCI.DAG);
18895 case ARMISD::BRCOND:
18896 return PerformBRCONDCombine(N, DCI.DAG);
18897 case ARMISD::CMPZ:
18898 return PerformCMPZCombine(N, DCI.DAG);
18899 case ARMISD::CSINC:
18900 case ARMISD::CSINV:
18901 case ARMISD::CSNEG:
18902 return PerformCSETCombine(N, DCI.DAG);
18903 case ISD::LOAD:
18904 return PerformLOADCombine(N, DCI, Subtarget);
18905 case ARMISD::VLD1DUP:
18906 case ARMISD::VLD2DUP:
18907 case ARMISD::VLD3DUP:
18908 case ARMISD::VLD4DUP:
18909 return PerformVLDCombine(N, DCI);
18911 return PerformARMBUILD_VECTORCombine(N, DCI);
18912 case ISD::BITCAST:
18913 return PerformBITCASTCombine(N, DCI, Subtarget);
18914 case ARMISD::PREDICATE_CAST:
18915 return PerformPREDICATE_CASTCombine(N, DCI);
18916 case ARMISD::VECTOR_REG_CAST:
18917 return PerformVECTOR_REG_CASTCombine(N, DCI.DAG, Subtarget);
18918 case ARMISD::MVETRUNC:
18919 return PerformMVETruncCombine(N, DCI);
18920 case ARMISD::MVESEXT:
18921 case ARMISD::MVEZEXT:
18922 return PerformMVEExtCombine(N, DCI);
18923 case ARMISD::VCMP:
18924 return PerformVCMPCombine(N, DCI.DAG, Subtarget);
18925 case ISD::VECREDUCE_ADD:
18926 return PerformVECREDUCE_ADDCombine(N, DCI.DAG, Subtarget);
18927 case ARMISD::VADDVs:
18928 case ARMISD::VADDVu:
18929 case ARMISD::VADDLVs:
18930 case ARMISD::VADDLVu:
18931 case ARMISD::VADDLVAs:
18932 case ARMISD::VADDLVAu:
18933 case ARMISD::VMLAVs:
18934 case ARMISD::VMLAVu:
18935 case ARMISD::VMLALVs:
18936 case ARMISD::VMLALVu:
18937 case ARMISD::VMLALVAs:
18938 case ARMISD::VMLALVAu:
18939 return PerformReduceShuffleCombine(N, DCI.DAG);
18940 case ARMISD::VMOVN:
18941 return PerformVMOVNCombine(N, DCI);
18942 case ARMISD::VQMOVNs:
18943 case ARMISD::VQMOVNu:
18944 return PerformVQMOVNCombine(N, DCI);
18945 case ARMISD::VQDMULH:
18946 return PerformVQDMULHCombine(N, DCI);
18947 case ARMISD::ASRL:
18948 case ARMISD::LSRL:
18949 case ARMISD::LSLL:
18950 return PerformLongShiftCombine(N, DCI.DAG);
18951 case ARMISD::SMULWB: {
18952 unsigned BitWidth = N->getValueType(0).getSizeInBits();
18953 APInt DemandedMask = APInt::getLowBitsSet(BitWidth, 16);
18954 if (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI))
18955 return SDValue();
18956 break;
18957 }
18958 case ARMISD::SMULWT: {
18959 unsigned BitWidth = N->getValueType(0).getSizeInBits();
18960 APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 16);
18961 if (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI))
18962 return SDValue();
18963 break;
18964 }
18965 case ARMISD::SMLALBB:
18966 case ARMISD::QADD16b:
18967 case ARMISD::QSUB16b:
18968 case ARMISD::UQADD16b:
18969 case ARMISD::UQSUB16b: {
18970 unsigned BitWidth = N->getValueType(0).getSizeInBits();
18971 APInt DemandedMask = APInt::getLowBitsSet(BitWidth, 16);
18972 if ((SimplifyDemandedBits(N->getOperand(0), DemandedMask, DCI)) ||
18973 (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI)))
18974 return SDValue();
18975 break;
18976 }
18977 case ARMISD::SMLALBT: {
18978 unsigned LowWidth = N->getOperand(0).getValueType().getSizeInBits();
18979 APInt LowMask = APInt::getLowBitsSet(LowWidth, 16);
18980 unsigned HighWidth = N->getOperand(1).getValueType().getSizeInBits();
18981 APInt HighMask = APInt::getHighBitsSet(HighWidth, 16);
18982 if ((SimplifyDemandedBits(N->getOperand(0), LowMask, DCI)) ||
18983 (SimplifyDemandedBits(N->getOperand(1), HighMask, DCI)))
18984 return SDValue();
18985 break;
18986 }
18987 case ARMISD::SMLALTB: {
18988 unsigned HighWidth = N->getOperand(0).getValueType().getSizeInBits();
18989 APInt HighMask = APInt::getHighBitsSet(HighWidth, 16);
18990 unsigned LowWidth = N->getOperand(1).getValueType().getSizeInBits();
18991 APInt LowMask = APInt::getLowBitsSet(LowWidth, 16);
18992 if ((SimplifyDemandedBits(N->getOperand(0), HighMask, DCI)) ||
18993 (SimplifyDemandedBits(N->getOperand(1), LowMask, DCI)))
18994 return SDValue();
18995 break;
18996 }
18997 case ARMISD::SMLALTT: {
18998 unsigned BitWidth = N->getValueType(0).getSizeInBits();
18999 APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 16);
19000 if ((SimplifyDemandedBits(N->getOperand(0), DemandedMask, DCI)) ||
19001 (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI)))
19002 return SDValue();
19003 break;
19004 }
19005 case ARMISD::QADD8b:
19006 case ARMISD::QSUB8b:
19007 case ARMISD::UQADD8b:
19008 case ARMISD::UQSUB8b: {
19009 unsigned BitWidth = N->getValueType(0).getSizeInBits();
19010 APInt DemandedMask = APInt::getLowBitsSet(BitWidth, 8);
19011 if ((SimplifyDemandedBits(N->getOperand(0), DemandedMask, DCI)) ||
19012 (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI)))
19013 return SDValue();
19014 break;
19015 }
19016 case ARMISD::VBSP:
19017 if (N->getOperand(1) == N->getOperand(2))
19018 return N->getOperand(1);
19019 return SDValue();
19022 switch (N->getConstantOperandVal(1)) {
19023 case Intrinsic::arm_neon_vld1:
19024 case Intrinsic::arm_neon_vld1x2:
19025 case Intrinsic::arm_neon_vld1x3:
19026 case Intrinsic::arm_neon_vld1x4:
19027 case Intrinsic::arm_neon_vld2:
19028 case Intrinsic::arm_neon_vld3:
19029 case Intrinsic::arm_neon_vld4:
19030 case Intrinsic::arm_neon_vld2lane:
19031 case Intrinsic::arm_neon_vld3lane:
19032 case Intrinsic::arm_neon_vld4lane:
19033 case Intrinsic::arm_neon_vld2dup:
19034 case Intrinsic::arm_neon_vld3dup:
19035 case Intrinsic::arm_neon_vld4dup:
19036 case Intrinsic::arm_neon_vst1:
19037 case Intrinsic::arm_neon_vst1x2:
19038 case Intrinsic::arm_neon_vst1x3:
19039 case Intrinsic::arm_neon_vst1x4:
19040 case Intrinsic::arm_neon_vst2:
19041 case Intrinsic::arm_neon_vst3:
19042 case Intrinsic::arm_neon_vst4:
19043 case Intrinsic::arm_neon_vst2lane:
19044 case Intrinsic::arm_neon_vst3lane:
19045 case Intrinsic::arm_neon_vst4lane:
19046 return PerformVLDCombine(N, DCI);
19047 case Intrinsic::arm_mve_vld2q:
19048 case Intrinsic::arm_mve_vld4q:
19049 case Intrinsic::arm_mve_vst2q:
19050 case Intrinsic::arm_mve_vst4q:
19051 return PerformMVEVLDCombine(N, DCI);
19052 default: break;
19053 }
19054 break;
19055 }
19056 return SDValue();
19057}
19058
19060 EVT VT) const {
19061 return (VT == MVT::f32) && (Opc == ISD::LOAD || Opc == ISD::STORE);
19062}
19063
19065 Align Alignment,
19067 unsigned *Fast) const {
19068 // Depends what it gets converted into if the type is weird.
19069 if (!VT.isSimple())
19070 return false;
19071
19072 // The AllowsUnaligned flag models the SCTLR.A setting in ARM cpus
19073 bool AllowsUnaligned = Subtarget->allowsUnalignedMem();
19074 auto Ty = VT.getSimpleVT().SimpleTy;
19075
19076 if (Ty == MVT::i8 || Ty == MVT::i16 || Ty == MVT::i32) {
19077 // Unaligned access can use (for example) LRDB, LRDH, LDR
19078 if (AllowsUnaligned) {
19079 if (Fast)
19080 *Fast = Subtarget->hasV7Ops();
19081 return true;
19082 }
19083 }
19084
19085 if (Ty == MVT::f64 || Ty == MVT::v2f64) {
19086 // For any little-endian targets with neon, we can support unaligned ld/st
19087 // of D and Q (e.g. {D0,D1}) registers by using vld1.i8/vst1.i8.
19088 // A big-endian target may also explicitly support unaligned accesses
19089 if (Subtarget->hasNEON() && (AllowsUnaligned || Subtarget->isLittle())) {
19090 if (Fast)
19091 *Fast = 1;
19092 return true;
19093 }
19094 }
19095
19096 if (!Subtarget->hasMVEIntegerOps())
19097 return false;
19098
19099 // These are for predicates
19100 if ((Ty == MVT::v16i1 || Ty == MVT::v8i1 || Ty == MVT::v4i1 ||
19101 Ty == MVT::v2i1)) {
19102 if (Fast)
19103 *Fast = 1;
19104 return true;
19105 }
19106
19107 // These are for truncated stores/narrowing loads. They are fine so long as
19108 // the alignment is at least the size of the item being loaded
19109 if ((Ty == MVT::v4i8 || Ty == MVT::v8i8 || Ty == MVT::v4i16) &&
19110 Alignment >= VT.getScalarSizeInBits() / 8) {
19111 if (Fast)
19112 *Fast = true;
19113 return true;
19114 }
19115
19116 // In little-endian MVE, the store instructions VSTRB.U8, VSTRH.U16 and
19117 // VSTRW.U32 all store the vector register in exactly the same format, and
19118 // differ only in the range of their immediate offset field and the required
19119 // alignment. So there is always a store that can be used, regardless of
19120 // actual type.
19121 //
19122 // For big endian, that is not the case. But can still emit a (VSTRB.U8;
19123 // VREV64.8) pair and get the same effect. This will likely be better than
19124 // aligning the vector through the stack.
19125 if (Ty == MVT::v16i8 || Ty == MVT::v8i16 || Ty == MVT::v8f16 ||
19126 Ty == MVT::v4i32 || Ty == MVT::v4f32 || Ty == MVT::v2i64 ||
19127 Ty == MVT::v2f64) {
19128 if (Fast)
19129 *Fast = 1;
19130 return true;
19131 }
19132
19133 return false;
19134}
19135
19137 LLVMContext &Context, const MemOp &Op,
19138 const AttributeList &FuncAttributes) const {
19139 // See if we can use NEON instructions for this...
19140 if ((Op.isMemcpy() || Op.isZeroMemset()) && Subtarget->hasNEON() &&
19141 !FuncAttributes.hasFnAttr(Attribute::NoImplicitFloat)) {
19142 unsigned Fast;
19143 if (Op.size() >= 16 &&
19144 (Op.isAligned(Align(16)) ||
19145 (allowsMisalignedMemoryAccesses(MVT::v2f64, 0, Align(1),
19147 Fast))) {
19148 return MVT::v2f64;
19149 } else if (Op.size() >= 8 &&
19150 (Op.isAligned(Align(8)) ||
19152 MVT::f64, 0, Align(1), MachineMemOperand::MONone, &Fast) &&
19153 Fast))) {
19154 return MVT::f64;
19155 }
19156 }
19157
19158 // Let the target-independent logic figure it out.
19159 return MVT::Other;
19160}
19161
19162// 64-bit integers are split into their high and low parts and held in two
19163// different registers, so the trunc is free since the low register can just
19164// be used.
19165bool ARMTargetLowering::isTruncateFree(Type *SrcTy, Type *DstTy) const {
19166 if (!SrcTy->isIntegerTy() || !DstTy->isIntegerTy())
19167 return false;
19168 unsigned SrcBits = SrcTy->getPrimitiveSizeInBits();
19169 unsigned DestBits = DstTy->getPrimitiveSizeInBits();
19170 return (SrcBits == 64 && DestBits == 32);
19171}
19172
19174 if (SrcVT.isVector() || DstVT.isVector() || !SrcVT.isInteger() ||
19175 !DstVT.isInteger())
19176 return false;
19177 unsigned SrcBits = SrcVT.getSizeInBits();
19178 unsigned DestBits = DstVT.getSizeInBits();
19179 return (SrcBits == 64 && DestBits == 32);
19180}
19181
19183 if (Val.getOpcode() != ISD::LOAD)
19184 return false;
19185
19186 EVT VT1 = Val.getValueType();
19187 if (!VT1.isSimple() || !VT1.isInteger() ||
19188 !VT2.isSimple() || !VT2.isInteger())
19189 return false;
19190
19191 switch (VT1.getSimpleVT().SimpleTy) {
19192 default: break;
19193 case MVT::i1:
19194 case MVT::i8:
19195 case MVT::i16:
19196 // 8-bit and 16-bit loads implicitly zero-extend to 32-bits.
19197 return true;
19198 }
19199
19200 return false;
19201}
19202
19204 if (!VT.isSimple())
19205 return false;
19206
19207 // There are quite a few FP16 instructions (e.g. VNMLA, VNMLS, etc.) that
19208 // negate values directly (fneg is free). So, we don't want to let the DAG
19209 // combiner rewrite fneg into xors and some other instructions. For f16 and
19210 // FullFP16 argument passing, some bitcast nodes may be introduced,
19211 // triggering this DAG combine rewrite, so we are avoiding that with this.
19212 switch (VT.getSimpleVT().SimpleTy) {
19213 default: break;
19214 case MVT::f16:
19215 return Subtarget->hasFullFP16();
19216 }
19217
19218 return false;
19219}
19220
19222 if (!Subtarget->hasMVEIntegerOps())
19223 return nullptr;
19224 Type *SVIType = SVI->getType();
19225 Type *ScalarType = SVIType->getScalarType();
19226
19227 if (ScalarType->isFloatTy())
19228 return Type::getInt32Ty(SVIType->getContext());
19229 if (ScalarType->isHalfTy())
19230 return Type::getInt16Ty(SVIType->getContext());
19231 return nullptr;
19232}
19233
19235 EVT VT = ExtVal.getValueType();
19236
19237 if (!isTypeLegal(VT))
19238 return false;
19239
19240 if (auto *Ld = dyn_cast<MaskedLoadSDNode>(ExtVal.getOperand(0))) {
19241 if (Ld->isExpandingLoad())
19242 return false;
19243 }
19244
19245 if (Subtarget->hasMVEIntegerOps())
19246 return true;
19247
19248 // Don't create a loadext if we can fold the extension into a wide/long
19249 // instruction.
19250 // If there's more than one user instruction, the loadext is desirable no
19251 // matter what. There can be two uses by the same instruction.
19252 if (ExtVal->use_empty() ||
19253 !ExtVal->user_begin()->isOnlyUserOf(ExtVal.getNode()))
19254 return true;
19255
19256 SDNode *U = *ExtVal->user_begin();
19257 if ((U->getOpcode() == ISD::ADD || U->getOpcode() == ISD::SUB ||
19258 U->getOpcode() == ISD::SHL || U->getOpcode() == ARMISD::VSHLIMM))
19259 return false;
19260
19261 return true;
19262}
19263
19265 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
19266 return false;
19267
19268 if (!isTypeLegal(EVT::getEVT(Ty1)))
19269 return false;
19270
19271 assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop");
19272
19273 // Assuming the caller doesn't have a zeroext or signext return parameter,
19274 // truncation all the way down to i1 is valid.
19275 return true;
19276}
19277
19278/// isFMAFasterThanFMulAndFAdd - Return true if an FMA operation is faster
19279/// than a pair of fmul and fadd instructions. fmuladd intrinsics will be
19280/// expanded to FMAs when this method returns true, otherwise fmuladd is
19281/// expanded to fmul + fadd.
19282///
19283/// ARM supports both fused and unfused multiply-add operations; we already
19284/// lower a pair of fmul and fadd to the latter so it's not clear that there
19285/// would be a gain or that the gain would be worthwhile enough to risk
19286/// correctness bugs.
19287///
19288/// For MVE, we set this to true as it helps simplify the need for some
19289/// patterns (and we don't have the non-fused floating point instruction).
19290bool ARMTargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
19291 EVT VT) const {
19292 if (Subtarget->useSoftFloat())
19293 return false;
19294
19295 if (!VT.isSimple())
19296 return false;
19297
19298 switch (VT.getSimpleVT().SimpleTy) {
19299 case MVT::v4f32:
19300 case MVT::v8f16:
19301 return Subtarget->hasMVEFloatOps();
19302 case MVT::f16:
19303 return Subtarget->useFPVFMx16();
19304 case MVT::f32:
19305 return Subtarget->useFPVFMx();
19306 case MVT::f64:
19307 return Subtarget->useFPVFMx64();
19308 default:
19309 break;
19310 }
19311
19312 return false;
19313}
19314
19315static bool isLegalT1AddressImmediate(int64_t V, EVT VT) {
19316 if (V < 0)
19317 return false;
19318
19319 unsigned Scale = 1;
19320 switch (VT.getSimpleVT().SimpleTy) {
19321 case MVT::i1:
19322 case MVT::i8:
19323 // Scale == 1;
19324 break;
19325 case MVT::i16:
19326 // Scale == 2;
19327 Scale = 2;
19328 break;
19329 default:
19330 // On thumb1 we load most things (i32, i64, floats, etc) with a LDR
19331 // Scale == 4;
19332 Scale = 4;
19333 break;
19334 }
19335
19336 if ((V & (Scale - 1)) != 0)
19337 return false;
19338 return isUInt<5>(V / Scale);
19339}
19340
19341static bool isLegalT2AddressImmediate(int64_t V, EVT VT,
19342 const ARMSubtarget *Subtarget) {
19343 if (!VT.isInteger() && !VT.isFloatingPoint())
19344 return false;
19345 if (VT.isVector() && Subtarget->hasNEON())
19346 return false;
19347 if (VT.isVector() && VT.isFloatingPoint() && Subtarget->hasMVEIntegerOps() &&
19348 !Subtarget->hasMVEFloatOps())
19349 return false;
19350
19351 bool IsNeg = false;
19352 if (V < 0) {
19353 IsNeg = true;
19354 V = -V;
19355 }
19356
19357 unsigned NumBytes = std::max((unsigned)VT.getSizeInBits() / 8, 1U);
19358
19359 // MVE: size * imm7
19360 if (VT.isVector() && Subtarget->hasMVEIntegerOps()) {
19361 switch (VT.getSimpleVT().getVectorElementType().SimpleTy) {
19362 case MVT::i32:
19363 case MVT::f32:
19364 return isShiftedUInt<7,2>(V);
19365 case MVT::i16:
19366 case MVT::f16:
19367 return isShiftedUInt<7,1>(V);
19368 case MVT::i8:
19369 return isUInt<7>(V);
19370 default:
19371 return false;
19372 }
19373 }
19374
19375 // half VLDR: 2 * imm8
19376 if (VT.isFloatingPoint() && NumBytes == 2 && Subtarget->hasFPRegs16())
19377 return isShiftedUInt<8, 1>(V);
19378 // VLDR and LDRD: 4 * imm8
19379 if ((VT.isFloatingPoint() && Subtarget->hasVFP2Base()) || NumBytes == 8)
19380 return isShiftedUInt<8, 2>(V);
19381
19382 if (NumBytes == 1 || NumBytes == 2 || NumBytes == 4) {
19383 // + imm12 or - imm8
19384 if (IsNeg)
19385 return isUInt<8>(V);
19386 return isUInt<12>(V);
19387 }
19388
19389 return false;
19390}
19391
19392/// isLegalAddressImmediate - Return true if the integer value can be used
19393/// as the offset of the target addressing mode for load / store of the
19394/// given type.
19395static bool isLegalAddressImmediate(int64_t V, EVT VT,
19396 const ARMSubtarget *Subtarget) {
19397 if (V == 0)
19398 return true;
19399
19400 if (!VT.isSimple())
19401 return false;
19402
19403 if (Subtarget->isThumb1Only())
19404 return isLegalT1AddressImmediate(V, VT);
19405 else if (Subtarget->isThumb2())
19406 return isLegalT2AddressImmediate(V, VT, Subtarget);
19407
19408 // ARM mode.
19409 if (V < 0)
19410 V = - V;
19411 switch (VT.getSimpleVT().SimpleTy) {
19412 default: return false;
19413 case MVT::i1:
19414 case MVT::i8:
19415 case MVT::i32:
19416 // +- imm12
19417 return isUInt<12>(V);
19418 case MVT::i16:
19419 // +- imm8
19420 return isUInt<8>(V);
19421 case MVT::f32:
19422 case MVT::f64:
19423 if (!Subtarget->hasVFP2Base()) // FIXME: NEON?
19424 return false;
19425 return isShiftedUInt<8, 2>(V);
19426 }
19427}
19428
19430 EVT VT) const {
19431 int Scale = AM.Scale;
19432 if (Scale < 0)
19433 return false;
19434
19435 switch (VT.getSimpleVT().SimpleTy) {
19436 default: return false;
19437 case MVT::i1:
19438 case MVT::i8:
19439 case MVT::i16:
19440 case MVT::i32:
19441 if (Scale == 1)
19442 return true;
19443 // r + r << imm
19444 Scale = Scale & ~1;
19445 return Scale == 2 || Scale == 4 || Scale == 8;
19446 case MVT::i64:
19447 // FIXME: What are we trying to model here? ldrd doesn't have an r + r
19448 // version in Thumb mode.
19449 // r + r
19450 if (Scale == 1)
19451 return true;
19452 // r * 2 (this can be lowered to r + r).
19453 if (!AM.HasBaseReg && Scale == 2)
19454 return true;
19455 return false;
19456 case MVT::isVoid:
19457 // Note, we allow "void" uses (basically, uses that aren't loads or
19458 // stores), because arm allows folding a scale into many arithmetic
19459 // operations. This should be made more precise and revisited later.
19460
19461 // Allow r << imm, but the imm has to be a multiple of two.
19462 if (Scale & 1) return false;
19463 return isPowerOf2_32(Scale);
19464 }
19465}
19466
19468 EVT VT) const {
19469 const int Scale = AM.Scale;
19470
19471 // Negative scales are not supported in Thumb1.
19472 if (Scale < 0)
19473 return false;
19474
19475 // Thumb1 addressing modes do not support register scaling excepting the
19476 // following cases:
19477 // 1. Scale == 1 means no scaling.
19478 // 2. Scale == 2 this can be lowered to r + r if there is no base register.
19479 return (Scale == 1) || (!AM.HasBaseReg && Scale == 2);
19480}
19481
19482/// isLegalAddressingMode - Return true if the addressing mode represented
19483/// by AM is legal for this target, for a load/store of the specified type.
19485 const AddrMode &AM, Type *Ty,
19486 unsigned AS, Instruction *I) const {
19487 EVT VT = getValueType(DL, Ty, true);
19488 if (!isLegalAddressImmediate(AM.BaseOffs, VT, Subtarget))
19489 return false;
19490
19491 // Can never fold addr of global into load/store.
19492 if (AM.BaseGV)
19493 return false;
19494
19495 switch (AM.Scale) {
19496 case 0: // no scale reg, must be "r+i" or "r", or "i".
19497 break;
19498 default:
19499 // ARM doesn't support any R+R*scale+imm addr modes.
19500 if (AM.BaseOffs)
19501 return false;
19502
19503 if (!VT.isSimple())
19504 return false;
19505
19506 if (Subtarget->isThumb1Only())
19507 return isLegalT1ScaledAddressingMode(AM, VT);
19508
19509 if (Subtarget->isThumb2())
19510 return isLegalT2ScaledAddressingMode(AM, VT);
19511
19512 int Scale = AM.Scale;
19513 switch (VT.getSimpleVT().SimpleTy) {
19514 default: return false;
19515 case MVT::i1:
19516 case MVT::i8:
19517 case MVT::i32:
19518 if (Scale < 0) Scale = -Scale;
19519 if (Scale == 1)
19520 return true;
19521 // r + r << imm
19522 return isPowerOf2_32(Scale & ~1);
19523 case MVT::i16:
19524 case MVT::i64:
19525 // r +/- r
19526 if (Scale == 1 || (AM.HasBaseReg && Scale == -1))
19527 return true;
19528 // r * 2 (this can be lowered to r + r).
19529 if (!AM.HasBaseReg && Scale == 2)
19530 return true;
19531 return false;
19532
19533 case MVT::isVoid:
19534 // Note, we allow "void" uses (basically, uses that aren't loads or
19535 // stores), because arm allows folding a scale into many arithmetic
19536 // operations. This should be made more precise and revisited later.
19537
19538 // Allow r << imm, but the imm has to be a multiple of two.
19539 if (Scale & 1) return false;
19540 return isPowerOf2_32(Scale);
19541 }
19542 }
19543 return true;
19544}
19545
19546/// isLegalICmpImmediate - Return true if the specified immediate is legal
19547/// icmp immediate, that is the target has icmp instructions which can compare
19548/// a register against the immediate without having to materialize the
19549/// immediate into a register.
19551 // Thumb2 and ARM modes can use cmn for negative immediates.
19552 if (!Subtarget->isThumb())
19553 return ARM_AM::getSOImmVal((uint32_t)Imm) != -1 ||
19554 ARM_AM::getSOImmVal(-(uint32_t)Imm) != -1;
19555 if (Subtarget->isThumb2())
19556 return ARM_AM::getT2SOImmVal((uint32_t)Imm) != -1 ||
19557 ARM_AM::getT2SOImmVal(-(uint32_t)Imm) != -1;
19558 // Thumb1 doesn't have cmn, and only 8-bit immediates.
19559 return Imm >= 0 && Imm <= 255;
19560}
19561
19562/// isLegalAddImmediate - Return true if the specified immediate is a legal add
19563/// *or sub* immediate, that is the target has add or sub instructions which can
19564/// add a register with the immediate without having to materialize the
19565/// immediate into a register.
19567 // Same encoding for add/sub, just flip the sign.
19568 uint64_t AbsImm = AbsoluteValue(Imm);
19569 if (!Subtarget->isThumb())
19570 return ARM_AM::getSOImmVal(AbsImm) != -1;
19571 if (Subtarget->isThumb2())
19572 return ARM_AM::getT2SOImmVal(AbsImm) != -1;
19573 // Thumb1 only has 8-bit unsigned immediate.
19574 return AbsImm <= 255;
19575}
19576
19577// Return false to prevent folding
19578// (mul (add r, c0), c1) -> (add (mul r, c1), c0*c1) in DAGCombine,
19579// if the folding leads to worse code.
19581 SDValue ConstNode) const {
19582 // Let the DAGCombiner decide for vector types and large types.
19583 const EVT VT = AddNode.getValueType();
19584 if (VT.isVector() || VT.getScalarSizeInBits() > 32)
19585 return true;
19586
19587 // It is worse if c0 is legal add immediate, while c1*c0 is not
19588 // and has to be composed by at least two instructions.
19589 const ConstantSDNode *C0Node = cast<ConstantSDNode>(AddNode.getOperand(1));
19590 const ConstantSDNode *C1Node = cast<ConstantSDNode>(ConstNode);
19591 const int64_t C0 = C0Node->getSExtValue();
19592 APInt CA = C0Node->getAPIntValue() * C1Node->getAPIntValue();
19594 return true;
19595 if (ConstantMaterializationCost((unsigned)CA.getZExtValue(), Subtarget) > 1)
19596 return false;
19597
19598 // Default to true and let the DAGCombiner decide.
19599 return true;
19600}
19601
19603 bool isSEXTLoad, SDValue &Base,
19604 SDValue &Offset, bool &isInc,
19605 SelectionDAG &DAG) {
19606 if (Ptr->getOpcode() != ISD::ADD && Ptr->getOpcode() != ISD::SUB)
19607 return false;
19608
19609 if (VT == MVT::i16 || ((VT == MVT::i8 || VT == MVT::i1) && isSEXTLoad)) {
19610 // AddressingMode 3
19611 Base = Ptr->getOperand(0);
19613 int RHSC = (int)RHS->getZExtValue();
19614 if (RHSC < 0 && RHSC > -256) {
19615 assert(Ptr->getOpcode() == ISD::ADD);
19616 isInc = false;
19617 Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0));
19618 return true;
19619 }
19620 }
19621 isInc = (Ptr->getOpcode() == ISD::ADD);
19622 Offset = Ptr->getOperand(1);
19623 return true;
19624 } else if (VT == MVT::i32 || VT == MVT::i8 || VT == MVT::i1) {
19625 // AddressingMode 2
19627 int RHSC = (int)RHS->getZExtValue();
19628 if (RHSC < 0 && RHSC > -0x1000) {
19629 assert(Ptr->getOpcode() == ISD::ADD);
19630 isInc = false;
19631 Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0));
19632 Base = Ptr->getOperand(0);
19633 return true;
19634 }
19635 }
19636
19637 if (Ptr->getOpcode() == ISD::ADD) {
19638 isInc = true;
19639 ARM_AM::ShiftOpc ShOpcVal=
19641 if (ShOpcVal != ARM_AM::no_shift) {
19642 Base = Ptr->getOperand(1);
19643 Offset = Ptr->getOperand(0);
19644 } else {
19645 Base = Ptr->getOperand(0);
19646 Offset = Ptr->getOperand(1);
19647 }
19648 return true;
19649 }
19650
19651 isInc = (Ptr->getOpcode() == ISD::ADD);
19652 Base = Ptr->getOperand(0);
19653 Offset = Ptr->getOperand(1);
19654 return true;
19655 }
19656
19657 // FIXME: Use VLDM / VSTM to emulate indexed FP load / store.
19658 return false;
19659}
19660
19662 bool isSEXTLoad, SDValue &Base,
19663 SDValue &Offset, bool &isInc,
19664 SelectionDAG &DAG) {
19665 if (Ptr->getOpcode() != ISD::ADD && Ptr->getOpcode() != ISD::SUB)
19666 return false;
19667
19668 Base = Ptr->getOperand(0);
19670 int RHSC = (int)RHS->getZExtValue();
19671 if (RHSC < 0 && RHSC > -0x100) { // 8 bits.
19672 assert(Ptr->getOpcode() == ISD::ADD);
19673 isInc = false;
19674 Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0));
19675 return true;
19676 } else if (RHSC > 0 && RHSC < 0x100) { // 8 bit, no zero.
19677 isInc = Ptr->getOpcode() == ISD::ADD;
19678 Offset = DAG.getConstant(RHSC, SDLoc(Ptr), RHS->getValueType(0));
19679 return true;
19680 }
19681 }
19682
19683 return false;
19684}
19685
19686static bool getMVEIndexedAddressParts(SDNode *Ptr, EVT VT, Align Alignment,
19687 bool isSEXTLoad, bool IsMasked, bool isLE,
19689 bool &isInc, SelectionDAG &DAG) {
19690 if (Ptr->getOpcode() != ISD::ADD && Ptr->getOpcode() != ISD::SUB)
19691 return false;
19692 if (!isa<ConstantSDNode>(Ptr->getOperand(1)))
19693 return false;
19694
19695 // We allow LE non-masked loads to change the type (for example use a vldrb.8
19696 // as opposed to a vldrw.32). This can allow extra addressing modes or
19697 // alignments for what is otherwise an equivalent instruction.
19698 bool CanChangeType = isLE && !IsMasked;
19699
19701 int RHSC = (int)RHS->getZExtValue();
19702
19703 auto IsInRange = [&](int RHSC, int Limit, int Scale) {
19704 if (RHSC < 0 && RHSC > -Limit * Scale && RHSC % Scale == 0) {
19705 assert(Ptr->getOpcode() == ISD::ADD);
19706 isInc = false;
19707 Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0));
19708 return true;
19709 } else if (RHSC > 0 && RHSC < Limit * Scale && RHSC % Scale == 0) {
19710 isInc = Ptr->getOpcode() == ISD::ADD;
19711 Offset = DAG.getConstant(RHSC, SDLoc(Ptr), RHS->getValueType(0));
19712 return true;
19713 }
19714 return false;
19715 };
19716
19717 // Try to find a matching instruction based on s/zext, Alignment, Offset and
19718 // (in BE/masked) type.
19719 Base = Ptr->getOperand(0);
19720 if (VT == MVT::v4i16) {
19721 if (Alignment >= 2 && IsInRange(RHSC, 0x80, 2))
19722 return true;
19723 } else if (VT == MVT::v4i8 || VT == MVT::v8i8) {
19724 if (IsInRange(RHSC, 0x80, 1))
19725 return true;
19726 } else if (Alignment >= 4 &&
19727 (CanChangeType || VT == MVT::v4i32 || VT == MVT::v4f32) &&
19728 IsInRange(RHSC, 0x80, 4))
19729 return true;
19730 else if (Alignment >= 2 &&
19731 (CanChangeType || VT == MVT::v8i16 || VT == MVT::v8f16) &&
19732 IsInRange(RHSC, 0x80, 2))
19733 return true;
19734 else if ((CanChangeType || VT == MVT::v16i8) && IsInRange(RHSC, 0x80, 1))
19735 return true;
19736 return false;
19737}
19738
19739/// getPreIndexedAddressParts - returns true by value, base pointer and
19740/// offset pointer and addressing mode by reference if the node's address
19741/// can be legally represented as pre-indexed load / store address.
19742bool
19744 SDValue &Offset,
19746 SelectionDAG &DAG) const {
19747 if (Subtarget->isThumb1Only())
19748 return false;
19749
19750 EVT VT;
19751 SDValue Ptr;
19752 Align Alignment;
19753 unsigned AS = 0;
19754 bool isSEXTLoad = false;
19755 bool IsMasked = false;
19756 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
19757 Ptr = LD->getBasePtr();
19758 VT = LD->getMemoryVT();
19759 Alignment = LD->getAlign();
19760 AS = LD->getAddressSpace();
19761 isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD;
19762 } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
19763 Ptr = ST->getBasePtr();
19764 VT = ST->getMemoryVT();
19765 Alignment = ST->getAlign();
19766 AS = ST->getAddressSpace();
19767 } else if (MaskedLoadSDNode *LD = dyn_cast<MaskedLoadSDNode>(N)) {
19768 Ptr = LD->getBasePtr();
19769 VT = LD->getMemoryVT();
19770 Alignment = LD->getAlign();
19771 AS = LD->getAddressSpace();
19772 isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD;
19773 IsMasked = true;
19775 Ptr = ST->getBasePtr();
19776 VT = ST->getMemoryVT();
19777 Alignment = ST->getAlign();
19778 AS = ST->getAddressSpace();
19779 IsMasked = true;
19780 } else
19781 return false;
19782
19783 unsigned Fast = 0;
19784 if (!allowsMisalignedMemoryAccesses(VT, AS, Alignment,
19786 // Only generate post-increment or pre-increment forms when a real
19787 // hardware instruction exists for them. Do not emit postinc/preinc
19788 // if the operation will end up as a libcall.
19789 return false;
19790 }
19791
19792 bool isInc;
19793 bool isLegal = false;
19794 if (VT.isVector())
19795 isLegal = Subtarget->hasMVEIntegerOps() &&
19797 Ptr.getNode(), VT, Alignment, isSEXTLoad, IsMasked,
19798 Subtarget->isLittle(), Base, Offset, isInc, DAG);
19799 else {
19800 if (Subtarget->isThumb2())
19801 isLegal = getT2IndexedAddressParts(Ptr.getNode(), VT, isSEXTLoad, Base,
19802 Offset, isInc, DAG);
19803 else
19804 isLegal = getARMIndexedAddressParts(Ptr.getNode(), VT, isSEXTLoad, Base,
19805 Offset, isInc, DAG);
19806 }
19807 if (!isLegal)
19808 return false;
19809
19810 AM = isInc ? ISD::PRE_INC : ISD::PRE_DEC;
19811 return true;
19812}
19813
19814/// getPostIndexedAddressParts - returns true by value, base pointer and
19815/// offset pointer and addressing mode by reference if this node can be
19816/// combined with a load / store to form a post-indexed load / store.
19818 SDValue &Base,
19819 SDValue &Offset,
19821 SelectionDAG &DAG) const {
19822 EVT VT;
19823 SDValue Ptr;
19824 Align Alignment;
19825 bool isSEXTLoad = false, isNonExt;
19826 bool IsMasked = false;
19827 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
19828 VT = LD->getMemoryVT();
19829 Ptr = LD->getBasePtr();
19830 Alignment = LD->getAlign();
19831 isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD;
19832 isNonExt = LD->getExtensionType() == ISD::NON_EXTLOAD;
19833 } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
19834 VT = ST->getMemoryVT();
19835 Ptr = ST->getBasePtr();
19836 Alignment = ST->getAlign();
19837 isNonExt = !ST->isTruncatingStore();
19838 } else if (MaskedLoadSDNode *LD = dyn_cast<MaskedLoadSDNode>(N)) {
19839 VT = LD->getMemoryVT();
19840 Ptr = LD->getBasePtr();
19841 Alignment = LD->getAlign();
19842 isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD;
19843 isNonExt = LD->getExtensionType() == ISD::NON_EXTLOAD;
19844 IsMasked = true;
19846 VT = ST->getMemoryVT();
19847 Ptr = ST->getBasePtr();
19848 Alignment = ST->getAlign();
19849 isNonExt = !ST->isTruncatingStore();
19850 IsMasked = true;
19851 } else
19852 return false;
19853
19854 if (Subtarget->isThumb1Only()) {
19855 // Thumb-1 can do a limited post-inc load or store as an updating LDM. It
19856 // must be non-extending/truncating, i32, with an offset of 4.
19857 assert(Op->getValueType(0) == MVT::i32 && "Non-i32 post-inc op?!");
19858 if (Op->getOpcode() != ISD::ADD || !isNonExt)
19859 return false;
19860 auto *RHS = dyn_cast<ConstantSDNode>(Op->getOperand(1));
19861 if (!RHS || RHS->getZExtValue() != 4)
19862 return false;
19863 if (Alignment < Align(4))
19864 return false;
19865
19866 Offset = Op->getOperand(1);
19867 Base = Op->getOperand(0);
19868 AM = ISD::POST_INC;
19869 return true;
19870 }
19871
19872 bool isInc;
19873 bool isLegal = false;
19874 if (VT.isVector())
19875 isLegal = Subtarget->hasMVEIntegerOps() &&
19876 getMVEIndexedAddressParts(Op, VT, Alignment, isSEXTLoad, IsMasked,
19877 Subtarget->isLittle(), Base, Offset,
19878 isInc, DAG);
19879 else {
19880 if (Subtarget->isThumb2())
19881 isLegal = getT2IndexedAddressParts(Op, VT, isSEXTLoad, Base, Offset,
19882 isInc, DAG);
19883 else
19884 isLegal = getARMIndexedAddressParts(Op, VT, isSEXTLoad, Base, Offset,
19885 isInc, DAG);
19886 }
19887 if (!isLegal)
19888 return false;
19889
19890 if (Ptr != Base) {
19891 // Swap base ptr and offset to catch more post-index load / store when
19892 // it's legal. In Thumb2 mode, offset must be an immediate.
19893 if (Ptr == Offset && Op->getOpcode() == ISD::ADD &&
19894 !Subtarget->isThumb2())
19896
19897 // Post-indexed load / store update the base pointer.
19898 if (Ptr != Base)
19899 return false;
19900 }
19901
19902 AM = isInc ? ISD::POST_INC : ISD::POST_DEC;
19903 return true;
19904}
19905
19907 KnownBits &Known,
19908 const APInt &DemandedElts,
19909 const SelectionDAG &DAG,
19910 unsigned Depth) const {
19911 unsigned BitWidth = Known.getBitWidth();
19912 Known.resetAll();
19913 switch (Op.getOpcode()) {
19914 default: break;
19915 case ARMISD::ADDC:
19916 case ARMISD::ADDE:
19917 case ARMISD::SUBC:
19918 case ARMISD::SUBE:
19919 // Special cases when we convert a carry to a boolean.
19920 if (Op.getResNo() == 0) {
19921 SDValue LHS = Op.getOperand(0);
19922 SDValue RHS = Op.getOperand(1);
19923 // (ADDE 0, 0, C) will give us a single bit.
19924 if (Op->getOpcode() == ARMISD::ADDE && isNullConstant(LHS) &&
19925 isNullConstant(RHS)) {
19927 return;
19928 }
19929 }
19930 break;
19931 case ARMISD::CMOV: {
19932 // Bits are known zero/one if known on the LHS and RHS.
19933 Known = DAG.computeKnownBits(Op.getOperand(0), Depth+1);
19934 if (Known.isUnknown())
19935 return;
19936
19937 KnownBits KnownRHS = DAG.computeKnownBits(Op.getOperand(1), Depth+1);
19938 Known = Known.intersectWith(KnownRHS);
19939 return;
19940 }
19942 Intrinsic::ID IntID =
19943 static_cast<Intrinsic::ID>(Op->getConstantOperandVal(1));
19944 switch (IntID) {
19945 default: return;
19946 case Intrinsic::arm_ldaex:
19947 case Intrinsic::arm_ldrex: {
19948 EVT VT = cast<MemIntrinsicSDNode>(Op)->getMemoryVT();
19949 unsigned MemBits = VT.getScalarSizeInBits();
19950 Known.Zero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits);
19951 return;
19952 }
19953 }
19954 }
19955 case ARMISD::BFI: {
19956 // Conservatively, we can recurse down the first operand
19957 // and just mask out all affected bits.
19958 Known = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
19959
19960 // The operand to BFI is already a mask suitable for removing the bits it
19961 // sets.
19962 const APInt &Mask = Op.getConstantOperandAPInt(2);
19963 Known.Zero &= Mask;
19964 Known.One &= Mask;
19965 return;
19966 }
19967 case ARMISD::VGETLANEs:
19968 case ARMISD::VGETLANEu: {
19969 const SDValue &SrcSV = Op.getOperand(0);
19970 EVT VecVT = SrcSV.getValueType();
19971 assert(VecVT.isVector() && "VGETLANE expected a vector type");
19972 const unsigned NumSrcElts = VecVT.getVectorNumElements();
19973 ConstantSDNode *Pos = cast<ConstantSDNode>(Op.getOperand(1).getNode());
19974 assert(Pos->getAPIntValue().ult(NumSrcElts) &&
19975 "VGETLANE index out of bounds");
19976 unsigned Idx = Pos->getZExtValue();
19977 APInt DemandedElt = APInt::getOneBitSet(NumSrcElts, Idx);
19978 Known = DAG.computeKnownBits(SrcSV, DemandedElt, Depth + 1);
19979
19980 EVT VT = Op.getValueType();
19981 const unsigned DstSz = VT.getScalarSizeInBits();
19982 const unsigned SrcSz = VecVT.getVectorElementType().getSizeInBits();
19983 (void)SrcSz;
19984 assert(SrcSz == Known.getBitWidth());
19985 assert(DstSz > SrcSz);
19986 if (Op.getOpcode() == ARMISD::VGETLANEs)
19987 Known = Known.sext(DstSz);
19988 else {
19989 Known = Known.zext(DstSz);
19990 }
19991 assert(DstSz == Known.getBitWidth());
19992 break;
19993 }
19994 case ARMISD::VMOVrh: {
19995 KnownBits KnownOp = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
19996 assert(KnownOp.getBitWidth() == 16);
19997 Known = KnownOp.zext(32);
19998 break;
19999 }
20000 case ARMISD::CSINC:
20001 case ARMISD::CSINV:
20002 case ARMISD::CSNEG: {
20003 KnownBits KnownOp0 = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
20004 KnownBits KnownOp1 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
20005
20006 // The result is either:
20007 // CSINC: KnownOp0 or KnownOp1 + 1
20008 // CSINV: KnownOp0 or ~KnownOp1
20009 // CSNEG: KnownOp0 or KnownOp1 * -1
20010 if (Op.getOpcode() == ARMISD::CSINC)
20011 KnownOp1 =
20012 KnownBits::add(KnownOp1, KnownBits::makeConstant(APInt(32, 1)));
20013 else if (Op.getOpcode() == ARMISD::CSINV)
20014 std::swap(KnownOp1.Zero, KnownOp1.One);
20015 else if (Op.getOpcode() == ARMISD::CSNEG)
20016 KnownOp1 = KnownBits::mul(KnownOp1,
20018
20019 Known = KnownOp0.intersectWith(KnownOp1);
20020 break;
20021 }
20022 case ARMISD::VORRIMM:
20023 case ARMISD::VBICIMM: {
20024 unsigned Encoded = Op.getConstantOperandVal(1);
20025 unsigned DecEltBits = 0;
20026 uint64_t DecodedVal = ARM_AM::decodeVMOVModImm(Encoded, DecEltBits);
20027
20028 unsigned EltBits = Op.getScalarValueSizeInBits();
20029 if (EltBits != DecEltBits) {
20030 // Be conservative: only update Known when EltBits == DecEltBits.
20031 // This is believed to always be true for VORRIMM/VBICIMM today, but if
20032 // that changes in the future, doing nothing here is safer than risking
20033 // subtle bugs.
20034 break;
20035 }
20036
20037 KnownBits KnownLHS = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
20038 bool IsVORR = Op.getOpcode() == ARMISD::VORRIMM;
20039 APInt Imm(DecEltBits, DecodedVal);
20040
20041 Known.One = IsVORR ? (KnownLHS.One | Imm) : (KnownLHS.One & ~Imm);
20042 Known.Zero = IsVORR ? (KnownLHS.Zero & ~Imm) : (KnownLHS.Zero | Imm);
20043 break;
20044 }
20045 }
20046}
20047
20049 SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
20050 TargetLoweringOpt &TLO) const {
20051 // Delay optimization, so we don't have to deal with illegal types, or block
20052 // optimizations.
20053 if (!TLO.LegalOps)
20054 return false;
20055
20056 // Only optimize AND for now.
20057 if (Op.getOpcode() != ISD::AND)
20058 return false;
20059
20060 EVT VT = Op.getValueType();
20061
20062 // Ignore vectors.
20063 if (VT.isVector())
20064 return false;
20065
20066 assert(VT == MVT::i32 && "Unexpected integer type");
20067
20068 // Make sure the RHS really is a constant.
20069 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
20070 if (!C)
20071 return false;
20072
20073 unsigned Mask = C->getZExtValue();
20074
20075 unsigned Demanded = DemandedBits.getZExtValue();
20076 unsigned ShrunkMask = Mask & Demanded;
20077 unsigned ExpandedMask = Mask | ~Demanded;
20078
20079 // If the mask is all zeros, let the target-independent code replace the
20080 // result with zero.
20081 if (ShrunkMask == 0)
20082 return false;
20083
20084 // If the mask is all ones, erase the AND. (Currently, the target-independent
20085 // code won't do this, so we have to do it explicitly to avoid an infinite
20086 // loop in obscure cases.)
20087 if (ExpandedMask == ~0U)
20088 return TLO.CombineTo(Op, Op.getOperand(0));
20089
20090 auto IsLegalMask = [ShrunkMask, ExpandedMask](unsigned Mask) -> bool {
20091 return (ShrunkMask & Mask) == ShrunkMask && (~ExpandedMask & Mask) == 0;
20092 };
20093 auto UseMask = [Mask, Op, VT, &TLO](unsigned NewMask) -> bool {
20094 if (NewMask == Mask)
20095 return true;
20096 SDLoc DL(Op);
20097 SDValue NewC = TLO.DAG.getConstant(NewMask, DL, VT);
20098 SDValue NewOp = TLO.DAG.getNode(ISD::AND, DL, VT, Op.getOperand(0), NewC);
20099 return TLO.CombineTo(Op, NewOp);
20100 };
20101
20102 // Prefer uxtb mask.
20103 if (IsLegalMask(0xFF))
20104 return UseMask(0xFF);
20105
20106 // Prefer uxth mask.
20107 if (IsLegalMask(0xFFFF))
20108 return UseMask(0xFFFF);
20109
20110 // [1, 255] is Thumb1 movs+ands, legal immediate for ARM/Thumb2.
20111 // FIXME: Prefer a contiguous sequence of bits for other optimizations.
20112 if (ShrunkMask < 256)
20113 return UseMask(ShrunkMask);
20114
20115 // [-256, -2] is Thumb1 movs+bics, legal immediate for ARM/Thumb2.
20116 // FIXME: Prefer a contiguous sequence of bits for other optimizations.
20117 if ((int)ExpandedMask <= -2 && (int)ExpandedMask >= -256)
20118 return UseMask(ExpandedMask);
20119
20120 // Potential improvements:
20121 //
20122 // We could try to recognize lsls+lsrs or lsrs+lsls pairs here.
20123 // We could try to prefer Thumb1 immediates which can be lowered to a
20124 // two-instruction sequence.
20125 // We could try to recognize more legal ARM/Thumb2 immediates here.
20126
20127 return false;
20128}
20129
20131 SDValue Op, const APInt &OriginalDemandedBits,
20132 const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO,
20133 unsigned Depth) const {
20134 unsigned Opc = Op.getOpcode();
20135
20136 switch (Opc) {
20137 case ARMISD::ASRL:
20138 case ARMISD::LSRL: {
20139 // If this is result 0 and the other result is unused, see if the demand
20140 // bits allow us to shrink this long shift into a standard small shift in
20141 // the opposite direction.
20142 if (Op.getResNo() == 0 && !Op->hasAnyUseOfValue(1) &&
20143 isa<ConstantSDNode>(Op->getOperand(2))) {
20144 unsigned ShAmt = Op->getConstantOperandVal(2);
20145 if (ShAmt < 32 && OriginalDemandedBits.isSubsetOf(APInt::getAllOnes(32)
20146 << (32 - ShAmt)))
20147 return TLO.CombineTo(
20148 Op, TLO.DAG.getNode(
20149 ISD::SHL, SDLoc(Op), MVT::i32, Op.getOperand(1),
20150 TLO.DAG.getConstant(32 - ShAmt, SDLoc(Op), MVT::i32)));
20151 }
20152 break;
20153 }
20154 case ARMISD::VBICIMM: {
20155 SDValue Op0 = Op.getOperand(0);
20156 unsigned ModImm = Op.getConstantOperandVal(1);
20157 unsigned EltBits = 0;
20158 uint64_t Mask = ARM_AM::decodeVMOVModImm(ModImm, EltBits);
20159 if ((OriginalDemandedBits & Mask) == 0)
20160 return TLO.CombineTo(Op, Op0);
20161 }
20162 }
20163
20165 Op, OriginalDemandedBits, OriginalDemandedElts, Known, TLO, Depth);
20166}
20167
20168//===----------------------------------------------------------------------===//
20169// ARM Inline Assembly Support
20170//===----------------------------------------------------------------------===//
20171
20172const char *ARMTargetLowering::LowerXConstraint(EVT ConstraintVT) const {
20173 // At this point, we have to lower this constraint to something else, so we
20174 // lower it to an "r" or "w". However, by doing this we will force the result
20175 // to be in register, while the X constraint is much more permissive.
20176 //
20177 // Although we are correct (we are free to emit anything, without
20178 // constraints), we might break use cases that would expect us to be more
20179 // efficient and emit something else.
20180 if (!Subtarget->hasVFP2Base())
20181 return "r";
20182 if (ConstraintVT.isFloatingPoint())
20183 return "w";
20184 if (ConstraintVT.isVector() && Subtarget->hasNEON() &&
20185 (ConstraintVT.getSizeInBits() == 64 ||
20186 ConstraintVT.getSizeInBits() == 128))
20187 return "w";
20188
20189 return "r";
20190}
20191
20192/// getConstraintType - Given a constraint letter, return the type of
20193/// constraint it is for this target.
20196 unsigned S = Constraint.size();
20197 if (S == 1) {
20198 switch (Constraint[0]) {
20199 default: break;
20200 case 'l': return C_RegisterClass;
20201 case 'w': return C_RegisterClass;
20202 case 'h': return C_RegisterClass;
20203 case 'x': return C_RegisterClass;
20204 case 't': return C_RegisterClass;
20205 case 'j': return C_Immediate; // Constant for movw.
20206 // An address with a single base register. Due to the way we
20207 // currently handle addresses it is the same as an 'r' memory constraint.
20208 case 'Q': return C_Memory;
20209 }
20210 } else if (S == 2) {
20211 switch (Constraint[0]) {
20212 default: break;
20213 case 'T': return C_RegisterClass;
20214 // All 'U+' constraints are addresses.
20215 case 'U': return C_Memory;
20216 }
20217 }
20218 return TargetLowering::getConstraintType(Constraint);
20219}
20220
20221/// Examine constraint type and operand type and determine a weight value.
20222/// This object must already have been set up with the operand type
20223/// and the current alternative constraint selected.
20226 AsmOperandInfo &info, const char *constraint) const {
20228 Value *CallOperandVal = info.CallOperandVal;
20229 // If we don't have a value, we can't do a match,
20230 // but allow it at the lowest weight.
20231 if (!CallOperandVal)
20232 return CW_Default;
20233 Type *type = CallOperandVal->getType();
20234 // Look at the constraint type.
20235 switch (*constraint) {
20236 default:
20238 break;
20239 case 'l':
20240 if (type->isIntegerTy()) {
20241 if (Subtarget->isThumb())
20242 weight = CW_SpecificReg;
20243 else
20244 weight = CW_Register;
20245 }
20246 break;
20247 case 'w':
20248 if (type->isFloatingPointTy())
20249 weight = CW_Register;
20250 break;
20251 }
20252 return weight;
20253}
20254
20255static bool isIncompatibleReg(const MCPhysReg &PR, MVT VT) {
20256 if (PR == 0 || VT == MVT::Other)
20257 return false;
20258 if (ARM::SPRRegClass.contains(PR))
20259 return VT != MVT::f32 && VT != MVT::f16 && VT != MVT::i32;
20260 if (ARM::DPRRegClass.contains(PR))
20261 return VT != MVT::f64 && !VT.is64BitVector();
20262 return false;
20263}
20264
20265using RCPair = std::pair<unsigned, const TargetRegisterClass *>;
20266
20268 const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const {
20269 switch (Constraint.size()) {
20270 case 1:
20271 // GCC ARM Constraint Letters
20272 switch (Constraint[0]) {
20273 case 'l': // Low regs or general regs.
20274 if (Subtarget->isThumb())
20275 return RCPair(0U, &ARM::tGPRRegClass);
20276 return RCPair(0U, &ARM::GPRRegClass);
20277 case 'h': // High regs or no regs.
20278 if (Subtarget->isThumb())
20279 return RCPair(0U, &ARM::hGPRRegClass);
20280 break;
20281 case 'r':
20282 if (Subtarget->isThumb1Only())
20283 return RCPair(0U, &ARM::tGPRRegClass);
20284 return RCPair(0U, &ARM::GPRRegClass);
20285 case 'w':
20286 if (VT == MVT::Other)
20287 break;
20288 if (VT == MVT::f32 || VT == MVT::f16 || VT == MVT::bf16)
20289 return RCPair(0U, &ARM::SPRRegClass);
20290 if (VT.getSizeInBits() == 64)
20291 return RCPair(0U, &ARM::DPRRegClass);
20292 if (VT.getSizeInBits() == 128)
20293 return RCPair(0U, &ARM::QPRRegClass);
20294 break;
20295 case 'x':
20296 if (VT == MVT::Other)
20297 break;
20298 if (VT == MVT::f32 || VT == MVT::f16 || VT == MVT::bf16)
20299 return RCPair(0U, &ARM::SPR_8RegClass);
20300 if (VT.getSizeInBits() == 64)
20301 return RCPair(0U, &ARM::DPR_8RegClass);
20302 if (VT.getSizeInBits() == 128)
20303 return RCPair(0U, &ARM::QPR_8RegClass);
20304 break;
20305 case 't':
20306 if (VT == MVT::Other)
20307 break;
20308 if (VT == MVT::f32 || VT == MVT::i32 || VT == MVT::f16 || VT == MVT::bf16)
20309 return RCPair(0U, &ARM::SPRRegClass);
20310 if (VT.getSizeInBits() == 64)
20311 return RCPair(0U, &ARM::DPR_VFP2RegClass);
20312 if (VT.getSizeInBits() == 128)
20313 return RCPair(0U, &ARM::QPR_VFP2RegClass);
20314 break;
20315 }
20316 break;
20317
20318 case 2:
20319 if (Constraint[0] == 'T') {
20320 switch (Constraint[1]) {
20321 default:
20322 break;
20323 case 'e':
20324 return RCPair(0U, &ARM::tGPREvenRegClass);
20325 case 'o':
20326 return RCPair(0U, &ARM::tGPROddRegClass);
20327 }
20328 }
20329 break;
20330
20331 default:
20332 break;
20333 }
20334
20335 if (StringRef("{cc}").equals_insensitive(Constraint))
20336 return std::make_pair(unsigned(ARM::CPSR), &ARM::CCRRegClass);
20337
20338 // r14 is an alias of lr.
20339 if (StringRef("{r14}").equals_insensitive(Constraint))
20340 return std::make_pair(unsigned(ARM::LR), getRegClassFor(MVT::i32));
20341
20342 auto RCP = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
20343 if (isIncompatibleReg(RCP.first, VT))
20344 return {0, nullptr};
20345 return RCP;
20346}
20347
20348/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
20349/// vector. If it is invalid, don't add anything to Ops.
20351 StringRef Constraint,
20352 std::vector<SDValue> &Ops,
20353 SelectionDAG &DAG) const {
20354 SDValue Result;
20355
20356 // Currently only support length 1 constraints.
20357 if (Constraint.size() != 1)
20358 return;
20359
20360 char ConstraintLetter = Constraint[0];
20361 switch (ConstraintLetter) {
20362 default: break;
20363 case 'j':
20364 case 'I': case 'J': case 'K': case 'L':
20365 case 'M': case 'N': case 'O':
20367 if (!C)
20368 return;
20369
20370 int64_t CVal64 = C->getSExtValue();
20371 int CVal = (int) CVal64;
20372 // None of these constraints allow values larger than 32 bits. Check
20373 // that the value fits in an int.
20374 if (CVal != CVal64)
20375 return;
20376
20377 switch (ConstraintLetter) {
20378 case 'j':
20379 // Constant suitable for movw, must be between 0 and
20380 // 65535.
20381 if (Subtarget->hasV6T2Ops() || (Subtarget->hasV8MBaselineOps()))
20382 if (CVal >= 0 && CVal <= 65535)
20383 break;
20384 return;
20385 case 'I':
20386 if (Subtarget->isThumb1Only()) {
20387 // This must be a constant between 0 and 255, for ADD
20388 // immediates.
20389 if (CVal >= 0 && CVal <= 255)
20390 break;
20391 } else if (Subtarget->isThumb2()) {
20392 // A constant that can be used as an immediate value in a
20393 // data-processing instruction.
20394 if (ARM_AM::getT2SOImmVal(CVal) != -1)
20395 break;
20396 } else {
20397 // A constant that can be used as an immediate value in a
20398 // data-processing instruction.
20399 if (ARM_AM::getSOImmVal(CVal) != -1)
20400 break;
20401 }
20402 return;
20403
20404 case 'J':
20405 if (Subtarget->isThumb1Only()) {
20406 // This must be a constant between -255 and -1, for negated ADD
20407 // immediates. This can be used in GCC with an "n" modifier that
20408 // prints the negated value, for use with SUB instructions. It is
20409 // not useful otherwise but is implemented for compatibility.
20410 if (CVal >= -255 && CVal <= -1)
20411 break;
20412 } else {
20413 // This must be a constant between -4095 and 4095. This is suitable
20414 // for use as the immediate offset field in LDR and STR instructions
20415 // such as LDR r0,[r1,#offset].
20416 if (CVal >= -4095 && CVal <= 4095)
20417 break;
20418 }
20419 return;
20420
20421 case 'K':
20422 if (Subtarget->isThumb1Only()) {
20423 // A 32-bit value where only one byte has a nonzero value. Exclude
20424 // zero to match GCC. This constraint is used by GCC internally for
20425 // constants that can be loaded with a move/shift combination.
20426 // It is not useful otherwise but is implemented for compatibility.
20427 if (CVal != 0 && ARM_AM::isThumbImmShiftedVal(CVal))
20428 break;
20429 } else if (Subtarget->isThumb2()) {
20430 // A constant whose bitwise inverse can be used as an immediate
20431 // value in a data-processing instruction. This can be used in GCC
20432 // with a "B" modifier that prints the inverted value, for use with
20433 // BIC and MVN instructions. It is not useful otherwise but is
20434 // implemented for compatibility.
20435 if (ARM_AM::getT2SOImmVal(~CVal) != -1)
20436 break;
20437 } else {
20438 // A constant whose bitwise inverse can be used as an immediate
20439 // value in a data-processing instruction. This can be used in GCC
20440 // with a "B" modifier that prints the inverted value, for use with
20441 // BIC and MVN instructions. It is not useful otherwise but is
20442 // implemented for compatibility.
20443 if (ARM_AM::getSOImmVal(~CVal) != -1)
20444 break;
20445 }
20446 return;
20447
20448 case 'L':
20449 if (Subtarget->isThumb1Only()) {
20450 // This must be a constant between -7 and 7,
20451 // for 3-operand ADD/SUB immediate instructions.
20452 if (CVal >= -7 && CVal < 7)
20453 break;
20454 } else if (Subtarget->isThumb2()) {
20455 // A constant whose negation can be used as an immediate value in a
20456 // data-processing instruction. This can be used in GCC with an "n"
20457 // modifier that prints the negated value, for use with SUB
20458 // instructions. It is not useful otherwise but is implemented for
20459 // compatibility.
20460 if (ARM_AM::getT2SOImmVal(-CVal) != -1)
20461 break;
20462 } else {
20463 // A constant whose negation can be used as an immediate value in a
20464 // data-processing instruction. This can be used in GCC with an "n"
20465 // modifier that prints the negated value, for use with SUB
20466 // instructions. It is not useful otherwise but is implemented for
20467 // compatibility.
20468 if (ARM_AM::getSOImmVal(-CVal) != -1)
20469 break;
20470 }
20471 return;
20472
20473 case 'M':
20474 if (Subtarget->isThumb1Only()) {
20475 // This must be a multiple of 4 between 0 and 1020, for
20476 // ADD sp + immediate.
20477 if ((CVal >= 0 && CVal <= 1020) && ((CVal & 3) == 0))
20478 break;
20479 } else {
20480 // A power of two or a constant between 0 and 32. This is used in
20481 // GCC for the shift amount on shifted register operands, but it is
20482 // useful in general for any shift amounts.
20483 if ((CVal >= 0 && CVal <= 32) || ((CVal & (CVal - 1)) == 0))
20484 break;
20485 }
20486 return;
20487
20488 case 'N':
20489 if (Subtarget->isThumb1Only()) {
20490 // This must be a constant between 0 and 31, for shift amounts.
20491 if (CVal >= 0 && CVal <= 31)
20492 break;
20493 }
20494 return;
20495
20496 case 'O':
20497 if (Subtarget->isThumb1Only()) {
20498 // This must be a multiple of 4 between -508 and 508, for
20499 // ADD/SUB sp = sp + immediate.
20500 if ((CVal >= -508 && CVal <= 508) && ((CVal & 3) == 0))
20501 break;
20502 }
20503 return;
20504 }
20505 Result = DAG.getSignedTargetConstant(CVal, SDLoc(Op), Op.getValueType());
20506 break;
20507 }
20508
20509 if (Result.getNode()) {
20510 Ops.push_back(Result);
20511 return;
20512 }
20513 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
20514}
20515
20516static RTLIB::Libcall getDivRemLibcall(
20517 const SDNode *N, MVT::SimpleValueType SVT) {
20518 assert((N->getOpcode() == ISD::SDIVREM || N->getOpcode() == ISD::UDIVREM ||
20519 N->getOpcode() == ISD::SREM || N->getOpcode() == ISD::UREM) &&
20520 "Unhandled Opcode in getDivRemLibcall");
20521 bool isSigned = N->getOpcode() == ISD::SDIVREM ||
20522 N->getOpcode() == ISD::SREM;
20523 RTLIB::Libcall LC;
20524 switch (SVT) {
20525 default: llvm_unreachable("Unexpected request for libcall!");
20526 case MVT::i8: LC = isSigned ? RTLIB::SDIVREM_I8 : RTLIB::UDIVREM_I8; break;
20527 case MVT::i16: LC = isSigned ? RTLIB::SDIVREM_I16 : RTLIB::UDIVREM_I16; break;
20528 case MVT::i32: LC = isSigned ? RTLIB::SDIVREM_I32 : RTLIB::UDIVREM_I32; break;
20529 case MVT::i64: LC = isSigned ? RTLIB::SDIVREM_I64 : RTLIB::UDIVREM_I64; break;
20530 }
20531 return LC;
20532}
20533
20535 const SDNode *N, LLVMContext *Context, const ARMSubtarget *Subtarget) {
20536 assert((N->getOpcode() == ISD::SDIVREM || N->getOpcode() == ISD::UDIVREM ||
20537 N->getOpcode() == ISD::SREM || N->getOpcode() == ISD::UREM) &&
20538 "Unhandled Opcode in getDivRemArgList");
20539 bool isSigned = N->getOpcode() == ISD::SDIVREM ||
20540 N->getOpcode() == ISD::SREM;
20542 for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
20543 EVT ArgVT = N->getOperand(i).getValueType();
20544 Type *ArgTy = ArgVT.getTypeForEVT(*Context);
20545 TargetLowering::ArgListEntry Entry(N->getOperand(i), ArgTy);
20546 Entry.IsSExt = isSigned;
20547 Entry.IsZExt = !isSigned;
20548 Args.push_back(Entry);
20549 }
20550 if (Subtarget->getTargetTriple().isOSWindows() && Args.size() >= 2)
20551 std::swap(Args[0], Args[1]);
20552 return Args;
20553}
20554
20555SDValue ARMTargetLowering::LowerDivRem(SDValue Op, SelectionDAG &DAG) const {
20556 assert((Subtarget->isTargetAEABI() || Subtarget->isTargetAndroid() ||
20557 Subtarget->isTargetGNUAEABI() || Subtarget->isTargetMuslAEABI() ||
20558 Subtarget->isTargetFuchsia() || Subtarget->isTargetWindows()) &&
20559 "Register-based DivRem lowering only");
20560 unsigned Opcode = Op->getOpcode();
20561 assert((Opcode == ISD::SDIVREM || Opcode == ISD::UDIVREM) &&
20562 "Invalid opcode for Div/Rem lowering");
20563 bool isSigned = (Opcode == ISD::SDIVREM);
20564 EVT VT = Op->getValueType(0);
20565 SDLoc dl(Op);
20566
20567 if (VT == MVT::i64 && isa<ConstantSDNode>(Op.getOperand(1))) {
20569 if (expandDIVREMByConstant(Op.getNode(), Result, MVT::i32, DAG)) {
20570 SDValue Res0 =
20571 DAG.getNode(ISD::BUILD_PAIR, dl, VT, Result[0], Result[1]);
20572 SDValue Res1 =
20573 DAG.getNode(ISD::BUILD_PAIR, dl, VT, Result[2], Result[3]);
20574 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(),
20575 {Res0, Res1});
20576 }
20577 }
20578
20579 Type *Ty = VT.getTypeForEVT(*DAG.getContext());
20580
20581 // If the target has hardware divide, use divide + multiply + subtract:
20582 // div = a / b
20583 // rem = a - b * div
20584 // return {div, rem}
20585 // This should be lowered into UDIV/SDIV + MLS later on.
20586 bool hasDivide = Subtarget->isThumb() ? Subtarget->hasDivideInThumbMode()
20587 : Subtarget->hasDivideInARMMode();
20588 if (hasDivide && Op->getValueType(0).isSimple() &&
20589 Op->getSimpleValueType(0) == MVT::i32) {
20590 unsigned DivOpcode = isSigned ? ISD::SDIV : ISD::UDIV;
20591 const SDValue Dividend = Op->getOperand(0);
20592 const SDValue Divisor = Op->getOperand(1);
20593 SDValue Div = DAG.getNode(DivOpcode, dl, VT, Dividend, Divisor);
20594 SDValue Mul = DAG.getNode(ISD::MUL, dl, VT, Div, Divisor);
20595 SDValue Rem = DAG.getNode(ISD::SUB, dl, VT, Dividend, Mul);
20596
20597 SDValue Values[2] = {Div, Rem};
20598 return DAG.getNode(ISD::MERGE_VALUES, dl, DAG.getVTList(VT, VT), Values);
20599 }
20600
20601 RTLIB::Libcall LC = getDivRemLibcall(Op.getNode(),
20602 VT.getSimpleVT().SimpleTy);
20603 RTLIB::LibcallImpl LCImpl = DAG.getLibcalls().getLibcallImpl(LC);
20604
20605 SDValue InChain = DAG.getEntryNode();
20606
20608 DAG.getContext(),
20609 Subtarget);
20610
20611 SDValue Callee =
20612 DAG.getExternalSymbol(LCImpl, getPointerTy(DAG.getDataLayout()));
20613
20614 Type *RetTy = StructType::get(Ty, Ty);
20615
20616 if (getTM().getTargetTriple().isOSWindows())
20617 InChain = WinDBZCheckDenominator(DAG, Op.getNode(), InChain);
20618
20619 TargetLowering::CallLoweringInfo CLI(DAG);
20620 CLI.setDebugLoc(dl)
20621 .setChain(InChain)
20622 .setCallee(DAG.getLibcalls().getLibcallImplCallingConv(LCImpl), RetTy,
20623 Callee, std::move(Args))
20624 .setInRegister()
20625 .setSExtResult(isSigned)
20626 .setZExtResult(!isSigned);
20627
20628 std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
20629 return CallInfo.first;
20630}
20631
20632// Lowers REM using divmod helpers
20633// see RTABI section 4.2/4.3
20634SDValue ARMTargetLowering::LowerREM(SDNode *N, SelectionDAG &DAG) const {
20635 EVT VT = N->getValueType(0);
20636
20637 if (VT == MVT::i64 && isa<ConstantSDNode>(N->getOperand(1))) {
20639 if (expandDIVREMByConstant(N, Result, MVT::i32, DAG))
20640 return DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), N->getValueType(0),
20641 Result[0], Result[1]);
20642 }
20643
20644 // Build return types (div and rem)
20645 std::vector<Type*> RetTyParams;
20646 Type *RetTyElement;
20647
20648 switch (VT.getSimpleVT().SimpleTy) {
20649 default: llvm_unreachable("Unexpected request for libcall!");
20650 case MVT::i8: RetTyElement = Type::getInt8Ty(*DAG.getContext()); break;
20651 case MVT::i16: RetTyElement = Type::getInt16Ty(*DAG.getContext()); break;
20652 case MVT::i32: RetTyElement = Type::getInt32Ty(*DAG.getContext()); break;
20653 case MVT::i64: RetTyElement = Type::getInt64Ty(*DAG.getContext()); break;
20654 }
20655
20656 RetTyParams.push_back(RetTyElement);
20657 RetTyParams.push_back(RetTyElement);
20658 ArrayRef<Type*> ret = ArrayRef<Type*>(RetTyParams);
20659 Type *RetTy = StructType::get(*DAG.getContext(), ret);
20660
20661 RTLIB::Libcall LC = getDivRemLibcall(N, N->getValueType(0).getSimpleVT().
20662 SimpleTy);
20663 RTLIB::LibcallImpl LCImpl = DAG.getLibcalls().getLibcallImpl(LC);
20664 SDValue InChain = DAG.getEntryNode();
20666 Subtarget);
20667 bool isSigned = N->getOpcode() == ISD::SREM;
20668
20669 SDValue Callee =
20670 DAG.getExternalSymbol(LCImpl, getPointerTy(DAG.getDataLayout()));
20671
20672 if (getTM().getTargetTriple().isOSWindows())
20673 InChain = WinDBZCheckDenominator(DAG, N, InChain);
20674
20675 // Lower call
20676 CallLoweringInfo CLI(DAG);
20677 CLI.setChain(InChain)
20678 .setCallee(DAG.getLibcalls().getLibcallImplCallingConv(LCImpl), RetTy,
20679 Callee, std::move(Args))
20680 .setSExtResult(isSigned)
20681 .setZExtResult(!isSigned)
20682 .setDebugLoc(SDLoc(N));
20683 std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
20684
20685 // Return second (rem) result operand (first contains div)
20686 SDNode *ResNode = CallResult.first.getNode();
20687 assert(ResNode->getNumOperands() == 2 && "divmod should return two operands");
20688 return ResNode->getOperand(1);
20689}
20690
20691SDValue
20692ARMTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const {
20693 assert(getTM().getTargetTriple().isOSWindows() &&
20694 "unsupported target platform");
20695 SDLoc DL(Op);
20696
20697 // Get the inputs.
20698 SDValue Chain = Op.getOperand(0);
20699 SDValue Size = Op.getOperand(1);
20700
20702 "no-stack-arg-probe")) {
20703 MaybeAlign Align =
20704 cast<ConstantSDNode>(Op.getOperand(2))->getMaybeAlignValue();
20705 SDValue SP = DAG.getCopyFromReg(Chain, DL, ARM::SP, MVT::i32);
20706 Chain = SP.getValue(1);
20707 SP = DAG.getNode(ISD::SUB, DL, MVT::i32, SP, Size);
20708 if (Align)
20709 SP = DAG.getNode(ISD::AND, DL, MVT::i32, SP.getValue(0),
20710 DAG.getSignedConstant(-Align->value(), DL, MVT::i32));
20711 Chain = DAG.getCopyToReg(Chain, DL, ARM::SP, SP);
20712 SDValue Ops[2] = { SP, Chain };
20713 return DAG.getMergeValues(Ops, DL);
20714 }
20715
20716 SDValue Words = DAG.getNode(ISD::SRL, DL, MVT::i32, Size,
20717 DAG.getConstant(2, DL, MVT::i32));
20718
20719 SDValue Glue;
20720 Chain = DAG.getCopyToReg(Chain, DL, ARM::R4, Words, Glue);
20721 Glue = Chain.getValue(1);
20722
20723 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
20724 Chain = DAG.getNode(ARMISD::WIN__CHKSTK, DL, NodeTys, Chain, Glue);
20725
20726 SDValue NewSP = DAG.getCopyFromReg(Chain, DL, ARM::SP, MVT::i32);
20727 Chain = NewSP.getValue(1);
20728
20729 SDValue Ops[2] = { NewSP, Chain };
20730 return DAG.getMergeValues(Ops, DL);
20731}
20732
20733SDValue ARMTargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
20734 bool IsStrict = Op->isStrictFPOpcode();
20735 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
20736 const unsigned DstSz = Op.getValueType().getSizeInBits();
20737 const unsigned SrcSz = SrcVal.getValueType().getSizeInBits();
20738 assert(DstSz > SrcSz && DstSz <= 64 && SrcSz >= 16 &&
20739 "Unexpected type for custom-lowering FP_EXTEND");
20740
20741 assert((!Subtarget->hasFP64() || !Subtarget->hasFPARMv8Base()) &&
20742 "With both FP DP and 16, any FP conversion is legal!");
20743
20744 assert(!(DstSz == 32 && Subtarget->hasFP16()) &&
20745 "With FP16, 16 to 32 conversion is legal!");
20746
20747 // Converting from 32 -> 64 is valid if we have FP64.
20748 if (SrcSz == 32 && DstSz == 64 && Subtarget->hasFP64()) {
20749 // FIXME: Remove this when we have strict fp instruction selection patterns
20750 if (IsStrict) {
20751 SDLoc Loc(Op);
20753 Loc, Op.getValueType(), SrcVal);
20754 return DAG.getMergeValues({Result, Op.getOperand(0)}, Loc);
20755 }
20756 return Op;
20757 }
20758
20759 // Either we are converting from 16 -> 64, without FP16 and/or
20760 // FP.double-precision or without Armv8-fp. So we must do it in two
20761 // steps.
20762 // Or we are converting from 32 -> 64 without fp.double-precision or 16 -> 32
20763 // without FP16. So we must do a function call.
20764 SDLoc Loc(Op);
20765 RTLIB::Libcall LC;
20766 MakeLibCallOptions CallOptions;
20767 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
20768 for (unsigned Sz = SrcSz; Sz <= 32 && Sz < DstSz; Sz *= 2) {
20769 bool Supported = (Sz == 16 ? Subtarget->hasFP16() : Subtarget->hasFP64());
20770 MVT SrcVT = (Sz == 16 ? MVT::f16 : MVT::f32);
20771 MVT DstVT = (Sz == 16 ? MVT::f32 : MVT::f64);
20772 if (Supported) {
20773 if (IsStrict) {
20774 SrcVal = DAG.getNode(ISD::STRICT_FP_EXTEND, Loc,
20775 {DstVT, MVT::Other}, {Chain, SrcVal});
20776 Chain = SrcVal.getValue(1);
20777 } else {
20778 SrcVal = DAG.getNode(ISD::FP_EXTEND, Loc, DstVT, SrcVal);
20779 }
20780 } else {
20781 LC = RTLIB::getFPEXT(SrcVT, DstVT);
20782 assert(LC != RTLIB::UNKNOWN_LIBCALL &&
20783 "Unexpected type for custom-lowering FP_EXTEND");
20784 std::tie(SrcVal, Chain) = makeLibCall(DAG, LC, DstVT, SrcVal, CallOptions,
20785 Loc, Chain);
20786 }
20787 }
20788
20789 return IsStrict ? DAG.getMergeValues({SrcVal, Chain}, Loc) : SrcVal;
20790}
20791
20792SDValue ARMTargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
20793 bool IsStrict = Op->isStrictFPOpcode();
20794
20795 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
20796 EVT SrcVT = SrcVal.getValueType();
20797 EVT DstVT = Op.getValueType();
20798 const unsigned DstSz = Op.getValueType().getSizeInBits();
20799 const unsigned SrcSz = SrcVT.getSizeInBits();
20800 (void)DstSz;
20801 assert(DstSz < SrcSz && SrcSz <= 64 && DstSz >= 16 &&
20802 "Unexpected type for custom-lowering FP_ROUND");
20803
20804 assert((!Subtarget->hasFP64() || !Subtarget->hasFPARMv8Base()) &&
20805 "With both FP DP and 16, any FP conversion is legal!");
20806
20807 SDLoc Loc(Op);
20808
20809 // Instruction from 32 -> 16 if hasFP16 is valid
20810 if (SrcSz == 32 && Subtarget->hasFP16())
20811 return Op;
20812
20813 // Lib call from 32 -> 16 / 64 -> [32, 16]
20814 RTLIB::Libcall LC = RTLIB::getFPROUND(SrcVT, DstVT);
20815 assert(LC != RTLIB::UNKNOWN_LIBCALL &&
20816 "Unexpected type for custom-lowering FP_ROUND");
20817 MakeLibCallOptions CallOptions;
20818 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
20820 std::tie(Result, Chain) = makeLibCall(DAG, LC, DstVT, SrcVal, CallOptions,
20821 Loc, Chain);
20822 return IsStrict ? DAG.getMergeValues({Result, Chain}, Loc) : Result;
20823}
20824
20825bool
20827 // The ARM target isn't yet aware of offsets.
20828 return false;
20829}
20830
20832 if (v == 0xffffffff)
20833 return false;
20834
20835 // there can be 1's on either or both "outsides", all the "inside"
20836 // bits must be 0's
20837 return isShiftedMask_32(~v);
20838}
20839
20840/// isFPImmLegal - Returns true if the target can instruction select the
20841/// specified FP immediate natively. If false, the legalizer will
20842/// materialize the FP immediate as a load from a constant pool.
20844 bool ForCodeSize) const {
20845 if (!Subtarget->hasVFP3Base())
20846 return false;
20847 if (VT == MVT::f16 && Subtarget->hasFullFP16())
20848 return ARM_AM::getFP16Imm(Imm) != -1;
20849 if (VT == MVT::f32 && Subtarget->hasFullFP16() &&
20850 ARM_AM::getFP32FP16Imm(Imm) != -1)
20851 return true;
20852 if (VT == MVT::f32)
20853 return ARM_AM::getFP32Imm(Imm) != -1;
20854 if (VT == MVT::f64 && Subtarget->hasFP64())
20855 return ARM_AM::getFP64Imm(Imm) != -1;
20856 return false;
20857}
20858
20859/// getTgtMemIntrinsic - Represent NEON load and store intrinsics as
20860/// MemIntrinsicNodes. The associated MachineMemOperands record the alignment
20861/// specified in the intrinsic calls.
20864 MachineFunction &MF, unsigned Intrinsic) const {
20865 IntrinsicInfo Info;
20866 switch (Intrinsic) {
20867 case Intrinsic::arm_neon_vld1:
20868 case Intrinsic::arm_neon_vld2:
20869 case Intrinsic::arm_neon_vld3:
20870 case Intrinsic::arm_neon_vld4:
20871 case Intrinsic::arm_neon_vld2lane:
20872 case Intrinsic::arm_neon_vld3lane:
20873 case Intrinsic::arm_neon_vld4lane:
20874 case Intrinsic::arm_neon_vld2dup:
20875 case Intrinsic::arm_neon_vld3dup:
20876 case Intrinsic::arm_neon_vld4dup: {
20877 Info.opc = ISD::INTRINSIC_W_CHAIN;
20878 // Conservatively set memVT to the entire set of vectors loaded.
20879 auto &DL = I.getDataLayout();
20880 uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64;
20881 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
20882 Info.ptrVal = I.getArgOperand(0);
20883 Info.offset = 0;
20884 Value *AlignArg = I.getArgOperand(I.arg_size() - 1);
20885 Info.align = cast<ConstantInt>(AlignArg)->getMaybeAlignValue();
20886 // volatile loads with NEON intrinsics not supported
20887 Info.flags = MachineMemOperand::MOLoad;
20888 Infos.push_back(Info);
20889 return;
20890 }
20891 case Intrinsic::arm_neon_vld1x2:
20892 case Intrinsic::arm_neon_vld1x3:
20893 case Intrinsic::arm_neon_vld1x4: {
20894 Info.opc = ISD::INTRINSIC_W_CHAIN;
20895 // Conservatively set memVT to the entire set of vectors loaded.
20896 auto &DL = I.getDataLayout();
20897 uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64;
20898 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
20899 Info.ptrVal = I.getArgOperand(I.arg_size() - 1);
20900 Info.offset = 0;
20901 Info.align = I.getParamAlign(I.arg_size() - 1).valueOrOne();
20902 // volatile loads with NEON intrinsics not supported
20903 Info.flags = MachineMemOperand::MOLoad;
20904 Infos.push_back(Info);
20905 return;
20906 }
20907 case Intrinsic::arm_neon_vst1:
20908 case Intrinsic::arm_neon_vst2:
20909 case Intrinsic::arm_neon_vst3:
20910 case Intrinsic::arm_neon_vst4:
20911 case Intrinsic::arm_neon_vst2lane:
20912 case Intrinsic::arm_neon_vst3lane:
20913 case Intrinsic::arm_neon_vst4lane: {
20914 Info.opc = ISD::INTRINSIC_VOID;
20915 // Conservatively set memVT to the entire set of vectors stored.
20916 auto &DL = I.getDataLayout();
20917 unsigned NumElts = 0;
20918 for (unsigned ArgI = 1, ArgE = I.arg_size(); ArgI < ArgE; ++ArgI) {
20919 Type *ArgTy = I.getArgOperand(ArgI)->getType();
20920 if (!ArgTy->isVectorTy())
20921 break;
20922 NumElts += DL.getTypeSizeInBits(ArgTy) / 64;
20923 }
20924 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
20925 Info.ptrVal = I.getArgOperand(0);
20926 Info.offset = 0;
20927 Value *AlignArg = I.getArgOperand(I.arg_size() - 1);
20928 Info.align = cast<ConstantInt>(AlignArg)->getMaybeAlignValue();
20929 // volatile stores with NEON intrinsics not supported
20930 Info.flags = MachineMemOperand::MOStore;
20931 Infos.push_back(Info);
20932 return;
20933 }
20934 case Intrinsic::arm_neon_vst1x2:
20935 case Intrinsic::arm_neon_vst1x3:
20936 case Intrinsic::arm_neon_vst1x4: {
20937 Info.opc = ISD::INTRINSIC_VOID;
20938 // Conservatively set memVT to the entire set of vectors stored.
20939 auto &DL = I.getDataLayout();
20940 unsigned NumElts = 0;
20941 for (unsigned ArgI = 1, ArgE = I.arg_size(); ArgI < ArgE; ++ArgI) {
20942 Type *ArgTy = I.getArgOperand(ArgI)->getType();
20943 if (!ArgTy->isVectorTy())
20944 break;
20945 NumElts += DL.getTypeSizeInBits(ArgTy) / 64;
20946 }
20947 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
20948 Info.ptrVal = I.getArgOperand(0);
20949 Info.offset = 0;
20950 Info.align = I.getParamAlign(0).valueOrOne();
20951 // volatile stores with NEON intrinsics not supported
20952 Info.flags = MachineMemOperand::MOStore;
20953 Infos.push_back(Info);
20954 return;
20955 }
20956 case Intrinsic::arm_mve_vld2q:
20957 case Intrinsic::arm_mve_vld4q: {
20958 Info.opc = ISD::INTRINSIC_W_CHAIN;
20959 // Conservatively set memVT to the entire set of vectors loaded.
20960 Type *VecTy = cast<StructType>(I.getType())->getElementType(1);
20961 unsigned Factor = Intrinsic == Intrinsic::arm_mve_vld2q ? 2 : 4;
20962 Info.memVT = EVT::getVectorVT(VecTy->getContext(), MVT::i64, Factor * 2);
20963 Info.ptrVal = I.getArgOperand(0);
20964 Info.offset = 0;
20965 Info.align = Align(VecTy->getScalarSizeInBits() / 8);
20966 // volatile loads with MVE intrinsics not supported
20967 Info.flags = MachineMemOperand::MOLoad;
20968 Infos.push_back(Info);
20969 return;
20970 }
20971 case Intrinsic::arm_mve_vst2q:
20972 case Intrinsic::arm_mve_vst4q: {
20973 Info.opc = ISD::INTRINSIC_VOID;
20974 // Conservatively set memVT to the entire set of vectors stored.
20975 Type *VecTy = I.getArgOperand(1)->getType();
20976 unsigned Factor = Intrinsic == Intrinsic::arm_mve_vst2q ? 2 : 4;
20977 Info.memVT = EVT::getVectorVT(VecTy->getContext(), MVT::i64, Factor * 2);
20978 Info.ptrVal = I.getArgOperand(0);
20979 Info.offset = 0;
20980 Info.align = Align(VecTy->getScalarSizeInBits() / 8);
20981 // volatile stores with MVE intrinsics not supported
20982 Info.flags = MachineMemOperand::MOStore;
20983 Infos.push_back(Info);
20984 return;
20985 }
20986 case Intrinsic::arm_mve_vldr_gather_base:
20987 case Intrinsic::arm_mve_vldr_gather_base_predicated: {
20988 Info.opc = ISD::INTRINSIC_W_CHAIN;
20989 Info.ptrVal = nullptr;
20990 Info.memVT = MVT::getVT(I.getType());
20991 Info.align = Align(1);
20992 Info.flags |= MachineMemOperand::MOLoad;
20993 Infos.push_back(Info);
20994 return;
20995 }
20996 case Intrinsic::arm_mve_vldr_gather_base_wb:
20997 case Intrinsic::arm_mve_vldr_gather_base_wb_predicated: {
20998 Info.opc = ISD::INTRINSIC_W_CHAIN;
20999 Info.ptrVal = nullptr;
21000 Info.memVT = MVT::getVT(I.getType()->getContainedType(0));
21001 Info.align = Align(1);
21002 Info.flags |= MachineMemOperand::MOLoad;
21003 Infos.push_back(Info);
21004 return;
21005 }
21006 case Intrinsic::arm_mve_vldr_gather_offset:
21007 case Intrinsic::arm_mve_vldr_gather_offset_predicated: {
21008 Info.opc = ISD::INTRINSIC_W_CHAIN;
21009 Info.ptrVal = nullptr;
21010 MVT DataVT = MVT::getVT(I.getType());
21011 unsigned MemSize = cast<ConstantInt>(I.getArgOperand(2))->getZExtValue();
21012 Info.memVT = MVT::getVectorVT(MVT::getIntegerVT(MemSize),
21013 DataVT.getVectorNumElements());
21014 Info.align = Align(1);
21015 Info.flags |= MachineMemOperand::MOLoad;
21016 Infos.push_back(Info);
21017 return;
21018 }
21019 case Intrinsic::arm_mve_vstr_scatter_base:
21020 case Intrinsic::arm_mve_vstr_scatter_base_predicated: {
21021 Info.opc = ISD::INTRINSIC_VOID;
21022 Info.ptrVal = nullptr;
21023 Info.memVT = MVT::getVT(I.getArgOperand(2)->getType());
21024 Info.align = Align(1);
21025 Info.flags |= MachineMemOperand::MOStore;
21026 Infos.push_back(Info);
21027 return;
21028 }
21029 case Intrinsic::arm_mve_vstr_scatter_base_wb:
21030 case Intrinsic::arm_mve_vstr_scatter_base_wb_predicated: {
21031 Info.opc = ISD::INTRINSIC_W_CHAIN;
21032 Info.ptrVal = nullptr;
21033 Info.memVT = MVT::getVT(I.getArgOperand(2)->getType());
21034 Info.align = Align(1);
21035 Info.flags |= MachineMemOperand::MOStore;
21036 Infos.push_back(Info);
21037 return;
21038 }
21039 case Intrinsic::arm_mve_vstr_scatter_offset:
21040 case Intrinsic::arm_mve_vstr_scatter_offset_predicated: {
21041 Info.opc = ISD::INTRINSIC_VOID;
21042 Info.ptrVal = nullptr;
21043 MVT DataVT = MVT::getVT(I.getArgOperand(2)->getType());
21044 unsigned MemSize = cast<ConstantInt>(I.getArgOperand(3))->getZExtValue();
21045 Info.memVT = MVT::getVectorVT(MVT::getIntegerVT(MemSize),
21046 DataVT.getVectorNumElements());
21047 Info.align = Align(1);
21048 Info.flags |= MachineMemOperand::MOStore;
21049 Infos.push_back(Info);
21050 return;
21051 }
21052 case Intrinsic::arm_ldaex:
21053 case Intrinsic::arm_ldrex: {
21054 auto &DL = I.getDataLayout();
21055 Type *ValTy = I.getParamElementType(0);
21056 Info.opc = ISD::INTRINSIC_W_CHAIN;
21057 Info.memVT = MVT::getVT(ValTy);
21058 Info.ptrVal = I.getArgOperand(0);
21059 Info.offset = 0;
21060 Info.align = DL.getABITypeAlign(ValTy);
21062 Infos.push_back(Info);
21063 return;
21064 }
21065 case Intrinsic::arm_stlex:
21066 case Intrinsic::arm_strex: {
21067 auto &DL = I.getDataLayout();
21068 Type *ValTy = I.getParamElementType(1);
21069 Info.opc = ISD::INTRINSIC_W_CHAIN;
21070 Info.memVT = MVT::getVT(ValTy);
21071 Info.ptrVal = I.getArgOperand(1);
21072 Info.offset = 0;
21073 Info.align = DL.getABITypeAlign(ValTy);
21075 Infos.push_back(Info);
21076 return;
21077 }
21078 case Intrinsic::arm_stlexd:
21079 case Intrinsic::arm_strexd:
21080 Info.opc = ISD::INTRINSIC_W_CHAIN;
21081 Info.memVT = MVT::i64;
21082 Info.ptrVal = I.getArgOperand(2);
21083 Info.offset = 0;
21084 Info.align = Align(8);
21086 Infos.push_back(Info);
21087 return;
21088
21089 case Intrinsic::arm_ldaexd:
21090 case Intrinsic::arm_ldrexd:
21091 Info.opc = ISD::INTRINSIC_W_CHAIN;
21092 Info.memVT = MVT::i64;
21093 Info.ptrVal = I.getArgOperand(0);
21094 Info.offset = 0;
21095 Info.align = Align(8);
21097 Infos.push_back(Info);
21098 return;
21099
21100 default:
21101 break;
21102 }
21103}
21104
21105/// Returns true if it is beneficial to convert a load of a constant
21106/// to just the constant itself.
21108 Type *Ty) const {
21109 assert(Ty->isIntegerTy());
21110
21111 unsigned Bits = Ty->getPrimitiveSizeInBits();
21112 if (Bits == 0 || Bits > 32)
21113 return false;
21114 return true;
21115}
21116
21118 unsigned Index) const {
21120 return false;
21121
21122 return (Index == 0 || Index == ResVT.getVectorNumElements());
21123}
21124
21126 ARM_MB::MemBOpt Domain) const {
21127 // First, if the target has no DMB, see what fallback we can use.
21128 if (!Subtarget->hasDataBarrier()) {
21129 // Some ARMv6 cpus can support data barriers with an mcr instruction.
21130 // Thumb1 and pre-v6 ARM mode use a libcall instead and should never get
21131 // here.
21132 if (Subtarget->hasV6Ops() && !Subtarget->isThumb()) {
21133 Value* args[6] = {Builder.getInt32(15), Builder.getInt32(0),
21134 Builder.getInt32(0), Builder.getInt32(7),
21135 Builder.getInt32(10), Builder.getInt32(5)};
21136 return Builder.CreateIntrinsic(Intrinsic::arm_mcr, args);
21137 } else {
21138 // Instead of using barriers, atomic accesses on these subtargets use
21139 // libcalls.
21140 llvm_unreachable("makeDMB on a target so old that it has no barriers");
21141 }
21142 } else {
21143 // Only a full system barrier exists in the M-class architectures.
21144 Domain = Subtarget->isMClass() ? ARM_MB::SY : Domain;
21145 Constant *CDomain = Builder.getInt32(Domain);
21146 return Builder.CreateIntrinsic(Intrinsic::arm_dmb, CDomain);
21147 }
21148}
21149
21150// Based on http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html
21152 Instruction *Inst,
21153 AtomicOrdering Ord) const {
21154 switch (Ord) {
21157 llvm_unreachable("Invalid fence: unordered/non-atomic");
21160 return nullptr; // Nothing to do
21162 if (!Inst->hasAtomicStore())
21163 return nullptr; // Nothing to do
21164 [[fallthrough]];
21167 if (Subtarget->preferISHSTBarriers())
21168 return makeDMB(Builder, ARM_MB::ISHST);
21169 // FIXME: add a comment with a link to documentation justifying this.
21170 else
21171 return makeDMB(Builder, ARM_MB::ISH);
21172 }
21173 llvm_unreachable("Unknown fence ordering in emitLeadingFence");
21174}
21175
21177 Instruction *Inst,
21178 AtomicOrdering Ord) const {
21179 switch (Ord) {
21182 llvm_unreachable("Invalid fence: unordered/not-atomic");
21185 return nullptr; // Nothing to do
21189 return makeDMB(Builder, ARM_MB::ISH);
21190 }
21191 llvm_unreachable("Unknown fence ordering in emitTrailingFence");
21192}
21193
21194// Loads and stores less than 64-bits are already atomic; ones above that
21195// are doomed anyway, so defer to the default libcall and blame the OS when
21196// things go wrong. Cortex M doesn't have ldrexd/strexd though, so don't emit
21197// anything for those.
21200 bool has64BitAtomicStore;
21201 if (Subtarget->isMClass())
21202 has64BitAtomicStore = false;
21203 else if (Subtarget->isThumb())
21204 has64BitAtomicStore = Subtarget->hasV7Ops();
21205 else
21206 has64BitAtomicStore = Subtarget->hasV6Ops();
21207
21208 unsigned Size = SI->getValueOperand()->getType()->getPrimitiveSizeInBits();
21209 return Size == 64 && has64BitAtomicStore ? AtomicExpansionKind::Expand
21211}
21212
21213// Loads and stores less than 64-bits are already atomic; ones above that
21214// are doomed anyway, so defer to the default libcall and blame the OS when
21215// things go wrong. Cortex M doesn't have ldrexd/strexd though, so don't emit
21216// anything for those.
21217// FIXME: ldrd and strd are atomic if the CPU has LPAE (e.g. A15 has that
21218// guarantee, see DDI0406C ARM architecture reference manual,
21219// sections A8.8.72-74 LDRD)
21222 bool has64BitAtomicLoad;
21223 if (Subtarget->isMClass())
21224 has64BitAtomicLoad = false;
21225 else if (Subtarget->isThumb())
21226 has64BitAtomicLoad = Subtarget->hasV7Ops();
21227 else
21228 has64BitAtomicLoad = Subtarget->hasV6Ops();
21229
21230 unsigned Size = LI->getType()->getPrimitiveSizeInBits();
21231 return (Size == 64 && has64BitAtomicLoad) ? AtomicExpansionKind::LLOnly
21233}
21234
21235// For the real atomic operations, we have ldrex/strex up to 32 bits,
21236// and up to 64 bits on the non-M profiles
21239 if (AI->isFloatingPointOperation())
21241
21242 unsigned Size = AI->getType()->getPrimitiveSizeInBits();
21243 bool hasAtomicRMW;
21244 if (Subtarget->isMClass())
21245 hasAtomicRMW = Subtarget->hasV8MBaselineOps();
21246 else if (Subtarget->isThumb())
21247 hasAtomicRMW = Subtarget->hasV7Ops();
21248 else
21249 hasAtomicRMW = Subtarget->hasV6Ops();
21250 if (Size <= (Subtarget->isMClass() ? 32U : 64U) && hasAtomicRMW) {
21251 // At -O0, fast-regalloc cannot cope with the live vregs necessary to
21252 // implement atomicrmw without spilling. If the target address is also on
21253 // the stack and close enough to the spill slot, this can lead to a
21254 // situation where the monitor always gets cleared and the atomic operation
21255 // can never succeed. So at -O0 lower this operation to a CAS loop.
21256 if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None)
21259 }
21261}
21262
21263// Similar to shouldExpandAtomicRMWInIR, ldrex/strex can be used up to 32
21264// bits, and up to 64 bits on the non-M profiles.
21267 const AtomicCmpXchgInst *AI) const {
21268 // At -O0, fast-regalloc cannot cope with the live vregs necessary to
21269 // implement cmpxchg without spilling. If the address being exchanged is also
21270 // on the stack and close enough to the spill slot, this can lead to a
21271 // situation where the monitor always gets cleared and the atomic operation
21272 // can never succeed. So at -O0 we need a late-expanded pseudo-inst instead.
21273 unsigned Size = AI->getOperand(1)->getType()->getPrimitiveSizeInBits();
21274 bool HasAtomicCmpXchg;
21275 if (Subtarget->isMClass())
21276 HasAtomicCmpXchg = Subtarget->hasV8MBaselineOps();
21277 else if (Subtarget->isThumb())
21278 HasAtomicCmpXchg = Subtarget->hasV7Ops();
21279 else
21280 HasAtomicCmpXchg = Subtarget->hasV6Ops();
21281 if (getTargetMachine().getOptLevel() != CodeGenOptLevel::None &&
21282 HasAtomicCmpXchg && Size <= (Subtarget->isMClass() ? 32U : 64U))
21285}
21286
21288 const Instruction *I) const {
21289 return InsertFencesForAtomic;
21290}
21291
21293 // ROPI/RWPI are not supported currently.
21294 return !Subtarget->isROPI() && !Subtarget->isRWPI();
21295}
21296
21298 Module &M, const LibcallLoweringInfo &Libcalls) const {
21299 // MSVC CRT provides functionalities for stack protection.
21300 RTLIB::LibcallImpl SecurityCheckCookieLibcall =
21301 Libcalls.getLibcallImpl(RTLIB::SECURITY_CHECK_COOKIE);
21302
21303 RTLIB::LibcallImpl SecurityCookieVar =
21304 Libcalls.getLibcallImpl(RTLIB::STACK_CHECK_GUARD);
21305 if (SecurityCheckCookieLibcall != RTLIB::Unsupported &&
21306 SecurityCookieVar != RTLIB::Unsupported) {
21307 // MSVC CRT has a global variable holding security cookie.
21308 M.getOrInsertGlobal(getLibcallImplName(SecurityCookieVar),
21309 PointerType::getUnqual(M.getContext()));
21310
21311 // MSVC CRT has a function to validate security cookie.
21312 FunctionCallee SecurityCheckCookie =
21313 M.getOrInsertFunction(getLibcallImplName(SecurityCheckCookieLibcall),
21314 Type::getVoidTy(M.getContext()),
21315 PointerType::getUnqual(M.getContext()));
21316 if (Function *F = dyn_cast<Function>(SecurityCheckCookie.getCallee()))
21317 F->addParamAttr(0, Attribute::AttrKind::InReg);
21318 }
21319
21321}
21322
21324 unsigned &Cost) const {
21325 // If we do not have NEON, vector types are not natively supported.
21326 if (!Subtarget->hasNEON())
21327 return false;
21328
21329 // Floating point values and vector values map to the same register file.
21330 // Therefore, although we could do a store extract of a vector type, this is
21331 // better to leave at float as we have more freedom in the addressing mode for
21332 // those.
21333 if (VectorTy->isFPOrFPVectorTy())
21334 return false;
21335
21336 // If the index is unknown at compile time, this is very expensive to lower
21337 // and it is not possible to combine the store with the extract.
21338 if (!isa<ConstantInt>(Idx))
21339 return false;
21340
21341 assert(VectorTy->isVectorTy() && "VectorTy is not a vector type");
21342 unsigned BitWidth = VectorTy->getPrimitiveSizeInBits().getFixedValue();
21343 // We can do a store + vector extract on any vector that fits perfectly in a D
21344 // or Q register.
21345 if (BitWidth == 64 || BitWidth == 128) {
21346 Cost = 0;
21347 return true;
21348 }
21349 return false;
21350}
21351
21353 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
21354 bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const {
21355 unsigned Opcode = Op.getOpcode();
21356 switch (Opcode) {
21357 case ARMISD::VORRIMM:
21358 case ARMISD::VBICIMM:
21359 return false;
21360 }
21362 Op, DemandedElts, DAG, PoisonOnly, ConsiderFlags, Depth);
21363}
21364
21366 return Subtarget->hasV5TOps() && !Subtarget->isThumb1Only();
21367}
21368
21370 return Subtarget->hasV5TOps() && !Subtarget->isThumb1Only();
21371}
21372
21374 const Instruction &AndI) const {
21375 if (!Subtarget->hasV7Ops())
21376 return false;
21377
21378 // Sink the `and` instruction only if the mask would fit into a modified
21379 // immediate operand.
21381 if (!Mask || Mask->getValue().getBitWidth() > 32u)
21382 return false;
21383 auto MaskVal = unsigned(Mask->getValue().getZExtValue());
21384 return (Subtarget->isThumb2() ? ARM_AM::getT2SOImmVal(MaskVal)
21385 : ARM_AM::getSOImmVal(MaskVal)) != -1;
21386}
21387
21390 SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const {
21391 if (Subtarget->hasMinSize() && !getTM().getTargetTriple().isOSWindows())
21394 ExpansionFactor);
21395}
21396
21398 Value *Addr,
21399 AtomicOrdering Ord) const {
21400 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
21401 bool IsAcquire = isAcquireOrStronger(Ord);
21402
21403 // Since i64 isn't legal and intrinsics don't get type-lowered, the ldrexd
21404 // intrinsic must return {i32, i32} and we have to recombine them into a
21405 // single i64 here.
21406 if (ValueTy->getPrimitiveSizeInBits() == 64) {
21408 IsAcquire ? Intrinsic::arm_ldaexd : Intrinsic::arm_ldrexd;
21409
21410 Value *LoHi =
21411 Builder.CreateIntrinsic(Int, Addr, /*FMFSource=*/nullptr, "lohi");
21412
21413 Value *Lo = Builder.CreateExtractValue(LoHi, 0, "lo");
21414 Value *Hi = Builder.CreateExtractValue(LoHi, 1, "hi");
21415 if (!Subtarget->isLittle())
21416 std::swap (Lo, Hi);
21417 Lo = Builder.CreateZExt(Lo, ValueTy, "lo64");
21418 Hi = Builder.CreateZExt(Hi, ValueTy, "hi64");
21419 return Builder.CreateOr(
21420 Lo, Builder.CreateShl(Hi, ConstantInt::get(ValueTy, 32)), "val64");
21421 }
21422
21423 Type *Tys[] = { Addr->getType() };
21424 Intrinsic::ID Int = IsAcquire ? Intrinsic::arm_ldaex : Intrinsic::arm_ldrex;
21425 CallInst *CI = Builder.CreateIntrinsic(Int, Tys, Addr);
21426
21427 CI->addParamAttr(
21428 0, Attribute::get(M->getContext(), Attribute::ElementType, ValueTy));
21429 return Builder.CreateTruncOrBitCast(CI, ValueTy);
21430}
21431
21433 IRBuilderBase &Builder) const {
21434 if (!Subtarget->hasV7Ops())
21435 return;
21436 Builder.CreateIntrinsic(Intrinsic::arm_clrex, {});
21437}
21438
21440 Value *Val, Value *Addr,
21441 AtomicOrdering Ord) const {
21442 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
21443 bool IsRelease = isReleaseOrStronger(Ord);
21444
21445 // Since the intrinsics must have legal type, the i64 intrinsics take two
21446 // parameters: "i32, i32". We must marshal Val into the appropriate form
21447 // before the call.
21448 if (Val->getType()->getPrimitiveSizeInBits() == 64) {
21450 IsRelease ? Intrinsic::arm_stlexd : Intrinsic::arm_strexd;
21451 Type *Int32Ty = Type::getInt32Ty(M->getContext());
21452
21453 Value *Lo = Builder.CreateTrunc(Val, Int32Ty, "lo");
21454 Value *Hi = Builder.CreateTrunc(Builder.CreateLShr(Val, 32), Int32Ty, "hi");
21455 if (!Subtarget->isLittle())
21456 std::swap(Lo, Hi);
21457 return Builder.CreateIntrinsic(Int, {Lo, Hi, Addr});
21458 }
21459
21460 Intrinsic::ID Int = IsRelease ? Intrinsic::arm_stlex : Intrinsic::arm_strex;
21461 Type *Tys[] = { Addr->getType() };
21463
21464 CallInst *CI = Builder.CreateCall(
21465 Strex, {Builder.CreateZExtOrBitCast(
21466 Val, Strex->getFunctionType()->getParamType(0)),
21467 Addr});
21468 CI->addParamAttr(1, Attribute::get(M->getContext(), Attribute::ElementType,
21469 Val->getType()));
21470 return CI;
21471}
21472
21473
21475 return Subtarget->isMClass();
21476}
21477
21478/// A helper function for determining the number of interleaved accesses we
21479/// will generate when lowering accesses of the given type.
21480unsigned
21482 const DataLayout &DL) const {
21483 return (DL.getTypeSizeInBits(VecTy) + 127) / 128;
21484}
21485
21487 unsigned Factor, FixedVectorType *VecTy, Align Alignment,
21488 const DataLayout &DL) const {
21489
21490 unsigned VecSize = DL.getTypeSizeInBits(VecTy);
21491 unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType());
21492
21493 if (!Subtarget->hasNEON() && !Subtarget->hasMVEIntegerOps())
21494 return false;
21495
21496 // Ensure the vector doesn't have f16 elements. Even though we could do an
21497 // i16 vldN, we can't hold the f16 vectors and will end up converting via
21498 // f32.
21499 if (Subtarget->hasNEON() && VecTy->getElementType()->isHalfTy())
21500 return false;
21501 if (Subtarget->hasMVEIntegerOps() && Factor == 3)
21502 return false;
21503
21504 // Ensure the number of vector elements is greater than 1.
21505 if (VecTy->getNumElements() < 2)
21506 return false;
21507
21508 // Ensure the element type is legal.
21509 if (ElSize != 8 && ElSize != 16 && ElSize != 32)
21510 return false;
21511 // And the alignment if high enough under MVE.
21512 if (Subtarget->hasMVEIntegerOps() && Alignment < ElSize / 8)
21513 return false;
21514
21515 // Ensure the total vector size is 64 or a multiple of 128. Types larger than
21516 // 128 will be split into multiple interleaved accesses.
21517 if (Subtarget->hasNEON() && VecSize == 64)
21518 return true;
21519 return VecSize % 128 == 0;
21520}
21521
21523 if (Subtarget->hasNEON())
21524 return 4;
21525 if (Subtarget->hasMVEIntegerOps())
21528}
21529
21530/// Lower an interleaved load into a vldN intrinsic.
21531///
21532/// E.g. Lower an interleaved load (Factor = 2):
21533/// %wide.vec = load <8 x i32>, <8 x i32>* %ptr, align 4
21534/// %v0 = shuffle %wide.vec, undef, <0, 2, 4, 6> ; Extract even elements
21535/// %v1 = shuffle %wide.vec, undef, <1, 3, 5, 7> ; Extract odd elements
21536///
21537/// Into:
21538/// %vld2 = { <4 x i32>, <4 x i32> } call llvm.arm.neon.vld2(%ptr, 4)
21539/// %vec0 = extractelement { <4 x i32>, <4 x i32> } %vld2, i32 0
21540/// %vec1 = extractelement { <4 x i32>, <4 x i32> } %vld2, i32 1
21542 Instruction *Load, Value *Mask, ArrayRef<ShuffleVectorInst *> Shuffles,
21543 ArrayRef<unsigned> Indices, unsigned Factor, const APInt &GapMask) const {
21544 assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
21545 "Invalid interleave factor");
21546 assert(!Shuffles.empty() && "Empty shufflevector input");
21547 assert(Shuffles.size() == Indices.size() &&
21548 "Unmatched number of shufflevectors and indices");
21549
21550 auto *LI = dyn_cast<LoadInst>(Load);
21551 if (!LI)
21552 return false;
21553 assert(!Mask && GapMask.popcount() == Factor && "Unexpected mask on a load");
21554
21555 auto *VecTy = cast<FixedVectorType>(Shuffles[0]->getType());
21556 Type *EltTy = VecTy->getElementType();
21557
21558 const DataLayout &DL = LI->getDataLayout();
21559 Align Alignment = LI->getAlign();
21560
21561 // Skip if we do not have NEON and skip illegal vector types. We can
21562 // "legalize" wide vector types into multiple interleaved accesses as long as
21563 // the vector types are divisible by 128.
21564 if (!isLegalInterleavedAccessType(Factor, VecTy, Alignment, DL))
21565 return false;
21566
21567 unsigned NumLoads = getNumInterleavedAccesses(VecTy, DL);
21568
21569 // A pointer vector can not be the return type of the ldN intrinsics. Need to
21570 // load integer vectors first and then convert to pointer vectors.
21571 if (EltTy->isPointerTy())
21572 VecTy = FixedVectorType::get(DL.getIntPtrType(EltTy), VecTy);
21573
21574 IRBuilder<> Builder(LI);
21575
21576 // The base address of the load.
21577 Value *BaseAddr = LI->getPointerOperand();
21578
21579 if (NumLoads > 1) {
21580 // If we're going to generate more than one load, reset the sub-vector type
21581 // to something legal.
21582 VecTy = FixedVectorType::get(VecTy->getElementType(),
21583 VecTy->getNumElements() / NumLoads);
21584 }
21585
21586 assert(isTypeLegal(EVT::getEVT(VecTy)) && "Illegal vldN vector type!");
21587
21588 auto createLoadIntrinsic = [&](Value *BaseAddr) {
21589 if (Subtarget->hasNEON()) {
21590 Type *PtrTy = Builder.getPtrTy(LI->getPointerAddressSpace());
21591 Type *Tys[] = {VecTy, PtrTy};
21592 static const Intrinsic::ID LoadInts[3] = {Intrinsic::arm_neon_vld2,
21593 Intrinsic::arm_neon_vld3,
21594 Intrinsic::arm_neon_vld4};
21595
21597 Ops.push_back(BaseAddr);
21598 Ops.push_back(Builder.getInt32(LI->getAlign().value()));
21599
21600 return Builder.CreateIntrinsic(LoadInts[Factor - 2], Tys, Ops,
21601 /*FMFSource=*/nullptr, "vldN");
21602 } else {
21603 assert((Factor == 2 || Factor == 4) &&
21604 "expected interleave factor of 2 or 4 for MVE");
21605 Intrinsic::ID LoadInts =
21606 Factor == 2 ? Intrinsic::arm_mve_vld2q : Intrinsic::arm_mve_vld4q;
21607 Type *PtrTy = Builder.getPtrTy(LI->getPointerAddressSpace());
21608 Type *Tys[] = {VecTy, PtrTy};
21609
21611 Ops.push_back(BaseAddr);
21612 return Builder.CreateIntrinsic(LoadInts, Tys, Ops, /*FMFSource=*/nullptr,
21613 "vldN");
21614 }
21615 };
21616
21617 // Holds sub-vectors extracted from the load intrinsic return values. The
21618 // sub-vectors are associated with the shufflevector instructions they will
21619 // replace.
21621
21622 for (unsigned LoadCount = 0; LoadCount < NumLoads; ++LoadCount) {
21623 // If we're generating more than one load, compute the base address of
21624 // subsequent loads as an offset from the previous.
21625 if (LoadCount > 0)
21626 BaseAddr = Builder.CreateConstGEP1_32(VecTy->getElementType(), BaseAddr,
21627 VecTy->getNumElements() * Factor);
21628
21629 CallInst *VldN = createLoadIntrinsic(BaseAddr);
21630
21631 // Replace uses of each shufflevector with the corresponding vector loaded
21632 // by ldN.
21633 for (unsigned i = 0; i < Shuffles.size(); i++) {
21634 ShuffleVectorInst *SV = Shuffles[i];
21635 unsigned Index = Indices[i];
21636
21637 Value *SubVec = Builder.CreateExtractValue(VldN, Index);
21638
21639 // Convert the integer vector to pointer vector if the element is pointer.
21640 if (EltTy->isPointerTy())
21641 SubVec = Builder.CreateIntToPtr(
21642 SubVec,
21644
21645 SubVecs[SV].push_back(SubVec);
21646 }
21647 }
21648
21649 // Replace uses of the shufflevector instructions with the sub-vectors
21650 // returned by the load intrinsic. If a shufflevector instruction is
21651 // associated with more than one sub-vector, those sub-vectors will be
21652 // concatenated into a single wide vector.
21653 for (ShuffleVectorInst *SVI : Shuffles) {
21654 auto &SubVec = SubVecs[SVI];
21655 auto *WideVec =
21656 SubVec.size() > 1 ? concatenateVectors(Builder, SubVec) : SubVec[0];
21657 SVI->replaceAllUsesWith(WideVec);
21658 }
21659
21660 return true;
21661}
21662
21663/// Lower an interleaved store into a vstN intrinsic.
21664///
21665/// E.g. Lower an interleaved store (Factor = 3):
21666/// %i.vec = shuffle <8 x i32> %v0, <8 x i32> %v1,
21667/// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>
21668/// store <12 x i32> %i.vec, <12 x i32>* %ptr, align 4
21669///
21670/// Into:
21671/// %sub.v0 = shuffle <8 x i32> %v0, <8 x i32> v1, <0, 1, 2, 3>
21672/// %sub.v1 = shuffle <8 x i32> %v0, <8 x i32> v1, <4, 5, 6, 7>
21673/// %sub.v2 = shuffle <8 x i32> %v0, <8 x i32> v1, <8, 9, 10, 11>
21674/// call void llvm.arm.neon.vst3(%ptr, %sub.v0, %sub.v1, %sub.v2, 4)
21675///
21676/// Note that the new shufflevectors will be removed and we'll only generate one
21677/// vst3 instruction in CodeGen.
21678///
21679/// Example for a more general valid mask (Factor 3). Lower:
21680/// %i.vec = shuffle <32 x i32> %v0, <32 x i32> %v1,
21681/// <4, 32, 16, 5, 33, 17, 6, 34, 18, 7, 35, 19>
21682/// store <12 x i32> %i.vec, <12 x i32>* %ptr
21683///
21684/// Into:
21685/// %sub.v0 = shuffle <32 x i32> %v0, <32 x i32> v1, <4, 5, 6, 7>
21686/// %sub.v1 = shuffle <32 x i32> %v0, <32 x i32> v1, <32, 33, 34, 35>
21687/// %sub.v2 = shuffle <32 x i32> %v0, <32 x i32> v1, <16, 17, 18, 19>
21688/// call void llvm.arm.neon.vst3(%ptr, %sub.v0, %sub.v1, %sub.v2, 4)
21690 Value *LaneMask,
21691 ShuffleVectorInst *SVI,
21692 unsigned Factor,
21693 const APInt &GapMask) const {
21694 assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
21695 "Invalid interleave factor");
21696 auto *SI = dyn_cast<StoreInst>(Store);
21697 if (!SI)
21698 return false;
21699 assert(!LaneMask && GapMask.popcount() == Factor &&
21700 "Unexpected mask on store");
21701
21702 auto *VecTy = cast<FixedVectorType>(SVI->getType());
21703 assert(VecTy->getNumElements() % Factor == 0 && "Invalid interleaved store");
21704
21705 unsigned LaneLen = VecTy->getNumElements() / Factor;
21706 Type *EltTy = VecTy->getElementType();
21707 auto *SubVecTy = FixedVectorType::get(EltTy, LaneLen);
21708
21709 const DataLayout &DL = SI->getDataLayout();
21710 Align Alignment = SI->getAlign();
21711
21712 // Skip if we do not have NEON and skip illegal vector types. We can
21713 // "legalize" wide vector types into multiple interleaved accesses as long as
21714 // the vector types are divisible by 128.
21715 if (!isLegalInterleavedAccessType(Factor, SubVecTy, Alignment, DL))
21716 return false;
21717
21718 unsigned NumStores = getNumInterleavedAccesses(SubVecTy, DL);
21719
21720 Value *Op0 = SVI->getOperand(0);
21721 Value *Op1 = SVI->getOperand(1);
21722 IRBuilder<> Builder(SI);
21723
21724 // StN intrinsics don't support pointer vectors as arguments. Convert pointer
21725 // vectors to integer vectors.
21726 if (EltTy->isPointerTy()) {
21727 Type *IntTy = DL.getIntPtrType(EltTy);
21728
21729 // Convert to the corresponding integer vector.
21730 auto *IntVecTy =
21732 Op0 = Builder.CreatePtrToInt(Op0, IntVecTy);
21733 Op1 = Builder.CreatePtrToInt(Op1, IntVecTy);
21734
21735 SubVecTy = FixedVectorType::get(IntTy, LaneLen);
21736 }
21737
21738 // The base address of the store.
21739 Value *BaseAddr = SI->getPointerOperand();
21740
21741 if (NumStores > 1) {
21742 // If we're going to generate more than one store, reset the lane length
21743 // and sub-vector type to something legal.
21744 LaneLen /= NumStores;
21745 SubVecTy = FixedVectorType::get(SubVecTy->getElementType(), LaneLen);
21746 }
21747
21748 assert(isTypeLegal(EVT::getEVT(SubVecTy)) && "Illegal vstN vector type!");
21749
21750 auto Mask = SVI->getShuffleMask();
21751
21752 auto createStoreIntrinsic = [&](Value *BaseAddr,
21753 SmallVectorImpl<Value *> &Shuffles) {
21754 if (Subtarget->hasNEON()) {
21755 static const Intrinsic::ID StoreInts[3] = {Intrinsic::arm_neon_vst2,
21756 Intrinsic::arm_neon_vst3,
21757 Intrinsic::arm_neon_vst4};
21758 Type *PtrTy = Builder.getPtrTy(SI->getPointerAddressSpace());
21759 Type *Tys[] = {PtrTy, SubVecTy};
21760
21762 Ops.push_back(BaseAddr);
21763 append_range(Ops, Shuffles);
21764 Ops.push_back(Builder.getInt32(SI->getAlign().value()));
21765 Builder.CreateIntrinsic(StoreInts[Factor - 2], Tys, Ops);
21766 } else {
21767 assert((Factor == 2 || Factor == 4) &&
21768 "expected interleave factor of 2 or 4 for MVE");
21769 Intrinsic::ID StoreInts =
21770 Factor == 2 ? Intrinsic::arm_mve_vst2q : Intrinsic::arm_mve_vst4q;
21771 Type *PtrTy = Builder.getPtrTy(SI->getPointerAddressSpace());
21772 Type *Tys[] = {PtrTy, SubVecTy};
21773
21775 Ops.push_back(BaseAddr);
21776 append_range(Ops, Shuffles);
21777 for (unsigned F = 0; F < Factor; F++) {
21778 Ops.push_back(Builder.getInt32(F));
21779 Builder.CreateIntrinsic(StoreInts, Tys, Ops);
21780 Ops.pop_back();
21781 }
21782 }
21783 };
21784
21785 for (unsigned StoreCount = 0; StoreCount < NumStores; ++StoreCount) {
21786 // If we generating more than one store, we compute the base address of
21787 // subsequent stores as an offset from the previous.
21788 if (StoreCount > 0)
21789 BaseAddr = Builder.CreateConstGEP1_32(SubVecTy->getElementType(),
21790 BaseAddr, LaneLen * Factor);
21791
21792 SmallVector<Value *, 4> Shuffles;
21793
21794 // Split the shufflevector operands into sub vectors for the new vstN call.
21795 for (unsigned i = 0; i < Factor; i++) {
21796 unsigned IdxI = StoreCount * LaneLen * Factor + i;
21797 if (Mask[IdxI] >= 0) {
21798 Shuffles.push_back(Builder.CreateShuffleVector(
21799 Op0, Op1, createSequentialMask(Mask[IdxI], LaneLen, 0)));
21800 } else {
21801 unsigned StartMask = 0;
21802 for (unsigned j = 1; j < LaneLen; j++) {
21803 unsigned IdxJ = StoreCount * LaneLen * Factor + j;
21804 if (Mask[IdxJ * Factor + IdxI] >= 0) {
21805 StartMask = Mask[IdxJ * Factor + IdxI] - IdxJ;
21806 break;
21807 }
21808 }
21809 // Note: If all elements in a chunk are undefs, StartMask=0!
21810 // Note: Filling undef gaps with random elements is ok, since
21811 // those elements were being written anyway (with undefs).
21812 // In the case of all undefs we're defaulting to using elems from 0
21813 // Note: StartMask cannot be negative, it's checked in
21814 // isReInterleaveMask
21815 Shuffles.push_back(Builder.CreateShuffleVector(
21816 Op0, Op1, createSequentialMask(StartMask, LaneLen, 0)));
21817 }
21818 }
21819
21820 createStoreIntrinsic(BaseAddr, Shuffles);
21821 }
21822 return true;
21823}
21824
21832
21834 uint64_t &Members) {
21835 if (auto *ST = dyn_cast<StructType>(Ty)) {
21836 for (unsigned i = 0; i < ST->getNumElements(); ++i) {
21837 uint64_t SubMembers = 0;
21838 if (!isHomogeneousAggregate(ST->getElementType(i), Base, SubMembers))
21839 return false;
21840 Members += SubMembers;
21841 }
21842 } else if (auto *AT = dyn_cast<ArrayType>(Ty)) {
21843 uint64_t SubMembers = 0;
21844 if (!isHomogeneousAggregate(AT->getElementType(), Base, SubMembers))
21845 return false;
21846 Members += SubMembers * AT->getNumElements();
21847 } else if (Ty->isFloatTy()) {
21848 if (Base != HA_UNKNOWN && Base != HA_FLOAT)
21849 return false;
21850 Members = 1;
21851 Base = HA_FLOAT;
21852 } else if (Ty->isDoubleTy()) {
21853 if (Base != HA_UNKNOWN && Base != HA_DOUBLE)
21854 return false;
21855 Members = 1;
21856 Base = HA_DOUBLE;
21857 } else if (auto *VT = dyn_cast<VectorType>(Ty)) {
21858 Members = 1;
21859 switch (Base) {
21860 case HA_FLOAT:
21861 case HA_DOUBLE:
21862 return false;
21863 case HA_VECT64:
21864 return VT->getPrimitiveSizeInBits().getFixedValue() == 64;
21865 case HA_VECT128:
21866 return VT->getPrimitiveSizeInBits().getFixedValue() == 128;
21867 case HA_UNKNOWN:
21868 switch (VT->getPrimitiveSizeInBits().getFixedValue()) {
21869 case 64:
21870 Base = HA_VECT64;
21871 return true;
21872 case 128:
21873 Base = HA_VECT128;
21874 return true;
21875 default:
21876 return false;
21877 }
21878 }
21879 }
21880
21881 return (Members > 0 && Members <= 4);
21882}
21883
21884/// Return the correct alignment for the current calling convention.
21886 Type *ArgTy, const DataLayout &DL) const {
21887 const Align ABITypeAlign = DL.getABITypeAlign(ArgTy);
21888 if (!ArgTy->isVectorTy())
21889 return ABITypeAlign;
21890
21891 // Avoid over-aligning vector parameters. It would require realigning the
21892 // stack and waste space for no real benefit.
21893 MaybeAlign StackAlign = DL.getStackAlignment();
21894 assert(StackAlign && "data layout string is missing stack alignment");
21895 return std::min(ABITypeAlign, *StackAlign);
21896}
21897
21898/// Return true if a type is an AAPCS-VFP homogeneous aggregate or one of
21899/// [N x i32] or [N x i64]. This allows front-ends to skip emitting padding when
21900/// passing according to AAPCS rules.
21902 Type *Ty, CallingConv::ID CallConv, bool isVarArg,
21903 const DataLayout &DL) const {
21904 if (getEffectiveCallingConv(CallConv, isVarArg) !=
21906 return false;
21907
21909 uint64_t Members = 0;
21910 bool IsHA = isHomogeneousAggregate(Ty, Base, Members);
21911 LLVM_DEBUG(dbgs() << "isHA: " << IsHA << " "; Ty->dump());
21912
21913 bool IsIntArray = Ty->isArrayTy() && Ty->getArrayElementType()->isIntegerTy();
21914 return IsHA || IsIntArray;
21915}
21916
21918 const Constant *PersonalityFn) const {
21919 // Platforms which do not use SjLj EH may return values in these registers
21920 // via the personality function.
21922 return EM == ExceptionHandling::SjLj ? Register() : ARM::R0;
21923}
21924
21926 const Constant *PersonalityFn) const {
21927 // Platforms which do not use SjLj EH may return values in these registers
21928 // via the personality function.
21930 return EM == ExceptionHandling::SjLj ? Register() : ARM::R1;
21931}
21932
21933void ARMTargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
21934 // Update IsSplitCSR in ARMFunctionInfo.
21935 ARMFunctionInfo *AFI = Entry->getParent()->getInfo<ARMFunctionInfo>();
21936 AFI->setIsSplitCSR(true);
21937}
21938
21939void ARMTargetLowering::insertCopiesSplitCSR(
21940 MachineBasicBlock *Entry,
21941 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
21942 const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo();
21943 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
21944 if (!IStart)
21945 return;
21946
21947 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
21948 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
21949 MachineBasicBlock::iterator MBBI = Entry->begin();
21950 for (const MCPhysReg *I = IStart; *I; ++I) {
21951 const TargetRegisterClass *RC = nullptr;
21952 if (ARM::GPRRegClass.contains(*I))
21953 RC = &ARM::GPRRegClass;
21954 else if (ARM::DPRRegClass.contains(*I))
21955 RC = &ARM::DPRRegClass;
21956 else
21957 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
21958
21959 Register NewVR = MRI->createVirtualRegister(RC);
21960 // Create copy from CSR to a virtual register.
21961 // FIXME: this currently does not emit CFI pseudo-instructions, it works
21962 // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
21963 // nounwind. If we want to generalize this later, we may need to emit
21964 // CFI pseudo-instructions.
21965 assert(Entry->getParent()->getFunction().hasFnAttribute(
21966 Attribute::NoUnwind) &&
21967 "Function should be nounwind in insertCopiesSplitCSR!");
21968 Entry->addLiveIn(*I);
21969 BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
21970 .addReg(*I);
21971
21972 // Insert the copy-back instructions right before the terminator.
21973 for (auto *Exit : Exits)
21974 BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
21975 TII->get(TargetOpcode::COPY), *I)
21976 .addReg(NewVR);
21977 }
21978}
21979
21984
21986 return Subtarget->hasMVEIntegerOps();
21987}
21988
21991 auto *VTy = dyn_cast<FixedVectorType>(Ty);
21992 if (!VTy)
21993 return false;
21994
21995 auto *ScalarTy = VTy->getScalarType();
21996 unsigned NumElements = VTy->getNumElements();
21997
21998 unsigned VTyWidth = VTy->getScalarSizeInBits() * NumElements;
21999 if (VTyWidth < 128 || !llvm::isPowerOf2_32(VTyWidth))
22000 return false;
22001
22002 // Both VCADD and VCMUL/VCMLA support the same types, F16 and F32
22003 if (ScalarTy->isHalfTy() || ScalarTy->isFloatTy())
22004 return Subtarget->hasMVEFloatOps();
22005
22007 return false;
22008
22009 return Subtarget->hasMVEIntegerOps() &&
22010 (ScalarTy->isIntegerTy(8) || ScalarTy->isIntegerTy(16) ||
22011 ScalarTy->isIntegerTy(32));
22012}
22013
22015 static const MCPhysReg RCRegs[] = {ARM::FPSCR_RM};
22016 return RCRegs;
22017}
22018
22021 ComplexDeinterleavingRotation Rotation, Value *InputA, Value *InputB,
22022 Value *Accumulator) const {
22023
22025
22026 unsigned TyWidth = Ty->getScalarSizeInBits() * Ty->getNumElements();
22027
22028 assert(TyWidth >= 128 && "Width of vector type must be at least 128 bits");
22029
22030 if (TyWidth > 128) {
22031 int Stride = Ty->getNumElements() / 2;
22032 auto SplitSeq = llvm::seq<int>(0, Ty->getNumElements());
22033 auto SplitSeqVec = llvm::to_vector(SplitSeq);
22034 ArrayRef<int> LowerSplitMask(&SplitSeqVec[0], Stride);
22035 ArrayRef<int> UpperSplitMask(&SplitSeqVec[Stride], Stride);
22036
22037 auto *LowerSplitA = B.CreateShuffleVector(InputA, LowerSplitMask);
22038 auto *LowerSplitB = B.CreateShuffleVector(InputB, LowerSplitMask);
22039 auto *UpperSplitA = B.CreateShuffleVector(InputA, UpperSplitMask);
22040 auto *UpperSplitB = B.CreateShuffleVector(InputB, UpperSplitMask);
22041 Value *LowerSplitAcc = nullptr;
22042 Value *UpperSplitAcc = nullptr;
22043
22044 if (Accumulator) {
22045 LowerSplitAcc = B.CreateShuffleVector(Accumulator, LowerSplitMask);
22046 UpperSplitAcc = B.CreateShuffleVector(Accumulator, UpperSplitMask);
22047 }
22048
22049 auto *LowerSplitInt = createComplexDeinterleavingIR(
22050 B, OperationType, Rotation, LowerSplitA, LowerSplitB, LowerSplitAcc);
22051 auto *UpperSplitInt = createComplexDeinterleavingIR(
22052 B, OperationType, Rotation, UpperSplitA, UpperSplitB, UpperSplitAcc);
22053
22054 ArrayRef<int> JoinMask(&SplitSeqVec[0], Ty->getNumElements());
22055 return B.CreateShuffleVector(LowerSplitInt, UpperSplitInt, JoinMask);
22056 }
22057
22058 auto *IntTy = Type::getInt32Ty(B.getContext());
22059
22060 ConstantInt *ConstRotation = nullptr;
22061 if (OperationType == ComplexDeinterleavingOperation::CMulPartial) {
22062 ConstRotation = ConstantInt::get(IntTy, (int)Rotation);
22063
22064 if (Accumulator)
22065 return B.CreateIntrinsic(Intrinsic::arm_mve_vcmlaq, Ty,
22066 {ConstRotation, Accumulator, InputB, InputA});
22067 return B.CreateIntrinsic(Intrinsic::arm_mve_vcmulq, Ty,
22068 {ConstRotation, InputB, InputA});
22069 }
22070
22071 if (OperationType == ComplexDeinterleavingOperation::CAdd) {
22072 // 1 means the value is not halved.
22073 auto *ConstHalving = ConstantInt::get(IntTy, 1);
22074
22076 ConstRotation = ConstantInt::get(IntTy, 0);
22078 ConstRotation = ConstantInt::get(IntTy, 1);
22079
22080 if (!ConstRotation)
22081 return nullptr; // Invalid rotation for arm_mve_vcaddq
22082
22083 return B.CreateIntrinsic(Intrinsic::arm_mve_vcaddq, Ty,
22084 {ConstHalving, ConstRotation, InputA, InputB});
22085 }
22086
22087 return nullptr;
22088}
unsigned const MachineRegisterInfo * MRI
static bool isAddSubSExt(SDValue N, SelectionDAG &DAG)
static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, int64_t &Cnt)
isVShiftRImm - Check if this is a valid build_vector for the immediate operand of a vector shift righ...
static bool isExtendedBUILD_VECTOR(SDValue N, SelectionDAG &DAG, bool isSigned)
static bool isZeroExtended(SDValue N, SelectionDAG &DAG)
return SDValue()
static const MCPhysReg GPRArgRegs[]
static SDValue GeneratePerfectShuffle(unsigned ID, SDValue V1, SDValue V2, unsigned PFEntry, SDValue LHS, SDValue RHS, SelectionDAG &DAG, const SDLoc &DL)
GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit the specified operations t...
constexpr MVT FlagsVT
Value type used for NZCV flags.
static bool isNegatedInteger(SDValue Op)
static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt)
getVShiftImm - Check if this is a valid build_vector for the immediate operand of a vector shift oper...
static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG)
static bool isSignExtended(SDValue N, SelectionDAG &DAG)
static bool isAddSubZExt(SDValue N, SelectionDAG &DAG)
static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt)
isVShiftLImm - Check if this is a valid build_vector for the immediate operand of a vector shift left...
static bool canGuaranteeTCO(CallingConv::ID CC, bool GuaranteeTailCalls)
Return true if the calling convention is one that we can guarantee TCO for.
unsigned RegSize
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
amdgpu aa AMDGPU Address space based Alias Analysis Wrapper
static bool isConstant(const MachineInstr &MI)
constexpr LLT F64
constexpr LLT S1
This file declares a class to represent arbitrary precision floating point values and provide a varie...
This file implements a class to represent arbitrary precision integral constant values and operations...
static SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG)
static bool isStore(int Opcode)
static bool isThumb(const MCSubtargetInfo &STI)
static SDValue LowerUADDSUBO_CARRY(SDValue Op, SelectionDAG &DAG)
static SDValue PerformExtractEltToVMOVRRD(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static bool isIncompatibleReg(const MCPhysReg &PR, MVT VT)
static SDValue PerformVQDMULHCombine(SDNode *N, SelectionDAG &DAG)
static SDValue LowerBUILD_VECTOR_i1(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue LowerShift(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue AddRequiredExtensionForVMULL(SDValue N, SelectionDAG &DAG, const EVT &OrigTy, const EVT &ExtTy, unsigned ExtOpcode)
AddRequiredExtensionForVMULL - Add a sign/zero extension to extend the total value size to 64 bits.
static cl::opt< unsigned > ConstpoolPromotionMaxSize("arm-promote-constant-max-size", cl::Hidden, cl::desc("Maximum size of constant to promote into a constant pool"), cl::init(64))
static bool isZeroOrAllOnes(SDValue N, bool AllOnes)
static SDValue LowerINSERT_VECTOR_ELT_i1(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static bool isVTBLMask(ArrayRef< int > M, EVT VT)
static SDValue PerformSUBCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformSUBCombine - Target-specific dag combine xforms for ISD::SUB.
static cl::opt< bool > EnableConstpoolPromotion("arm-promote-constant", cl::Hidden, cl::desc("Enable / disable promotion of unnamed_addr constants into " "constant pools"), cl::init(false))
static SDValue PerformFAddVSelectCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue PerformExtractFpToIntStores(StoreSDNode *St, SelectionDAG &DAG)
static SDValue PerformVDUPCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
PerformVDUPCombine - Target-specific dag combine xforms for ARMISD::VDUP.
static SDValue PerformExtractEltCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *ST)
static const APInt * isPowerOf2Constant(SDValue V)
static SDValue PerformVCVTCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
PerformVCVTCombine - VCVT (floating-point to fixed-point, Advanced SIMD) can replace combinations of ...
static SDValue PerformVMOVhrCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue LowerVectorFP_TO_INT(SDValue Op, SelectionDAG &DAG)
static SDValue LowerVECTOR_SHUFFLEUsingOneOff(SDValue Op, ArrayRef< int > ShuffleMask, SelectionDAG &DAG)
static bool isValidMVECond(unsigned CC, bool IsFloat)
static SDValue PerformPREDICATE_CASTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static ARMCC::CondCodes IntCCToARMCC(ISD::CondCode CC)
IntCCToARMCC - Convert a DAG integer condition code to an ARM CC.
static SDValue PerformSTORECombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformSTORECombine - Target-specific dag combine xforms for ISD::STORE.
static SDValue ConvertBooleanCarryToCarryFlag(SDValue BoolCarry, SelectionDAG &DAG)
static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static bool isGTorGE(ISD::CondCode CC)
static bool CombineVLDDUP(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
CombineVLDDUP - For a VDUPLANE node N, check if its source operand is a vldN-lane (N > 1) intrinsic,...
static SDValue ParseBFI(SDNode *N, APInt &ToMask, APInt &FromMask)
static bool isReverseMask(ArrayRef< int > M, EVT VT)
static bool isVZIP_v_undef_Mask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
isVZIP_v_undef_Mask - Special case of isVZIPMask for canonical form of "vector_shuffle v,...
static SDValue PerformSELECTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue AddCombineTo64bitUMAAL(SDNode *AddeNode, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue PerformVECTOR_REG_CASTCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue PerformVMulVCTPCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
PerformVMulVCTPCombine - VCVT (fixed-point to floating-point, Advanced SIMD) can replace combinations...
static SDValue createGPRPairNode2xi32(SelectionDAG &DAG, SDValue V0, SDValue V1)
static SDValue bitcastf32Toi32(SDValue Op, SelectionDAG &DAG)
static bool findPointerConstIncrement(SDNode *N, SDValue *Ptr, SDValue *CInc)
static bool isVTRNMask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static bool CanInvertMVEVCMP(SDValue N)
static SDValue PerformLongShiftCombine(SDNode *N, SelectionDAG &DAG)
static SDValue AddCombineToVPADD(SDNode *N, SDValue N0, SDValue N1, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue PerformShiftCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *ST)
PerformShiftCombine - Checks for immediate versions of vector shifts and lowers them.
static void FPCCToARMCC(ISD::CondCode CC, ARMCC::CondCodes &CondCode, ARMCC::CondCodes &CondCode2)
FPCCToARMCC - Convert a DAG fp condition code to an ARM CC.
static void ExpandREAD_REGISTER(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG)
static EVT getVectorTyFromPredicateVector(EVT VT)
static SDValue PerformFADDVCMLACombine(SDNode *N, SelectionDAG &DAG)
static SDValue handleCMSEValue(const SDValue &Value, const ISD::InputArg &Arg, SelectionDAG &DAG, const SDLoc &DL)
static SDValue PerformARMBUILD_VECTORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
Target-specific dag combine xforms for ARMISD::BUILD_VECTOR.
static bool isSRL16(const SDValue &Op)
static SDValue PerformVMOVrhCombine(SDNode *N, SelectionDAG &DAG)
static SDValue PerformLOADCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue IsCMPZCSINC(SDNode *Cmp, ARMCC::CondCodes &CC)
static unsigned getPointerConstIncrement(unsigned Opcode, SDValue Ptr, SDValue Inc, const SelectionDAG &DAG)
static SDValue combineSelectAndUseCommutative(SDNode *N, bool AllOnes, TargetLowering::DAGCombinerInfo &DCI)
static SDValue LowerATOMIC_FENCE(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static Register genTPEntry(MachineBasicBlock *TpEntry, MachineBasicBlock *TpLoopBody, MachineBasicBlock *TpExit, Register OpSizeReg, const TargetInstrInfo *TII, DebugLoc Dl, MachineRegisterInfo &MRI)
Adds logic in loop entry MBB to calculate loop iteration count and adds t2WhileLoopSetup and t2WhileL...
static SDValue createGPRPairNodei64(SelectionDAG &DAG, SDValue V)
static bool isLTorLE(ISD::CondCode CC)
static SDValue PerformVCMPCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue PerformMVEVMULLCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue LowerSDIV_v4i16(SDValue N0, SDValue N1, const SDLoc &dl, SelectionDAG &DAG)
static SDValue performNegCMovCombine(SDNode *N, SelectionDAG &DAG)
static EVT getExtensionTo64Bits(const EVT &OrigVT)
static SDValue PerformBITCASTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *ST)
static SDValue AddCombineTo64bitMLAL(SDNode *AddeSubeNode, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue LowerWRITE_REGISTER(SDValue Op, SelectionDAG &DAG)
static bool checkAndUpdateCPSRKill(MachineBasicBlock::iterator SelectItr, MachineBasicBlock *BB, const TargetRegisterInfo *TRI)
static SDValue PerformCMPZCombine(SDNode *N, SelectionDAG &DAG)
static bool hasNormalLoadOperand(SDNode *N)
hasNormalLoadOperand - Check if any of the operands of a BUILD_VECTOR node are normal,...
static SDValue PerformInsertEltCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
PerformInsertEltCombine - Target-specific dag combine xforms for ISD::INSERT_VECTOR_ELT.
static SDValue PerformVDUPLANECombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformVDUPLANECombine - Target-specific dag combine xforms for ARMISD::VDUPLANE.
static SDValue LowerBuildVectorOfFPTrunc(SDValue BV, SelectionDAG &DAG, const ARMSubtarget *ST)
static cl::opt< unsigned > ConstpoolPromotionMaxTotal("arm-promote-constant-max-total", cl::Hidden, cl::desc("Maximum size of ALL constants to promote into a constant pool"), cl::init(128))
static SDValue LowerTruncatei1(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
static RTLIB::Libcall getDivRemLibcall(const SDNode *N, MVT::SimpleValueType SVT)
static SDValue SkipLoadExtensionForVMULL(LoadSDNode *LD, SelectionDAG &DAG)
SkipLoadExtensionForVMULL - return a load of the original vector size that does not do any sign/zero ...
static SDValue AddCombineVUZPToVPADDL(SDNode *N, SDValue N0, SDValue N1, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue PerformADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformADDCombineWithOperands - Try DAG combinations for an ADD with operands N0 and N1.
static SDValue PromoteMVEPredVector(SDLoc dl, SDValue Pred, EVT VT, SelectionDAG &DAG)
static bool isVZIPMask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
static SDValue PerformORCombineToSMULWBT(SDNode *OR, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static bool isVTRN_v_undef_Mask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
isVTRN_v_undef_Mask - Special case of isVTRNMask for canonical form of "vector_shuffle v,...
static SDValue LowerUDIV(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue FindBFIToCombineWith(SDNode *N)
static SDValue LowerADDSUBSAT(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue ConvertCarryFlagToBooleanCarry(SDValue Flags, EVT VT, SelectionDAG &DAG)
static void checkVSELConstraints(ISD::CondCode CC, ARMCC::CondCodes &CondCode, bool &swpCmpOps, bool &swpVselOps)
static void ReplaceLongIntrinsic(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG)
static bool isS16(const SDValue &Op, SelectionDAG &DAG)
static bool isSRA16(const SDValue &Op)
static SDValue AddCombineBUILD_VECTORToVPADDL(SDNode *N, SDValue N0, SDValue N1, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue LowerVECTOR_SHUFFLEUsingMovs(SDValue Op, ArrayRef< int > ShuffleMask, SelectionDAG &DAG)
static SDValue LowerInterruptReturn(SmallVectorImpl< SDValue > &RetOps, const SDLoc &DL, SelectionDAG &DAG)
static SDValue LowerEXTRACT_VECTOR_ELT_i1(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue LowerSDIV_v4i8(SDValue X, SDValue Y, const SDLoc &dl, SelectionDAG &DAG)
static void expandf64Toi32(SDValue Op, SelectionDAG &DAG, SDValue &RetVal1, SDValue &RetVal2)
static SDValue LowerCONCAT_VECTORS_i1(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue LowerCTTZ(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue PerformVLDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static bool isSHL16(const SDValue &Op)
static bool isVEXTMask(ArrayRef< int > M, EVT VT, bool &ReverseVEXT, unsigned &Imm)
static SDValue PerformMVEVLDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
cl::opt< unsigned > ArmMaxBaseUpdatesToCheck("arm-max-base-updates-to-check", cl::Hidden, cl::desc("Maximum number of base-updates to check generating postindex."), cl::init(64))
static bool isTruncMask(ArrayRef< int > M, EVT VT, bool Top, bool SingleSource)
static SDValue PerformADDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformADDCombine - Target-specific dag combine xforms for ISD::ADD.
static unsigned getLdOpcode(unsigned LdSize, bool IsThumb1, bool IsThumb2)
Return the load opcode for a given load size.
static bool isLegalT2AddressImmediate(int64_t V, EVT VT, const ARMSubtarget *Subtarget)
static bool isLegalMVEShuffleOp(unsigned PFEntry)
static SDValue PerformSignExtendInregCombine(SDNode *N, SelectionDAG &DAG)
static SDValue PerformShuffleVMOVNCombine(ShuffleVectorSDNode *N, SelectionDAG &DAG)
static bool isVUZPMask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
static SDValue PerformVECTOR_SHUFFLECombine(SDNode *N, SelectionDAG &DAG)
PerformVECTOR_SHUFFLECombine - Target-specific dag combine xforms for ISD::VECTOR_SHUFFLE.
static SDValue SkipExtensionForVMULL(SDNode *N, SelectionDAG &DAG)
SkipExtensionForVMULL - For a node that is a SIGN_EXTEND, ZERO_EXTEND, ANY_EXTEND,...
static bool isVMOVNTruncMask(ArrayRef< int > M, EVT ToVT, bool rev)
static SDValue PerformVQMOVNCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static MachineBasicBlock * OtherSucc(MachineBasicBlock *MBB, MachineBasicBlock *Succ)
static SDValue LowerVecReduceMinMax(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue PerformFPExtendCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue PerformAddcSubcCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue PerformVSELECTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static TargetLowering::ArgListTy getDivRemArgList(const SDNode *N, LLVMContext *Context, const ARMSubtarget *Subtarget)
static SDValue PerformVECREDUCE_ADDCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue getZeroVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl)
getZeroVector - Returns a vector of specified type with all zero elements.
static SDValue LowerAtomicLoadStore(SDValue Op, SelectionDAG &DAG)
static SDValue PerformSplittingToNarrowingStores(StoreSDNode *St, SelectionDAG &DAG)
static bool getT2IndexedAddressParts(SDNode *Ptr, EVT VT, bool isSEXTLoad, SDValue &Base, SDValue &Offset, bool &isInc, SelectionDAG &DAG)
static ARMCC::CondCodes getVCMPCondCode(SDValue N)
static cl::opt< bool > ARMInterworking("arm-interworking", cl::Hidden, cl::desc("Enable / disable ARM interworking (for debugging only)"), cl::init(true))
static void ReplaceREADCYCLECOUNTER(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue PerformORCombineToBFI(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static bool isConditionalZeroOrAllOnes(SDNode *N, bool AllOnes, SDValue &CC, bool &Invert, SDValue &OtherOp, SelectionDAG &DAG)
static SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue PerformVSetCCToVCTPCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue LowerBUILD_VECTORToVIDUP(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static bool isZeroVector(SDValue N)
static SDValue PerformAddeSubeCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static void ReplaceCMP_SWAP_64Results(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG)
static bool isLowerSaturate(const SDValue LHS, const SDValue RHS, const SDValue TrueVal, const SDValue FalseVal, const ISD::CondCode CC, const SDValue K)
static SDValue LowerPredicateLoad(SDValue Op, SelectionDAG &DAG)
static void emitPostSt(MachineBasicBlock *BB, MachineBasicBlock::iterator Pos, const TargetInstrInfo *TII, const DebugLoc &dl, unsigned StSize, unsigned Data, unsigned AddrIn, unsigned AddrOut, bool IsThumb1, bool IsThumb2)
Emit a post-increment store operation with given size.
static bool isVMOVNMask(ArrayRef< int > M, EVT VT, bool Top, bool SingleSource)
static SDValue CombineBaseUpdate(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
CombineBaseUpdate - Target-specific DAG combine function for VLDDUP, NEON load/store intrinsics,...
static SDValue LowerSaturatingConditional(SDValue Op, SelectionDAG &DAG)
static SDValue PerformSubCSINCCombine(SDNode *N, SelectionDAG &DAG)
static SDValue PerformVMOVRRDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformVMOVRRDCombine - Target-specific dag combine xforms for ARMISD::VMOVRRD.
static SDValue LowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue PerformCSETCombine(SDNode *N, SelectionDAG &DAG)
static SDValue PerformVMOVNCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue PerformInsertSubvectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue LowerVectorExtend(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue WinDBZCheckDenominator(SelectionDAG &DAG, SDNode *N, SDValue InChain)
static SDValue LowerVECTOR_SHUFFLEv8i8(SDValue Op, ArrayRef< int > ShuffleMask, SelectionDAG &DAG)
static SDValue PerformVMULCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformVMULCombine Distribute (A + B) * C to (A * C) + (B * C) to take advantage of the special multi...
static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG)
static SDValue PerformBFICombine(SDNode *N, SelectionDAG &DAG)
static SDValue PerformORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformORCombine - Target-specific dag combine xforms for ISD::OR.
static SDValue LowerMLOAD(SDValue Op, SelectionDAG &DAG)
static SDValue PerformTruncatingStoreCombine(StoreSDNode *St, SelectionDAG &DAG)
static unsigned SelectPairHalf(unsigned Elements, ArrayRef< int > Mask, unsigned Index)
static void emitPostLd(MachineBasicBlock *BB, MachineBasicBlock::iterator Pos, const TargetInstrInfo *TII, const DebugLoc &dl, unsigned LdSize, unsigned Data, unsigned AddrIn, unsigned AddrOut, bool IsThumb1, bool IsThumb2)
Emit a post-increment load operation with given size.
static SDValue TryDistrubutionADDVecReduce(SDNode *N, SelectionDAG &DAG)
static bool isValidBaseUpdate(SDNode *N, SDNode *User)
static SDValue IsSingleInstrConstant(SDValue N, SelectionDAG &DAG, const ARMSubtarget *ST, const SDLoc &dl)
static bool IsQRMVEInstruction(const SDNode *N, const SDNode *Op)
static SDValue PerformMinMaxToSatCombine(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue PerformXORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static bool getMVEIndexedAddressParts(SDNode *Ptr, EVT VT, Align Alignment, bool isSEXTLoad, bool IsMasked, bool isLE, SDValue &Base, SDValue &Offset, bool &isInc, SelectionDAG &DAG)
std::pair< unsigned, const TargetRegisterClass * > RCPair
static SDValue combineSelectAndUse(SDNode *N, SDValue Slct, SDValue OtherOp, TargetLowering::DAGCombinerInfo &DCI, bool AllOnes=false)
static SDValue PerformExtendCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
PerformExtendCombine - Target-specific DAG combining for ISD::SIGN_EXTEND, ISD::ZERO_EXTEND,...
static SDValue LowerSDIV(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
cl::opt< unsigned > MVEMaxSupportedInterleaveFactor("mve-max-interleave-factor", cl::Hidden, cl::desc("Maximum interleave factor for MVE VLDn to generate."), cl::init(2))
static SDValue isVMOVModifiedImm(uint64_t SplatBits, uint64_t SplatUndef, unsigned SplatBitSize, SelectionDAG &DAG, const SDLoc &dl, EVT &VT, EVT VectorVT, VMOVModImmType type)
isVMOVModifiedImm - Check if the specified splat value corresponds to a valid vector constant for a N...
static SDValue LowerBuildVectorOfFPExt(SDValue BV, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue CombineVMOVDRRCandidateWithVecOp(const SDNode *BC, SelectionDAG &DAG)
BC is a bitcast that is about to be turned into a VMOVDRR.
static SDValue promoteToConstantPool(const ARMTargetLowering *TLI, const GlobalValue *GV, SelectionDAG &DAG, EVT PtrVT, const SDLoc &dl)
static unsigned isNEONTwoResultShuffleMask(ArrayRef< int > ShuffleMask, EVT VT, unsigned &WhichResult, bool &isV_UNDEF)
Check if ShuffleMask is a NEON two-result shuffle (VZIP, VUZP, VTRN), and return the corresponding AR...
static bool BitsProperlyConcatenate(const APInt &A, const APInt &B)
static bool getARMIndexedAddressParts(SDNode *Ptr, EVT VT, bool isSEXTLoad, SDValue &Base, SDValue &Offset, bool &isInc, SelectionDAG &DAG)
static SDValue LowerVecReduce(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue LowerVectorINT_TO_FP(SDValue Op, SelectionDAG &DAG)
static bool TryCombineBaseUpdate(struct BaseUpdateTarget &Target, struct BaseUpdateUser &User, bool SimpleConstIncOnly, TargetLowering::DAGCombinerInfo &DCI)
static bool allUsersAreInFunction(const Value *V, const Function *F)
Return true if all users of V are within function F, looking through ConstantExprs.
static bool isSingletonVEXTMask(ArrayRef< int > M, EVT VT, unsigned &Imm)
static SDValue PerformVMOVDRRCombine(SDNode *N, SelectionDAG &DAG)
PerformVMOVDRRCombine - Target-specific dag combine xforms for ARMISD::VMOVDRR.
static bool isLowerSaturatingConditional(const SDValue &Op, SDValue &V, SDValue &SatK)
static bool isLegalAddressImmediate(int64_t V, EVT VT, const ARMSubtarget *Subtarget)
isLegalAddressImmediate - Return true if the integer value can be used as the offset of the target ad...
static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static bool isLegalT1AddressImmediate(int64_t V, EVT VT)
static SDValue CombineANDShift(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG)
static SDValue PerformSHLSimplify(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *ST)
static SDValue PerformADDECombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformADDECombine - Target-specific dag combine transform from ARMISD::ADDC, ARMISD::ADDE,...
static SDValue PerformReduceShuffleCombine(SDNode *N, SelectionDAG &DAG)
static SDValue PerformUMLALCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue LowerTruncate(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue PerformHWLoopCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *ST)
static SDValue PerformORCombineToShiftInsert(SelectionDAG &DAG, SDValue AndOp, SDValue ShiftOp, EVT VT, SDLoc dl)
static SDValue PerformSplittingMVETruncToNarrowingStores(StoreSDNode *St, SelectionDAG &DAG)
static bool isVUZP_v_undef_Mask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
isVUZP_v_undef_Mask - Special case of isVUZPMask for canonical form of "vector_shuffle v,...
static bool isHomogeneousAggregate(Type *Ty, HABaseType &Base, uint64_t &Members)
static SDValue PerformMULCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue PerformFADDCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue LowerReverse_VECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG)
static SDValue PerformANDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue PerformADDVecReduce(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue LowerPredicateStore(SDValue Op, SelectionDAG &DAG)
static SDValue SearchLoopIntrinsic(SDValue N, ISD::CondCode &CC, int &Imm, bool &Negate)
static bool canChangeToInt(SDValue Op, bool &SeenZero, const ARMSubtarget *Subtarget)
canChangeToInt - Given the fp compare operand, return true if it is suitable to morph to an integer c...
static unsigned getStOpcode(unsigned StSize, bool IsThumb1, bool IsThumb2)
Return the store opcode for a given store size.
static bool IsVUZPShuffleNode(SDNode *N)
static SDValue Expand64BitShift(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue AddCombineTo64BitSMLAL16(SDNode *AddcNode, SDNode *AddeNode, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static void attachMEMCPYScratchRegs(const ARMSubtarget *Subtarget, MachineInstr &MI, const SDNode *Node)
Attaches vregs to MEMCPY that it will use as scratch registers when it is expanded into LDM/STM.
static bool isFloatingPointZero(SDValue Op)
isFloatingPointZero - Return true if this is +0.0.
static SDValue findMUL_LOHI(SDValue V)
static SDValue LowerVECTOR_SHUFFLE_i1(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue PerformORCombine_i1(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue PerformSplittingMVEEXTToWideningLoad(SDNode *N, SelectionDAG &DAG)
static SDValue PerformSplittingToWideningLoad(SDNode *N, SelectionDAG &DAG)
static void genTPLoopBody(MachineBasicBlock *TpLoopBody, MachineBasicBlock *TpEntry, MachineBasicBlock *TpExit, const TargetInstrInfo *TII, DebugLoc Dl, MachineRegisterInfo &MRI, Register OpSrcReg, Register OpDestReg, Register ElementCountReg, Register TotalIterationsReg, bool IsMemcpy)
Adds logic in the loopBody MBB to generate MVE_VCTP, t2DoLoopDec and t2DoLoopEnd.
static SDValue PerformBUILD_VECTORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformBUILD_VECTORCombine - Target-specific dag combine xforms for ISD::BUILD_VECTOR.
static SDValue LowerVecReduceF(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue PerformMinMaxCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
PerformMinMaxCombine - Target-specific DAG combining for creating truncating saturates.
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
This file a TargetTransformInfoImplBase conforming object specific to the ARM target machine.
Function Alias Analysis false
Function Alias Analysis Results
Atomic ordering constants.
This file contains the simple types necessary to represent the attributes associated with functions a...
This file implements the BitVector class.
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static std::optional< bool > isBigEndian(const SmallDenseMap< int64_t, int64_t, 8 > &MemOffset2Idx, int64_t LowestIdx)
Given a map from byte offsets in memory to indices in a load/store, determine if that map corresponds...
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static void createLoadIntrinsic(IntrinsicInst *II, LoadInst *LI, dxil::ResourceTypeInfo &RTI)
static void createStoreIntrinsic(IntrinsicInst *II, StoreInst *SI, dxil::ResourceTypeInfo &RTI)
This file defines the DenseMap class.
#define Check(C,...)
#define op(i)
#define im(i)
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
Module.h This file contains the declarations for the Module class.
std::pair< Value *, Value * > ShuffleOps
We are building a shuffle to create V, which is a sequence of insertelement, extractelement pairs.
static Value * LowerCTPOP(LLVMContext &Context, Value *V, Instruction *IP)
Emit the code to lower ctpop of V before the specified instruction IP.
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
#define RegName(no)
static LVOptions Options
Definition LVOptions.cpp:25
lazy value info
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
#define G(x, y, z)
Definition MD5.cpp:55
This file declares the MachineConstantPool class which is an abstract constant pool to keep track of ...
static DebugLoc getDebugLoc(MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
Return the first DebugLoc that has line number information, given a range of instructions.
Register Reg
Register const TargetRegisterInfo * TRI
Promote Memory to Register
Definition Mem2Reg.cpp:110
#define T
nvptx lower args
uint64_t High
uint64_t IntrinsicInst * II
PowerPC Reduce CR logical Operation
R600 Clause Merge
const SmallVectorImpl< MachineOperand > & Cond
static cl::opt< RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode > Mode("regalloc-enable-advisor", cl::Hidden, cl::init(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default), cl::desc("Enable regalloc advisor mode"), cl::values(clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default, "default", "Default"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Release, "release", "precompiled"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Development, "development", "for training")))
SI Lower i1 Copies
This file contains some templates that are useful if you are working with the STL at all.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition Value.cpp:487
static cl::opt< unsigned > MaxSteps("has-predecessor-max-steps", cl::Hidden, cl::init(8192), cl::desc("DAG combiner limit number of steps when searching DAG " "for predecessor nodes"))
This file defines the SmallPtrSet class.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:171
This file contains some functions that are useful when dealing with strings.
This file implements the StringSwitch template, which mimics a switch() statement whose cases are str...
#define LLVM_DEBUG(...)
Definition Debug.h:114
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39
This file describes how to lower LLVM code to machine code.
static X86::CondCode getSwappedCondition(X86::CondCode CC)
Assuming the flags are set by MI(a,b), return the condition code if we modify the instructions such t...
static constexpr int Concat[]
Value * RHS
Value * LHS
BinaryOperator * Mul
static bool isIntrinsic(const CallBase &Call, Intrinsic::ID ID)
The Input class is used to parse a yaml document into in-memory structs and vectors.
static constexpr roundingMode rmTowardZero
Definition APFloat.h:348
LLVM_ABI bool getExactInverse(APFloat *Inv) const
If this value is normal and has an exact, normal, multiplicative inverse, store it in inv and return ...
Definition APFloat.cpp:5917
APInt bitcastToAPInt() const
Definition APFloat.h:1404
opStatus convertToInteger(MutableArrayRef< integerPart > Input, unsigned int Width, bool IsSigned, roundingMode RM, bool *IsExact) const
Definition APFloat.h:1383
Class for arbitrary precision integers.
Definition APInt.h:78
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition APInt.h:235
bool isMinSignedValue() const
Determine if this is the smallest signed value.
Definition APInt.h:424
uint64_t getZExtValue() const
Get zero extended value.
Definition APInt.h:1555
unsigned popcount() const
Count the number of bits set.
Definition APInt.h:1685
LLVM_ABI APInt zextOrTrunc(unsigned width) const
Zero extend or truncate to width.
Definition APInt.cpp:1044
unsigned getActiveBits() const
Compute the number of active bits in the value.
Definition APInt.h:1527
LLVM_ABI APInt trunc(unsigned width) const
Truncate to new width.
Definition APInt.cpp:936
void setBit(unsigned BitPosition)
Set the given bit to 1 whose position is given as "bitPosition".
Definition APInt.h:1345
bool sgt(const APInt &RHS) const
Signed greater than comparison.
Definition APInt.h:1208
bool isAllOnes() const
Determine if all bits are set. This is true for zero-width values.
Definition APInt.h:372
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition APInt.h:1503
bool ult(const APInt &RHS) const
Unsigned less than comparison.
Definition APInt.h:1118
unsigned countr_zero() const
Count the number of trailing zero bits.
Definition APInt.h:1654
unsigned countl_zero() const
The APInt version of std::countl_zero.
Definition APInt.h:1613
static LLVM_ABI APInt getSplat(unsigned NewLen, const APInt &V)
Return a value containing V broadcasted over NewLen bits.
Definition APInt.cpp:651
unsigned logBase2() const
Definition APInt.h:1776
uint64_t getLimitedValue(uint64_t Limit=UINT64_MAX) const
If this value is smaller than the specified limit, return it, otherwise return the limit value.
Definition APInt.h:476
bool isSubsetOf(const APInt &RHS) const
This operation checks that all bits set in this APInt are also set in RHS.
Definition APInt.h:1264
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
Definition APInt.h:441
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition APInt.h:307
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition APInt.h:297
static APInt getOneBitSet(unsigned numBits, unsigned BitNo)
Return an APInt with exactly one bit set in the result.
Definition APInt.h:240
int64_t getSExtValue() const
Get sign extended value.
Definition APInt.h:1577
void lshrInPlace(unsigned ShiftAmt)
Logical right-shift this APInt by ShiftAmt in place.
Definition APInt.h:865
APInt lshr(unsigned shiftAmt) const
Logical right-shift function.
Definition APInt.h:858
unsigned countr_one() const
Count the number of trailing one bits.
Definition APInt.h:1671
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
Definition APInt.h:1228
An arbitrary precision integer that knows its signedness.
Definition APSInt.h:24
const ARMBaseRegisterInfo & getRegisterInfo() const
const uint32_t * getSjLjDispatchPreservedMask(const MachineFunction &MF) const
const MCPhysReg * getCalleeSavedRegs(const MachineFunction *MF) const override
Code Generation virtual methods...
Register getFrameRegister(const MachineFunction &MF) const override
const uint32_t * getCallPreservedMask(const MachineFunction &MF, CallingConv::ID) const override
const uint32_t * getTLSCallPreservedMask(const MachineFunction &MF) const
const uint32_t * getThisReturnPreservedMask(const MachineFunction &MF, CallingConv::ID) const
getThisReturnPreservedMask - Returns a call preserved mask specific to the case that 'returned' is on...
static ARMConstantPoolConstant * Create(const Constant *C, unsigned ID)
static ARMConstantPoolMBB * Create(LLVMContext &C, const MachineBasicBlock *mbb, unsigned ID, unsigned char PCAdj)
static ARMConstantPoolSymbol * Create(LLVMContext &C, StringRef s, unsigned ID, unsigned char PCAdj)
ARMConstantPoolValue - ARM specific constantpool value.
ARMFunctionInfo - This class is derived from MachineFunctionInfo and contains private ARM-specific in...
SmallPtrSet< const GlobalVariable *, 2 > & getGlobalsPromotedToConstantPool()
void setArgumentStackToRestore(unsigned v)
void setArgRegsSaveSize(unsigned s)
void setReturnRegsCount(unsigned s)
unsigned getArgRegsSaveSize() const
void markGlobalAsPromotedToConstantPool(const GlobalVariable *GV)
Indicate to the backend that GV has had its storage changed to inside a constant pool.
void setArgumentStackSize(unsigned size)
unsigned getArgumentStackSize() const
const Triple & getTargetTriple() const
const ARMBaseInstrInfo * getInstrInfo() const override
bool isThumb1Only() const
bool useFPVFMx() const
bool isThumb2() const
bool hasBaseDSP() const
const ARMTargetLowering * getTargetLowering() const override
const ARMBaseRegisterInfo * getRegisterInfo() const override
bool hasVFP2Base() const
bool useFPVFMx64() const
bool isLittle() const
bool useFPVFMx16() const
bool isMClass() const
bool useMulOps() const
bool shouldFoldSelectWithIdentityConstant(unsigned BinOpcode, EVT VT, unsigned SelectOpcode, SDValue X, SDValue Y) const override
Return true if pulling a binary operation into a select with an identity constant is profitable.
bool isReadOnly(const GlobalValue *GV) const
unsigned getMaxSupportedInterleaveFactor() const override
Get the maximum supported factor for interleaved memory accesses.
TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicLoadInIR(LoadInst *LI) const override
Returns how the given (atomic) load should be expanded by the IR-level AtomicExpand pass.
unsigned getNumInterleavedAccesses(VectorType *VecTy, const DataLayout &DL) const
Returns the number of interleaved accesses that will be generated when lowering accesses of the given...
bool shouldInsertFencesForAtomic(const Instruction *I) const override
Whether AtomicExpandPass should automatically insert fences and reduce ordering for this atomic.
Align getABIAlignmentForCallingConv(Type *ArgTy, const DataLayout &DL) const override
Return the correct alignment for the current calling convention.
bool isDesirableToCommuteWithShift(const SDNode *N, CombineLevel Level) const override
Return true if it is profitable to move this shift by a constant amount through its operand,...
Register getExceptionPointerRegister(const Constant *PersonalityFn) const override
If a physical register, this returns the register that receives the exception address on entry to an ...
ConstraintWeight getSingleConstraintMatchWeight(AsmOperandInfo &info, const char *constraint) const override
Examine constraint string and operand type and determine a weight value.
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I=nullptr) const override
isLegalAddressingMode - Return true if the addressing mode represented by AM is legal for this target...
const ARMSubtarget * getSubtarget() const
bool isLegalT2ScaledAddressingMode(const AddrMode &AM, EVT VT) const
bool isLegalT1ScaledAddressingMode(const AddrMode &AM, EVT VT) const
Returns true if the addressing mode representing by AM is legal for the Thumb1 target,...
bool getPreIndexedAddressParts(SDNode *N, SDValue &Base, SDValue &Offset, ISD::MemIndexedMode &AM, SelectionDAG &DAG) const override
getPreIndexedAddressParts - returns true by value, base pointer and offset pointer and addressing mod...
MachineInstr * EmitKCFICheck(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator &MBBI, const TargetInstrInfo *TII) const override
bool shouldAlignPointerArgs(CallInst *CI, unsigned &MinSize, Align &PrefAlign) const override
Return true if the pointer arguments to CI should be aligned by aligning the object whose address is ...
void getTgtMemIntrinsic(SmallVectorImpl< IntrinsicInfo > &Infos, const CallBase &I, MachineFunction &MF, unsigned Intrinsic) const override
getTgtMemIntrinsic - Represent NEON load and store intrinsics as MemIntrinsicNodes.
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
ReplaceNodeResults - Replace the results of node with an illegal result type with new values built ou...
void emitAtomicCmpXchgNoStoreLLBalance(IRBuilderBase &Builder) const override
bool isMulAddWithConstProfitable(SDValue AddNode, SDValue ConstNode) const override
Return true if it may be profitable to transform (mul (add x, c1), c2) -> (add (mul x,...
bool isLegalAddImmediate(int64_t Imm) const override
isLegalAddImmediate - Return true if the specified immediate is legal add immediate,...
EVT getOptimalMemOpType(LLVMContext &Context, const MemOp &Op, const AttributeList &FuncAttributes) const override
Returns the target specific optimal type for load and store operations as a result of memset,...
Instruction * emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst, AtomicOrdering Ord) const override
bool isFNegFree(EVT VT) const override
Return true if an fneg operation is free to the point where it is never worthwhile to replace it with...
void finalizeLowering(MachineFunction &MF) const override
Execute target specific actions to finalize target lowering.
SDValue PerformMVETruncCombine(SDNode *N, DAGCombinerInfo &DCI) const
bool isFPImmLegal(const APFloat &Imm, EVT VT, bool ForCodeSize=false) const override
isFPImmLegal - Returns true if the target can instruction select the specified FP immediate natively.
ConstraintType getConstraintType(StringRef Constraint) const override
getConstraintType - Given a constraint letter, return the type of constraint it is for this target.
bool preferIncOfAddToSubOfNot(EVT VT) const override
These two forms are equivalent: sub y, (xor x, -1) add (add x, 1), y The variant with two add's is IR...
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicStoreInIR(StoreInst *SI) const override
Returns how the given (atomic) store should be expanded by the IR-level AtomicExpand pass into.
SDValue PerformIntrinsicCombine(SDNode *N, DAGCombinerInfo &DCI) const
PerformIntrinsicCombine - ARM-specific DAG combining for intrinsics.
bool shouldFoldConstantShiftPairToMask(const SDNode *N) const override
Return true if it is profitable to fold a pair of shifts into a mask.
bool isDesirableToCommuteXorWithShift(const SDNode *N) const override
Return true if it is profitable to combine an XOR of a logical shift to create a logical shift of NOT...
SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG) const
PerformCMOVCombine - Target-specific DAG combining for ARMISD::CMOV.
TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicRMWInIR(const AtomicRMWInst *AI) const override
Returns how the IR-level AtomicExpand pass should expand the given AtomicRMW, if at all.
Value * createComplexDeinterleavingIR(IRBuilderBase &B, ComplexDeinterleavingOperation OperationType, ComplexDeinterleavingRotation Rotation, Value *InputA, Value *InputB, Value *Accumulator=nullptr) const override
Create the IR node for the given complex deinterleaving operation.
bool isComplexDeinterleavingSupported() const override
Does this target support complex deinterleaving.
SDValue PerformMVEExtCombine(SDNode *N, DAGCombinerInfo &DCI) const
FastISel * createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo, const LibcallLoweringInfo *libcallLowering) const override
createFastISel - This method returns a target specific FastISel object, or null if the target does no...
void insertSSPDeclarations(Module &M, const LibcallLoweringInfo &Libcalls) const override
Inserts necessary declarations for SSP (stack protection) purpose.
bool SimplifyDemandedBitsForTargetNode(SDValue Op, const APInt &OriginalDemandedBits, const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth) const override
Attempt to simplify any target nodes based on the demanded bits/elts, returning true on success.
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override
getSetCCResultType - Return the value type to use for ISD::SETCC.
MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const override
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
Value * emitStoreConditional(IRBuilderBase &Builder, Value *Val, Value *Addr, AtomicOrdering Ord) const override
Perform a store-conditional operation to Addr.
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC, bool isVarArg) const
void AdjustInstrPostInstrSelection(MachineInstr &MI, SDNode *Node) const override
This method should be implemented by targets that mark instructions with the 'hasPostISelHook' flag.
bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, unsigned Index) const override
Return true if EXTRACT_SUBVECTOR is cheap for this result type with this index.
bool isTruncateFree(Type *SrcTy, Type *DstTy) const override
Return true if it's free to truncate a value of type FromTy to type ToTy.
bool isShuffleMaskLegal(ArrayRef< int > M, EVT VT) const override
isShuffleMaskLegal - Targets can use this to indicate that they only support some VECTOR_SHUFFLE oper...
TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicCmpXchgInIR(const AtomicCmpXchgInst *AI) const override
Returns how the given atomic cmpxchg should be expanded by the IR-level AtomicExpand pass.
bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override
Returns true if it is beneficial to convert a load of a constant to just the constant itself.
bool lowerInterleavedStore(Instruction *Store, Value *Mask, ShuffleVectorInst *SVI, unsigned Factor, const APInt &GapMask) const override
Lower an interleaved store into a vstN intrinsic.
bool canCreateUndefOrPoisonForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const override
Return true if Op can create undef or poison from non-undef & non-poison operands.
const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent=false) const override
getRegClassFor - Return the register class that should be used for the specified value type.
bool useLoadStackGuardNode(const Module &M) const override
If this function returns true, SelectionDAGBuilder emits a LOAD_STACK_GUARD node when it is lowering ...
bool lowerInterleavedLoad(Instruction *Load, Value *Mask, ArrayRef< ShuffleVectorInst * > Shuffles, ArrayRef< unsigned > Indices, unsigned Factor, const APInt &GapMask) const override
Lower an interleaved load into a vldN intrinsic.
std::pair< const TargetRegisterClass *, uint8_t > findRepresentativeClass(const TargetRegisterInfo *TRI, MVT VT) const override
Return the largest legal super-reg register class of the register class for the specified type and it...
bool preferSelectsOverBooleanArithmetic(EVT VT) const override
Should we prefer selects to doing arithmetic on boolean types.
bool isZExtFree(SDValue Val, EVT VT2) const override
Return true if zero-extending the specific node Val to type VT2 is free (either because it's implicit...
bool isCheapToSpeculateCttz(Type *Ty) const override
Return true if it is cheap to speculate a call to intrinsic cttz.
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
bool isCheapToSpeculateCtlz(Type *Ty) const override
Return true if it is cheap to speculate a call to intrinsic ctlz.
bool targetShrinkDemandedConstant(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, TargetLoweringOpt &TLO) const override
ARMTargetLowering(const TargetMachine &TM, const ARMSubtarget &STI)
bool isComplexDeinterleavingOperationSupported(ComplexDeinterleavingOperation Operation, Type *Ty) const override
Does this target support complex deinterleaving with the given operation and type.
bool supportKCFIBundles() const override
Return true if the target supports kcfi operand bundles.
SDValue PerformBRCONDCombine(SDNode *N, SelectionDAG &DAG) const
PerformBRCONDCombine - Target-specific DAG combining for ARMISD::BRCOND.
Register getExceptionSelectorRegister(const Constant *PersonalityFn) const override
If a physical register, this returns the register that receives the exception typeid on entry to a la...
Type * shouldConvertSplatType(ShuffleVectorInst *SVI) const override
Given a shuffle vector SVI representing a vector splat, return a new scalar type of size equal to SVI...
Value * emitLoadLinked(IRBuilderBase &Builder, Type *ValueTy, Value *Addr, AtomicOrdering Ord) const override
Perform a load-linked operation on Addr, returning a "Value *" with the corresponding pointee type.
Instruction * makeDMB(IRBuilderBase &Builder, ARM_MB::MemBOpt Domain) const
bool isLegalICmpImmediate(int64_t Imm) const override
isLegalICmpImmediate - Return true if the specified immediate is legal icmp immediate,...
const char * LowerXConstraint(EVT ConstraintVT) const override
Try to replace an X constraint, which matches anything, with another that has more specific requireme...
unsigned getJumpTableEncoding() const override
Return the entry encoding for a jump table in the current function.
bool isDesirableToTransformToIntegerOp(unsigned Opc, EVT VT) const override
Return true if it is profitable for dag combiner to transform a floating point op of specified opcode...
CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool isVarArg) const
bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags, unsigned *Fast) const override
allowsMisalignedMemoryAccesses - Returns true if the target allows unaligned memory accesses of the s...
bool isLegalInterleavedAccessType(unsigned Factor, FixedVectorType *VecTy, Align Alignment, const DataLayout &DL) const
Returns true if VecTy is a legal interleaved access type.
bool isVectorLoadExtDesirable(SDValue ExtVal) const override
Return true if folding a vector load into ExtVal (a sign, zero, or any extend node) is profitable.
bool canCombineStoreAndExtract(Type *VectorTy, Value *Idx, unsigned &Cost) const override
Return true if the target can combine store(extractelement VectorTy,Idx).
bool useSoftFloat() const override
bool alignLoopsWithOptSize() const override
Should loops be aligned even when the function is marked OptSize (but not MinSize).
SDValue PerformCMOVToBFICombine(SDNode *N, SelectionDAG &DAG) const
bool allowTruncateForTailCall(Type *Ty1, Type *Ty2) const override
Return true if a truncation from FromTy to ToTy is permitted when deciding whether a call is in tail ...
void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const override
LowerAsmOperandForConstraint - Lower the specified operand into the Ops vector.
std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override
Given a physical register constraint (e.g.
bool shouldConvertFpToSat(unsigned Op, EVT FPVT, EVT VT) const override
Should we generate fp_to_si_sat and fp_to_ui_sat from type FPVT to type VT from min(max(fptoi)) satur...
bool functionArgumentNeedsConsecutiveRegisters(Type *Ty, CallingConv::ID CallConv, bool isVarArg, const DataLayout &DL) const override
Returns true if an argument of type Ty needs to be passed in a contiguous block of registers in calli...
bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override
Return true if folding a constant offset with the given GlobalAddress is legal.
const ARMBaseTargetMachine & getTM() const
bool isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const override
Return if the target supports combining a chain like:
ShiftLegalizationStrategy preferredShiftLegalizationStrategy(SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const override
bool getPostIndexedAddressParts(SDNode *N, SDNode *Op, SDValue &Base, SDValue &Offset, ISD::MemIndexedMode &AM, SelectionDAG &DAG) const override
getPostIndexedAddressParts - returns true by value, base pointer and offset pointer and addressing mo...
Instruction * emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst, AtomicOrdering Ord) const override
Inserts in the IR a target-specific intrinsic specifying a fence.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
size_t size() const
size - Get the array size.
Definition ArrayRef.h:142
bool empty() const
empty - Check if the array is empty.
Definition ArrayRef.h:137
An instruction that atomically checks whether a specified value is in a memory location,...
an instruction that atomically reads a memory location, combines it with another value,...
bool isFloatingPointOperation() const
static LLVM_ABI Attribute get(LLVMContext &Context, AttrKind Kind, uint64_t Val=0)
Return a uniquified Attribute object.
static LLVM_ABI BaseIndexOffset match(const SDNode *N, const SelectionDAG &DAG)
Parses tree in N for base, index, offset addresses.
LLVM Basic Block Representation.
Definition BasicBlock.h:62
The address of a basic block.
Definition Constants.h:904
static BranchProbability getZero()
A "pseudo-class" with methods for operating on BUILD_VECTORs.
LLVM_ABI bool isConstantSplat(APInt &SplatValue, APInt &SplatUndef, unsigned &SplatBitSize, bool &HasAnyUndefs, unsigned MinSplatBits=0, bool isBigEndian=false) const
Check if this is a constant splat, and if so, find the smallest element size that splats the vector.
LLVM_ABI int32_t getConstantFPSplatPow2ToLog2Int(BitVector *UndefElements, uint32_t BitWidth) const
If this is a constant FP splat and the splatted constant FP is an exact power or 2,...
CCState - This class holds information needed while lowering arguments and return values.
void getInRegsParamInfo(unsigned InRegsParamRecordIndex, unsigned &BeginReg, unsigned &EndReg) const
unsigned getFirstUnallocated(ArrayRef< MCPhysReg > Regs) const
getFirstUnallocated - Return the index of the first unallocated register in the set,...
static LLVM_ABI bool resultsCompatible(CallingConv::ID CalleeCC, CallingConv::ID CallerCC, MachineFunction &MF, LLVMContext &C, const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn CalleeFn, CCAssignFn CallerFn)
Returns true if the results of the two calling conventions are compatible.
MCRegister AllocateReg(MCPhysReg Reg)
AllocateReg - Attempt to allocate one register.
LLVM_ABI bool CheckReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
CheckReturn - Analyze the return values of a function, returning true if the return can be performed ...
LLVM_ABI void AnalyzeReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeReturn - Analyze the returned values of a return, incorporating info about the result values i...
unsigned getInRegsParamsProcessed() const
uint64_t getStackSize() const
Returns the size of the currently allocated portion of the stack.
void addInRegsParamInfo(unsigned RegBegin, unsigned RegEnd)
LLVM_ABI void AnalyzeFormalArguments(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeFormalArguments - Analyze an array of argument values, incorporating info about the formals in...
unsigned getInRegsParamsCount() const
CCValAssign - Represent assignment of one arg/retval to a location.
Register getLocReg() const
LocInfo getLocInfo() const
bool needsCustom() const
int64_t getLocMemOffset() const
unsigned getValNo() const
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
LLVM_ABI bool isMustTailCall() const
Tests if this call site must be tail call optimized.
AttributeList getAttributes() const
Return the attributes for this call.
void addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind)
Adds the attribute to the indicated argument.
This class represents a function call, abstracting a target machine's calling convention.
bool isTailCall() const
static Constant * get(LLVMContext &Context, ArrayRef< ElementTy > Elts)
get() constructor - Return a constant with array type with an element count and element type matching...
Definition Constants.h:720
const APFloat & getValueAPF() const
ConstantFP - Floating Point Values [float, double].
Definition Constants.h:282
This is the shared class of boolean and integer constants.
Definition Constants.h:87
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
Definition Constants.h:168
MachineConstantPoolValue * getMachineCPVal() const
const Constant * getConstVal() const
LLVM_ABI Type * getType() const
uint64_t getZExtValue() const
const APInt & getAPIntValue() const
int64_t getSExtValue() const
This is an important base class in LLVM.
Definition Constant.h:43
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
bool isLittleEndian() const
Layout endianness...
Definition DataLayout.h:214
bool isBigEndian() const
Definition DataLayout.h:215
MaybeAlign getStackAlignment() const
Returns the natural stack alignment, or MaybeAlign() if one wasn't specified.
Definition DataLayout.h:244
LLVM_ABI TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
LLVM_ABI Align getPreferredAlign(const GlobalVariable *GV) const
Returns the preferred alignment of the specified global.
StringRef getPrivateGlobalPrefix() const
Definition DataLayout.h:302
LLVM_ABI Align getPrefTypeAlign(Type *Ty) const
Returns the preferred stack/global alignment for the specified type.
A debug info location.
Definition DebugLoc.h:123
iterator find(const_arg_type_t< KeyT > Val)
Definition DenseMap.h:178
unsigned size() const
Definition DenseMap.h:110
bool empty() const
Definition DenseMap.h:109
iterator begin()
Definition DenseMap.h:78
iterator end()
Definition DenseMap.h:81
This is a fast-path instruction selection class that generates poor code and doesn't support illegal ...
Definition FastISel.h:66
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:802
A handy container for a FunctionType+Callee-pointer pair, which can be passed around as a single enti...
FunctionLoweringInfo - This contains information that is global to a function that is used when lower...
Type * getParamType(unsigned i) const
Parameter type accessors.
FunctionType * getFunctionType() const
Returns the FunctionType for me.
Definition Function.h:211
bool hasMinSize() const
Optimize this function for minimum size (-Oz).
Definition Function.h:711
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition Function.h:272
arg_iterator arg_begin()
Definition Function.h:868
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:358
bool hasStructRetAttr() const
Determine if the function returns a structure through first or second pointer argument.
Definition Function.h:695
const Argument * const_arg_iterator
Definition Function.h:74
bool isVarArg() const
isVarArg - Return true if this function takes a variable number of arguments.
Definition Function.h:229
bool hasFnAttribute(Attribute::AttrKind Kind) const
Return true if the function has the attribute.
Definition Function.cpp:729
const GlobalValue * getGlobal() const
bool isDSOLocal() const
bool hasExternalWeakLinkage() const
bool hasDLLImportStorageClass() const
Module * getParent()
Get the module that this global value is contained inside of...
bool isStrongDefinitionForLinker() const
Returns true if this global's definition will be the one chosen by the linker.
@ InternalLinkage
Rename collisions when linking (static functions).
Definition GlobalValue.h:60
Common base class shared among various IRBuilders.
Definition IRBuilder.h:114
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition IRBuilder.h:2788
LLVM_ABI bool hasAtomicStore() const LLVM_READONLY
Return true if this atomic instruction stores to memory.
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
bool isUnindexed() const
Return true if this is NOT a pre/post inc/dec load/store.
bool isIndexed() const
Return true if this is a pre/post inc/dec load/store.
Tracks which library functions to use for a particular subtarget.
LLVM_ABI CallingConv::ID getLibcallImplCallingConv(RTLIB::LibcallImpl Call) const
Get the CallingConv that should be used for the specified libcall.
LLVM_ABI RTLIB::LibcallImpl getLibcallImpl(RTLIB::Libcall Call) const
Return the lowering's selection of implementation call for Call.
An instruction for reading from memory.
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
Describe properties that are true of each instruction in the target description file.
Machine Value Type.
static MVT getFloatingPointVT(unsigned BitWidth)
static auto integer_fixedlen_vector_valuetypes()
SimpleValueType SimpleTy
uint64_t getScalarSizeInBits() const
unsigned getVectorNumElements() const
bool isInteger() const
Return true if this is an integer or a vector integer type.
static LLVM_ABI MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
static auto integer_valuetypes()
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
static auto fixedlen_vector_valuetypes()
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
bool isScalarInteger() const
Return true if this is an integer, not including vectors.
static MVT getVectorVT(MVT VT, unsigned NumElements)
MVT getVectorElementType() const
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
static MVT getIntegerVT(unsigned BitWidth)
static auto fp_valuetypes()
bool is64BitVector() const
Return true if this is a 64-bit vector type.
LLVM_ABI void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
bool isEHPad() const
Returns true if the block is a landing pad.
LLVM_ABI MachineBasicBlock * getFallThrough(bool JumpToFallThrough=true)
Return the fallthrough block if the block can implicitly transfer control to the block after it by fa...
void setCallFrameSize(unsigned N)
Set the call frame size on entry to this basic block.
const BasicBlock * getBasicBlock() const
Return the LLVM basic block that this instance corresponded to originally.
LLVM_ABI bool canFallThrough()
Return true if the block can implicitly transfer control to the block after it by falling off the end...
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
Instructions::iterator instr_iterator
MachineInstrBundleIterator< MachineInstr, true > reverse_iterator
LLVM_ABI MachineBasicBlock * splitAt(MachineInstr &SplitInst, bool UpdateLiveIns=true, LiveIntervals *LIS=nullptr)
Split a basic block into 2 pieces at SplitPoint.
void addLiveIn(MCRegister PhysReg, LaneBitmask LaneMask=LaneBitmask::getAll())
Adds the specified register as a live in.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
LLVM_ABI instr_iterator erase(instr_iterator I)
Remove an instruction from the instruction list and delete it.
iterator_range< succ_iterator > successors()
iterator_range< pred_iterator > predecessors()
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
MachineInstrBundleIterator< MachineInstr > iterator
LLVM_ABI void moveAfter(MachineBasicBlock *NewBefore)
void setIsEHPad(bool V=true)
Indicates the block is a landing pad.
The MachineConstantPool class keeps track of constants referenced by a function which must be spilled...
unsigned getConstantPoolIndex(const Constant *C, Align Alignment)
getConstantPoolIndex - Create a new entry in the constant pool or return an existing one.
LLVM_ABI int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
LLVM_ABI void computeMaxCallFrameSize(MachineFunction &MF, std::vector< MachineBasicBlock::iterator > *FrameSDOps=nullptr)
Computes the maximum size of a callframe.
void setFrameAddressIsTaken(bool T)
void setHasTailCall(bool V=true)
void setReturnAddressIsTaken(bool s)
int64_t getObjectSize(int ObjectIdx) const
Return the size of the specified object.
bool hasVAStart() const
Returns true if the function calls the llvm.va_start intrinsic.
int64_t getObjectOffset(int ObjectIdx) const
Return the assigned stack offset of the specified object from the incoming stack pointer.
bool isFixedObjectIndex(int ObjectIdx) const
Returns true if the specified index corresponds to a fixed stack object.
int getFunctionContextIndex() const
Return the index for the function context object.
Properties which a MachineFunction may have at a given point in time.
unsigned getFunctionNumber() const
getFunctionNumber - Return a unique ID for the current function.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
void push_back(MachineBasicBlock *MBB)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
const DataLayout & getDataLayout() const
Return the DataLayout attached to the Module associated to this MF.
Function & getFunction()
Return the LLVM function that this machine code represents.
BasicBlockListType::iterator iterator
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
MachineConstantPool * getConstantPool()
getConstantPool - Return the constant pool object for the current function.
const MachineFunctionProperties & getProperties() const
Get the function properties.
Register addLiveIn(MCRegister PReg, const TargetRegisterClass *RC)
addLiveIn - Add the specified physical register as a live-in value and create a corresponding virtual...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
const MachineInstrBuilder & addExternalSymbol(const char *FnName, unsigned TargetFlags=0) const
const MachineInstrBuilder & addUse(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a virtual register use operand.
const MachineInstrBuilder & addReg(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addFrameIndex(int Idx) const
const MachineInstrBuilder & addConstantPoolIndex(unsigned Idx, int Offset=0, unsigned TargetFlags=0) const
const MachineInstrBuilder & addRegMask(const uint32_t *Mask) const
const MachineInstrBuilder & addJumpTableIndex(unsigned Idx, unsigned TargetFlags=0) const
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & addDef(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a virtual register definition operand.
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
const MachineInstrBuilder & setMIFlags(unsigned Flags) const
const MachineInstrBuilder & addMemOperand(MachineMemOperand *MMO) const
MachineInstr * getInstr() const
If conversion operators fail, use this method to get the MachineInstr explicitly.
Representation of each machine instruction.
bool readsRegister(Register Reg, const TargetRegisterInfo *TRI) const
Return true if the MachineInstr reads the specified register.
bool definesRegister(Register Reg, const TargetRegisterInfo *TRI) const
Return true if the MachineInstr fully defines the specified register.
MachineOperand * mop_iterator
iterator/begin/end - Iterate over all operands of a machine instruction.
const MachineOperand & getOperand(unsigned i) const
LLVM_ABI unsigned createJumpTableIndex(const std::vector< MachineBasicBlock * > &DestBBs)
createJumpTableIndex - Create a new jump table.
@ EK_Inline
EK_Inline - Jump table entries are emitted inline at their point of use.
@ EK_BlockAddress
EK_BlockAddress - Each entry is a plain address of block, e.g.: .word LBB123.
A description of a memory reference used in the backend.
Flags
Flags values. These may be or'd together.
@ MOVolatile
The memory access is volatile.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MONonTemporal
The memory access is non-temporal.
@ MOInvariant
The memory access always returns the same value (or traps).
@ MOStore
The memory access writes data.
Flags getFlags() const
Return the raw flags of the source value,.
MachineOperand class - Representation of each machine instruction operand.
LLVM_ABI void setIsRenamable(bool Val=true)
bool isReg() const
isReg - Tests if this is a MO_Register operand.
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
static MachineOperand CreateImm(int64_t Val)
Register getReg() const
getReg - Returns the register number.
LLVM_ABI void setIsDef(bool Val=true)
Change a def to a use, or a use to a def.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
This class is used to represent an MLOAD node.
This class is used to represent an MSTORE node.
This SDNode is used for target intrinsics that touch memory and need an associated MachineMemOperand.
This is an abstract virtual class for memory operations.
Align getBaseAlign() const
Returns alignment and volatility of the memory access.
Align getAlign() const
bool isVolatile() const
AAMDNodes getAAInfo() const
Returns the AA info that describes the dereference.
bool isSimple() const
Returns true if the memory operation is neither atomic or volatile.
MachineMemOperand * getMemOperand() const
Return the unique MachineMemOperand object describing the memory reference performed by operation.
const MachinePointerInfo & getPointerInfo() const
const SDValue & getChain() const
EVT getMemoryVT() const
Return the type of the in-memory value.
A Module instance is used to store all the information related to an LLVM module.
Definition Module.h:67
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
Wrapper class representing virtual and physical registers.
Definition Register.h:20
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
const DebugLoc & getDebugLoc() const
Represents one node in the SelectionDAG.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool hasOneUse() const
Return true if there is exactly one use of this node.
LLVM_ABI bool isOnlyUserOf(const SDNode *N) const
Return true if this node is the only use of N.
iterator_range< use_iterator > uses()
SDNodeFlags getFlags() const
static bool hasPredecessorHelper(const SDNode *N, SmallPtrSetImpl< const SDNode * > &Visited, SmallVectorImpl< const SDNode * > &Worklist, unsigned int MaxSteps=0, bool TopologicalPrune=false)
Returns true if N is a predecessor of any node in Worklist.
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
bool use_empty() const
Return true if there are no uses of this node.
unsigned getNumValues() const
Return the number of values defined/returned by this operator.
unsigned getNumOperands() const
Return the number of values used by this operation.
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
const APInt & getConstantOperandAPInt(unsigned Num) const
Helper method returns the APInt of a ConstantSDNode operand.
bool isPredecessorOf(const SDNode *N) const
Return true if this node is a predecessor of N.
LLVM_ABI bool hasAnyUseOfValue(unsigned Value) const
Return true if there are any use of the indicated value.
EVT getValueType(unsigned ResNo) const
Return the type of a specified result.
void setCFIType(uint32_t Type)
bool isUndef() const
Returns true if the node type is UNDEF or POISON.
iterator_range< user_iterator > users()
void setFlags(SDNodeFlags NewFlags)
user_iterator user_begin() const
Provide iteration support to walk over all users of an SDNode.
Represents a use of a SDNode.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
bool isUndef() const
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
const SDValue & getOperand(unsigned i) const
const APInt & getConstantOperandAPInt(unsigned i) const
uint64_t getScalarValueSizeInBits() const
unsigned getResNo() const
get the index which selects a specific result in the SDNode
uint64_t getConstantOperandVal(unsigned i) const
unsigned getOpcode() const
unsigned getNumOperands() const
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
LLVM_ABI SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
SDValue getTargetGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, unsigned TargetFlags=0)
LLVM_ABI SDValue getStackArgumentTokenFactor(SDValue Chain)
Compute a TokenFactor to force all the incoming stack arguments to be loaded from the stack.
const TargetSubtargetInfo & getSubtarget() const
SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, Register Reg, SDValue N)
LLVM_ABI SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
LLVM_ABI SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
LLVM_ABI SDValue getSplatValue(SDValue V, bool LegalTypes=false)
If V is a splat vector, return its scalar source operand by extracting that element from the source v...
LLVM_ABI SDValue getAllOnesConstant(const SDLoc &DL, EVT VT, bool IsTarget=false, bool IsOpaque=false)
LLVM_ABI MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
LLVM_ABI SDNode * getNodeIfExists(unsigned Opcode, SDVTList VTList, ArrayRef< SDValue > Ops, const SDNodeFlags Flags, bool AllowCommute=false)
Get the specified node if it's already available, or else return NULL.
LLVM_ABI SDValue UnrollVectorOp(SDNode *N, unsigned ResNE=0)
Utility function used by legalize and lowering to "unroll" a vector operation by splitting out the sc...
LLVM_ABI bool haveNoCommonBitsSet(SDValue A, SDValue B) const
Return true if A and B have no common bits set.
LLVM_ABI SDValue getRegister(Register Reg, EVT VT)
LLVM_ABI SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
LLVM_ABI SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, LocationSize Size=LocationSize::precise(0), const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
void addNoMergeSiteInfo(const SDNode *Node, bool NoMerge)
Set NoMergeSiteInfo to be associated with Node if NoMerge is true.
std::pair< SDValue, SDValue > SplitVectorOperand(const SDNode *N, unsigned OpNo)
Split the node's operand with EXTRACT_SUBVECTOR and return the low/high part.
LLVM_ABI SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
const TargetLowering & getTargetLoweringInfo() const
SDValue getTargetJumpTable(int JTI, EVT VT, unsigned TargetFlags=0)
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
LLVM_ABI SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, Register Reg, EVT VT)
LLVM_ABI SDValue getNegative(SDValue Val, const SDLoc &DL, EVT VT)
Create negative operation as (SUB 0, Val).
LLVM_ABI void setNodeMemRefs(MachineSDNode *N, ArrayRef< MachineMemOperand * > NewMemRefs)
Mutate the specified machine node's memory references to the provided list.
LLVM_ABI SDValue getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT VT)
Return the expression required to zero extend the Op value assuming it was the smaller SrcTy value.
const DataLayout & getDataLayout() const
LLVM_ABI SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
SDValue getSignedTargetConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
LLVM_ABI void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
LLVM_ABI std::pair< SDValue, SDValue > SplitVector(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the vector with EXTRACT_SUBVECTOR using the provided VTs and return the low/high part.
LLVM_ABI SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
LLVM_ABI SDValue getSignedConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
LLVM_ABI MaybeAlign InferPtrAlign(SDValue Ptr) const
Infer alignment of a load / store address.
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
LLVM_ABI SDValue getTargetExtractSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand)
A convenience function for creating TargetInstrInfo::EXTRACT_SUBREG nodes.
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
LLVM_ABI SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either sign-extending or trunca...
LLVM_ABI bool isKnownNeverZero(SDValue Op, unsigned Depth=0) const
Test whether the given SDValue is known to contain non-zero value(s).
LLVM_ABI SDValue getExternalSymbol(const char *Sym, EVT VT)
const TargetMachine & getTarget() const
const LibcallLoweringInfo & getLibcalls() const
LLVM_ABI SDValue getIntPtrConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
LLVM_ABI SDValue getValueType(EVT)
LLVM_ABI SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI unsigned ComputeNumSignBits(SDValue Op, unsigned Depth=0) const
Return the number of times the sign bit of the register is replicated into the other bits.
LLVM_ABI SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
LLVM_ABI void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
LLVM_ABI SDValue getFrameIndex(int FI, EVT VT, bool isTarget=false)
LLVM_ABI KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
LLVM_ABI SDValue getRegisterMask(const uint32_t *RegMask)
LLVM_ABI SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
LLVM_ABI SDValue getCondCode(ISD::CondCode Cond)
void addCallSiteInfo(const SDNode *Node, CallSiteInfo &&CallInfo)
Set CallSiteInfo to be associated with Node.
LLVM_ABI bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
SDValue getObjectPtrOffset(const SDLoc &SL, SDValue Ptr, TypeSize Offset)
Create an add instruction with appropriate flags when used for addressing some offset of an object.
LLVMContext * getContext() const
LLVM_ABI SDValue getTargetExternalSymbol(const char *Sym, EVT VT, unsigned TargetFlags=0)
LLVM_ABI SDValue CreateStackTemporary(TypeSize Bytes, Align Alignment)
Create a stack temporary based on the size in bytes and the alignment.
SDValue getTargetConstantPool(const Constant *C, EVT VT, MaybeAlign Align=std::nullopt, int Offset=0, unsigned TargetFlags=0)
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
LLVM_ABI SDValue getMaskedLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Base, SDValue Offset, SDValue Mask, SDValue Src0, EVT MemVT, MachineMemOperand *MMO, ISD::MemIndexedMode AM, ISD::LoadExtType, bool IsExpanding=false)
DenormalMode getDenormalMode(EVT VT) const
Return the current function's default denormal handling kind for the given floating point type.
LLVM_ABI std::pair< SDValue, SDValue > SplitScalar(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the scalar node with EXTRACT_ELEMENT using the provided VTs and return the low/high part.
LLVM_ABI SDValue getVectorShuffle(EVT VT, const SDLoc &dl, SDValue N1, SDValue N2, ArrayRef< int > Mask)
Return an ISD::VECTOR_SHUFFLE node.
LLVM_ABI SDValue getLogicalNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a logical NOT operation as (XOR Val, BooleanOne).
This instruction constructs a fixed permutation of two input vectors.
VectorType * getType() const
Overload to return most specific vector type.
static LLVM_ABI void getShuffleMask(const Constant *Mask, SmallVectorImpl< int > &Result)
Convert the input shuffle mask operand to a vector of integers.
static LLVM_ABI bool isIdentityMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector without lane crossin...
This SDNode is used to implement the code generator support for the llvm IR shufflevector instruction...
int getMaskElt(unsigned Idx) const
ArrayRef< int > getMask() const
static LLVM_ABI bool isSplatMask(ArrayRef< int > Mask)
void insert_range(Range &&R)
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
bool empty() const
Definition SmallSet.h:169
bool erase(const T &V)
Definition SmallSet.h:200
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition SmallSet.h:184
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
iterator insert(iterator I, T &&Elt)
void resize(size_type N)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
This class is used to represent ISD::STORE nodes.
const SDValue & getBasePtr() const
const SDValue & getValue() const
bool isTruncatingStore() const
Return true if the op does a truncation before store.
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
const unsigned char * bytes_end() const
Definition StringRef.h:124
constexpr size_t size() const
size - Get the string size.
Definition StringRef.h:143
constexpr const char * data() const
data - Get a pointer to the start of the string (which may not be null terminated).
Definition StringRef.h:137
const unsigned char * bytes_begin() const
Definition StringRef.h:121
static LLVM_ABI StructType * get(LLVMContext &Context, ArrayRef< Type * > Elements, bool isPacked=false)
This static method is the primary way to create a literal StructType.
Definition Type.cpp:413
TargetInstrInfo - Interface to description of machine instruction set.
Provides information about what library functions are available for the current target.
bool isOperationExpand(unsigned Op, EVT VT) const
Return true if the specified operation is illegal on this target or unlikely to be made legal with cu...
void setBooleanVectorContents(BooleanContent Ty)
Specify how the target extends the result of a vector boolean value from a vector of i1 to a wider ty...
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
virtual void finalizeLowering(MachineFunction &MF) const
Execute target specific actions to finalize target lowering.
void setMaxDivRemBitWidthSupported(unsigned SizeInBits)
Set the size in bits of the maximum div/rem the backend supports.
bool PredictableSelectIsExpensive
Tells the code generator that select is more expensive than a branch if the branch is usually predict...
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
unsigned MaxStoresPerMemcpyOptSize
Likewise for functions with the OptSize attribute.
virtual const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent=false) const
Return the register class that should be used for the specified value type.
ShiftLegalizationStrategy
Return the preferred strategy to legalize tihs SHIFT instruction, with ExpansionFactor being the recu...
void setMinStackArgumentAlignment(Align Alignment)
Set the minimum stack alignment of an argument.
const TargetMachine & getTargetMachine() const
virtual void insertSSPDeclarations(Module &M, const LibcallLoweringInfo &Libcalls) const
Inserts necessary declarations for SSP (stack protection) purpose.
void setIndexedMaskedLoadAction(unsigned IdxMode, MVT VT, LegalizeAction Action)
Indicate that the specified indexed masked load does or does not work with the specified type and ind...
void setIndexedLoadAction(ArrayRef< unsigned > IdxModes, MVT VT, LegalizeAction Action)
Indicate that the specified indexed load does or does not work with the specified type and indicate w...
void setPrefLoopAlignment(Align Alignment)
Set the target's preferred loop alignment.
void setMaxAtomicSizeInBitsSupported(unsigned SizeInBits)
Set the maximum atomic operation size supported by the backend.
Sched::Preference getSchedulingPreference() const
Return target scheduling preference.
void setMinFunctionAlignment(Align Alignment)
Set the target's minimum function alignment.
unsigned MaxStoresPerMemsetOptSize
Likewise for functions with the OptSize attribute.
void setBooleanContents(BooleanContent Ty)
Specify how the target extends the result of integer and floating point boolean values from i1 to a w...
unsigned MaxStoresPerMemmove
Specify maximum number of store instructions per memmove call.
void computeRegisterProperties(const TargetRegisterInfo *TRI)
Once all of the register classes are added, this allows us to compute derived properties we expose.
unsigned MaxStoresPerMemmoveOptSize
Likewise for functions with the OptSize attribute.
void addRegisterClass(MVT VT, const TargetRegisterClass *RC)
Add the specified register class as an available regclass for the specified value type.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
void setIndexedStoreAction(ArrayRef< unsigned > IdxModes, MVT VT, LegalizeAction Action)
Indicate that the specified indexed store does or does not work with the specified type and indicate ...
virtual MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
void setPrefFunctionAlignment(Align Alignment)
Set the target's preferred function alignment.
virtual unsigned getMaxSupportedInterleaveFactor() const
Get the maximum supported factor for interleaved memory accesses.
void setIndexedMaskedStoreAction(unsigned IdxMode, MVT VT, LegalizeAction Action)
Indicate that the specified indexed masked store does or does not work with the specified type and in...
unsigned MaxStoresPerMemset
Specify maximum number of store instructions per memset call.
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
virtual ShiftLegalizationStrategy preferredShiftLegalizationStrategy(SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const
bool isOperationLegalOrCustom(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
virtual bool allowsMemoryAccess(LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const
Return true if the target supports a memory access of this type for the given address space and align...
void setStackPointerRegisterToSaveRestore(Register R)
If set to a physical register, this specifies the register that llvm.savestack/llvm....
void AddPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
If Opc/OrigVT is specified as being promoted, the promotion code defaults to trying a larger integer/...
AtomicExpansionKind
Enum that specifies what an atomic load/AtomicRMWInst is expanded to, if at all.
virtual std::pair< const TargetRegisterClass *, uint8_t > findRepresentativeClass(const TargetRegisterInfo *TRI, MVT VT) const
Return the largest legal super-reg register class of the register class for the specified type and it...
RTLIB::LibcallImpl getLibcallImpl(RTLIB::Libcall Call) const
Get the libcall impl routine name for the specified libcall.
static StringRef getLibcallImplName(RTLIB::LibcallImpl Call)
Get the libcall routine name for the specified libcall implementation.
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
void setLoadExtAction(unsigned ExtType, MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified load with extension does not work with the specified type and indicate wh...
LegalizeTypeAction getTypeAction(LLVMContext &Context, EVT VT) const
Return how we should legalize values of this type, either it is already legal (return 'Legal') or we ...
std::vector< ArgListEntry > ArgListTy
unsigned MaxStoresPerMemcpy
Specify maximum number of store instructions per memcpy call.
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
This class defines information used to lower LLVM code to legal SelectionDAG operators that the targe...
bool SimplifyDemandedVectorElts(SDValue Op, const APInt &DemandedEltMask, APInt &KnownUndef, APInt &KnownZero, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Vector Op.
void softenSetCCOperands(SelectionDAG &DAG, EVT VT, SDValue &NewLHS, SDValue &NewRHS, ISD::CondCode &CCCode, const SDLoc &DL, const SDValue OldLHS, const SDValue OldRHS) const
Soften the operands of a comparison.
SDValue expandUnalignedStore(StoreSDNode *ST, SelectionDAG &DAG) const
Expands an unaligned store to 2 half-size stores for integer values, and possibly more for vectors.
virtual ConstraintType getConstraintType(StringRef Constraint) const
Given a constraint, return the type of constraint it is for this target.
bool parametersInCSRMatch(const MachineRegisterInfo &MRI, const uint32_t *CallerPreservedMask, const SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< SDValue > &OutVals) const
Check whether parameters to a call that are passed in callee saved registers are the same as from the...
std::pair< SDValue, SDValue > expandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Expands an unaligned load to 2 half-size loads for an integer, and possibly more for vectors.
virtual SDValue LowerToTLSEmulatedModel(const GlobalAddressSDNode *GA, SelectionDAG &DAG) const
Lower TLS global address SDNode for target independent emulated TLS model.
std::pair< SDValue, SDValue > LowerCallTo(CallLoweringInfo &CLI) const
This function lowers an abstract call to a function into an actual call.
bool expandDIVREMByConstant(SDNode *N, SmallVectorImpl< SDValue > &Result, EVT HiLoVT, SelectionDAG &DAG, SDValue LL=SDValue(), SDValue LH=SDValue()) const
Attempt to expand an n-bit div/rem/divrem by constant using a n/2-bit urem by constant and other arit...
bool isPositionIndependent() const
virtual ConstraintWeight getSingleConstraintMatchWeight(AsmOperandInfo &info, const char *constraint) const
Examine constraint string and operand type and determine a weight value.
SDValue buildLegalVectorShuffle(EVT VT, const SDLoc &DL, SDValue N0, SDValue N1, MutableArrayRef< int > Mask, SelectionDAG &DAG) const
Tries to build a legal vector shuffle using the provided parameters or equivalent variations.
virtual std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const
Given a physical register constraint (e.g.
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Op.
virtual bool SimplifyDemandedBitsForTargetNode(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0) const
Attempt to simplify any target nodes based on the demanded bits/elts, returning true on success.
TargetLowering(const TargetLowering &)=delete
bool isConstTrueVal(SDValue N) const
Return if the N is a constant or constant vector equal to the true value from getBooleanContents().
virtual ArrayRef< MCPhysReg > getRoundingControlRegisters() const
Returns a 0 terminated array of rounding control registers that can be attached into strict FP call.
virtual void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const
Lower the specified operand into the Ops vector.
std::pair< SDValue, SDValue > makeLibCall(SelectionDAG &DAG, RTLIB::LibcallImpl LibcallImpl, EVT RetVT, ArrayRef< SDValue > Ops, MakeLibCallOptions CallOptions, const SDLoc &dl, SDValue Chain=SDValue()) const
Returns a pair of (return value, chain).
virtual bool canCreateUndefOrPoisonForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const
Return true if Op can create undef or poison from non-undef & non-poison operands.
void setTypeIdForCallsiteInfo(const CallBase *CB, MachineFunction &MF, MachineFunction::CallSiteInfo &CSInfo) const
Primary interface to the complete machine description for the target machine.
TLSModel::Model getTLSModel(const GlobalValue *GV) const
Returns the TLS model which should be used for the given global variable.
ExceptionHandling getExceptionModel() const
Return the ExceptionHandling to use, considering TargetOptions and the Triple's default.
const Triple & getTargetTriple() const
bool useEmulatedTLS() const
Returns true if this target uses emulated TLS.
virtual const TargetSubtargetInfo * getSubtargetImpl(const Function &) const
Virtual method implemented by subclasses that returns a reference to that target's TargetSubtargetInf...
TargetOptions Options
unsigned EnableFastISel
EnableFastISel - This flag enables fast-path instruction selection which trades away generated code q...
unsigned GuaranteedTailCallOpt
GuaranteedTailCallOpt - This flag is enabled when -tailcallopt is specified on the commandline.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
virtual const TargetRegisterInfo * getRegisterInfo() const =0
Return the target's register information.
Target - Wrapper for Target specific information.
Triple - Helper class for working with autoconf configuration names.
Definition Triple.h:47
ObjectFormatType getObjectFormat() const
Get the object format for this triple.
Definition Triple.h:446
bool isOSWindows() const
Tests whether the OS is Windows.
Definition Triple.h:709
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:343
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
static LLVM_ABI IntegerType * getInt64Ty(LLVMContext &C)
Definition Type.cpp:297
bool isVectorTy() const
True if this is an instance of VectorType.
Definition Type.h:273
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:296
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:267
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition Type.h:153
static LLVM_ABI Type * getVoidTy(LLVMContext &C)
Definition Type.cpp:280
static LLVM_ABI IntegerType * getInt8Ty(LLVMContext &C)
Definition Type.cpp:294
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:352
LLVM_ABI TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Definition Type.cpp:197
static LLVM_ABI IntegerType * getInt16Ty(LLVMContext &C)
Definition Type.cpp:295
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition Type.h:142
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition Type.h:128
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:230
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:184
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240
bool isFPOrFPVectorTy() const
Return true if this is a FP type or a vector of FP.
Definition Type.h:225
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
LLVM_ABI unsigned getOperandNo() const
Return the operand # of this use in its User.
Definition Use.cpp:35
User * getUser() const
Returns the User that contains this Use.
Definition Use.h:61
Value * getOperand(unsigned i) const
Definition User.h:207
unsigned getNumOperands() const
Definition User.h:229
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:440
Base class of all SIMD vector types.
Type * getElementType() const
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:202
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
Definition DenseSet.h:175
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:200
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:123
IteratorT end() const
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
static CondCodes getOppositeCondition(CondCodes CC)
Definition ARMBaseInfo.h:48
@ SECREL
Thread Pointer Offset.
@ SBREL
Section Relative (Windows TLS)
@ GOTTPOFF
Global Offset Table, PC Relative.
@ TPOFF
Global Offset Table, Thread Pointer Offset.
TOF
Target Operand Flag enum.
@ MO_NONLAZY
MO_NONLAZY - This is an independent flag, on a symbol operand "FOO" it represents a symbol which,...
@ MO_SBREL
MO_SBREL - On a symbol operand, this represents a static base relative relocation.
@ MO_DLLIMPORT
MO_DLLIMPORT - On a symbol operand, this represents that the reference to the symbol is for an import...
@ MO_GOT
MO_GOT - On a symbol operand, this represents a GOT relative relocation.
@ MO_COFFSTUB
MO_COFFSTUB - On a symbol operand "FOO", this indicates that the reference is actually to the "....
static ShiftOpc getShiftOpcForNode(unsigned Opcode)
int getSOImmVal(unsigned Arg)
getSOImmVal - Given a 32-bit immediate, if it is something that can fit into an shifter_operand immed...
int getFP32Imm(const APInt &Imm)
getFP32Imm - Return an 8-bit floating-point version of the 32-bit floating-point value.
uint64_t decodeVMOVModImm(unsigned ModImm, unsigned &EltBits)
decodeVMOVModImm - Decode a NEON/MVE modified immediate value into the element value and the element ...
unsigned getAM2Offset(unsigned AM2Opc)
bool isThumbImmShiftedVal(unsigned V)
isThumbImmShiftedVal - Return true if the specified value can be obtained by left shifting a 8-bit im...
int getT2SOImmVal(unsigned Arg)
getT2SOImmVal - Given a 32-bit immediate, if it is something that can fit into a Thumb-2 shifter_oper...
unsigned createVMOVModImm(unsigned OpCmode, unsigned Val)
int getFP64Imm(const APInt &Imm)
getFP64Imm - Return an 8-bit floating-point version of the 64-bit floating-point value.
int getFP16Imm(const APInt &Imm)
getFP16Imm - Return an 8-bit floating-point version of the 16-bit floating-point value.
unsigned getSORegOpc(ShiftOpc ShOp, unsigned Imm)
int getFP32FP16Imm(const APInt &Imm)
If this is a FP16Imm encoded as a fp32 value, return the 8-bit encoding for it.
AddrOpc getAM2Op(unsigned AM2Opc)
bool isBitFieldInvertedMask(unsigned v)
const unsigned FPStatusBits
FastISel * createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo, const LibcallLoweringInfo *libcallLowering)
const unsigned FPReservedBits
const unsigned RoundingBitsPos
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
@ Entry
Definition COFF.h:862
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ Swift
Calling convention for Swift.
Definition CallingConv.h:69
@ ARM_APCS
ARM Procedure Calling Standard (obsolete, but still used on some targets).
@ CFGuard_Check
Special calling convention on Windows for calling the Control Guard Check ICall funtion.
Definition CallingConv.h:82
@ PreserveMost
Used for runtime calls that preserves most registers.
Definition CallingConv.h:63
@ ARM_AAPCS
ARM Architecture Procedure Calling Standard calling convention (aka EABI).
@ CXX_FAST_TLS
Used for access functions.
Definition CallingConv.h:72
@ GHC
Used by the Glasgow Haskell Compiler (GHC).
Definition CallingConv.h:50
@ PreserveAll
Used for runtime calls that preserves (almost) all registers.
Definition CallingConv.h:66
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition CallingConv.h:41
@ Tail
Attemps to make calls as fast as possible while guaranteeing that tail call optimization can always b...
Definition CallingConv.h:76
@ SwiftTail
This follows the Swift calling convention in how arguments are passed but guarantees tail calls will ...
Definition CallingConv.h:87
@ ARM_AAPCS_VFP
Same as ARM_AAPCS, but uses hard floating point ABI.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
bool isNON_EXTLoad(const SDNode *N)
Returns true if the specified node is a non-extending load.
NodeType
ISD::NodeType enum - This enum defines the target-independent operators for a SelectionDAG.
Definition ISDOpcodes.h:41
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition ISDOpcodes.h:819
@ MERGE_VALUES
MERGE_VALUES - This node takes multiple discrete operands and returns them all as its individual resu...
Definition ISDOpcodes.h:261
@ STACKRESTORE
STACKRESTORE has two operands, an input chain and a pointer to restore to it returns an output chain.
@ STACKSAVE
STACKSAVE - STACKSAVE has one operand, an input chain.
@ CTLZ_ZERO_UNDEF
Definition ISDOpcodes.h:788
@ STRICT_FSETCC
STRICT_FSETCC/STRICT_FSETCCS - Constrained versions of SETCC, used for floating-point operands only.
Definition ISDOpcodes.h:511
@ SET_FPENV
Sets the current floating-point environment.
@ MLOAD
Masked load and store - consecutive vector load and store operations with additional mask operand tha...
@ EH_SJLJ_LONGJMP
OUTCHAIN = EH_SJLJ_LONGJMP(INCHAIN, buffer) This corresponds to the eh.sjlj.longjmp intrinsic.
Definition ISDOpcodes.h:168
@ FGETSIGN
INT = FGETSIGN(FP) - Return the sign bit of the specified floating point value as an integer 0/1 valu...
Definition ISDOpcodes.h:538
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition ISDOpcodes.h:275
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
Definition ISDOpcodes.h:600
@ BSWAP
Byte Swap and Counting operators.
Definition ISDOpcodes.h:779
@ VAEND
VAEND, VASTART - VAEND and VASTART have three operands: an input chain, pointer, and a SRCVALUE.
@ ATOMIC_STORE
OUTCHAIN = ATOMIC_STORE(INCHAIN, val, ptr) This corresponds to "store atomic" instruction.
@ RESET_FPENV
Set floating-point environment to default state.
@ ADD
Simple integer binary arithmetic operators.
Definition ISDOpcodes.h:264
@ LOAD
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
@ SET_FPMODE
Sets the current dynamic floating-point control modes.
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition ISDOpcodes.h:853
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition ISDOpcodes.h:518
@ FMODF
FMODF - Decomposes the operand into integral and fractional parts, each having the same type and sign...
@ FATAN2
FATAN2 - atan2, inspired by libm.
@ FSINCOSPI
FSINCOSPI - Compute both the sine and cosine times pi more accurately than FSINCOS(pi*x),...
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
Definition ISDOpcodes.h:220
@ EH_SJLJ_SETUP_DISPATCH
OUTCHAIN = EH_SJLJ_SETUP_DISPATCH(INCHAIN) The target initializes the dispatch table here.
Definition ISDOpcodes.h:172
@ GlobalAddress
Definition ISDOpcodes.h:88
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition ISDOpcodes.h:880
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
Definition ISDOpcodes.h:584
@ VECREDUCE_FMAX
FMIN/FMAX nodes can have flags, for NaN/NoNaN variants.
@ FADD
Simple binary floating point operators.
Definition ISDOpcodes.h:417
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
Definition ISDOpcodes.h:747
@ ATOMIC_FENCE
OUTCHAIN = ATOMIC_FENCE(INCHAIN, ordering, scope) This corresponds to the fence instruction.
@ RESET_FPMODE
Sets default dynamic floating-point control modes.
@ SDIVREM
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition ISDOpcodes.h:280
@ FP16_TO_FP
FP16_TO_FP, FP_TO_FP16 - These operators are used to perform promotions and truncation for half-preci...
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
Definition ISDOpcodes.h:993
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
Definition ISDOpcodes.h:254
@ FLDEXP
FLDEXP - ldexp, inspired by libm (op0 * 2**op1).
@ STRICT_FSQRT
Constrained versions of libm-equivalent floating point intrinsics.
Definition ISDOpcodes.h:438
@ BUILTIN_OP_END
BUILTIN_OP_END - This must be the last enum value in this list.
@ GlobalTLSAddress
Definition ISDOpcodes.h:89
@ SET_ROUNDING
Set rounding mode.
Definition ISDOpcodes.h:975
@ SIGN_EXTEND
Conversion operators.
Definition ISDOpcodes.h:844
@ AVGCEILS
AVGCEILS/AVGCEILU - Rounding averaging add - Add two integers using an integer of type i[N+2],...
Definition ISDOpcodes.h:715
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
Definition ISDOpcodes.h:665
@ BR
Control flow instructions. These all have token chains.
@ VECREDUCE_FADD
These reductions have relaxed evaluation order semantics, and have a single vector operand.
@ CTTZ_ZERO_UNDEF
Bit counting operators with an undefined result for zero inputs.
Definition ISDOpcodes.h:787
@ PREFETCH
PREFETCH - This corresponds to a prefetch intrinsic.
@ FSINCOS
FSINCOS - Compute both fsin and fcos as a single operation.
@ SETCCCARRY
Like SetCC, ops #0 and #1 are the LHS and RHS operands to compare, but op #2 is a boolean indicating ...
Definition ISDOpcodes.h:827
@ FNEG
Perform various unary floating-point operations inspired by libm.
@ BR_CC
BR_CC - Conditional branch.
@ SSUBO
Same for subtraction.
Definition ISDOpcodes.h:352
@ BR_JT
BR_JT - Jumptable branch.
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
Definition ISDOpcodes.h:374
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition ISDOpcodes.h:796
@ ATOMIC_LOAD
Val, OUTCHAIN = ATOMIC_LOAD(INCHAIN, ptr) This corresponds to "load atomic" instruction.
@ UNDEF
UNDEF - An undefined node.
Definition ISDOpcodes.h:233
@ EXTRACT_ELEMENT
EXTRACT_ELEMENT - This is used to get the lower or upper (determined by a Constant,...
Definition ISDOpcodes.h:247
@ VACOPY
VACOPY - VACOPY has 5 operands: an input chain, a destination pointer, a source pointer,...
@ BasicBlock
Various leaf nodes.
Definition ISDOpcodes.h:81
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
Definition ISDOpcodes.h:230
@ SADDO
RESULT, BOOL = [SU]ADDO(LHS, RHS) - Overflow-aware nodes for addition.
Definition ISDOpcodes.h:348
@ CTLS
Count leading redundant sign bits.
Definition ISDOpcodes.h:792
@ VECREDUCE_ADD
Integer reductions may have a result type larger than the vector element type.
@ GET_ROUNDING
Returns current rounding mode: -1 Undefined 0 Round to 0 1 Round to nearest, ties to even 2 Round to ...
Definition ISDOpcodes.h:970
@ STRICT_FP_TO_FP16
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition ISDOpcodes.h:704
@ GET_FPMODE
Reads the current dynamic floating-point control modes.
@ STRICT_FP16_TO_FP
@ GET_FPENV
Gets the current floating-point environment.
@ SHL
Shift and rotation operations.
Definition ISDOpcodes.h:765
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition ISDOpcodes.h:649
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
Definition ISDOpcodes.h:614
@ READ_REGISTER
READ_REGISTER, WRITE_REGISTER - This node represents llvm.register on the DAG, which implements the n...
Definition ISDOpcodes.h:139
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition ISDOpcodes.h:576
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition ISDOpcodes.h:224
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition ISDOpcodes.h:850
@ DEBUGTRAP
DEBUGTRAP - Trap intended to get the attention of a debugger.
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
Definition ISDOpcodes.h:811
@ ATOMIC_CMP_SWAP
Val, OUTCHAIN = ATOMIC_CMP_SWAP(INCHAIN, ptr, cmp, swap) For double-word atomic operations: ValLo,...
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum maximum on two values, following IEEE-754 definition...
@ SMULO
Same for multiplication.
Definition ISDOpcodes.h:356
@ DYNAMIC_STACKALLOC
DYNAMIC_STACKALLOC - Allocate some number of bytes on the stack aligned to a specified boundary.
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition ISDOpcodes.h:888
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition ISDOpcodes.h:727
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition ISDOpcodes.h:978
@ VSELECT
Select with a vector condition (op #0) and two vector operands (ops #1 and #2), returning a vector re...
Definition ISDOpcodes.h:805
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:328
@ STRICT_FROUNDEVEN
Definition ISDOpcodes.h:464
@ BF16_TO_FP
BF16_TO_FP, FP_TO_BF16 - These operators are used to perform promotions and truncation for bfloat16.
@ FRAMEADDR
FRAMEADDR, RETURNADDR - These nodes represent llvm.frameaddress and llvm.returnaddress on the DAG.
Definition ISDOpcodes.h:110
@ STRICT_FP_TO_UINT
Definition ISDOpcodes.h:478
@ STRICT_FP_ROUND
X = STRICT_FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision ...
Definition ISDOpcodes.h:500
@ STRICT_FP_TO_SINT
STRICT_FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:477
@ FMINIMUM
FMINIMUM/FMAXIMUM - NaN-propagating minimum/maximum that also treat -0.0 as less than 0....
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:926
@ READCYCLECOUNTER
READCYCLECOUNTER - This corresponds to the readcyclecounter intrinsic.
@ STRICT_FP_EXTEND
X = STRICT_FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition ISDOpcodes.h:505
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition ISDOpcodes.h:739
@ TRAP
TRAP - Trapping instruction.
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition ISDOpcodes.h:205
@ SCMP
[US]CMP - 3-way comparison of signed or unsigned integers.
Definition ISDOpcodes.h:735
@ AVGFLOORS
AVGFLOORS/AVGFLOORU - Averaging add - Add two integers using an integer of type i[N+1],...
Definition ISDOpcodes.h:710
@ STRICT_FADD
Constrained versions of the binary floating point operators.
Definition ISDOpcodes.h:427
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
Definition ISDOpcodes.h:565
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
Definition ISDOpcodes.h:53
@ ATOMIC_SWAP
Val, OUTCHAIN = ATOMIC_SWAP(INCHAIN, ptr, amt) Val, OUTCHAIN = ATOMIC_LOAD_[OpName](INCHAIN,...
@ FFREXP
FFREXP - frexp, extract fractional and exponent component of a floating-point value.
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition ISDOpcodes.h:959
@ SPONENTRY
SPONENTRY - Represents the llvm.sponentry intrinsic.
Definition ISDOpcodes.h:122
@ STRICT_FNEARBYINT
Definition ISDOpcodes.h:458
@ FP_TO_SINT_SAT
FP_TO_[US]INT_SAT - Convert floating point value in operand 0 to a signed or unsigned scalar integer ...
Definition ISDOpcodes.h:945
@ EH_SJLJ_SETJMP
RESULT, OUTCHAIN = EH_SJLJ_SETJMP(INCHAIN, buffer) This corresponds to the eh.sjlj....
Definition ISDOpcodes.h:162
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition ISDOpcodes.h:856
@ VAARG
VAARG - VAARG has four operands: an input chain, a pointer, a SRCVALUE, and the alignment.
@ BRCOND
BRCOND - Conditional branch.
@ SHL_PARTS
SHL_PARTS/SRA_PARTS/SRL_PARTS - These operators are used for expanded integer shift operations.
Definition ISDOpcodes.h:833
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
Definition ISDOpcodes.h:534
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
Definition ISDOpcodes.h:365
@ ABDS
ABDS/ABDU - Absolute difference - Return the absolute difference between two numbers interpreted as s...
Definition ISDOpcodes.h:722
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition ISDOpcodes.h:213
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
Definition ISDOpcodes.h:556
bool isNormalStore(const SDNode *N)
Returns true if the specified node is a non-truncating and unindexed store.
bool isZEXTLoad(const SDNode *N)
Returns true if the specified node is a ZEXTLOAD.
LLVM_ABI CondCode getSetCCInverse(CondCode Operation, EVT Type)
Return the operation corresponding to !(X op Y), where 'op' is a valid SetCC operation.
bool isEXTLoad(const SDNode *N)
Returns true if the specified node is a EXTLOAD.
LLVM_ABI CondCode getSetCCSwappedOperands(CondCode Operation)
Return the operation corresponding to (Y op X) when given the operation for (X op Y).
LLVM_ABI bool isBuildVectorAllZeros(const SDNode *N)
Return true if the specified node is a BUILD_VECTOR where all of the elements are 0 or undef.
bool isSignedIntSetCC(CondCode Code)
Return true if this is a setcc instruction that performs a signed comparison when used with integer o...
LLVM_ABI bool isConstantSplatVector(const SDNode *N, APInt &SplatValue)
Node predicates.
MemIndexedMode
MemIndexedMode enum - This enum defines the load / store indexed addressing modes.
bool isSEXTLoad(const SDNode *N)
Returns true if the specified node is a SEXTLOAD.
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
LoadExtType
LoadExtType enum - This enum defines the three variants of LOADEXT (load with extension).
static const int LAST_INDEXED_MODE
bool isNormalLoad(const SDNode *N)
Returns true if the specified node is a non-extending and unindexed load.
This namespace contains an enum with a value for every intrinsic/builtin function known by LLVM.
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
LLVM_ABI Libcall getSINTTOFP(EVT OpVT, EVT RetVT)
getSINTTOFP - Return the SINTTOFP_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
LLVM_ABI Libcall getUINTTOFP(EVT OpVT, EVT RetVT)
getUINTTOFP - Return the UINTTOFP_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
LLVM_ABI Libcall getFPTOUINT(EVT OpVT, EVT RetVT)
getFPTOUINT - Return the FPTOUINT_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
LLVM_ABI Libcall getFPTOSINT(EVT OpVT, EVT RetVT)
getFPTOSINT - Return the FPTOSINT_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
LLVM_ABI Libcall getFPEXT(EVT OpVT, EVT RetVT)
getFPEXT - Return the FPEXT_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
LLVM_ABI Libcall getFPROUND(EVT OpVT, EVT RetVT)
getFPROUND - Return the FPROUND_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
@ SingleThread
Synchronized with respect to signal handlers executing in the same thread.
Definition LLVMContext.h:55
initializer< Ty > init(const Ty &Val)
This is an optimization pass for GlobalISel generic memory operations.
Definition Types.h:26
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:316
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
bool RetFastCC_ARM_APCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
@ Low
Lower the current thread's priority such that it does not affect foreground tasks significantly.
Definition Threading.h:280
@ Offset
Definition DWP.cpp:532
@ Length
Definition DWP.cpp:532
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
void stable_sort(R &&Range)
Definition STLExtras.h:2116
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1765
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1739
bool HasLowerConstantMaterializationCost(unsigned Val1, unsigned Val2, const ARMSubtarget *Subtarget, bool ForCodesize=false)
Returns true if Val1 has a lower Constant Materialization Cost than Val2.
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
InstructionCost Cost
LLVM_ABI bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Dead
Unused definition.
@ Kill
The last use of a register.
@ Define
Register definition.
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
bool isStrongerThanMonotonic(AtomicOrdering AO)
int countr_one(T Value)
Count the number of ones from the least significant bit to the first zero bit.
Definition bit.h:293
FunctionAddr VTableAddr uintptr_t uintptr_t Int32Ty
Definition InstrProf.h:296
bool CCAssignFn(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
CCAssignFn - This function assigns a location for Val, updating State to reflect the change.
bool CC_ARM_AAPCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
constexpr bool isMask_32(uint32_t Value)
Return true if the argument is a non-empty sequence of ones starting at the least significant bit wit...
Definition MathExtras.h:255
ExceptionHandling
Definition CodeGen.h:53
@ SjLj
setjmp/longjmp based exceptions
Definition CodeGen.h:56
bool RetCC_ARM_AAPCS_VFP(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
bool RetCC_ARM_APCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
Definition bit.h:303
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2208
constexpr bool isUIntN(unsigned N, uint64_t x)
Checks if an unsigned integer fits into the given (dynamic) bit width.
Definition MathExtras.h:243
bool RetCC_ARM_AAPCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
LLVM_ABI Value * concatenateVectors(IRBuilderBase &Builder, ArrayRef< Value * > Vecs)
Concatenate a list of vectors.
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition MathExtras.h:284
void shuffle(Iterator first, Iterator last, RNG &&g)
Definition STLExtras.h:1530
bool CC_ARM_APCS_GHC(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
static std::array< MachineOperand, 2 > predOps(ARMCC::CondCodes Pred, unsigned PredReg=0)
Get the operands corresponding to the given Pred value.
bool operator==(const AddressRangeValuePair &LHS, const AddressRangeValuePair &RHS)
constexpr bool isShiftedMask_32(uint32_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (32 bit ver...
Definition MathExtras.h:267
LLVM_ABI ConstantFPSDNode * isConstOrConstSplatFP(SDValue N, bool AllowUndefs=false)
Returns the SDNode if it is a constant splat BuildVector or constant float.
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition bit.h:202
bool isReleaseOrStronger(AtomicOrdering AO)
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:753
constexpr bool has_single_bit(T Value) noexcept
Definition bit.h:147
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1746
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:331
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
Definition bit.h:236
LLVM_ABI bool isBitwiseNot(SDValue V, bool AllowUndefs=false)
Returns true if V is a bitwise not operation.
MachineInstr * getImm(const MachineOperand &MO, const MachineRegisterInfo *MRI)
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:279
bool FastCC_ARM_APCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
bool CC_ARM_Win32_CFGuard_Check(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:163
FunctionAddr VTableAddr Count
Definition InstrProf.h:139
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
Definition MathExtras.h:189
SmallVector< ValueTypeFromRangeType< R >, Size > to_vector(R &&Range)
Given a range of type R, iterate the entire range and return a SmallVector with elements of the vecto...
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
LLVM_ABI raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
AtomicOrdering
Atomic ordering for LLVM's memory model.
FunctionAddr VTableAddr uintptr_t uintptr_t Data
Definition InstrProf.h:189
CombineLevel
Definition DAGCombine.h:15
@ BeforeLegalizeTypes
Definition DAGCombine.h:16
unsigned ConstantMaterializationCost(unsigned Val, const ARMSubtarget *Subtarget, bool ForCodesize=false)
Returns the number of instructions required to materialize the given constant in a register,...
@ Mul
Product of integers.
@ And
Bitwise or logical AND of integers.
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
@ FAdd
Sum of floats.
uint16_t MCPhysReg
An unsigned integer type large enough to represent all physical registers, but not necessarily virtua...
Definition MCRegister.h:21
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition Alignment.h:144
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
LLVM_ABI ConstantSDNode * isConstOrConstSplat(SDValue N, bool AllowUndefs=false, bool AllowTruncation=false)
Returns the SDNode if it is a constant splat BuildVector or constant int.
constexpr U AbsoluteValue(T X)
Return the absolute value of a signed integer, converted to the corresponding unsigned integer type.
Definition MathExtras.h:592
bool isAcquireOrStronger(AtomicOrdering AO)
constexpr unsigned BitWidth
static MachineOperand t1CondCodeOp(bool isDead=false)
Get the operand corresponding to the conditional code result for Thumb1.
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
Definition STLExtras.h:2019
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1772
LLVM_ABI bool isOneConstant(SDValue V)
Returns true if V is a constant integer one.
constexpr bool isIntN(unsigned N, int64_t x)
Checks if an signed integer fits into the given (dynamic) bit width.
Definition MathExtras.h:248
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition Alignment.h:201
static MachineOperand condCodeOp(unsigned CCReg=0)
Get the operand corresponding to the conditional code result.
bool isVREVMask(ArrayRef< int > M, EVT VT, unsigned BlockSize)
isVREVMask - Check if a vector shuffle corresponds to a VREV instruction with the specified blocksize...
auto seq(T Begin, T End)
Iterate over an integral type from Begin up to - but not including - End.
Definition Sequence.h:305
unsigned gettBLXrOpcode(const MachineFunction &MF)
bool CC_ARM_APCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
@ Increment
Incrementally increasing token ID.
Definition AllocToken.h:26
bool CC_ARM_AAPCS_VFP(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
LLVM_ABI llvm::SmallVector< int, 16 > createSequentialMask(unsigned Start, unsigned NumInts, unsigned NumUndefs)
Create a sequential shuffle mask.
constexpr bool isShiftedUInt(uint64_t x)
Checks if a unsigned integer is an N bit number shifted left by S.
Definition MathExtras.h:198
unsigned convertAddSubFlagsOpcode(unsigned OldOpc)
Map pseudo instructions that imply an 'S' bit onto real opcodes.
LLVM_ABI bool isAllOnesConstant(SDValue V)
Returns true if V is an integer constant with all bits set.
static const unsigned PerfectShuffleTable[6561+1]
LLVM_ABI void reportFatalUsageError(Error Err)
Report a fatal error that does not indicate a bug in LLVM.
Definition Error.cpp:177
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:872
#define N
Load/store instruction that can be merged with a base address update.
SDNode * N
Instruction that updates a pointer.
unsigned ConstInc
Pointer increment value if it is a constant, or 0 otherwise.
SDValue Inc
Pointer increment operand.
A collection of metadata nodes that might be associated with a memory access used by the alias-analys...
Definition Metadata.h:763
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
constexpr uint64_t value() const
This is a hole in the type system and should not be abused.
Definition Alignment.h:77
static constexpr DenormalMode getIEEE()
Extended Value Type.
Definition ValueTypes.h:35
EVT changeVectorElementTypeToInteger() const
Return a vector with the same number of elements as this vector, but with the element type converted ...
Definition ValueTypes.h:94
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition ValueTypes.h:137
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition ValueTypes.h:74
bool bitsGT(EVT VT) const
Return true if this has more bits than VT.
Definition ValueTypes.h:284
bool bitsLT(EVT VT) const
Return true if this has less bits than VT.
Definition ValueTypes.h:300
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
Definition ValueTypes.h:147
ElementCount getVectorElementCount() const
Definition ValueTypes.h:350
EVT getDoubleNumVectorElementsVT(LLVMContext &Context) const
Definition ValueTypes.h:463
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition ValueTypes.h:373
unsigned getVectorMinNumElements() const
Given a vector type, return the minimum number of elements it contains.
Definition ValueTypes.h:359
uint64_t getScalarSizeInBits() const
Definition ValueTypes.h:385
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
Definition ValueTypes.h:470
static LLVM_ABI EVT getEVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
EVT changeVectorElementType(LLVMContext &Context, EVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
Definition ValueTypes.h:102
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition ValueTypes.h:316
bool is128BitVector() const
Return true if this is a 128-bit vector type.
Definition ValueTypes.h:207
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
Definition ValueTypes.h:65
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
Definition ValueTypes.h:381
bool isFixedLengthVector() const
Definition ValueTypes.h:181
static EVT getFloatingPointVT(unsigned BitWidth)
Returns the EVT that represents a floating-point type with the given number of bits.
Definition ValueTypes.h:59
bool isVector() const
Return true if this is a vector value type.
Definition ValueTypes.h:168
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition ValueTypes.h:323
LLVM_ABI Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition ValueTypes.h:328
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
Definition ValueTypes.h:157
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition ValueTypes.h:336
bool bitsLE(EVT VT) const
Return true if this has no more bits than VT.
Definition ValueTypes.h:308
EVT getHalfNumVectorElementsVT(LLVMContext &Context) const
Definition ValueTypes.h:453
bool isInteger() const
Return true if this is an integer or a vector integer type.
Definition ValueTypes.h:152
bool is64BitVector() const
Return true if this is a 64-bit vector type.
Definition ValueTypes.h:202
InputArg - This struct carries flags and type information about a single incoming (formal) argument o...
EVT ArgVT
Usually the non-legalized type of the argument, which is the EVT corresponding to the OrigTy IR type.
static KnownBits makeConstant(const APInt &C)
Create known bits from a known constant.
Definition KnownBits.h:317
bool isUnknown() const
Returns true if we don't know any bits.
Definition KnownBits.h:66
unsigned getBitWidth() const
Get the bit width of this value.
Definition KnownBits.h:44
KnownBits zext(unsigned BitWidth) const
Return known bits for a zero extension of the value we're tracking.
Definition KnownBits.h:178
void resetAll()
Resets the known state of all bits.
Definition KnownBits.h:74
KnownBits intersectWith(const KnownBits &RHS) const
Returns KnownBits information that is known to be true for both this and RHS.
Definition KnownBits.h:327
KnownBits sext(unsigned BitWidth) const
Return known bits for a sign extension of the value we're tracking.
Definition KnownBits.h:186
static KnownBits add(const KnownBits &LHS, const KnownBits &RHS, bool NSW=false, bool NUW=false)
Compute knownbits resulting from addition of LHS and RHS.
Definition KnownBits.h:363
static LLVM_ABI KnownBits mul(const KnownBits &LHS, const KnownBits &RHS, bool NoUndefSelfMultiply=false)
Compute known bits resulting from multiplying LHS and RHS.
APInt getSignedMinValue() const
Return the minimal signed value possible given these KnownBits.
Definition KnownBits.h:138
Matching combinators.
SmallVector< ArgRegPair, 1 > ArgRegPairs
Vector of call argument and its forwarding register.
This class contains a discriminated union of information about pointers in memory operands,...
static LLVM_ABI MachinePointerInfo getJumpTable(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a jump table entry.
static LLVM_ABI MachinePointerInfo getStack(MachineFunction &MF, int64_t Offset, uint8_t ID=0)
Stack pointer relative access.
static LLVM_ABI MachinePointerInfo getConstantPool(MachineFunction &MF)
Return a MachinePointerInfo record that refers to the constant pool.
MachinePointerInfo getWithOffset(int64_t O) const
static LLVM_ABI MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
static LLVM_ABI MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition Alignment.h:106
These are IR-level optimization flags that may be propagated to SDNodes.
bool hasNoSignedZeros() const
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
This contains information for each constraint that we are lowering.
This structure contains all information that is necessary for lowering calls.
CallLoweringInfo & setInRegister(bool Value=true)
CallLoweringInfo & setLibCallee(CallingConv::ID CC, Type *ResultType, SDValue Target, ArgListTy &&ArgsList)
SmallVector< ISD::InputArg, 32 > Ins
CallLoweringInfo & setZExtResult(bool Value=true)
CallLoweringInfo & setDebugLoc(const SDLoc &dl)
CallLoweringInfo & setSExtResult(bool Value=true)
SmallVector< ISD::OutputArg, 32 > Outs
CallLoweringInfo & setChain(SDValue InChain)
CallLoweringInfo & setCallee(CallingConv::ID CC, Type *ResultType, SDValue Target, ArgListTy &&ArgsList, AttributeSet ResultAttrs={})
LLVM_ABI void AddToWorklist(SDNode *N)
LLVM_ABI SDValue CombineTo(SDNode *N, ArrayRef< SDValue > To, bool AddTo=true)
This structure is used to pass arguments to makeLibCall function.
A convenience struct that encapsulates a DAG, and two SDValues for returning information from TargetL...