LLVM 23.0.0git
ARMISelLowering.cpp
Go to the documentation of this file.
1//===- ARMISelLowering.cpp - ARM DAG Lowering Implementation --------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines the interfaces that ARM uses to lower LLVM code into a
10// selection DAG.
11//
12//===----------------------------------------------------------------------===//
13
14#include "ARMISelLowering.h"
15#include "ARMBaseInstrInfo.h"
16#include "ARMBaseRegisterInfo.h"
17#include "ARMCallingConv.h"
20#include "ARMPerfectShuffle.h"
21#include "ARMRegisterInfo.h"
22#include "ARMSelectionDAGInfo.h"
23#include "ARMSubtarget.h"
27#include "Utils/ARMBaseInfo.h"
28#include "llvm/ADT/APFloat.h"
29#include "llvm/ADT/APInt.h"
30#include "llvm/ADT/ArrayRef.h"
31#include "llvm/ADT/BitVector.h"
32#include "llvm/ADT/DenseMap.h"
33#include "llvm/ADT/STLExtras.h"
36#include "llvm/ADT/Statistic.h"
38#include "llvm/ADT/StringRef.h"
40#include "llvm/ADT/Twine.h"
66#include "llvm/IR/Attributes.h"
67#include "llvm/IR/CallingConv.h"
68#include "llvm/IR/Constant.h"
69#include "llvm/IR/Constants.h"
70#include "llvm/IR/DataLayout.h"
71#include "llvm/IR/DebugLoc.h"
73#include "llvm/IR/Function.h"
74#include "llvm/IR/GlobalAlias.h"
75#include "llvm/IR/GlobalValue.h"
77#include "llvm/IR/IRBuilder.h"
78#include "llvm/IR/InlineAsm.h"
79#include "llvm/IR/Instruction.h"
82#include "llvm/IR/Intrinsics.h"
83#include "llvm/IR/IntrinsicsARM.h"
84#include "llvm/IR/Module.h"
85#include "llvm/IR/Type.h"
86#include "llvm/IR/User.h"
87#include "llvm/IR/Value.h"
88#include "llvm/MC/MCInstrDesc.h"
90#include "llvm/MC/MCSchedule.h"
97#include "llvm/Support/Debug.h"
105#include <algorithm>
106#include <cassert>
107#include <cstdint>
108#include <cstdlib>
109#include <iterator>
110#include <limits>
111#include <optional>
112#include <tuple>
113#include <utility>
114#include <vector>
115
116using namespace llvm;
117
118#define DEBUG_TYPE "arm-isel"
119
120STATISTIC(NumTailCalls, "Number of tail calls");
121STATISTIC(NumMovwMovt, "Number of GAs materialized with movw + movt");
122STATISTIC(NumLoopByVals, "Number of loops generated for byval arguments");
123STATISTIC(NumConstpoolPromoted,
124 "Number of constants with their storage promoted into constant pools");
125
126static cl::opt<bool>
127ARMInterworking("arm-interworking", cl::Hidden,
128 cl::desc("Enable / disable ARM interworking (for debugging only)"),
129 cl::init(true));
130
132 "arm-promote-constant", cl::Hidden,
133 cl::desc("Enable / disable promotion of unnamed_addr constants into "
134 "constant pools"),
135 cl::init(false)); // FIXME: set to true by default once PR32780 is fixed
137 "arm-promote-constant-max-size", cl::Hidden,
138 cl::desc("Maximum size of constant to promote into a constant pool"),
139 cl::init(64));
141 "arm-promote-constant-max-total", cl::Hidden,
142 cl::desc("Maximum size of ALL constants to promote into a constant pool"),
143 cl::init(128));
144
146MVEMaxSupportedInterleaveFactor("mve-max-interleave-factor", cl::Hidden,
147 cl::desc("Maximum interleave factor for MVE VLDn to generate."),
148 cl::init(2));
149
151 "arm-max-base-updates-to-check", cl::Hidden,
152 cl::desc("Maximum number of base-updates to check generating postindex."),
153 cl::init(64));
154
155/// Value type used for "flags" operands / results (either CPSR or FPSCR_NZCV).
156constexpr MVT FlagsVT = MVT::i32;
157
158// The APCS parameter registers.
159static const MCPhysReg GPRArgRegs[] = {
160 ARM::R0, ARM::R1, ARM::R2, ARM::R3
161};
162
164 SelectionDAG &DAG, const SDLoc &DL) {
166 assert(Arg.ArgVT.bitsLT(MVT::i32));
167 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, Arg.ArgVT, Value);
168 SDValue Ext =
170 MVT::i32, Trunc);
171 return Ext;
172}
173
174void ARMTargetLowering::addTypeForNEON(MVT VT, MVT PromotedLdStVT) {
175 if (VT != PromotedLdStVT) {
177 AddPromotedToType (ISD::LOAD, VT, PromotedLdStVT);
178
180 AddPromotedToType (ISD::STORE, VT, PromotedLdStVT);
181 }
182
183 MVT ElemTy = VT.getVectorElementType();
184 if (ElemTy != MVT::f64)
188 if (ElemTy == MVT::i32) {
193 } else {
198 }
207 if (VT.isInteger()) {
211 }
212
213 // Neon does not support vector divide/remainder operations.
222
223 if (!VT.isFloatingPoint() && VT != MVT::v2i64 && VT != MVT::v1i64)
224 for (auto Opcode : {ISD::ABS, ISD::ABDS, ISD::ABDU, ISD::SMIN, ISD::SMAX,
226 setOperationAction(Opcode, VT, Legal);
227 if (!VT.isFloatingPoint())
228 for (auto Opcode : {ISD::SADDSAT, ISD::UADDSAT, ISD::SSUBSAT, ISD::USUBSAT})
229 setOperationAction(Opcode, VT, Legal);
230}
231
232void ARMTargetLowering::addDRTypeForNEON(MVT VT) {
233 addRegisterClass(VT, &ARM::DPRRegClass);
234 addTypeForNEON(VT, MVT::f64);
235}
236
237void ARMTargetLowering::addQRTypeForNEON(MVT VT) {
238 addRegisterClass(VT, &ARM::DPairRegClass);
239 addTypeForNEON(VT, MVT::v2f64);
240}
241
242void ARMTargetLowering::setAllExpand(MVT VT) {
243 for (unsigned Opc = 0; Opc < ISD::BUILTIN_OP_END; ++Opc)
245
246 // We support these really simple operations even on types where all
247 // the actual arithmetic has to be broken down into simpler
248 // operations or turned into library calls.
253}
254
255void ARMTargetLowering::addAllExtLoads(const MVT From, const MVT To,
256 LegalizeAction Action) {
257 setLoadExtAction(ISD::EXTLOAD, From, To, Action);
258 setLoadExtAction(ISD::ZEXTLOAD, From, To, Action);
259 setLoadExtAction(ISD::SEXTLOAD, From, To, Action);
260}
261
262void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) {
263 const MVT IntTypes[] = { MVT::v16i8, MVT::v8i16, MVT::v4i32 };
264
265 for (auto VT : IntTypes) {
266 addRegisterClass(VT, &ARM::MQPRRegClass);
297
298 // No native support for these.
308
309 // Vector reductions
319
320 if (!HasMVEFP) {
325 } else {
328 }
329
330 // Pre and Post inc are supported on loads and stores
331 for (unsigned im = (unsigned)ISD::PRE_INC;
332 im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
337 }
338 }
339
340 const MVT FloatTypes[] = { MVT::v8f16, MVT::v4f32 };
341 for (auto VT : FloatTypes) {
342 addRegisterClass(VT, &ARM::MQPRRegClass);
343 if (!HasMVEFP)
344 setAllExpand(VT);
345
346 // These are legal or custom whether we have MVE.fp or not
359
360 // Pre and Post inc are supported on loads and stores
361 for (unsigned im = (unsigned)ISD::PRE_INC;
362 im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
367 }
368
369 if (HasMVEFP) {
377 }
382
383 // No native support for these.
398 }
399 }
400
401 // Custom Expand smaller than legal vector reductions to prevent false zero
402 // items being added.
411
412 // We 'support' these types up to bitcast/load/store level, regardless of
413 // MVE integer-only / float support. Only doing FP data processing on the FP
414 // vector types is inhibited at integer-only level.
415 const MVT LongTypes[] = { MVT::v2i64, MVT::v2f64 };
416 for (auto VT : LongTypes) {
417 addRegisterClass(VT, &ARM::MQPRRegClass);
418 setAllExpand(VT);
424 }
426
427 // We can do bitwise operations on v2i64 vectors
428 setOperationAction(ISD::AND, MVT::v2i64, Legal);
429 setOperationAction(ISD::OR, MVT::v2i64, Legal);
430 setOperationAction(ISD::XOR, MVT::v2i64, Legal);
431
432 // It is legal to extload from v4i8 to v4i16 or v4i32.
433 addAllExtLoads(MVT::v8i16, MVT::v8i8, Legal);
434 addAllExtLoads(MVT::v4i32, MVT::v4i16, Legal);
435 addAllExtLoads(MVT::v4i32, MVT::v4i8, Legal);
436
437 // It is legal to sign extend from v4i8/v4i16 to v4i32 or v8i8 to v8i16.
443
444 // Some truncating stores are legal too.
445 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal);
446 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal);
447 setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal);
448
449 // Pre and Post inc on these are legal, given the correct extends
450 for (unsigned im = (unsigned)ISD::PRE_INC;
451 im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
452 for (auto VT : {MVT::v8i8, MVT::v4i8, MVT::v4i16}) {
457 }
458 }
459
460 // Predicate types
461 const MVT pTypes[] = {MVT::v16i1, MVT::v8i1, MVT::v4i1, MVT::v2i1};
462 for (auto VT : pTypes) {
463 addRegisterClass(VT, &ARM::VCCRRegClass);
478
479 if (!HasMVEFP) {
484 }
485 }
489 setOperationAction(ISD::OR, MVT::v2i1, Expand);
495
504}
505
507 return static_cast<const ARMBaseTargetMachine &>(getTargetMachine());
508}
509
511 const ARMSubtarget &STI)
512 : TargetLowering(TM_, STI), Subtarget(&STI),
513 RegInfo(Subtarget->getRegisterInfo()),
514 Itins(Subtarget->getInstrItineraryData()) {
515 const auto &TM = static_cast<const ARMBaseTargetMachine &>(TM_);
516
519
520 const Triple &TT = TM.getTargetTriple();
521
522 if (Subtarget->isThumb1Only())
523 addRegisterClass(MVT::i32, &ARM::tGPRRegClass);
524 else
525 addRegisterClass(MVT::i32, &ARM::GPRRegClass);
526
527 if (!Subtarget->useSoftFloat() && !Subtarget->isThumb1Only() &&
528 Subtarget->hasFPRegs()) {
529 addRegisterClass(MVT::f32, &ARM::SPRRegClass);
530 addRegisterClass(MVT::f64, &ARM::DPRRegClass);
531
536
537 if (!Subtarget->hasVFP2Base()) {
538 setAllExpand(MVT::f32);
539 } else {
542 setOperationAction(Op, MVT::f32, Legal);
543 }
544 if (!Subtarget->hasFP64()) {
545 setAllExpand(MVT::f64);
546 } else {
549 setOperationAction(Op, MVT::f64, Legal);
550
552 }
553 }
554
555 if (Subtarget->hasFullFP16()) {
558 setOperationAction(Op, MVT::f16, Legal);
559
560 addRegisterClass(MVT::f16, &ARM::HPRRegClass);
563
568 }
569
570 if (Subtarget->hasBF16()) {
571 addRegisterClass(MVT::bf16, &ARM::HPRRegClass);
572 setAllExpand(MVT::bf16);
573 if (!Subtarget->hasFullFP16())
575 } else {
580 }
581
583 for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
584 setTruncStoreAction(VT, InnerVT, Expand);
585 addAllExtLoads(VT, InnerVT, Expand);
586 }
587
590
592 }
593
594 if (!Subtarget->isThumb1Only() && !Subtarget->hasV8_1MMainlineOps())
596
597 if (!Subtarget->hasV8_1MMainlineOps())
599
600 if (!Subtarget->isThumb1Only())
602
605
608
609 if (Subtarget->hasMVEIntegerOps())
610 addMVEVectorTypes(Subtarget->hasMVEFloatOps());
611
612 // Combine low-overhead loop intrinsics so that we can lower i1 types.
613 if (Subtarget->hasLOB()) {
615 }
616
617 if (Subtarget->hasNEON()) {
618 addDRTypeForNEON(MVT::v2f32);
619 addDRTypeForNEON(MVT::v8i8);
620 addDRTypeForNEON(MVT::v4i16);
621 addDRTypeForNEON(MVT::v2i32);
622 addDRTypeForNEON(MVT::v1i64);
623
624 addQRTypeForNEON(MVT::v4f32);
625 addQRTypeForNEON(MVT::v2f64);
626 addQRTypeForNEON(MVT::v16i8);
627 addQRTypeForNEON(MVT::v8i16);
628 addQRTypeForNEON(MVT::v4i32);
629 addQRTypeForNEON(MVT::v2i64);
630
631 if (Subtarget->hasFullFP16()) {
632 addQRTypeForNEON(MVT::v8f16);
633 addDRTypeForNEON(MVT::v4f16);
634 }
635
636 if (Subtarget->hasBF16()) {
637 addQRTypeForNEON(MVT::v8bf16);
638 addDRTypeForNEON(MVT::v4bf16);
639 }
640 }
641
642 if (Subtarget->hasMVEIntegerOps() || Subtarget->hasNEON()) {
643 // v2f64 is legal so that QR subregs can be extracted as f64 elements, but
644 // none of Neon, MVE or VFP supports any arithmetic operations on it.
645 setOperationAction(ISD::FADD, MVT::v2f64, Expand);
646 setOperationAction(ISD::FSUB, MVT::v2f64, Expand);
647 setOperationAction(ISD::FMUL, MVT::v2f64, Expand);
648 // FIXME: Code duplication: FDIV and FREM are expanded always, see
649 // ARMTargetLowering::addTypeForNEON method for details.
650 setOperationAction(ISD::FDIV, MVT::v2f64, Expand);
651 setOperationAction(ISD::FREM, MVT::v2f64, Expand);
652 // FIXME: Create unittest.
653 // In another words, find a way when "copysign" appears in DAG with vector
654 // operands.
656 // FIXME: Code duplication: SETCC has custom operation action, see
657 // ARMTargetLowering::addTypeForNEON method for details.
659 // FIXME: Create unittest for FNEG and for FABS.
660 setOperationAction(ISD::FNEG, MVT::v2f64, Expand);
661 setOperationAction(ISD::FABS, MVT::v2f64, Expand);
663 setOperationAction(ISD::FSIN, MVT::v2f64, Expand);
664 setOperationAction(ISD::FCOS, MVT::v2f64, Expand);
665 setOperationAction(ISD::FTAN, MVT::v2f64, Expand);
666 setOperationAction(ISD::FPOW, MVT::v2f64, Expand);
667 setOperationAction(ISD::FLOG, MVT::v2f64, Expand);
670 setOperationAction(ISD::FEXP, MVT::v2f64, Expand);
679 setOperationAction(ISD::FMA, MVT::v2f64, Expand);
680 }
681
682 if (Subtarget->hasNEON()) {
683 // The same with v4f32. But keep in mind that vadd, vsub, vmul are natively
684 // supported for v4f32.
686 setOperationAction(ISD::FSIN, MVT::v4f32, Expand);
687 setOperationAction(ISD::FCOS, MVT::v4f32, Expand);
688 setOperationAction(ISD::FTAN, MVT::v4f32, Expand);
689 setOperationAction(ISD::FPOW, MVT::v4f32, Expand);
690 setOperationAction(ISD::FLOG, MVT::v4f32, Expand);
693 setOperationAction(ISD::FEXP, MVT::v4f32, Expand);
702
703 // Mark v2f32 intrinsics.
705 setOperationAction(ISD::FSIN, MVT::v2f32, Expand);
706 setOperationAction(ISD::FCOS, MVT::v2f32, Expand);
707 setOperationAction(ISD::FTAN, MVT::v2f32, Expand);
708 setOperationAction(ISD::FPOW, MVT::v2f32, Expand);
709 setOperationAction(ISD::FLOG, MVT::v2f32, Expand);
712 setOperationAction(ISD::FEXP, MVT::v2f32, Expand);
721
724 setOperationAction(Op, MVT::v4f16, Expand);
725 setOperationAction(Op, MVT::v8f16, Expand);
726 }
727
728 // Neon does not support some operations on v1i64 and v2i64 types.
729 setOperationAction(ISD::MUL, MVT::v1i64, Expand);
730 // Custom handling for some quad-vector types to detect VMULL.
731 setOperationAction(ISD::MUL, MVT::v8i16, Custom);
732 setOperationAction(ISD::MUL, MVT::v4i32, Custom);
733 setOperationAction(ISD::MUL, MVT::v2i64, Custom);
734 // Custom handling for some vector types to avoid expensive expansions
735 setOperationAction(ISD::SDIV, MVT::v4i16, Custom);
737 setOperationAction(ISD::UDIV, MVT::v4i16, Custom);
739 // Neon does not have single instruction SINT_TO_FP and UINT_TO_FP with
740 // a destination type that is wider than the source, and nor does
741 // it have a FP_TO_[SU]INT instruction with a narrower destination than
742 // source.
751
754
755 // NEON does not have single instruction CTPOP for vectors with element
756 // types wider than 8-bits. However, custom lowering can leverage the
757 // v8i8/v16i8 vcnt instruction.
764
765 setOperationAction(ISD::CTLZ, MVT::v1i64, Expand);
766 setOperationAction(ISD::CTLZ, MVT::v2i64, Expand);
767
768 // NEON does not have single instruction CTTZ for vectors.
770 setOperationAction(ISD::CTTZ, MVT::v4i16, Custom);
771 setOperationAction(ISD::CTTZ, MVT::v2i32, Custom);
772 setOperationAction(ISD::CTTZ, MVT::v1i64, Custom);
773
774 setOperationAction(ISD::CTTZ, MVT::v16i8, Custom);
775 setOperationAction(ISD::CTTZ, MVT::v8i16, Custom);
776 setOperationAction(ISD::CTTZ, MVT::v4i32, Custom);
777 setOperationAction(ISD::CTTZ, MVT::v2i64, Custom);
778
783
788
792 }
793
794 // NEON only has FMA instructions as of VFP4.
795 if (!Subtarget->hasVFP4Base()) {
796 setOperationAction(ISD::FMA, MVT::v2f32, Expand);
797 setOperationAction(ISD::FMA, MVT::v4f32, Expand);
798 }
799
802
803 // It is legal to extload from v4i8 to v4i16 or v4i32.
804 for (MVT Ty : {MVT::v8i8, MVT::v4i8, MVT::v2i8, MVT::v4i16, MVT::v2i16,
805 MVT::v2i32}) {
810 }
811 }
812
813 for (auto VT : {MVT::v8i8, MVT::v4i16, MVT::v2i32, MVT::v16i8, MVT::v8i16,
814 MVT::v4i32}) {
819 }
820 }
821
822 if (Subtarget->hasNEON() || Subtarget->hasMVEIntegerOps()) {
829 }
830 if (Subtarget->hasMVEIntegerOps()) {
833 ISD::SETCC});
834 }
835 if (Subtarget->hasMVEFloatOps()) {
837 }
838
839 if (!Subtarget->hasFP64()) {
840 // When targeting a floating-point unit with only single-precision
841 // operations, f64 is legal for the few double-precision instructions which
842 // are present However, no double-precision operations other than moves,
843 // loads and stores are provided by the hardware.
880 }
881
884
885 if (!Subtarget->hasFP64() || !Subtarget->hasFPARMv8Base()) {
888 if (Subtarget->hasFullFP16()) {
891 }
892 } else {
894 }
895
896 if (!Subtarget->hasFP16()) {
899 } else {
902 }
903
904 computeRegisterProperties(Subtarget->getRegisterInfo());
905
906 // ARM does not have floating-point extending loads.
907 for (MVT VT : MVT::fp_valuetypes()) {
908 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand);
909 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand);
910 setLoadExtAction(ISD::EXTLOAD, VT, MVT::bf16, Expand);
911 }
912
913 // ... or truncating stores
914 setTruncStoreAction(MVT::f64, MVT::f32, Expand);
915 setTruncStoreAction(MVT::f32, MVT::f16, Expand);
916 setTruncStoreAction(MVT::f64, MVT::f16, Expand);
917 setTruncStoreAction(MVT::f32, MVT::bf16, Expand);
918 setTruncStoreAction(MVT::f64, MVT::bf16, Expand);
919
920 // ARM does not have i1 sign extending load.
921 for (MVT VT : MVT::integer_valuetypes())
923
924 // ARM supports all 4 flavors of integer indexed load / store.
925 if (!Subtarget->isThumb1Only()) {
926 for (unsigned im = (unsigned)ISD::PRE_INC;
928 setIndexedLoadAction(im, MVT::i1, Legal);
929 setIndexedLoadAction(im, MVT::i8, Legal);
930 setIndexedLoadAction(im, MVT::i16, Legal);
931 setIndexedLoadAction(im, MVT::i32, Legal);
932 setIndexedStoreAction(im, MVT::i1, Legal);
933 setIndexedStoreAction(im, MVT::i8, Legal);
934 setIndexedStoreAction(im, MVT::i16, Legal);
935 setIndexedStoreAction(im, MVT::i32, Legal);
936 }
937 } else {
938 // Thumb-1 has limited post-inc load/store support - LDM r0!, {r1}.
941 }
942
943 // Custom loads/stores to possible use __aeabi_uread/write*
944 if (TT.isTargetAEABI() && !Subtarget->allowsUnalignedMem()) {
949 }
950
955
956 if (!Subtarget->isThumb1Only()) {
959 }
960
965 if (Subtarget->hasDSP()) {
974 }
975 if (Subtarget->hasBaseDSP()) {
978 }
979
980 // i64 operation support.
983 if (Subtarget->isThumb1Only()) {
986 }
987 if (Subtarget->isThumb1Only() || !Subtarget->hasV6Ops()
988 || (Subtarget->isThumb2() && !Subtarget->hasDSP()))
990
1000
1001 // MVE lowers 64 bit shifts to lsll and lsrl
1002 // assuming that ISD::SRL and SRA of i64 are already marked custom
1003 if (Subtarget->hasMVEIntegerOps())
1005
1006 // Expand to __aeabi_l{lsl,lsr,asr} calls for Thumb1.
1007 if (Subtarget->isThumb1Only()) {
1011 }
1012
1013 if (!Subtarget->isThumb1Only() && Subtarget->hasV6T2Ops())
1015
1016 // ARM does not have ROTL.
1021 }
1023 // TODO: These two should be set to LibCall, but this currently breaks
1024 // the Linux kernel build. See #101786.
1027 if (!Subtarget->hasV5TOps() || Subtarget->isThumb1Only()) {
1030 }
1031
1032 // @llvm.readcyclecounter requires the Performance Monitors extension.
1033 // Default to the 0 expansion on unsupported platforms.
1034 // FIXME: Technically there are older ARM CPUs that have
1035 // implementation-specific ways of obtaining this information.
1036 if (Subtarget->hasPerfMon())
1038
1039 // Only ARMv6 has BSWAP.
1040 if (!Subtarget->hasV6Ops())
1042
1043 bool hasDivide = Subtarget->isThumb() ? Subtarget->hasDivideInThumbMode()
1044 : Subtarget->hasDivideInARMMode();
1045 if (!hasDivide) {
1046 // These are expanded into libcalls if the cpu doesn't have HW divider.
1049 }
1050
1051 if (TT.isOSWindows() && !Subtarget->hasDivideInThumbMode()) {
1054
1057 }
1058
1061
1062 // Register based DivRem for AEABI (RTABI 4.2)
1063 if (TT.isTargetAEABI() || TT.isAndroid() || TT.isTargetGNUAEABI() ||
1064 TT.isTargetMuslAEABI() || TT.isOSFuchsia() || TT.isOSWindows()) {
1067 HasStandaloneRem = false;
1068
1073 } else {
1076 }
1077
1082
1083 setOperationAction(ISD::TRAP, MVT::Other, Legal);
1085
1086 // Use the default implementation.
1088 setOperationAction(ISD::VAARG, MVT::Other, Expand);
1090 setOperationAction(ISD::VAEND, MVT::Other, Expand);
1093
1094 if (TT.isOSWindows())
1096 else
1098
1099 // ARMv6 Thumb1 (except for CPUs that support dmb / dsb) and earlier use
1100 // the default expansion.
1101 InsertFencesForAtomic = false;
1102 if (Subtarget->hasAnyDataBarrier() &&
1103 (!Subtarget->isThumb() || Subtarget->hasV8MBaselineOps())) {
1104 // ATOMIC_FENCE needs custom lowering; the others should have been expanded
1105 // to ldrex/strex loops already.
1107 if (!Subtarget->isThumb() || !Subtarget->isMClass())
1109
1110 // On v8, we have particularly efficient implementations of atomic fences
1111 // if they can be combined with nearby atomic loads and stores.
1112 if (!Subtarget->hasAcquireRelease() ||
1113 getTargetMachine().getOptLevel() == CodeGenOptLevel::None) {
1114 // Automatically insert fences (dmb ish) around ATOMIC_SWAP etc.
1115 InsertFencesForAtomic = true;
1116 }
1117 } else {
1118 // If there's anything we can use as a barrier, go through custom lowering
1119 // for ATOMIC_FENCE.
1120 // If target has DMB in thumb, Fences can be inserted.
1121 if (Subtarget->hasDataBarrier())
1122 InsertFencesForAtomic = true;
1123
1125 Subtarget->hasAnyDataBarrier() ? Custom : Expand);
1126
1127 // Set them all for libcall, which will force libcalls.
1140 // Mark ATOMIC_LOAD and ATOMIC_STORE custom so we can handle the
1141 // Unordered/Monotonic case.
1142 if (!InsertFencesForAtomic) {
1145 }
1146 }
1147
1148 // Compute supported atomic widths.
1149 if (TT.isOSLinux() || (!Subtarget->isMClass() && Subtarget->hasV6Ops())) {
1150 // For targets where __sync_* routines are reliably available, we use them
1151 // if necessary.
1152 //
1153 // ARM Linux always supports 64-bit atomics through kernel-assisted atomic
1154 // routines (kernel 3.1 or later). FIXME: Not with compiler-rt?
1155 //
1156 // ARMv6 targets have native instructions in ARM mode. For Thumb mode,
1157 // such targets should provide __sync_* routines, which use the ARM mode
1158 // instructions. (ARMv6 doesn't have dmb, but it has an equivalent
1159 // encoding; see ARMISD::MEMBARRIER_MCR.)
1161 } else if ((Subtarget->isMClass() && Subtarget->hasV8MBaselineOps()) ||
1162 Subtarget->hasForced32BitAtomics()) {
1163 // Cortex-M (besides Cortex-M0) have 32-bit atomics.
1165 } else {
1166 // We can't assume anything about other targets; just use libatomic
1167 // routines.
1169 }
1170
1172
1174
1175 // Requires SXTB/SXTH, available on v6 and up in both ARM and Thumb modes.
1176 if (!Subtarget->hasV6Ops()) {
1179 }
1181
1182 if (!Subtarget->useSoftFloat() && Subtarget->hasFPRegs() &&
1183 !Subtarget->isThumb1Only()) {
1184 // Turn f64->i64 into VMOVRRD, i64 -> f64 to VMOVDRR
1185 // iff target supports vfp2.
1195 }
1196
1197 // We want to custom lower some of our intrinsics.
1202
1212 if (Subtarget->hasFullFP16()) {
1216 }
1217
1219
1222 if (Subtarget->hasFullFP16())
1226 setOperationAction(ISD::BR_JT, MVT::Other, Custom);
1227
1228 // We don't support sin/cos/fmod/copysign/pow
1237 if (!Subtarget->useSoftFloat() && Subtarget->hasVFP2Base() &&
1238 !Subtarget->isThumb1Only()) {
1241 }
1244
1245 if (!Subtarget->hasVFP4Base()) {
1248 }
1249
1250 // Various VFP goodness
1251 if (!Subtarget->useSoftFloat() && !Subtarget->isThumb1Only()) {
1252 // FP-ARMv8 adds f64 <-> f16 conversion. Before that it should be expanded.
1253 if (!Subtarget->hasFPARMv8Base() || !Subtarget->hasFP64()) {
1258 }
1259
1260 // fp16 is a special v7 extension that adds f16 <-> f32 conversions.
1261 if (!Subtarget->hasFP16()) {
1266 }
1267
1268 // Strict floating-point comparisons need custom lowering.
1275 }
1276
1279
1280 // FP-ARMv8 implements a lot of rounding-like FP operations.
1281 if (Subtarget->hasFPARMv8Base()) {
1282 for (auto Op :
1289 setOperationAction(Op, MVT::f32, Legal);
1290
1291 if (Subtarget->hasFP64())
1292 setOperationAction(Op, MVT::f64, Legal);
1293 }
1294
1295 if (Subtarget->hasNEON()) {
1300 }
1301 }
1302
1303 // FP16 often need to be promoted to call lib functions
1304 // clang-format off
1305 if (Subtarget->hasFullFP16()) {
1309
1310 for (auto Op : {ISD::FREM, ISD::FPOW, ISD::FPOWI,
1324 setOperationAction(Op, MVT::f16, Promote);
1325 }
1326
1327 // Round-to-integer need custom lowering for fp16, as Promote doesn't work
1328 // because the result type is integer.
1330 setOperationAction(Op, MVT::f16, Custom);
1331
1337 setOperationAction(Op, MVT::f16, Legal);
1338 }
1339 // clang-format on
1340 }
1341
1342 if (Subtarget->hasNEON()) {
1343 // vmin and vmax aren't available in a scalar form, so we can use
1344 // a NEON instruction with an undef lane instead.
1353
1354 if (Subtarget->hasV8Ops()) {
1359 setOperationAction(Op, MVT::v2f32, Legal);
1360 setOperationAction(Op, MVT::v4f32, Legal);
1361 }
1362 }
1363
1364 if (Subtarget->hasFullFP16()) {
1369
1374
1379 setOperationAction(Op, MVT::v4f16, Legal);
1380 setOperationAction(Op, MVT::v8f16, Legal);
1381 }
1382 }
1383 }
1384
1385 // On MSVC, both 32-bit and 64-bit, ldexpf(f32) is not defined. MinGW has
1386 // it, but it's just a wrapper around ldexp.
1387 if (TT.isOSWindows()) {
1389 if (isOperationExpand(Op, MVT::f32))
1390 setOperationAction(Op, MVT::f32, Promote);
1391 }
1392
1393 // LegalizeDAG currently can't expand fp16 LDEXP/FREXP on targets where i16
1394 // isn't legal.
1396 if (isOperationExpand(Op, MVT::f16))
1397 setOperationAction(Op, MVT::f16, Promote);
1398
1399 // We have target-specific dag combine patterns for the following nodes:
1400 // ARMISD::VMOVRRD - No need to call setTargetDAGCombine
1403
1404 if (Subtarget->hasMVEIntegerOps())
1406
1407 if (Subtarget->hasV6Ops())
1409 if (Subtarget->isThumb1Only())
1411 // Attempt to lower smin/smax to ssat/usat
1412 if ((!Subtarget->isThumb() && Subtarget->hasV6Ops()) ||
1413 Subtarget->isThumb2()) {
1415 }
1416
1418
1419 if (Subtarget->useSoftFloat() || Subtarget->isThumb1Only() ||
1420 !Subtarget->hasVFP2Base() || Subtarget->hasMinSize())
1422 else
1424
1425 //// temporary - rewrite interface to use type
1428 MaxStoresPerMemcpy = 4; // For @llvm.memcpy -> sequence of stores
1430 MaxStoresPerMemmove = 4; // For @llvm.memmove -> sequence of stores
1432
1433 // On ARM arguments smaller than 4 bytes are extended, so all arguments
1434 // are at least 4 bytes aligned.
1436
1437 // Prefer likely predicted branches to selects on out-of-order cores.
1438 PredictableSelectIsExpensive = Subtarget->getSchedModel().isOutOfOrder();
1439
1440 setPrefLoopAlignment(Align(1ULL << Subtarget->getPreferBranchLogAlignment()));
1442 Align(1ULL << Subtarget->getPreferBranchLogAlignment()));
1443
1444 setMinFunctionAlignment(Subtarget->isThumb() ? Align(2) : Align(4));
1445
1446 IsStrictFPEnabled = true;
1447}
1448
1450 return Subtarget->useSoftFloat();
1451}
1452
1454 return !Subtarget->isThumb1Only() && VT.getSizeInBits() <= 32;
1455}
1456
1457// FIXME: It might make sense to define the representative register class as the
1458// nearest super-register that has a non-null superset. For example, DPR_VFP2 is
1459// a super-register of SPR, and DPR is a superset if DPR_VFP2. Consequently,
1460// SPR's representative would be DPR_VFP2. This should work well if register
1461// pressure tracking were modified such that a register use would increment the
1462// pressure of the register class's representative and all of it's super
1463// classes' representatives transitively. We have not implemented this because
1464// of the difficulty prior to coalescing of modeling operand register classes
1465// due to the common occurrence of cross class copies and subregister insertions
1466// and extractions.
1467std::pair<const TargetRegisterClass *, uint8_t>
1469 MVT VT) const {
1470 const TargetRegisterClass *RRC = nullptr;
1471 uint8_t Cost = 1;
1472 switch (VT.SimpleTy) {
1473 default:
1475 // Use DPR as representative register class for all floating point
1476 // and vector types. Since there are 32 SPR registers and 32 DPR registers so
1477 // the cost is 1 for both f32 and f64.
1478 case MVT::f32: case MVT::f64: case MVT::v8i8: case MVT::v4i16:
1479 case MVT::v2i32: case MVT::v1i64: case MVT::v2f32:
1480 RRC = &ARM::DPRRegClass;
1481 // When NEON is used for SP, only half of the register file is available
1482 // because operations that define both SP and DP results will be constrained
1483 // to the VFP2 class (D0-D15). We currently model this constraint prior to
1484 // coalescing by double-counting the SP regs. See the FIXME above.
1485 if (Subtarget->useNEONForSinglePrecisionFP())
1486 Cost = 2;
1487 break;
1488 case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
1489 case MVT::v4f32: case MVT::v2f64:
1490 RRC = &ARM::DPRRegClass;
1491 Cost = 2;
1492 break;
1493 case MVT::v4i64:
1494 RRC = &ARM::DPRRegClass;
1495 Cost = 4;
1496 break;
1497 case MVT::v8i64:
1498 RRC = &ARM::DPRRegClass;
1499 Cost = 8;
1500 break;
1501 }
1502 return std::make_pair(RRC, Cost);
1503}
1504
1506 EVT VT) const {
1507 if (!VT.isVector())
1508 return getPointerTy(DL);
1509
1510 // MVE has a predicate register.
1511 if ((Subtarget->hasMVEIntegerOps() &&
1512 (VT == MVT::v2i64 || VT == MVT::v4i32 || VT == MVT::v8i16 ||
1513 VT == MVT::v16i8)) ||
1514 (Subtarget->hasMVEFloatOps() &&
1515 (VT == MVT::v2f64 || VT == MVT::v4f32 || VT == MVT::v8f16)))
1516 return MVT::getVectorVT(MVT::i1, VT.getVectorElementCount());
1518}
1519
1520/// getRegClassFor - Return the register class that should be used for the
1521/// specified value type.
1522const TargetRegisterClass *
1523ARMTargetLowering::getRegClassFor(MVT VT, bool isDivergent) const {
1524 (void)isDivergent;
1525 // Map v4i64 to QQ registers but do not make the type legal. Similarly map
1526 // v8i64 to QQQQ registers. v4i64 and v8i64 are only used for REG_SEQUENCE to
1527 // load / store 4 to 8 consecutive NEON D registers, or 2 to 4 consecutive
1528 // MVE Q registers.
1529 if (Subtarget->hasNEON()) {
1530 if (VT == MVT::v4i64)
1531 return &ARM::QQPRRegClass;
1532 if (VT == MVT::v8i64)
1533 return &ARM::QQQQPRRegClass;
1534 }
1535 if (Subtarget->hasMVEIntegerOps()) {
1536 if (VT == MVT::v4i64)
1537 return &ARM::MQQPRRegClass;
1538 if (VT == MVT::v8i64)
1539 return &ARM::MQQQQPRRegClass;
1540 }
1542}
1543
1544// memcpy, and other memory intrinsics, typically tries to use LDM/STM if the
1545// source/dest is aligned and the copy size is large enough. We therefore want
1546// to align such objects passed to memory intrinsics.
1548 Align &PrefAlign) const {
1549 if (!isa<MemIntrinsic>(CI))
1550 return false;
1551 MinSize = 8;
1552 // On ARM11 onwards (excluding M class) 8-byte aligned LDM is typically 1
1553 // cycle faster than 4-byte aligned LDM.
1554 PrefAlign =
1555 (Subtarget->hasV6Ops() && !Subtarget->isMClass() ? Align(8) : Align(4));
1556 return true;
1557}
1558
1559// Create a fast isel object.
1561 FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo,
1562 const LibcallLoweringInfo *libcallLowering) const {
1563 return ARM::createFastISel(funcInfo, libInfo, libcallLowering);
1564}
1565
1567 unsigned NumVals = N->getNumValues();
1568 if (!NumVals)
1569 return Sched::RegPressure;
1570
1571 for (unsigned i = 0; i != NumVals; ++i) {
1572 EVT VT = N->getValueType(i);
1573 if (VT == MVT::Glue || VT == MVT::Other)
1574 continue;
1575 if (VT.isFloatingPoint() || VT.isVector())
1576 return Sched::ILP;
1577 }
1578
1579 if (!N->isMachineOpcode())
1580 return Sched::RegPressure;
1581
1582 // Load are scheduled for latency even if there instruction itinerary
1583 // is not available.
1584 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
1585 const MCInstrDesc &MCID = TII->get(N->getMachineOpcode());
1586
1587 if (MCID.getNumDefs() == 0)
1588 return Sched::RegPressure;
1589 if (!Itins->isEmpty() &&
1590 Itins->getOperandCycle(MCID.getSchedClass(), 0) > 2U)
1591 return Sched::ILP;
1592
1593 return Sched::RegPressure;
1594}
1595
1596//===----------------------------------------------------------------------===//
1597// Lowering Code
1598//===----------------------------------------------------------------------===//
1599
1600static bool isSRL16(const SDValue &Op) {
1601 if (Op.getOpcode() != ISD::SRL)
1602 return false;
1603 if (auto Const = dyn_cast<ConstantSDNode>(Op.getOperand(1)))
1604 return Const->getZExtValue() == 16;
1605 return false;
1606}
1607
1608static bool isSRA16(const SDValue &Op) {
1609 if (Op.getOpcode() != ISD::SRA)
1610 return false;
1611 if (auto Const = dyn_cast<ConstantSDNode>(Op.getOperand(1)))
1612 return Const->getZExtValue() == 16;
1613 return false;
1614}
1615
1616static bool isSHL16(const SDValue &Op) {
1617 if (Op.getOpcode() != ISD::SHL)
1618 return false;
1619 if (auto Const = dyn_cast<ConstantSDNode>(Op.getOperand(1)))
1620 return Const->getZExtValue() == 16;
1621 return false;
1622}
1623
1624// Check for a signed 16-bit value. We special case SRA because it makes it
1625// more simple when also looking for SRAs that aren't sign extending a
1626// smaller value. Without the check, we'd need to take extra care with
1627// checking order for some operations.
1628static bool isS16(const SDValue &Op, SelectionDAG &DAG) {
1629 if (isSRA16(Op))
1630 return isSHL16(Op.getOperand(0));
1631 return DAG.ComputeNumSignBits(Op) == 17;
1632}
1633
1634/// IntCCToARMCC - Convert a DAG integer condition code to an ARM CC
1636 switch (CC) {
1637 default: llvm_unreachable("Unknown condition code!");
1638 case ISD::SETNE: return ARMCC::NE;
1639 case ISD::SETEQ: return ARMCC::EQ;
1640 case ISD::SETGT: return ARMCC::GT;
1641 case ISD::SETGE: return ARMCC::GE;
1642 case ISD::SETLT: return ARMCC::LT;
1643 case ISD::SETLE: return ARMCC::LE;
1644 case ISD::SETUGT: return ARMCC::HI;
1645 case ISD::SETUGE: return ARMCC::HS;
1646 case ISD::SETULT: return ARMCC::LO;
1647 case ISD::SETULE: return ARMCC::LS;
1648 }
1649}
1650
1651/// FPCCToARMCC - Convert a DAG fp condition code to an ARM CC.
1653 ARMCC::CondCodes &CondCode2) {
1654 CondCode2 = ARMCC::AL;
1655 switch (CC) {
1656 default: llvm_unreachable("Unknown FP condition!");
1657 case ISD::SETEQ:
1658 case ISD::SETOEQ: CondCode = ARMCC::EQ; break;
1659 case ISD::SETGT:
1660 case ISD::SETOGT: CondCode = ARMCC::GT; break;
1661 case ISD::SETGE:
1662 case ISD::SETOGE: CondCode = ARMCC::GE; break;
1663 case ISD::SETOLT: CondCode = ARMCC::MI; break;
1664 case ISD::SETOLE: CondCode = ARMCC::LS; break;
1665 case ISD::SETONE: CondCode = ARMCC::MI; CondCode2 = ARMCC::GT; break;
1666 case ISD::SETO: CondCode = ARMCC::VC; break;
1667 case ISD::SETUO: CondCode = ARMCC::VS; break;
1668 case ISD::SETUEQ: CondCode = ARMCC::EQ; CondCode2 = ARMCC::VS; break;
1669 case ISD::SETUGT: CondCode = ARMCC::HI; break;
1670 case ISD::SETUGE: CondCode = ARMCC::PL; break;
1671 case ISD::SETLT:
1672 case ISD::SETULT: CondCode = ARMCC::LT; break;
1673 case ISD::SETLE:
1674 case ISD::SETULE: CondCode = ARMCC::LE; break;
1675 case ISD::SETNE:
1676 case ISD::SETUNE: CondCode = ARMCC::NE; break;
1677 }
1678}
1679
1680//===----------------------------------------------------------------------===//
1681// Calling Convention Implementation
1682//===----------------------------------------------------------------------===//
1683
1684/// getEffectiveCallingConv - Get the effective calling convention, taking into
1685/// account presence of floating point hardware and calling convention
1686/// limitations, such as support for variadic functions.
1688ARMTargetLowering::getEffectiveCallingConv(CallingConv::ID CC,
1689 bool isVarArg) const {
1690 switch (CC) {
1691 default:
1692 report_fatal_error("Unsupported calling convention");
1695 case CallingConv::GHC:
1697 return CC;
1703 case CallingConv::Swift:
1706 case CallingConv::C:
1707 case CallingConv::Tail:
1708 if (!getTM().isAAPCS_ABI())
1709 return CallingConv::ARM_APCS;
1710 else if (Subtarget->hasFPRegs() && !Subtarget->isThumb1Only() &&
1711 getTargetMachine().Options.FloatABIType == FloatABI::Hard &&
1712 !isVarArg)
1714 else
1716 case CallingConv::Fast:
1718 if (!getTM().isAAPCS_ABI()) {
1719 if (Subtarget->hasFPRegs() && !Subtarget->isThumb1Only() && !isVarArg)
1720 return CallingConv::Fast;
1721 return CallingConv::ARM_APCS;
1722 } else if (Subtarget->hasFPRegs() && !Subtarget->isThumb1Only() &&
1723 !isVarArg)
1725 else
1727 }
1728}
1729
1731 bool isVarArg) const {
1732 return CCAssignFnForNode(CC, false, isVarArg);
1733}
1734
1736 bool isVarArg) const {
1737 return CCAssignFnForNode(CC, true, isVarArg);
1738}
1739
1740/// CCAssignFnForNode - Selects the correct CCAssignFn for the given
1741/// CallingConvention.
1742CCAssignFn *ARMTargetLowering::CCAssignFnForNode(CallingConv::ID CC,
1743 bool Return,
1744 bool isVarArg) const {
1745 switch (getEffectiveCallingConv(CC, isVarArg)) {
1746 default:
1747 report_fatal_error("Unsupported calling convention");
1749 return (Return ? RetCC_ARM_APCS : CC_ARM_APCS);
1751 return (Return ? RetCC_ARM_AAPCS : CC_ARM_AAPCS);
1753 return (Return ? RetCC_ARM_AAPCS_VFP : CC_ARM_AAPCS_VFP);
1754 case CallingConv::Fast:
1755 return (Return ? RetFastCC_ARM_APCS : FastCC_ARM_APCS);
1756 case CallingConv::GHC:
1757 return (Return ? RetCC_ARM_APCS : CC_ARM_APCS_GHC);
1759 return (Return ? RetCC_ARM_AAPCS : CC_ARM_AAPCS);
1761 return (Return ? RetCC_ARM_AAPCS : CC_ARM_AAPCS);
1763 return (Return ? RetCC_ARM_AAPCS : CC_ARM_Win32_CFGuard_Check);
1764 }
1765}
1766
1767SDValue ARMTargetLowering::MoveToHPR(const SDLoc &dl, SelectionDAG &DAG,
1768 MVT LocVT, MVT ValVT, SDValue Val) const {
1769 Val = DAG.getNode(ISD::BITCAST, dl, MVT::getIntegerVT(LocVT.getSizeInBits()),
1770 Val);
1771 if (Subtarget->hasFullFP16()) {
1772 Val = DAG.getNode(ARMISD::VMOVhr, dl, ValVT, Val);
1773 } else {
1774 Val = DAG.getNode(ISD::TRUNCATE, dl,
1775 MVT::getIntegerVT(ValVT.getSizeInBits()), Val);
1776 Val = DAG.getNode(ISD::BITCAST, dl, ValVT, Val);
1777 }
1778 return Val;
1779}
1780
1781SDValue ARMTargetLowering::MoveFromHPR(const SDLoc &dl, SelectionDAG &DAG,
1782 MVT LocVT, MVT ValVT,
1783 SDValue Val) const {
1784 if (Subtarget->hasFullFP16()) {
1785 Val = DAG.getNode(ARMISD::VMOVrh, dl,
1786 MVT::getIntegerVT(LocVT.getSizeInBits()), Val);
1787 } else {
1788 Val = DAG.getNode(ISD::BITCAST, dl,
1789 MVT::getIntegerVT(ValVT.getSizeInBits()), Val);
1790 Val = DAG.getNode(ISD::ZERO_EXTEND, dl,
1791 MVT::getIntegerVT(LocVT.getSizeInBits()), Val);
1792 }
1793 return DAG.getNode(ISD::BITCAST, dl, LocVT, Val);
1794}
1795
1796/// LowerCallResult - Lower the result values of a call into the
1797/// appropriate copies out of appropriate physical registers.
1798SDValue ARMTargetLowering::LowerCallResult(
1799 SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg,
1800 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
1801 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool isThisReturn,
1802 SDValue ThisVal, bool isCmseNSCall) const {
1803 // Assign locations to each value returned by this call.
1805 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
1806 *DAG.getContext());
1807 CCInfo.AnalyzeCallResult(Ins, CCAssignFnForReturn(CallConv, isVarArg));
1808
1809 // Copy all of the result registers out of their specified physreg.
1810 for (unsigned i = 0; i != RVLocs.size(); ++i) {
1811 CCValAssign VA = RVLocs[i];
1812
1813 // Pass 'this' value directly from the argument to return value, to avoid
1814 // reg unit interference
1815 if (i == 0 && isThisReturn) {
1816 assert(!VA.needsCustom() && VA.getLocVT() == MVT::i32 &&
1817 "unexpected return calling convention register assignment");
1818 InVals.push_back(ThisVal);
1819 continue;
1820 }
1821
1822 SDValue Val;
1823 if (VA.needsCustom() &&
1824 (VA.getLocVT() == MVT::f64 || VA.getLocVT() == MVT::v2f64)) {
1825 // Handle f64 or half of a v2f64.
1826 SDValue Lo = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32,
1827 InGlue);
1828 Chain = Lo.getValue(1);
1829 InGlue = Lo.getValue(2);
1830 VA = RVLocs[++i]; // skip ahead to next loc
1831 SDValue Hi = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32,
1832 InGlue);
1833 Chain = Hi.getValue(1);
1834 InGlue = Hi.getValue(2);
1835 if (!Subtarget->isLittle())
1836 std::swap (Lo, Hi);
1837 Val = DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi);
1838
1839 if (VA.getLocVT() == MVT::v2f64) {
1840 SDValue Vec = DAG.getNode(ISD::UNDEF, dl, MVT::v2f64);
1841 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Vec, Val,
1842 DAG.getConstant(0, dl, MVT::i32));
1843
1844 VA = RVLocs[++i]; // skip ahead to next loc
1845 Lo = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, InGlue);
1846 Chain = Lo.getValue(1);
1847 InGlue = Lo.getValue(2);
1848 VA = RVLocs[++i]; // skip ahead to next loc
1849 Hi = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, InGlue);
1850 Chain = Hi.getValue(1);
1851 InGlue = Hi.getValue(2);
1852 if (!Subtarget->isLittle())
1853 std::swap (Lo, Hi);
1854 Val = DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi);
1855 Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Vec, Val,
1856 DAG.getConstant(1, dl, MVT::i32));
1857 }
1858 } else {
1859 Val = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), VA.getLocVT(),
1860 InGlue);
1861 Chain = Val.getValue(1);
1862 InGlue = Val.getValue(2);
1863 }
1864
1865 switch (VA.getLocInfo()) {
1866 default: llvm_unreachable("Unknown loc info!");
1867 case CCValAssign::Full: break;
1868 case CCValAssign::BCvt:
1869 Val = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), Val);
1870 break;
1871 }
1872
1873 // f16 arguments have their size extended to 4 bytes and passed as if they
1874 // had been copied to the LSBs of a 32-bit register.
1875 // For that, it's passed extended to i32 (soft ABI) or to f32 (hard ABI)
1876 if (VA.needsCustom() &&
1877 (VA.getValVT() == MVT::f16 || VA.getValVT() == MVT::bf16))
1878 Val = MoveToHPR(dl, DAG, VA.getLocVT(), VA.getValVT(), Val);
1879
1880 // On CMSE Non-secure Calls, call results (returned values) whose bitwidth
1881 // is less than 32 bits must be sign- or zero-extended after the call for
1882 // security reasons. Although the ABI mandates an extension done by the
1883 // callee, the latter cannot be trusted to follow the rules of the ABI.
1884 const ISD::InputArg &Arg = Ins[VA.getValNo()];
1885 if (isCmseNSCall && Arg.ArgVT.isScalarInteger() &&
1886 VA.getLocVT().isScalarInteger() && Arg.ArgVT.bitsLT(MVT::i32))
1887 Val = handleCMSEValue(Val, Arg, DAG, dl);
1888
1889 InVals.push_back(Val);
1890 }
1891
1892 return Chain;
1893}
1894
1895std::pair<SDValue, MachinePointerInfo> ARMTargetLowering::computeAddrForCallArg(
1896 const SDLoc &dl, SelectionDAG &DAG, const CCValAssign &VA, SDValue StackPtr,
1897 bool IsTailCall, int SPDiff) const {
1898 SDValue DstAddr;
1899 MachinePointerInfo DstInfo;
1900 int32_t Offset = VA.getLocMemOffset();
1901 MachineFunction &MF = DAG.getMachineFunction();
1902
1903 if (IsTailCall) {
1904 Offset += SPDiff;
1905 auto PtrVT = getPointerTy(DAG.getDataLayout());
1906 int Size = VA.getLocVT().getFixedSizeInBits() / 8;
1907 int FI = MF.getFrameInfo().CreateFixedObject(Size, Offset, true);
1908 DstAddr = DAG.getFrameIndex(FI, PtrVT);
1909 DstInfo =
1911 } else {
1912 SDValue PtrOff = DAG.getIntPtrConstant(Offset, dl);
1913 DstAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
1914 StackPtr, PtrOff);
1915 DstInfo =
1917 }
1918
1919 return std::make_pair(DstAddr, DstInfo);
1920}
1921
1922// Returns the type of copying which is required to set up a byval argument to
1923// a tail-called function. This isn't needed for non-tail calls, because they
1924// always need the equivalent of CopyOnce, but tail-calls sometimes need two to
1925// avoid clobbering another argument (CopyViaTemp), and sometimes can be
1926// optimised to zero copies when forwarding an argument from the caller's
1927// caller (NoCopy).
1928ARMTargetLowering::ByValCopyKind ARMTargetLowering::ByValNeedsCopyForTailCall(
1929 SelectionDAG &DAG, SDValue Src, SDValue Dst, ISD::ArgFlagsTy Flags) const {
1930 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
1931 ARMFunctionInfo *AFI = DAG.getMachineFunction().getInfo<ARMFunctionInfo>();
1932
1933 // Globals are always safe to copy from.
1935 return CopyOnce;
1936
1937 // Can only analyse frame index nodes, conservatively assume we need a
1938 // temporary.
1939 auto *SrcFrameIdxNode = dyn_cast<FrameIndexSDNode>(Src);
1940 auto *DstFrameIdxNode = dyn_cast<FrameIndexSDNode>(Dst);
1941 if (!SrcFrameIdxNode || !DstFrameIdxNode)
1942 return CopyViaTemp;
1943
1944 int SrcFI = SrcFrameIdxNode->getIndex();
1945 int DstFI = DstFrameIdxNode->getIndex();
1946 assert(MFI.isFixedObjectIndex(DstFI) &&
1947 "byval passed in non-fixed stack slot");
1948
1949 int64_t SrcOffset = MFI.getObjectOffset(SrcFI);
1950 int64_t DstOffset = MFI.getObjectOffset(DstFI);
1951
1952 // If the source is in the local frame, then the copy to the argument memory
1953 // is always valid.
1954 bool FixedSrc = MFI.isFixedObjectIndex(SrcFI);
1955 if (!FixedSrc ||
1956 (FixedSrc && SrcOffset < -(int64_t)AFI->getArgRegsSaveSize()))
1957 return CopyOnce;
1958
1959 // In the case of byval arguments split between registers and the stack,
1960 // computeAddrForCallArg returns a FrameIndex which corresponds only to the
1961 // stack portion, but the Src SDValue will refer to the full value, including
1962 // the local stack memory that the register portion gets stored into. We only
1963 // need to compare them for equality, so normalise on the full value version.
1964 uint64_t RegSize = Flags.getByValSize() - MFI.getObjectSize(DstFI);
1965 DstOffset -= RegSize;
1966
1967 // If the value is already in the correct location, then no copying is
1968 // needed. If not, then we need to copy via a temporary.
1969 if (SrcOffset == DstOffset)
1970 return NoCopy;
1971 else
1972 return CopyViaTemp;
1973}
1974
1975void ARMTargetLowering::PassF64ArgInRegs(const SDLoc &dl, SelectionDAG &DAG,
1976 SDValue Chain, SDValue &Arg,
1977 RegsToPassVector &RegsToPass,
1978 CCValAssign &VA, CCValAssign &NextVA,
1979 SDValue &StackPtr,
1980 SmallVectorImpl<SDValue> &MemOpChains,
1981 bool IsTailCall,
1982 int SPDiff) const {
1983 SDValue fmrrd = DAG.getNode(ARMISD::VMOVRRD, dl,
1984 DAG.getVTList(MVT::i32, MVT::i32), Arg);
1985 unsigned id = Subtarget->isLittle() ? 0 : 1;
1986 RegsToPass.push_back(std::make_pair(VA.getLocReg(), fmrrd.getValue(id)));
1987
1988 if (NextVA.isRegLoc())
1989 RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), fmrrd.getValue(1-id)));
1990 else {
1991 assert(NextVA.isMemLoc());
1992 if (!StackPtr.getNode())
1993 StackPtr = DAG.getCopyFromReg(Chain, dl, ARM::SP,
1995
1996 SDValue DstAddr;
1997 MachinePointerInfo DstInfo;
1998 std::tie(DstAddr, DstInfo) =
1999 computeAddrForCallArg(dl, DAG, NextVA, StackPtr, IsTailCall, SPDiff);
2000 MemOpChains.push_back(
2001 DAG.getStore(Chain, dl, fmrrd.getValue(1 - id), DstAddr, DstInfo));
2002 }
2003}
2004
2005static bool canGuaranteeTCO(CallingConv::ID CC, bool GuaranteeTailCalls) {
2006 return (CC == CallingConv::Fast && GuaranteeTailCalls) ||
2008}
2009
2010/// LowerCall - Lowering a call into a callseq_start <-
2011/// ARMISD:CALL <- callseq_end chain. Also add input and output parameter
2012/// nodes.
2013SDValue
2014ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
2015 SmallVectorImpl<SDValue> &InVals) const {
2016 SelectionDAG &DAG = CLI.DAG;
2017 SDLoc &dl = CLI.DL;
2018 SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
2019 SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
2020 SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
2021 SDValue Chain = CLI.Chain;
2022 SDValue Callee = CLI.Callee;
2023 bool &isTailCall = CLI.IsTailCall;
2024 CallingConv::ID CallConv = CLI.CallConv;
2025 bool doesNotRet = CLI.DoesNotReturn;
2026 bool isVarArg = CLI.IsVarArg;
2027 const CallBase *CB = CLI.CB;
2028
2029 MachineFunction &MF = DAG.getMachineFunction();
2030 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
2031 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
2032 MachineFunction::CallSiteInfo CSInfo;
2033 bool isStructRet = (Outs.empty()) ? false : Outs[0].Flags.isSRet();
2034 bool isThisReturn = false;
2035 bool isCmseNSCall = false;
2036 bool isSibCall = false;
2037 bool PreferIndirect = false;
2038 bool GuardWithBTI = false;
2039
2040 // Analyze operands of the call, assigning locations to each operand.
2042 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
2043 *DAG.getContext());
2044 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CallConv, isVarArg));
2045
2046 // Lower 'returns_twice' calls to a pseudo-instruction.
2047 if (CLI.CB && CLI.CB->getAttributes().hasFnAttr(Attribute::ReturnsTwice) &&
2048 !Subtarget->noBTIAtReturnTwice())
2049 GuardWithBTI = AFI->branchTargetEnforcement();
2050
2051 // Set type id for call site info.
2052 setTypeIdForCallsiteInfo(CB, MF, CSInfo);
2053
2054 // Determine whether this is a non-secure function call.
2055 if (CLI.CB && CLI.CB->getAttributes().hasFnAttr("cmse_nonsecure_call"))
2056 isCmseNSCall = true;
2057
2058 // Disable tail calls if they're not supported.
2059 if (!Subtarget->supportsTailCall())
2060 isTailCall = false;
2061
2062 // For both the non-secure calls and the returns from a CMSE entry function,
2063 // the function needs to do some extra work after the call, or before the
2064 // return, respectively, thus it cannot end with a tail call
2065 if (isCmseNSCall || AFI->isCmseNSEntryFunction())
2066 isTailCall = false;
2067
2068 if (isa<GlobalAddressSDNode>(Callee)) {
2069 // If we're optimizing for minimum size and the function is called three or
2070 // more times in this block, we can improve codesize by calling indirectly
2071 // as BLXr has a 16-bit encoding.
2072 auto *GV = cast<GlobalAddressSDNode>(Callee)->getGlobal();
2073 if (CLI.CB) {
2074 auto *BB = CLI.CB->getParent();
2075 PreferIndirect = Subtarget->isThumb() && Subtarget->hasMinSize() &&
2076 count_if(GV->users(), [&BB](const User *U) {
2077 return isa<Instruction>(U) &&
2078 cast<Instruction>(U)->getParent() == BB;
2079 }) > 2;
2080 }
2081 }
2082 if (isTailCall) {
2083 // Check if it's really possible to do a tail call.
2084 isTailCall =
2085 IsEligibleForTailCallOptimization(CLI, CCInfo, ArgLocs, PreferIndirect);
2086
2087 if (isTailCall && !getTargetMachine().Options.GuaranteedTailCallOpt &&
2088 CallConv != CallingConv::Tail && CallConv != CallingConv::SwiftTail)
2089 isSibCall = true;
2090
2091 // We don't support GuaranteedTailCallOpt for ARM, only automatically
2092 // detected sibcalls.
2093 if (isTailCall)
2094 ++NumTailCalls;
2095 }
2096
2097 if (!isTailCall && CLI.CB && CLI.CB->isMustTailCall())
2098 report_fatal_error("failed to perform tail call elimination on a call "
2099 "site marked musttail");
2100
2101 // Get a count of how many bytes are to be pushed on the stack.
2102 unsigned NumBytes = CCInfo.getStackSize();
2103
2104 // SPDiff is the byte offset of the call's argument area from the callee's.
2105 // Stores to callee stack arguments will be placed in FixedStackSlots offset
2106 // by this amount for a tail call. In a sibling call it must be 0 because the
2107 // caller will deallocate the entire stack and the callee still expects its
2108 // arguments to begin at SP+0. Completely unused for non-tail calls.
2109 int SPDiff = 0;
2110
2111 if (isTailCall && !isSibCall) {
2112 auto FuncInfo = MF.getInfo<ARMFunctionInfo>();
2113 unsigned NumReusableBytes = FuncInfo->getArgumentStackSize();
2114
2115 // Since callee will pop argument stack as a tail call, we must keep the
2116 // popped size 16-byte aligned.
2117 MaybeAlign StackAlign = DAG.getDataLayout().getStackAlignment();
2118 assert(StackAlign && "data layout string is missing stack alignment");
2119 NumBytes = alignTo(NumBytes, *StackAlign);
2120
2121 // SPDiff will be negative if this tail call requires more space than we
2122 // would automatically have in our incoming argument space. Positive if we
2123 // can actually shrink the stack.
2124 SPDiff = NumReusableBytes - NumBytes;
2125
2126 // If this call requires more stack than we have available from
2127 // LowerFormalArguments, tell FrameLowering to reserve space for it.
2128 if (SPDiff < 0 && AFI->getArgRegsSaveSize() < (unsigned)-SPDiff)
2129 AFI->setArgRegsSaveSize(-SPDiff);
2130 }
2131
2132 if (isSibCall) {
2133 // For sibling tail calls, memory operands are available in our caller's stack.
2134 NumBytes = 0;
2135 } else {
2136 // Adjust the stack pointer for the new arguments...
2137 // These operations are automatically eliminated by the prolog/epilog pass
2138 Chain = DAG.getCALLSEQ_START(Chain, isTailCall ? 0 : NumBytes, 0, dl);
2139 }
2140
2142 DAG.getCopyFromReg(Chain, dl, ARM::SP, getPointerTy(DAG.getDataLayout()));
2143
2144 RegsToPassVector RegsToPass;
2145 SmallVector<SDValue, 8> MemOpChains;
2146
2147 // If we are doing a tail-call, any byval arguments will be written to stack
2148 // space which was used for incoming arguments. If any the values being used
2149 // are incoming byval arguments to this function, then they might be
2150 // overwritten by the stores of the outgoing arguments. To avoid this, we
2151 // need to make a temporary copy of them in local stack space, then copy back
2152 // to the argument area.
2153 DenseMap<unsigned, SDValue> ByValTemporaries;
2154 SDValue ByValTempChain;
2155 if (isTailCall) {
2156 SmallVector<SDValue, 8> ByValCopyChains;
2157 for (const CCValAssign &VA : ArgLocs) {
2158 unsigned ArgIdx = VA.getValNo();
2159 SDValue Src = OutVals[ArgIdx];
2160 ISD::ArgFlagsTy Flags = Outs[ArgIdx].Flags;
2161
2162 if (!Flags.isByVal())
2163 continue;
2164
2165 SDValue Dst;
2166 MachinePointerInfo DstInfo;
2167 std::tie(Dst, DstInfo) =
2168 computeAddrForCallArg(dl, DAG, VA, SDValue(), true, SPDiff);
2169 ByValCopyKind Copy = ByValNeedsCopyForTailCall(DAG, Src, Dst, Flags);
2170
2171 if (Copy == NoCopy) {
2172 // If the argument is already at the correct offset on the stack
2173 // (because we are forwarding a byval argument from our caller), we
2174 // don't need any copying.
2175 continue;
2176 } else if (Copy == CopyOnce) {
2177 // If the argument is in our local stack frame, no other argument
2178 // preparation can clobber it, so we can copy it to the final location
2179 // later.
2180 ByValTemporaries[ArgIdx] = Src;
2181 } else {
2182 assert(Copy == CopyViaTemp && "unexpected enum value");
2183 // If we might be copying this argument from the outgoing argument
2184 // stack area, we need to copy via a temporary in the local stack
2185 // frame.
2186 int TempFrameIdx = MFI.CreateStackObject(
2187 Flags.getByValSize(), Flags.getNonZeroByValAlign(), false);
2188 SDValue Temp =
2189 DAG.getFrameIndex(TempFrameIdx, getPointerTy(DAG.getDataLayout()));
2190
2191 SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), dl, MVT::i32);
2192 SDValue AlignNode =
2193 DAG.getConstant(Flags.getNonZeroByValAlign().value(), dl, MVT::i32);
2194
2195 SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
2196 SDValue Ops[] = {Chain, Temp, Src, SizeNode, AlignNode};
2197 ByValCopyChains.push_back(
2198 DAG.getNode(ARMISD::COPY_STRUCT_BYVAL, dl, VTs, Ops));
2199 ByValTemporaries[ArgIdx] = Temp;
2200 }
2201 }
2202 if (!ByValCopyChains.empty())
2203 ByValTempChain =
2204 DAG.getNode(ISD::TokenFactor, dl, MVT::Other, ByValCopyChains);
2205 }
2206
2207 // During a tail call, stores to the argument area must happen after all of
2208 // the function's incoming arguments have been loaded because they may alias.
2209 // This is done by folding in a TokenFactor from LowerFormalArguments, but
2210 // there's no point in doing so repeatedly so this tracks whether that's
2211 // happened yet.
2212 bool AfterFormalArgLoads = false;
2213
2214 // Walk the register/memloc assignments, inserting copies/loads. In the case
2215 // of tail call optimization, arguments are handled later.
2216 for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size();
2217 i != e;
2218 ++i, ++realArgIdx) {
2219 CCValAssign &VA = ArgLocs[i];
2220 SDValue Arg = OutVals[realArgIdx];
2221 ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags;
2222 bool isByVal = Flags.isByVal();
2223
2224 // Promote the value if needed.
2225 switch (VA.getLocInfo()) {
2226 default: llvm_unreachable("Unknown loc info!");
2227 case CCValAssign::Full: break;
2228 case CCValAssign::SExt:
2229 Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), Arg);
2230 break;
2231 case CCValAssign::ZExt:
2232 Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), Arg);
2233 break;
2234 case CCValAssign::AExt:
2235 Arg = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), Arg);
2236 break;
2237 case CCValAssign::BCvt:
2238 Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg);
2239 break;
2240 }
2241
2242 if (isTailCall && VA.isMemLoc() && !AfterFormalArgLoads) {
2243 Chain = DAG.getStackArgumentTokenFactor(Chain);
2244 if (ByValTempChain) {
2245 // In case of large byval copies, re-using the stackframe for tail-calls
2246 // can lead to overwriting incoming arguments on the stack. Force
2247 // loading these stack arguments before the copy to avoid that.
2248 SmallVector<SDValue, 8> IncomingLoad;
2249 for (unsigned I = 0; I < OutVals.size(); ++I) {
2250 if (Outs[I].Flags.isByVal())
2251 continue;
2252
2253 SDValue OutVal = OutVals[I];
2254 LoadSDNode *OutLN = dyn_cast_or_null<LoadSDNode>(OutVal);
2255 if (!OutLN)
2256 continue;
2257
2258 FrameIndexSDNode *FIN =
2260 if (!FIN)
2261 continue;
2262
2263 if (!MFI.isFixedObjectIndex(FIN->getIndex()))
2264 continue;
2265
2266 for (const CCValAssign &VA : ArgLocs) {
2267 if (VA.isMemLoc())
2268 IncomingLoad.push_back(OutVal.getValue(1));
2269 }
2270 }
2271
2272 // Update the chain to force loads for potentially clobbered argument
2273 // loads to happen before the byval copy.
2274 if (!IncomingLoad.empty()) {
2275 IncomingLoad.push_back(Chain);
2276 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, IncomingLoad);
2277 }
2278
2279 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chain,
2280 ByValTempChain);
2281 }
2282 AfterFormalArgLoads = true;
2283 }
2284
2285 // f16 arguments have their size extended to 4 bytes and passed as if they
2286 // had been copied to the LSBs of a 32-bit register.
2287 // For that, it's passed extended to i32 (soft ABI) or to f32 (hard ABI)
2288 if (VA.needsCustom() &&
2289 (VA.getValVT() == MVT::f16 || VA.getValVT() == MVT::bf16)) {
2290 Arg = MoveFromHPR(dl, DAG, VA.getLocVT(), VA.getValVT(), Arg);
2291 } else {
2292 // f16 arguments could have been extended prior to argument lowering.
2293 // Mask them arguments if this is a CMSE nonsecure call.
2294 auto ArgVT = Outs[realArgIdx].ArgVT;
2295 if (isCmseNSCall && (ArgVT == MVT::f16)) {
2296 auto LocBits = VA.getLocVT().getSizeInBits();
2297 auto MaskValue = APInt::getLowBitsSet(LocBits, ArgVT.getSizeInBits());
2298 SDValue Mask =
2299 DAG.getConstant(MaskValue, dl, MVT::getIntegerVT(LocBits));
2300 Arg = DAG.getNode(ISD::BITCAST, dl, MVT::getIntegerVT(LocBits), Arg);
2301 Arg = DAG.getNode(ISD::AND, dl, MVT::getIntegerVT(LocBits), Arg, Mask);
2302 Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg);
2303 }
2304 }
2305
2306 // f64 and v2f64 might be passed in i32 pairs and must be split into pieces
2307 if (VA.needsCustom() && VA.getLocVT() == MVT::v2f64) {
2308 SDValue Op0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
2309 DAG.getConstant(0, dl, MVT::i32));
2310 SDValue Op1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
2311 DAG.getConstant(1, dl, MVT::i32));
2312
2313 PassF64ArgInRegs(dl, DAG, Chain, Op0, RegsToPass, VA, ArgLocs[++i],
2314 StackPtr, MemOpChains, isTailCall, SPDiff);
2315
2316 VA = ArgLocs[++i]; // skip ahead to next loc
2317 if (VA.isRegLoc()) {
2318 PassF64ArgInRegs(dl, DAG, Chain, Op1, RegsToPass, VA, ArgLocs[++i],
2319 StackPtr, MemOpChains, isTailCall, SPDiff);
2320 } else {
2321 assert(VA.isMemLoc());
2322 SDValue DstAddr;
2323 MachinePointerInfo DstInfo;
2324 std::tie(DstAddr, DstInfo) =
2325 computeAddrForCallArg(dl, DAG, VA, StackPtr, isTailCall, SPDiff);
2326 MemOpChains.push_back(DAG.getStore(Chain, dl, Op1, DstAddr, DstInfo));
2327 }
2328 } else if (VA.needsCustom() && VA.getLocVT() == MVT::f64) {
2329 PassF64ArgInRegs(dl, DAG, Chain, Arg, RegsToPass, VA, ArgLocs[++i],
2330 StackPtr, MemOpChains, isTailCall, SPDiff);
2331 } else if (VA.isRegLoc()) {
2332 if (realArgIdx == 0 && Flags.isReturned() && !Flags.isSwiftSelf() &&
2333 Outs[0].VT == MVT::i32) {
2334 assert(VA.getLocVT() == MVT::i32 &&
2335 "unexpected calling convention register assignment");
2336 assert(!Ins.empty() && Ins[0].VT == MVT::i32 &&
2337 "unexpected use of 'returned'");
2338 isThisReturn = true;
2339 }
2340 const TargetOptions &Options = DAG.getTarget().Options;
2341 if (Options.EmitCallSiteInfo)
2342 CSInfo.ArgRegPairs.emplace_back(VA.getLocReg(), i);
2343 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
2344 } else if (isByVal) {
2345 assert(VA.isMemLoc());
2346 unsigned offset = 0;
2347
2348 // True if this byval aggregate will be split between registers
2349 // and memory.
2350 unsigned ByValArgsCount = CCInfo.getInRegsParamsCount();
2351 unsigned CurByValIdx = CCInfo.getInRegsParamsProcessed();
2352
2353 SDValue ByValSrc;
2354 bool NeedsStackCopy;
2355 if (auto It = ByValTemporaries.find(realArgIdx);
2356 It != ByValTemporaries.end()) {
2357 ByValSrc = It->second;
2358 NeedsStackCopy = true;
2359 } else {
2360 ByValSrc = Arg;
2361 NeedsStackCopy = !isTailCall;
2362 }
2363
2364 // If part of the argument is in registers, load them.
2365 if (CurByValIdx < ByValArgsCount) {
2366 unsigned RegBegin, RegEnd;
2367 CCInfo.getInRegsParamInfo(CurByValIdx, RegBegin, RegEnd);
2368
2369 EVT PtrVT = getPointerTy(DAG.getDataLayout());
2370 unsigned int i, j;
2371 for (i = 0, j = RegBegin; j < RegEnd; i++, j++) {
2372 SDValue Const = DAG.getConstant(4*i, dl, MVT::i32);
2373 SDValue AddArg = DAG.getNode(ISD::ADD, dl, PtrVT, ByValSrc, Const);
2374 SDValue Load =
2375 DAG.getLoad(PtrVT, dl, Chain, AddArg, MachinePointerInfo(),
2376 DAG.InferPtrAlign(AddArg));
2377 MemOpChains.push_back(Load.getValue(1));
2378 RegsToPass.push_back(std::make_pair(j, Load));
2379 }
2380
2381 // If parameter size outsides register area, "offset" value
2382 // helps us to calculate stack slot for remained part properly.
2383 offset = RegEnd - RegBegin;
2384
2385 CCInfo.nextInRegsParam();
2386 }
2387
2388 // If the memory part of the argument isn't already in the correct place
2389 // (which can happen with tail calls), copy it into the argument area.
2390 if (NeedsStackCopy && Flags.getByValSize() > 4 * offset) {
2391 auto PtrVT = getPointerTy(DAG.getDataLayout());
2392 SDValue Dst;
2393 MachinePointerInfo DstInfo;
2394 std::tie(Dst, DstInfo) =
2395 computeAddrForCallArg(dl, DAG, VA, StackPtr, isTailCall, SPDiff);
2396 SDValue SrcOffset = DAG.getIntPtrConstant(4*offset, dl);
2397 SDValue Src = DAG.getNode(ISD::ADD, dl, PtrVT, ByValSrc, SrcOffset);
2398 SDValue SizeNode = DAG.getConstant(Flags.getByValSize() - 4*offset, dl,
2399 MVT::i32);
2400 SDValue AlignNode =
2401 DAG.getConstant(Flags.getNonZeroByValAlign().value(), dl, MVT::i32);
2402
2403 SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
2404 SDValue Ops[] = { Chain, Dst, Src, SizeNode, AlignNode};
2405 MemOpChains.push_back(DAG.getNode(ARMISD::COPY_STRUCT_BYVAL, dl, VTs,
2406 Ops));
2407 }
2408 } else {
2409 assert(VA.isMemLoc());
2410 SDValue DstAddr;
2411 MachinePointerInfo DstInfo;
2412 std::tie(DstAddr, DstInfo) =
2413 computeAddrForCallArg(dl, DAG, VA, StackPtr, isTailCall, SPDiff);
2414
2415 SDValue Store = DAG.getStore(Chain, dl, Arg, DstAddr, DstInfo);
2416 MemOpChains.push_back(Store);
2417 }
2418 }
2419
2420 if (!MemOpChains.empty())
2421 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
2422
2423 // Build a sequence of copy-to-reg nodes chained together with token chain
2424 // and flag operands which copy the outgoing args into the appropriate regs.
2425 SDValue InGlue;
2426 for (const auto &[Reg, N] : RegsToPass) {
2427 Chain = DAG.getCopyToReg(Chain, dl, Reg, N, InGlue);
2428 InGlue = Chain.getValue(1);
2429 }
2430
2431 // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
2432 // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
2433 // node so that legalize doesn't hack it.
2434 bool isDirect = false;
2435
2436 const TargetMachine &TM = getTargetMachine();
2437 const Triple &TT = TM.getTargetTriple();
2438 const GlobalValue *GVal = nullptr;
2439 if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
2440 GVal = G->getGlobal();
2441 bool isStub = !TM.shouldAssumeDSOLocal(GVal) && TT.isOSBinFormatMachO();
2442
2443 bool isARMFunc = !Subtarget->isThumb() || (isStub && !Subtarget->isMClass());
2444 bool isLocalARMFunc = false;
2445 auto PtrVt = getPointerTy(DAG.getDataLayout());
2446
2447 if (Subtarget->genLongCalls()) {
2448 assert((!isPositionIndependent() || TT.isOSWindows()) &&
2449 "long-calls codegen is not position independent!");
2450 // Handle a global address or an external symbol. If it's not one of
2451 // those, the target's already in a register, so we don't need to do
2452 // anything extra.
2453 if (isa<GlobalAddressSDNode>(Callee)) {
2454 if (Subtarget->genExecuteOnly()) {
2455 if (Subtarget->useMovt())
2456 ++NumMovwMovt;
2457 Callee = DAG.getNode(ARMISD::Wrapper, dl, PtrVt,
2458 DAG.getTargetGlobalAddress(GVal, dl, PtrVt));
2459 } else {
2460 // Create a constant pool entry for the callee address
2461 unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
2462 ARMConstantPoolValue *CPV = ARMConstantPoolConstant::Create(
2463 GVal, ARMPCLabelIndex, ARMCP::CPValue, 0);
2464
2465 // Get the address of the callee into a register
2466 SDValue Addr = DAG.getTargetConstantPool(CPV, PtrVt, Align(4));
2467 Addr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Addr);
2468 Callee = DAG.getLoad(
2469 PtrVt, dl, DAG.getEntryNode(), Addr,
2471 }
2472 } else if (ExternalSymbolSDNode *S=dyn_cast<ExternalSymbolSDNode>(Callee)) {
2473 const char *Sym = S->getSymbol();
2474
2475 if (Subtarget->genExecuteOnly()) {
2476 if (Subtarget->useMovt())
2477 ++NumMovwMovt;
2478 Callee = DAG.getNode(ARMISD::Wrapper, dl, PtrVt,
2479 DAG.getTargetGlobalAddress(GVal, dl, PtrVt));
2480 } else {
2481 // Create a constant pool entry for the callee address
2482 unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
2483 ARMConstantPoolValue *CPV = ARMConstantPoolSymbol::Create(
2484 *DAG.getContext(), Sym, ARMPCLabelIndex, 0);
2485
2486 // Get the address of the callee into a register
2487 SDValue Addr = DAG.getTargetConstantPool(CPV, PtrVt, Align(4));
2488 Addr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Addr);
2489 Callee = DAG.getLoad(
2490 PtrVt, dl, DAG.getEntryNode(), Addr,
2492 }
2493 }
2494 } else if (isa<GlobalAddressSDNode>(Callee)) {
2495 if (!PreferIndirect) {
2496 isDirect = true;
2497 bool isDef = GVal->isStrongDefinitionForLinker();
2498
2499 // ARM call to a local ARM function is predicable.
2500 isLocalARMFunc = !Subtarget->isThumb() && (isDef || !ARMInterworking);
2501 // tBX takes a register source operand.
2502 if (isStub && Subtarget->isThumb1Only() && !Subtarget->hasV5TOps()) {
2503 assert(TT.isOSBinFormatMachO() && "WrapperPIC use on non-MachO?");
2504 Callee = DAG.getNode(
2505 ARMISD::WrapperPIC, dl, PtrVt,
2506 DAG.getTargetGlobalAddress(GVal, dl, PtrVt, 0, ARMII::MO_NONLAZY));
2507 Callee = DAG.getLoad(
2508 PtrVt, dl, DAG.getEntryNode(), Callee,
2512 } else if (Subtarget->isTargetCOFF()) {
2513 assert(Subtarget->isTargetWindows() &&
2514 "Windows is the only supported COFF target");
2515 unsigned TargetFlags = ARMII::MO_NO_FLAG;
2516 if (GVal->hasDLLImportStorageClass())
2517 TargetFlags = ARMII::MO_DLLIMPORT;
2518 else if (!TM.shouldAssumeDSOLocal(GVal))
2519 TargetFlags = ARMII::MO_COFFSTUB;
2520 Callee = DAG.getTargetGlobalAddress(GVal, dl, PtrVt, /*offset=*/0,
2521 TargetFlags);
2522 if (TargetFlags & (ARMII::MO_DLLIMPORT | ARMII::MO_COFFSTUB))
2523 Callee =
2524 DAG.getLoad(PtrVt, dl, DAG.getEntryNode(),
2525 DAG.getNode(ARMISD::Wrapper, dl, PtrVt, Callee),
2527 } else {
2528 Callee = DAG.getTargetGlobalAddress(GVal, dl, PtrVt, 0, 0);
2529 }
2530 }
2531 } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
2532 isDirect = true;
2533 // tBX takes a register source operand.
2534 const char *Sym = S->getSymbol();
2535 if (isARMFunc && Subtarget->isThumb1Only() && !Subtarget->hasV5TOps()) {
2536 unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
2537 ARMConstantPoolValue *CPV =
2539 ARMPCLabelIndex, 4);
2540 SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, Align(4));
2541 CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
2542 Callee = DAG.getLoad(
2543 PtrVt, dl, DAG.getEntryNode(), CPAddr,
2545 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32);
2546 Callee = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVt, Callee, PICLabel);
2547 } else {
2548 Callee = DAG.getTargetExternalSymbol(Sym, PtrVt, 0);
2549 }
2550 }
2551
2552 if (isCmseNSCall) {
2553 assert(!isARMFunc && !isDirect &&
2554 "Cannot handle call to ARM function or direct call");
2555 if (NumBytes > 0) {
2556 DAG.getContext()->diagnose(
2557 DiagnosticInfoUnsupported(DAG.getMachineFunction().getFunction(),
2558 "call to non-secure function would require "
2559 "passing arguments on stack",
2560 dl.getDebugLoc()));
2561 }
2562 if (isStructRet) {
2563 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
2565 "call to non-secure function would return value through pointer",
2566 dl.getDebugLoc()));
2567 }
2568 }
2569
2570 // FIXME: handle tail calls differently.
2571 unsigned CallOpc;
2572 if (Subtarget->isThumb()) {
2573 if (GuardWithBTI)
2574 CallOpc = ARMISD::t2CALL_BTI;
2575 else if (isCmseNSCall)
2576 CallOpc = ARMISD::tSECALL;
2577 else if ((!isDirect || isARMFunc) && !Subtarget->hasV5TOps())
2578 CallOpc = ARMISD::CALL_NOLINK;
2579 else
2580 CallOpc = ARMISD::CALL;
2581 } else {
2582 if (!isDirect && !Subtarget->hasV5TOps())
2583 CallOpc = ARMISD::CALL_NOLINK;
2584 else if (doesNotRet && isDirect && Subtarget->hasRetAddrStack() &&
2585 // Emit regular call when code size is the priority
2586 !Subtarget->hasMinSize())
2587 // "mov lr, pc; b _foo" to avoid confusing the RSP
2588 CallOpc = ARMISD::CALL_NOLINK;
2589 else
2590 CallOpc = isLocalARMFunc ? ARMISD::CALL_PRED : ARMISD::CALL;
2591 }
2592
2593 // We don't usually want to end the call-sequence here because we would tidy
2594 // the frame up *after* the call, however in the ABI-changing tail-call case
2595 // we've carefully laid out the parameters so that when sp is reset they'll be
2596 // in the correct location.
2597 if (isTailCall && !isSibCall) {
2598 Chain = DAG.getCALLSEQ_END(Chain, 0, 0, InGlue, dl);
2599 InGlue = Chain.getValue(1);
2600 }
2601
2602 std::vector<SDValue> Ops;
2603 Ops.push_back(Chain);
2604 Ops.push_back(Callee);
2605
2606 if (isTailCall) {
2607 Ops.push_back(DAG.getSignedTargetConstant(SPDiff, dl, MVT::i32));
2608 }
2609
2610 // Add argument registers to the end of the list so that they are known live
2611 // into the call.
2612 for (const auto &[Reg, N] : RegsToPass)
2613 Ops.push_back(DAG.getRegister(Reg, N.getValueType()));
2614
2615 // Add a register mask operand representing the call-preserved registers.
2616 const uint32_t *Mask;
2617 const ARMBaseRegisterInfo *ARI = Subtarget->getRegisterInfo();
2618 if (isThisReturn) {
2619 // For 'this' returns, use the R0-preserving mask if applicable
2620 Mask = ARI->getThisReturnPreservedMask(MF, CallConv);
2621 if (!Mask) {
2622 // Set isThisReturn to false if the calling convention is not one that
2623 // allows 'returned' to be modeled in this way, so LowerCallResult does
2624 // not try to pass 'this' straight through
2625 isThisReturn = false;
2626 Mask = ARI->getCallPreservedMask(MF, CallConv);
2627 }
2628 } else
2629 Mask = ARI->getCallPreservedMask(MF, CallConv);
2630
2631 assert(Mask && "Missing call preserved mask for calling convention");
2632 Ops.push_back(DAG.getRegisterMask(Mask));
2633
2634 if (InGlue.getNode())
2635 Ops.push_back(InGlue);
2636
2637 if (isTailCall) {
2639 SDValue Ret = DAG.getNode(ARMISD::TC_RETURN, dl, MVT::Other, Ops);
2640 if (CLI.CFIType)
2641 Ret.getNode()->setCFIType(CLI.CFIType->getZExtValue());
2642 DAG.addNoMergeSiteInfo(Ret.getNode(), CLI.NoMerge);
2643 DAG.addCallSiteInfo(Ret.getNode(), std::move(CSInfo));
2644 return Ret;
2645 }
2646
2647 // Returns a chain and a flag for retval copy to use.
2648 Chain = DAG.getNode(CallOpc, dl, {MVT::Other, MVT::Glue}, Ops);
2649 if (CLI.CFIType)
2650 Chain.getNode()->setCFIType(CLI.CFIType->getZExtValue());
2651 DAG.addNoMergeSiteInfo(Chain.getNode(), CLI.NoMerge);
2652 InGlue = Chain.getValue(1);
2653 DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo));
2654
2655 // If we're guaranteeing tail-calls will be honoured, the callee must
2656 // pop its own argument stack on return. But this call is *not* a tail call so
2657 // we need to undo that after it returns to restore the status-quo.
2658 bool TailCallOpt = getTargetMachine().Options.GuaranteedTailCallOpt;
2659 uint64_t CalleePopBytes =
2660 canGuaranteeTCO(CallConv, TailCallOpt) ? alignTo(NumBytes, 16) : -1U;
2661
2662 Chain = DAG.getCALLSEQ_END(Chain, NumBytes, CalleePopBytes, InGlue, dl);
2663 if (!Ins.empty())
2664 InGlue = Chain.getValue(1);
2665
2666 // Handle result values, copying them out of physregs into vregs that we
2667 // return.
2668 return LowerCallResult(Chain, InGlue, CallConv, isVarArg, Ins, dl, DAG,
2669 InVals, isThisReturn,
2670 isThisReturn ? OutVals[0] : SDValue(), isCmseNSCall);
2671}
2672
2673/// HandleByVal - Every parameter *after* a byval parameter is passed
2674/// on the stack. Remember the next parameter register to allocate,
2675/// and then confiscate the rest of the parameter registers to insure
2676/// this.
2677void ARMTargetLowering::HandleByVal(CCState *State, unsigned &Size,
2678 Align Alignment) const {
2679 // Byval (as with any stack) slots are always at least 4 byte aligned.
2680 Alignment = std::max(Alignment, Align(4));
2681
2682 MCRegister Reg = State->AllocateReg(GPRArgRegs);
2683 if (!Reg)
2684 return;
2685
2686 unsigned AlignInRegs = Alignment.value() / 4;
2687 unsigned Waste = (ARM::R4 - Reg) % AlignInRegs;
2688 for (unsigned i = 0; i < Waste; ++i)
2689 Reg = State->AllocateReg(GPRArgRegs);
2690
2691 if (!Reg)
2692 return;
2693
2694 unsigned Excess = 4 * (ARM::R4 - Reg);
2695
2696 // Special case when NSAA != SP and parameter size greater than size of
2697 // all remained GPR regs. In that case we can't split parameter, we must
2698 // send it to stack. We also must set NCRN to R4, so waste all
2699 // remained registers.
2700 const unsigned NSAAOffset = State->getStackSize();
2701 if (NSAAOffset != 0 && Size > Excess) {
2702 while (State->AllocateReg(GPRArgRegs))
2703 ;
2704 return;
2705 }
2706
2707 // First register for byval parameter is the first register that wasn't
2708 // allocated before this method call, so it would be "reg".
2709 // If parameter is small enough to be saved in range [reg, r4), then
2710 // the end (first after last) register would be reg + param-size-in-regs,
2711 // else parameter would be splitted between registers and stack,
2712 // end register would be r4 in this case.
2713 unsigned ByValRegBegin = Reg;
2714 unsigned ByValRegEnd = std::min<unsigned>(Reg + Size / 4, ARM::R4);
2715 State->addInRegsParamInfo(ByValRegBegin, ByValRegEnd);
2716 // Note, first register is allocated in the beginning of function already,
2717 // allocate remained amount of registers we need.
2718 for (unsigned i = Reg + 1; i != ByValRegEnd; ++i)
2719 State->AllocateReg(GPRArgRegs);
2720 // A byval parameter that is split between registers and memory needs its
2721 // size truncated here.
2722 // In the case where the entire structure fits in registers, we set the
2723 // size in memory to zero.
2724 Size = std::max<int>(Size - Excess, 0);
2725}
2726
2727/// IsEligibleForTailCallOptimization - Check whether the call is eligible
2728/// for tail call optimization. Targets which want to do tail call
2729/// optimization should implement this function. Note that this function also
2730/// processes musttail calls, so when this function returns false on a valid
2731/// musttail call, a fatal backend error occurs.
2732bool ARMTargetLowering::IsEligibleForTailCallOptimization(
2734 SmallVectorImpl<CCValAssign> &ArgLocs, const bool isIndirect) const {
2735 CallingConv::ID CalleeCC = CLI.CallConv;
2736 SDValue Callee = CLI.Callee;
2737 bool isVarArg = CLI.IsVarArg;
2738 const SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
2739 const SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
2740 const SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
2741 const SelectionDAG &DAG = CLI.DAG;
2742 MachineFunction &MF = DAG.getMachineFunction();
2743 const Function &CallerF = MF.getFunction();
2744 CallingConv::ID CallerCC = CallerF.getCallingConv();
2745
2746 assert(Subtarget->supportsTailCall());
2747
2748 // Indirect tail-calls require a register to hold the target address. That
2749 // register must be:
2750 // * Allocatable (i.e. r0-r7 if the target is Thumb1).
2751 // * Not callee-saved, so must be one of r0-r3 or r12.
2752 // * Not used to hold an argument to the tail-called function, which might be
2753 // in r0-r3.
2754 // * Not used to hold the return address authentication code, which is in r12
2755 // if enabled.
2756 // Sometimes, no register matches all of these conditions, so we can't do a
2757 // tail-call.
2758 if (!isa<GlobalAddressSDNode>(Callee.getNode()) || isIndirect) {
2759 SmallSet<MCPhysReg, 5> AddressRegisters = {ARM::R0, ARM::R1, ARM::R2,
2760 ARM::R3};
2761 if (!(Subtarget->isThumb1Only() ||
2762 MF.getInfo<ARMFunctionInfo>()->shouldSignReturnAddress(true)))
2763 AddressRegisters.insert(ARM::R12);
2764 for (const CCValAssign &AL : ArgLocs)
2765 if (AL.isRegLoc())
2766 AddressRegisters.erase(AL.getLocReg());
2767 if (AddressRegisters.empty()) {
2768 LLVM_DEBUG(dbgs() << "false (no reg to hold function pointer)\n");
2769 return false;
2770 }
2771 }
2772
2773 // Look for obvious safe cases to perform tail call optimization that do not
2774 // require ABI changes. This is what gcc calls sibcall.
2775
2776 // Exception-handling functions need a special set of instructions to indicate
2777 // a return to the hardware. Tail-calling another function would probably
2778 // break this.
2779 if (CallerF.hasFnAttribute("interrupt")) {
2780 LLVM_DEBUG(dbgs() << "false (interrupt attribute)\n");
2781 return false;
2782 }
2783
2784 if (canGuaranteeTCO(CalleeCC,
2785 getTargetMachine().Options.GuaranteedTailCallOpt)) {
2786 LLVM_DEBUG(dbgs() << (CalleeCC == CallerCC ? "true" : "false")
2787 << " (guaranteed tail-call CC)\n");
2788 return CalleeCC == CallerCC;
2789 }
2790
2791 // Also avoid sibcall optimization if either caller or callee uses struct
2792 // return semantics.
2793 bool isCalleeStructRet = Outs.empty() ? false : Outs[0].Flags.isSRet();
2794 bool isCallerStructRet = MF.getFunction().hasStructRetAttr();
2795 if (isCalleeStructRet != isCallerStructRet) {
2796 LLVM_DEBUG(dbgs() << "false (struct-ret)\n");
2797 return false;
2798 }
2799
2800 // Externally-defined functions with weak linkage should not be
2801 // tail-called on ARM when the OS does not support dynamic
2802 // pre-emption of symbols, as the AAELF spec requires normal calls
2803 // to undefined weak functions to be replaced with a NOP or jump to the
2804 // next instruction. The behaviour of branch instructions in this
2805 // situation (as used for tail calls) is implementation-defined, so we
2806 // cannot rely on the linker replacing the tail call with a return.
2807 if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
2808 const GlobalValue *GV = G->getGlobal();
2809 const Triple &TT = getTargetMachine().getTargetTriple();
2810 if (GV->hasExternalWeakLinkage() &&
2811 (!TT.isOSWindows() || TT.isOSBinFormatELF() ||
2812 TT.isOSBinFormatMachO())) {
2813 LLVM_DEBUG(dbgs() << "false (external weak linkage)\n");
2814 return false;
2815 }
2816 }
2817
2818 // Check that the call results are passed in the same way.
2819 LLVMContext &C = *DAG.getContext();
2821 getEffectiveCallingConv(CalleeCC, isVarArg),
2822 getEffectiveCallingConv(CallerCC, CallerF.isVarArg()), MF, C, Ins,
2823 CCAssignFnForReturn(CalleeCC, isVarArg),
2824 CCAssignFnForReturn(CallerCC, CallerF.isVarArg()))) {
2825 LLVM_DEBUG(dbgs() << "false (incompatible results)\n");
2826 return false;
2827 }
2828 // The callee has to preserve all registers the caller needs to preserve.
2829 const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo();
2830 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
2831 if (CalleeCC != CallerCC) {
2832 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
2833 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved)) {
2834 LLVM_DEBUG(dbgs() << "false (not all registers preserved)\n");
2835 return false;
2836 }
2837 }
2838
2839 // If Caller's vararg argument has been split between registers and stack, do
2840 // not perform tail call, since part of the argument is in caller's local
2841 // frame.
2842 const ARMFunctionInfo *AFI_Caller = MF.getInfo<ARMFunctionInfo>();
2843 if (CLI.IsVarArg && AFI_Caller->getArgRegsSaveSize()) {
2844 LLVM_DEBUG(dbgs() << "false (arg reg save area)\n");
2845 return false;
2846 }
2847
2848 // If the callee takes no arguments then go on to check the results of the
2849 // call.
2850 const MachineRegisterInfo &MRI = MF.getRegInfo();
2851 if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals)) {
2852 LLVM_DEBUG(dbgs() << "false (parameters in CSRs do not match)\n");
2853 return false;
2854 }
2855
2856 // If the stack arguments for this call do not fit into our own save area then
2857 // the call cannot be made tail.
2858 if (CCInfo.getStackSize() > AFI_Caller->getArgumentStackSize())
2859 return false;
2860
2861 LLVM_DEBUG(dbgs() << "true\n");
2862 return true;
2863}
2864
2865bool
2866ARMTargetLowering::CanLowerReturn(CallingConv::ID CallConv,
2867 MachineFunction &MF, bool isVarArg,
2869 LLVMContext &Context, const Type *RetTy) const {
2871 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
2872 return CCInfo.CheckReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg));
2873}
2874
2876 const SDLoc &DL, SelectionDAG &DAG) {
2877 const MachineFunction &MF = DAG.getMachineFunction();
2878 const Function &F = MF.getFunction();
2879
2880 StringRef IntKind = F.getFnAttribute("interrupt").getValueAsString();
2881
2882 // See ARM ARM v7 B1.8.3. On exception entry LR is set to a possibly offset
2883 // version of the "preferred return address". These offsets affect the return
2884 // instruction if this is a return from PL1 without hypervisor extensions.
2885 // IRQ/FIQ: +4 "subs pc, lr, #4"
2886 // SWI: 0 "subs pc, lr, #0"
2887 // ABORT: +4 "subs pc, lr, #4"
2888 // UNDEF: +4/+2 "subs pc, lr, #0"
2889 // UNDEF varies depending on where the exception came from ARM or Thumb
2890 // mode. Alongside GCC, we throw our hands up in disgust and pretend it's 0.
2891
2892 int64_t LROffset;
2893 if (IntKind == "" || IntKind == "IRQ" || IntKind == "FIQ" ||
2894 IntKind == "ABORT")
2895 LROffset = 4;
2896 else if (IntKind == "SWI" || IntKind == "UNDEF")
2897 LROffset = 0;
2898 else
2899 report_fatal_error("Unsupported interrupt attribute. If present, value "
2900 "must be one of: IRQ, FIQ, SWI, ABORT or UNDEF");
2901
2902 RetOps.insert(RetOps.begin() + 1,
2903 DAG.getConstant(LROffset, DL, MVT::i32, false));
2904
2905 return DAG.getNode(ARMISD::INTRET_GLUE, DL, MVT::Other, RetOps);
2906}
2907
2908SDValue
2909ARMTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
2910 bool isVarArg,
2912 const SmallVectorImpl<SDValue> &OutVals,
2913 const SDLoc &dl, SelectionDAG &DAG) const {
2914 // CCValAssign - represent the assignment of the return value to a location.
2916
2917 // CCState - Info about the registers and stack slots.
2918 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
2919 *DAG.getContext());
2920
2921 // Analyze outgoing return values.
2922 CCInfo.AnalyzeReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg));
2923
2924 SDValue Glue;
2926 RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
2927 bool isLittleEndian = Subtarget->isLittle();
2928
2929 MachineFunction &MF = DAG.getMachineFunction();
2930 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
2931 AFI->setReturnRegsCount(RVLocs.size());
2932
2933 // Report error if cmse entry function returns structure through first ptr arg.
2934 if (AFI->isCmseNSEntryFunction() && MF.getFunction().hasStructRetAttr()) {
2935 // Note: using an empty SDLoc(), as the first line of the function is a
2936 // better place to report than the last line.
2937 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
2939 "secure entry function would return value through pointer",
2940 SDLoc().getDebugLoc()));
2941 }
2942
2943 // Copy the result values into the output registers.
2944 for (unsigned i = 0, realRVLocIdx = 0;
2945 i != RVLocs.size();
2946 ++i, ++realRVLocIdx) {
2947 CCValAssign &VA = RVLocs[i];
2948 assert(VA.isRegLoc() && "Can only return in registers!");
2949
2950 SDValue Arg = OutVals[realRVLocIdx];
2951 bool ReturnF16 = false;
2952
2953 if (Subtarget->hasFullFP16() && getTM().isTargetHardFloat()) {
2954 // Half-precision return values can be returned like this:
2955 //
2956 // t11 f16 = fadd ...
2957 // t12: i16 = bitcast t11
2958 // t13: i32 = zero_extend t12
2959 // t14: f32 = bitcast t13 <~~~~~~~ Arg
2960 //
2961 // to avoid code generation for bitcasts, we simply set Arg to the node
2962 // that produces the f16 value, t11 in this case.
2963 //
2964 if (Arg.getValueType() == MVT::f32 && Arg.getOpcode() == ISD::BITCAST) {
2965 SDValue ZE = Arg.getOperand(0);
2966 if (ZE.getOpcode() == ISD::ZERO_EXTEND && ZE.getValueType() == MVT::i32) {
2967 SDValue BC = ZE.getOperand(0);
2968 if (BC.getOpcode() == ISD::BITCAST && BC.getValueType() == MVT::i16) {
2969 Arg = BC.getOperand(0);
2970 ReturnF16 = true;
2971 }
2972 }
2973 }
2974 }
2975
2976 switch (VA.getLocInfo()) {
2977 default: llvm_unreachable("Unknown loc info!");
2978 case CCValAssign::Full: break;
2979 case CCValAssign::BCvt:
2980 if (!ReturnF16)
2981 Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg);
2982 break;
2983 }
2984
2985 // Mask f16 arguments if this is a CMSE nonsecure entry.
2986 auto RetVT = Outs[realRVLocIdx].ArgVT;
2987 if (AFI->isCmseNSEntryFunction() && (RetVT == MVT::f16)) {
2988 if (VA.needsCustom() && VA.getValVT() == MVT::f16) {
2989 Arg = MoveFromHPR(dl, DAG, VA.getLocVT(), VA.getValVT(), Arg);
2990 } else {
2991 auto LocBits = VA.getLocVT().getSizeInBits();
2992 auto MaskValue = APInt::getLowBitsSet(LocBits, RetVT.getSizeInBits());
2993 SDValue Mask =
2994 DAG.getConstant(MaskValue, dl, MVT::getIntegerVT(LocBits));
2995 Arg = DAG.getNode(ISD::BITCAST, dl, MVT::getIntegerVT(LocBits), Arg);
2996 Arg = DAG.getNode(ISD::AND, dl, MVT::getIntegerVT(LocBits), Arg, Mask);
2997 Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg);
2998 }
2999 }
3000
3001 if (VA.needsCustom() &&
3002 (VA.getLocVT() == MVT::v2f64 || VA.getLocVT() == MVT::f64)) {
3003 if (VA.getLocVT() == MVT::v2f64) {
3004 // Extract the first half and return it in two registers.
3005 SDValue Half = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
3006 DAG.getConstant(0, dl, MVT::i32));
3007 SDValue HalfGPRs = DAG.getNode(ARMISD::VMOVRRD, dl,
3008 DAG.getVTList(MVT::i32, MVT::i32), Half);
3009
3010 Chain =
3011 DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
3012 HalfGPRs.getValue(isLittleEndian ? 0 : 1), Glue);
3013 Glue = Chain.getValue(1);
3014 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
3015 VA = RVLocs[++i]; // skip ahead to next loc
3016 Chain =
3017 DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
3018 HalfGPRs.getValue(isLittleEndian ? 1 : 0), Glue);
3019 Glue = Chain.getValue(1);
3020 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
3021 VA = RVLocs[++i]; // skip ahead to next loc
3022
3023 // Extract the 2nd half and fall through to handle it as an f64 value.
3024 Arg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
3025 DAG.getConstant(1, dl, MVT::i32));
3026 }
3027 // Legalize ret f64 -> ret 2 x i32. We always have fmrrd if f64 is
3028 // available.
3029 SDValue fmrrd = DAG.getNode(ARMISD::VMOVRRD, dl,
3030 DAG.getVTList(MVT::i32, MVT::i32), Arg);
3031 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
3032 fmrrd.getValue(isLittleEndian ? 0 : 1), Glue);
3033 Glue = Chain.getValue(1);
3034 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
3035 VA = RVLocs[++i]; // skip ahead to next loc
3036 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
3037 fmrrd.getValue(isLittleEndian ? 1 : 0), Glue);
3038 } else
3039 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), Arg, Glue);
3040
3041 // Guarantee that all emitted copies are
3042 // stuck together, avoiding something bad.
3043 Glue = Chain.getValue(1);
3044 RetOps.push_back(DAG.getRegister(
3045 VA.getLocReg(), ReturnF16 ? Arg.getValueType() : VA.getLocVT()));
3046 }
3047 const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo();
3048 const MCPhysReg *I =
3049 TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
3050 if (I) {
3051 for (; *I; ++I) {
3052 if (ARM::GPRRegClass.contains(*I))
3053 RetOps.push_back(DAG.getRegister(*I, MVT::i32));
3054 else if (ARM::DPRRegClass.contains(*I))
3056 else
3057 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
3058 }
3059 }
3060
3061 // Update chain and glue.
3062 RetOps[0] = Chain;
3063 if (Glue.getNode())
3064 RetOps.push_back(Glue);
3065
3066 // CPUs which aren't M-class use a special sequence to return from
3067 // exceptions (roughly, any instruction setting pc and cpsr simultaneously,
3068 // though we use "subs pc, lr, #N").
3069 //
3070 // M-class CPUs actually use a normal return sequence with a special
3071 // (hardware-provided) value in LR, so the normal code path works.
3072 if (DAG.getMachineFunction().getFunction().hasFnAttribute("interrupt") &&
3073 !Subtarget->isMClass()) {
3074 if (Subtarget->isThumb1Only())
3075 report_fatal_error("interrupt attribute is not supported in Thumb1");
3076 return LowerInterruptReturn(RetOps, dl, DAG);
3077 }
3078
3079 unsigned RetNode =
3080 AFI->isCmseNSEntryFunction() ? ARMISD::SERET_GLUE : ARMISD::RET_GLUE;
3081 return DAG.getNode(RetNode, dl, MVT::Other, RetOps);
3082}
3083
3084bool ARMTargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
3085 if (N->getNumValues() != 1)
3086 return false;
3087 if (!N->hasNUsesOfValue(1, 0))
3088 return false;
3089
3090 SDValue TCChain = Chain;
3091 SDNode *Copy = *N->user_begin();
3092 if (Copy->getOpcode() == ISD::CopyToReg) {
3093 // If the copy has a glue operand, we conservatively assume it isn't safe to
3094 // perform a tail call.
3095 if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
3096 return false;
3097 TCChain = Copy->getOperand(0);
3098 } else if (Copy->getOpcode() == ARMISD::VMOVRRD) {
3099 SDNode *VMov = Copy;
3100 // f64 returned in a pair of GPRs.
3101 SmallPtrSet<SDNode*, 2> Copies;
3102 for (SDNode *U : VMov->users()) {
3103 if (U->getOpcode() != ISD::CopyToReg)
3104 return false;
3105 Copies.insert(U);
3106 }
3107 if (Copies.size() > 2)
3108 return false;
3109
3110 for (SDNode *U : VMov->users()) {
3111 SDValue UseChain = U->getOperand(0);
3112 if (Copies.count(UseChain.getNode()))
3113 // Second CopyToReg
3114 Copy = U;
3115 else {
3116 // We are at the top of this chain.
3117 // If the copy has a glue operand, we conservatively assume it
3118 // isn't safe to perform a tail call.
3119 if (U->getOperand(U->getNumOperands() - 1).getValueType() == MVT::Glue)
3120 return false;
3121 // First CopyToReg
3122 TCChain = UseChain;
3123 }
3124 }
3125 } else if (Copy->getOpcode() == ISD::BITCAST) {
3126 // f32 returned in a single GPR.
3127 if (!Copy->hasOneUse())
3128 return false;
3129 Copy = *Copy->user_begin();
3130 if (Copy->getOpcode() != ISD::CopyToReg || !Copy->hasNUsesOfValue(1, 0))
3131 return false;
3132 // If the copy has a glue operand, we conservatively assume it isn't safe to
3133 // perform a tail call.
3134 if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
3135 return false;
3136 TCChain = Copy->getOperand(0);
3137 } else {
3138 return false;
3139 }
3140
3141 bool HasRet = false;
3142 for (const SDNode *U : Copy->users()) {
3143 if (U->getOpcode() != ARMISD::RET_GLUE &&
3144 U->getOpcode() != ARMISD::INTRET_GLUE)
3145 return false;
3146 HasRet = true;
3147 }
3148
3149 if (!HasRet)
3150 return false;
3151
3152 Chain = TCChain;
3153 return true;
3154}
3155
3156bool ARMTargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
3157 if (!Subtarget->supportsTailCall())
3158 return false;
3159
3160 if (!CI->isTailCall())
3161 return false;
3162
3163 return true;
3164}
3165
3166// Trying to write a 64 bit value so need to split into two 32 bit values first,
3167// and pass the lower and high parts through.
3169 SDLoc DL(Op);
3170 SDValue WriteValue = Op->getOperand(2);
3171
3172 // This function is only supposed to be called for i64 type argument.
3173 assert(WriteValue.getValueType() == MVT::i64
3174 && "LowerWRITE_REGISTER called for non-i64 type argument.");
3175
3176 SDValue Lo, Hi;
3177 std::tie(Lo, Hi) = DAG.SplitScalar(WriteValue, DL, MVT::i32, MVT::i32);
3178 SDValue Ops[] = { Op->getOperand(0), Op->getOperand(1), Lo, Hi };
3179 return DAG.getNode(ISD::WRITE_REGISTER, DL, MVT::Other, Ops);
3180}
3181
3182// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
3183// their target counterpart wrapped in the ARMISD::Wrapper node. Suppose N is
3184// one of the above mentioned nodes. It has to be wrapped because otherwise
3185// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
3186// be used to form addressing mode. These wrapped nodes will be selected
3187// into MOVi.
3188SDValue ARMTargetLowering::LowerConstantPool(SDValue Op,
3189 SelectionDAG &DAG) const {
3190 EVT PtrVT = Op.getValueType();
3191 // FIXME there is no actual debug info here
3192 SDLoc dl(Op);
3193 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
3194 SDValue Res;
3195
3196 // When generating execute-only code Constant Pools must be promoted to the
3197 // global data section. It's a bit ugly that we can't share them across basic
3198 // blocks, but this way we guarantee that execute-only behaves correct with
3199 // position-independent addressing modes.
3200 if (Subtarget->genExecuteOnly()) {
3201 auto AFI = DAG.getMachineFunction().getInfo<ARMFunctionInfo>();
3202 auto *T = CP->getType();
3203 auto C = const_cast<Constant*>(CP->getConstVal());
3204 auto M = DAG.getMachineFunction().getFunction().getParent();
3205 auto GV = new GlobalVariable(
3206 *M, T, /*isConstant=*/true, GlobalVariable::InternalLinkage, C,
3207 Twine(DAG.getDataLayout().getInternalSymbolPrefix()) + "CP" +
3208 Twine(DAG.getMachineFunction().getFunctionNumber()) + "_" +
3209 Twine(AFI->createPICLabelUId()));
3210 SDValue GA = DAG.getTargetGlobalAddress(GV, dl, PtrVT);
3211 return LowerGlobalAddress(GA, DAG);
3212 }
3213
3214 // The 16-bit ADR instruction can only encode offsets that are multiples of 4,
3215 // so we need to align to at least 4 bytes when we don't have 32-bit ADR.
3216 Align CPAlign = CP->getAlign();
3217 if (Subtarget->isThumb1Only())
3218 CPAlign = std::max(CPAlign, Align(4));
3220 Res =
3221 DAG.getTargetConstantPool(CP->getMachineCPVal(), PtrVT, CPAlign);
3222 else
3223 Res = DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CPAlign);
3224 return DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Res);
3225}
3226
3228 // If we don't have a 32-bit pc-relative branch instruction then the jump
3229 // table consists of block addresses. Usually this is inline, but for
3230 // execute-only it must be placed out-of-line.
3231 if (Subtarget->genExecuteOnly() && !Subtarget->hasV8MBaselineOps())
3234}
3235
3236SDValue ARMTargetLowering::LowerBlockAddress(SDValue Op,
3237 SelectionDAG &DAG) const {
3240 unsigned ARMPCLabelIndex = 0;
3241 SDLoc DL(Op);
3242 EVT PtrVT = getPointerTy(DAG.getDataLayout());
3243 const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
3244 SDValue CPAddr;
3245 bool IsPositionIndependent = isPositionIndependent() || Subtarget->isROPI();
3246 if (!IsPositionIndependent) {
3247 CPAddr = DAG.getTargetConstantPool(BA, PtrVT, Align(4));
3248 } else {
3249 unsigned PCAdj = Subtarget->isThumb() ? 4 : 8;
3250 ARMPCLabelIndex = AFI->createPICLabelUId();
3252 ARMConstantPoolConstant::Create(BA, ARMPCLabelIndex,
3253 ARMCP::CPBlockAddress, PCAdj);
3254 CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, Align(4));
3255 }
3256 CPAddr = DAG.getNode(ARMISD::Wrapper, DL, PtrVT, CPAddr);
3257 SDValue Result = DAG.getLoad(
3258 PtrVT, DL, DAG.getEntryNode(), CPAddr,
3260 if (!IsPositionIndependent)
3261 return Result;
3262 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, DL, MVT::i32);
3263 return DAG.getNode(ARMISD::PIC_ADD, DL, PtrVT, Result, PICLabel);
3264}
3265
3266/// Convert a TLS address reference into the correct sequence of loads
3267/// and calls to compute the variable's address for Darwin, and return an
3268/// SDValue containing the final node.
3269
3270/// Darwin only has one TLS scheme which must be capable of dealing with the
3271/// fully general situation, in the worst case. This means:
3272/// + "extern __thread" declaration.
3273/// + Defined in a possibly unknown dynamic library.
3274///
3275/// The general system is that each __thread variable has a [3 x i32] descriptor
3276/// which contains information used by the runtime to calculate the address. The
3277/// only part of this the compiler needs to know about is the first word, which
3278/// contains a function pointer that must be called with the address of the
3279/// entire descriptor in "r0".
3280///
3281/// Since this descriptor may be in a different unit, in general access must
3282/// proceed along the usual ARM rules. A common sequence to produce is:
3283///
3284/// movw rT1, :lower16:_var$non_lazy_ptr
3285/// movt rT1, :upper16:_var$non_lazy_ptr
3286/// ldr r0, [rT1]
3287/// ldr rT2, [r0]
3288/// blx rT2
3289/// [...address now in r0...]
3290SDValue
3291ARMTargetLowering::LowerGlobalTLSAddressDarwin(SDValue Op,
3292 SelectionDAG &DAG) const {
3293 assert(getTargetMachine().getTargetTriple().isOSDarwin() &&
3294 "This function expects a Darwin target");
3295 SDLoc DL(Op);
3296
3297 // First step is to get the address of the actua global symbol. This is where
3298 // the TLS descriptor lives.
3299 SDValue DescAddr = LowerGlobalAddressDarwin(Op, DAG);
3300
3301 // The first entry in the descriptor is a function pointer that we must call
3302 // to obtain the address of the variable.
3303 SDValue Chain = DAG.getEntryNode();
3304 SDValue FuncTLVGet = DAG.getLoad(
3305 MVT::i32, DL, Chain, DescAddr,
3309 Chain = FuncTLVGet.getValue(1);
3310
3311 MachineFunction &F = DAG.getMachineFunction();
3312 MachineFrameInfo &MFI = F.getFrameInfo();
3313 MFI.setAdjustsStack(true);
3314
3315 // TLS calls preserve all registers except those that absolutely must be
3316 // trashed: R0 (it takes an argument), LR (it's a call) and CPSR (let's not be
3317 // silly).
3318 auto TRI =
3320 auto ARI = static_cast<const ARMRegisterInfo *>(TRI);
3321 const uint32_t *Mask = ARI->getTLSCallPreservedMask(DAG.getMachineFunction());
3322
3323 // Finally, we can make the call. This is just a degenerate version of a
3324 // normal AArch64 call node: r0 takes the address of the descriptor, and
3325 // returns the address of the variable in this thread.
3326 Chain = DAG.getCopyToReg(Chain, DL, ARM::R0, DescAddr, SDValue());
3327 Chain =
3328 DAG.getNode(ARMISD::CALL, DL, DAG.getVTList(MVT::Other, MVT::Glue),
3329 Chain, FuncTLVGet, DAG.getRegister(ARM::R0, MVT::i32),
3330 DAG.getRegisterMask(Mask), Chain.getValue(1));
3331 return DAG.getCopyFromReg(Chain, DL, ARM::R0, MVT::i32, Chain.getValue(1));
3332}
3333
3334SDValue
3335ARMTargetLowering::LowerGlobalTLSAddressWindows(SDValue Op,
3336 SelectionDAG &DAG) const {
3337 assert(getTargetMachine().getTargetTriple().isOSWindows() &&
3338 "Windows specific TLS lowering");
3339
3340 SDValue Chain = DAG.getEntryNode();
3341 EVT PtrVT = getPointerTy(DAG.getDataLayout());
3342 SDLoc DL(Op);
3343
3344 // Load the current TEB (thread environment block)
3345 SDValue Ops[] = {Chain,
3346 DAG.getTargetConstant(Intrinsic::arm_mrc, DL, MVT::i32),
3347 DAG.getTargetConstant(15, DL, MVT::i32),
3348 DAG.getTargetConstant(0, DL, MVT::i32),
3349 DAG.getTargetConstant(13, DL, MVT::i32),
3350 DAG.getTargetConstant(0, DL, MVT::i32),
3351 DAG.getTargetConstant(2, DL, MVT::i32)};
3352 SDValue CurrentTEB = DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL,
3353 DAG.getVTList(MVT::i32, MVT::Other), Ops);
3354
3355 SDValue TEB = CurrentTEB.getValue(0);
3356 Chain = CurrentTEB.getValue(1);
3357
3358 // Load the ThreadLocalStoragePointer from the TEB
3359 // A pointer to the TLS array is located at offset 0x2c from the TEB.
3360 SDValue TLSArray =
3361 DAG.getNode(ISD::ADD, DL, PtrVT, TEB, DAG.getIntPtrConstant(0x2c, DL));
3362 TLSArray = DAG.getLoad(PtrVT, DL, Chain, TLSArray, MachinePointerInfo());
3363
3364 // The pointer to the thread's TLS data area is at the TLS Index scaled by 4
3365 // offset into the TLSArray.
3366
3367 // Load the TLS index from the C runtime
3368 SDValue TLSIndex =
3369 DAG.getTargetExternalSymbol("_tls_index", PtrVT, ARMII::MO_NO_FLAG);
3370 TLSIndex = DAG.getNode(ARMISD::Wrapper, DL, PtrVT, TLSIndex);
3371 TLSIndex = DAG.getLoad(PtrVT, DL, Chain, TLSIndex, MachinePointerInfo());
3372
3373 SDValue Slot = DAG.getNode(ISD::SHL, DL, PtrVT, TLSIndex,
3374 DAG.getConstant(2, DL, MVT::i32));
3375 SDValue TLS = DAG.getLoad(PtrVT, DL, Chain,
3376 DAG.getNode(ISD::ADD, DL, PtrVT, TLSArray, Slot),
3377 MachinePointerInfo());
3378
3379 // Get the offset of the start of the .tls section (section base)
3380 const auto *GA = cast<GlobalAddressSDNode>(Op);
3381 auto *CPV = ARMConstantPoolConstant::Create(GA->getGlobal(), ARMCP::SECREL);
3382 SDValue Offset = DAG.getLoad(
3383 PtrVT, DL, Chain,
3384 DAG.getNode(ARMISD::Wrapper, DL, MVT::i32,
3385 DAG.getTargetConstantPool(CPV, PtrVT, Align(4))),
3387
3388 return DAG.getNode(ISD::ADD, DL, PtrVT, TLS, Offset);
3389}
3390
3391// Lower ISD::GlobalTLSAddress using the "general dynamic" model
3392SDValue
3393ARMTargetLowering::LowerToTLSGeneralDynamicModel(GlobalAddressSDNode *GA,
3394 SelectionDAG &DAG) const {
3395 SDLoc dl(GA);
3396 EVT PtrVT = getPointerTy(DAG.getDataLayout());
3397 unsigned char PCAdj = Subtarget->isThumb() ? 4 : 8;
3398 MachineFunction &MF = DAG.getMachineFunction();
3399 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
3400 unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
3401 ARMConstantPoolValue *CPV =
3402 ARMConstantPoolConstant::Create(GA->getGlobal(), ARMPCLabelIndex,
3403 ARMCP::CPValue, PCAdj, ARMCP::TLSGD, true);
3404 SDValue Argument = DAG.getTargetConstantPool(CPV, PtrVT, Align(4));
3405 Argument = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Argument);
3406 Argument = DAG.getLoad(
3407 PtrVT, dl, DAG.getEntryNode(), Argument,
3409 SDValue Chain = Argument.getValue(1);
3410
3411 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32);
3412 Argument = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Argument, PICLabel);
3413
3414 // call __tls_get_addr.
3416 Args.emplace_back(Argument, Type::getInt32Ty(*DAG.getContext()));
3417
3418 // FIXME: is there useful debug info available here?
3419 TargetLowering::CallLoweringInfo CLI(DAG);
3420 CLI.setDebugLoc(dl).setChain(Chain).setLibCallee(
3422 DAG.getExternalSymbol("__tls_get_addr", PtrVT), std::move(Args));
3423
3424 std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
3425 return CallResult.first;
3426}
3427
3428// Lower ISD::GlobalTLSAddress using the "initial exec" or
3429// "local exec" model.
3430SDValue
3431ARMTargetLowering::LowerToTLSExecModels(GlobalAddressSDNode *GA,
3432 SelectionDAG &DAG,
3433 TLSModel::Model model) const {
3434 const GlobalValue *GV = GA->getGlobal();
3435 SDLoc dl(GA);
3437 SDValue Chain = DAG.getEntryNode();
3438 EVT PtrVT = getPointerTy(DAG.getDataLayout());
3439 // Get the Thread Pointer
3440 SDValue ThreadPointer = DAG.getNode(ARMISD::THREAD_POINTER, dl, PtrVT);
3441
3442 if (model == TLSModel::InitialExec) {
3443 MachineFunction &MF = DAG.getMachineFunction();
3444 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
3445 unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
3446 // Initial exec model.
3447 unsigned char PCAdj = Subtarget->isThumb() ? 4 : 8;
3448 ARMConstantPoolValue *CPV =
3449 ARMConstantPoolConstant::Create(GA->getGlobal(), ARMPCLabelIndex,
3451 true);
3452 Offset = DAG.getTargetConstantPool(CPV, PtrVT, Align(4));
3453 Offset = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Offset);
3454 Offset = DAG.getLoad(
3455 PtrVT, dl, Chain, Offset,
3457 Chain = Offset.getValue(1);
3458
3459 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32);
3460 Offset = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Offset, PICLabel);
3461
3462 Offset = DAG.getLoad(
3463 PtrVT, dl, Chain, Offset,
3465 } else {
3466 // local exec model
3467 assert(model == TLSModel::LocalExec);
3468 ARMConstantPoolValue *CPV =
3470 Offset = DAG.getTargetConstantPool(CPV, PtrVT, Align(4));
3471 Offset = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Offset);
3472 Offset = DAG.getLoad(
3473 PtrVT, dl, Chain, Offset,
3475 }
3476
3477 // The address of the thread local variable is the add of the thread
3478 // pointer with the offset of the variable.
3479 return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
3480}
3481
3482SDValue
3483ARMTargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
3484 GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
3485 if (DAG.getTarget().useEmulatedTLS())
3486 return LowerToTLSEmulatedModel(GA, DAG);
3487
3488 const Triple &TT = getTargetMachine().getTargetTriple();
3489 if (TT.isOSDarwin())
3490 return LowerGlobalTLSAddressDarwin(Op, DAG);
3491
3492 if (TT.isOSWindows())
3493 return LowerGlobalTLSAddressWindows(Op, DAG);
3494
3495 // TODO: implement the "local dynamic" model
3496 assert(TT.isOSBinFormatELF() && "Only ELF implemented here");
3498
3499 switch (model) {
3502 return LowerToTLSGeneralDynamicModel(GA, DAG);
3505 return LowerToTLSExecModels(GA, DAG, model);
3506 }
3507 llvm_unreachable("bogus TLS model");
3508}
3509
3510/// Return true if all users of V are within function F, looking through
3511/// ConstantExprs.
3512static bool allUsersAreInFunction(const Value *V, const Function *F) {
3513 SmallVector<const User*,4> Worklist(V->users());
3514 while (!Worklist.empty()) {
3515 auto *U = Worklist.pop_back_val();
3516 if (isa<ConstantExpr>(U)) {
3517 append_range(Worklist, U->users());
3518 continue;
3519 }
3520
3521 auto *I = dyn_cast<Instruction>(U);
3522 if (!I || I->getParent()->getParent() != F)
3523 return false;
3524 }
3525 return true;
3526}
3527
3529 const GlobalValue *GV, SelectionDAG &DAG,
3530 EVT PtrVT, const SDLoc &dl) {
3531 // If we're creating a pool entry for a constant global with unnamed address,
3532 // and the global is small enough, we can emit it inline into the constant pool
3533 // to save ourselves an indirection.
3534 //
3535 // This is a win if the constant is only used in one function (so it doesn't
3536 // need to be duplicated) or duplicating the constant wouldn't increase code
3537 // size (implying the constant is no larger than 4 bytes).
3538 const Function &F = DAG.getMachineFunction().getFunction();
3539
3540 // We rely on this decision to inline being idempotent and unrelated to the
3541 // use-site. We know that if we inline a variable at one use site, we'll
3542 // inline it elsewhere too (and reuse the constant pool entry). Fast-isel
3543 // doesn't know about this optimization, so bail out if it's enabled else
3544 // we could decide to inline here (and thus never emit the GV) but require
3545 // the GV from fast-isel generated code.
3548 return SDValue();
3549
3550 auto *GVar = dyn_cast<GlobalVariable>(GV);
3551 if (!GVar || !GVar->hasInitializer() ||
3552 !GVar->isConstant() || !GVar->hasGlobalUnnamedAddr() ||
3553 !GVar->hasLocalLinkage())
3554 return SDValue();
3555
3556 // If we inline a value that contains relocations, we move the relocations
3557 // from .data to .text. This is not allowed in position-independent code.
3558 auto *Init = GVar->getInitializer();
3559 if ((TLI->isPositionIndependent() || TLI->getSubtarget()->isROPI()) &&
3560 Init->needsDynamicRelocation())
3561 return SDValue();
3562
3563 // The constant islands pass can only really deal with alignment requests
3564 // <= 4 bytes and cannot pad constants itself. Therefore we cannot promote
3565 // any type wanting greater alignment requirements than 4 bytes. We also
3566 // can only promote constants that are multiples of 4 bytes in size or
3567 // are paddable to a multiple of 4. Currently we only try and pad constants
3568 // that are strings for simplicity.
3569 auto *CDAInit = dyn_cast<ConstantDataArray>(Init);
3570 unsigned Size = DAG.getDataLayout().getTypeAllocSize(Init->getType());
3571 Align PrefAlign = DAG.getDataLayout().getPreferredAlign(GVar);
3572 unsigned RequiredPadding = 4 - (Size % 4);
3573 bool PaddingPossible =
3574 RequiredPadding == 4 || (CDAInit && CDAInit->isString());
3575 if (!PaddingPossible || PrefAlign > 4 || Size > ConstpoolPromotionMaxSize ||
3576 Size == 0)
3577 return SDValue();
3578
3579 unsigned PaddedSize = Size + ((RequiredPadding == 4) ? 0 : RequiredPadding);
3581 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
3582
3583 // We can't bloat the constant pool too much, else the ConstantIslands pass
3584 // may fail to converge. If we haven't promoted this global yet (it may have
3585 // multiple uses), and promoting it would increase the constant pool size (Sz
3586 // > 4), ensure we have space to do so up to MaxTotal.
3587 if (!AFI->getGlobalsPromotedToConstantPool().count(GVar) && Size > 4)
3588 if (AFI->getPromotedConstpoolIncrease() + PaddedSize - 4 >=
3590 return SDValue();
3591
3592 // This is only valid if all users are in a single function; we can't clone
3593 // the constant in general. The LLVM IR unnamed_addr allows merging
3594 // constants, but not cloning them.
3595 //
3596 // We could potentially allow cloning if we could prove all uses of the
3597 // constant in the current function don't care about the address, like
3598 // printf format strings. But that isn't implemented for now.
3599 if (!allUsersAreInFunction(GVar, &F))
3600 return SDValue();
3601
3602 // We're going to inline this global. Pad it out if needed.
3603 if (RequiredPadding != 4) {
3604 StringRef S = CDAInit->getAsString();
3605
3607 std::copy(S.bytes_begin(), S.bytes_end(), V.begin());
3608 while (RequiredPadding--)
3609 V.push_back(0);
3611 }
3612
3613 auto CPVal = ARMConstantPoolConstant::Create(GVar, Init);
3614 SDValue CPAddr = DAG.getTargetConstantPool(CPVal, PtrVT, Align(4));
3615 if (!AFI->getGlobalsPromotedToConstantPool().count(GVar)) {
3618 PaddedSize - 4);
3619 }
3620 ++NumConstpoolPromoted;
3621 return DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
3622}
3623
3625 if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV))
3626 if (!(GV = GA->getAliaseeObject()))
3627 return false;
3628 if (const auto *V = dyn_cast<GlobalVariable>(GV))
3629 return V->isConstant();
3630 return isa<Function>(GV);
3631}
3632
3633SDValue ARMTargetLowering::LowerGlobalAddress(SDValue Op,
3634 SelectionDAG &DAG) const {
3635 switch (Subtarget->getTargetTriple().getObjectFormat()) {
3636 default: llvm_unreachable("unknown object format");
3637 case Triple::COFF:
3638 return LowerGlobalAddressWindows(Op, DAG);
3639 case Triple::ELF:
3640 return LowerGlobalAddressELF(Op, DAG);
3641 case Triple::MachO:
3642 return LowerGlobalAddressDarwin(Op, DAG);
3643 }
3644}
3645
3646SDValue ARMTargetLowering::LowerGlobalAddressELF(SDValue Op,
3647 SelectionDAG &DAG) const {
3648 EVT PtrVT = getPointerTy(DAG.getDataLayout());
3649 SDLoc dl(Op);
3650 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
3651 bool IsRO = isReadOnly(GV);
3652
3653 // promoteToConstantPool only if not generating XO text section
3654 if (GV->isDSOLocal() && !Subtarget->genExecuteOnly())
3655 if (SDValue V = promoteToConstantPool(this, GV, DAG, PtrVT, dl))
3656 return V;
3657
3658 if (isPositionIndependent()) {
3660 GV, dl, PtrVT, 0, GV->isDSOLocal() ? 0 : ARMII::MO_GOT);
3661 SDValue Result = DAG.getNode(ARMISD::WrapperPIC, dl, PtrVT, G);
3662 if (!GV->isDSOLocal())
3663 Result =
3664 DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
3666 return Result;
3667 } else if (Subtarget->isROPI() && IsRO) {
3668 // PC-relative.
3669 SDValue G = DAG.getTargetGlobalAddress(GV, dl, PtrVT);
3670 SDValue Result = DAG.getNode(ARMISD::WrapperPIC, dl, PtrVT, G);
3671 return Result;
3672 } else if (Subtarget->isRWPI() && !IsRO) {
3673 // SB-relative.
3674 SDValue RelAddr;
3675 if (Subtarget->useMovt()) {
3676 ++NumMovwMovt;
3677 SDValue G = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, ARMII::MO_SBREL);
3678 RelAddr = DAG.getNode(ARMISD::Wrapper, dl, PtrVT, G);
3679 } else { // use literal pool for address constant
3680 ARMConstantPoolValue *CPV =
3682 SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, Align(4));
3683 CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
3684 RelAddr = DAG.getLoad(
3685 PtrVT, dl, DAG.getEntryNode(), CPAddr,
3687 }
3688 SDValue SB = DAG.getCopyFromReg(DAG.getEntryNode(), dl, ARM::R9, PtrVT);
3689 SDValue Result = DAG.getNode(ISD::ADD, dl, PtrVT, SB, RelAddr);
3690 return Result;
3691 }
3692
3693 // If we have T2 ops, we can materialize the address directly via movt/movw
3694 // pair. This is always cheaper. If need to generate Execute Only code, and we
3695 // only have Thumb1 available, we can't use a constant pool and are forced to
3696 // use immediate relocations.
3697 if (Subtarget->useMovt() || Subtarget->genExecuteOnly()) {
3698 if (Subtarget->useMovt())
3699 ++NumMovwMovt;
3700 // FIXME: Once remat is capable of dealing with instructions with register
3701 // operands, expand this into two nodes.
3702 return DAG.getNode(ARMISD::Wrapper, dl, PtrVT,
3703 DAG.getTargetGlobalAddress(GV, dl, PtrVT));
3704 } else {
3705 SDValue CPAddr = DAG.getTargetConstantPool(GV, PtrVT, Align(4));
3706 CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
3707 return DAG.getLoad(
3708 PtrVT, dl, DAG.getEntryNode(), CPAddr,
3710 }
3711}
3712
3713SDValue ARMTargetLowering::LowerGlobalAddressDarwin(SDValue Op,
3714 SelectionDAG &DAG) const {
3715 assert(!Subtarget->isROPI() && !Subtarget->isRWPI() &&
3716 "ROPI/RWPI not currently supported for Darwin");
3717 EVT PtrVT = getPointerTy(DAG.getDataLayout());
3718 SDLoc dl(Op);
3719 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
3720
3721 if (Subtarget->useMovt())
3722 ++NumMovwMovt;
3723
3724 // FIXME: Once remat is capable of dealing with instructions with register
3725 // operands, expand this into multiple nodes
3726 unsigned Wrapper =
3727 isPositionIndependent() ? ARMISD::WrapperPIC : ARMISD::Wrapper;
3728
3729 SDValue G = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, ARMII::MO_NONLAZY);
3730 SDValue Result = DAG.getNode(Wrapper, dl, PtrVT, G);
3731
3732 if (Subtarget->isGVIndirectSymbol(GV))
3733 Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
3735 return Result;
3736}
3737
3738SDValue ARMTargetLowering::LowerGlobalAddressWindows(SDValue Op,
3739 SelectionDAG &DAG) const {
3740 assert(getTargetMachine().getTargetTriple().isOSWindows() &&
3741 "non-Windows COFF is not supported");
3742 assert(Subtarget->useMovt() &&
3743 "Windows on ARM expects to use movw/movt");
3744 assert(!Subtarget->isROPI() && !Subtarget->isRWPI() &&
3745 "ROPI/RWPI not currently supported for Windows");
3746
3747 const TargetMachine &TM = getTargetMachine();
3748 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
3749 ARMII::TOF TargetFlags = ARMII::MO_NO_FLAG;
3750 if (GV->hasDLLImportStorageClass())
3751 TargetFlags = ARMII::MO_DLLIMPORT;
3752 else if (!TM.shouldAssumeDSOLocal(GV))
3753 TargetFlags = ARMII::MO_COFFSTUB;
3754 EVT PtrVT = getPointerTy(DAG.getDataLayout());
3756 SDLoc DL(Op);
3757
3758 ++NumMovwMovt;
3759
3760 // FIXME: Once remat is capable of dealing with instructions with register
3761 // operands, expand this into two nodes.
3762 Result = DAG.getNode(ARMISD::Wrapper, DL, PtrVT,
3763 DAG.getTargetGlobalAddress(GV, DL, PtrVT, /*offset=*/0,
3764 TargetFlags));
3765 if (TargetFlags & (ARMII::MO_DLLIMPORT | ARMII::MO_COFFSTUB))
3766 Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result,
3768 return Result;
3769}
3770
3771SDValue
3772ARMTargetLowering::LowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const {
3773 SDLoc dl(Op);
3774 SDValue Val = DAG.getConstant(0, dl, MVT::i32);
3775 return DAG.getNode(ARMISD::EH_SJLJ_SETJMP, dl,
3776 DAG.getVTList(MVT::i32, MVT::Other), Op.getOperand(0),
3777 Op.getOperand(1), Val);
3778}
3779
3780SDValue
3781ARMTargetLowering::LowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const {
3782 SDLoc dl(Op);
3783 return DAG.getNode(ARMISD::EH_SJLJ_LONGJMP, dl, MVT::Other, Op.getOperand(0),
3784 Op.getOperand(1), DAG.getConstant(0, dl, MVT::i32));
3785}
3786
3787SDValue ARMTargetLowering::LowerEH_SJLJ_SETUP_DISPATCH(SDValue Op,
3788 SelectionDAG &DAG) const {
3789 SDLoc dl(Op);
3790 return DAG.getNode(ARMISD::EH_SJLJ_SETUP_DISPATCH, dl, MVT::Other,
3791 Op.getOperand(0));
3792}
3793
3794SDValue ARMTargetLowering::LowerINTRINSIC_VOID(
3795 SDValue Op, SelectionDAG &DAG, const ARMSubtarget *Subtarget) const {
3796 unsigned IntNo =
3797 Op.getConstantOperandVal(Op.getOperand(0).getValueType() == MVT::Other);
3798 switch (IntNo) {
3799 default:
3800 return SDValue(); // Don't custom lower most intrinsics.
3801 case Intrinsic::arm_gnu_eabi_mcount: {
3802 MachineFunction &MF = DAG.getMachineFunction();
3803 EVT PtrVT = getPointerTy(DAG.getDataLayout());
3804 SDLoc dl(Op);
3805 SDValue Chain = Op.getOperand(0);
3806 // call "\01__gnu_mcount_nc"
3807 const ARMBaseRegisterInfo *ARI = Subtarget->getRegisterInfo();
3808 const uint32_t *Mask =
3810 assert(Mask && "Missing call preserved mask for calling convention");
3811 // Mark LR an implicit live-in.
3812 Register Reg = MF.addLiveIn(ARM::LR, getRegClassFor(MVT::i32));
3813 SDValue ReturnAddress =
3814 DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, PtrVT);
3815 constexpr EVT ResultTys[] = {MVT::Other, MVT::Glue};
3816 SDValue Callee =
3817 DAG.getTargetExternalSymbol("\01__gnu_mcount_nc", PtrVT, 0);
3819 if (Subtarget->isThumb())
3820 return SDValue(
3821 DAG.getMachineNode(
3822 ARM::tBL_PUSHLR, dl, ResultTys,
3823 {ReturnAddress, DAG.getTargetConstant(ARMCC::AL, dl, PtrVT),
3824 DAG.getRegister(0, PtrVT), Callee, RegisterMask, Chain}),
3825 0);
3826 return SDValue(
3827 DAG.getMachineNode(ARM::BL_PUSHLR, dl, ResultTys,
3828 {ReturnAddress, Callee, RegisterMask, Chain}),
3829 0);
3830 }
3831 }
3832}
3833
3834SDValue
3835ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG,
3836 const ARMSubtarget *Subtarget) const {
3837 unsigned IntNo = Op.getConstantOperandVal(0);
3838 SDLoc dl(Op);
3839 switch (IntNo) {
3840 default: return SDValue(); // Don't custom lower most intrinsics.
3841 case Intrinsic::thread_pointer: {
3842 EVT PtrVT = getPointerTy(DAG.getDataLayout());
3843 return DAG.getNode(ARMISD::THREAD_POINTER, dl, PtrVT);
3844 }
3845 case Intrinsic::arm_cls: {
3846 // Note: arm_cls and arm_cls64 intrinsics are expanded directly here
3847 // in LowerINTRINSIC_WO_CHAIN since there's no native scalar CLS
3848 // instruction.
3849 const SDValue &Operand = Op.getOperand(1);
3850 const EVT VTy = Op.getValueType();
3851 return DAG.getNode(ISD::CTLS, dl, VTy, Operand);
3852 }
3853 case Intrinsic::arm_cls64: {
3854 // arm_cls64 returns i32 but takes i64 input.
3855 // Use ISD::CTLS for i64 and truncate the result.
3856 SDValue CTLS64 = DAG.getNode(ISD::CTLS, dl, MVT::i64, Op.getOperand(1));
3857 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, CTLS64);
3858 }
3859 case Intrinsic::arm_neon_vcls:
3860 case Intrinsic::arm_mve_vcls: {
3861 // Lower vector CLS intrinsics to ISD::CTLS.
3862 // Vector CTLS is Legal when NEON/MVE is available (set elsewhere).
3863 const EVT VTy = Op.getValueType();
3864 return DAG.getNode(ISD::CTLS, dl, VTy, Op.getOperand(1));
3865 }
3866 case Intrinsic::eh_sjlj_lsda: {
3867 MachineFunction &MF = DAG.getMachineFunction();
3868 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
3869 unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
3870 EVT PtrVT = getPointerTy(DAG.getDataLayout());
3871 SDValue CPAddr;
3872 bool IsPositionIndependent = isPositionIndependent();
3873 unsigned PCAdj = IsPositionIndependent ? (Subtarget->isThumb() ? 4 : 8) : 0;
3874 ARMConstantPoolValue *CPV =
3875 ARMConstantPoolConstant::Create(&MF.getFunction(), ARMPCLabelIndex,
3876 ARMCP::CPLSDA, PCAdj);
3877 CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, Align(4));
3878 CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
3879 SDValue Result = DAG.getLoad(
3880 PtrVT, dl, DAG.getEntryNode(), CPAddr,
3882
3883 if (IsPositionIndependent) {
3884 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32);
3885 Result = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Result, PICLabel);
3886 }
3887 return Result;
3888 }
3889 case Intrinsic::arm_neon_vabs:
3890 return DAG.getNode(ISD::ABS, SDLoc(Op), Op.getValueType(),
3891 Op.getOperand(1));
3892 case Intrinsic::arm_neon_vabds:
3893 if (Op.getValueType().isInteger())
3894 return DAG.getNode(ISD::ABDS, SDLoc(Op), Op.getValueType(),
3895 Op.getOperand(1), Op.getOperand(2));
3896 return SDValue();
3897 case Intrinsic::arm_neon_vabdu:
3898 return DAG.getNode(ISD::ABDU, SDLoc(Op), Op.getValueType(),
3899 Op.getOperand(1), Op.getOperand(2));
3900 case Intrinsic::arm_neon_vmulls:
3901 case Intrinsic::arm_neon_vmullu: {
3902 unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmulls)
3903 ? ARMISD::VMULLs : ARMISD::VMULLu;
3904 return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),
3905 Op.getOperand(1), Op.getOperand(2));
3906 }
3907 case Intrinsic::arm_neon_vminnm:
3908 case Intrinsic::arm_neon_vmaxnm: {
3909 unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vminnm)
3910 ? ISD::FMINNUM : ISD::FMAXNUM;
3911 return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),
3912 Op.getOperand(1), Op.getOperand(2));
3913 }
3914 case Intrinsic::arm_neon_vminu:
3915 case Intrinsic::arm_neon_vmaxu: {
3916 if (Op.getValueType().isFloatingPoint())
3917 return SDValue();
3918 unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vminu)
3919 ? ISD::UMIN : ISD::UMAX;
3920 return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),
3921 Op.getOperand(1), Op.getOperand(2));
3922 }
3923 case Intrinsic::arm_neon_vmins:
3924 case Intrinsic::arm_neon_vmaxs: {
3925 // v{min,max}s is overloaded between signed integers and floats.
3926 if (!Op.getValueType().isFloatingPoint()) {
3927 unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmins)
3928 ? ISD::SMIN : ISD::SMAX;
3929 return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),
3930 Op.getOperand(1), Op.getOperand(2));
3931 }
3932 unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmins)
3933 ? ISD::FMINIMUM : ISD::FMAXIMUM;
3934 return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),
3935 Op.getOperand(1), Op.getOperand(2));
3936 }
3937 case Intrinsic::arm_neon_vtbl1:
3938 return DAG.getNode(ARMISD::VTBL1, SDLoc(Op), Op.getValueType(),
3939 Op.getOperand(1), Op.getOperand(2));
3940 case Intrinsic::arm_neon_vtbl2:
3941 return DAG.getNode(ARMISD::VTBL2, SDLoc(Op), Op.getValueType(),
3942 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
3943 case Intrinsic::arm_mve_pred_i2v:
3944 case Intrinsic::arm_mve_pred_v2i:
3945 return DAG.getNode(ARMISD::PREDICATE_CAST, SDLoc(Op), Op.getValueType(),
3946 Op.getOperand(1));
3947 case Intrinsic::arm_mve_vreinterpretq:
3948 return DAG.getNode(ARMISD::VECTOR_REG_CAST, SDLoc(Op), Op.getValueType(),
3949 Op.getOperand(1));
3950 case Intrinsic::arm_mve_lsll:
3951 return DAG.getNode(ARMISD::LSLL, SDLoc(Op), Op->getVTList(),
3952 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
3953 case Intrinsic::arm_mve_asrl:
3954 return DAG.getNode(ARMISD::ASRL, SDLoc(Op), Op->getVTList(),
3955 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
3956 case Intrinsic::arm_mve_vsli:
3957 return DAG.getNode(ARMISD::VSLIIMM, SDLoc(Op), Op->getVTList(),
3958 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
3959 case Intrinsic::arm_mve_vsri:
3960 return DAG.getNode(ARMISD::VSRIIMM, SDLoc(Op), Op->getVTList(),
3961 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
3962 }
3963}
3964
3966 const ARMSubtarget *Subtarget) {
3967 SDLoc dl(Op);
3968 auto SSID = static_cast<SyncScope::ID>(Op.getConstantOperandVal(2));
3969 if (SSID == SyncScope::SingleThread)
3970 return Op;
3971
3972 if (!Subtarget->hasDataBarrier()) {
3973 // Some ARMv6 cpus can support data barriers with an mcr instruction.
3974 // Thumb1 and pre-v6 ARM mode use a libcall instead and should never get
3975 // here.
3976 assert(Subtarget->hasV6Ops() && !Subtarget->isThumb() &&
3977 "Unexpected ISD::ATOMIC_FENCE encountered. Should be libcall!");
3978 return DAG.getNode(ARMISD::MEMBARRIER_MCR, dl, MVT::Other, Op.getOperand(0),
3979 DAG.getConstant(0, dl, MVT::i32));
3980 }
3981
3982 AtomicOrdering Ord =
3983 static_cast<AtomicOrdering>(Op.getConstantOperandVal(1));
3985 if (Subtarget->isMClass()) {
3986 // Only a full system barrier exists in the M-class architectures.
3988 } else if (Subtarget->preferISHSTBarriers() &&
3989 Ord == AtomicOrdering::Release) {
3990 // Swift happens to implement ISHST barriers in a way that's compatible with
3991 // Release semantics but weaker than ISH so we'd be fools not to use
3992 // it. Beware: other processors probably don't!
3994 }
3995
3996 return DAG.getNode(ISD::INTRINSIC_VOID, dl, MVT::Other, Op.getOperand(0),
3997 DAG.getConstant(Intrinsic::arm_dmb, dl, MVT::i32),
3998 DAG.getConstant(Domain, dl, MVT::i32));
3999}
4000
4002 const ARMSubtarget *Subtarget) {
4003 // ARM pre v5TE and Thumb1 does not have preload instructions.
4004 if (!(Subtarget->isThumb2() ||
4005 (!Subtarget->isThumb1Only() && Subtarget->hasV5TEOps())))
4006 // Just preserve the chain.
4007 return Op.getOperand(0);
4008
4009 SDLoc dl(Op);
4010 unsigned isRead = ~Op.getConstantOperandVal(2) & 1;
4011 if (!isRead &&
4012 (!Subtarget->hasV7Ops() || !Subtarget->hasMPExtension()))
4013 // ARMv7 with MP extension has PLDW.
4014 return Op.getOperand(0);
4015
4016 unsigned isData = Op.getConstantOperandVal(4);
4017 if (Subtarget->isThumb()) {
4018 // Invert the bits.
4019 isRead = ~isRead & 1;
4020 isData = ~isData & 1;
4021 }
4022
4023 return DAG.getNode(ARMISD::PRELOAD, dl, MVT::Other, Op.getOperand(0),
4024 Op.getOperand(1), DAG.getConstant(isRead, dl, MVT::i32),
4025 DAG.getConstant(isData, dl, MVT::i32));
4026}
4027
4030 ARMFunctionInfo *FuncInfo = MF.getInfo<ARMFunctionInfo>();
4031
4032 // vastart just stores the address of the VarArgsFrameIndex slot into the
4033 // memory location argument.
4034 SDLoc dl(Op);
4036 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
4037 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
4038 return DAG.getStore(Op.getOperand(0), dl, FR, Op.getOperand(1),
4039 MachinePointerInfo(SV));
4040}
4041
4042SDValue ARMTargetLowering::GetF64FormalArgument(CCValAssign &VA,
4043 CCValAssign &NextVA,
4044 SDValue &Root,
4045 SelectionDAG &DAG,
4046 const SDLoc &dl) const {
4047 MachineFunction &MF = DAG.getMachineFunction();
4048 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
4049
4050 const TargetRegisterClass *RC;
4051 if (AFI->isThumb1OnlyFunction())
4052 RC = &ARM::tGPRRegClass;
4053 else
4054 RC = &ARM::GPRRegClass;
4055
4056 // Transform the arguments stored in physical registers into virtual ones.
4057 Register Reg = MF.addLiveIn(VA.getLocReg(), RC);
4058 SDValue ArgValue = DAG.getCopyFromReg(Root, dl, Reg, MVT::i32);
4059
4060 SDValue ArgValue2;
4061 if (NextVA.isMemLoc()) {
4062 MachineFrameInfo &MFI = MF.getFrameInfo();
4063 int FI = MFI.CreateFixedObject(4, NextVA.getLocMemOffset(), true);
4064
4065 // Create load node to retrieve arguments from the stack.
4066 SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
4067 ArgValue2 = DAG.getLoad(
4068 MVT::i32, dl, Root, FIN,
4070 } else {
4071 Reg = MF.addLiveIn(NextVA.getLocReg(), RC);
4072 ArgValue2 = DAG.getCopyFromReg(Root, dl, Reg, MVT::i32);
4073 }
4074 if (!Subtarget->isLittle())
4075 std::swap (ArgValue, ArgValue2);
4076 return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, ArgValue, ArgValue2);
4077}
4078
4079// The remaining GPRs hold either the beginning of variable-argument
4080// data, or the beginning of an aggregate passed by value (usually
4081// byval). Either way, we allocate stack slots adjacent to the data
4082// provided by our caller, and store the unallocated registers there.
4083// If this is a variadic function, the va_list pointer will begin with
4084// these values; otherwise, this reassembles a (byval) structure that
4085// was split between registers and memory.
4086// Return: The frame index registers were stored into.
4087int ARMTargetLowering::StoreByValRegs(CCState &CCInfo, SelectionDAG &DAG,
4088 const SDLoc &dl, SDValue &Chain,
4089 const Value *OrigArg,
4090 unsigned InRegsParamRecordIdx,
4091 int ArgOffset, unsigned ArgSize) const {
4092 // Currently, two use-cases possible:
4093 // Case #1. Non-var-args function, and we meet first byval parameter.
4094 // Setup first unallocated register as first byval register;
4095 // eat all remained registers
4096 // (these two actions are performed by HandleByVal method).
4097 // Then, here, we initialize stack frame with
4098 // "store-reg" instructions.
4099 // Case #2. Var-args function, that doesn't contain byval parameters.
4100 // The same: eat all remained unallocated registers,
4101 // initialize stack frame.
4102
4103 MachineFunction &MF = DAG.getMachineFunction();
4104 MachineFrameInfo &MFI = MF.getFrameInfo();
4105 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
4106 unsigned RBegin, REnd;
4107 if (InRegsParamRecordIdx < CCInfo.getInRegsParamsCount()) {
4108 CCInfo.getInRegsParamInfo(InRegsParamRecordIdx, RBegin, REnd);
4109 } else {
4110 unsigned RBeginIdx = CCInfo.getFirstUnallocated(GPRArgRegs);
4111 RBegin = RBeginIdx == 4 ? (unsigned)ARM::R4 : GPRArgRegs[RBeginIdx];
4112 REnd = ARM::R4;
4113 }
4114
4115 if (REnd != RBegin)
4116 ArgOffset = -4 * (ARM::R4 - RBegin);
4117
4118 auto PtrVT = getPointerTy(DAG.getDataLayout());
4119 int FrameIndex = MFI.CreateFixedObject(ArgSize, ArgOffset, false);
4120 SDValue FIN = DAG.getFrameIndex(FrameIndex, PtrVT);
4121
4123 const TargetRegisterClass *RC =
4124 AFI->isThumb1OnlyFunction() ? &ARM::tGPRRegClass : &ARM::GPRRegClass;
4125
4126 for (unsigned Reg = RBegin, i = 0; Reg < REnd; ++Reg, ++i) {
4127 Register VReg = MF.addLiveIn(Reg, RC);
4128 SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i32);
4129 SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN,
4130 MachinePointerInfo(OrigArg, 4 * i));
4131 MemOps.push_back(Store);
4132 FIN = DAG.getNode(ISD::ADD, dl, PtrVT, FIN, DAG.getConstant(4, dl, PtrVT));
4133 }
4134
4135 if (!MemOps.empty())
4136 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
4137 return FrameIndex;
4138}
4139
4140// Setup stack frame, the va_list pointer will start from.
4141void ARMTargetLowering::VarArgStyleRegisters(CCState &CCInfo, SelectionDAG &DAG,
4142 const SDLoc &dl, SDValue &Chain,
4143 unsigned ArgOffset,
4144 unsigned TotalArgRegsSaveSize,
4145 bool ForceMutable) const {
4146 MachineFunction &MF = DAG.getMachineFunction();
4147 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
4148
4149 // Try to store any remaining integer argument regs
4150 // to their spots on the stack so that they may be loaded by dereferencing
4151 // the result of va_next.
4152 // If there is no regs to be stored, just point address after last
4153 // argument passed via stack.
4154 int FrameIndex = StoreByValRegs(
4155 CCInfo, DAG, dl, Chain, nullptr, CCInfo.getInRegsParamsCount(),
4156 CCInfo.getStackSize(), std::max(4U, TotalArgRegsSaveSize));
4157 AFI->setVarArgsFrameIndex(FrameIndex);
4158}
4159
4160bool ARMTargetLowering::splitValueIntoRegisterParts(
4161 SelectionDAG &DAG, const SDLoc &DL, SDValue Val, SDValue *Parts,
4162 unsigned NumParts, MVT PartVT, std::optional<CallingConv::ID> CC) const {
4163 EVT ValueVT = Val.getValueType();
4164 if ((ValueVT == MVT::f16 || ValueVT == MVT::bf16) && PartVT == MVT::f32) {
4165 unsigned ValueBits = ValueVT.getSizeInBits();
4166 unsigned PartBits = PartVT.getSizeInBits();
4167 Val = DAG.getNode(ISD::BITCAST, DL, MVT::getIntegerVT(ValueBits), Val);
4168 Val = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::getIntegerVT(PartBits), Val);
4169 Val = DAG.getNode(ISD::BITCAST, DL, PartVT, Val);
4170 Parts[0] = Val;
4171 return true;
4172 }
4173 return false;
4174}
4175
4176SDValue ARMTargetLowering::joinRegisterPartsIntoValue(
4177 SelectionDAG &DAG, const SDLoc &DL, const SDValue *Parts, unsigned NumParts,
4178 MVT PartVT, EVT ValueVT, std::optional<CallingConv::ID> CC) const {
4179 if ((ValueVT == MVT::f16 || ValueVT == MVT::bf16) && PartVT == MVT::f32) {
4180 unsigned ValueBits = ValueVT.getSizeInBits();
4181 unsigned PartBits = PartVT.getSizeInBits();
4182 SDValue Val = Parts[0];
4183
4184 Val = DAG.getNode(ISD::BITCAST, DL, MVT::getIntegerVT(PartBits), Val);
4185 Val = DAG.getNode(ISD::TRUNCATE, DL, MVT::getIntegerVT(ValueBits), Val);
4186 Val = DAG.getNode(ISD::BITCAST, DL, ValueVT, Val);
4187 return Val;
4188 }
4189 return SDValue();
4190}
4191
4192SDValue ARMTargetLowering::LowerFormalArguments(
4193 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
4194 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
4195 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
4196 MachineFunction &MF = DAG.getMachineFunction();
4197 MachineFrameInfo &MFI = MF.getFrameInfo();
4198
4199 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
4200
4201 // Assign locations to all of the incoming arguments.
4203 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
4204 *DAG.getContext());
4205 CCInfo.AnalyzeFormalArguments(Ins, CCAssignFnForCall(CallConv, isVarArg));
4206
4208 unsigned CurArgIdx = 0;
4209
4210 // Initially ArgRegsSaveSize is zero.
4211 // Then we increase this value each time we meet byval parameter.
4212 // We also increase this value in case of varargs function.
4213 AFI->setArgRegsSaveSize(0);
4214
4215 // Calculate the amount of stack space that we need to allocate to store
4216 // byval and variadic arguments that are passed in registers.
4217 // We need to know this before we allocate the first byval or variadic
4218 // argument, as they will be allocated a stack slot below the CFA (Canonical
4219 // Frame Address, the stack pointer at entry to the function).
4220 unsigned ArgRegBegin = ARM::R4;
4221 for (const CCValAssign &VA : ArgLocs) {
4222 if (CCInfo.getInRegsParamsProcessed() >= CCInfo.getInRegsParamsCount())
4223 break;
4224
4225 unsigned Index = VA.getValNo();
4226 ISD::ArgFlagsTy Flags = Ins[Index].Flags;
4227 if (!Flags.isByVal())
4228 continue;
4229
4230 assert(VA.isMemLoc() && "unexpected byval pointer in reg");
4231 unsigned RBegin, REnd;
4232 CCInfo.getInRegsParamInfo(CCInfo.getInRegsParamsProcessed(), RBegin, REnd);
4233 ArgRegBegin = std::min(ArgRegBegin, RBegin);
4234
4235 CCInfo.nextInRegsParam();
4236 }
4237 CCInfo.rewindByValRegsInfo();
4238
4239 int lastInsIndex = -1;
4240 if (isVarArg && MFI.hasVAStart()) {
4241 unsigned RegIdx = CCInfo.getFirstUnallocated(GPRArgRegs);
4242 if (RegIdx != std::size(GPRArgRegs))
4243 ArgRegBegin = std::min(ArgRegBegin, (unsigned)GPRArgRegs[RegIdx]);
4244 }
4245
4246 unsigned TotalArgRegsSaveSize = 4 * (ARM::R4 - ArgRegBegin);
4247 AFI->setArgRegsSaveSize(TotalArgRegsSaveSize);
4248 auto PtrVT = getPointerTy(DAG.getDataLayout());
4249
4250 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
4251 CCValAssign &VA = ArgLocs[i];
4252 if (Ins[VA.getValNo()].isOrigArg()) {
4253 std::advance(CurOrigArg,
4254 Ins[VA.getValNo()].getOrigArgIndex() - CurArgIdx);
4255 CurArgIdx = Ins[VA.getValNo()].getOrigArgIndex();
4256 }
4257 // Arguments stored in registers.
4258 if (VA.isRegLoc()) {
4259 EVT RegVT = VA.getLocVT();
4260 SDValue ArgValue;
4261
4262 if (VA.needsCustom() && VA.getLocVT() == MVT::v2f64) {
4263 // f64 and vector types are split up into multiple registers or
4264 // combinations of registers and stack slots.
4265 SDValue ArgValue1 =
4266 GetF64FormalArgument(VA, ArgLocs[++i], Chain, DAG, dl);
4267 VA = ArgLocs[++i]; // skip ahead to next loc
4268 SDValue ArgValue2;
4269 if (VA.isMemLoc()) {
4270 int FI = MFI.CreateFixedObject(8, VA.getLocMemOffset(), true);
4271 SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
4272 ArgValue2 = DAG.getLoad(
4273 MVT::f64, dl, Chain, FIN,
4275 } else {
4276 ArgValue2 = GetF64FormalArgument(VA, ArgLocs[++i], Chain, DAG, dl);
4277 }
4278 ArgValue = DAG.getNode(ISD::UNDEF, dl, MVT::v2f64);
4279 ArgValue = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, ArgValue,
4280 ArgValue1, DAG.getIntPtrConstant(0, dl));
4281 ArgValue = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, ArgValue,
4282 ArgValue2, DAG.getIntPtrConstant(1, dl));
4283 } else if (VA.needsCustom() && VA.getLocVT() == MVT::f64) {
4284 ArgValue = GetF64FormalArgument(VA, ArgLocs[++i], Chain, DAG, dl);
4285 } else {
4286 const TargetRegisterClass *RC;
4287
4288 if (RegVT == MVT::f16 || RegVT == MVT::bf16)
4289 RC = &ARM::HPRRegClass;
4290 else if (RegVT == MVT::f32)
4291 RC = &ARM::SPRRegClass;
4292 else if (RegVT == MVT::f64 || RegVT == MVT::v4f16 ||
4293 RegVT == MVT::v4bf16)
4294 RC = &ARM::DPRRegClass;
4295 else if (RegVT == MVT::v2f64 || RegVT == MVT::v8f16 ||
4296 RegVT == MVT::v8bf16)
4297 RC = &ARM::QPRRegClass;
4298 else if (RegVT == MVT::i32)
4299 RC = AFI->isThumb1OnlyFunction() ? &ARM::tGPRRegClass
4300 : &ARM::GPRRegClass;
4301 else
4302 llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering");
4303
4304 // Transform the arguments in physical registers into virtual ones.
4305 Register Reg = MF.addLiveIn(VA.getLocReg(), RC);
4306 ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
4307
4308 // If this value is passed in r0 and has the returned attribute (e.g.
4309 // C++ 'structors), record this fact for later use.
4310 if (VA.getLocReg() == ARM::R0 && Ins[VA.getValNo()].Flags.isReturned()) {
4311 AFI->setPreservesR0();
4312 }
4313 }
4314
4315 // If this is an 8 or 16-bit value, it is really passed promoted
4316 // to 32 bits. Insert an assert[sz]ext to capture this, then
4317 // truncate to the right size.
4318 switch (VA.getLocInfo()) {
4319 default: llvm_unreachable("Unknown loc info!");
4320 case CCValAssign::Full: break;
4321 case CCValAssign::BCvt:
4322 ArgValue = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), ArgValue);
4323 break;
4324 }
4325
4326 // f16 arguments have their size extended to 4 bytes and passed as if they
4327 // had been copied to the LSBs of a 32-bit register.
4328 // For that, it's passed extended to i32 (soft ABI) or to f32 (hard ABI)
4329 if (VA.needsCustom() &&
4330 (VA.getValVT() == MVT::f16 || VA.getValVT() == MVT::bf16))
4331 ArgValue = MoveToHPR(dl, DAG, VA.getLocVT(), VA.getValVT(), ArgValue);
4332
4333 // On CMSE Entry Functions, formal integer arguments whose bitwidth is
4334 // less than 32 bits must be sign- or zero-extended in the callee for
4335 // security reasons. Although the ABI mandates an extension done by the
4336 // caller, the latter cannot be trusted to follow the rules of the ABI.
4337 const ISD::InputArg &Arg = Ins[VA.getValNo()];
4338 if (AFI->isCmseNSEntryFunction() && Arg.ArgVT.isScalarInteger() &&
4339 RegVT.isScalarInteger() && Arg.ArgVT.bitsLT(MVT::i32))
4340 ArgValue = handleCMSEValue(ArgValue, Arg, DAG, dl);
4341
4342 InVals.push_back(ArgValue);
4343 } else { // VA.isRegLoc()
4344 // Only arguments passed on the stack should make it here.
4345 assert(VA.isMemLoc());
4346 assert(VA.getValVT() != MVT::i64 && "i64 should already be lowered");
4347
4348 int index = VA.getValNo();
4349
4350 // Some Ins[] entries become multiple ArgLoc[] entries.
4351 // Process them only once.
4352 if (index != lastInsIndex)
4353 {
4354 ISD::ArgFlagsTy Flags = Ins[index].Flags;
4355 // FIXME: For now, all byval parameter objects are marked mutable.
4356 // This can be changed with more analysis.
4357 // In case of tail call optimization mark all arguments mutable.
4358 // Since they could be overwritten by lowering of arguments in case of
4359 // a tail call.
4360 if (Flags.isByVal()) {
4361 assert(Ins[index].isOrigArg() &&
4362 "Byval arguments cannot be implicit");
4363 unsigned CurByValIndex = CCInfo.getInRegsParamsProcessed();
4364
4365 int FrameIndex = StoreByValRegs(
4366 CCInfo, DAG, dl, Chain, &*CurOrigArg, CurByValIndex,
4367 VA.getLocMemOffset(), Flags.getByValSize());
4368 InVals.push_back(DAG.getFrameIndex(FrameIndex, PtrVT));
4369 CCInfo.nextInRegsParam();
4370 } else if (VA.needsCustom() && (VA.getValVT() == MVT::f16 ||
4371 VA.getValVT() == MVT::bf16)) {
4372 // f16 and bf16 values are passed in the least-significant half of
4373 // a 4 byte stack slot. This is done as-if the extension was done
4374 // in a 32-bit register, so the actual bytes used for the value
4375 // differ between little and big endian.
4376 assert(VA.getLocVT().getSizeInBits() == 32);
4377 unsigned FIOffset = VA.getLocMemOffset();
4378 int FI = MFI.CreateFixedObject(VA.getLocVT().getSizeInBits() / 8,
4379 FIOffset, true);
4380
4381 SDValue Addr = DAG.getFrameIndex(FI, PtrVT);
4382 if (DAG.getDataLayout().isBigEndian())
4383 Addr = DAG.getObjectPtrOffset(dl, Addr, TypeSize::getFixed(2));
4384
4385 InVals.push_back(DAG.getLoad(VA.getValVT(), dl, Chain, Addr,
4387 DAG.getMachineFunction(), FI)));
4388
4389 } else {
4390 unsigned FIOffset = VA.getLocMemOffset();
4391 int FI = MFI.CreateFixedObject(VA.getLocVT().getSizeInBits()/8,
4392 FIOffset, true);
4393
4394 // Create load nodes to retrieve arguments from the stack.
4395 SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
4396 InVals.push_back(DAG.getLoad(VA.getValVT(), dl, Chain, FIN,
4398 DAG.getMachineFunction(), FI)));
4399 }
4400 lastInsIndex = index;
4401 }
4402 }
4403 }
4404
4405 // varargs
4406 if (isVarArg && MFI.hasVAStart()) {
4407 VarArgStyleRegisters(CCInfo, DAG, dl, Chain, CCInfo.getStackSize(),
4408 TotalArgRegsSaveSize);
4409 if (AFI->isCmseNSEntryFunction()) {
4410 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
4412 "secure entry function must not be variadic", dl.getDebugLoc()));
4413 }
4414 }
4415
4416 unsigned StackArgSize = CCInfo.getStackSize();
4417 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
4418 if (canGuaranteeTCO(CallConv, TailCallOpt)) {
4419 // The only way to guarantee a tail call is if the callee restores its
4420 // argument area, but it must also keep the stack aligned when doing so.
4421 MaybeAlign StackAlign = DAG.getDataLayout().getStackAlignment();
4422 assert(StackAlign && "data layout string is missing stack alignment");
4423 StackArgSize = alignTo(StackArgSize, *StackAlign);
4424
4425 AFI->setArgumentStackToRestore(StackArgSize);
4426 }
4427 AFI->setArgumentStackSize(StackArgSize);
4428
4429 if (CCInfo.getStackSize() > 0 && AFI->isCmseNSEntryFunction()) {
4430 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
4432 "secure entry function requires arguments on stack", dl.getDebugLoc()));
4433 }
4434
4435 return Chain;
4436}
4437
4438/// isFloatingPointZero - Return true if this is +0.0.
4441 return CFP->getValueAPF().isPosZero();
4442 else if (ISD::isEXTLoad(Op.getNode()) || ISD::isNON_EXTLoad(Op.getNode())) {
4443 // Maybe this has already been legalized into the constant pool?
4444 if (Op.getOperand(1).getOpcode() == ARMISD::Wrapper) {
4445 SDValue WrapperOp = Op.getOperand(1).getOperand(0);
4447 if (const ConstantFP *CFP = dyn_cast<ConstantFP>(CP->getConstVal()))
4448 return CFP->getValueAPF().isPosZero();
4449 }
4450 } else if (Op->getOpcode() == ISD::BITCAST &&
4451 Op->getValueType(0) == MVT::f64) {
4452 // Handle (ISD::BITCAST (ARMISD::VMOVIMM (ISD::TargetConstant 0)) MVT::f64)
4453 // created by LowerConstantFP().
4454 SDValue BitcastOp = Op->getOperand(0);
4455 if (BitcastOp->getOpcode() == ARMISD::VMOVIMM &&
4456 isNullConstant(BitcastOp->getOperand(0)))
4457 return true;
4458 }
4459 return false;
4460}
4461
4463 // 0 - INT_MIN sign wraps, so no signed wrap means cmn is safe.
4464 if (Op->getFlags().hasNoSignedWrap())
4465 return true;
4466
4467 // We can still figure out if the second operand is safe to use
4468 // in a CMN instruction by checking if it is known to be not the minimum
4469 // signed value. If it is not, then we can safely use CMN.
4470 // Note: We can eventually remove this check and simply rely on
4471 // Op->getFlags().hasNoSignedWrap() once SelectionDAG/ISelLowering
4472 // consistently sets them appropriately when making said nodes.
4473
4474 KnownBits KnownSrc = DAG.computeKnownBits(Op.getOperand(1));
4475 return !KnownSrc.getSignedMinValue().isMinSignedValue();
4476}
4477
4479 return Op.getOpcode() == ISD::SUB && isNullConstant(Op.getOperand(0)) &&
4480 (isIntEqualitySetCC(CC) ||
4481 (isUnsignedIntSetCC(CC) && DAG.isKnownNeverZero(Op.getOperand(1))) ||
4482 (isSignedIntSetCC(CC) && isSafeSignedCMN(Op, DAG)));
4483}
4484
4485/// Returns appropriate ARM CMP (cmp) and corresponding condition code for
4486/// the given operands.
4487SDValue ARMTargetLowering::getARMCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
4488 SDValue &ARMcc, SelectionDAG &DAG,
4489 const SDLoc &dl) const {
4490 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) {
4491 unsigned C = RHSC->getZExtValue();
4492 if (!isLegalICmpImmediate((int32_t)C)) {
4493 // Constant does not fit, try adjusting it by one.
4494 switch (CC) {
4495 default: break;
4496 case ISD::SETLT:
4497 case ISD::SETGE:
4498 if (C != 0x80000000 && isLegalICmpImmediate(C-1)) {
4499 CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT;
4500 RHS = DAG.getConstant(C - 1, dl, MVT::i32);
4501 }
4502 break;
4503 case ISD::SETULT:
4504 case ISD::SETUGE:
4505 if (C != 0 && isLegalICmpImmediate(C-1)) {
4506 CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT;
4507 RHS = DAG.getConstant(C - 1, dl, MVT::i32);
4508 }
4509 break;
4510 case ISD::SETLE:
4511 case ISD::SETGT:
4512 if (C != 0x7fffffff && isLegalICmpImmediate(C+1)) {
4513 CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE;
4514 RHS = DAG.getConstant(C + 1, dl, MVT::i32);
4515 }
4516 break;
4517 case ISD::SETULE:
4518 case ISD::SETUGT:
4519 if (C != 0xffffffff && isLegalICmpImmediate(C+1)) {
4520 CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE;
4521 RHS = DAG.getConstant(C + 1, dl, MVT::i32);
4522 }
4523 break;
4524 }
4525 }
4526 } else if ((ARM_AM::getShiftOpcForNode(LHS.getOpcode()) != ARM_AM::no_shift) &&
4528 // In ARM and Thumb-2, the compare instructions can shift their second
4529 // operand.
4531 std::swap(LHS, RHS);
4532 }
4533
4534 // Thumb1 has very limited immediate modes, so turning an "and" into a
4535 // shift can save multiple instructions.
4536 //
4537 // If we have (x & C1), and C1 is an appropriate mask, we can transform it
4538 // into "((x << n) >> n)". But that isn't necessarily profitable on its
4539 // own. If it's the operand to an unsigned comparison with an immediate,
4540 // we can eliminate one of the shifts: we transform
4541 // "((x << n) >> n) == C2" to "(x << n) == (C2 << n)".
4542 //
4543 // We avoid transforming cases which aren't profitable due to encoding
4544 // details:
4545 //
4546 // 1. C2 fits into the immediate field of a cmp, and the transformed version
4547 // would not; in that case, we're essentially trading one immediate load for
4548 // another.
4549 // 2. C1 is 255 or 65535, so we can use uxtb or uxth.
4550 // 3. C2 is zero; we have other code for this special case.
4551 //
4552 // FIXME: Figure out profitability for Thumb2; we usually can't save an
4553 // instruction, since the AND is always one instruction anyway, but we could
4554 // use narrow instructions in some cases.
4555 if (Subtarget->isThumb1Only() && LHS->getOpcode() == ISD::AND &&
4556 LHS->hasOneUse() && isa<ConstantSDNode>(LHS.getOperand(1)) &&
4557 LHS.getValueType() == MVT::i32 && isa<ConstantSDNode>(RHS) &&
4558 !isSignedIntSetCC(CC)) {
4559 unsigned Mask = LHS.getConstantOperandVal(1);
4560 auto *RHSC = cast<ConstantSDNode>(RHS.getNode());
4561 uint64_t RHSV = RHSC->getZExtValue();
4562 if (isMask_32(Mask) && (RHSV & ~Mask) == 0 && Mask != 255 && Mask != 65535) {
4563 unsigned ShiftBits = llvm::countl_zero(Mask);
4564 if (RHSV && (RHSV > 255 || (RHSV << ShiftBits) <= 255)) {
4565 SDValue ShiftAmt = DAG.getConstant(ShiftBits, dl, MVT::i32);
4566 LHS = DAG.getNode(ISD::SHL, dl, MVT::i32, LHS.getOperand(0), ShiftAmt);
4567 RHS = DAG.getConstant(RHSV << ShiftBits, dl, MVT::i32);
4568 }
4569 }
4570 }
4571
4572 // The specific comparison "(x<<c) > 0x80000000U" can be optimized to a
4573 // single "lsls x, c+1". The shift sets the "C" and "Z" flags the same
4574 // way a cmp would.
4575 // FIXME: Add support for ARM/Thumb2; this would need isel patterns, and
4576 // some tweaks to the heuristics for the previous and->shift transform.
4577 // FIXME: Optimize cases where the LHS isn't a shift.
4578 if (Subtarget->isThumb1Only() && LHS->getOpcode() == ISD::SHL &&
4579 isa<ConstantSDNode>(RHS) && RHS->getAsZExtVal() == 0x80000000U &&
4580 CC == ISD::SETUGT && isa<ConstantSDNode>(LHS.getOperand(1)) &&
4581 LHS.getConstantOperandVal(1) < 31) {
4582 unsigned ShiftAmt = LHS.getConstantOperandVal(1) + 1;
4583 SDValue Shift =
4584 DAG.getNode(ARMISD::LSLS, dl, DAG.getVTList(MVT::i32, FlagsVT),
4585 LHS.getOperand(0), DAG.getConstant(ShiftAmt, dl, MVT::i32));
4586 ARMcc = DAG.getConstant(ARMCC::HI, dl, MVT::i32);
4587 return Shift.getValue(1);
4588 }
4589
4591
4592 // If the RHS is a constant zero then the V (overflow) flag will never be
4593 // set. This can allow us to simplify GE to PL or LT to MI, which can be
4594 // simpler for other passes (like the peephole optimiser) to deal with.
4595 if (isNullConstant(RHS)) {
4596 switch (CondCode) {
4597 default: break;
4598 case ARMCC::GE:
4600 break;
4601 case ARMCC::LT:
4603 break;
4604 }
4605 }
4606
4607 unsigned CompareType;
4608 switch (CondCode) {
4609 default:
4610 CompareType = ARMISD::CMP;
4611 break;
4612 case ARMCC::EQ:
4613 case ARMCC::NE:
4614 // Uses only Z Flag
4615 CompareType = ARMISD::CMPZ;
4616 break;
4617 }
4618
4619 // TODO: Remove CMPZ check once we generalize and remove the CMPZ enum from
4620 // the codebase.
4621
4622 // TODO: When we have a solution to the vselect predicate not allowing pl/mi
4623 // all the time, allow those cases to be cmn too no matter what.
4624 if (CompareType != ARMISD::CMPZ && isCMN(RHS, CC, DAG)) {
4625 CompareType = ARMISD::CMN;
4626 RHS = RHS.getOperand(1);
4627 } else if (CompareType != ARMISD::CMPZ && isCMN(LHS, CC, DAG)) {
4628 CompareType = ARMISD::CMN;
4629 LHS = LHS.getOperand(1);
4631 }
4632
4633 ARMcc = DAG.getConstant(CondCode, dl, MVT::i32);
4634 return DAG.getNode(CompareType, dl, FlagsVT, LHS, RHS);
4635}
4636
4637/// Returns a appropriate VFP CMP (fcmp{s|d}+fmstat) for the given operands.
4638SDValue ARMTargetLowering::getVFPCmp(SDValue LHS, SDValue RHS,
4639 SelectionDAG &DAG, const SDLoc &dl,
4640 bool Signaling) const {
4641 assert(Subtarget->hasFP64() || RHS.getValueType() != MVT::f64);
4642 SDValue Flags;
4644 Flags = DAG.getNode(Signaling ? ARMISD::CMPFPE : ARMISD::CMPFP, dl, FlagsVT,
4645 LHS, RHS);
4646 else
4647 Flags = DAG.getNode(Signaling ? ARMISD::CMPFPEw0 : ARMISD::CMPFPw0, dl,
4648 FlagsVT, LHS);
4649 return DAG.getNode(ARMISD::FMSTAT, dl, FlagsVT, Flags);
4650}
4651
4652// This function returns three things: the arithmetic computation itself
4653// (Value), a comparison (OverflowCmp), and a condition code (ARMcc). The
4654// comparison and the condition code define the case in which the arithmetic
4655// computation *does not* overflow.
4656std::pair<SDValue, SDValue>
4657ARMTargetLowering::getARMXALUOOp(SDValue Op, SelectionDAG &DAG,
4658 SDValue &ARMcc) const {
4659 assert(Op.getValueType() == MVT::i32 && "Unsupported value type");
4660
4661 SDValue Value, OverflowCmp;
4662 SDValue LHS = Op.getOperand(0);
4663 SDValue RHS = Op.getOperand(1);
4664 SDLoc dl(Op);
4665
4666 // FIXME: We are currently always generating CMPs because we don't support
4667 // generating CMN through the backend. This is not as good as the natural
4668 // CMP case because it causes a register dependency and cannot be folded
4669 // later.
4670
4671 switch (Op.getOpcode()) {
4672 default:
4673 llvm_unreachable("Unknown overflow instruction!");
4674 case ISD::SADDO:
4675 ARMcc = DAG.getConstant(ARMCC::VC, dl, MVT::i32);
4676 Value = DAG.getNode(ISD::ADD, dl, Op.getValueType(), LHS, RHS);
4677 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, FlagsVT, Value, LHS);
4678 break;
4679 case ISD::UADDO:
4680 ARMcc = DAG.getConstant(ARMCC::HS, dl, MVT::i32);
4681 // We use ADDC here to correspond to its use in LowerALUO.
4682 // We do not use it in the USUBO case as Value may not be used.
4683 Value = DAG.getNode(ARMISD::ADDC, dl,
4684 DAG.getVTList(Op.getValueType(), MVT::i32), LHS, RHS)
4685 .getValue(0);
4686 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, FlagsVT, Value, LHS);
4687 break;
4688 case ISD::SSUBO:
4689 ARMcc = DAG.getConstant(ARMCC::VC, dl, MVT::i32);
4690 Value = DAG.getNode(ISD::SUB, dl, Op.getValueType(), LHS, RHS);
4691 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, FlagsVT, LHS, RHS);
4692 break;
4693 case ISD::USUBO:
4694 ARMcc = DAG.getConstant(ARMCC::HS, dl, MVT::i32);
4695 Value = DAG.getNode(ISD::SUB, dl, Op.getValueType(), LHS, RHS);
4696 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, FlagsVT, LHS, RHS);
4697 break;
4698 case ISD::UMULO:
4699 // We generate a UMUL_LOHI and then check if the high word is 0.
4700 ARMcc = DAG.getConstant(ARMCC::EQ, dl, MVT::i32);
4701 Value = DAG.getNode(ISD::UMUL_LOHI, dl,
4702 DAG.getVTList(Op.getValueType(), Op.getValueType()),
4703 LHS, RHS);
4704 OverflowCmp = DAG.getNode(ARMISD::CMPZ, dl, FlagsVT, Value.getValue(1),
4705 DAG.getConstant(0, dl, MVT::i32));
4706 Value = Value.getValue(0); // We only want the low 32 bits for the result.
4707 break;
4708 case ISD::SMULO:
4709 // We generate a SMUL_LOHI and then check if all the bits of the high word
4710 // are the same as the sign bit of the low word.
4711 ARMcc = DAG.getConstant(ARMCC::EQ, dl, MVT::i32);
4712 Value = DAG.getNode(ISD::SMUL_LOHI, dl,
4713 DAG.getVTList(Op.getValueType(), Op.getValueType()),
4714 LHS, RHS);
4715 OverflowCmp = DAG.getNode(ARMISD::CMPZ, dl, FlagsVT, Value.getValue(1),
4716 DAG.getNode(ISD::SRA, dl, Op.getValueType(),
4717 Value.getValue(0),
4718 DAG.getConstant(31, dl, MVT::i32)));
4719 Value = Value.getValue(0); // We only want the low 32 bits for the result.
4720 break;
4721 } // switch (...)
4722
4723 return std::make_pair(Value, OverflowCmp);
4724}
4725
4727 SDLoc DL(Value);
4728 EVT VT = Value.getValueType();
4729
4730 if (Invert)
4731 Value = DAG.getNode(ISD::SUB, DL, MVT::i32,
4732 DAG.getConstant(1, DL, MVT::i32), Value);
4733
4734 SDValue Cmp = DAG.getNode(ARMISD::SUBC, DL, DAG.getVTList(VT, MVT::i32),
4735 Value, DAG.getConstant(1, DL, VT));
4736 return Cmp.getValue(1);
4737}
4738
4740 bool Invert) {
4741 SDLoc DL(Flags);
4742
4743 if (Invert) {
4744 // Convert flags to boolean with ADDE 0,0,Carry then compute 1 - bool.
4745 SDValue BoolCarry = DAG.getNode(
4746 ARMISD::ADDE, DL, DAG.getVTList(VT, MVT::i32),
4747 DAG.getConstant(0, DL, VT), DAG.getConstant(0, DL, VT), Flags);
4748 return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(1, DL, VT), BoolCarry);
4749 }
4750
4751 // Now convert the carry flag into a boolean carry. We do this
4752 // using ARMISD::ADDE 0, 0, Carry
4753 return DAG.getNode(ARMISD::ADDE, DL, DAG.getVTList(VT, MVT::i32),
4754 DAG.getConstant(0, DL, VT), DAG.getConstant(0, DL, VT),
4755 Flags);
4756}
4757
4758// Value is 1 if 'V' bit is 1, else 0
4760 SDLoc DL(Flags);
4761 SDValue Zero = DAG.getConstant(0, DL, VT);
4762 SDValue One = DAG.getConstant(1, DL, VT);
4763 SDValue ARMcc = DAG.getConstant(ARMCC::VS, DL, MVT::i32);
4764 return DAG.getNode(ARMISD::CMOV, DL, VT, Zero, One, ARMcc, Flags);
4765}
4766
4767SDValue ARMTargetLowering::LowerALUO(SDValue Op, SelectionDAG &DAG) const {
4768 // Let legalize expand this if it isn't a legal type yet.
4769 if (!isTypeLegal(Op.getValueType()))
4770 return SDValue();
4771
4772 SDValue LHS = Op.getOperand(0);
4773 SDValue RHS = Op.getOperand(1);
4774 SDLoc dl(Op);
4775
4776 EVT VT = Op.getValueType();
4777 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
4778 SDValue Value;
4779 SDValue Overflow;
4780 switch (Op.getOpcode()) {
4781 case ISD::UADDO:
4782 Value = DAG.getNode(ARMISD::ADDC, dl, VTs, LHS, RHS);
4783 // Convert the carry flag into a boolean value.
4784 Overflow = carryFlagToValue(Value.getValue(1), VT, DAG, false);
4785 break;
4786 case ISD::USUBO:
4787 Value = DAG.getNode(ARMISD::SUBC, dl, VTs, LHS, RHS);
4788 // Convert the carry flag into a boolean value.
4789 Overflow = carryFlagToValue(Value.getValue(1), VT, DAG, true);
4790 break;
4791 default: {
4792 // Handle other operations with getARMXALUOOp
4793 SDValue OverflowCmp, ARMcc;
4794 std::tie(Value, OverflowCmp) = getARMXALUOOp(Op, DAG, ARMcc);
4795 // We use 0 and 1 as false and true values.
4796 // ARMcc represents the "no overflow" condition (e.g., VC for signed ops).
4797 // CMOV operand order is (FalseVal, TrueVal), so we put 1 in FalseVal
4798 // position to get Overflow=1 when the "no overflow" condition is false.
4799 Overflow =
4800 DAG.getNode(ARMISD::CMOV, dl, MVT::i32,
4801 DAG.getConstant(1, dl, MVT::i32), // FalseVal: overflow
4802 DAG.getConstant(0, dl, MVT::i32), // TrueVal: no overflow
4803 ARMcc, OverflowCmp);
4804 break;
4805 }
4806 }
4807
4808 return DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Value, Overflow);
4809}
4810
4812 const ARMSubtarget *Subtarget) {
4813 EVT VT = Op.getValueType();
4814 if (!Subtarget->hasV6Ops() || !Subtarget->hasDSP() || Subtarget->isThumb1Only())
4815 return SDValue();
4816 if (!VT.isSimple())
4817 return SDValue();
4818
4819 unsigned NewOpcode;
4820 switch (VT.getSimpleVT().SimpleTy) {
4821 default:
4822 return SDValue();
4823 case MVT::i8:
4824 switch (Op->getOpcode()) {
4825 case ISD::UADDSAT:
4826 NewOpcode = ARMISD::UQADD8b;
4827 break;
4828 case ISD::SADDSAT:
4829 NewOpcode = ARMISD::QADD8b;
4830 break;
4831 case ISD::USUBSAT:
4832 NewOpcode = ARMISD::UQSUB8b;
4833 break;
4834 case ISD::SSUBSAT:
4835 NewOpcode = ARMISD::QSUB8b;
4836 break;
4837 }
4838 break;
4839 case MVT::i16:
4840 switch (Op->getOpcode()) {
4841 case ISD::UADDSAT:
4842 NewOpcode = ARMISD::UQADD16b;
4843 break;
4844 case ISD::SADDSAT:
4845 NewOpcode = ARMISD::QADD16b;
4846 break;
4847 case ISD::USUBSAT:
4848 NewOpcode = ARMISD::UQSUB16b;
4849 break;
4850 case ISD::SSUBSAT:
4851 NewOpcode = ARMISD::QSUB16b;
4852 break;
4853 }
4854 break;
4855 }
4856
4857 SDLoc dl(Op);
4858 SDValue Add =
4859 DAG.getNode(NewOpcode, dl, MVT::i32,
4860 DAG.getSExtOrTrunc(Op->getOperand(0), dl, MVT::i32),
4861 DAG.getSExtOrTrunc(Op->getOperand(1), dl, MVT::i32));
4862 return DAG.getNode(ISD::TRUNCATE, dl, VT, Add);
4863}
4864
4865SDValue ARMTargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
4866 SDValue Cond = Op.getOperand(0);
4867 SDValue SelectTrue = Op.getOperand(1);
4868 SDValue SelectFalse = Op.getOperand(2);
4869 SDLoc dl(Op);
4870 unsigned Opc = Cond.getOpcode();
4871
4872 if (Cond.getResNo() == 1 &&
4873 (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO ||
4874 Opc == ISD::USUBO)) {
4875 if (!isTypeLegal(Cond->getValueType(0)))
4876 return SDValue();
4877
4878 SDValue Value, OverflowCmp;
4879 SDValue ARMcc;
4880 std::tie(Value, OverflowCmp) = getARMXALUOOp(Cond, DAG, ARMcc);
4881 EVT VT = Op.getValueType();
4882
4883 return getCMOV(dl, VT, SelectTrue, SelectFalse, ARMcc, OverflowCmp, DAG);
4884 }
4885
4886 // Convert:
4887 //
4888 // (select (cmov 1, 0, cond), t, f) -> (cmov t, f, cond)
4889 // (select (cmov 0, 1, cond), t, f) -> (cmov f, t, cond)
4890 //
4891 if (Cond.getOpcode() == ARMISD::CMOV && Cond.hasOneUse()) {
4892 const ConstantSDNode *CMOVTrue =
4893 dyn_cast<ConstantSDNode>(Cond.getOperand(0));
4894 const ConstantSDNode *CMOVFalse =
4895 dyn_cast<ConstantSDNode>(Cond.getOperand(1));
4896
4897 if (CMOVTrue && CMOVFalse) {
4898 unsigned CMOVTrueVal = CMOVTrue->getZExtValue();
4899 unsigned CMOVFalseVal = CMOVFalse->getZExtValue();
4900
4901 SDValue True;
4902 SDValue False;
4903 if (CMOVTrueVal == 1 && CMOVFalseVal == 0) {
4904 True = SelectTrue;
4905 False = SelectFalse;
4906 } else if (CMOVTrueVal == 0 && CMOVFalseVal == 1) {
4907 True = SelectFalse;
4908 False = SelectTrue;
4909 }
4910
4911 if (True.getNode() && False.getNode())
4912 return getCMOV(dl, Op.getValueType(), True, False, Cond.getOperand(2),
4913 Cond.getOperand(3), DAG);
4914 }
4915 }
4916
4917 // ARM's BooleanContents value is UndefinedBooleanContent. Mask out the
4918 // undefined bits before doing a full-word comparison with zero.
4919 Cond = DAG.getNode(ISD::AND, dl, Cond.getValueType(), Cond,
4920 DAG.getConstant(1, dl, Cond.getValueType()));
4921
4922 return DAG.getSelectCC(dl, Cond,
4923 DAG.getConstant(0, dl, Cond.getValueType()),
4924 SelectTrue, SelectFalse, ISD::SETNE);
4925}
4926
4928 bool &swpCmpOps, bool &swpVselOps) {
4929 // Start by selecting the GE condition code for opcodes that return true for
4930 // 'equality'
4931 if (CC == ISD::SETUGE || CC == ISD::SETOGE || CC == ISD::SETOLE ||
4932 CC == ISD::SETULE || CC == ISD::SETGE || CC == ISD::SETLE)
4933 CondCode = ARMCC::GE;
4934
4935 // and GT for opcodes that return false for 'equality'.
4936 else if (CC == ISD::SETUGT || CC == ISD::SETOGT || CC == ISD::SETOLT ||
4937 CC == ISD::SETULT || CC == ISD::SETGT || CC == ISD::SETLT)
4938 CondCode = ARMCC::GT;
4939
4940 // Since we are constrained to GE/GT, if the opcode contains 'less', we need
4941 // to swap the compare operands.
4942 if (CC == ISD::SETOLE || CC == ISD::SETULE || CC == ISD::SETOLT ||
4943 CC == ISD::SETULT || CC == ISD::SETLE || CC == ISD::SETLT)
4944 swpCmpOps = true;
4945
4946 // Both GT and GE are ordered comparisons, and return false for 'unordered'.
4947 // If we have an unordered opcode, we need to swap the operands to the VSEL
4948 // instruction (effectively negating the condition).
4949 //
4950 // This also has the effect of swapping which one of 'less' or 'greater'
4951 // returns true, so we also swap the compare operands. It also switches
4952 // whether we return true for 'equality', so we compensate by picking the
4953 // opposite condition code to our original choice.
4954 if (CC == ISD::SETULE || CC == ISD::SETULT || CC == ISD::SETUGE ||
4955 CC == ISD::SETUGT) {
4956 swpCmpOps = !swpCmpOps;
4957 swpVselOps = !swpVselOps;
4958 CondCode = CondCode == ARMCC::GT ? ARMCC::GE : ARMCC::GT;
4959 }
4960
4961 // 'ordered' is 'anything but unordered', so use the VS condition code and
4962 // swap the VSEL operands.
4963 if (CC == ISD::SETO) {
4964 CondCode = ARMCC::VS;
4965 swpVselOps = true;
4966 }
4967
4968 // 'unordered or not equal' is 'anything but equal', so use the EQ condition
4969 // code and swap the VSEL operands. Also do this if we don't care about the
4970 // unordered case.
4971 if (CC == ISD::SETUNE || CC == ISD::SETNE) {
4972 CondCode = ARMCC::EQ;
4973 swpVselOps = true;
4974 }
4975}
4976
4977SDValue ARMTargetLowering::getCMOV(const SDLoc &dl, EVT VT, SDValue FalseVal,
4978 SDValue TrueVal, SDValue ARMcc,
4979 SDValue Flags, SelectionDAG &DAG) const {
4980 if (!Subtarget->hasFP64() && VT == MVT::f64) {
4981 FalseVal = DAG.getNode(ARMISD::VMOVRRD, dl,
4982 DAG.getVTList(MVT::i32, MVT::i32), FalseVal);
4983 TrueVal = DAG.getNode(ARMISD::VMOVRRD, dl,
4984 DAG.getVTList(MVT::i32, MVT::i32), TrueVal);
4985
4986 SDValue TrueLow = TrueVal.getValue(0);
4987 SDValue TrueHigh = TrueVal.getValue(1);
4988 SDValue FalseLow = FalseVal.getValue(0);
4989 SDValue FalseHigh = FalseVal.getValue(1);
4990
4991 SDValue Low = DAG.getNode(ARMISD::CMOV, dl, MVT::i32, FalseLow, TrueLow,
4992 ARMcc, Flags);
4993 SDValue High = DAG.getNode(ARMISD::CMOV, dl, MVT::i32, FalseHigh, TrueHigh,
4994 ARMcc, Flags);
4995
4996 return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Low, High);
4997 }
4998 return DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, TrueVal, ARMcc, Flags);
4999}
5000
5001static bool isGTorGE(ISD::CondCode CC) {
5002 return CC == ISD::SETGT || CC == ISD::SETGE;
5003}
5004
5005static bool isLTorLE(ISD::CondCode CC) {
5006 return CC == ISD::SETLT || CC == ISD::SETLE;
5007}
5008
5009// See if a conditional (LHS CC RHS ? TrueVal : FalseVal) is lower-saturating.
5010// All of these conditions (and their <= and >= counterparts) will do:
5011// x < k ? k : x
5012// x > k ? x : k
5013// k < x ? x : k
5014// k > x ? k : x
5015static bool isLowerSaturate(const SDValue LHS, const SDValue RHS,
5016 const SDValue TrueVal, const SDValue FalseVal,
5017 const ISD::CondCode CC, const SDValue K) {
5018 return (isGTorGE(CC) &&
5019 ((K == LHS && K == TrueVal) || (K == RHS && K == FalseVal))) ||
5020 (isLTorLE(CC) &&
5021 ((K == RHS && K == TrueVal) || (K == LHS && K == FalseVal)));
5022}
5023
5024// Check if two chained conditionals could be converted into SSAT or USAT.
5025//
5026// SSAT can replace a set of two conditional selectors that bound a number to an
5027// interval of type [k, ~k] when k + 1 is a power of 2. Here are some examples:
5028//
5029// x < -k ? -k : (x > k ? k : x)
5030// x < -k ? -k : (x < k ? x : k)
5031// x > -k ? (x > k ? k : x) : -k
5032// x < k ? (x < -k ? -k : x) : k
5033// etc.
5034//
5035// LLVM canonicalizes these to either a min(max()) or a max(min())
5036// pattern. This function tries to match one of these and will return a SSAT
5037// node if successful.
5038//
5039// USAT works similarly to SSAT but bounds on the interval [0, k] where k + 1
5040// is a power of 2.
5042 EVT VT = Op.getValueType();
5043 SDValue V1 = Op.getOperand(0);
5044 SDValue K1 = Op.getOperand(1);
5045 SDValue TrueVal1 = Op.getOperand(2);
5046 SDValue FalseVal1 = Op.getOperand(3);
5047 ISD::CondCode CC1 = cast<CondCodeSDNode>(Op.getOperand(4))->get();
5048
5049 const SDValue Op2 = isa<ConstantSDNode>(TrueVal1) ? FalseVal1 : TrueVal1;
5050 if (Op2.getOpcode() != ISD::SELECT_CC)
5051 return SDValue();
5052
5053 SDValue V2 = Op2.getOperand(0);
5054 SDValue K2 = Op2.getOperand(1);
5055 SDValue TrueVal2 = Op2.getOperand(2);
5056 SDValue FalseVal2 = Op2.getOperand(3);
5057 ISD::CondCode CC2 = cast<CondCodeSDNode>(Op2.getOperand(4))->get();
5058
5059 SDValue V1Tmp = V1;
5060 SDValue V2Tmp = V2;
5061
5062 // Check that the registers and the constants match a max(min()) or min(max())
5063 // pattern
5064 if (V1Tmp != TrueVal1 || V2Tmp != TrueVal2 || K1 != FalseVal1 ||
5065 K2 != FalseVal2 ||
5066 !((isGTorGE(CC1) && isLTorLE(CC2)) || (isLTorLE(CC1) && isGTorGE(CC2))))
5067 return SDValue();
5068
5069 // Check that the constant in the lower-bound check is
5070 // the opposite of the constant in the upper-bound check
5071 // in 1's complement.
5073 return SDValue();
5074
5075 int64_t Val1 = cast<ConstantSDNode>(K1)->getSExtValue();
5076 int64_t Val2 = cast<ConstantSDNode>(K2)->getSExtValue();
5077 int64_t PosVal = std::max(Val1, Val2);
5078 int64_t NegVal = std::min(Val1, Val2);
5079
5080 if (!((Val1 > Val2 && isLTorLE(CC1)) || (Val1 < Val2 && isLTorLE(CC2))) ||
5081 !isPowerOf2_64(PosVal + 1))
5082 return SDValue();
5083
5084 // Handle the difference between USAT (unsigned) and SSAT (signed)
5085 // saturation
5086 // At this point, PosVal is guaranteed to be positive
5087 uint64_t K = PosVal;
5088 SDLoc dl(Op);
5089 if (Val1 == ~Val2)
5090 return DAG.getNode(ARMISD::SSAT, dl, VT, V2Tmp,
5091 DAG.getConstant(llvm::countr_one(K), dl, VT));
5092 if (NegVal == 0)
5093 return DAG.getNode(ARMISD::USAT, dl, VT, V2Tmp,
5094 DAG.getConstant(llvm::countr_one(K), dl, VT));
5095
5096 return SDValue();
5097}
5098
5099// Check if a condition of the type x < k ? k : x can be converted into a
5100// bit operation instead of conditional moves.
5101// Currently this is allowed given:
5102// - The conditions and values match up
5103// - k is 0 or -1 (all ones)
5104// This function will not check the last condition, thats up to the caller
5105// It returns true if the transformation can be made, and in such case
5106// returns x in V, and k in SatK.
5108 SDValue &SatK)
5109{
5110 SDValue LHS = Op.getOperand(0);
5111 SDValue RHS = Op.getOperand(1);
5112 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
5113 SDValue TrueVal = Op.getOperand(2);
5114 SDValue FalseVal = Op.getOperand(3);
5115
5117 ? &RHS
5118 : nullptr;
5119
5120 // No constant operation in comparison, early out
5121 if (!K)
5122 return false;
5123
5124 SDValue KTmp = isa<ConstantSDNode>(TrueVal) ? TrueVal : FalseVal;
5125 V = (KTmp == TrueVal) ? FalseVal : TrueVal;
5126 SDValue VTmp = (K && *K == LHS) ? RHS : LHS;
5127
5128 // If the constant on left and right side, or variable on left and right,
5129 // does not match, early out
5130 if (*K != KTmp || V != VTmp)
5131 return false;
5132
5133 if (isLowerSaturate(LHS, RHS, TrueVal, FalseVal, CC, *K)) {
5134 SatK = *K;
5135 return true;
5136 }
5137
5138 return false;
5139}
5140
5141bool ARMTargetLowering::isUnsupportedFloatingType(EVT VT) const {
5142 if (VT == MVT::f32)
5143 return !Subtarget->hasVFP2Base();
5144 if (VT == MVT::f64)
5145 return !Subtarget->hasFP64();
5146 if (VT == MVT::f16)
5147 return !Subtarget->hasFullFP16();
5148 return false;
5149}
5150
5151static SDValue matchCSET(unsigned &Opcode, bool &InvertCond, SDValue TrueVal,
5152 SDValue FalseVal, const ARMSubtarget *Subtarget) {
5153 ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FalseVal);
5154 ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TrueVal);
5155 if (!CFVal || !CTVal || !Subtarget->hasV8_1MMainlineOps())
5156 return SDValue();
5157
5158 unsigned TVal = CTVal->getZExtValue();
5159 unsigned FVal = CFVal->getZExtValue();
5160
5161 Opcode = 0;
5162 InvertCond = false;
5163 if (TVal == ~FVal) {
5164 Opcode = ARMISD::CSINV;
5165 } else if (TVal == ~FVal + 1) {
5166 Opcode = ARMISD::CSNEG;
5167 } else if (TVal + 1 == FVal) {
5168 Opcode = ARMISD::CSINC;
5169 } else if (TVal == FVal + 1) {
5170 Opcode = ARMISD::CSINC;
5171 std::swap(TrueVal, FalseVal);
5172 std::swap(TVal, FVal);
5173 InvertCond = !InvertCond;
5174 } else {
5175 return SDValue();
5176 }
5177
5178 // If one of the constants is cheaper than another, materialise the
5179 // cheaper one and let the csel generate the other.
5180 if (Opcode != ARMISD::CSINC &&
5181 HasLowerConstantMaterializationCost(FVal, TVal, Subtarget)) {
5182 std::swap(TrueVal, FalseVal);
5183 std::swap(TVal, FVal);
5184 InvertCond = !InvertCond;
5185 }
5186
5187 // Attempt to use ZR checking TVal is 0, possibly inverting the condition
5188 // to get there. CSINC not is invertable like the other two (~(~a) == a,
5189 // -(-a) == a, but (a+1)+1 != a).
5190 if (FVal == 0 && Opcode != ARMISD::CSINC) {
5191 std::swap(TrueVal, FalseVal);
5192 std::swap(TVal, FVal);
5193 InvertCond = !InvertCond;
5194 }
5195
5196 return TrueVal;
5197}
5198
5199SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
5200 EVT VT = Op.getValueType();
5201 SDLoc dl(Op);
5202
5203 // Try to convert two saturating conditional selects into a single SSAT
5204 if ((!Subtarget->isThumb() && Subtarget->hasV6Ops()) || Subtarget->isThumb2())
5205 if (SDValue SatValue = LowerSaturatingConditional(Op, DAG))
5206 return SatValue;
5207
5208 // Try to convert expressions of the form x < k ? k : x (and similar forms)
5209 // into more efficient bit operations, which is possible when k is 0 or -1
5210 // On ARM and Thumb-2 which have flexible operand 2 this will result in
5211 // single instructions. On Thumb the shift and the bit operation will be two
5212 // instructions.
5213 // Only allow this transformation on full-width (32-bit) operations
5214 SDValue LowerSatConstant;
5215 SDValue SatValue;
5216 if (VT == MVT::i32 &&
5217 isLowerSaturatingConditional(Op, SatValue, LowerSatConstant)) {
5218 SDValue ShiftV = DAG.getNode(ISD::SRA, dl, VT, SatValue,
5219 DAG.getConstant(31, dl, VT));
5220 if (isNullConstant(LowerSatConstant)) {
5221 SDValue NotShiftV = DAG.getNode(ISD::XOR, dl, VT, ShiftV,
5222 DAG.getAllOnesConstant(dl, VT));
5223 return DAG.getNode(ISD::AND, dl, VT, SatValue, NotShiftV);
5224 } else if (isAllOnesConstant(LowerSatConstant))
5225 return DAG.getNode(ISD::OR, dl, VT, SatValue, ShiftV);
5226 }
5227
5228 SDValue LHS = Op.getOperand(0);
5229 SDValue RHS = Op.getOperand(1);
5230 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
5231 SDValue TrueVal = Op.getOperand(2);
5232 SDValue FalseVal = Op.getOperand(3);
5233 ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FalseVal);
5234 ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS);
5235 if (Op.getValueType().isInteger()) {
5236
5237 // Check for SMAX(lhs, 0) and SMIN(lhs, 0) patterns.
5238 // (SELECT_CC setgt, lhs, 0, lhs, 0) -> (BIC lhs, (SRA lhs, typesize-1))
5239 // (SELECT_CC setlt, lhs, 0, lhs, 0) -> (AND lhs, (SRA lhs, typesize-1))
5240 // Both require less instructions than compare and conditional select.
5241 if ((CC == ISD::SETGT || CC == ISD::SETLT) && LHS == TrueVal && RHSC &&
5242 RHSC->isZero() && CFVal && CFVal->isZero() &&
5243 LHS.getValueType() == RHS.getValueType()) {
5244 EVT VT = LHS.getValueType();
5245 SDValue Shift =
5246 DAG.getNode(ISD::SRA, dl, VT, LHS,
5247 DAG.getConstant(VT.getSizeInBits() - 1, dl, VT));
5248
5249 if (CC == ISD::SETGT)
5250 Shift = DAG.getNOT(dl, Shift, VT);
5251
5252 return DAG.getNode(ISD::AND, dl, VT, LHS, Shift);
5253 }
5254
5255 // (SELECT_CC setlt, x, 0, 1, 0) -> SRL(x, bw-1)
5256 if (CC == ISD::SETLT && isNullConstant(RHS) && isOneConstant(TrueVal) &&
5257 isNullConstant(FalseVal) && LHS.getValueType() == VT)
5258 return DAG.getNode(ISD::SRL, dl, VT, LHS,
5259 DAG.getConstant(VT.getSizeInBits() - 1, dl, VT));
5260 }
5261
5262 if (LHS.getValueType() == MVT::i32) {
5263 unsigned Opcode;
5264 bool InvertCond;
5265 if (SDValue Op =
5266 matchCSET(Opcode, InvertCond, TrueVal, FalseVal, Subtarget)) {
5267 if (InvertCond)
5268 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
5269
5270 SDValue ARMcc;
5271 SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl);
5272 EVT VT = Op.getValueType();
5273 return DAG.getNode(Opcode, dl, VT, Op, Op, ARMcc, Cmp);
5274 }
5275 }
5276
5277 if (isUnsupportedFloatingType(LHS.getValueType())) {
5278 softenSetCCOperands(DAG, LHS.getValueType(), LHS, RHS, CC, dl, LHS, RHS);
5279
5280 // If softenSetCCOperands only returned one value, we should compare it to
5281 // zero.
5282 if (!RHS.getNode()) {
5283 RHS = DAG.getConstant(0, dl, LHS.getValueType());
5284 CC = ISD::SETNE;
5285 }
5286 }
5287
5288 if (LHS.getValueType() == MVT::i32) {
5289 // Try to generate VSEL on ARMv8.
5290 // The VSEL instruction can't use all the usual ARM condition
5291 // codes: it only has two bits to select the condition code, so it's
5292 // constrained to use only GE, GT, VS and EQ.
5293 //
5294 // To implement all the various ISD::SETXXX opcodes, we sometimes need to
5295 // swap the operands of the previous compare instruction (effectively
5296 // inverting the compare condition, swapping 'less' and 'greater') and
5297 // sometimes need to swap the operands to the VSEL (which inverts the
5298 // condition in the sense of firing whenever the previous condition didn't)
5299 if (Subtarget->hasFPARMv8Base() && (TrueVal.getValueType() == MVT::f16 ||
5300 TrueVal.getValueType() == MVT::f32 ||
5301 TrueVal.getValueType() == MVT::f64)) {
5303 if (CondCode == ARMCC::LT || CondCode == ARMCC::LE ||
5304 CondCode == ARMCC::VC || CondCode == ARMCC::NE) {
5305 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
5306 std::swap(TrueVal, FalseVal);
5307 }
5308 }
5309
5310 SDValue ARMcc;
5311 SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl);
5312 // Choose GE over PL, which vsel does now support
5313 if (ARMcc->getAsZExtVal() == ARMCC::PL)
5314 ARMcc = DAG.getConstant(ARMCC::GE, dl, MVT::i32);
5315 return getCMOV(dl, VT, FalseVal, TrueVal, ARMcc, Cmp, DAG);
5316 }
5317
5318 ARMCC::CondCodes CondCode, CondCode2;
5319 FPCCToARMCC(CC, CondCode, CondCode2);
5320
5321 // Normalize the fp compare. If RHS is zero we prefer to keep it there so we
5322 // match CMPFPw0 instead of CMPFP, though we don't do this for f16 because we
5323 // must use VSEL (limited condition codes), due to not having conditional f16
5324 // moves.
5325 if (Subtarget->hasFPARMv8Base() &&
5326 !(isFloatingPointZero(RHS) && TrueVal.getValueType() != MVT::f16) &&
5327 (TrueVal.getValueType() == MVT::f16 ||
5328 TrueVal.getValueType() == MVT::f32 ||
5329 TrueVal.getValueType() == MVT::f64)) {
5330 bool swpCmpOps = false;
5331 bool swpVselOps = false;
5332 checkVSELConstraints(CC, CondCode, swpCmpOps, swpVselOps);
5333
5334 if (CondCode == ARMCC::GT || CondCode == ARMCC::GE ||
5335 CondCode == ARMCC::VS || CondCode == ARMCC::EQ) {
5336 if (swpCmpOps)
5337 std::swap(LHS, RHS);
5338 if (swpVselOps)
5339 std::swap(TrueVal, FalseVal);
5340 }
5341 }
5342
5343 SDValue ARMcc = DAG.getConstant(CondCode, dl, MVT::i32);
5344 SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl);
5345 SDValue Result = getCMOV(dl, VT, FalseVal, TrueVal, ARMcc, Cmp, DAG);
5346 if (CondCode2 != ARMCC::AL) {
5347 SDValue ARMcc2 = DAG.getConstant(CondCode2, dl, MVT::i32);
5348 Result = getCMOV(dl, VT, Result, TrueVal, ARMcc2, Cmp, DAG);
5349 }
5350 return Result;
5351}
5352
5353/// canChangeToInt - Given the fp compare operand, return true if it is suitable
5354/// to morph to an integer compare sequence.
5355static bool canChangeToInt(SDValue Op, bool &SeenZero,
5356 const ARMSubtarget *Subtarget) {
5357 SDNode *N = Op.getNode();
5358 if (!N->hasOneUse())
5359 // Otherwise it requires moving the value from fp to integer registers.
5360 return false;
5361 if (!N->getNumValues())
5362 return false;
5363 EVT VT = Op.getValueType();
5364 if (VT != MVT::f32 && !Subtarget->isFPBrccSlow())
5365 // f32 case is generally profitable. f64 case only makes sense when vcmpe +
5366 // vmrs are very slow, e.g. cortex-a8.
5367 return false;
5368
5369 if (isFloatingPointZero(Op)) {
5370 SeenZero = true;
5371 return true;
5372 }
5373 return ISD::isNormalLoad(N);
5374}
5375
5378 return DAG.getConstant(0, SDLoc(Op), MVT::i32);
5379
5381 return DAG.getLoad(MVT::i32, SDLoc(Op), Ld->getChain(), Ld->getBasePtr(),
5382 Ld->getPointerInfo(), Ld->getAlign(),
5383 Ld->getMemOperand()->getFlags());
5384
5385 llvm_unreachable("Unknown VFP cmp argument!");
5386}
5387
5389 SDValue &RetVal1, SDValue &RetVal2) {
5390 SDLoc dl(Op);
5391
5392 if (isFloatingPointZero(Op)) {
5393 RetVal1 = DAG.getConstant(0, dl, MVT::i32);
5394 RetVal2 = DAG.getConstant(0, dl, MVT::i32);
5395 return;
5396 }
5397
5398 if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Op)) {
5399 SDValue Ptr = Ld->getBasePtr();
5400 RetVal1 =
5401 DAG.getLoad(MVT::i32, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),
5402 Ld->getAlign(), Ld->getMemOperand()->getFlags());
5403
5404 EVT PtrType = Ptr.getValueType();
5405 SDValue NewPtr = DAG.getNode(ISD::ADD, dl,
5406 PtrType, Ptr, DAG.getConstant(4, dl, PtrType));
5407 RetVal2 = DAG.getLoad(MVT::i32, dl, Ld->getChain(), NewPtr,
5408 Ld->getPointerInfo().getWithOffset(4),
5409 commonAlignment(Ld->getAlign(), 4),
5410 Ld->getMemOperand()->getFlags());
5411 return;
5412 }
5413
5414 llvm_unreachable("Unknown VFP cmp argument!");
5415}
5416
5417/// OptimizeVFPBrcond - With nnan and without daz, it's legal to optimize some
5418/// f32 and even f64 comparisons to integer ones.
5419SDValue
5420ARMTargetLowering::OptimizeVFPBrcond(SDValue Op, SelectionDAG &DAG) const {
5421 SDValue Chain = Op.getOperand(0);
5422 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
5423 SDValue LHS = Op.getOperand(2);
5424 SDValue RHS = Op.getOperand(3);
5425 SDValue Dest = Op.getOperand(4);
5426 SDLoc dl(Op);
5427
5428 bool LHSSeenZero = false;
5429 bool LHSOk = canChangeToInt(LHS, LHSSeenZero, Subtarget);
5430 bool RHSSeenZero = false;
5431 bool RHSOk = canChangeToInt(RHS, RHSSeenZero, Subtarget);
5432 if (LHSOk && RHSOk && (LHSSeenZero || RHSSeenZero)) {
5433 // If unsafe fp math optimization is enabled and there are no other uses of
5434 // the CMP operands, and the condition code is EQ or NE, we can optimize it
5435 // to an integer comparison.
5436 if (CC == ISD::SETOEQ)
5437 CC = ISD::SETEQ;
5438 else if (CC == ISD::SETUNE)
5439 CC = ISD::SETNE;
5440
5441 SDValue Mask = DAG.getConstant(0x7fffffff, dl, MVT::i32);
5442 SDValue ARMcc;
5443 if (LHS.getValueType() == MVT::f32) {
5444 LHS = DAG.getNode(ISD::AND, dl, MVT::i32,
5445 bitcastf32Toi32(LHS, DAG), Mask);
5446 RHS = DAG.getNode(ISD::AND, dl, MVT::i32,
5447 bitcastf32Toi32(RHS, DAG), Mask);
5448 SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl);
5449 return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, Chain, Dest, ARMcc,
5450 Cmp);
5451 }
5452
5453 SDValue LHS1, LHS2;
5454 SDValue RHS1, RHS2;
5455 expandf64Toi32(LHS, DAG, LHS1, LHS2);
5456 expandf64Toi32(RHS, DAG, RHS1, RHS2);
5457 LHS2 = DAG.getNode(ISD::AND, dl, MVT::i32, LHS2, Mask);
5458 RHS2 = DAG.getNode(ISD::AND, dl, MVT::i32, RHS2, Mask);
5460 ARMcc = DAG.getConstant(CondCode, dl, MVT::i32);
5461 SDValue Ops[] = { Chain, ARMcc, LHS1, LHS2, RHS1, RHS2, Dest };
5462 return DAG.getNode(ARMISD::BCC_i64, dl, MVT::Other, Ops);
5463 }
5464
5465 return SDValue();
5466}
5467
5468// Generate CMP + CMOV for integer abs.
5469SDValue ARMTargetLowering::LowerABS(SDValue Op, SelectionDAG &DAG) const {
5470 SDLoc DL(Op);
5471
5472 SDValue Neg = DAG.getNegative(Op.getOperand(0), DL, MVT::i32);
5473
5474 // Generate CMP & CMOV.
5475 SDValue Cmp = DAG.getNode(ARMISD::CMP, DL, FlagsVT, Op.getOperand(0),
5476 DAG.getConstant(0, DL, MVT::i32));
5477 return DAG.getNode(ARMISD::CMOV, DL, MVT::i32, Op.getOperand(0), Neg,
5478 DAG.getConstant(ARMCC::MI, DL, MVT::i32), Cmp);
5479}
5480
5482 ARMCC::CondCodes CondCode =
5483 (ARMCC::CondCodes)cast<ConstantSDNode>(ARMcc)->getZExtValue();
5484 CondCode = ARMCC::getOppositeCondition(CondCode);
5485 return DAG.getConstant(CondCode, SDLoc(ARMcc), MVT::i32);
5486}
5487
5488SDValue ARMTargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
5489 SDValue Chain = Op.getOperand(0);
5490 SDValue Cond = Op.getOperand(1);
5491 SDValue Dest = Op.getOperand(2);
5492 SDLoc dl(Op);
5493
5494 // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch
5495 // instruction.
5496 unsigned Opc = Cond.getOpcode();
5497 bool OptimizeMul = (Opc == ISD::SMULO || Opc == ISD::UMULO) &&
5498 !Subtarget->isThumb1Only();
5499 if (Cond.getResNo() == 1 &&
5500 (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO ||
5501 Opc == ISD::USUBO || OptimizeMul)) {
5502 // Only lower legal XALUO ops.
5503 if (!isTypeLegal(Cond->getValueType(0)))
5504 return SDValue();
5505
5506 // The actual operation with overflow check.
5507 SDValue Value, OverflowCmp;
5508 SDValue ARMcc;
5509 std::tie(Value, OverflowCmp) = getARMXALUOOp(Cond, DAG, ARMcc);
5510
5511 // Reverse the condition code.
5512 ARMcc = getInvertedARMCondCode(ARMcc, DAG);
5513
5514 return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, Chain, Dest, ARMcc,
5515 OverflowCmp);
5516 }
5517
5518 return SDValue();
5519}
5520
5521SDValue ARMTargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
5522 SDValue Chain = Op.getOperand(0);
5523 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
5524 SDValue LHS = Op.getOperand(2);
5525 SDValue RHS = Op.getOperand(3);
5526 SDValue Dest = Op.getOperand(4);
5527 SDLoc dl(Op);
5528
5529 if (isUnsupportedFloatingType(LHS.getValueType())) {
5530 softenSetCCOperands(DAG, LHS.getValueType(), LHS, RHS, CC, dl, LHS, RHS);
5531
5532 // If softenSetCCOperands only returned one value, we should compare it to
5533 // zero.
5534 if (!RHS.getNode()) {
5535 RHS = DAG.getConstant(0, dl, LHS.getValueType());
5536 CC = ISD::SETNE;
5537 }
5538 }
5539
5540 // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch
5541 // instruction.
5542 unsigned Opc = LHS.getOpcode();
5543 bool OptimizeMul = (Opc == ISD::SMULO || Opc == ISD::UMULO) &&
5544 !Subtarget->isThumb1Only();
5545 if (LHS.getResNo() == 1 && (isOneConstant(RHS) || isNullConstant(RHS)) &&
5546 (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO ||
5547 Opc == ISD::USUBO || OptimizeMul) &&
5548 (CC == ISD::SETEQ || CC == ISD::SETNE)) {
5549 // Only lower legal XALUO ops.
5550 if (!isTypeLegal(LHS->getValueType(0)))
5551 return SDValue();
5552
5553 // The actual operation with overflow check.
5554 SDValue Value, OverflowCmp;
5555 SDValue ARMcc;
5556 std::tie(Value, OverflowCmp) = getARMXALUOOp(LHS.getValue(0), DAG, ARMcc);
5557
5558 if ((CC == ISD::SETNE) != isOneConstant(RHS)) {
5559 // Reverse the condition code.
5560 ARMcc = getInvertedARMCondCode(ARMcc, DAG);
5561 }
5562
5563 return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, Chain, Dest, ARMcc,
5564 OverflowCmp);
5565 }
5566
5567 if (LHS.getValueType() == MVT::i32) {
5568 SDValue ARMcc;
5569 SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl);
5570 return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, Chain, Dest, ARMcc, Cmp);
5571 }
5572
5573 SDNodeFlags Flags = Op->getFlags();
5574 if (Flags.hasNoNaNs() &&
5575 DAG.getDenormalMode(MVT::f32) == DenormalMode::getIEEE() &&
5576 DAG.getDenormalMode(MVT::f64) == DenormalMode::getIEEE() &&
5577 (CC == ISD::SETEQ || CC == ISD::SETOEQ || CC == ISD::SETNE ||
5578 CC == ISD::SETUNE)) {
5579 if (SDValue Result = OptimizeVFPBrcond(Op, DAG))
5580 return Result;
5581 }
5582
5583 ARMCC::CondCodes CondCode, CondCode2;
5584 FPCCToARMCC(CC, CondCode, CondCode2);
5585
5586 SDValue ARMcc = DAG.getConstant(CondCode, dl, MVT::i32);
5587 SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl);
5588 SDValue Ops[] = {Chain, Dest, ARMcc, Cmp};
5589 SDValue Res = DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, Ops);
5590 if (CondCode2 != ARMCC::AL) {
5591 ARMcc = DAG.getConstant(CondCode2, dl, MVT::i32);
5592 SDValue Ops[] = {Res, Dest, ARMcc, Cmp};
5593 Res = DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, Ops);
5594 }
5595 return Res;
5596}
5597
5598SDValue ARMTargetLowering::LowerBR_JT(SDValue Op, SelectionDAG &DAG) const {
5599 SDValue Chain = Op.getOperand(0);
5600 SDValue Table = Op.getOperand(1);
5601 SDValue Index = Op.getOperand(2);
5602 SDLoc dl(Op);
5603
5604 EVT PTy = getPointerTy(DAG.getDataLayout());
5605 JumpTableSDNode *JT = cast<JumpTableSDNode>(Table);
5606 SDValue JTI = DAG.getTargetJumpTable(JT->getIndex(), PTy);
5607 Table = DAG.getNode(ARMISD::WrapperJT, dl, MVT::i32, JTI);
5608 Index = DAG.getNode(ISD::MUL, dl, PTy, Index, DAG.getConstant(4, dl, PTy));
5609 SDValue Addr = DAG.getNode(ISD::ADD, dl, PTy, Table, Index);
5610 if (Subtarget->isThumb2() || (Subtarget->hasV8MBaselineOps() && Subtarget->isThumb())) {
5611 // Thumb2 and ARMv8-M use a two-level jump. That is, it jumps into the jump table
5612 // which does another jump to the destination. This also makes it easier
5613 // to translate it to TBB / TBH later (Thumb2 only).
5614 // FIXME: This might not work if the function is extremely large.
5615 return DAG.getNode(ARMISD::BR2_JT, dl, MVT::Other, Chain,
5616 Addr, Op.getOperand(2), JTI);
5617 }
5618 if (isPositionIndependent() || Subtarget->isROPI()) {
5619 Addr =
5620 DAG.getLoad((EVT)MVT::i32, dl, Chain, Addr,
5622 Chain = Addr.getValue(1);
5623 Addr = DAG.getNode(ISD::ADD, dl, PTy, Table, Addr);
5624 return DAG.getNode(ARMISD::BR_JT, dl, MVT::Other, Chain, Addr, JTI);
5625 } else {
5626 Addr =
5627 DAG.getLoad(PTy, dl, Chain, Addr,
5629 Chain = Addr.getValue(1);
5630 return DAG.getNode(ARMISD::BR_JT, dl, MVT::Other, Chain, Addr, JTI);
5631 }
5632}
5633
5635 EVT VT = Op.getValueType();
5636 SDLoc dl(Op);
5637
5638 if (Op.getValueType().getVectorElementType() == MVT::i32) {
5639 if (Op.getOperand(0).getValueType().getVectorElementType() == MVT::f32)
5640 return Op;
5641 return DAG.UnrollVectorOp(Op.getNode());
5642 }
5643
5644 const bool HasFullFP16 = DAG.getSubtarget<ARMSubtarget>().hasFullFP16();
5645
5646 EVT NewTy;
5647 const EVT OpTy = Op.getOperand(0).getValueType();
5648 if (OpTy == MVT::v4f32)
5649 NewTy = MVT::v4i32;
5650 else if (OpTy == MVT::v4f16 && HasFullFP16)
5651 NewTy = MVT::v4i16;
5652 else if (OpTy == MVT::v8f16 && HasFullFP16)
5653 NewTy = MVT::v8i16;
5654 else
5655 llvm_unreachable("Invalid type for custom lowering!");
5656
5657 if (VT != MVT::v4i16 && VT != MVT::v8i16)
5658 return DAG.UnrollVectorOp(Op.getNode());
5659
5660 Op = DAG.getNode(Op.getOpcode(), dl, NewTy, Op.getOperand(0));
5661 return DAG.getNode(ISD::TRUNCATE, dl, VT, Op);
5662}
5663
5664SDValue ARMTargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
5665 EVT VT = Op.getValueType();
5666 if (VT.isVector())
5667 return LowerVectorFP_TO_INT(Op, DAG);
5668
5669 bool IsStrict = Op->isStrictFPOpcode();
5670 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
5671
5672 if (isUnsupportedFloatingType(SrcVal.getValueType())) {
5673 RTLIB::Libcall LC;
5674 if (Op.getOpcode() == ISD::FP_TO_SINT ||
5675 Op.getOpcode() == ISD::STRICT_FP_TO_SINT)
5676 LC = RTLIB::getFPTOSINT(SrcVal.getValueType(),
5677 Op.getValueType());
5678 else
5679 LC = RTLIB::getFPTOUINT(SrcVal.getValueType(),
5680 Op.getValueType());
5681 SDLoc Loc(Op);
5682 MakeLibCallOptions CallOptions;
5683 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
5685 std::tie(Result, Chain) = makeLibCall(DAG, LC, Op.getValueType(), SrcVal,
5686 CallOptions, Loc, Chain);
5687 return IsStrict ? DAG.getMergeValues({Result, Chain}, Loc) : Result;
5688 }
5689
5690 // FIXME: Remove this when we have strict fp instruction selection patterns
5691 if (IsStrict) {
5692 SDLoc Loc(Op);
5693 SDValue Result =
5696 Loc, Op.getValueType(), SrcVal);
5697 return DAG.getMergeValues({Result, Op.getOperand(0)}, Loc);
5698 }
5699
5700 return Op;
5701}
5702
5704 const ARMSubtarget *Subtarget) {
5705 EVT VT = Op.getValueType();
5706 EVT ToVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
5707 EVT FromVT = Op.getOperand(0).getValueType();
5708
5709 if (VT == MVT::i32 && ToVT == MVT::i32 && FromVT == MVT::f32)
5710 return Op;
5711 if (VT == MVT::i32 && ToVT == MVT::i32 && FromVT == MVT::f64 &&
5712 Subtarget->hasFP64())
5713 return Op;
5714 if (VT == MVT::i32 && ToVT == MVT::i32 && FromVT == MVT::f16 &&
5715 Subtarget->hasFullFP16())
5716 return Op;
5717 if (VT == MVT::v4i32 && ToVT == MVT::i32 && FromVT == MVT::v4f32 &&
5718 Subtarget->hasMVEFloatOps())
5719 return Op;
5720 if (VT == MVT::v8i16 && ToVT == MVT::i16 && FromVT == MVT::v8f16 &&
5721 Subtarget->hasMVEFloatOps())
5722 return Op;
5723
5724 if (FromVT != MVT::v4f32 && FromVT != MVT::v8f16)
5725 return SDValue();
5726
5727 SDLoc DL(Op);
5728 bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT_SAT;
5729 unsigned BW = ToVT.getScalarSizeInBits() - IsSigned;
5730 SDValue CVT = DAG.getNode(Op.getOpcode(), DL, VT, Op.getOperand(0),
5731 DAG.getValueType(VT.getScalarType()));
5732 SDValue Max = DAG.getNode(IsSigned ? ISD::SMIN : ISD::UMIN, DL, VT, CVT,
5733 DAG.getConstant((1 << BW) - 1, DL, VT));
5734 if (IsSigned)
5735 Max = DAG.getNode(ISD::SMAX, DL, VT, Max,
5736 DAG.getSignedConstant(-(1 << BW), DL, VT));
5737 return Max;
5738}
5739
5741 EVT VT = Op.getValueType();
5742 SDLoc dl(Op);
5743
5744 if (Op.getOperand(0).getValueType().getVectorElementType() == MVT::i32) {
5745 if (VT.getVectorElementType() == MVT::f32)
5746 return Op;
5747 return DAG.UnrollVectorOp(Op.getNode());
5748 }
5749
5750 assert((Op.getOperand(0).getValueType() == MVT::v4i16 ||
5751 Op.getOperand(0).getValueType() == MVT::v8i16) &&
5752 "Invalid type for custom lowering!");
5753
5754 const bool HasFullFP16 = DAG.getSubtarget<ARMSubtarget>().hasFullFP16();
5755
5756 EVT DestVecType;
5757 if (VT == MVT::v4f32)
5758 DestVecType = MVT::v4i32;
5759 else if (VT == MVT::v4f16 && HasFullFP16)
5760 DestVecType = MVT::v4i16;
5761 else if (VT == MVT::v8f16 && HasFullFP16)
5762 DestVecType = MVT::v8i16;
5763 else
5764 return DAG.UnrollVectorOp(Op.getNode());
5765
5766 unsigned CastOpc;
5767 unsigned Opc;
5768 switch (Op.getOpcode()) {
5769 default: llvm_unreachable("Invalid opcode!");
5770 case ISD::SINT_TO_FP:
5771 CastOpc = ISD::SIGN_EXTEND;
5773 break;
5774 case ISD::UINT_TO_FP:
5775 CastOpc = ISD::ZERO_EXTEND;
5777 break;
5778 }
5779
5780 Op = DAG.getNode(CastOpc, dl, DestVecType, Op.getOperand(0));
5781 return DAG.getNode(Opc, dl, VT, Op);
5782}
5783
5784SDValue ARMTargetLowering::LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) const {
5785 EVT VT = Op.getValueType();
5786 if (VT.isVector())
5787 return LowerVectorINT_TO_FP(Op, DAG);
5788 if (isUnsupportedFloatingType(VT)) {
5789 RTLIB::Libcall LC;
5790 if (Op.getOpcode() == ISD::SINT_TO_FP)
5791 LC = RTLIB::getSINTTOFP(Op.getOperand(0).getValueType(),
5792 Op.getValueType());
5793 else
5794 LC = RTLIB::getUINTTOFP(Op.getOperand(0).getValueType(),
5795 Op.getValueType());
5796 MakeLibCallOptions CallOptions;
5797 return makeLibCall(DAG, LC, Op.getValueType(), Op.getOperand(0),
5798 CallOptions, SDLoc(Op)).first;
5799 }
5800
5801 return Op;
5802}
5803
5804SDValue ARMTargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
5805 // Implement fcopysign with a fabs and a conditional fneg.
5806 SDValue Tmp0 = Op.getOperand(0);
5807 SDValue Tmp1 = Op.getOperand(1);
5808 SDLoc dl(Op);
5809 EVT VT = Op.getValueType();
5810 EVT SrcVT = Tmp1.getValueType();
5811 bool InGPR = Tmp0.getOpcode() == ISD::BITCAST ||
5812 Tmp0.getOpcode() == ARMISD::VMOVDRR;
5813 bool UseNEON = !InGPR && Subtarget->hasNEON();
5814
5815 if (UseNEON) {
5816 // Use VBSL to copy the sign bit.
5817 unsigned EncodedVal = ARM_AM::createVMOVModImm(0x6, 0x80);
5818 SDValue Mask = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v2i32,
5819 DAG.getTargetConstant(EncodedVal, dl, MVT::i32));
5820 EVT OpVT = (VT == MVT::f32) ? MVT::v2i32 : MVT::v1i64;
5821 if (VT == MVT::f64)
5822 Mask = DAG.getNode(ARMISD::VSHLIMM, dl, OpVT,
5823 DAG.getNode(ISD::BITCAST, dl, OpVT, Mask),
5824 DAG.getConstant(32, dl, MVT::i32));
5825 else /*if (VT == MVT::f32)*/
5826 Tmp0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f32, Tmp0);
5827 if (SrcVT == MVT::f32) {
5828 Tmp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f32, Tmp1);
5829 if (VT == MVT::f64)
5830 Tmp1 = DAG.getNode(ARMISD::VSHLIMM, dl, OpVT,
5831 DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp1),
5832 DAG.getConstant(32, dl, MVT::i32));
5833 } else if (VT == MVT::f32)
5834 Tmp1 = DAG.getNode(ARMISD::VSHRuIMM, dl, MVT::v1i64,
5835 DAG.getNode(ISD::BITCAST, dl, MVT::v1i64, Tmp1),
5836 DAG.getConstant(32, dl, MVT::i32));
5837 Tmp0 = DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp0);
5838 Tmp1 = DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp1);
5839
5841 dl, MVT::i32);
5842 AllOnes = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v8i8, AllOnes);
5843 SDValue MaskNot = DAG.getNode(ISD::XOR, dl, OpVT, Mask,
5844 DAG.getNode(ISD::BITCAST, dl, OpVT, AllOnes));
5845
5846 SDValue Res = DAG.getNode(ISD::OR, dl, OpVT,
5847 DAG.getNode(ISD::AND, dl, OpVT, Tmp1, Mask),
5848 DAG.getNode(ISD::AND, dl, OpVT, Tmp0, MaskNot));
5849 if (VT == MVT::f32) {
5850 Res = DAG.getNode(ISD::BITCAST, dl, MVT::v2f32, Res);
5851 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res,
5852 DAG.getConstant(0, dl, MVT::i32));
5853 } else {
5854 Res = DAG.getNode(ISD::BITCAST, dl, MVT::f64, Res);
5855 }
5856
5857 return Res;
5858 }
5859
5860 // Bitcast operand 1 to i32.
5861 if (SrcVT == MVT::f64)
5862 Tmp1 = DAG.getNode(ARMISD::VMOVRRD, dl, DAG.getVTList(MVT::i32, MVT::i32),
5863 Tmp1).getValue(1);
5864 Tmp1 = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Tmp1);
5865
5866 // Or in the signbit with integer operations.
5867 SDValue Mask1 = DAG.getConstant(0x80000000, dl, MVT::i32);
5868 SDValue Mask2 = DAG.getConstant(0x7fffffff, dl, MVT::i32);
5869 Tmp1 = DAG.getNode(ISD::AND, dl, MVT::i32, Tmp1, Mask1);
5870 if (VT == MVT::f32) {
5871 Tmp0 = DAG.getNode(ISD::AND, dl, MVT::i32,
5872 DAG.getNode(ISD::BITCAST, dl, MVT::i32, Tmp0), Mask2);
5873 return DAG.getNode(ISD::BITCAST, dl, MVT::f32,
5874 DAG.getNode(ISD::OR, dl, MVT::i32, Tmp0, Tmp1));
5875 }
5876
5877 // f64: Or the high part with signbit and then combine two parts.
5878 Tmp0 = DAG.getNode(ARMISD::VMOVRRD, dl, DAG.getVTList(MVT::i32, MVT::i32),
5879 Tmp0);
5880 SDValue Lo = Tmp0.getValue(0);
5881 SDValue Hi = DAG.getNode(ISD::AND, dl, MVT::i32, Tmp0.getValue(1), Mask2);
5882 Hi = DAG.getNode(ISD::OR, dl, MVT::i32, Hi, Tmp1);
5883 return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi);
5884}
5885
5886SDValue ARMTargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const{
5887 MachineFunction &MF = DAG.getMachineFunction();
5888 MachineFrameInfo &MFI = MF.getFrameInfo();
5889 MFI.setReturnAddressIsTaken(true);
5890
5891 EVT VT = Op.getValueType();
5892 SDLoc dl(Op);
5893 unsigned Depth = Op.getConstantOperandVal(0);
5894 if (Depth) {
5895 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
5896 SDValue Offset = DAG.getConstant(4, dl, MVT::i32);
5897 return DAG.getLoad(VT, dl, DAG.getEntryNode(),
5898 DAG.getNode(ISD::ADD, dl, VT, FrameAddr, Offset),
5899 MachinePointerInfo());
5900 }
5901
5902 // Return LR, which contains the return address. Mark it an implicit live-in.
5903 Register Reg = MF.addLiveIn(ARM::LR, getRegClassFor(MVT::i32));
5904 return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT);
5905}
5906
5907SDValue ARMTargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
5908 const ARMBaseRegisterInfo &ARI =
5909 *static_cast<const ARMBaseRegisterInfo*>(RegInfo);
5910 MachineFunction &MF = DAG.getMachineFunction();
5911 MachineFrameInfo &MFI = MF.getFrameInfo();
5912 MFI.setFrameAddressIsTaken(true);
5913
5914 EVT VT = Op.getValueType();
5915 SDLoc dl(Op); // FIXME probably not meaningful
5916 unsigned Depth = Op.getConstantOperandVal(0);
5917 Register FrameReg = ARI.getFrameRegister(MF);
5918 SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
5919 while (Depth--)
5920 FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
5921 MachinePointerInfo());
5922 return FrameAddr;
5923}
5924
5925// FIXME? Maybe this could be a TableGen attribute on some registers and
5926// this table could be generated automatically from RegInfo.
5927Register ARMTargetLowering::getRegisterByName(const char* RegName, LLT VT,
5928 const MachineFunction &MF) const {
5929 return StringSwitch<Register>(RegName)
5930 .Case("sp", ARM::SP)
5931 .Default(Register());
5932}
5933
5934// Result is 64 bit value so split into two 32 bit values and return as a
5935// pair of values.
5937 SelectionDAG &DAG) {
5938 SDLoc DL(N);
5939
5940 // This function is only supposed to be called for i64 type destination.
5941 assert(N->getValueType(0) == MVT::i64
5942 && "ExpandREAD_REGISTER called for non-i64 type result.");
5943
5945 DAG.getVTList(MVT::i32, MVT::i32, MVT::Other),
5946 N->getOperand(0),
5947 N->getOperand(1));
5948
5949 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Read.getValue(0),
5950 Read.getValue(1)));
5951 Results.push_back(Read.getValue(2)); // Chain
5952}
5953
5954/// \p BC is a bitcast that is about to be turned into a VMOVDRR.
5955/// When \p DstVT, the destination type of \p BC, is on the vector
5956/// register bank and the source of bitcast, \p Op, operates on the same bank,
5957/// it might be possible to combine them, such that everything stays on the
5958/// vector register bank.
5959/// \p return The node that would replace \p BT, if the combine
5960/// is possible.
5962 SelectionDAG &DAG) {
5963 SDValue Op = BC->getOperand(0);
5964 EVT DstVT = BC->getValueType(0);
5965
5966 // The only vector instruction that can produce a scalar (remember,
5967 // since the bitcast was about to be turned into VMOVDRR, the source
5968 // type is i64) from a vector is EXTRACT_VECTOR_ELT.
5969 // Moreover, we can do this combine only if there is one use.
5970 // Finally, if the destination type is not a vector, there is not
5971 // much point on forcing everything on the vector bank.
5972 if (!DstVT.isVector() || Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
5973 !Op.hasOneUse())
5974 return SDValue();
5975
5976 // If the index is not constant, we will introduce an additional
5977 // multiply that will stick.
5978 // Give up in that case.
5979 ConstantSDNode *Index = dyn_cast<ConstantSDNode>(Op.getOperand(1));
5980 if (!Index)
5981 return SDValue();
5982 unsigned DstNumElt = DstVT.getVectorNumElements();
5983
5984 // Compute the new index.
5985 const APInt &APIntIndex = Index->getAPIntValue();
5986 APInt NewIndex(APIntIndex.getBitWidth(), DstNumElt);
5987 NewIndex *= APIntIndex;
5988 // Check if the new constant index fits into i32.
5989 if (NewIndex.getBitWidth() > 32)
5990 return SDValue();
5991
5992 // vMTy bitcast(i64 extractelt vNi64 src, i32 index) ->
5993 // vMTy extractsubvector vNxMTy (bitcast vNi64 src), i32 index*M)
5994 SDLoc dl(Op);
5995 SDValue ExtractSrc = Op.getOperand(0);
5996 EVT VecVT = EVT::getVectorVT(
5997 *DAG.getContext(), DstVT.getScalarType(),
5998 ExtractSrc.getValueType().getVectorNumElements() * DstNumElt);
5999 SDValue BitCast = DAG.getNode(ISD::BITCAST, dl, VecVT, ExtractSrc);
6000 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DstVT, BitCast,
6001 DAG.getConstant(NewIndex.getZExtValue(), dl, MVT::i32));
6002}
6003
6004/// ExpandBITCAST - If the target supports VFP, this function is called to
6005/// expand a bit convert where either the source or destination type is i64 to
6006/// use a VMOVDRR or VMOVRRD node. This should not be done when the non-i64
6007/// operand type is illegal (e.g., v2f32 for a target that doesn't support
6008/// vectors), since the legalizer won't know what to do with that.
6009SDValue ARMTargetLowering::ExpandBITCAST(SDNode *N, SelectionDAG &DAG,
6010 const ARMSubtarget *Subtarget) const {
6011 SDLoc dl(N);
6012 SDValue Op = N->getOperand(0);
6013
6014 // This function is only supposed to be called for i16 and i64 types, either
6015 // as the source or destination of the bit convert.
6016 EVT SrcVT = Op.getValueType();
6017 EVT DstVT = N->getValueType(0);
6018
6019 if ((SrcVT == MVT::i16 || SrcVT == MVT::i32) &&
6020 (DstVT == MVT::f16 || DstVT == MVT::bf16))
6021 return MoveToHPR(SDLoc(N), DAG, MVT::i32, DstVT.getSimpleVT(),
6022 DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), MVT::i32, Op));
6023
6024 if ((DstVT == MVT::i16 || DstVT == MVT::i32) &&
6025 (SrcVT == MVT::f16 || SrcVT == MVT::bf16)) {
6026 if (Subtarget->hasFullFP16() && !Subtarget->hasBF16())
6027 Op = DAG.getBitcast(MVT::f16, Op);
6028 return DAG.getNode(
6029 ISD::TRUNCATE, SDLoc(N), DstVT,
6030 MoveFromHPR(SDLoc(N), DAG, MVT::i32, SrcVT.getSimpleVT(), Op));
6031 }
6032
6033 if (!(SrcVT == MVT::i64 || DstVT == MVT::i64))
6034 return SDValue();
6035
6036 // Turn i64->f64 into VMOVDRR.
6037 if (SrcVT == MVT::i64 && isTypeLegal(DstVT)) {
6038 // Do not force values to GPRs (this is what VMOVDRR does for the inputs)
6039 // if we can combine the bitcast with its source.
6041 return Val;
6042 SDValue Lo, Hi;
6043 std::tie(Lo, Hi) = DAG.SplitScalar(Op, dl, MVT::i32, MVT::i32);
6044 return DAG.getNode(ISD::BITCAST, dl, DstVT,
6045 DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi));
6046 }
6047
6048 // Turn f64->i64 into VMOVRRD.
6049 if (DstVT == MVT::i64 && isTypeLegal(SrcVT)) {
6050 SDValue Cvt;
6051 if (DAG.getDataLayout().isBigEndian() && SrcVT.isVector() &&
6052 SrcVT.getVectorNumElements() > 1)
6053 Cvt = DAG.getNode(ARMISD::VMOVRRD, dl,
6054 DAG.getVTList(MVT::i32, MVT::i32),
6055 DAG.getNode(ARMISD::VREV64, dl, SrcVT, Op));
6056 else
6057 Cvt = DAG.getNode(ARMISD::VMOVRRD, dl,
6058 DAG.getVTList(MVT::i32, MVT::i32), Op);
6059 // Merge the pieces into a single i64 value.
6060 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Cvt, Cvt.getValue(1));
6061 }
6062
6063 return SDValue();
6064}
6065
6066/// getZeroVector - Returns a vector of specified type with all zero elements.
6067/// Zero vectors are used to represent vector negation and in those cases
6068/// will be implemented with the NEON VNEG instruction. However, VNEG does
6069/// not support i64 elements, so sometimes the zero vectors will need to be
6070/// explicitly constructed. Regardless, use a canonical VMOV to create the
6071/// zero vector.
6072static SDValue getZeroVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) {
6073 assert(VT.isVector() && "Expected a vector type");
6074 // The canonical modified immediate encoding of a zero vector is....0!
6075 SDValue EncodedVal = DAG.getTargetConstant(0, dl, MVT::i32);
6076 EVT VmovVT = VT.is128BitVector() ? MVT::v4i32 : MVT::v2i32;
6077 SDValue Vmov = DAG.getNode(ARMISD::VMOVIMM, dl, VmovVT, EncodedVal);
6078 return DAG.getNode(ISD::BITCAST, dl, VT, Vmov);
6079}
6080
6081/// LowerShiftRightParts - Lower SRA_PARTS, which returns two
6082/// i32 values and take a 2 x i32 value to shift plus a shift amount.
6083SDValue ARMTargetLowering::LowerShiftRightParts(SDValue Op,
6084 SelectionDAG &DAG) const {
6085 assert(Op.getNumOperands() == 3 && "Not a double-shift!");
6086 EVT VT = Op.getValueType();
6087 unsigned VTBits = VT.getSizeInBits();
6088 SDLoc dl(Op);
6089 SDValue ShOpLo = Op.getOperand(0);
6090 SDValue ShOpHi = Op.getOperand(1);
6091 SDValue ShAmt = Op.getOperand(2);
6092 SDValue ARMcc;
6093 unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL;
6094
6095 assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS);
6096
6097 SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
6098 DAG.getConstant(VTBits, dl, MVT::i32), ShAmt);
6099 SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt);
6100 SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
6101 DAG.getConstant(VTBits, dl, MVT::i32));
6102 SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt);
6103 SDValue LoSmallShift = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
6104 SDValue LoBigShift = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt);
6105 SDValue CmpLo = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32),
6106 ISD::SETGE, ARMcc, DAG, dl);
6107 SDValue Lo =
6108 DAG.getNode(ARMISD::CMOV, dl, VT, LoSmallShift, LoBigShift, ARMcc, CmpLo);
6109
6110 SDValue HiSmallShift = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
6111 SDValue HiBigShift = Opc == ISD::SRA
6112 ? DAG.getNode(Opc, dl, VT, ShOpHi,
6113 DAG.getConstant(VTBits - 1, dl, VT))
6114 : DAG.getConstant(0, dl, VT);
6115 SDValue CmpHi = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32),
6116 ISD::SETGE, ARMcc, DAG, dl);
6117 SDValue Hi =
6118 DAG.getNode(ARMISD::CMOV, dl, VT, HiSmallShift, HiBigShift, ARMcc, CmpHi);
6119
6120 SDValue Ops[2] = { Lo, Hi };
6121 return DAG.getMergeValues(Ops, dl);
6122}
6123
6124/// LowerShiftLeftParts - Lower SHL_PARTS, which returns two
6125/// i32 values and take a 2 x i32 value to shift plus a shift amount.
6126SDValue ARMTargetLowering::LowerShiftLeftParts(SDValue Op,
6127 SelectionDAG &DAG) const {
6128 assert(Op.getNumOperands() == 3 && "Not a double-shift!");
6129 EVT VT = Op.getValueType();
6130 unsigned VTBits = VT.getSizeInBits();
6131 SDLoc dl(Op);
6132 SDValue ShOpLo = Op.getOperand(0);
6133 SDValue ShOpHi = Op.getOperand(1);
6134 SDValue ShAmt = Op.getOperand(2);
6135 SDValue ARMcc;
6136
6137 assert(Op.getOpcode() == ISD::SHL_PARTS);
6138 SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
6139 DAG.getConstant(VTBits, dl, MVT::i32), ShAmt);
6140 SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt);
6141 SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt);
6142 SDValue HiSmallShift = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
6143
6144 SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
6145 DAG.getConstant(VTBits, dl, MVT::i32));
6146 SDValue HiBigShift = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt);
6147 SDValue CmpHi = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32),
6148 ISD::SETGE, ARMcc, DAG, dl);
6149 SDValue Hi =
6150 DAG.getNode(ARMISD::CMOV, dl, VT, HiSmallShift, HiBigShift, ARMcc, CmpHi);
6151
6152 SDValue CmpLo = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32),
6153 ISD::SETGE, ARMcc, DAG, dl);
6154 SDValue LoSmallShift = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
6155 SDValue Lo = DAG.getNode(ARMISD::CMOV, dl, VT, LoSmallShift,
6156 DAG.getConstant(0, dl, VT), ARMcc, CmpLo);
6157
6158 SDValue Ops[2] = { Lo, Hi };
6159 return DAG.getMergeValues(Ops, dl);
6160}
6161
6162SDValue ARMTargetLowering::LowerGET_ROUNDING(SDValue Op,
6163 SelectionDAG &DAG) const {
6164 // The rounding mode is in bits 23:22 of the FPSCR.
6165 // The ARM rounding mode value to FLT_ROUNDS mapping is 0->1, 1->2, 2->3, 3->0
6166 // The formula we use to implement this is (((FPSCR + 1 << 22) >> 22) & 3)
6167 // so that the shift + and get folded into a bitfield extract.
6168 SDLoc dl(Op);
6169 SDValue Chain = Op.getOperand(0);
6170 SDValue Ops[] = {Chain,
6171 DAG.getConstant(Intrinsic::arm_get_fpscr, dl, MVT::i32)};
6172
6173 SDValue FPSCR =
6174 DAG.getNode(ISD::INTRINSIC_W_CHAIN, dl, {MVT::i32, MVT::Other}, Ops);
6175 Chain = FPSCR.getValue(1);
6176 SDValue FltRounds = DAG.getNode(ISD::ADD, dl, MVT::i32, FPSCR,
6177 DAG.getConstant(1U << 22, dl, MVT::i32));
6178 SDValue RMODE = DAG.getNode(ISD::SRL, dl, MVT::i32, FltRounds,
6179 DAG.getConstant(22, dl, MVT::i32));
6180 SDValue And = DAG.getNode(ISD::AND, dl, MVT::i32, RMODE,
6181 DAG.getConstant(3, dl, MVT::i32));
6182 return DAG.getMergeValues({And, Chain}, dl);
6183}
6184
6185SDValue ARMTargetLowering::LowerSET_ROUNDING(SDValue Op,
6186 SelectionDAG &DAG) const {
6187 SDLoc DL(Op);
6188 SDValue Chain = Op->getOperand(0);
6189 SDValue RMValue = Op->getOperand(1);
6190
6191 // The rounding mode is in bits 23:22 of the FPSCR.
6192 // The llvm.set.rounding argument value to ARM rounding mode value mapping
6193 // is 0->3, 1->0, 2->1, 3->2. The formula we use to implement this is
6194 // ((arg - 1) & 3) << 22).
6195 //
6196 // It is expected that the argument of llvm.set.rounding is within the
6197 // segment [0, 3], so NearestTiesToAway (4) is not handled here. It is
6198 // responsibility of the code generated llvm.set.rounding to ensure this
6199 // condition.
6200
6201 // Calculate new value of FPSCR[23:22].
6202 RMValue = DAG.getNode(ISD::SUB, DL, MVT::i32, RMValue,
6203 DAG.getConstant(1, DL, MVT::i32));
6204 RMValue = DAG.getNode(ISD::AND, DL, MVT::i32, RMValue,
6205 DAG.getConstant(0x3, DL, MVT::i32));
6206 RMValue = DAG.getNode(ISD::SHL, DL, MVT::i32, RMValue,
6207 DAG.getConstant(ARM::RoundingBitsPos, DL, MVT::i32));
6208
6209 // Get current value of FPSCR.
6210 SDValue Ops[] = {Chain,
6211 DAG.getConstant(Intrinsic::arm_get_fpscr, DL, MVT::i32)};
6212 SDValue FPSCR =
6213 DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, {MVT::i32, MVT::Other}, Ops);
6214 Chain = FPSCR.getValue(1);
6215 FPSCR = FPSCR.getValue(0);
6216
6217 // Put new rounding mode into FPSCR[23:22].
6218 const unsigned RMMask = ~(ARM::Rounding::rmMask << ARM::RoundingBitsPos);
6219 FPSCR = DAG.getNode(ISD::AND, DL, MVT::i32, FPSCR,
6220 DAG.getConstant(RMMask, DL, MVT::i32));
6221 FPSCR = DAG.getNode(ISD::OR, DL, MVT::i32, FPSCR, RMValue);
6222 SDValue Ops2[] = {
6223 Chain, DAG.getConstant(Intrinsic::arm_set_fpscr, DL, MVT::i32), FPSCR};
6224 return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, Ops2);
6225}
6226
6227SDValue ARMTargetLowering::LowerSET_FPMODE(SDValue Op,
6228 SelectionDAG &DAG) const {
6229 SDLoc DL(Op);
6230 SDValue Chain = Op->getOperand(0);
6231 SDValue Mode = Op->getOperand(1);
6232
6233 // Generate nodes to build:
6234 // FPSCR = (FPSCR & FPStatusBits) | (Mode & ~FPStatusBits)
6235 SDValue Ops[] = {Chain,
6236 DAG.getConstant(Intrinsic::arm_get_fpscr, DL, MVT::i32)};
6237 SDValue FPSCR =
6238 DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, {MVT::i32, MVT::Other}, Ops);
6239 Chain = FPSCR.getValue(1);
6240 FPSCR = FPSCR.getValue(0);
6241
6242 SDValue FPSCRMasked =
6243 DAG.getNode(ISD::AND, DL, MVT::i32, FPSCR,
6244 DAG.getConstant(ARM::FPStatusBits, DL, MVT::i32));
6245 SDValue InputMasked =
6246 DAG.getNode(ISD::AND, DL, MVT::i32, Mode,
6247 DAG.getConstant(~ARM::FPStatusBits, DL, MVT::i32));
6248 FPSCR = DAG.getNode(ISD::OR, DL, MVT::i32, FPSCRMasked, InputMasked);
6249
6250 SDValue Ops2[] = {
6251 Chain, DAG.getConstant(Intrinsic::arm_set_fpscr, DL, MVT::i32), FPSCR};
6252 return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, Ops2);
6253}
6254
6255SDValue ARMTargetLowering::LowerRESET_FPMODE(SDValue Op,
6256 SelectionDAG &DAG) const {
6257 SDLoc DL(Op);
6258 SDValue Chain = Op->getOperand(0);
6259
6260 // To get the default FP mode all control bits are cleared:
6261 // FPSCR = FPSCR & (FPStatusBits | FPReservedBits)
6262 SDValue Ops[] = {Chain,
6263 DAG.getConstant(Intrinsic::arm_get_fpscr, DL, MVT::i32)};
6264 SDValue FPSCR =
6265 DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, {MVT::i32, MVT::Other}, Ops);
6266 Chain = FPSCR.getValue(1);
6267 FPSCR = FPSCR.getValue(0);
6268
6269 SDValue FPSCRMasked = DAG.getNode(
6270 ISD::AND, DL, MVT::i32, FPSCR,
6272 SDValue Ops2[] = {Chain,
6273 DAG.getConstant(Intrinsic::arm_set_fpscr, DL, MVT::i32),
6274 FPSCRMasked};
6275 return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, Ops2);
6276}
6277
6279 const ARMSubtarget *ST) {
6280 SDLoc dl(N);
6281 EVT VT = N->getValueType(0);
6282 if (VT.isVector() && ST->hasNEON()) {
6283
6284 // Compute the least significant set bit: LSB = X & -X
6285 SDValue X = N->getOperand(0);
6286 SDValue NX = DAG.getNode(ISD::SUB, dl, VT, getZeroVector(VT, DAG, dl), X);
6287 SDValue LSB = DAG.getNode(ISD::AND, dl, VT, X, NX);
6288
6289 EVT ElemTy = VT.getVectorElementType();
6290
6291 if (ElemTy == MVT::i8) {
6292 // Compute with: cttz(x) = ctpop(lsb - 1)
6293 SDValue One = DAG.getNode(ARMISD::VMOVIMM, dl, VT,
6294 DAG.getTargetConstant(1, dl, ElemTy));
6295 SDValue Bits = DAG.getNode(ISD::SUB, dl, VT, LSB, One);
6296 return DAG.getNode(ISD::CTPOP, dl, VT, Bits);
6297 }
6298
6299 if ((ElemTy == MVT::i16 || ElemTy == MVT::i32) &&
6300 (N->getOpcode() == ISD::CTTZ_ZERO_UNDEF)) {
6301 // Compute with: cttz(x) = (width - 1) - ctlz(lsb), if x != 0
6302 unsigned NumBits = ElemTy.getSizeInBits();
6303 SDValue WidthMinus1 =
6304 DAG.getNode(ARMISD::VMOVIMM, dl, VT,
6305 DAG.getTargetConstant(NumBits - 1, dl, ElemTy));
6306 SDValue CTLZ = DAG.getNode(ISD::CTLZ, dl, VT, LSB);
6307 return DAG.getNode(ISD::SUB, dl, VT, WidthMinus1, CTLZ);
6308 }
6309
6310 // Compute with: cttz(x) = ctpop(lsb - 1)
6311
6312 // Compute LSB - 1.
6313 SDValue Bits;
6314 if (ElemTy == MVT::i64) {
6315 // Load constant 0xffff'ffff'ffff'ffff to register.
6316 SDValue FF = DAG.getNode(ARMISD::VMOVIMM, dl, VT,
6317 DAG.getTargetConstant(0x1eff, dl, MVT::i32));
6318 Bits = DAG.getNode(ISD::ADD, dl, VT, LSB, FF);
6319 } else {
6320 SDValue One = DAG.getNode(ARMISD::VMOVIMM, dl, VT,
6321 DAG.getTargetConstant(1, dl, ElemTy));
6322 Bits = DAG.getNode(ISD::SUB, dl, VT, LSB, One);
6323 }
6324 return DAG.getNode(ISD::CTPOP, dl, VT, Bits);
6325 }
6326
6327 if (!ST->hasV6T2Ops())
6328 return SDValue();
6329
6330 SDValue rbit = DAG.getNode(ISD::BITREVERSE, dl, VT, N->getOperand(0));
6331 return DAG.getNode(ISD::CTLZ, dl, VT, rbit);
6332}
6333
6335 const ARMSubtarget *ST) {
6336 EVT VT = N->getValueType(0);
6337 SDLoc DL(N);
6338
6339 assert(ST->hasNEON() && "Custom ctpop lowering requires NEON.");
6340 assert((VT == MVT::v1i64 || VT == MVT::v2i64 || VT == MVT::v2i32 ||
6341 VT == MVT::v4i32 || VT == MVT::v4i16 || VT == MVT::v8i16) &&
6342 "Unexpected type for custom ctpop lowering");
6343
6344 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
6345 EVT VT8Bit = VT.is64BitVector() ? MVT::v8i8 : MVT::v16i8;
6346 SDValue Res = DAG.getBitcast(VT8Bit, N->getOperand(0));
6347 Res = DAG.getNode(ISD::CTPOP, DL, VT8Bit, Res);
6348
6349 // Widen v8i8/v16i8 CTPOP result to VT by repeatedly widening pairwise adds.
6350 unsigned EltSize = 8;
6351 unsigned NumElts = VT.is64BitVector() ? 8 : 16;
6352 while (EltSize != VT.getScalarSizeInBits()) {
6354 Ops.push_back(DAG.getConstant(Intrinsic::arm_neon_vpaddlu, DL,
6355 TLI.getPointerTy(DAG.getDataLayout())));
6356 Ops.push_back(Res);
6357
6358 EltSize *= 2;
6359 NumElts /= 2;
6360 MVT WidenVT = MVT::getVectorVT(MVT::getIntegerVT(EltSize), NumElts);
6361 Res = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, WidenVT, Ops);
6362 }
6363
6364 return Res;
6365}
6366
6367/// Getvshiftimm - Check if this is a valid build_vector for the immediate
6368/// operand of a vector shift operation, where all the elements of the
6369/// build_vector must have the same constant integer value.
6370static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) {
6371 // Ignore bit_converts.
6372 while (Op.getOpcode() == ISD::BITCAST)
6373 Op = Op.getOperand(0);
6375 APInt SplatBits, SplatUndef;
6376 unsigned SplatBitSize;
6377 bool HasAnyUndefs;
6378 if (!BVN ||
6379 !BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs,
6380 ElementBits) ||
6381 SplatBitSize > ElementBits)
6382 return false;
6383 Cnt = SplatBits.getSExtValue();
6384 return true;
6385}
6386
6387/// isVShiftLImm - Check if this is a valid build_vector for the immediate
6388/// operand of a vector shift left operation. That value must be in the range:
6389/// 0 <= Value < ElementBits for a left shift; or
6390/// 0 <= Value <= ElementBits for a long left shift.
6391static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) {
6392 assert(VT.isVector() && "vector shift count is not a vector type");
6393 int64_t ElementBits = VT.getScalarSizeInBits();
6394 if (!getVShiftImm(Op, ElementBits, Cnt))
6395 return false;
6396 return (Cnt >= 0 && (isLong ? Cnt - 1 : Cnt) < ElementBits);
6397}
6398
6399/// isVShiftRImm - Check if this is a valid build_vector for the immediate
6400/// operand of a vector shift right operation. For a shift opcode, the value
6401/// is positive, but for an intrinsic the value count must be negative. The
6402/// absolute value must be in the range:
6403/// 1 <= |Value| <= ElementBits for a right shift; or
6404/// 1 <= |Value| <= ElementBits/2 for a narrow right shift.
6405static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, bool isIntrinsic,
6406 int64_t &Cnt) {
6407 assert(VT.isVector() && "vector shift count is not a vector type");
6408 int64_t ElementBits = VT.getScalarSizeInBits();
6409 if (!getVShiftImm(Op, ElementBits, Cnt))
6410 return false;
6411 if (!isIntrinsic)
6412 return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits / 2 : ElementBits));
6413 if (Cnt >= -(isNarrow ? ElementBits / 2 : ElementBits) && Cnt <= -1) {
6414 Cnt = -Cnt;
6415 return true;
6416 }
6417 return false;
6418}
6419
6421 const ARMSubtarget *ST) {
6422 EVT VT = N->getValueType(0);
6423 SDLoc dl(N);
6424 int64_t Cnt;
6425
6426 if (!VT.isVector())
6427 return SDValue();
6428
6429 // We essentially have two forms here. Shift by an immediate and shift by a
6430 // vector register (there are also shift by a gpr, but that is just handled
6431 // with a tablegen pattern). We cannot easily match shift by an immediate in
6432 // tablegen so we do that here and generate a VSHLIMM/VSHRsIMM/VSHRuIMM.
6433 // For shifting by a vector, we don't have VSHR, only VSHL (which can be
6434 // signed or unsigned, and a negative shift indicates a shift right).
6435 if (N->getOpcode() == ISD::SHL) {
6436 if (isVShiftLImm(N->getOperand(1), VT, false, Cnt))
6437 return DAG.getNode(ARMISD::VSHLIMM, dl, VT, N->getOperand(0),
6438 DAG.getConstant(Cnt, dl, MVT::i32));
6439 return DAG.getNode(ARMISD::VSHLu, dl, VT, N->getOperand(0),
6440 N->getOperand(1));
6441 }
6442
6443 assert((N->getOpcode() == ISD::SRA || N->getOpcode() == ISD::SRL) &&
6444 "unexpected vector shift opcode");
6445
6446 if (isVShiftRImm(N->getOperand(1), VT, false, false, Cnt)) {
6447 unsigned VShiftOpc =
6448 (N->getOpcode() == ISD::SRA ? ARMISD::VSHRsIMM : ARMISD::VSHRuIMM);
6449 return DAG.getNode(VShiftOpc, dl, VT, N->getOperand(0),
6450 DAG.getConstant(Cnt, dl, MVT::i32));
6451 }
6452
6453 // Other right shifts we don't have operations for (we use a shift left by a
6454 // negative number).
6455 EVT ShiftVT = N->getOperand(1).getValueType();
6456 SDValue NegatedCount = DAG.getNode(
6457 ISD::SUB, dl, ShiftVT, getZeroVector(ShiftVT, DAG, dl), N->getOperand(1));
6458 unsigned VShiftOpc =
6459 (N->getOpcode() == ISD::SRA ? ARMISD::VSHLs : ARMISD::VSHLu);
6460 return DAG.getNode(VShiftOpc, dl, VT, N->getOperand(0), NegatedCount);
6461}
6462
6464 const ARMSubtarget *ST) {
6465 EVT VT = N->getValueType(0);
6466 SDLoc dl(N);
6467
6468 // We can get here for a node like i32 = ISD::SHL i32, i64
6469 if (VT != MVT::i64)
6470 return SDValue();
6471
6472 assert((N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA ||
6473 N->getOpcode() == ISD::SHL) &&
6474 "Unknown shift to lower!");
6475
6476 unsigned ShOpc = N->getOpcode();
6477 if (ST->hasMVEIntegerOps()) {
6478 SDValue ShAmt = N->getOperand(1);
6479 unsigned ShPartsOpc = ARMISD::LSLL;
6481
6482 // If the shift amount is greater than 32 or has a greater bitwidth than 64
6483 // then do the default optimisation
6484 if ((!Con && ShAmt->getValueType(0).getSizeInBits() > 64) ||
6485 (Con && (Con->getAPIntValue() == 0 || Con->getAPIntValue().uge(32))))
6486 return SDValue();
6487
6488 // Extract the lower 32 bits of the shift amount if it's not an i32
6489 if (ShAmt->getValueType(0) != MVT::i32)
6490 ShAmt = DAG.getZExtOrTrunc(ShAmt, dl, MVT::i32);
6491
6492 if (ShOpc == ISD::SRL) {
6493 if (!Con)
6494 // There is no t2LSRLr instruction so negate and perform an lsll if the
6495 // shift amount is in a register, emulating a right shift.
6496 ShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
6497 DAG.getConstant(0, dl, MVT::i32), ShAmt);
6498 else
6499 // Else generate an lsrl on the immediate shift amount
6500 ShPartsOpc = ARMISD::LSRL;
6501 } else if (ShOpc == ISD::SRA)
6502 ShPartsOpc = ARMISD::ASRL;
6503
6504 // Split Lower/Upper 32 bits of the destination/source
6505 SDValue Lo, Hi;
6506 std::tie(Lo, Hi) =
6507 DAG.SplitScalar(N->getOperand(0), dl, MVT::i32, MVT::i32);
6508 // Generate the shift operation as computed above
6509 Lo = DAG.getNode(ShPartsOpc, dl, DAG.getVTList(MVT::i32, MVT::i32), Lo, Hi,
6510 ShAmt);
6511 // The upper 32 bits come from the second return value of lsll
6512 Hi = SDValue(Lo.getNode(), 1);
6513 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
6514 }
6515
6516 // We only lower SRA, SRL of 1 here, all others use generic lowering.
6517 if (!isOneConstant(N->getOperand(1)) || N->getOpcode() == ISD::SHL)
6518 return SDValue();
6519
6520 // If we are in thumb mode, we don't have RRX.
6521 if (ST->isThumb1Only())
6522 return SDValue();
6523
6524 // Okay, we have a 64-bit SRA or SRL of 1. Lower this to an RRX expr.
6525 SDValue Lo, Hi;
6526 std::tie(Lo, Hi) = DAG.SplitScalar(N->getOperand(0), dl, MVT::i32, MVT::i32);
6527
6528 // First, build a LSRS1/ASRS1 op, which shifts the top part by one and
6529 // captures the shifted out bit into a carry flag.
6530 unsigned Opc = N->getOpcode() == ISD::SRL ? ARMISD::LSRS1 : ARMISD::ASRS1;
6531 Hi = DAG.getNode(Opc, dl, DAG.getVTList(MVT::i32, FlagsVT), Hi);
6532
6533 // The low part is an ARMISD::RRX operand, which shifts the carry in.
6534 Lo = DAG.getNode(ARMISD::RRX, dl, MVT::i32, Lo, Hi.getValue(1));
6535
6536 // Merge the pieces into a single i64 value.
6537 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
6538}
6539
6541 const ARMSubtarget *ST) {
6542 bool Invert = false;
6543 bool Swap = false;
6544 unsigned Opc = ARMCC::AL;
6545
6546 SDValue Op0 = Op.getOperand(0);
6547 SDValue Op1 = Op.getOperand(1);
6548 SDValue CC = Op.getOperand(2);
6549 EVT VT = Op.getValueType();
6550 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
6551 SDLoc dl(Op);
6552
6553 EVT CmpVT;
6554 if (ST->hasNEON())
6556 else {
6557 assert(ST->hasMVEIntegerOps() &&
6558 "No hardware support for integer vector comparison!");
6559
6560 if (Op.getValueType().getVectorElementType() != MVT::i1)
6561 return SDValue();
6562
6563 // Make sure we expand floating point setcc to scalar if we do not have
6564 // mve.fp, so that we can handle them from there.
6565 if (Op0.getValueType().isFloatingPoint() && !ST->hasMVEFloatOps())
6566 return SDValue();
6567
6568 CmpVT = VT;
6569 }
6570
6571 if (Op0.getValueType().getVectorElementType() == MVT::i64 &&
6572 (SetCCOpcode == ISD::SETEQ || SetCCOpcode == ISD::SETNE)) {
6573 // Special-case integer 64-bit equality comparisons. They aren't legal,
6574 // but they can be lowered with a few vector instructions.
6575 unsigned CmpElements = CmpVT.getVectorNumElements() * 2;
6576 EVT SplitVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, CmpElements);
6577 SDValue CastOp0 = DAG.getNode(ISD::BITCAST, dl, SplitVT, Op0);
6578 SDValue CastOp1 = DAG.getNode(ISD::BITCAST, dl, SplitVT, Op1);
6579 SDValue Cmp = DAG.getNode(ISD::SETCC, dl, SplitVT, CastOp0, CastOp1,
6580 DAG.getCondCode(ISD::SETEQ));
6581 SDValue Reversed = DAG.getNode(ARMISD::VREV64, dl, SplitVT, Cmp);
6582 SDValue Merged = DAG.getNode(ISD::AND, dl, SplitVT, Cmp, Reversed);
6583 Merged = DAG.getNode(ISD::BITCAST, dl, CmpVT, Merged);
6584 if (SetCCOpcode == ISD::SETNE)
6585 Merged = DAG.getNOT(dl, Merged, CmpVT);
6586 Merged = DAG.getSExtOrTrunc(Merged, dl, VT);
6587 return Merged;
6588 }
6589
6590 if (CmpVT.getVectorElementType() == MVT::i64)
6591 // 64-bit comparisons are not legal in general.
6592 return SDValue();
6593
6594 if (Op1.getValueType().isFloatingPoint()) {
6595 switch (SetCCOpcode) {
6596 default: llvm_unreachable("Illegal FP comparison");
6597 case ISD::SETUNE:
6598 case ISD::SETNE:
6599 if (ST->hasMVEFloatOps()) {
6600 Opc = ARMCC::NE; break;
6601 } else {
6602 Invert = true; [[fallthrough]];
6603 }
6604 case ISD::SETOEQ:
6605 case ISD::SETEQ: Opc = ARMCC::EQ; break;
6606 case ISD::SETOLT:
6607 case ISD::SETLT: Swap = true; [[fallthrough]];
6608 case ISD::SETOGT:
6609 case ISD::SETGT: Opc = ARMCC::GT; break;
6610 case ISD::SETOLE:
6611 case ISD::SETLE: Swap = true; [[fallthrough]];
6612 case ISD::SETOGE:
6613 case ISD::SETGE: Opc = ARMCC::GE; break;
6614 case ISD::SETUGE: Swap = true; [[fallthrough]];
6615 case ISD::SETULE: Invert = true; Opc = ARMCC::GT; break;
6616 case ISD::SETUGT: Swap = true; [[fallthrough]];
6617 case ISD::SETULT: Invert = true; Opc = ARMCC::GE; break;
6618 case ISD::SETUEQ: Invert = true; [[fallthrough]];
6619 case ISD::SETONE: {
6620 // Expand this to (OLT | OGT).
6621 SDValue TmpOp0 = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op1, Op0,
6622 DAG.getConstant(ARMCC::GT, dl, MVT::i32));
6623 SDValue TmpOp1 = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op0, Op1,
6624 DAG.getConstant(ARMCC::GT, dl, MVT::i32));
6625 SDValue Result = DAG.getNode(ISD::OR, dl, CmpVT, TmpOp0, TmpOp1);
6626 if (Invert)
6627 Result = DAG.getNOT(dl, Result, VT);
6628 return Result;
6629 }
6630 case ISD::SETUO: Invert = true; [[fallthrough]];
6631 case ISD::SETO: {
6632 // Expand this to (OLT | OGE).
6633 SDValue TmpOp0 = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op1, Op0,
6634 DAG.getConstant(ARMCC::GT, dl, MVT::i32));
6635 SDValue TmpOp1 = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op0, Op1,
6636 DAG.getConstant(ARMCC::GE, dl, MVT::i32));
6637 SDValue Result = DAG.getNode(ISD::OR, dl, CmpVT, TmpOp0, TmpOp1);
6638 if (Invert)
6639 Result = DAG.getNOT(dl, Result, VT);
6640 return Result;
6641 }
6642 }
6643 } else {
6644 // Integer comparisons.
6645 switch (SetCCOpcode) {
6646 default: llvm_unreachable("Illegal integer comparison");
6647 case ISD::SETNE:
6648 if (ST->hasMVEIntegerOps()) {
6649 Opc = ARMCC::NE; break;
6650 } else {
6651 Invert = true; [[fallthrough]];
6652 }
6653 case ISD::SETEQ: Opc = ARMCC::EQ; break;
6654 case ISD::SETLT: Swap = true; [[fallthrough]];
6655 case ISD::SETGT: Opc = ARMCC::GT; break;
6656 case ISD::SETLE: Swap = true; [[fallthrough]];
6657 case ISD::SETGE: Opc = ARMCC::GE; break;
6658 case ISD::SETULT: Swap = true; [[fallthrough]];
6659 case ISD::SETUGT: Opc = ARMCC::HI; break;
6660 case ISD::SETULE: Swap = true; [[fallthrough]];
6661 case ISD::SETUGE: Opc = ARMCC::HS; break;
6662 }
6663
6664 // Detect VTST (Vector Test Bits) = icmp ne (and (op0, op1), zero).
6665 if (ST->hasNEON() && Opc == ARMCC::EQ) {
6666 SDValue AndOp;
6668 AndOp = Op0;
6669 else if (ISD::isBuildVectorAllZeros(Op0.getNode()))
6670 AndOp = Op1;
6671
6672 // Ignore bitconvert.
6673 if (AndOp.getNode() && AndOp.getOpcode() == ISD::BITCAST)
6674 AndOp = AndOp.getOperand(0);
6675
6676 if (AndOp.getNode() && AndOp.getOpcode() == ISD::AND) {
6677 Op0 = DAG.getNode(ISD::BITCAST, dl, CmpVT, AndOp.getOperand(0));
6678 Op1 = DAG.getNode(ISD::BITCAST, dl, CmpVT, AndOp.getOperand(1));
6679 SDValue Result = DAG.getNode(ARMISD::VTST, dl, CmpVT, Op0, Op1);
6680 if (!Invert)
6681 Result = DAG.getNOT(dl, Result, VT);
6682 return Result;
6683 }
6684 }
6685 }
6686
6687 if (Swap)
6688 std::swap(Op0, Op1);
6689
6690 // If one of the operands is a constant vector zero, attempt to fold the
6691 // comparison to a specialized compare-against-zero form.
6693 (Opc == ARMCC::GE || Opc == ARMCC::GT || Opc == ARMCC::EQ ||
6694 Opc == ARMCC::NE)) {
6695 if (Opc == ARMCC::GE)
6696 Opc = ARMCC::LE;
6697 else if (Opc == ARMCC::GT)
6698 Opc = ARMCC::LT;
6699 std::swap(Op0, Op1);
6700 }
6701
6702 SDValue Result;
6704 (Opc == ARMCC::GE || Opc == ARMCC::GT || Opc == ARMCC::LE ||
6705 Opc == ARMCC::LT || Opc == ARMCC::NE || Opc == ARMCC::EQ))
6706 Result = DAG.getNode(ARMISD::VCMPZ, dl, CmpVT, Op0,
6707 DAG.getConstant(Opc, dl, MVT::i32));
6708 else
6709 Result = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op0, Op1,
6710 DAG.getConstant(Opc, dl, MVT::i32));
6711
6712 Result = DAG.getSExtOrTrunc(Result, dl, VT);
6713
6714 if (Invert)
6715 Result = DAG.getNOT(dl, Result, VT);
6716
6717 return Result;
6718}
6719
6721 SDValue LHS = Op.getOperand(0);
6722 SDValue RHS = Op.getOperand(1);
6723
6724 assert(LHS.getSimpleValueType().isInteger() && "SETCCCARRY is integer only.");
6725
6726 SDValue Carry = Op.getOperand(2);
6727 SDValue Cond = Op.getOperand(3);
6728 SDLoc DL(Op);
6729
6730 // ARMISD::SUBE expects a carry not a borrow like ISD::USUBO_CARRY so we
6731 // have to invert the carry first.
6732 SDValue InvCarry = valueToCarryFlag(Carry, DAG, true);
6733
6734 SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
6735 SDValue Cmp = DAG.getNode(ARMISD::SUBE, DL, VTs, LHS, RHS, InvCarry);
6736
6737 SDValue FVal = DAG.getConstant(0, DL, MVT::i32);
6738 SDValue TVal = DAG.getConstant(1, DL, MVT::i32);
6739 SDValue ARMcc = DAG.getConstant(
6740 IntCCToARMCC(cast<CondCodeSDNode>(Cond)->get()), DL, MVT::i32);
6741 return DAG.getNode(ARMISD::CMOV, DL, Op.getValueType(), FVal, TVal, ARMcc,
6742 Cmp.getValue(1));
6743}
6744
6745/// isVMOVModifiedImm - Check if the specified splat value corresponds to a
6746/// valid vector constant for a NEON or MVE instruction with a "modified
6747/// immediate" operand (e.g., VMOV). If so, return the encoded value.
6748static SDValue isVMOVModifiedImm(uint64_t SplatBits, uint64_t SplatUndef,
6749 unsigned SplatBitSize, SelectionDAG &DAG,
6750 const SDLoc &dl, EVT &VT, EVT VectorVT,
6751 VMOVModImmType type) {
6752 unsigned OpCmode, Imm;
6753 bool is128Bits = VectorVT.is128BitVector();
6754
6755 // SplatBitSize is set to the smallest size that splats the vector, so a
6756 // zero vector will always have SplatBitSize == 8. However, NEON modified
6757 // immediate instructions others than VMOV do not support the 8-bit encoding
6758 // of a zero vector, and the default encoding of zero is supposed to be the
6759 // 32-bit version.
6760 if (SplatBits == 0)
6761 SplatBitSize = 32;
6762
6763 switch (SplatBitSize) {
6764 case 8:
6765 if (type != VMOVModImm)
6766 return SDValue();
6767 // Any 1-byte value is OK. Op=0, Cmode=1110.
6768 assert((SplatBits & ~0xff) == 0 && "one byte splat value is too big");
6769 OpCmode = 0xe;
6770 Imm = SplatBits;
6771 VT = is128Bits ? MVT::v16i8 : MVT::v8i8;
6772 break;
6773
6774 case 16:
6775 // NEON's 16-bit VMOV supports splat values where only one byte is nonzero.
6776 VT = is128Bits ? MVT::v8i16 : MVT::v4i16;
6777 if ((SplatBits & ~0xff) == 0) {
6778 // Value = 0x00nn: Op=x, Cmode=100x.
6779 OpCmode = 0x8;
6780 Imm = SplatBits;
6781 break;
6782 }
6783 if ((SplatBits & ~0xff00) == 0) {
6784 // Value = 0xnn00: Op=x, Cmode=101x.
6785 OpCmode = 0xa;
6786 Imm = SplatBits >> 8;
6787 break;
6788 }
6789 return SDValue();
6790
6791 case 32:
6792 // NEON's 32-bit VMOV supports splat values where:
6793 // * only one byte is nonzero, or
6794 // * the least significant byte is 0xff and the second byte is nonzero, or
6795 // * the least significant 2 bytes are 0xff and the third is nonzero.
6796 VT = is128Bits ? MVT::v4i32 : MVT::v2i32;
6797 if ((SplatBits & ~0xff) == 0) {
6798 // Value = 0x000000nn: Op=x, Cmode=000x.
6799 OpCmode = 0;
6800 Imm = SplatBits;
6801 break;
6802 }
6803 if ((SplatBits & ~0xff00) == 0) {
6804 // Value = 0x0000nn00: Op=x, Cmode=001x.
6805 OpCmode = 0x2;
6806 Imm = SplatBits >> 8;
6807 break;
6808 }
6809 if ((SplatBits & ~0xff0000) == 0) {
6810 // Value = 0x00nn0000: Op=x, Cmode=010x.
6811 OpCmode = 0x4;
6812 Imm = SplatBits >> 16;
6813 break;
6814 }
6815 if ((SplatBits & ~0xff000000) == 0) {
6816 // Value = 0xnn000000: Op=x, Cmode=011x.
6817 OpCmode = 0x6;
6818 Imm = SplatBits >> 24;
6819 break;
6820 }
6821
6822 // cmode == 0b1100 and cmode == 0b1101 are not supported for VORR or VBIC
6823 if (type == OtherModImm) return SDValue();
6824
6825 if ((SplatBits & ~0xffff) == 0 &&
6826 ((SplatBits | SplatUndef) & 0xff) == 0xff) {
6827 // Value = 0x0000nnff: Op=x, Cmode=1100.
6828 OpCmode = 0xc;
6829 Imm = SplatBits >> 8;
6830 break;
6831 }
6832
6833 // cmode == 0b1101 is not supported for MVE VMVN
6834 if (type == MVEVMVNModImm)
6835 return SDValue();
6836
6837 if ((SplatBits & ~0xffffff) == 0 &&
6838 ((SplatBits | SplatUndef) & 0xffff) == 0xffff) {
6839 // Value = 0x00nnffff: Op=x, Cmode=1101.
6840 OpCmode = 0xd;
6841 Imm = SplatBits >> 16;
6842 break;
6843 }
6844
6845 // Note: there are a few 32-bit splat values (specifically: 00ffff00,
6846 // ff000000, ff0000ff, and ffff00ff) that are valid for VMOV.I64 but not
6847 // VMOV.I32. A (very) minor optimization would be to replicate the value
6848 // and fall through here to test for a valid 64-bit splat. But, then the
6849 // caller would also need to check and handle the change in size.
6850 return SDValue();
6851
6852 case 64: {
6853 if (type != VMOVModImm)
6854 return SDValue();
6855 // NEON has a 64-bit VMOV splat where each byte is either 0 or 0xff.
6856 uint64_t BitMask = 0xff;
6857 unsigned ImmMask = 1;
6858 Imm = 0;
6859 for (int ByteNum = 0; ByteNum < 8; ++ByteNum) {
6860 if (((SplatBits | SplatUndef) & BitMask) == BitMask) {
6861 Imm |= ImmMask;
6862 } else if ((SplatBits & BitMask) != 0) {
6863 return SDValue();
6864 }
6865 BitMask <<= 8;
6866 ImmMask <<= 1;
6867 }
6868
6869 // Op=1, Cmode=1110.
6870 OpCmode = 0x1e;
6871 VT = is128Bits ? MVT::v2i64 : MVT::v1i64;
6872 break;
6873 }
6874
6875 default:
6876 llvm_unreachable("unexpected size for isVMOVModifiedImm");
6877 }
6878
6879 unsigned EncodedVal = ARM_AM::createVMOVModImm(OpCmode, Imm);
6880 return DAG.getTargetConstant(EncodedVal, dl, MVT::i32);
6881}
6882
6883SDValue ARMTargetLowering::LowerConstantFP(SDValue Op, SelectionDAG &DAG,
6884 const ARMSubtarget *ST) const {
6885 EVT VT = Op.getValueType();
6886 bool IsDouble = (VT == MVT::f64);
6887 ConstantFPSDNode *CFP = cast<ConstantFPSDNode>(Op);
6888 const APFloat &FPVal = CFP->getValueAPF();
6889
6890 // Prevent floating-point constants from using literal loads
6891 // when execute-only is enabled.
6892 if (ST->genExecuteOnly()) {
6893 // We shouldn't trigger this for v6m execute-only
6894 assert((!ST->isThumb1Only() || ST->hasV8MBaselineOps()) &&
6895 "Unexpected architecture");
6896
6897 // If we can represent the constant as an immediate, don't lower it
6898 if (isFPImmLegal(FPVal, VT))
6899 return Op;
6900 // Otherwise, construct as integer, and move to float register
6901 APInt INTVal = FPVal.bitcastToAPInt();
6902 SDLoc DL(CFP);
6903 switch (VT.getSimpleVT().SimpleTy) {
6904 default:
6905 llvm_unreachable("Unknown floating point type!");
6906 break;
6907 case MVT::f64: {
6908 SDValue Lo = DAG.getConstant(INTVal.trunc(32), DL, MVT::i32);
6909 SDValue Hi = DAG.getConstant(INTVal.lshr(32).trunc(32), DL, MVT::i32);
6910 return DAG.getNode(ARMISD::VMOVDRR, DL, MVT::f64, Lo, Hi);
6911 }
6912 case MVT::f32:
6913 return DAG.getNode(ARMISD::VMOVSR, DL, VT,
6914 DAG.getConstant(INTVal, DL, MVT::i32));
6915 }
6916 }
6917
6918 if (!ST->hasVFP3Base())
6919 return SDValue();
6920
6921 // Use the default (constant pool) lowering for double constants when we have
6922 // an SP-only FPU
6923 if (IsDouble && !Subtarget->hasFP64())
6924 return SDValue();
6925
6926 // Try splatting with a VMOV.f32...
6927 int ImmVal = IsDouble ? ARM_AM::getFP64Imm(FPVal) : ARM_AM::getFP32Imm(FPVal);
6928
6929 if (ImmVal != -1) {
6930 if (IsDouble || !ST->useNEONForSinglePrecisionFP()) {
6931 // We have code in place to select a valid ConstantFP already, no need to
6932 // do any mangling.
6933 return Op;
6934 }
6935
6936 // It's a float and we are trying to use NEON operations where
6937 // possible. Lower it to a splat followed by an extract.
6938 SDLoc DL(Op);
6939 SDValue NewVal = DAG.getTargetConstant(ImmVal, DL, MVT::i32);
6940 SDValue VecConstant = DAG.getNode(ARMISD::VMOVFPIMM, DL, MVT::v2f32,
6941 NewVal);
6942 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecConstant,
6943 DAG.getConstant(0, DL, MVT::i32));
6944 }
6945
6946 // The rest of our options are NEON only, make sure that's allowed before
6947 // proceeding..
6948 if (!ST->hasNEON() || (!IsDouble && !ST->useNEONForSinglePrecisionFP()))
6949 return SDValue();
6950
6951 EVT VMovVT;
6952 uint64_t iVal = FPVal.bitcastToAPInt().getZExtValue();
6953
6954 // It wouldn't really be worth bothering for doubles except for one very
6955 // important value, which does happen to match: 0.0. So make sure we don't do
6956 // anything stupid.
6957 if (IsDouble && (iVal & 0xffffffff) != (iVal >> 32))
6958 return SDValue();
6959
6960 // Try a VMOV.i32 (FIXME: i8, i16, or i64 could work too).
6961 SDValue NewVal = isVMOVModifiedImm(iVal & 0xffffffffU, 0, 32, DAG, SDLoc(Op),
6962 VMovVT, VT, VMOVModImm);
6963 if (NewVal != SDValue()) {
6964 SDLoc DL(Op);
6965 SDValue VecConstant = DAG.getNode(ARMISD::VMOVIMM, DL, VMovVT,
6966 NewVal);
6967 if (IsDouble)
6968 return DAG.getNode(ISD::BITCAST, DL, MVT::f64, VecConstant);
6969
6970 // It's a float: cast and extract a vector element.
6971 SDValue VecFConstant = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32,
6972 VecConstant);
6973 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecFConstant,
6974 DAG.getConstant(0, DL, MVT::i32));
6975 }
6976
6977 // Finally, try a VMVN.i32
6978 NewVal = isVMOVModifiedImm(~iVal & 0xffffffffU, 0, 32, DAG, SDLoc(Op), VMovVT,
6979 VT, VMVNModImm);
6980 if (NewVal != SDValue()) {
6981 SDLoc DL(Op);
6982 SDValue VecConstant = DAG.getNode(ARMISD::VMVNIMM, DL, VMovVT, NewVal);
6983
6984 if (IsDouble)
6985 return DAG.getNode(ISD::BITCAST, DL, MVT::f64, VecConstant);
6986
6987 // It's a float: cast and extract a vector element.
6988 SDValue VecFConstant = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32,
6989 VecConstant);
6990 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecFConstant,
6991 DAG.getConstant(0, DL, MVT::i32));
6992 }
6993
6994 return SDValue();
6995}
6996
6997// check if an VEXT instruction can handle the shuffle mask when the
6998// vector sources of the shuffle are the same.
6999static bool isSingletonVEXTMask(ArrayRef<int> M, EVT VT, unsigned &Imm) {
7000 unsigned NumElts = VT.getVectorNumElements();
7001
7002 // Assume that the first shuffle index is not UNDEF. Fail if it is.
7003 if (M[0] < 0)
7004 return false;
7005
7006 Imm = M[0];
7007
7008 // If this is a VEXT shuffle, the immediate value is the index of the first
7009 // element. The other shuffle indices must be the successive elements after
7010 // the first one.
7011 unsigned ExpectedElt = Imm;
7012 for (unsigned i = 1; i < NumElts; ++i) {
7013 // Increment the expected index. If it wraps around, just follow it
7014 // back to index zero and keep going.
7015 ++ExpectedElt;
7016 if (ExpectedElt == NumElts)
7017 ExpectedElt = 0;
7018
7019 if (M[i] < 0) continue; // ignore UNDEF indices
7020 if (ExpectedElt != static_cast<unsigned>(M[i]))
7021 return false;
7022 }
7023
7024 return true;
7025}
7026
7027static bool isVEXTMask(ArrayRef<int> M, EVT VT,
7028 bool &ReverseVEXT, unsigned &Imm) {
7029 unsigned NumElts = VT.getVectorNumElements();
7030 ReverseVEXT = false;
7031
7032 // Assume that the first shuffle index is not UNDEF. Fail if it is.
7033 if (M[0] < 0)
7034 return false;
7035
7036 Imm = M[0];
7037
7038 // If this is a VEXT shuffle, the immediate value is the index of the first
7039 // element. The other shuffle indices must be the successive elements after
7040 // the first one.
7041 unsigned ExpectedElt = Imm;
7042 for (unsigned i = 1; i < NumElts; ++i) {
7043 // Increment the expected index. If it wraps around, it may still be
7044 // a VEXT but the source vectors must be swapped.
7045 ExpectedElt += 1;
7046 if (ExpectedElt == NumElts * 2) {
7047 ExpectedElt = 0;
7048 ReverseVEXT = true;
7049 }
7050
7051 if (M[i] < 0) continue; // ignore UNDEF indices
7052 if (ExpectedElt != static_cast<unsigned>(M[i]))
7053 return false;
7054 }
7055
7056 // Adjust the index value if the source operands will be swapped.
7057 if (ReverseVEXT)
7058 Imm -= NumElts;
7059
7060 return true;
7061}
7062
7063static bool isVTBLMask(ArrayRef<int> M, EVT VT) {
7064 // We can handle <8 x i8> vector shuffles. If the index in the mask is out of
7065 // range, then 0 is placed into the resulting vector. So pretty much any mask
7066 // of 8 elements can work here.
7067 return VT == MVT::v8i8 && M.size() == 8;
7068}
7069
7070static unsigned SelectPairHalf(unsigned Elements, ArrayRef<int> Mask,
7071 unsigned Index) {
7072 if (Mask.size() == Elements * 2)
7073 return Index / Elements;
7074 return Mask[Index] == 0 ? 0 : 1;
7075}
7076
7077// Checks whether the shuffle mask represents a vector transpose (VTRN) by
7078// checking that pairs of elements in the shuffle mask represent the same index
7079// in each vector, incrementing the expected index by 2 at each step.
7080// e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 4, 2, 6]
7081// v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,e,c,g}
7082// v2={e,f,g,h}
7083// WhichResult gives the offset for each element in the mask based on which
7084// of the two results it belongs to.
7085//
7086// The transpose can be represented either as:
7087// result1 = shufflevector v1, v2, result1_shuffle_mask
7088// result2 = shufflevector v1, v2, result2_shuffle_mask
7089// where v1/v2 and the shuffle masks have the same number of elements
7090// (here WhichResult (see below) indicates which result is being checked)
7091//
7092// or as:
7093// results = shufflevector v1, v2, shuffle_mask
7094// where both results are returned in one vector and the shuffle mask has twice
7095// as many elements as v1/v2 (here WhichResult will always be 0 if true) here we
7096// want to check the low half and high half of the shuffle mask as if it were
7097// the other case
7098static bool isVTRNMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
7099 unsigned EltSz = VT.getScalarSizeInBits();
7100 if (EltSz == 64)
7101 return false;
7102
7103 unsigned NumElts = VT.getVectorNumElements();
7104 if ((M.size() != NumElts && M.size() != NumElts * 2) || NumElts % 2 != 0)
7105 return false;
7106
7107 // If the mask is twice as long as the input vector then we need to check the
7108 // upper and lower parts of the mask with a matching value for WhichResult
7109 // FIXME: A mask with only even values will be rejected in case the first
7110 // element is undefined, e.g. [-1, 4, 2, 6] will be rejected, because only
7111 // M[0] is used to determine WhichResult
7112 for (unsigned i = 0; i < M.size(); i += NumElts) {
7113 WhichResult = SelectPairHalf(NumElts, M, i);
7114 for (unsigned j = 0; j < NumElts; j += 2) {
7115 if ((M[i+j] >= 0 && (unsigned) M[i+j] != j + WhichResult) ||
7116 (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != j + NumElts + WhichResult))
7117 return false;
7118 }
7119 }
7120
7121 if (M.size() == NumElts*2)
7122 WhichResult = 0;
7123
7124 return true;
7125}
7126
7127/// isVTRN_v_undef_Mask - Special case of isVTRNMask for canonical form of
7128/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
7129/// Mask is e.g., <0, 0, 2, 2> instead of <0, 4, 2, 6>.
7130static bool isVTRN_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){
7131 unsigned EltSz = VT.getScalarSizeInBits();
7132 if (EltSz == 64)
7133 return false;
7134
7135 unsigned NumElts = VT.getVectorNumElements();
7136 if ((M.size() != NumElts && M.size() != NumElts * 2) || NumElts % 2 != 0)
7137 return false;
7138
7139 for (unsigned i = 0; i < M.size(); i += NumElts) {
7140 WhichResult = SelectPairHalf(NumElts, M, i);
7141 for (unsigned j = 0; j < NumElts; j += 2) {
7142 if ((M[i+j] >= 0 && (unsigned) M[i+j] != j + WhichResult) ||
7143 (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != j + WhichResult))
7144 return false;
7145 }
7146 }
7147
7148 if (M.size() == NumElts*2)
7149 WhichResult = 0;
7150
7151 return true;
7152}
7153
7154// Checks whether the shuffle mask represents a vector unzip (VUZP) by checking
7155// that the mask elements are either all even and in steps of size 2 or all odd
7156// and in steps of size 2.
7157// e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 2, 4, 6]
7158// v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,c,e,g}
7159// v2={e,f,g,h}
7160// Requires similar checks to that of isVTRNMask with
7161// respect the how results are returned.
7162static bool isVUZPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
7163 unsigned EltSz = VT.getScalarSizeInBits();
7164 if (EltSz == 64)
7165 return false;
7166
7167 unsigned NumElts = VT.getVectorNumElements();
7168 if (M.size() != NumElts && M.size() != NumElts*2)
7169 return false;
7170
7171 for (unsigned i = 0; i < M.size(); i += NumElts) {
7172 WhichResult = SelectPairHalf(NumElts, M, i);
7173 for (unsigned j = 0; j < NumElts; ++j) {
7174 if (M[i+j] >= 0 && (unsigned) M[i+j] != 2 * j + WhichResult)
7175 return false;
7176 }
7177 }
7178
7179 if (M.size() == NumElts*2)
7180 WhichResult = 0;
7181
7182 // VUZP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
7183 if (VT.is64BitVector() && EltSz == 32)
7184 return false;
7185
7186 return true;
7187}
7188
7189/// isVUZP_v_undef_Mask - Special case of isVUZPMask for canonical form of
7190/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
7191/// Mask is e.g., <0, 2, 0, 2> instead of <0, 2, 4, 6>,
7192static bool isVUZP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){
7193 unsigned EltSz = VT.getScalarSizeInBits();
7194 if (EltSz == 64)
7195 return false;
7196
7197 unsigned NumElts = VT.getVectorNumElements();
7198 if (M.size() != NumElts && M.size() != NumElts*2)
7199 return false;
7200
7201 unsigned Half = NumElts / 2;
7202 for (unsigned i = 0; i < M.size(); i += NumElts) {
7203 WhichResult = SelectPairHalf(NumElts, M, i);
7204 for (unsigned j = 0; j < NumElts; j += Half) {
7205 unsigned Idx = WhichResult;
7206 for (unsigned k = 0; k < Half; ++k) {
7207 int MIdx = M[i + j + k];
7208 if (MIdx >= 0 && (unsigned) MIdx != Idx)
7209 return false;
7210 Idx += 2;
7211 }
7212 }
7213 }
7214
7215 if (M.size() == NumElts*2)
7216 WhichResult = 0;
7217
7218 // VUZP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
7219 if (VT.is64BitVector() && EltSz == 32)
7220 return false;
7221
7222 return true;
7223}
7224
7225// Checks whether the shuffle mask represents a vector zip (VZIP) by checking
7226// that pairs of elements of the shufflemask represent the same index in each
7227// vector incrementing sequentially through the vectors.
7228// e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 4, 1, 5]
7229// v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,e,b,f}
7230// v2={e,f,g,h}
7231// Requires similar checks to that of isVTRNMask with respect the how results
7232// are returned.
7233static bool isVZIPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
7234 unsigned EltSz = VT.getScalarSizeInBits();
7235 if (EltSz == 64)
7236 return false;
7237
7238 unsigned NumElts = VT.getVectorNumElements();
7239 if ((M.size() != NumElts && M.size() != NumElts * 2) || NumElts % 2 != 0)
7240 return false;
7241
7242 for (unsigned i = 0; i < M.size(); i += NumElts) {
7243 WhichResult = SelectPairHalf(NumElts, M, i);
7244 unsigned Idx = WhichResult * NumElts / 2;
7245 for (unsigned j = 0; j < NumElts; j += 2) {
7246 if ((M[i+j] >= 0 && (unsigned) M[i+j] != Idx) ||
7247 (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != Idx + NumElts))
7248 return false;
7249 Idx += 1;
7250 }
7251 }
7252
7253 if (M.size() == NumElts*2)
7254 WhichResult = 0;
7255
7256 // VZIP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
7257 if (VT.is64BitVector() && EltSz == 32)
7258 return false;
7259
7260 return true;
7261}
7262
7263/// isVZIP_v_undef_Mask - Special case of isVZIPMask for canonical form of
7264/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
7265/// Mask is e.g., <0, 0, 1, 1> instead of <0, 4, 1, 5>.
7266static bool isVZIP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){
7267 unsigned EltSz = VT.getScalarSizeInBits();
7268 if (EltSz == 64)
7269 return false;
7270
7271 unsigned NumElts = VT.getVectorNumElements();
7272 if ((M.size() != NumElts && M.size() != NumElts * 2) || NumElts % 2 != 0)
7273 return false;
7274
7275 for (unsigned i = 0; i < M.size(); i += NumElts) {
7276 WhichResult = SelectPairHalf(NumElts, M, i);
7277 unsigned Idx = WhichResult * NumElts / 2;
7278 for (unsigned j = 0; j < NumElts; j += 2) {
7279 if ((M[i+j] >= 0 && (unsigned) M[i+j] != Idx) ||
7280 (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != Idx))
7281 return false;
7282 Idx += 1;
7283 }
7284 }
7285
7286 if (M.size() == NumElts*2)
7287 WhichResult = 0;
7288
7289 // VZIP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
7290 if (VT.is64BitVector() && EltSz == 32)
7291 return false;
7292
7293 return true;
7294}
7295
7296/// Check if \p ShuffleMask is a NEON two-result shuffle (VZIP, VUZP, VTRN),
7297/// and return the corresponding ARMISD opcode if it is, or 0 if it isn't.
7298static unsigned isNEONTwoResultShuffleMask(ArrayRef<int> ShuffleMask, EVT VT,
7299 unsigned &WhichResult,
7300 bool &isV_UNDEF) {
7301 isV_UNDEF = false;
7302 if (isVTRNMask(ShuffleMask, VT, WhichResult))
7303 return ARMISD::VTRN;
7304 if (isVUZPMask(ShuffleMask, VT, WhichResult))
7305 return ARMISD::VUZP;
7306 if (isVZIPMask(ShuffleMask, VT, WhichResult))
7307 return ARMISD::VZIP;
7308
7309 isV_UNDEF = true;
7310 if (isVTRN_v_undef_Mask(ShuffleMask, VT, WhichResult))
7311 return ARMISD::VTRN;
7312 if (isVUZP_v_undef_Mask(ShuffleMask, VT, WhichResult))
7313 return ARMISD::VUZP;
7314 if (isVZIP_v_undef_Mask(ShuffleMask, VT, WhichResult))
7315 return ARMISD::VZIP;
7316
7317 return 0;
7318}
7319
7320/// \return true if this is a reverse operation on an vector.
7321static bool isReverseMask(ArrayRef<int> M, EVT VT) {
7322 unsigned NumElts = VT.getVectorNumElements();
7323 // Make sure the mask has the right size.
7324 if (NumElts != M.size())
7325 return false;
7326
7327 // Look for <15, ..., 3, -1, 1, 0>.
7328 for (unsigned i = 0; i != NumElts; ++i)
7329 if (M[i] >= 0 && M[i] != (int) (NumElts - 1 - i))
7330 return false;
7331
7332 return true;
7333}
7334
7335static bool isTruncMask(ArrayRef<int> M, EVT VT, bool Top, bool SingleSource) {
7336 unsigned NumElts = VT.getVectorNumElements();
7337 // Make sure the mask has the right size.
7338 if (NumElts != M.size() || (VT != MVT::v8i16 && VT != MVT::v16i8))
7339 return false;
7340
7341 // Half-width truncation patterns (e.g. v4i32 -> v8i16):
7342 // !Top && SingleSource: <0, 2, 4, 6, 0, 2, 4, 6>
7343 // !Top && !SingleSource: <0, 2, 4, 6, 8, 10, 12, 14>
7344 // Top && SingleSource: <1, 3, 5, 7, 1, 3, 5, 7>
7345 // Top && !SingleSource: <1, 3, 5, 7, 9, 11, 13, 15>
7346 int Ofs = Top ? 1 : 0;
7347 int Upper = SingleSource ? 0 : NumElts;
7348 for (int i = 0, e = NumElts / 2; i != e; ++i) {
7349 if (M[i] >= 0 && M[i] != (i * 2) + Ofs)
7350 return false;
7351 if (M[i + e] >= 0 && M[i + e] != (i * 2) + Ofs + Upper)
7352 return false;
7353 }
7354 return true;
7355}
7356
7357static bool isVMOVNMask(ArrayRef<int> M, EVT VT, bool Top, bool SingleSource) {
7358 unsigned NumElts = VT.getVectorNumElements();
7359 // Make sure the mask has the right size.
7360 if (NumElts != M.size() || (VT != MVT::v8i16 && VT != MVT::v16i8))
7361 return false;
7362
7363 // If Top
7364 // Look for <0, N, 2, N+2, 4, N+4, ..>.
7365 // This inserts Input2 into Input1
7366 // else if not Top
7367 // Look for <0, N+1, 2, N+3, 4, N+5, ..>
7368 // This inserts Input1 into Input2
7369 unsigned Offset = Top ? 0 : 1;
7370 unsigned N = SingleSource ? 0 : NumElts;
7371 for (unsigned i = 0; i < NumElts; i += 2) {
7372 if (M[i] >= 0 && M[i] != (int)i)
7373 return false;
7374 if (M[i + 1] >= 0 && M[i + 1] != (int)(N + i + Offset))
7375 return false;
7376 }
7377
7378 return true;
7379}
7380
7381static bool isVMOVNTruncMask(ArrayRef<int> M, EVT ToVT, bool rev) {
7382 unsigned NumElts = ToVT.getVectorNumElements();
7383 if (NumElts != M.size())
7384 return false;
7385
7386 // Test if the Trunc can be convertible to a VMOVN with this shuffle. We are
7387 // looking for patterns of:
7388 // !rev: 0 N/2 1 N/2+1 2 N/2+2 ...
7389 // rev: N/2 0 N/2+1 1 N/2+2 2 ...
7390
7391 unsigned Off0 = rev ? NumElts / 2 : 0;
7392 unsigned Off1 = rev ? 0 : NumElts / 2;
7393 for (unsigned i = 0; i < NumElts; i += 2) {
7394 if (M[i] >= 0 && M[i] != (int)(Off0 + i / 2))
7395 return false;
7396 if (M[i + 1] >= 0 && M[i + 1] != (int)(Off1 + i / 2))
7397 return false;
7398 }
7399
7400 return true;
7401}
7402
7403// Reconstruct an MVE VCVT from a BuildVector of scalar fptrunc, all extracted
7404// from a pair of inputs. For example:
7405// BUILDVECTOR(FP_ROUND(EXTRACT_ELT(X, 0),
7406// FP_ROUND(EXTRACT_ELT(Y, 0),
7407// FP_ROUND(EXTRACT_ELT(X, 1),
7408// FP_ROUND(EXTRACT_ELT(Y, 1), ...)
7410 const ARMSubtarget *ST) {
7411 assert(BV.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
7412 if (!ST->hasMVEFloatOps())
7413 return SDValue();
7414
7415 SDLoc dl(BV);
7416 EVT VT = BV.getValueType();
7417 if (VT != MVT::v8f16)
7418 return SDValue();
7419
7420 // We are looking for a buildvector of fptrunc elements, where all the
7421 // elements are interleavingly extracted from two sources. Check the first two
7422 // items are valid enough and extract some info from them (they are checked
7423 // properly in the loop below).
7424 if (BV.getOperand(0).getOpcode() != ISD::FP_ROUND ||
7427 return SDValue();
7428 if (BV.getOperand(1).getOpcode() != ISD::FP_ROUND ||
7431 return SDValue();
7432 SDValue Op0 = BV.getOperand(0).getOperand(0).getOperand(0);
7433 SDValue Op1 = BV.getOperand(1).getOperand(0).getOperand(0);
7434 if (Op0.getValueType() != MVT::v4f32 || Op1.getValueType() != MVT::v4f32)
7435 return SDValue();
7436
7437 // Check all the values in the BuildVector line up with our expectations.
7438 for (unsigned i = 1; i < 4; i++) {
7439 auto Check = [](SDValue Trunc, SDValue Op, unsigned Idx) {
7440 return Trunc.getOpcode() == ISD::FP_ROUND &&
7442 Trunc.getOperand(0).getOperand(0) == Op &&
7443 Trunc.getOperand(0).getConstantOperandVal(1) == Idx;
7444 };
7445 if (!Check(BV.getOperand(i * 2 + 0), Op0, i))
7446 return SDValue();
7447 if (!Check(BV.getOperand(i * 2 + 1), Op1, i))
7448 return SDValue();
7449 }
7450
7451 SDValue N1 = DAG.getNode(ARMISD::VCVTN, dl, VT, DAG.getUNDEF(VT), Op0,
7452 DAG.getConstant(0, dl, MVT::i32));
7453 return DAG.getNode(ARMISD::VCVTN, dl, VT, N1, Op1,
7454 DAG.getConstant(1, dl, MVT::i32));
7455}
7456
7457// Reconstruct an MVE VCVT from a BuildVector of scalar fpext, all extracted
7458// from a single input on alternating lanes. For example:
7459// BUILDVECTOR(FP_ROUND(EXTRACT_ELT(X, 0),
7460// FP_ROUND(EXTRACT_ELT(X, 2),
7461// FP_ROUND(EXTRACT_ELT(X, 4), ...)
7463 const ARMSubtarget *ST) {
7464 assert(BV.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
7465 if (!ST->hasMVEFloatOps())
7466 return SDValue();
7467
7468 SDLoc dl(BV);
7469 EVT VT = BV.getValueType();
7470 if (VT != MVT::v4f32)
7471 return SDValue();
7472
7473 // We are looking for a buildvector of fptext elements, where all the
7474 // elements are alternating lanes from a single source. For example <0,2,4,6>
7475 // or <1,3,5,7>. Check the first two items are valid enough and extract some
7476 // info from them (they are checked properly in the loop below).
7477 if (BV.getOperand(0).getOpcode() != ISD::FP_EXTEND ||
7479 return SDValue();
7480 SDValue Op0 = BV.getOperand(0).getOperand(0).getOperand(0);
7482 if (Op0.getValueType() != MVT::v8f16 || (Offset != 0 && Offset != 1))
7483 return SDValue();
7484
7485 // Check all the values in the BuildVector line up with our expectations.
7486 for (unsigned i = 1; i < 4; i++) {
7487 auto Check = [](SDValue Trunc, SDValue Op, unsigned Idx) {
7488 return Trunc.getOpcode() == ISD::FP_EXTEND &&
7490 Trunc.getOperand(0).getOperand(0) == Op &&
7491 Trunc.getOperand(0).getConstantOperandVal(1) == Idx;
7492 };
7493 if (!Check(BV.getOperand(i), Op0, 2 * i + Offset))
7494 return SDValue();
7495 }
7496
7497 return DAG.getNode(ARMISD::VCVTL, dl, VT, Op0,
7498 DAG.getConstant(Offset, dl, MVT::i32));
7499}
7500
7501// If N is an integer constant that can be moved into a register in one
7502// instruction, return an SDValue of such a constant (will become a MOV
7503// instruction). Otherwise return null.
7505 const ARMSubtarget *ST, const SDLoc &dl) {
7506 uint64_t Val;
7507 if (!isa<ConstantSDNode>(N))
7508 return SDValue();
7509 Val = N->getAsZExtVal();
7510
7511 if (ST->isThumb1Only()) {
7512 if (Val <= 255 || ~Val <= 255)
7513 return DAG.getConstant(Val, dl, MVT::i32);
7514 } else {
7515 if (ARM_AM::getSOImmVal(Val) != -1 || ARM_AM::getSOImmVal(~Val) != -1)
7516 return DAG.getConstant(Val, dl, MVT::i32);
7517 }
7518 return SDValue();
7519}
7520
7522 const ARMSubtarget *ST) {
7523 SDLoc dl(Op);
7524 EVT VT = Op.getValueType();
7525
7526 assert(ST->hasMVEIntegerOps() && "LowerBUILD_VECTOR_i1 called without MVE!");
7527
7528 unsigned NumElts = VT.getVectorNumElements();
7529 unsigned BoolMask;
7530 unsigned BitsPerBool;
7531 if (NumElts == 2) {
7532 BitsPerBool = 8;
7533 BoolMask = 0xff;
7534 } else if (NumElts == 4) {
7535 BitsPerBool = 4;
7536 BoolMask = 0xf;
7537 } else if (NumElts == 8) {
7538 BitsPerBool = 2;
7539 BoolMask = 0x3;
7540 } else if (NumElts == 16) {
7541 BitsPerBool = 1;
7542 BoolMask = 0x1;
7543 } else
7544 return SDValue();
7545
7546 // If this is a single value copied into all lanes (a splat), we can just sign
7547 // extend that single value
7548 SDValue FirstOp = Op.getOperand(0);
7549 if (!isa<ConstantSDNode>(FirstOp) &&
7550 llvm::all_of(llvm::drop_begin(Op->ops()), [&FirstOp](const SDUse &U) {
7551 return U.get().isUndef() || U.get() == FirstOp;
7552 })) {
7553 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::i32, FirstOp,
7554 DAG.getValueType(MVT::i1));
7555 return DAG.getNode(ARMISD::PREDICATE_CAST, dl, Op.getValueType(), Ext);
7556 }
7557
7558 // First create base with bits set where known
7559 unsigned Bits32 = 0;
7560 for (unsigned i = 0; i < NumElts; ++i) {
7561 SDValue V = Op.getOperand(i);
7562 if (!isa<ConstantSDNode>(V) && !V.isUndef())
7563 continue;
7564 bool BitSet = V.isUndef() ? false : V->getAsZExtVal();
7565 if (BitSet)
7566 Bits32 |= BoolMask << (i * BitsPerBool);
7567 }
7568
7569 // Add in unknown nodes
7570 SDValue Base = DAG.getNode(ARMISD::PREDICATE_CAST, dl, VT,
7571 DAG.getConstant(Bits32, dl, MVT::i32));
7572 for (unsigned i = 0; i < NumElts; ++i) {
7573 SDValue V = Op.getOperand(i);
7574 if (isa<ConstantSDNode>(V) || V.isUndef())
7575 continue;
7576 Base = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Base, V,
7577 DAG.getConstant(i, dl, MVT::i32));
7578 }
7579
7580 return Base;
7581}
7582
7584 const ARMSubtarget *ST) {
7585 if (!ST->hasMVEIntegerOps())
7586 return SDValue();
7587
7588 // We are looking for a buildvector where each element is Op[0] + i*N
7589 EVT VT = Op.getValueType();
7590 SDValue Op0 = Op.getOperand(0);
7591 unsigned NumElts = VT.getVectorNumElements();
7592
7593 // Get the increment value from operand 1
7594 SDValue Op1 = Op.getOperand(1);
7595 if (Op1.getOpcode() != ISD::ADD || Op1.getOperand(0) != Op0 ||
7597 return SDValue();
7598 unsigned N = Op1.getConstantOperandVal(1);
7599 if (N != 1 && N != 2 && N != 4 && N != 8)
7600 return SDValue();
7601
7602 // Check that each other operand matches
7603 for (unsigned I = 2; I < NumElts; I++) {
7604 SDValue OpI = Op.getOperand(I);
7605 if (OpI.getOpcode() != ISD::ADD || OpI.getOperand(0) != Op0 ||
7607 OpI.getConstantOperandVal(1) != I * N)
7608 return SDValue();
7609 }
7610
7611 SDLoc DL(Op);
7612 return DAG.getNode(ARMISD::VIDUP, DL, DAG.getVTList(VT, MVT::i32), Op0,
7613 DAG.getConstant(N, DL, MVT::i32));
7614}
7615
7616// Returns true if the operation N can be treated as qr instruction variant at
7617// operand Op.
7618static bool IsQRMVEInstruction(const SDNode *N, const SDNode *Op) {
7619 switch (N->getOpcode()) {
7620 case ISD::ADD:
7621 case ISD::MUL:
7622 case ISD::SADDSAT:
7623 case ISD::UADDSAT:
7624 case ISD::AVGFLOORS:
7625 case ISD::AVGFLOORU:
7626 return true;
7627 case ISD::SUB:
7628 case ISD::SSUBSAT:
7629 case ISD::USUBSAT:
7630 return N->getOperand(1).getNode() == Op;
7632 switch (N->getConstantOperandVal(0)) {
7633 case Intrinsic::arm_mve_add_predicated:
7634 case Intrinsic::arm_mve_mul_predicated:
7635 case Intrinsic::arm_mve_qadd_predicated:
7636 case Intrinsic::arm_mve_vhadd:
7637 case Intrinsic::arm_mve_hadd_predicated:
7638 case Intrinsic::arm_mve_vqdmulh:
7639 case Intrinsic::arm_mve_qdmulh_predicated:
7640 case Intrinsic::arm_mve_vqrdmulh:
7641 case Intrinsic::arm_mve_qrdmulh_predicated:
7642 case Intrinsic::arm_mve_vqdmull:
7643 case Intrinsic::arm_mve_vqdmull_predicated:
7644 return true;
7645 case Intrinsic::arm_mve_sub_predicated:
7646 case Intrinsic::arm_mve_qsub_predicated:
7647 case Intrinsic::arm_mve_vhsub:
7648 case Intrinsic::arm_mve_hsub_predicated:
7649 return N->getOperand(2).getNode() == Op;
7650 default:
7651 return false;
7652 }
7653 default:
7654 return false;
7655 }
7656}
7657
7658// If this is a case we can't handle, return null and let the default
7659// expansion code take care of it.
7660SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
7661 const ARMSubtarget *ST) const {
7662 BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode());
7663 SDLoc dl(Op);
7664 EVT VT = Op.getValueType();
7665
7666 if (ST->hasMVEIntegerOps() && VT.getScalarSizeInBits() == 1)
7667 return LowerBUILD_VECTOR_i1(Op, DAG, ST);
7668
7669 if (SDValue R = LowerBUILD_VECTORToVIDUP(Op, DAG, ST))
7670 return R;
7671
7672 APInt SplatBits, SplatUndef;
7673 unsigned SplatBitSize;
7674 bool HasAnyUndefs;
7675 if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
7676 if (SplatUndef.isAllOnes())
7677 return DAG.getUNDEF(VT);
7678
7679 // If all the users of this constant splat are qr instruction variants,
7680 // generate a vdup of the constant.
7681 if (ST->hasMVEIntegerOps() && VT.getScalarSizeInBits() == SplatBitSize &&
7682 (SplatBitSize == 8 || SplatBitSize == 16 || SplatBitSize == 32) &&
7683 all_of(BVN->users(),
7684 [BVN](const SDNode *U) { return IsQRMVEInstruction(U, BVN); })) {
7685 EVT DupVT = SplatBitSize == 32 ? MVT::v4i32
7686 : SplatBitSize == 16 ? MVT::v8i16
7687 : MVT::v16i8;
7688 SDValue Const = DAG.getConstant(SplatBits.getZExtValue(), dl, MVT::i32);
7689 SDValue VDup = DAG.getNode(ARMISD::VDUP, dl, DupVT, Const);
7690 return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, VDup);
7691 }
7692
7693 if ((ST->hasNEON() && SplatBitSize <= 64) ||
7694 (ST->hasMVEIntegerOps() && SplatBitSize <= 64)) {
7695 // Check if an immediate VMOV works.
7696 EVT VmovVT;
7697 SDValue Val =
7698 isVMOVModifiedImm(SplatBits.getZExtValue(), SplatUndef.getZExtValue(),
7699 SplatBitSize, DAG, dl, VmovVT, VT, VMOVModImm);
7700
7701 if (Val.getNode()) {
7702 SDValue Vmov = DAG.getNode(ARMISD::VMOVIMM, dl, VmovVT, Val);
7703 return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, Vmov);
7704 }
7705
7706 // Try an immediate VMVN.
7707 uint64_t NegatedImm = (~SplatBits).getZExtValue();
7708 Val = isVMOVModifiedImm(
7709 NegatedImm, SplatUndef.getZExtValue(), SplatBitSize, DAG, dl, VmovVT,
7710 VT, ST->hasMVEIntegerOps() ? MVEVMVNModImm : VMVNModImm);
7711 if (Val.getNode()) {
7712 SDValue Vmov = DAG.getNode(ARMISD::VMVNIMM, dl, VmovVT, Val);
7713 return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, Vmov);
7714 }
7715
7716 // Use vmov.f32 to materialize other v2f32 and v4f32 splats.
7717 if ((VT == MVT::v2f32 || VT == MVT::v4f32) && SplatBitSize == 32) {
7718 int ImmVal = ARM_AM::getFP32Imm(SplatBits);
7719 if (ImmVal != -1) {
7720 SDValue Val = DAG.getTargetConstant(ImmVal, dl, MVT::i32);
7721 return DAG.getNode(ARMISD::VMOVFPIMM, dl, VT, Val);
7722 }
7723 }
7724
7725 // If we are under MVE, generate a VDUP(constant), bitcast to the original
7726 // type.
7727 if (ST->hasMVEIntegerOps() &&
7728 (SplatBitSize == 8 || SplatBitSize == 16 || SplatBitSize == 32)) {
7729 EVT DupVT = SplatBitSize == 32 ? MVT::v4i32
7730 : SplatBitSize == 16 ? MVT::v8i16
7731 : MVT::v16i8;
7732 SDValue Const = DAG.getConstant(SplatBits.getZExtValue(), dl, MVT::i32);
7733 SDValue VDup = DAG.getNode(ARMISD::VDUP, dl, DupVT, Const);
7734 return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, VDup);
7735 }
7736 }
7737 }
7738
7739 // Scan through the operands to see if only one value is used.
7740 //
7741 // As an optimisation, even if more than one value is used it may be more
7742 // profitable to splat with one value then change some lanes.
7743 //
7744 // Heuristically we decide to do this if the vector has a "dominant" value,
7745 // defined as splatted to more than half of the lanes.
7746 unsigned NumElts = VT.getVectorNumElements();
7747 bool isOnlyLowElement = true;
7748 bool usesOnlyOneValue = true;
7749 bool hasDominantValue = false;
7750 bool isConstant = true;
7751
7752 // Map of the number of times a particular SDValue appears in the
7753 // element list.
7754 DenseMap<SDValue, unsigned> ValueCounts;
7755 SDValue Value;
7756 for (unsigned i = 0; i < NumElts; ++i) {
7757 SDValue V = Op.getOperand(i);
7758 if (V.isUndef())
7759 continue;
7760 if (i > 0)
7761 isOnlyLowElement = false;
7763 isConstant = false;
7764
7765 unsigned &Count = ValueCounts[V];
7766
7767 // Is this value dominant? (takes up more than half of the lanes)
7768 if (++Count > (NumElts / 2)) {
7769 hasDominantValue = true;
7770 Value = V;
7771 }
7772 }
7773 if (ValueCounts.size() != 1)
7774 usesOnlyOneValue = false;
7775 if (!Value.getNode() && !ValueCounts.empty())
7776 Value = ValueCounts.begin()->first;
7777
7778 if (ValueCounts.empty())
7779 return DAG.getUNDEF(VT);
7780
7781 // Loads are better lowered with insert_vector_elt/ARMISD::BUILD_VECTOR.
7782 // Keep going if we are hitting this case.
7783 if (isOnlyLowElement && !ISD::isNormalLoad(Value.getNode()))
7784 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Value);
7785
7786 unsigned EltSize = VT.getScalarSizeInBits();
7787
7788 // Use VDUP for non-constant splats. For f32 constant splats, reduce to
7789 // i32 and try again.
7790 if (hasDominantValue && EltSize <= 32) {
7791 if (!isConstant) {
7792 SDValue N;
7793
7794 // If we are VDUPing a value that comes directly from a vector, that will
7795 // cause an unnecessary move to and from a GPR, where instead we could
7796 // just use VDUPLANE. We can only do this if the lane being extracted
7797 // is at a constant index, as the VDUP from lane instructions only have
7798 // constant-index forms.
7799 ConstantSDNode *constIndex;
7800 if (Value->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
7801 (constIndex = dyn_cast<ConstantSDNode>(Value->getOperand(1)))) {
7802 // We need to create a new undef vector to use for the VDUPLANE if the
7803 // size of the vector from which we get the value is different than the
7804 // size of the vector that we need to create. We will insert the element
7805 // such that the register coalescer will remove unnecessary copies.
7806 if (VT != Value->getOperand(0).getValueType()) {
7807 unsigned index = constIndex->getAPIntValue().getLimitedValue() %
7809 N = DAG.getNode(ARMISD::VDUPLANE, dl, VT,
7810 DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DAG.getUNDEF(VT),
7811 Value, DAG.getConstant(index, dl, MVT::i32)),
7812 DAG.getConstant(index, dl, MVT::i32));
7813 } else
7814 N = DAG.getNode(ARMISD::VDUPLANE, dl, VT,
7815 Value->getOperand(0), Value->getOperand(1));
7816 } else
7817 N = DAG.getNode(ARMISD::VDUP, dl, VT, Value);
7818
7819 if (!usesOnlyOneValue) {
7820 // The dominant value was splatted as 'N', but we now have to insert
7821 // all differing elements.
7822 for (unsigned I = 0; I < NumElts; ++I) {
7823 if (Op.getOperand(I) == Value)
7824 continue;
7826 Ops.push_back(N);
7827 Ops.push_back(Op.getOperand(I));
7828 Ops.push_back(DAG.getConstant(I, dl, MVT::i32));
7829 N = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Ops);
7830 }
7831 }
7832 return N;
7833 }
7836 MVT FVT = VT.getVectorElementType().getSimpleVT();
7837 assert(FVT == MVT::f32 || FVT == MVT::f16);
7838 MVT IVT = (FVT == MVT::f32) ? MVT::i32 : MVT::i16;
7839 for (unsigned i = 0; i < NumElts; ++i)
7840 Ops.push_back(DAG.getNode(ISD::BITCAST, dl, IVT,
7841 Op.getOperand(i)));
7842 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), IVT, NumElts);
7843 SDValue Val = DAG.getBuildVector(VecVT, dl, Ops);
7844 Val = LowerBUILD_VECTOR(Val, DAG, ST);
7845 if (Val.getNode())
7846 return DAG.getNode(ISD::BITCAST, dl, VT, Val);
7847 }
7848 if (usesOnlyOneValue) {
7849 SDValue Val = IsSingleInstrConstant(Value, DAG, ST, dl);
7850 if (isConstant && Val.getNode())
7851 return DAG.getNode(ARMISD::VDUP, dl, VT, Val);
7852 }
7853 }
7854
7855 // If all elements are constants and the case above didn't get hit, fall back
7856 // to the default expansion, which will generate a load from the constant
7857 // pool.
7858 if (isConstant)
7859 return SDValue();
7860
7861 // Reconstruct the BUILDVECTOR to one of the legal shuffles (such as vext and
7862 // vmovn). Empirical tests suggest this is rarely worth it for vectors of
7863 // length <= 2.
7864 if (NumElts >= 4)
7865 if (SDValue shuffle = ReconstructShuffle(Op, DAG))
7866 return shuffle;
7867
7868 // Attempt to turn a buildvector of scalar fptrunc's or fpext's back into
7869 // VCVT's
7870 if (SDValue VCVT = LowerBuildVectorOfFPTrunc(Op, DAG, Subtarget))
7871 return VCVT;
7872 if (SDValue VCVT = LowerBuildVectorOfFPExt(Op, DAG, Subtarget))
7873 return VCVT;
7874
7875 if (ST->hasNEON() && VT.is128BitVector() && VT != MVT::v2f64 && VT != MVT::v4f32) {
7876 // If we haven't found an efficient lowering, try splitting a 128-bit vector
7877 // into two 64-bit vectors; we might discover a better way to lower it.
7878 SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElts);
7879 EVT ExtVT = VT.getVectorElementType();
7880 EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElts / 2);
7881 SDValue Lower = DAG.getBuildVector(HVT, dl, ArrayRef(&Ops[0], NumElts / 2));
7882 if (Lower.getOpcode() == ISD::BUILD_VECTOR)
7883 Lower = LowerBUILD_VECTOR(Lower, DAG, ST);
7884 SDValue Upper =
7885 DAG.getBuildVector(HVT, dl, ArrayRef(&Ops[NumElts / 2], NumElts / 2));
7886 if (Upper.getOpcode() == ISD::BUILD_VECTOR)
7887 Upper = LowerBUILD_VECTOR(Upper, DAG, ST);
7888 if (Lower && Upper)
7889 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lower, Upper);
7890 }
7891
7892 // Vectors with 32- or 64-bit elements can be built by directly assigning
7893 // the subregisters. Lower it to an ARMISD::BUILD_VECTOR so the operands
7894 // will be legalized.
7895 if (EltSize >= 32) {
7896 // Do the expansion with floating-point types, since that is what the VFP
7897 // registers are defined to use, and since i64 is not legal.
7898 EVT EltVT = EVT::getFloatingPointVT(EltSize);
7899 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts);
7901 for (unsigned i = 0; i < NumElts; ++i)
7902 Ops.push_back(DAG.getNode(ISD::BITCAST, dl, EltVT, Op.getOperand(i)));
7903 SDValue Val = DAG.getNode(ARMISD::BUILD_VECTOR, dl, VecVT, Ops);
7904 return DAG.getNode(ISD::BITCAST, dl, VT, Val);
7905 }
7906
7907 // If all else fails, just use a sequence of INSERT_VECTOR_ELT when we
7908 // know the default expansion would otherwise fall back on something even
7909 // worse. For a vector with one or two non-undef values, that's
7910 // scalar_to_vector for the elements followed by a shuffle (provided the
7911 // shuffle is valid for the target) and materialization element by element
7912 // on the stack followed by a load for everything else.
7913 if (!isConstant && !usesOnlyOneValue) {
7914 SDValue Vec = DAG.getUNDEF(VT);
7915 for (unsigned i = 0 ; i < NumElts; ++i) {
7916 SDValue V = Op.getOperand(i);
7917 if (V.isUndef())
7918 continue;
7919 SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i32);
7920 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Vec, V, LaneIdx);
7921 }
7922 return Vec;
7923 }
7924
7925 return SDValue();
7926}
7927
7928// Gather data to see if the operation can be modelled as a
7929// shuffle in combination with VEXTs.
7930SDValue ARMTargetLowering::ReconstructShuffle(SDValue Op,
7931 SelectionDAG &DAG) const {
7932 assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
7933 SDLoc dl(Op);
7934 EVT VT = Op.getValueType();
7935 unsigned NumElts = VT.getVectorNumElements();
7936
7937 struct ShuffleSourceInfo {
7938 SDValue Vec;
7939 unsigned MinElt = std::numeric_limits<unsigned>::max();
7940 unsigned MaxElt = 0;
7941
7942 // We may insert some combination of BITCASTs and VEXT nodes to force Vec to
7943 // be compatible with the shuffle we intend to construct. As a result
7944 // ShuffleVec will be some sliding window into the original Vec.
7945 SDValue ShuffleVec;
7946
7947 // Code should guarantee that element i in Vec starts at element "WindowBase
7948 // + i * WindowScale in ShuffleVec".
7949 int WindowBase = 0;
7950 int WindowScale = 1;
7951
7952 ShuffleSourceInfo(SDValue Vec) : Vec(Vec), ShuffleVec(Vec) {}
7953
7954 bool operator ==(SDValue OtherVec) { return Vec == OtherVec; }
7955 };
7956
7957 // First gather all vectors used as an immediate source for this BUILD_VECTOR
7958 // node.
7960 for (unsigned i = 0; i < NumElts; ++i) {
7961 SDValue V = Op.getOperand(i);
7962 if (V.isUndef())
7963 continue;
7964 else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT) {
7965 // A shuffle can only come from building a vector from various
7966 // elements of other vectors.
7967 return SDValue();
7968 } else if (!isa<ConstantSDNode>(V.getOperand(1))) {
7969 // Furthermore, shuffles require a constant mask, whereas extractelts
7970 // accept variable indices.
7971 return SDValue();
7972 }
7973
7974 // Add this element source to the list if it's not already there.
7975 SDValue SourceVec = V.getOperand(0);
7976 auto Source = llvm::find(Sources, SourceVec);
7977 if (Source == Sources.end())
7978 Source = Sources.insert(Sources.end(), ShuffleSourceInfo(SourceVec));
7979
7980 // Update the minimum and maximum lane number seen.
7981 unsigned EltNo = V.getConstantOperandVal(1);
7982 Source->MinElt = std::min(Source->MinElt, EltNo);
7983 Source->MaxElt = std::max(Source->MaxElt, EltNo);
7984 }
7985
7986 // Currently only do something sane when at most two source vectors
7987 // are involved.
7988 if (Sources.size() > 2)
7989 return SDValue();
7990
7991 // Find out the smallest element size among result and two sources, and use
7992 // it as element size to build the shuffle_vector.
7993 EVT SmallestEltTy = VT.getVectorElementType();
7994 for (auto &Source : Sources) {
7995 EVT SrcEltTy = Source.Vec.getValueType().getVectorElementType();
7996 if (SrcEltTy.bitsLT(SmallestEltTy))
7997 SmallestEltTy = SrcEltTy;
7998 }
7999 unsigned ResMultiplier =
8000 VT.getScalarSizeInBits() / SmallestEltTy.getSizeInBits();
8001 NumElts = VT.getSizeInBits() / SmallestEltTy.getSizeInBits();
8002 EVT ShuffleVT = EVT::getVectorVT(*DAG.getContext(), SmallestEltTy, NumElts);
8003
8004 // If the source vector is too wide or too narrow, we may nevertheless be able
8005 // to construct a compatible shuffle either by concatenating it with UNDEF or
8006 // extracting a suitable range of elements.
8007 for (auto &Src : Sources) {
8008 EVT SrcVT = Src.ShuffleVec.getValueType();
8009
8010 uint64_t SrcVTSize = SrcVT.getFixedSizeInBits();
8011 uint64_t VTSize = VT.getFixedSizeInBits();
8012 if (SrcVTSize == VTSize)
8013 continue;
8014
8015 // This stage of the search produces a source with the same element type as
8016 // the original, but with a total width matching the BUILD_VECTOR output.
8017 EVT EltVT = SrcVT.getVectorElementType();
8018 unsigned NumSrcElts = VTSize / EltVT.getFixedSizeInBits();
8019 EVT DestVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumSrcElts);
8020
8021 if (SrcVTSize < VTSize) {
8022 if (2 * SrcVTSize != VTSize)
8023 return SDValue();
8024 // We can pad out the smaller vector for free, so if it's part of a
8025 // shuffle...
8026 Src.ShuffleVec =
8027 DAG.getNode(ISD::CONCAT_VECTORS, dl, DestVT, Src.ShuffleVec,
8028 DAG.getUNDEF(Src.ShuffleVec.getValueType()));
8029 continue;
8030 }
8031
8032 if (SrcVTSize != 2 * VTSize)
8033 return SDValue();
8034
8035 if (Src.MaxElt - Src.MinElt >= NumSrcElts) {
8036 // Span too large for a VEXT to cope
8037 return SDValue();
8038 }
8039
8040 if (Src.MinElt >= NumSrcElts) {
8041 // The extraction can just take the second half
8042 Src.ShuffleVec =
8043 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
8044 DAG.getConstant(NumSrcElts, dl, MVT::i32));
8045 Src.WindowBase = -NumSrcElts;
8046 } else if (Src.MaxElt < NumSrcElts) {
8047 // The extraction can just take the first half
8048 Src.ShuffleVec =
8049 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
8050 DAG.getConstant(0, dl, MVT::i32));
8051 } else {
8052 // An actual VEXT is needed
8053 SDValue VEXTSrc1 =
8054 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
8055 DAG.getConstant(0, dl, MVT::i32));
8056 SDValue VEXTSrc2 =
8057 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
8058 DAG.getConstant(NumSrcElts, dl, MVT::i32));
8059
8060 Src.ShuffleVec = DAG.getNode(ARMISD::VEXT, dl, DestVT, VEXTSrc1,
8061 VEXTSrc2,
8062 DAG.getConstant(Src.MinElt, dl, MVT::i32));
8063 Src.WindowBase = -Src.MinElt;
8064 }
8065 }
8066
8067 // Another possible incompatibility occurs from the vector element types. We
8068 // can fix this by bitcasting the source vectors to the same type we intend
8069 // for the shuffle.
8070 for (auto &Src : Sources) {
8071 EVT SrcEltTy = Src.ShuffleVec.getValueType().getVectorElementType();
8072 if (SrcEltTy == SmallestEltTy)
8073 continue;
8074 assert(ShuffleVT.getVectorElementType() == SmallestEltTy);
8075 Src.ShuffleVec = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, ShuffleVT, Src.ShuffleVec);
8076 Src.WindowScale = SrcEltTy.getSizeInBits() / SmallestEltTy.getSizeInBits();
8077 Src.WindowBase *= Src.WindowScale;
8078 }
8079
8080 // Final check before we try to actually produce a shuffle.
8081 LLVM_DEBUG({
8082 for (auto Src : Sources)
8083 assert(Src.ShuffleVec.getValueType() == ShuffleVT);
8084 });
8085
8086 // The stars all align, our next step is to produce the mask for the shuffle.
8087 SmallVector<int, 8> Mask(ShuffleVT.getVectorNumElements(), -1);
8088 int BitsPerShuffleLane = ShuffleVT.getScalarSizeInBits();
8089 for (unsigned i = 0; i < VT.getVectorNumElements(); ++i) {
8090 SDValue Entry = Op.getOperand(i);
8091 if (Entry.isUndef())
8092 continue;
8093
8094 auto Src = llvm::find(Sources, Entry.getOperand(0));
8095 int EltNo = cast<ConstantSDNode>(Entry.getOperand(1))->getSExtValue();
8096
8097 // EXTRACT_VECTOR_ELT performs an implicit any_ext; BUILD_VECTOR an implicit
8098 // trunc. So only std::min(SrcBits, DestBits) actually get defined in this
8099 // segment.
8100 EVT OrigEltTy = Entry.getOperand(0).getValueType().getVectorElementType();
8101 int BitsDefined = std::min(OrigEltTy.getScalarSizeInBits(),
8102 VT.getScalarSizeInBits());
8103 int LanesDefined = BitsDefined / BitsPerShuffleLane;
8104
8105 // This source is expected to fill ResMultiplier lanes of the final shuffle,
8106 // starting at the appropriate offset.
8107 int *LaneMask = &Mask[i * ResMultiplier];
8108
8109 int ExtractBase = EltNo * Src->WindowScale + Src->WindowBase;
8110 ExtractBase += NumElts * (Src - Sources.begin());
8111 for (int j = 0; j < LanesDefined; ++j)
8112 LaneMask[j] = ExtractBase + j;
8113 }
8114
8115
8116 // We can't handle more than two sources. This should have already
8117 // been checked before this point.
8118 assert(Sources.size() <= 2 && "Too many sources!");
8119
8120 SDValue ShuffleOps[] = { DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT) };
8121 for (unsigned i = 0; i < Sources.size(); ++i)
8122 ShuffleOps[i] = Sources[i].ShuffleVec;
8123
8124 SDValue Shuffle = buildLegalVectorShuffle(ShuffleVT, dl, ShuffleOps[0],
8125 ShuffleOps[1], Mask, DAG);
8126 if (!Shuffle)
8127 return SDValue();
8128 return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, Shuffle);
8129}
8130
8132 OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3>
8141 OP_VUZPL, // VUZP, left result
8142 OP_VUZPR, // VUZP, right result
8143 OP_VZIPL, // VZIP, left result
8144 OP_VZIPR, // VZIP, right result
8145 OP_VTRNL, // VTRN, left result
8146 OP_VTRNR // VTRN, right result
8147};
8148
8149static bool isLegalMVEShuffleOp(unsigned PFEntry) {
8150 unsigned OpNum = (PFEntry >> 26) & 0x0F;
8151 switch (OpNum) {
8152 case OP_COPY:
8153 case OP_VREV:
8154 case OP_VDUP0:
8155 case OP_VDUP1:
8156 case OP_VDUP2:
8157 case OP_VDUP3:
8158 return true;
8159 }
8160 return false;
8161}
8162
8163/// isShuffleMaskLegal - Targets can use this to indicate that they only
8164/// support *some* VECTOR_SHUFFLE operations, those with specific masks.
8165/// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
8166/// are assumed to be legal.
8168 if (VT.getVectorNumElements() == 4 &&
8169 (VT.is128BitVector() || VT.is64BitVector())) {
8170 unsigned PFIndexes[4];
8171 for (unsigned i = 0; i != 4; ++i) {
8172 if (M[i] < 0)
8173 PFIndexes[i] = 8;
8174 else
8175 PFIndexes[i] = M[i];
8176 }
8177
8178 // Compute the index in the perfect shuffle table.
8179 unsigned PFTableIndex =
8180 PFIndexes[0]*9*9*9+PFIndexes[1]*9*9+PFIndexes[2]*9+PFIndexes[3];
8181 unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
8182 unsigned Cost = (PFEntry >> 30);
8183
8184 if (Cost <= 4 && (Subtarget->hasNEON() || isLegalMVEShuffleOp(PFEntry)))
8185 return true;
8186 }
8187
8188 bool ReverseVEXT, isV_UNDEF;
8189 unsigned Imm, WhichResult;
8190
8191 unsigned EltSize = VT.getScalarSizeInBits();
8192 if (EltSize >= 32 ||
8194 ShuffleVectorInst::isIdentityMask(M, M.size()) ||
8195 isVREVMask(M, VT, 64) ||
8196 isVREVMask(M, VT, 32) ||
8197 isVREVMask(M, VT, 16))
8198 return true;
8199 else if (Subtarget->hasNEON() &&
8200 (isVEXTMask(M, VT, ReverseVEXT, Imm) ||
8201 isVTBLMask(M, VT) ||
8202 isNEONTwoResultShuffleMask(M, VT, WhichResult, isV_UNDEF)))
8203 return true;
8204 else if ((VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i8) &&
8205 isReverseMask(M, VT))
8206 return true;
8207 else if (Subtarget->hasMVEIntegerOps() &&
8208 (isVMOVNMask(M, VT, true, false) ||
8209 isVMOVNMask(M, VT, false, false) || isVMOVNMask(M, VT, true, true)))
8210 return true;
8211 else if (Subtarget->hasMVEIntegerOps() &&
8212 (isTruncMask(M, VT, false, false) ||
8213 isTruncMask(M, VT, false, true) ||
8214 isTruncMask(M, VT, true, false) || isTruncMask(M, VT, true, true)))
8215 return true;
8216 else
8217 return false;
8218}
8219
8220/// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit
8221/// the specified operations to build the shuffle.
8223 SDValue RHS, SelectionDAG &DAG,
8224 const SDLoc &dl) {
8225 unsigned OpNum = (PFEntry >> 26) & 0x0F;
8226 unsigned LHSID = (PFEntry >> 13) & ((1 << 13)-1);
8227 unsigned RHSID = (PFEntry >> 0) & ((1 << 13)-1);
8228
8229 if (OpNum == OP_COPY) {
8230 if (LHSID == (1*9+2)*9+3) return LHS;
8231 assert(LHSID == ((4*9+5)*9+6)*9+7 && "Illegal OP_COPY!");
8232 return RHS;
8233 }
8234
8235 SDValue OpLHS, OpRHS;
8236 OpLHS = GeneratePerfectShuffle(PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl);
8237 OpRHS = GeneratePerfectShuffle(PerfectShuffleTable[RHSID], LHS, RHS, DAG, dl);
8238 EVT VT = OpLHS.getValueType();
8239
8240 switch (OpNum) {
8241 default: llvm_unreachable("Unknown shuffle opcode!");
8242 case OP_VREV:
8243 // VREV divides the vector in half and swaps within the half.
8244 if (VT.getScalarSizeInBits() == 32)
8245 return DAG.getNode(ARMISD::VREV64, dl, VT, OpLHS);
8246 // vrev <4 x i16> -> VREV32
8247 if (VT.getScalarSizeInBits() == 16)
8248 return DAG.getNode(ARMISD::VREV32, dl, VT, OpLHS);
8249 // vrev <4 x i8> -> VREV16
8250 assert(VT.getScalarSizeInBits() == 8);
8251 return DAG.getNode(ARMISD::VREV16, dl, VT, OpLHS);
8252 case OP_VDUP0:
8253 case OP_VDUP1:
8254 case OP_VDUP2:
8255 case OP_VDUP3:
8256 return DAG.getNode(ARMISD::VDUPLANE, dl, VT,
8257 OpLHS, DAG.getConstant(OpNum-OP_VDUP0, dl, MVT::i32));
8258 case OP_VEXT1:
8259 case OP_VEXT2:
8260 case OP_VEXT3:
8261 return DAG.getNode(ARMISD::VEXT, dl, VT,
8262 OpLHS, OpRHS,
8263 DAG.getConstant(OpNum - OP_VEXT1 + 1, dl, MVT::i32));
8264 case OP_VUZPL:
8265 case OP_VUZPR:
8266 return DAG.getNode(ARMISD::VUZP, dl, DAG.getVTList(VT, VT),
8267 OpLHS, OpRHS).getValue(OpNum-OP_VUZPL);
8268 case OP_VZIPL:
8269 case OP_VZIPR:
8270 return DAG.getNode(ARMISD::VZIP, dl, DAG.getVTList(VT, VT),
8271 OpLHS, OpRHS).getValue(OpNum-OP_VZIPL);
8272 case OP_VTRNL:
8273 case OP_VTRNR:
8274 return DAG.getNode(ARMISD::VTRN, dl, DAG.getVTList(VT, VT),
8275 OpLHS, OpRHS).getValue(OpNum-OP_VTRNL);
8276 }
8277}
8278
8280 ArrayRef<int> ShuffleMask,
8281 SelectionDAG &DAG) {
8282 // Check to see if we can use the VTBL instruction.
8283 SDValue V1 = Op.getOperand(0);
8284 SDValue V2 = Op.getOperand(1);
8285 SDLoc DL(Op);
8286
8287 SmallVector<SDValue, 8> VTBLMask;
8288 for (int I : ShuffleMask)
8289 VTBLMask.push_back(DAG.getSignedConstant(I, DL, MVT::i32));
8290
8291 if (V2.getNode()->isUndef())
8292 return DAG.getNode(ARMISD::VTBL1, DL, MVT::v8i8, V1,
8293 DAG.getBuildVector(MVT::v8i8, DL, VTBLMask));
8294
8295 return DAG.getNode(ARMISD::VTBL2, DL, MVT::v8i8, V1, V2,
8296 DAG.getBuildVector(MVT::v8i8, DL, VTBLMask));
8297}
8298
8300 SDLoc DL(Op);
8301 EVT VT = Op.getValueType();
8302
8303 assert((VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i8) &&
8304 "Expect an v8i16/v16i8 type");
8305 SDValue OpLHS = DAG.getNode(ARMISD::VREV64, DL, VT, Op.getOperand(0));
8306 // For a v16i8 type: After the VREV, we have got <7, ..., 0, 15, ..., 8>. Now,
8307 // extract the first 8 bytes into the top double word and the last 8 bytes
8308 // into the bottom double word, through a new vector shuffle that will be
8309 // turned into a VEXT on Neon, or a couple of VMOVDs on MVE.
8310 std::vector<int> NewMask;
8311 for (unsigned i = 0; i < VT.getVectorNumElements() / 2; i++)
8312 NewMask.push_back(VT.getVectorNumElements() / 2 + i);
8313 for (unsigned i = 0; i < VT.getVectorNumElements() / 2; i++)
8314 NewMask.push_back(i);
8315 return DAG.getVectorShuffle(VT, DL, OpLHS, OpLHS, NewMask);
8316}
8317
8319 switch (VT.getSimpleVT().SimpleTy) {
8320 case MVT::v2i1:
8321 return MVT::v2f64;
8322 case MVT::v4i1:
8323 return MVT::v4i32;
8324 case MVT::v8i1:
8325 return MVT::v8i16;
8326 case MVT::v16i1:
8327 return MVT::v16i8;
8328 default:
8329 llvm_unreachable("Unexpected vector predicate type");
8330 }
8331}
8332
8334 SelectionDAG &DAG) {
8335 // Converting from boolean predicates to integers involves creating a vector
8336 // of all ones or all zeroes and selecting the lanes based upon the real
8337 // predicate.
8339 DAG.getTargetConstant(ARM_AM::createVMOVModImm(0xe, 0xff), dl, MVT::i32);
8340 AllOnes = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v16i8, AllOnes);
8341
8342 SDValue AllZeroes =
8343 DAG.getTargetConstant(ARM_AM::createVMOVModImm(0xe, 0x0), dl, MVT::i32);
8344 AllZeroes = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v16i8, AllZeroes);
8345
8346 // Get full vector type from predicate type
8348
8349 SDValue RecastV1;
8350 // If the real predicate is an v8i1 or v4i1 (not v16i1) then we need to recast
8351 // this to a v16i1. This cannot be done with an ordinary bitcast because the
8352 // sizes are not the same. We have to use a MVE specific PREDICATE_CAST node,
8353 // since we know in hardware the sizes are really the same.
8354 if (VT != MVT::v16i1)
8355 RecastV1 = DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::v16i1, Pred);
8356 else
8357 RecastV1 = Pred;
8358
8359 // Select either all ones or zeroes depending upon the real predicate bits.
8360 SDValue PredAsVector =
8361 DAG.getNode(ISD::VSELECT, dl, MVT::v16i8, RecastV1, AllOnes, AllZeroes);
8362
8363 // Recast our new predicate-as-integer v16i8 vector into something
8364 // appropriate for the shuffle, i.e. v4i32 for a real v4i1 predicate.
8365 return DAG.getNode(ISD::BITCAST, dl, NewVT, PredAsVector);
8366}
8367
8369 const ARMSubtarget *ST) {
8370 EVT VT = Op.getValueType();
8372 ArrayRef<int> ShuffleMask = SVN->getMask();
8373
8374 assert(ST->hasMVEIntegerOps() &&
8375 "No support for vector shuffle of boolean predicates");
8376
8377 SDValue V1 = Op.getOperand(0);
8378 SDValue V2 = Op.getOperand(1);
8379 SDLoc dl(Op);
8380 if (isReverseMask(ShuffleMask, VT)) {
8381 SDValue cast = DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::i32, V1);
8382 SDValue rbit = DAG.getNode(ISD::BITREVERSE, dl, MVT::i32, cast);
8383 SDValue srl = DAG.getNode(ISD::SRL, dl, MVT::i32, rbit,
8384 DAG.getConstant(16, dl, MVT::i32));
8385 return DAG.getNode(ARMISD::PREDICATE_CAST, dl, VT, srl);
8386 }
8387
8388 // Until we can come up with optimised cases for every single vector
8389 // shuffle in existence we have chosen the least painful strategy. This is
8390 // to essentially promote the boolean predicate to a 8-bit integer, where
8391 // each predicate represents a byte. Then we fall back on a normal integer
8392 // vector shuffle and convert the result back into a predicate vector. In
8393 // many cases the generated code might be even better than scalar code
8394 // operating on bits. Just imagine trying to shuffle 8 arbitrary 2-bit
8395 // fields in a register into 8 other arbitrary 2-bit fields!
8396 SDValue PredAsVector1 = PromoteMVEPredVector(dl, V1, VT, DAG);
8397 EVT NewVT = PredAsVector1.getValueType();
8398 SDValue PredAsVector2 = V2.isUndef() ? DAG.getUNDEF(NewVT)
8399 : PromoteMVEPredVector(dl, V2, VT, DAG);
8400 assert(PredAsVector2.getValueType() == NewVT &&
8401 "Expected identical vector type in expanded i1 shuffle!");
8402
8403 // Do the shuffle!
8404 SDValue Shuffled = DAG.getVectorShuffle(NewVT, dl, PredAsVector1,
8405 PredAsVector2, ShuffleMask);
8406
8407 // Now return the result of comparing the shuffled vector with zero,
8408 // which will generate a real predicate, i.e. v4i1, v8i1 or v16i1. For a v2i1
8409 // we convert to a v4i1 compare to fill in the two halves of the i64 as i32s.
8410 if (VT == MVT::v2i1) {
8411 SDValue BC = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, Shuffled);
8412 SDValue Cmp = DAG.getNode(ARMISD::VCMPZ, dl, MVT::v4i1, BC,
8413 DAG.getConstant(ARMCC::NE, dl, MVT::i32));
8414 return DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::v2i1, Cmp);
8415 }
8416 return DAG.getNode(ARMISD::VCMPZ, dl, VT, Shuffled,
8417 DAG.getConstant(ARMCC::NE, dl, MVT::i32));
8418}
8419
8421 ArrayRef<int> ShuffleMask,
8422 SelectionDAG &DAG) {
8423 // Attempt to lower the vector shuffle using as many whole register movs as
8424 // possible. This is useful for types smaller than 32bits, which would
8425 // often otherwise become a series for grp movs.
8426 SDLoc dl(Op);
8427 EVT VT = Op.getValueType();
8428 if (VT.getScalarSizeInBits() >= 32)
8429 return SDValue();
8430
8431 assert((VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i8) &&
8432 "Unexpected vector type");
8433 int NumElts = VT.getVectorNumElements();
8434 int QuarterSize = NumElts / 4;
8435 // The four final parts of the vector, as i32's
8436 SDValue Parts[4];
8437
8438 // Look for full lane vmovs like <0,1,2,3> or <u,5,6,7> etc, (but not
8439 // <u,u,u,u>), returning the vmov lane index
8440 auto getMovIdx = [](ArrayRef<int> ShuffleMask, int Start, int Length) {
8441 // Detect which mov lane this would be from the first non-undef element.
8442 int MovIdx = -1;
8443 for (int i = 0; i < Length; i++) {
8444 if (ShuffleMask[Start + i] >= 0) {
8445 if (ShuffleMask[Start + i] % Length != i)
8446 return -1;
8447 MovIdx = ShuffleMask[Start + i] / Length;
8448 break;
8449 }
8450 }
8451 // If all items are undef, leave this for other combines
8452 if (MovIdx == -1)
8453 return -1;
8454 // Check the remaining values are the correct part of the same mov
8455 for (int i = 1; i < Length; i++) {
8456 if (ShuffleMask[Start + i] >= 0 &&
8457 (ShuffleMask[Start + i] / Length != MovIdx ||
8458 ShuffleMask[Start + i] % Length != i))
8459 return -1;
8460 }
8461 return MovIdx;
8462 };
8463
8464 for (int Part = 0; Part < 4; ++Part) {
8465 // Does this part look like a mov
8466 int Elt = getMovIdx(ShuffleMask, Part * QuarterSize, QuarterSize);
8467 if (Elt != -1) {
8468 SDValue Input = Op->getOperand(0);
8469 if (Elt >= 4) {
8470 Input = Op->getOperand(1);
8471 Elt -= 4;
8472 }
8473 SDValue BitCast = DAG.getBitcast(MVT::v4f32, Input);
8474 Parts[Part] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, BitCast,
8475 DAG.getConstant(Elt, dl, MVT::i32));
8476 }
8477 }
8478
8479 // Nothing interesting found, just return
8480 if (!Parts[0] && !Parts[1] && !Parts[2] && !Parts[3])
8481 return SDValue();
8482
8483 // The other parts need to be built with the old shuffle vector, cast to a
8484 // v4i32 and extract_vector_elts
8485 if (!Parts[0] || !Parts[1] || !Parts[2] || !Parts[3]) {
8486 SmallVector<int, 16> NewShuffleMask;
8487 for (int Part = 0; Part < 4; ++Part)
8488 for (int i = 0; i < QuarterSize; i++)
8489 NewShuffleMask.push_back(
8490 Parts[Part] ? -1 : ShuffleMask[Part * QuarterSize + i]);
8491 SDValue NewShuffle = DAG.getVectorShuffle(
8492 VT, dl, Op->getOperand(0), Op->getOperand(1), NewShuffleMask);
8493 SDValue BitCast = DAG.getBitcast(MVT::v4f32, NewShuffle);
8494
8495 for (int Part = 0; Part < 4; ++Part)
8496 if (!Parts[Part])
8497 Parts[Part] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32,
8498 BitCast, DAG.getConstant(Part, dl, MVT::i32));
8499 }
8500 // Build a vector out of the various parts and bitcast it back to the original
8501 // type.
8502 SDValue NewVec = DAG.getNode(ARMISD::BUILD_VECTOR, dl, MVT::v4f32, Parts);
8503 return DAG.getBitcast(VT, NewVec);
8504}
8505
8507 ArrayRef<int> ShuffleMask,
8508 SelectionDAG &DAG) {
8509 SDValue V1 = Op.getOperand(0);
8510 SDValue V2 = Op.getOperand(1);
8511 EVT VT = Op.getValueType();
8512 unsigned NumElts = VT.getVectorNumElements();
8513
8514 // An One-Off Identity mask is one that is mostly an identity mask from as
8515 // single source but contains a single element out-of-place, either from a
8516 // different vector or from another position in the same vector. As opposed to
8517 // lowering this via a ARMISD::BUILD_VECTOR we can generate an extract/insert
8518 // pair directly.
8519 auto isOneOffIdentityMask = [](ArrayRef<int> Mask, EVT VT, int BaseOffset,
8520 int &OffElement) {
8521 OffElement = -1;
8522 int NonUndef = 0;
8523 for (int i = 0, NumMaskElts = Mask.size(); i < NumMaskElts; ++i) {
8524 if (Mask[i] == -1)
8525 continue;
8526 NonUndef++;
8527 if (Mask[i] != i + BaseOffset) {
8528 if (OffElement == -1)
8529 OffElement = i;
8530 else
8531 return false;
8532 }
8533 }
8534 return NonUndef > 2 && OffElement != -1;
8535 };
8536 int OffElement;
8537 SDValue VInput;
8538 if (isOneOffIdentityMask(ShuffleMask, VT, 0, OffElement))
8539 VInput = V1;
8540 else if (isOneOffIdentityMask(ShuffleMask, VT, NumElts, OffElement))
8541 VInput = V2;
8542 else
8543 return SDValue();
8544
8545 SDLoc dl(Op);
8546 EVT SVT = VT.getScalarType() == MVT::i8 || VT.getScalarType() == MVT::i16
8547 ? MVT::i32
8548 : VT.getScalarType();
8549 SDValue Elt = DAG.getNode(
8550 ISD::EXTRACT_VECTOR_ELT, dl, SVT,
8551 ShuffleMask[OffElement] < (int)NumElts ? V1 : V2,
8552 DAG.getVectorIdxConstant(ShuffleMask[OffElement] % NumElts, dl));
8553 return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, VInput, Elt,
8554 DAG.getVectorIdxConstant(OffElement % NumElts, dl));
8555}
8556
8558 const ARMSubtarget *ST) {
8559 SDValue V1 = Op.getOperand(0);
8560 SDValue V2 = Op.getOperand(1);
8561 SDLoc dl(Op);
8562 EVT VT = Op.getValueType();
8564 unsigned EltSize = VT.getScalarSizeInBits();
8565
8566 if (ST->hasMVEIntegerOps() && EltSize == 1)
8567 return LowerVECTOR_SHUFFLE_i1(Op, DAG, ST);
8568
8569 // Convert shuffles that are directly supported on NEON to target-specific
8570 // DAG nodes, instead of keeping them as shuffles and matching them again
8571 // during code selection. This is more efficient and avoids the possibility
8572 // of inconsistencies between legalization and selection.
8573 // FIXME: floating-point vectors should be canonicalized to integer vectors
8574 // of the same time so that they get CSEd properly.
8575 ArrayRef<int> ShuffleMask = SVN->getMask();
8576
8577 if (EltSize <= 32) {
8578 if (SVN->isSplat()) {
8579 int Lane = SVN->getSplatIndex();
8580 // If this is undef splat, generate it via "just" vdup, if possible.
8581 if (Lane == -1) Lane = 0;
8582
8583 // Test if V1 is a SCALAR_TO_VECTOR.
8584 if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR) {
8585 return DAG.getNode(ARMISD::VDUP, dl, VT, V1.getOperand(0));
8586 }
8587 // Test if V1 is a BUILD_VECTOR which is equivalent to a SCALAR_TO_VECTOR
8588 // (and probably will turn into a SCALAR_TO_VECTOR once legalization
8589 // reaches it).
8590 if (Lane == 0 && V1.getOpcode() == ISD::BUILD_VECTOR &&
8592 bool IsScalarToVector = true;
8593 for (unsigned i = 1, e = V1.getNumOperands(); i != e; ++i)
8594 if (!V1.getOperand(i).isUndef()) {
8595 IsScalarToVector = false;
8596 break;
8597 }
8598 if (IsScalarToVector)
8599 return DAG.getNode(ARMISD::VDUP, dl, VT, V1.getOperand(0));
8600 }
8601 return DAG.getNode(ARMISD::VDUPLANE, dl, VT, V1,
8602 DAG.getConstant(Lane, dl, MVT::i32));
8603 }
8604
8605 bool ReverseVEXT = false;
8606 unsigned Imm = 0;
8607 if (ST->hasNEON() && isVEXTMask(ShuffleMask, VT, ReverseVEXT, Imm)) {
8608 if (ReverseVEXT)
8609 std::swap(V1, V2);
8610 return DAG.getNode(ARMISD::VEXT, dl, VT, V1, V2,
8611 DAG.getConstant(Imm, dl, MVT::i32));
8612 }
8613
8614 if (isVREVMask(ShuffleMask, VT, 64))
8615 return DAG.getNode(ARMISD::VREV64, dl, VT, V1);
8616 if (isVREVMask(ShuffleMask, VT, 32))
8617 return DAG.getNode(ARMISD::VREV32, dl, VT, V1);
8618 if (isVREVMask(ShuffleMask, VT, 16))
8619 return DAG.getNode(ARMISD::VREV16, dl, VT, V1);
8620
8621 if (ST->hasNEON() && V2->isUndef() && isSingletonVEXTMask(ShuffleMask, VT, Imm)) {
8622 return DAG.getNode(ARMISD::VEXT, dl, VT, V1, V1,
8623 DAG.getConstant(Imm, dl, MVT::i32));
8624 }
8625
8626 // Check for Neon shuffles that modify both input vectors in place.
8627 // If both results are used, i.e., if there are two shuffles with the same
8628 // source operands and with masks corresponding to both results of one of
8629 // these operations, DAG memoization will ensure that a single node is
8630 // used for both shuffles.
8631 unsigned WhichResult = 0;
8632 bool isV_UNDEF = false;
8633 if (ST->hasNEON()) {
8634 if (unsigned ShuffleOpc = isNEONTwoResultShuffleMask(
8635 ShuffleMask, VT, WhichResult, isV_UNDEF)) {
8636 if (isV_UNDEF)
8637 V2 = V1;
8638 return DAG.getNode(ShuffleOpc, dl, DAG.getVTList(VT, VT), V1, V2)
8639 .getValue(WhichResult);
8640 }
8641 }
8642 if (ST->hasMVEIntegerOps()) {
8643 if (isVMOVNMask(ShuffleMask, VT, false, false))
8644 return DAG.getNode(ARMISD::VMOVN, dl, VT, V2, V1,
8645 DAG.getConstant(0, dl, MVT::i32));
8646 if (isVMOVNMask(ShuffleMask, VT, true, false))
8647 return DAG.getNode(ARMISD::VMOVN, dl, VT, V1, V2,
8648 DAG.getConstant(1, dl, MVT::i32));
8649 if (isVMOVNMask(ShuffleMask, VT, true, true))
8650 return DAG.getNode(ARMISD::VMOVN, dl, VT, V1, V1,
8651 DAG.getConstant(1, dl, MVT::i32));
8652 }
8653
8654 // Also check for these shuffles through CONCAT_VECTORS: we canonicalize
8655 // shuffles that produce a result larger than their operands with:
8656 // shuffle(concat(v1, undef), concat(v2, undef))
8657 // ->
8658 // shuffle(concat(v1, v2), undef)
8659 // because we can access quad vectors (see PerformVECTOR_SHUFFLECombine).
8660 //
8661 // This is useful in the general case, but there are special cases where
8662 // native shuffles produce larger results: the two-result ops.
8663 //
8664 // Look through the concat when lowering them:
8665 // shuffle(concat(v1, v2), undef)
8666 // ->
8667 // concat(VZIP(v1, v2):0, :1)
8668 //
8669 if (ST->hasNEON() && V1->getOpcode() == ISD::CONCAT_VECTORS && V2->isUndef()) {
8670 SDValue SubV1 = V1->getOperand(0);
8671 SDValue SubV2 = V1->getOperand(1);
8672 EVT SubVT = SubV1.getValueType();
8673
8674 // We expect these to have been canonicalized to -1.
8675 assert(llvm::all_of(ShuffleMask, [&](int i) {
8676 return i < (int)VT.getVectorNumElements();
8677 }) && "Unexpected shuffle index into UNDEF operand!");
8678
8679 if (unsigned ShuffleOpc = isNEONTwoResultShuffleMask(
8680 ShuffleMask, SubVT, WhichResult, isV_UNDEF)) {
8681 if (isV_UNDEF)
8682 SubV2 = SubV1;
8683 assert((WhichResult == 0) &&
8684 "In-place shuffle of concat can only have one result!");
8685 SDValue Res = DAG.getNode(ShuffleOpc, dl, DAG.getVTList(SubVT, SubVT),
8686 SubV1, SubV2);
8687 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Res.getValue(0),
8688 Res.getValue(1));
8689 }
8690 }
8691 }
8692
8693 if (ST->hasMVEIntegerOps() && EltSize <= 32) {
8694 if (SDValue V = LowerVECTOR_SHUFFLEUsingOneOff(Op, ShuffleMask, DAG))
8695 return V;
8696
8697 for (bool Top : {false, true}) {
8698 for (bool SingleSource : {false, true}) {
8699 if (isTruncMask(ShuffleMask, VT, Top, SingleSource)) {
8700 MVT FromSVT = MVT::getIntegerVT(EltSize * 2);
8701 MVT FromVT = MVT::getVectorVT(FromSVT, ShuffleMask.size() / 2);
8702 SDValue Lo = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, FromVT, V1);
8703 SDValue Hi = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, FromVT,
8704 SingleSource ? V1 : V2);
8705 if (Top) {
8706 SDValue Amt = DAG.getConstant(EltSize, dl, FromVT);
8707 Lo = DAG.getNode(ISD::SRL, dl, FromVT, Lo, Amt);
8708 Hi = DAG.getNode(ISD::SRL, dl, FromVT, Hi, Amt);
8709 }
8710 return DAG.getNode(ARMISD::MVETRUNC, dl, VT, Lo, Hi);
8711 }
8712 }
8713 }
8714 }
8715
8716 // If the shuffle is not directly supported and it has 4 elements, use
8717 // the PerfectShuffle-generated table to synthesize it from other shuffles.
8718 unsigned NumElts = VT.getVectorNumElements();
8719 if (NumElts == 4) {
8720 unsigned PFIndexes[4];
8721 for (unsigned i = 0; i != 4; ++i) {
8722 if (ShuffleMask[i] < 0)
8723 PFIndexes[i] = 8;
8724 else
8725 PFIndexes[i] = ShuffleMask[i];
8726 }
8727
8728 // Compute the index in the perfect shuffle table.
8729 unsigned PFTableIndex =
8730 PFIndexes[0]*9*9*9+PFIndexes[1]*9*9+PFIndexes[2]*9+PFIndexes[3];
8731 unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
8732 unsigned Cost = (PFEntry >> 30);
8733
8734 if (Cost <= 4) {
8735 if (ST->hasNEON())
8736 return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl);
8737 else if (isLegalMVEShuffleOp(PFEntry)) {
8738 unsigned LHSID = (PFEntry >> 13) & ((1 << 13)-1);
8739 unsigned RHSID = (PFEntry >> 0) & ((1 << 13)-1);
8740 unsigned PFEntryLHS = PerfectShuffleTable[LHSID];
8741 unsigned PFEntryRHS = PerfectShuffleTable[RHSID];
8742 if (isLegalMVEShuffleOp(PFEntryLHS) && isLegalMVEShuffleOp(PFEntryRHS))
8743 return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl);
8744 }
8745 }
8746 }
8747
8748 // Implement shuffles with 32- or 64-bit elements as ARMISD::BUILD_VECTORs.
8749 if (EltSize >= 32) {
8750 // Do the expansion with floating-point types, since that is what the VFP
8751 // registers are defined to use, and since i64 is not legal.
8752 EVT EltVT = EVT::getFloatingPointVT(EltSize);
8753 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts);
8754 V1 = DAG.getNode(ISD::BITCAST, dl, VecVT, V1);
8755 V2 = DAG.getNode(ISD::BITCAST, dl, VecVT, V2);
8757 for (unsigned i = 0; i < NumElts; ++i) {
8758 if (ShuffleMask[i] < 0)
8759 Ops.push_back(DAG.getUNDEF(EltVT));
8760 else
8761 Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT,
8762 ShuffleMask[i] < (int)NumElts ? V1 : V2,
8763 DAG.getConstant(ShuffleMask[i] & (NumElts-1),
8764 dl, MVT::i32)));
8765 }
8766 SDValue Val = DAG.getNode(ARMISD::BUILD_VECTOR, dl, VecVT, Ops);
8767 return DAG.getNode(ISD::BITCAST, dl, VT, Val);
8768 }
8769
8770 if ((VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i8) &&
8771 isReverseMask(ShuffleMask, VT))
8772 return LowerReverse_VECTOR_SHUFFLE(Op, DAG);
8773
8774 if (ST->hasNEON() && VT == MVT::v8i8)
8775 if (SDValue NewOp = LowerVECTOR_SHUFFLEv8i8(Op, ShuffleMask, DAG))
8776 return NewOp;
8777
8778 if (ST->hasMVEIntegerOps())
8779 if (SDValue NewOp = LowerVECTOR_SHUFFLEUsingMovs(Op, ShuffleMask, DAG))
8780 return NewOp;
8781
8782 return SDValue();
8783}
8784
8786 const ARMSubtarget *ST) {
8787 EVT VecVT = Op.getOperand(0).getValueType();
8788 SDLoc dl(Op);
8789
8790 assert(ST->hasMVEIntegerOps() &&
8791 "LowerINSERT_VECTOR_ELT_i1 called without MVE!");
8792
8793 SDValue Conv =
8794 DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::i32, Op->getOperand(0));
8795 unsigned Lane = Op.getConstantOperandVal(2);
8796 unsigned LaneWidth =
8798 unsigned Mask = ((1 << LaneWidth) - 1) << Lane * LaneWidth;
8799 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::i32,
8800 Op.getOperand(1), DAG.getValueType(MVT::i1));
8801 SDValue BFI = DAG.getNode(ARMISD::BFI, dl, MVT::i32, Conv, Ext,
8802 DAG.getConstant(~Mask, dl, MVT::i32));
8803 return DAG.getNode(ARMISD::PREDICATE_CAST, dl, Op.getValueType(), BFI);
8804}
8805
8806SDValue ARMTargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
8807 SelectionDAG &DAG) const {
8808 // INSERT_VECTOR_ELT is legal only for immediate indexes.
8809 SDValue Lane = Op.getOperand(2);
8810 if (!isa<ConstantSDNode>(Lane))
8811 return SDValue();
8812
8813 SDValue Elt = Op.getOperand(1);
8814 EVT EltVT = Elt.getValueType();
8815
8816 if (Subtarget->hasMVEIntegerOps() &&
8817 Op.getValueType().getScalarSizeInBits() == 1)
8818 return LowerINSERT_VECTOR_ELT_i1(Op, DAG, Subtarget);
8819
8820 if (getTypeAction(*DAG.getContext(), EltVT) ==
8822 // INSERT_VECTOR_ELT doesn't want f16 operands promoting to f32,
8823 // but the type system will try to do that if we don't intervene.
8824 // Reinterpret any such vector-element insertion as one with the
8825 // corresponding integer types.
8826
8827 SDLoc dl(Op);
8828
8829 EVT IEltVT = MVT::getIntegerVT(EltVT.getScalarSizeInBits());
8830 assert(getTypeAction(*DAG.getContext(), IEltVT) !=
8832
8833 SDValue VecIn = Op.getOperand(0);
8834 EVT VecVT = VecIn.getValueType();
8835 EVT IVecVT = EVT::getVectorVT(*DAG.getContext(), IEltVT,
8836 VecVT.getVectorNumElements());
8837
8838 SDValue IElt = DAG.getNode(ISD::BITCAST, dl, IEltVT, Elt);
8839 SDValue IVecIn = DAG.getNode(ISD::BITCAST, dl, IVecVT, VecIn);
8840 SDValue IVecOut = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, IVecVT,
8841 IVecIn, IElt, Lane);
8842 return DAG.getNode(ISD::BITCAST, dl, VecVT, IVecOut);
8843 }
8844
8845 return Op;
8846}
8847
8849 const ARMSubtarget *ST) {
8850 EVT VecVT = Op.getOperand(0).getValueType();
8851 SDLoc dl(Op);
8852
8853 assert(ST->hasMVEIntegerOps() &&
8854 "LowerINSERT_VECTOR_ELT_i1 called without MVE!");
8855
8856 SDValue Conv =
8857 DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::i32, Op->getOperand(0));
8858 unsigned Lane = Op.getConstantOperandVal(1);
8859 unsigned LaneWidth =
8861 SDValue Shift = DAG.getNode(ISD::SRL, dl, MVT::i32, Conv,
8862 DAG.getConstant(Lane * LaneWidth, dl, MVT::i32));
8863 return Shift;
8864}
8865
8867 const ARMSubtarget *ST) {
8868 // EXTRACT_VECTOR_ELT is legal only for immediate indexes.
8869 SDValue Lane = Op.getOperand(1);
8870 if (!isa<ConstantSDNode>(Lane))
8871 return SDValue();
8872
8873 SDValue Vec = Op.getOperand(0);
8874 EVT VT = Vec.getValueType();
8875
8876 if (ST->hasMVEIntegerOps() && VT.getScalarSizeInBits() == 1)
8877 return LowerEXTRACT_VECTOR_ELT_i1(Op, DAG, ST);
8878
8879 if (Op.getValueType() == MVT::i32 && Vec.getScalarValueSizeInBits() < 32) {
8880 SDLoc dl(Op);
8881 return DAG.getNode(ARMISD::VGETLANEu, dl, MVT::i32, Vec, Lane);
8882 }
8883
8884 return Op;
8885}
8886
8888 const ARMSubtarget *ST) {
8889 SDLoc dl(Op);
8890 assert(Op.getValueType().getScalarSizeInBits() == 1 &&
8891 "Unexpected custom CONCAT_VECTORS lowering");
8892 assert(isPowerOf2_32(Op.getNumOperands()) &&
8893 "Unexpected custom CONCAT_VECTORS lowering");
8894 assert(ST->hasMVEIntegerOps() &&
8895 "CONCAT_VECTORS lowering only supported for MVE");
8896
8897 auto ConcatPair = [&](SDValue V1, SDValue V2) {
8898 EVT Op1VT = V1.getValueType();
8899 EVT Op2VT = V2.getValueType();
8900 assert(Op1VT == Op2VT && "Operand types don't match!");
8901 assert((Op1VT == MVT::v2i1 || Op1VT == MVT::v4i1 || Op1VT == MVT::v8i1) &&
8902 "Unexpected i1 concat operations!");
8903 EVT VT = Op1VT.getDoubleNumVectorElementsVT(*DAG.getContext());
8904
8905 SDValue NewV1 = PromoteMVEPredVector(dl, V1, Op1VT, DAG);
8906 SDValue NewV2 = PromoteMVEPredVector(dl, V2, Op2VT, DAG);
8907
8908 // We now have Op1 + Op2 promoted to vectors of integers, where v8i1 gets
8909 // promoted to v8i16, etc.
8910 MVT ElType =
8912 unsigned NumElts = 2 * Op1VT.getVectorNumElements();
8913
8914 EVT ConcatVT = MVT::getVectorVT(ElType, NumElts);
8915 if (Op1VT == MVT::v4i1 || Op1VT == MVT::v8i1) {
8916 // Use MVETRUNC to truncate the combined NewV1::NewV2 into the smaller
8917 // ConcatVT.
8918 SDValue ConVec =
8919 DAG.getNode(ARMISD::MVETRUNC, dl, ConcatVT, NewV1, NewV2);
8920 return DAG.getNode(ARMISD::VCMPZ, dl, VT, ConVec,
8921 DAG.getConstant(ARMCC::NE, dl, MVT::i32));
8922 }
8923
8924 // Extract the vector elements from Op1 and Op2 one by one and truncate them
8925 // to be the right size for the destination. For example, if Op1 is v4i1
8926 // then the promoted vector is v4i32. The result of concatenation gives a
8927 // v8i1, which when promoted is v8i16. That means each i32 element from Op1
8928 // needs truncating to i16 and inserting in the result.
8929 auto ExtractInto = [&DAG, &dl](SDValue NewV, SDValue ConVec, unsigned &j) {
8930 EVT NewVT = NewV.getValueType();
8931 EVT ConcatVT = ConVec.getValueType();
8932 unsigned ExtScale = 1;
8933 if (NewVT == MVT::v2f64) {
8934 NewV = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, NewV);
8935 ExtScale = 2;
8936 }
8937 for (unsigned i = 0, e = NewVT.getVectorNumElements(); i < e; i++, j++) {
8938 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, NewV,
8939 DAG.getIntPtrConstant(i * ExtScale, dl));
8940 ConVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ConcatVT, ConVec, Elt,
8941 DAG.getConstant(j, dl, MVT::i32));
8942 }
8943 return ConVec;
8944 };
8945 unsigned j = 0;
8946 SDValue ConVec = DAG.getNode(ISD::UNDEF, dl, ConcatVT);
8947 ConVec = ExtractInto(NewV1, ConVec, j);
8948 ConVec = ExtractInto(NewV2, ConVec, j);
8949
8950 // Now return the result of comparing the subvector with zero, which will
8951 // generate a real predicate, i.e. v4i1, v8i1 or v16i1.
8952 return DAG.getNode(ARMISD::VCMPZ, dl, VT, ConVec,
8953 DAG.getConstant(ARMCC::NE, dl, MVT::i32));
8954 };
8955
8956 // Concat each pair of subvectors and pack into the lower half of the array.
8957 SmallVector<SDValue> ConcatOps(Op->ops());
8958 while (ConcatOps.size() > 1) {
8959 for (unsigned I = 0, E = ConcatOps.size(); I != E; I += 2) {
8960 SDValue V1 = ConcatOps[I];
8961 SDValue V2 = ConcatOps[I + 1];
8962 ConcatOps[I / 2] = ConcatPair(V1, V2);
8963 }
8964 ConcatOps.resize(ConcatOps.size() / 2);
8965 }
8966 return ConcatOps[0];
8967}
8968
8970 const ARMSubtarget *ST) {
8971 EVT VT = Op->getValueType(0);
8972 if (ST->hasMVEIntegerOps() && VT.getScalarSizeInBits() == 1)
8973 return LowerCONCAT_VECTORS_i1(Op, DAG, ST);
8974
8975 // The only time a CONCAT_VECTORS operation can have legal types is when
8976 // two 64-bit vectors are concatenated to a 128-bit vector.
8977 assert(Op.getValueType().is128BitVector() && Op.getNumOperands() == 2 &&
8978 "unexpected CONCAT_VECTORS");
8979 SDLoc dl(Op);
8980 SDValue Val = DAG.getUNDEF(MVT::v2f64);
8981 SDValue Op0 = Op.getOperand(0);
8982 SDValue Op1 = Op.getOperand(1);
8983 if (!Op0.isUndef())
8984 Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Val,
8985 DAG.getNode(ISD::BITCAST, dl, MVT::f64, Op0),
8986 DAG.getIntPtrConstant(0, dl));
8987 if (!Op1.isUndef())
8988 Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Val,
8989 DAG.getNode(ISD::BITCAST, dl, MVT::f64, Op1),
8990 DAG.getIntPtrConstant(1, dl));
8991 return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Val);
8992}
8993
8995 const ARMSubtarget *ST) {
8996 SDValue V1 = Op.getOperand(0);
8997 SDValue V2 = Op.getOperand(1);
8998 SDLoc dl(Op);
8999 EVT VT = Op.getValueType();
9000 EVT Op1VT = V1.getValueType();
9001 unsigned NumElts = VT.getVectorNumElements();
9002 unsigned Index = V2->getAsZExtVal();
9003
9004 assert(VT.getScalarSizeInBits() == 1 &&
9005 "Unexpected custom EXTRACT_SUBVECTOR lowering");
9006 assert(ST->hasMVEIntegerOps() &&
9007 "EXTRACT_SUBVECTOR lowering only supported for MVE");
9008
9009 SDValue NewV1 = PromoteMVEPredVector(dl, V1, Op1VT, DAG);
9010
9011 // We now have Op1 promoted to a vector of integers, where v8i1 gets
9012 // promoted to v8i16, etc.
9013
9015
9016 if (NumElts == 2) {
9017 EVT SubVT = MVT::v4i32;
9018 SDValue SubVec = DAG.getNode(ISD::UNDEF, dl, SubVT);
9019 for (unsigned i = Index, j = 0; i < (Index + NumElts); i++, j += 2) {
9020 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, NewV1,
9021 DAG.getIntPtrConstant(i, dl));
9022 SubVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, SubVT, SubVec, Elt,
9023 DAG.getConstant(j, dl, MVT::i32));
9024 SubVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, SubVT, SubVec, Elt,
9025 DAG.getConstant(j + 1, dl, MVT::i32));
9026 }
9027 SDValue Cmp = DAG.getNode(ARMISD::VCMPZ, dl, MVT::v4i1, SubVec,
9028 DAG.getConstant(ARMCC::NE, dl, MVT::i32));
9029 return DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::v2i1, Cmp);
9030 }
9031
9032 EVT SubVT = MVT::getVectorVT(ElType, NumElts);
9033 SDValue SubVec = DAG.getNode(ISD::UNDEF, dl, SubVT);
9034 for (unsigned i = Index, j = 0; i < (Index + NumElts); i++, j++) {
9035 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, NewV1,
9036 DAG.getIntPtrConstant(i, dl));
9037 SubVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, SubVT, SubVec, Elt,
9038 DAG.getConstant(j, dl, MVT::i32));
9039 }
9040
9041 // Now return the result of comparing the subvector with zero,
9042 // which will generate a real predicate, i.e. v4i1, v8i1 or v16i1.
9043 return DAG.getNode(ARMISD::VCMPZ, dl, VT, SubVec,
9044 DAG.getConstant(ARMCC::NE, dl, MVT::i32));
9045}
9046
9047// Turn a truncate into a predicate (an i1 vector) into icmp(and(x, 1), 0).
9049 const ARMSubtarget *ST) {
9050 assert(ST->hasMVEIntegerOps() && "Expected MVE!");
9051 EVT VT = N->getValueType(0);
9052 assert((VT == MVT::v16i1 || VT == MVT::v8i1 || VT == MVT::v4i1) &&
9053 "Expected a vector i1 type!");
9054 SDValue Op = N->getOperand(0);
9055 EVT FromVT = Op.getValueType();
9056 SDLoc DL(N);
9057
9058 SDValue And =
9059 DAG.getNode(ISD::AND, DL, FromVT, Op, DAG.getConstant(1, DL, FromVT));
9060 return DAG.getNode(ISD::SETCC, DL, VT, And, DAG.getConstant(0, DL, FromVT),
9061 DAG.getCondCode(ISD::SETNE));
9062}
9063
9065 const ARMSubtarget *Subtarget) {
9066 if (!Subtarget->hasMVEIntegerOps())
9067 return SDValue();
9068
9069 EVT ToVT = N->getValueType(0);
9070 if (ToVT.getScalarType() == MVT::i1)
9071 return LowerTruncatei1(N, DAG, Subtarget);
9072
9073 // MVE does not have a single instruction to perform the truncation of a v4i32
9074 // into the lower half of a v8i16, in the same way that a NEON vmovn would.
9075 // Most of the instructions in MVE follow the 'Beats' system, where moving
9076 // values from different lanes is usually something that the instructions
9077 // avoid.
9078 //
9079 // Instead it has top/bottom instructions such as VMOVLT/B and VMOVNT/B,
9080 // which take a the top/bottom half of a larger lane and extend it (or do the
9081 // opposite, truncating into the top/bottom lane from a larger lane). Note
9082 // that because of the way we widen lanes, a v4i16 is really a v4i32 using the
9083 // bottom 16bits from each vector lane. This works really well with T/B
9084 // instructions, but that doesn't extend to v8i32->v8i16 where the lanes need
9085 // to move order.
9086 //
9087 // But truncates and sext/zext are always going to be fairly common from llvm.
9088 // We have several options for how to deal with them:
9089 // - Wherever possible combine them into an instruction that makes them
9090 // "free". This includes loads/stores, which can perform the trunc as part
9091 // of the memory operation. Or certain shuffles that can be turned into
9092 // VMOVN/VMOVL.
9093 // - Lane Interleaving to transform blocks surrounded by ext/trunc. So
9094 // trunc(mul(sext(a), sext(b))) may become
9095 // VMOVNT(VMUL(VMOVLB(a), VMOVLB(b)), VMUL(VMOVLT(a), VMOVLT(b))). (Which in
9096 // this case can use VMULL). This is performed in the
9097 // MVELaneInterleavingPass.
9098 // - Otherwise we have an option. By default we would expand the
9099 // zext/sext/trunc into a series of lane extract/inserts going via GPR
9100 // registers. One for each vector lane in the vector. This can obviously be
9101 // very expensive.
9102 // - The other option is to use the fact that loads/store can extend/truncate
9103 // to turn a trunc into two truncating stack stores and a stack reload. This
9104 // becomes 3 back-to-back memory operations, but at least that is less than
9105 // all the insert/extracts.
9106 //
9107 // In order to do the last, we convert certain trunc's into MVETRUNC, which
9108 // are either optimized where they can be, or eventually lowered into stack
9109 // stores/loads. This prevents us from splitting a v8i16 trunc into two stores
9110 // two early, where other instructions would be better, and stops us from
9111 // having to reconstruct multiple buildvector shuffles into loads/stores.
9112 if (ToVT != MVT::v8i16 && ToVT != MVT::v16i8)
9113 return SDValue();
9114 EVT FromVT = N->getOperand(0).getValueType();
9115 if (FromVT != MVT::v8i32 && FromVT != MVT::v16i16)
9116 return SDValue();
9117
9118 SDValue Lo, Hi;
9119 std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
9120 SDLoc DL(N);
9121 return DAG.getNode(ARMISD::MVETRUNC, DL, ToVT, Lo, Hi);
9122}
9123
9125 const ARMSubtarget *Subtarget) {
9126 if (!Subtarget->hasMVEIntegerOps())
9127 return SDValue();
9128
9129 // See LowerTruncate above for an explanation of MVEEXT/MVETRUNC.
9130
9131 EVT ToVT = N->getValueType(0);
9132 if (ToVT != MVT::v16i32 && ToVT != MVT::v8i32 && ToVT != MVT::v16i16)
9133 return SDValue();
9134 SDValue Op = N->getOperand(0);
9135 EVT FromVT = Op.getValueType();
9136 if (FromVT != MVT::v8i16 && FromVT != MVT::v16i8)
9137 return SDValue();
9138
9139 SDLoc DL(N);
9140 EVT ExtVT = ToVT.getHalfNumVectorElementsVT(*DAG.getContext());
9141 if (ToVT.getScalarType() == MVT::i32 && FromVT.getScalarType() == MVT::i8)
9142 ExtVT = MVT::v8i16;
9143
9144 unsigned Opcode =
9146 SDValue Ext = DAG.getNode(Opcode, DL, DAG.getVTList(ExtVT, ExtVT), Op);
9147 SDValue Ext1 = Ext.getValue(1);
9148
9149 if (ToVT.getScalarType() == MVT::i32 && FromVT.getScalarType() == MVT::i8) {
9150 Ext = DAG.getNode(N->getOpcode(), DL, MVT::v8i32, Ext);
9151 Ext1 = DAG.getNode(N->getOpcode(), DL, MVT::v8i32, Ext1);
9152 }
9153
9154 return DAG.getNode(ISD::CONCAT_VECTORS, DL, ToVT, Ext, Ext1);
9155}
9156
9157/// isExtendedBUILD_VECTOR - Check if N is a constant BUILD_VECTOR where each
9158/// element has been zero/sign-extended, depending on the isSigned parameter,
9159/// from an integer type half its size.
9161 bool isSigned) {
9162 // A v2i64 BUILD_VECTOR will have been legalized to a BITCAST from v4i32.
9163 EVT VT = N->getValueType(0);
9164 if (VT == MVT::v2i64 && N->getOpcode() == ISD::BITCAST) {
9165 SDNode *BVN = N->getOperand(0).getNode();
9166 if (BVN->getValueType(0) != MVT::v4i32 ||
9167 BVN->getOpcode() != ISD::BUILD_VECTOR)
9168 return false;
9169 unsigned LoElt = DAG.getDataLayout().isBigEndian() ? 1 : 0;
9170 unsigned HiElt = 1 - LoElt;
9175 if (!Lo0 || !Hi0 || !Lo1 || !Hi1)
9176 return false;
9177 if (isSigned) {
9178 if (Hi0->getSExtValue() == Lo0->getSExtValue() >> 32 &&
9179 Hi1->getSExtValue() == Lo1->getSExtValue() >> 32)
9180 return true;
9181 } else {
9182 if (Hi0->isZero() && Hi1->isZero())
9183 return true;
9184 }
9185 return false;
9186 }
9187
9188 if (N->getOpcode() != ISD::BUILD_VECTOR)
9189 return false;
9190
9191 for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
9192 SDNode *Elt = N->getOperand(i).getNode();
9194 unsigned EltSize = VT.getScalarSizeInBits();
9195 unsigned HalfSize = EltSize / 2;
9196 if (isSigned) {
9197 if (!isIntN(HalfSize, C->getSExtValue()))
9198 return false;
9199 } else {
9200 if (!isUIntN(HalfSize, C->getZExtValue()))
9201 return false;
9202 }
9203 continue;
9204 }
9205 return false;
9206 }
9207
9208 return true;
9209}
9210
9211/// isSignExtended - Check if a node is a vector value that is sign-extended
9212/// or a constant BUILD_VECTOR with sign-extended elements.
9214 if (N->getOpcode() == ISD::SIGN_EXTEND || ISD::isSEXTLoad(N))
9215 return true;
9216 if (isExtendedBUILD_VECTOR(N, DAG, true))
9217 return true;
9218 return false;
9219}
9220
9221/// isZeroExtended - Check if a node is a vector value that is zero-extended (or
9222/// any-extended) or a constant BUILD_VECTOR with zero-extended elements.
9224 if (N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode() == ISD::ANY_EXTEND ||
9226 return true;
9227 if (isExtendedBUILD_VECTOR(N, DAG, false))
9228 return true;
9229 return false;
9230}
9231
9232static EVT getExtensionTo64Bits(const EVT &OrigVT) {
9233 if (OrigVT.getSizeInBits() >= 64)
9234 return OrigVT;
9235
9236 assert(OrigVT.isSimple() && "Expecting a simple value type");
9237
9238 MVT::SimpleValueType OrigSimpleTy = OrigVT.getSimpleVT().SimpleTy;
9239 switch (OrigSimpleTy) {
9240 default: llvm_unreachable("Unexpected Vector Type");
9241 case MVT::v2i8:
9242 case MVT::v2i16:
9243 return MVT::v2i32;
9244 case MVT::v4i8:
9245 return MVT::v4i16;
9246 }
9247}
9248
9249/// AddRequiredExtensionForVMULL - Add a sign/zero extension to extend the total
9250/// value size to 64 bits. We need a 64-bit D register as an operand to VMULL.
9251/// We insert the required extension here to get the vector to fill a D register.
9253 const EVT &OrigTy,
9254 const EVT &ExtTy,
9255 unsigned ExtOpcode) {
9256 // The vector originally had a size of OrigTy. It was then extended to ExtTy.
9257 // We expect the ExtTy to be 128-bits total. If the OrigTy is less than
9258 // 64-bits we need to insert a new extension so that it will be 64-bits.
9259 assert(ExtTy.is128BitVector() && "Unexpected extension size");
9260 if (OrigTy.getSizeInBits() >= 64)
9261 return N;
9262
9263 // Must extend size to at least 64 bits to be used as an operand for VMULL.
9264 EVT NewVT = getExtensionTo64Bits(OrigTy);
9265
9266 return DAG.getNode(ExtOpcode, SDLoc(N), NewVT, N);
9267}
9268
9269/// SkipLoadExtensionForVMULL - return a load of the original vector size that
9270/// does not do any sign/zero extension. If the original vector is less
9271/// than 64 bits, an appropriate extension will be added after the load to
9272/// reach a total size of 64 bits. We have to add the extension separately
9273/// because ARM does not have a sign/zero extending load for vectors.
9275 EVT ExtendedTy = getExtensionTo64Bits(LD->getMemoryVT());
9276
9277 // The load already has the right type.
9278 if (ExtendedTy == LD->getMemoryVT())
9279 return DAG.getLoad(LD->getMemoryVT(), SDLoc(LD), LD->getChain(),
9280 LD->getBasePtr(), LD->getPointerInfo(), LD->getAlign(),
9281 LD->getMemOperand()->getFlags());
9282
9283 // We need to create a zextload/sextload. We cannot just create a load
9284 // followed by a zext/zext node because LowerMUL is also run during normal
9285 // operation legalization where we can't create illegal types.
9286 return DAG.getExtLoad(LD->getExtensionType(), SDLoc(LD), ExtendedTy,
9287 LD->getChain(), LD->getBasePtr(), LD->getPointerInfo(),
9288 LD->getMemoryVT(), LD->getAlign(),
9289 LD->getMemOperand()->getFlags());
9290}
9291
9292/// SkipExtensionForVMULL - For a node that is a SIGN_EXTEND, ZERO_EXTEND,
9293/// ANY_EXTEND, extending load, or BUILD_VECTOR with extended elements, return
9294/// the unextended value. The unextended vector should be 64 bits so that it can
9295/// be used as an operand to a VMULL instruction. If the original vector size
9296/// before extension is less than 64 bits we add a an extension to resize
9297/// the vector to 64 bits.
9299 if (N->getOpcode() == ISD::SIGN_EXTEND ||
9300 N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode() == ISD::ANY_EXTEND)
9301 return AddRequiredExtensionForVMULL(N->getOperand(0), DAG,
9302 N->getOperand(0)->getValueType(0),
9303 N->getValueType(0),
9304 N->getOpcode());
9305
9306 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
9307 assert((ISD::isSEXTLoad(LD) || ISD::isZEXTLoad(LD)) &&
9308 "Expected extending load");
9309
9310 SDValue newLoad = SkipLoadExtensionForVMULL(LD, DAG);
9311 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), newLoad.getValue(1));
9312 unsigned Opcode = ISD::isSEXTLoad(LD) ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
9313 SDValue extLoad =
9314 DAG.getNode(Opcode, SDLoc(newLoad), LD->getValueType(0), newLoad);
9315 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 0), extLoad);
9316
9317 return newLoad;
9318 }
9319
9320 // Otherwise, the value must be a BUILD_VECTOR. For v2i64, it will
9321 // have been legalized as a BITCAST from v4i32.
9322 if (N->getOpcode() == ISD::BITCAST) {
9323 SDNode *BVN = N->getOperand(0).getNode();
9325 BVN->getValueType(0) == MVT::v4i32 && "expected v4i32 BUILD_VECTOR");
9326 unsigned LowElt = DAG.getDataLayout().isBigEndian() ? 1 : 0;
9327 return DAG.getBuildVector(
9328 MVT::v2i32, SDLoc(N),
9329 {BVN->getOperand(LowElt), BVN->getOperand(LowElt + 2)});
9330 }
9331 // Construct a new BUILD_VECTOR with elements truncated to half the size.
9332 assert(N->getOpcode() == ISD::BUILD_VECTOR && "expected BUILD_VECTOR");
9333 EVT VT = N->getValueType(0);
9334 unsigned EltSize = VT.getScalarSizeInBits() / 2;
9335 unsigned NumElts = VT.getVectorNumElements();
9336 MVT TruncVT = MVT::getIntegerVT(EltSize);
9338 SDLoc dl(N);
9339 for (unsigned i = 0; i != NumElts; ++i) {
9340 const APInt &CInt = N->getConstantOperandAPInt(i);
9341 // Element types smaller than 32 bits are not legal, so use i32 elements.
9342 // The values are implicitly truncated so sext vs. zext doesn't matter.
9343 Ops.push_back(DAG.getConstant(CInt.zextOrTrunc(32), dl, MVT::i32));
9344 }
9345 return DAG.getBuildVector(MVT::getVectorVT(TruncVT, NumElts), dl, Ops);
9346}
9347
9348static bool isAddSubSExt(SDNode *N, SelectionDAG &DAG) {
9349 unsigned Opcode = N->getOpcode();
9350 if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
9351 SDNode *N0 = N->getOperand(0).getNode();
9352 SDNode *N1 = N->getOperand(1).getNode();
9353 return N0->hasOneUse() && N1->hasOneUse() &&
9354 isSignExtended(N0, DAG) && isSignExtended(N1, DAG);
9355 }
9356 return false;
9357}
9358
9359static bool isAddSubZExt(SDNode *N, SelectionDAG &DAG) {
9360 unsigned Opcode = N->getOpcode();
9361 if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
9362 SDNode *N0 = N->getOperand(0).getNode();
9363 SDNode *N1 = N->getOperand(1).getNode();
9364 return N0->hasOneUse() && N1->hasOneUse() &&
9365 isZeroExtended(N0, DAG) && isZeroExtended(N1, DAG);
9366 }
9367 return false;
9368}
9369
9371 // Multiplications are only custom-lowered for 128-bit vectors so that
9372 // VMULL can be detected. Otherwise v2i64 multiplications are not legal.
9373 EVT VT = Op.getValueType();
9374 assert(VT.is128BitVector() && VT.isInteger() &&
9375 "unexpected type for custom-lowering ISD::MUL");
9376 SDNode *N0 = Op.getOperand(0).getNode();
9377 SDNode *N1 = Op.getOperand(1).getNode();
9378 unsigned NewOpc = 0;
9379 bool isMLA = false;
9380 bool isN0SExt = isSignExtended(N0, DAG);
9381 bool isN1SExt = isSignExtended(N1, DAG);
9382 if (isN0SExt && isN1SExt)
9383 NewOpc = ARMISD::VMULLs;
9384 else {
9385 bool isN0ZExt = isZeroExtended(N0, DAG);
9386 bool isN1ZExt = isZeroExtended(N1, DAG);
9387 if (isN0ZExt && isN1ZExt)
9388 NewOpc = ARMISD::VMULLu;
9389 else if (isN1SExt || isN1ZExt) {
9390 // Look for (s/zext A + s/zext B) * (s/zext C). We want to turn these
9391 // into (s/zext A * s/zext C) + (s/zext B * s/zext C)
9392 if (isN1SExt && isAddSubSExt(N0, DAG)) {
9393 NewOpc = ARMISD::VMULLs;
9394 isMLA = true;
9395 } else if (isN1ZExt && isAddSubZExt(N0, DAG)) {
9396 NewOpc = ARMISD::VMULLu;
9397 isMLA = true;
9398 } else if (isN0ZExt && isAddSubZExt(N1, DAG)) {
9399 std::swap(N0, N1);
9400 NewOpc = ARMISD::VMULLu;
9401 isMLA = true;
9402 }
9403 }
9404
9405 if (!NewOpc) {
9406 if (VT == MVT::v2i64)
9407 // Fall through to expand this. It is not legal.
9408 return SDValue();
9409 else
9410 // Other vector multiplications are legal.
9411 return Op;
9412 }
9413 }
9414
9415 // Legalize to a VMULL instruction.
9416 SDLoc DL(Op);
9417 SDValue Op0;
9418 SDValue Op1 = SkipExtensionForVMULL(N1, DAG);
9419 if (!isMLA) {
9420 Op0 = SkipExtensionForVMULL(N0, DAG);
9422 Op1.getValueType().is64BitVector() &&
9423 "unexpected types for extended operands to VMULL");
9424 return DAG.getNode(NewOpc, DL, VT, Op0, Op1);
9425 }
9426
9427 // Optimizing (zext A + zext B) * C, to (VMULL A, C) + (VMULL B, C) during
9428 // isel lowering to take advantage of no-stall back to back vmul + vmla.
9429 // vmull q0, d4, d6
9430 // vmlal q0, d5, d6
9431 // is faster than
9432 // vaddl q0, d4, d5
9433 // vmovl q1, d6
9434 // vmul q0, q0, q1
9435 SDValue N00 = SkipExtensionForVMULL(N0->getOperand(0).getNode(), DAG);
9436 SDValue N01 = SkipExtensionForVMULL(N0->getOperand(1).getNode(), DAG);
9437 EVT Op1VT = Op1.getValueType();
9438 return DAG.getNode(N0->getOpcode(), DL, VT,
9439 DAG.getNode(NewOpc, DL, VT,
9440 DAG.getNode(ISD::BITCAST, DL, Op1VT, N00), Op1),
9441 DAG.getNode(NewOpc, DL, VT,
9442 DAG.getNode(ISD::BITCAST, DL, Op1VT, N01), Op1));
9443}
9444
9446 SelectionDAG &DAG) {
9447 // TODO: Should this propagate fast-math-flags?
9448
9449 // Convert to float
9450 // float4 xf = vcvt_f32_s32(vmovl_s16(a.lo));
9451 // float4 yf = vcvt_f32_s32(vmovl_s16(b.lo));
9452 X = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, X);
9453 Y = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Y);
9454 X = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, X);
9455 Y = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, Y);
9456 // Get reciprocal estimate.
9457 // float4 recip = vrecpeq_f32(yf);
9458 Y = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
9459 DAG.getConstant(Intrinsic::arm_neon_vrecpe, dl, MVT::i32),
9460 Y);
9461 // Because char has a smaller range than uchar, we can actually get away
9462 // without any newton steps. This requires that we use a weird bias
9463 // of 0xb000, however (again, this has been exhaustively tested).
9464 // float4 result = as_float4(as_int4(xf*recip) + 0xb000);
9465 X = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, X, Y);
9466 X = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, X);
9467 Y = DAG.getConstant(0xb000, dl, MVT::v4i32);
9468 X = DAG.getNode(ISD::ADD, dl, MVT::v4i32, X, Y);
9469 X = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, X);
9470 // Convert back to short.
9471 X = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, X);
9472 X = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, X);
9473 return X;
9474}
9475
9477 SelectionDAG &DAG) {
9478 // TODO: Should this propagate fast-math-flags?
9479
9480 SDValue N2;
9481 // Convert to float.
9482 // float4 yf = vcvt_f32_s32(vmovl_s16(y));
9483 // float4 xf = vcvt_f32_s32(vmovl_s16(x));
9484 N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, N0);
9485 N1 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, N1);
9486 N0 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N0);
9487 N1 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N1);
9488
9489 // Use reciprocal estimate and one refinement step.
9490 // float4 recip = vrecpeq_f32(yf);
9491 // recip *= vrecpsq_f32(yf, recip);
9492 N2 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
9493 DAG.getConstant(Intrinsic::arm_neon_vrecpe, dl, MVT::i32),
9494 N1);
9495 N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
9496 DAG.getConstant(Intrinsic::arm_neon_vrecps, dl, MVT::i32),
9497 N1, N2);
9498 N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2);
9499 // Because short has a smaller range than ushort, we can actually get away
9500 // with only a single newton step. This requires that we use a weird bias
9501 // of 89, however (again, this has been exhaustively tested).
9502 // float4 result = as_float4(as_int4(xf*recip) + 0x89);
9503 N0 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N0, N2);
9504 N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, N0);
9505 N1 = DAG.getConstant(0x89, dl, MVT::v4i32);
9506 N0 = DAG.getNode(ISD::ADD, dl, MVT::v4i32, N0, N1);
9507 N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, N0);
9508 // Convert back to integer and return.
9509 // return vmovn_s32(vcvt_s32_f32(result));
9510 N0 = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, N0);
9511 N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, N0);
9512 return N0;
9513}
9514
9516 const ARMSubtarget *ST) {
9517 EVT VT = Op.getValueType();
9518 assert((VT == MVT::v4i16 || VT == MVT::v8i8) &&
9519 "unexpected type for custom-lowering ISD::SDIV");
9520
9521 SDLoc dl(Op);
9522 SDValue N0 = Op.getOperand(0);
9523 SDValue N1 = Op.getOperand(1);
9524 SDValue N2, N3;
9525
9526 if (VT == MVT::v8i8) {
9527 N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i16, N0);
9528 N1 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i16, N1);
9529
9530 N2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,
9531 DAG.getIntPtrConstant(4, dl));
9532 N3 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1,
9533 DAG.getIntPtrConstant(4, dl));
9534 N0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,
9535 DAG.getIntPtrConstant(0, dl));
9536 N1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1,
9537 DAG.getIntPtrConstant(0, dl));
9538
9539 N0 = LowerSDIV_v4i8(N0, N1, dl, DAG); // v4i16
9540 N2 = LowerSDIV_v4i8(N2, N3, dl, DAG); // v4i16
9541
9542 N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, N0, N2);
9543 N0 = LowerCONCAT_VECTORS(N0, DAG, ST);
9544
9545 N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v8i8, N0);
9546 return N0;
9547 }
9548 return LowerSDIV_v4i16(N0, N1, dl, DAG);
9549}
9550
9552 const ARMSubtarget *ST) {
9553 // TODO: Should this propagate fast-math-flags?
9554 EVT VT = Op.getValueType();
9555 assert((VT == MVT::v4i16 || VT == MVT::v8i8) &&
9556 "unexpected type for custom-lowering ISD::UDIV");
9557
9558 SDLoc dl(Op);
9559 SDValue N0 = Op.getOperand(0);
9560 SDValue N1 = Op.getOperand(1);
9561 SDValue N2, N3;
9562
9563 if (VT == MVT::v8i8) {
9564 N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v8i16, N0);
9565 N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v8i16, N1);
9566
9567 N2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,
9568 DAG.getIntPtrConstant(4, dl));
9569 N3 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1,
9570 DAG.getIntPtrConstant(4, dl));
9571 N0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,
9572 DAG.getIntPtrConstant(0, dl));
9573 N1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1,
9574 DAG.getIntPtrConstant(0, dl));
9575
9576 N0 = LowerSDIV_v4i16(N0, N1, dl, DAG); // v4i16
9577 N2 = LowerSDIV_v4i16(N2, N3, dl, DAG); // v4i16
9578
9579 N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, N0, N2);
9580 N0 = LowerCONCAT_VECTORS(N0, DAG, ST);
9581
9582 N0 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v8i8,
9583 DAG.getConstant(Intrinsic::arm_neon_vqmovnsu, dl,
9584 MVT::i32),
9585 N0);
9586 return N0;
9587 }
9588
9589 // v4i16 sdiv ... Convert to float.
9590 // float4 yf = vcvt_f32_s32(vmovl_u16(y));
9591 // float4 xf = vcvt_f32_s32(vmovl_u16(x));
9592 N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v4i32, N0);
9593 N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v4i32, N1);
9594 N0 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N0);
9595 SDValue BN1 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N1);
9596
9597 // Use reciprocal estimate and two refinement steps.
9598 // float4 recip = vrecpeq_f32(yf);
9599 // recip *= vrecpsq_f32(yf, recip);
9600 // recip *= vrecpsq_f32(yf, recip);
9601 N2 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
9602 DAG.getConstant(Intrinsic::arm_neon_vrecpe, dl, MVT::i32),
9603 BN1);
9604 N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
9605 DAG.getConstant(Intrinsic::arm_neon_vrecps, dl, MVT::i32),
9606 BN1, N2);
9607 N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2);
9608 N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
9609 DAG.getConstant(Intrinsic::arm_neon_vrecps, dl, MVT::i32),
9610 BN1, N2);
9611 N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2);
9612 // Simply multiplying by the reciprocal estimate can leave us a few ulps
9613 // too low, so we add 2 ulps (exhaustive testing shows that this is enough,
9614 // and that it will never cause us to return an answer too large).
9615 // float4 result = as_float4(as_int4(xf*recip) + 2);
9616 N0 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N0, N2);
9617 N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, N0);
9618 N1 = DAG.getConstant(2, dl, MVT::v4i32);
9619 N0 = DAG.getNode(ISD::ADD, dl, MVT::v4i32, N0, N1);
9620 N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, N0);
9621 // Convert back to integer and return.
9622 // return vmovn_u32(vcvt_s32_f32(result));
9623 N0 = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, N0);
9624 N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, N0);
9625 return N0;
9626}
9627
9629 unsigned Opcode, bool IsSigned) {
9630 EVT VT0 = Op.getValue(0).getValueType();
9631 EVT VT1 = Op.getValue(1).getValueType();
9632
9633 bool InvertCarry = Opcode == ARMISD::SUBE;
9634 SDValue OpLHS = Op.getOperand(0);
9635 SDValue OpRHS = Op.getOperand(1);
9636 SDValue OpCarryIn = valueToCarryFlag(Op.getOperand(2), DAG, InvertCarry);
9637
9638 SDLoc DL(Op);
9639
9640 SDValue Result = DAG.getNode(Opcode, DL, DAG.getVTList(VT0, MVT::i32), OpLHS,
9641 OpRHS, OpCarryIn);
9642
9643 SDValue OutFlag =
9644 IsSigned ? overflowFlagToValue(Result.getValue(1), VT1, DAG)
9645 : carryFlagToValue(Result.getValue(1), VT1, DAG, InvertCarry);
9646
9647 return DAG.getMergeValues({Result, OutFlag}, DL);
9648}
9649
9650SDValue ARMTargetLowering::LowerWindowsDIVLibCall(SDValue Op, SelectionDAG &DAG,
9651 bool Signed,
9652 SDValue &Chain) const {
9653 EVT VT = Op.getValueType();
9654 assert((VT == MVT::i32 || VT == MVT::i64) &&
9655 "unexpected type for custom lowering DIV");
9656 SDLoc dl(Op);
9657
9658 const auto &DL = DAG.getDataLayout();
9659 RTLIB::Libcall LC;
9660 if (Signed)
9661 LC = VT == MVT::i32 ? RTLIB::SDIVREM_I32 : RTLIB::SDIVREM_I64;
9662 else
9663 LC = VT == MVT::i32 ? RTLIB::UDIVREM_I32 : RTLIB::UDIVREM_I64;
9664
9665 RTLIB::LibcallImpl LCImpl = DAG.getLibcalls().getLibcallImpl(LC);
9666 SDValue ES = DAG.getExternalSymbol(LCImpl, getPointerTy(DL));
9667
9669
9670 for (auto AI : {1, 0}) {
9671 SDValue Operand = Op.getOperand(AI);
9672 Args.emplace_back(Operand,
9673 Operand.getValueType().getTypeForEVT(*DAG.getContext()));
9674 }
9675
9676 CallLoweringInfo CLI(DAG);
9677 CLI.setDebugLoc(dl).setChain(Chain).setCallee(
9679 VT.getTypeForEVT(*DAG.getContext()), ES, std::move(Args));
9680
9681 return LowerCallTo(CLI).first;
9682}
9683
9684// This is a code size optimisation: return the original SDIV node to
9685// DAGCombiner when we don't want to expand SDIV into a sequence of
9686// instructions, and an empty node otherwise which will cause the
9687// SDIV to be expanded in DAGCombine.
9688SDValue
9689ARMTargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
9690 SelectionDAG &DAG,
9691 SmallVectorImpl<SDNode *> &Created) const {
9692 // TODO: Support SREM
9693 if (N->getOpcode() != ISD::SDIV)
9694 return SDValue();
9695
9696 const auto &ST = DAG.getSubtarget<ARMSubtarget>();
9697 const bool MinSize = ST.hasMinSize();
9698 const bool HasDivide = ST.isThumb() ? ST.hasDivideInThumbMode()
9699 : ST.hasDivideInARMMode();
9700
9701 // Don't touch vector types; rewriting this may lead to scalarizing
9702 // the int divs.
9703 if (N->getOperand(0).getValueType().isVector())
9704 return SDValue();
9705
9706 // Bail if MinSize is not set, and also for both ARM and Thumb mode we need
9707 // hwdiv support for this to be really profitable.
9708 if (!(MinSize && HasDivide))
9709 return SDValue();
9710
9711 // ARM mode is a bit simpler than Thumb: we can handle large power
9712 // of 2 immediates with 1 mov instruction; no further checks required,
9713 // just return the sdiv node.
9714 if (!ST.isThumb())
9715 return SDValue(N, 0);
9716
9717 // In Thumb mode, immediates larger than 128 need a wide 4-byte MOV,
9718 // and thus lose the code size benefits of a MOVS that requires only 2.
9719 // TargetTransformInfo and 'getIntImmCodeSizeCost' could be helpful here,
9720 // but as it's doing exactly this, it's not worth the trouble to get TTI.
9721 if (Divisor.sgt(128))
9722 return SDValue();
9723
9724 return SDValue(N, 0);
9725}
9726
9727SDValue ARMTargetLowering::LowerDIV_Windows(SDValue Op, SelectionDAG &DAG,
9728 bool Signed) const {
9729 assert(Op.getValueType() == MVT::i32 &&
9730 "unexpected type for custom lowering DIV");
9731 SDLoc dl(Op);
9732
9733 SDValue DBZCHK = DAG.getNode(ARMISD::WIN__DBZCHK, dl, MVT::Other,
9734 DAG.getEntryNode(), Op.getOperand(1));
9735
9736 return LowerWindowsDIVLibCall(Op, DAG, Signed, DBZCHK);
9737}
9738
9740 SDLoc DL(N);
9741 SDValue Op = N->getOperand(1);
9742 if (N->getValueType(0) == MVT::i32)
9743 return DAG.getNode(ARMISD::WIN__DBZCHK, DL, MVT::Other, InChain, Op);
9744 SDValue Lo, Hi;
9745 std::tie(Lo, Hi) = DAG.SplitScalar(Op, DL, MVT::i32, MVT::i32);
9746 return DAG.getNode(ARMISD::WIN__DBZCHK, DL, MVT::Other, InChain,
9747 DAG.getNode(ISD::OR, DL, MVT::i32, Lo, Hi));
9748}
9749
9750void ARMTargetLowering::ExpandDIV_Windows(
9751 SDValue Op, SelectionDAG &DAG, bool Signed,
9753 const auto &DL = DAG.getDataLayout();
9754
9755 assert(Op.getValueType() == MVT::i64 &&
9756 "unexpected type for custom lowering DIV");
9757 SDLoc dl(Op);
9758
9759 SDValue DBZCHK = WinDBZCheckDenominator(DAG, Op.getNode(), DAG.getEntryNode());
9760
9761 SDValue Result = LowerWindowsDIVLibCall(Op, DAG, Signed, DBZCHK);
9762
9763 SDValue Lower = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Result);
9764 SDValue Upper = DAG.getNode(ISD::SRL, dl, MVT::i64, Result,
9765 DAG.getConstant(32, dl, getPointerTy(DL)));
9766 Upper = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Upper);
9767
9768 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lower, Upper));
9769}
9770
9771std::pair<SDValue, SDValue>
9772ARMTargetLowering::LowerAEABIUnalignedLoad(SDValue Op,
9773 SelectionDAG &DAG) const {
9774 // If we have an unaligned load from a i32 or i64 that would normally be
9775 // split into separate ldrb's, we can use the __aeabi_uread4/__aeabi_uread8
9776 // functions instead.
9777 LoadSDNode *LD = cast<LoadSDNode>(Op.getNode());
9778 EVT MemVT = LD->getMemoryVT();
9779 if (MemVT != MVT::i32 && MemVT != MVT::i64)
9780 return std::make_pair(SDValue(), SDValue());
9781
9782 const auto &MF = DAG.getMachineFunction();
9783 unsigned AS = LD->getAddressSpace();
9784 Align Alignment = LD->getAlign();
9785 const DataLayout &DL = DAG.getDataLayout();
9786 bool AllowsUnaligned = Subtarget->allowsUnalignedMem();
9787
9788 if (MF.getFunction().hasMinSize() && !AllowsUnaligned &&
9789 Alignment <= llvm::Align(2)) {
9790
9791 RTLIB::Libcall LC =
9792 (MemVT == MVT::i32) ? RTLIB::AEABI_UREAD4 : RTLIB::AEABI_UREAD8;
9793
9794 MakeLibCallOptions Opts;
9795 SDLoc dl(Op);
9796
9797 auto Pair = makeLibCall(DAG, LC, MemVT.getSimpleVT(), LD->getBasePtr(),
9798 Opts, dl, LD->getChain());
9799
9800 // If necessary, extend the node to 64bit
9801 if (LD->getExtensionType() != ISD::NON_EXTLOAD) {
9802 unsigned ExtType = LD->getExtensionType() == ISD::SEXTLOAD
9805 SDValue EN = DAG.getNode(ExtType, dl, LD->getValueType(0), Pair.first);
9806 Pair.first = EN;
9807 }
9808 return Pair;
9809 }
9810
9811 // Default expand to individual loads
9812 if (!allowsMemoryAccess(*DAG.getContext(), DL, MemVT, AS, Alignment))
9813 return expandUnalignedLoad(LD, DAG);
9814 return std::make_pair(SDValue(), SDValue());
9815}
9816
9817SDValue ARMTargetLowering::LowerAEABIUnalignedStore(SDValue Op,
9818 SelectionDAG &DAG) const {
9819 // If we have an unaligned store to a i32 or i64 that would normally be
9820 // split into separate ldrb's, we can use the __aeabi_uwrite4/__aeabi_uwrite8
9821 // functions instead.
9822 StoreSDNode *ST = cast<StoreSDNode>(Op.getNode());
9823 EVT MemVT = ST->getMemoryVT();
9824 if (MemVT != MVT::i32 && MemVT != MVT::i64)
9825 return SDValue();
9826
9827 const auto &MF = DAG.getMachineFunction();
9828 unsigned AS = ST->getAddressSpace();
9829 Align Alignment = ST->getAlign();
9830 const DataLayout &DL = DAG.getDataLayout();
9831 bool AllowsUnaligned = Subtarget->allowsUnalignedMem();
9832
9833 if (MF.getFunction().hasMinSize() && !AllowsUnaligned &&
9834 Alignment <= llvm::Align(2)) {
9835
9836 SDLoc dl(Op);
9837
9838 // If necessary, trunc the value to 32bit
9839 SDValue StoreVal = ST->getOperand(1);
9840 if (ST->isTruncatingStore())
9841 StoreVal = DAG.getNode(ISD::TRUNCATE, dl, MemVT, ST->getOperand(1));
9842
9843 RTLIB::Libcall LC =
9844 (MemVT == MVT::i32) ? RTLIB::AEABI_UWRITE4 : RTLIB::AEABI_UWRITE8;
9845
9846 MakeLibCallOptions Opts;
9847 auto CallResult =
9848 makeLibCall(DAG, LC, MVT::isVoid, {StoreVal, ST->getBasePtr()}, Opts,
9849 dl, ST->getChain());
9850
9851 return CallResult.second;
9852 }
9853
9854 // Default expand to individual stores
9855 if (!allowsMemoryAccess(*DAG.getContext(), DL, MemVT, AS, Alignment))
9856 return expandUnalignedStore(ST, DAG);
9857 return SDValue();
9858}
9859
9861 LoadSDNode *LD = cast<LoadSDNode>(Op.getNode());
9862 EVT MemVT = LD->getMemoryVT();
9863 assert((MemVT == MVT::v2i1 || MemVT == MVT::v4i1 || MemVT == MVT::v8i1 ||
9864 MemVT == MVT::v16i1) &&
9865 "Expected a predicate type!");
9866 assert(MemVT == Op.getValueType());
9867 assert(LD->getExtensionType() == ISD::NON_EXTLOAD &&
9868 "Expected a non-extending load");
9869 assert(LD->isUnindexed() && "Expected a unindexed load");
9870
9871 // The basic MVE VLDR on a v2i1/v4i1/v8i1 actually loads the entire 16bit
9872 // predicate, with the "v4i1" bits spread out over the 16 bits loaded. We
9873 // need to make sure that 8/4/2 bits are actually loaded into the correct
9874 // place, which means loading the value and then shuffling the values into
9875 // the bottom bits of the predicate.
9876 // Equally, VLDR for an v16i1 will actually load 32bits (so will be incorrect
9877 // for BE).
9878 // Speaking of BE, apparently the rest of llvm will assume a reverse order to
9879 // a natural VMSR(load), so needs to be reversed.
9880
9881 SDLoc dl(Op);
9882 SDValue Load = DAG.getExtLoad(
9883 ISD::EXTLOAD, dl, MVT::i32, LD->getChain(), LD->getBasePtr(),
9885 LD->getMemOperand());
9886 SDValue Val = Load;
9887 if (DAG.getDataLayout().isBigEndian())
9888 Val = DAG.getNode(ISD::SRL, dl, MVT::i32,
9889 DAG.getNode(ISD::BITREVERSE, dl, MVT::i32, Load),
9890 DAG.getConstant(32 - MemVT.getSizeInBits(), dl, MVT::i32));
9891 SDValue Pred = DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::v16i1, Val);
9892 if (MemVT != MVT::v16i1)
9893 Pred = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MemVT, Pred,
9894 DAG.getConstant(0, dl, MVT::i32));
9895 return DAG.getMergeValues({Pred, Load.getValue(1)}, dl);
9896}
9897
9898void ARMTargetLowering::LowerLOAD(SDNode *N, SmallVectorImpl<SDValue> &Results,
9899 SelectionDAG &DAG) const {
9900 LoadSDNode *LD = cast<LoadSDNode>(N);
9901 EVT MemVT = LD->getMemoryVT();
9902
9903 if (MemVT == MVT::i64 && Subtarget->hasV5TEOps() &&
9904 !Subtarget->isThumb1Only() && LD->isVolatile() &&
9905 LD->getAlign() >= Subtarget->getDualLoadStoreAlignment()) {
9906 assert(LD->isUnindexed() && "Loads should be unindexed at this point.");
9907 SDLoc dl(N);
9909 ARMISD::LDRD, dl, DAG.getVTList({MVT::i32, MVT::i32, MVT::Other}),
9910 {LD->getChain(), LD->getBasePtr()}, MemVT, LD->getMemOperand());
9911 SDValue Lo = Result.getValue(DAG.getDataLayout().isLittleEndian() ? 0 : 1);
9912 SDValue Hi = Result.getValue(DAG.getDataLayout().isLittleEndian() ? 1 : 0);
9913 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
9914 Results.append({Pair, Result.getValue(2)});
9915 } else if (MemVT == MVT::i32 || MemVT == MVT::i64) {
9916 auto Pair = LowerAEABIUnalignedLoad(SDValue(N, 0), DAG);
9917 if (Pair.first) {
9918 Results.push_back(Pair.first);
9919 Results.push_back(Pair.second);
9920 }
9921 }
9922}
9923
9925 StoreSDNode *ST = cast<StoreSDNode>(Op.getNode());
9926 EVT MemVT = ST->getMemoryVT();
9927 assert((MemVT == MVT::v2i1 || MemVT == MVT::v4i1 || MemVT == MVT::v8i1 ||
9928 MemVT == MVT::v16i1) &&
9929 "Expected a predicate type!");
9930 assert(MemVT == ST->getValue().getValueType());
9931 assert(!ST->isTruncatingStore() && "Expected a non-extending store");
9932 assert(ST->isUnindexed() && "Expected a unindexed store");
9933
9934 // Only store the v2i1 or v4i1 or v8i1 worth of bits, via a buildvector with
9935 // top bits unset and a scalar store.
9936 SDLoc dl(Op);
9937 SDValue Build = ST->getValue();
9938 if (MemVT != MVT::v16i1) {
9940 for (unsigned I = 0; I < MemVT.getVectorNumElements(); I++) {
9941 unsigned Elt = DAG.getDataLayout().isBigEndian()
9942 ? MemVT.getVectorNumElements() - I - 1
9943 : I;
9944 Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, Build,
9945 DAG.getConstant(Elt, dl, MVT::i32)));
9946 }
9947 for (unsigned I = MemVT.getVectorNumElements(); I < 16; I++)
9948 Ops.push_back(DAG.getUNDEF(MVT::i32));
9949 Build = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i1, Ops);
9950 }
9951 SDValue GRP = DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::i32, Build);
9952 if (MemVT == MVT::v16i1 && DAG.getDataLayout().isBigEndian())
9953 GRP = DAG.getNode(ISD::SRL, dl, MVT::i32,
9954 DAG.getNode(ISD::BITREVERSE, dl, MVT::i32, GRP),
9955 DAG.getConstant(16, dl, MVT::i32));
9956 return DAG.getTruncStore(
9957 ST->getChain(), dl, GRP, ST->getBasePtr(),
9959 ST->getMemOperand());
9960}
9961
9962SDValue ARMTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG,
9963 const ARMSubtarget *Subtarget) const {
9964 StoreSDNode *ST = cast<StoreSDNode>(Op.getNode());
9965 EVT MemVT = ST->getMemoryVT();
9966
9967 if (MemVT == MVT::i64 && Subtarget->hasV5TEOps() &&
9968 !Subtarget->isThumb1Only() && ST->isVolatile() &&
9969 ST->getAlign() >= Subtarget->getDualLoadStoreAlignment()) {
9970 assert(ST->isUnindexed() && "Stores should be unindexed at this point.");
9971 SDNode *N = Op.getNode();
9972 SDLoc dl(N);
9973
9974 SDValue Lo = DAG.getNode(
9975 ISD::EXTRACT_ELEMENT, dl, MVT::i32, ST->getValue(),
9976 DAG.getTargetConstant(DAG.getDataLayout().isLittleEndian() ? 0 : 1, dl,
9977 MVT::i32));
9978 SDValue Hi = DAG.getNode(
9979 ISD::EXTRACT_ELEMENT, dl, MVT::i32, ST->getValue(),
9980 DAG.getTargetConstant(DAG.getDataLayout().isLittleEndian() ? 1 : 0, dl,
9981 MVT::i32));
9982
9983 return DAG.getMemIntrinsicNode(ARMISD::STRD, dl, DAG.getVTList(MVT::Other),
9984 {ST->getChain(), Lo, Hi, ST->getBasePtr()},
9985 MemVT, ST->getMemOperand());
9986 } else if (Subtarget->hasMVEIntegerOps() &&
9987 ((MemVT == MVT::v2i1 || MemVT == MVT::v4i1 || MemVT == MVT::v8i1 ||
9988 MemVT == MVT::v16i1))) {
9989 return LowerPredicateStore(Op, DAG);
9990 } else if (MemVT == MVT::i32 || MemVT == MVT::i64) {
9991 return LowerAEABIUnalignedStore(Op, DAG);
9992 }
9993 return SDValue();
9994}
9995
9996static bool isZeroVector(SDValue N) {
9997 return (ISD::isBuildVectorAllZeros(N.getNode()) ||
9998 (N->getOpcode() == ARMISD::VMOVIMM &&
9999 isNullConstant(N->getOperand(0))));
10000}
10001
10004 MVT VT = Op.getSimpleValueType();
10005 SDValue Mask = N->getMask();
10006 SDValue PassThru = N->getPassThru();
10007 SDLoc dl(Op);
10008
10009 if (isZeroVector(PassThru))
10010 return Op;
10011
10012 // MVE Masked loads use zero as the passthru value. Here we convert undef to
10013 // zero too, and other values are lowered to a select.
10014 SDValue ZeroVec = DAG.getNode(ARMISD::VMOVIMM, dl, VT,
10015 DAG.getTargetConstant(0, dl, MVT::i32));
10016 SDValue NewLoad = DAG.getMaskedLoad(
10017 VT, dl, N->getChain(), N->getBasePtr(), N->getOffset(), Mask, ZeroVec,
10018 N->getMemoryVT(), N->getMemOperand(), N->getAddressingMode(),
10019 N->getExtensionType(), N->isExpandingLoad());
10020 SDValue Combo = NewLoad;
10021 bool PassThruIsCastZero = (PassThru.getOpcode() == ISD::BITCAST ||
10022 PassThru.getOpcode() == ARMISD::VECTOR_REG_CAST) &&
10023 isZeroVector(PassThru->getOperand(0));
10024 if (!PassThru.isUndef() && !PassThruIsCastZero)
10025 Combo = DAG.getNode(ISD::VSELECT, dl, VT, Mask, NewLoad, PassThru);
10026 return DAG.getMergeValues({Combo, NewLoad.getValue(1)}, dl);
10027}
10028
10030 const ARMSubtarget *ST) {
10031 if (!ST->hasMVEIntegerOps())
10032 return SDValue();
10033
10034 SDLoc dl(Op);
10035 unsigned BaseOpcode = 0;
10036 switch (Op->getOpcode()) {
10037 default: llvm_unreachable("Expected VECREDUCE opcode");
10038 case ISD::VECREDUCE_FADD: BaseOpcode = ISD::FADD; break;
10039 case ISD::VECREDUCE_FMUL: BaseOpcode = ISD::FMUL; break;
10040 case ISD::VECREDUCE_MUL: BaseOpcode = ISD::MUL; break;
10041 case ISD::VECREDUCE_AND: BaseOpcode = ISD::AND; break;
10042 case ISD::VECREDUCE_OR: BaseOpcode = ISD::OR; break;
10043 case ISD::VECREDUCE_XOR: BaseOpcode = ISD::XOR; break;
10044 case ISD::VECREDUCE_FMAX: BaseOpcode = ISD::FMAXNUM; break;
10045 case ISD::VECREDUCE_FMIN: BaseOpcode = ISD::FMINNUM; break;
10046 }
10047
10048 SDValue Op0 = Op->getOperand(0);
10049 EVT VT = Op0.getValueType();
10050 EVT EltVT = VT.getVectorElementType();
10051 unsigned NumElts = VT.getVectorNumElements();
10052 unsigned NumActiveLanes = NumElts;
10053
10054 assert((NumActiveLanes == 16 || NumActiveLanes == 8 || NumActiveLanes == 4 ||
10055 NumActiveLanes == 2) &&
10056 "Only expected a power 2 vector size");
10057
10058 // Use Mul(X, Rev(X)) until 4 items remain. Going down to 4 vector elements
10059 // allows us to easily extract vector elements from the lanes.
10060 while (NumActiveLanes > 4) {
10061 unsigned RevOpcode = NumActiveLanes == 16 ? ARMISD::VREV16 : ARMISD::VREV32;
10062 SDValue Rev = DAG.getNode(RevOpcode, dl, VT, Op0);
10063 Op0 = DAG.getNode(BaseOpcode, dl, VT, Op0, Rev);
10064 NumActiveLanes /= 2;
10065 }
10066
10067 SDValue Res;
10068 if (NumActiveLanes == 4) {
10069 // The remaining 4 elements are summed sequentially
10070 SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0,
10071 DAG.getConstant(0 * NumElts / 4, dl, MVT::i32));
10072 SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0,
10073 DAG.getConstant(1 * NumElts / 4, dl, MVT::i32));
10074 SDValue Ext2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0,
10075 DAG.getConstant(2 * NumElts / 4, dl, MVT::i32));
10076 SDValue Ext3 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0,
10077 DAG.getConstant(3 * NumElts / 4, dl, MVT::i32));
10078 SDValue Res0 = DAG.getNode(BaseOpcode, dl, EltVT, Ext0, Ext1, Op->getFlags());
10079 SDValue Res1 = DAG.getNode(BaseOpcode, dl, EltVT, Ext2, Ext3, Op->getFlags());
10080 Res = DAG.getNode(BaseOpcode, dl, EltVT, Res0, Res1, Op->getFlags());
10081 } else {
10082 SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0,
10083 DAG.getConstant(0, dl, MVT::i32));
10084 SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0,
10085 DAG.getConstant(1, dl, MVT::i32));
10086 Res = DAG.getNode(BaseOpcode, dl, EltVT, Ext0, Ext1, Op->getFlags());
10087 }
10088
10089 // Result type may be wider than element type.
10090 if (EltVT != Op->getValueType(0))
10091 Res = DAG.getNode(ISD::ANY_EXTEND, dl, Op->getValueType(0), Res);
10092 return Res;
10093}
10094
10096 const ARMSubtarget *ST) {
10097 if (!ST->hasMVEFloatOps())
10098 return SDValue();
10099 return LowerVecReduce(Op, DAG, ST);
10100}
10101
10103 const ARMSubtarget *ST) {
10104 if (!ST->hasNEON())
10105 return SDValue();
10106
10107 SDLoc dl(Op);
10108 SDValue Op0 = Op->getOperand(0);
10109 EVT VT = Op0.getValueType();
10110 EVT EltVT = VT.getVectorElementType();
10111
10112 unsigned PairwiseIntrinsic = 0;
10113 switch (Op->getOpcode()) {
10114 default:
10115 llvm_unreachable("Expected VECREDUCE opcode");
10117 PairwiseIntrinsic = Intrinsic::arm_neon_vpminu;
10118 break;
10120 PairwiseIntrinsic = Intrinsic::arm_neon_vpmaxu;
10121 break;
10123 PairwiseIntrinsic = Intrinsic::arm_neon_vpmins;
10124 break;
10126 PairwiseIntrinsic = Intrinsic::arm_neon_vpmaxs;
10127 break;
10128 }
10129 SDValue PairwiseOp = DAG.getConstant(PairwiseIntrinsic, dl, MVT::i32);
10130
10131 unsigned NumElts = VT.getVectorNumElements();
10132 unsigned NumActiveLanes = NumElts;
10133
10134 assert((NumActiveLanes == 16 || NumActiveLanes == 8 || NumActiveLanes == 4 ||
10135 NumActiveLanes == 2) &&
10136 "Only expected a power 2 vector size");
10137
10138 // Split 128-bit vectors, since vpmin/max takes 2 64-bit vectors.
10139 if (VT.is128BitVector()) {
10140 SDValue Lo, Hi;
10141 std::tie(Lo, Hi) = DAG.SplitVector(Op0, dl);
10142 VT = Lo.getValueType();
10143 Op0 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, {PairwiseOp, Lo, Hi});
10144 NumActiveLanes /= 2;
10145 }
10146
10147 // Use pairwise reductions until one lane remains
10148 while (NumActiveLanes > 1) {
10149 Op0 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, {PairwiseOp, Op0, Op0});
10150 NumActiveLanes /= 2;
10151 }
10152
10153 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0,
10154 DAG.getConstant(0, dl, MVT::i32));
10155
10156 // Result type may be wider than element type.
10157 if (EltVT != Op.getValueType()) {
10158 unsigned Extend = 0;
10159 switch (Op->getOpcode()) {
10160 default:
10161 llvm_unreachable("Expected VECREDUCE opcode");
10164 Extend = ISD::ZERO_EXTEND;
10165 break;
10168 Extend = ISD::SIGN_EXTEND;
10169 break;
10170 }
10171 Res = DAG.getNode(Extend, dl, Op.getValueType(), Res);
10172 }
10173 return Res;
10174}
10175
10177 if (isStrongerThanMonotonic(cast<AtomicSDNode>(Op)->getSuccessOrdering()))
10178 // Acquire/Release load/store is not legal for targets without a dmb or
10179 // equivalent available.
10180 return SDValue();
10181
10182 // Monotonic load/store is legal for all targets.
10183 return Op;
10184}
10185
10188 SelectionDAG &DAG,
10189 const ARMSubtarget *Subtarget) {
10190 SDLoc DL(N);
10191 // Under Power Management extensions, the cycle-count is:
10192 // mrc p15, #0, <Rt>, c9, c13, #0
10193 SDValue Ops[] = { N->getOperand(0), // Chain
10194 DAG.getTargetConstant(Intrinsic::arm_mrc, DL, MVT::i32),
10195 DAG.getTargetConstant(15, DL, MVT::i32),
10196 DAG.getTargetConstant(0, DL, MVT::i32),
10197 DAG.getTargetConstant(9, DL, MVT::i32),
10198 DAG.getTargetConstant(13, DL, MVT::i32),
10199 DAG.getTargetConstant(0, DL, MVT::i32)
10200 };
10201
10202 SDValue Cycles32 = DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL,
10203 DAG.getVTList(MVT::i32, MVT::Other), Ops);
10204 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Cycles32,
10205 DAG.getConstant(0, DL, MVT::i32)));
10206 Results.push_back(Cycles32.getValue(1));
10207}
10208
10210 SDValue V1) {
10211 SDLoc dl(V0.getNode());
10212 SDValue RegClass =
10213 DAG.getTargetConstant(ARM::GPRPairRegClassID, dl, MVT::i32);
10214 SDValue SubReg0 = DAG.getTargetConstant(ARM::gsub_0, dl, MVT::i32);
10215 SDValue SubReg1 = DAG.getTargetConstant(ARM::gsub_1, dl, MVT::i32);
10216 const SDValue Ops[] = {RegClass, V0, SubReg0, V1, SubReg1};
10217 return SDValue(
10218 DAG.getMachineNode(TargetOpcode::REG_SEQUENCE, dl, MVT::Untyped, Ops), 0);
10219}
10220
10222 SDLoc dl(V.getNode());
10223 auto [VLo, VHi] = DAG.SplitScalar(V, dl, MVT::i32, MVT::i32);
10224 bool isBigEndian = DAG.getDataLayout().isBigEndian();
10225 if (isBigEndian)
10226 std::swap(VLo, VHi);
10227 return createGPRPairNode2xi32(DAG, VLo, VHi);
10228}
10229
10232 SelectionDAG &DAG) {
10233 assert(N->getValueType(0) == MVT::i64 &&
10234 "AtomicCmpSwap on types less than 64 should be legal");
10235 SDValue Ops[] = {
10236 createGPRPairNode2xi32(DAG, N->getOperand(1),
10237 DAG.getUNDEF(MVT::i32)), // pointer, temp
10238 createGPRPairNodei64(DAG, N->getOperand(2)), // expected
10239 createGPRPairNodei64(DAG, N->getOperand(3)), // new
10240 N->getOperand(0), // chain in
10241 };
10242 SDNode *CmpSwap = DAG.getMachineNode(
10243 ARM::CMP_SWAP_64, SDLoc(N),
10244 DAG.getVTList(MVT::Untyped, MVT::Untyped, MVT::Other), Ops);
10245
10246 MachineMemOperand *MemOp = cast<MemSDNode>(N)->getMemOperand();
10247 DAG.setNodeMemRefs(cast<MachineSDNode>(CmpSwap), {MemOp});
10248
10249 bool isBigEndian = DAG.getDataLayout().isBigEndian();
10250
10251 SDValue Lo =
10252 DAG.getTargetExtractSubreg(isBigEndian ? ARM::gsub_1 : ARM::gsub_0,
10253 SDLoc(N), MVT::i32, SDValue(CmpSwap, 0));
10254 SDValue Hi =
10255 DAG.getTargetExtractSubreg(isBigEndian ? ARM::gsub_0 : ARM::gsub_1,
10256 SDLoc(N), MVT::i32, SDValue(CmpSwap, 0));
10257 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i64, Lo, Hi));
10258 Results.push_back(SDValue(CmpSwap, 2));
10259}
10260
10261SDValue ARMTargetLowering::LowerFSETCC(SDValue Op, SelectionDAG &DAG) const {
10262 SDLoc dl(Op);
10263 EVT VT = Op.getValueType();
10264 SDValue Chain = Op.getOperand(0);
10265 SDValue LHS = Op.getOperand(1);
10266 SDValue RHS = Op.getOperand(2);
10267 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(3))->get();
10268 bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
10269
10270 // If we don't have instructions of this float type then soften to a libcall
10271 // and use SETCC instead.
10272 if (isUnsupportedFloatingType(LHS.getValueType())) {
10273 softenSetCCOperands(DAG, LHS.getValueType(), LHS, RHS, CC, dl, LHS, RHS,
10274 Chain, IsSignaling);
10275 if (!RHS.getNode()) {
10276 RHS = DAG.getConstant(0, dl, LHS.getValueType());
10277 CC = ISD::SETNE;
10278 }
10279 SDValue Result = DAG.getNode(ISD::SETCC, dl, VT, LHS, RHS,
10280 DAG.getCondCode(CC));
10281 return DAG.getMergeValues({Result, Chain}, dl);
10282 }
10283
10284 ARMCC::CondCodes CondCode, CondCode2;
10285 FPCCToARMCC(CC, CondCode, CondCode2);
10286
10287 SDValue True = DAG.getConstant(1, dl, VT);
10288 SDValue False = DAG.getConstant(0, dl, VT);
10289 SDValue ARMcc = DAG.getConstant(CondCode, dl, MVT::i32);
10290 SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl, IsSignaling);
10291 SDValue Result = getCMOV(dl, VT, False, True, ARMcc, Cmp, DAG);
10292 if (CondCode2 != ARMCC::AL) {
10293 ARMcc = DAG.getConstant(CondCode2, dl, MVT::i32);
10294 Result = getCMOV(dl, VT, Result, True, ARMcc, Cmp, DAG);
10295 }
10296 return DAG.getMergeValues({Result, Chain}, dl);
10297}
10298
10299SDValue ARMTargetLowering::LowerSPONENTRY(SDValue Op, SelectionDAG &DAG) const {
10300 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
10301
10302 EVT VT = getPointerTy(DAG.getDataLayout());
10303 int FI = MFI.CreateFixedObject(4, 0, false);
10304 return DAG.getFrameIndex(FI, VT);
10305}
10306
10307SDValue ARMTargetLowering::LowerFP_TO_BF16(SDValue Op,
10308 SelectionDAG &DAG) const {
10309 SDLoc DL(Op);
10310 MakeLibCallOptions CallOptions;
10311 MVT SVT = Op.getOperand(0).getSimpleValueType();
10312 RTLIB::Libcall LC = RTLIB::getFPROUND(SVT, MVT::bf16);
10313 SDValue Res =
10314 makeLibCall(DAG, LC, MVT::f32, Op.getOperand(0), CallOptions, DL).first;
10315 return DAG.getBitcast(MVT::i32, Res);
10316}
10317
10318SDValue ARMTargetLowering::LowerCMP(SDValue Op, SelectionDAG &DAG) const {
10319 SDLoc dl(Op);
10320 SDValue LHS = Op.getOperand(0);
10321 SDValue RHS = Op.getOperand(1);
10322
10323 // Determine if this is signed or unsigned comparison
10324 bool IsSigned = (Op.getOpcode() == ISD::SCMP);
10325
10326 // Special case for Thumb1 UCMP only
10327 if (!IsSigned && Subtarget->isThumb1Only()) {
10328 // For Thumb unsigned comparison, use this sequence:
10329 // subs r2, r0, r1 ; r2 = LHS - RHS, sets flags
10330 // sbc r2, r2 ; r2 = r2 - r2 - !carry
10331 // cmp r1, r0 ; compare RHS with LHS
10332 // sbc r1, r1 ; r1 = r1 - r1 - !carry
10333 // subs r0, r2, r1 ; r0 = r2 - r1 (final result)
10334
10335 // First subtraction: LHS - RHS
10336 SDValue Sub1WithFlags = DAG.getNode(
10337 ARMISD::SUBC, dl, DAG.getVTList(MVT::i32, FlagsVT), LHS, RHS);
10338 SDValue Sub1Result = Sub1WithFlags.getValue(0);
10339 SDValue Flags1 = Sub1WithFlags.getValue(1);
10340
10341 // SUBE: Sub1Result - Sub1Result - !carry
10342 // This gives 0 if LHS >= RHS (unsigned), -1 if LHS < RHS (unsigned)
10343 SDValue Sbc1 =
10344 DAG.getNode(ARMISD::SUBE, dl, DAG.getVTList(MVT::i32, FlagsVT),
10345 Sub1Result, Sub1Result, Flags1);
10346 SDValue Sbc1Result = Sbc1.getValue(0);
10347
10348 // Second comparison: RHS vs LHS (reverse comparison)
10349 SDValue CmpFlags = DAG.getNode(ARMISD::CMP, dl, FlagsVT, RHS, LHS);
10350
10351 // SUBE: RHS - RHS - !carry
10352 // This gives 0 if RHS <= LHS (unsigned), -1 if RHS > LHS (unsigned)
10353 SDValue Sbc2 = DAG.getNode(
10354 ARMISD::SUBE, dl, DAG.getVTList(MVT::i32, FlagsVT), RHS, RHS, CmpFlags);
10355 SDValue Sbc2Result = Sbc2.getValue(0);
10356
10357 // Final subtraction: Sbc1Result - Sbc2Result (no flags needed)
10358 SDValue Result =
10359 DAG.getNode(ISD::SUB, dl, MVT::i32, Sbc1Result, Sbc2Result);
10360 if (Op.getValueType() != MVT::i32)
10361 Result = DAG.getSExtOrTrunc(Result, dl, Op.getValueType());
10362
10363 return Result;
10364 }
10365
10366 // For the ARM assembly pattern:
10367 // subs r0, r0, r1 ; subtract RHS from LHS and set flags
10368 // movgt r0, #1 ; if LHS > RHS, set result to 1 (GT for signed, HI for
10369 // unsigned) mvnlt r0, #0 ; if LHS < RHS, set result to -1 (LT for
10370 // signed, LO for unsigned)
10371 // ; if LHS == RHS, result remains 0 from the subs
10372
10373 // Optimization: if RHS is a subtraction against 0, use ADDC instead of SUBC
10374 unsigned Opcode = ARMISD::SUBC;
10375
10376 // Check if RHS is a subtraction against 0: (0 - X)
10377 if (RHS.getOpcode() == ISD::SUB) {
10378 SDValue SubLHS = RHS.getOperand(0);
10379 SDValue SubRHS = RHS.getOperand(1);
10380
10381 // Check if it's 0 - X
10382 if (isNullConstant(SubLHS)) {
10383 bool CanUseAdd = false;
10384 if (IsSigned) {
10385 // For SCMP: only if X is known to never be INT_MIN (to avoid overflow)
10386 if (RHS->getFlags().hasNoSignedWrap() || !DAG.computeKnownBits(SubRHS)
10388 .isMinSignedValue()) {
10389 CanUseAdd = true;
10390 }
10391 } else {
10392 // For UCMP: only if X is known to never be zero
10393 if (DAG.isKnownNeverZero(SubRHS)) {
10394 CanUseAdd = true;
10395 }
10396 }
10397
10398 if (CanUseAdd) {
10399 Opcode = ARMISD::ADDC;
10400 RHS = SubRHS; // Replace RHS with X, so we do LHS + X instead of
10401 // LHS - (0 - X)
10402 }
10403 }
10404 }
10405
10406 // Generate the operation with flags
10407 SDValue OpWithFlags =
10408 DAG.getNode(Opcode, dl, DAG.getVTList(MVT::i32, FlagsVT), LHS, RHS);
10409
10410 SDValue OpResult = OpWithFlags.getValue(0);
10411 SDValue Flags = OpWithFlags.getValue(1);
10412
10413 // Constants for conditional moves
10414 SDValue One = DAG.getConstant(1, dl, MVT::i32);
10415 SDValue MinusOne = DAG.getAllOnesConstant(dl, MVT::i32);
10416
10417 // Select condition codes based on signed vs unsigned
10418 ARMCC::CondCodes GTCond = IsSigned ? ARMCC::GT : ARMCC::HI;
10419 ARMCC::CondCodes LTCond = IsSigned ? ARMCC::LT : ARMCC::LO;
10420
10421 // First conditional move: if greater than, set to 1
10422 SDValue GTCondValue = DAG.getConstant(GTCond, dl, MVT::i32);
10423 SDValue Result1 = DAG.getNode(ARMISD::CMOV, dl, MVT::i32, OpResult, One,
10424 GTCondValue, Flags);
10425
10426 // Second conditional move: if less than, set to -1
10427 SDValue LTCondValue = DAG.getConstant(LTCond, dl, MVT::i32);
10428 SDValue Result2 = DAG.getNode(ARMISD::CMOV, dl, MVT::i32, Result1, MinusOne,
10429 LTCondValue, Flags);
10430
10431 if (Op.getValueType() != MVT::i32)
10432 Result2 = DAG.getSExtOrTrunc(Result2, dl, Op.getValueType());
10433
10434 return Result2;
10435}
10436
10438 LLVM_DEBUG(dbgs() << "Lowering node: "; Op.dump());
10439 switch (Op.getOpcode()) {
10440 default: llvm_unreachable("Don't know how to custom lower this!");
10441 case ISD::WRITE_REGISTER: return LowerWRITE_REGISTER(Op, DAG);
10442 case ISD::ConstantPool: return LowerConstantPool(Op, DAG);
10443 case ISD::BlockAddress: return LowerBlockAddress(Op, DAG);
10444 case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG);
10445 case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
10446 case ISD::SELECT: return LowerSELECT(Op, DAG);
10447 case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
10448 case ISD::BRCOND: return LowerBRCOND(Op, DAG);
10449 case ISD::BR_CC: return LowerBR_CC(Op, DAG);
10450 case ISD::BR_JT: return LowerBR_JT(Op, DAG);
10451 case ISD::VASTART: return LowerVASTART(Op, DAG);
10452 case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, DAG, Subtarget);
10453 case ISD::PREFETCH: return LowerPREFETCH(Op, DAG, Subtarget);
10454 case ISD::SINT_TO_FP:
10455 case ISD::UINT_TO_FP: return LowerINT_TO_FP(Op, DAG);
10458 case ISD::FP_TO_SINT:
10459 case ISD::FP_TO_UINT: return LowerFP_TO_INT(Op, DAG);
10461 case ISD::FP_TO_UINT_SAT: return LowerFP_TO_INT_SAT(Op, DAG, Subtarget);
10462 case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG);
10463 case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG);
10464 case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG);
10465 case ISD::EH_SJLJ_SETJMP: return LowerEH_SJLJ_SETJMP(Op, DAG);
10466 case ISD::EH_SJLJ_LONGJMP: return LowerEH_SJLJ_LONGJMP(Op, DAG);
10467 case ISD::EH_SJLJ_SETUP_DISPATCH: return LowerEH_SJLJ_SETUP_DISPATCH(Op, DAG);
10468 case ISD::INTRINSIC_VOID: return LowerINTRINSIC_VOID(Op, DAG, Subtarget);
10469 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG,
10470 Subtarget);
10471 case ISD::BITCAST: return ExpandBITCAST(Op.getNode(), DAG, Subtarget);
10472 case ISD::SHL:
10473 case ISD::SRL:
10474 case ISD::SRA: return LowerShift(Op.getNode(), DAG, Subtarget);
10475 case ISD::SREM: return LowerREM(Op.getNode(), DAG);
10476 case ISD::UREM: return LowerREM(Op.getNode(), DAG);
10477 case ISD::SHL_PARTS: return LowerShiftLeftParts(Op, DAG);
10478 case ISD::SRL_PARTS:
10479 case ISD::SRA_PARTS: return LowerShiftRightParts(Op, DAG);
10480 case ISD::CTTZ:
10481 case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op.getNode(), DAG, Subtarget);
10482 case ISD::CTPOP: return LowerCTPOP(Op.getNode(), DAG, Subtarget);
10483 case ISD::SETCC: return LowerVSETCC(Op, DAG, Subtarget);
10484 case ISD::SETCCCARRY: return LowerSETCCCARRY(Op, DAG);
10485 case ISD::ConstantFP: return LowerConstantFP(Op, DAG, Subtarget);
10486 case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG, Subtarget);
10487 case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG, Subtarget);
10488 case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG, Subtarget);
10489 case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
10490 case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG, Subtarget);
10491 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG, Subtarget);
10492 case ISD::TRUNCATE: return LowerTruncate(Op.getNode(), DAG, Subtarget);
10493 case ISD::SIGN_EXTEND:
10494 case ISD::ZERO_EXTEND: return LowerVectorExtend(Op.getNode(), DAG, Subtarget);
10495 case ISD::GET_ROUNDING: return LowerGET_ROUNDING(Op, DAG);
10496 case ISD::SET_ROUNDING: return LowerSET_ROUNDING(Op, DAG);
10497 case ISD::SET_FPMODE:
10498 return LowerSET_FPMODE(Op, DAG);
10499 case ISD::RESET_FPMODE:
10500 return LowerRESET_FPMODE(Op, DAG);
10501 case ISD::MUL: return LowerMUL(Op, DAG);
10502 case ISD::SDIV:
10503 if (getTargetMachine().getTargetTriple().isOSWindows() &&
10504 !Op.getValueType().isVector())
10505 return LowerDIV_Windows(Op, DAG, /* Signed */ true);
10506 return LowerSDIV(Op, DAG, Subtarget);
10507 case ISD::UDIV:
10508 if (getTargetMachine().getTargetTriple().isOSWindows() &&
10509 !Op.getValueType().isVector())
10510 return LowerDIV_Windows(Op, DAG, /* Signed */ false);
10511 return LowerUDIV(Op, DAG, Subtarget);
10512 case ISD::UADDO_CARRY:
10513 return LowerADDSUBO_CARRY(Op, DAG, ARMISD::ADDE, false /*unsigned*/);
10514 case ISD::USUBO_CARRY:
10515 return LowerADDSUBO_CARRY(Op, DAG, ARMISD::SUBE, false /*unsigned*/);
10516 case ISD::SADDO_CARRY:
10517 return LowerADDSUBO_CARRY(Op, DAG, ARMISD::ADDE, true /*signed*/);
10518 case ISD::SSUBO_CARRY:
10519 return LowerADDSUBO_CARRY(Op, DAG, ARMISD::SUBE, true /*signed*/);
10520 case ISD::UADDO:
10521 case ISD::USUBO:
10522 case ISD::UMULO:
10523 case ISD::SADDO:
10524 case ISD::SSUBO:
10525 case ISD::SMULO:
10526 return LowerALUO(Op, DAG);
10527 case ISD::SADDSAT:
10528 case ISD::SSUBSAT:
10529 case ISD::UADDSAT:
10530 case ISD::USUBSAT:
10531 return LowerADDSUBSAT(Op, DAG, Subtarget);
10532 case ISD::LOAD: {
10533 auto *LD = cast<LoadSDNode>(Op);
10534 EVT MemVT = LD->getMemoryVT();
10535 if (Subtarget->hasMVEIntegerOps() &&
10536 (MemVT == MVT::v2i1 || MemVT == MVT::v4i1 || MemVT == MVT::v8i1 ||
10537 MemVT == MVT::v16i1))
10538 return LowerPredicateLoad(Op, DAG);
10539
10540 auto Pair = LowerAEABIUnalignedLoad(Op, DAG);
10541 if (Pair.first)
10542 return DAG.getMergeValues({Pair.first, Pair.second}, SDLoc(Pair.first));
10543 return SDValue();
10544 }
10545 case ISD::STORE:
10546 return LowerSTORE(Op, DAG, Subtarget);
10547 case ISD::MLOAD:
10548 return LowerMLOAD(Op, DAG);
10549 case ISD::VECREDUCE_MUL:
10550 case ISD::VECREDUCE_AND:
10551 case ISD::VECREDUCE_OR:
10552 case ISD::VECREDUCE_XOR:
10553 return LowerVecReduce(Op, DAG, Subtarget);
10558 return LowerVecReduceF(Op, DAG, Subtarget);
10563 return LowerVecReduceMinMax(Op, DAG, Subtarget);
10564 case ISD::ATOMIC_LOAD:
10565 case ISD::ATOMIC_STORE:
10566 return LowerAtomicLoadStore(Op, DAG);
10567 case ISD::SDIVREM:
10568 case ISD::UDIVREM: return LowerDivRem(Op, DAG);
10570 if (getTargetMachine().getTargetTriple().isOSWindows())
10571 return LowerDYNAMIC_STACKALLOC(Op, DAG);
10572 llvm_unreachable("Don't know how to custom lower this!");
10574 case ISD::FP_ROUND: return LowerFP_ROUND(Op, DAG);
10576 case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG);
10577 case ISD::STRICT_FSETCC:
10578 case ISD::STRICT_FSETCCS: return LowerFSETCC(Op, DAG);
10579 case ISD::SPONENTRY:
10580 return LowerSPONENTRY(Op, DAG);
10581 case ISD::FP_TO_BF16:
10582 return LowerFP_TO_BF16(Op, DAG);
10583 case ARMISD::WIN__DBZCHK: return SDValue();
10584 case ISD::UCMP:
10585 case ISD::SCMP:
10586 return LowerCMP(Op, DAG);
10587 case ISD::ABS:
10588 return LowerABS(Op, DAG);
10589 case ISD::STRICT_LROUND:
10591 case ISD::STRICT_LRINT:
10592 case ISD::STRICT_LLRINT: {
10593 assert((Op.getOperand(1).getValueType() == MVT::f16 ||
10594 Op.getOperand(1).getValueType() == MVT::bf16) &&
10595 "Expected custom lowering of rounding operations only for f16");
10596 SDLoc DL(Op);
10597 SDValue Ext = DAG.getNode(ISD::STRICT_FP_EXTEND, DL, {MVT::f32, MVT::Other},
10598 {Op.getOperand(0), Op.getOperand(1)});
10599 return DAG.getNode(Op.getOpcode(), DL, {Op.getValueType(), MVT::Other},
10600 {Ext.getValue(1), Ext.getValue(0)});
10601 }
10602 }
10603}
10604
10606 SelectionDAG &DAG) {
10607 unsigned IntNo = N->getConstantOperandVal(0);
10608 unsigned Opc = 0;
10609 if (IntNo == Intrinsic::arm_smlald)
10610 Opc = ARMISD::SMLALD;
10611 else if (IntNo == Intrinsic::arm_smlaldx)
10612 Opc = ARMISD::SMLALDX;
10613 else if (IntNo == Intrinsic::arm_smlsld)
10614 Opc = ARMISD::SMLSLD;
10615 else if (IntNo == Intrinsic::arm_smlsldx)
10616 Opc = ARMISD::SMLSLDX;
10617 else
10618 return;
10619
10620 SDLoc dl(N);
10621 SDValue Lo, Hi;
10622 std::tie(Lo, Hi) = DAG.SplitScalar(N->getOperand(3), dl, MVT::i32, MVT::i32);
10623
10624 SDValue LongMul = DAG.getNode(Opc, dl,
10625 DAG.getVTList(MVT::i32, MVT::i32),
10626 N->getOperand(1), N->getOperand(2),
10627 Lo, Hi);
10628 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64,
10629 LongMul.getValue(0), LongMul.getValue(1)));
10630}
10631
10632/// ReplaceNodeResults - Replace the results of node with an illegal result
10633/// type with new values built out of custom code.
10636 SelectionDAG &DAG) const {
10637 SDValue Res;
10638 switch (N->getOpcode()) {
10639 default:
10640 llvm_unreachable("Don't know how to custom expand this!");
10641 case ISD::READ_REGISTER:
10643 break;
10644 case ISD::BITCAST:
10645 Res = ExpandBITCAST(N, DAG, Subtarget);
10646 break;
10647 case ISD::SRL:
10648 case ISD::SRA:
10649 case ISD::SHL:
10650 Res = Expand64BitShift(N, DAG, Subtarget);
10651 break;
10652 case ISD::SREM:
10653 case ISD::UREM:
10654 Res = LowerREM(N, DAG);
10655 break;
10656 case ISD::SDIVREM:
10657 case ISD::UDIVREM:
10658 Res = LowerDivRem(SDValue(N, 0), DAG);
10659 assert(Res.getNumOperands() == 2 && "DivRem needs two values");
10660 Results.push_back(Res.getValue(0));
10661 Results.push_back(Res.getValue(1));
10662 return;
10663 case ISD::SADDSAT:
10664 case ISD::SSUBSAT:
10665 case ISD::UADDSAT:
10666 case ISD::USUBSAT:
10667 Res = LowerADDSUBSAT(SDValue(N, 0), DAG, Subtarget);
10668 break;
10670 ReplaceREADCYCLECOUNTER(N, Results, DAG, Subtarget);
10671 return;
10672 case ISD::UDIV:
10673 case ISD::SDIV:
10674 assert(getTargetMachine().getTargetTriple().isOSWindows() &&
10675 "can only expand DIV on Windows");
10676 return ExpandDIV_Windows(SDValue(N, 0), DAG, N->getOpcode() == ISD::SDIV,
10677 Results);
10680 return;
10682 return ReplaceLongIntrinsic(N, Results, DAG);
10683 case ISD::LOAD:
10684 LowerLOAD(N, Results, DAG);
10685 break;
10686 case ISD::STORE:
10687 Res = LowerAEABIUnalignedStore(SDValue(N, 0), DAG);
10688 break;
10689 case ISD::TRUNCATE:
10690 Res = LowerTruncate(N, DAG, Subtarget);
10691 break;
10692 case ISD::SIGN_EXTEND:
10693 case ISD::ZERO_EXTEND:
10694 Res = LowerVectorExtend(N, DAG, Subtarget);
10695 break;
10698 Res = LowerFP_TO_INT_SAT(SDValue(N, 0), DAG, Subtarget);
10699 break;
10700 }
10701 if (Res.getNode())
10702 Results.push_back(Res);
10703}
10704
10705//===----------------------------------------------------------------------===//
10706// ARM Scheduler Hooks
10707//===----------------------------------------------------------------------===//
10708
10709/// SetupEntryBlockForSjLj - Insert code into the entry block that creates and
10710/// registers the function context.
10711void ARMTargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,
10713 MachineBasicBlock *DispatchBB,
10714 int FI) const {
10715 assert(!Subtarget->isROPI() && !Subtarget->isRWPI() &&
10716 "ROPI/RWPI not currently supported with SjLj");
10717 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
10718 DebugLoc dl = MI.getDebugLoc();
10719 MachineFunction *MF = MBB->getParent();
10720 MachineRegisterInfo *MRI = &MF->getRegInfo();
10723 const Function &F = MF->getFunction();
10724
10725 bool isThumb = Subtarget->isThumb();
10726 bool isThumb2 = Subtarget->isThumb2();
10727
10728 unsigned PCLabelId = AFI->createPICLabelUId();
10729 unsigned PCAdj = (isThumb || isThumb2) ? 4 : 8;
10731 ARMConstantPoolMBB::Create(F.getContext(), DispatchBB, PCLabelId, PCAdj);
10732 unsigned CPI = MCP->getConstantPoolIndex(CPV, Align(4));
10733
10734 const TargetRegisterClass *TRC = isThumb ? &ARM::tGPRRegClass
10735 : &ARM::GPRRegClass;
10736
10737 // Grab constant pool and fixed stack memory operands.
10738 MachineMemOperand *CPMMO =
10741
10742 MachineMemOperand *FIMMOSt =
10745
10746 // Load the address of the dispatch MBB into the jump buffer.
10747 if (isThumb2) {
10748 // Incoming value: jbuf
10749 // ldr.n r5, LCPI1_1
10750 // orr r5, r5, #1
10751 // add r5, pc
10752 // str r5, [$jbuf, #+4] ; &jbuf[1]
10753 Register NewVReg1 = MRI->createVirtualRegister(TRC);
10754 BuildMI(*MBB, MI, dl, TII->get(ARM::t2LDRpci), NewVReg1)
10756 .addMemOperand(CPMMO)
10758 // Set the low bit because of thumb mode.
10759 Register NewVReg2 = MRI->createVirtualRegister(TRC);
10760 BuildMI(*MBB, MI, dl, TII->get(ARM::t2ORRri), NewVReg2)
10761 .addReg(NewVReg1, RegState::Kill)
10762 .addImm(0x01)
10764 .add(condCodeOp());
10765 Register NewVReg3 = MRI->createVirtualRegister(TRC);
10766 BuildMI(*MBB, MI, dl, TII->get(ARM::tPICADD), NewVReg3)
10767 .addReg(NewVReg2, RegState::Kill)
10768 .addImm(PCLabelId);
10769 BuildMI(*MBB, MI, dl, TII->get(ARM::t2STRi12))
10770 .addReg(NewVReg3, RegState::Kill)
10771 .addFrameIndex(FI)
10772 .addImm(36) // &jbuf[1] :: pc
10773 .addMemOperand(FIMMOSt)
10775 } else if (isThumb) {
10776 // Incoming value: jbuf
10777 // ldr.n r1, LCPI1_4
10778 // add r1, pc
10779 // mov r2, #1
10780 // orrs r1, r2
10781 // add r2, $jbuf, #+4 ; &jbuf[1]
10782 // str r1, [r2]
10783 Register NewVReg1 = MRI->createVirtualRegister(TRC);
10784 BuildMI(*MBB, MI, dl, TII->get(ARM::tLDRpci), NewVReg1)
10786 .addMemOperand(CPMMO)
10788 Register NewVReg2 = MRI->createVirtualRegister(TRC);
10789 BuildMI(*MBB, MI, dl, TII->get(ARM::tPICADD), NewVReg2)
10790 .addReg(NewVReg1, RegState::Kill)
10791 .addImm(PCLabelId);
10792 // Set the low bit because of thumb mode.
10793 Register NewVReg3 = MRI->createVirtualRegister(TRC);
10794 BuildMI(*MBB, MI, dl, TII->get(ARM::tMOVi8), NewVReg3)
10795 .addReg(ARM::CPSR, RegState::Define)
10796 .addImm(1)
10798 Register NewVReg4 = MRI->createVirtualRegister(TRC);
10799 BuildMI(*MBB, MI, dl, TII->get(ARM::tORR), NewVReg4)
10800 .addReg(ARM::CPSR, RegState::Define)
10801 .addReg(NewVReg2, RegState::Kill)
10802 .addReg(NewVReg3, RegState::Kill)
10804 Register NewVReg5 = MRI->createVirtualRegister(TRC);
10805 BuildMI(*MBB, MI, dl, TII->get(ARM::tADDframe), NewVReg5)
10806 .addFrameIndex(FI)
10807 .addImm(36); // &jbuf[1] :: pc
10808 BuildMI(*MBB, MI, dl, TII->get(ARM::tSTRi))
10809 .addReg(NewVReg4, RegState::Kill)
10810 .addReg(NewVReg5, RegState::Kill)
10811 .addImm(0)
10812 .addMemOperand(FIMMOSt)
10814 } else {
10815 // Incoming value: jbuf
10816 // ldr r1, LCPI1_1
10817 // add r1, pc, r1
10818 // str r1, [$jbuf, #+4] ; &jbuf[1]
10819 Register NewVReg1 = MRI->createVirtualRegister(TRC);
10820 BuildMI(*MBB, MI, dl, TII->get(ARM::LDRi12), NewVReg1)
10822 .addImm(0)
10823 .addMemOperand(CPMMO)
10825 Register NewVReg2 = MRI->createVirtualRegister(TRC);
10826 BuildMI(*MBB, MI, dl, TII->get(ARM::PICADD), NewVReg2)
10827 .addReg(NewVReg1, RegState::Kill)
10828 .addImm(PCLabelId)
10830 BuildMI(*MBB, MI, dl, TII->get(ARM::STRi12))
10831 .addReg(NewVReg2, RegState::Kill)
10832 .addFrameIndex(FI)
10833 .addImm(36) // &jbuf[1] :: pc
10834 .addMemOperand(FIMMOSt)
10836 }
10837}
10838
10839void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
10840 MachineBasicBlock *MBB) const {
10841 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
10842 DebugLoc dl = MI.getDebugLoc();
10843 MachineFunction *MF = MBB->getParent();
10844 MachineRegisterInfo *MRI = &MF->getRegInfo();
10845 MachineFrameInfo &MFI = MF->getFrameInfo();
10846 int FI = MFI.getFunctionContextIndex();
10847
10848 const TargetRegisterClass *TRC = Subtarget->isThumb() ? &ARM::tGPRRegClass
10849 : &ARM::GPRnopcRegClass;
10850
10851 // Get a mapping of the call site numbers to all of the landing pads they're
10852 // associated with.
10853 DenseMap<unsigned, SmallVector<MachineBasicBlock*, 2>> CallSiteNumToLPad;
10854 unsigned MaxCSNum = 0;
10855 for (MachineBasicBlock &BB : *MF) {
10856 if (!BB.isEHPad())
10857 continue;
10858
10859 // FIXME: We should assert that the EH_LABEL is the first MI in the landing
10860 // pad.
10861 for (MachineInstr &II : BB) {
10862 if (!II.isEHLabel())
10863 continue;
10864
10865 MCSymbol *Sym = II.getOperand(0).getMCSymbol();
10866 if (!MF->hasCallSiteLandingPad(Sym)) continue;
10867
10868 SmallVectorImpl<unsigned> &CallSiteIdxs = MF->getCallSiteLandingPad(Sym);
10869 for (unsigned Idx : CallSiteIdxs) {
10870 CallSiteNumToLPad[Idx].push_back(&BB);
10871 MaxCSNum = std::max(MaxCSNum, Idx);
10872 }
10873 break;
10874 }
10875 }
10876
10877 // Get an ordered list of the machine basic blocks for the jump table.
10878 std::vector<MachineBasicBlock*> LPadList;
10879 SmallPtrSet<MachineBasicBlock*, 32> InvokeBBs;
10880 LPadList.reserve(CallSiteNumToLPad.size());
10881 for (unsigned I = 1; I <= MaxCSNum; ++I) {
10882 SmallVectorImpl<MachineBasicBlock*> &MBBList = CallSiteNumToLPad[I];
10883 for (MachineBasicBlock *MBB : MBBList) {
10884 LPadList.push_back(MBB);
10885 InvokeBBs.insert_range(MBB->predecessors());
10886 }
10887 }
10888
10889 assert(!LPadList.empty() &&
10890 "No landing pad destinations for the dispatch jump table!");
10891
10892 // Create the jump table and associated information.
10893 MachineJumpTableInfo *JTI =
10894 MF->getOrCreateJumpTableInfo(MachineJumpTableInfo::EK_Inline);
10895 unsigned MJTI = JTI->createJumpTableIndex(LPadList);
10896
10897 // Create the MBBs for the dispatch code.
10898
10899 // Shove the dispatch's address into the return slot in the function context.
10900 MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock();
10901 DispatchBB->setIsEHPad();
10902
10903 MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
10904
10905 BuildMI(TrapBB, dl, TII->get(Subtarget->isThumb() ? ARM::tTRAP : ARM::TRAP));
10906 DispatchBB->addSuccessor(TrapBB);
10907
10908 MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock();
10909 DispatchBB->addSuccessor(DispContBB);
10910
10911 // Insert and MBBs.
10912 MF->insert(MF->end(), DispatchBB);
10913 MF->insert(MF->end(), DispContBB);
10914 MF->insert(MF->end(), TrapBB);
10915
10916 // Insert code into the entry block that creates and registers the function
10917 // context.
10918 SetupEntryBlockForSjLj(MI, MBB, DispatchBB, FI);
10919
10920 MachineMemOperand *FIMMOLd = MF->getMachineMemOperand(
10923
10924 MachineInstrBuilder MIB;
10925 MIB = BuildMI(DispatchBB, dl, TII->get(ARM::Int_eh_sjlj_dispatchsetup));
10926
10927 const ARMBaseInstrInfo *AII = static_cast<const ARMBaseInstrInfo*>(TII);
10928 const ARMBaseRegisterInfo &RI = AII->getRegisterInfo();
10929
10930 // Add a register mask with no preserved registers. This results in all
10931 // registers being marked as clobbered. This can't work if the dispatch block
10932 // is in a Thumb1 function and is linked with ARM code which uses the FP
10933 // registers, as there is no way to preserve the FP registers in Thumb1 mode.
10935
10936 bool IsPositionIndependent = isPositionIndependent();
10937 unsigned NumLPads = LPadList.size();
10938 if (Subtarget->isThumb2()) {
10939 Register NewVReg1 = MRI->createVirtualRegister(TRC);
10940 BuildMI(DispatchBB, dl, TII->get(ARM::t2LDRi12), NewVReg1)
10941 .addFrameIndex(FI)
10942 .addImm(4)
10943 .addMemOperand(FIMMOLd)
10945
10946 if (NumLPads < 256) {
10947 BuildMI(DispatchBB, dl, TII->get(ARM::t2CMPri))
10948 .addReg(NewVReg1)
10949 .addImm(LPadList.size())
10951 } else {
10952 Register VReg1 = MRI->createVirtualRegister(TRC);
10953 BuildMI(DispatchBB, dl, TII->get(ARM::t2MOVi16), VReg1)
10954 .addImm(NumLPads & 0xFFFF)
10956
10957 unsigned VReg2 = VReg1;
10958 if ((NumLPads & 0xFFFF0000) != 0) {
10959 VReg2 = MRI->createVirtualRegister(TRC);
10960 BuildMI(DispatchBB, dl, TII->get(ARM::t2MOVTi16), VReg2)
10961 .addReg(VReg1)
10962 .addImm(NumLPads >> 16)
10964 }
10965
10966 BuildMI(DispatchBB, dl, TII->get(ARM::t2CMPrr))
10967 .addReg(NewVReg1)
10968 .addReg(VReg2)
10970 }
10971
10972 BuildMI(DispatchBB, dl, TII->get(ARM::t2Bcc))
10973 .addMBB(TrapBB)
10975 .addReg(ARM::CPSR);
10976
10977 Register NewVReg3 = MRI->createVirtualRegister(TRC);
10978 BuildMI(DispContBB, dl, TII->get(ARM::t2LEApcrelJT), NewVReg3)
10979 .addJumpTableIndex(MJTI)
10981
10982 Register NewVReg4 = MRI->createVirtualRegister(TRC);
10983 BuildMI(DispContBB, dl, TII->get(ARM::t2ADDrs), NewVReg4)
10984 .addReg(NewVReg3, RegState::Kill)
10985 .addReg(NewVReg1)
10988 .add(condCodeOp());
10989
10990 BuildMI(DispContBB, dl, TII->get(ARM::t2BR_JT))
10991 .addReg(NewVReg4, RegState::Kill)
10992 .addReg(NewVReg1)
10993 .addJumpTableIndex(MJTI);
10994 } else if (Subtarget->isThumb()) {
10995 Register NewVReg1 = MRI->createVirtualRegister(TRC);
10996 BuildMI(DispatchBB, dl, TII->get(ARM::tLDRspi), NewVReg1)
10997 .addFrameIndex(FI)
10998 .addImm(1)
10999 .addMemOperand(FIMMOLd)
11001
11002 if (NumLPads < 256) {
11003 BuildMI(DispatchBB, dl, TII->get(ARM::tCMPi8))
11004 .addReg(NewVReg1)
11005 .addImm(NumLPads)
11007 } else {
11008 MachineConstantPool *ConstantPool = MF->getConstantPool();
11009 Type *Int32Ty = Type::getInt32Ty(MF->getFunction().getContext());
11010 const Constant *C = ConstantInt::get(Int32Ty, NumLPads);
11011
11012 // MachineConstantPool wants an explicit alignment.
11013 Align Alignment = MF->getDataLayout().getPrefTypeAlign(Int32Ty);
11014 unsigned Idx = ConstantPool->getConstantPoolIndex(C, Alignment);
11015
11016 Register VReg1 = MRI->createVirtualRegister(TRC);
11017 BuildMI(DispatchBB, dl, TII->get(ARM::tLDRpci))
11018 .addReg(VReg1, RegState::Define)
11021 BuildMI(DispatchBB, dl, TII->get(ARM::tCMPr))
11022 .addReg(NewVReg1)
11023 .addReg(VReg1)
11025 }
11026
11027 BuildMI(DispatchBB, dl, TII->get(ARM::tBcc))
11028 .addMBB(TrapBB)
11030 .addReg(ARM::CPSR);
11031
11032 Register NewVReg2 = MRI->createVirtualRegister(TRC);
11033 BuildMI(DispContBB, dl, TII->get(ARM::tLSLri), NewVReg2)
11034 .addReg(ARM::CPSR, RegState::Define)
11035 .addReg(NewVReg1)
11036 .addImm(2)
11038
11039 Register NewVReg3 = MRI->createVirtualRegister(TRC);
11040 BuildMI(DispContBB, dl, TII->get(ARM::tLEApcrelJT), NewVReg3)
11041 .addJumpTableIndex(MJTI)
11043
11044 Register NewVReg4 = MRI->createVirtualRegister(TRC);
11045 BuildMI(DispContBB, dl, TII->get(ARM::tADDrr), NewVReg4)
11046 .addReg(ARM::CPSR, RegState::Define)
11047 .addReg(NewVReg2, RegState::Kill)
11048 .addReg(NewVReg3)
11050
11051 MachineMemOperand *JTMMOLd =
11052 MF->getMachineMemOperand(MachinePointerInfo::getJumpTable(*MF),
11054
11055 Register NewVReg5 = MRI->createVirtualRegister(TRC);
11056 BuildMI(DispContBB, dl, TII->get(ARM::tLDRi), NewVReg5)
11057 .addReg(NewVReg4, RegState::Kill)
11058 .addImm(0)
11059 .addMemOperand(JTMMOLd)
11061
11062 unsigned NewVReg6 = NewVReg5;
11063 if (IsPositionIndependent) {
11064 NewVReg6 = MRI->createVirtualRegister(TRC);
11065 BuildMI(DispContBB, dl, TII->get(ARM::tADDrr), NewVReg6)
11066 .addReg(ARM::CPSR, RegState::Define)
11067 .addReg(NewVReg5, RegState::Kill)
11068 .addReg(NewVReg3)
11070 }
11071
11072 BuildMI(DispContBB, dl, TII->get(ARM::tBR_JTr))
11073 .addReg(NewVReg6, RegState::Kill)
11074 .addJumpTableIndex(MJTI);
11075 } else {
11076 Register NewVReg1 = MRI->createVirtualRegister(TRC);
11077 BuildMI(DispatchBB, dl, TII->get(ARM::LDRi12), NewVReg1)
11078 .addFrameIndex(FI)
11079 .addImm(4)
11080 .addMemOperand(FIMMOLd)
11082
11083 if (NumLPads < 256) {
11084 BuildMI(DispatchBB, dl, TII->get(ARM::CMPri))
11085 .addReg(NewVReg1)
11086 .addImm(NumLPads)
11088 } else if (Subtarget->hasV6T2Ops() && isUInt<16>(NumLPads)) {
11089 Register VReg1 = MRI->createVirtualRegister(TRC);
11090 BuildMI(DispatchBB, dl, TII->get(ARM::MOVi16), VReg1)
11091 .addImm(NumLPads & 0xFFFF)
11093
11094 unsigned VReg2 = VReg1;
11095 if ((NumLPads & 0xFFFF0000) != 0) {
11096 VReg2 = MRI->createVirtualRegister(TRC);
11097 BuildMI(DispatchBB, dl, TII->get(ARM::MOVTi16), VReg2)
11098 .addReg(VReg1)
11099 .addImm(NumLPads >> 16)
11101 }
11102
11103 BuildMI(DispatchBB, dl, TII->get(ARM::CMPrr))
11104 .addReg(NewVReg1)
11105 .addReg(VReg2)
11107 } else {
11108 MachineConstantPool *ConstantPool = MF->getConstantPool();
11109 Type *Int32Ty = Type::getInt32Ty(MF->getFunction().getContext());
11110 const Constant *C = ConstantInt::get(Int32Ty, NumLPads);
11111
11112 // MachineConstantPool wants an explicit alignment.
11113 Align Alignment = MF->getDataLayout().getPrefTypeAlign(Int32Ty);
11114 unsigned Idx = ConstantPool->getConstantPoolIndex(C, Alignment);
11115
11116 Register VReg1 = MRI->createVirtualRegister(TRC);
11117 BuildMI(DispatchBB, dl, TII->get(ARM::LDRcp))
11118 .addReg(VReg1, RegState::Define)
11120 .addImm(0)
11122 BuildMI(DispatchBB, dl, TII->get(ARM::CMPrr))
11123 .addReg(NewVReg1)
11124 .addReg(VReg1, RegState::Kill)
11126 }
11127
11128 BuildMI(DispatchBB, dl, TII->get(ARM::Bcc))
11129 .addMBB(TrapBB)
11131 .addReg(ARM::CPSR);
11132
11133 Register NewVReg3 = MRI->createVirtualRegister(TRC);
11134 BuildMI(DispContBB, dl, TII->get(ARM::MOVsi), NewVReg3)
11135 .addReg(NewVReg1)
11138 .add(condCodeOp());
11139 Register NewVReg4 = MRI->createVirtualRegister(TRC);
11140 BuildMI(DispContBB, dl, TII->get(ARM::LEApcrelJT), NewVReg4)
11141 .addJumpTableIndex(MJTI)
11143
11144 MachineMemOperand *JTMMOLd =
11145 MF->getMachineMemOperand(MachinePointerInfo::getJumpTable(*MF),
11147 Register NewVReg5 = MRI->createVirtualRegister(TRC);
11148 BuildMI(DispContBB, dl, TII->get(ARM::LDRrs), NewVReg5)
11149 .addReg(NewVReg3, RegState::Kill)
11150 .addReg(NewVReg4)
11151 .addImm(0)
11152 .addMemOperand(JTMMOLd)
11154
11155 if (IsPositionIndependent) {
11156 BuildMI(DispContBB, dl, TII->get(ARM::BR_JTadd))
11157 .addReg(NewVReg5, RegState::Kill)
11158 .addReg(NewVReg4)
11159 .addJumpTableIndex(MJTI);
11160 } else {
11161 BuildMI(DispContBB, dl, TII->get(ARM::BR_JTr))
11162 .addReg(NewVReg5, RegState::Kill)
11163 .addJumpTableIndex(MJTI);
11164 }
11165 }
11166
11167 // Add the jump table entries as successors to the MBB.
11168 SmallPtrSet<MachineBasicBlock*, 8> SeenMBBs;
11169 for (MachineBasicBlock *CurMBB : LPadList) {
11170 if (SeenMBBs.insert(CurMBB).second)
11171 DispContBB->addSuccessor(CurMBB);
11172 }
11173
11174 // N.B. the order the invoke BBs are processed in doesn't matter here.
11175 const MCPhysReg *SavedRegs = RI.getCalleeSavedRegs(MF);
11177 for (MachineBasicBlock *BB : InvokeBBs) {
11178
11179 // Remove the landing pad successor from the invoke block and replace it
11180 // with the new dispatch block.
11181 SmallVector<MachineBasicBlock*, 4> Successors(BB->successors());
11182 while (!Successors.empty()) {
11183 MachineBasicBlock *SMBB = Successors.pop_back_val();
11184 if (SMBB->isEHPad()) {
11185 BB->removeSuccessor(SMBB);
11186 MBBLPads.push_back(SMBB);
11187 }
11188 }
11189
11190 BB->addSuccessor(DispatchBB, BranchProbability::getZero());
11191 BB->normalizeSuccProbs();
11192
11193 // Find the invoke call and mark all of the callee-saved registers as
11194 // 'implicit defined' so that they're spilled. This prevents code from
11195 // moving instructions to before the EH block, where they will never be
11196 // executed.
11198 II = BB->rbegin(), IE = BB->rend(); II != IE; ++II) {
11199 if (!II->isCall()) continue;
11200
11201 DenseSet<unsigned> DefRegs;
11203 OI = II->operands_begin(), OE = II->operands_end();
11204 OI != OE; ++OI) {
11205 if (!OI->isReg()) continue;
11206 DefRegs.insert(OI->getReg());
11207 }
11208
11209 MachineInstrBuilder MIB(*MF, &*II);
11210
11211 for (unsigned i = 0; SavedRegs[i] != 0; ++i) {
11212 unsigned Reg = SavedRegs[i];
11213 if (Subtarget->isThumb2() &&
11214 !ARM::tGPRRegClass.contains(Reg) &&
11215 !ARM::hGPRRegClass.contains(Reg))
11216 continue;
11217 if (Subtarget->isThumb1Only() && !ARM::tGPRRegClass.contains(Reg))
11218 continue;
11219 if (!Subtarget->isThumb() && !ARM::GPRRegClass.contains(Reg))
11220 continue;
11221 if (!DefRegs.contains(Reg))
11223 }
11224
11225 break;
11226 }
11227 }
11228
11229 // Mark all former landing pads as non-landing pads. The dispatch is the only
11230 // landing pad now.
11231 for (MachineBasicBlock *MBBLPad : MBBLPads)
11232 MBBLPad->setIsEHPad(false);
11233
11234 // The instruction is gone now.
11235 MI.eraseFromParent();
11236}
11237
11238static
11240 for (MachineBasicBlock *S : MBB->successors())
11241 if (S != Succ)
11242 return S;
11243 llvm_unreachable("Expecting a BB with two successors!");
11244}
11245
11246/// Return the load opcode for a given load size. If load size >= 8,
11247/// neon opcode will be returned.
11248static unsigned getLdOpcode(unsigned LdSize, bool IsThumb1, bool IsThumb2) {
11249 if (LdSize >= 8)
11250 return LdSize == 16 ? ARM::VLD1q32wb_fixed
11251 : LdSize == 8 ? ARM::VLD1d32wb_fixed : 0;
11252 if (IsThumb1)
11253 return LdSize == 4 ? ARM::tLDRi
11254 : LdSize == 2 ? ARM::tLDRHi
11255 : LdSize == 1 ? ARM::tLDRBi : 0;
11256 if (IsThumb2)
11257 return LdSize == 4 ? ARM::t2LDR_POST
11258 : LdSize == 2 ? ARM::t2LDRH_POST
11259 : LdSize == 1 ? ARM::t2LDRB_POST : 0;
11260 return LdSize == 4 ? ARM::LDR_POST_IMM
11261 : LdSize == 2 ? ARM::LDRH_POST
11262 : LdSize == 1 ? ARM::LDRB_POST_IMM : 0;
11263}
11264
11265/// Return the store opcode for a given store size. If store size >= 8,
11266/// neon opcode will be returned.
11267static unsigned getStOpcode(unsigned StSize, bool IsThumb1, bool IsThumb2) {
11268 if (StSize >= 8)
11269 return StSize == 16 ? ARM::VST1q32wb_fixed
11270 : StSize == 8 ? ARM::VST1d32wb_fixed : 0;
11271 if (IsThumb1)
11272 return StSize == 4 ? ARM::tSTRi
11273 : StSize == 2 ? ARM::tSTRHi
11274 : StSize == 1 ? ARM::tSTRBi : 0;
11275 if (IsThumb2)
11276 return StSize == 4 ? ARM::t2STR_POST
11277 : StSize == 2 ? ARM::t2STRH_POST
11278 : StSize == 1 ? ARM::t2STRB_POST : 0;
11279 return StSize == 4 ? ARM::STR_POST_IMM
11280 : StSize == 2 ? ARM::STRH_POST
11281 : StSize == 1 ? ARM::STRB_POST_IMM : 0;
11282}
11283
11284/// Emit a post-increment load operation with given size. The instructions
11285/// will be added to BB at Pos.
11287 const TargetInstrInfo *TII, const DebugLoc &dl,
11288 unsigned LdSize, unsigned Data, unsigned AddrIn,
11289 unsigned AddrOut, bool IsThumb1, bool IsThumb2) {
11290 unsigned LdOpc = getLdOpcode(LdSize, IsThumb1, IsThumb2);
11291 assert(LdOpc != 0 && "Should have a load opcode");
11292 if (LdSize >= 8) {
11293 BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data)
11294 .addReg(AddrOut, RegState::Define)
11295 .addReg(AddrIn)
11296 .addImm(0)
11298 } else if (IsThumb1) {
11299 // load + update AddrIn
11300 BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data)
11301 .addReg(AddrIn)
11302 .addImm(0)
11304 BuildMI(*BB, Pos, dl, TII->get(ARM::tADDi8), AddrOut)
11305 .add(t1CondCodeOp())
11306 .addReg(AddrIn)
11307 .addImm(LdSize)
11309 } else if (IsThumb2) {
11310 BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data)
11311 .addReg(AddrOut, RegState::Define)
11312 .addReg(AddrIn)
11313 .addImm(LdSize)
11315 } else { // arm
11316 BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data)
11317 .addReg(AddrOut, RegState::Define)
11318 .addReg(AddrIn)
11319 .addReg(0)
11320 .addImm(LdSize)
11322 }
11323}
11324
11325/// Emit a post-increment store operation with given size. The instructions
11326/// will be added to BB at Pos.
11328 const TargetInstrInfo *TII, const DebugLoc &dl,
11329 unsigned StSize, unsigned Data, unsigned AddrIn,
11330 unsigned AddrOut, bool IsThumb1, bool IsThumb2) {
11331 unsigned StOpc = getStOpcode(StSize, IsThumb1, IsThumb2);
11332 assert(StOpc != 0 && "Should have a store opcode");
11333 if (StSize >= 8) {
11334 BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut)
11335 .addReg(AddrIn)
11336 .addImm(0)
11337 .addReg(Data)
11339 } else if (IsThumb1) {
11340 // store + update AddrIn
11341 BuildMI(*BB, Pos, dl, TII->get(StOpc))
11342 .addReg(Data)
11343 .addReg(AddrIn)
11344 .addImm(0)
11346 BuildMI(*BB, Pos, dl, TII->get(ARM::tADDi8), AddrOut)
11347 .add(t1CondCodeOp())
11348 .addReg(AddrIn)
11349 .addImm(StSize)
11351 } else if (IsThumb2) {
11352 BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut)
11353 .addReg(Data)
11354 .addReg(AddrIn)
11355 .addImm(StSize)
11357 } else { // arm
11358 BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut)
11359 .addReg(Data)
11360 .addReg(AddrIn)
11361 .addReg(0)
11362 .addImm(StSize)
11364 }
11365}
11366
11368ARMTargetLowering::EmitStructByval(MachineInstr &MI,
11369 MachineBasicBlock *BB) const {
11370 // This pseudo instruction has 3 operands: dst, src, size
11371 // We expand it to a loop if size > Subtarget->getMaxInlineSizeThreshold().
11372 // Otherwise, we will generate unrolled scalar copies.
11373 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
11374 const BasicBlock *LLVM_BB = BB->getBasicBlock();
11376
11377 Register dest = MI.getOperand(0).getReg();
11378 Register src = MI.getOperand(1).getReg();
11379 unsigned SizeVal = MI.getOperand(2).getImm();
11380 unsigned Alignment = MI.getOperand(3).getImm();
11381 DebugLoc dl = MI.getDebugLoc();
11382
11383 MachineFunction *MF = BB->getParent();
11384 MachineRegisterInfo &MRI = MF->getRegInfo();
11385 unsigned UnitSize = 0;
11386 const TargetRegisterClass *TRC = nullptr;
11387 const TargetRegisterClass *VecTRC = nullptr;
11388
11389 bool IsThumb1 = Subtarget->isThumb1Only();
11390 bool IsThumb2 = Subtarget->isThumb2();
11391 bool IsThumb = Subtarget->isThumb();
11392
11393 if (Alignment & 1) {
11394 UnitSize = 1;
11395 } else if (Alignment & 2) {
11396 UnitSize = 2;
11397 } else {
11398 // Check whether we can use NEON instructions.
11399 if (!MF->getFunction().hasFnAttribute(Attribute::NoImplicitFloat) &&
11400 Subtarget->hasNEON()) {
11401 if ((Alignment % 16 == 0) && SizeVal >= 16)
11402 UnitSize = 16;
11403 else if ((Alignment % 8 == 0) && SizeVal >= 8)
11404 UnitSize = 8;
11405 }
11406 // Can't use NEON instructions.
11407 if (UnitSize == 0)
11408 UnitSize = 4;
11409 }
11410
11411 // Select the correct opcode and register class for unit size load/store
11412 bool IsNeon = UnitSize >= 8;
11413 TRC = IsThumb ? &ARM::tGPRRegClass : &ARM::GPRRegClass;
11414 if (IsNeon)
11415 VecTRC = UnitSize == 16 ? &ARM::DPairRegClass
11416 : UnitSize == 8 ? &ARM::DPRRegClass
11417 : nullptr;
11418
11419 unsigned BytesLeft = SizeVal % UnitSize;
11420 unsigned LoopSize = SizeVal - BytesLeft;
11421
11422 if (SizeVal <= Subtarget->getMaxInlineSizeThreshold()) {
11423 // Use LDR and STR to copy.
11424 // [scratch, srcOut] = LDR_POST(srcIn, UnitSize)
11425 // [destOut] = STR_POST(scratch, destIn, UnitSize)
11426 unsigned srcIn = src;
11427 unsigned destIn = dest;
11428 for (unsigned i = 0; i < LoopSize; i+=UnitSize) {
11429 Register srcOut = MRI.createVirtualRegister(TRC);
11430 Register destOut = MRI.createVirtualRegister(TRC);
11431 Register scratch = MRI.createVirtualRegister(IsNeon ? VecTRC : TRC);
11432 emitPostLd(BB, MI, TII, dl, UnitSize, scratch, srcIn, srcOut,
11433 IsThumb1, IsThumb2);
11434 emitPostSt(BB, MI, TII, dl, UnitSize, scratch, destIn, destOut,
11435 IsThumb1, IsThumb2);
11436 srcIn = srcOut;
11437 destIn = destOut;
11438 }
11439
11440 // Handle the leftover bytes with LDRB and STRB.
11441 // [scratch, srcOut] = LDRB_POST(srcIn, 1)
11442 // [destOut] = STRB_POST(scratch, destIn, 1)
11443 for (unsigned i = 0; i < BytesLeft; i++) {
11444 Register srcOut = MRI.createVirtualRegister(TRC);
11445 Register destOut = MRI.createVirtualRegister(TRC);
11446 Register scratch = MRI.createVirtualRegister(TRC);
11447 emitPostLd(BB, MI, TII, dl, 1, scratch, srcIn, srcOut,
11448 IsThumb1, IsThumb2);
11449 emitPostSt(BB, MI, TII, dl, 1, scratch, destIn, destOut,
11450 IsThumb1, IsThumb2);
11451 srcIn = srcOut;
11452 destIn = destOut;
11453 }
11454 MI.eraseFromParent(); // The instruction is gone now.
11455 return BB;
11456 }
11457
11458 // Expand the pseudo op to a loop.
11459 // thisMBB:
11460 // ...
11461 // movw varEnd, # --> with thumb2
11462 // movt varEnd, #
11463 // ldrcp varEnd, idx --> without thumb2
11464 // fallthrough --> loopMBB
11465 // loopMBB:
11466 // PHI varPhi, varEnd, varLoop
11467 // PHI srcPhi, src, srcLoop
11468 // PHI destPhi, dst, destLoop
11469 // [scratch, srcLoop] = LDR_POST(srcPhi, UnitSize)
11470 // [destLoop] = STR_POST(scratch, destPhi, UnitSize)
11471 // subs varLoop, varPhi, #UnitSize
11472 // bne loopMBB
11473 // fallthrough --> exitMBB
11474 // exitMBB:
11475 // epilogue to handle left-over bytes
11476 // [scratch, srcOut] = LDRB_POST(srcLoop, 1)
11477 // [destOut] = STRB_POST(scratch, destLoop, 1)
11478 MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB);
11479 MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
11480 MF->insert(It, loopMBB);
11481 MF->insert(It, exitMBB);
11482
11483 // Set the call frame size on entry to the new basic blocks.
11484 unsigned CallFrameSize = TII->getCallFrameSizeAt(MI);
11485 loopMBB->setCallFrameSize(CallFrameSize);
11486 exitMBB->setCallFrameSize(CallFrameSize);
11487
11488 // Transfer the remainder of BB and its successor edges to exitMBB.
11489 exitMBB->splice(exitMBB->begin(), BB,
11490 std::next(MachineBasicBlock::iterator(MI)), BB->end());
11492
11493 // Load an immediate to varEnd.
11494 Register varEnd = MRI.createVirtualRegister(TRC);
11495 if (Subtarget->useMovt()) {
11496 BuildMI(BB, dl, TII->get(IsThumb ? ARM::t2MOVi32imm : ARM::MOVi32imm),
11497 varEnd)
11498 .addImm(LoopSize);
11499 } else if (Subtarget->genExecuteOnly()) {
11500 assert(IsThumb && "Non-thumb expected to have used movt");
11501 BuildMI(BB, dl, TII->get(ARM::tMOVi32imm), varEnd).addImm(LoopSize);
11502 } else {
11503 MachineConstantPool *ConstantPool = MF->getConstantPool();
11505 const Constant *C = ConstantInt::get(Int32Ty, LoopSize);
11506
11507 // MachineConstantPool wants an explicit alignment.
11508 Align Alignment = MF->getDataLayout().getPrefTypeAlign(Int32Ty);
11509 unsigned Idx = ConstantPool->getConstantPoolIndex(C, Alignment);
11510 MachineMemOperand *CPMMO =
11513
11514 if (IsThumb)
11515 BuildMI(*BB, MI, dl, TII->get(ARM::tLDRpci))
11516 .addReg(varEnd, RegState::Define)
11519 .addMemOperand(CPMMO);
11520 else
11521 BuildMI(*BB, MI, dl, TII->get(ARM::LDRcp))
11522 .addReg(varEnd, RegState::Define)
11524 .addImm(0)
11526 .addMemOperand(CPMMO);
11527 }
11528 BB->addSuccessor(loopMBB);
11529
11530 // Generate the loop body:
11531 // varPhi = PHI(varLoop, varEnd)
11532 // srcPhi = PHI(srcLoop, src)
11533 // destPhi = PHI(destLoop, dst)
11534 MachineBasicBlock *entryBB = BB;
11535 BB = loopMBB;
11536 Register varLoop = MRI.createVirtualRegister(TRC);
11537 Register varPhi = MRI.createVirtualRegister(TRC);
11538 Register srcLoop = MRI.createVirtualRegister(TRC);
11539 Register srcPhi = MRI.createVirtualRegister(TRC);
11540 Register destLoop = MRI.createVirtualRegister(TRC);
11541 Register destPhi = MRI.createVirtualRegister(TRC);
11542
11543 BuildMI(*BB, BB->begin(), dl, TII->get(ARM::PHI), varPhi)
11544 .addReg(varLoop).addMBB(loopMBB)
11545 .addReg(varEnd).addMBB(entryBB);
11546 BuildMI(BB, dl, TII->get(ARM::PHI), srcPhi)
11547 .addReg(srcLoop).addMBB(loopMBB)
11548 .addReg(src).addMBB(entryBB);
11549 BuildMI(BB, dl, TII->get(ARM::PHI), destPhi)
11550 .addReg(destLoop).addMBB(loopMBB)
11551 .addReg(dest).addMBB(entryBB);
11552
11553 // [scratch, srcLoop] = LDR_POST(srcPhi, UnitSize)
11554 // [destLoop] = STR_POST(scratch, destPhi, UnitSiz)
11555 Register scratch = MRI.createVirtualRegister(IsNeon ? VecTRC : TRC);
11556 emitPostLd(BB, BB->end(), TII, dl, UnitSize, scratch, srcPhi, srcLoop,
11557 IsThumb1, IsThumb2);
11558 emitPostSt(BB, BB->end(), TII, dl, UnitSize, scratch, destPhi, destLoop,
11559 IsThumb1, IsThumb2);
11560
11561 // Decrement loop variable by UnitSize.
11562 if (IsThumb1) {
11563 BuildMI(*BB, BB->end(), dl, TII->get(ARM::tSUBi8), varLoop)
11564 .add(t1CondCodeOp())
11565 .addReg(varPhi)
11566 .addImm(UnitSize)
11568 } else {
11569 MachineInstrBuilder MIB =
11570 BuildMI(*BB, BB->end(), dl,
11571 TII->get(IsThumb2 ? ARM::t2SUBri : ARM::SUBri), varLoop);
11572 MIB.addReg(varPhi)
11573 .addImm(UnitSize)
11575 .add(condCodeOp());
11576 MIB->getOperand(5).setReg(ARM::CPSR);
11577 MIB->getOperand(5).setIsDef(true);
11578 }
11579 BuildMI(*BB, BB->end(), dl,
11580 TII->get(IsThumb1 ? ARM::tBcc : IsThumb2 ? ARM::t2Bcc : ARM::Bcc))
11581 .addMBB(loopMBB).addImm(ARMCC::NE).addReg(ARM::CPSR);
11582
11583 // loopMBB can loop back to loopMBB or fall through to exitMBB.
11584 BB->addSuccessor(loopMBB);
11585 BB->addSuccessor(exitMBB);
11586
11587 // Add epilogue to handle BytesLeft.
11588 BB = exitMBB;
11589 auto StartOfExit = exitMBB->begin();
11590
11591 // [scratch, srcOut] = LDRB_POST(srcLoop, 1)
11592 // [destOut] = STRB_POST(scratch, destLoop, 1)
11593 unsigned srcIn = srcLoop;
11594 unsigned destIn = destLoop;
11595 for (unsigned i = 0; i < BytesLeft; i++) {
11596 Register srcOut = MRI.createVirtualRegister(TRC);
11597 Register destOut = MRI.createVirtualRegister(TRC);
11598 Register scratch = MRI.createVirtualRegister(TRC);
11599 emitPostLd(BB, StartOfExit, TII, dl, 1, scratch, srcIn, srcOut,
11600 IsThumb1, IsThumb2);
11601 emitPostSt(BB, StartOfExit, TII, dl, 1, scratch, destIn, destOut,
11602 IsThumb1, IsThumb2);
11603 srcIn = srcOut;
11604 destIn = destOut;
11605 }
11606
11607 MI.eraseFromParent(); // The instruction is gone now.
11608 return BB;
11609}
11610
11612ARMTargetLowering::EmitLowered__chkstk(MachineInstr &MI,
11613 MachineBasicBlock *MBB) const {
11614 const TargetMachine &TM = getTargetMachine();
11615 const TargetInstrInfo &TII = *Subtarget->getInstrInfo();
11616 DebugLoc DL = MI.getDebugLoc();
11617
11618 assert(TM.getTargetTriple().isOSWindows() &&
11619 "__chkstk is only supported on Windows");
11620 assert(Subtarget->isThumb2() && "Windows on ARM requires Thumb-2 mode");
11621
11622 // __chkstk takes the number of words to allocate on the stack in R4, and
11623 // returns the stack adjustment in number of bytes in R4. This will not
11624 // clober any other registers (other than the obvious lr).
11625 //
11626 // Although, technically, IP should be considered a register which may be
11627 // clobbered, the call itself will not touch it. Windows on ARM is a pure
11628 // thumb-2 environment, so there is no interworking required. As a result, we
11629 // do not expect a veneer to be emitted by the linker, clobbering IP.
11630 //
11631 // Each module receives its own copy of __chkstk, so no import thunk is
11632 // required, again, ensuring that IP is not clobbered.
11633 //
11634 // Finally, although some linkers may theoretically provide a trampoline for
11635 // out of range calls (which is quite common due to a 32M range limitation of
11636 // branches for Thumb), we can generate the long-call version via
11637 // -mcmodel=large, alleviating the need for the trampoline which may clobber
11638 // IP.
11639
11640 RTLIB::LibcallImpl ChkStkLibcall = getLibcallImpl(RTLIB::STACK_PROBE);
11641 if (ChkStkLibcall == RTLIB::Unsupported)
11642 reportFatalUsageError("no available implementation of __chkstk");
11643
11644 const char *ChkStk = getLibcallImplName(ChkStkLibcall).data();
11645 switch (TM.getCodeModel()) {
11646 case CodeModel::Tiny:
11647 llvm_unreachable("Tiny code model not available on ARM.");
11648 case CodeModel::Small:
11649 case CodeModel::Medium:
11650 case CodeModel::Kernel:
11651 BuildMI(*MBB, MI, DL, TII.get(ARM::tBL))
11653 .addExternalSymbol(ChkStk)
11656 .addReg(ARM::R12,
11658 .addReg(ARM::CPSR,
11660 break;
11661 case CodeModel::Large: {
11662 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
11663 Register Reg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
11664
11665 BuildMI(*MBB, MI, DL, TII.get(ARM::t2MOVi32imm), Reg)
11666 .addExternalSymbol(ChkStk);
11672 .addReg(ARM::R12,
11674 .addReg(ARM::CPSR,
11676 break;
11677 }
11678 }
11679
11680 BuildMI(*MBB, MI, DL, TII.get(ARM::t2SUBrr), ARM::SP)
11681 .addReg(ARM::SP, RegState::Kill)
11682 .addReg(ARM::R4, RegState::Kill)
11685 .add(condCodeOp());
11686
11687 MI.eraseFromParent();
11688 return MBB;
11689}
11690
11692ARMTargetLowering::EmitLowered__dbzchk(MachineInstr &MI,
11693 MachineBasicBlock *MBB) const {
11694 DebugLoc DL = MI.getDebugLoc();
11695 MachineFunction *MF = MBB->getParent();
11696 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
11697
11698 MachineBasicBlock *ContBB = MF->CreateMachineBasicBlock();
11699 MF->insert(++MBB->getIterator(), ContBB);
11700 ContBB->splice(ContBB->begin(), MBB,
11701 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
11703 MBB->addSuccessor(ContBB);
11704
11705 MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
11706 BuildMI(TrapBB, DL, TII->get(ARM::t__brkdiv0));
11707 MF->push_back(TrapBB);
11708 MBB->addSuccessor(TrapBB);
11709
11710 BuildMI(*MBB, MI, DL, TII->get(ARM::tCMPi8))
11711 .addReg(MI.getOperand(0).getReg())
11712 .addImm(0)
11714 BuildMI(*MBB, MI, DL, TII->get(ARM::t2Bcc))
11715 .addMBB(TrapBB)
11717 .addReg(ARM::CPSR);
11718
11719 MI.eraseFromParent();
11720 return ContBB;
11721}
11722
11723// The CPSR operand of SelectItr might be missing a kill marker
11724// because there were multiple uses of CPSR, and ISel didn't know
11725// which to mark. Figure out whether SelectItr should have had a
11726// kill marker, and set it if it should. Returns the correct kill
11727// marker value.
11730 const TargetRegisterInfo* TRI) {
11731 // Scan forward through BB for a use/def of CPSR.
11732 MachineBasicBlock::iterator miI(std::next(SelectItr));
11733 for (MachineBasicBlock::iterator miE = BB->end(); miI != miE; ++miI) {
11734 const MachineInstr& mi = *miI;
11735 if (mi.readsRegister(ARM::CPSR, /*TRI=*/nullptr))
11736 return false;
11737 if (mi.definesRegister(ARM::CPSR, /*TRI=*/nullptr))
11738 break; // Should have kill-flag - update below.
11739 }
11740
11741 // If we hit the end of the block, check whether CPSR is live into a
11742 // successor.
11743 if (miI == BB->end()) {
11744 for (MachineBasicBlock *Succ : BB->successors())
11745 if (Succ->isLiveIn(ARM::CPSR))
11746 return false;
11747 }
11748
11749 // We found a def, or hit the end of the basic block and CPSR wasn't live
11750 // out. SelectMI should have a kill flag on CPSR.
11751 SelectItr->addRegisterKilled(ARM::CPSR, TRI);
11752 return true;
11753}
11754
11755/// Adds logic in loop entry MBB to calculate loop iteration count and adds
11756/// t2WhileLoopSetup and t2WhileLoopStart to generate WLS loop
11758 MachineBasicBlock *TpLoopBody,
11759 MachineBasicBlock *TpExit, Register OpSizeReg,
11760 const TargetInstrInfo *TII, DebugLoc Dl,
11761 MachineRegisterInfo &MRI) {
11762 // Calculates loop iteration count = ceil(n/16) = (n + 15) >> 4.
11763 Register AddDestReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
11764 BuildMI(TpEntry, Dl, TII->get(ARM::t2ADDri), AddDestReg)
11765 .addUse(OpSizeReg)
11766 .addImm(15)
11768 .addReg(0);
11769
11770 Register LsrDestReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
11771 BuildMI(TpEntry, Dl, TII->get(ARM::t2LSRri), LsrDestReg)
11772 .addUse(AddDestReg, RegState::Kill)
11773 .addImm(4)
11775 .addReg(0);
11776
11777 Register TotalIterationsReg = MRI.createVirtualRegister(&ARM::GPRlrRegClass);
11778 BuildMI(TpEntry, Dl, TII->get(ARM::t2WhileLoopSetup), TotalIterationsReg)
11779 .addUse(LsrDestReg, RegState::Kill);
11780
11781 BuildMI(TpEntry, Dl, TII->get(ARM::t2WhileLoopStart))
11782 .addUse(TotalIterationsReg)
11783 .addMBB(TpExit);
11784
11785 BuildMI(TpEntry, Dl, TII->get(ARM::t2B))
11786 .addMBB(TpLoopBody)
11788
11789 return TotalIterationsReg;
11790}
11791
11792/// Adds logic in the loopBody MBB to generate MVE_VCTP, t2DoLoopDec and
11793/// t2DoLoopEnd. These are used by later passes to generate tail predicated
11794/// loops.
11795static void genTPLoopBody(MachineBasicBlock *TpLoopBody,
11796 MachineBasicBlock *TpEntry, MachineBasicBlock *TpExit,
11797 const TargetInstrInfo *TII, DebugLoc Dl,
11798 MachineRegisterInfo &MRI, Register OpSrcReg,
11799 Register OpDestReg, Register ElementCountReg,
11800 Register TotalIterationsReg, bool IsMemcpy) {
11801 // First insert 4 PHI nodes for: Current pointer to Src (if memcpy), Dest
11802 // array, loop iteration counter, predication counter.
11803
11804 Register SrcPhiReg, CurrSrcReg;
11805 if (IsMemcpy) {
11806 // Current position in the src array
11807 SrcPhiReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
11808 CurrSrcReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
11809 BuildMI(TpLoopBody, Dl, TII->get(ARM::PHI), SrcPhiReg)
11810 .addUse(OpSrcReg)
11811 .addMBB(TpEntry)
11812 .addUse(CurrSrcReg)
11813 .addMBB(TpLoopBody);
11814 }
11815
11816 // Current position in the dest array
11817 Register DestPhiReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
11818 Register CurrDestReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
11819 BuildMI(TpLoopBody, Dl, TII->get(ARM::PHI), DestPhiReg)
11820 .addUse(OpDestReg)
11821 .addMBB(TpEntry)
11822 .addUse(CurrDestReg)
11823 .addMBB(TpLoopBody);
11824
11825 // Current loop counter
11826 Register LoopCounterPhiReg = MRI.createVirtualRegister(&ARM::GPRlrRegClass);
11827 Register RemainingLoopIterationsReg =
11828 MRI.createVirtualRegister(&ARM::GPRlrRegClass);
11829 BuildMI(TpLoopBody, Dl, TII->get(ARM::PHI), LoopCounterPhiReg)
11830 .addUse(TotalIterationsReg)
11831 .addMBB(TpEntry)
11832 .addUse(RemainingLoopIterationsReg)
11833 .addMBB(TpLoopBody);
11834
11835 // Predication counter
11836 Register PredCounterPhiReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
11837 Register RemainingElementsReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
11838 BuildMI(TpLoopBody, Dl, TII->get(ARM::PHI), PredCounterPhiReg)
11839 .addUse(ElementCountReg)
11840 .addMBB(TpEntry)
11841 .addUse(RemainingElementsReg)
11842 .addMBB(TpLoopBody);
11843
11844 // Pass predication counter to VCTP
11845 Register VccrReg = MRI.createVirtualRegister(&ARM::VCCRRegClass);
11846 BuildMI(TpLoopBody, Dl, TII->get(ARM::MVE_VCTP8), VccrReg)
11847 .addUse(PredCounterPhiReg)
11849 .addReg(0)
11850 .addReg(0);
11851
11852 BuildMI(TpLoopBody, Dl, TII->get(ARM::t2SUBri), RemainingElementsReg)
11853 .addUse(PredCounterPhiReg)
11854 .addImm(16)
11856 .addReg(0);
11857
11858 // VLDRB (only if memcpy) and VSTRB instructions, predicated using VPR
11859 Register SrcValueReg;
11860 if (IsMemcpy) {
11861 SrcValueReg = MRI.createVirtualRegister(&ARM::MQPRRegClass);
11862 BuildMI(TpLoopBody, Dl, TII->get(ARM::MVE_VLDRBU8_post))
11863 .addDef(CurrSrcReg)
11864 .addDef(SrcValueReg)
11865 .addReg(SrcPhiReg)
11866 .addImm(16)
11868 .addUse(VccrReg)
11869 .addReg(0);
11870 } else
11871 SrcValueReg = OpSrcReg;
11872
11873 BuildMI(TpLoopBody, Dl, TII->get(ARM::MVE_VSTRBU8_post))
11874 .addDef(CurrDestReg)
11875 .addUse(SrcValueReg)
11876 .addReg(DestPhiReg)
11877 .addImm(16)
11879 .addUse(VccrReg)
11880 .addReg(0);
11881
11882 // Add the pseudoInstrs for decrementing the loop counter and marking the
11883 // end:t2DoLoopDec and t2DoLoopEnd
11884 BuildMI(TpLoopBody, Dl, TII->get(ARM::t2LoopDec), RemainingLoopIterationsReg)
11885 .addUse(LoopCounterPhiReg)
11886 .addImm(1);
11887
11888 BuildMI(TpLoopBody, Dl, TII->get(ARM::t2LoopEnd))
11889 .addUse(RemainingLoopIterationsReg)
11890 .addMBB(TpLoopBody);
11891
11892 BuildMI(TpLoopBody, Dl, TII->get(ARM::t2B))
11893 .addMBB(TpExit)
11895}
11896
11898 // KCFI is supported in all ARM/Thumb modes
11899 return true;
11900}
11901
11905 const TargetInstrInfo *TII) const {
11906 assert(MBBI->isCall() && MBBI->getCFIType() &&
11907 "Invalid call instruction for a KCFI check");
11908
11909 MachineOperand *TargetOp = nullptr;
11910 switch (MBBI->getOpcode()) {
11911 // ARM mode opcodes
11912 case ARM::BLX:
11913 case ARM::BLX_pred:
11914 case ARM::BLX_noip:
11915 case ARM::BLX_pred_noip:
11916 case ARM::BX_CALL:
11917 TargetOp = &MBBI->getOperand(0);
11918 break;
11919 case ARM::TCRETURNri:
11920 case ARM::TCRETURNrinotr12:
11921 case ARM::TAILJMPr:
11922 case ARM::TAILJMPr4:
11923 TargetOp = &MBBI->getOperand(0);
11924 break;
11925 // Thumb mode opcodes (Thumb1 and Thumb2)
11926 // Note: Most Thumb call instructions have predicate operands before the
11927 // target register Format: tBLXr pred, predreg, target_register, ...
11928 case ARM::tBLXr: // Thumb1/Thumb2: BLX register (requires V5T)
11929 case ARM::tBLXr_noip: // Thumb1/Thumb2: BLX register, no IP clobber
11930 case ARM::tBX_CALL: // Thumb1 only: BX call (push LR, BX)
11931 TargetOp = &MBBI->getOperand(2);
11932 break;
11933 // Tail call instructions don't have predicates, target is operand 0
11934 case ARM::tTAILJMPr: // Thumb1/Thumb2: Tail call via register
11935 TargetOp = &MBBI->getOperand(0);
11936 break;
11937 default:
11938 llvm_unreachable("Unexpected CFI call opcode");
11939 }
11940
11941 assert(TargetOp && TargetOp->isReg() && "Invalid target operand");
11942 TargetOp->setIsRenamable(false);
11943
11944 // Select the appropriate KCFI_CHECK variant based on the instruction set
11945 unsigned KCFICheckOpcode;
11946 if (Subtarget->isThumb()) {
11947 if (Subtarget->isThumb2()) {
11948 KCFICheckOpcode = ARM::KCFI_CHECK_Thumb2;
11949 } else {
11950 KCFICheckOpcode = ARM::KCFI_CHECK_Thumb1;
11951 }
11952 } else {
11953 KCFICheckOpcode = ARM::KCFI_CHECK_ARM;
11954 }
11955
11956 return BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII->get(KCFICheckOpcode))
11957 .addReg(TargetOp->getReg())
11958 .addImm(MBBI->getCFIType())
11959 .getInstr();
11960}
11961
11964 MachineBasicBlock *BB) const {
11965 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
11966 DebugLoc dl = MI.getDebugLoc();
11967 bool isThumb2 = Subtarget->isThumb2();
11968 switch (MI.getOpcode()) {
11969 default: {
11970 MI.print(errs());
11971 llvm_unreachable("Unexpected instr type to insert");
11972 }
11973
11974 // Thumb1 post-indexed loads are really just single-register LDMs.
11975 case ARM::tLDR_postidx: {
11976 MachineOperand Def(MI.getOperand(1));
11977 BuildMI(*BB, MI, dl, TII->get(ARM::tLDMIA_UPD))
11978 .add(Def) // Rn_wb
11979 .add(MI.getOperand(2)) // Rn
11980 .add(MI.getOperand(3)) // PredImm
11981 .add(MI.getOperand(4)) // PredReg
11982 .add(MI.getOperand(0)) // Rt
11983 .cloneMemRefs(MI);
11984 MI.eraseFromParent();
11985 return BB;
11986 }
11987
11988 case ARM::MVE_MEMCPYLOOPINST:
11989 case ARM::MVE_MEMSETLOOPINST: {
11990
11991 // Transformation below expands MVE_MEMCPYLOOPINST/MVE_MEMSETLOOPINST Pseudo
11992 // into a Tail Predicated (TP) Loop. It adds the instructions to calculate
11993 // the iteration count =ceil(size_in_bytes/16)) in the TP entry block and
11994 // adds the relevant instructions in the TP loop Body for generation of a
11995 // WLSTP loop.
11996
11997 // Below is relevant portion of the CFG after the transformation.
11998 // The Machine Basic Blocks are shown along with branch conditions (in
11999 // brackets). Note that TP entry/exit MBBs depict the entry/exit of this
12000 // portion of the CFG and may not necessarily be the entry/exit of the
12001 // function.
12002
12003 // (Relevant) CFG after transformation:
12004 // TP entry MBB
12005 // |
12006 // |-----------------|
12007 // (n <= 0) (n > 0)
12008 // | |
12009 // | TP loop Body MBB<--|
12010 // | | |
12011 // \ |___________|
12012 // \ /
12013 // TP exit MBB
12014
12015 MachineFunction *MF = BB->getParent();
12016 MachineFunctionProperties &Properties = MF->getProperties();
12017 MachineRegisterInfo &MRI = MF->getRegInfo();
12018
12019 Register OpDestReg = MI.getOperand(0).getReg();
12020 Register OpSrcReg = MI.getOperand(1).getReg();
12021 Register OpSizeReg = MI.getOperand(2).getReg();
12022
12023 // Allocate the required MBBs and add to parent function.
12024 MachineBasicBlock *TpEntry = BB;
12025 MachineBasicBlock *TpLoopBody = MF->CreateMachineBasicBlock();
12026 MachineBasicBlock *TpExit;
12027
12028 MF->push_back(TpLoopBody);
12029
12030 // If any instructions are present in the current block after
12031 // MVE_MEMCPYLOOPINST or MVE_MEMSETLOOPINST, split the current block and
12032 // move the instructions into the newly created exit block. If there are no
12033 // instructions add an explicit branch to the FallThrough block and then
12034 // split.
12035 //
12036 // The split is required for two reasons:
12037 // 1) A terminator(t2WhileLoopStart) will be placed at that site.
12038 // 2) Since a TPLoopBody will be added later, any phis in successive blocks
12039 // need to be updated. splitAt() already handles this.
12040 TpExit = BB->splitAt(MI, false);
12041 if (TpExit == BB) {
12042 assert(BB->canFallThrough() && "Exit Block must be Fallthrough of the "
12043 "block containing memcpy/memset Pseudo");
12044 TpExit = BB->getFallThrough();
12045 BuildMI(BB, dl, TII->get(ARM::t2B))
12046 .addMBB(TpExit)
12048 TpExit = BB->splitAt(MI, false);
12049 }
12050
12051 // Add logic for iteration count
12052 Register TotalIterationsReg =
12053 genTPEntry(TpEntry, TpLoopBody, TpExit, OpSizeReg, TII, dl, MRI);
12054
12055 // Add the vectorized (and predicated) loads/store instructions
12056 bool IsMemcpy = MI.getOpcode() == ARM::MVE_MEMCPYLOOPINST;
12057 genTPLoopBody(TpLoopBody, TpEntry, TpExit, TII, dl, MRI, OpSrcReg,
12058 OpDestReg, OpSizeReg, TotalIterationsReg, IsMemcpy);
12059
12060 // Required to avoid conflict with the MachineVerifier during testing.
12061 Properties.resetNoPHIs();
12062
12063 // Connect the blocks
12064 TpEntry->addSuccessor(TpLoopBody);
12065 TpLoopBody->addSuccessor(TpLoopBody);
12066 TpLoopBody->addSuccessor(TpExit);
12067
12068 // Reorder for a more natural layout
12069 TpLoopBody->moveAfter(TpEntry);
12070 TpExit->moveAfter(TpLoopBody);
12071
12072 // Finally, remove the memcpy Pseudo Instruction
12073 MI.eraseFromParent();
12074
12075 // Return the exit block as it may contain other instructions requiring a
12076 // custom inserter
12077 return TpExit;
12078 }
12079
12080 // The Thumb2 pre-indexed stores have the same MI operands, they just
12081 // define them differently in the .td files from the isel patterns, so
12082 // they need pseudos.
12083 case ARM::t2STR_preidx:
12084 MI.setDesc(TII->get(ARM::t2STR_PRE));
12085 return BB;
12086 case ARM::t2STRB_preidx:
12087 MI.setDesc(TII->get(ARM::t2STRB_PRE));
12088 return BB;
12089 case ARM::t2STRH_preidx:
12090 MI.setDesc(TII->get(ARM::t2STRH_PRE));
12091 return BB;
12092
12093 case ARM::STRi_preidx:
12094 case ARM::STRBi_preidx: {
12095 unsigned NewOpc = MI.getOpcode() == ARM::STRi_preidx ? ARM::STR_PRE_IMM
12096 : ARM::STRB_PRE_IMM;
12097 // Decode the offset.
12098 unsigned Offset = MI.getOperand(4).getImm();
12099 bool isSub = ARM_AM::getAM2Op(Offset) == ARM_AM::sub;
12101 if (isSub)
12102 Offset = -Offset;
12103
12104 MachineMemOperand *MMO = *MI.memoperands_begin();
12105 BuildMI(*BB, MI, dl, TII->get(NewOpc))
12106 .add(MI.getOperand(0)) // Rn_wb
12107 .add(MI.getOperand(1)) // Rt
12108 .add(MI.getOperand(2)) // Rn
12109 .addImm(Offset) // offset (skip GPR==zero_reg)
12110 .add(MI.getOperand(5)) // pred
12111 .add(MI.getOperand(6))
12112 .addMemOperand(MMO);
12113 MI.eraseFromParent();
12114 return BB;
12115 }
12116 case ARM::STRr_preidx:
12117 case ARM::STRBr_preidx:
12118 case ARM::STRH_preidx: {
12119 unsigned NewOpc;
12120 switch (MI.getOpcode()) {
12121 default: llvm_unreachable("unexpected opcode!");
12122 case ARM::STRr_preidx: NewOpc = ARM::STR_PRE_REG; break;
12123 case ARM::STRBr_preidx: NewOpc = ARM::STRB_PRE_REG; break;
12124 case ARM::STRH_preidx: NewOpc = ARM::STRH_PRE; break;
12125 }
12126 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(NewOpc));
12127 for (const MachineOperand &MO : MI.operands())
12128 MIB.add(MO);
12129 MI.eraseFromParent();
12130 return BB;
12131 }
12132
12133 case ARM::tMOVCCr_pseudo: {
12134 // To "insert" a SELECT_CC instruction, we actually have to insert the
12135 // diamond control-flow pattern. The incoming instruction knows the
12136 // destination vreg to set, the condition code register to branch on, the
12137 // true/false values to select between, and a branch opcode to use.
12138 const BasicBlock *LLVM_BB = BB->getBasicBlock();
12140
12141 // thisMBB:
12142 // ...
12143 // TrueVal = ...
12144 // cmpTY ccX, r1, r2
12145 // bCC copy1MBB
12146 // fallthrough --> copy0MBB
12147 MachineBasicBlock *thisMBB = BB;
12148 MachineFunction *F = BB->getParent();
12149 MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
12150 MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
12151 F->insert(It, copy0MBB);
12152 F->insert(It, sinkMBB);
12153
12154 // Set the call frame size on entry to the new basic blocks.
12155 unsigned CallFrameSize = TII->getCallFrameSizeAt(MI);
12156 copy0MBB->setCallFrameSize(CallFrameSize);
12157 sinkMBB->setCallFrameSize(CallFrameSize);
12158
12159 // Check whether CPSR is live past the tMOVCCr_pseudo.
12160 const TargetRegisterInfo *TRI = Subtarget->getRegisterInfo();
12161 if (!MI.killsRegister(ARM::CPSR, /*TRI=*/nullptr) &&
12162 !checkAndUpdateCPSRKill(MI, thisMBB, TRI)) {
12163 copy0MBB->addLiveIn(ARM::CPSR);
12164 sinkMBB->addLiveIn(ARM::CPSR);
12165 }
12166
12167 // Transfer the remainder of BB and its successor edges to sinkMBB.
12168 sinkMBB->splice(sinkMBB->begin(), BB,
12169 std::next(MachineBasicBlock::iterator(MI)), BB->end());
12171
12172 BB->addSuccessor(copy0MBB);
12173 BB->addSuccessor(sinkMBB);
12174
12175 BuildMI(BB, dl, TII->get(ARM::tBcc))
12176 .addMBB(sinkMBB)
12177 .addImm(MI.getOperand(3).getImm())
12178 .addReg(MI.getOperand(4).getReg());
12179
12180 // copy0MBB:
12181 // %FalseValue = ...
12182 // # fallthrough to sinkMBB
12183 BB = copy0MBB;
12184
12185 // Update machine-CFG edges
12186 BB->addSuccessor(sinkMBB);
12187
12188 // sinkMBB:
12189 // %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
12190 // ...
12191 BB = sinkMBB;
12192 BuildMI(*BB, BB->begin(), dl, TII->get(ARM::PHI), MI.getOperand(0).getReg())
12193 .addReg(MI.getOperand(1).getReg())
12194 .addMBB(copy0MBB)
12195 .addReg(MI.getOperand(2).getReg())
12196 .addMBB(thisMBB);
12197
12198 MI.eraseFromParent(); // The pseudo instruction is gone now.
12199 return BB;
12200 }
12201
12202 case ARM::BCCi64:
12203 case ARM::BCCZi64: {
12204 // If there is an unconditional branch to the other successor, remove it.
12205 BB->erase(std::next(MachineBasicBlock::iterator(MI)), BB->end());
12206
12207 // Compare both parts that make up the double comparison separately for
12208 // equality.
12209 bool RHSisZero = MI.getOpcode() == ARM::BCCZi64;
12210
12211 Register LHS1 = MI.getOperand(1).getReg();
12212 Register LHS2 = MI.getOperand(2).getReg();
12213 if (RHSisZero) {
12214 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri))
12215 .addReg(LHS1)
12216 .addImm(0)
12218 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri))
12219 .addReg(LHS2).addImm(0)
12220 .addImm(ARMCC::EQ).addReg(ARM::CPSR);
12221 } else {
12222 Register RHS1 = MI.getOperand(3).getReg();
12223 Register RHS2 = MI.getOperand(4).getReg();
12224 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPrr : ARM::CMPrr))
12225 .addReg(LHS1)
12226 .addReg(RHS1)
12228 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPrr : ARM::CMPrr))
12229 .addReg(LHS2).addReg(RHS2)
12230 .addImm(ARMCC::EQ).addReg(ARM::CPSR);
12231 }
12232
12233 MachineBasicBlock *destMBB = MI.getOperand(RHSisZero ? 3 : 5).getMBB();
12234 MachineBasicBlock *exitMBB = OtherSucc(BB, destMBB);
12235 if (MI.getOperand(0).getImm() == ARMCC::NE)
12236 std::swap(destMBB, exitMBB);
12237
12238 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc))
12239 .addMBB(destMBB).addImm(ARMCC::EQ).addReg(ARM::CPSR);
12240 if (isThumb2)
12241 BuildMI(BB, dl, TII->get(ARM::t2B))
12242 .addMBB(exitMBB)
12244 else
12245 BuildMI(BB, dl, TII->get(ARM::B)) .addMBB(exitMBB);
12246
12247 MI.eraseFromParent(); // The pseudo instruction is gone now.
12248 return BB;
12249 }
12250
12251 case ARM::Int_eh_sjlj_setjmp:
12252 case ARM::Int_eh_sjlj_setjmp_nofp:
12253 case ARM::tInt_eh_sjlj_setjmp:
12254 case ARM::t2Int_eh_sjlj_setjmp:
12255 case ARM::t2Int_eh_sjlj_setjmp_nofp:
12256 return BB;
12257
12258 case ARM::Int_eh_sjlj_setup_dispatch:
12259 EmitSjLjDispatchBlock(MI, BB);
12260 return BB;
12261 case ARM::COPY_STRUCT_BYVAL_I32:
12262 ++NumLoopByVals;
12263 return EmitStructByval(MI, BB);
12264 case ARM::WIN__CHKSTK:
12265 return EmitLowered__chkstk(MI, BB);
12266 case ARM::WIN__DBZCHK:
12267 return EmitLowered__dbzchk(MI, BB);
12268 }
12269}
12270
12271/// Attaches vregs to MEMCPY that it will use as scratch registers
12272/// when it is expanded into LDM/STM. This is done as a post-isel lowering
12273/// instead of as a custom inserter because we need the use list from the SDNode.
12274static void attachMEMCPYScratchRegs(const ARMSubtarget *Subtarget,
12275 MachineInstr &MI, const SDNode *Node) {
12276 bool isThumb1 = Subtarget->isThumb1Only();
12277
12278 MachineFunction *MF = MI.getParent()->getParent();
12279 MachineRegisterInfo &MRI = MF->getRegInfo();
12280 MachineInstrBuilder MIB(*MF, MI);
12281
12282 // If the new dst/src is unused mark it as dead.
12283 if (!Node->hasAnyUseOfValue(0)) {
12284 MI.getOperand(0).setIsDead(true);
12285 }
12286 if (!Node->hasAnyUseOfValue(1)) {
12287 MI.getOperand(1).setIsDead(true);
12288 }
12289
12290 // The MEMCPY both defines and kills the scratch registers.
12291 for (unsigned I = 0; I != MI.getOperand(4).getImm(); ++I) {
12292 Register TmpReg = MRI.createVirtualRegister(isThumb1 ? &ARM::tGPRRegClass
12293 : &ARM::GPRRegClass);
12295 }
12296}
12297
12299 SDNode *Node) const {
12300 if (MI.getOpcode() == ARM::MEMCPY) {
12301 attachMEMCPYScratchRegs(Subtarget, MI, Node);
12302 return;
12303 }
12304
12305 const MCInstrDesc *MCID = &MI.getDesc();
12306 // Adjust potentially 's' setting instructions after isel, i.e. ADC, SBC, RSB,
12307 // RSC. Coming out of isel, they have an implicit CPSR def, but the optional
12308 // operand is still set to noreg. If needed, set the optional operand's
12309 // register to CPSR, and remove the redundant implicit def.
12310 //
12311 // e.g. ADCS (..., implicit-def CPSR) -> ADC (... opt:def CPSR).
12312
12313 // Rename pseudo opcodes.
12314 unsigned NewOpc = convertAddSubFlagsOpcode(MI.getOpcode());
12315 unsigned ccOutIdx;
12316 if (NewOpc) {
12317 const ARMBaseInstrInfo *TII = Subtarget->getInstrInfo();
12318 MCID = &TII->get(NewOpc);
12319
12320 assert(MCID->getNumOperands() ==
12321 MI.getDesc().getNumOperands() + 5 - MI.getDesc().getSize()
12322 && "converted opcode should be the same except for cc_out"
12323 " (and, on Thumb1, pred)");
12324
12325 MI.setDesc(*MCID);
12326
12327 // Add the optional cc_out operand
12328 MI.addOperand(MachineOperand::CreateReg(0, /*isDef=*/true));
12329
12330 // On Thumb1, move all input operands to the end, then add the predicate
12331 if (Subtarget->isThumb1Only()) {
12332 for (unsigned c = MCID->getNumOperands() - 4; c--;) {
12333 MI.addOperand(MI.getOperand(1));
12334 MI.removeOperand(1);
12335 }
12336
12337 // Restore the ties
12338 for (unsigned i = MI.getNumOperands(); i--;) {
12339 const MachineOperand& op = MI.getOperand(i);
12340 if (op.isReg() && op.isUse()) {
12341 int DefIdx = MCID->getOperandConstraint(i, MCOI::TIED_TO);
12342 if (DefIdx != -1)
12343 MI.tieOperands(DefIdx, i);
12344 }
12345 }
12346
12348 MI.addOperand(MachineOperand::CreateReg(0, /*isDef=*/false));
12349 ccOutIdx = 1;
12350 } else
12351 ccOutIdx = MCID->getNumOperands() - 1;
12352 } else
12353 ccOutIdx = MCID->getNumOperands() - 1;
12354
12355 // Any ARM instruction that sets the 's' bit should specify an optional
12356 // "cc_out" operand in the last operand position.
12357 if (!MI.hasOptionalDef() || !MCID->operands()[ccOutIdx].isOptionalDef()) {
12358 assert(!NewOpc && "Optional cc_out operand required");
12359 return;
12360 }
12361 // Look for an implicit def of CPSR added by MachineInstr ctor. Remove it
12362 // since we already have an optional CPSR def.
12363 bool definesCPSR = false;
12364 bool deadCPSR = false;
12365 for (unsigned i = MCID->getNumOperands(), e = MI.getNumOperands(); i != e;
12366 ++i) {
12367 const MachineOperand &MO = MI.getOperand(i);
12368 if (MO.isReg() && MO.isDef() && MO.getReg() == ARM::CPSR) {
12369 definesCPSR = true;
12370 if (MO.isDead())
12371 deadCPSR = true;
12372 MI.removeOperand(i);
12373 break;
12374 }
12375 }
12376 if (!definesCPSR) {
12377 assert(!NewOpc && "Optional cc_out operand required");
12378 return;
12379 }
12380 assert(deadCPSR == !Node->hasAnyUseOfValue(1) && "inconsistent dead flag");
12381 if (deadCPSR) {
12382 assert(!MI.getOperand(ccOutIdx).getReg() &&
12383 "expect uninitialized optional cc_out operand");
12384 // Thumb1 instructions must have the S bit even if the CPSR is dead.
12385 if (!Subtarget->isThumb1Only())
12386 return;
12387 }
12388
12389 // If this instruction was defined with an optional CPSR def and its dag node
12390 // had a live implicit CPSR def, then activate the optional CPSR def.
12391 MachineOperand &MO = MI.getOperand(ccOutIdx);
12392 MO.setReg(ARM::CPSR);
12393 MO.setIsDef(true);
12394}
12395
12396//===----------------------------------------------------------------------===//
12397// ARM Optimization Hooks
12398//===----------------------------------------------------------------------===//
12399
12400// Helper function that checks if N is a null or all ones constant.
12401static inline bool isZeroOrAllOnes(SDValue N, bool AllOnes) {
12403}
12404
12405// Return true if N is conditionally 0 or all ones.
12406// Detects these expressions where cc is an i1 value:
12407//
12408// (select cc 0, y) [AllOnes=0]
12409// (select cc y, 0) [AllOnes=0]
12410// (zext cc) [AllOnes=0]
12411// (sext cc) [AllOnes=0/1]
12412// (select cc -1, y) [AllOnes=1]
12413// (select cc y, -1) [AllOnes=1]
12414//
12415// Invert is set when N is the null/all ones constant when CC is false.
12416// OtherOp is set to the alternative value of N.
12418 SDValue &CC, bool &Invert,
12419 SDValue &OtherOp,
12420 SelectionDAG &DAG) {
12421 switch (N->getOpcode()) {
12422 default: return false;
12423 case ISD::SELECT: {
12424 CC = N->getOperand(0);
12425 SDValue N1 = N->getOperand(1);
12426 SDValue N2 = N->getOperand(2);
12427 if (isZeroOrAllOnes(N1, AllOnes)) {
12428 Invert = false;
12429 OtherOp = N2;
12430 return true;
12431 }
12432 if (isZeroOrAllOnes(N2, AllOnes)) {
12433 Invert = true;
12434 OtherOp = N1;
12435 return true;
12436 }
12437 return false;
12438 }
12439 case ISD::ZERO_EXTEND:
12440 // (zext cc) can never be the all ones value.
12441 if (AllOnes)
12442 return false;
12443 [[fallthrough]];
12444 case ISD::SIGN_EXTEND: {
12445 SDLoc dl(N);
12446 EVT VT = N->getValueType(0);
12447 CC = N->getOperand(0);
12448 if (CC.getValueType() != MVT::i1 || CC.getOpcode() != ISD::SETCC)
12449 return false;
12450 Invert = !AllOnes;
12451 if (AllOnes)
12452 // When looking for an AllOnes constant, N is an sext, and the 'other'
12453 // value is 0.
12454 OtherOp = DAG.getConstant(0, dl, VT);
12455 else if (N->getOpcode() == ISD::ZERO_EXTEND)
12456 // When looking for a 0 constant, N can be zext or sext.
12457 OtherOp = DAG.getConstant(1, dl, VT);
12458 else
12459 OtherOp = DAG.getAllOnesConstant(dl, VT);
12460 return true;
12461 }
12462 }
12463}
12464
12465// Combine a constant select operand into its use:
12466//
12467// (add (select cc, 0, c), x) -> (select cc, x, (add, x, c))
12468// (sub x, (select cc, 0, c)) -> (select cc, x, (sub, x, c))
12469// (and (select cc, -1, c), x) -> (select cc, x, (and, x, c)) [AllOnes=1]
12470// (or (select cc, 0, c), x) -> (select cc, x, (or, x, c))
12471// (xor (select cc, 0, c), x) -> (select cc, x, (xor, x, c))
12472//
12473// The transform is rejected if the select doesn't have a constant operand that
12474// is null, or all ones when AllOnes is set.
12475//
12476// Also recognize sext/zext from i1:
12477//
12478// (add (zext cc), x) -> (select cc (add x, 1), x)
12479// (add (sext cc), x) -> (select cc (add x, -1), x)
12480//
12481// These transformations eventually create predicated instructions.
12482//
12483// @param N The node to transform.
12484// @param Slct The N operand that is a select.
12485// @param OtherOp The other N operand (x above).
12486// @param DCI Context.
12487// @param AllOnes Require the select constant to be all ones instead of null.
12488// @returns The new node, or SDValue() on failure.
12489static
12492 bool AllOnes = false) {
12493 SelectionDAG &DAG = DCI.DAG;
12494 EVT VT = N->getValueType(0);
12495 SDValue NonConstantVal;
12496 SDValue CCOp;
12497 bool SwapSelectOps;
12498 if (!isConditionalZeroOrAllOnes(Slct.getNode(), AllOnes, CCOp, SwapSelectOps,
12499 NonConstantVal, DAG))
12500 return SDValue();
12501
12502 // Slct is now know to be the desired identity constant when CC is true.
12503 SDValue TrueVal = OtherOp;
12504 SDValue FalseVal = DAG.getNode(N->getOpcode(), SDLoc(N), VT,
12505 OtherOp, NonConstantVal);
12506 // Unless SwapSelectOps says CC should be false.
12507 if (SwapSelectOps)
12508 std::swap(TrueVal, FalseVal);
12509
12510 return DAG.getNode(ISD::SELECT, SDLoc(N), VT,
12511 CCOp, TrueVal, FalseVal);
12512}
12513
12514// Attempt combineSelectAndUse on each operand of a commutative operator N.
12515static
12518 SDValue N0 = N->getOperand(0);
12519 SDValue N1 = N->getOperand(1);
12520 if (N0.getNode()->hasOneUse())
12521 if (SDValue Result = combineSelectAndUse(N, N0, N1, DCI, AllOnes))
12522 return Result;
12523 if (N1.getNode()->hasOneUse())
12524 if (SDValue Result = combineSelectAndUse(N, N1, N0, DCI, AllOnes))
12525 return Result;
12526 return SDValue();
12527}
12528
12530 // VUZP shuffle node.
12531 if (N->getOpcode() == ARMISD::VUZP)
12532 return true;
12533
12534 // "VUZP" on i32 is an alias for VTRN.
12535 if (N->getOpcode() == ARMISD::VTRN && N->getValueType(0) == MVT::v2i32)
12536 return true;
12537
12538 return false;
12539}
12540
12543 const ARMSubtarget *Subtarget) {
12544 // Look for ADD(VUZP.0, VUZP.1).
12545 if (!IsVUZPShuffleNode(N0.getNode()) || N0.getNode() != N1.getNode() ||
12546 N0 == N1)
12547 return SDValue();
12548
12549 // Make sure the ADD is a 64-bit add; there is no 128-bit VPADD.
12550 if (!N->getValueType(0).is64BitVector())
12551 return SDValue();
12552
12553 // Generate vpadd.
12554 SelectionDAG &DAG = DCI.DAG;
12555 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
12556 SDLoc dl(N);
12557 SDNode *Unzip = N0.getNode();
12558 EVT VT = N->getValueType(0);
12559
12561 Ops.push_back(DAG.getConstant(Intrinsic::arm_neon_vpadd, dl,
12562 TLI.getPointerTy(DAG.getDataLayout())));
12563 Ops.push_back(Unzip->getOperand(0));
12564 Ops.push_back(Unzip->getOperand(1));
12565
12566 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, Ops);
12567}
12568
12571 const ARMSubtarget *Subtarget) {
12572 // Check for two extended operands.
12573 if (!(N0.getOpcode() == ISD::SIGN_EXTEND &&
12574 N1.getOpcode() == ISD::SIGN_EXTEND) &&
12575 !(N0.getOpcode() == ISD::ZERO_EXTEND &&
12576 N1.getOpcode() == ISD::ZERO_EXTEND))
12577 return SDValue();
12578
12579 SDValue N00 = N0.getOperand(0);
12580 SDValue N10 = N1.getOperand(0);
12581
12582 // Look for ADD(SEXT(VUZP.0), SEXT(VUZP.1))
12583 if (!IsVUZPShuffleNode(N00.getNode()) || N00.getNode() != N10.getNode() ||
12584 N00 == N10)
12585 return SDValue();
12586
12587 // We only recognize Q register paddl here; this can't be reached until
12588 // after type legalization.
12589 if (!N00.getValueType().is64BitVector() ||
12591 return SDValue();
12592
12593 // Generate vpaddl.
12594 SelectionDAG &DAG = DCI.DAG;
12595 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
12596 SDLoc dl(N);
12597 EVT VT = N->getValueType(0);
12598
12600 // Form vpaddl.sN or vpaddl.uN depending on the kind of extension.
12601 unsigned Opcode;
12602 if (N0.getOpcode() == ISD::SIGN_EXTEND)
12603 Opcode = Intrinsic::arm_neon_vpaddls;
12604 else
12605 Opcode = Intrinsic::arm_neon_vpaddlu;
12606 Ops.push_back(DAG.getConstant(Opcode, dl,
12607 TLI.getPointerTy(DAG.getDataLayout())));
12608 EVT ElemTy = N00.getValueType().getVectorElementType();
12609 unsigned NumElts = VT.getVectorNumElements();
12610 EVT ConcatVT = EVT::getVectorVT(*DAG.getContext(), ElemTy, NumElts * 2);
12611 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), ConcatVT,
12612 N00.getOperand(0), N00.getOperand(1));
12613 Ops.push_back(Concat);
12614
12615 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, Ops);
12616}
12617
12618// FIXME: This function shouldn't be necessary; if we lower BUILD_VECTOR in
12619// an appropriate manner, we end up with ADD(VUZP(ZEXT(N))), which is
12620// much easier to match.
12621static SDValue
12624 const ARMSubtarget *Subtarget) {
12625 // Only perform optimization if after legalize, and if NEON is available. We
12626 // also expected both operands to be BUILD_VECTORs.
12627 if (DCI.isBeforeLegalize() || !Subtarget->hasNEON()
12628 || N0.getOpcode() != ISD::BUILD_VECTOR
12629 || N1.getOpcode() != ISD::BUILD_VECTOR)
12630 return SDValue();
12631
12632 // Check output type since VPADDL operand elements can only be 8, 16, or 32.
12633 EVT VT = N->getValueType(0);
12634 if (!VT.isInteger() || VT.getVectorElementType() == MVT::i64)
12635 return SDValue();
12636
12637 // Check that the vector operands are of the right form.
12638 // N0 and N1 are BUILD_VECTOR nodes with N number of EXTRACT_VECTOR
12639 // operands, where N is the size of the formed vector.
12640 // Each EXTRACT_VECTOR should have the same input vector and odd or even
12641 // index such that we have a pair wise add pattern.
12642
12643 // Grab the vector that all EXTRACT_VECTOR nodes should be referencing.
12645 return SDValue();
12646 SDValue Vec = N0->getOperand(0)->getOperand(0);
12647 SDNode *V = Vec.getNode();
12648 unsigned nextIndex = 0;
12649
12650 // For each operands to the ADD which are BUILD_VECTORs,
12651 // check to see if each of their operands are an EXTRACT_VECTOR with
12652 // the same vector and appropriate index.
12653 for (unsigned i = 0, e = N0->getNumOperands(); i != e; ++i) {
12656
12657 SDValue ExtVec0 = N0->getOperand(i);
12658 SDValue ExtVec1 = N1->getOperand(i);
12659
12660 // First operand is the vector, verify its the same.
12661 if (V != ExtVec0->getOperand(0).getNode() ||
12662 V != ExtVec1->getOperand(0).getNode())
12663 return SDValue();
12664
12665 // Second is the constant, verify its correct.
12668
12669 // For the constant, we want to see all the even or all the odd.
12670 if (!C0 || !C1 || C0->getZExtValue() != nextIndex
12671 || C1->getZExtValue() != nextIndex+1)
12672 return SDValue();
12673
12674 // Increment index.
12675 nextIndex+=2;
12676 } else
12677 return SDValue();
12678 }
12679
12680 // Don't generate vpaddl+vmovn; we'll match it to vpadd later. Also make sure
12681 // we're using the entire input vector, otherwise there's a size/legality
12682 // mismatch somewhere.
12683 if (nextIndex != Vec.getValueType().getVectorNumElements() ||
12685 return SDValue();
12686
12687 // Create VPADDL node.
12688 SelectionDAG &DAG = DCI.DAG;
12689 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
12690
12691 SDLoc dl(N);
12692
12693 // Build operand list.
12695 Ops.push_back(DAG.getConstant(Intrinsic::arm_neon_vpaddls, dl,
12696 TLI.getPointerTy(DAG.getDataLayout())));
12697
12698 // Input is the vector.
12699 Ops.push_back(Vec);
12700
12701 // Get widened type and narrowed type.
12702 MVT widenType;
12703 unsigned numElem = VT.getVectorNumElements();
12704
12705 EVT inputLaneType = Vec.getValueType().getVectorElementType();
12706 switch (inputLaneType.getSimpleVT().SimpleTy) {
12707 case MVT::i8: widenType = MVT::getVectorVT(MVT::i16, numElem); break;
12708 case MVT::i16: widenType = MVT::getVectorVT(MVT::i32, numElem); break;
12709 case MVT::i32: widenType = MVT::getVectorVT(MVT::i64, numElem); break;
12710 default:
12711 llvm_unreachable("Invalid vector element type for padd optimization.");
12712 }
12713
12714 SDValue tmp = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, widenType, Ops);
12715 unsigned ExtOp = VT.bitsGT(tmp.getValueType()) ? ISD::ANY_EXTEND : ISD::TRUNCATE;
12716 return DAG.getNode(ExtOp, dl, VT, tmp);
12717}
12718
12720 if (V->getOpcode() == ISD::UMUL_LOHI ||
12721 V->getOpcode() == ISD::SMUL_LOHI)
12722 return V;
12723 return SDValue();
12724}
12725
12726static SDValue AddCombineTo64BitSMLAL16(SDNode *AddcNode, SDNode *AddeNode,
12728 const ARMSubtarget *Subtarget) {
12729 if (!Subtarget->hasBaseDSP())
12730 return SDValue();
12731
12732 // SMLALBB, SMLALBT, SMLALTB, SMLALTT multiply two 16-bit values and
12733 // accumulates the product into a 64-bit value. The 16-bit values will
12734 // be sign extended somehow or SRA'd into 32-bit values
12735 // (addc (adde (mul 16bit, 16bit), lo), hi)
12736 SDValue Mul = AddcNode->getOperand(0);
12737 SDValue Lo = AddcNode->getOperand(1);
12738 if (Mul.getOpcode() != ISD::MUL) {
12739 Lo = AddcNode->getOperand(0);
12740 Mul = AddcNode->getOperand(1);
12741 if (Mul.getOpcode() != ISD::MUL)
12742 return SDValue();
12743 }
12744
12745 SDValue SRA = AddeNode->getOperand(0);
12746 SDValue Hi = AddeNode->getOperand(1);
12747 if (SRA.getOpcode() != ISD::SRA) {
12748 SRA = AddeNode->getOperand(1);
12749 Hi = AddeNode->getOperand(0);
12750 if (SRA.getOpcode() != ISD::SRA)
12751 return SDValue();
12752 }
12753 if (auto Const = dyn_cast<ConstantSDNode>(SRA.getOperand(1))) {
12754 if (Const->getZExtValue() != 31)
12755 return SDValue();
12756 } else
12757 return SDValue();
12758
12759 if (SRA.getOperand(0) != Mul)
12760 return SDValue();
12761
12762 SelectionDAG &DAG = DCI.DAG;
12763 SDLoc dl(AddcNode);
12764 unsigned Opcode = 0;
12765 SDValue Op0;
12766 SDValue Op1;
12767
12768 if (isS16(Mul.getOperand(0), DAG) && isS16(Mul.getOperand(1), DAG)) {
12769 Opcode = ARMISD::SMLALBB;
12770 Op0 = Mul.getOperand(0);
12771 Op1 = Mul.getOperand(1);
12772 } else if (isS16(Mul.getOperand(0), DAG) && isSRA16(Mul.getOperand(1))) {
12773 Opcode = ARMISD::SMLALBT;
12774 Op0 = Mul.getOperand(0);
12775 Op1 = Mul.getOperand(1).getOperand(0);
12776 } else if (isSRA16(Mul.getOperand(0)) && isS16(Mul.getOperand(1), DAG)) {
12777 Opcode = ARMISD::SMLALTB;
12778 Op0 = Mul.getOperand(0).getOperand(0);
12779 Op1 = Mul.getOperand(1);
12780 } else if (isSRA16(Mul.getOperand(0)) && isSRA16(Mul.getOperand(1))) {
12781 Opcode = ARMISD::SMLALTT;
12782 Op0 = Mul->getOperand(0).getOperand(0);
12783 Op1 = Mul->getOperand(1).getOperand(0);
12784 }
12785
12786 if (!Op0 || !Op1)
12787 return SDValue();
12788
12789 SDValue SMLAL = DAG.getNode(Opcode, dl, DAG.getVTList(MVT::i32, MVT::i32),
12790 Op0, Op1, Lo, Hi);
12791 // Replace the ADDs' nodes uses by the MLA node's values.
12792 SDValue HiMLALResult(SMLAL.getNode(), 1);
12793 SDValue LoMLALResult(SMLAL.getNode(), 0);
12794
12795 DAG.ReplaceAllUsesOfValueWith(SDValue(AddcNode, 0), LoMLALResult);
12796 DAG.ReplaceAllUsesOfValueWith(SDValue(AddeNode, 0), HiMLALResult);
12797
12798 // Return original node to notify the driver to stop replacing.
12799 SDValue resNode(AddcNode, 0);
12800 return resNode;
12801}
12802
12805 const ARMSubtarget *Subtarget) {
12806 // Look for multiply add opportunities.
12807 // The pattern is a ISD::UMUL_LOHI followed by two add nodes, where
12808 // each add nodes consumes a value from ISD::UMUL_LOHI and there is
12809 // a glue link from the first add to the second add.
12810 // If we find this pattern, we can replace the U/SMUL_LOHI, ADDC, and ADDE by
12811 // a S/UMLAL instruction.
12812 // UMUL_LOHI
12813 // / :lo \ :hi
12814 // V \ [no multiline comment]
12815 // loAdd -> ADDC |
12816 // \ :carry /
12817 // V V
12818 // ADDE <- hiAdd
12819 //
12820 // In the special case where only the higher part of a signed result is used
12821 // and the add to the low part of the result of ISD::UMUL_LOHI adds or subtracts
12822 // a constant with the exact value of 0x80000000, we recognize we are dealing
12823 // with a "rounded multiply and add" (or subtract) and transform it into
12824 // either a ARMISD::SMMLAR or ARMISD::SMMLSR respectively.
12825
12826 assert((AddeSubeNode->getOpcode() == ARMISD::ADDE ||
12827 AddeSubeNode->getOpcode() == ARMISD::SUBE) &&
12828 "Expect an ADDE or SUBE");
12829
12830 assert(AddeSubeNode->getNumOperands() == 3 &&
12831 AddeSubeNode->getOperand(2).getValueType() == MVT::i32 &&
12832 "ADDE node has the wrong inputs");
12833
12834 // Check that we are chained to the right ADDC or SUBC node.
12835 SDNode *AddcSubcNode = AddeSubeNode->getOperand(2).getNode();
12836 if ((AddeSubeNode->getOpcode() == ARMISD::ADDE &&
12837 AddcSubcNode->getOpcode() != ARMISD::ADDC) ||
12838 (AddeSubeNode->getOpcode() == ARMISD::SUBE &&
12839 AddcSubcNode->getOpcode() != ARMISD::SUBC))
12840 return SDValue();
12841
12842 SDValue AddcSubcOp0 = AddcSubcNode->getOperand(0);
12843 SDValue AddcSubcOp1 = AddcSubcNode->getOperand(1);
12844
12845 // Check if the two operands are from the same mul_lohi node.
12846 if (AddcSubcOp0.getNode() == AddcSubcOp1.getNode())
12847 return SDValue();
12848
12849 assert(AddcSubcNode->getNumValues() == 2 &&
12850 AddcSubcNode->getValueType(0) == MVT::i32 &&
12851 "Expect ADDC with two result values. First: i32");
12852
12853 // Check that the ADDC adds the low result of the S/UMUL_LOHI. If not, it
12854 // maybe a SMLAL which multiplies two 16-bit values.
12855 if (AddeSubeNode->getOpcode() == ARMISD::ADDE &&
12856 AddcSubcOp0->getOpcode() != ISD::UMUL_LOHI &&
12857 AddcSubcOp0->getOpcode() != ISD::SMUL_LOHI &&
12858 AddcSubcOp1->getOpcode() != ISD::UMUL_LOHI &&
12859 AddcSubcOp1->getOpcode() != ISD::SMUL_LOHI)
12860 return AddCombineTo64BitSMLAL16(AddcSubcNode, AddeSubeNode, DCI, Subtarget);
12861
12862 // Check for the triangle shape.
12863 SDValue AddeSubeOp0 = AddeSubeNode->getOperand(0);
12864 SDValue AddeSubeOp1 = AddeSubeNode->getOperand(1);
12865
12866 // Make sure that the ADDE/SUBE operands are not coming from the same node.
12867 if (AddeSubeOp0.getNode() == AddeSubeOp1.getNode())
12868 return SDValue();
12869
12870 // Find the MUL_LOHI node walking up ADDE/SUBE's operands.
12871 bool IsLeftOperandMUL = false;
12872 SDValue MULOp = findMUL_LOHI(AddeSubeOp0);
12873 if (MULOp == SDValue())
12874 MULOp = findMUL_LOHI(AddeSubeOp1);
12875 else
12876 IsLeftOperandMUL = true;
12877 if (MULOp == SDValue())
12878 return SDValue();
12879
12880 // Figure out the right opcode.
12881 unsigned Opc = MULOp->getOpcode();
12882 unsigned FinalOpc = (Opc == ISD::SMUL_LOHI) ? ARMISD::SMLAL : ARMISD::UMLAL;
12883
12884 // Figure out the high and low input values to the MLAL node.
12885 SDValue *HiAddSub = nullptr;
12886 SDValue *LoMul = nullptr;
12887 SDValue *LowAddSub = nullptr;
12888
12889 // Ensure that ADDE/SUBE is from high result of ISD::xMUL_LOHI.
12890 if ((AddeSubeOp0 != MULOp.getValue(1)) && (AddeSubeOp1 != MULOp.getValue(1)))
12891 return SDValue();
12892
12893 if (IsLeftOperandMUL)
12894 HiAddSub = &AddeSubeOp1;
12895 else
12896 HiAddSub = &AddeSubeOp0;
12897
12898 // Ensure that LoMul and LowAddSub are taken from correct ISD::SMUL_LOHI node
12899 // whose low result is fed to the ADDC/SUBC we are checking.
12900
12901 if (AddcSubcOp0 == MULOp.getValue(0)) {
12902 LoMul = &AddcSubcOp0;
12903 LowAddSub = &AddcSubcOp1;
12904 }
12905 if (AddcSubcOp1 == MULOp.getValue(0)) {
12906 LoMul = &AddcSubcOp1;
12907 LowAddSub = &AddcSubcOp0;
12908 }
12909
12910 if (!LoMul)
12911 return SDValue();
12912
12913 // If HiAddSub is the same node as ADDC/SUBC or is a predecessor of ADDC/SUBC
12914 // the replacement below will create a cycle.
12915 if (AddcSubcNode == HiAddSub->getNode() ||
12916 AddcSubcNode->isPredecessorOf(HiAddSub->getNode()))
12917 return SDValue();
12918
12919 // Create the merged node.
12920 SelectionDAG &DAG = DCI.DAG;
12921
12922 // Start building operand list.
12924 Ops.push_back(LoMul->getOperand(0));
12925 Ops.push_back(LoMul->getOperand(1));
12926
12927 // Check whether we can use SMMLAR, SMMLSR or SMMULR instead. For this to be
12928 // the case, we must be doing signed multiplication and only use the higher
12929 // part of the result of the MLAL, furthermore the LowAddSub must be a constant
12930 // addition or subtraction with the value of 0x800000.
12931 if (Subtarget->hasV6Ops() && Subtarget->hasDSP() && Subtarget->useMulOps() &&
12932 FinalOpc == ARMISD::SMLAL && !AddeSubeNode->hasAnyUseOfValue(1) &&
12933 LowAddSub->getNode()->getOpcode() == ISD::Constant &&
12934 static_cast<ConstantSDNode *>(LowAddSub->getNode())->getZExtValue() ==
12935 0x80000000) {
12936 Ops.push_back(*HiAddSub);
12937 if (AddcSubcNode->getOpcode() == ARMISD::SUBC) {
12938 FinalOpc = ARMISD::SMMLSR;
12939 } else {
12940 FinalOpc = ARMISD::SMMLAR;
12941 }
12942 SDValue NewNode = DAG.getNode(FinalOpc, SDLoc(AddcSubcNode), MVT::i32, Ops);
12943 DAG.ReplaceAllUsesOfValueWith(SDValue(AddeSubeNode, 0), NewNode);
12944
12945 return SDValue(AddeSubeNode, 0);
12946 } else if (AddcSubcNode->getOpcode() == ARMISD::SUBC)
12947 // SMMLS is generated during instruction selection and the rest of this
12948 // function can not handle the case where AddcSubcNode is a SUBC.
12949 return SDValue();
12950
12951 // Finish building the operand list for {U/S}MLAL
12952 Ops.push_back(*LowAddSub);
12953 Ops.push_back(*HiAddSub);
12954
12955 SDValue MLALNode = DAG.getNode(FinalOpc, SDLoc(AddcSubcNode),
12956 DAG.getVTList(MVT::i32, MVT::i32), Ops);
12957
12958 // Replace the ADDs' nodes uses by the MLA node's values.
12959 SDValue HiMLALResult(MLALNode.getNode(), 1);
12960 DAG.ReplaceAllUsesOfValueWith(SDValue(AddeSubeNode, 0), HiMLALResult);
12961
12962 SDValue LoMLALResult(MLALNode.getNode(), 0);
12963 DAG.ReplaceAllUsesOfValueWith(SDValue(AddcSubcNode, 0), LoMLALResult);
12964
12965 // Return original node to notify the driver to stop replacing.
12966 return SDValue(AddeSubeNode, 0);
12967}
12968
12971 const ARMSubtarget *Subtarget) {
12972 // UMAAL is similar to UMLAL except that it adds two unsigned values.
12973 // While trying to combine for the other MLAL nodes, first search for the
12974 // chance to use UMAAL. Check if Addc uses a node which has already
12975 // been combined into a UMLAL. The other pattern is UMLAL using Addc/Adde
12976 // as the addend, and it's handled in PerformUMLALCombine.
12977
12978 if (!Subtarget->hasV6Ops() || !Subtarget->hasDSP())
12979 return AddCombineTo64bitMLAL(AddeNode, DCI, Subtarget);
12980
12981 // Check that we have a glued ADDC node.
12982 SDNode* AddcNode = AddeNode->getOperand(2).getNode();
12983 if (AddcNode->getOpcode() != ARMISD::ADDC)
12984 return SDValue();
12985
12986 // Find the converted UMAAL or quit if it doesn't exist.
12987 SDNode *UmlalNode = nullptr;
12988 SDValue AddHi;
12989 if (AddcNode->getOperand(0).getOpcode() == ARMISD::UMLAL) {
12990 UmlalNode = AddcNode->getOperand(0).getNode();
12991 AddHi = AddcNode->getOperand(1);
12992 } else if (AddcNode->getOperand(1).getOpcode() == ARMISD::UMLAL) {
12993 UmlalNode = AddcNode->getOperand(1).getNode();
12994 AddHi = AddcNode->getOperand(0);
12995 } else {
12996 return AddCombineTo64bitMLAL(AddeNode, DCI, Subtarget);
12997 }
12998
12999 // The ADDC should be glued to an ADDE node, which uses the same UMLAL as
13000 // the ADDC as well as Zero.
13001 if (!isNullConstant(UmlalNode->getOperand(3)))
13002 return SDValue();
13003
13004 if ((isNullConstant(AddeNode->getOperand(0)) &&
13005 AddeNode->getOperand(1).getNode() == UmlalNode) ||
13006 (AddeNode->getOperand(0).getNode() == UmlalNode &&
13007 isNullConstant(AddeNode->getOperand(1)))) {
13008 SelectionDAG &DAG = DCI.DAG;
13009 SDValue Ops[] = { UmlalNode->getOperand(0), UmlalNode->getOperand(1),
13010 UmlalNode->getOperand(2), AddHi };
13011 SDValue UMAAL = DAG.getNode(ARMISD::UMAAL, SDLoc(AddcNode),
13012 DAG.getVTList(MVT::i32, MVT::i32), Ops);
13013
13014 // Replace the ADDs' nodes uses by the UMAAL node's values.
13015 DAG.ReplaceAllUsesOfValueWith(SDValue(AddeNode, 0), SDValue(UMAAL.getNode(), 1));
13016 DAG.ReplaceAllUsesOfValueWith(SDValue(AddcNode, 0), SDValue(UMAAL.getNode(), 0));
13017
13018 // Return original node to notify the driver to stop replacing.
13019 return SDValue(AddeNode, 0);
13020 }
13021 return SDValue();
13022}
13023
13025 const ARMSubtarget *Subtarget) {
13026 if (!Subtarget->hasV6Ops() || !Subtarget->hasDSP())
13027 return SDValue();
13028
13029 // Check that we have a pair of ADDC and ADDE as operands.
13030 // Both addends of the ADDE must be zero.
13031 SDNode* AddcNode = N->getOperand(2).getNode();
13032 SDNode* AddeNode = N->getOperand(3).getNode();
13033 if ((AddcNode->getOpcode() == ARMISD::ADDC) &&
13034 (AddeNode->getOpcode() == ARMISD::ADDE) &&
13035 isNullConstant(AddeNode->getOperand(0)) &&
13036 isNullConstant(AddeNode->getOperand(1)) &&
13037 (AddeNode->getOperand(2).getNode() == AddcNode))
13038 return DAG.getNode(ARMISD::UMAAL, SDLoc(N),
13039 DAG.getVTList(MVT::i32, MVT::i32),
13040 {N->getOperand(0), N->getOperand(1),
13041 AddcNode->getOperand(0), AddcNode->getOperand(1)});
13042 else
13043 return SDValue();
13044}
13045
13048 const ARMSubtarget *Subtarget) {
13049 SelectionDAG &DAG(DCI.DAG);
13050
13051 if (N->getOpcode() == ARMISD::SUBC && N->hasAnyUseOfValue(1)) {
13052 // (SUBC (ADDE 0, 0, C), 1) -> C
13053 SDValue LHS = N->getOperand(0);
13054 SDValue RHS = N->getOperand(1);
13055 if (LHS->getOpcode() == ARMISD::ADDE &&
13056 isNullConstant(LHS->getOperand(0)) &&
13057 isNullConstant(LHS->getOperand(1)) && isOneConstant(RHS)) {
13058 return DCI.CombineTo(N, SDValue(N, 0), LHS->getOperand(2));
13059 }
13060 }
13061
13062 if (Subtarget->isThumb1Only()) {
13063 SDValue RHS = N->getOperand(1);
13065 int32_t imm = C->getSExtValue();
13066 if (imm < 0 && imm > std::numeric_limits<int>::min()) {
13067 SDLoc DL(N);
13068 RHS = DAG.getConstant(-imm, DL, MVT::i32);
13069 unsigned Opcode = (N->getOpcode() == ARMISD::ADDC) ? ARMISD::SUBC
13070 : ARMISD::ADDC;
13071 return DAG.getNode(Opcode, DL, N->getVTList(), N->getOperand(0), RHS);
13072 }
13073 }
13074 }
13075
13076 return SDValue();
13077}
13078
13081 const ARMSubtarget *Subtarget) {
13082 if (Subtarget->isThumb1Only()) {
13083 SelectionDAG &DAG = DCI.DAG;
13084 SDValue RHS = N->getOperand(1);
13086 int64_t imm = C->getSExtValue();
13087 if (imm < 0) {
13088 SDLoc DL(N);
13089
13090 // The with-carry-in form matches bitwise not instead of the negation.
13091 // Effectively, the inverse interpretation of the carry flag already
13092 // accounts for part of the negation.
13093 RHS = DAG.getConstant(~imm, DL, MVT::i32);
13094
13095 unsigned Opcode = (N->getOpcode() == ARMISD::ADDE) ? ARMISD::SUBE
13096 : ARMISD::ADDE;
13097 return DAG.getNode(Opcode, DL, N->getVTList(),
13098 N->getOperand(0), RHS, N->getOperand(2));
13099 }
13100 }
13101 } else if (N->getOperand(1)->getOpcode() == ISD::SMUL_LOHI) {
13102 return AddCombineTo64bitMLAL(N, DCI, Subtarget);
13103 }
13104 return SDValue();
13105}
13106
13109 const ARMSubtarget *Subtarget) {
13110 if (!Subtarget->hasMVEIntegerOps())
13111 return SDValue();
13112
13113 SDLoc dl(N);
13114 SDValue SetCC;
13115 SDValue LHS;
13116 SDValue RHS;
13117 ISD::CondCode CC;
13118 SDValue TrueVal;
13119 SDValue FalseVal;
13120
13121 if (N->getOpcode() == ISD::SELECT &&
13122 N->getOperand(0)->getOpcode() == ISD::SETCC) {
13123 SetCC = N->getOperand(0);
13124 LHS = SetCC->getOperand(0);
13125 RHS = SetCC->getOperand(1);
13126 CC = cast<CondCodeSDNode>(SetCC->getOperand(2))->get();
13127 TrueVal = N->getOperand(1);
13128 FalseVal = N->getOperand(2);
13129 } else if (N->getOpcode() == ISD::SELECT_CC) {
13130 LHS = N->getOperand(0);
13131 RHS = N->getOperand(1);
13132 CC = cast<CondCodeSDNode>(N->getOperand(4))->get();
13133 TrueVal = N->getOperand(2);
13134 FalseVal = N->getOperand(3);
13135 } else {
13136 return SDValue();
13137 }
13138
13139 unsigned int Opcode = 0;
13140 if ((TrueVal->getOpcode() == ISD::VECREDUCE_UMIN ||
13141 FalseVal->getOpcode() == ISD::VECREDUCE_UMIN) &&
13142 (CC == ISD::SETULT || CC == ISD::SETUGT)) {
13143 Opcode = ARMISD::VMINVu;
13144 if (CC == ISD::SETUGT)
13145 std::swap(TrueVal, FalseVal);
13146 } else if ((TrueVal->getOpcode() == ISD::VECREDUCE_SMIN ||
13147 FalseVal->getOpcode() == ISD::VECREDUCE_SMIN) &&
13148 (CC == ISD::SETLT || CC == ISD::SETGT)) {
13149 Opcode = ARMISD::VMINVs;
13150 if (CC == ISD::SETGT)
13151 std::swap(TrueVal, FalseVal);
13152 } else if ((TrueVal->getOpcode() == ISD::VECREDUCE_UMAX ||
13153 FalseVal->getOpcode() == ISD::VECREDUCE_UMAX) &&
13154 (CC == ISD::SETUGT || CC == ISD::SETULT)) {
13155 Opcode = ARMISD::VMAXVu;
13156 if (CC == ISD::SETULT)
13157 std::swap(TrueVal, FalseVal);
13158 } else if ((TrueVal->getOpcode() == ISD::VECREDUCE_SMAX ||
13159 FalseVal->getOpcode() == ISD::VECREDUCE_SMAX) &&
13160 (CC == ISD::SETGT || CC == ISD::SETLT)) {
13161 Opcode = ARMISD::VMAXVs;
13162 if (CC == ISD::SETLT)
13163 std::swap(TrueVal, FalseVal);
13164 } else
13165 return SDValue();
13166
13167 // Normalise to the right hand side being the vector reduction
13168 switch (TrueVal->getOpcode()) {
13173 std::swap(LHS, RHS);
13174 std::swap(TrueVal, FalseVal);
13175 break;
13176 }
13177
13178 EVT VectorType = FalseVal->getOperand(0).getValueType();
13179
13180 if (VectorType != MVT::v16i8 && VectorType != MVT::v8i16 &&
13181 VectorType != MVT::v4i32)
13182 return SDValue();
13183
13184 EVT VectorScalarType = VectorType.getVectorElementType();
13185
13186 // The values being selected must also be the ones being compared
13187 if (TrueVal != LHS || FalseVal != RHS)
13188 return SDValue();
13189
13190 EVT LeftType = LHS->getValueType(0);
13191 EVT RightType = RHS->getValueType(0);
13192
13193 // The types must match the reduced type too
13194 if (LeftType != VectorScalarType || RightType != VectorScalarType)
13195 return SDValue();
13196
13197 // Legalise the scalar to an i32
13198 if (VectorScalarType != MVT::i32)
13199 LHS = DCI.DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, LHS);
13200
13201 // Generate the reduction as an i32 for legalisation purposes
13202 auto Reduction =
13203 DCI.DAG.getNode(Opcode, dl, MVT::i32, LHS, RHS->getOperand(0));
13204
13205 // The result isn't actually an i32 so truncate it back to its original type
13206 if (VectorScalarType != MVT::i32)
13207 Reduction = DCI.DAG.getNode(ISD::TRUNCATE, dl, VectorScalarType, Reduction);
13208
13209 return Reduction;
13210}
13211
13212// A special combine for the vqdmulh family of instructions. This is one of the
13213// potential set of patterns that could patch this instruction. The base pattern
13214// you would expect to be min(max(ashr(mul(mul(sext(x), 2), sext(y)), 16))).
13215// This matches the different min(max(ashr(mul(mul(sext(x), sext(y)), 2), 16))),
13216// which llvm will have optimized to min(ashr(mul(sext(x), sext(y)), 15))) as
13217// the max is unnecessary.
13219 EVT VT = N->getValueType(0);
13220 SDValue Shft;
13221 ConstantSDNode *Clamp;
13222
13223 if (!VT.isVector() || VT.getScalarSizeInBits() > 64)
13224 return SDValue();
13225
13226 if (N->getOpcode() == ISD::SMIN) {
13227 Shft = N->getOperand(0);
13228 Clamp = isConstOrConstSplat(N->getOperand(1));
13229 } else if (N->getOpcode() == ISD::VSELECT) {
13230 // Detect a SMIN, which for an i64 node will be a vselect/setcc, not a smin.
13231 SDValue Cmp = N->getOperand(0);
13232 if (Cmp.getOpcode() != ISD::SETCC ||
13233 cast<CondCodeSDNode>(Cmp.getOperand(2))->get() != ISD::SETLT ||
13234 Cmp.getOperand(0) != N->getOperand(1) ||
13235 Cmp.getOperand(1) != N->getOperand(2))
13236 return SDValue();
13237 Shft = N->getOperand(1);
13238 Clamp = isConstOrConstSplat(N->getOperand(2));
13239 } else
13240 return SDValue();
13241
13242 if (!Clamp)
13243 return SDValue();
13244
13245 MVT ScalarType;
13246 int ShftAmt = 0;
13247 switch (Clamp->getSExtValue()) {
13248 case (1 << 7) - 1:
13249 ScalarType = MVT::i8;
13250 ShftAmt = 7;
13251 break;
13252 case (1 << 15) - 1:
13253 ScalarType = MVT::i16;
13254 ShftAmt = 15;
13255 break;
13256 case (1ULL << 31) - 1:
13257 ScalarType = MVT::i32;
13258 ShftAmt = 31;
13259 break;
13260 default:
13261 return SDValue();
13262 }
13263
13264 if (Shft.getOpcode() != ISD::SRA)
13265 return SDValue();
13267 if (!N1 || N1->getSExtValue() != ShftAmt)
13268 return SDValue();
13269
13270 SDValue Mul = Shft.getOperand(0);
13271 if (Mul.getOpcode() != ISD::MUL)
13272 return SDValue();
13273
13274 SDValue Ext0 = Mul.getOperand(0);
13275 SDValue Ext1 = Mul.getOperand(1);
13276 if (Ext0.getOpcode() != ISD::SIGN_EXTEND ||
13277 Ext1.getOpcode() != ISD::SIGN_EXTEND)
13278 return SDValue();
13279 EVT VecVT = Ext0.getOperand(0).getValueType();
13280 if (!VecVT.isPow2VectorType() || VecVT.getVectorNumElements() == 1)
13281 return SDValue();
13282 if (Ext1.getOperand(0).getValueType() != VecVT ||
13283 VecVT.getScalarType() != ScalarType ||
13284 VT.getScalarSizeInBits() < ScalarType.getScalarSizeInBits() * 2)
13285 return SDValue();
13286
13287 SDLoc DL(Mul);
13288 unsigned LegalLanes = 128 / (ShftAmt + 1);
13289 EVT LegalVecVT = MVT::getVectorVT(ScalarType, LegalLanes);
13290 // For types smaller than legal vectors extend to be legal and only use needed
13291 // lanes.
13292 if (VecVT.getSizeInBits() < 128) {
13293 EVT ExtVecVT =
13295 VecVT.getVectorNumElements());
13296 SDValue Inp0 =
13297 DAG.getNode(ISD::ANY_EXTEND, DL, ExtVecVT, Ext0.getOperand(0));
13298 SDValue Inp1 =
13299 DAG.getNode(ISD::ANY_EXTEND, DL, ExtVecVT, Ext1.getOperand(0));
13300 Inp0 = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, LegalVecVT, Inp0);
13301 Inp1 = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, LegalVecVT, Inp1);
13302 SDValue VQDMULH = DAG.getNode(ARMISD::VQDMULH, DL, LegalVecVT, Inp0, Inp1);
13303 SDValue Trunc = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, ExtVecVT, VQDMULH);
13304 Trunc = DAG.getNode(ISD::TRUNCATE, DL, VecVT, Trunc);
13305 return DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Trunc);
13306 }
13307
13308 // For larger types, split into legal sized chunks.
13309 assert(VecVT.getSizeInBits() % 128 == 0 && "Expected a power2 type");
13310 unsigned NumParts = VecVT.getSizeInBits() / 128;
13312 for (unsigned I = 0; I < NumParts; ++I) {
13313 SDValue Inp0 =
13314 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, LegalVecVT, Ext0.getOperand(0),
13315 DAG.getVectorIdxConstant(I * LegalLanes, DL));
13316 SDValue Inp1 =
13317 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, LegalVecVT, Ext1.getOperand(0),
13318 DAG.getVectorIdxConstant(I * LegalLanes, DL));
13319 SDValue VQDMULH = DAG.getNode(ARMISD::VQDMULH, DL, LegalVecVT, Inp0, Inp1);
13320 Parts.push_back(VQDMULH);
13321 }
13322 return DAG.getNode(ISD::SIGN_EXTEND, DL, VT,
13323 DAG.getNode(ISD::CONCAT_VECTORS, DL, VecVT, Parts));
13324}
13325
13328 const ARMSubtarget *Subtarget) {
13329 if (!Subtarget->hasMVEIntegerOps())
13330 return SDValue();
13331
13332 if (SDValue V = PerformVQDMULHCombine(N, DCI.DAG))
13333 return V;
13334
13335 // Transforms vselect(not(cond), lhs, rhs) into vselect(cond, rhs, lhs).
13336 //
13337 // We need to re-implement this optimization here as the implementation in the
13338 // Target-Independent DAGCombiner does not handle the kind of constant we make
13339 // (it calls isConstOrConstSplat with AllowTruncation set to false - and for
13340 // good reason, allowing truncation there would break other targets).
13341 //
13342 // Currently, this is only done for MVE, as it's the only target that benefits
13343 // from this transformation (e.g. VPNOT+VPSEL becomes a single VPSEL).
13344 if (N->getOperand(0).getOpcode() != ISD::XOR)
13345 return SDValue();
13346 SDValue XOR = N->getOperand(0);
13347
13348 // Check if the XOR's RHS is either a 1, or a BUILD_VECTOR of 1s.
13349 // It is important to check with truncation allowed as the BUILD_VECTORs we
13350 // generate in those situations will truncate their operands.
13351 ConstantSDNode *Const =
13352 isConstOrConstSplat(XOR->getOperand(1), /*AllowUndefs*/ false,
13353 /*AllowTruncation*/ true);
13354 if (!Const || !Const->isOne())
13355 return SDValue();
13356
13357 // Rewrite into vselect(cond, rhs, lhs).
13358 SDValue Cond = XOR->getOperand(0);
13359 SDValue LHS = N->getOperand(1);
13360 SDValue RHS = N->getOperand(2);
13361 EVT Type = N->getValueType(0);
13362 return DCI.DAG.getNode(ISD::VSELECT, SDLoc(N), Type, Cond, RHS, LHS);
13363}
13364
13365// Convert vsetcc([0,1,2,..], splat(n), ult) -> vctp n
13368 const ARMSubtarget *Subtarget) {
13369 SDValue Op0 = N->getOperand(0);
13370 SDValue Op1 = N->getOperand(1);
13371 ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
13372 EVT VT = N->getValueType(0);
13373
13374 if (!Subtarget->hasMVEIntegerOps() ||
13376 return SDValue();
13377
13378 if (CC == ISD::SETUGE) {
13379 std::swap(Op0, Op1);
13380 CC = ISD::SETULT;
13381 }
13382
13383 if (CC != ISD::SETULT || VT.getScalarSizeInBits() != 1 ||
13385 return SDValue();
13386
13387 // Check first operand is BuildVector of 0,1,2,...
13388 for (unsigned I = 0; I < VT.getVectorNumElements(); I++) {
13389 if (!Op0.getOperand(I).isUndef() &&
13391 Op0.getConstantOperandVal(I) == I))
13392 return SDValue();
13393 }
13394
13395 // The second is a Splat of Op1S
13396 SDValue Op1S = DCI.DAG.getSplatValue(Op1);
13397 if (!Op1S)
13398 return SDValue();
13399
13400 unsigned Opc;
13401 switch (VT.getVectorNumElements()) {
13402 case 2:
13403 Opc = Intrinsic::arm_mve_vctp64;
13404 break;
13405 case 4:
13406 Opc = Intrinsic::arm_mve_vctp32;
13407 break;
13408 case 8:
13409 Opc = Intrinsic::arm_mve_vctp16;
13410 break;
13411 case 16:
13412 Opc = Intrinsic::arm_mve_vctp8;
13413 break;
13414 default:
13415 return SDValue();
13416 }
13417
13418 SDLoc DL(N);
13419 return DCI.DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
13420 DCI.DAG.getConstant(Opc, DL, MVT::i32),
13421 DCI.DAG.getZExtOrTrunc(Op1S, DL, MVT::i32));
13422}
13423
13424/// PerformADDECombine - Target-specific dag combine transform from
13425/// ARMISD::ADDC, ARMISD::ADDE, and ISD::MUL_LOHI to MLAL or
13426/// ARMISD::ADDC, ARMISD::ADDE and ARMISD::UMLAL to ARMISD::UMAAL
13429 const ARMSubtarget *Subtarget) {
13430 // Only ARM and Thumb2 support UMLAL/SMLAL.
13431 if (Subtarget->isThumb1Only())
13432 return PerformAddeSubeCombine(N, DCI, Subtarget);
13433
13434 // Only perform the checks after legalize when the pattern is available.
13435 if (DCI.isBeforeLegalize()) return SDValue();
13436
13437 return AddCombineTo64bitUMAAL(N, DCI, Subtarget);
13438}
13439
13440/// PerformADDCombineWithOperands - Try DAG combinations for an ADD with
13441/// operands N0 and N1. This is a helper for PerformADDCombine that is
13442/// called with the default operands, and if that fails, with commuted
13443/// operands.
13446 const ARMSubtarget *Subtarget){
13447 // Attempt to create vpadd for this add.
13448 if (SDValue Result = AddCombineToVPADD(N, N0, N1, DCI, Subtarget))
13449 return Result;
13450
13451 // Attempt to create vpaddl for this add.
13452 if (SDValue Result = AddCombineVUZPToVPADDL(N, N0, N1, DCI, Subtarget))
13453 return Result;
13454 if (SDValue Result = AddCombineBUILD_VECTORToVPADDL(N, N0, N1, DCI,
13455 Subtarget))
13456 return Result;
13457
13458 // fold (add (select cc, 0, c), x) -> (select cc, x, (add, x, c))
13459 if (N0.getNode()->hasOneUse())
13460 if (SDValue Result = combineSelectAndUse(N, N0, N1, DCI))
13461 return Result;
13462 return SDValue();
13463}
13464
13466 EVT VT = N->getValueType(0);
13467 SDValue N0 = N->getOperand(0);
13468 SDValue N1 = N->getOperand(1);
13469 SDLoc dl(N);
13470
13471 auto IsVecReduce = [](SDValue Op) {
13472 switch (Op.getOpcode()) {
13473 case ISD::VECREDUCE_ADD:
13474 case ARMISD::VADDVs:
13475 case ARMISD::VADDVu:
13476 case ARMISD::VMLAVs:
13477 case ARMISD::VMLAVu:
13478 return true;
13479 }
13480 return false;
13481 };
13482
13483 auto DistrubuteAddAddVecReduce = [&](SDValue N0, SDValue N1) {
13484 // Distribute add(X, add(vecreduce(Y), vecreduce(Z))) ->
13485 // add(add(X, vecreduce(Y)), vecreduce(Z))
13486 // to make better use of vaddva style instructions.
13487 if (VT == MVT::i32 && N1.getOpcode() == ISD::ADD && !IsVecReduce(N0) &&
13488 IsVecReduce(N1.getOperand(0)) && IsVecReduce(N1.getOperand(1)) &&
13489 !isa<ConstantSDNode>(N0) && N1->hasOneUse()) {
13490 SDValue Add0 = DAG.getNode(ISD::ADD, dl, VT, N0, N1.getOperand(0));
13491 return DAG.getNode(ISD::ADD, dl, VT, Add0, N1.getOperand(1));
13492 }
13493 // And turn add(add(A, reduce(B)), add(C, reduce(D))) ->
13494 // add(add(add(A, C), reduce(B)), reduce(D))
13495 if (VT == MVT::i32 && N0.getOpcode() == ISD::ADD &&
13496 N1.getOpcode() == ISD::ADD && N0->hasOneUse() && N1->hasOneUse()) {
13497 unsigned N0RedOp = 0;
13498 if (!IsVecReduce(N0.getOperand(N0RedOp))) {
13499 N0RedOp = 1;
13500 if (!IsVecReduce(N0.getOperand(N0RedOp)))
13501 return SDValue();
13502 }
13503
13504 unsigned N1RedOp = 0;
13505 if (!IsVecReduce(N1.getOperand(N1RedOp)))
13506 N1RedOp = 1;
13507 if (!IsVecReduce(N1.getOperand(N1RedOp)))
13508 return SDValue();
13509
13510 SDValue Add0 = DAG.getNode(ISD::ADD, dl, VT, N0.getOperand(1 - N0RedOp),
13511 N1.getOperand(1 - N1RedOp));
13512 SDValue Add1 =
13513 DAG.getNode(ISD::ADD, dl, VT, Add0, N0.getOperand(N0RedOp));
13514 return DAG.getNode(ISD::ADD, dl, VT, Add1, N1.getOperand(N1RedOp));
13515 }
13516 return SDValue();
13517 };
13518 if (SDValue R = DistrubuteAddAddVecReduce(N0, N1))
13519 return R;
13520 if (SDValue R = DistrubuteAddAddVecReduce(N1, N0))
13521 return R;
13522
13523 // Distribute add(vecreduce(load(Y)), vecreduce(load(Z)))
13524 // Or add(add(X, vecreduce(load(Y))), vecreduce(load(Z)))
13525 // by ascending load offsets. This can help cores prefetch if the order of
13526 // loads is more predictable.
13527 auto DistrubuteVecReduceLoad = [&](SDValue N0, SDValue N1, bool IsForward) {
13528 // Check if two reductions are known to load data where one is before/after
13529 // another. Return negative if N0 loads data before N1, positive if N1 is
13530 // before N0 and 0 otherwise if nothing is known.
13531 auto IsKnownOrderedLoad = [&](SDValue N0, SDValue N1) {
13532 // Look through to the first operand of a MUL, for the VMLA case.
13533 // Currently only looks at the first operand, in the hope they are equal.
13534 if (N0.getOpcode() == ISD::MUL)
13535 N0 = N0.getOperand(0);
13536 if (N1.getOpcode() == ISD::MUL)
13537 N1 = N1.getOperand(0);
13538
13539 // Return true if the two operands are loads to the same object and the
13540 // offset of the first is known to be less than the offset of the second.
13541 LoadSDNode *Load0 = dyn_cast<LoadSDNode>(N0);
13542 LoadSDNode *Load1 = dyn_cast<LoadSDNode>(N1);
13543 if (!Load0 || !Load1 || Load0->getChain() != Load1->getChain() ||
13544 !Load0->isSimple() || !Load1->isSimple() || Load0->isIndexed() ||
13545 Load1->isIndexed())
13546 return 0;
13547
13548 auto BaseLocDecomp0 = BaseIndexOffset::match(Load0, DAG);
13549 auto BaseLocDecomp1 = BaseIndexOffset::match(Load1, DAG);
13550
13551 if (!BaseLocDecomp0.getBase() ||
13552 BaseLocDecomp0.getBase() != BaseLocDecomp1.getBase() ||
13553 !BaseLocDecomp0.hasValidOffset() || !BaseLocDecomp1.hasValidOffset())
13554 return 0;
13555 if (BaseLocDecomp0.getOffset() < BaseLocDecomp1.getOffset())
13556 return -1;
13557 if (BaseLocDecomp0.getOffset() > BaseLocDecomp1.getOffset())
13558 return 1;
13559 return 0;
13560 };
13561
13562 SDValue X;
13563 if (N0.getOpcode() == ISD::ADD && N0->hasOneUse()) {
13564 if (IsVecReduce(N0.getOperand(0)) && IsVecReduce(N0.getOperand(1))) {
13565 int IsBefore = IsKnownOrderedLoad(N0.getOperand(0).getOperand(0),
13566 N0.getOperand(1).getOperand(0));
13567 if (IsBefore < 0) {
13568 X = N0.getOperand(0);
13569 N0 = N0.getOperand(1);
13570 } else if (IsBefore > 0) {
13571 X = N0.getOperand(1);
13572 N0 = N0.getOperand(0);
13573 } else
13574 return SDValue();
13575 } else if (IsVecReduce(N0.getOperand(0))) {
13576 X = N0.getOperand(1);
13577 N0 = N0.getOperand(0);
13578 } else if (IsVecReduce(N0.getOperand(1))) {
13579 X = N0.getOperand(0);
13580 N0 = N0.getOperand(1);
13581 } else
13582 return SDValue();
13583 } else if (IsForward && IsVecReduce(N0) && IsVecReduce(N1) &&
13584 IsKnownOrderedLoad(N0.getOperand(0), N1.getOperand(0)) < 0) {
13585 // Note this is backward to how you would expect. We create
13586 // add(reduce(load + 16), reduce(load + 0)) so that the
13587 // add(reduce(load+16), X) is combined into VADDVA(X, load+16)), leaving
13588 // the X as VADDV(load + 0)
13589 return DAG.getNode(ISD::ADD, dl, VT, N1, N0);
13590 } else
13591 return SDValue();
13592
13593 if (!IsVecReduce(N0) || !IsVecReduce(N1))
13594 return SDValue();
13595
13596 if (IsKnownOrderedLoad(N1.getOperand(0), N0.getOperand(0)) >= 0)
13597 return SDValue();
13598
13599 // Switch from add(add(X, N0), N1) to add(add(X, N1), N0)
13600 SDValue Add0 = DAG.getNode(ISD::ADD, dl, VT, X, N1);
13601 return DAG.getNode(ISD::ADD, dl, VT, Add0, N0);
13602 };
13603 if (SDValue R = DistrubuteVecReduceLoad(N0, N1, true))
13604 return R;
13605 if (SDValue R = DistrubuteVecReduceLoad(N1, N0, false))
13606 return R;
13607 return SDValue();
13608}
13609
13611 const ARMSubtarget *Subtarget) {
13612 if (!Subtarget->hasMVEIntegerOps())
13613 return SDValue();
13614
13616 return R;
13617
13618 EVT VT = N->getValueType(0);
13619 SDValue N0 = N->getOperand(0);
13620 SDValue N1 = N->getOperand(1);
13621 SDLoc dl(N);
13622
13623 if (VT != MVT::i64)
13624 return SDValue();
13625
13626 // We are looking for a i64 add of a VADDLVx. Due to these being i64's, this
13627 // will look like:
13628 // t1: i32,i32 = ARMISD::VADDLVs x
13629 // t2: i64 = build_pair t1, t1:1
13630 // t3: i64 = add t2, y
13631 // Otherwise we try to push the add up above VADDLVAx, to potentially allow
13632 // the add to be simplified separately.
13633 // We also need to check for sext / zext and commutitive adds.
13634 auto MakeVecReduce = [&](unsigned Opcode, unsigned OpcodeA, SDValue NA,
13635 SDValue NB) {
13636 if (NB->getOpcode() != ISD::BUILD_PAIR)
13637 return SDValue();
13638 SDValue VecRed = NB->getOperand(0);
13639 if ((VecRed->getOpcode() != Opcode && VecRed->getOpcode() != OpcodeA) ||
13640 VecRed.getResNo() != 0 ||
13641 NB->getOperand(1) != SDValue(VecRed.getNode(), 1))
13642 return SDValue();
13643
13644 if (VecRed->getOpcode() == OpcodeA) {
13645 // add(NA, VADDLVA(Inp), Y) -> VADDLVA(add(NA, Inp), Y)
13646 SDValue Inp = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64,
13647 VecRed.getOperand(0), VecRed.getOperand(1));
13648 NA = DAG.getNode(ISD::ADD, dl, MVT::i64, Inp, NA);
13649 }
13650
13652 std::tie(Ops[0], Ops[1]) = DAG.SplitScalar(NA, dl, MVT::i32, MVT::i32);
13653
13654 unsigned S = VecRed->getOpcode() == OpcodeA ? 2 : 0;
13655 for (unsigned I = S, E = VecRed.getNumOperands(); I < E; I++)
13656 Ops.push_back(VecRed->getOperand(I));
13657 SDValue Red =
13658 DAG.getNode(OpcodeA, dl, DAG.getVTList({MVT::i32, MVT::i32}), Ops);
13659 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Red,
13660 SDValue(Red.getNode(), 1));
13661 };
13662
13663 if (SDValue M = MakeVecReduce(ARMISD::VADDLVs, ARMISD::VADDLVAs, N0, N1))
13664 return M;
13665 if (SDValue M = MakeVecReduce(ARMISD::VADDLVu, ARMISD::VADDLVAu, N0, N1))
13666 return M;
13667 if (SDValue M = MakeVecReduce(ARMISD::VADDLVs, ARMISD::VADDLVAs, N1, N0))
13668 return M;
13669 if (SDValue M = MakeVecReduce(ARMISD::VADDLVu, ARMISD::VADDLVAu, N1, N0))
13670 return M;
13671 if (SDValue M = MakeVecReduce(ARMISD::VADDLVps, ARMISD::VADDLVAps, N0, N1))
13672 return M;
13673 if (SDValue M = MakeVecReduce(ARMISD::VADDLVpu, ARMISD::VADDLVApu, N0, N1))
13674 return M;
13675 if (SDValue M = MakeVecReduce(ARMISD::VADDLVps, ARMISD::VADDLVAps, N1, N0))
13676 return M;
13677 if (SDValue M = MakeVecReduce(ARMISD::VADDLVpu, ARMISD::VADDLVApu, N1, N0))
13678 return M;
13679 if (SDValue M = MakeVecReduce(ARMISD::VMLALVs, ARMISD::VMLALVAs, N0, N1))
13680 return M;
13681 if (SDValue M = MakeVecReduce(ARMISD::VMLALVu, ARMISD::VMLALVAu, N0, N1))
13682 return M;
13683 if (SDValue M = MakeVecReduce(ARMISD::VMLALVs, ARMISD::VMLALVAs, N1, N0))
13684 return M;
13685 if (SDValue M = MakeVecReduce(ARMISD::VMLALVu, ARMISD::VMLALVAu, N1, N0))
13686 return M;
13687 if (SDValue M = MakeVecReduce(ARMISD::VMLALVps, ARMISD::VMLALVAps, N0, N1))
13688 return M;
13689 if (SDValue M = MakeVecReduce(ARMISD::VMLALVpu, ARMISD::VMLALVApu, N0, N1))
13690 return M;
13691 if (SDValue M = MakeVecReduce(ARMISD::VMLALVps, ARMISD::VMLALVAps, N1, N0))
13692 return M;
13693 if (SDValue M = MakeVecReduce(ARMISD::VMLALVpu, ARMISD::VMLALVApu, N1, N0))
13694 return M;
13695 return SDValue();
13696}
13697
13698bool
13700 CombineLevel Level) const {
13701 assert((N->getOpcode() == ISD::SHL || N->getOpcode() == ISD::SRA ||
13702 N->getOpcode() == ISD::SRL) &&
13703 "Expected shift op");
13704
13705 SDValue ShiftLHS = N->getOperand(0);
13706 if (!ShiftLHS->hasOneUse())
13707 return false;
13708
13709 if (ShiftLHS.getOpcode() == ISD::SIGN_EXTEND &&
13710 !ShiftLHS.getOperand(0)->hasOneUse())
13711 return false;
13712
13713 if (Level == BeforeLegalizeTypes)
13714 return true;
13715
13716 if (N->getOpcode() != ISD::SHL)
13717 return true;
13718
13719 if (Subtarget->isThumb1Only()) {
13720 // Avoid making expensive immediates by commuting shifts. (This logic
13721 // only applies to Thumb1 because ARM and Thumb2 immediates can be shifted
13722 // for free.)
13723 if (N->getOpcode() != ISD::SHL)
13724 return true;
13725 SDValue N1 = N->getOperand(0);
13726 if (N1->getOpcode() != ISD::ADD && N1->getOpcode() != ISD::AND &&
13727 N1->getOpcode() != ISD::OR && N1->getOpcode() != ISD::XOR)
13728 return true;
13729 if (auto *Const = dyn_cast<ConstantSDNode>(N1->getOperand(1))) {
13730 if (Const->getAPIntValue().ult(256))
13731 return false;
13732 if (N1->getOpcode() == ISD::ADD && Const->getAPIntValue().slt(0) &&
13733 Const->getAPIntValue().sgt(-256))
13734 return false;
13735 }
13736 return true;
13737 }
13738
13739 // Turn off commute-with-shift transform after legalization, so it doesn't
13740 // conflict with PerformSHLSimplify. (We could try to detect when
13741 // PerformSHLSimplify would trigger more precisely, but it isn't
13742 // really necessary.)
13743 return false;
13744}
13745
13747 const SDNode *N) const {
13748 assert(N->getOpcode() == ISD::XOR &&
13749 (N->getOperand(0).getOpcode() == ISD::SHL ||
13750 N->getOperand(0).getOpcode() == ISD::SRL) &&
13751 "Expected XOR(SHIFT) pattern");
13752
13753 // Only commute if the entire NOT mask is a hidden shifted mask.
13754 auto *XorC = dyn_cast<ConstantSDNode>(N->getOperand(1));
13755 auto *ShiftC = dyn_cast<ConstantSDNode>(N->getOperand(0).getOperand(1));
13756 if (XorC && ShiftC) {
13757 unsigned MaskIdx, MaskLen;
13758 if (XorC->getAPIntValue().isShiftedMask(MaskIdx, MaskLen)) {
13759 unsigned ShiftAmt = ShiftC->getZExtValue();
13760 unsigned BitWidth = N->getValueType(0).getScalarSizeInBits();
13761 if (N->getOperand(0).getOpcode() == ISD::SHL)
13762 return MaskIdx == ShiftAmt && MaskLen == (BitWidth - ShiftAmt);
13763 return MaskIdx == 0 && MaskLen == (BitWidth - ShiftAmt);
13764 }
13765 }
13766
13767 return false;
13768}
13769
13771 const SDNode *N) const {
13772 assert(((N->getOpcode() == ISD::SHL &&
13773 N->getOperand(0).getOpcode() == ISD::SRL) ||
13774 (N->getOpcode() == ISD::SRL &&
13775 N->getOperand(0).getOpcode() == ISD::SHL)) &&
13776 "Expected shift-shift mask");
13777
13778 if (!Subtarget->isThumb1Only())
13779 return true;
13780
13781 EVT VT = N->getValueType(0);
13782 if (VT.getScalarSizeInBits() > 32)
13783 return true;
13784
13785 return false;
13786}
13787
13789 unsigned BinOpcode, EVT VT, unsigned SelectOpcode, SDValue X,
13790 SDValue Y) const {
13791 return Subtarget->hasMVEIntegerOps() && isTypeLegal(VT) &&
13792 SelectOpcode == ISD::VSELECT;
13793}
13794
13796 if (!Subtarget->hasNEON()) {
13797 if (Subtarget->isThumb1Only())
13798 return VT.getScalarSizeInBits() <= 32;
13799 return true;
13800 }
13801 return VT.isScalarInteger();
13802}
13803
13805 EVT VT) const {
13806 if (!isOperationLegalOrCustom(Op, VT) || !FPVT.isSimple())
13807 return false;
13808
13809 switch (FPVT.getSimpleVT().SimpleTy) {
13810 case MVT::f16:
13811 return Subtarget->hasVFP2Base();
13812 case MVT::f32:
13813 return Subtarget->hasVFP2Base();
13814 case MVT::f64:
13815 return Subtarget->hasFP64();
13816 case MVT::v4f32:
13817 case MVT::v8f16:
13818 return Subtarget->hasMVEFloatOps();
13819 default:
13820 return false;
13821 }
13822}
13823
13826 const ARMSubtarget *ST) {
13827 // Allow the generic combiner to identify potential bswaps.
13828 if (DCI.isBeforeLegalize())
13829 return SDValue();
13830
13831 // DAG combiner will fold:
13832 // (shl (add x, c1), c2) -> (add (shl x, c2), c1 << c2)
13833 // (shl (or x, c1), c2) -> (or (shl x, c2), c1 << c2
13834 // Other code patterns that can be also be modified have the following form:
13835 // b + ((a << 1) | 510)
13836 // b + ((a << 1) & 510)
13837 // b + ((a << 1) ^ 510)
13838 // b + ((a << 1) + 510)
13839
13840 // Many instructions can perform the shift for free, but it requires both
13841 // the operands to be registers. If c1 << c2 is too large, a mov immediate
13842 // instruction will needed. So, unfold back to the original pattern if:
13843 // - if c1 and c2 are small enough that they don't require mov imms.
13844 // - the user(s) of the node can perform an shl
13845
13846 // No shifted operands for 16-bit instructions.
13847 if (ST->isThumb() && ST->isThumb1Only())
13848 return SDValue();
13849
13850 // Check that all the users could perform the shl themselves.
13851 for (auto *U : N->users()) {
13852 switch(U->getOpcode()) {
13853 default:
13854 return SDValue();
13855 case ISD::SUB:
13856 case ISD::ADD:
13857 case ISD::AND:
13858 case ISD::OR:
13859 case ISD::XOR:
13860 case ISD::SETCC:
13861 case ARMISD::CMP:
13862 // Check that the user isn't already using a constant because there
13863 // aren't any instructions that support an immediate operand and a
13864 // shifted operand.
13865 if (isa<ConstantSDNode>(U->getOperand(0)) ||
13866 isa<ConstantSDNode>(U->getOperand(1)))
13867 return SDValue();
13868
13869 // Check that it's not already using a shift.
13870 if (U->getOperand(0).getOpcode() == ISD::SHL ||
13871 U->getOperand(1).getOpcode() == ISD::SHL)
13872 return SDValue();
13873 break;
13874 }
13875 }
13876
13877 if (N->getOpcode() != ISD::ADD && N->getOpcode() != ISD::OR &&
13878 N->getOpcode() != ISD::XOR && N->getOpcode() != ISD::AND)
13879 return SDValue();
13880
13881 if (N->getOperand(0).getOpcode() != ISD::SHL)
13882 return SDValue();
13883
13884 SDValue SHL = N->getOperand(0);
13885
13886 auto *C1ShlC2 = dyn_cast<ConstantSDNode>(N->getOperand(1));
13887 auto *C2 = dyn_cast<ConstantSDNode>(SHL.getOperand(1));
13888 if (!C1ShlC2 || !C2)
13889 return SDValue();
13890
13891 APInt C2Int = C2->getAPIntValue();
13892 APInt C1Int = C1ShlC2->getAPIntValue();
13893 unsigned C2Width = C2Int.getBitWidth();
13894 if (C2Int.uge(C2Width))
13895 return SDValue();
13896 uint64_t C2Value = C2Int.getZExtValue();
13897
13898 // Check that performing a lshr will not lose any information.
13899 APInt Mask = APInt::getHighBitsSet(C2Width, C2Width - C2Value);
13900 if ((C1Int & Mask) != C1Int)
13901 return SDValue();
13902
13903 // Shift the first constant.
13904 C1Int.lshrInPlace(C2Int);
13905
13906 // The immediates are encoded as an 8-bit value that can be rotated.
13907 auto LargeImm = [](const APInt &Imm) {
13908 unsigned Zeros = Imm.countl_zero() + Imm.countr_zero();
13909 return Imm.getBitWidth() - Zeros > 8;
13910 };
13911
13912 if (LargeImm(C1Int) || LargeImm(C2Int))
13913 return SDValue();
13914
13915 SelectionDAG &DAG = DCI.DAG;
13916 SDLoc dl(N);
13917 SDValue X = SHL.getOperand(0);
13918 SDValue BinOp = DAG.getNode(N->getOpcode(), dl, MVT::i32, X,
13919 DAG.getConstant(C1Int, dl, MVT::i32));
13920 // Shift left to compensate for the lshr of C1Int.
13921 SDValue Res = DAG.getNode(ISD::SHL, dl, MVT::i32, BinOp, SHL.getOperand(1));
13922
13923 LLVM_DEBUG(dbgs() << "Simplify shl use:\n"; SHL.getOperand(0).dump();
13924 SHL.dump(); N->dump());
13925 LLVM_DEBUG(dbgs() << "Into:\n"; X.dump(); BinOp.dump(); Res.dump());
13926 return Res;
13927}
13928
13929
13930/// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD.
13931///
13934 const ARMSubtarget *Subtarget) {
13935 SDValue N0 = N->getOperand(0);
13936 SDValue N1 = N->getOperand(1);
13937
13938 // Only works one way, because it needs an immediate operand.
13939 if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget))
13940 return Result;
13941
13942 if (SDValue Result = PerformADDVecReduce(N, DCI.DAG, Subtarget))
13943 return Result;
13944
13945 // First try with the default operand order.
13946 if (SDValue Result = PerformADDCombineWithOperands(N, N0, N1, DCI, Subtarget))
13947 return Result;
13948
13949 // If that didn't work, try again with the operands commuted.
13950 return PerformADDCombineWithOperands(N, N1, N0, DCI, Subtarget);
13951}
13952
13953// Combine (sub 0, (csinc X, Y, CC)) -> (csinv -X, Y, CC)
13954// providing -X is as cheap as X (currently, just a constant).
13956 if (N->getValueType(0) != MVT::i32 || !isNullConstant(N->getOperand(0)))
13957 return SDValue();
13958 SDValue CSINC = N->getOperand(1);
13959 if (CSINC.getOpcode() != ARMISD::CSINC || !CSINC.hasOneUse())
13960 return SDValue();
13961
13963 if (!X)
13964 return SDValue();
13965
13966 return DAG.getNode(ARMISD::CSINV, SDLoc(N), MVT::i32,
13967 DAG.getNode(ISD::SUB, SDLoc(N), MVT::i32, N->getOperand(0),
13968 CSINC.getOperand(0)),
13969 CSINC.getOperand(1), CSINC.getOperand(2),
13970 CSINC.getOperand(3));
13971}
13972
13974 // Free to negate.
13976 return 0;
13977
13978 // Will save one instruction.
13979 if (Op.getOpcode() == ISD::SUB && isNullConstant(Op.getOperand(0)))
13980 return -1;
13981
13982 // Can freely negate by converting sra <-> srl.
13983 if (Op.getOpcode() == ISD::SRA || Op.getOpcode() == ISD::SRL) {
13984 ConstantSDNode *ShiftAmt = dyn_cast<ConstantSDNode>(Op.getOperand(1));
13985 if (Op.hasOneUse() && ShiftAmt &&
13986 ShiftAmt->getZExtValue() == Op.getValueType().getScalarSizeInBits() - 1)
13987 return 0;
13988 }
13989
13990 // Will have to create sub.
13991 return 1;
13992}
13993
13994// Try to fold
13995//
13996// (neg (cmov X, Y)) -> (cmov (neg X), (neg Y))
13997//
13998// The folding helps cmov to be matched with csneg without generating
13999// redundant neg instruction.
14001 assert(N->getOpcode() == ISD::SUB);
14002 if (!isNullConstant(N->getOperand(0)))
14003 return SDValue();
14004
14005 SDValue CMov = N->getOperand(1);
14006 if (CMov.getOpcode() != ARMISD::CMOV || !CMov->hasOneUse())
14007 return SDValue();
14008
14009 SDValue N0 = CMov.getOperand(0);
14010 SDValue N1 = CMov.getOperand(1);
14011
14012 // Only perform the fold if we actually save something.
14013 if (getNegationCost(N0) + getNegationCost(N1) > 0)
14014 return SDValue();
14015
14016 SDLoc DL(N);
14017 EVT VT = CMov.getValueType();
14018
14019 SDValue N0N = DAG.getNegative(N0, DL, VT);
14020 SDValue N1N = DAG.getNegative(N1, DL, VT);
14021 return DAG.getNode(ARMISD::CMOV, DL, VT, N0N, N1N, CMov.getOperand(2),
14022 CMov.getOperand(3));
14023}
14024
14025/// PerformSUBCombine - Target-specific dag combine xforms for ISD::SUB.
14026///
14029 const ARMSubtarget *Subtarget) {
14030 SDValue N0 = N->getOperand(0);
14031 SDValue N1 = N->getOperand(1);
14032
14033 // fold (sub x, (select cc, 0, c)) -> (select cc, x, (sub, x, c))
14034 if (N1.getNode()->hasOneUse())
14035 if (SDValue Result = combineSelectAndUse(N, N1, N0, DCI))
14036 return Result;
14037
14038 if (SDValue R = PerformSubCSINCCombine(N, DCI.DAG))
14039 return R;
14040
14041 if (SDValue Val = performNegCMovCombine(N, DCI.DAG))
14042 return Val;
14043
14044 if (!Subtarget->hasMVEIntegerOps() || !N->getValueType(0).isVector())
14045 return SDValue();
14046
14047 // Fold (sub (ARMvmovImm 0), (ARMvdup x)) -> (ARMvdup (sub 0, x))
14048 // so that we can readily pattern match more mve instructions which can use
14049 // a scalar operand.
14050 SDValue VDup = N->getOperand(1);
14051 if (VDup->getOpcode() != ARMISD::VDUP)
14052 return SDValue();
14053
14054 SDValue VMov = N->getOperand(0);
14055 if (VMov->getOpcode() == ISD::BITCAST)
14056 VMov = VMov->getOperand(0);
14057
14058 if (VMov->getOpcode() != ARMISD::VMOVIMM || !isZeroVector(VMov))
14059 return SDValue();
14060
14061 SDLoc dl(N);
14062 SDValue Negate = DCI.DAG.getNode(ISD::SUB, dl, MVT::i32,
14063 DCI.DAG.getConstant(0, dl, MVT::i32),
14064 VDup->getOperand(0));
14065 return DCI.DAG.getNode(ARMISD::VDUP, dl, N->getValueType(0), Negate);
14066}
14067
14068/// PerformVMULCombine
14069/// Distribute (A + B) * C to (A * C) + (B * C) to take advantage of the
14070/// special multiplier accumulator forwarding.
14071/// vmul d3, d0, d2
14072/// vmla d3, d1, d2
14073/// is faster than
14074/// vadd d3, d0, d1
14075/// vmul d3, d3, d2
14076// However, for (A + B) * (A + B),
14077// vadd d2, d0, d1
14078// vmul d3, d0, d2
14079// vmla d3, d1, d2
14080// is slower than
14081// vadd d2, d0, d1
14082// vmul d3, d2, d2
14085 const ARMSubtarget *Subtarget) {
14086 if (!Subtarget->hasVMLxForwarding())
14087 return SDValue();
14088
14089 SelectionDAG &DAG = DCI.DAG;
14090 SDValue N0 = N->getOperand(0);
14091 SDValue N1 = N->getOperand(1);
14092 unsigned Opcode = N0.getOpcode();
14093 if (Opcode != ISD::ADD && Opcode != ISD::SUB &&
14094 Opcode != ISD::FADD && Opcode != ISD::FSUB) {
14095 Opcode = N1.getOpcode();
14096 if (Opcode != ISD::ADD && Opcode != ISD::SUB &&
14097 Opcode != ISD::FADD && Opcode != ISD::FSUB)
14098 return SDValue();
14099 std::swap(N0, N1);
14100 }
14101
14102 if (N0 == N1)
14103 return SDValue();
14104
14105 EVT VT = N->getValueType(0);
14106 SDLoc DL(N);
14107 SDValue N00 = N0->getOperand(0);
14108 SDValue N01 = N0->getOperand(1);
14109 return DAG.getNode(Opcode, DL, VT,
14110 DAG.getNode(ISD::MUL, DL, VT, N00, N1),
14111 DAG.getNode(ISD::MUL, DL, VT, N01, N1));
14112}
14113
14115 const ARMSubtarget *Subtarget) {
14116 EVT VT = N->getValueType(0);
14117 if (VT != MVT::v2i64)
14118 return SDValue();
14119
14120 SDValue N0 = N->getOperand(0);
14121 SDValue N1 = N->getOperand(1);
14122
14123 auto IsSignExt = [&](SDValue Op) {
14124 if (Op->getOpcode() != ISD::SIGN_EXTEND_INREG)
14125 return SDValue();
14126 EVT VT = cast<VTSDNode>(Op->getOperand(1))->getVT();
14127 if (VT.getScalarSizeInBits() == 32)
14128 return Op->getOperand(0);
14129 return SDValue();
14130 };
14131 auto IsZeroExt = [&](SDValue Op) {
14132 // Zero extends are a little more awkward. At the point we are matching
14133 // this, we are looking for an AND with a (-1, 0, -1, 0) buildvector mask.
14134 // That might be before of after a bitcast depending on how the and is
14135 // placed. Because this has to look through bitcasts, it is currently only
14136 // supported on LE.
14137 if (!Subtarget->isLittle())
14138 return SDValue();
14139
14140 SDValue And = Op;
14141 if (And->getOpcode() == ISD::BITCAST)
14142 And = And->getOperand(0);
14143 if (And->getOpcode() != ISD::AND)
14144 return SDValue();
14145 SDValue Mask = And->getOperand(1);
14146 if (Mask->getOpcode() == ISD::BITCAST)
14147 Mask = Mask->getOperand(0);
14148
14149 if (Mask->getOpcode() != ISD::BUILD_VECTOR ||
14150 Mask.getValueType() != MVT::v4i32)
14151 return SDValue();
14152 if (isAllOnesConstant(Mask->getOperand(0)) &&
14153 isNullConstant(Mask->getOperand(1)) &&
14154 isAllOnesConstant(Mask->getOperand(2)) &&
14155 isNullConstant(Mask->getOperand(3)))
14156 return And->getOperand(0);
14157 return SDValue();
14158 };
14159
14160 SDLoc dl(N);
14161 if (SDValue Op0 = IsSignExt(N0)) {
14162 if (SDValue Op1 = IsSignExt(N1)) {
14163 SDValue New0a = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, Op0);
14164 SDValue New1a = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, Op1);
14165 return DAG.getNode(ARMISD::VMULLs, dl, VT, New0a, New1a);
14166 }
14167 }
14168 if (SDValue Op0 = IsZeroExt(N0)) {
14169 if (SDValue Op1 = IsZeroExt(N1)) {
14170 SDValue New0a = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, Op0);
14171 SDValue New1a = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, Op1);
14172 return DAG.getNode(ARMISD::VMULLu, dl, VT, New0a, New1a);
14173 }
14174 }
14175
14176 return SDValue();
14177}
14178
14181 const ARMSubtarget *Subtarget) {
14182 SelectionDAG &DAG = DCI.DAG;
14183
14184 EVT VT = N->getValueType(0);
14185 if (Subtarget->hasMVEIntegerOps() && VT == MVT::v2i64)
14186 return PerformMVEVMULLCombine(N, DAG, Subtarget);
14187
14188 if (Subtarget->isThumb1Only())
14189 return SDValue();
14190
14191 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
14192 return SDValue();
14193
14194 if (VT.is64BitVector() || VT.is128BitVector())
14195 return PerformVMULCombine(N, DCI, Subtarget);
14196 if (VT != MVT::i32)
14197 return SDValue();
14198
14199 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
14200 if (!C)
14201 return SDValue();
14202
14203 int64_t MulAmt = C->getSExtValue();
14204 unsigned ShiftAmt = llvm::countr_zero<uint64_t>(MulAmt);
14205
14206 ShiftAmt = ShiftAmt & (32 - 1);
14207 SDValue V = N->getOperand(0);
14208 SDLoc DL(N);
14209
14210 SDValue Res;
14211 MulAmt >>= ShiftAmt;
14212
14213 if (MulAmt >= 0) {
14214 if (llvm::has_single_bit<uint32_t>(MulAmt - 1)) {
14215 // (mul x, 2^N + 1) => (add (shl x, N), x)
14216 Res = DAG.getNode(ISD::ADD, DL, VT,
14217 V,
14218 DAG.getNode(ISD::SHL, DL, VT,
14219 V,
14220 DAG.getConstant(Log2_32(MulAmt - 1), DL,
14221 MVT::i32)));
14222 } else if (llvm::has_single_bit<uint32_t>(MulAmt + 1)) {
14223 // (mul x, 2^N - 1) => (sub (shl x, N), x)
14224 Res = DAG.getNode(ISD::SUB, DL, VT,
14225 DAG.getNode(ISD::SHL, DL, VT,
14226 V,
14227 DAG.getConstant(Log2_32(MulAmt + 1), DL,
14228 MVT::i32)),
14229 V);
14230 } else
14231 return SDValue();
14232 } else {
14233 uint64_t MulAmtAbs = -MulAmt;
14234 if (llvm::has_single_bit<uint32_t>(MulAmtAbs + 1)) {
14235 // (mul x, -(2^N - 1)) => (sub x, (shl x, N))
14236 Res = DAG.getNode(ISD::SUB, DL, VT,
14237 V,
14238 DAG.getNode(ISD::SHL, DL, VT,
14239 V,
14240 DAG.getConstant(Log2_32(MulAmtAbs + 1), DL,
14241 MVT::i32)));
14242 } else if (llvm::has_single_bit<uint32_t>(MulAmtAbs - 1)) {
14243 // (mul x, -(2^N + 1)) => - (add (shl x, N), x)
14244 Res = DAG.getNode(ISD::ADD, DL, VT,
14245 V,
14246 DAG.getNode(ISD::SHL, DL, VT,
14247 V,
14248 DAG.getConstant(Log2_32(MulAmtAbs - 1), DL,
14249 MVT::i32)));
14250 Res = DAG.getNode(ISD::SUB, DL, VT,
14251 DAG.getConstant(0, DL, MVT::i32), Res);
14252 } else
14253 return SDValue();
14254 }
14255
14256 if (ShiftAmt != 0)
14257 Res = DAG.getNode(ISD::SHL, DL, VT,
14258 Res, DAG.getConstant(ShiftAmt, DL, MVT::i32));
14259
14260 // Do not add new nodes to DAG combiner worklist.
14261 DCI.CombineTo(N, Res, false);
14262 return SDValue();
14263}
14264
14267 const ARMSubtarget *Subtarget) {
14268 // Allow DAGCombine to pattern-match before we touch the canonical form.
14269 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
14270 return SDValue();
14271
14272 if (N->getValueType(0) != MVT::i32)
14273 return SDValue();
14274
14275 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N->getOperand(1));
14276 if (!N1C)
14277 return SDValue();
14278
14279 uint32_t C1 = (uint32_t)N1C->getZExtValue();
14280 // Don't transform uxtb/uxth.
14281 if (C1 == 255 || C1 == 65535)
14282 return SDValue();
14283
14284 SDNode *N0 = N->getOperand(0).getNode();
14285 if (!N0->hasOneUse())
14286 return SDValue();
14287
14288 if (N0->getOpcode() != ISD::SHL && N0->getOpcode() != ISD::SRL)
14289 return SDValue();
14290
14291 bool LeftShift = N0->getOpcode() == ISD::SHL;
14292
14294 if (!N01C)
14295 return SDValue();
14296
14297 uint32_t C2 = (uint32_t)N01C->getZExtValue();
14298 if (!C2 || C2 >= 32)
14299 return SDValue();
14300
14301 // Clear irrelevant bits in the mask.
14302 if (LeftShift)
14303 C1 &= (-1U << C2);
14304 else
14305 C1 &= (-1U >> C2);
14306
14307 SelectionDAG &DAG = DCI.DAG;
14308 SDLoc DL(N);
14309
14310 // We have a pattern of the form "(and (shl x, c2) c1)" or
14311 // "(and (srl x, c2) c1)", where c1 is a shifted mask. Try to
14312 // transform to a pair of shifts, to save materializing c1.
14313
14314 // First pattern: right shift, then mask off leading bits.
14315 // FIXME: Use demanded bits?
14316 if (!LeftShift && isMask_32(C1)) {
14317 uint32_t C3 = llvm::countl_zero(C1);
14318 if (C2 < C3) {
14319 SDValue SHL = DAG.getNode(ISD::SHL, DL, MVT::i32, N0->getOperand(0),
14320 DAG.getConstant(C3 - C2, DL, MVT::i32));
14321 return DAG.getNode(ISD::SRL, DL, MVT::i32, SHL,
14322 DAG.getConstant(C3, DL, MVT::i32));
14323 }
14324 }
14325
14326 // First pattern, reversed: left shift, then mask off trailing bits.
14327 if (LeftShift && isMask_32(~C1)) {
14328 uint32_t C3 = llvm::countr_zero(C1);
14329 if (C2 < C3) {
14330 SDValue SHL = DAG.getNode(ISD::SRL, DL, MVT::i32, N0->getOperand(0),
14331 DAG.getConstant(C3 - C2, DL, MVT::i32));
14332 return DAG.getNode(ISD::SHL, DL, MVT::i32, SHL,
14333 DAG.getConstant(C3, DL, MVT::i32));
14334 }
14335 }
14336
14337 // Second pattern: left shift, then mask off leading bits.
14338 // FIXME: Use demanded bits?
14339 if (LeftShift && isShiftedMask_32(C1)) {
14340 uint32_t Trailing = llvm::countr_zero(C1);
14341 uint32_t C3 = llvm::countl_zero(C1);
14342 if (Trailing == C2 && C2 + C3 < 32) {
14343 SDValue SHL = DAG.getNode(ISD::SHL, DL, MVT::i32, N0->getOperand(0),
14344 DAG.getConstant(C2 + C3, DL, MVT::i32));
14345 return DAG.getNode(ISD::SRL, DL, MVT::i32, SHL,
14346 DAG.getConstant(C3, DL, MVT::i32));
14347 }
14348 }
14349
14350 // Second pattern, reversed: right shift, then mask off trailing bits.
14351 // FIXME: Handle other patterns of known/demanded bits.
14352 if (!LeftShift && isShiftedMask_32(C1)) {
14353 uint32_t Leading = llvm::countl_zero(C1);
14354 uint32_t C3 = llvm::countr_zero(C1);
14355 if (Leading == C2 && C2 + C3 < 32) {
14356 SDValue SHL = DAG.getNode(ISD::SRL, DL, MVT::i32, N0->getOperand(0),
14357 DAG.getConstant(C2 + C3, DL, MVT::i32));
14358 return DAG.getNode(ISD::SHL, DL, MVT::i32, SHL,
14359 DAG.getConstant(C3, DL, MVT::i32));
14360 }
14361 }
14362
14363 // Transform "(and (shl x, c2) c1)" into "(shl (and x, c1>>c2), c2)"
14364 // if "c1 >> c2" is a cheaper immediate than "c1"
14365 if (LeftShift &&
14366 HasLowerConstantMaterializationCost(C1 >> C2, C1, Subtarget)) {
14367
14368 SDValue And = DAG.getNode(ISD::AND, DL, MVT::i32, N0->getOperand(0),
14369 DAG.getConstant(C1 >> C2, DL, MVT::i32));
14370 return DAG.getNode(ISD::SHL, DL, MVT::i32, And,
14371 DAG.getConstant(C2, DL, MVT::i32));
14372 }
14373
14374 return SDValue();
14375}
14376
14379 const ARMSubtarget *Subtarget) {
14380 // Attempt to use immediate-form VBIC
14381 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(1));
14382 SDLoc dl(N);
14383 EVT VT = N->getValueType(0);
14384 SelectionDAG &DAG = DCI.DAG;
14385
14386 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT) || VT == MVT::v2i1 ||
14387 VT == MVT::v4i1 || VT == MVT::v8i1 || VT == MVT::v16i1)
14388 return SDValue();
14389
14390 APInt SplatBits, SplatUndef;
14391 unsigned SplatBitSize;
14392 bool HasAnyUndefs;
14393 if (BVN && (Subtarget->hasNEON() || Subtarget->hasMVEIntegerOps()) &&
14394 BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
14395 if (SplatBitSize == 8 || SplatBitSize == 16 || SplatBitSize == 32 ||
14396 SplatBitSize == 64) {
14397 EVT VbicVT;
14398 SDValue Val = isVMOVModifiedImm((~SplatBits).getZExtValue(),
14399 SplatUndef.getZExtValue(), SplatBitSize,
14400 DAG, dl, VbicVT, VT, OtherModImm);
14401 if (Val.getNode()) {
14402 SDValue Input =
14403 DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VbicVT, N->getOperand(0));
14404 SDValue Vbic = DAG.getNode(ARMISD::VBICIMM, dl, VbicVT, Input, Val);
14405 return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, Vbic);
14406 }
14407 }
14408 }
14409
14410 if (!Subtarget->isThumb1Only()) {
14411 // fold (and (select cc, -1, c), x) -> (select cc, x, (and, x, c))
14412 if (SDValue Result = combineSelectAndUseCommutative(N, true, DCI))
14413 return Result;
14414
14415 if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget))
14416 return Result;
14417 }
14418
14419 if (Subtarget->isThumb1Only())
14420 if (SDValue Result = CombineANDShift(N, DCI, Subtarget))
14421 return Result;
14422
14423 return SDValue();
14424}
14425
14426// Try combining OR nodes to SMULWB, SMULWT.
14429 const ARMSubtarget *Subtarget) {
14430 if (!Subtarget->hasV6Ops() ||
14431 (Subtarget->isThumb() &&
14432 (!Subtarget->hasThumb2() || !Subtarget->hasDSP())))
14433 return SDValue();
14434
14435 SDValue SRL = OR->getOperand(0);
14436 SDValue SHL = OR->getOperand(1);
14437
14438 if (SRL.getOpcode() != ISD::SRL || SHL.getOpcode() != ISD::SHL) {
14439 SRL = OR->getOperand(1);
14440 SHL = OR->getOperand(0);
14441 }
14442 if (!isSRL16(SRL) || !isSHL16(SHL))
14443 return SDValue();
14444
14445 // The first operands to the shifts need to be the two results from the
14446 // same smul_lohi node.
14447 if ((SRL.getOperand(0).getNode() != SHL.getOperand(0).getNode()) ||
14448 SRL.getOperand(0).getOpcode() != ISD::SMUL_LOHI)
14449 return SDValue();
14450
14451 SDNode *SMULLOHI = SRL.getOperand(0).getNode();
14452 if (SRL.getOperand(0) != SDValue(SMULLOHI, 0) ||
14453 SHL.getOperand(0) != SDValue(SMULLOHI, 1))
14454 return SDValue();
14455
14456 // Now we have:
14457 // (or (srl (smul_lohi ?, ?), 16), (shl (smul_lohi ?, ?), 16)))
14458 // For SMUL[B|T] smul_lohi will take a 32-bit and a 16-bit arguments.
14459 // For SMUWB the 16-bit value will signed extended somehow.
14460 // For SMULWT only the SRA is required.
14461 // Check both sides of SMUL_LOHI
14462 SDValue OpS16 = SMULLOHI->getOperand(0);
14463 SDValue OpS32 = SMULLOHI->getOperand(1);
14464
14465 SelectionDAG &DAG = DCI.DAG;
14466 if (!isS16(OpS16, DAG) && !isSRA16(OpS16)) {
14467 OpS16 = OpS32;
14468 OpS32 = SMULLOHI->getOperand(0);
14469 }
14470
14471 SDLoc dl(OR);
14472 unsigned Opcode = 0;
14473 if (isS16(OpS16, DAG))
14474 Opcode = ARMISD::SMULWB;
14475 else if (isSRA16(OpS16)) {
14476 Opcode = ARMISD::SMULWT;
14477 OpS16 = OpS16->getOperand(0);
14478 }
14479 else
14480 return SDValue();
14481
14482 SDValue Res = DAG.getNode(Opcode, dl, MVT::i32, OpS32, OpS16);
14483 DAG.ReplaceAllUsesOfValueWith(SDValue(OR, 0), Res);
14484 return SDValue(OR, 0);
14485}
14486
14489 const ARMSubtarget *Subtarget) {
14490 // BFI is only available on V6T2+
14491 if (Subtarget->isThumb1Only() || !Subtarget->hasV6T2Ops())
14492 return SDValue();
14493
14494 EVT VT = N->getValueType(0);
14495 SDValue N0 = N->getOperand(0);
14496 SDValue N1 = N->getOperand(1);
14497 SelectionDAG &DAG = DCI.DAG;
14498 SDLoc DL(N);
14499 // 1) or (and A, mask), val => ARMbfi A, val, mask
14500 // iff (val & mask) == val
14501 //
14502 // 2) or (and A, mask), (and B, mask2) => ARMbfi A, (lsr B, amt), mask
14503 // 2a) iff isBitFieldInvertedMask(mask) && isBitFieldInvertedMask(~mask2)
14504 // && mask == ~mask2
14505 // 2b) iff isBitFieldInvertedMask(~mask) && isBitFieldInvertedMask(mask2)
14506 // && ~mask == mask2
14507 // (i.e., copy a bitfield value into another bitfield of the same width)
14508
14509 if (VT != MVT::i32)
14510 return SDValue();
14511
14512 SDValue N00 = N0.getOperand(0);
14513
14514 // The value and the mask need to be constants so we can verify this is
14515 // actually a bitfield set. If the mask is 0xffff, we can do better
14516 // via a movt instruction, so don't use BFI in that case.
14517 SDValue MaskOp = N0.getOperand(1);
14519 if (!MaskC)
14520 return SDValue();
14521 unsigned Mask = MaskC->getZExtValue();
14522 if (Mask == 0xffff)
14523 return SDValue();
14524 SDValue Res;
14525 // Case (1): or (and A, mask), val => ARMbfi A, val, mask
14527 if (N1C) {
14528 unsigned Val = N1C->getZExtValue();
14529 if ((Val & ~Mask) != Val)
14530 return SDValue();
14531
14532 if (ARM::isBitFieldInvertedMask(Mask)) {
14533 Val >>= llvm::countr_zero(~Mask);
14534
14535 Res = DAG.getNode(ARMISD::BFI, DL, VT, N00,
14536 DAG.getConstant(Val, DL, MVT::i32),
14537 DAG.getConstant(Mask, DL, MVT::i32));
14538
14539 DCI.CombineTo(N, Res, false);
14540 // Return value from the original node to inform the combiner than N is
14541 // now dead.
14542 return SDValue(N, 0);
14543 }
14544 } else if (N1.getOpcode() == ISD::AND) {
14545 // case (2) or (and A, mask), (and B, mask2) => ARMbfi A, (lsr B, amt), mask
14547 if (!N11C)
14548 return SDValue();
14549 unsigned Mask2 = N11C->getZExtValue();
14550
14551 // Mask and ~Mask2 (or reverse) must be equivalent for the BFI pattern
14552 // as is to match.
14553 if (ARM::isBitFieldInvertedMask(Mask) &&
14554 (Mask == ~Mask2)) {
14555 // The pack halfword instruction works better for masks that fit it,
14556 // so use that when it's available.
14557 if (Subtarget->hasDSP() &&
14558 (Mask == 0xffff || Mask == 0xffff0000))
14559 return SDValue();
14560 // 2a
14561 unsigned amt = llvm::countr_zero(Mask2);
14562 Res = DAG.getNode(ISD::SRL, DL, VT, N1.getOperand(0),
14563 DAG.getConstant(amt, DL, MVT::i32));
14564 Res = DAG.getNode(ARMISD::BFI, DL, VT, N00, Res,
14565 DAG.getConstant(Mask, DL, MVT::i32));
14566 DCI.CombineTo(N, Res, false);
14567 // Return value from the original node to inform the combiner than N is
14568 // now dead.
14569 return SDValue(N, 0);
14570 } else if (ARM::isBitFieldInvertedMask(~Mask) &&
14571 (~Mask == Mask2)) {
14572 // The pack halfword instruction works better for masks that fit it,
14573 // so use that when it's available.
14574 if (Subtarget->hasDSP() &&
14575 (Mask2 == 0xffff || Mask2 == 0xffff0000))
14576 return SDValue();
14577 // 2b
14578 unsigned lsb = llvm::countr_zero(Mask);
14579 Res = DAG.getNode(ISD::SRL, DL, VT, N00,
14580 DAG.getConstant(lsb, DL, MVT::i32));
14581 Res = DAG.getNode(ARMISD::BFI, DL, VT, N1.getOperand(0), Res,
14582 DAG.getConstant(Mask2, DL, MVT::i32));
14583 DCI.CombineTo(N, Res, false);
14584 // Return value from the original node to inform the combiner than N is
14585 // now dead.
14586 return SDValue(N, 0);
14587 }
14588 }
14589
14590 if (DAG.MaskedValueIsZero(N1, MaskC->getAPIntValue()) &&
14591 N00.getOpcode() == ISD::SHL && isa<ConstantSDNode>(N00.getOperand(1)) &&
14593 // Case (3): or (and (shl A, #shamt), mask), B => ARMbfi B, A, ~mask
14594 // where lsb(mask) == #shamt and masked bits of B are known zero.
14595 SDValue ShAmt = N00.getOperand(1);
14596 unsigned ShAmtC = ShAmt->getAsZExtVal();
14597 unsigned LSB = llvm::countr_zero(Mask);
14598 if (ShAmtC != LSB)
14599 return SDValue();
14600
14601 Res = DAG.getNode(ARMISD::BFI, DL, VT, N1, N00.getOperand(0),
14602 DAG.getConstant(~Mask, DL, MVT::i32));
14603
14604 DCI.CombineTo(N, Res, false);
14605 // Return value from the original node to inform the combiner than N is
14606 // now dead.
14607 return SDValue(N, 0);
14608 }
14609
14610 return SDValue();
14611}
14612
14613static bool isValidMVECond(unsigned CC, bool IsFloat) {
14614 switch (CC) {
14615 case ARMCC::EQ:
14616 case ARMCC::NE:
14617 case ARMCC::LE:
14618 case ARMCC::GT:
14619 case ARMCC::GE:
14620 case ARMCC::LT:
14621 return true;
14622 case ARMCC::HS:
14623 case ARMCC::HI:
14624 return !IsFloat;
14625 default:
14626 return false;
14627 };
14628}
14629
14631 if (N->getOpcode() == ARMISD::VCMP)
14632 return (ARMCC::CondCodes)N->getConstantOperandVal(2);
14633 else if (N->getOpcode() == ARMISD::VCMPZ)
14634 return (ARMCC::CondCodes)N->getConstantOperandVal(1);
14635 else
14636 llvm_unreachable("Not a VCMP/VCMPZ!");
14637}
14638
14641 return isValidMVECond(CC, N->getOperand(0).getValueType().isFloatingPoint());
14642}
14643
14645 const ARMSubtarget *Subtarget) {
14646 // Try to invert "or A, B" -> "and ~A, ~B", as the "and" is easier to chain
14647 // together with predicates
14648 EVT VT = N->getValueType(0);
14649 SDLoc DL(N);
14650 SDValue N0 = N->getOperand(0);
14651 SDValue N1 = N->getOperand(1);
14652
14653 auto IsFreelyInvertable = [&](SDValue V) {
14654 if (V->getOpcode() == ARMISD::VCMP || V->getOpcode() == ARMISD::VCMPZ)
14655 return CanInvertMVEVCMP(V);
14656 return false;
14657 };
14658
14659 // At least one operand must be freely invertable.
14660 if (!(IsFreelyInvertable(N0) || IsFreelyInvertable(N1)))
14661 return SDValue();
14662
14663 SDValue NewN0 = DAG.getLogicalNOT(DL, N0, VT);
14664 SDValue NewN1 = DAG.getLogicalNOT(DL, N1, VT);
14665 SDValue And = DAG.getNode(ISD::AND, DL, VT, NewN0, NewN1);
14666 return DAG.getLogicalNOT(DL, And, VT);
14667}
14668
14669// Try to form a NEON shift-{right, left}-and-insert (VSRI/VSLI) from:
14670// (or (and X, splat (i32 C1)), (srl Y, splat (i32 C2))) -> VSRI X, Y, #C2
14671// (or (and X, splat (i32 C1)), (shl Y, splat (i32 C2))) -> VSLI X, Y, #C2
14672// where C1 is a mask that preserves the bits not written by the shift/insert,
14673// i.e. `C1 == (1 << C2) - 1`.
14675 SDValue ShiftOp, EVT VT,
14676 SDLoc dl) {
14677 // Match (and X, Mask)
14678 if (AndOp.getOpcode() != ISD::AND)
14679 return SDValue();
14680
14681 SDValue X = AndOp.getOperand(0);
14682 SDValue Mask = AndOp.getOperand(1);
14683
14684 ConstantSDNode *MaskC = isConstOrConstSplat(Mask, false, true);
14685 if (!MaskC)
14686 return SDValue();
14687 APInt MaskBits =
14688 MaskC->getAPIntValue().trunc(Mask.getScalarValueSizeInBits());
14689
14690 // Match shift (srl/shl Y, CntVec)
14691 int64_t Cnt = 0;
14692 bool IsShiftRight = false;
14693 SDValue Y;
14694
14695 if (ShiftOp.getOpcode() == ARMISD::VSHRuIMM) {
14696 IsShiftRight = true;
14697 Y = ShiftOp.getOperand(0);
14698 Cnt = ShiftOp.getConstantOperandVal(1);
14699 } else if (ShiftOp.getOpcode() == ARMISD::VSHLIMM) {
14700 Y = ShiftOp.getOperand(0);
14701 Cnt = ShiftOp.getConstantOperandVal(1);
14702 } else {
14703 return SDValue();
14704 }
14705
14706 unsigned ElemBits = VT.getScalarSizeInBits();
14707 APInt RequiredMask = IsShiftRight
14708 ? APInt::getHighBitsSet(ElemBits, (unsigned)Cnt)
14709 : APInt::getLowBitsSet(ElemBits, (unsigned)Cnt);
14710 if (MaskBits != RequiredMask)
14711 return SDValue();
14712
14713 unsigned Opc = IsShiftRight ? ARMISD::VSRIIMM : ARMISD::VSLIIMM;
14714 return DAG.getNode(Opc, dl, VT, X, Y, DAG.getConstant(Cnt, dl, MVT::i32));
14715}
14716
14717/// PerformORCombine - Target-specific dag combine xforms for ISD::OR
14719 const ARMSubtarget *Subtarget) {
14720 // Attempt to use immediate-form VORR
14721 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(1));
14722 SDLoc dl(N);
14723 EVT VT = N->getValueType(0);
14724 SelectionDAG &DAG = DCI.DAG;
14725
14726 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
14727 return SDValue();
14728
14729 if (Subtarget->hasMVEIntegerOps() && (VT == MVT::v2i1 || VT == MVT::v4i1 ||
14730 VT == MVT::v8i1 || VT == MVT::v16i1))
14731 return PerformORCombine_i1(N, DAG, Subtarget);
14732
14733 APInt SplatBits, SplatUndef;
14734 unsigned SplatBitSize;
14735 bool HasAnyUndefs;
14736 if (BVN && (Subtarget->hasNEON() || Subtarget->hasMVEIntegerOps()) &&
14737 BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
14738 if (SplatBitSize == 8 || SplatBitSize == 16 || SplatBitSize == 32 ||
14739 SplatBitSize == 64) {
14740 EVT VorrVT;
14741 SDValue Val =
14742 isVMOVModifiedImm(SplatBits.getZExtValue(), SplatUndef.getZExtValue(),
14743 SplatBitSize, DAG, dl, VorrVT, VT, OtherModImm);
14744 if (Val.getNode()) {
14745 SDValue Input =
14746 DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VorrVT, N->getOperand(0));
14747 SDValue Vorr = DAG.getNode(ARMISD::VORRIMM, dl, VorrVT, Input, Val);
14748 return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, Vorr);
14749 }
14750 }
14751 }
14752
14753 if (!Subtarget->isThumb1Only()) {
14754 // fold (or (select cc, 0, c), x) -> (select cc, x, (or, x, c))
14755 if (SDValue Result = combineSelectAndUseCommutative(N, false, DCI))
14756 return Result;
14757 if (SDValue Result = PerformORCombineToSMULWBT(N, DCI, Subtarget))
14758 return Result;
14759 }
14760
14761 SDValue N0 = N->getOperand(0);
14762 SDValue N1 = N->getOperand(1);
14763
14764 // (or (and X, C1), (srl Y, C2)) -> VSRI X, Y, #C2
14765 // (or (and X, C1), (shl Y, C2)) -> VSLI X, Y, #C2
14766 if (VT.isVector() &&
14767 ((Subtarget->hasNEON() && DAG.getTargetLoweringInfo().isTypeLegal(VT)) ||
14768 (Subtarget->hasMVEIntegerOps() &&
14769 (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32)))) {
14770 if (SDValue ShiftInsert =
14771 PerformORCombineToShiftInsert(DAG, N0, N1, VT, dl))
14772 return ShiftInsert;
14773
14774 if (SDValue ShiftInsert =
14775 PerformORCombineToShiftInsert(DAG, N1, N0, VT, dl))
14776 return ShiftInsert;
14777 }
14778
14779 // (or (and B, A), (and C, ~A)) => (VBSL A, B, C) when A is a constant.
14780 if (Subtarget->hasNEON() && N1.getOpcode() == ISD::AND && VT.isVector() &&
14782
14783 // The code below optimizes (or (and X, Y), Z).
14784 // The AND operand needs to have a single user to make these optimizations
14785 // profitable.
14786 if (N0.getOpcode() != ISD::AND || !N0.hasOneUse())
14787 return SDValue();
14788
14789 APInt SplatUndef;
14790 unsigned SplatBitSize;
14791 bool HasAnyUndefs;
14792
14793 APInt SplatBits0, SplatBits1;
14796 // Ensure that the second operand of both ands are constants
14797 if (BVN0 && BVN0->isConstantSplat(SplatBits0, SplatUndef, SplatBitSize,
14798 HasAnyUndefs) && !HasAnyUndefs) {
14799 if (BVN1 && BVN1->isConstantSplat(SplatBits1, SplatUndef, SplatBitSize,
14800 HasAnyUndefs) && !HasAnyUndefs) {
14801 // Ensure that the bit width of the constants are the same and that
14802 // the splat arguments are logical inverses as per the pattern we
14803 // are trying to simplify.
14804 if (SplatBits0.getBitWidth() == SplatBits1.getBitWidth() &&
14805 SplatBits0 == ~SplatBits1) {
14806 // Canonicalize the vector type to make instruction selection
14807 // simpler.
14808 EVT CanonicalVT = VT.is128BitVector() ? MVT::v4i32 : MVT::v2i32;
14809 SDValue Result = DAG.getNode(ARMISD::VBSP, dl, CanonicalVT,
14810 N0->getOperand(1),
14811 N0->getOperand(0),
14812 N1->getOperand(0));
14813 return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, Result);
14814 }
14815 }
14816 }
14817 }
14818
14819 // Try to use the ARM/Thumb2 BFI (bitfield insert) instruction when
14820 // reasonable.
14821 if (N0.getOpcode() == ISD::AND && N0.hasOneUse()) {
14822 if (SDValue Res = PerformORCombineToBFI(N, DCI, Subtarget))
14823 return Res;
14824 }
14825
14826 if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget))
14827 return Result;
14828
14829 return SDValue();
14830}
14831
14834 const ARMSubtarget *Subtarget) {
14835 EVT VT = N->getValueType(0);
14836 SelectionDAG &DAG = DCI.DAG;
14837
14838 if(!DAG.getTargetLoweringInfo().isTypeLegal(VT))
14839 return SDValue();
14840
14841 if (!Subtarget->isThumb1Only()) {
14842 // fold (xor (select cc, 0, c), x) -> (select cc, x, (xor, x, c))
14843 if (SDValue Result = combineSelectAndUseCommutative(N, false, DCI))
14844 return Result;
14845
14846 if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget))
14847 return Result;
14848 }
14849
14850 if (Subtarget->hasMVEIntegerOps()) {
14851 // fold (xor(vcmp/z, 1)) into a vcmp with the opposite condition.
14852 SDValue N0 = N->getOperand(0);
14853 SDValue N1 = N->getOperand(1);
14854 const TargetLowering *TLI = Subtarget->getTargetLowering();
14855 if (TLI->isConstTrueVal(N1) &&
14856 (N0->getOpcode() == ARMISD::VCMP || N0->getOpcode() == ARMISD::VCMPZ)) {
14857 if (CanInvertMVEVCMP(N0)) {
14858 SDLoc DL(N0);
14860
14862 Ops.push_back(N0->getOperand(0));
14863 if (N0->getOpcode() == ARMISD::VCMP)
14864 Ops.push_back(N0->getOperand(1));
14865 Ops.push_back(DAG.getConstant(CC, DL, MVT::i32));
14866 return DAG.getNode(N0->getOpcode(), DL, N0->getValueType(0), Ops);
14867 }
14868 }
14869 }
14870
14871 return SDValue();
14872}
14873
14874// ParseBFI - given a BFI instruction in N, extract the "from" value (Rn) and return it,
14875// and fill in FromMask and ToMask with (consecutive) bits in "from" to be extracted and
14876// their position in "to" (Rd).
14877static SDValue ParseBFI(SDNode *N, APInt &ToMask, APInt &FromMask) {
14878 assert(N->getOpcode() == ARMISD::BFI);
14879
14880 SDValue From = N->getOperand(1);
14881 ToMask = ~N->getConstantOperandAPInt(2);
14882 FromMask = APInt::getLowBitsSet(ToMask.getBitWidth(), ToMask.popcount());
14883
14884 // If the Base came from a SHR #C, we can deduce that it is really testing bit
14885 // #C in the base of the SHR.
14886 if (From->getOpcode() == ISD::SRL &&
14887 isa<ConstantSDNode>(From->getOperand(1))) {
14888 APInt Shift = From->getConstantOperandAPInt(1);
14889 assert(Shift.getLimitedValue() < 32 && "Shift too large!");
14890 FromMask <<= Shift.getLimitedValue(31);
14891 From = From->getOperand(0);
14892 }
14893
14894 return From;
14895}
14896
14897// If A and B contain one contiguous set of bits, does A | B == A . B?
14898//
14899// Neither A nor B must be zero.
14900static bool BitsProperlyConcatenate(const APInt &A, const APInt &B) {
14901 unsigned LastActiveBitInA = A.countr_zero();
14902 unsigned FirstActiveBitInB = B.getBitWidth() - B.countl_zero() - 1;
14903 return LastActiveBitInA - 1 == FirstActiveBitInB;
14904}
14905
14907 // We have a BFI in N. Find a BFI it can combine with, if one exists.
14908 APInt ToMask, FromMask;
14909 SDValue From = ParseBFI(N, ToMask, FromMask);
14910 SDValue To = N->getOperand(0);
14911
14912 SDValue V = To;
14913 if (V.getOpcode() != ARMISD::BFI)
14914 return SDValue();
14915
14916 APInt NewToMask, NewFromMask;
14917 SDValue NewFrom = ParseBFI(V.getNode(), NewToMask, NewFromMask);
14918 if (NewFrom != From)
14919 return SDValue();
14920
14921 // Do the written bits conflict with any we've seen so far?
14922 if ((NewToMask & ToMask).getBoolValue())
14923 // Conflicting bits.
14924 return SDValue();
14925
14926 // Are the new bits contiguous when combined with the old bits?
14927 if (BitsProperlyConcatenate(ToMask, NewToMask) &&
14928 BitsProperlyConcatenate(FromMask, NewFromMask))
14929 return V;
14930 if (BitsProperlyConcatenate(NewToMask, ToMask) &&
14931 BitsProperlyConcatenate(NewFromMask, FromMask))
14932 return V;
14933
14934 return SDValue();
14935}
14936
14938 SDValue N0 = N->getOperand(0);
14939 SDValue N1 = N->getOperand(1);
14940
14941 if (N1.getOpcode() == ISD::AND) {
14942 // (bfi A, (and B, Mask1), Mask2) -> (bfi A, B, Mask2) iff
14943 // the bits being cleared by the AND are not demanded by the BFI.
14945 if (!N11C)
14946 return SDValue();
14947 unsigned InvMask = N->getConstantOperandVal(2);
14948 unsigned LSB = llvm::countr_zero(~InvMask);
14949 unsigned Width = llvm::bit_width<unsigned>(~InvMask) - LSB;
14950 assert(Width <
14951 static_cast<unsigned>(std::numeric_limits<unsigned>::digits) &&
14952 "undefined behavior");
14953 unsigned Mask = (1u << Width) - 1;
14954 unsigned Mask2 = N11C->getZExtValue();
14955 if ((Mask & (~Mask2)) == 0)
14956 return DAG.getNode(ARMISD::BFI, SDLoc(N), N->getValueType(0),
14957 N->getOperand(0), N1.getOperand(0), N->getOperand(2));
14958 return SDValue();
14959 }
14960
14961 // Look for another BFI to combine with.
14962 if (SDValue CombineBFI = FindBFIToCombineWith(N)) {
14963 // We've found a BFI.
14964 APInt ToMask1, FromMask1;
14965 SDValue From1 = ParseBFI(N, ToMask1, FromMask1);
14966
14967 APInt ToMask2, FromMask2;
14968 SDValue From2 = ParseBFI(CombineBFI.getNode(), ToMask2, FromMask2);
14969 assert(From1 == From2);
14970 (void)From2;
14971
14972 // Create a new BFI, combining the two together.
14973 APInt NewFromMask = FromMask1 | FromMask2;
14974 APInt NewToMask = ToMask1 | ToMask2;
14975
14976 EVT VT = N->getValueType(0);
14977 SDLoc dl(N);
14978
14979 if (NewFromMask[0] == 0)
14980 From1 = DAG.getNode(ISD::SRL, dl, VT, From1,
14981 DAG.getConstant(NewFromMask.countr_zero(), dl, VT));
14982 return DAG.getNode(ARMISD::BFI, dl, VT, CombineBFI.getOperand(0), From1,
14983 DAG.getConstant(~NewToMask, dl, VT));
14984 }
14985
14986 // Reassociate BFI(BFI (A, B, M1), C, M2) to BFI(BFI (A, C, M2), B, M1) so
14987 // that lower bit insertions are performed first, providing that M1 and M2
14988 // do no overlap. This can allow multiple BFI instructions to be combined
14989 // together by the other folds above.
14990 if (N->getOperand(0).getOpcode() == ARMISD::BFI) {
14991 APInt ToMask1 = ~N->getConstantOperandAPInt(2);
14992 APInt ToMask2 = ~N0.getConstantOperandAPInt(2);
14993
14994 if (!N0.hasOneUse() || (ToMask1 & ToMask2) != 0 ||
14995 ToMask1.countl_zero() < ToMask2.countl_zero())
14996 return SDValue();
14997
14998 EVT VT = N->getValueType(0);
14999 SDLoc dl(N);
15000 SDValue BFI1 = DAG.getNode(ARMISD::BFI, dl, VT, N0.getOperand(0),
15001 N->getOperand(1), N->getOperand(2));
15002 return DAG.getNode(ARMISD::BFI, dl, VT, BFI1, N0.getOperand(1),
15003 N0.getOperand(2));
15004 }
15005
15006 return SDValue();
15007}
15008
15009// Check that N is CMPZ(CSINC(0, 0, CC, X)),
15010// or CMPZ(CMOV(1, 0, CC, X))
15011// return X if valid.
15013 if (Cmp->getOpcode() != ARMISD::CMPZ || !isNullConstant(Cmp->getOperand(1)))
15014 return SDValue();
15015 SDValue CSInc = Cmp->getOperand(0);
15016
15017 // Ignore any `And 1` nodes that may not yet have been removed. We are
15018 // looking for a value that produces 1/0, so these have no effect on the
15019 // code.
15020 while (CSInc.getOpcode() == ISD::AND &&
15021 isa<ConstantSDNode>(CSInc.getOperand(1)) &&
15022 CSInc.getConstantOperandVal(1) == 1 && CSInc->hasOneUse())
15023 CSInc = CSInc.getOperand(0);
15024
15025 if (CSInc.getOpcode() == ARMISD::CSINC &&
15026 isNullConstant(CSInc.getOperand(0)) &&
15027 isNullConstant(CSInc.getOperand(1)) && CSInc->hasOneUse()) {
15029 return CSInc.getOperand(3);
15030 }
15031 if (CSInc.getOpcode() == ARMISD::CMOV && isOneConstant(CSInc.getOperand(0)) &&
15032 isNullConstant(CSInc.getOperand(1)) && CSInc->hasOneUse()) {
15034 return CSInc.getOperand(3);
15035 }
15036 if (CSInc.getOpcode() == ARMISD::CMOV && isOneConstant(CSInc.getOperand(1)) &&
15037 isNullConstant(CSInc.getOperand(0)) && CSInc->hasOneUse()) {
15040 return CSInc.getOperand(3);
15041 }
15042 return SDValue();
15043}
15044
15046 // Given CMPZ(CSINC(C, 0, 0, EQ), 0), we can just use C directly. As in
15047 // t92: flags = ARMISD::CMPZ t74, 0
15048 // t93: i32 = ARMISD::CSINC 0, 0, 1, t92
15049 // t96: flags = ARMISD::CMPZ t93, 0
15050 // t114: i32 = ARMISD::CSINV 0, 0, 0, t96
15052 if (SDValue C = IsCMPZCSINC(N, Cond))
15053 if (Cond == ARMCC::EQ)
15054 return C;
15055 return SDValue();
15056}
15057
15059 // Fold away an unnecessary CMPZ/CSINC
15060 // CSXYZ A, B, C1 (CMPZ (CSINC 0, 0, C2, D), 0) ->
15061 // if C1==EQ -> CSXYZ A, B, C2, D
15062 // if C1==NE -> CSXYZ A, B, NOT(C2), D
15064 if (SDValue C = IsCMPZCSINC(N->getOperand(3).getNode(), Cond)) {
15065 if (N->getConstantOperandVal(2) == ARMCC::EQ)
15066 return DAG.getNode(N->getOpcode(), SDLoc(N), MVT::i32, N->getOperand(0),
15067 N->getOperand(1),
15068 DAG.getConstant(Cond, SDLoc(N), MVT::i32), C);
15069 if (N->getConstantOperandVal(2) == ARMCC::NE)
15070 return DAG.getNode(
15071 N->getOpcode(), SDLoc(N), MVT::i32, N->getOperand(0),
15072 N->getOperand(1),
15074 }
15075 return SDValue();
15076}
15077
15078/// PerformVMOVRRDCombine - Target-specific dag combine xforms for
15079/// ARMISD::VMOVRRD.
15082 const ARMSubtarget *Subtarget) {
15083 // vmovrrd(vmovdrr x, y) -> x,y
15084 SDValue InDouble = N->getOperand(0);
15085 if (InDouble.getOpcode() == ARMISD::VMOVDRR && Subtarget->hasFP64())
15086 return DCI.CombineTo(N, InDouble.getOperand(0), InDouble.getOperand(1));
15087
15088 // vmovrrd(load f64) -> (load i32), (load i32)
15089 SDNode *InNode = InDouble.getNode();
15090 if (ISD::isNormalLoad(InNode) && InNode->hasOneUse() &&
15091 InNode->getValueType(0) == MVT::f64 &&
15092 InNode->getOperand(1).getOpcode() == ISD::FrameIndex &&
15093 !cast<LoadSDNode>(InNode)->isVolatile()) {
15094 // TODO: Should this be done for non-FrameIndex operands?
15095 LoadSDNode *LD = cast<LoadSDNode>(InNode);
15096
15097 SelectionDAG &DAG = DCI.DAG;
15098 SDLoc DL(LD);
15099 SDValue BasePtr = LD->getBasePtr();
15100 SDValue NewLD1 =
15101 DAG.getLoad(MVT::i32, DL, LD->getChain(), BasePtr, LD->getPointerInfo(),
15102 LD->getAlign(), LD->getMemOperand()->getFlags());
15103
15104 SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr,
15105 DAG.getConstant(4, DL, MVT::i32));
15106
15107 SDValue NewLD2 = DAG.getLoad(MVT::i32, DL, LD->getChain(), OffsetPtr,
15108 LD->getPointerInfo().getWithOffset(4),
15109 commonAlignment(LD->getAlign(), 4),
15110 LD->getMemOperand()->getFlags());
15111
15112 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewLD2.getValue(1));
15113 if (DCI.DAG.getDataLayout().isBigEndian())
15114 std::swap (NewLD1, NewLD2);
15115 SDValue Result = DCI.CombineTo(N, NewLD1, NewLD2);
15116 return Result;
15117 }
15118
15119 // VMOVRRD(extract(..(build_vector(a, b, c, d)))) -> a,b or c,d
15120 // VMOVRRD(extract(insert_vector(insert_vector(.., a, l1), b, l2))) -> a,b
15121 if (InDouble.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
15122 isa<ConstantSDNode>(InDouble.getOperand(1))) {
15123 SDValue BV = InDouble.getOperand(0);
15124 // Look up through any nop bitcasts and vector_reg_casts. bitcasts may
15125 // change lane order under big endian.
15126 bool BVSwap = BV.getOpcode() == ISD::BITCAST;
15127 while (
15128 (BV.getOpcode() == ISD::BITCAST ||
15129 BV.getOpcode() == ARMISD::VECTOR_REG_CAST) &&
15130 (BV.getValueType() == MVT::v2f64 || BV.getValueType() == MVT::v2i64)) {
15131 BVSwap = BV.getOpcode() == ISD::BITCAST;
15132 BV = BV.getOperand(0);
15133 }
15134 if (BV.getValueType() != MVT::v4i32)
15135 return SDValue();
15136
15137 // Handle buildvectors, pulling out the correct lane depending on
15138 // endianness.
15139 unsigned Offset = InDouble.getConstantOperandVal(1) == 1 ? 2 : 0;
15140 if (BV.getOpcode() == ISD::BUILD_VECTOR) {
15141 SDValue Op0 = BV.getOperand(Offset);
15142 SDValue Op1 = BV.getOperand(Offset + 1);
15143 if (!Subtarget->isLittle() && BVSwap)
15144 std::swap(Op0, Op1);
15145
15146 return DCI.DAG.getMergeValues({Op0, Op1}, SDLoc(N));
15147 }
15148
15149 // A chain of insert_vectors, grabbing the correct value of the chain of
15150 // inserts.
15151 SDValue Op0, Op1;
15152 while (BV.getOpcode() == ISD::INSERT_VECTOR_ELT) {
15153 if (isa<ConstantSDNode>(BV.getOperand(2))) {
15154 if (BV.getConstantOperandVal(2) == Offset && !Op0)
15155 Op0 = BV.getOperand(1);
15156 if (BV.getConstantOperandVal(2) == Offset + 1 && !Op1)
15157 Op1 = BV.getOperand(1);
15158 }
15159 BV = BV.getOperand(0);
15160 }
15161 if (!Subtarget->isLittle() && BVSwap)
15162 std::swap(Op0, Op1);
15163 if (Op0 && Op1)
15164 return DCI.DAG.getMergeValues({Op0, Op1}, SDLoc(N));
15165 }
15166
15167 return SDValue();
15168}
15169
15170/// PerformVMOVDRRCombine - Target-specific dag combine xforms for
15171/// ARMISD::VMOVDRR. This is also used for BUILD_VECTORs with 2 operands.
15173 // N=vmovrrd(X); vmovdrr(N:0, N:1) -> bit_convert(X)
15174 SDValue Op0 = N->getOperand(0);
15175 SDValue Op1 = N->getOperand(1);
15176 if (Op0.getOpcode() == ISD::BITCAST)
15177 Op0 = Op0.getOperand(0);
15178 if (Op1.getOpcode() == ISD::BITCAST)
15179 Op1 = Op1.getOperand(0);
15180 if (Op0.getOpcode() == ARMISD::VMOVRRD &&
15181 Op0.getNode() == Op1.getNode() &&
15182 Op0.getResNo() == 0 && Op1.getResNo() == 1)
15183 return DAG.getNode(ISD::BITCAST, SDLoc(N),
15184 N->getValueType(0), Op0.getOperand(0));
15185 return SDValue();
15186}
15187
15190 SDValue Op0 = N->getOperand(0);
15191
15192 // VMOVhr (VMOVrh (X)) -> X
15193 if (Op0->getOpcode() == ARMISD::VMOVrh)
15194 return Op0->getOperand(0);
15195
15196 // FullFP16: half values are passed in S-registers, and we don't
15197 // need any of the bitcast and moves:
15198 //
15199 // t2: f32,ch1,gl1? = CopyFromReg ch, Register:f32 %0, gl?
15200 // t5: i32 = bitcast t2
15201 // t18: f16 = ARMISD::VMOVhr t5
15202 // =>
15203 // tN: f16,ch2,gl2? = CopyFromReg ch, Register::f32 %0, gl?
15204 if (Op0->getOpcode() == ISD::BITCAST) {
15205 SDValue Copy = Op0->getOperand(0);
15206 if (Copy.getValueType() == MVT::f32 &&
15207 Copy->getOpcode() == ISD::CopyFromReg) {
15208 bool HasGlue = Copy->getNumOperands() == 3;
15209 SDValue Ops[] = {Copy->getOperand(0), Copy->getOperand(1),
15210 HasGlue ? Copy->getOperand(2) : SDValue()};
15211 EVT OutTys[] = {N->getValueType(0), MVT::Other, MVT::Glue};
15212 SDValue NewCopy =
15214 DCI.DAG.getVTList(ArrayRef(OutTys, HasGlue ? 3 : 2)),
15215 ArrayRef(Ops, HasGlue ? 3 : 2));
15216
15217 // Update Users, Chains, and Potential Glue.
15218 DCI.DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), NewCopy.getValue(0));
15219 DCI.DAG.ReplaceAllUsesOfValueWith(Copy.getValue(1), NewCopy.getValue(1));
15220 if (HasGlue)
15221 DCI.DAG.ReplaceAllUsesOfValueWith(Copy.getValue(2),
15222 NewCopy.getValue(2));
15223
15224 return NewCopy;
15225 }
15226 }
15227
15228 // fold (VMOVhr (load x)) -> (load (f16*)x)
15229 if (LoadSDNode *LN0 = dyn_cast<LoadSDNode>(Op0)) {
15230 if (LN0->hasOneUse() && LN0->isUnindexed() &&
15231 LN0->getMemoryVT() == MVT::i16) {
15232 SDValue Load =
15233 DCI.DAG.getLoad(N->getValueType(0), SDLoc(N), LN0->getChain(),
15234 LN0->getBasePtr(), LN0->getMemOperand());
15235 DCI.DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Load.getValue(0));
15236 DCI.DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), Load.getValue(1));
15237 return Load;
15238 }
15239 }
15240
15241 // Only the bottom 16 bits of the source register are used.
15242 APInt DemandedMask = APInt::getLowBitsSet(32, 16);
15243 const TargetLowering &TLI = DCI.DAG.getTargetLoweringInfo();
15244 if (TLI.SimplifyDemandedBits(Op0, DemandedMask, DCI))
15245 return SDValue(N, 0);
15246
15247 return SDValue();
15248}
15249
15251 SDValue N0 = N->getOperand(0);
15252 EVT VT = N->getValueType(0);
15253
15254 // fold (VMOVrh (fpconst x)) -> const x
15256 APFloat V = C->getValueAPF();
15257 return DAG.getConstant(V.bitcastToAPInt().getZExtValue(), SDLoc(N), VT);
15258 }
15259
15260 // fold (VMOVrh (load x)) -> (zextload (i16*)x)
15261 if (ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse()) {
15262 LoadSDNode *LN0 = cast<LoadSDNode>(N0);
15263
15264 SDValue Load =
15265 DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(N), VT, LN0->getChain(),
15266 LN0->getBasePtr(), MVT::i16, LN0->getMemOperand());
15267 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Load.getValue(0));
15268 DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), Load.getValue(1));
15269 return Load;
15270 }
15271
15272 // Fold VMOVrh(extract(x, n)) -> vgetlaneu(x, n)
15273 if (N0->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
15275 return DAG.getNode(ARMISD::VGETLANEu, SDLoc(N), VT, N0->getOperand(0),
15276 N0->getOperand(1));
15277
15278 return SDValue();
15279}
15280
15281/// hasNormalLoadOperand - Check if any of the operands of a BUILD_VECTOR node
15282/// are normal, non-volatile loads. If so, it is profitable to bitcast an
15283/// i64 vector to have f64 elements, since the value can then be loaded
15284/// directly into a VFP register.
15286 unsigned NumElts = N->getValueType(0).getVectorNumElements();
15287 for (unsigned i = 0; i < NumElts; ++i) {
15288 SDNode *Elt = N->getOperand(i).getNode();
15289 if (ISD::isNormalLoad(Elt) && !cast<LoadSDNode>(Elt)->isVolatile())
15290 return true;
15291 }
15292 return false;
15293}
15294
15295/// PerformBUILD_VECTORCombine - Target-specific dag combine xforms for
15296/// ISD::BUILD_VECTOR.
15299 const ARMSubtarget *Subtarget) {
15300 // build_vector(N=ARMISD::VMOVRRD(X), N:1) -> bit_convert(X):
15301 // VMOVRRD is introduced when legalizing i64 types. It forces the i64 value
15302 // into a pair of GPRs, which is fine when the value is used as a scalar,
15303 // but if the i64 value is converted to a vector, we need to undo the VMOVRRD.
15304 SelectionDAG &DAG = DCI.DAG;
15305 if (N->getNumOperands() == 2)
15306 if (SDValue RV = PerformVMOVDRRCombine(N, DAG))
15307 return RV;
15308
15309 // Load i64 elements as f64 values so that type legalization does not split
15310 // them up into i32 values.
15311 EVT VT = N->getValueType(0);
15312 if (VT.getVectorElementType() != MVT::i64 || !hasNormalLoadOperand(N))
15313 return SDValue();
15314 SDLoc dl(N);
15316 unsigned NumElts = VT.getVectorNumElements();
15317 for (unsigned i = 0; i < NumElts; ++i) {
15318 SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::f64, N->getOperand(i));
15319 Ops.push_back(V);
15320 // Make the DAGCombiner fold the bitcast.
15321 DCI.AddToWorklist(V.getNode());
15322 }
15323 EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, NumElts);
15324 SDValue BV = DAG.getBuildVector(FloatVT, dl, Ops);
15325 return DAG.getNode(ISD::BITCAST, dl, VT, BV);
15326}
15327
15328/// Target-specific dag combine xforms for ARMISD::BUILD_VECTOR.
15329static SDValue
15331 // ARMISD::BUILD_VECTOR is introduced when legalizing ISD::BUILD_VECTOR.
15332 // At that time, we may have inserted bitcasts from integer to float.
15333 // If these bitcasts have survived DAGCombine, change the lowering of this
15334 // BUILD_VECTOR in something more vector friendly, i.e., that does not
15335 // force to use floating point types.
15336
15337 // Make sure we can change the type of the vector.
15338 // This is possible iff:
15339 // 1. The vector is only used in a bitcast to a integer type. I.e.,
15340 // 1.1. Vector is used only once.
15341 // 1.2. Use is a bit convert to an integer type.
15342 // 2. The size of its operands are 32-bits (64-bits are not legal).
15343 EVT VT = N->getValueType(0);
15344 EVT EltVT = VT.getVectorElementType();
15345
15346 // Check 1.1. and 2.
15347 if (EltVT.getSizeInBits() != 32 || !N->hasOneUse())
15348 return SDValue();
15349
15350 // By construction, the input type must be float.
15351 assert(EltVT == MVT::f32 && "Unexpected type!");
15352
15353 // Check 1.2.
15354 SDNode *Use = *N->user_begin();
15355 if (Use->getOpcode() != ISD::BITCAST ||
15356 Use->getValueType(0).isFloatingPoint())
15357 return SDValue();
15358
15359 // Check profitability.
15360 // Model is, if more than half of the relevant operands are bitcast from
15361 // i32, turn the build_vector into a sequence of insert_vector_elt.
15362 // Relevant operands are everything that is not statically
15363 // (i.e., at compile time) bitcasted.
15364 unsigned NumOfBitCastedElts = 0;
15365 unsigned NumElts = VT.getVectorNumElements();
15366 unsigned NumOfRelevantElts = NumElts;
15367 for (unsigned Idx = 0; Idx < NumElts; ++Idx) {
15368 SDValue Elt = N->getOperand(Idx);
15369 if (Elt->getOpcode() == ISD::BITCAST) {
15370 // Assume only bit cast to i32 will go away.
15371 if (Elt->getOperand(0).getValueType() == MVT::i32)
15372 ++NumOfBitCastedElts;
15373 } else if (Elt.isUndef() || isa<ConstantSDNode>(Elt))
15374 // Constants are statically casted, thus do not count them as
15375 // relevant operands.
15376 --NumOfRelevantElts;
15377 }
15378
15379 // Check if more than half of the elements require a non-free bitcast.
15380 if (NumOfBitCastedElts <= NumOfRelevantElts / 2)
15381 return SDValue();
15382
15383 SelectionDAG &DAG = DCI.DAG;
15384 // Create the new vector type.
15385 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts);
15386 // Check if the type is legal.
15387 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
15388 if (!TLI.isTypeLegal(VecVT))
15389 return SDValue();
15390
15391 // Combine:
15392 // ARMISD::BUILD_VECTOR E1, E2, ..., EN.
15393 // => BITCAST INSERT_VECTOR_ELT
15394 // (INSERT_VECTOR_ELT (...), (BITCAST EN-1), N-1),
15395 // (BITCAST EN), N.
15396 SDValue Vec = DAG.getUNDEF(VecVT);
15397 SDLoc dl(N);
15398 for (unsigned Idx = 0 ; Idx < NumElts; ++Idx) {
15399 SDValue V = N->getOperand(Idx);
15400 if (V.isUndef())
15401 continue;
15402 if (V.getOpcode() == ISD::BITCAST &&
15403 V->getOperand(0).getValueType() == MVT::i32)
15404 // Fold obvious case.
15405 V = V.getOperand(0);
15406 else {
15407 V = DAG.getNode(ISD::BITCAST, SDLoc(V), MVT::i32, V);
15408 // Make the DAGCombiner fold the bitcasts.
15409 DCI.AddToWorklist(V.getNode());
15410 }
15411 SDValue LaneIdx = DAG.getConstant(Idx, dl, MVT::i32);
15412 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VecVT, Vec, V, LaneIdx);
15413 }
15414 Vec = DAG.getNode(ISD::BITCAST, dl, VT, Vec);
15415 // Make the DAGCombiner fold the bitcasts.
15416 DCI.AddToWorklist(Vec.getNode());
15417 return Vec;
15418}
15419
15420static SDValue
15422 EVT VT = N->getValueType(0);
15423 SDValue Op = N->getOperand(0);
15424 SDLoc dl(N);
15425
15426 // PREDICATE_CAST(PREDICATE_CAST(x)) == PREDICATE_CAST(x)
15427 if (Op->getOpcode() == ARMISD::PREDICATE_CAST) {
15428 // If the valuetypes are the same, we can remove the cast entirely.
15429 if (Op->getOperand(0).getValueType() == VT)
15430 return Op->getOperand(0);
15431 return DCI.DAG.getNode(ARMISD::PREDICATE_CAST, dl, VT, Op->getOperand(0));
15432 }
15433
15434 // Turn pred_cast(xor x, -1) into xor(pred_cast x, -1), in order to produce
15435 // more VPNOT which might get folded as else predicates.
15436 if (Op.getValueType() == MVT::i32 && isBitwiseNot(Op)) {
15437 SDValue X =
15438 DCI.DAG.getNode(ARMISD::PREDICATE_CAST, dl, VT, Op->getOperand(0));
15439 SDValue C = DCI.DAG.getNode(ARMISD::PREDICATE_CAST, dl, VT,
15440 DCI.DAG.getConstant(65535, dl, MVT::i32));
15441 return DCI.DAG.getNode(ISD::XOR, dl, VT, X, C);
15442 }
15443
15444 // Only the bottom 16 bits of the source register are used.
15445 if (Op.getValueType() == MVT::i32) {
15446 APInt DemandedMask = APInt::getLowBitsSet(32, 16);
15447 const TargetLowering &TLI = DCI.DAG.getTargetLoweringInfo();
15448 if (TLI.SimplifyDemandedBits(Op, DemandedMask, DCI))
15449 return SDValue(N, 0);
15450 }
15451 return SDValue();
15452}
15453
15455 const ARMSubtarget *ST) {
15456 EVT VT = N->getValueType(0);
15457 SDValue Op = N->getOperand(0);
15458 SDLoc dl(N);
15459
15460 // Under Little endian, a VECTOR_REG_CAST is equivalent to a BITCAST
15461 if (ST->isLittle())
15462 return DAG.getNode(ISD::BITCAST, dl, VT, Op);
15463
15464 // VT VECTOR_REG_CAST (VT Op) -> Op
15465 if (Op.getValueType() == VT)
15466 return Op;
15467 // VECTOR_REG_CAST undef -> undef
15468 if (Op.isUndef())
15469 return DAG.getUNDEF(VT);
15470
15471 // VECTOR_REG_CAST(VECTOR_REG_CAST(x)) == VECTOR_REG_CAST(x)
15472 if (Op->getOpcode() == ARMISD::VECTOR_REG_CAST) {
15473 // If the valuetypes are the same, we can remove the cast entirely.
15474 if (Op->getOperand(0).getValueType() == VT)
15475 return Op->getOperand(0);
15476 return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, Op->getOperand(0));
15477 }
15478
15479 return SDValue();
15480}
15481
15483 const ARMSubtarget *Subtarget) {
15484 if (!Subtarget->hasMVEIntegerOps())
15485 return SDValue();
15486
15487 EVT VT = N->getValueType(0);
15488 SDValue Op0 = N->getOperand(0);
15489 SDValue Op1 = N->getOperand(1);
15490 ARMCC::CondCodes Cond = (ARMCC::CondCodes)N->getConstantOperandVal(2);
15491 SDLoc dl(N);
15492
15493 // vcmp X, 0, cc -> vcmpz X, cc
15494 if (isZeroVector(Op1))
15495 return DAG.getNode(ARMISD::VCMPZ, dl, VT, Op0, N->getOperand(2));
15496
15497 unsigned SwappedCond = getSwappedCondition(Cond);
15498 if (isValidMVECond(SwappedCond, VT.isFloatingPoint())) {
15499 // vcmp 0, X, cc -> vcmpz X, reversed(cc)
15500 if (isZeroVector(Op0))
15501 return DAG.getNode(ARMISD::VCMPZ, dl, VT, Op1,
15502 DAG.getConstant(SwappedCond, dl, MVT::i32));
15503 // vcmp vdup(Y), X, cc -> vcmp X, vdup(Y), reversed(cc)
15504 if (Op0->getOpcode() == ARMISD::VDUP && Op1->getOpcode() != ARMISD::VDUP)
15505 return DAG.getNode(ARMISD::VCMP, dl, VT, Op1, Op0,
15506 DAG.getConstant(SwappedCond, dl, MVT::i32));
15507 }
15508
15509 return SDValue();
15510}
15511
15512/// PerformInsertEltCombine - Target-specific dag combine xforms for
15513/// ISD::INSERT_VECTOR_ELT.
15516 // Bitcast an i64 load inserted into a vector to f64.
15517 // Otherwise, the i64 value will be legalized to a pair of i32 values.
15518 EVT VT = N->getValueType(0);
15519 SDNode *Elt = N->getOperand(1).getNode();
15520 if (VT.getVectorElementType() != MVT::i64 ||
15521 !ISD::isNormalLoad(Elt) || cast<LoadSDNode>(Elt)->isVolatile())
15522 return SDValue();
15523
15524 SelectionDAG &DAG = DCI.DAG;
15525 SDLoc dl(N);
15526 EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64,
15528 SDValue Vec = DAG.getNode(ISD::BITCAST, dl, FloatVT, N->getOperand(0));
15529 SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::f64, N->getOperand(1));
15530 // Make the DAGCombiner fold the bitcasts.
15531 DCI.AddToWorklist(Vec.getNode());
15532 DCI.AddToWorklist(V.getNode());
15533 SDValue InsElt = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, FloatVT,
15534 Vec, V, N->getOperand(2));
15535 return DAG.getNode(ISD::BITCAST, dl, VT, InsElt);
15536}
15537
15538// Convert a pair of extracts from the same base vector to a VMOVRRD. Either
15539// directly or bitcast to an integer if the original is a float vector.
15540// extract(x, n); extract(x, n+1) -> VMOVRRD(extract v2f64 x, n/2)
15541// bitcast(extract(x, n)); bitcast(extract(x, n+1)) -> VMOVRRD(extract x, n/2)
15542static SDValue
15544 EVT VT = N->getValueType(0);
15545 SDLoc dl(N);
15546
15547 if (!DCI.isAfterLegalizeDAG() || VT != MVT::i32 ||
15548 !DCI.DAG.getTargetLoweringInfo().isTypeLegal(MVT::f64))
15549 return SDValue();
15550
15551 SDValue Ext = SDValue(N, 0);
15552 if (Ext.getOpcode() == ISD::BITCAST &&
15553 Ext.getOperand(0).getValueType() == MVT::f32)
15554 Ext = Ext.getOperand(0);
15555 if (Ext.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
15557 Ext.getConstantOperandVal(1) % 2 != 0)
15558 return SDValue();
15559 if (Ext->hasOneUse() && (Ext->user_begin()->getOpcode() == ISD::SINT_TO_FP ||
15560 Ext->user_begin()->getOpcode() == ISD::UINT_TO_FP))
15561 return SDValue();
15562
15563 SDValue Op0 = Ext.getOperand(0);
15564 EVT VecVT = Op0.getValueType();
15565 unsigned ResNo = Op0.getResNo();
15566 unsigned Lane = Ext.getConstantOperandVal(1);
15567 if (VecVT.getVectorNumElements() != 4)
15568 return SDValue();
15569
15570 // Find another extract, of Lane + 1
15571 auto OtherIt = find_if(Op0->users(), [&](SDNode *V) {
15572 return V->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
15573 isa<ConstantSDNode>(V->getOperand(1)) &&
15574 V->getConstantOperandVal(1) == Lane + 1 &&
15575 V->getOperand(0).getResNo() == ResNo;
15576 });
15577 if (OtherIt == Op0->users().end())
15578 return SDValue();
15579
15580 // For float extracts, we need to be converting to a i32 for both vector
15581 // lanes.
15582 SDValue OtherExt(*OtherIt, 0);
15583 if (OtherExt.getValueType() != MVT::i32) {
15584 if (!OtherExt->hasOneUse() ||
15585 OtherExt->user_begin()->getOpcode() != ISD::BITCAST ||
15586 OtherExt->user_begin()->getValueType(0) != MVT::i32)
15587 return SDValue();
15588 OtherExt = SDValue(*OtherExt->user_begin(), 0);
15589 }
15590
15591 // Convert the type to a f64 and extract with a VMOVRRD.
15592 SDValue F64 = DCI.DAG.getNode(
15593 ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
15594 DCI.DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v2f64, Op0),
15595 DCI.DAG.getConstant(Ext.getConstantOperandVal(1) / 2, dl, MVT::i32));
15596 SDValue VMOVRRD =
15597 DCI.DAG.getNode(ARMISD::VMOVRRD, dl, {MVT::i32, MVT::i32}, F64);
15598
15599 DCI.CombineTo(OtherExt.getNode(), SDValue(VMOVRRD.getNode(), 1));
15600 return VMOVRRD;
15601}
15602
15605 const ARMSubtarget *ST) {
15606 SDValue Op0 = N->getOperand(0);
15607 EVT VT = N->getValueType(0);
15608 SDLoc dl(N);
15609
15610 // extract (vdup x) -> x
15611 if (Op0->getOpcode() == ARMISD::VDUP) {
15612 SDValue X = Op0->getOperand(0);
15613 if (VT == MVT::f16 && X.getValueType() == MVT::i32)
15614 return DCI.DAG.getNode(ARMISD::VMOVhr, dl, VT, X);
15615 if (VT == MVT::i32 && X.getValueType() == MVT::f16)
15616 return DCI.DAG.getNode(ARMISD::VMOVrh, dl, VT, X);
15617 if (VT == MVT::f32 && X.getValueType() == MVT::i32)
15618 return DCI.DAG.getNode(ISD::BITCAST, dl, VT, X);
15619
15620 while (X.getValueType() != VT && X->getOpcode() == ISD::BITCAST)
15621 X = X->getOperand(0);
15622 if (X.getValueType() == VT)
15623 return X;
15624 }
15625
15626 // extract ARM_BUILD_VECTOR -> x
15627 if (Op0->getOpcode() == ARMISD::BUILD_VECTOR &&
15628 isa<ConstantSDNode>(N->getOperand(1)) &&
15629 N->getConstantOperandVal(1) < Op0.getNumOperands()) {
15630 return Op0.getOperand(N->getConstantOperandVal(1));
15631 }
15632
15633 // extract(bitcast(BUILD_VECTOR(VMOVDRR(a, b), ..))) -> a or b
15634 if (Op0.getValueType() == MVT::v4i32 &&
15635 isa<ConstantSDNode>(N->getOperand(1)) &&
15636 Op0.getOpcode() == ISD::BITCAST &&
15638 Op0.getOperand(0).getValueType() == MVT::v2f64) {
15639 SDValue BV = Op0.getOperand(0);
15640 unsigned Offset = N->getConstantOperandVal(1);
15641 SDValue MOV = BV.getOperand(Offset < 2 ? 0 : 1);
15642 if (MOV.getOpcode() == ARMISD::VMOVDRR)
15643 return MOV.getOperand(ST->isLittle() ? Offset % 2 : 1 - Offset % 2);
15644 }
15645
15646 // extract x, n; extract x, n+1 -> VMOVRRD x
15647 if (SDValue R = PerformExtractEltToVMOVRRD(N, DCI))
15648 return R;
15649
15650 // extract (MVETrunc(x)) -> extract x
15651 if (Op0->getOpcode() == ARMISD::MVETRUNC) {
15652 unsigned Idx = N->getConstantOperandVal(1);
15653 unsigned Vec =
15655 unsigned SubIdx =
15657 return DCI.DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Op0.getOperand(Vec),
15658 DCI.DAG.getConstant(SubIdx, dl, MVT::i32));
15659 }
15660
15661 return SDValue();
15662}
15663
15665 SDValue Op = N->getOperand(0);
15666 EVT VT = N->getValueType(0);
15667
15668 // sext_inreg(VGETLANEu) -> VGETLANEs
15669 if (Op.getOpcode() == ARMISD::VGETLANEu &&
15670 cast<VTSDNode>(N->getOperand(1))->getVT() ==
15671 Op.getOperand(0).getValueType().getScalarType())
15672 return DAG.getNode(ARMISD::VGETLANEs, SDLoc(N), VT, Op.getOperand(0),
15673 Op.getOperand(1));
15674
15675 return SDValue();
15676}
15677
15678static SDValue
15680 SDValue Vec = N->getOperand(0);
15681 SDValue SubVec = N->getOperand(1);
15682 uint64_t IdxVal = N->getConstantOperandVal(2);
15683 EVT VecVT = Vec.getValueType();
15684 EVT SubVT = SubVec.getValueType();
15685
15686 // Only do this for legal fixed vector types.
15687 if (!VecVT.isFixedLengthVector() ||
15688 !DCI.DAG.getTargetLoweringInfo().isTypeLegal(VecVT) ||
15690 return SDValue();
15691
15692 // Ignore widening patterns.
15693 if (IdxVal == 0 && Vec.isUndef())
15694 return SDValue();
15695
15696 // Subvector must be half the width and an "aligned" insertion.
15697 unsigned NumSubElts = SubVT.getVectorNumElements();
15698 if ((SubVT.getSizeInBits() * 2) != VecVT.getSizeInBits() ||
15699 (IdxVal != 0 && IdxVal != NumSubElts))
15700 return SDValue();
15701
15702 // Fold insert_subvector -> concat_vectors
15703 // insert_subvector(Vec,Sub,lo) -> concat_vectors(Sub,extract(Vec,hi))
15704 // insert_subvector(Vec,Sub,hi) -> concat_vectors(extract(Vec,lo),Sub)
15705 SDLoc DL(N);
15706 SDValue Lo, Hi;
15707 if (IdxVal == 0) {
15708 Lo = SubVec;
15709 Hi = DCI.DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, Vec,
15710 DCI.DAG.getVectorIdxConstant(NumSubElts, DL));
15711 } else {
15712 Lo = DCI.DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, Vec,
15713 DCI.DAG.getVectorIdxConstant(0, DL));
15714 Hi = SubVec;
15715 }
15716 return DCI.DAG.getNode(ISD::CONCAT_VECTORS, DL, VecVT, Lo, Hi);
15717}
15718
15719// shuffle(MVETrunc(x, y)) -> VMOVN(x, y)
15721 SelectionDAG &DAG) {
15722 SDValue Trunc = N->getOperand(0);
15723 EVT VT = Trunc.getValueType();
15724 if (Trunc.getOpcode() != ARMISD::MVETRUNC || !N->getOperand(1).isUndef())
15725 return SDValue();
15726
15727 SDLoc DL(Trunc);
15728 if (isVMOVNTruncMask(N->getMask(), VT, false))
15729 return DAG.getNode(
15730 ARMISD::VMOVN, DL, VT,
15731 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, Trunc.getOperand(0)),
15732 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, Trunc.getOperand(1)),
15733 DAG.getConstant(1, DL, MVT::i32));
15734 else if (isVMOVNTruncMask(N->getMask(), VT, true))
15735 return DAG.getNode(
15736 ARMISD::VMOVN, DL, VT,
15737 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, Trunc.getOperand(1)),
15738 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, Trunc.getOperand(0)),
15739 DAG.getConstant(1, DL, MVT::i32));
15740 return SDValue();
15741}
15742
15743/// PerformVECTOR_SHUFFLECombine - Target-specific dag combine xforms for
15744/// ISD::VECTOR_SHUFFLE.
15747 return R;
15748
15749 // The LLVM shufflevector instruction does not require the shuffle mask
15750 // length to match the operand vector length, but ISD::VECTOR_SHUFFLE does
15751 // have that requirement. When translating to ISD::VECTOR_SHUFFLE, if the
15752 // operands do not match the mask length, they are extended by concatenating
15753 // them with undef vectors. That is probably the right thing for other
15754 // targets, but for NEON it is better to concatenate two double-register
15755 // size vector operands into a single quad-register size vector. Do that
15756 // transformation here:
15757 // shuffle(concat(v1, undef), concat(v2, undef)) ->
15758 // shuffle(concat(v1, v2), undef)
15759 SDValue Op0 = N->getOperand(0);
15760 SDValue Op1 = N->getOperand(1);
15761 if (Op0.getOpcode() != ISD::CONCAT_VECTORS ||
15762 Op1.getOpcode() != ISD::CONCAT_VECTORS ||
15763 Op0.getNumOperands() != 2 ||
15764 Op1.getNumOperands() != 2)
15765 return SDValue();
15766 SDValue Concat0Op1 = Op0.getOperand(1);
15767 SDValue Concat1Op1 = Op1.getOperand(1);
15768 if (!Concat0Op1.isUndef() || !Concat1Op1.isUndef())
15769 return SDValue();
15770 // Skip the transformation if any of the types are illegal.
15771 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
15772 EVT VT = N->getValueType(0);
15773 if (!TLI.isTypeLegal(VT) ||
15774 !TLI.isTypeLegal(Concat0Op1.getValueType()) ||
15775 !TLI.isTypeLegal(Concat1Op1.getValueType()))
15776 return SDValue();
15777
15778 SDValue NewConcat = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT,
15779 Op0.getOperand(0), Op1.getOperand(0));
15780 // Translate the shuffle mask.
15781 SmallVector<int, 16> NewMask;
15782 unsigned NumElts = VT.getVectorNumElements();
15783 unsigned HalfElts = NumElts/2;
15785 for (unsigned n = 0; n < NumElts; ++n) {
15786 int MaskElt = SVN->getMaskElt(n);
15787 int NewElt = -1;
15788 if (MaskElt < (int)HalfElts)
15789 NewElt = MaskElt;
15790 else if (MaskElt >= (int)NumElts && MaskElt < (int)(NumElts + HalfElts))
15791 NewElt = HalfElts + MaskElt - NumElts;
15792 NewMask.push_back(NewElt);
15793 }
15794 return DAG.getVectorShuffle(VT, SDLoc(N), NewConcat,
15795 DAG.getUNDEF(VT), NewMask);
15796}
15797
15798/// Load/store instruction that can be merged with a base address
15799/// update
15804 unsigned AddrOpIdx;
15805};
15806
15808 /// Instruction that updates a pointer
15810 /// Pointer increment operand
15812 /// Pointer increment value if it is a constant, or 0 otherwise
15813 unsigned ConstInc;
15814};
15815
15817 // Check that the add is independent of the load/store.
15818 // Otherwise, folding it would create a cycle. Search through Addr
15819 // as well, since the User may not be a direct user of Addr and
15820 // only share a base pointer.
15823 Worklist.push_back(N);
15824 Worklist.push_back(User);
15825 const unsigned MaxSteps = 1024;
15826 if (SDNode::hasPredecessorHelper(N, Visited, Worklist, MaxSteps) ||
15827 SDNode::hasPredecessorHelper(User, Visited, Worklist, MaxSteps))
15828 return false;
15829 return true;
15830}
15831
15833 struct BaseUpdateUser &User,
15834 bool SimpleConstIncOnly,
15836 SelectionDAG &DAG = DCI.DAG;
15837 SDNode *N = Target.N;
15838 MemSDNode *MemN = cast<MemSDNode>(N);
15839 SDLoc dl(N);
15840
15841 // Find the new opcode for the updating load/store.
15842 bool isLoadOp = true;
15843 bool isLaneOp = false;
15844 // Workaround for vst1x and vld1x intrinsics which do not have alignment
15845 // as an operand.
15846 bool hasAlignment = true;
15847 unsigned NewOpc = 0;
15848 unsigned NumVecs = 0;
15849 if (Target.isIntrinsic) {
15850 unsigned IntNo = N->getConstantOperandVal(1);
15851 switch (IntNo) {
15852 default:
15853 llvm_unreachable("unexpected intrinsic for Neon base update");
15854 case Intrinsic::arm_neon_vld1:
15855 NewOpc = ARMISD::VLD1_UPD;
15856 NumVecs = 1;
15857 break;
15858 case Intrinsic::arm_neon_vld2:
15859 NewOpc = ARMISD::VLD2_UPD;
15860 NumVecs = 2;
15861 break;
15862 case Intrinsic::arm_neon_vld3:
15863 NewOpc = ARMISD::VLD3_UPD;
15864 NumVecs = 3;
15865 break;
15866 case Intrinsic::arm_neon_vld4:
15867 NewOpc = ARMISD::VLD4_UPD;
15868 NumVecs = 4;
15869 break;
15870 case Intrinsic::arm_neon_vld1x2:
15871 NewOpc = ARMISD::VLD1x2_UPD;
15872 NumVecs = 2;
15873 hasAlignment = false;
15874 break;
15875 case Intrinsic::arm_neon_vld1x3:
15876 NewOpc = ARMISD::VLD1x3_UPD;
15877 NumVecs = 3;
15878 hasAlignment = false;
15879 break;
15880 case Intrinsic::arm_neon_vld1x4:
15881 NewOpc = ARMISD::VLD1x4_UPD;
15882 NumVecs = 4;
15883 hasAlignment = false;
15884 break;
15885 case Intrinsic::arm_neon_vld2dup:
15886 NewOpc = ARMISD::VLD2DUP_UPD;
15887 NumVecs = 2;
15888 break;
15889 case Intrinsic::arm_neon_vld3dup:
15890 NewOpc = ARMISD::VLD3DUP_UPD;
15891 NumVecs = 3;
15892 break;
15893 case Intrinsic::arm_neon_vld4dup:
15894 NewOpc = ARMISD::VLD4DUP_UPD;
15895 NumVecs = 4;
15896 break;
15897 case Intrinsic::arm_neon_vld2lane:
15898 NewOpc = ARMISD::VLD2LN_UPD;
15899 NumVecs = 2;
15900 isLaneOp = true;
15901 break;
15902 case Intrinsic::arm_neon_vld3lane:
15903 NewOpc = ARMISD::VLD3LN_UPD;
15904 NumVecs = 3;
15905 isLaneOp = true;
15906 break;
15907 case Intrinsic::arm_neon_vld4lane:
15908 NewOpc = ARMISD::VLD4LN_UPD;
15909 NumVecs = 4;
15910 isLaneOp = true;
15911 break;
15912 case Intrinsic::arm_neon_vst1:
15913 NewOpc = ARMISD::VST1_UPD;
15914 NumVecs = 1;
15915 isLoadOp = false;
15916 break;
15917 case Intrinsic::arm_neon_vst2:
15918 NewOpc = ARMISD::VST2_UPD;
15919 NumVecs = 2;
15920 isLoadOp = false;
15921 break;
15922 case Intrinsic::arm_neon_vst3:
15923 NewOpc = ARMISD::VST3_UPD;
15924 NumVecs = 3;
15925 isLoadOp = false;
15926 break;
15927 case Intrinsic::arm_neon_vst4:
15928 NewOpc = ARMISD::VST4_UPD;
15929 NumVecs = 4;
15930 isLoadOp = false;
15931 break;
15932 case Intrinsic::arm_neon_vst2lane:
15933 NewOpc = ARMISD::VST2LN_UPD;
15934 NumVecs = 2;
15935 isLoadOp = false;
15936 isLaneOp = true;
15937 break;
15938 case Intrinsic::arm_neon_vst3lane:
15939 NewOpc = ARMISD::VST3LN_UPD;
15940 NumVecs = 3;
15941 isLoadOp = false;
15942 isLaneOp = true;
15943 break;
15944 case Intrinsic::arm_neon_vst4lane:
15945 NewOpc = ARMISD::VST4LN_UPD;
15946 NumVecs = 4;
15947 isLoadOp = false;
15948 isLaneOp = true;
15949 break;
15950 case Intrinsic::arm_neon_vst1x2:
15951 NewOpc = ARMISD::VST1x2_UPD;
15952 NumVecs = 2;
15953 isLoadOp = false;
15954 hasAlignment = false;
15955 break;
15956 case Intrinsic::arm_neon_vst1x3:
15957 NewOpc = ARMISD::VST1x3_UPD;
15958 NumVecs = 3;
15959 isLoadOp = false;
15960 hasAlignment = false;
15961 break;
15962 case Intrinsic::arm_neon_vst1x4:
15963 NewOpc = ARMISD::VST1x4_UPD;
15964 NumVecs = 4;
15965 isLoadOp = false;
15966 hasAlignment = false;
15967 break;
15968 }
15969 } else {
15970 isLaneOp = true;
15971 switch (N->getOpcode()) {
15972 default:
15973 llvm_unreachable("unexpected opcode for Neon base update");
15974 case ARMISD::VLD1DUP:
15975 NewOpc = ARMISD::VLD1DUP_UPD;
15976 NumVecs = 1;
15977 break;
15978 case ARMISD::VLD2DUP:
15979 NewOpc = ARMISD::VLD2DUP_UPD;
15980 NumVecs = 2;
15981 break;
15982 case ARMISD::VLD3DUP:
15983 NewOpc = ARMISD::VLD3DUP_UPD;
15984 NumVecs = 3;
15985 break;
15986 case ARMISD::VLD4DUP:
15987 NewOpc = ARMISD::VLD4DUP_UPD;
15988 NumVecs = 4;
15989 break;
15990 case ISD::LOAD:
15991 NewOpc = ARMISD::VLD1_UPD;
15992 NumVecs = 1;
15993 isLaneOp = false;
15994 break;
15995 case ISD::STORE:
15996 NewOpc = ARMISD::VST1_UPD;
15997 NumVecs = 1;
15998 isLaneOp = false;
15999 isLoadOp = false;
16000 break;
16001 }
16002 }
16003
16004 // Find the size of memory referenced by the load/store.
16005 EVT VecTy;
16006 if (isLoadOp) {
16007 VecTy = N->getValueType(0);
16008 } else if (Target.isIntrinsic) {
16009 VecTy = N->getOperand(Target.AddrOpIdx + 1).getValueType();
16010 } else {
16011 assert(Target.isStore &&
16012 "Node has to be a load, a store, or an intrinsic!");
16013 VecTy = N->getOperand(1).getValueType();
16014 }
16015
16016 bool isVLDDUPOp =
16017 NewOpc == ARMISD::VLD1DUP_UPD || NewOpc == ARMISD::VLD2DUP_UPD ||
16018 NewOpc == ARMISD::VLD3DUP_UPD || NewOpc == ARMISD::VLD4DUP_UPD;
16019
16020 unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8;
16021 if (isLaneOp || isVLDDUPOp)
16022 NumBytes /= VecTy.getVectorNumElements();
16023
16024 if (NumBytes >= 3 * 16 && User.ConstInc != NumBytes) {
16025 // VLD3/4 and VST3/4 for 128-bit vectors are implemented with two
16026 // separate instructions that make it harder to use a non-constant update.
16027 return false;
16028 }
16029
16030 if (SimpleConstIncOnly && User.ConstInc != NumBytes)
16031 return false;
16032
16033 if (!isValidBaseUpdate(N, User.N))
16034 return false;
16035
16036 // OK, we found an ADD we can fold into the base update.
16037 // Now, create a _UPD node, taking care of not breaking alignment.
16038
16039 EVT AlignedVecTy = VecTy;
16040 Align Alignment = MemN->getAlign();
16041
16042 // If this is a less-than-standard-aligned load/store, change the type to
16043 // match the standard alignment.
16044 // The alignment is overlooked when selecting _UPD variants; and it's
16045 // easier to introduce bitcasts here than fix that.
16046 // There are 3 ways to get to this base-update combine:
16047 // - intrinsics: they are assumed to be properly aligned (to the standard
16048 // alignment of the memory type), so we don't need to do anything.
16049 // - ARMISD::VLDx nodes: they are only generated from the aforementioned
16050 // intrinsics, so, likewise, there's nothing to do.
16051 // - generic load/store instructions: the alignment is specified as an
16052 // explicit operand, rather than implicitly as the standard alignment
16053 // of the memory type (like the intrinsics). We need to change the
16054 // memory type to match the explicit alignment. That way, we don't
16055 // generate non-standard-aligned ARMISD::VLDx nodes.
16056 if (isa<LSBaseSDNode>(N)) {
16057 if (Alignment.value() < VecTy.getScalarSizeInBits() / 8) {
16058 MVT EltTy = MVT::getIntegerVT(Alignment.value() * 8);
16059 assert(NumVecs == 1 && "Unexpected multi-element generic load/store.");
16060 assert(!isLaneOp && "Unexpected generic load/store lane.");
16061 unsigned NumElts = NumBytes / (EltTy.getSizeInBits() / 8);
16062 AlignedVecTy = MVT::getVectorVT(EltTy, NumElts);
16063 }
16064 // Don't set an explicit alignment on regular load/stores that we want
16065 // to transform to VLD/VST 1_UPD nodes.
16066 // This matches the behavior of regular load/stores, which only get an
16067 // explicit alignment if the MMO alignment is larger than the standard
16068 // alignment of the memory type.
16069 // Intrinsics, however, always get an explicit alignment, set to the
16070 // alignment of the MMO.
16071 Alignment = Align(1);
16072 }
16073
16074 // Create the new updating load/store node.
16075 // First, create an SDVTList for the new updating node's results.
16076 EVT Tys[6];
16077 unsigned NumResultVecs = (isLoadOp ? NumVecs : 0);
16078 unsigned n;
16079 for (n = 0; n < NumResultVecs; ++n)
16080 Tys[n] = AlignedVecTy;
16081 Tys[n++] = MVT::i32;
16082 Tys[n] = MVT::Other;
16083 SDVTList SDTys = DAG.getVTList(ArrayRef(Tys, NumResultVecs + 2));
16084
16085 // Then, gather the new node's operands.
16087 Ops.push_back(N->getOperand(0)); // incoming chain
16088 Ops.push_back(N->getOperand(Target.AddrOpIdx));
16089 Ops.push_back(User.Inc);
16090
16091 if (StoreSDNode *StN = dyn_cast<StoreSDNode>(N)) {
16092 // Try to match the intrinsic's signature
16093 Ops.push_back(StN->getValue());
16094 } else {
16095 // Loads (and of course intrinsics) match the intrinsics' signature,
16096 // so just add all but the alignment operand.
16097 unsigned LastOperand =
16098 hasAlignment ? N->getNumOperands() - 1 : N->getNumOperands();
16099 for (unsigned i = Target.AddrOpIdx + 1; i < LastOperand; ++i)
16100 Ops.push_back(N->getOperand(i));
16101 }
16102
16103 // For all node types, the alignment operand is always the last one.
16104 Ops.push_back(DAG.getConstant(Alignment.value(), dl, MVT::i32));
16105
16106 // If this is a non-standard-aligned STORE, the penultimate operand is the
16107 // stored value. Bitcast it to the aligned type.
16108 if (AlignedVecTy != VecTy && N->getOpcode() == ISD::STORE) {
16109 SDValue &StVal = Ops[Ops.size() - 2];
16110 StVal = DAG.getNode(ISD::BITCAST, dl, AlignedVecTy, StVal);
16111 }
16112
16113 EVT LoadVT = isLaneOp ? VecTy.getVectorElementType() : AlignedVecTy;
16114 SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, dl, SDTys, Ops, LoadVT,
16115 MemN->getMemOperand());
16116
16117 // Update the uses.
16118 SmallVector<SDValue, 5> NewResults;
16119 for (unsigned i = 0; i < NumResultVecs; ++i)
16120 NewResults.push_back(SDValue(UpdN.getNode(), i));
16121
16122 // If this is an non-standard-aligned LOAD, the first result is the loaded
16123 // value. Bitcast it to the expected result type.
16124 if (AlignedVecTy != VecTy && N->getOpcode() == ISD::LOAD) {
16125 SDValue &LdVal = NewResults[0];
16126 LdVal = DAG.getNode(ISD::BITCAST, dl, VecTy, LdVal);
16127 }
16128
16129 NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs + 1)); // chain
16130 DCI.CombineTo(N, NewResults);
16131 DCI.CombineTo(User.N, SDValue(UpdN.getNode(), NumResultVecs));
16132
16133 return true;
16134}
16135
16136// If (opcode ptr inc) is and ADD-like instruction, return the
16137// increment value. Otherwise return 0.
16138static unsigned getPointerConstIncrement(unsigned Opcode, SDValue Ptr,
16139 SDValue Inc, const SelectionDAG &DAG) {
16141 if (!CInc)
16142 return 0;
16143
16144 switch (Opcode) {
16145 case ARMISD::VLD1_UPD:
16146 case ISD::ADD:
16147 return CInc->getZExtValue();
16148 case ISD::OR: {
16149 if (DAG.haveNoCommonBitsSet(Ptr, Inc)) {
16150 // (OR ptr inc) is the same as (ADD ptr inc)
16151 return CInc->getZExtValue();
16152 }
16153 return 0;
16154 }
16155 default:
16156 return 0;
16157 }
16158}
16159
16161 switch (N->getOpcode()) {
16162 case ISD::ADD:
16163 case ISD::OR: {
16164 if (isa<ConstantSDNode>(N->getOperand(1))) {
16165 *Ptr = N->getOperand(0);
16166 *CInc = N->getOperand(1);
16167 return true;
16168 }
16169 return false;
16170 }
16171 case ARMISD::VLD1_UPD: {
16172 if (isa<ConstantSDNode>(N->getOperand(2))) {
16173 *Ptr = N->getOperand(1);
16174 *CInc = N->getOperand(2);
16175 return true;
16176 }
16177 return false;
16178 }
16179 default:
16180 return false;
16181 }
16182}
16183
16184/// CombineBaseUpdate - Target-specific DAG combine function for VLDDUP,
16185/// NEON load/store intrinsics, and generic vector load/stores, to merge
16186/// base address updates.
16187/// For generic load/stores, the memory type is assumed to be a vector.
16188/// The caller is assumed to have checked legality.
16191 const bool isIntrinsic = (N->getOpcode() == ISD::INTRINSIC_VOID ||
16192 N->getOpcode() == ISD::INTRINSIC_W_CHAIN);
16193 const bool isStore = N->getOpcode() == ISD::STORE;
16194 const unsigned AddrOpIdx = ((isIntrinsic || isStore) ? 2 : 1);
16195 BaseUpdateTarget Target = {N, isIntrinsic, isStore, AddrOpIdx};
16196
16197 // Limit the number of possible base-updates we look at to prevent degenerate
16198 // cases.
16199 unsigned MaxBaseUpdates = ArmMaxBaseUpdatesToCheck;
16200
16201 SDValue Addr = N->getOperand(AddrOpIdx);
16202
16204
16205 // Search for a use of the address operand that is an increment.
16206 for (SDUse &Use : Addr->uses()) {
16207 SDNode *User = Use.getUser();
16208 if (Use.getResNo() != Addr.getResNo() || User->getNumOperands() != 2)
16209 continue;
16210
16211 SDValue Inc = User->getOperand(Use.getOperandNo() == 1 ? 0 : 1);
16212 unsigned ConstInc =
16213 getPointerConstIncrement(User->getOpcode(), Addr, Inc, DCI.DAG);
16214
16215 if (ConstInc || User->getOpcode() == ISD::ADD) {
16216 BaseUpdates.push_back({User, Inc, ConstInc});
16217 if (BaseUpdates.size() >= MaxBaseUpdates)
16218 break;
16219 }
16220 }
16221
16222 // If the address is a constant pointer increment itself, find
16223 // another constant increment that has the same base operand
16224 SDValue Base;
16225 SDValue CInc;
16226 if (findPointerConstIncrement(Addr.getNode(), &Base, &CInc)) {
16227 unsigned Offset =
16228 getPointerConstIncrement(Addr->getOpcode(), Base, CInc, DCI.DAG);
16229 if (Offset) {
16230 for (SDUse &Use : Base->uses()) {
16231
16232 SDNode *User = Use.getUser();
16233 if (Use.getResNo() != Base.getResNo() || User == Addr.getNode() ||
16234 User->getNumOperands() != 2)
16235 continue;
16236
16237 SDValue UserInc = User->getOperand(Use.getOperandNo() == 0 ? 1 : 0);
16238 unsigned UserOffset =
16239 getPointerConstIncrement(User->getOpcode(), Base, UserInc, DCI.DAG);
16240
16241 if (!UserOffset || UserOffset <= Offset)
16242 continue;
16243
16244 unsigned NewConstInc = UserOffset - Offset;
16245 SDValue NewInc = DCI.DAG.getConstant(NewConstInc, SDLoc(N), MVT::i32);
16246 BaseUpdates.push_back({User, NewInc, NewConstInc});
16247 if (BaseUpdates.size() >= MaxBaseUpdates)
16248 break;
16249 }
16250 }
16251 }
16252
16253 // Try to fold the load/store with an update that matches memory
16254 // access size. This should work well for sequential loads.
16255 unsigned NumValidUpd = BaseUpdates.size();
16256 for (unsigned I = 0; I < NumValidUpd; I++) {
16257 BaseUpdateUser &User = BaseUpdates[I];
16258 if (TryCombineBaseUpdate(Target, User, /*SimpleConstIncOnly=*/true, DCI))
16259 return SDValue();
16260 }
16261
16262 // Try to fold with other users. Non-constant updates are considered
16263 // first, and constant updates are sorted to not break a sequence of
16264 // strided accesses (if there is any).
16265 llvm::stable_sort(BaseUpdates,
16266 [](const BaseUpdateUser &LHS, const BaseUpdateUser &RHS) {
16267 return LHS.ConstInc < RHS.ConstInc;
16268 });
16269 for (BaseUpdateUser &User : BaseUpdates) {
16270 if (TryCombineBaseUpdate(Target, User, /*SimpleConstIncOnly=*/false, DCI))
16271 return SDValue();
16272 }
16273 return SDValue();
16274}
16275
16278 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
16279 return SDValue();
16280
16281 return CombineBaseUpdate(N, DCI);
16282}
16283
16286 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
16287 return SDValue();
16288
16289 SelectionDAG &DAG = DCI.DAG;
16290 SDValue Addr = N->getOperand(2);
16291 MemSDNode *MemN = cast<MemSDNode>(N);
16292 SDLoc dl(N);
16293
16294 // For the stores, where there are multiple intrinsics we only actually want
16295 // to post-inc the last of the them.
16296 unsigned IntNo = N->getConstantOperandVal(1);
16297 if (IntNo == Intrinsic::arm_mve_vst2q && N->getConstantOperandVal(5) != 1)
16298 return SDValue();
16299 if (IntNo == Intrinsic::arm_mve_vst4q && N->getConstantOperandVal(7) != 3)
16300 return SDValue();
16301
16302 // Search for a use of the address operand that is an increment.
16303 for (SDUse &Use : Addr->uses()) {
16304 SDNode *User = Use.getUser();
16305 if (User->getOpcode() != ISD::ADD || Use.getResNo() != Addr.getResNo())
16306 continue;
16307
16308 // Check that the add is independent of the load/store. Otherwise, folding
16309 // it would create a cycle. We can avoid searching through Addr as it's a
16310 // predecessor to both.
16313 Visited.insert(Addr.getNode());
16314 Worklist.push_back(N);
16315 Worklist.push_back(User);
16316 const unsigned MaxSteps = 1024;
16317 if (SDNode::hasPredecessorHelper(N, Visited, Worklist, MaxSteps) ||
16318 SDNode::hasPredecessorHelper(User, Visited, Worklist, MaxSteps))
16319 continue;
16320
16321 // Find the new opcode for the updating load/store.
16322 bool isLoadOp = true;
16323 unsigned NewOpc = 0;
16324 unsigned NumVecs = 0;
16325 switch (IntNo) {
16326 default:
16327 llvm_unreachable("unexpected intrinsic for MVE VLDn combine");
16328 case Intrinsic::arm_mve_vld2q:
16329 NewOpc = ARMISD::VLD2_UPD;
16330 NumVecs = 2;
16331 break;
16332 case Intrinsic::arm_mve_vld4q:
16333 NewOpc = ARMISD::VLD4_UPD;
16334 NumVecs = 4;
16335 break;
16336 case Intrinsic::arm_mve_vst2q:
16337 NewOpc = ARMISD::VST2_UPD;
16338 NumVecs = 2;
16339 isLoadOp = false;
16340 break;
16341 case Intrinsic::arm_mve_vst4q:
16342 NewOpc = ARMISD::VST4_UPD;
16343 NumVecs = 4;
16344 isLoadOp = false;
16345 break;
16346 }
16347
16348 // Find the size of memory referenced by the load/store.
16349 EVT VecTy;
16350 if (isLoadOp) {
16351 VecTy = N->getValueType(0);
16352 } else {
16353 VecTy = N->getOperand(3).getValueType();
16354 }
16355
16356 unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8;
16357
16358 // If the increment is a constant, it must match the memory ref size.
16359 SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);
16361 if (!CInc || CInc->getZExtValue() != NumBytes)
16362 continue;
16363
16364 // Create the new updating load/store node.
16365 // First, create an SDVTList for the new updating node's results.
16366 EVT Tys[6];
16367 unsigned NumResultVecs = (isLoadOp ? NumVecs : 0);
16368 unsigned n;
16369 for (n = 0; n < NumResultVecs; ++n)
16370 Tys[n] = VecTy;
16371 Tys[n++] = MVT::i32;
16372 Tys[n] = MVT::Other;
16373 SDVTList SDTys = DAG.getVTList(ArrayRef(Tys, NumResultVecs + 2));
16374
16375 // Then, gather the new node's operands.
16377 Ops.push_back(N->getOperand(0)); // incoming chain
16378 Ops.push_back(N->getOperand(2)); // ptr
16379 Ops.push_back(Inc);
16380
16381 for (unsigned i = 3; i < N->getNumOperands(); ++i)
16382 Ops.push_back(N->getOperand(i));
16383
16384 SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, dl, SDTys, Ops, VecTy,
16385 MemN->getMemOperand());
16386
16387 // Update the uses.
16388 SmallVector<SDValue, 5> NewResults;
16389 for (unsigned i = 0; i < NumResultVecs; ++i)
16390 NewResults.push_back(SDValue(UpdN.getNode(), i));
16391
16392 NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs + 1)); // chain
16393 DCI.CombineTo(N, NewResults);
16394 DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs));
16395
16396 break;
16397 }
16398
16399 return SDValue();
16400}
16401
16402/// CombineVLDDUP - For a VDUPLANE node N, check if its source operand is a
16403/// vldN-lane (N > 1) intrinsic, and if all the other uses of that intrinsic
16404/// are also VDUPLANEs. If so, combine them to a vldN-dup operation and
16405/// return true.
16407 SelectionDAG &DAG = DCI.DAG;
16408 EVT VT = N->getValueType(0);
16409 // vldN-dup instructions only support 64-bit vectors for N > 1.
16410 if (!VT.is64BitVector())
16411 return false;
16412
16413 // Check if the VDUPLANE operand is a vldN-dup intrinsic.
16414 SDNode *VLD = N->getOperand(0).getNode();
16415 if (VLD->getOpcode() != ISD::INTRINSIC_W_CHAIN)
16416 return false;
16417 unsigned NumVecs = 0;
16418 unsigned NewOpc = 0;
16419 unsigned IntNo = VLD->getConstantOperandVal(1);
16420 if (IntNo == Intrinsic::arm_neon_vld2lane) {
16421 NumVecs = 2;
16422 NewOpc = ARMISD::VLD2DUP;
16423 } else if (IntNo == Intrinsic::arm_neon_vld3lane) {
16424 NumVecs = 3;
16425 NewOpc = ARMISD::VLD3DUP;
16426 } else if (IntNo == Intrinsic::arm_neon_vld4lane) {
16427 NumVecs = 4;
16428 NewOpc = ARMISD::VLD4DUP;
16429 } else {
16430 return false;
16431 }
16432
16433 // First check that all the vldN-lane uses are VDUPLANEs and that the lane
16434 // numbers match the load.
16435 unsigned VLDLaneNo = VLD->getConstantOperandVal(NumVecs + 3);
16436 for (SDUse &Use : VLD->uses()) {
16437 // Ignore uses of the chain result.
16438 if (Use.getResNo() == NumVecs)
16439 continue;
16440 SDNode *User = Use.getUser();
16441 if (User->getOpcode() != ARMISD::VDUPLANE ||
16442 VLDLaneNo != User->getConstantOperandVal(1))
16443 return false;
16444 }
16445
16446 // Create the vldN-dup node.
16447 EVT Tys[5];
16448 unsigned n;
16449 for (n = 0; n < NumVecs; ++n)
16450 Tys[n] = VT;
16451 Tys[n] = MVT::Other;
16452 SDVTList SDTys = DAG.getVTList(ArrayRef(Tys, NumVecs + 1));
16453 SDValue Ops[] = { VLD->getOperand(0), VLD->getOperand(2) };
16455 SDValue VLDDup = DAG.getMemIntrinsicNode(NewOpc, SDLoc(VLD), SDTys,
16456 Ops, VLDMemInt->getMemoryVT(),
16457 VLDMemInt->getMemOperand());
16458
16459 // Update the uses.
16460 for (SDUse &Use : VLD->uses()) {
16461 unsigned ResNo = Use.getResNo();
16462 // Ignore uses of the chain result.
16463 if (ResNo == NumVecs)
16464 continue;
16465 DCI.CombineTo(Use.getUser(), SDValue(VLDDup.getNode(), ResNo));
16466 }
16467
16468 // Now the vldN-lane intrinsic is dead except for its chain result.
16469 // Update uses of the chain.
16470 std::vector<SDValue> VLDDupResults;
16471 for (unsigned n = 0; n < NumVecs; ++n)
16472 VLDDupResults.push_back(SDValue(VLDDup.getNode(), n));
16473 VLDDupResults.push_back(SDValue(VLDDup.getNode(), NumVecs));
16474 DCI.CombineTo(VLD, VLDDupResults);
16475
16476 return true;
16477}
16478
16479/// PerformVDUPLANECombine - Target-specific dag combine xforms for
16480/// ARMISD::VDUPLANE.
16483 const ARMSubtarget *Subtarget) {
16484 SDValue Op = N->getOperand(0);
16485 EVT VT = N->getValueType(0);
16486
16487 // On MVE, we just convert the VDUPLANE to a VDUP with an extract.
16488 if (Subtarget->hasMVEIntegerOps()) {
16489 EVT ExtractVT = VT.getVectorElementType();
16490 // We need to ensure we are creating a legal type.
16491 if (!DCI.DAG.getTargetLoweringInfo().isTypeLegal(ExtractVT))
16492 ExtractVT = MVT::i32;
16493 SDValue Extract = DCI.DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(N), ExtractVT,
16494 N->getOperand(0), N->getOperand(1));
16495 return DCI.DAG.getNode(ARMISD::VDUP, SDLoc(N), VT, Extract);
16496 }
16497
16498 // If the source is a vldN-lane (N > 1) intrinsic, and all the other uses
16499 // of that intrinsic are also VDUPLANEs, combine them to a vldN-dup operation.
16500 if (CombineVLDDUP(N, DCI))
16501 return SDValue(N, 0);
16502
16503 // If the source is already a VMOVIMM or VMVNIMM splat, the VDUPLANE is
16504 // redundant. Ignore bit_converts for now; element sizes are checked below.
16505 while (Op.getOpcode() == ISD::BITCAST)
16506 Op = Op.getOperand(0);
16507 if (Op.getOpcode() != ARMISD::VMOVIMM && Op.getOpcode() != ARMISD::VMVNIMM)
16508 return SDValue();
16509
16510 // Make sure the VMOV element size is not bigger than the VDUPLANE elements.
16511 unsigned EltSize = Op.getScalarValueSizeInBits();
16512 // The canonical VMOV for a zero vector uses a 32-bit element size.
16513 unsigned Imm = Op.getConstantOperandVal(0);
16514 unsigned EltBits;
16515 if (ARM_AM::decodeVMOVModImm(Imm, EltBits) == 0)
16516 EltSize = 8;
16517 if (EltSize > VT.getScalarSizeInBits())
16518 return SDValue();
16519
16520 return DCI.DAG.getNode(ISD::BITCAST, SDLoc(N), VT, Op);
16521}
16522
16523/// PerformVDUPCombine - Target-specific dag combine xforms for ARMISD::VDUP.
16525 const ARMSubtarget *Subtarget) {
16526 SDValue Op = N->getOperand(0);
16527 SDLoc dl(N);
16528
16529 if (Subtarget->hasMVEIntegerOps()) {
16530 // Convert VDUP f32 -> VDUP BITCAST i32 under MVE, as we know the value will
16531 // need to come from a GPR.
16532 if (Op.getValueType() == MVT::f32)
16533 return DAG.getNode(ARMISD::VDUP, dl, N->getValueType(0),
16534 DAG.getNode(ISD::BITCAST, dl, MVT::i32, Op));
16535 else if (Op.getValueType() == MVT::f16)
16536 return DAG.getNode(ARMISD::VDUP, dl, N->getValueType(0),
16537 DAG.getNode(ARMISD::VMOVrh, dl, MVT::i32, Op));
16538 }
16539
16540 if (!Subtarget->hasNEON())
16541 return SDValue();
16542
16543 // Match VDUP(LOAD) -> VLD1DUP.
16544 // We match this pattern here rather than waiting for isel because the
16545 // transform is only legal for unindexed loads.
16546 LoadSDNode *LD = dyn_cast<LoadSDNode>(Op.getNode());
16547 if (LD && Op.hasOneUse() && LD->isUnindexed() &&
16548 LD->getMemoryVT() == N->getValueType(0).getVectorElementType()) {
16549 SDValue Ops[] = {LD->getOperand(0), LD->getOperand(1),
16550 DAG.getConstant(LD->getAlign().value(), SDLoc(N), MVT::i32)};
16551 SDVTList SDTys = DAG.getVTList(N->getValueType(0), MVT::Other);
16552 SDValue VLDDup =
16554 LD->getMemoryVT(), LD->getMemOperand());
16555 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), VLDDup.getValue(1));
16556 return VLDDup;
16557 }
16558
16559 return SDValue();
16560}
16561
16564 const ARMSubtarget *Subtarget) {
16565 EVT VT = N->getValueType(0);
16566
16567 // If this is a legal vector load, try to combine it into a VLD1_UPD.
16568 if (Subtarget->hasNEON() && ISD::isNormalLoad(N) && VT.isVector() &&
16570 return CombineBaseUpdate(N, DCI);
16571
16572 return SDValue();
16573}
16574
16575// Optimize trunc store (of multiple scalars) to shuffle and store. First,
16576// pack all of the elements in one place. Next, store to memory in fewer
16577// chunks.
16579 SelectionDAG &DAG) {
16580 SDValue StVal = St->getValue();
16581 EVT VT = StVal.getValueType();
16582 if (!St->isTruncatingStore() || !VT.isVector())
16583 return SDValue();
16584 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
16585 EVT StVT = St->getMemoryVT();
16586 unsigned NumElems = VT.getVectorNumElements();
16587 assert(StVT != VT && "Cannot truncate to the same type");
16588 unsigned FromEltSz = VT.getScalarSizeInBits();
16589 unsigned ToEltSz = StVT.getScalarSizeInBits();
16590
16591 // From, To sizes and ElemCount must be pow of two
16592 if (!isPowerOf2_32(NumElems * FromEltSz * ToEltSz))
16593 return SDValue();
16594
16595 // We are going to use the original vector elt for storing.
16596 // Accumulated smaller vector elements must be a multiple of the store size.
16597 if (0 != (NumElems * FromEltSz) % ToEltSz)
16598 return SDValue();
16599
16600 unsigned SizeRatio = FromEltSz / ToEltSz;
16601 assert(SizeRatio * NumElems * ToEltSz == VT.getSizeInBits());
16602
16603 // Create a type on which we perform the shuffle.
16604 EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), StVT.getScalarType(),
16605 NumElems * SizeRatio);
16606 assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
16607
16608 SDLoc DL(St);
16609 SDValue WideVec = DAG.getNode(ISD::BITCAST, DL, WideVecVT, StVal);
16610 SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
16611 for (unsigned i = 0; i < NumElems; ++i)
16612 ShuffleVec[i] = DAG.getDataLayout().isBigEndian() ? (i + 1) * SizeRatio - 1
16613 : i * SizeRatio;
16614
16615 // Can't shuffle using an illegal type.
16616 if (!TLI.isTypeLegal(WideVecVT))
16617 return SDValue();
16618
16619 SDValue Shuff = DAG.getVectorShuffle(
16620 WideVecVT, DL, WideVec, DAG.getUNDEF(WideVec.getValueType()), ShuffleVec);
16621 // At this point all of the data is stored at the bottom of the
16622 // register. We now need to save it to mem.
16623
16624 // Find the largest store unit
16625 MVT StoreType = MVT::i8;
16626 for (MVT Tp : MVT::integer_valuetypes()) {
16627 if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToEltSz)
16628 StoreType = Tp;
16629 }
16630 // Didn't find a legal store type.
16631 if (!TLI.isTypeLegal(StoreType))
16632 return SDValue();
16633
16634 // Bitcast the original vector into a vector of store-size units
16635 EVT StoreVecVT =
16636 EVT::getVectorVT(*DAG.getContext(), StoreType,
16637 VT.getSizeInBits() / EVT(StoreType).getSizeInBits());
16638 assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits());
16639 SDValue ShuffWide = DAG.getNode(ISD::BITCAST, DL, StoreVecVT, Shuff);
16641 SDValue Increment = DAG.getConstant(StoreType.getSizeInBits() / 8, DL,
16642 TLI.getPointerTy(DAG.getDataLayout()));
16643 SDValue BasePtr = St->getBasePtr();
16644
16645 // Perform one or more big stores into memory.
16646 unsigned E = (ToEltSz * NumElems) / StoreType.getSizeInBits();
16647 for (unsigned I = 0; I < E; I++) {
16648 SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, StoreType,
16649 ShuffWide, DAG.getIntPtrConstant(I, DL));
16650 SDValue Ch =
16651 DAG.getStore(St->getChain(), DL, SubVec, BasePtr, St->getPointerInfo(),
16652 St->getAlign(), St->getMemOperand()->getFlags());
16653 BasePtr =
16654 DAG.getNode(ISD::ADD, DL, BasePtr.getValueType(), BasePtr, Increment);
16655 Chains.push_back(Ch);
16656 }
16657 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
16658}
16659
16660// Try taking a single vector store from an fpround (which would otherwise turn
16661// into an expensive buildvector) and splitting it into a series of narrowing
16662// stores.
16664 SelectionDAG &DAG) {
16665 if (!St->isSimple() || St->isTruncatingStore() || !St->isUnindexed())
16666 return SDValue();
16667 SDValue Trunc = St->getValue();
16668 if (Trunc->getOpcode() != ISD::FP_ROUND)
16669 return SDValue();
16670 EVT FromVT = Trunc->getOperand(0).getValueType();
16671 EVT ToVT = Trunc.getValueType();
16672 if (!ToVT.isVector())
16673 return SDValue();
16675 EVT ToEltVT = ToVT.getVectorElementType();
16676 EVT FromEltVT = FromVT.getVectorElementType();
16677
16678 if (FromEltVT != MVT::f32 || ToEltVT != MVT::f16)
16679 return SDValue();
16680
16681 unsigned NumElements = 4;
16682 if (FromVT.getVectorNumElements() % NumElements != 0)
16683 return SDValue();
16684
16685 // Test if the Trunc will be convertible to a VMOVN with a shuffle, and if so
16686 // use the VMOVN over splitting the store. We are looking for patterns of:
16687 // !rev: 0 N 1 N+1 2 N+2 ...
16688 // rev: N 0 N+1 1 N+2 2 ...
16689 // The shuffle may either be a single source (in which case N = NumElts/2) or
16690 // two inputs extended with concat to the same size (in which case N =
16691 // NumElts).
16692 auto isVMOVNShuffle = [&](ShuffleVectorSDNode *SVN, bool Rev) {
16693 ArrayRef<int> M = SVN->getMask();
16694 unsigned NumElts = ToVT.getVectorNumElements();
16695 if (SVN->getOperand(1).isUndef())
16696 NumElts /= 2;
16697
16698 unsigned Off0 = Rev ? NumElts : 0;
16699 unsigned Off1 = Rev ? 0 : NumElts;
16700
16701 for (unsigned I = 0; I < NumElts; I += 2) {
16702 if (M[I] >= 0 && M[I] != (int)(Off0 + I / 2))
16703 return false;
16704 if (M[I + 1] >= 0 && M[I + 1] != (int)(Off1 + I / 2))
16705 return false;
16706 }
16707
16708 return true;
16709 };
16710
16711 if (auto *Shuffle = dyn_cast<ShuffleVectorSDNode>(Trunc.getOperand(0)))
16712 if (isVMOVNShuffle(Shuffle, false) || isVMOVNShuffle(Shuffle, true))
16713 return SDValue();
16714
16715 LLVMContext &C = *DAG.getContext();
16716 SDLoc DL(St);
16717 // Details about the old store
16718 SDValue Ch = St->getChain();
16719 SDValue BasePtr = St->getBasePtr();
16720 Align Alignment = St->getBaseAlign();
16722 AAMDNodes AAInfo = St->getAAInfo();
16723
16724 // We split the store into slices of NumElements. fp16 trunc stores are vcvt
16725 // and then stored as truncating integer stores.
16726 EVT NewFromVT = EVT::getVectorVT(C, FromEltVT, NumElements);
16727 EVT NewToVT = EVT::getVectorVT(
16728 C, EVT::getIntegerVT(C, ToEltVT.getSizeInBits()), NumElements);
16729
16731 for (unsigned i = 0; i < FromVT.getVectorNumElements() / NumElements; i++) {
16732 unsigned NewOffset = i * NumElements * ToEltVT.getSizeInBits() / 8;
16733 SDValue NewPtr =
16734 DAG.getObjectPtrOffset(DL, BasePtr, TypeSize::getFixed(NewOffset));
16735
16736 SDValue Extract =
16737 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NewFromVT, Trunc.getOperand(0),
16738 DAG.getConstant(i * NumElements, DL, MVT::i32));
16739
16740 SDValue FPTrunc =
16741 DAG.getNode(ARMISD::VCVTN, DL, MVT::v8f16, DAG.getUNDEF(MVT::v8f16),
16742 Extract, DAG.getConstant(0, DL, MVT::i32));
16743 Extract = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, MVT::v4i32, FPTrunc);
16744
16745 SDValue Store = DAG.getTruncStore(
16746 Ch, DL, Extract, NewPtr, St->getPointerInfo().getWithOffset(NewOffset),
16747 NewToVT, Alignment, MMOFlags, AAInfo);
16748 Stores.push_back(Store);
16749 }
16750 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores);
16751}
16752
16753// Try taking a single vector store from an MVETRUNC (which would otherwise turn
16754// into an expensive buildvector) and splitting it into a series of narrowing
16755// stores.
16757 SelectionDAG &DAG) {
16758 if (!St->isSimple() || St->isTruncatingStore() || !St->isUnindexed())
16759 return SDValue();
16760 SDValue Trunc = St->getValue();
16761 if (Trunc->getOpcode() != ARMISD::MVETRUNC)
16762 return SDValue();
16763 EVT FromVT = Trunc->getOperand(0).getValueType();
16764 EVT ToVT = Trunc.getValueType();
16765
16766 LLVMContext &C = *DAG.getContext();
16767 SDLoc DL(St);
16768 // Details about the old store
16769 SDValue Ch = St->getChain();
16770 SDValue BasePtr = St->getBasePtr();
16771 Align Alignment = St->getBaseAlign();
16773 AAMDNodes AAInfo = St->getAAInfo();
16774
16775 EVT NewToVT = EVT::getVectorVT(C, ToVT.getVectorElementType(),
16776 FromVT.getVectorNumElements());
16777
16779 for (unsigned i = 0; i < Trunc.getNumOperands(); i++) {
16780 unsigned NewOffset =
16781 i * FromVT.getVectorNumElements() * ToVT.getScalarSizeInBits() / 8;
16782 SDValue NewPtr =
16783 DAG.getObjectPtrOffset(DL, BasePtr, TypeSize::getFixed(NewOffset));
16784
16785 SDValue Extract = Trunc.getOperand(i);
16786 SDValue Store = DAG.getTruncStore(
16787 Ch, DL, Extract, NewPtr, St->getPointerInfo().getWithOffset(NewOffset),
16788 NewToVT, Alignment, MMOFlags, AAInfo);
16789 Stores.push_back(Store);
16790 }
16791 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores);
16792}
16793
16794// Given a floating point store from an extracted vector, with an integer
16795// VGETLANE that already exists, store the existing VGETLANEu directly. This can
16796// help reduce fp register pressure, doesn't require the fp extract and allows
16797// use of more integer post-inc stores not available with vstr.
16799 if (!St->isSimple() || St->isTruncatingStore() || !St->isUnindexed())
16800 return SDValue();
16801 SDValue Extract = St->getValue();
16802 EVT VT = Extract.getValueType();
16803 // For now only uses f16. This may be useful for f32 too, but that will
16804 // be bitcast(extract), not the VGETLANEu we currently check here.
16805 if (VT != MVT::f16 || Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
16806 return SDValue();
16807
16808 SDNode *GetLane =
16809 DAG.getNodeIfExists(ARMISD::VGETLANEu, DAG.getVTList(MVT::i32),
16810 {Extract.getOperand(0), Extract.getOperand(1)});
16811 if (!GetLane)
16812 return SDValue();
16813
16814 LLVMContext &C = *DAG.getContext();
16815 SDLoc DL(St);
16816 // Create a new integer store to replace the existing floating point version.
16817 SDValue Ch = St->getChain();
16818 SDValue BasePtr = St->getBasePtr();
16819 Align Alignment = St->getBaseAlign();
16821 AAMDNodes AAInfo = St->getAAInfo();
16822 EVT NewToVT = EVT::getIntegerVT(C, VT.getSizeInBits());
16823 SDValue Store = DAG.getTruncStore(Ch, DL, SDValue(GetLane, 0), BasePtr,
16824 St->getPointerInfo(), NewToVT, Alignment,
16825 MMOFlags, AAInfo);
16826
16827 return Store;
16828}
16829
16830/// PerformSTORECombine - Target-specific dag combine xforms for
16831/// ISD::STORE.
16834 const ARMSubtarget *Subtarget) {
16836 if (St->isVolatile())
16837 return SDValue();
16838 SDValue StVal = St->getValue();
16839 EVT VT = StVal.getValueType();
16840
16841 if (Subtarget->hasNEON())
16842 if (SDValue Store = PerformTruncatingStoreCombine(St, DCI.DAG))
16843 return Store;
16844
16845 if (Subtarget->hasMVEFloatOps())
16846 if (SDValue NewToken = PerformSplittingToNarrowingStores(St, DCI.DAG))
16847 return NewToken;
16848
16849 if (Subtarget->hasMVEIntegerOps()) {
16850 if (SDValue NewChain = PerformExtractFpToIntStores(St, DCI.DAG))
16851 return NewChain;
16852 if (SDValue NewToken =
16854 return NewToken;
16855 }
16856
16857 if (!ISD::isNormalStore(St))
16858 return SDValue();
16859
16860 // Split a store of a VMOVDRR into two integer stores to avoid mixing NEON and
16861 // ARM stores of arguments in the same cache line.
16862 if (StVal.getNode()->getOpcode() == ARMISD::VMOVDRR &&
16863 StVal.getNode()->hasOneUse()) {
16864 SelectionDAG &DAG = DCI.DAG;
16865 bool isBigEndian = DAG.getDataLayout().isBigEndian();
16866 SDLoc DL(St);
16867 SDValue BasePtr = St->getBasePtr();
16868 SDValue NewST1 = DAG.getStore(
16869 St->getChain(), DL, StVal.getNode()->getOperand(isBigEndian ? 1 : 0),
16870 BasePtr, St->getPointerInfo(), St->getBaseAlign(),
16871 St->getMemOperand()->getFlags());
16872
16873 SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr,
16874 DAG.getConstant(4, DL, MVT::i32));
16875 return DAG.getStore(NewST1.getValue(0), DL,
16876 StVal.getNode()->getOperand(isBigEndian ? 0 : 1),
16877 OffsetPtr, St->getPointerInfo().getWithOffset(4),
16878 St->getBaseAlign(), St->getMemOperand()->getFlags());
16879 }
16880
16881 if (StVal.getValueType() == MVT::i64 &&
16883
16884 // Bitcast an i64 store extracted from a vector to f64.
16885 // Otherwise, the i64 value will be legalized to a pair of i32 values.
16886 SelectionDAG &DAG = DCI.DAG;
16887 SDLoc dl(StVal);
16888 SDValue IntVec = StVal.getOperand(0);
16889 EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64,
16891 SDValue Vec = DAG.getNode(ISD::BITCAST, dl, FloatVT, IntVec);
16892 SDValue ExtElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
16893 Vec, StVal.getOperand(1));
16894 dl = SDLoc(N);
16895 SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::i64, ExtElt);
16896 // Make the DAGCombiner fold the bitcasts.
16897 DCI.AddToWorklist(Vec.getNode());
16898 DCI.AddToWorklist(ExtElt.getNode());
16899 DCI.AddToWorklist(V.getNode());
16900 return DAG.getStore(St->getChain(), dl, V, St->getBasePtr(),
16901 St->getPointerInfo(), St->getAlign(),
16902 St->getMemOperand()->getFlags(), St->getAAInfo());
16903 }
16904
16905 // If this is a legal vector store, try to combine it into a VST1_UPD.
16906 if (Subtarget->hasNEON() && ISD::isNormalStore(N) && VT.isVector() &&
16908 return CombineBaseUpdate(N, DCI);
16909
16910 return SDValue();
16911}
16912
16913/// PerformVCVTCombine - VCVT (floating-point to fixed-point, Advanced SIMD)
16914/// can replace combinations of VMUL and VCVT (floating-point to integer)
16915/// when the VMUL has a constant operand that is a power of 2.
16916///
16917/// Example (assume d17 = <float 8.000000e+00, float 8.000000e+00>):
16918/// vmul.f32 d16, d17, d16
16919/// vcvt.s32.f32 d16, d16
16920/// becomes:
16921/// vcvt.s32.f32 d16, d16, #3
16923 const ARMSubtarget *Subtarget) {
16924 if (!Subtarget->hasNEON())
16925 return SDValue();
16926
16927 SDValue Op = N->getOperand(0);
16928 if (!Op.getValueType().isVector() || !Op.getValueType().isSimple() ||
16929 Op.getOpcode() != ISD::FMUL)
16930 return SDValue();
16931
16932 SDValue ConstVec = Op->getOperand(1);
16933 if (!isa<BuildVectorSDNode>(ConstVec))
16934 return SDValue();
16935
16936 MVT FloatTy = Op.getSimpleValueType().getVectorElementType();
16937 uint32_t FloatBits = FloatTy.getSizeInBits();
16938 MVT IntTy = N->getSimpleValueType(0).getVectorElementType();
16939 uint32_t IntBits = IntTy.getSizeInBits();
16940 unsigned NumLanes = Op.getValueType().getVectorNumElements();
16941 if (FloatBits != 32 || IntBits > 32 || (NumLanes != 4 && NumLanes != 2)) {
16942 // These instructions only exist converting from f32 to i32. We can handle
16943 // smaller integers by generating an extra truncate, but larger ones would
16944 // be lossy. We also can't handle anything other than 2 or 4 lanes, since
16945 // these instructions only support v2i32/v4i32 types.
16946 return SDValue();
16947 }
16948
16949 BitVector UndefElements;
16951 int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, 33);
16952 if (C == -1 || C == 0 || C > 32)
16953 return SDValue();
16954
16955 SDLoc dl(N);
16956 bool isSigned = N->getOpcode() == ISD::FP_TO_SINT;
16957 unsigned IntrinsicOpcode = isSigned ? Intrinsic::arm_neon_vcvtfp2fxs :
16958 Intrinsic::arm_neon_vcvtfp2fxu;
16959 SDValue FixConv = DAG.getNode(
16960 ISD::INTRINSIC_WO_CHAIN, dl, NumLanes == 2 ? MVT::v2i32 : MVT::v4i32,
16961 DAG.getConstant(IntrinsicOpcode, dl, MVT::i32), Op->getOperand(0),
16962 DAG.getConstant(C, dl, MVT::i32));
16963
16964 if (IntBits < FloatBits)
16965 FixConv = DAG.getNode(ISD::TRUNCATE, dl, N->getValueType(0), FixConv);
16966
16967 return FixConv;
16968}
16969
16971 const ARMSubtarget *Subtarget) {
16972 if (!Subtarget->hasMVEFloatOps())
16973 return SDValue();
16974
16975 // Turn (fadd x, (vselect c, y, -0.0)) into (vselect c, (fadd x, y), x)
16976 // The second form can be more easily turned into a predicated vadd, and
16977 // possibly combined into a fma to become a predicated vfma.
16978 SDValue Op0 = N->getOperand(0);
16979 SDValue Op1 = N->getOperand(1);
16980 EVT VT = N->getValueType(0);
16981 SDLoc DL(N);
16982
16983 // The identity element for a fadd is -0.0 or +0.0 when the nsz flag is set,
16984 // which these VMOV's represent.
16985 auto isIdentitySplat = [&](SDValue Op, bool NSZ) {
16986 if (Op.getOpcode() != ISD::BITCAST ||
16987 Op.getOperand(0).getOpcode() != ARMISD::VMOVIMM)
16988 return false;
16989 uint64_t ImmVal = Op.getOperand(0).getConstantOperandVal(0);
16990 if (VT == MVT::v4f32 && (ImmVal == 1664 || (ImmVal == 0 && NSZ)))
16991 return true;
16992 if (VT == MVT::v8f16 && (ImmVal == 2688 || (ImmVal == 0 && NSZ)))
16993 return true;
16994 return false;
16995 };
16996
16997 if (Op0.getOpcode() == ISD::VSELECT && Op1.getOpcode() != ISD::VSELECT)
16998 std::swap(Op0, Op1);
16999
17000 if (Op1.getOpcode() != ISD::VSELECT)
17001 return SDValue();
17002
17003 SDNodeFlags FaddFlags = N->getFlags();
17004 bool NSZ = FaddFlags.hasNoSignedZeros();
17005 if (!isIdentitySplat(Op1.getOperand(2), NSZ))
17006 return SDValue();
17007
17008 SDValue FAdd =
17009 DAG.getNode(ISD::FADD, DL, VT, Op0, Op1.getOperand(1), FaddFlags);
17010 return DAG.getNode(ISD::VSELECT, DL, VT, Op1.getOperand(0), FAdd, Op0, FaddFlags);
17011}
17012
17014 SDValue LHS = N->getOperand(0);
17015 SDValue RHS = N->getOperand(1);
17016 EVT VT = N->getValueType(0);
17017 SDLoc DL(N);
17018
17019 if (!N->getFlags().hasAllowReassociation())
17020 return SDValue();
17021
17022 // Combine fadd(a, vcmla(b, c, d)) -> vcmla(fadd(a, b), b, c)
17023 auto ReassocComplex = [&](SDValue A, SDValue B) {
17024 if (A.getOpcode() != ISD::INTRINSIC_WO_CHAIN)
17025 return SDValue();
17026 unsigned Opc = A.getConstantOperandVal(0);
17027 if (Opc != Intrinsic::arm_mve_vcmlaq)
17028 return SDValue();
17029 SDValue VCMLA = DAG.getNode(
17030 ISD::INTRINSIC_WO_CHAIN, DL, VT, A.getOperand(0), A.getOperand(1),
17031 DAG.getNode(ISD::FADD, DL, VT, A.getOperand(2), B, N->getFlags()),
17032 A.getOperand(3), A.getOperand(4));
17033 VCMLA->setFlags(A->getFlags());
17034 return VCMLA;
17035 };
17036 if (SDValue R = ReassocComplex(LHS, RHS))
17037 return R;
17038 if (SDValue R = ReassocComplex(RHS, LHS))
17039 return R;
17040
17041 return SDValue();
17042}
17043
17045 const ARMSubtarget *Subtarget) {
17046 if (SDValue S = PerformFAddVSelectCombine(N, DAG, Subtarget))
17047 return S;
17048 if (SDValue S = PerformFADDVCMLACombine(N, DAG))
17049 return S;
17050 return SDValue();
17051}
17052
17053/// PerformVMulVCTPCombine - VCVT (fixed-point to floating-point, Advanced SIMD)
17054/// can replace combinations of VCVT (integer to floating-point) and VMUL
17055/// when the VMUL has a constant operand that is a power of 2.
17056///
17057/// Example (assume d17 = <float 0.125, float 0.125>):
17058/// vcvt.f32.s32 d16, d16
17059/// vmul.f32 d16, d16, d17
17060/// becomes:
17061/// vcvt.f32.s32 d16, d16, #3
17063 const ARMSubtarget *Subtarget) {
17064 if (!Subtarget->hasNEON())
17065 return SDValue();
17066
17067 SDValue Op = N->getOperand(0);
17068 unsigned OpOpcode = Op.getNode()->getOpcode();
17069 if (!N->getValueType(0).isVector() || !N->getValueType(0).isSimple() ||
17070 (OpOpcode != ISD::SINT_TO_FP && OpOpcode != ISD::UINT_TO_FP))
17071 return SDValue();
17072
17073 SDValue ConstVec = N->getOperand(1);
17074 if (!isa<BuildVectorSDNode>(ConstVec))
17075 return SDValue();
17076
17077 MVT FloatTy = N->getSimpleValueType(0).getVectorElementType();
17078 uint32_t FloatBits = FloatTy.getSizeInBits();
17079 MVT IntTy = Op.getOperand(0).getSimpleValueType().getVectorElementType();
17080 uint32_t IntBits = IntTy.getSizeInBits();
17081 unsigned NumLanes = Op.getValueType().getVectorNumElements();
17082 if (FloatBits != 32 || IntBits > 32 || (NumLanes != 4 && NumLanes != 2)) {
17083 // These instructions only exist converting from i32 to f32. We can handle
17084 // smaller integers by generating an extra extend, but larger ones would
17085 // be lossy. We also can't handle anything other than 2 or 4 lanes, since
17086 // these instructions only support v2i32/v4i32 types.
17087 return SDValue();
17088 }
17089
17090 ConstantFPSDNode *CN = isConstOrConstSplatFP(ConstVec, true);
17091 APFloat Recip(0.0f);
17092 if (!CN || !CN->getValueAPF().getExactInverse(&Recip))
17093 return SDValue();
17094
17095 bool IsExact;
17096 APSInt IntVal(33);
17097 if (Recip.convertToInteger(IntVal, APFloat::rmTowardZero, &IsExact) !=
17098 APFloat::opOK ||
17099 !IsExact)
17100 return SDValue();
17101
17102 int32_t C = IntVal.exactLogBase2();
17103 if (C == -1 || C == 0 || C > 32)
17104 return SDValue();
17105
17106 SDLoc DL(N);
17107 bool isSigned = OpOpcode == ISD::SINT_TO_FP;
17108 SDValue ConvInput = Op.getOperand(0);
17109 if (IntBits < FloatBits)
17110 ConvInput = DAG.getNode(isSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, DL,
17111 NumLanes == 2 ? MVT::v2i32 : MVT::v4i32, ConvInput);
17112
17113 unsigned IntrinsicOpcode = isSigned ? Intrinsic::arm_neon_vcvtfxs2fp
17114 : Intrinsic::arm_neon_vcvtfxu2fp;
17115 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),
17116 DAG.getConstant(IntrinsicOpcode, DL, MVT::i32), ConvInput,
17117 DAG.getConstant(C, DL, MVT::i32));
17118}
17119
17121 const ARMSubtarget *ST) {
17122 if (!ST->hasMVEIntegerOps())
17123 return SDValue();
17124
17125 assert(N->getOpcode() == ISD::VECREDUCE_ADD);
17126 EVT ResVT = N->getValueType(0);
17127 SDValue N0 = N->getOperand(0);
17128 SDLoc dl(N);
17129
17130 // Try to turn vecreduce_add(add(x, y)) into vecreduce(x) + vecreduce(y)
17131 if (ResVT == MVT::i32 && N0.getOpcode() == ISD::ADD &&
17132 (N0.getValueType() == MVT::v4i32 || N0.getValueType() == MVT::v8i16 ||
17133 N0.getValueType() == MVT::v16i8)) {
17134 SDValue Red0 = DAG.getNode(ISD::VECREDUCE_ADD, dl, ResVT, N0.getOperand(0));
17135 SDValue Red1 = DAG.getNode(ISD::VECREDUCE_ADD, dl, ResVT, N0.getOperand(1));
17136 return DAG.getNode(ISD::ADD, dl, ResVT, Red0, Red1);
17137 }
17138
17139 // We are looking for something that will have illegal types if left alone,
17140 // but that we can convert to a single instruction under MVE. For example
17141 // vecreduce_add(sext(A, v8i32)) => VADDV.s16 A
17142 // or
17143 // vecreduce_add(mul(zext(A, v16i32), zext(B, v16i32))) => VMLADAV.u8 A, B
17144
17145 // The legal cases are:
17146 // VADDV u/s 8/16/32
17147 // VMLAV u/s 8/16/32
17148 // VADDLV u/s 32
17149 // VMLALV u/s 16/32
17150
17151 // If the input vector is smaller than legal (v4i8/v4i16 for example) we can
17152 // extend it and use v4i32 instead.
17153 auto ExtTypeMatches = [](SDValue A, ArrayRef<MVT> ExtTypes) {
17154 EVT AVT = A.getValueType();
17155 return any_of(ExtTypes, [&](MVT Ty) {
17156 return AVT.getVectorNumElements() == Ty.getVectorNumElements() &&
17157 AVT.bitsLE(Ty);
17158 });
17159 };
17160 auto ExtendIfNeeded = [&](SDValue A, unsigned ExtendCode) {
17161 EVT AVT = A.getValueType();
17162 if (!AVT.is128BitVector())
17163 A = DAG.getNode(
17164 ExtendCode, dl,
17166 *DAG.getContext(),
17168 A);
17169 return A;
17170 };
17171 auto IsVADDV = [&](MVT RetTy, unsigned ExtendCode, ArrayRef<MVT> ExtTypes) {
17172 if (ResVT != RetTy || N0->getOpcode() != ExtendCode)
17173 return SDValue();
17174 SDValue A = N0->getOperand(0);
17175 if (ExtTypeMatches(A, ExtTypes))
17176 return ExtendIfNeeded(A, ExtendCode);
17177 return SDValue();
17178 };
17179 auto IsPredVADDV = [&](MVT RetTy, unsigned ExtendCode,
17180 ArrayRef<MVT> ExtTypes, SDValue &Mask) {
17181 if (ResVT != RetTy || N0->getOpcode() != ISD::VSELECT ||
17183 return SDValue();
17184 Mask = N0->getOperand(0);
17185 SDValue Ext = N0->getOperand(1);
17186 if (Ext->getOpcode() != ExtendCode)
17187 return SDValue();
17188 SDValue A = Ext->getOperand(0);
17189 if (ExtTypeMatches(A, ExtTypes))
17190 return ExtendIfNeeded(A, ExtendCode);
17191 return SDValue();
17192 };
17193 auto IsVMLAV = [&](MVT RetTy, unsigned ExtendCode, ArrayRef<MVT> ExtTypes,
17194 SDValue &A, SDValue &B) {
17195 // For a vmla we are trying to match a larger pattern:
17196 // ExtA = sext/zext A
17197 // ExtB = sext/zext B
17198 // Mul = mul ExtA, ExtB
17199 // vecreduce.add Mul
17200 // There might also be en extra extend between the mul and the addreduce, so
17201 // long as the bitwidth is high enough to make them equivalent (for example
17202 // original v8i16 might be mul at v8i32 and the reduce happens at v8i64).
17203 if (ResVT != RetTy)
17204 return false;
17205 SDValue Mul = N0;
17206 if (Mul->getOpcode() == ExtendCode &&
17207 Mul->getOperand(0).getScalarValueSizeInBits() * 2 >=
17208 ResVT.getScalarSizeInBits())
17209 Mul = Mul->getOperand(0);
17210 if (Mul->getOpcode() != ISD::MUL)
17211 return false;
17212 SDValue ExtA = Mul->getOperand(0);
17213 SDValue ExtB = Mul->getOperand(1);
17214 if (ExtA->getOpcode() != ExtendCode || ExtB->getOpcode() != ExtendCode)
17215 return false;
17216 A = ExtA->getOperand(0);
17217 B = ExtB->getOperand(0);
17218 if (ExtTypeMatches(A, ExtTypes) && ExtTypeMatches(B, ExtTypes)) {
17219 A = ExtendIfNeeded(A, ExtendCode);
17220 B = ExtendIfNeeded(B, ExtendCode);
17221 return true;
17222 }
17223 return false;
17224 };
17225 auto IsPredVMLAV = [&](MVT RetTy, unsigned ExtendCode, ArrayRef<MVT> ExtTypes,
17226 SDValue &A, SDValue &B, SDValue &Mask) {
17227 // Same as the pattern above with a select for the zero predicated lanes
17228 // ExtA = sext/zext A
17229 // ExtB = sext/zext B
17230 // Mul = mul ExtA, ExtB
17231 // N0 = select Mask, Mul, 0
17232 // vecreduce.add N0
17233 if (ResVT != RetTy || N0->getOpcode() != ISD::VSELECT ||
17235 return false;
17236 Mask = N0->getOperand(0);
17237 SDValue Mul = N0->getOperand(1);
17238 if (Mul->getOpcode() == ExtendCode &&
17239 Mul->getOperand(0).getScalarValueSizeInBits() * 2 >=
17240 ResVT.getScalarSizeInBits())
17241 Mul = Mul->getOperand(0);
17242 if (Mul->getOpcode() != ISD::MUL)
17243 return false;
17244 SDValue ExtA = Mul->getOperand(0);
17245 SDValue ExtB = Mul->getOperand(1);
17246 if (ExtA->getOpcode() != ExtendCode || ExtB->getOpcode() != ExtendCode)
17247 return false;
17248 A = ExtA->getOperand(0);
17249 B = ExtB->getOperand(0);
17250 if (ExtTypeMatches(A, ExtTypes) && ExtTypeMatches(B, ExtTypes)) {
17251 A = ExtendIfNeeded(A, ExtendCode);
17252 B = ExtendIfNeeded(B, ExtendCode);
17253 return true;
17254 }
17255 return false;
17256 };
17257 auto Create64bitNode = [&](unsigned Opcode, ArrayRef<SDValue> Ops) {
17258 // Split illegal MVT::v16i8->i64 vector reductions into two legal v8i16->i64
17259 // reductions. The operands are extended with MVEEXT, but as they are
17260 // reductions the lane orders do not matter. MVEEXT may be combined with
17261 // loads to produce two extending loads, or else they will be expanded to
17262 // VREV/VMOVL.
17263 EVT VT = Ops[0].getValueType();
17264 if (VT == MVT::v16i8) {
17265 assert((Opcode == ARMISD::VMLALVs || Opcode == ARMISD::VMLALVu) &&
17266 "Unexpected illegal long reduction opcode");
17267 bool IsUnsigned = Opcode == ARMISD::VMLALVu;
17268
17269 SDValue Ext0 =
17270 DAG.getNode(IsUnsigned ? ARMISD::MVEZEXT : ARMISD::MVESEXT, dl,
17271 DAG.getVTList(MVT::v8i16, MVT::v8i16), Ops[0]);
17272 SDValue Ext1 =
17273 DAG.getNode(IsUnsigned ? ARMISD::MVEZEXT : ARMISD::MVESEXT, dl,
17274 DAG.getVTList(MVT::v8i16, MVT::v8i16), Ops[1]);
17275
17276 SDValue MLA0 = DAG.getNode(Opcode, dl, DAG.getVTList(MVT::i32, MVT::i32),
17277 Ext0, Ext1);
17278 SDValue MLA1 =
17279 DAG.getNode(IsUnsigned ? ARMISD::VMLALVAu : ARMISD::VMLALVAs, dl,
17280 DAG.getVTList(MVT::i32, MVT::i32), MLA0, MLA0.getValue(1),
17281 Ext0.getValue(1), Ext1.getValue(1));
17282 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, MLA1, MLA1.getValue(1));
17283 }
17284 SDValue Node = DAG.getNode(Opcode, dl, {MVT::i32, MVT::i32}, Ops);
17285 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Node,
17286 SDValue(Node.getNode(), 1));
17287 };
17288
17289 SDValue A, B;
17290 SDValue Mask;
17291 if (IsVMLAV(MVT::i32, ISD::SIGN_EXTEND, {MVT::v8i16, MVT::v16i8}, A, B))
17292 return DAG.getNode(ARMISD::VMLAVs, dl, ResVT, A, B);
17293 if (IsVMLAV(MVT::i32, ISD::ZERO_EXTEND, {MVT::v8i16, MVT::v16i8}, A, B))
17294 return DAG.getNode(ARMISD::VMLAVu, dl, ResVT, A, B);
17295 if (IsVMLAV(MVT::i64, ISD::SIGN_EXTEND, {MVT::v16i8, MVT::v8i16, MVT::v4i32},
17296 A, B))
17297 return Create64bitNode(ARMISD::VMLALVs, {A, B});
17298 if (IsVMLAV(MVT::i64, ISD::ZERO_EXTEND, {MVT::v16i8, MVT::v8i16, MVT::v4i32},
17299 A, B))
17300 return Create64bitNode(ARMISD::VMLALVu, {A, B});
17301 if (IsVMLAV(MVT::i16, ISD::SIGN_EXTEND, {MVT::v16i8}, A, B))
17302 return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
17303 DAG.getNode(ARMISD::VMLAVs, dl, MVT::i32, A, B));
17304 if (IsVMLAV(MVT::i16, ISD::ZERO_EXTEND, {MVT::v16i8}, A, B))
17305 return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
17306 DAG.getNode(ARMISD::VMLAVu, dl, MVT::i32, A, B));
17307
17308 if (IsPredVMLAV(MVT::i32, ISD::SIGN_EXTEND, {MVT::v8i16, MVT::v16i8}, A, B,
17309 Mask))
17310 return DAG.getNode(ARMISD::VMLAVps, dl, ResVT, A, B, Mask);
17311 if (IsPredVMLAV(MVT::i32, ISD::ZERO_EXTEND, {MVT::v8i16, MVT::v16i8}, A, B,
17312 Mask))
17313 return DAG.getNode(ARMISD::VMLAVpu, dl, ResVT, A, B, Mask);
17314 if (IsPredVMLAV(MVT::i64, ISD::SIGN_EXTEND, {MVT::v8i16, MVT::v4i32}, A, B,
17315 Mask))
17316 return Create64bitNode(ARMISD::VMLALVps, {A, B, Mask});
17317 if (IsPredVMLAV(MVT::i64, ISD::ZERO_EXTEND, {MVT::v8i16, MVT::v4i32}, A, B,
17318 Mask))
17319 return Create64bitNode(ARMISD::VMLALVpu, {A, B, Mask});
17320 if (IsPredVMLAV(MVT::i16, ISD::SIGN_EXTEND, {MVT::v16i8}, A, B, Mask))
17321 return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
17322 DAG.getNode(ARMISD::VMLAVps, dl, MVT::i32, A, B, Mask));
17323 if (IsPredVMLAV(MVT::i16, ISD::ZERO_EXTEND, {MVT::v16i8}, A, B, Mask))
17324 return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
17325 DAG.getNode(ARMISD::VMLAVpu, dl, MVT::i32, A, B, Mask));
17326
17327 if (SDValue A = IsVADDV(MVT::i32, ISD::SIGN_EXTEND, {MVT::v8i16, MVT::v16i8}))
17328 return DAG.getNode(ARMISD::VADDVs, dl, ResVT, A);
17329 if (SDValue A = IsVADDV(MVT::i32, ISD::ZERO_EXTEND, {MVT::v8i16, MVT::v16i8}))
17330 return DAG.getNode(ARMISD::VADDVu, dl, ResVT, A);
17331 if (SDValue A = IsVADDV(MVT::i64, ISD::SIGN_EXTEND, {MVT::v4i32}))
17332 return Create64bitNode(ARMISD::VADDLVs, {A});
17333 if (SDValue A = IsVADDV(MVT::i64, ISD::ZERO_EXTEND, {MVT::v4i32}))
17334 return Create64bitNode(ARMISD::VADDLVu, {A});
17335 if (SDValue A = IsVADDV(MVT::i16, ISD::SIGN_EXTEND, {MVT::v16i8}))
17336 return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
17337 DAG.getNode(ARMISD::VADDVs, dl, MVT::i32, A));
17338 if (SDValue A = IsVADDV(MVT::i16, ISD::ZERO_EXTEND, {MVT::v16i8}))
17339 return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
17340 DAG.getNode(ARMISD::VADDVu, dl, MVT::i32, A));
17341
17342 if (SDValue A = IsPredVADDV(MVT::i32, ISD::SIGN_EXTEND, {MVT::v8i16, MVT::v16i8}, Mask))
17343 return DAG.getNode(ARMISD::VADDVps, dl, ResVT, A, Mask);
17344 if (SDValue A = IsPredVADDV(MVT::i32, ISD::ZERO_EXTEND, {MVT::v8i16, MVT::v16i8}, Mask))
17345 return DAG.getNode(ARMISD::VADDVpu, dl, ResVT, A, Mask);
17346 if (SDValue A = IsPredVADDV(MVT::i64, ISD::SIGN_EXTEND, {MVT::v4i32}, Mask))
17347 return Create64bitNode(ARMISD::VADDLVps, {A, Mask});
17348 if (SDValue A = IsPredVADDV(MVT::i64, ISD::ZERO_EXTEND, {MVT::v4i32}, Mask))
17349 return Create64bitNode(ARMISD::VADDLVpu, {A, Mask});
17350 if (SDValue A = IsPredVADDV(MVT::i16, ISD::SIGN_EXTEND, {MVT::v16i8}, Mask))
17351 return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
17352 DAG.getNode(ARMISD::VADDVps, dl, MVT::i32, A, Mask));
17353 if (SDValue A = IsPredVADDV(MVT::i16, ISD::ZERO_EXTEND, {MVT::v16i8}, Mask))
17354 return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
17355 DAG.getNode(ARMISD::VADDVpu, dl, MVT::i32, A, Mask));
17356
17357 // Some complications. We can get a case where the two inputs of the mul are
17358 // the same, then the output sext will have been helpfully converted to a
17359 // zext. Turn it back.
17360 SDValue Op = N0;
17361 if (Op->getOpcode() == ISD::VSELECT)
17362 Op = Op->getOperand(1);
17363 if (Op->getOpcode() == ISD::ZERO_EXTEND &&
17364 Op->getOperand(0)->getOpcode() == ISD::MUL) {
17365 SDValue Mul = Op->getOperand(0);
17366 if (Mul->getOperand(0) == Mul->getOperand(1) &&
17367 Mul->getOperand(0)->getOpcode() == ISD::SIGN_EXTEND) {
17368 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, N0->getValueType(0), Mul);
17369 if (Op != N0)
17370 Ext = DAG.getNode(ISD::VSELECT, dl, N0->getValueType(0),
17371 N0->getOperand(0), Ext, N0->getOperand(2));
17372 return DAG.getNode(ISD::VECREDUCE_ADD, dl, ResVT, Ext);
17373 }
17374 }
17375
17376 return SDValue();
17377}
17378
17379// Looks for vaddv(shuffle) or vmlav(shuffle, shuffle), with a shuffle where all
17380// the lanes are used. Due to the reduction being commutative the shuffle can be
17381// removed.
17383 unsigned VecOp = N->getOperand(0).getValueType().isVector() ? 0 : 2;
17384 auto *Shuf = dyn_cast<ShuffleVectorSDNode>(N->getOperand(VecOp));
17385 if (!Shuf || !Shuf->getOperand(1).isUndef())
17386 return SDValue();
17387
17388 // Check all elements are used once in the mask.
17389 ArrayRef<int> Mask = Shuf->getMask();
17390 APInt SetElts(Mask.size(), 0);
17391 for (int E : Mask) {
17392 if (E < 0 || E >= (int)Mask.size())
17393 return SDValue();
17394 SetElts.setBit(E);
17395 }
17396 if (!SetElts.isAllOnes())
17397 return SDValue();
17398
17399 if (N->getNumOperands() != VecOp + 1) {
17400 auto *Shuf2 = dyn_cast<ShuffleVectorSDNode>(N->getOperand(VecOp + 1));
17401 if (!Shuf2 || !Shuf2->getOperand(1).isUndef() || Shuf2->getMask() != Mask)
17402 return SDValue();
17403 }
17404
17406 for (SDValue Op : N->ops()) {
17407 if (Op.getValueType().isVector())
17408 Ops.push_back(Op.getOperand(0));
17409 else
17410 Ops.push_back(Op);
17411 }
17412 return DAG.getNode(N->getOpcode(), SDLoc(N), N->getVTList(), Ops);
17413}
17414
17417 SDValue Op0 = N->getOperand(0);
17418 SDValue Op1 = N->getOperand(1);
17419 unsigned IsTop = N->getConstantOperandVal(2);
17420
17421 // VMOVNT a undef -> a
17422 // VMOVNB a undef -> a
17423 // VMOVNB undef a -> a
17424 if (Op1->isUndef())
17425 return Op0;
17426 if (Op0->isUndef() && !IsTop)
17427 return Op1;
17428
17429 // VMOVNt(c, VQMOVNb(a, b)) => VQMOVNt(c, b)
17430 // VMOVNb(c, VQMOVNb(a, b)) => VQMOVNb(c, b)
17431 if ((Op1->getOpcode() == ARMISD::VQMOVNs ||
17432 Op1->getOpcode() == ARMISD::VQMOVNu) &&
17433 Op1->getConstantOperandVal(2) == 0)
17434 return DCI.DAG.getNode(Op1->getOpcode(), SDLoc(Op1), N->getValueType(0),
17435 Op0, Op1->getOperand(1), N->getOperand(2));
17436
17437 // Only the bottom lanes from Qm (Op1) and either the top or bottom lanes from
17438 // Qd (Op0) are demanded from a VMOVN, depending on whether we are inserting
17439 // into the top or bottom lanes.
17440 unsigned NumElts = N->getValueType(0).getVectorNumElements();
17441 APInt Op1DemandedElts = APInt::getSplat(NumElts, APInt::getLowBitsSet(2, 1));
17442 APInt Op0DemandedElts =
17443 IsTop ? Op1DemandedElts
17444 : APInt::getSplat(NumElts, APInt::getHighBitsSet(2, 1));
17445
17446 const TargetLowering &TLI = DCI.DAG.getTargetLoweringInfo();
17447 if (TLI.SimplifyDemandedVectorElts(Op0, Op0DemandedElts, DCI))
17448 return SDValue(N, 0);
17449 if (TLI.SimplifyDemandedVectorElts(Op1, Op1DemandedElts, DCI))
17450 return SDValue(N, 0);
17451
17452 return SDValue();
17453}
17454
17457 SDValue Op0 = N->getOperand(0);
17458 unsigned IsTop = N->getConstantOperandVal(2);
17459
17460 unsigned NumElts = N->getValueType(0).getVectorNumElements();
17461 APInt Op0DemandedElts =
17462 APInt::getSplat(NumElts, IsTop ? APInt::getLowBitsSet(2, 1)
17463 : APInt::getHighBitsSet(2, 1));
17464
17465 const TargetLowering &TLI = DCI.DAG.getTargetLoweringInfo();
17466 if (TLI.SimplifyDemandedVectorElts(Op0, Op0DemandedElts, DCI))
17467 return SDValue(N, 0);
17468 return SDValue();
17469}
17470
17473 EVT VT = N->getValueType(0);
17474 SDValue LHS = N->getOperand(0);
17475 SDValue RHS = N->getOperand(1);
17476
17477 auto *Shuf0 = dyn_cast<ShuffleVectorSDNode>(LHS);
17478 auto *Shuf1 = dyn_cast<ShuffleVectorSDNode>(RHS);
17479 // Turn VQDMULH(shuffle, shuffle) -> shuffle(VQDMULH)
17480 if (Shuf0 && Shuf1 && Shuf0->getMask().equals(Shuf1->getMask()) &&
17481 LHS.getOperand(1).isUndef() && RHS.getOperand(1).isUndef() &&
17482 (LHS.hasOneUse() || RHS.hasOneUse() || LHS == RHS)) {
17483 SDLoc DL(N);
17484 SDValue NewBinOp = DCI.DAG.getNode(N->getOpcode(), DL, VT,
17485 LHS.getOperand(0), RHS.getOperand(0));
17486 SDValue UndefV = LHS.getOperand(1);
17487 return DCI.DAG.getVectorShuffle(VT, DL, NewBinOp, UndefV, Shuf0->getMask());
17488 }
17489 return SDValue();
17490}
17491
17493 SDLoc DL(N);
17494 SDValue Op0 = N->getOperand(0);
17495 SDValue Op1 = N->getOperand(1);
17496
17497 // Turn X << -C -> X >> C and viceversa. The negative shifts can come up from
17498 // uses of the intrinsics.
17499 if (auto C = dyn_cast<ConstantSDNode>(N->getOperand(2))) {
17500 int ShiftAmt = C->getSExtValue();
17501 if (ShiftAmt == 0) {
17502 SDValue Merge = DAG.getMergeValues({Op0, Op1}, DL);
17503 DAG.ReplaceAllUsesWith(N, Merge.getNode());
17504 return SDValue();
17505 }
17506
17507 if (ShiftAmt >= -32 && ShiftAmt < 0) {
17508 unsigned NewOpcode =
17509 N->getOpcode() == ARMISD::LSLL ? ARMISD::LSRL : ARMISD::LSLL;
17510 SDValue NewShift = DAG.getNode(NewOpcode, DL, N->getVTList(), Op0, Op1,
17511 DAG.getConstant(-ShiftAmt, DL, MVT::i32));
17512 DAG.ReplaceAllUsesWith(N, NewShift.getNode());
17513 return NewShift;
17514 }
17515 }
17516
17517 return SDValue();
17518}
17519
17520/// PerformIntrinsicCombine - ARM-specific DAG combining for intrinsics.
17522 DAGCombinerInfo &DCI) const {
17523 SelectionDAG &DAG = DCI.DAG;
17524 unsigned IntNo = N->getConstantOperandVal(0);
17525 switch (IntNo) {
17526 default:
17527 // Don't do anything for most intrinsics.
17528 break;
17529
17530 // Vector shifts: check for immediate versions and lower them.
17531 // Note: This is done during DAG combining instead of DAG legalizing because
17532 // the build_vectors for 64-bit vector element shift counts are generally
17533 // not legal, and it is hard to see their values after they get legalized to
17534 // loads from a constant pool.
17535 case Intrinsic::arm_neon_vshifts:
17536 case Intrinsic::arm_neon_vshiftu:
17537 case Intrinsic::arm_neon_vrshifts:
17538 case Intrinsic::arm_neon_vrshiftu:
17539 case Intrinsic::arm_neon_vrshiftn:
17540 case Intrinsic::arm_neon_vqshifts:
17541 case Intrinsic::arm_neon_vqshiftu:
17542 case Intrinsic::arm_neon_vqshiftsu:
17543 case Intrinsic::arm_neon_vqshiftns:
17544 case Intrinsic::arm_neon_vqshiftnu:
17545 case Intrinsic::arm_neon_vqshiftnsu:
17546 case Intrinsic::arm_neon_vqrshiftns:
17547 case Intrinsic::arm_neon_vqrshiftnu:
17548 case Intrinsic::arm_neon_vqrshiftnsu: {
17549 EVT VT = N->getOperand(1).getValueType();
17550 int64_t Cnt;
17551 unsigned VShiftOpc = 0;
17552
17553 switch (IntNo) {
17554 case Intrinsic::arm_neon_vshifts:
17555 case Intrinsic::arm_neon_vshiftu:
17556 if (isVShiftLImm(N->getOperand(2), VT, false, Cnt)) {
17557 VShiftOpc = ARMISD::VSHLIMM;
17558 break;
17559 }
17560 if (isVShiftRImm(N->getOperand(2), VT, false, true, Cnt)) {
17561 VShiftOpc = (IntNo == Intrinsic::arm_neon_vshifts ? ARMISD::VSHRsIMM
17562 : ARMISD::VSHRuIMM);
17563 break;
17564 }
17565 return SDValue();
17566
17567 case Intrinsic::arm_neon_vrshifts:
17568 case Intrinsic::arm_neon_vrshiftu:
17569 if (isVShiftRImm(N->getOperand(2), VT, false, true, Cnt))
17570 break;
17571 return SDValue();
17572
17573 case Intrinsic::arm_neon_vqshifts:
17574 case Intrinsic::arm_neon_vqshiftu:
17575 if (isVShiftLImm(N->getOperand(2), VT, false, Cnt))
17576 break;
17577 return SDValue();
17578
17579 case Intrinsic::arm_neon_vqshiftsu:
17580 if (isVShiftLImm(N->getOperand(2), VT, false, Cnt))
17581 break;
17582 llvm_unreachable("invalid shift count for vqshlu intrinsic");
17583
17584 case Intrinsic::arm_neon_vrshiftn:
17585 case Intrinsic::arm_neon_vqshiftns:
17586 case Intrinsic::arm_neon_vqshiftnu:
17587 case Intrinsic::arm_neon_vqshiftnsu:
17588 case Intrinsic::arm_neon_vqrshiftns:
17589 case Intrinsic::arm_neon_vqrshiftnu:
17590 case Intrinsic::arm_neon_vqrshiftnsu:
17591 // Narrowing shifts require an immediate right shift.
17592 if (isVShiftRImm(N->getOperand(2), VT, true, true, Cnt))
17593 break;
17594 llvm_unreachable("invalid shift count for narrowing vector shift "
17595 "intrinsic");
17596
17597 default:
17598 llvm_unreachable("unhandled vector shift");
17599 }
17600
17601 switch (IntNo) {
17602 case Intrinsic::arm_neon_vshifts:
17603 case Intrinsic::arm_neon_vshiftu:
17604 // Opcode already set above.
17605 break;
17606 case Intrinsic::arm_neon_vrshifts:
17607 VShiftOpc = ARMISD::VRSHRsIMM;
17608 break;
17609 case Intrinsic::arm_neon_vrshiftu:
17610 VShiftOpc = ARMISD::VRSHRuIMM;
17611 break;
17612 case Intrinsic::arm_neon_vrshiftn:
17613 VShiftOpc = ARMISD::VRSHRNIMM;
17614 break;
17615 case Intrinsic::arm_neon_vqshifts:
17616 VShiftOpc = ARMISD::VQSHLsIMM;
17617 break;
17618 case Intrinsic::arm_neon_vqshiftu:
17619 VShiftOpc = ARMISD::VQSHLuIMM;
17620 break;
17621 case Intrinsic::arm_neon_vqshiftsu:
17622 VShiftOpc = ARMISD::VQSHLsuIMM;
17623 break;
17624 case Intrinsic::arm_neon_vqshiftns:
17625 VShiftOpc = ARMISD::VQSHRNsIMM;
17626 break;
17627 case Intrinsic::arm_neon_vqshiftnu:
17628 VShiftOpc = ARMISD::VQSHRNuIMM;
17629 break;
17630 case Intrinsic::arm_neon_vqshiftnsu:
17631 VShiftOpc = ARMISD::VQSHRNsuIMM;
17632 break;
17633 case Intrinsic::arm_neon_vqrshiftns:
17634 VShiftOpc = ARMISD::VQRSHRNsIMM;
17635 break;
17636 case Intrinsic::arm_neon_vqrshiftnu:
17637 VShiftOpc = ARMISD::VQRSHRNuIMM;
17638 break;
17639 case Intrinsic::arm_neon_vqrshiftnsu:
17640 VShiftOpc = ARMISD::VQRSHRNsuIMM;
17641 break;
17642 }
17643
17644 SDLoc dl(N);
17645 return DAG.getNode(VShiftOpc, dl, N->getValueType(0),
17646 N->getOperand(1), DAG.getConstant(Cnt, dl, MVT::i32));
17647 }
17648
17649 case Intrinsic::arm_neon_vshiftins: {
17650 EVT VT = N->getOperand(1).getValueType();
17651 int64_t Cnt;
17652 unsigned VShiftOpc = 0;
17653
17654 if (isVShiftLImm(N->getOperand(3), VT, false, Cnt))
17655 VShiftOpc = ARMISD::VSLIIMM;
17656 else if (isVShiftRImm(N->getOperand(3), VT, false, true, Cnt))
17657 VShiftOpc = ARMISD::VSRIIMM;
17658 else {
17659 llvm_unreachable("invalid shift count for vsli/vsri intrinsic");
17660 }
17661
17662 SDLoc dl(N);
17663 return DAG.getNode(VShiftOpc, dl, N->getValueType(0),
17664 N->getOperand(1), N->getOperand(2),
17665 DAG.getConstant(Cnt, dl, MVT::i32));
17666 }
17667
17668 case Intrinsic::arm_neon_vqrshifts:
17669 case Intrinsic::arm_neon_vqrshiftu:
17670 // No immediate versions of these to check for.
17671 break;
17672
17673 case Intrinsic::arm_neon_vbsl: {
17674 SDLoc dl(N);
17675 return DAG.getNode(ARMISD::VBSP, dl, N->getValueType(0), N->getOperand(1),
17676 N->getOperand(2), N->getOperand(3));
17677 }
17678 case Intrinsic::arm_mve_vqdmlah:
17679 case Intrinsic::arm_mve_vqdmlash:
17680 case Intrinsic::arm_mve_vqrdmlah:
17681 case Intrinsic::arm_mve_vqrdmlash:
17682 case Intrinsic::arm_mve_vmla_n_predicated:
17683 case Intrinsic::arm_mve_vmlas_n_predicated:
17684 case Intrinsic::arm_mve_vqdmlah_predicated:
17685 case Intrinsic::arm_mve_vqdmlash_predicated:
17686 case Intrinsic::arm_mve_vqrdmlah_predicated:
17687 case Intrinsic::arm_mve_vqrdmlash_predicated: {
17688 // These intrinsics all take an i32 scalar operand which is narrowed to the
17689 // size of a single lane of the vector type they return. So we don't need
17690 // any bits of that operand above that point, which allows us to eliminate
17691 // uxth/sxth.
17692 unsigned BitWidth = N->getValueType(0).getScalarSizeInBits();
17693 APInt DemandedMask = APInt::getLowBitsSet(32, BitWidth);
17694 if (SimplifyDemandedBits(N->getOperand(3), DemandedMask, DCI))
17695 return SDValue();
17696 break;
17697 }
17698
17699 case Intrinsic::arm_mve_minv:
17700 case Intrinsic::arm_mve_maxv:
17701 case Intrinsic::arm_mve_minav:
17702 case Intrinsic::arm_mve_maxav:
17703 case Intrinsic::arm_mve_minv_predicated:
17704 case Intrinsic::arm_mve_maxv_predicated:
17705 case Intrinsic::arm_mve_minav_predicated:
17706 case Intrinsic::arm_mve_maxav_predicated: {
17707 // These intrinsics all take an i32 scalar operand which is narrowed to the
17708 // size of a single lane of the vector type they take as the other input.
17709 unsigned BitWidth = N->getOperand(2)->getValueType(0).getScalarSizeInBits();
17710 APInt DemandedMask = APInt::getLowBitsSet(32, BitWidth);
17711 if (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI))
17712 return SDValue();
17713 break;
17714 }
17715
17716 case Intrinsic::arm_mve_addv: {
17717 // Turn this intrinsic straight into the appropriate ARMISD::VADDV node,
17718 // which allow PerformADDVecReduce to turn it into VADDLV when possible.
17719 bool Unsigned = N->getConstantOperandVal(2);
17720 unsigned Opc = Unsigned ? ARMISD::VADDVu : ARMISD::VADDVs;
17721 return DAG.getNode(Opc, SDLoc(N), N->getVTList(), N->getOperand(1));
17722 }
17723
17724 case Intrinsic::arm_mve_addlv:
17725 case Intrinsic::arm_mve_addlv_predicated: {
17726 // Same for these, but ARMISD::VADDLV has to be followed by a BUILD_PAIR
17727 // which recombines the two outputs into an i64
17728 bool Unsigned = N->getConstantOperandVal(2);
17729 unsigned Opc = IntNo == Intrinsic::arm_mve_addlv ?
17730 (Unsigned ? ARMISD::VADDLVu : ARMISD::VADDLVs) :
17731 (Unsigned ? ARMISD::VADDLVpu : ARMISD::VADDLVps);
17732
17734 for (unsigned i = 1, e = N->getNumOperands(); i < e; i++)
17735 if (i != 2) // skip the unsigned flag
17736 Ops.push_back(N->getOperand(i));
17737
17738 SDLoc dl(N);
17739 SDValue val = DAG.getNode(Opc, dl, {MVT::i32, MVT::i32}, Ops);
17740 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, val.getValue(0),
17741 val.getValue(1));
17742 }
17743 }
17744
17745 return SDValue();
17746}
17747
17748/// PerformShiftCombine - Checks for immediate versions of vector shifts and
17749/// lowers them. As with the vector shift intrinsics, this is done during DAG
17750/// combining instead of DAG legalizing because the build_vectors for 64-bit
17751/// vector element shift counts are generally not legal, and it is hard to see
17752/// their values after they get legalized to loads from a constant pool.
17755 const ARMSubtarget *ST) {
17756 SelectionDAG &DAG = DCI.DAG;
17757 EVT VT = N->getValueType(0);
17758
17759 if (ST->isThumb1Only() && N->getOpcode() == ISD::SHL && VT == MVT::i32 &&
17760 N->getOperand(0)->getOpcode() == ISD::AND &&
17761 N->getOperand(0)->hasOneUse()) {
17762 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
17763 return SDValue();
17764 // Look for the pattern (shl (and x, AndMask), ShiftAmt). This doesn't
17765 // usually show up because instcombine prefers to canonicalize it to
17766 // (and (shl x, ShiftAmt) (shl AndMask, ShiftAmt)), but the shift can come
17767 // out of GEP lowering in some cases.
17768 SDValue N0 = N->getOperand(0);
17769 ConstantSDNode *ShiftAmtNode = dyn_cast<ConstantSDNode>(N->getOperand(1));
17770 if (!ShiftAmtNode)
17771 return SDValue();
17772 uint32_t ShiftAmt = static_cast<uint32_t>(ShiftAmtNode->getZExtValue());
17773 ConstantSDNode *AndMaskNode = dyn_cast<ConstantSDNode>(N0->getOperand(1));
17774 if (!AndMaskNode)
17775 return SDValue();
17776 uint32_t AndMask = static_cast<uint32_t>(AndMaskNode->getZExtValue());
17777 // Don't transform uxtb/uxth.
17778 if (AndMask == 255 || AndMask == 65535)
17779 return SDValue();
17780 if (isMask_32(AndMask)) {
17781 uint32_t MaskedBits = llvm::countl_zero(AndMask);
17782 if (MaskedBits > ShiftAmt) {
17783 SDLoc DL(N);
17784 SDValue SHL = DAG.getNode(ISD::SHL, DL, MVT::i32, N0->getOperand(0),
17785 DAG.getConstant(MaskedBits, DL, MVT::i32));
17786 return DAG.getNode(
17787 ISD::SRL, DL, MVT::i32, SHL,
17788 DAG.getConstant(MaskedBits - ShiftAmt, DL, MVT::i32));
17789 }
17790 }
17791 }
17792
17793 // Nothing to be done for scalar shifts.
17794 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
17795 if (!VT.isVector() || !TLI.isTypeLegal(VT))
17796 return SDValue();
17797 if (ST->hasMVEIntegerOps())
17798 return SDValue();
17799
17800 int64_t Cnt;
17801
17802 switch (N->getOpcode()) {
17803 default: llvm_unreachable("unexpected shift opcode");
17804
17805 case ISD::SHL:
17806 if (isVShiftLImm(N->getOperand(1), VT, false, Cnt)) {
17807 SDLoc dl(N);
17808 return DAG.getNode(ARMISD::VSHLIMM, dl, VT, N->getOperand(0),
17809 DAG.getConstant(Cnt, dl, MVT::i32));
17810 }
17811 break;
17812
17813 case ISD::SRA:
17814 case ISD::SRL:
17815 if (isVShiftRImm(N->getOperand(1), VT, false, false, Cnt)) {
17816 unsigned VShiftOpc =
17817 (N->getOpcode() == ISD::SRA ? ARMISD::VSHRsIMM : ARMISD::VSHRuIMM);
17818 SDLoc dl(N);
17819 return DAG.getNode(VShiftOpc, dl, VT, N->getOperand(0),
17820 DAG.getConstant(Cnt, dl, MVT::i32));
17821 }
17822 }
17823 return SDValue();
17824}
17825
17826// Look for a sign/zero/fpextend extend of a larger than legal load. This can be
17827// split into multiple extending loads, which are simpler to deal with than an
17828// arbitrary extend. For fp extends we use an integer extending load and a VCVTL
17829// to convert the type to an f32.
17831 SDValue N0 = N->getOperand(0);
17832 if (N0.getOpcode() != ISD::LOAD)
17833 return SDValue();
17835 if (!LD->isSimple() || !N0.hasOneUse() || LD->isIndexed() ||
17836 LD->getExtensionType() != ISD::NON_EXTLOAD)
17837 return SDValue();
17838 EVT FromVT = LD->getValueType(0);
17839 EVT ToVT = N->getValueType(0);
17840 if (!ToVT.isVector())
17841 return SDValue();
17843 EVT ToEltVT = ToVT.getVectorElementType();
17844 EVT FromEltVT = FromVT.getVectorElementType();
17845
17846 unsigned NumElements = 0;
17847 if (ToEltVT == MVT::i32 && FromEltVT == MVT::i8)
17848 NumElements = 4;
17849 if (ToEltVT == MVT::f32 && FromEltVT == MVT::f16)
17850 NumElements = 4;
17851 if (NumElements == 0 ||
17852 (FromEltVT != MVT::f16 && FromVT.getVectorNumElements() == NumElements) ||
17853 FromVT.getVectorNumElements() % NumElements != 0 ||
17854 !isPowerOf2_32(NumElements))
17855 return SDValue();
17856
17857 LLVMContext &C = *DAG.getContext();
17858 SDLoc DL(LD);
17859 // Details about the old load
17860 SDValue Ch = LD->getChain();
17861 SDValue BasePtr = LD->getBasePtr();
17862 Align Alignment = LD->getBaseAlign();
17863 MachineMemOperand::Flags MMOFlags = LD->getMemOperand()->getFlags();
17864 AAMDNodes AAInfo = LD->getAAInfo();
17865
17866 ISD::LoadExtType NewExtType =
17867 N->getOpcode() == ISD::SIGN_EXTEND ? ISD::SEXTLOAD : ISD::ZEXTLOAD;
17868 SDValue Offset = DAG.getUNDEF(BasePtr.getValueType());
17869 EVT NewFromVT = EVT::getVectorVT(
17870 C, EVT::getIntegerVT(C, FromEltVT.getScalarSizeInBits()), NumElements);
17871 EVT NewToVT = EVT::getVectorVT(
17872 C, EVT::getIntegerVT(C, ToEltVT.getScalarSizeInBits()), NumElements);
17873
17876 for (unsigned i = 0; i < FromVT.getVectorNumElements() / NumElements; i++) {
17877 unsigned NewOffset = (i * NewFromVT.getSizeInBits()) / 8;
17878 SDValue NewPtr =
17879 DAG.getObjectPtrOffset(DL, BasePtr, TypeSize::getFixed(NewOffset));
17880
17881 SDValue NewLoad =
17882 DAG.getLoad(ISD::UNINDEXED, NewExtType, NewToVT, DL, Ch, NewPtr, Offset,
17883 LD->getPointerInfo().getWithOffset(NewOffset), NewFromVT,
17884 Alignment, MMOFlags, AAInfo);
17885 Loads.push_back(NewLoad);
17886 Chains.push_back(SDValue(NewLoad.getNode(), 1));
17887 }
17888
17889 // Float truncs need to extended with VCVTB's into their floating point types.
17890 if (FromEltVT == MVT::f16) {
17892
17893 for (unsigned i = 0; i < Loads.size(); i++) {
17894 SDValue LoadBC =
17895 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, MVT::v8f16, Loads[i]);
17896 SDValue FPExt = DAG.getNode(ARMISD::VCVTL, DL, MVT::v4f32, LoadBC,
17897 DAG.getConstant(0, DL, MVT::i32));
17898 Extends.push_back(FPExt);
17899 }
17900
17901 Loads = Extends;
17902 }
17903
17904 SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
17905 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewChain);
17906 return DAG.getNode(ISD::CONCAT_VECTORS, DL, ToVT, Loads);
17907}
17908
17909/// PerformExtendCombine - Target-specific DAG combining for ISD::SIGN_EXTEND,
17910/// ISD::ZERO_EXTEND, and ISD::ANY_EXTEND.
17912 const ARMSubtarget *ST) {
17913 SDValue N0 = N->getOperand(0);
17914
17915 // Check for sign- and zero-extensions of vector extract operations of 8- and
17916 // 16-bit vector elements. NEON and MVE support these directly. They are
17917 // handled during DAG combining because type legalization will promote them
17918 // to 32-bit types and it is messy to recognize the operations after that.
17919 if ((ST->hasNEON() || ST->hasMVEIntegerOps()) &&
17921 SDValue Vec = N0.getOperand(0);
17922 SDValue Lane = N0.getOperand(1);
17923 EVT VT = N->getValueType(0);
17924 EVT EltVT = N0.getValueType();
17925 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
17926
17927 if (VT == MVT::i32 &&
17928 (EltVT == MVT::i8 || EltVT == MVT::i16) &&
17929 TLI.isTypeLegal(Vec.getValueType()) &&
17930 isa<ConstantSDNode>(Lane)) {
17931
17932 unsigned Opc = 0;
17933 switch (N->getOpcode()) {
17934 default: llvm_unreachable("unexpected opcode");
17935 case ISD::SIGN_EXTEND:
17936 Opc = ARMISD::VGETLANEs;
17937 break;
17938 case ISD::ZERO_EXTEND:
17939 case ISD::ANY_EXTEND:
17940 Opc = ARMISD::VGETLANEu;
17941 break;
17942 }
17943 return DAG.getNode(Opc, SDLoc(N), VT, Vec, Lane);
17944 }
17945 }
17946
17947 if (ST->hasMVEIntegerOps())
17948 if (SDValue NewLoad = PerformSplittingToWideningLoad(N, DAG))
17949 return NewLoad;
17950
17951 return SDValue();
17952}
17953
17955 const ARMSubtarget *ST) {
17956 if (ST->hasMVEFloatOps())
17957 if (SDValue NewLoad = PerformSplittingToWideningLoad(N, DAG))
17958 return NewLoad;
17959
17960 return SDValue();
17961}
17962
17963// Lower smin(smax(x, C1), C2) to ssat or usat, if they have saturating
17964// constant bounds.
17966 const ARMSubtarget *Subtarget) {
17967 if ((Subtarget->isThumb() || !Subtarget->hasV6Ops()) &&
17968 !Subtarget->isThumb2())
17969 return SDValue();
17970
17971 EVT VT = Op.getValueType();
17972 SDValue Op0 = Op.getOperand(0);
17973
17974 if (VT != MVT::i32 ||
17975 (Op0.getOpcode() != ISD::SMIN && Op0.getOpcode() != ISD::SMAX) ||
17976 !isa<ConstantSDNode>(Op.getOperand(1)) ||
17978 return SDValue();
17979
17980 SDValue Min = Op;
17981 SDValue Max = Op0;
17982 SDValue Input = Op0.getOperand(0);
17983 if (Min.getOpcode() == ISD::SMAX)
17984 std::swap(Min, Max);
17985
17986 APInt MinC = Min.getConstantOperandAPInt(1);
17987 APInt MaxC = Max.getConstantOperandAPInt(1);
17988
17989 if (Min.getOpcode() != ISD::SMIN || Max.getOpcode() != ISD::SMAX ||
17990 !(MinC + 1).isPowerOf2())
17991 return SDValue();
17992
17993 SDLoc DL(Op);
17994 if (MinC == ~MaxC)
17995 return DAG.getNode(ARMISD::SSAT, DL, VT, Input,
17996 DAG.getConstant(MinC.countr_one(), DL, VT));
17997 if (MaxC == 0)
17998 return DAG.getNode(ARMISD::USAT, DL, VT, Input,
17999 DAG.getConstant(MinC.countr_one(), DL, VT));
18000
18001 return SDValue();
18002}
18003
18004/// PerformMinMaxCombine - Target-specific DAG combining for creating truncating
18005/// saturates.
18007 const ARMSubtarget *ST) {
18008 EVT VT = N->getValueType(0);
18009 SDValue N0 = N->getOperand(0);
18010
18011 if (VT == MVT::i32)
18012 return PerformMinMaxToSatCombine(SDValue(N, 0), DAG, ST);
18013
18014 if (!ST->hasMVEIntegerOps())
18015 return SDValue();
18016
18017 if (SDValue V = PerformVQDMULHCombine(N, DAG))
18018 return V;
18019
18020 if (VT != MVT::v4i32 && VT != MVT::v8i16)
18021 return SDValue();
18022
18023 auto IsSignedSaturate = [&](SDNode *Min, SDNode *Max) {
18024 // Check one is a smin and the other is a smax
18025 if (Min->getOpcode() != ISD::SMIN)
18026 std::swap(Min, Max);
18027 if (Min->getOpcode() != ISD::SMIN || Max->getOpcode() != ISD::SMAX)
18028 return false;
18029
18030 APInt SaturateC;
18031 if (VT == MVT::v4i32)
18032 SaturateC = APInt(32, (1 << 15) - 1, true);
18033 else //if (VT == MVT::v8i16)
18034 SaturateC = APInt(16, (1 << 7) - 1, true);
18035
18036 APInt MinC, MaxC;
18037 if (!ISD::isConstantSplatVector(Min->getOperand(1).getNode(), MinC) ||
18038 MinC != SaturateC)
18039 return false;
18040 if (!ISD::isConstantSplatVector(Max->getOperand(1).getNode(), MaxC) ||
18041 MaxC != ~SaturateC)
18042 return false;
18043 return true;
18044 };
18045
18046 if (IsSignedSaturate(N, N0.getNode())) {
18047 SDLoc DL(N);
18048 MVT ExtVT, HalfVT;
18049 if (VT == MVT::v4i32) {
18050 HalfVT = MVT::v8i16;
18051 ExtVT = MVT::v4i16;
18052 } else { // if (VT == MVT::v8i16)
18053 HalfVT = MVT::v16i8;
18054 ExtVT = MVT::v8i8;
18055 }
18056
18057 // Create a VQMOVNB with undef top lanes, then signed extended into the top
18058 // half. That extend will hopefully be removed if only the bottom bits are
18059 // demanded (though a truncating store, for example).
18060 SDValue VQMOVN =
18061 DAG.getNode(ARMISD::VQMOVNs, DL, HalfVT, DAG.getUNDEF(HalfVT),
18062 N0->getOperand(0), DAG.getConstant(0, DL, MVT::i32));
18063 SDValue Bitcast = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, VQMOVN);
18064 return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Bitcast,
18065 DAG.getValueType(ExtVT));
18066 }
18067
18068 auto IsUnsignedSaturate = [&](SDNode *Min) {
18069 // For unsigned, we just need to check for <= 0xffff
18070 if (Min->getOpcode() != ISD::UMIN)
18071 return false;
18072
18073 APInt SaturateC;
18074 if (VT == MVT::v4i32)
18075 SaturateC = APInt(32, (1 << 16) - 1, true);
18076 else //if (VT == MVT::v8i16)
18077 SaturateC = APInt(16, (1 << 8) - 1, true);
18078
18079 APInt MinC;
18080 if (!ISD::isConstantSplatVector(Min->getOperand(1).getNode(), MinC) ||
18081 MinC != SaturateC)
18082 return false;
18083 return true;
18084 };
18085
18086 if (IsUnsignedSaturate(N)) {
18087 SDLoc DL(N);
18088 MVT HalfVT;
18089 unsigned ExtConst;
18090 if (VT == MVT::v4i32) {
18091 HalfVT = MVT::v8i16;
18092 ExtConst = 0x0000FFFF;
18093 } else { //if (VT == MVT::v8i16)
18094 HalfVT = MVT::v16i8;
18095 ExtConst = 0x00FF;
18096 }
18097
18098 // Create a VQMOVNB with undef top lanes, then ZExt into the top half with
18099 // an AND. That extend will hopefully be removed if only the bottom bits are
18100 // demanded (though a truncating store, for example).
18101 SDValue VQMOVN =
18102 DAG.getNode(ARMISD::VQMOVNu, DL, HalfVT, DAG.getUNDEF(HalfVT), N0,
18103 DAG.getConstant(0, DL, MVT::i32));
18104 SDValue Bitcast = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, VQMOVN);
18105 return DAG.getNode(ISD::AND, DL, VT, Bitcast,
18106 DAG.getConstant(ExtConst, DL, VT));
18107 }
18108
18109 return SDValue();
18110}
18111
18114 if (!C)
18115 return nullptr;
18116 const APInt *CV = &C->getAPIntValue();
18117 return CV->isPowerOf2() ? CV : nullptr;
18118}
18119
18121 // If we have a CMOV, OR and AND combination such as:
18122 // if (x & CN)
18123 // y |= CM;
18124 //
18125 // And:
18126 // * CN is a single bit;
18127 // * All bits covered by CM are known zero in y
18128 //
18129 // Then we can convert this into a sequence of BFI instructions. This will
18130 // always be a win if CM is a single bit, will always be no worse than the
18131 // TST&OR sequence if CM is two bits, and for thumb will be no worse if CM is
18132 // three bits (due to the extra IT instruction).
18133
18134 SDValue Op0 = CMOV->getOperand(0);
18135 SDValue Op1 = CMOV->getOperand(1);
18136 auto CC = CMOV->getConstantOperandAPInt(2).getLimitedValue();
18137 SDValue CmpZ = CMOV->getOperand(3);
18138
18139 // The compare must be against zero.
18140 if (!isNullConstant(CmpZ->getOperand(1)))
18141 return SDValue();
18142
18143 assert(CmpZ->getOpcode() == ARMISD::CMPZ);
18144 SDValue And = CmpZ->getOperand(0);
18145 if (And->getOpcode() != ISD::AND)
18146 return SDValue();
18147 const APInt *AndC = isPowerOf2Constant(And->getOperand(1));
18148 if (!AndC)
18149 return SDValue();
18150 SDValue X = And->getOperand(0);
18151
18152 if (CC == ARMCC::EQ) {
18153 // We're performing an "equal to zero" compare. Swap the operands so we
18154 // canonicalize on a "not equal to zero" compare.
18155 std::swap(Op0, Op1);
18156 } else {
18157 assert(CC == ARMCC::NE && "How can a CMPZ node not be EQ or NE?");
18158 }
18159
18160 if (Op1->getOpcode() != ISD::OR)
18161 return SDValue();
18162
18164 if (!OrC)
18165 return SDValue();
18166 SDValue Y = Op1->getOperand(0);
18167
18168 if (Op0 != Y)
18169 return SDValue();
18170
18171 // Now, is it profitable to continue?
18172 APInt OrCI = OrC->getAPIntValue();
18173 unsigned Heuristic = Subtarget->isThumb() ? 3 : 2;
18174 if (OrCI.popcount() > Heuristic)
18175 return SDValue();
18176
18177 // Lastly, can we determine that the bits defined by OrCI
18178 // are zero in Y?
18179 KnownBits Known = DAG.computeKnownBits(Y);
18180 if ((OrCI & Known.Zero) != OrCI)
18181 return SDValue();
18182
18183 // OK, we can do the combine.
18184 SDValue V = Y;
18185 SDLoc dl(X);
18186 EVT VT = X.getValueType();
18187 unsigned BitInX = AndC->logBase2();
18188
18189 if (BitInX != 0) {
18190 // We must shift X first.
18191 X = DAG.getNode(ISD::SRL, dl, VT, X,
18192 DAG.getConstant(BitInX, dl, VT));
18193 }
18194
18195 for (unsigned BitInY = 0, NumActiveBits = OrCI.getActiveBits();
18196 BitInY < NumActiveBits; ++BitInY) {
18197 if (OrCI[BitInY] == 0)
18198 continue;
18199 APInt Mask(VT.getSizeInBits(), 0);
18200 Mask.setBit(BitInY);
18201 V = DAG.getNode(ARMISD::BFI, dl, VT, V, X,
18202 // Confusingly, the operand is an *inverted* mask.
18203 DAG.getConstant(~Mask, dl, VT));
18204 }
18205
18206 return V;
18207}
18208
18209// Given N, the value controlling the conditional branch, search for the loop
18210// intrinsic, returning it, along with how the value is used. We need to handle
18211// patterns such as the following:
18212// (brcond (xor (setcc (loop.decrement), 0, ne), 1), exit)
18213// (brcond (setcc (loop.decrement), 0, eq), exit)
18214// (brcond (setcc (loop.decrement), 0, ne), header)
18216 bool &Negate) {
18217 switch (N->getOpcode()) {
18218 default:
18219 break;
18220 case ISD::XOR: {
18221 if (!isa<ConstantSDNode>(N.getOperand(1)))
18222 return SDValue();
18223 if (!cast<ConstantSDNode>(N.getOperand(1))->isOne())
18224 return SDValue();
18225 Negate = !Negate;
18226 return SearchLoopIntrinsic(N.getOperand(0), CC, Imm, Negate);
18227 }
18228 case ISD::SETCC: {
18229 auto *Const = dyn_cast<ConstantSDNode>(N.getOperand(1));
18230 if (!Const)
18231 return SDValue();
18232 if (Const->isZero())
18233 Imm = 0;
18234 else if (Const->isOne())
18235 Imm = 1;
18236 else
18237 return SDValue();
18238 CC = cast<CondCodeSDNode>(N.getOperand(2))->get();
18239 return SearchLoopIntrinsic(N->getOperand(0), CC, Imm, Negate);
18240 }
18242 unsigned IntOp = N.getConstantOperandVal(1);
18243 if (IntOp != Intrinsic::test_start_loop_iterations &&
18244 IntOp != Intrinsic::loop_decrement_reg)
18245 return SDValue();
18246 return N;
18247 }
18248 }
18249 return SDValue();
18250}
18251
18254 const ARMSubtarget *ST) {
18255
18256 // The hwloop intrinsics that we're interested are used for control-flow,
18257 // either for entering or exiting the loop:
18258 // - test.start.loop.iterations will test whether its operand is zero. If it
18259 // is zero, the proceeding branch should not enter the loop.
18260 // - loop.decrement.reg also tests whether its operand is zero. If it is
18261 // zero, the proceeding branch should not branch back to the beginning of
18262 // the loop.
18263 // So here, we need to check that how the brcond is using the result of each
18264 // of the intrinsics to ensure that we're branching to the right place at the
18265 // right time.
18266
18267 ISD::CondCode CC;
18268 SDValue Cond;
18269 int Imm = 1;
18270 bool Negate = false;
18271 SDValue Chain = N->getOperand(0);
18272 SDValue Dest;
18273
18274 if (N->getOpcode() == ISD::BRCOND) {
18275 CC = ISD::SETEQ;
18276 Cond = N->getOperand(1);
18277 Dest = N->getOperand(2);
18278 } else {
18279 assert(N->getOpcode() == ISD::BR_CC && "Expected BRCOND or BR_CC!");
18280 CC = cast<CondCodeSDNode>(N->getOperand(1))->get();
18281 Cond = N->getOperand(2);
18282 Dest = N->getOperand(4);
18283 if (auto *Const = dyn_cast<ConstantSDNode>(N->getOperand(3))) {
18284 if (!Const->isOne() && !Const->isZero())
18285 return SDValue();
18286 Imm = Const->getZExtValue();
18287 } else
18288 return SDValue();
18289 }
18290
18291 SDValue Int = SearchLoopIntrinsic(Cond, CC, Imm, Negate);
18292 if (!Int)
18293 return SDValue();
18294
18295 if (Negate)
18296 CC = ISD::getSetCCInverse(CC, /* Integer inverse */ MVT::i32);
18297
18298 auto IsTrueIfZero = [](ISD::CondCode CC, int Imm) {
18299 return (CC == ISD::SETEQ && Imm == 0) ||
18300 (CC == ISD::SETNE && Imm == 1) ||
18301 (CC == ISD::SETLT && Imm == 1) ||
18302 (CC == ISD::SETULT && Imm == 1);
18303 };
18304
18305 auto IsFalseIfZero = [](ISD::CondCode CC, int Imm) {
18306 return (CC == ISD::SETEQ && Imm == 1) ||
18307 (CC == ISD::SETNE && Imm == 0) ||
18308 (CC == ISD::SETGT && Imm == 0) ||
18309 (CC == ISD::SETUGT && Imm == 0) ||
18310 (CC == ISD::SETGE && Imm == 1) ||
18311 (CC == ISD::SETUGE && Imm == 1);
18312 };
18313
18314 assert((IsTrueIfZero(CC, Imm) || IsFalseIfZero(CC, Imm)) &&
18315 "unsupported condition");
18316
18317 SDLoc dl(Int);
18318 SelectionDAG &DAG = DCI.DAG;
18319 SDValue Elements = Int.getOperand(2);
18320 unsigned IntOp = Int->getConstantOperandVal(1);
18321 assert((N->hasOneUse() && N->user_begin()->getOpcode() == ISD::BR) &&
18322 "expected single br user");
18323 SDNode *Br = *N->user_begin();
18324 SDValue OtherTarget = Br->getOperand(1);
18325
18326 // Update the unconditional branch to branch to the given Dest.
18327 auto UpdateUncondBr = [](SDNode *Br, SDValue Dest, SelectionDAG &DAG) {
18328 SDValue NewBrOps[] = { Br->getOperand(0), Dest };
18329 SDValue NewBr = DAG.getNode(ISD::BR, SDLoc(Br), MVT::Other, NewBrOps);
18330 DAG.ReplaceAllUsesOfValueWith(SDValue(Br, 0), NewBr);
18331 };
18332
18333 if (IntOp == Intrinsic::test_start_loop_iterations) {
18334 SDValue Res;
18335 SDValue Setup = DAG.getNode(ARMISD::WLSSETUP, dl, MVT::i32, Elements);
18336 // We expect this 'instruction' to branch when the counter is zero.
18337 if (IsTrueIfZero(CC, Imm)) {
18338 SDValue Ops[] = {Chain, Setup, Dest};
18339 Res = DAG.getNode(ARMISD::WLS, dl, MVT::Other, Ops);
18340 } else {
18341 // The logic is the reverse of what we need for WLS, so find the other
18342 // basic block target: the target of the proceeding br.
18343 UpdateUncondBr(Br, Dest, DAG);
18344
18345 SDValue Ops[] = {Chain, Setup, OtherTarget};
18346 Res = DAG.getNode(ARMISD::WLS, dl, MVT::Other, Ops);
18347 }
18348 // Update LR count to the new value
18349 DAG.ReplaceAllUsesOfValueWith(Int.getValue(0), Setup);
18350 // Update chain
18351 DAG.ReplaceAllUsesOfValueWith(Int.getValue(2), Int.getOperand(0));
18352 return Res;
18353 } else {
18354 SDValue Size =
18355 DAG.getTargetConstant(Int.getConstantOperandVal(3), dl, MVT::i32);
18356 SDValue Args[] = { Int.getOperand(0), Elements, Size, };
18357 SDValue LoopDec = DAG.getNode(ARMISD::LOOP_DEC, dl,
18358 DAG.getVTList(MVT::i32, MVT::Other), Args);
18359 DAG.ReplaceAllUsesWith(Int.getNode(), LoopDec.getNode());
18360
18361 // We expect this instruction to branch when the count is not zero.
18362 SDValue Target = IsFalseIfZero(CC, Imm) ? Dest : OtherTarget;
18363
18364 // Update the unconditional branch to target the loop preheader if we've
18365 // found the condition has been reversed.
18366 if (Target == OtherTarget)
18367 UpdateUncondBr(Br, Dest, DAG);
18368
18369 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
18370 SDValue(LoopDec.getNode(), 1), Chain);
18371
18372 SDValue EndArgs[] = { Chain, SDValue(LoopDec.getNode(), 0), Target };
18373 return DAG.getNode(ARMISD::LE, dl, MVT::Other, EndArgs);
18374 }
18375 return SDValue();
18376}
18377
18378/// PerformBRCONDCombine - Target-specific DAG combining for ARMISD::BRCOND.
18379SDValue
18381 SDValue Cmp = N->getOperand(3);
18382 if (Cmp.getOpcode() != ARMISD::CMPZ)
18383 // Only looking at NE cases.
18384 return SDValue();
18385
18386 SDLoc dl(N);
18387 SDValue LHS = Cmp.getOperand(0);
18388 SDValue RHS = Cmp.getOperand(1);
18389 SDValue Chain = N->getOperand(0);
18390 SDValue BB = N->getOperand(1);
18391 SDValue ARMcc = N->getOperand(2);
18393
18394 // (brcond Chain BB ne (cmpz (and (cmov 0 1 CC Flags) 1) 0))
18395 // -> (brcond Chain BB CC Flags)
18396 if (CC == ARMCC::NE && LHS.getOpcode() == ISD::AND && LHS->hasOneUse() &&
18397 LHS->getOperand(0)->getOpcode() == ARMISD::CMOV &&
18398 LHS->getOperand(0)->hasOneUse() &&
18399 isNullConstant(LHS->getOperand(0)->getOperand(0)) &&
18400 isOneConstant(LHS->getOperand(0)->getOperand(1)) &&
18401 isOneConstant(LHS->getOperand(1)) && isNullConstant(RHS)) {
18402 return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, Chain, BB,
18403 LHS->getOperand(0)->getOperand(2),
18404 LHS->getOperand(0)->getOperand(3));
18405 }
18406
18407 return SDValue();
18408}
18409
18410/// PerformCMOVCombine - Target-specific DAG combining for ARMISD::CMOV.
18411SDValue
18413 SDLoc dl(N);
18414 EVT VT = N->getValueType(0);
18415 SDValue FalseVal = N->getOperand(0);
18416 SDValue TrueVal = N->getOperand(1);
18417 SDValue ARMcc = N->getOperand(2);
18418 SDValue Cmp = N->getOperand(3);
18419
18420 // Try to form CSINV etc.
18421 unsigned Opcode;
18422 bool InvertCond;
18423 if (SDValue CSetOp =
18424 matchCSET(Opcode, InvertCond, TrueVal, FalseVal, Subtarget)) {
18425 if (InvertCond) {
18426 ARMCC::CondCodes CondCode =
18427 (ARMCC::CondCodes)cast<const ConstantSDNode>(ARMcc)->getZExtValue();
18428 CondCode = ARMCC::getOppositeCondition(CondCode);
18429 ARMcc = DAG.getConstant(CondCode, SDLoc(ARMcc), MVT::i32);
18430 }
18431 return DAG.getNode(Opcode, dl, VT, CSetOp, CSetOp, ARMcc, Cmp);
18432 }
18433
18434 if (Cmp.getOpcode() != ARMISD::CMPZ)
18435 // Only looking at EQ and NE cases.
18436 return SDValue();
18437
18438 SDValue LHS = Cmp.getOperand(0);
18439 SDValue RHS = Cmp.getOperand(1);
18441
18442 // BFI is only available on V6T2+.
18443 if (!Subtarget->isThumb1Only() && Subtarget->hasV6T2Ops()) {
18445 if (R)
18446 return R;
18447 }
18448
18449 // Simplify
18450 // mov r1, r0
18451 // cmp r1, x
18452 // mov r0, y
18453 // moveq r0, x
18454 // to
18455 // cmp r0, x
18456 // movne r0, y
18457 //
18458 // mov r1, r0
18459 // cmp r1, x
18460 // mov r0, x
18461 // movne r0, y
18462 // to
18463 // cmp r0, x
18464 // movne r0, y
18465 /// FIXME: Turn this into a target neutral optimization?
18466 SDValue Res;
18467 if (CC == ARMCC::NE && FalseVal == RHS && FalseVal != LHS) {
18468 Res = DAG.getNode(ARMISD::CMOV, dl, VT, LHS, TrueVal, ARMcc, Cmp);
18469 } else if (CC == ARMCC::EQ && TrueVal == RHS) {
18470 SDValue ARMcc;
18471 SDValue NewCmp = getARMCmp(LHS, RHS, ISD::SETNE, ARMcc, DAG, dl);
18472 Res = DAG.getNode(ARMISD::CMOV, dl, VT, LHS, FalseVal, ARMcc, NewCmp);
18473 }
18474
18475 // (cmov F T ne (cmpz (cmov 0 1 CC Flags) 0))
18476 // -> (cmov F T CC Flags)
18477 if (CC == ARMCC::NE && LHS.getOpcode() == ARMISD::CMOV && LHS->hasOneUse() &&
18478 isNullConstant(LHS->getOperand(0)) && isOneConstant(LHS->getOperand(1)) &&
18479 isNullConstant(RHS)) {
18480 return DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, TrueVal,
18481 LHS->getOperand(2), LHS->getOperand(3));
18482 }
18483
18484 if (!VT.isInteger())
18485 return SDValue();
18486
18487 // Fold away an unnecessary CMPZ/CMOV
18488 // CMOV A, B, C1, (CMPZ (CMOV 1, 0, C2, D), 0) ->
18489 // if C1==EQ -> CMOV A, B, C2, D
18490 // if C1==NE -> CMOV A, B, NOT(C2), D
18491 if (N->getConstantOperandVal(2) == ARMCC::EQ ||
18492 N->getConstantOperandVal(2) == ARMCC::NE) {
18494 if (SDValue C = IsCMPZCSINC(N->getOperand(3).getNode(), Cond)) {
18495 if (N->getConstantOperandVal(2) == ARMCC::NE)
18497 return DAG.getNode(N->getOpcode(), SDLoc(N), MVT::i32, N->getOperand(0),
18498 N->getOperand(1),
18499 DAG.getConstant(Cond, SDLoc(N), MVT::i32), C);
18500 }
18501 }
18502
18503 // Materialize a boolean comparison for integers so we can avoid branching.
18504 if (isNullConstant(FalseVal)) {
18505 if (CC == ARMCC::EQ && isOneConstant(TrueVal)) {
18506 if (!Subtarget->isThumb1Only() && Subtarget->hasV5TOps()) {
18507 // If x == y then x - y == 0 and ARM's CLZ will return 32, shifting it
18508 // right 5 bits will make that 32 be 1, otherwise it will be 0.
18509 // CMOV 0, 1, ==, (CMPZ x, y) -> SRL (CTLZ (SUB x, y)), 5
18510 SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, LHS, RHS);
18511 Res = DAG.getNode(ISD::SRL, dl, VT, DAG.getNode(ISD::CTLZ, dl, VT, Sub),
18512 DAG.getConstant(5, dl, MVT::i32));
18513 } else {
18514 // CMOV 0, 1, ==, (CMPZ x, y) ->
18515 // (UADDO_CARRY (SUB x, y), t:0, t:1)
18516 // where t = (USUBO_CARRY 0, (SUB x, y), 0)
18517 //
18518 // The USUBO_CARRY computes 0 - (x - y) and this will give a borrow when
18519 // x != y. In other words, a carry C == 1 when x == y, C == 0
18520 // otherwise.
18521 // The final UADDO_CARRY computes
18522 // x - y + (0 - (x - y)) + C == C
18523 SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, LHS, RHS);
18524 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
18525 SDValue Neg = DAG.getNode(ISD::USUBO, dl, VTs, FalseVal, Sub);
18526 // ISD::USUBO_CARRY returns a borrow but we want the carry here
18527 // actually.
18528 SDValue Carry =
18529 DAG.getNode(ISD::SUB, dl, MVT::i32,
18530 DAG.getConstant(1, dl, MVT::i32), Neg.getValue(1));
18531 Res = DAG.getNode(ISD::UADDO_CARRY, dl, VTs, Sub, Neg, Carry);
18532 }
18533 } else if (CC == ARMCC::NE && !isNullConstant(RHS) &&
18534 (!Subtarget->isThumb1Only() || isPowerOf2Constant(TrueVal))) {
18535 // This seems pointless but will allow us to combine it further below.
18536 // CMOV 0, z, !=, (CMPZ x, y) -> CMOV (SUBC x, y), z, !=, (SUBC x, y):1
18537 SDValue Sub =
18538 DAG.getNode(ARMISD::SUBC, dl, DAG.getVTList(VT, MVT::i32), LHS, RHS);
18539 Res = DAG.getNode(ARMISD::CMOV, dl, VT, Sub, TrueVal, ARMcc,
18540 Sub.getValue(1));
18541 FalseVal = Sub;
18542 }
18543 } else if (isNullConstant(TrueVal)) {
18544 if (CC == ARMCC::EQ && !isNullConstant(RHS) &&
18545 (!Subtarget->isThumb1Only() || isPowerOf2Constant(FalseVal))) {
18546 // This seems pointless but will allow us to combine it further below
18547 // Note that we change == for != as this is the dual for the case above.
18548 // CMOV z, 0, ==, (CMPZ x, y) -> CMOV (SUBC x, y), z, !=, (SUBC x, y):1
18549 SDValue Sub =
18550 DAG.getNode(ARMISD::SUBC, dl, DAG.getVTList(VT, MVT::i32), LHS, RHS);
18551 Res = DAG.getNode(ARMISD::CMOV, dl, VT, Sub, FalseVal,
18552 DAG.getConstant(ARMCC::NE, dl, MVT::i32),
18553 Sub.getValue(1));
18554 FalseVal = Sub;
18555 }
18556 }
18557
18558 // On Thumb1, the DAG above may be further combined if z is a power of 2
18559 // (z == 2 ^ K).
18560 // CMOV (SUBC x, y), z, !=, (SUBC x, y):1 ->
18561 // t1 = (USUBO (SUB x, y), 1)
18562 // t2 = (USUBO_CARRY (SUB x, y), t1:0, t1:1)
18563 // Result = if K != 0 then (SHL t2:0, K) else t2:0
18564 //
18565 // This also handles the special case of comparing against zero; it's
18566 // essentially, the same pattern, except there's no SUBC:
18567 // CMOV x, z, !=, (CMPZ x, 0) ->
18568 // t1 = (USUBO x, 1)
18569 // t2 = (USUBO_CARRY x, t1:0, t1:1)
18570 // Result = if K != 0 then (SHL t2:0, K) else t2:0
18571 const APInt *TrueConst;
18572 if (Subtarget->isThumb1Only() && CC == ARMCC::NE &&
18573 ((FalseVal.getOpcode() == ARMISD::SUBC && FalseVal.getOperand(0) == LHS &&
18574 FalseVal.getOperand(1) == RHS) ||
18575 (FalseVal == LHS && isNullConstant(RHS))) &&
18576 (TrueConst = isPowerOf2Constant(TrueVal))) {
18577 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
18578 unsigned ShiftAmount = TrueConst->logBase2();
18579 if (ShiftAmount)
18580 TrueVal = DAG.getConstant(1, dl, VT);
18581 SDValue Subc = DAG.getNode(ISD::USUBO, dl, VTs, FalseVal, TrueVal);
18582 Res = DAG.getNode(ISD::USUBO_CARRY, dl, VTs, FalseVal, Subc,
18583 Subc.getValue(1));
18584
18585 if (ShiftAmount)
18586 Res = DAG.getNode(ISD::SHL, dl, VT, Res,
18587 DAG.getConstant(ShiftAmount, dl, MVT::i32));
18588 }
18589
18590 if (Res.getNode()) {
18591 KnownBits Known = DAG.computeKnownBits(SDValue(N,0));
18592 // Capture demanded bits information that would be otherwise lost.
18593 if (Known.Zero == 0xfffffffe)
18594 Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res,
18595 DAG.getValueType(MVT::i1));
18596 else if (Known.Zero == 0xffffff00)
18597 Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res,
18598 DAG.getValueType(MVT::i8));
18599 else if (Known.Zero == 0xffff0000)
18600 Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res,
18601 DAG.getValueType(MVT::i16));
18602 }
18603
18604 return Res;
18605}
18606
18609 const ARMSubtarget *ST) {
18610 SelectionDAG &DAG = DCI.DAG;
18611 SDValue Src = N->getOperand(0);
18612 EVT DstVT = N->getValueType(0);
18613
18614 // Convert v4f32 bitcast (v4i32 vdup (i32)) -> v4f32 vdup (i32) under MVE.
18615 if (ST->hasMVEIntegerOps() && Src.getOpcode() == ARMISD::VDUP) {
18616 EVT SrcVT = Src.getValueType();
18617 if (SrcVT.getScalarSizeInBits() == DstVT.getScalarSizeInBits())
18618 return DAG.getNode(ARMISD::VDUP, SDLoc(N), DstVT, Src.getOperand(0));
18619 }
18620
18621 // We may have a bitcast of something that has already had this bitcast
18622 // combine performed on it, so skip past any VECTOR_REG_CASTs.
18623 if (Src.getOpcode() == ARMISD::VECTOR_REG_CAST &&
18624 Src.getOperand(0).getValueType().getScalarSizeInBits() <=
18625 Src.getValueType().getScalarSizeInBits())
18626 Src = Src.getOperand(0);
18627
18628 // Bitcast from element-wise VMOV or VMVN doesn't need VREV if the VREV that
18629 // would be generated is at least the width of the element type.
18630 EVT SrcVT = Src.getValueType();
18631 if ((Src.getOpcode() == ARMISD::VMOVIMM ||
18632 Src.getOpcode() == ARMISD::VMVNIMM ||
18633 Src.getOpcode() == ARMISD::VMOVFPIMM) &&
18634 SrcVT.getScalarSizeInBits() <= DstVT.getScalarSizeInBits() &&
18635 DAG.getDataLayout().isBigEndian())
18636 return DAG.getNode(ARMISD::VECTOR_REG_CAST, SDLoc(N), DstVT, Src);
18637
18638 // bitcast(extract(x, n)); bitcast(extract(x, n+1)) -> VMOVRRD x
18639 if (SDValue R = PerformExtractEltToVMOVRRD(N, DCI))
18640 return R;
18641
18642 return SDValue();
18643}
18644
18645// Some combines for the MVETrunc truncations legalizer helper. Also lowers the
18646// node into stack operations after legalizeOps.
18649 SelectionDAG &DAG = DCI.DAG;
18650 EVT VT = N->getValueType(0);
18651 SDLoc DL(N);
18652
18653 // MVETrunc(Undef, Undef) -> Undef
18654 if (all_of(N->ops(), [](SDValue Op) { return Op.isUndef(); }))
18655 return DAG.getUNDEF(VT);
18656
18657 // MVETrunc(MVETrunc a b, MVETrunc c, d) -> MVETrunc
18658 if (N->getNumOperands() == 2 &&
18659 N->getOperand(0).getOpcode() == ARMISD::MVETRUNC &&
18660 N->getOperand(1).getOpcode() == ARMISD::MVETRUNC)
18661 return DAG.getNode(ARMISD::MVETRUNC, DL, VT, N->getOperand(0).getOperand(0),
18662 N->getOperand(0).getOperand(1),
18663 N->getOperand(1).getOperand(0),
18664 N->getOperand(1).getOperand(1));
18665
18666 // MVETrunc(shuffle, shuffle) -> VMOVN
18667 if (N->getNumOperands() == 2 &&
18668 N->getOperand(0).getOpcode() == ISD::VECTOR_SHUFFLE &&
18669 N->getOperand(1).getOpcode() == ISD::VECTOR_SHUFFLE) {
18670 auto *S0 = cast<ShuffleVectorSDNode>(N->getOperand(0).getNode());
18671 auto *S1 = cast<ShuffleVectorSDNode>(N->getOperand(1).getNode());
18672
18673 if (S0->getOperand(0) == S1->getOperand(0) &&
18674 S0->getOperand(1) == S1->getOperand(1)) {
18675 // Construct complete shuffle mask
18676 SmallVector<int, 8> Mask(S0->getMask());
18677 Mask.append(S1->getMask().begin(), S1->getMask().end());
18678
18679 if (isVMOVNTruncMask(Mask, VT, false))
18680 return DAG.getNode(
18681 ARMISD::VMOVN, DL, VT,
18682 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, S0->getOperand(0)),
18683 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, S0->getOperand(1)),
18684 DAG.getConstant(1, DL, MVT::i32));
18685 if (isVMOVNTruncMask(Mask, VT, true))
18686 return DAG.getNode(
18687 ARMISD::VMOVN, DL, VT,
18688 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, S0->getOperand(1)),
18689 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, S0->getOperand(0)),
18690 DAG.getConstant(1, DL, MVT::i32));
18691 }
18692 }
18693
18694 // For MVETrunc of a buildvector or shuffle, it can be beneficial to lower the
18695 // truncate to a buildvector to allow the generic optimisations to kick in.
18696 if (all_of(N->ops(), [](SDValue Op) {
18697 return Op.getOpcode() == ISD::BUILD_VECTOR ||
18698 Op.getOpcode() == ISD::VECTOR_SHUFFLE ||
18699 (Op.getOpcode() == ISD::BITCAST &&
18700 Op.getOperand(0).getOpcode() == ISD::BUILD_VECTOR);
18701 })) {
18702 SmallVector<SDValue, 8> Extracts;
18703 for (unsigned Op = 0; Op < N->getNumOperands(); Op++) {
18704 SDValue O = N->getOperand(Op);
18705 for (unsigned i = 0; i < O.getValueType().getVectorNumElements(); i++) {
18706 SDValue Ext = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, O,
18707 DAG.getConstant(i, DL, MVT::i32));
18708 Extracts.push_back(Ext);
18709 }
18710 }
18711 return DAG.getBuildVector(VT, DL, Extracts);
18712 }
18713
18714 // If we are late in the legalization process and nothing has optimised
18715 // the trunc to anything better, lower it to a stack store and reload,
18716 // performing the truncation whilst keeping the lanes in the correct order:
18717 // VSTRH.32 a, stack; VSTRH.32 b, stack+8; VLDRW.32 stack;
18718 if (!DCI.isAfterLegalizeDAG())
18719 return SDValue();
18720
18721 SDValue StackPtr = DAG.CreateStackTemporary(TypeSize::getFixed(16), Align(4));
18722 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
18723 int NumIns = N->getNumOperands();
18724 assert((NumIns == 2 || NumIns == 4) &&
18725 "Expected 2 or 4 inputs to an MVETrunc");
18726 EVT StoreVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
18727 if (N->getNumOperands() == 4)
18728 StoreVT = StoreVT.getHalfNumVectorElementsVT(*DAG.getContext());
18729
18730 SmallVector<SDValue> Chains;
18731 for (int I = 0; I < NumIns; I++) {
18732 SDValue Ptr = DAG.getNode(
18733 ISD::ADD, DL, StackPtr.getValueType(), StackPtr,
18734 DAG.getConstant(I * 16 / NumIns, DL, StackPtr.getValueType()));
18736 DAG.getMachineFunction(), SPFI, I * 16 / NumIns);
18737 SDValue Ch = DAG.getTruncStore(DAG.getEntryNode(), DL, N->getOperand(I),
18738 Ptr, MPI, StoreVT, Align(4));
18739 Chains.push_back(Ch);
18740 }
18741
18742 SDValue Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
18743 MachinePointerInfo MPI =
18745 return DAG.getLoad(VT, DL, Chain, StackPtr, MPI, Align(4));
18746}
18747
18748// Take a MVEEXT(load x) and split that into (extload x, extload x+8)
18750 SelectionDAG &DAG) {
18751 SDValue N0 = N->getOperand(0);
18753 if (!LD || !LD->isSimple() || !N0.hasOneUse() || LD->isIndexed())
18754 return SDValue();
18755
18756 EVT FromVT = LD->getMemoryVT();
18757 EVT ToVT = N->getValueType(0);
18758 if (!ToVT.isVector())
18759 return SDValue();
18760 assert(FromVT.getVectorNumElements() == ToVT.getVectorNumElements() * 2);
18761 EVT ToEltVT = ToVT.getVectorElementType();
18762 EVT FromEltVT = FromVT.getVectorElementType();
18763
18764 unsigned NumElements = 0;
18765 if (ToEltVT == MVT::i32 && (FromEltVT == MVT::i16 || FromEltVT == MVT::i8))
18766 NumElements = 4;
18767 if (ToEltVT == MVT::i16 && FromEltVT == MVT::i8)
18768 NumElements = 8;
18769 assert(NumElements != 0);
18770
18771 ISD::LoadExtType NewExtType =
18772 N->getOpcode() == ARMISD::MVESEXT ? ISD::SEXTLOAD : ISD::ZEXTLOAD;
18773 if (LD->getExtensionType() != ISD::NON_EXTLOAD &&
18774 LD->getExtensionType() != ISD::EXTLOAD &&
18775 LD->getExtensionType() != NewExtType)
18776 return SDValue();
18777
18778 LLVMContext &C = *DAG.getContext();
18779 SDLoc DL(LD);
18780 // Details about the old load
18781 SDValue Ch = LD->getChain();
18782 SDValue BasePtr = LD->getBasePtr();
18783 Align Alignment = LD->getBaseAlign();
18784 MachineMemOperand::Flags MMOFlags = LD->getMemOperand()->getFlags();
18785 AAMDNodes AAInfo = LD->getAAInfo();
18786
18787 SDValue Offset = DAG.getUNDEF(BasePtr.getValueType());
18788 EVT NewFromVT = EVT::getVectorVT(
18789 C, EVT::getIntegerVT(C, FromEltVT.getScalarSizeInBits()), NumElements);
18790 EVT NewToVT = EVT::getVectorVT(
18791 C, EVT::getIntegerVT(C, ToEltVT.getScalarSizeInBits()), NumElements);
18792
18795 for (unsigned i = 0; i < FromVT.getVectorNumElements() / NumElements; i++) {
18796 unsigned NewOffset = (i * NewFromVT.getSizeInBits()) / 8;
18797 SDValue NewPtr =
18798 DAG.getObjectPtrOffset(DL, BasePtr, TypeSize::getFixed(NewOffset));
18799
18800 SDValue NewLoad =
18801 DAG.getLoad(ISD::UNINDEXED, NewExtType, NewToVT, DL, Ch, NewPtr, Offset,
18802 LD->getPointerInfo().getWithOffset(NewOffset), NewFromVT,
18803 Alignment, MMOFlags, AAInfo);
18804 Loads.push_back(NewLoad);
18805 Chains.push_back(SDValue(NewLoad.getNode(), 1));
18806 }
18807
18808 SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
18809 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewChain);
18810 return DAG.getMergeValues(Loads, DL);
18811}
18812
18813// Perform combines for MVEEXT. If it has not be optimized to anything better
18814// before lowering, it gets converted to stack store and extloads performing the
18815// extend whilst still keeping the same lane ordering.
18818 SelectionDAG &DAG = DCI.DAG;
18819 EVT VT = N->getValueType(0);
18820 SDLoc DL(N);
18821 assert(N->getNumValues() == 2 && "Expected MVEEXT with 2 elements");
18822 assert((VT == MVT::v4i32 || VT == MVT::v8i16) && "Unexpected MVEEXT type");
18823
18824 EVT ExtVT = N->getOperand(0).getValueType().getHalfNumVectorElementsVT(
18825 *DAG.getContext());
18826 auto Extend = [&](SDValue V) {
18827 SDValue VVT = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, V);
18828 return N->getOpcode() == ARMISD::MVESEXT
18829 ? DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, VVT,
18830 DAG.getValueType(ExtVT))
18831 : DAG.getZeroExtendInReg(VVT, DL, ExtVT);
18832 };
18833
18834 // MVEEXT(VDUP) -> SIGN_EXTEND_INREG(VDUP)
18835 if (N->getOperand(0).getOpcode() == ARMISD::VDUP) {
18836 SDValue Ext = Extend(N->getOperand(0));
18837 return DAG.getMergeValues({Ext, Ext}, DL);
18838 }
18839
18840 // MVEEXT(shuffle) -> SIGN_EXTEND_INREG/ZERO_EXTEND_INREG
18841 if (auto *SVN = dyn_cast<ShuffleVectorSDNode>(N->getOperand(0))) {
18842 ArrayRef<int> Mask = SVN->getMask();
18843 assert(Mask.size() == 2 * VT.getVectorNumElements());
18844 assert(Mask.size() == SVN->getValueType(0).getVectorNumElements());
18845 unsigned Rev = VT == MVT::v4i32 ? ARMISD::VREV32 : ARMISD::VREV16;
18846 SDValue Op0 = SVN->getOperand(0);
18847 SDValue Op1 = SVN->getOperand(1);
18848
18849 auto CheckInregMask = [&](int Start, int Offset) {
18850 for (int Idx = 0, E = VT.getVectorNumElements(); Idx < E; ++Idx)
18851 if (Mask[Start + Idx] >= 0 && Mask[Start + Idx] != Idx * 2 + Offset)
18852 return false;
18853 return true;
18854 };
18855 SDValue V0 = SDValue(N, 0);
18856 SDValue V1 = SDValue(N, 1);
18857 if (CheckInregMask(0, 0))
18858 V0 = Extend(Op0);
18859 else if (CheckInregMask(0, 1))
18860 V0 = Extend(DAG.getNode(Rev, DL, SVN->getValueType(0), Op0));
18861 else if (CheckInregMask(0, Mask.size()))
18862 V0 = Extend(Op1);
18863 else if (CheckInregMask(0, Mask.size() + 1))
18864 V0 = Extend(DAG.getNode(Rev, DL, SVN->getValueType(0), Op1));
18865
18866 if (CheckInregMask(VT.getVectorNumElements(), Mask.size()))
18867 V1 = Extend(Op1);
18868 else if (CheckInregMask(VT.getVectorNumElements(), Mask.size() + 1))
18869 V1 = Extend(DAG.getNode(Rev, DL, SVN->getValueType(0), Op1));
18870 else if (CheckInregMask(VT.getVectorNumElements(), 0))
18871 V1 = Extend(Op0);
18872 else if (CheckInregMask(VT.getVectorNumElements(), 1))
18873 V1 = Extend(DAG.getNode(Rev, DL, SVN->getValueType(0), Op0));
18874
18875 if (V0.getNode() != N || V1.getNode() != N)
18876 return DAG.getMergeValues({V0, V1}, DL);
18877 }
18878
18879 // MVEEXT(load) -> extload, extload
18880 if (N->getOperand(0)->getOpcode() == ISD::LOAD)
18882 return L;
18883
18884 if (!DCI.isAfterLegalizeDAG())
18885 return SDValue();
18886
18887 // Lower to a stack store and reload:
18888 // VSTRW.32 a, stack; VLDRH.32 stack; VLDRH.32 stack+8;
18889 SDValue StackPtr = DAG.CreateStackTemporary(TypeSize::getFixed(16), Align(4));
18890 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
18891 int NumOuts = N->getNumValues();
18892 assert((NumOuts == 2 || NumOuts == 4) &&
18893 "Expected 2 or 4 outputs to an MVEEXT");
18894 EVT LoadVT = N->getOperand(0).getValueType().getHalfNumVectorElementsVT(
18895 *DAG.getContext());
18896 if (N->getNumOperands() == 4)
18897 LoadVT = LoadVT.getHalfNumVectorElementsVT(*DAG.getContext());
18898
18899 MachinePointerInfo MPI =
18901 SDValue Chain = DAG.getStore(DAG.getEntryNode(), DL, N->getOperand(0),
18902 StackPtr, MPI, Align(4));
18903
18905 for (int I = 0; I < NumOuts; I++) {
18906 SDValue Ptr = DAG.getNode(
18907 ISD::ADD, DL, StackPtr.getValueType(), StackPtr,
18908 DAG.getConstant(I * 16 / NumOuts, DL, StackPtr.getValueType()));
18910 DAG.getMachineFunction(), SPFI, I * 16 / NumOuts);
18911 SDValue Load = DAG.getExtLoad(
18912 N->getOpcode() == ARMISD::MVESEXT ? ISD::SEXTLOAD : ISD::ZEXTLOAD, DL,
18913 VT, Chain, Ptr, MPI, LoadVT, Align(4));
18914 Loads.push_back(Load);
18915 }
18916
18917 return DAG.getMergeValues(Loads, DL);
18918}
18919
18921 DAGCombinerInfo &DCI) const {
18922 switch (N->getOpcode()) {
18923 default: break;
18924 case ISD::SELECT_CC:
18925 case ISD::SELECT: return PerformSELECTCombine(N, DCI, Subtarget);
18926 case ISD::VSELECT: return PerformVSELECTCombine(N, DCI, Subtarget);
18927 case ISD::SETCC: return PerformVSetCCToVCTPCombine(N, DCI, Subtarget);
18928 case ARMISD::ADDE: return PerformADDECombine(N, DCI, Subtarget);
18929 case ARMISD::UMLAL: return PerformUMLALCombine(N, DCI.DAG, Subtarget);
18930 case ISD::ADD: return PerformADDCombine(N, DCI, Subtarget);
18931 case ISD::SUB: return PerformSUBCombine(N, DCI, Subtarget);
18932 case ISD::MUL: return PerformMULCombine(N, DCI, Subtarget);
18933 case ISD::OR: return PerformORCombine(N, DCI, Subtarget);
18934 case ISD::XOR: return PerformXORCombine(N, DCI, Subtarget);
18935 case ISD::AND: return PerformANDCombine(N, DCI, Subtarget);
18936 case ISD::BRCOND:
18937 case ISD::BR_CC: return PerformHWLoopCombine(N, DCI, Subtarget);
18938 case ARMISD::ADDC:
18939 case ARMISD::SUBC: return PerformAddcSubcCombine(N, DCI, Subtarget);
18940 case ARMISD::SUBE: return PerformAddeSubeCombine(N, DCI, Subtarget);
18941 case ARMISD::BFI: return PerformBFICombine(N, DCI.DAG);
18942 case ARMISD::VMOVRRD: return PerformVMOVRRDCombine(N, DCI, Subtarget);
18943 case ARMISD::VMOVDRR: return PerformVMOVDRRCombine(N, DCI.DAG);
18944 case ARMISD::VMOVhr: return PerformVMOVhrCombine(N, DCI);
18945 case ARMISD::VMOVrh: return PerformVMOVrhCombine(N, DCI.DAG);
18946 case ISD::STORE: return PerformSTORECombine(N, DCI, Subtarget);
18947 case ISD::BUILD_VECTOR: return PerformBUILD_VECTORCombine(N, DCI, Subtarget);
18950 return PerformExtractEltCombine(N, DCI, Subtarget);
18954 case ARMISD::VDUPLANE: return PerformVDUPLANECombine(N, DCI, Subtarget);
18955 case ARMISD::VDUP: return PerformVDUPCombine(N, DCI.DAG, Subtarget);
18956 case ISD::FP_TO_SINT:
18957 case ISD::FP_TO_UINT:
18958 return PerformVCVTCombine(N, DCI.DAG, Subtarget);
18959 case ISD::FADD:
18960 return PerformFADDCombine(N, DCI.DAG, Subtarget);
18961 case ISD::FMUL:
18962 return PerformVMulVCTPCombine(N, DCI.DAG, Subtarget);
18964 return PerformIntrinsicCombine(N, DCI);
18965 case ISD::SHL:
18966 case ISD::SRA:
18967 case ISD::SRL:
18968 return PerformShiftCombine(N, DCI, Subtarget);
18969 case ISD::SIGN_EXTEND:
18970 case ISD::ZERO_EXTEND:
18971 case ISD::ANY_EXTEND:
18972 return PerformExtendCombine(N, DCI.DAG, Subtarget);
18973 case ISD::FP_EXTEND:
18974 return PerformFPExtendCombine(N, DCI.DAG, Subtarget);
18975 case ISD::SMIN:
18976 case ISD::UMIN:
18977 case ISD::SMAX:
18978 case ISD::UMAX:
18979 return PerformMinMaxCombine(N, DCI.DAG, Subtarget);
18980 case ARMISD::CMOV:
18981 return PerformCMOVCombine(N, DCI.DAG);
18982 case ARMISD::BRCOND:
18983 return PerformBRCONDCombine(N, DCI.DAG);
18984 case ARMISD::CMPZ:
18985 return PerformCMPZCombine(N, DCI.DAG);
18986 case ARMISD::CSINC:
18987 case ARMISD::CSINV:
18988 case ARMISD::CSNEG:
18989 return PerformCSETCombine(N, DCI.DAG);
18990 case ISD::LOAD:
18991 return PerformLOADCombine(N, DCI, Subtarget);
18992 case ARMISD::VLD1DUP:
18993 case ARMISD::VLD2DUP:
18994 case ARMISD::VLD3DUP:
18995 case ARMISD::VLD4DUP:
18996 return PerformVLDCombine(N, DCI);
18998 return PerformARMBUILD_VECTORCombine(N, DCI);
18999 case ISD::BITCAST:
19000 return PerformBITCASTCombine(N, DCI, Subtarget);
19001 case ARMISD::PREDICATE_CAST:
19002 return PerformPREDICATE_CASTCombine(N, DCI);
19003 case ARMISD::VECTOR_REG_CAST:
19004 return PerformVECTOR_REG_CASTCombine(N, DCI.DAG, Subtarget);
19005 case ARMISD::MVETRUNC:
19006 return PerformMVETruncCombine(N, DCI);
19007 case ARMISD::MVESEXT:
19008 case ARMISD::MVEZEXT:
19009 return PerformMVEExtCombine(N, DCI);
19010 case ARMISD::VCMP:
19011 return PerformVCMPCombine(N, DCI.DAG, Subtarget);
19012 case ISD::VECREDUCE_ADD:
19013 return PerformVECREDUCE_ADDCombine(N, DCI.DAG, Subtarget);
19014 case ARMISD::VADDVs:
19015 case ARMISD::VADDVu:
19016 case ARMISD::VADDLVs:
19017 case ARMISD::VADDLVu:
19018 case ARMISD::VADDLVAs:
19019 case ARMISD::VADDLVAu:
19020 case ARMISD::VMLAVs:
19021 case ARMISD::VMLAVu:
19022 case ARMISD::VMLALVs:
19023 case ARMISD::VMLALVu:
19024 case ARMISD::VMLALVAs:
19025 case ARMISD::VMLALVAu:
19026 return PerformReduceShuffleCombine(N, DCI.DAG);
19027 case ARMISD::VMOVN:
19028 return PerformVMOVNCombine(N, DCI);
19029 case ARMISD::VQMOVNs:
19030 case ARMISD::VQMOVNu:
19031 return PerformVQMOVNCombine(N, DCI);
19032 case ARMISD::VQDMULH:
19033 return PerformVQDMULHCombine(N, DCI);
19034 case ARMISD::ASRL:
19035 case ARMISD::LSRL:
19036 case ARMISD::LSLL:
19037 return PerformLongShiftCombine(N, DCI.DAG);
19038 case ARMISD::SMULWB: {
19039 unsigned BitWidth = N->getValueType(0).getSizeInBits();
19040 APInt DemandedMask = APInt::getLowBitsSet(BitWidth, 16);
19041 if (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI))
19042 return SDValue();
19043 break;
19044 }
19045 case ARMISD::SMULWT: {
19046 unsigned BitWidth = N->getValueType(0).getSizeInBits();
19047 APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 16);
19048 if (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI))
19049 return SDValue();
19050 break;
19051 }
19052 case ARMISD::SMLALBB:
19053 case ARMISD::QADD16b:
19054 case ARMISD::QSUB16b:
19055 case ARMISD::UQADD16b:
19056 case ARMISD::UQSUB16b: {
19057 unsigned BitWidth = N->getValueType(0).getSizeInBits();
19058 APInt DemandedMask = APInt::getLowBitsSet(BitWidth, 16);
19059 if ((SimplifyDemandedBits(N->getOperand(0), DemandedMask, DCI)) ||
19060 (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI)))
19061 return SDValue();
19062 break;
19063 }
19064 case ARMISD::SMLALBT: {
19065 unsigned LowWidth = N->getOperand(0).getValueType().getSizeInBits();
19066 APInt LowMask = APInt::getLowBitsSet(LowWidth, 16);
19067 unsigned HighWidth = N->getOperand(1).getValueType().getSizeInBits();
19068 APInt HighMask = APInt::getHighBitsSet(HighWidth, 16);
19069 if ((SimplifyDemandedBits(N->getOperand(0), LowMask, DCI)) ||
19070 (SimplifyDemandedBits(N->getOperand(1), HighMask, DCI)))
19071 return SDValue();
19072 break;
19073 }
19074 case ARMISD::SMLALTB: {
19075 unsigned HighWidth = N->getOperand(0).getValueType().getSizeInBits();
19076 APInt HighMask = APInt::getHighBitsSet(HighWidth, 16);
19077 unsigned LowWidth = N->getOperand(1).getValueType().getSizeInBits();
19078 APInt LowMask = APInt::getLowBitsSet(LowWidth, 16);
19079 if ((SimplifyDemandedBits(N->getOperand(0), HighMask, DCI)) ||
19080 (SimplifyDemandedBits(N->getOperand(1), LowMask, DCI)))
19081 return SDValue();
19082 break;
19083 }
19084 case ARMISD::SMLALTT: {
19085 unsigned BitWidth = N->getValueType(0).getSizeInBits();
19086 APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 16);
19087 if ((SimplifyDemandedBits(N->getOperand(0), DemandedMask, DCI)) ||
19088 (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI)))
19089 return SDValue();
19090 break;
19091 }
19092 case ARMISD::QADD8b:
19093 case ARMISD::QSUB8b:
19094 case ARMISD::UQADD8b:
19095 case ARMISD::UQSUB8b: {
19096 unsigned BitWidth = N->getValueType(0).getSizeInBits();
19097 APInt DemandedMask = APInt::getLowBitsSet(BitWidth, 8);
19098 if ((SimplifyDemandedBits(N->getOperand(0), DemandedMask, DCI)) ||
19099 (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI)))
19100 return SDValue();
19101 break;
19102 }
19103 case ARMISD::VBSP:
19104 if (N->getOperand(1) == N->getOperand(2))
19105 return N->getOperand(1);
19106 return SDValue();
19109 switch (N->getConstantOperandVal(1)) {
19110 case Intrinsic::arm_neon_vld1:
19111 case Intrinsic::arm_neon_vld1x2:
19112 case Intrinsic::arm_neon_vld1x3:
19113 case Intrinsic::arm_neon_vld1x4:
19114 case Intrinsic::arm_neon_vld2:
19115 case Intrinsic::arm_neon_vld3:
19116 case Intrinsic::arm_neon_vld4:
19117 case Intrinsic::arm_neon_vld2lane:
19118 case Intrinsic::arm_neon_vld3lane:
19119 case Intrinsic::arm_neon_vld4lane:
19120 case Intrinsic::arm_neon_vld2dup:
19121 case Intrinsic::arm_neon_vld3dup:
19122 case Intrinsic::arm_neon_vld4dup:
19123 case Intrinsic::arm_neon_vst1:
19124 case Intrinsic::arm_neon_vst1x2:
19125 case Intrinsic::arm_neon_vst1x3:
19126 case Intrinsic::arm_neon_vst1x4:
19127 case Intrinsic::arm_neon_vst2:
19128 case Intrinsic::arm_neon_vst3:
19129 case Intrinsic::arm_neon_vst4:
19130 case Intrinsic::arm_neon_vst2lane:
19131 case Intrinsic::arm_neon_vst3lane:
19132 case Intrinsic::arm_neon_vst4lane:
19133 return PerformVLDCombine(N, DCI);
19134 case Intrinsic::arm_mve_vld2q:
19135 case Intrinsic::arm_mve_vld4q:
19136 case Intrinsic::arm_mve_vst2q:
19137 case Intrinsic::arm_mve_vst4q:
19138 return PerformMVEVLDCombine(N, DCI);
19139 default: break;
19140 }
19141 break;
19142 }
19143 return SDValue();
19144}
19145
19147 EVT VT) const {
19148 return (VT == MVT::f32) && (Opc == ISD::LOAD || Opc == ISD::STORE);
19149}
19150
19152 Align Alignment,
19154 unsigned *Fast) const {
19155 // Depends what it gets converted into if the type is weird.
19156 if (!VT.isSimple())
19157 return false;
19158
19159 // The AllowsUnaligned flag models the SCTLR.A setting in ARM cpus
19160 bool AllowsUnaligned = Subtarget->allowsUnalignedMem();
19161 auto Ty = VT.getSimpleVT().SimpleTy;
19162
19163 if (Ty == MVT::i8 || Ty == MVT::i16 || Ty == MVT::i32) {
19164 // Unaligned access can use (for example) LRDB, LRDH, LDR
19165 if (AllowsUnaligned) {
19166 if (Fast)
19167 *Fast = Subtarget->hasV7Ops();
19168 return true;
19169 }
19170 }
19171
19172 if (Ty == MVT::f64 || Ty == MVT::v2f64) {
19173 // For any little-endian targets with neon, we can support unaligned ld/st
19174 // of D and Q (e.g. {D0,D1}) registers by using vld1.i8/vst1.i8.
19175 // A big-endian target may also explicitly support unaligned accesses
19176 if (Subtarget->hasNEON() && (AllowsUnaligned || Subtarget->isLittle())) {
19177 if (Fast)
19178 *Fast = 1;
19179 return true;
19180 }
19181 }
19182
19183 if (!Subtarget->hasMVEIntegerOps())
19184 return false;
19185
19186 // These are for predicates
19187 if ((Ty == MVT::v16i1 || Ty == MVT::v8i1 || Ty == MVT::v4i1 ||
19188 Ty == MVT::v2i1)) {
19189 if (Fast)
19190 *Fast = 1;
19191 return true;
19192 }
19193
19194 // These are for truncated stores/narrowing loads. They are fine so long as
19195 // the alignment is at least the size of the item being loaded
19196 if ((Ty == MVT::v4i8 || Ty == MVT::v8i8 || Ty == MVT::v4i16) &&
19197 Alignment >= VT.getScalarSizeInBits() / 8) {
19198 if (Fast)
19199 *Fast = true;
19200 return true;
19201 }
19202
19203 // In little-endian MVE, the store instructions VSTRB.U8, VSTRH.U16 and
19204 // VSTRW.U32 all store the vector register in exactly the same format, and
19205 // differ only in the range of their immediate offset field and the required
19206 // alignment. So there is always a store that can be used, regardless of
19207 // actual type.
19208 //
19209 // For big endian, that is not the case. But can still emit a (VSTRB.U8;
19210 // VREV64.8) pair and get the same effect. This will likely be better than
19211 // aligning the vector through the stack.
19212 if (Ty == MVT::v16i8 || Ty == MVT::v8i16 || Ty == MVT::v8f16 ||
19213 Ty == MVT::v4i32 || Ty == MVT::v4f32 || Ty == MVT::v2i64 ||
19214 Ty == MVT::v2f64) {
19215 if (Fast)
19216 *Fast = 1;
19217 return true;
19218 }
19219
19220 return false;
19221}
19222
19224 LLVMContext &Context, const MemOp &Op,
19225 const AttributeList &FuncAttributes) const {
19226 // See if we can use NEON instructions for this...
19227 if ((Op.isMemcpy() || Op.isZeroMemset()) && Subtarget->hasNEON() &&
19228 !FuncAttributes.hasFnAttr(Attribute::NoImplicitFloat)) {
19229 unsigned Fast;
19230 if (Op.size() >= 16 &&
19231 (Op.isAligned(Align(16)) ||
19232 (allowsMisalignedMemoryAccesses(MVT::v2f64, 0, Align(1),
19234 Fast))) {
19235 return MVT::v2f64;
19236 } else if (Op.size() >= 8 &&
19237 (Op.isAligned(Align(8)) ||
19239 MVT::f64, 0, Align(1), MachineMemOperand::MONone, &Fast) &&
19240 Fast))) {
19241 return MVT::f64;
19242 }
19243 }
19244
19245 // Let the target-independent logic figure it out.
19246 return MVT::Other;
19247}
19248
19249// 64-bit integers are split into their high and low parts and held in two
19250// different registers, so the trunc is free since the low register can just
19251// be used.
19252bool ARMTargetLowering::isTruncateFree(Type *SrcTy, Type *DstTy) const {
19253 if (!SrcTy->isIntegerTy() || !DstTy->isIntegerTy())
19254 return false;
19255 unsigned SrcBits = SrcTy->getPrimitiveSizeInBits();
19256 unsigned DestBits = DstTy->getPrimitiveSizeInBits();
19257 return (SrcBits == 64 && DestBits == 32);
19258}
19259
19261 if (SrcVT.isVector() || DstVT.isVector() || !SrcVT.isInteger() ||
19262 !DstVT.isInteger())
19263 return false;
19264 unsigned SrcBits = SrcVT.getSizeInBits();
19265 unsigned DestBits = DstVT.getSizeInBits();
19266 return (SrcBits == 64 && DestBits == 32);
19267}
19268
19270 if (Val.getOpcode() != ISD::LOAD)
19271 return false;
19272
19273 EVT VT1 = Val.getValueType();
19274 if (!VT1.isSimple() || !VT1.isInteger() ||
19275 !VT2.isSimple() || !VT2.isInteger())
19276 return false;
19277
19278 switch (VT1.getSimpleVT().SimpleTy) {
19279 default: break;
19280 case MVT::i1:
19281 case MVT::i8:
19282 case MVT::i16:
19283 // 8-bit and 16-bit loads implicitly zero-extend to 32-bits.
19284 return true;
19285 }
19286
19287 return false;
19288}
19289
19291 if (!VT.isSimple())
19292 return false;
19293
19294 // There are quite a few FP16 instructions (e.g. VNMLA, VNMLS, etc.) that
19295 // negate values directly (fneg is free). So, we don't want to let the DAG
19296 // combiner rewrite fneg into xors and some other instructions. For f16 and
19297 // FullFP16 argument passing, some bitcast nodes may be introduced,
19298 // triggering this DAG combine rewrite, so we are avoiding that with this.
19299 switch (VT.getSimpleVT().SimpleTy) {
19300 default: break;
19301 case MVT::f16:
19302 return Subtarget->hasFullFP16();
19303 }
19304
19305 return false;
19306}
19307
19309 if (!Subtarget->hasMVEIntegerOps())
19310 return nullptr;
19311 Type *SVIType = SVI->getType();
19312 Type *ScalarType = SVIType->getScalarType();
19313
19314 if (ScalarType->isFloatTy())
19315 return Type::getInt32Ty(SVIType->getContext());
19316 if (ScalarType->isHalfTy())
19317 return Type::getInt16Ty(SVIType->getContext());
19318 return nullptr;
19319}
19320
19322 EVT VT = ExtVal.getValueType();
19323
19324 if (!isTypeLegal(VT))
19325 return false;
19326
19327 if (auto *Ld = dyn_cast<MaskedLoadSDNode>(ExtVal.getOperand(0))) {
19328 if (Ld->isExpandingLoad())
19329 return false;
19330 }
19331
19332 if (Subtarget->hasMVEIntegerOps())
19333 return true;
19334
19335 // Don't create a loadext if we can fold the extension into a wide/long
19336 // instruction.
19337 // If there's more than one user instruction, the loadext is desirable no
19338 // matter what. There can be two uses by the same instruction.
19339 if (ExtVal->use_empty() ||
19340 !ExtVal->user_begin()->isOnlyUserOf(ExtVal.getNode()))
19341 return true;
19342
19343 SDNode *U = *ExtVal->user_begin();
19344 if ((U->getOpcode() == ISD::ADD || U->getOpcode() == ISD::SUB ||
19345 U->getOpcode() == ISD::SHL || U->getOpcode() == ARMISD::VSHLIMM))
19346 return false;
19347
19348 return true;
19349}
19350
19352 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
19353 return false;
19354
19355 if (!isTypeLegal(EVT::getEVT(Ty1)))
19356 return false;
19357
19358 assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop");
19359
19360 // Assuming the caller doesn't have a zeroext or signext return parameter,
19361 // truncation all the way down to i1 is valid.
19362 return true;
19363}
19364
19365/// isFMAFasterThanFMulAndFAdd - Return true if an FMA operation is faster
19366/// than a pair of fmul and fadd instructions. fmuladd intrinsics will be
19367/// expanded to FMAs when this method returns true, otherwise fmuladd is
19368/// expanded to fmul + fadd.
19369///
19370/// ARM supports both fused and unfused multiply-add operations; we already
19371/// lower a pair of fmul and fadd to the latter so it's not clear that there
19372/// would be a gain or that the gain would be worthwhile enough to risk
19373/// correctness bugs.
19374///
19375/// For MVE, we set this to true as it helps simplify the need for some
19376/// patterns (and we don't have the non-fused floating point instruction).
19377bool ARMTargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
19378 EVT VT) const {
19379 if (Subtarget->useSoftFloat())
19380 return false;
19381
19382 if (!VT.isSimple())
19383 return false;
19384
19385 switch (VT.getSimpleVT().SimpleTy) {
19386 case MVT::v4f32:
19387 case MVT::v8f16:
19388 return Subtarget->hasMVEFloatOps();
19389 case MVT::f16:
19390 return Subtarget->useFPVFMx16();
19391 case MVT::f32:
19392 return Subtarget->useFPVFMx();
19393 case MVT::f64:
19394 return Subtarget->useFPVFMx64();
19395 default:
19396 break;
19397 }
19398
19399 return false;
19400}
19401
19402static bool isLegalT1AddressImmediate(int64_t V, EVT VT) {
19403 if (V < 0)
19404 return false;
19405
19406 unsigned Scale = 1;
19407 switch (VT.getSimpleVT().SimpleTy) {
19408 case MVT::i1:
19409 case MVT::i8:
19410 // Scale == 1;
19411 break;
19412 case MVT::i16:
19413 // Scale == 2;
19414 Scale = 2;
19415 break;
19416 default:
19417 // On thumb1 we load most things (i32, i64, floats, etc) with a LDR
19418 // Scale == 4;
19419 Scale = 4;
19420 break;
19421 }
19422
19423 if ((V & (Scale - 1)) != 0)
19424 return false;
19425 return isUInt<5>(V / Scale);
19426}
19427
19428static bool isLegalT2AddressImmediate(int64_t V, EVT VT,
19429 const ARMSubtarget *Subtarget) {
19430 if (!VT.isInteger() && !VT.isFloatingPoint())
19431 return false;
19432 if (VT.isVector() && Subtarget->hasNEON())
19433 return false;
19434 if (VT.isVector() && VT.isFloatingPoint() && Subtarget->hasMVEIntegerOps() &&
19435 !Subtarget->hasMVEFloatOps())
19436 return false;
19437
19438 bool IsNeg = false;
19439 if (V < 0) {
19440 IsNeg = true;
19441 V = -V;
19442 }
19443
19444 unsigned NumBytes = std::max((unsigned)VT.getSizeInBits() / 8, 1U);
19445
19446 // MVE: size * imm7
19447 if (VT.isVector() && Subtarget->hasMVEIntegerOps()) {
19448 switch (VT.getSimpleVT().getVectorElementType().SimpleTy) {
19449 case MVT::i32:
19450 case MVT::f32:
19451 return isShiftedUInt<7,2>(V);
19452 case MVT::i16:
19453 case MVT::f16:
19454 return isShiftedUInt<7,1>(V);
19455 case MVT::i8:
19456 return isUInt<7>(V);
19457 default:
19458 return false;
19459 }
19460 }
19461
19462 // half VLDR: 2 * imm8
19463 if (VT.isFloatingPoint() && NumBytes == 2 && Subtarget->hasFPRegs16())
19464 return isShiftedUInt<8, 1>(V);
19465 // VLDR and LDRD: 4 * imm8
19466 if ((VT.isFloatingPoint() && Subtarget->hasVFP2Base()) || NumBytes == 8)
19467 return isShiftedUInt<8, 2>(V);
19468
19469 if (NumBytes == 1 || NumBytes == 2 || NumBytes == 4) {
19470 // + imm12 or - imm8
19471 if (IsNeg)
19472 return isUInt<8>(V);
19473 return isUInt<12>(V);
19474 }
19475
19476 return false;
19477}
19478
19479/// isLegalAddressImmediate - Return true if the integer value can be used
19480/// as the offset of the target addressing mode for load / store of the
19481/// given type.
19482static bool isLegalAddressImmediate(int64_t V, EVT VT,
19483 const ARMSubtarget *Subtarget) {
19484 if (V == 0)
19485 return true;
19486
19487 if (!VT.isSimple())
19488 return false;
19489
19490 if (Subtarget->isThumb1Only())
19491 return isLegalT1AddressImmediate(V, VT);
19492 else if (Subtarget->isThumb2())
19493 return isLegalT2AddressImmediate(V, VT, Subtarget);
19494
19495 // ARM mode.
19496 if (V < 0)
19497 V = - V;
19498 switch (VT.getSimpleVT().SimpleTy) {
19499 default: return false;
19500 case MVT::i1:
19501 case MVT::i8:
19502 case MVT::i32:
19503 // +- imm12
19504 return isUInt<12>(V);
19505 case MVT::i16:
19506 // +- imm8
19507 return isUInt<8>(V);
19508 case MVT::f32:
19509 case MVT::f64:
19510 if (!Subtarget->hasVFP2Base()) // FIXME: NEON?
19511 return false;
19512 return isShiftedUInt<8, 2>(V);
19513 }
19514}
19515
19517 EVT VT) const {
19518 int Scale = AM.Scale;
19519 if (Scale < 0)
19520 return false;
19521
19522 switch (VT.getSimpleVT().SimpleTy) {
19523 default: return false;
19524 case MVT::i1:
19525 case MVT::i8:
19526 case MVT::i16:
19527 case MVT::i32:
19528 if (Scale == 1)
19529 return true;
19530 // r + r << imm
19531 Scale = Scale & ~1;
19532 return Scale == 2 || Scale == 4 || Scale == 8;
19533 case MVT::i64:
19534 // FIXME: What are we trying to model here? ldrd doesn't have an r + r
19535 // version in Thumb mode.
19536 // r + r
19537 if (Scale == 1)
19538 return true;
19539 // r * 2 (this can be lowered to r + r).
19540 if (!AM.HasBaseReg && Scale == 2)
19541 return true;
19542 return false;
19543 case MVT::isVoid:
19544 // Note, we allow "void" uses (basically, uses that aren't loads or
19545 // stores), because arm allows folding a scale into many arithmetic
19546 // operations. This should be made more precise and revisited later.
19547
19548 // Allow r << imm, but the imm has to be a multiple of two.
19549 if (Scale & 1) return false;
19550 return isPowerOf2_32(Scale);
19551 }
19552}
19553
19555 EVT VT) const {
19556 const int Scale = AM.Scale;
19557
19558 // Negative scales are not supported in Thumb1.
19559 if (Scale < 0)
19560 return false;
19561
19562 // Thumb1 addressing modes do not support register scaling excepting the
19563 // following cases:
19564 // 1. Scale == 1 means no scaling.
19565 // 2. Scale == 2 this can be lowered to r + r if there is no base register.
19566 return (Scale == 1) || (!AM.HasBaseReg && Scale == 2);
19567}
19568
19569/// isLegalAddressingMode - Return true if the addressing mode represented
19570/// by AM is legal for this target, for a load/store of the specified type.
19572 const AddrMode &AM, Type *Ty,
19573 unsigned AS, Instruction *I) const {
19574 EVT VT = getValueType(DL, Ty, true);
19575 if (!isLegalAddressImmediate(AM.BaseOffs, VT, Subtarget))
19576 return false;
19577
19578 // Can never fold addr of global into load/store.
19579 if (AM.BaseGV)
19580 return false;
19581
19582 switch (AM.Scale) {
19583 case 0: // no scale reg, must be "r+i" or "r", or "i".
19584 break;
19585 default:
19586 // ARM doesn't support any R+R*scale+imm addr modes.
19587 if (AM.BaseOffs)
19588 return false;
19589
19590 if (!VT.isSimple())
19591 return false;
19592
19593 if (Subtarget->isThumb1Only())
19594 return isLegalT1ScaledAddressingMode(AM, VT);
19595
19596 if (Subtarget->isThumb2())
19597 return isLegalT2ScaledAddressingMode(AM, VT);
19598
19599 int Scale = AM.Scale;
19600 switch (VT.getSimpleVT().SimpleTy) {
19601 default: return false;
19602 case MVT::i1:
19603 case MVT::i8:
19604 case MVT::i32:
19605 if (Scale < 0) Scale = -Scale;
19606 if (Scale == 1)
19607 return true;
19608 // r + r << imm
19609 return isPowerOf2_32(Scale & ~1);
19610 case MVT::i16:
19611 case MVT::i64:
19612 // r +/- r
19613 if (Scale == 1 || (AM.HasBaseReg && Scale == -1))
19614 return true;
19615 // r * 2 (this can be lowered to r + r).
19616 if (!AM.HasBaseReg && Scale == 2)
19617 return true;
19618 return false;
19619
19620 case MVT::isVoid:
19621 // Note, we allow "void" uses (basically, uses that aren't loads or
19622 // stores), because arm allows folding a scale into many arithmetic
19623 // operations. This should be made more precise and revisited later.
19624
19625 // Allow r << imm, but the imm has to be a multiple of two.
19626 if (Scale & 1) return false;
19627 return isPowerOf2_32(Scale);
19628 }
19629 }
19630 return true;
19631}
19632
19633/// isLegalICmpImmediate - Return true if the specified immediate is legal
19634/// icmp immediate, that is the target has icmp instructions which can compare
19635/// a register against the immediate without having to materialize the
19636/// immediate into a register.
19638 // Thumb2 and ARM modes can use cmn for negative immediates.
19639 if (!Subtarget->isThumb())
19640 return ARM_AM::getSOImmVal((uint32_t)Imm) != -1 ||
19641 ARM_AM::getSOImmVal(-(uint32_t)Imm) != -1;
19642 if (Subtarget->isThumb2())
19643 return ARM_AM::getT2SOImmVal((uint32_t)Imm) != -1 ||
19644 ARM_AM::getT2SOImmVal(-(uint32_t)Imm) != -1;
19645 // Thumb1 doesn't have cmn, and only 8-bit immediates.
19646 return Imm >= 0 && Imm <= 255;
19647}
19648
19649/// isLegalAddImmediate - Return true if the specified immediate is a legal add
19650/// *or sub* immediate, that is the target has add or sub instructions which can
19651/// add a register with the immediate without having to materialize the
19652/// immediate into a register.
19654 // Same encoding for add/sub, just flip the sign.
19655 uint64_t AbsImm = AbsoluteValue(Imm);
19656 if (!Subtarget->isThumb())
19657 return ARM_AM::getSOImmVal(AbsImm) != -1;
19658 if (Subtarget->isThumb2())
19659 return ARM_AM::getT2SOImmVal(AbsImm) != -1;
19660 // Thumb1 only has 8-bit unsigned immediate.
19661 return AbsImm <= 255;
19662}
19663
19664// Return false to prevent folding
19665// (mul (add r, c0), c1) -> (add (mul r, c1), c0*c1) in DAGCombine,
19666// if the folding leads to worse code.
19668 SDValue ConstNode) const {
19669 // Let the DAGCombiner decide for vector types and large types.
19670 const EVT VT = AddNode.getValueType();
19671 if (VT.isVector() || VT.getScalarSizeInBits() > 32)
19672 return true;
19673
19674 // It is worse if c0 is legal add immediate, while c1*c0 is not
19675 // and has to be composed by at least two instructions.
19676 const ConstantSDNode *C0Node = cast<ConstantSDNode>(AddNode.getOperand(1));
19677 const ConstantSDNode *C1Node = cast<ConstantSDNode>(ConstNode);
19678 const int64_t C0 = C0Node->getSExtValue();
19679 APInt CA = C0Node->getAPIntValue() * C1Node->getAPIntValue();
19681 return true;
19682 if (ConstantMaterializationCost((unsigned)CA.getZExtValue(), Subtarget) > 1)
19683 return false;
19684
19685 // Default to true and let the DAGCombiner decide.
19686 return true;
19687}
19688
19690 bool isSEXTLoad, SDValue &Base,
19691 SDValue &Offset, bool &isInc,
19692 SelectionDAG &DAG) {
19693 if (Ptr->getOpcode() != ISD::ADD && Ptr->getOpcode() != ISD::SUB)
19694 return false;
19695
19696 if (VT == MVT::i16 || ((VT == MVT::i8 || VT == MVT::i1) && isSEXTLoad)) {
19697 // AddressingMode 3
19698 Base = Ptr->getOperand(0);
19700 int RHSC = (int)RHS->getZExtValue();
19701 if (RHSC < 0 && RHSC > -256) {
19702 assert(Ptr->getOpcode() == ISD::ADD);
19703 isInc = false;
19704 Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0));
19705 return true;
19706 }
19707 }
19708 isInc = (Ptr->getOpcode() == ISD::ADD);
19709 Offset = Ptr->getOperand(1);
19710 return true;
19711 } else if (VT == MVT::i32 || VT == MVT::i8 || VT == MVT::i1) {
19712 // AddressingMode 2
19714 int RHSC = (int)RHS->getZExtValue();
19715 if (RHSC < 0 && RHSC > -0x1000) {
19716 assert(Ptr->getOpcode() == ISD::ADD);
19717 isInc = false;
19718 Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0));
19719 Base = Ptr->getOperand(0);
19720 return true;
19721 }
19722 }
19723
19724 if (Ptr->getOpcode() == ISD::ADD) {
19725 isInc = true;
19726 ARM_AM::ShiftOpc ShOpcVal=
19728 if (ShOpcVal != ARM_AM::no_shift) {
19729 Base = Ptr->getOperand(1);
19730 Offset = Ptr->getOperand(0);
19731 } else {
19732 Base = Ptr->getOperand(0);
19733 Offset = Ptr->getOperand(1);
19734 }
19735 return true;
19736 }
19737
19738 isInc = (Ptr->getOpcode() == ISD::ADD);
19739 Base = Ptr->getOperand(0);
19740 Offset = Ptr->getOperand(1);
19741 return true;
19742 }
19743
19744 // FIXME: Use VLDM / VSTM to emulate indexed FP load / store.
19745 return false;
19746}
19747
19749 bool isSEXTLoad, SDValue &Base,
19750 SDValue &Offset, bool &isInc,
19751 SelectionDAG &DAG) {
19752 if (Ptr->getOpcode() != ISD::ADD && Ptr->getOpcode() != ISD::SUB)
19753 return false;
19754
19755 Base = Ptr->getOperand(0);
19757 int RHSC = (int)RHS->getZExtValue();
19758 if (RHSC < 0 && RHSC > -0x100) { // 8 bits.
19759 assert(Ptr->getOpcode() == ISD::ADD);
19760 isInc = false;
19761 Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0));
19762 return true;
19763 } else if (RHSC > 0 && RHSC < 0x100) { // 8 bit, no zero.
19764 isInc = Ptr->getOpcode() == ISD::ADD;
19765 Offset = DAG.getConstant(RHSC, SDLoc(Ptr), RHS->getValueType(0));
19766 return true;
19767 }
19768 }
19769
19770 return false;
19771}
19772
19773static bool getMVEIndexedAddressParts(SDNode *Ptr, EVT VT, Align Alignment,
19774 bool isSEXTLoad, bool IsMasked, bool isLE,
19776 bool &isInc, SelectionDAG &DAG) {
19777 if (Ptr->getOpcode() != ISD::ADD && Ptr->getOpcode() != ISD::SUB)
19778 return false;
19779 if (!isa<ConstantSDNode>(Ptr->getOperand(1)))
19780 return false;
19781
19782 // We allow LE non-masked loads to change the type (for example use a vldrb.8
19783 // as opposed to a vldrw.32). This can allow extra addressing modes or
19784 // alignments for what is otherwise an equivalent instruction.
19785 bool CanChangeType = isLE && !IsMasked;
19786
19788 int RHSC = (int)RHS->getZExtValue();
19789
19790 auto IsInRange = [&](int RHSC, int Limit, int Scale) {
19791 if (RHSC < 0 && RHSC > -Limit * Scale && RHSC % Scale == 0) {
19792 assert(Ptr->getOpcode() == ISD::ADD);
19793 isInc = false;
19794 Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0));
19795 return true;
19796 } else if (RHSC > 0 && RHSC < Limit * Scale && RHSC % Scale == 0) {
19797 isInc = Ptr->getOpcode() == ISD::ADD;
19798 Offset = DAG.getConstant(RHSC, SDLoc(Ptr), RHS->getValueType(0));
19799 return true;
19800 }
19801 return false;
19802 };
19803
19804 // Try to find a matching instruction based on s/zext, Alignment, Offset and
19805 // (in BE/masked) type.
19806 Base = Ptr->getOperand(0);
19807 if (VT == MVT::v4i16) {
19808 if (Alignment >= 2 && IsInRange(RHSC, 0x80, 2))
19809 return true;
19810 } else if (VT == MVT::v4i8 || VT == MVT::v8i8) {
19811 if (IsInRange(RHSC, 0x80, 1))
19812 return true;
19813 } else if (Alignment >= 4 &&
19814 (CanChangeType || VT == MVT::v4i32 || VT == MVT::v4f32) &&
19815 IsInRange(RHSC, 0x80, 4))
19816 return true;
19817 else if (Alignment >= 2 &&
19818 (CanChangeType || VT == MVT::v8i16 || VT == MVT::v8f16) &&
19819 IsInRange(RHSC, 0x80, 2))
19820 return true;
19821 else if ((CanChangeType || VT == MVT::v16i8) && IsInRange(RHSC, 0x80, 1))
19822 return true;
19823 return false;
19824}
19825
19826/// getPreIndexedAddressParts - returns true by value, base pointer and
19827/// offset pointer and addressing mode by reference if the node's address
19828/// can be legally represented as pre-indexed load / store address.
19829bool
19831 SDValue &Offset,
19833 SelectionDAG &DAG) const {
19834 if (Subtarget->isThumb1Only())
19835 return false;
19836
19837 EVT VT;
19838 SDValue Ptr;
19839 Align Alignment;
19840 unsigned AS = 0;
19841 bool isSEXTLoad = false;
19842 bool IsMasked = false;
19843 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
19844 Ptr = LD->getBasePtr();
19845 VT = LD->getMemoryVT();
19846 Alignment = LD->getAlign();
19847 AS = LD->getAddressSpace();
19848 isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD;
19849 } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
19850 Ptr = ST->getBasePtr();
19851 VT = ST->getMemoryVT();
19852 Alignment = ST->getAlign();
19853 AS = ST->getAddressSpace();
19854 } else if (MaskedLoadSDNode *LD = dyn_cast<MaskedLoadSDNode>(N)) {
19855 Ptr = LD->getBasePtr();
19856 VT = LD->getMemoryVT();
19857 Alignment = LD->getAlign();
19858 AS = LD->getAddressSpace();
19859 isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD;
19860 IsMasked = true;
19862 Ptr = ST->getBasePtr();
19863 VT = ST->getMemoryVT();
19864 Alignment = ST->getAlign();
19865 AS = ST->getAddressSpace();
19866 IsMasked = true;
19867 } else
19868 return false;
19869
19870 unsigned Fast = 0;
19871 if (!allowsMisalignedMemoryAccesses(VT, AS, Alignment,
19873 // Only generate post-increment or pre-increment forms when a real
19874 // hardware instruction exists for them. Do not emit postinc/preinc
19875 // if the operation will end up as a libcall.
19876 return false;
19877 }
19878
19879 bool isInc;
19880 bool isLegal = false;
19881 if (VT.isVector())
19882 isLegal = Subtarget->hasMVEIntegerOps() &&
19884 Ptr.getNode(), VT, Alignment, isSEXTLoad, IsMasked,
19885 Subtarget->isLittle(), Base, Offset, isInc, DAG);
19886 else {
19887 if (Subtarget->isThumb2())
19888 isLegal = getT2IndexedAddressParts(Ptr.getNode(), VT, isSEXTLoad, Base,
19889 Offset, isInc, DAG);
19890 else
19891 isLegal = getARMIndexedAddressParts(Ptr.getNode(), VT, isSEXTLoad, Base,
19892 Offset, isInc, DAG);
19893 }
19894 if (!isLegal)
19895 return false;
19896
19897 AM = isInc ? ISD::PRE_INC : ISD::PRE_DEC;
19898 return true;
19899}
19900
19901/// getPostIndexedAddressParts - returns true by value, base pointer and
19902/// offset pointer and addressing mode by reference if this node can be
19903/// combined with a load / store to form a post-indexed load / store.
19905 SDValue &Base,
19906 SDValue &Offset,
19908 SelectionDAG &DAG) const {
19909 EVT VT;
19910 SDValue Ptr;
19911 Align Alignment;
19912 bool isSEXTLoad = false, isNonExt;
19913 bool IsMasked = false;
19914 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
19915 VT = LD->getMemoryVT();
19916 Ptr = LD->getBasePtr();
19917 Alignment = LD->getAlign();
19918 isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD;
19919 isNonExt = LD->getExtensionType() == ISD::NON_EXTLOAD;
19920 } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
19921 VT = ST->getMemoryVT();
19922 Ptr = ST->getBasePtr();
19923 Alignment = ST->getAlign();
19924 isNonExt = !ST->isTruncatingStore();
19925 } else if (MaskedLoadSDNode *LD = dyn_cast<MaskedLoadSDNode>(N)) {
19926 VT = LD->getMemoryVT();
19927 Ptr = LD->getBasePtr();
19928 Alignment = LD->getAlign();
19929 isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD;
19930 isNonExt = LD->getExtensionType() == ISD::NON_EXTLOAD;
19931 IsMasked = true;
19933 VT = ST->getMemoryVT();
19934 Ptr = ST->getBasePtr();
19935 Alignment = ST->getAlign();
19936 isNonExt = !ST->isTruncatingStore();
19937 IsMasked = true;
19938 } else
19939 return false;
19940
19941 if (Subtarget->isThumb1Only()) {
19942 // Thumb-1 can do a limited post-inc load or store as an updating LDM. It
19943 // must be non-extending/truncating, i32, with an offset of 4.
19944 assert(Op->getValueType(0) == MVT::i32 && "Non-i32 post-inc op?!");
19945 if (Op->getOpcode() != ISD::ADD || !isNonExt)
19946 return false;
19947 auto *RHS = dyn_cast<ConstantSDNode>(Op->getOperand(1));
19948 if (!RHS || RHS->getZExtValue() != 4)
19949 return false;
19950 if (Alignment < Align(4))
19951 return false;
19952
19953 Offset = Op->getOperand(1);
19954 Base = Op->getOperand(0);
19955 AM = ISD::POST_INC;
19956 return true;
19957 }
19958
19959 bool isInc;
19960 bool isLegal = false;
19961 if (VT.isVector())
19962 isLegal = Subtarget->hasMVEIntegerOps() &&
19963 getMVEIndexedAddressParts(Op, VT, Alignment, isSEXTLoad, IsMasked,
19964 Subtarget->isLittle(), Base, Offset,
19965 isInc, DAG);
19966 else {
19967 if (Subtarget->isThumb2())
19968 isLegal = getT2IndexedAddressParts(Op, VT, isSEXTLoad, Base, Offset,
19969 isInc, DAG);
19970 else
19971 isLegal = getARMIndexedAddressParts(Op, VT, isSEXTLoad, Base, Offset,
19972 isInc, DAG);
19973 }
19974 if (!isLegal)
19975 return false;
19976
19977 if (Ptr != Base) {
19978 // Swap base ptr and offset to catch more post-index load / store when
19979 // it's legal. In Thumb2 mode, offset must be an immediate.
19980 if (Ptr == Offset && Op->getOpcode() == ISD::ADD &&
19981 !Subtarget->isThumb2())
19983
19984 // Post-indexed load / store update the base pointer.
19985 if (Ptr != Base)
19986 return false;
19987 }
19988
19989 AM = isInc ? ISD::POST_INC : ISD::POST_DEC;
19990 return true;
19991}
19992
19994 KnownBits &Known,
19995 const APInt &DemandedElts,
19996 const SelectionDAG &DAG,
19997 unsigned Depth) const {
19998 unsigned BitWidth = Known.getBitWidth();
19999 Known.resetAll();
20000 switch (Op.getOpcode()) {
20001 default: break;
20002 case ARMISD::ADDC:
20003 case ARMISD::ADDE:
20004 case ARMISD::SUBC:
20005 case ARMISD::SUBE:
20006 // Special cases when we convert a carry to a boolean.
20007 if (Op.getResNo() == 0) {
20008 SDValue LHS = Op.getOperand(0);
20009 SDValue RHS = Op.getOperand(1);
20010 // (ADDE 0, 0, C) will give us a single bit.
20011 if (Op->getOpcode() == ARMISD::ADDE && isNullConstant(LHS) &&
20012 isNullConstant(RHS)) {
20014 return;
20015 }
20016 }
20017 break;
20018 case ARMISD::CMOV: {
20019 // Bits are known zero/one if known on the LHS and RHS.
20020 Known = DAG.computeKnownBits(Op.getOperand(0), Depth+1);
20021 if (Known.isUnknown())
20022 return;
20023
20024 KnownBits KnownRHS = DAG.computeKnownBits(Op.getOperand(1), Depth+1);
20025 Known = Known.intersectWith(KnownRHS);
20026 return;
20027 }
20029 Intrinsic::ID IntID =
20030 static_cast<Intrinsic::ID>(Op->getConstantOperandVal(1));
20031 switch (IntID) {
20032 default: return;
20033 case Intrinsic::arm_ldaex:
20034 case Intrinsic::arm_ldrex: {
20035 EVT VT = cast<MemIntrinsicSDNode>(Op)->getMemoryVT();
20036 unsigned MemBits = VT.getScalarSizeInBits();
20037 Known.Zero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits);
20038 return;
20039 }
20040 }
20041 }
20042 case ARMISD::BFI: {
20043 // Conservatively, we can recurse down the first operand
20044 // and just mask out all affected bits.
20045 Known = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
20046
20047 // The operand to BFI is already a mask suitable for removing the bits it
20048 // sets.
20049 const APInt &Mask = Op.getConstantOperandAPInt(2);
20050 Known.Zero &= Mask;
20051 Known.One &= Mask;
20052 return;
20053 }
20054 case ARMISD::VGETLANEs:
20055 case ARMISD::VGETLANEu: {
20056 const SDValue &SrcSV = Op.getOperand(0);
20057 EVT VecVT = SrcSV.getValueType();
20058 assert(VecVT.isVector() && "VGETLANE expected a vector type");
20059 const unsigned NumSrcElts = VecVT.getVectorNumElements();
20060 ConstantSDNode *Pos = cast<ConstantSDNode>(Op.getOperand(1).getNode());
20061 assert(Pos->getAPIntValue().ult(NumSrcElts) &&
20062 "VGETLANE index out of bounds");
20063 unsigned Idx = Pos->getZExtValue();
20064 APInt DemandedElt = APInt::getOneBitSet(NumSrcElts, Idx);
20065 Known = DAG.computeKnownBits(SrcSV, DemandedElt, Depth + 1);
20066
20067 EVT VT = Op.getValueType();
20068 const unsigned DstSz = VT.getScalarSizeInBits();
20069 const unsigned SrcSz = VecVT.getVectorElementType().getSizeInBits();
20070 (void)SrcSz;
20071 assert(SrcSz == Known.getBitWidth());
20072 assert(DstSz > SrcSz);
20073 if (Op.getOpcode() == ARMISD::VGETLANEs)
20074 Known = Known.sext(DstSz);
20075 else {
20076 Known = Known.zext(DstSz);
20077 }
20078 assert(DstSz == Known.getBitWidth());
20079 break;
20080 }
20081 case ARMISD::VMOVrh: {
20082 KnownBits KnownOp = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
20083 assert(KnownOp.getBitWidth() == 16);
20084 Known = KnownOp.zext(32);
20085 break;
20086 }
20087 case ARMISD::CSINC:
20088 case ARMISD::CSINV:
20089 case ARMISD::CSNEG: {
20090 KnownBits KnownOp0 = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
20091 KnownBits KnownOp1 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
20092
20093 // The result is either:
20094 // CSINC: KnownOp0 or KnownOp1 + 1
20095 // CSINV: KnownOp0 or ~KnownOp1
20096 // CSNEG: KnownOp0 or KnownOp1 * -1
20097 if (Op.getOpcode() == ARMISD::CSINC)
20098 KnownOp1 =
20099 KnownBits::add(KnownOp1, KnownBits::makeConstant(APInt(32, 1)));
20100 else if (Op.getOpcode() == ARMISD::CSINV)
20101 std::swap(KnownOp1.Zero, KnownOp1.One);
20102 else if (Op.getOpcode() == ARMISD::CSNEG)
20103 KnownOp1 = KnownBits::mul(KnownOp1,
20105
20106 Known = KnownOp0.intersectWith(KnownOp1);
20107 break;
20108 }
20109 case ARMISD::VORRIMM:
20110 case ARMISD::VBICIMM: {
20111 unsigned Encoded = Op.getConstantOperandVal(1);
20112 unsigned DecEltBits = 0;
20113 uint64_t DecodedVal = ARM_AM::decodeVMOVModImm(Encoded, DecEltBits);
20114
20115 unsigned EltBits = Op.getScalarValueSizeInBits();
20116 if (EltBits != DecEltBits) {
20117 // Be conservative: only update Known when EltBits == DecEltBits.
20118 // This is believed to always be true for VORRIMM/VBICIMM today, but if
20119 // that changes in the future, doing nothing here is safer than risking
20120 // subtle bugs.
20121 break;
20122 }
20123
20124 KnownBits KnownLHS = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
20125 bool IsVORR = Op.getOpcode() == ARMISD::VORRIMM;
20126 APInt Imm(DecEltBits, DecodedVal);
20127
20128 Known.One = IsVORR ? (KnownLHS.One | Imm) : (KnownLHS.One & ~Imm);
20129 Known.Zero = IsVORR ? (KnownLHS.Zero & ~Imm) : (KnownLHS.Zero | Imm);
20130 break;
20131 }
20132 }
20133}
20134
20136 SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
20137 TargetLoweringOpt &TLO) const {
20138 // Delay optimization, so we don't have to deal with illegal types, or block
20139 // optimizations.
20140 if (!TLO.LegalOps)
20141 return false;
20142
20143 // Only optimize AND for now.
20144 if (Op.getOpcode() != ISD::AND)
20145 return false;
20146
20147 EVT VT = Op.getValueType();
20148
20149 // Ignore vectors.
20150 if (VT.isVector())
20151 return false;
20152
20153 assert(VT == MVT::i32 && "Unexpected integer type");
20154
20155 // Make sure the RHS really is a constant.
20156 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
20157 if (!C)
20158 return false;
20159
20160 unsigned Mask = C->getZExtValue();
20161
20162 unsigned Demanded = DemandedBits.getZExtValue();
20163 unsigned ShrunkMask = Mask & Demanded;
20164 unsigned ExpandedMask = Mask | ~Demanded;
20165
20166 // If the mask is all zeros, let the target-independent code replace the
20167 // result with zero.
20168 if (ShrunkMask == 0)
20169 return false;
20170
20171 // If the mask is all ones, erase the AND. (Currently, the target-independent
20172 // code won't do this, so we have to do it explicitly to avoid an infinite
20173 // loop in obscure cases.)
20174 if (ExpandedMask == ~0U)
20175 return TLO.CombineTo(Op, Op.getOperand(0));
20176
20177 auto IsLegalMask = [ShrunkMask, ExpandedMask](unsigned Mask) -> bool {
20178 return (ShrunkMask & Mask) == ShrunkMask && (~ExpandedMask & Mask) == 0;
20179 };
20180 auto UseMask = [Mask, Op, VT, &TLO](unsigned NewMask) -> bool {
20181 if (NewMask == Mask)
20182 return true;
20183 SDLoc DL(Op);
20184 SDValue NewC = TLO.DAG.getConstant(NewMask, DL, VT);
20185 SDValue NewOp = TLO.DAG.getNode(ISD::AND, DL, VT, Op.getOperand(0), NewC);
20186 return TLO.CombineTo(Op, NewOp);
20187 };
20188
20189 // Prefer uxtb mask.
20190 if (IsLegalMask(0xFF))
20191 return UseMask(0xFF);
20192
20193 // Prefer uxth mask.
20194 if (IsLegalMask(0xFFFF))
20195 return UseMask(0xFFFF);
20196
20197 // [1, 255] is Thumb1 movs+ands, legal immediate for ARM/Thumb2.
20198 // FIXME: Prefer a contiguous sequence of bits for other optimizations.
20199 if (ShrunkMask < 256)
20200 return UseMask(ShrunkMask);
20201
20202 // [-256, -2] is Thumb1 movs+bics, legal immediate for ARM/Thumb2.
20203 // FIXME: Prefer a contiguous sequence of bits for other optimizations.
20204 if ((int)ExpandedMask <= -2 && (int)ExpandedMask >= -256)
20205 return UseMask(ExpandedMask);
20206
20207 // Potential improvements:
20208 //
20209 // We could try to recognize lsls+lsrs or lsrs+lsls pairs here.
20210 // We could try to prefer Thumb1 immediates which can be lowered to a
20211 // two-instruction sequence.
20212 // We could try to recognize more legal ARM/Thumb2 immediates here.
20213
20214 return false;
20215}
20216
20218 SDValue Op, const APInt &OriginalDemandedBits,
20219 const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO,
20220 unsigned Depth) const {
20221 unsigned Opc = Op.getOpcode();
20222
20223 switch (Opc) {
20224 case ARMISD::ASRL:
20225 case ARMISD::LSRL: {
20226 // If this is result 0 and the other result is unused, see if the demand
20227 // bits allow us to shrink this long shift into a standard small shift in
20228 // the opposite direction.
20229 if (Op.getResNo() == 0 && !Op->hasAnyUseOfValue(1) &&
20230 isa<ConstantSDNode>(Op->getOperand(2))) {
20231 unsigned ShAmt = Op->getConstantOperandVal(2);
20232 if (ShAmt < 32 && OriginalDemandedBits.isSubsetOf(APInt::getAllOnes(32)
20233 << (32 - ShAmt)))
20234 return TLO.CombineTo(
20235 Op, TLO.DAG.getNode(
20236 ISD::SHL, SDLoc(Op), MVT::i32, Op.getOperand(1),
20237 TLO.DAG.getConstant(32 - ShAmt, SDLoc(Op), MVT::i32)));
20238 }
20239 break;
20240 }
20241 case ARMISD::VBICIMM: {
20242 SDValue Op0 = Op.getOperand(0);
20243 unsigned ModImm = Op.getConstantOperandVal(1);
20244 unsigned EltBits = 0;
20245 uint64_t Mask = ARM_AM::decodeVMOVModImm(ModImm, EltBits);
20246 if ((OriginalDemandedBits & Mask) == 0)
20247 return TLO.CombineTo(Op, Op0);
20248 }
20249 }
20250
20252 Op, OriginalDemandedBits, OriginalDemandedElts, Known, TLO, Depth);
20253}
20254
20255//===----------------------------------------------------------------------===//
20256// ARM Inline Assembly Support
20257//===----------------------------------------------------------------------===//
20258
20259const char *ARMTargetLowering::LowerXConstraint(EVT ConstraintVT) const {
20260 // At this point, we have to lower this constraint to something else, so we
20261 // lower it to an "r" or "w". However, by doing this we will force the result
20262 // to be in register, while the X constraint is much more permissive.
20263 //
20264 // Although we are correct (we are free to emit anything, without
20265 // constraints), we might break use cases that would expect us to be more
20266 // efficient and emit something else.
20267 if (!Subtarget->hasVFP2Base())
20268 return "r";
20269 if (ConstraintVT.isFloatingPoint())
20270 return "w";
20271 if (ConstraintVT.isVector() && Subtarget->hasNEON() &&
20272 (ConstraintVT.getSizeInBits() == 64 ||
20273 ConstraintVT.getSizeInBits() == 128))
20274 return "w";
20275
20276 return "r";
20277}
20278
20279/// getConstraintType - Given a constraint letter, return the type of
20280/// constraint it is for this target.
20283 unsigned S = Constraint.size();
20284 if (S == 1) {
20285 switch (Constraint[0]) {
20286 default: break;
20287 case 'l': return C_RegisterClass;
20288 case 'w': return C_RegisterClass;
20289 case 'h': return C_RegisterClass;
20290 case 'x': return C_RegisterClass;
20291 case 't': return C_RegisterClass;
20292 case 'j': return C_Immediate; // Constant for movw.
20293 // An address with a single base register. Due to the way we
20294 // currently handle addresses it is the same as an 'r' memory constraint.
20295 case 'Q': return C_Memory;
20296 }
20297 } else if (S == 2) {
20298 switch (Constraint[0]) {
20299 default: break;
20300 case 'T': return C_RegisterClass;
20301 // All 'U+' constraints are addresses.
20302 case 'U': return C_Memory;
20303 }
20304 }
20305 return TargetLowering::getConstraintType(Constraint);
20306}
20307
20308/// Examine constraint type and operand type and determine a weight value.
20309/// This object must already have been set up with the operand type
20310/// and the current alternative constraint selected.
20313 AsmOperandInfo &info, const char *constraint) const {
20315 Value *CallOperandVal = info.CallOperandVal;
20316 // If we don't have a value, we can't do a match,
20317 // but allow it at the lowest weight.
20318 if (!CallOperandVal)
20319 return CW_Default;
20320 Type *type = CallOperandVal->getType();
20321 // Look at the constraint type.
20322 switch (*constraint) {
20323 default:
20325 break;
20326 case 'l':
20327 if (type->isIntegerTy()) {
20328 if (Subtarget->isThumb())
20329 weight = CW_SpecificReg;
20330 else
20331 weight = CW_Register;
20332 }
20333 break;
20334 case 'w':
20335 if (type->isFloatingPointTy())
20336 weight = CW_Register;
20337 break;
20338 }
20339 return weight;
20340}
20341
20342static bool isIncompatibleReg(const MCPhysReg &PR, MVT VT) {
20343 if (PR == 0 || VT == MVT::Other)
20344 return false;
20345 if (ARM::SPRRegClass.contains(PR))
20346 return VT != MVT::f32 && VT != MVT::f16 && VT != MVT::i32;
20347 if (ARM::DPRRegClass.contains(PR))
20348 return VT != MVT::f64 && !VT.is64BitVector();
20349 return false;
20350}
20351
20352using RCPair = std::pair<unsigned, const TargetRegisterClass *>;
20353
20355 const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const {
20356 switch (Constraint.size()) {
20357 case 1:
20358 // GCC ARM Constraint Letters
20359 switch (Constraint[0]) {
20360 case 'l': // Low regs or general regs.
20361 if (Subtarget->isThumb())
20362 return RCPair(0U, &ARM::tGPRRegClass);
20363 return RCPair(0U, &ARM::GPRRegClass);
20364 case 'h': // High regs or no regs.
20365 if (Subtarget->isThumb())
20366 return RCPair(0U, &ARM::hGPRRegClass);
20367 break;
20368 case 'r':
20369 if (Subtarget->isThumb1Only())
20370 return RCPair(0U, &ARM::tGPRRegClass);
20371 return RCPair(0U, &ARM::GPRRegClass);
20372 case 'w':
20373 if (VT == MVT::Other)
20374 break;
20375 if (VT == MVT::f32 || VT == MVT::f16 || VT == MVT::bf16)
20376 return RCPair(0U, &ARM::SPRRegClass);
20377 if (VT.getSizeInBits() == 64)
20378 return RCPair(0U, &ARM::DPRRegClass);
20379 if (VT.getSizeInBits() == 128)
20380 return RCPair(0U, &ARM::QPRRegClass);
20381 break;
20382 case 'x':
20383 if (VT == MVT::Other)
20384 break;
20385 if (VT == MVT::f32 || VT == MVT::f16 || VT == MVT::bf16)
20386 return RCPair(0U, &ARM::SPR_8RegClass);
20387 if (VT.getSizeInBits() == 64)
20388 return RCPair(0U, &ARM::DPR_8RegClass);
20389 if (VT.getSizeInBits() == 128)
20390 return RCPair(0U, &ARM::QPR_8RegClass);
20391 break;
20392 case 't':
20393 if (VT == MVT::Other)
20394 break;
20395 if (VT == MVT::f32 || VT == MVT::i32 || VT == MVT::f16 || VT == MVT::bf16)
20396 return RCPair(0U, &ARM::SPRRegClass);
20397 if (VT.getSizeInBits() == 64)
20398 return RCPair(0U, &ARM::DPR_VFP2RegClass);
20399 if (VT.getSizeInBits() == 128)
20400 return RCPair(0U, &ARM::QPR_VFP2RegClass);
20401 break;
20402 }
20403 break;
20404
20405 case 2:
20406 if (Constraint[0] == 'T') {
20407 switch (Constraint[1]) {
20408 default:
20409 break;
20410 case 'e':
20411 return RCPair(0U, &ARM::tGPREvenRegClass);
20412 case 'o':
20413 return RCPair(0U, &ARM::tGPROddRegClass);
20414 }
20415 }
20416 break;
20417
20418 default:
20419 break;
20420 }
20421
20422 if (StringRef("{cc}").equals_insensitive(Constraint))
20423 return std::make_pair(unsigned(ARM::CPSR), &ARM::CCRRegClass);
20424
20425 // r14 is an alias of lr.
20426 if (StringRef("{r14}").equals_insensitive(Constraint))
20427 return std::make_pair(unsigned(ARM::LR), getRegClassFor(MVT::i32));
20428
20429 auto RCP = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
20430 if (isIncompatibleReg(RCP.first, VT))
20431 return {0, nullptr};
20432 return RCP;
20433}
20434
20435/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
20436/// vector. If it is invalid, don't add anything to Ops.
20438 StringRef Constraint,
20439 std::vector<SDValue> &Ops,
20440 SelectionDAG &DAG) const {
20441 SDValue Result;
20442
20443 // Currently only support length 1 constraints.
20444 if (Constraint.size() != 1)
20445 return;
20446
20447 char ConstraintLetter = Constraint[0];
20448 switch (ConstraintLetter) {
20449 default: break;
20450 case 'j':
20451 case 'I': case 'J': case 'K': case 'L':
20452 case 'M': case 'N': case 'O':
20454 if (!C)
20455 return;
20456
20457 int64_t CVal64 = C->getSExtValue();
20458 int CVal = (int) CVal64;
20459 // None of these constraints allow values larger than 32 bits. Check
20460 // that the value fits in an int.
20461 if (CVal != CVal64)
20462 return;
20463
20464 switch (ConstraintLetter) {
20465 case 'j':
20466 // Constant suitable for movw, must be between 0 and
20467 // 65535.
20468 if (Subtarget->hasV6T2Ops() || (Subtarget->hasV8MBaselineOps()))
20469 if (CVal >= 0 && CVal <= 65535)
20470 break;
20471 return;
20472 case 'I':
20473 if (Subtarget->isThumb1Only()) {
20474 // This must be a constant between 0 and 255, for ADD
20475 // immediates.
20476 if (CVal >= 0 && CVal <= 255)
20477 break;
20478 } else if (Subtarget->isThumb2()) {
20479 // A constant that can be used as an immediate value in a
20480 // data-processing instruction.
20481 if (ARM_AM::getT2SOImmVal(CVal) != -1)
20482 break;
20483 } else {
20484 // A constant that can be used as an immediate value in a
20485 // data-processing instruction.
20486 if (ARM_AM::getSOImmVal(CVal) != -1)
20487 break;
20488 }
20489 return;
20490
20491 case 'J':
20492 if (Subtarget->isThumb1Only()) {
20493 // This must be a constant between -255 and -1, for negated ADD
20494 // immediates. This can be used in GCC with an "n" modifier that
20495 // prints the negated value, for use with SUB instructions. It is
20496 // not useful otherwise but is implemented for compatibility.
20497 if (CVal >= -255 && CVal <= -1)
20498 break;
20499 } else {
20500 // This must be a constant between -4095 and 4095. This is suitable
20501 // for use as the immediate offset field in LDR and STR instructions
20502 // such as LDR r0,[r1,#offset].
20503 if (CVal >= -4095 && CVal <= 4095)
20504 break;
20505 }
20506 return;
20507
20508 case 'K':
20509 if (Subtarget->isThumb1Only()) {
20510 // A 32-bit value where only one byte has a nonzero value. Exclude
20511 // zero to match GCC. This constraint is used by GCC internally for
20512 // constants that can be loaded with a move/shift combination.
20513 // It is not useful otherwise but is implemented for compatibility.
20514 if (CVal != 0 && ARM_AM::isThumbImmShiftedVal(CVal))
20515 break;
20516 } else if (Subtarget->isThumb2()) {
20517 // A constant whose bitwise inverse can be used as an immediate
20518 // value in a data-processing instruction. This can be used in GCC
20519 // with a "B" modifier that prints the inverted value, for use with
20520 // BIC and MVN instructions. It is not useful otherwise but is
20521 // implemented for compatibility.
20522 if (ARM_AM::getT2SOImmVal(~CVal) != -1)
20523 break;
20524 } else {
20525 // A constant whose bitwise inverse can be used as an immediate
20526 // value in a data-processing instruction. This can be used in GCC
20527 // with a "B" modifier that prints the inverted value, for use with
20528 // BIC and MVN instructions. It is not useful otherwise but is
20529 // implemented for compatibility.
20530 if (ARM_AM::getSOImmVal(~CVal) != -1)
20531 break;
20532 }
20533 return;
20534
20535 case 'L':
20536 if (Subtarget->isThumb1Only()) {
20537 // This must be a constant between -7 and 7,
20538 // for 3-operand ADD/SUB immediate instructions.
20539 if (CVal >= -7 && CVal < 7)
20540 break;
20541 } else if (Subtarget->isThumb2()) {
20542 // A constant whose negation can be used as an immediate value in a
20543 // data-processing instruction. This can be used in GCC with an "n"
20544 // modifier that prints the negated value, for use with SUB
20545 // instructions. It is not useful otherwise but is implemented for
20546 // compatibility.
20547 if (ARM_AM::getT2SOImmVal(-CVal) != -1)
20548 break;
20549 } else {
20550 // A constant whose negation can be used as an immediate value in a
20551 // data-processing instruction. This can be used in GCC with an "n"
20552 // modifier that prints the negated value, for use with SUB
20553 // instructions. It is not useful otherwise but is implemented for
20554 // compatibility.
20555 if (ARM_AM::getSOImmVal(-CVal) != -1)
20556 break;
20557 }
20558 return;
20559
20560 case 'M':
20561 if (Subtarget->isThumb1Only()) {
20562 // This must be a multiple of 4 between 0 and 1020, for
20563 // ADD sp + immediate.
20564 if ((CVal >= 0 && CVal <= 1020) && ((CVal & 3) == 0))
20565 break;
20566 } else {
20567 // A power of two or a constant between 0 and 32. This is used in
20568 // GCC for the shift amount on shifted register operands, but it is
20569 // useful in general for any shift amounts.
20570 if ((CVal >= 0 && CVal <= 32) || ((CVal & (CVal - 1)) == 0))
20571 break;
20572 }
20573 return;
20574
20575 case 'N':
20576 if (Subtarget->isThumb1Only()) {
20577 // This must be a constant between 0 and 31, for shift amounts.
20578 if (CVal >= 0 && CVal <= 31)
20579 break;
20580 }
20581 return;
20582
20583 case 'O':
20584 if (Subtarget->isThumb1Only()) {
20585 // This must be a multiple of 4 between -508 and 508, for
20586 // ADD/SUB sp = sp + immediate.
20587 if ((CVal >= -508 && CVal <= 508) && ((CVal & 3) == 0))
20588 break;
20589 }
20590 return;
20591 }
20592 Result = DAG.getSignedTargetConstant(CVal, SDLoc(Op), Op.getValueType());
20593 break;
20594 }
20595
20596 if (Result.getNode()) {
20597 Ops.push_back(Result);
20598 return;
20599 }
20600 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
20601}
20602
20603static RTLIB::Libcall getDivRemLibcall(
20604 const SDNode *N, MVT::SimpleValueType SVT) {
20605 assert((N->getOpcode() == ISD::SDIVREM || N->getOpcode() == ISD::UDIVREM ||
20606 N->getOpcode() == ISD::SREM || N->getOpcode() == ISD::UREM) &&
20607 "Unhandled Opcode in getDivRemLibcall");
20608 bool isSigned = N->getOpcode() == ISD::SDIVREM ||
20609 N->getOpcode() == ISD::SREM;
20610 RTLIB::Libcall LC;
20611 switch (SVT) {
20612 default: llvm_unreachable("Unexpected request for libcall!");
20613 case MVT::i8: LC = isSigned ? RTLIB::SDIVREM_I8 : RTLIB::UDIVREM_I8; break;
20614 case MVT::i16: LC = isSigned ? RTLIB::SDIVREM_I16 : RTLIB::UDIVREM_I16; break;
20615 case MVT::i32: LC = isSigned ? RTLIB::SDIVREM_I32 : RTLIB::UDIVREM_I32; break;
20616 case MVT::i64: LC = isSigned ? RTLIB::SDIVREM_I64 : RTLIB::UDIVREM_I64; break;
20617 }
20618 return LC;
20619}
20620
20622 const SDNode *N, LLVMContext *Context, const ARMSubtarget *Subtarget) {
20623 assert((N->getOpcode() == ISD::SDIVREM || N->getOpcode() == ISD::UDIVREM ||
20624 N->getOpcode() == ISD::SREM || N->getOpcode() == ISD::UREM) &&
20625 "Unhandled Opcode in getDivRemArgList");
20626 bool isSigned = N->getOpcode() == ISD::SDIVREM ||
20627 N->getOpcode() == ISD::SREM;
20629 for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
20630 EVT ArgVT = N->getOperand(i).getValueType();
20631 Type *ArgTy = ArgVT.getTypeForEVT(*Context);
20632 TargetLowering::ArgListEntry Entry(N->getOperand(i), ArgTy);
20633 Entry.IsSExt = isSigned;
20634 Entry.IsZExt = !isSigned;
20635 Args.push_back(Entry);
20636 }
20637 if (Subtarget->getTargetTriple().isOSWindows() && Args.size() >= 2)
20638 std::swap(Args[0], Args[1]);
20639 return Args;
20640}
20641
20642SDValue ARMTargetLowering::LowerDivRem(SDValue Op, SelectionDAG &DAG) const {
20643 assert((Subtarget->isTargetAEABI() || Subtarget->isTargetAndroid() ||
20644 Subtarget->isTargetGNUAEABI() || Subtarget->isTargetMuslAEABI() ||
20645 Subtarget->isTargetFuchsia() || Subtarget->isTargetWindows()) &&
20646 "Register-based DivRem lowering only");
20647 unsigned Opcode = Op->getOpcode();
20648 assert((Opcode == ISD::SDIVREM || Opcode == ISD::UDIVREM) &&
20649 "Invalid opcode for Div/Rem lowering");
20650 bool isSigned = (Opcode == ISD::SDIVREM);
20651 EVT VT = Op->getValueType(0);
20652 SDLoc dl(Op);
20653
20654 if (VT == MVT::i64 && isa<ConstantSDNode>(Op.getOperand(1))) {
20656 if (expandDIVREMByConstant(Op.getNode(), Result, MVT::i32, DAG)) {
20657 SDValue Res0 =
20658 DAG.getNode(ISD::BUILD_PAIR, dl, VT, Result[0], Result[1]);
20659 SDValue Res1 =
20660 DAG.getNode(ISD::BUILD_PAIR, dl, VT, Result[2], Result[3]);
20661 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(),
20662 {Res0, Res1});
20663 }
20664 }
20665
20666 Type *Ty = VT.getTypeForEVT(*DAG.getContext());
20667
20668 // If the target has hardware divide, use divide + multiply + subtract:
20669 // div = a / b
20670 // rem = a - b * div
20671 // return {div, rem}
20672 // This should be lowered into UDIV/SDIV + MLS later on.
20673 bool hasDivide = Subtarget->isThumb() ? Subtarget->hasDivideInThumbMode()
20674 : Subtarget->hasDivideInARMMode();
20675 if (hasDivide && Op->getValueType(0).isSimple() &&
20676 Op->getSimpleValueType(0) == MVT::i32) {
20677 unsigned DivOpcode = isSigned ? ISD::SDIV : ISD::UDIV;
20678 const SDValue Dividend = Op->getOperand(0);
20679 const SDValue Divisor = Op->getOperand(1);
20680 SDValue Div = DAG.getNode(DivOpcode, dl, VT, Dividend, Divisor);
20681 SDValue Mul = DAG.getNode(ISD::MUL, dl, VT, Div, Divisor);
20682 SDValue Rem = DAG.getNode(ISD::SUB, dl, VT, Dividend, Mul);
20683
20684 SDValue Values[2] = {Div, Rem};
20685 return DAG.getNode(ISD::MERGE_VALUES, dl, DAG.getVTList(VT, VT), Values);
20686 }
20687
20688 RTLIB::Libcall LC = getDivRemLibcall(Op.getNode(),
20689 VT.getSimpleVT().SimpleTy);
20690 RTLIB::LibcallImpl LCImpl = DAG.getLibcalls().getLibcallImpl(LC);
20691
20692 SDValue InChain = DAG.getEntryNode();
20693
20695 DAG.getContext(),
20696 Subtarget);
20697
20698 SDValue Callee =
20699 DAG.getExternalSymbol(LCImpl, getPointerTy(DAG.getDataLayout()));
20700
20701 Type *RetTy = StructType::get(Ty, Ty);
20702
20703 if (getTM().getTargetTriple().isOSWindows())
20704 InChain = WinDBZCheckDenominator(DAG, Op.getNode(), InChain);
20705
20706 TargetLowering::CallLoweringInfo CLI(DAG);
20707 CLI.setDebugLoc(dl)
20708 .setChain(InChain)
20709 .setCallee(DAG.getLibcalls().getLibcallImplCallingConv(LCImpl), RetTy,
20710 Callee, std::move(Args))
20711 .setInRegister()
20712 .setSExtResult(isSigned)
20713 .setZExtResult(!isSigned);
20714
20715 std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
20716 return CallInfo.first;
20717}
20718
20719// Lowers REM using divmod helpers
20720// see RTABI section 4.2/4.3
20721SDValue ARMTargetLowering::LowerREM(SDNode *N, SelectionDAG &DAG) const {
20722 EVT VT = N->getValueType(0);
20723
20724 if (VT == MVT::i64 && isa<ConstantSDNode>(N->getOperand(1))) {
20726 if (expandDIVREMByConstant(N, Result, MVT::i32, DAG))
20727 return DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), N->getValueType(0),
20728 Result[0], Result[1]);
20729 }
20730
20731 // Build return types (div and rem)
20732 std::vector<Type*> RetTyParams;
20733 Type *RetTyElement;
20734
20735 switch (VT.getSimpleVT().SimpleTy) {
20736 default: llvm_unreachable("Unexpected request for libcall!");
20737 case MVT::i8: RetTyElement = Type::getInt8Ty(*DAG.getContext()); break;
20738 case MVT::i16: RetTyElement = Type::getInt16Ty(*DAG.getContext()); break;
20739 case MVT::i32: RetTyElement = Type::getInt32Ty(*DAG.getContext()); break;
20740 case MVT::i64: RetTyElement = Type::getInt64Ty(*DAG.getContext()); break;
20741 }
20742
20743 RetTyParams.push_back(RetTyElement);
20744 RetTyParams.push_back(RetTyElement);
20745 ArrayRef<Type*> ret = ArrayRef<Type*>(RetTyParams);
20746 Type *RetTy = StructType::get(*DAG.getContext(), ret);
20747
20748 RTLIB::Libcall LC = getDivRemLibcall(N, N->getValueType(0).getSimpleVT().
20749 SimpleTy);
20750 RTLIB::LibcallImpl LCImpl = DAG.getLibcalls().getLibcallImpl(LC);
20751 SDValue InChain = DAG.getEntryNode();
20753 Subtarget);
20754 bool isSigned = N->getOpcode() == ISD::SREM;
20755
20756 SDValue Callee =
20757 DAG.getExternalSymbol(LCImpl, getPointerTy(DAG.getDataLayout()));
20758
20759 if (getTM().getTargetTriple().isOSWindows())
20760 InChain = WinDBZCheckDenominator(DAG, N, InChain);
20761
20762 // Lower call
20763 CallLoweringInfo CLI(DAG);
20764 CLI.setChain(InChain)
20765 .setCallee(DAG.getLibcalls().getLibcallImplCallingConv(LCImpl), RetTy,
20766 Callee, std::move(Args))
20767 .setSExtResult(isSigned)
20768 .setZExtResult(!isSigned)
20769 .setDebugLoc(SDLoc(N));
20770 std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
20771
20772 // Return second (rem) result operand (first contains div)
20773 SDNode *ResNode = CallResult.first.getNode();
20774 assert(ResNode->getNumOperands() == 2 && "divmod should return two operands");
20775 return ResNode->getOperand(1);
20776}
20777
20778SDValue
20779ARMTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const {
20780 assert(getTM().getTargetTriple().isOSWindows() &&
20781 "unsupported target platform");
20782 SDLoc DL(Op);
20783
20784 // Get the inputs.
20785 SDValue Chain = Op.getOperand(0);
20786 SDValue Size = Op.getOperand(1);
20787
20789 "no-stack-arg-probe")) {
20790 MaybeAlign Align =
20791 cast<ConstantSDNode>(Op.getOperand(2))->getMaybeAlignValue();
20792 SDValue SP = DAG.getCopyFromReg(Chain, DL, ARM::SP, MVT::i32);
20793 Chain = SP.getValue(1);
20794 SP = DAG.getNode(ISD::SUB, DL, MVT::i32, SP, Size);
20795 if (Align)
20796 SP = DAG.getNode(ISD::AND, DL, MVT::i32, SP.getValue(0),
20797 DAG.getSignedConstant(-Align->value(), DL, MVT::i32));
20798 Chain = DAG.getCopyToReg(Chain, DL, ARM::SP, SP);
20799 SDValue Ops[2] = { SP, Chain };
20800 return DAG.getMergeValues(Ops, DL);
20801 }
20802
20803 SDValue Words = DAG.getNode(ISD::SRL, DL, MVT::i32, Size,
20804 DAG.getConstant(2, DL, MVT::i32));
20805
20806 SDValue Glue;
20807 Chain = DAG.getCopyToReg(Chain, DL, ARM::R4, Words, Glue);
20808 Glue = Chain.getValue(1);
20809
20810 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
20811 Chain = DAG.getNode(ARMISD::WIN__CHKSTK, DL, NodeTys, Chain, Glue);
20812
20813 SDValue NewSP = DAG.getCopyFromReg(Chain, DL, ARM::SP, MVT::i32);
20814 Chain = NewSP.getValue(1);
20815
20816 SDValue Ops[2] = { NewSP, Chain };
20817 return DAG.getMergeValues(Ops, DL);
20818}
20819
20820SDValue ARMTargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
20821 bool IsStrict = Op->isStrictFPOpcode();
20822 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
20823 const unsigned DstSz = Op.getValueType().getSizeInBits();
20824 const unsigned SrcSz = SrcVal.getValueType().getSizeInBits();
20825 assert(DstSz > SrcSz && DstSz <= 64 && SrcSz >= 16 &&
20826 "Unexpected type for custom-lowering FP_EXTEND");
20827
20828 assert((!Subtarget->hasFP64() || !Subtarget->hasFPARMv8Base()) &&
20829 "With both FP DP and 16, any FP conversion is legal!");
20830
20831 assert(!(DstSz == 32 && Subtarget->hasFP16()) &&
20832 "With FP16, 16 to 32 conversion is legal!");
20833
20834 // Converting from 32 -> 64 is valid if we have FP64.
20835 if (SrcSz == 32 && DstSz == 64 && Subtarget->hasFP64()) {
20836 // FIXME: Remove this when we have strict fp instruction selection patterns
20837 if (IsStrict) {
20838 SDLoc Loc(Op);
20840 Loc, Op.getValueType(), SrcVal);
20841 return DAG.getMergeValues({Result, Op.getOperand(0)}, Loc);
20842 }
20843 return Op;
20844 }
20845
20846 // Either we are converting from 16 -> 64, without FP16 and/or
20847 // FP.double-precision or without Armv8-fp. So we must do it in two
20848 // steps.
20849 // Or we are converting from 32 -> 64 without fp.double-precision or 16 -> 32
20850 // without FP16. So we must do a function call.
20851 SDLoc Loc(Op);
20852 RTLIB::Libcall LC;
20853 MakeLibCallOptions CallOptions;
20854 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
20855 for (unsigned Sz = SrcSz; Sz <= 32 && Sz < DstSz; Sz *= 2) {
20856 bool Supported = (Sz == 16 ? Subtarget->hasFP16() : Subtarget->hasFP64());
20857 MVT SrcVT = (Sz == 16 ? MVT::f16 : MVT::f32);
20858 MVT DstVT = (Sz == 16 ? MVT::f32 : MVT::f64);
20859 if (Supported) {
20860 if (IsStrict) {
20861 SrcVal = DAG.getNode(ISD::STRICT_FP_EXTEND, Loc,
20862 {DstVT, MVT::Other}, {Chain, SrcVal});
20863 Chain = SrcVal.getValue(1);
20864 } else {
20865 SrcVal = DAG.getNode(ISD::FP_EXTEND, Loc, DstVT, SrcVal);
20866 }
20867 } else {
20868 LC = RTLIB::getFPEXT(SrcVT, DstVT);
20869 assert(LC != RTLIB::UNKNOWN_LIBCALL &&
20870 "Unexpected type for custom-lowering FP_EXTEND");
20871 std::tie(SrcVal, Chain) = makeLibCall(DAG, LC, DstVT, SrcVal, CallOptions,
20872 Loc, Chain);
20873 }
20874 }
20875
20876 return IsStrict ? DAG.getMergeValues({SrcVal, Chain}, Loc) : SrcVal;
20877}
20878
20879SDValue ARMTargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
20880 bool IsStrict = Op->isStrictFPOpcode();
20881
20882 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
20883 EVT SrcVT = SrcVal.getValueType();
20884 EVT DstVT = Op.getValueType();
20885 const unsigned DstSz = Op.getValueType().getSizeInBits();
20886 const unsigned SrcSz = SrcVT.getSizeInBits();
20887 (void)DstSz;
20888 assert(DstSz < SrcSz && SrcSz <= 64 && DstSz >= 16 &&
20889 "Unexpected type for custom-lowering FP_ROUND");
20890
20891 assert((!Subtarget->hasFP64() || !Subtarget->hasFPARMv8Base()) &&
20892 "With both FP DP and 16, any FP conversion is legal!");
20893
20894 SDLoc Loc(Op);
20895
20896 // Instruction from 32 -> 16 if hasFP16 is valid
20897 if (SrcSz == 32 && Subtarget->hasFP16())
20898 return Op;
20899
20900 // Lib call from 32 -> 16 / 64 -> [32, 16]
20901 RTLIB::Libcall LC = RTLIB::getFPROUND(SrcVT, DstVT);
20902 assert(LC != RTLIB::UNKNOWN_LIBCALL &&
20903 "Unexpected type for custom-lowering FP_ROUND");
20904 MakeLibCallOptions CallOptions;
20905 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
20907 std::tie(Result, Chain) = makeLibCall(DAG, LC, DstVT, SrcVal, CallOptions,
20908 Loc, Chain);
20909 return IsStrict ? DAG.getMergeValues({Result, Chain}, Loc) : Result;
20910}
20911
20912bool
20914 // The ARM target isn't yet aware of offsets.
20915 return false;
20916}
20917
20919 if (v == 0xffffffff)
20920 return false;
20921
20922 // there can be 1's on either or both "outsides", all the "inside"
20923 // bits must be 0's
20924 return isShiftedMask_32(~v);
20925}
20926
20927/// isFPImmLegal - Returns true if the target can instruction select the
20928/// specified FP immediate natively. If false, the legalizer will
20929/// materialize the FP immediate as a load from a constant pool.
20931 bool ForCodeSize) const {
20932 if (!Subtarget->hasVFP3Base())
20933 return false;
20934 if (VT == MVT::f16 && Subtarget->hasFullFP16())
20935 return ARM_AM::getFP16Imm(Imm) != -1;
20936 if (VT == MVT::f32 && Subtarget->hasFullFP16() &&
20937 ARM_AM::getFP32FP16Imm(Imm) != -1)
20938 return true;
20939 if (VT == MVT::f32)
20940 return ARM_AM::getFP32Imm(Imm) != -1;
20941 if (VT == MVT::f64 && Subtarget->hasFP64())
20942 return ARM_AM::getFP64Imm(Imm) != -1;
20943 return false;
20944}
20945
20946/// getTgtMemIntrinsic - Represent NEON load and store intrinsics as
20947/// MemIntrinsicNodes. The associated MachineMemOperands record the alignment
20948/// specified in the intrinsic calls.
20951 MachineFunction &MF, unsigned Intrinsic) const {
20952 IntrinsicInfo Info;
20953 switch (Intrinsic) {
20954 case Intrinsic::arm_neon_vld1:
20955 case Intrinsic::arm_neon_vld2:
20956 case Intrinsic::arm_neon_vld3:
20957 case Intrinsic::arm_neon_vld4:
20958 case Intrinsic::arm_neon_vld2lane:
20959 case Intrinsic::arm_neon_vld3lane:
20960 case Intrinsic::arm_neon_vld4lane:
20961 case Intrinsic::arm_neon_vld2dup:
20962 case Intrinsic::arm_neon_vld3dup:
20963 case Intrinsic::arm_neon_vld4dup: {
20964 Info.opc = ISD::INTRINSIC_W_CHAIN;
20965 // Conservatively set memVT to the entire set of vectors loaded.
20966 auto &DL = I.getDataLayout();
20967 uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64;
20968 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
20969 Info.ptrVal = I.getArgOperand(0);
20970 Info.offset = 0;
20971 Value *AlignArg = I.getArgOperand(I.arg_size() - 1);
20972 Info.align = cast<ConstantInt>(AlignArg)->getMaybeAlignValue();
20973 // volatile loads with NEON intrinsics not supported
20974 Info.flags = MachineMemOperand::MOLoad;
20975 Infos.push_back(Info);
20976 return;
20977 }
20978 case Intrinsic::arm_neon_vld1x2:
20979 case Intrinsic::arm_neon_vld1x3:
20980 case Intrinsic::arm_neon_vld1x4: {
20981 Info.opc = ISD::INTRINSIC_W_CHAIN;
20982 // Conservatively set memVT to the entire set of vectors loaded.
20983 auto &DL = I.getDataLayout();
20984 uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64;
20985 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
20986 Info.ptrVal = I.getArgOperand(I.arg_size() - 1);
20987 Info.offset = 0;
20988 Info.align = I.getParamAlign(I.arg_size() - 1).valueOrOne();
20989 // volatile loads with NEON intrinsics not supported
20990 Info.flags = MachineMemOperand::MOLoad;
20991 Infos.push_back(Info);
20992 return;
20993 }
20994 case Intrinsic::arm_neon_vst1:
20995 case Intrinsic::arm_neon_vst2:
20996 case Intrinsic::arm_neon_vst3:
20997 case Intrinsic::arm_neon_vst4:
20998 case Intrinsic::arm_neon_vst2lane:
20999 case Intrinsic::arm_neon_vst3lane:
21000 case Intrinsic::arm_neon_vst4lane: {
21001 Info.opc = ISD::INTRINSIC_VOID;
21002 // Conservatively set memVT to the entire set of vectors stored.
21003 auto &DL = I.getDataLayout();
21004 unsigned NumElts = 0;
21005 for (unsigned ArgI = 1, ArgE = I.arg_size(); ArgI < ArgE; ++ArgI) {
21006 Type *ArgTy = I.getArgOperand(ArgI)->getType();
21007 if (!ArgTy->isVectorTy())
21008 break;
21009 NumElts += DL.getTypeSizeInBits(ArgTy) / 64;
21010 }
21011 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
21012 Info.ptrVal = I.getArgOperand(0);
21013 Info.offset = 0;
21014 Value *AlignArg = I.getArgOperand(I.arg_size() - 1);
21015 Info.align = cast<ConstantInt>(AlignArg)->getMaybeAlignValue();
21016 // volatile stores with NEON intrinsics not supported
21017 Info.flags = MachineMemOperand::MOStore;
21018 Infos.push_back(Info);
21019 return;
21020 }
21021 case Intrinsic::arm_neon_vst1x2:
21022 case Intrinsic::arm_neon_vst1x3:
21023 case Intrinsic::arm_neon_vst1x4: {
21024 Info.opc = ISD::INTRINSIC_VOID;
21025 // Conservatively set memVT to the entire set of vectors stored.
21026 auto &DL = I.getDataLayout();
21027 unsigned NumElts = 0;
21028 for (unsigned ArgI = 1, ArgE = I.arg_size(); ArgI < ArgE; ++ArgI) {
21029 Type *ArgTy = I.getArgOperand(ArgI)->getType();
21030 if (!ArgTy->isVectorTy())
21031 break;
21032 NumElts += DL.getTypeSizeInBits(ArgTy) / 64;
21033 }
21034 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
21035 Info.ptrVal = I.getArgOperand(0);
21036 Info.offset = 0;
21037 Info.align = I.getParamAlign(0).valueOrOne();
21038 // volatile stores with NEON intrinsics not supported
21039 Info.flags = MachineMemOperand::MOStore;
21040 Infos.push_back(Info);
21041 return;
21042 }
21043 case Intrinsic::arm_mve_vld2q:
21044 case Intrinsic::arm_mve_vld4q: {
21045 Info.opc = ISD::INTRINSIC_W_CHAIN;
21046 // Conservatively set memVT to the entire set of vectors loaded.
21047 Type *VecTy = cast<StructType>(I.getType())->getElementType(1);
21048 unsigned Factor = Intrinsic == Intrinsic::arm_mve_vld2q ? 2 : 4;
21049 Info.memVT = EVT::getVectorVT(VecTy->getContext(), MVT::i64, Factor * 2);
21050 Info.ptrVal = I.getArgOperand(0);
21051 Info.offset = 0;
21052 Info.align = Align(VecTy->getScalarSizeInBits() / 8);
21053 // volatile loads with MVE intrinsics not supported
21054 Info.flags = MachineMemOperand::MOLoad;
21055 Infos.push_back(Info);
21056 return;
21057 }
21058 case Intrinsic::arm_mve_vst2q:
21059 case Intrinsic::arm_mve_vst4q: {
21060 Info.opc = ISD::INTRINSIC_VOID;
21061 // Conservatively set memVT to the entire set of vectors stored.
21062 Type *VecTy = I.getArgOperand(1)->getType();
21063 unsigned Factor = Intrinsic == Intrinsic::arm_mve_vst2q ? 2 : 4;
21064 Info.memVT = EVT::getVectorVT(VecTy->getContext(), MVT::i64, Factor * 2);
21065 Info.ptrVal = I.getArgOperand(0);
21066 Info.offset = 0;
21067 Info.align = Align(VecTy->getScalarSizeInBits() / 8);
21068 // volatile stores with MVE intrinsics not supported
21069 Info.flags = MachineMemOperand::MOStore;
21070 Infos.push_back(Info);
21071 return;
21072 }
21073 case Intrinsic::arm_mve_vldr_gather_base:
21074 case Intrinsic::arm_mve_vldr_gather_base_predicated: {
21075 Info.opc = ISD::INTRINSIC_W_CHAIN;
21076 Info.ptrVal = nullptr;
21077 Info.memVT = MVT::getVT(I.getType());
21078 Info.align = Align(1);
21079 Info.flags |= MachineMemOperand::MOLoad;
21080 Infos.push_back(Info);
21081 return;
21082 }
21083 case Intrinsic::arm_mve_vldr_gather_base_wb:
21084 case Intrinsic::arm_mve_vldr_gather_base_wb_predicated: {
21085 Info.opc = ISD::INTRINSIC_W_CHAIN;
21086 Info.ptrVal = nullptr;
21087 Info.memVT = MVT::getVT(I.getType()->getContainedType(0));
21088 Info.align = Align(1);
21089 Info.flags |= MachineMemOperand::MOLoad;
21090 Infos.push_back(Info);
21091 return;
21092 }
21093 case Intrinsic::arm_mve_vldr_gather_offset:
21094 case Intrinsic::arm_mve_vldr_gather_offset_predicated: {
21095 Info.opc = ISD::INTRINSIC_W_CHAIN;
21096 Info.ptrVal = nullptr;
21097 MVT DataVT = MVT::getVT(I.getType());
21098 unsigned MemSize = cast<ConstantInt>(I.getArgOperand(2))->getZExtValue();
21099 Info.memVT = MVT::getVectorVT(MVT::getIntegerVT(MemSize),
21100 DataVT.getVectorNumElements());
21101 Info.align = Align(1);
21102 Info.flags |= MachineMemOperand::MOLoad;
21103 Infos.push_back(Info);
21104 return;
21105 }
21106 case Intrinsic::arm_mve_vstr_scatter_base:
21107 case Intrinsic::arm_mve_vstr_scatter_base_predicated: {
21108 Info.opc = ISD::INTRINSIC_VOID;
21109 Info.ptrVal = nullptr;
21110 Info.memVT = MVT::getVT(I.getArgOperand(2)->getType());
21111 Info.align = Align(1);
21112 Info.flags |= MachineMemOperand::MOStore;
21113 Infos.push_back(Info);
21114 return;
21115 }
21116 case Intrinsic::arm_mve_vstr_scatter_base_wb:
21117 case Intrinsic::arm_mve_vstr_scatter_base_wb_predicated: {
21118 Info.opc = ISD::INTRINSIC_W_CHAIN;
21119 Info.ptrVal = nullptr;
21120 Info.memVT = MVT::getVT(I.getArgOperand(2)->getType());
21121 Info.align = Align(1);
21122 Info.flags |= MachineMemOperand::MOStore;
21123 Infos.push_back(Info);
21124 return;
21125 }
21126 case Intrinsic::arm_mve_vstr_scatter_offset:
21127 case Intrinsic::arm_mve_vstr_scatter_offset_predicated: {
21128 Info.opc = ISD::INTRINSIC_VOID;
21129 Info.ptrVal = nullptr;
21130 MVT DataVT = MVT::getVT(I.getArgOperand(2)->getType());
21131 unsigned MemSize = cast<ConstantInt>(I.getArgOperand(3))->getZExtValue();
21132 Info.memVT = MVT::getVectorVT(MVT::getIntegerVT(MemSize),
21133 DataVT.getVectorNumElements());
21134 Info.align = Align(1);
21135 Info.flags |= MachineMemOperand::MOStore;
21136 Infos.push_back(Info);
21137 return;
21138 }
21139 case Intrinsic::arm_ldaex:
21140 case Intrinsic::arm_ldrex: {
21141 auto &DL = I.getDataLayout();
21142 Type *ValTy = I.getParamElementType(0);
21143 Info.opc = ISD::INTRINSIC_W_CHAIN;
21144 Info.memVT = MVT::getVT(ValTy);
21145 Info.ptrVal = I.getArgOperand(0);
21146 Info.offset = 0;
21147 Info.align = DL.getABITypeAlign(ValTy);
21149 Infos.push_back(Info);
21150 return;
21151 }
21152 case Intrinsic::arm_stlex:
21153 case Intrinsic::arm_strex: {
21154 auto &DL = I.getDataLayout();
21155 Type *ValTy = I.getParamElementType(1);
21156 Info.opc = ISD::INTRINSIC_W_CHAIN;
21157 Info.memVT = MVT::getVT(ValTy);
21158 Info.ptrVal = I.getArgOperand(1);
21159 Info.offset = 0;
21160 Info.align = DL.getABITypeAlign(ValTy);
21162 Infos.push_back(Info);
21163 return;
21164 }
21165 case Intrinsic::arm_stlexd:
21166 case Intrinsic::arm_strexd:
21167 Info.opc = ISD::INTRINSIC_W_CHAIN;
21168 Info.memVT = MVT::i64;
21169 Info.ptrVal = I.getArgOperand(2);
21170 Info.offset = 0;
21171 Info.align = Align(8);
21173 Infos.push_back(Info);
21174 return;
21175
21176 case Intrinsic::arm_ldaexd:
21177 case Intrinsic::arm_ldrexd:
21178 Info.opc = ISD::INTRINSIC_W_CHAIN;
21179 Info.memVT = MVT::i64;
21180 Info.ptrVal = I.getArgOperand(0);
21181 Info.offset = 0;
21182 Info.align = Align(8);
21184 Infos.push_back(Info);
21185 return;
21186
21187 default:
21188 break;
21189 }
21190}
21191
21192/// Returns true if it is beneficial to convert a load of a constant
21193/// to just the constant itself.
21195 Type *Ty) const {
21196 assert(Ty->isIntegerTy());
21197
21198 unsigned Bits = Ty->getPrimitiveSizeInBits();
21199 if (Bits == 0 || Bits > 32)
21200 return false;
21201 return true;
21202}
21203
21205 unsigned Index) const {
21207 return false;
21208
21209 return (Index == 0 || Index == ResVT.getVectorNumElements());
21210}
21211
21213 ARM_MB::MemBOpt Domain) const {
21214 // First, if the target has no DMB, see what fallback we can use.
21215 if (!Subtarget->hasDataBarrier()) {
21216 // Some ARMv6 cpus can support data barriers with an mcr instruction.
21217 // Thumb1 and pre-v6 ARM mode use a libcall instead and should never get
21218 // here.
21219 if (Subtarget->hasV6Ops() && !Subtarget->isThumb()) {
21220 Value* args[6] = {Builder.getInt32(15), Builder.getInt32(0),
21221 Builder.getInt32(0), Builder.getInt32(7),
21222 Builder.getInt32(10), Builder.getInt32(5)};
21223 return Builder.CreateIntrinsic(Intrinsic::arm_mcr, args);
21224 } else {
21225 // Instead of using barriers, atomic accesses on these subtargets use
21226 // libcalls.
21227 llvm_unreachable("makeDMB on a target so old that it has no barriers");
21228 }
21229 } else {
21230 // Only a full system barrier exists in the M-class architectures.
21231 Domain = Subtarget->isMClass() ? ARM_MB::SY : Domain;
21232 Constant *CDomain = Builder.getInt32(Domain);
21233 return Builder.CreateIntrinsic(Intrinsic::arm_dmb, CDomain);
21234 }
21235}
21236
21237// Based on http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html
21239 Instruction *Inst,
21240 AtomicOrdering Ord) const {
21241 switch (Ord) {
21244 llvm_unreachable("Invalid fence: unordered/non-atomic");
21247 return nullptr; // Nothing to do
21249 if (!Inst->hasAtomicStore())
21250 return nullptr; // Nothing to do
21251 [[fallthrough]];
21254 if (Subtarget->preferISHSTBarriers())
21255 return makeDMB(Builder, ARM_MB::ISHST);
21256 // FIXME: add a comment with a link to documentation justifying this.
21257 else
21258 return makeDMB(Builder, ARM_MB::ISH);
21259 }
21260 llvm_unreachable("Unknown fence ordering in emitLeadingFence");
21261}
21262
21264 Instruction *Inst,
21265 AtomicOrdering Ord) const {
21266 switch (Ord) {
21269 llvm_unreachable("Invalid fence: unordered/not-atomic");
21272 return nullptr; // Nothing to do
21276 return makeDMB(Builder, ARM_MB::ISH);
21277 }
21278 llvm_unreachable("Unknown fence ordering in emitTrailingFence");
21279}
21280
21281// Loads and stores less than 64-bits are already atomic; ones above that
21282// are doomed anyway, so defer to the default libcall and blame the OS when
21283// things go wrong. Cortex M doesn't have ldrexd/strexd though, so don't emit
21284// anything for those.
21287 bool has64BitAtomicStore;
21288 if (Subtarget->isMClass())
21289 has64BitAtomicStore = false;
21290 else if (Subtarget->isThumb())
21291 has64BitAtomicStore = Subtarget->hasV7Ops();
21292 else
21293 has64BitAtomicStore = Subtarget->hasV6Ops();
21294
21295 unsigned Size = SI->getValueOperand()->getType()->getPrimitiveSizeInBits();
21296 return Size == 64 && has64BitAtomicStore ? AtomicExpansionKind::Expand
21298}
21299
21300// Loads and stores less than 64-bits are already atomic; ones above that
21301// are doomed anyway, so defer to the default libcall and blame the OS when
21302// things go wrong. Cortex M doesn't have ldrexd/strexd though, so don't emit
21303// anything for those.
21304// FIXME: ldrd and strd are atomic if the CPU has LPAE (e.g. A15 has that
21305// guarantee, see DDI0406C ARM architecture reference manual,
21306// sections A8.8.72-74 LDRD)
21309 bool has64BitAtomicLoad;
21310 if (Subtarget->isMClass())
21311 has64BitAtomicLoad = false;
21312 else if (Subtarget->isThumb())
21313 has64BitAtomicLoad = Subtarget->hasV7Ops();
21314 else
21315 has64BitAtomicLoad = Subtarget->hasV6Ops();
21316
21317 unsigned Size = LI->getType()->getPrimitiveSizeInBits();
21318 return (Size == 64 && has64BitAtomicLoad) ? AtomicExpansionKind::LLOnly
21320}
21321
21322// For the real atomic operations, we have ldrex/strex up to 32 bits,
21323// and up to 64 bits on the non-M profiles
21326 if (AI->isFloatingPointOperation())
21328
21329 unsigned Size = AI->getType()->getPrimitiveSizeInBits();
21330 bool hasAtomicRMW;
21331 if (Subtarget->isMClass())
21332 hasAtomicRMW = Subtarget->hasV8MBaselineOps();
21333 else if (Subtarget->isThumb())
21334 hasAtomicRMW = Subtarget->hasV7Ops();
21335 else
21336 hasAtomicRMW = Subtarget->hasV6Ops();
21337 if (Size <= (Subtarget->isMClass() ? 32U : 64U) && hasAtomicRMW) {
21338 // At -O0, fast-regalloc cannot cope with the live vregs necessary to
21339 // implement atomicrmw without spilling. If the target address is also on
21340 // the stack and close enough to the spill slot, this can lead to a
21341 // situation where the monitor always gets cleared and the atomic operation
21342 // can never succeed. So at -O0 lower this operation to a CAS loop.
21343 if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None)
21346 }
21348}
21349
21350// Similar to shouldExpandAtomicRMWInIR, ldrex/strex can be used up to 32
21351// bits, and up to 64 bits on the non-M profiles.
21354 const AtomicCmpXchgInst *AI) const {
21355 // At -O0, fast-regalloc cannot cope with the live vregs necessary to
21356 // implement cmpxchg without spilling. If the address being exchanged is also
21357 // on the stack and close enough to the spill slot, this can lead to a
21358 // situation where the monitor always gets cleared and the atomic operation
21359 // can never succeed. So at -O0 we need a late-expanded pseudo-inst instead.
21360 unsigned Size = AI->getOperand(1)->getType()->getPrimitiveSizeInBits();
21361 bool HasAtomicCmpXchg;
21362 if (Subtarget->isMClass())
21363 HasAtomicCmpXchg = Subtarget->hasV8MBaselineOps();
21364 else if (Subtarget->isThumb())
21365 HasAtomicCmpXchg = Subtarget->hasV7Ops();
21366 else
21367 HasAtomicCmpXchg = Subtarget->hasV6Ops();
21368 if (getTargetMachine().getOptLevel() != CodeGenOptLevel::None &&
21369 HasAtomicCmpXchg && Size <= (Subtarget->isMClass() ? 32U : 64U))
21372}
21373
21375 const Instruction *I) const {
21376 return InsertFencesForAtomic;
21377}
21378
21380 // ROPI/RWPI are not supported currently.
21381 return !Subtarget->isROPI() && !Subtarget->isRWPI();
21382}
21383
21385 Module &M, const LibcallLoweringInfo &Libcalls) const {
21386 // MSVC CRT provides functionalities for stack protection.
21387 RTLIB::LibcallImpl SecurityCheckCookieLibcall =
21388 Libcalls.getLibcallImpl(RTLIB::SECURITY_CHECK_COOKIE);
21389
21390 RTLIB::LibcallImpl SecurityCookieVar =
21391 Libcalls.getLibcallImpl(RTLIB::STACK_CHECK_GUARD);
21392 if (SecurityCheckCookieLibcall != RTLIB::Unsupported &&
21393 SecurityCookieVar != RTLIB::Unsupported) {
21394 // MSVC CRT has a global variable holding security cookie.
21395 M.getOrInsertGlobal(getLibcallImplName(SecurityCookieVar),
21396 PointerType::getUnqual(M.getContext()));
21397
21398 // MSVC CRT has a function to validate security cookie.
21399 FunctionCallee SecurityCheckCookie =
21400 M.getOrInsertFunction(getLibcallImplName(SecurityCheckCookieLibcall),
21401 Type::getVoidTy(M.getContext()),
21402 PointerType::getUnqual(M.getContext()));
21403 if (Function *F = dyn_cast<Function>(SecurityCheckCookie.getCallee()))
21404 F->addParamAttr(0, Attribute::AttrKind::InReg);
21405 }
21406
21408}
21409
21411 unsigned &Cost) const {
21412 // If we do not have NEON, vector types are not natively supported.
21413 if (!Subtarget->hasNEON())
21414 return false;
21415
21416 // Floating point values and vector values map to the same register file.
21417 // Therefore, although we could do a store extract of a vector type, this is
21418 // better to leave at float as we have more freedom in the addressing mode for
21419 // those.
21420 if (VectorTy->isFPOrFPVectorTy())
21421 return false;
21422
21423 // If the index is unknown at compile time, this is very expensive to lower
21424 // and it is not possible to combine the store with the extract.
21425 if (!isa<ConstantInt>(Idx))
21426 return false;
21427
21428 assert(VectorTy->isVectorTy() && "VectorTy is not a vector type");
21429 unsigned BitWidth = VectorTy->getPrimitiveSizeInBits().getFixedValue();
21430 // We can do a store + vector extract on any vector that fits perfectly in a D
21431 // or Q register.
21432 if (BitWidth == 64 || BitWidth == 128) {
21433 Cost = 0;
21434 return true;
21435 }
21436 return false;
21437}
21438
21440 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
21441 bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const {
21442 unsigned Opcode = Op.getOpcode();
21443 switch (Opcode) {
21444 case ARMISD::VORRIMM:
21445 case ARMISD::VBICIMM:
21446 return false;
21447 }
21449 Op, DemandedElts, DAG, PoisonOnly, ConsiderFlags, Depth);
21450}
21451
21453 return Subtarget->hasV5TOps() && !Subtarget->isThumb1Only();
21454}
21455
21457 return Subtarget->hasV5TOps() && !Subtarget->isThumb1Only();
21458}
21459
21461 const Instruction &AndI) const {
21462 if (!Subtarget->hasV7Ops())
21463 return false;
21464
21465 // Sink the `and` instruction only if the mask would fit into a modified
21466 // immediate operand.
21468 if (!Mask || Mask->getValue().getBitWidth() > 32u)
21469 return false;
21470 auto MaskVal = unsigned(Mask->getValue().getZExtValue());
21471 return (Subtarget->isThumb2() ? ARM_AM::getT2SOImmVal(MaskVal)
21472 : ARM_AM::getSOImmVal(MaskVal)) != -1;
21473}
21474
21477 SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const {
21478 if (Subtarget->hasMinSize() && !getTM().getTargetTriple().isOSWindows())
21481 ExpansionFactor);
21482}
21483
21485 Value *Addr,
21486 AtomicOrdering Ord) const {
21487 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
21488 bool IsAcquire = isAcquireOrStronger(Ord);
21489
21490 // Since i64 isn't legal and intrinsics don't get type-lowered, the ldrexd
21491 // intrinsic must return {i32, i32} and we have to recombine them into a
21492 // single i64 here.
21493 if (ValueTy->getPrimitiveSizeInBits() == 64) {
21495 IsAcquire ? Intrinsic::arm_ldaexd : Intrinsic::arm_ldrexd;
21496
21497 Value *LoHi =
21498 Builder.CreateIntrinsic(Int, Addr, /*FMFSource=*/nullptr, "lohi");
21499
21500 Value *Lo = Builder.CreateExtractValue(LoHi, 0, "lo");
21501 Value *Hi = Builder.CreateExtractValue(LoHi, 1, "hi");
21502 if (!Subtarget->isLittle())
21503 std::swap (Lo, Hi);
21504 Lo = Builder.CreateZExt(Lo, ValueTy, "lo64");
21505 Hi = Builder.CreateZExt(Hi, ValueTy, "hi64");
21506 return Builder.CreateOr(
21507 Lo, Builder.CreateShl(Hi, ConstantInt::get(ValueTy, 32)), "val64");
21508 }
21509
21510 Type *Tys[] = { Addr->getType() };
21511 Intrinsic::ID Int = IsAcquire ? Intrinsic::arm_ldaex : Intrinsic::arm_ldrex;
21512 CallInst *CI = Builder.CreateIntrinsic(Int, Tys, Addr);
21513
21514 CI->addParamAttr(
21515 0, Attribute::get(M->getContext(), Attribute::ElementType, ValueTy));
21516 return Builder.CreateTruncOrBitCast(CI, ValueTy);
21517}
21518
21520 IRBuilderBase &Builder) const {
21521 if (!Subtarget->hasV7Ops())
21522 return;
21523 Builder.CreateIntrinsic(Intrinsic::arm_clrex, {});
21524}
21525
21527 Value *Val, Value *Addr,
21528 AtomicOrdering Ord) const {
21529 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
21530 bool IsRelease = isReleaseOrStronger(Ord);
21531
21532 // Since the intrinsics must have legal type, the i64 intrinsics take two
21533 // parameters: "i32, i32". We must marshal Val into the appropriate form
21534 // before the call.
21535 if (Val->getType()->getPrimitiveSizeInBits() == 64) {
21537 IsRelease ? Intrinsic::arm_stlexd : Intrinsic::arm_strexd;
21538 Type *Int32Ty = Type::getInt32Ty(M->getContext());
21539
21540 Value *Lo = Builder.CreateTrunc(Val, Int32Ty, "lo");
21541 Value *Hi = Builder.CreateTrunc(Builder.CreateLShr(Val, 32), Int32Ty, "hi");
21542 if (!Subtarget->isLittle())
21543 std::swap(Lo, Hi);
21544 return Builder.CreateIntrinsic(Int, {Lo, Hi, Addr});
21545 }
21546
21547 Intrinsic::ID Int = IsRelease ? Intrinsic::arm_stlex : Intrinsic::arm_strex;
21548 Type *Tys[] = { Addr->getType() };
21550
21551 CallInst *CI = Builder.CreateCall(
21552 Strex, {Builder.CreateZExtOrBitCast(
21553 Val, Strex->getFunctionType()->getParamType(0)),
21554 Addr});
21555 CI->addParamAttr(1, Attribute::get(M->getContext(), Attribute::ElementType,
21556 Val->getType()));
21557 return CI;
21558}
21559
21560
21562 return Subtarget->isMClass();
21563}
21564
21565/// A helper function for determining the number of interleaved accesses we
21566/// will generate when lowering accesses of the given type.
21567unsigned
21569 const DataLayout &DL) const {
21570 return (DL.getTypeSizeInBits(VecTy) + 127) / 128;
21571}
21572
21574 unsigned Factor, FixedVectorType *VecTy, Align Alignment,
21575 const DataLayout &DL) const {
21576
21577 unsigned VecSize = DL.getTypeSizeInBits(VecTy);
21578 unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType());
21579
21580 if (!Subtarget->hasNEON() && !Subtarget->hasMVEIntegerOps())
21581 return false;
21582
21583 // Ensure the vector doesn't have f16 elements. Even though we could do an
21584 // i16 vldN, we can't hold the f16 vectors and will end up converting via
21585 // f32.
21586 if (Subtarget->hasNEON() && VecTy->getElementType()->isHalfTy())
21587 return false;
21588 if (Subtarget->hasMVEIntegerOps() && Factor == 3)
21589 return false;
21590
21591 // Ensure the number of vector elements is greater than 1.
21592 if (VecTy->getNumElements() < 2)
21593 return false;
21594
21595 // Ensure the element type is legal.
21596 if (ElSize != 8 && ElSize != 16 && ElSize != 32)
21597 return false;
21598 // And the alignment if high enough under MVE.
21599 if (Subtarget->hasMVEIntegerOps() && Alignment < ElSize / 8)
21600 return false;
21601
21602 // Ensure the total vector size is 64 or a multiple of 128. Types larger than
21603 // 128 will be split into multiple interleaved accesses.
21604 if (Subtarget->hasNEON() && VecSize == 64)
21605 return true;
21606 return VecSize % 128 == 0;
21607}
21608
21610 if (Subtarget->hasNEON())
21611 return 4;
21612 if (Subtarget->hasMVEIntegerOps())
21615}
21616
21617/// Lower an interleaved load into a vldN intrinsic.
21618///
21619/// E.g. Lower an interleaved load (Factor = 2):
21620/// %wide.vec = load <8 x i32>, <8 x i32>* %ptr, align 4
21621/// %v0 = shuffle %wide.vec, undef, <0, 2, 4, 6> ; Extract even elements
21622/// %v1 = shuffle %wide.vec, undef, <1, 3, 5, 7> ; Extract odd elements
21623///
21624/// Into:
21625/// %vld2 = { <4 x i32>, <4 x i32> } call llvm.arm.neon.vld2(%ptr, 4)
21626/// %vec0 = extractelement { <4 x i32>, <4 x i32> } %vld2, i32 0
21627/// %vec1 = extractelement { <4 x i32>, <4 x i32> } %vld2, i32 1
21629 Instruction *Load, Value *Mask, ArrayRef<ShuffleVectorInst *> Shuffles,
21630 ArrayRef<unsigned> Indices, unsigned Factor, const APInt &GapMask) const {
21631 assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
21632 "Invalid interleave factor");
21633 assert(!Shuffles.empty() && "Empty shufflevector input");
21634 assert(Shuffles.size() == Indices.size() &&
21635 "Unmatched number of shufflevectors and indices");
21636
21637 auto *LI = dyn_cast<LoadInst>(Load);
21638 if (!LI)
21639 return false;
21640 assert(!Mask && GapMask.popcount() == Factor && "Unexpected mask on a load");
21641
21642 auto *VecTy = cast<FixedVectorType>(Shuffles[0]->getType());
21643 Type *EltTy = VecTy->getElementType();
21644
21645 const DataLayout &DL = LI->getDataLayout();
21646 Align Alignment = LI->getAlign();
21647
21648 // Skip if we do not have NEON and skip illegal vector types. We can
21649 // "legalize" wide vector types into multiple interleaved accesses as long as
21650 // the vector types are divisible by 128.
21651 if (!isLegalInterleavedAccessType(Factor, VecTy, Alignment, DL))
21652 return false;
21653
21654 unsigned NumLoads = getNumInterleavedAccesses(VecTy, DL);
21655
21656 // A pointer vector can not be the return type of the ldN intrinsics. Need to
21657 // load integer vectors first and then convert to pointer vectors.
21658 if (EltTy->isPointerTy())
21659 VecTy = FixedVectorType::get(DL.getIntPtrType(EltTy), VecTy);
21660
21661 IRBuilder<> Builder(LI);
21662
21663 // The base address of the load.
21664 Value *BaseAddr = LI->getPointerOperand();
21665
21666 if (NumLoads > 1) {
21667 // If we're going to generate more than one load, reset the sub-vector type
21668 // to something legal.
21669 VecTy = FixedVectorType::get(VecTy->getElementType(),
21670 VecTy->getNumElements() / NumLoads);
21671 }
21672
21673 assert(isTypeLegal(EVT::getEVT(VecTy)) && "Illegal vldN vector type!");
21674
21675 auto createLoadIntrinsic = [&](Value *BaseAddr) {
21676 if (Subtarget->hasNEON()) {
21677 Type *PtrTy = Builder.getPtrTy(LI->getPointerAddressSpace());
21678 Type *Tys[] = {VecTy, PtrTy};
21679 static const Intrinsic::ID LoadInts[3] = {Intrinsic::arm_neon_vld2,
21680 Intrinsic::arm_neon_vld3,
21681 Intrinsic::arm_neon_vld4};
21682
21684 Ops.push_back(BaseAddr);
21685 Ops.push_back(Builder.getInt32(LI->getAlign().value()));
21686
21687 return Builder.CreateIntrinsic(LoadInts[Factor - 2], Tys, Ops,
21688 /*FMFSource=*/nullptr, "vldN");
21689 } else {
21690 assert((Factor == 2 || Factor == 4) &&
21691 "expected interleave factor of 2 or 4 for MVE");
21692 Intrinsic::ID LoadInts =
21693 Factor == 2 ? Intrinsic::arm_mve_vld2q : Intrinsic::arm_mve_vld4q;
21694 Type *PtrTy = Builder.getPtrTy(LI->getPointerAddressSpace());
21695 Type *Tys[] = {VecTy, PtrTy};
21696
21698 Ops.push_back(BaseAddr);
21699 return Builder.CreateIntrinsic(LoadInts, Tys, Ops, /*FMFSource=*/nullptr,
21700 "vldN");
21701 }
21702 };
21703
21704 // Holds sub-vectors extracted from the load intrinsic return values. The
21705 // sub-vectors are associated with the shufflevector instructions they will
21706 // replace.
21708
21709 for (unsigned LoadCount = 0; LoadCount < NumLoads; ++LoadCount) {
21710 // If we're generating more than one load, compute the base address of
21711 // subsequent loads as an offset from the previous.
21712 if (LoadCount > 0)
21713 BaseAddr = Builder.CreateConstGEP1_32(VecTy->getElementType(), BaseAddr,
21714 VecTy->getNumElements() * Factor);
21715
21716 CallInst *VldN = createLoadIntrinsic(BaseAddr);
21717
21718 // Replace uses of each shufflevector with the corresponding vector loaded
21719 // by ldN.
21720 for (unsigned i = 0; i < Shuffles.size(); i++) {
21721 ShuffleVectorInst *SV = Shuffles[i];
21722 unsigned Index = Indices[i];
21723
21724 Value *SubVec = Builder.CreateExtractValue(VldN, Index);
21725
21726 // Convert the integer vector to pointer vector if the element is pointer.
21727 if (EltTy->isPointerTy())
21728 SubVec = Builder.CreateIntToPtr(
21729 SubVec,
21731
21732 SubVecs[SV].push_back(SubVec);
21733 }
21734 }
21735
21736 // Replace uses of the shufflevector instructions with the sub-vectors
21737 // returned by the load intrinsic. If a shufflevector instruction is
21738 // associated with more than one sub-vector, those sub-vectors will be
21739 // concatenated into a single wide vector.
21740 for (ShuffleVectorInst *SVI : Shuffles) {
21741 auto &SubVec = SubVecs[SVI];
21742 auto *WideVec =
21743 SubVec.size() > 1 ? concatenateVectors(Builder, SubVec) : SubVec[0];
21744 SVI->replaceAllUsesWith(WideVec);
21745 }
21746
21747 return true;
21748}
21749
21750/// Lower an interleaved store into a vstN intrinsic.
21751///
21752/// E.g. Lower an interleaved store (Factor = 3):
21753/// %i.vec = shuffle <8 x i32> %v0, <8 x i32> %v1,
21754/// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>
21755/// store <12 x i32> %i.vec, <12 x i32>* %ptr, align 4
21756///
21757/// Into:
21758/// %sub.v0 = shuffle <8 x i32> %v0, <8 x i32> v1, <0, 1, 2, 3>
21759/// %sub.v1 = shuffle <8 x i32> %v0, <8 x i32> v1, <4, 5, 6, 7>
21760/// %sub.v2 = shuffle <8 x i32> %v0, <8 x i32> v1, <8, 9, 10, 11>
21761/// call void llvm.arm.neon.vst3(%ptr, %sub.v0, %sub.v1, %sub.v2, 4)
21762///
21763/// Note that the new shufflevectors will be removed and we'll only generate one
21764/// vst3 instruction in CodeGen.
21765///
21766/// Example for a more general valid mask (Factor 3). Lower:
21767/// %i.vec = shuffle <32 x i32> %v0, <32 x i32> %v1,
21768/// <4, 32, 16, 5, 33, 17, 6, 34, 18, 7, 35, 19>
21769/// store <12 x i32> %i.vec, <12 x i32>* %ptr
21770///
21771/// Into:
21772/// %sub.v0 = shuffle <32 x i32> %v0, <32 x i32> v1, <4, 5, 6, 7>
21773/// %sub.v1 = shuffle <32 x i32> %v0, <32 x i32> v1, <32, 33, 34, 35>
21774/// %sub.v2 = shuffle <32 x i32> %v0, <32 x i32> v1, <16, 17, 18, 19>
21775/// call void llvm.arm.neon.vst3(%ptr, %sub.v0, %sub.v1, %sub.v2, 4)
21777 Value *LaneMask,
21778 ShuffleVectorInst *SVI,
21779 unsigned Factor,
21780 const APInt &GapMask) const {
21781 assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
21782 "Invalid interleave factor");
21783 auto *SI = dyn_cast<StoreInst>(Store);
21784 if (!SI)
21785 return false;
21786 assert(!LaneMask && GapMask.popcount() == Factor &&
21787 "Unexpected mask on store");
21788
21789 auto *VecTy = cast<FixedVectorType>(SVI->getType());
21790 assert(VecTy->getNumElements() % Factor == 0 && "Invalid interleaved store");
21791
21792 unsigned LaneLen = VecTy->getNumElements() / Factor;
21793 Type *EltTy = VecTy->getElementType();
21794 auto *SubVecTy = FixedVectorType::get(EltTy, LaneLen);
21795
21796 const DataLayout &DL = SI->getDataLayout();
21797 Align Alignment = SI->getAlign();
21798
21799 // Skip if we do not have NEON and skip illegal vector types. We can
21800 // "legalize" wide vector types into multiple interleaved accesses as long as
21801 // the vector types are divisible by 128.
21802 if (!isLegalInterleavedAccessType(Factor, SubVecTy, Alignment, DL))
21803 return false;
21804
21805 unsigned NumStores = getNumInterleavedAccesses(SubVecTy, DL);
21806
21807 Value *Op0 = SVI->getOperand(0);
21808 Value *Op1 = SVI->getOperand(1);
21809 IRBuilder<> Builder(SI);
21810
21811 // StN intrinsics don't support pointer vectors as arguments. Convert pointer
21812 // vectors to integer vectors.
21813 if (EltTy->isPointerTy()) {
21814 Type *IntTy = DL.getIntPtrType(EltTy);
21815
21816 // Convert to the corresponding integer vector.
21817 auto *IntVecTy =
21819 Op0 = Builder.CreatePtrToInt(Op0, IntVecTy);
21820 Op1 = Builder.CreatePtrToInt(Op1, IntVecTy);
21821
21822 SubVecTy = FixedVectorType::get(IntTy, LaneLen);
21823 }
21824
21825 // The base address of the store.
21826 Value *BaseAddr = SI->getPointerOperand();
21827
21828 if (NumStores > 1) {
21829 // If we're going to generate more than one store, reset the lane length
21830 // and sub-vector type to something legal.
21831 LaneLen /= NumStores;
21832 SubVecTy = FixedVectorType::get(SubVecTy->getElementType(), LaneLen);
21833 }
21834
21835 assert(isTypeLegal(EVT::getEVT(SubVecTy)) && "Illegal vstN vector type!");
21836
21837 auto Mask = SVI->getShuffleMask();
21838
21839 auto createStoreIntrinsic = [&](Value *BaseAddr,
21840 SmallVectorImpl<Value *> &Shuffles) {
21841 if (Subtarget->hasNEON()) {
21842 static const Intrinsic::ID StoreInts[3] = {Intrinsic::arm_neon_vst2,
21843 Intrinsic::arm_neon_vst3,
21844 Intrinsic::arm_neon_vst4};
21845 Type *PtrTy = Builder.getPtrTy(SI->getPointerAddressSpace());
21846 Type *Tys[] = {PtrTy, SubVecTy};
21847
21849 Ops.push_back(BaseAddr);
21850 append_range(Ops, Shuffles);
21851 Ops.push_back(Builder.getInt32(SI->getAlign().value()));
21852 Builder.CreateIntrinsic(StoreInts[Factor - 2], Tys, Ops);
21853 } else {
21854 assert((Factor == 2 || Factor == 4) &&
21855 "expected interleave factor of 2 or 4 for MVE");
21856 Intrinsic::ID StoreInts =
21857 Factor == 2 ? Intrinsic::arm_mve_vst2q : Intrinsic::arm_mve_vst4q;
21858 Type *PtrTy = Builder.getPtrTy(SI->getPointerAddressSpace());
21859 Type *Tys[] = {PtrTy, SubVecTy};
21860
21862 Ops.push_back(BaseAddr);
21863 append_range(Ops, Shuffles);
21864 for (unsigned F = 0; F < Factor; F++) {
21865 Ops.push_back(Builder.getInt32(F));
21866 Builder.CreateIntrinsic(StoreInts, Tys, Ops);
21867 Ops.pop_back();
21868 }
21869 }
21870 };
21871
21872 for (unsigned StoreCount = 0; StoreCount < NumStores; ++StoreCount) {
21873 // If we generating more than one store, we compute the base address of
21874 // subsequent stores as an offset from the previous.
21875 if (StoreCount > 0)
21876 BaseAddr = Builder.CreateConstGEP1_32(SubVecTy->getElementType(),
21877 BaseAddr, LaneLen * Factor);
21878
21879 SmallVector<Value *, 4> Shuffles;
21880
21881 // Split the shufflevector operands into sub vectors for the new vstN call.
21882 for (unsigned i = 0; i < Factor; i++) {
21883 unsigned IdxI = StoreCount * LaneLen * Factor + i;
21884 if (Mask[IdxI] >= 0) {
21885 Shuffles.push_back(Builder.CreateShuffleVector(
21886 Op0, Op1, createSequentialMask(Mask[IdxI], LaneLen, 0)));
21887 } else {
21888 unsigned StartMask = 0;
21889 for (unsigned j = 1; j < LaneLen; j++) {
21890 unsigned IdxJ = StoreCount * LaneLen * Factor + j;
21891 if (Mask[IdxJ * Factor + IdxI] >= 0) {
21892 StartMask = Mask[IdxJ * Factor + IdxI] - IdxJ;
21893 break;
21894 }
21895 }
21896 // Note: If all elements in a chunk are undefs, StartMask=0!
21897 // Note: Filling undef gaps with random elements is ok, since
21898 // those elements were being written anyway (with undefs).
21899 // In the case of all undefs we're defaulting to using elems from 0
21900 // Note: StartMask cannot be negative, it's checked in
21901 // isReInterleaveMask
21902 Shuffles.push_back(Builder.CreateShuffleVector(
21903 Op0, Op1, createSequentialMask(StartMask, LaneLen, 0)));
21904 }
21905 }
21906
21907 createStoreIntrinsic(BaseAddr, Shuffles);
21908 }
21909 return true;
21910}
21911
21919
21921 uint64_t &Members) {
21922 if (auto *ST = dyn_cast<StructType>(Ty)) {
21923 for (unsigned i = 0; i < ST->getNumElements(); ++i) {
21924 uint64_t SubMembers = 0;
21925 if (!isHomogeneousAggregate(ST->getElementType(i), Base, SubMembers))
21926 return false;
21927 Members += SubMembers;
21928 }
21929 } else if (auto *AT = dyn_cast<ArrayType>(Ty)) {
21930 uint64_t SubMembers = 0;
21931 if (!isHomogeneousAggregate(AT->getElementType(), Base, SubMembers))
21932 return false;
21933 Members += SubMembers * AT->getNumElements();
21934 } else if (Ty->isFloatTy()) {
21935 if (Base != HA_UNKNOWN && Base != HA_FLOAT)
21936 return false;
21937 Members = 1;
21938 Base = HA_FLOAT;
21939 } else if (Ty->isDoubleTy()) {
21940 if (Base != HA_UNKNOWN && Base != HA_DOUBLE)
21941 return false;
21942 Members = 1;
21943 Base = HA_DOUBLE;
21944 } else if (auto *VT = dyn_cast<VectorType>(Ty)) {
21945 Members = 1;
21946 switch (Base) {
21947 case HA_FLOAT:
21948 case HA_DOUBLE:
21949 return false;
21950 case HA_VECT64:
21951 return VT->getPrimitiveSizeInBits().getFixedValue() == 64;
21952 case HA_VECT128:
21953 return VT->getPrimitiveSizeInBits().getFixedValue() == 128;
21954 case HA_UNKNOWN:
21955 switch (VT->getPrimitiveSizeInBits().getFixedValue()) {
21956 case 64:
21957 Base = HA_VECT64;
21958 return true;
21959 case 128:
21960 Base = HA_VECT128;
21961 return true;
21962 default:
21963 return false;
21964 }
21965 }
21966 }
21967
21968 return (Members > 0 && Members <= 4);
21969}
21970
21971/// Return the correct alignment for the current calling convention.
21973 Type *ArgTy, const DataLayout &DL) const {
21974 const Align ABITypeAlign = DL.getABITypeAlign(ArgTy);
21975 if (!ArgTy->isVectorTy())
21976 return ABITypeAlign;
21977
21978 // Avoid over-aligning vector parameters. It would require realigning the
21979 // stack and waste space for no real benefit.
21980 MaybeAlign StackAlign = DL.getStackAlignment();
21981 assert(StackAlign && "data layout string is missing stack alignment");
21982 return std::min(ABITypeAlign, *StackAlign);
21983}
21984
21985/// Return true if a type is an AAPCS-VFP homogeneous aggregate or one of
21986/// [N x i32] or [N x i64]. This allows front-ends to skip emitting padding when
21987/// passing according to AAPCS rules.
21989 Type *Ty, CallingConv::ID CallConv, bool isVarArg,
21990 const DataLayout &DL) const {
21991 if (getEffectiveCallingConv(CallConv, isVarArg) !=
21993 return false;
21994
21996 uint64_t Members = 0;
21997 bool IsHA = isHomogeneousAggregate(Ty, Base, Members);
21998 LLVM_DEBUG(dbgs() << "isHA: " << IsHA << " "; Ty->dump());
21999
22000 bool IsIntArray = Ty->isArrayTy() && Ty->getArrayElementType()->isIntegerTy();
22001 return IsHA || IsIntArray;
22002}
22003
22005 const Constant *PersonalityFn) const {
22006 // Platforms which do not use SjLj EH may return values in these registers
22007 // via the personality function.
22009 return EM == ExceptionHandling::SjLj ? Register() : ARM::R0;
22010}
22011
22013 const Constant *PersonalityFn) const {
22014 // Platforms which do not use SjLj EH may return values in these registers
22015 // via the personality function.
22017 return EM == ExceptionHandling::SjLj ? Register() : ARM::R1;
22018}
22019
22020void ARMTargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
22021 // Update IsSplitCSR in ARMFunctionInfo.
22022 ARMFunctionInfo *AFI = Entry->getParent()->getInfo<ARMFunctionInfo>();
22023 AFI->setIsSplitCSR(true);
22024}
22025
22026void ARMTargetLowering::insertCopiesSplitCSR(
22027 MachineBasicBlock *Entry,
22028 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
22029 const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo();
22030 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
22031 if (!IStart)
22032 return;
22033
22034 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
22035 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
22036 MachineBasicBlock::iterator MBBI = Entry->begin();
22037 for (const MCPhysReg *I = IStart; *I; ++I) {
22038 const TargetRegisterClass *RC = nullptr;
22039 if (ARM::GPRRegClass.contains(*I))
22040 RC = &ARM::GPRRegClass;
22041 else if (ARM::DPRRegClass.contains(*I))
22042 RC = &ARM::DPRRegClass;
22043 else
22044 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
22045
22046 Register NewVR = MRI->createVirtualRegister(RC);
22047 // Create copy from CSR to a virtual register.
22048 // FIXME: this currently does not emit CFI pseudo-instructions, it works
22049 // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
22050 // nounwind. If we want to generalize this later, we may need to emit
22051 // CFI pseudo-instructions.
22052 assert(Entry->getParent()->getFunction().hasFnAttribute(
22053 Attribute::NoUnwind) &&
22054 "Function should be nounwind in insertCopiesSplitCSR!");
22055 Entry->addLiveIn(*I);
22056 BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
22057 .addReg(*I);
22058
22059 // Insert the copy-back instructions right before the terminator.
22060 for (auto *Exit : Exits)
22061 BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
22062 TII->get(TargetOpcode::COPY), *I)
22063 .addReg(NewVR);
22064 }
22065}
22066
22071
22073 return Subtarget->hasMVEIntegerOps();
22074}
22075
22078 auto *VTy = dyn_cast<FixedVectorType>(Ty);
22079 if (!VTy)
22080 return false;
22081
22082 auto *ScalarTy = VTy->getScalarType();
22083 unsigned NumElements = VTy->getNumElements();
22084
22085 unsigned VTyWidth = VTy->getScalarSizeInBits() * NumElements;
22086 if (VTyWidth < 128 || !llvm::isPowerOf2_32(VTyWidth))
22087 return false;
22088
22089 // Both VCADD and VCMUL/VCMLA support the same types, F16 and F32
22090 if (ScalarTy->isHalfTy() || ScalarTy->isFloatTy())
22091 return Subtarget->hasMVEFloatOps();
22092
22094 return false;
22095
22096 return Subtarget->hasMVEIntegerOps() &&
22097 (ScalarTy->isIntegerTy(8) || ScalarTy->isIntegerTy(16) ||
22098 ScalarTy->isIntegerTy(32));
22099}
22100
22102 static const MCPhysReg RCRegs[] = {ARM::FPSCR_RM};
22103 return RCRegs;
22104}
22105
22108 ComplexDeinterleavingRotation Rotation, Value *InputA, Value *InputB,
22109 Value *Accumulator) const {
22110
22112
22113 unsigned TyWidth = Ty->getScalarSizeInBits() * Ty->getNumElements();
22114
22115 assert(TyWidth >= 128 && "Width of vector type must be at least 128 bits");
22116
22117 if (TyWidth > 128) {
22118 int Stride = Ty->getNumElements() / 2;
22119 auto SplitSeq = llvm::seq<int>(0, Ty->getNumElements());
22120 auto SplitSeqVec = llvm::to_vector(SplitSeq);
22121 ArrayRef<int> LowerSplitMask(&SplitSeqVec[0], Stride);
22122 ArrayRef<int> UpperSplitMask(&SplitSeqVec[Stride], Stride);
22123
22124 auto *LowerSplitA = B.CreateShuffleVector(InputA, LowerSplitMask);
22125 auto *LowerSplitB = B.CreateShuffleVector(InputB, LowerSplitMask);
22126 auto *UpperSplitA = B.CreateShuffleVector(InputA, UpperSplitMask);
22127 auto *UpperSplitB = B.CreateShuffleVector(InputB, UpperSplitMask);
22128 Value *LowerSplitAcc = nullptr;
22129 Value *UpperSplitAcc = nullptr;
22130
22131 if (Accumulator) {
22132 LowerSplitAcc = B.CreateShuffleVector(Accumulator, LowerSplitMask);
22133 UpperSplitAcc = B.CreateShuffleVector(Accumulator, UpperSplitMask);
22134 }
22135
22136 auto *LowerSplitInt = createComplexDeinterleavingIR(
22137 B, OperationType, Rotation, LowerSplitA, LowerSplitB, LowerSplitAcc);
22138 auto *UpperSplitInt = createComplexDeinterleavingIR(
22139 B, OperationType, Rotation, UpperSplitA, UpperSplitB, UpperSplitAcc);
22140
22141 ArrayRef<int> JoinMask(&SplitSeqVec[0], Ty->getNumElements());
22142 return B.CreateShuffleVector(LowerSplitInt, UpperSplitInt, JoinMask);
22143 }
22144
22145 auto *IntTy = Type::getInt32Ty(B.getContext());
22146
22147 ConstantInt *ConstRotation = nullptr;
22148 if (OperationType == ComplexDeinterleavingOperation::CMulPartial) {
22149 ConstRotation = ConstantInt::get(IntTy, (int)Rotation);
22150
22151 if (Accumulator)
22152 return B.CreateIntrinsic(Intrinsic::arm_mve_vcmlaq, Ty,
22153 {ConstRotation, Accumulator, InputB, InputA});
22154 return B.CreateIntrinsic(Intrinsic::arm_mve_vcmulq, Ty,
22155 {ConstRotation, InputB, InputA});
22156 }
22157
22158 if (OperationType == ComplexDeinterleavingOperation::CAdd) {
22159 // 1 means the value is not halved.
22160 auto *ConstHalving = ConstantInt::get(IntTy, 1);
22161
22163 ConstRotation = ConstantInt::get(IntTy, 0);
22165 ConstRotation = ConstantInt::get(IntTy, 1);
22166
22167 if (!ConstRotation)
22168 return nullptr; // Invalid rotation for arm_mve_vcaddq
22169
22170 return B.CreateIntrinsic(Intrinsic::arm_mve_vcaddq, Ty,
22171 {ConstHalving, ConstRotation, InputA, InputB});
22172 }
22173
22174 return nullptr;
22175}
static bool isAddSubSExt(SDValue N, SelectionDAG &DAG)
static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, int64_t &Cnt)
isVShiftRImm - Check if this is a valid build_vector for the immediate operand of a vector shift righ...
static bool isExtendedBUILD_VECTOR(SDValue N, SelectionDAG &DAG, bool isSigned)
static SDValue carryFlagToValue(SDValue Glue, EVT VT, SelectionDAG &DAG, bool Invert)
static SDValue overflowFlagToValue(SDValue Glue, EVT VT, SelectionDAG &DAG)
static bool isZeroExtended(SDValue N, SelectionDAG &DAG)
return SDValue()
static bool isCMN(SDValue Op, ISD::CondCode CC, SelectionDAG &DAG)
static const MCPhysReg GPRArgRegs[]
static SDValue valueToCarryFlag(SDValue Value, SelectionDAG &DAG, bool Invert)
static SDValue GeneratePerfectShuffle(unsigned ID, SDValue V1, SDValue V2, unsigned PFEntry, SDValue LHS, SDValue RHS, SelectionDAG &DAG, const SDLoc &DL)
GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit the specified operations t...
constexpr MVT FlagsVT
Value type used for NZCV flags.
static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt)
getVShiftImm - Check if this is a valid build_vector for the immediate operand of a vector shift oper...
static bool isSafeSignedCMN(SDValue Op, SelectionDAG &DAG)
static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG)
static bool isSignExtended(SDValue N, SelectionDAG &DAG)
static bool isAddSubZExt(SDValue N, SelectionDAG &DAG)
static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt)
isVShiftLImm - Check if this is a valid build_vector for the immediate operand of a vector shift left...
static bool canGuaranteeTCO(CallingConv::ID CC, bool GuaranteeTailCalls)
Return true if the calling convention is one that we can guarantee TCO for.
unsigned RegSize
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
amdgpu aa AMDGPU Address space based Alias Analysis Wrapper
static bool isConstant(const MachineInstr &MI)
constexpr LLT F64
constexpr LLT S1
This file declares a class to represent arbitrary precision floating point values and provide a varie...
This file implements a class to represent arbitrary precision integral constant values and operations...
static SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG)
static bool isStore(int Opcode)
static bool isThumb(const MCSubtargetInfo &STI)
static SDValue PerformExtractEltToVMOVRRD(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static bool isIncompatibleReg(const MCPhysReg &PR, MVT VT)
static SDValue PerformVQDMULHCombine(SDNode *N, SelectionDAG &DAG)
static SDValue LowerBUILD_VECTOR_i1(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue LowerShift(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue AddRequiredExtensionForVMULL(SDValue N, SelectionDAG &DAG, const EVT &OrigTy, const EVT &ExtTy, unsigned ExtOpcode)
AddRequiredExtensionForVMULL - Add a sign/zero extension to extend the total value size to 64 bits.
static cl::opt< unsigned > ConstpoolPromotionMaxSize("arm-promote-constant-max-size", cl::Hidden, cl::desc("Maximum size of constant to promote into a constant pool"), cl::init(64))
static bool isZeroOrAllOnes(SDValue N, bool AllOnes)
static SDValue LowerINSERT_VECTOR_ELT_i1(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static bool isVTBLMask(ArrayRef< int > M, EVT VT)
static SDValue PerformSUBCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformSUBCombine - Target-specific dag combine xforms for ISD::SUB.
static cl::opt< bool > EnableConstpoolPromotion("arm-promote-constant", cl::Hidden, cl::desc("Enable / disable promotion of unnamed_addr constants into " "constant pools"), cl::init(false))
static SDValue PerformFAddVSelectCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue PerformExtractFpToIntStores(StoreSDNode *St, SelectionDAG &DAG)
static SDValue PerformVDUPCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
PerformVDUPCombine - Target-specific dag combine xforms for ARMISD::VDUP.
static SDValue PerformExtractEltCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *ST)
static const APInt * isPowerOf2Constant(SDValue V)
static SDValue PerformVCVTCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
PerformVCVTCombine - VCVT (floating-point to fixed-point, Advanced SIMD) can replace combinations of ...
static SDValue PerformVMOVhrCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue LowerVectorFP_TO_INT(SDValue Op, SelectionDAG &DAG)
static SDValue LowerVECTOR_SHUFFLEUsingOneOff(SDValue Op, ArrayRef< int > ShuffleMask, SelectionDAG &DAG)
static bool isValidMVECond(unsigned CC, bool IsFloat)
static SDValue PerformPREDICATE_CASTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static ARMCC::CondCodes IntCCToARMCC(ISD::CondCode CC)
IntCCToARMCC - Convert a DAG integer condition code to an ARM CC.
static SDValue PerformSTORECombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformSTORECombine - Target-specific dag combine xforms for ISD::STORE.
static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static bool isGTorGE(ISD::CondCode CC)
static bool CombineVLDDUP(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
CombineVLDDUP - For a VDUPLANE node N, check if its source operand is a vldN-lane (N > 1) intrinsic,...
static SDValue ParseBFI(SDNode *N, APInt &ToMask, APInt &FromMask)
static bool isReverseMask(ArrayRef< int > M, EVT VT)
static bool isVZIP_v_undef_Mask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
isVZIP_v_undef_Mask - Special case of isVZIPMask for canonical form of "vector_shuffle v,...
static SDValue PerformSELECTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue AddCombineTo64bitUMAAL(SDNode *AddeNode, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue PerformVECTOR_REG_CASTCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue PerformVMulVCTPCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
PerformVMulVCTPCombine - VCVT (fixed-point to floating-point, Advanced SIMD) can replace combinations...
static SDValue createGPRPairNode2xi32(SelectionDAG &DAG, SDValue V0, SDValue V1)
static SDValue bitcastf32Toi32(SDValue Op, SelectionDAG &DAG)
static bool findPointerConstIncrement(SDNode *N, SDValue *Ptr, SDValue *CInc)
static bool isVTRNMask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static bool CanInvertMVEVCMP(SDValue N)
static SDValue PerformLongShiftCombine(SDNode *N, SelectionDAG &DAG)
static SDValue AddCombineToVPADD(SDNode *N, SDValue N0, SDValue N1, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue PerformShiftCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *ST)
PerformShiftCombine - Checks for immediate versions of vector shifts and lowers them.
static void FPCCToARMCC(ISD::CondCode CC, ARMCC::CondCodes &CondCode, ARMCC::CondCodes &CondCode2)
FPCCToARMCC - Convert a DAG fp condition code to an ARM CC.
static void ExpandREAD_REGISTER(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG)
static EVT getVectorTyFromPredicateVector(EVT VT)
static SDValue PerformFADDVCMLACombine(SDNode *N, SelectionDAG &DAG)
static SDValue handleCMSEValue(const SDValue &Value, const ISD::InputArg &Arg, SelectionDAG &DAG, const SDLoc &DL)
static SDValue PerformARMBUILD_VECTORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
Target-specific dag combine xforms for ARMISD::BUILD_VECTOR.
static bool isSRL16(const SDValue &Op)
static SDValue PerformVMOVrhCombine(SDNode *N, SelectionDAG &DAG)
static SDValue PerformLOADCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue IsCMPZCSINC(SDNode *Cmp, ARMCC::CondCodes &CC)
static unsigned getPointerConstIncrement(unsigned Opcode, SDValue Ptr, SDValue Inc, const SelectionDAG &DAG)
static SDValue combineSelectAndUseCommutative(SDNode *N, bool AllOnes, TargetLowering::DAGCombinerInfo &DCI)
static SDValue LowerATOMIC_FENCE(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static Register genTPEntry(MachineBasicBlock *TpEntry, MachineBasicBlock *TpLoopBody, MachineBasicBlock *TpExit, Register OpSizeReg, const TargetInstrInfo *TII, DebugLoc Dl, MachineRegisterInfo &MRI)
Adds logic in loop entry MBB to calculate loop iteration count and adds t2WhileLoopSetup and t2WhileL...
static SDValue createGPRPairNodei64(SelectionDAG &DAG, SDValue V)
static bool isLTorLE(ISD::CondCode CC)
static SDValue PerformVCMPCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue PerformMVEVMULLCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue LowerSDIV_v4i16(SDValue N0, SDValue N1, const SDLoc &dl, SelectionDAG &DAG)
static SDValue performNegCMovCombine(SDNode *N, SelectionDAG &DAG)
static EVT getExtensionTo64Bits(const EVT &OrigVT)
static SDValue PerformBITCASTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *ST)
static SDValue AddCombineTo64bitMLAL(SDNode *AddeSubeNode, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue LowerWRITE_REGISTER(SDValue Op, SelectionDAG &DAG)
static bool checkAndUpdateCPSRKill(MachineBasicBlock::iterator SelectItr, MachineBasicBlock *BB, const TargetRegisterInfo *TRI)
static SDValue PerformCMPZCombine(SDNode *N, SelectionDAG &DAG)
static bool hasNormalLoadOperand(SDNode *N)
hasNormalLoadOperand - Check if any of the operands of a BUILD_VECTOR node are normal,...
static SDValue PerformInsertEltCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
PerformInsertEltCombine - Target-specific dag combine xforms for ISD::INSERT_VECTOR_ELT.
static SDValue PerformVDUPLANECombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformVDUPLANECombine - Target-specific dag combine xforms for ARMISD::VDUPLANE.
static SDValue LowerBuildVectorOfFPTrunc(SDValue BV, SelectionDAG &DAG, const ARMSubtarget *ST)
static cl::opt< unsigned > ConstpoolPromotionMaxTotal("arm-promote-constant-max-total", cl::Hidden, cl::desc("Maximum size of ALL constants to promote into a constant pool"), cl::init(128))
static SDValue LowerTruncatei1(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
static RTLIB::Libcall getDivRemLibcall(const SDNode *N, MVT::SimpleValueType SVT)
static SDValue SkipLoadExtensionForVMULL(LoadSDNode *LD, SelectionDAG &DAG)
SkipLoadExtensionForVMULL - return a load of the original vector size that does not do any sign/zero ...
static SDValue AddCombineVUZPToVPADDL(SDNode *N, SDValue N0, SDValue N1, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue PerformADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformADDCombineWithOperands - Try DAG combinations for an ADD with operands N0 and N1.
static SDValue PromoteMVEPredVector(SDLoc dl, SDValue Pred, EVT VT, SelectionDAG &DAG)
static SDValue matchCSET(unsigned &Opcode, bool &InvertCond, SDValue TrueVal, SDValue FalseVal, const ARMSubtarget *Subtarget)
static bool isVZIPMask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
static SDValue PerformORCombineToSMULWBT(SDNode *OR, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static bool isVTRN_v_undef_Mask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
isVTRN_v_undef_Mask - Special case of isVTRNMask for canonical form of "vector_shuffle v,...
static SDValue LowerUDIV(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue FindBFIToCombineWith(SDNode *N)
static SDValue LowerADDSUBSAT(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static void checkVSELConstraints(ISD::CondCode CC, ARMCC::CondCodes &CondCode, bool &swpCmpOps, bool &swpVselOps)
static void ReplaceLongIntrinsic(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG)
static bool isS16(const SDValue &Op, SelectionDAG &DAG)
static bool isSRA16(const SDValue &Op)
static SDValue AddCombineBUILD_VECTORToVPADDL(SDNode *N, SDValue N0, SDValue N1, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue LowerVECTOR_SHUFFLEUsingMovs(SDValue Op, ArrayRef< int > ShuffleMask, SelectionDAG &DAG)
static SDValue LowerInterruptReturn(SmallVectorImpl< SDValue > &RetOps, const SDLoc &DL, SelectionDAG &DAG)
static SDValue LowerEXTRACT_VECTOR_ELT_i1(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue getInvertedARMCondCode(SDValue ARMcc, SelectionDAG &DAG)
static SDValue LowerSDIV_v4i8(SDValue X, SDValue Y, const SDLoc &dl, SelectionDAG &DAG)
static void expandf64Toi32(SDValue Op, SelectionDAG &DAG, SDValue &RetVal1, SDValue &RetVal2)
static SDValue LowerCONCAT_VECTORS_i1(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue LowerCTTZ(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue PerformVLDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static bool isSHL16(const SDValue &Op)
static bool isVEXTMask(ArrayRef< int > M, EVT VT, bool &ReverseVEXT, unsigned &Imm)
static SDValue PerformMVEVLDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
cl::opt< unsigned > ArmMaxBaseUpdatesToCheck("arm-max-base-updates-to-check", cl::Hidden, cl::desc("Maximum number of base-updates to check generating postindex."), cl::init(64))
static bool isTruncMask(ArrayRef< int > M, EVT VT, bool Top, bool SingleSource)
static SDValue PerformADDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformADDCombine - Target-specific dag combine xforms for ISD::ADD.
static unsigned getLdOpcode(unsigned LdSize, bool IsThumb1, bool IsThumb2)
Return the load opcode for a given load size.
static SDValue LowerADDSUBO_CARRY(SDValue Op, SelectionDAG &DAG, unsigned Opcode, bool IsSigned)
static bool isLegalT2AddressImmediate(int64_t V, EVT VT, const ARMSubtarget *Subtarget)
static bool isLegalMVEShuffleOp(unsigned PFEntry)
static SDValue PerformSignExtendInregCombine(SDNode *N, SelectionDAG &DAG)
static SDValue PerformShuffleVMOVNCombine(ShuffleVectorSDNode *N, SelectionDAG &DAG)
static bool isVUZPMask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
static SDValue PerformVECTOR_SHUFFLECombine(SDNode *N, SelectionDAG &DAG)
PerformVECTOR_SHUFFLECombine - Target-specific dag combine xforms for ISD::VECTOR_SHUFFLE.
static SDValue SkipExtensionForVMULL(SDNode *N, SelectionDAG &DAG)
SkipExtensionForVMULL - For a node that is a SIGN_EXTEND, ZERO_EXTEND, ANY_EXTEND,...
static int getNegationCost(SDValue Op)
static bool isVMOVNTruncMask(ArrayRef< int > M, EVT ToVT, bool rev)
static SDValue PerformVQMOVNCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static MachineBasicBlock * OtherSucc(MachineBasicBlock *MBB, MachineBasicBlock *Succ)
static SDValue LowerVecReduceMinMax(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue PerformFPExtendCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue PerformAddcSubcCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue PerformVSELECTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static TargetLowering::ArgListTy getDivRemArgList(const SDNode *N, LLVMContext *Context, const ARMSubtarget *Subtarget)
static SDValue PerformVECREDUCE_ADDCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue getZeroVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl)
getZeroVector - Returns a vector of specified type with all zero elements.
static SDValue LowerAtomicLoadStore(SDValue Op, SelectionDAG &DAG)
static SDValue PerformSplittingToNarrowingStores(StoreSDNode *St, SelectionDAG &DAG)
static bool getT2IndexedAddressParts(SDNode *Ptr, EVT VT, bool isSEXTLoad, SDValue &Base, SDValue &Offset, bool &isInc, SelectionDAG &DAG)
static ARMCC::CondCodes getVCMPCondCode(SDValue N)
static cl::opt< bool > ARMInterworking("arm-interworking", cl::Hidden, cl::desc("Enable / disable ARM interworking (for debugging only)"), cl::init(true))
static void ReplaceREADCYCLECOUNTER(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue PerformORCombineToBFI(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static bool isConditionalZeroOrAllOnes(SDNode *N, bool AllOnes, SDValue &CC, bool &Invert, SDValue &OtherOp, SelectionDAG &DAG)
static SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue PerformVSetCCToVCTPCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue LowerBUILD_VECTORToVIDUP(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static bool isZeroVector(SDValue N)
static SDValue PerformAddeSubeCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static void ReplaceCMP_SWAP_64Results(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG)
static bool isLowerSaturate(const SDValue LHS, const SDValue RHS, const SDValue TrueVal, const SDValue FalseVal, const ISD::CondCode CC, const SDValue K)
static SDValue LowerPredicateLoad(SDValue Op, SelectionDAG &DAG)
static void emitPostSt(MachineBasicBlock *BB, MachineBasicBlock::iterator Pos, const TargetInstrInfo *TII, const DebugLoc &dl, unsigned StSize, unsigned Data, unsigned AddrIn, unsigned AddrOut, bool IsThumb1, bool IsThumb2)
Emit a post-increment store operation with given size.
static bool isVMOVNMask(ArrayRef< int > M, EVT VT, bool Top, bool SingleSource)
static SDValue CombineBaseUpdate(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
CombineBaseUpdate - Target-specific DAG combine function for VLDDUP, NEON load/store intrinsics,...
static SDValue LowerSaturatingConditional(SDValue Op, SelectionDAG &DAG)
static SDValue PerformSubCSINCCombine(SDNode *N, SelectionDAG &DAG)
static SDValue PerformVMOVRRDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformVMOVRRDCombine - Target-specific dag combine xforms for ARMISD::VMOVRRD.
static SDValue LowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue PerformCSETCombine(SDNode *N, SelectionDAG &DAG)
static SDValue PerformVMOVNCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue PerformInsertSubvectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue LowerVectorExtend(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue WinDBZCheckDenominator(SelectionDAG &DAG, SDNode *N, SDValue InChain)
static SDValue LowerVECTOR_SHUFFLEv8i8(SDValue Op, ArrayRef< int > ShuffleMask, SelectionDAG &DAG)
static SDValue PerformVMULCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformVMULCombine Distribute (A + B) * C to (A * C) + (B * C) to take advantage of the special multi...
static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG)
static SDValue PerformBFICombine(SDNode *N, SelectionDAG &DAG)
static SDValue PerformORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformORCombine - Target-specific dag combine xforms for ISD::OR.
static SDValue LowerMLOAD(SDValue Op, SelectionDAG &DAG)
static SDValue PerformTruncatingStoreCombine(StoreSDNode *St, SelectionDAG &DAG)
static unsigned SelectPairHalf(unsigned Elements, ArrayRef< int > Mask, unsigned Index)
static void emitPostLd(MachineBasicBlock *BB, MachineBasicBlock::iterator Pos, const TargetInstrInfo *TII, const DebugLoc &dl, unsigned LdSize, unsigned Data, unsigned AddrIn, unsigned AddrOut, bool IsThumb1, bool IsThumb2)
Emit a post-increment load operation with given size.
static SDValue TryDistrubutionADDVecReduce(SDNode *N, SelectionDAG &DAG)
static bool isValidBaseUpdate(SDNode *N, SDNode *User)
static SDValue IsSingleInstrConstant(SDValue N, SelectionDAG &DAG, const ARMSubtarget *ST, const SDLoc &dl)
static bool IsQRMVEInstruction(const SDNode *N, const SDNode *Op)
static SDValue PerformMinMaxToSatCombine(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue PerformXORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static bool getMVEIndexedAddressParts(SDNode *Ptr, EVT VT, Align Alignment, bool isSEXTLoad, bool IsMasked, bool isLE, SDValue &Base, SDValue &Offset, bool &isInc, SelectionDAG &DAG)
std::pair< unsigned, const TargetRegisterClass * > RCPair
static SDValue combineSelectAndUse(SDNode *N, SDValue Slct, SDValue OtherOp, TargetLowering::DAGCombinerInfo &DCI, bool AllOnes=false)
static SDValue PerformExtendCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
PerformExtendCombine - Target-specific DAG combining for ISD::SIGN_EXTEND, ISD::ZERO_EXTEND,...
static SDValue LowerSDIV(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
cl::opt< unsigned > MVEMaxSupportedInterleaveFactor("mve-max-interleave-factor", cl::Hidden, cl::desc("Maximum interleave factor for MVE VLDn to generate."), cl::init(2))
static SDValue isVMOVModifiedImm(uint64_t SplatBits, uint64_t SplatUndef, unsigned SplatBitSize, SelectionDAG &DAG, const SDLoc &dl, EVT &VT, EVT VectorVT, VMOVModImmType type)
isVMOVModifiedImm - Check if the specified splat value corresponds to a valid vector constant for a N...
static SDValue LowerBuildVectorOfFPExt(SDValue BV, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue CombineVMOVDRRCandidateWithVecOp(const SDNode *BC, SelectionDAG &DAG)
BC is a bitcast that is about to be turned into a VMOVDRR.
static SDValue promoteToConstantPool(const ARMTargetLowering *TLI, const GlobalValue *GV, SelectionDAG &DAG, EVT PtrVT, const SDLoc &dl)
static unsigned isNEONTwoResultShuffleMask(ArrayRef< int > ShuffleMask, EVT VT, unsigned &WhichResult, bool &isV_UNDEF)
Check if ShuffleMask is a NEON two-result shuffle (VZIP, VUZP, VTRN), and return the corresponding AR...
static bool BitsProperlyConcatenate(const APInt &A, const APInt &B)
static bool getARMIndexedAddressParts(SDNode *Ptr, EVT VT, bool isSEXTLoad, SDValue &Base, SDValue &Offset, bool &isInc, SelectionDAG &DAG)
static SDValue LowerVecReduce(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue LowerVectorINT_TO_FP(SDValue Op, SelectionDAG &DAG)
static bool TryCombineBaseUpdate(struct BaseUpdateTarget &Target, struct BaseUpdateUser &User, bool SimpleConstIncOnly, TargetLowering::DAGCombinerInfo &DCI)
static bool allUsersAreInFunction(const Value *V, const Function *F)
Return true if all users of V are within function F, looking through ConstantExprs.
static bool isSingletonVEXTMask(ArrayRef< int > M, EVT VT, unsigned &Imm)
static SDValue PerformVMOVDRRCombine(SDNode *N, SelectionDAG &DAG)
PerformVMOVDRRCombine - Target-specific dag combine xforms for ARMISD::VMOVDRR.
static bool isLowerSaturatingConditional(const SDValue &Op, SDValue &V, SDValue &SatK)
static bool isLegalAddressImmediate(int64_t V, EVT VT, const ARMSubtarget *Subtarget)
isLegalAddressImmediate - Return true if the integer value can be used as the offset of the target ad...
static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static bool isLegalT1AddressImmediate(int64_t V, EVT VT)
static SDValue CombineANDShift(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG)
static SDValue PerformSHLSimplify(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *ST)
static SDValue PerformADDECombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformADDECombine - Target-specific dag combine transform from ARMISD::ADDC, ARMISD::ADDE,...
static SDValue PerformReduceShuffleCombine(SDNode *N, SelectionDAG &DAG)
static SDValue PerformUMLALCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue LowerTruncate(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue PerformHWLoopCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *ST)
static SDValue PerformORCombineToShiftInsert(SelectionDAG &DAG, SDValue AndOp, SDValue ShiftOp, EVT VT, SDLoc dl)
static SDValue PerformSplittingMVETruncToNarrowingStores(StoreSDNode *St, SelectionDAG &DAG)
static bool isVUZP_v_undef_Mask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
isVUZP_v_undef_Mask - Special case of isVUZPMask for canonical form of "vector_shuffle v,...
static bool isHomogeneousAggregate(Type *Ty, HABaseType &Base, uint64_t &Members)
static SDValue PerformMULCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue PerformFADDCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue LowerReverse_VECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG)
static SDValue PerformANDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue PerformADDVecReduce(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue LowerPredicateStore(SDValue Op, SelectionDAG &DAG)
static SDValue SearchLoopIntrinsic(SDValue N, ISD::CondCode &CC, int &Imm, bool &Negate)
static bool canChangeToInt(SDValue Op, bool &SeenZero, const ARMSubtarget *Subtarget)
canChangeToInt - Given the fp compare operand, return true if it is suitable to morph to an integer c...
static unsigned getStOpcode(unsigned StSize, bool IsThumb1, bool IsThumb2)
Return the store opcode for a given store size.
static bool IsVUZPShuffleNode(SDNode *N)
static SDValue Expand64BitShift(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue AddCombineTo64BitSMLAL16(SDNode *AddcNode, SDNode *AddeNode, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static void attachMEMCPYScratchRegs(const ARMSubtarget *Subtarget, MachineInstr &MI, const SDNode *Node)
Attaches vregs to MEMCPY that it will use as scratch registers when it is expanded into LDM/STM.
static bool isFloatingPointZero(SDValue Op)
isFloatingPointZero - Return true if this is +0.0.
static SDValue findMUL_LOHI(SDValue V)
static SDValue LowerVECTOR_SHUFFLE_i1(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue PerformORCombine_i1(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue PerformSplittingMVEEXTToWideningLoad(SDNode *N, SelectionDAG &DAG)
static SDValue PerformSplittingToWideningLoad(SDNode *N, SelectionDAG &DAG)
static void genTPLoopBody(MachineBasicBlock *TpLoopBody, MachineBasicBlock *TpEntry, MachineBasicBlock *TpExit, const TargetInstrInfo *TII, DebugLoc Dl, MachineRegisterInfo &MRI, Register OpSrcReg, Register OpDestReg, Register ElementCountReg, Register TotalIterationsReg, bool IsMemcpy)
Adds logic in the loopBody MBB to generate MVE_VCTP, t2DoLoopDec and t2DoLoopEnd.
static SDValue PerformBUILD_VECTORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformBUILD_VECTORCombine - Target-specific dag combine xforms for ISD::BUILD_VECTOR.
static SDValue LowerVecReduceF(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue PerformMinMaxCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
PerformMinMaxCombine - Target-specific DAG combining for creating truncating saturates.
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
This file a TargetTransformInfoImplBase conforming object specific to the ARM target machine.
Function Alias Analysis false
Function Alias Analysis Results
Atomic ordering constants.
This file contains the simple types necessary to represent the attributes associated with functions a...
#define X(NUM, ENUM, NAME)
Definition ELF.h:851
This file implements the BitVector class.
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static std::optional< bool > isBigEndian(const SmallDenseMap< int64_t, int64_t, 8 > &MemOffset2Idx, int64_t LowestIdx)
Given a map from byte offsets in memory to indices in a load/store, determine if that map corresponds...
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static void createLoadIntrinsic(IntrinsicInst *II, LoadInst *LI, dxil::ResourceTypeInfo &RTI)
static void createStoreIntrinsic(IntrinsicInst *II, StoreInst *SI, dxil::ResourceTypeInfo &RTI)
This file defines the DenseMap class.
#define Check(C,...)
#define op(i)
#define im(i)
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
Module.h This file contains the declarations for the Module class.
std::pair< Value *, Value * > ShuffleOps
We are building a shuffle to create V, which is a sequence of insertelement, extractelement pairs.
static Value * LowerCTPOP(LLVMContext &Context, Value *V, Instruction *IP)
Emit the code to lower ctpop of V before the specified instruction IP.
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
#define RegName(no)
static LVOptions Options
Definition LVOptions.cpp:25
lazy value info
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
#define G(x, y, z)
Definition MD5.cpp:55
This file declares the MachineConstantPool class which is an abstract constant pool to keep track of ...
static DebugLoc getDebugLoc(MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
Return the first DebugLoc that has line number information, given a range of instructions.
Register Reg
Register const TargetRegisterInfo * TRI
Promote Memory to Register
Definition Mem2Reg.cpp:110
#define T
nvptx lower args
uint64_t High
uint64_t IntrinsicInst * II
PowerPC Reduce CR logical Operation
R600 Clause Merge
const SmallVectorImpl< MachineOperand > & Cond
static cl::opt< RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode > Mode("regalloc-enable-advisor", cl::Hidden, cl::init(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default), cl::desc("Enable regalloc advisor mode"), cl::values(clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default, "default", "Default"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Release, "release", "precompiled"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Development, "development", "for training")))
SI Lower i1 Copies
This file contains some templates that are useful if you are working with the STL at all.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition Value.cpp:483
static cl::opt< unsigned > MaxSteps("has-predecessor-max-steps", cl::Hidden, cl::init(8192), cl::desc("DAG combiner limit number of steps when searching DAG " "for predecessor nodes"))
This file defines the SmallPtrSet class.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:171
This file contains some functions that are useful when dealing with strings.
This file implements the StringSwitch template, which mimics a switch() statement whose cases are str...
#define LLVM_DEBUG(...)
Definition Debug.h:114
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39
This file describes how to lower LLVM code to machine code.
static X86::CondCode getSwappedCondition(X86::CondCode CC)
Assuming the flags are set by MI(a,b), return the condition code if we modify the instructions such t...
static constexpr int Concat[]
Value * RHS
Value * LHS
BinaryOperator * Mul
static bool isIntrinsic(const CallBase &Call, Intrinsic::ID ID)
The Input class is used to parse a yaml document into in-memory structs and vectors.
static constexpr roundingMode rmTowardZero
Definition APFloat.h:348
LLVM_ABI bool getExactInverse(APFloat *Inv) const
If this value is normal and has an exact, normal, multiplicative inverse, store it in inv and return ...
Definition APFloat.cpp:5832
APInt bitcastToAPInt() const
Definition APFloat.h:1408
opStatus convertToInteger(MutableArrayRef< integerPart > Input, unsigned int Width, bool IsSigned, roundingMode RM, bool *IsExact) const
Definition APFloat.h:1387
Class for arbitrary precision integers.
Definition APInt.h:78
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition APInt.h:235
bool isMinSignedValue() const
Determine if this is the smallest signed value.
Definition APInt.h:424
uint64_t getZExtValue() const
Get zero extended value.
Definition APInt.h:1555
unsigned popcount() const
Count the number of bits set.
Definition APInt.h:1685
LLVM_ABI APInt zextOrTrunc(unsigned width) const
Zero extend or truncate to width.
Definition APInt.cpp:1064
unsigned getActiveBits() const
Compute the number of active bits in the value.
Definition APInt.h:1527
LLVM_ABI APInt trunc(unsigned width) const
Truncate to new width.
Definition APInt.cpp:956
void setBit(unsigned BitPosition)
Set the given bit to 1 whose position is given as "bitPosition".
Definition APInt.h:1345
bool sgt(const APInt &RHS) const
Signed greater than comparison.
Definition APInt.h:1208
bool isAllOnes() const
Determine if all bits are set. This is true for zero-width values.
Definition APInt.h:372
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition APInt.h:1503
bool ult(const APInt &RHS) const
Unsigned less than comparison.
Definition APInt.h:1118
unsigned countr_zero() const
Count the number of trailing zero bits.
Definition APInt.h:1654
unsigned countl_zero() const
The APInt version of std::countl_zero.
Definition APInt.h:1613
static LLVM_ABI APInt getSplat(unsigned NewLen, const APInt &V)
Return a value containing V broadcasted over NewLen bits.
Definition APInt.cpp:651
unsigned logBase2() const
Definition APInt.h:1776
uint64_t getLimitedValue(uint64_t Limit=UINT64_MAX) const
If this value is smaller than the specified limit, return it, otherwise return the limit value.
Definition APInt.h:476
bool isSubsetOf(const APInt &RHS) const
This operation checks that all bits set in this APInt are also set in RHS.
Definition APInt.h:1264
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
Definition APInt.h:441
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition APInt.h:307
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition APInt.h:297
static APInt getOneBitSet(unsigned numBits, unsigned BitNo)
Return an APInt with exactly one bit set in the result.
Definition APInt.h:240
int64_t getSExtValue() const
Get sign extended value.
Definition APInt.h:1577
void lshrInPlace(unsigned ShiftAmt)
Logical right-shift this APInt by ShiftAmt in place.
Definition APInt.h:865
APInt lshr(unsigned shiftAmt) const
Logical right-shift function.
Definition APInt.h:858
unsigned countr_one() const
Count the number of trailing one bits.
Definition APInt.h:1671
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
Definition APInt.h:1228
An arbitrary precision integer that knows its signedness.
Definition APSInt.h:24
const ARMBaseRegisterInfo & getRegisterInfo() const
const uint32_t * getSjLjDispatchPreservedMask(const MachineFunction &MF) const
const MCPhysReg * getCalleeSavedRegs(const MachineFunction *MF) const override
Code Generation virtual methods...
Register getFrameRegister(const MachineFunction &MF) const override
const uint32_t * getCallPreservedMask(const MachineFunction &MF, CallingConv::ID) const override
const uint32_t * getTLSCallPreservedMask(const MachineFunction &MF) const
const uint32_t * getThisReturnPreservedMask(const MachineFunction &MF, CallingConv::ID) const
getThisReturnPreservedMask - Returns a call preserved mask specific to the case that 'returned' is on...
static ARMConstantPoolConstant * Create(const Constant *C, unsigned ID)
static ARMConstantPoolMBB * Create(LLVMContext &C, const MachineBasicBlock *mbb, unsigned ID, unsigned char PCAdj)
static ARMConstantPoolSymbol * Create(LLVMContext &C, StringRef s, unsigned ID, unsigned char PCAdj)
ARMConstantPoolValue - ARM specific constantpool value.
ARMFunctionInfo - This class is derived from MachineFunctionInfo and contains private ARM-specific in...
SmallPtrSet< const GlobalVariable *, 2 > & getGlobalsPromotedToConstantPool()
void setArgumentStackToRestore(unsigned v)
void setArgRegsSaveSize(unsigned s)
void setReturnRegsCount(unsigned s)
unsigned getArgRegsSaveSize() const
void markGlobalAsPromotedToConstantPool(const GlobalVariable *GV)
Indicate to the backend that GV has had its storage changed to inside a constant pool.
void setArgumentStackSize(unsigned size)
unsigned getArgumentStackSize() const
const Triple & getTargetTriple() const
const ARMBaseInstrInfo * getInstrInfo() const override
bool isThumb1Only() const
bool useFPVFMx() const
bool isThumb2() const
bool hasBaseDSP() const
const ARMTargetLowering * getTargetLowering() const override
const ARMBaseRegisterInfo * getRegisterInfo() const override
bool hasVFP2Base() const
bool useFPVFMx64() const
bool isLittle() const
bool useFPVFMx16() const
bool isMClass() const
bool useMulOps() const
bool shouldFoldSelectWithIdentityConstant(unsigned BinOpcode, EVT VT, unsigned SelectOpcode, SDValue X, SDValue Y) const override
Return true if pulling a binary operation into a select with an identity constant is profitable.
bool isReadOnly(const GlobalValue *GV) const
unsigned getMaxSupportedInterleaveFactor() const override
Get the maximum supported factor for interleaved memory accesses.
TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicLoadInIR(LoadInst *LI) const override
Returns how the given (atomic) load should be expanded by the IR-level AtomicExpand pass.
unsigned getNumInterleavedAccesses(VectorType *VecTy, const DataLayout &DL) const
Returns the number of interleaved accesses that will be generated when lowering accesses of the given...
bool shouldInsertFencesForAtomic(const Instruction *I) const override
Whether AtomicExpandPass should automatically insert fences and reduce ordering for this atomic.
Align getABIAlignmentForCallingConv(Type *ArgTy, const DataLayout &DL) const override
Return the correct alignment for the current calling convention.
bool isDesirableToCommuteWithShift(const SDNode *N, CombineLevel Level) const override
Return true if it is profitable to move this shift by a constant amount through its operand,...
Register getExceptionPointerRegister(const Constant *PersonalityFn) const override
If a physical register, this returns the register that receives the exception address on entry to an ...
ConstraintWeight getSingleConstraintMatchWeight(AsmOperandInfo &info, const char *constraint) const override
Examine constraint string and operand type and determine a weight value.
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I=nullptr) const override
isLegalAddressingMode - Return true if the addressing mode represented by AM is legal for this target...
const ARMSubtarget * getSubtarget() const
bool isLegalT2ScaledAddressingMode(const AddrMode &AM, EVT VT) const
bool isLegalT1ScaledAddressingMode(const AddrMode &AM, EVT VT) const
Returns true if the addressing mode representing by AM is legal for the Thumb1 target,...
bool getPreIndexedAddressParts(SDNode *N, SDValue &Base, SDValue &Offset, ISD::MemIndexedMode &AM, SelectionDAG &DAG) const override
getPreIndexedAddressParts - returns true by value, base pointer and offset pointer and addressing mod...
MachineInstr * EmitKCFICheck(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator &MBBI, const TargetInstrInfo *TII) const override
bool shouldAlignPointerArgs(CallInst *CI, unsigned &MinSize, Align &PrefAlign) const override
Return true if the pointer arguments to CI should be aligned by aligning the object whose address is ...
void getTgtMemIntrinsic(SmallVectorImpl< IntrinsicInfo > &Infos, const CallBase &I, MachineFunction &MF, unsigned Intrinsic) const override
getTgtMemIntrinsic - Represent NEON load and store intrinsics as MemIntrinsicNodes.
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
ReplaceNodeResults - Replace the results of node with an illegal result type with new values built ou...
void emitAtomicCmpXchgNoStoreLLBalance(IRBuilderBase &Builder) const override
bool isMulAddWithConstProfitable(SDValue AddNode, SDValue ConstNode) const override
Return true if it may be profitable to transform (mul (add x, c1), c2) -> (add (mul x,...
bool isLegalAddImmediate(int64_t Imm) const override
isLegalAddImmediate - Return true if the specified immediate is legal add immediate,...
EVT getOptimalMemOpType(LLVMContext &Context, const MemOp &Op, const AttributeList &FuncAttributes) const override
Returns the target specific optimal type for load and store operations as a result of memset,...
Instruction * emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst, AtomicOrdering Ord) const override
bool isFNegFree(EVT VT) const override
Return true if an fneg operation is free to the point where it is never worthwhile to replace it with...
void finalizeLowering(MachineFunction &MF) const override
Execute target specific actions to finalize target lowering.
SDValue PerformMVETruncCombine(SDNode *N, DAGCombinerInfo &DCI) const
bool isFPImmLegal(const APFloat &Imm, EVT VT, bool ForCodeSize=false) const override
isFPImmLegal - Returns true if the target can instruction select the specified FP immediate natively.
ConstraintType getConstraintType(StringRef Constraint) const override
getConstraintType - Given a constraint letter, return the type of constraint it is for this target.
bool preferIncOfAddToSubOfNot(EVT VT) const override
These two forms are equivalent: sub y, (xor x, -1) add (add x, 1), y The variant with two add's is IR...
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicStoreInIR(StoreInst *SI) const override
Returns how the given (atomic) store should be expanded by the IR-level AtomicExpand pass into.
SDValue PerformIntrinsicCombine(SDNode *N, DAGCombinerInfo &DCI) const
PerformIntrinsicCombine - ARM-specific DAG combining for intrinsics.
bool shouldFoldConstantShiftPairToMask(const SDNode *N) const override
Return true if it is profitable to fold a pair of shifts into a mask.
bool isDesirableToCommuteXorWithShift(const SDNode *N) const override
Return true if it is profitable to combine an XOR of a logical shift to create a logical shift of NOT...
SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG) const
PerformCMOVCombine - Target-specific DAG combining for ARMISD::CMOV.
TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicRMWInIR(const AtomicRMWInst *AI) const override
Returns how the IR-level AtomicExpand pass should expand the given AtomicRMW, if at all.
Value * createComplexDeinterleavingIR(IRBuilderBase &B, ComplexDeinterleavingOperation OperationType, ComplexDeinterleavingRotation Rotation, Value *InputA, Value *InputB, Value *Accumulator=nullptr) const override
Create the IR node for the given complex deinterleaving operation.
bool isComplexDeinterleavingSupported() const override
Does this target support complex deinterleaving.
SDValue PerformMVEExtCombine(SDNode *N, DAGCombinerInfo &DCI) const
FastISel * createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo, const LibcallLoweringInfo *libcallLowering) const override
createFastISel - This method returns a target specific FastISel object, or null if the target does no...
void insertSSPDeclarations(Module &M, const LibcallLoweringInfo &Libcalls) const override
Inserts necessary declarations for SSP (stack protection) purpose.
bool SimplifyDemandedBitsForTargetNode(SDValue Op, const APInt &OriginalDemandedBits, const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth) const override
Attempt to simplify any target nodes based on the demanded bits/elts, returning true on success.
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override
getSetCCResultType - Return the value type to use for ISD::SETCC.
MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const override
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
Value * emitStoreConditional(IRBuilderBase &Builder, Value *Val, Value *Addr, AtomicOrdering Ord) const override
Perform a store-conditional operation to Addr.
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC, bool isVarArg) const
void AdjustInstrPostInstrSelection(MachineInstr &MI, SDNode *Node) const override
This method should be implemented by targets that mark instructions with the 'hasPostISelHook' flag.
bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, unsigned Index) const override
Return true if EXTRACT_SUBVECTOR is cheap for this result type with this index.
bool isTruncateFree(Type *SrcTy, Type *DstTy) const override
Return true if it's free to truncate a value of type FromTy to type ToTy.
bool isShuffleMaskLegal(ArrayRef< int > M, EVT VT) const override
isShuffleMaskLegal - Targets can use this to indicate that they only support some VECTOR_SHUFFLE oper...
TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicCmpXchgInIR(const AtomicCmpXchgInst *AI) const override
Returns how the given atomic cmpxchg should be expanded by the IR-level AtomicExpand pass.
bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override
Returns true if it is beneficial to convert a load of a constant to just the constant itself.
bool lowerInterleavedStore(Instruction *Store, Value *Mask, ShuffleVectorInst *SVI, unsigned Factor, const APInt &GapMask) const override
Lower an interleaved store into a vstN intrinsic.
bool canCreateUndefOrPoisonForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const override
Return true if Op can create undef or poison from non-undef & non-poison operands.
const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent=false) const override
getRegClassFor - Return the register class that should be used for the specified value type.
bool useLoadStackGuardNode(const Module &M) const override
If this function returns true, SelectionDAGBuilder emits a LOAD_STACK_GUARD node when it is lowering ...
bool lowerInterleavedLoad(Instruction *Load, Value *Mask, ArrayRef< ShuffleVectorInst * > Shuffles, ArrayRef< unsigned > Indices, unsigned Factor, const APInt &GapMask) const override
Lower an interleaved load into a vldN intrinsic.
std::pair< const TargetRegisterClass *, uint8_t > findRepresentativeClass(const TargetRegisterInfo *TRI, MVT VT) const override
Return the largest legal super-reg register class of the register class for the specified type and it...
bool preferSelectsOverBooleanArithmetic(EVT VT) const override
Should we prefer selects to doing arithmetic on boolean types.
bool isZExtFree(SDValue Val, EVT VT2) const override
Return true if zero-extending the specific node Val to type VT2 is free (either because it's implicit...
bool isCheapToSpeculateCttz(Type *Ty) const override
Return true if it is cheap to speculate a call to intrinsic cttz.
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
bool isCheapToSpeculateCtlz(Type *Ty) const override
Return true if it is cheap to speculate a call to intrinsic ctlz.
bool targetShrinkDemandedConstant(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, TargetLoweringOpt &TLO) const override
ARMTargetLowering(const TargetMachine &TM, const ARMSubtarget &STI)
bool isComplexDeinterleavingOperationSupported(ComplexDeinterleavingOperation Operation, Type *Ty) const override
Does this target support complex deinterleaving with the given operation and type.
bool supportKCFIBundles() const override
Return true if the target supports kcfi operand bundles.
SDValue PerformBRCONDCombine(SDNode *N, SelectionDAG &DAG) const
PerformBRCONDCombine - Target-specific DAG combining for ARMISD::BRCOND.
Register getExceptionSelectorRegister(const Constant *PersonalityFn) const override
If a physical register, this returns the register that receives the exception typeid on entry to a la...
Type * shouldConvertSplatType(ShuffleVectorInst *SVI) const override
Given a shuffle vector SVI representing a vector splat, return a new scalar type of size equal to SVI...
Value * emitLoadLinked(IRBuilderBase &Builder, Type *ValueTy, Value *Addr, AtomicOrdering Ord) const override
Perform a load-linked operation on Addr, returning a "Value *" with the corresponding pointee type.
Instruction * makeDMB(IRBuilderBase &Builder, ARM_MB::MemBOpt Domain) const
bool isLegalICmpImmediate(int64_t Imm) const override
isLegalICmpImmediate - Return true if the specified immediate is legal icmp immediate,...
const char * LowerXConstraint(EVT ConstraintVT) const override
Try to replace an X constraint, which matches anything, with another that has more specific requireme...
unsigned getJumpTableEncoding() const override
Return the entry encoding for a jump table in the current function.
bool isDesirableToTransformToIntegerOp(unsigned Opc, EVT VT) const override
Return true if it is profitable for dag combiner to transform a floating point op of specified opcode...
CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool isVarArg) const
bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags, unsigned *Fast) const override
allowsMisalignedMemoryAccesses - Returns true if the target allows unaligned memory accesses of the s...
bool isLegalInterleavedAccessType(unsigned Factor, FixedVectorType *VecTy, Align Alignment, const DataLayout &DL) const
Returns true if VecTy is a legal interleaved access type.
bool isVectorLoadExtDesirable(SDValue ExtVal) const override
Return true if folding a vector load into ExtVal (a sign, zero, or any extend node) is profitable.
bool canCombineStoreAndExtract(Type *VectorTy, Value *Idx, unsigned &Cost) const override
Return true if the target can combine store(extractelement VectorTy,Idx).
bool useSoftFloat() const override
bool alignLoopsWithOptSize() const override
Should loops be aligned even when the function is marked OptSize (but not MinSize).
SDValue PerformCMOVToBFICombine(SDNode *N, SelectionDAG &DAG) const
bool allowTruncateForTailCall(Type *Ty1, Type *Ty2) const override
Return true if a truncation from FromTy to ToTy is permitted when deciding whether a call is in tail ...
void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const override
LowerAsmOperandForConstraint - Lower the specified operand into the Ops vector.
std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override
Given a physical register constraint (e.g.
bool shouldConvertFpToSat(unsigned Op, EVT FPVT, EVT VT) const override
Should we generate fp_to_si_sat and fp_to_ui_sat from type FPVT to type VT from min(max(fptoi)) satur...
bool functionArgumentNeedsConsecutiveRegisters(Type *Ty, CallingConv::ID CallConv, bool isVarArg, const DataLayout &DL) const override
Returns true if an argument of type Ty needs to be passed in a contiguous block of registers in calli...
bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override
Return true if folding a constant offset with the given GlobalAddress is legal.
const ARMBaseTargetMachine & getTM() const
bool isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const override
Return if the target supports combining a chain like:
ShiftLegalizationStrategy preferredShiftLegalizationStrategy(SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const override
bool getPostIndexedAddressParts(SDNode *N, SDNode *Op, SDValue &Base, SDValue &Offset, ISD::MemIndexedMode &AM, SelectionDAG &DAG) const override
getPostIndexedAddressParts - returns true by value, base pointer and offset pointer and addressing mo...
Instruction * emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst, AtomicOrdering Ord) const override
Inserts in the IR a target-specific intrinsic specifying a fence.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
size_t size() const
size - Get the array size.
Definition ArrayRef.h:142
bool empty() const
empty - Check if the array is empty.
Definition ArrayRef.h:137
An instruction that atomically checks whether a specified value is in a memory location,...
an instruction that atomically reads a memory location, combines it with another value,...
bool isFloatingPointOperation() const
static LLVM_ABI Attribute get(LLVMContext &Context, AttrKind Kind, uint64_t Val=0)
Return a uniquified Attribute object.
static LLVM_ABI BaseIndexOffset match(const SDNode *N, const SelectionDAG &DAG)
Parses tree in N for base, index, offset addresses.
LLVM Basic Block Representation.
Definition BasicBlock.h:62
The address of a basic block.
Definition Constants.h:1065
static BranchProbability getZero()
A "pseudo-class" with methods for operating on BUILD_VECTORs.
LLVM_ABI bool isConstantSplat(APInt &SplatValue, APInt &SplatUndef, unsigned &SplatBitSize, bool &HasAnyUndefs, unsigned MinSplatBits=0, bool isBigEndian=false) const
Check if this is a constant splat, and if so, find the smallest element size that splats the vector.
LLVM_ABI int32_t getConstantFPSplatPow2ToLog2Int(BitVector *UndefElements, uint32_t BitWidth) const
If this is a constant FP splat and the splatted constant FP is an exact power or 2,...
CCState - This class holds information needed while lowering arguments and return values.
void getInRegsParamInfo(unsigned InRegsParamRecordIndex, unsigned &BeginReg, unsigned &EndReg) const
unsigned getFirstUnallocated(ArrayRef< MCPhysReg > Regs) const
getFirstUnallocated - Return the index of the first unallocated register in the set,...
static LLVM_ABI bool resultsCompatible(CallingConv::ID CalleeCC, CallingConv::ID CallerCC, MachineFunction &MF, LLVMContext &C, const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn CalleeFn, CCAssignFn CallerFn)
Returns true if the results of the two calling conventions are compatible.
MCRegister AllocateReg(MCPhysReg Reg)
AllocateReg - Attempt to allocate one register.
LLVM_ABI bool CheckReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
CheckReturn - Analyze the return values of a function, returning true if the return can be performed ...
LLVM_ABI void AnalyzeReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeReturn - Analyze the returned values of a return, incorporating info about the result values i...
unsigned getInRegsParamsProcessed() const
uint64_t getStackSize() const
Returns the size of the currently allocated portion of the stack.
void addInRegsParamInfo(unsigned RegBegin, unsigned RegEnd)
LLVM_ABI void AnalyzeFormalArguments(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeFormalArguments - Analyze an array of argument values, incorporating info about the formals in...
unsigned getInRegsParamsCount() const
CCValAssign - Represent assignment of one arg/retval to a location.
Register getLocReg() const
LocInfo getLocInfo() const
bool needsCustom() const
int64_t getLocMemOffset() const
unsigned getValNo() const
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
LLVM_ABI bool isMustTailCall() const
Tests if this call site must be tail call optimized.
AttributeList getAttributes() const
Return the attributes for this call.
void addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind)
Adds the attribute to the indicated argument.
This class represents a function call, abstracting a target machine's calling convention.
bool isTailCall() const
static Constant * get(LLVMContext &Context, ArrayRef< ElementTy > Elts)
get() constructor - Return a constant with array type with an element count and element type matching...
Definition Constants.h:859
const APFloat & getValueAPF() const
ConstantFP - Floating Point Values [float, double].
Definition Constants.h:420
This is the shared class of boolean and integer constants.
Definition Constants.h:87
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
Definition Constants.h:168
MachineConstantPoolValue * getMachineCPVal() const
const Constant * getConstVal() const
LLVM_ABI Type * getType() const
uint64_t getZExtValue() const
const APInt & getAPIntValue() const
int64_t getSExtValue() const
This is an important base class in LLVM.
Definition Constant.h:43
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
bool isLittleEndian() const
Layout endianness...
Definition DataLayout.h:215
bool isBigEndian() const
Definition DataLayout.h:216
MaybeAlign getStackAlignment() const
Returns the natural stack alignment, or MaybeAlign() if one wasn't specified.
Definition DataLayout.h:248
LLVM_ABI TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
StringRef getInternalSymbolPrefix() const
Definition DataLayout.h:306
LLVM_ABI Align getPreferredAlign(const GlobalVariable *GV) const
Returns the preferred alignment of the specified global.
LLVM_ABI Align getPrefTypeAlign(Type *Ty) const
Returns the preferred stack/global alignment for the specified type.
A debug info location.
Definition DebugLoc.h:123
iterator find(const_arg_type_t< KeyT > Val)
Definition DenseMap.h:178
unsigned size() const
Definition DenseMap.h:110
bool empty() const
Definition DenseMap.h:109
iterator begin()
Definition DenseMap.h:78
iterator end()
Definition DenseMap.h:81
This is a fast-path instruction selection class that generates poor code and doesn't support illegal ...
Definition FastISel.h:66
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:873
A handy container for a FunctionType+Callee-pointer pair, which can be passed around as a single enti...
FunctionLoweringInfo - This contains information that is global to a function that is used when lower...
Type * getParamType(unsigned i) const
Parameter type accessors.
FunctionType * getFunctionType() const
Returns the FunctionType for me.
Definition Function.h:211
bool hasMinSize() const
Optimize this function for minimum size (-Oz).
Definition Function.h:711
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition Function.h:272
arg_iterator arg_begin()
Definition Function.h:868
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:358
bool hasStructRetAttr() const
Determine if the function returns a structure through first or second pointer argument.
Definition Function.h:695
const Argument * const_arg_iterator
Definition Function.h:74
bool isVarArg() const
isVarArg - Return true if this function takes a variable number of arguments.
Definition Function.h:229
bool hasFnAttribute(Attribute::AttrKind Kind) const
Return true if the function has the attribute.
Definition Function.cpp:728
const GlobalValue * getGlobal() const
bool isDSOLocal() const
bool hasExternalWeakLinkage() const
bool hasDLLImportStorageClass() const
Module * getParent()
Get the module that this global value is contained inside of...
bool isStrongDefinitionForLinker() const
Returns true if this global's definition will be the one chosen by the linker.
@ InternalLinkage
Rename collisions when linking (static functions).
Definition GlobalValue.h:60
Common base class shared among various IRBuilders.
Definition IRBuilder.h:114
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition IRBuilder.h:2812
LLVM_ABI bool hasAtomicStore() const LLVM_READONLY
Return true if this atomic instruction stores to memory.
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
bool isUnindexed() const
Return true if this is NOT a pre/post inc/dec load/store.
bool isIndexed() const
Return true if this is a pre/post inc/dec load/store.
Tracks which library functions to use for a particular subtarget.
LLVM_ABI CallingConv::ID getLibcallImplCallingConv(RTLIB::LibcallImpl Call) const
Get the CallingConv that should be used for the specified libcall.
LLVM_ABI RTLIB::LibcallImpl getLibcallImpl(RTLIB::Libcall Call) const
Return the lowering's selection of implementation call for Call.
An instruction for reading from memory.
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
Describe properties that are true of each instruction in the target description file.
Machine Value Type.
static MVT getFloatingPointVT(unsigned BitWidth)
static auto integer_fixedlen_vector_valuetypes()
SimpleValueType SimpleTy
uint64_t getScalarSizeInBits() const
unsigned getVectorNumElements() const
bool isInteger() const
Return true if this is an integer or a vector integer type.
static LLVM_ABI MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
static auto integer_valuetypes()
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
static auto fixedlen_vector_valuetypes()
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
bool isScalarInteger() const
Return true if this is an integer, not including vectors.
static MVT getVectorVT(MVT VT, unsigned NumElements)
MVT getVectorElementType() const
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
static MVT getIntegerVT(unsigned BitWidth)
static auto fp_valuetypes()
bool is64BitVector() const
Return true if this is a 64-bit vector type.
LLVM_ABI void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
bool isEHPad() const
Returns true if the block is a landing pad.
LLVM_ABI MachineBasicBlock * getFallThrough(bool JumpToFallThrough=true)
Return the fallthrough block if the block can implicitly transfer control to the block after it by fa...
void setCallFrameSize(unsigned N)
Set the call frame size on entry to this basic block.
const BasicBlock * getBasicBlock() const
Return the LLVM basic block that this instance corresponded to originally.
LLVM_ABI bool canFallThrough()
Return true if the block can implicitly transfer control to the block after it by falling off the end...
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
Instructions::iterator instr_iterator
MachineInstrBundleIterator< MachineInstr, true > reverse_iterator
LLVM_ABI MachineBasicBlock * splitAt(MachineInstr &SplitInst, bool UpdateLiveIns=true, LiveIntervals *LIS=nullptr)
Split a basic block into 2 pieces at SplitPoint.
void addLiveIn(MCRegister PhysReg, LaneBitmask LaneMask=LaneBitmask::getAll())
Adds the specified register as a live in.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
LLVM_ABI instr_iterator erase(instr_iterator I)
Remove an instruction from the instruction list and delete it.
iterator_range< succ_iterator > successors()
iterator_range< pred_iterator > predecessors()
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
MachineInstrBundleIterator< MachineInstr > iterator
LLVM_ABI void moveAfter(MachineBasicBlock *NewBefore)
void setIsEHPad(bool V=true)
Indicates the block is a landing pad.
The MachineConstantPool class keeps track of constants referenced by a function which must be spilled...
unsigned getConstantPoolIndex(const Constant *C, Align Alignment)
getConstantPoolIndex - Create a new entry in the constant pool or return an existing one.
LLVM_ABI int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
LLVM_ABI void computeMaxCallFrameSize(MachineFunction &MF, std::vector< MachineBasicBlock::iterator > *FrameSDOps=nullptr)
Computes the maximum size of a callframe.
void setFrameAddressIsTaken(bool T)
void setHasTailCall(bool V=true)
void setReturnAddressIsTaken(bool s)
int64_t getObjectSize(int ObjectIdx) const
Return the size of the specified object.
bool hasVAStart() const
Returns true if the function calls the llvm.va_start intrinsic.
int64_t getObjectOffset(int ObjectIdx) const
Return the assigned stack offset of the specified object from the incoming stack pointer.
bool isFixedObjectIndex(int ObjectIdx) const
Returns true if the specified index corresponds to a fixed stack object.
int getFunctionContextIndex() const
Return the index for the function context object.
Properties which a MachineFunction may have at a given point in time.
unsigned getFunctionNumber() const
getFunctionNumber - Return a unique ID for the current function.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
void push_back(MachineBasicBlock *MBB)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
const DataLayout & getDataLayout() const
Return the DataLayout attached to the Module associated to this MF.
Function & getFunction()
Return the LLVM function that this machine code represents.
BasicBlockListType::iterator iterator
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
MachineConstantPool * getConstantPool()
getConstantPool - Return the constant pool object for the current function.
const MachineFunctionProperties & getProperties() const
Get the function properties.
Register addLiveIn(MCRegister PReg, const TargetRegisterClass *RC)
addLiveIn - Add the specified physical register as a live-in value and create a corresponding virtual...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
const MachineInstrBuilder & addExternalSymbol(const char *FnName, unsigned TargetFlags=0) const
const MachineInstrBuilder & addUse(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a virtual register use operand.
const MachineInstrBuilder & addReg(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addFrameIndex(int Idx) const
const MachineInstrBuilder & addConstantPoolIndex(unsigned Idx, int Offset=0, unsigned TargetFlags=0) const
const MachineInstrBuilder & addRegMask(const uint32_t *Mask) const
const MachineInstrBuilder & addJumpTableIndex(unsigned Idx, unsigned TargetFlags=0) const
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & addDef(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a virtual register definition operand.
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
const MachineInstrBuilder & setMIFlags(unsigned Flags) const
const MachineInstrBuilder & addMemOperand(MachineMemOperand *MMO) const
MachineInstr * getInstr() const
If conversion operators fail, use this method to get the MachineInstr explicitly.
Representation of each machine instruction.
bool readsRegister(Register Reg, const TargetRegisterInfo *TRI) const
Return true if the MachineInstr reads the specified register.
bool definesRegister(Register Reg, const TargetRegisterInfo *TRI) const
Return true if the MachineInstr fully defines the specified register.
MachineOperand * mop_iterator
iterator/begin/end - Iterate over all operands of a machine instruction.
const MachineOperand & getOperand(unsigned i) const
LLVM_ABI unsigned createJumpTableIndex(const std::vector< MachineBasicBlock * > &DestBBs)
createJumpTableIndex - Create a new jump table.
@ EK_Inline
EK_Inline - Jump table entries are emitted inline at their point of use.
@ EK_BlockAddress
EK_BlockAddress - Each entry is a plain address of block, e.g.: .word LBB123.
A description of a memory reference used in the backend.
Flags
Flags values. These may be or'd together.
@ MOVolatile
The memory access is volatile.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MONonTemporal
The memory access is non-temporal.
@ MOInvariant
The memory access always returns the same value (or traps).
@ MOStore
The memory access writes data.
Flags getFlags() const
Return the raw flags of the source value,.
MachineOperand class - Representation of each machine instruction operand.
LLVM_ABI void setIsRenamable(bool Val=true)
bool isReg() const
isReg - Tests if this is a MO_Register operand.
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
static MachineOperand CreateImm(int64_t Val)
Register getReg() const
getReg - Returns the register number.
LLVM_ABI void setIsDef(bool Val=true)
Change a def to a use, or a use to a def.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLVM_ABI Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
This class is used to represent an MLOAD node.
This class is used to represent an MSTORE node.
This SDNode is used for target intrinsics that touch memory and need an associated MachineMemOperand.
This is an abstract virtual class for memory operations.
Align getBaseAlign() const
Returns alignment and volatility of the memory access.
Align getAlign() const
bool isVolatile() const
AAMDNodes getAAInfo() const
Returns the AA info that describes the dereference.
bool isSimple() const
Returns true if the memory operation is neither atomic or volatile.
MachineMemOperand * getMemOperand() const
Return the unique MachineMemOperand object describing the memory reference performed by operation.
const MachinePointerInfo & getPointerInfo() const
const SDValue & getChain() const
EVT getMemoryVT() const
Return the type of the in-memory value.
A Module instance is used to store all the information related to an LLVM module.
Definition Module.h:67
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
Wrapper class representing virtual and physical registers.
Definition Register.h:20
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
const DebugLoc & getDebugLoc() const
Represents one node in the SelectionDAG.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool hasOneUse() const
Return true if there is exactly one use of this node.
LLVM_ABI bool isOnlyUserOf(const SDNode *N) const
Return true if this node is the only use of N.
iterator_range< use_iterator > uses()
SDNodeFlags getFlags() const
static bool hasPredecessorHelper(const SDNode *N, SmallPtrSetImpl< const SDNode * > &Visited, SmallVectorImpl< const SDNode * > &Worklist, unsigned int MaxSteps=0, bool TopologicalPrune=false)
Returns true if N is a predecessor of any node in Worklist.
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
bool use_empty() const
Return true if there are no uses of this node.
unsigned getNumValues() const
Return the number of values defined/returned by this operator.
unsigned getNumOperands() const
Return the number of values used by this operation.
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
const APInt & getConstantOperandAPInt(unsigned Num) const
Helper method returns the APInt of a ConstantSDNode operand.
bool isPredecessorOf(const SDNode *N) const
Return true if this node is a predecessor of N.
LLVM_ABI bool hasAnyUseOfValue(unsigned Value) const
Return true if there are any use of the indicated value.
EVT getValueType(unsigned ResNo) const
Return the type of a specified result.
void setCFIType(uint32_t Type)
bool isUndef() const
Returns true if the node type is UNDEF or POISON.
iterator_range< user_iterator > users()
void setFlags(SDNodeFlags NewFlags)
user_iterator user_begin() const
Provide iteration support to walk over all users of an SDNode.
Represents a use of a SDNode.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
bool isUndef() const
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
const SDValue & getOperand(unsigned i) const
const APInt & getConstantOperandAPInt(unsigned i) const
uint64_t getScalarValueSizeInBits() const
unsigned getResNo() const
get the index which selects a specific result in the SDNode
uint64_t getConstantOperandVal(unsigned i) const
unsigned getOpcode() const
unsigned getNumOperands() const
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
LLVM_ABI SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
SDValue getTargetGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, unsigned TargetFlags=0)
LLVM_ABI SDValue getStackArgumentTokenFactor(SDValue Chain)
Compute a TokenFactor to force all the incoming stack arguments to be loaded from the stack.
const TargetSubtargetInfo & getSubtarget() const
SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, Register Reg, SDValue N)
LLVM_ABI SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
LLVM_ABI SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
LLVM_ABI SDValue getSplatValue(SDValue V, bool LegalTypes=false)
If V is a splat vector, return its scalar source operand by extracting that element from the source v...
LLVM_ABI SDValue getAllOnesConstant(const SDLoc &DL, EVT VT, bool IsTarget=false, bool IsOpaque=false)
LLVM_ABI MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
LLVM_ABI SDNode * getNodeIfExists(unsigned Opcode, SDVTList VTList, ArrayRef< SDValue > Ops, const SDNodeFlags Flags, bool AllowCommute=false)
Get the specified node if it's already available, or else return NULL.
LLVM_ABI SDValue UnrollVectorOp(SDNode *N, unsigned ResNE=0)
Utility function used by legalize and lowering to "unroll" a vector operation by splitting out the sc...
LLVM_ABI bool haveNoCommonBitsSet(SDValue A, SDValue B) const
Return true if A and B have no common bits set.
LLVM_ABI SDValue getRegister(Register Reg, EVT VT)
LLVM_ABI SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
LLVM_ABI SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, LocationSize Size=LocationSize::precise(0), const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
void addNoMergeSiteInfo(const SDNode *Node, bool NoMerge)
Set NoMergeSiteInfo to be associated with Node if NoMerge is true.
std::pair< SDValue, SDValue > SplitVectorOperand(const SDNode *N, unsigned OpNo)
Split the node's operand with EXTRACT_SUBVECTOR and return the low/high part.
LLVM_ABI SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
const TargetLowering & getTargetLoweringInfo() const
SDValue getTargetJumpTable(int JTI, EVT VT, unsigned TargetFlags=0)
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
LLVM_ABI SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, Register Reg, EVT VT)
LLVM_ABI SDValue getNegative(SDValue Val, const SDLoc &DL, EVT VT)
Create negative operation as (SUB 0, Val).
LLVM_ABI void setNodeMemRefs(MachineSDNode *N, ArrayRef< MachineMemOperand * > NewMemRefs)
Mutate the specified machine node's memory references to the provided list.
LLVM_ABI SDValue getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT VT)
Return the expression required to zero extend the Op value assuming it was the smaller SrcTy value.
const DataLayout & getDataLayout() const
LLVM_ABI SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
SDValue getSignedTargetConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
LLVM_ABI void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
LLVM_ABI std::pair< SDValue, SDValue > SplitVector(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the vector with EXTRACT_SUBVECTOR using the provided VTs and return the low/high part.
LLVM_ABI SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
LLVM_ABI SDValue getSignedConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
LLVM_ABI MaybeAlign InferPtrAlign(SDValue Ptr) const
Infer alignment of a load / store address.
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
LLVM_ABI SDValue getTargetExtractSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand)
A convenience function for creating TargetInstrInfo::EXTRACT_SUBREG nodes.
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
LLVM_ABI SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either sign-extending or trunca...
LLVM_ABI bool isKnownNeverZero(SDValue Op, unsigned Depth=0) const
Test whether the given SDValue is known to contain non-zero value(s).
LLVM_ABI SDValue getExternalSymbol(const char *Sym, EVT VT)
const TargetMachine & getTarget() const
const LibcallLoweringInfo & getLibcalls() const
LLVM_ABI SDValue getIntPtrConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
LLVM_ABI SDValue getValueType(EVT)
LLVM_ABI SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI unsigned ComputeNumSignBits(SDValue Op, unsigned Depth=0) const
Return the number of times the sign bit of the register is replicated into the other bits.
LLVM_ABI SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
LLVM_ABI void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
LLVM_ABI SDValue getFrameIndex(int FI, EVT VT, bool isTarget=false)
LLVM_ABI KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
LLVM_ABI SDValue getRegisterMask(const uint32_t *RegMask)
LLVM_ABI SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
LLVM_ABI SDValue getCondCode(ISD::CondCode Cond)
void addCallSiteInfo(const SDNode *Node, CallSiteInfo &&CallInfo)
Set CallSiteInfo to be associated with Node.
LLVM_ABI bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
SDValue getObjectPtrOffset(const SDLoc &SL, SDValue Ptr, TypeSize Offset)
Create an add instruction with appropriate flags when used for addressing some offset of an object.
LLVMContext * getContext() const
LLVM_ABI SDValue getTargetExternalSymbol(const char *Sym, EVT VT, unsigned TargetFlags=0)
LLVM_ABI SDValue CreateStackTemporary(TypeSize Bytes, Align Alignment)
Create a stack temporary based on the size in bytes and the alignment.
SDValue getTargetConstantPool(const Constant *C, EVT VT, MaybeAlign Align=std::nullopt, int Offset=0, unsigned TargetFlags=0)
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
LLVM_ABI SDValue getMaskedLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Base, SDValue Offset, SDValue Mask, SDValue Src0, EVT MemVT, MachineMemOperand *MMO, ISD::MemIndexedMode AM, ISD::LoadExtType, bool IsExpanding=false)
DenormalMode getDenormalMode(EVT VT) const
Return the current function's default denormal handling kind for the given floating point type.
LLVM_ABI std::pair< SDValue, SDValue > SplitScalar(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the scalar node with EXTRACT_ELEMENT using the provided VTs and return the low/high part.
LLVM_ABI SDValue getVectorShuffle(EVT VT, const SDLoc &dl, SDValue N1, SDValue N2, ArrayRef< int > Mask)
Return an ISD::VECTOR_SHUFFLE node.
LLVM_ABI SDValue getLogicalNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a logical NOT operation as (XOR Val, BooleanOne).
This instruction constructs a fixed permutation of two input vectors.
VectorType * getType() const
Overload to return most specific vector type.
static LLVM_ABI void getShuffleMask(const Constant *Mask, SmallVectorImpl< int > &Result)
Convert the input shuffle mask operand to a vector of integers.
static LLVM_ABI bool isIdentityMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector without lane crossin...
This SDNode is used to implement the code generator support for the llvm IR shufflevector instruction...
int getMaskElt(unsigned Idx) const
ArrayRef< int > getMask() const
static LLVM_ABI bool isSplatMask(ArrayRef< int > Mask)
void insert_range(Range &&R)
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
bool empty() const
Definition SmallSet.h:169
bool erase(const T &V)
Definition SmallSet.h:200
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition SmallSet.h:184
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
iterator insert(iterator I, T &&Elt)
void resize(size_type N)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
This class is used to represent ISD::STORE nodes.
const SDValue & getBasePtr() const
const SDValue & getValue() const
bool isTruncatingStore() const
Return true if the op does a truncation before store.
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
const unsigned char * bytes_end() const
Definition StringRef.h:124
constexpr size_t size() const
size - Get the string size.
Definition StringRef.h:143
constexpr const char * data() const
data - Get a pointer to the start of the string (which may not be null terminated).
Definition StringRef.h:137
const unsigned char * bytes_begin() const
Definition StringRef.h:121
static LLVM_ABI StructType * get(LLVMContext &Context, ArrayRef< Type * > Elements, bool isPacked=false)
This static method is the primary way to create a literal StructType.
Definition Type.cpp:483
TargetInstrInfo - Interface to description of machine instruction set.
Provides information about what library functions are available for the current target.
bool isOperationExpand(unsigned Op, EVT VT) const
Return true if the specified operation is illegal on this target or unlikely to be made legal with cu...
void setBooleanVectorContents(BooleanContent Ty)
Specify how the target extends the result of a vector boolean value from a vector of i1 to a wider ty...
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
virtual void finalizeLowering(MachineFunction &MF) const
Execute target specific actions to finalize target lowering.
void setMaxDivRemBitWidthSupported(unsigned SizeInBits)
Set the size in bits of the maximum div/rem the backend supports.
bool PredictableSelectIsExpensive
Tells the code generator that select is more expensive than a branch if the branch is usually predict...
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
unsigned MaxStoresPerMemcpyOptSize
Likewise for functions with the OptSize attribute.
virtual const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent=false) const
Return the register class that should be used for the specified value type.
ShiftLegalizationStrategy
Return the preferred strategy to legalize tihs SHIFT instruction, with ExpansionFactor being the recu...
void setMinStackArgumentAlignment(Align Alignment)
Set the minimum stack alignment of an argument.
const TargetMachine & getTargetMachine() const
virtual void insertSSPDeclarations(Module &M, const LibcallLoweringInfo &Libcalls) const
Inserts necessary declarations for SSP (stack protection) purpose.
void setIndexedMaskedLoadAction(unsigned IdxMode, MVT VT, LegalizeAction Action)
Indicate that the specified indexed masked load does or does not work with the specified type and ind...
void setIndexedLoadAction(ArrayRef< unsigned > IdxModes, MVT VT, LegalizeAction Action)
Indicate that the specified indexed load does or does not work with the specified type and indicate w...
void setPrefLoopAlignment(Align Alignment)
Set the target's preferred loop alignment.
void setMaxAtomicSizeInBitsSupported(unsigned SizeInBits)
Set the maximum atomic operation size supported by the backend.
Sched::Preference getSchedulingPreference() const
Return target scheduling preference.
void setMinFunctionAlignment(Align Alignment)
Set the target's minimum function alignment.
unsigned MaxStoresPerMemsetOptSize
Likewise for functions with the OptSize attribute.
void setBooleanContents(BooleanContent Ty)
Specify how the target extends the result of integer and floating point boolean values from i1 to a w...
unsigned MaxStoresPerMemmove
Specify maximum number of store instructions per memmove call.
void computeRegisterProperties(const TargetRegisterInfo *TRI)
Once all of the register classes are added, this allows us to compute derived properties we expose.
unsigned MaxStoresPerMemmoveOptSize
Likewise for functions with the OptSize attribute.
void addRegisterClass(MVT VT, const TargetRegisterClass *RC)
Add the specified register class as an available regclass for the specified value type.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
void setIndexedStoreAction(ArrayRef< unsigned > IdxModes, MVT VT, LegalizeAction Action)
Indicate that the specified indexed store does or does not work with the specified type and indicate ...
virtual MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
void setPrefFunctionAlignment(Align Alignment)
Set the target's preferred function alignment.
virtual unsigned getMaxSupportedInterleaveFactor() const
Get the maximum supported factor for interleaved memory accesses.
void setIndexedMaskedStoreAction(unsigned IdxMode, MVT VT, LegalizeAction Action)
Indicate that the specified indexed masked store does or does not work with the specified type and in...
unsigned MaxStoresPerMemset
Specify maximum number of store instructions per memset call.
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
virtual ShiftLegalizationStrategy preferredShiftLegalizationStrategy(SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const
bool isOperationLegalOrCustom(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
virtual bool allowsMemoryAccess(LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const
Return true if the target supports a memory access of this type for the given address space and align...
void setStackPointerRegisterToSaveRestore(Register R)
If set to a physical register, this specifies the register that llvm.savestack/llvm....
void AddPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
If Opc/OrigVT is specified as being promoted, the promotion code defaults to trying a larger integer/...
AtomicExpansionKind
Enum that specifies what an atomic load/AtomicRMWInst is expanded to, if at all.
virtual std::pair< const TargetRegisterClass *, uint8_t > findRepresentativeClass(const TargetRegisterInfo *TRI, MVT VT) const
Return the largest legal super-reg register class of the register class for the specified type and it...
RTLIB::LibcallImpl getLibcallImpl(RTLIB::Libcall Call) const
Get the libcall impl routine name for the specified libcall.
static StringRef getLibcallImplName(RTLIB::LibcallImpl Call)
Get the libcall routine name for the specified libcall implementation.
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
void setLoadExtAction(unsigned ExtType, MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified load with extension does not work with the specified type and indicate wh...
LegalizeTypeAction getTypeAction(LLVMContext &Context, EVT VT) const
Return how we should legalize values of this type, either it is already legal (return 'Legal') or we ...
std::vector< ArgListEntry > ArgListTy
unsigned MaxStoresPerMemcpy
Specify maximum number of store instructions per memcpy call.
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
This class defines information used to lower LLVM code to legal SelectionDAG operators that the targe...
bool SimplifyDemandedVectorElts(SDValue Op, const APInt &DemandedEltMask, APInt &KnownUndef, APInt &KnownZero, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Vector Op.
void softenSetCCOperands(SelectionDAG &DAG, EVT VT, SDValue &NewLHS, SDValue &NewRHS, ISD::CondCode &CCCode, const SDLoc &DL, const SDValue OldLHS, const SDValue OldRHS) const
Soften the operands of a comparison.
SDValue expandUnalignedStore(StoreSDNode *ST, SelectionDAG &DAG) const
Expands an unaligned store to 2 half-size stores for integer values, and possibly more for vectors.
virtual ConstraintType getConstraintType(StringRef Constraint) const
Given a constraint, return the type of constraint it is for this target.
bool parametersInCSRMatch(const MachineRegisterInfo &MRI, const uint32_t *CallerPreservedMask, const SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< SDValue > &OutVals) const
Check whether parameters to a call that are passed in callee saved registers are the same as from the...
std::pair< SDValue, SDValue > expandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Expands an unaligned load to 2 half-size loads for an integer, and possibly more for vectors.
virtual SDValue LowerToTLSEmulatedModel(const GlobalAddressSDNode *GA, SelectionDAG &DAG) const
Lower TLS global address SDNode for target independent emulated TLS model.
std::pair< SDValue, SDValue > LowerCallTo(CallLoweringInfo &CLI) const
This function lowers an abstract call to a function into an actual call.
bool expandDIVREMByConstant(SDNode *N, SmallVectorImpl< SDValue > &Result, EVT HiLoVT, SelectionDAG &DAG, SDValue LL=SDValue(), SDValue LH=SDValue()) const
Attempt to expand an n-bit div/rem/divrem by constant using a n/2-bit urem by constant and other arit...
bool isPositionIndependent() const
virtual ConstraintWeight getSingleConstraintMatchWeight(AsmOperandInfo &info, const char *constraint) const
Examine constraint string and operand type and determine a weight value.
SDValue buildLegalVectorShuffle(EVT VT, const SDLoc &DL, SDValue N0, SDValue N1, MutableArrayRef< int > Mask, SelectionDAG &DAG) const
Tries to build a legal vector shuffle using the provided parameters or equivalent variations.
virtual std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const
Given a physical register constraint (e.g.
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Op.
virtual bool SimplifyDemandedBitsForTargetNode(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0) const
Attempt to simplify any target nodes based on the demanded bits/elts, returning true on success.
TargetLowering(const TargetLowering &)=delete
bool isConstTrueVal(SDValue N) const
Return if the N is a constant or constant vector equal to the true value from getBooleanContents().
virtual ArrayRef< MCPhysReg > getRoundingControlRegisters() const
Returns a 0 terminated array of rounding control registers that can be attached into strict FP call.
virtual void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const
Lower the specified operand into the Ops vector.
std::pair< SDValue, SDValue > makeLibCall(SelectionDAG &DAG, RTLIB::LibcallImpl LibcallImpl, EVT RetVT, ArrayRef< SDValue > Ops, MakeLibCallOptions CallOptions, const SDLoc &dl, SDValue Chain=SDValue()) const
Returns a pair of (return value, chain).
virtual bool canCreateUndefOrPoisonForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const
Return true if Op can create undef or poison from non-undef & non-poison operands.
void setTypeIdForCallsiteInfo(const CallBase *CB, MachineFunction &MF, MachineFunction::CallSiteInfo &CSInfo) const
Primary interface to the complete machine description for the target machine.
TLSModel::Model getTLSModel(const GlobalValue *GV) const
Returns the TLS model which should be used for the given global variable.
ExceptionHandling getExceptionModel() const
Return the ExceptionHandling to use, considering TargetOptions and the Triple's default.
const Triple & getTargetTriple() const
bool useEmulatedTLS() const
Returns true if this target uses emulated TLS.
virtual const TargetSubtargetInfo * getSubtargetImpl(const Function &) const
Virtual method implemented by subclasses that returns a reference to that target's TargetSubtargetInf...
TargetOptions Options
unsigned EnableFastISel
EnableFastISel - This flag enables fast-path instruction selection which trades away generated code q...
unsigned GuaranteedTailCallOpt
GuaranteedTailCallOpt - This flag is enabled when -tailcallopt is specified on the commandline.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
virtual const TargetRegisterInfo * getRegisterInfo() const =0
Return the target's register information.
Target - Wrapper for Target specific information.
Triple - Helper class for working with autoconf configuration names.
Definition Triple.h:47
ObjectFormatType getObjectFormat() const
Get the object format for this triple.
Definition Triple.h:453
bool isOSWindows() const
Tests whether the OS is Windows.
Definition Triple.h:716
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:343
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:46
static LLVM_ABI IntegerType * getInt64Ty(LLVMContext &C)
Definition Type.cpp:314
bool isVectorTy() const
True if this is an instance of VectorType.
Definition Type.h:290
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:313
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:284
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition Type.h:155
static LLVM_ABI Type * getVoidTy(LLVMContext &C)
Definition Type.cpp:286
static LLVM_ABI IntegerType * getInt8Ty(LLVMContext &C)
Definition Type.cpp:311
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:370
LLVM_ABI TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Definition Type.cpp:201
static LLVM_ABI IntegerType * getInt16Ty(LLVMContext &C)
Definition Type.cpp:312
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition Type.h:144
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition Type.h:130
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:236
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:186
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:257
bool isFPOrFPVectorTy() const
Return true if this is a FP type or a vector of FP.
Definition Type.h:227
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
LLVM_ABI unsigned getOperandNo() const
Return the operand # of this use in its User.
Definition Use.cpp:35
User * getUser() const
Returns the User that contains this Use.
Definition Use.h:61
Value * getOperand(unsigned i) const
Definition User.h:207
unsigned getNumOperands() const
Definition User.h:229
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:255
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
Base class of all SIMD vector types.
Type * getElementType() const
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:202
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
Definition DenseSet.h:175
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:200
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:123
IteratorT end() const
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
static CondCodes getOppositeCondition(CondCodes CC)
Definition ARMBaseInfo.h:48
@ SECREL
Thread Pointer Offset.
@ SBREL
Section Relative (Windows TLS)
@ GOTTPOFF
Global Offset Table, PC Relative.
@ TPOFF
Global Offset Table, Thread Pointer Offset.
TOF
Target Operand Flag enum.
@ MO_NONLAZY
MO_NONLAZY - This is an independent flag, on a symbol operand "FOO" it represents a symbol which,...
@ MO_SBREL
MO_SBREL - On a symbol operand, this represents a static base relative relocation.
@ MO_DLLIMPORT
MO_DLLIMPORT - On a symbol operand, this represents that the reference to the symbol is for an import...
@ MO_GOT
MO_GOT - On a symbol operand, this represents a GOT relative relocation.
@ MO_COFFSTUB
MO_COFFSTUB - On a symbol operand "FOO", this indicates that the reference is actually to the "....
static ShiftOpc getShiftOpcForNode(unsigned Opcode)
int getSOImmVal(unsigned Arg)
getSOImmVal - Given a 32-bit immediate, if it is something that can fit into an shifter_operand immed...
int getFP32Imm(const APInt &Imm)
getFP32Imm - Return an 8-bit floating-point version of the 32-bit floating-point value.
uint64_t decodeVMOVModImm(unsigned ModImm, unsigned &EltBits)
decodeVMOVModImm - Decode a NEON/MVE modified immediate value into the element value and the element ...
unsigned getAM2Offset(unsigned AM2Opc)
bool isThumbImmShiftedVal(unsigned V)
isThumbImmShiftedVal - Return true if the specified value can be obtained by left shifting a 8-bit im...
int getT2SOImmVal(unsigned Arg)
getT2SOImmVal - Given a 32-bit immediate, if it is something that can fit into a Thumb-2 shifter_oper...
unsigned createVMOVModImm(unsigned OpCmode, unsigned Val)
int getFP64Imm(const APInt &Imm)
getFP64Imm - Return an 8-bit floating-point version of the 64-bit floating-point value.
int getFP16Imm(const APInt &Imm)
getFP16Imm - Return an 8-bit floating-point version of the 16-bit floating-point value.
unsigned getSORegOpc(ShiftOpc ShOp, unsigned Imm)
int getFP32FP16Imm(const APInt &Imm)
If this is a FP16Imm encoded as a fp32 value, return the 8-bit encoding for it.
AddrOpc getAM2Op(unsigned AM2Opc)
bool isBitFieldInvertedMask(unsigned v)
const unsigned FPStatusBits
FastISel * createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo, const LibcallLoweringInfo *libcallLowering)
const unsigned FPReservedBits
const unsigned RoundingBitsPos
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
@ Entry
Definition COFF.h:862
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ Swift
Calling convention for Swift.
Definition CallingConv.h:69
@ ARM_APCS
ARM Procedure Calling Standard (obsolete, but still used on some targets).
@ CFGuard_Check
Special calling convention on Windows for calling the Control Guard Check ICall funtion.
Definition CallingConv.h:82
@ PreserveMost
Used for runtime calls that preserves most registers.
Definition CallingConv.h:63
@ ARM_AAPCS
ARM Architecture Procedure Calling Standard calling convention (aka EABI).
@ CXX_FAST_TLS
Used for access functions.
Definition CallingConv.h:72
@ GHC
Used by the Glasgow Haskell Compiler (GHC).
Definition CallingConv.h:50
@ PreserveAll
Used for runtime calls that preserves (almost) all registers.
Definition CallingConv.h:66
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition CallingConv.h:41
@ Tail
Attemps to make calls as fast as possible while guaranteeing that tail call optimization can always b...
Definition CallingConv.h:76
@ SwiftTail
This follows the Swift calling convention in how arguments are passed but guarantees tail calls will ...
Definition CallingConv.h:87
@ ARM_AAPCS_VFP
Same as ARM_AAPCS, but uses hard floating point ABI.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
bool isNON_EXTLoad(const SDNode *N)
Returns true if the specified node is a non-extending load.
NodeType
ISD::NodeType enum - This enum defines the target-independent operators for a SelectionDAG.
Definition ISDOpcodes.h:41
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition ISDOpcodes.h:819
@ MERGE_VALUES
MERGE_VALUES - This node takes multiple discrete operands and returns them all as its individual resu...
Definition ISDOpcodes.h:261
@ STACKRESTORE
STACKRESTORE has two operands, an input chain and a pointer to restore to it returns an output chain.
@ STACKSAVE
STACKSAVE - STACKSAVE has one operand, an input chain.
@ CTLZ_ZERO_UNDEF
Definition ISDOpcodes.h:788
@ STRICT_FSETCC
STRICT_FSETCC/STRICT_FSETCCS - Constrained versions of SETCC, used for floating-point operands only.
Definition ISDOpcodes.h:511
@ SET_FPENV
Sets the current floating-point environment.
@ MLOAD
Masked load and store - consecutive vector load and store operations with additional mask operand tha...
@ EH_SJLJ_LONGJMP
OUTCHAIN = EH_SJLJ_LONGJMP(INCHAIN, buffer) This corresponds to the eh.sjlj.longjmp intrinsic.
Definition ISDOpcodes.h:168
@ FGETSIGN
INT = FGETSIGN(FP) - Return the sign bit of the specified floating point value as an integer 0/1 valu...
Definition ISDOpcodes.h:538
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition ISDOpcodes.h:275
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
Definition ISDOpcodes.h:600
@ BSWAP
Byte Swap and Counting operators.
Definition ISDOpcodes.h:779
@ VAEND
VAEND, VASTART - VAEND and VASTART have three operands: an input chain, pointer, and a SRCVALUE.
@ ATOMIC_STORE
OUTCHAIN = ATOMIC_STORE(INCHAIN, val, ptr) This corresponds to "store atomic" instruction.
@ RESET_FPENV
Set floating-point environment to default state.
@ ADD
Simple integer binary arithmetic operators.
Definition ISDOpcodes.h:264
@ LOAD
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
@ SET_FPMODE
Sets the current dynamic floating-point control modes.
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition ISDOpcodes.h:853
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition ISDOpcodes.h:518
@ FMODF
FMODF - Decomposes the operand into integral and fractional parts, each having the same type and sign...
@ FATAN2
FATAN2 - atan2, inspired by libm.
@ FSINCOSPI
FSINCOSPI - Compute both the sine and cosine times pi more accurately than FSINCOS(pi*x),...
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
Definition ISDOpcodes.h:220
@ EH_SJLJ_SETUP_DISPATCH
OUTCHAIN = EH_SJLJ_SETUP_DISPATCH(INCHAIN) The target initializes the dispatch table here.
Definition ISDOpcodes.h:172
@ GlobalAddress
Definition ISDOpcodes.h:88
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition ISDOpcodes.h:880
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
Definition ISDOpcodes.h:584
@ VECREDUCE_FMAX
FMIN/FMAX nodes can have flags, for NaN/NoNaN variants.
@ FADD
Simple binary floating point operators.
Definition ISDOpcodes.h:417
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
Definition ISDOpcodes.h:747
@ ATOMIC_FENCE
OUTCHAIN = ATOMIC_FENCE(INCHAIN, ordering, scope) This corresponds to the fence instruction.
@ RESET_FPMODE
Sets default dynamic floating-point control modes.
@ SDIVREM
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition ISDOpcodes.h:280
@ FP16_TO_FP
FP16_TO_FP, FP_TO_FP16 - These operators are used to perform promotions and truncation for half-preci...
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
Definition ISDOpcodes.h:993
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
Definition ISDOpcodes.h:254
@ FLDEXP
FLDEXP - ldexp, inspired by libm (op0 * 2**op1).
@ STRICT_FSQRT
Constrained versions of libm-equivalent floating point intrinsics.
Definition ISDOpcodes.h:438
@ BUILTIN_OP_END
BUILTIN_OP_END - This must be the last enum value in this list.
@ GlobalTLSAddress
Definition ISDOpcodes.h:89
@ SET_ROUNDING
Set rounding mode.
Definition ISDOpcodes.h:975
@ SIGN_EXTEND
Conversion operators.
Definition ISDOpcodes.h:844
@ AVGCEILS
AVGCEILS/AVGCEILU - Rounding averaging add - Add two integers using an integer of type i[N+2],...
Definition ISDOpcodes.h:715
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
Definition ISDOpcodes.h:665
@ BR
Control flow instructions. These all have token chains.
@ VECREDUCE_FADD
These reductions have relaxed evaluation order semantics, and have a single vector operand.
@ CTTZ_ZERO_UNDEF
Bit counting operators with an undefined result for zero inputs.
Definition ISDOpcodes.h:787
@ PREFETCH
PREFETCH - This corresponds to a prefetch intrinsic.
@ FSINCOS
FSINCOS - Compute both fsin and fcos as a single operation.
@ SETCCCARRY
Like SetCC, ops #0 and #1 are the LHS and RHS operands to compare, but op #2 is a boolean indicating ...
Definition ISDOpcodes.h:827
@ FNEG
Perform various unary floating-point operations inspired by libm.
@ BR_CC
BR_CC - Conditional branch.
@ SSUBO
Same for subtraction.
Definition ISDOpcodes.h:352
@ BR_JT
BR_JT - Jumptable branch.
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
Definition ISDOpcodes.h:374
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition ISDOpcodes.h:796
@ ATOMIC_LOAD
Val, OUTCHAIN = ATOMIC_LOAD(INCHAIN, ptr) This corresponds to "load atomic" instruction.
@ UNDEF
UNDEF - An undefined node.
Definition ISDOpcodes.h:233
@ EXTRACT_ELEMENT
EXTRACT_ELEMENT - This is used to get the lower or upper (determined by a Constant,...
Definition ISDOpcodes.h:247
@ VACOPY
VACOPY - VACOPY has 5 operands: an input chain, a destination pointer, a source pointer,...
@ BasicBlock
Various leaf nodes.
Definition ISDOpcodes.h:81
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
Definition ISDOpcodes.h:230
@ SADDO
RESULT, BOOL = [SU]ADDO(LHS, RHS) - Overflow-aware nodes for addition.
Definition ISDOpcodes.h:348
@ CTLS
Count leading redundant sign bits.
Definition ISDOpcodes.h:792
@ VECREDUCE_ADD
Integer reductions may have a result type larger than the vector element type.
@ GET_ROUNDING
Returns current rounding mode: -1 Undefined 0 Round to 0 1 Round to nearest, ties to even 2 Round to ...
Definition ISDOpcodes.h:970
@ STRICT_FP_TO_FP16
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition ISDOpcodes.h:704
@ GET_FPMODE
Reads the current dynamic floating-point control modes.
@ STRICT_FP16_TO_FP
@ GET_FPENV
Gets the current floating-point environment.
@ SHL
Shift and rotation operations.
Definition ISDOpcodes.h:765
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition ISDOpcodes.h:649
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
Definition ISDOpcodes.h:614
@ READ_REGISTER
READ_REGISTER, WRITE_REGISTER - This node represents llvm.register on the DAG, which implements the n...
Definition ISDOpcodes.h:139
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition ISDOpcodes.h:576
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition ISDOpcodes.h:224
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition ISDOpcodes.h:850
@ DEBUGTRAP
DEBUGTRAP - Trap intended to get the attention of a debugger.
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
Definition ISDOpcodes.h:811
@ ATOMIC_CMP_SWAP
Val, OUTCHAIN = ATOMIC_CMP_SWAP(INCHAIN, ptr, cmp, swap) For double-word atomic operations: ValLo,...
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum maximum on two values, following IEEE-754 definition...
@ SMULO
Same for multiplication.
Definition ISDOpcodes.h:356
@ DYNAMIC_STACKALLOC
DYNAMIC_STACKALLOC - Allocate some number of bytes on the stack aligned to a specified boundary.
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition ISDOpcodes.h:888
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition ISDOpcodes.h:727
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition ISDOpcodes.h:978
@ VSELECT
Select with a vector condition (op #0) and two vector operands (ops #1 and #2), returning a vector re...
Definition ISDOpcodes.h:805
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:328
@ STRICT_FROUNDEVEN
Definition ISDOpcodes.h:464
@ BF16_TO_FP
BF16_TO_FP, FP_TO_BF16 - These operators are used to perform promotions and truncation for bfloat16.
@ FRAMEADDR
FRAMEADDR, RETURNADDR - These nodes represent llvm.frameaddress and llvm.returnaddress on the DAG.
Definition ISDOpcodes.h:110
@ STRICT_FP_TO_UINT
Definition ISDOpcodes.h:478
@ STRICT_FP_ROUND
X = STRICT_FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision ...
Definition ISDOpcodes.h:500
@ STRICT_FP_TO_SINT
STRICT_FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:477
@ FMINIMUM
FMINIMUM/FMAXIMUM - NaN-propagating minimum/maximum that also treat -0.0 as less than 0....
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:926
@ READCYCLECOUNTER
READCYCLECOUNTER - This corresponds to the readcyclecounter intrinsic.
@ STRICT_FP_EXTEND
X = STRICT_FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition ISDOpcodes.h:505
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition ISDOpcodes.h:739
@ TRAP
TRAP - Trapping instruction.
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition ISDOpcodes.h:205
@ SCMP
[US]CMP - 3-way comparison of signed or unsigned integers.
Definition ISDOpcodes.h:735
@ AVGFLOORS
AVGFLOORS/AVGFLOORU - Averaging add - Add two integers using an integer of type i[N+1],...
Definition ISDOpcodes.h:710
@ STRICT_FADD
Constrained versions of the binary floating point operators.
Definition ISDOpcodes.h:427
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
Definition ISDOpcodes.h:565
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
Definition ISDOpcodes.h:53
@ ATOMIC_SWAP
Val, OUTCHAIN = ATOMIC_SWAP(INCHAIN, ptr, amt) Val, OUTCHAIN = ATOMIC_LOAD_[OpName](INCHAIN,...
@ FFREXP
FFREXP - frexp, extract fractional and exponent component of a floating-point value.
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition ISDOpcodes.h:959
@ SPONENTRY
SPONENTRY - Represents the llvm.sponentry intrinsic.
Definition ISDOpcodes.h:122
@ STRICT_FNEARBYINT
Definition ISDOpcodes.h:458
@ FP_TO_SINT_SAT
FP_TO_[US]INT_SAT - Convert floating point value in operand 0 to a signed or unsigned scalar integer ...
Definition ISDOpcodes.h:945
@ EH_SJLJ_SETJMP
RESULT, OUTCHAIN = EH_SJLJ_SETJMP(INCHAIN, buffer) This corresponds to the eh.sjlj....
Definition ISDOpcodes.h:162
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition ISDOpcodes.h:856
@ VAARG
VAARG - VAARG has four operands: an input chain, a pointer, a SRCVALUE, and the alignment.
@ BRCOND
BRCOND - Conditional branch.
@ SHL_PARTS
SHL_PARTS/SRA_PARTS/SRL_PARTS - These operators are used for expanded integer shift operations.
Definition ISDOpcodes.h:833
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
Definition ISDOpcodes.h:534
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
Definition ISDOpcodes.h:365
@ ABDS
ABDS/ABDU - Absolute difference - Return the absolute difference between two numbers interpreted as s...
Definition ISDOpcodes.h:722
@ SADDO_CARRY
Carry-using overflow-aware nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:338
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition ISDOpcodes.h:213
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
Definition ISDOpcodes.h:556
bool isNormalStore(const SDNode *N)
Returns true if the specified node is a non-truncating and unindexed store.
bool isZEXTLoad(const SDNode *N)
Returns true if the specified node is a ZEXTLOAD.
LLVM_ABI CondCode getSetCCInverse(CondCode Operation, EVT Type)
Return the operation corresponding to !(X op Y), where 'op' is a valid SetCC operation.
bool isEXTLoad(const SDNode *N)
Returns true if the specified node is a EXTLOAD.
LLVM_ABI CondCode getSetCCSwappedOperands(CondCode Operation)
Return the operation corresponding to (Y op X) when given the operation for (X op Y).
LLVM_ABI bool isBuildVectorAllZeros(const SDNode *N)
Return true if the specified node is a BUILD_VECTOR where all of the elements are 0 or undef.
bool isSignedIntSetCC(CondCode Code)
Return true if this is a setcc instruction that performs a signed comparison when used with integer o...
LLVM_ABI bool isConstantSplatVector(const SDNode *N, APInt &SplatValue)
Node predicates.
MemIndexedMode
MemIndexedMode enum - This enum defines the load / store indexed addressing modes.
bool isSEXTLoad(const SDNode *N)
Returns true if the specified node is a SEXTLOAD.
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
LoadExtType
LoadExtType enum - This enum defines the three variants of LOADEXT (load with extension).
static const int LAST_INDEXED_MODE
bool isNormalLoad(const SDNode *N)
Returns true if the specified node is a non-extending and unindexed load.
This namespace contains an enum with a value for every intrinsic/builtin function known by LLVM.
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > OverloadTys={})
Look up the Function declaration of the intrinsic id in the Module M.
LLVM_ABI Libcall getSINTTOFP(EVT OpVT, EVT RetVT)
getSINTTOFP - Return the SINTTOFP_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
LLVM_ABI Libcall getUINTTOFP(EVT OpVT, EVT RetVT)
getUINTTOFP - Return the UINTTOFP_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
LLVM_ABI Libcall getFPTOUINT(EVT OpVT, EVT RetVT)
getFPTOUINT - Return the FPTOUINT_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
LLVM_ABI Libcall getFPTOSINT(EVT OpVT, EVT RetVT)
getFPTOSINT - Return the FPTOSINT_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
LLVM_ABI Libcall getFPEXT(EVT OpVT, EVT RetVT)
getFPEXT - Return the FPEXT_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
LLVM_ABI Libcall getFPROUND(EVT OpVT, EVT RetVT)
getFPROUND - Return the FPROUND_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
@ SingleThread
Synchronized with respect to signal handlers executing in the same thread.
Definition LLVMContext.h:55
initializer< Ty > init(const Ty &Val)
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:316
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
bool RetFastCC_ARM_APCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
@ Low
Lower the current thread's priority such that it does not affect foreground tasks significantly.
Definition Threading.h:280
@ Offset
Definition DWP.cpp:532
@ Length
Definition DWP.cpp:532
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
void stable_sort(R &&Range)
Definition STLExtras.h:2116
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1765
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1739
bool HasLowerConstantMaterializationCost(unsigned Val1, unsigned Val2, const ARMSubtarget *Subtarget, bool ForCodesize=false)
Returns true if Val1 has a lower Constant Materialization Cost than Val2.
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
InstructionCost Cost
LLVM_ABI bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Dead
Unused definition.
@ Kill
The last use of a register.
@ Define
Register definition.
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
bool isStrongerThanMonotonic(AtomicOrdering AO)
int countr_one(T Value)
Count the number of ones from the least significant bit to the first zero bit.
Definition bit.h:313
FunctionAddr VTableAddr uintptr_t uintptr_t Int32Ty
Definition InstrProf.h:328
bool CCAssignFn(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
CCAssignFn - This function assigns a location for Val, updating State to reflect the change.
bool CC_ARM_AAPCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
constexpr bool isMask_32(uint32_t Value)
Return true if the argument is a non-empty sequence of ones starting at the least significant bit wit...
Definition MathExtras.h:255
ExceptionHandling
Definition CodeGen.h:53
@ SjLj
setjmp/longjmp based exceptions
Definition CodeGen.h:56
bool RetCC_ARM_AAPCS_VFP(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
bool RetCC_ARM_APCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
Definition bit.h:323
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2208
constexpr bool isUIntN(unsigned N, uint64_t x)
Checks if an unsigned integer fits into the given (dynamic) bit width.
Definition MathExtras.h:243
bool RetCC_ARM_AAPCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
LLVM_ABI Value * concatenateVectors(IRBuilderBase &Builder, ArrayRef< Value * > Vecs)
Concatenate a list of vectors.
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition MathExtras.h:284
void shuffle(Iterator first, Iterator last, RNG &&g)
Definition STLExtras.h:1530
bool CC_ARM_APCS_GHC(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
static std::array< MachineOperand, 2 > predOps(ARMCC::CondCodes Pred, unsigned PredReg=0)
Get the operands corresponding to the given Pred value.
bool operator==(const AddressRangeValuePair &LHS, const AddressRangeValuePair &RHS)
constexpr bool isShiftedMask_32(uint32_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (32 bit ver...
Definition MathExtras.h:267
LLVM_ABI ConstantFPSDNode * isConstOrConstSplatFP(SDValue N, bool AllowUndefs=false)
Returns the SDNode if it is a constant splat BuildVector or constant float.
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition bit.h:202
bool isReleaseOrStronger(AtomicOrdering AO)
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:753
constexpr bool has_single_bit(T Value) noexcept
Definition bit.h:147
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1746
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:331
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
Definition bit.h:261
LLVM_ABI bool isBitwiseNot(SDValue V, bool AllowUndefs=false)
Returns true if V is a bitwise not operation.
MachineInstr * getImm(const MachineOperand &MO, const MachineRegisterInfo *MRI)
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:279
bool FastCC_ARM_APCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
bool CC_ARM_Win32_CFGuard_Check(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:163
constexpr uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition Alignment.h:144
FunctionAddr VTableAddr Count
Definition InstrProf.h:139
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
Definition MathExtras.h:189
SmallVector< ValueTypeFromRangeType< R >, Size > to_vector(R &&Range)
Given a range of type R, iterate the entire range and return a SmallVector with elements of the vecto...
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
LLVM_ABI raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
AtomicOrdering
Atomic ordering for LLVM's memory model.
FunctionAddr VTableAddr uintptr_t uintptr_t Data
Definition InstrProf.h:221
CombineLevel
Definition DAGCombine.h:15
@ BeforeLegalizeTypes
Definition DAGCombine.h:16
unsigned ConstantMaterializationCost(unsigned Val, const ARMSubtarget *Subtarget, bool ForCodesize=false)
Returns the number of instructions required to materialize the given constant in a register,...
@ Mul
Product of integers.
@ And
Bitwise or logical AND of integers.
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
@ FAdd
Sum of floats.
uint16_t MCPhysReg
An unsigned integer type large enough to represent all physical registers, but not necessarily virtua...
Definition MCRegister.h:21
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
LLVM_ABI ConstantSDNode * isConstOrConstSplat(SDValue N, bool AllowUndefs=false, bool AllowTruncation=false)
Returns the SDNode if it is a constant splat BuildVector or constant int.
constexpr U AbsoluteValue(T X)
Return the absolute value of a signed integer, converted to the corresponding unsigned integer type.
Definition MathExtras.h:592
bool isAcquireOrStronger(AtomicOrdering AO)
constexpr unsigned BitWidth
static MachineOperand t1CondCodeOp(bool isDead=false)
Get the operand corresponding to the conditional code result for Thumb1.
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
Definition STLExtras.h:2019
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1772
LLVM_ABI bool isOneConstant(SDValue V)
Returns true if V is a constant integer one.
constexpr bool isIntN(unsigned N, int64_t x)
Checks if an signed integer fits into the given (dynamic) bit width.
Definition MathExtras.h:248
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition Alignment.h:201
static MachineOperand condCodeOp(unsigned CCReg=0)
Get the operand corresponding to the conditional code result.
bool isVREVMask(ArrayRef< int > M, EVT VT, unsigned BlockSize)
isVREVMask - Check if a vector shuffle corresponds to a VREV instruction with the specified blocksize...
auto seq(T Begin, T End)
Iterate over an integral type from Begin up to - but not including - End.
Definition Sequence.h:305
unsigned gettBLXrOpcode(const MachineFunction &MF)
bool CC_ARM_APCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
@ Increment
Incrementally increasing token ID.
Definition AllocToken.h:26
bool CC_ARM_AAPCS_VFP(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
LLVM_ABI llvm::SmallVector< int, 16 > createSequentialMask(unsigned Start, unsigned NumInts, unsigned NumUndefs)
Create a sequential shuffle mask.
constexpr bool isShiftedUInt(uint64_t x)
Checks if a unsigned integer is an N bit number shifted left by S.
Definition MathExtras.h:198
unsigned convertAddSubFlagsOpcode(unsigned OldOpc)
Map pseudo instructions that imply an 'S' bit onto real opcodes.
LLVM_ABI bool isAllOnesConstant(SDValue V)
Returns true if V is an integer constant with all bits set.
static const unsigned PerfectShuffleTable[6561+1]
LLVM_ABI void reportFatalUsageError(Error Err)
Report a fatal error that does not indicate a bug in LLVM.
Definition Error.cpp:177
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:872
#define N
Load/store instruction that can be merged with a base address update.
SDNode * N
Instruction that updates a pointer.
unsigned ConstInc
Pointer increment value if it is a constant, or 0 otherwise.
SDValue Inc
Pointer increment operand.
A collection of metadata nodes that might be associated with a memory access used by the alias-analys...
Definition Metadata.h:763
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
constexpr uint64_t value() const
This is a hole in the type system and should not be abused.
Definition Alignment.h:77
static constexpr DenormalMode getIEEE()
Extended Value Type.
Definition ValueTypes.h:35
EVT changeVectorElementTypeToInteger() const
Return a vector with the same number of elements as this vector, but with the element type converted ...
Definition ValueTypes.h:90
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition ValueTypes.h:145
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition ValueTypes.h:70
bool bitsGT(EVT VT) const
Return true if this has more bits than VT.
Definition ValueTypes.h:292
bool bitsLT(EVT VT) const
Return true if this has less bits than VT.
Definition ValueTypes.h:308
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
Definition ValueTypes.h:155
ElementCount getVectorElementCount() const
Definition ValueTypes.h:358
EVT getDoubleNumVectorElementsVT(LLVMContext &Context) const
Definition ValueTypes.h:479
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition ValueTypes.h:381
unsigned getVectorMinNumElements() const
Given a vector type, return the minimum number of elements it contains.
Definition ValueTypes.h:367
uint64_t getScalarSizeInBits() const
Definition ValueTypes.h:393
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
Definition ValueTypes.h:486
static LLVM_ABI EVT getEVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
EVT changeVectorElementType(LLVMContext &Context, EVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
Definition ValueTypes.h:98
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition ValueTypes.h:324
bool is128BitVector() const
Return true if this is a 128-bit vector type.
Definition ValueTypes.h:215
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
Definition ValueTypes.h:61
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
Definition ValueTypes.h:389
bool isFixedLengthVector() const
Definition ValueTypes.h:189
static EVT getFloatingPointVT(unsigned BitWidth)
Returns the EVT that represents a floating-point type with the given number of bits.
Definition ValueTypes.h:55
bool isVector() const
Return true if this is a vector value type.
Definition ValueTypes.h:176
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition ValueTypes.h:331
LLVM_ABI Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition ValueTypes.h:336
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
Definition ValueTypes.h:165
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition ValueTypes.h:344
bool bitsLE(EVT VT) const
Return true if this has no more bits than VT.
Definition ValueTypes.h:316
EVT getHalfNumVectorElementsVT(LLVMContext &Context) const
Definition ValueTypes.h:469
bool isInteger() const
Return true if this is an integer or a vector integer type.
Definition ValueTypes.h:160
bool is64BitVector() const
Return true if this is a 64-bit vector type.
Definition ValueTypes.h:210
InputArg - This struct carries flags and type information about a single incoming (formal) argument o...
EVT ArgVT
Usually the non-legalized type of the argument, which is the EVT corresponding to the OrigTy IR type.
static KnownBits makeConstant(const APInt &C)
Create known bits from a known constant.
Definition KnownBits.h:317
bool isUnknown() const
Returns true if we don't know any bits.
Definition KnownBits.h:66
unsigned getBitWidth() const
Get the bit width of this value.
Definition KnownBits.h:44
KnownBits zext(unsigned BitWidth) const
Return known bits for a zero extension of the value we're tracking.
Definition KnownBits.h:178
void resetAll()
Resets the known state of all bits.
Definition KnownBits.h:74
static KnownBits add(const KnownBits &LHS, const KnownBits &RHS, bool NSW=false, bool NUW=false, bool SelfAdd=false)
Compute knownbits resulting from addition of LHS and RHS.
Definition KnownBits.h:363
KnownBits intersectWith(const KnownBits &RHS) const
Returns KnownBits information that is known to be true for both this and RHS.
Definition KnownBits.h:327
KnownBits sext(unsigned BitWidth) const
Return known bits for a sign extension of the value we're tracking.
Definition KnownBits.h:186
static LLVM_ABI KnownBits mul(const KnownBits &LHS, const KnownBits &RHS, bool NoUndefSelfMultiply=false)
Compute known bits resulting from multiplying LHS and RHS.
APInt getSignedMinValue() const
Return the minimal signed value possible given these KnownBits.
Definition KnownBits.h:138
Matching combinators.
SmallVector< ArgRegPair, 1 > ArgRegPairs
Vector of call argument and its forwarding register.
This class contains a discriminated union of information about pointers in memory operands,...
static LLVM_ABI MachinePointerInfo getJumpTable(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a jump table entry.
static LLVM_ABI MachinePointerInfo getStack(MachineFunction &MF, int64_t Offset, uint8_t ID=0)
Stack pointer relative access.
static LLVM_ABI MachinePointerInfo getConstantPool(MachineFunction &MF)
Return a MachinePointerInfo record that refers to the constant pool.
MachinePointerInfo getWithOffset(int64_t O) const
static LLVM_ABI MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
static LLVM_ABI MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition Alignment.h:106
These are IR-level optimization flags that may be propagated to SDNodes.
bool hasNoSignedZeros() const
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
This contains information for each constraint that we are lowering.
This structure contains all information that is necessary for lowering calls.
CallLoweringInfo & setInRegister(bool Value=true)
CallLoweringInfo & setLibCallee(CallingConv::ID CC, Type *ResultType, SDValue Target, ArgListTy &&ArgsList)
SmallVector< ISD::InputArg, 32 > Ins
CallLoweringInfo & setZExtResult(bool Value=true)
CallLoweringInfo & setDebugLoc(const SDLoc &dl)
CallLoweringInfo & setSExtResult(bool Value=true)
SmallVector< ISD::OutputArg, 32 > Outs
CallLoweringInfo & setChain(SDValue InChain)
CallLoweringInfo & setCallee(CallingConv::ID CC, Type *ResultType, SDValue Target, ArgListTy &&ArgsList, AttributeSet ResultAttrs={})
LLVM_ABI void AddToWorklist(SDNode *N)
LLVM_ABI SDValue CombineTo(SDNode *N, ArrayRef< SDValue > To, bool AddTo=true)
This structure is used to pass arguments to makeLibCall function.
A convenience struct that encapsulates a DAG, and two SDValues for returning information from TargetL...