LLVM 23.0.0git
ARMISelLowering.cpp
Go to the documentation of this file.
1//===- ARMISelLowering.cpp - ARM DAG Lowering Implementation --------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines the interfaces that ARM uses to lower LLVM code into a
10// selection DAG.
11//
12//===----------------------------------------------------------------------===//
13
14#include "ARMISelLowering.h"
15#include "ARMBaseInstrInfo.h"
16#include "ARMBaseRegisterInfo.h"
17#include "ARMCallingConv.h"
20#include "ARMPerfectShuffle.h"
21#include "ARMRegisterInfo.h"
22#include "ARMSelectionDAGInfo.h"
23#include "ARMSubtarget.h"
27#include "Utils/ARMBaseInfo.h"
28#include "llvm/ADT/APFloat.h"
29#include "llvm/ADT/APInt.h"
30#include "llvm/ADT/ArrayRef.h"
31#include "llvm/ADT/BitVector.h"
32#include "llvm/ADT/DenseMap.h"
33#include "llvm/ADT/STLExtras.h"
36#include "llvm/ADT/Statistic.h"
38#include "llvm/ADT/StringRef.h"
40#include "llvm/ADT/Twine.h"
66#include "llvm/IR/Attributes.h"
67#include "llvm/IR/CallingConv.h"
68#include "llvm/IR/Constant.h"
69#include "llvm/IR/Constants.h"
70#include "llvm/IR/DataLayout.h"
71#include "llvm/IR/DebugLoc.h"
73#include "llvm/IR/Function.h"
74#include "llvm/IR/GlobalAlias.h"
75#include "llvm/IR/GlobalValue.h"
77#include "llvm/IR/IRBuilder.h"
78#include "llvm/IR/InlineAsm.h"
79#include "llvm/IR/Instruction.h"
82#include "llvm/IR/Intrinsics.h"
83#include "llvm/IR/IntrinsicsARM.h"
84#include "llvm/IR/Module.h"
85#include "llvm/IR/Type.h"
86#include "llvm/IR/User.h"
87#include "llvm/IR/Value.h"
88#include "llvm/MC/MCInstrDesc.h"
90#include "llvm/MC/MCSchedule.h"
97#include "llvm/Support/Debug.h"
105#include <algorithm>
106#include <cassert>
107#include <cstdint>
108#include <cstdlib>
109#include <iterator>
110#include <limits>
111#include <optional>
112#include <tuple>
113#include <utility>
114#include <vector>
115
116using namespace llvm;
117
118#define DEBUG_TYPE "arm-isel"
119
120STATISTIC(NumTailCalls, "Number of tail calls");
121STATISTIC(NumOptimizedImms, "Number of times immediates were optimized");
122STATISTIC(NumMovwMovt, "Number of GAs materialized with movw + movt");
123STATISTIC(NumLoopByVals, "Number of loops generated for byval arguments");
124STATISTIC(NumConstpoolPromoted,
125 "Number of constants with their storage promoted into constant pools");
126
127static cl::opt<bool>
128ARMInterworking("arm-interworking", cl::Hidden,
129 cl::desc("Enable / disable ARM interworking (for debugging only)"),
130 cl::init(true));
131
133 "arm-promote-constant", cl::Hidden,
134 cl::desc("Enable / disable promotion of unnamed_addr constants into "
135 "constant pools"),
136 cl::init(false)); // FIXME: set to true by default once PR32780 is fixed
138 "arm-promote-constant-max-size", cl::Hidden,
139 cl::desc("Maximum size of constant to promote into a constant pool"),
140 cl::init(64));
142 "arm-promote-constant-max-total", cl::Hidden,
143 cl::desc("Maximum size of ALL constants to promote into a constant pool"),
144 cl::init(128));
145
147MVEMaxSupportedInterleaveFactor("mve-max-interleave-factor", cl::Hidden,
148 cl::desc("Maximum interleave factor for MVE VLDn to generate."),
149 cl::init(2));
150
152 "arm-max-base-updates-to-check", cl::Hidden,
153 cl::desc("Maximum number of base-updates to check generating postindex."),
154 cl::init(64));
155
156/// Value type used for "flags" operands / results (either CPSR or FPSCR_NZCV).
157constexpr MVT FlagsVT = MVT::i32;
158
159// The APCS parameter registers.
160static const MCPhysReg GPRArgRegs[] = {
161 ARM::R0, ARM::R1, ARM::R2, ARM::R3
162};
163
165 SelectionDAG &DAG, const SDLoc &DL) {
167 assert(Arg.ArgVT.bitsLT(MVT::i32));
168 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, Arg.ArgVT, Value);
169 SDValue Ext =
171 MVT::i32, Trunc);
172 return Ext;
173}
174
175void ARMTargetLowering::addTypeForNEON(MVT VT, MVT PromotedLdStVT) {
176 if (VT != PromotedLdStVT) {
178 AddPromotedToType (ISD::LOAD, VT, PromotedLdStVT);
179
181 AddPromotedToType (ISD::STORE, VT, PromotedLdStVT);
182 }
183
184 MVT ElemTy = VT.getVectorElementType();
185 if (ElemTy != MVT::f64)
189 if (ElemTy == MVT::i32) {
194 } else {
199 }
208 if (VT.isInteger()) {
212 }
213
214 // Neon does not support vector divide/remainder operations.
223
224 if (!VT.isFloatingPoint() && VT != MVT::v2i64 && VT != MVT::v1i64)
225 for (auto Opcode : {ISD::ABS, ISD::ABDS, ISD::ABDU, ISD::SMIN, ISD::SMAX,
227 setOperationAction(Opcode, VT, Legal);
228 if (!VT.isFloatingPoint())
229 for (auto Opcode : {ISD::SADDSAT, ISD::UADDSAT, ISD::SSUBSAT, ISD::USUBSAT})
230 setOperationAction(Opcode, VT, Legal);
231}
232
233void ARMTargetLowering::addDRTypeForNEON(MVT VT) {
234 addRegisterClass(VT, &ARM::DPRRegClass);
235 addTypeForNEON(VT, MVT::f64);
236}
237
238void ARMTargetLowering::addQRTypeForNEON(MVT VT) {
239 addRegisterClass(VT, &ARM::DPairRegClass);
240 addTypeForNEON(VT, MVT::v2f64);
241}
242
243void ARMTargetLowering::setAllExpand(MVT VT) {
244 for (unsigned Opc = 0; Opc < ISD::BUILTIN_OP_END; ++Opc)
246
247 // We support these really simple operations even on types where all
248 // the actual arithmetic has to be broken down into simpler
249 // operations or turned into library calls.
254}
255
256void ARMTargetLowering::addAllExtLoads(const MVT From, const MVT To,
257 LegalizeAction Action) {
258 setLoadExtAction(ISD::EXTLOAD, From, To, Action);
259 setLoadExtAction(ISD::ZEXTLOAD, From, To, Action);
260 setLoadExtAction(ISD::SEXTLOAD, From, To, Action);
261}
262
263void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) {
264 const MVT IntTypes[] = { MVT::v16i8, MVT::v8i16, MVT::v4i32 };
265
266 for (auto VT : IntTypes) {
267 addRegisterClass(VT, &ARM::MQPRRegClass);
298
299 // No native support for these.
309
310 // Vector reductions
320
321 if (!HasMVEFP) {
326 } else {
329 }
330
331 // Pre and Post inc are supported on loads and stores
332 for (unsigned im = (unsigned)ISD::PRE_INC;
333 im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
338 }
339 }
340
341 const MVT FloatTypes[] = { MVT::v8f16, MVT::v4f32 };
342 for (auto VT : FloatTypes) {
343 addRegisterClass(VT, &ARM::MQPRRegClass);
344 if (!HasMVEFP)
345 setAllExpand(VT);
346
347 // These are legal or custom whether we have MVE.fp or not
360
361 // Pre and Post inc are supported on loads and stores
362 for (unsigned im = (unsigned)ISD::PRE_INC;
363 im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
368 }
369
370 if (HasMVEFP) {
378 }
383
384 // No native support for these.
399 }
400 }
401
402 // Custom Expand smaller than legal vector reductions to prevent false zero
403 // items being added.
412
413 // We 'support' these types up to bitcast/load/store level, regardless of
414 // MVE integer-only / float support. Only doing FP data processing on the FP
415 // vector types is inhibited at integer-only level.
416 const MVT LongTypes[] = { MVT::v2i64, MVT::v2f64 };
417 for (auto VT : LongTypes) {
418 addRegisterClass(VT, &ARM::MQPRRegClass);
419 setAllExpand(VT);
425 }
427
428 // We can do bitwise operations on v2i64 vectors
429 setOperationAction(ISD::AND, MVT::v2i64, Legal);
430 setOperationAction(ISD::OR, MVT::v2i64, Legal);
431 setOperationAction(ISD::XOR, MVT::v2i64, Legal);
432
433 // It is legal to extload from v4i8 to v4i16 or v4i32.
434 addAllExtLoads(MVT::v8i16, MVT::v8i8, Legal);
435 addAllExtLoads(MVT::v4i32, MVT::v4i16, Legal);
436 addAllExtLoads(MVT::v4i32, MVT::v4i8, Legal);
437
438 // It is legal to sign extend from v4i8/v4i16 to v4i32 or v8i8 to v8i16.
444
445 // Some truncating stores are legal too.
446 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal);
447 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal);
448 setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal);
449
450 // Pre and Post inc on these are legal, given the correct extends
451 for (unsigned im = (unsigned)ISD::PRE_INC;
452 im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
453 for (auto VT : {MVT::v8i8, MVT::v4i8, MVT::v4i16}) {
458 }
459 }
460
461 // Predicate types
462 const MVT pTypes[] = {MVT::v16i1, MVT::v8i1, MVT::v4i1, MVT::v2i1};
463 for (auto VT : pTypes) {
464 addRegisterClass(VT, &ARM::VCCRRegClass);
479
480 if (!HasMVEFP) {
485 }
486 }
490 setOperationAction(ISD::OR, MVT::v2i1, Expand);
496
505}
506
508 return static_cast<const ARMBaseTargetMachine &>(getTargetMachine());
509}
510
512 const ARMSubtarget &STI)
513 : TargetLowering(TM_, STI), Subtarget(&STI),
514 RegInfo(Subtarget->getRegisterInfo()),
515 Itins(Subtarget->getInstrItineraryData()) {
516 const auto &TM = static_cast<const ARMBaseTargetMachine &>(TM_);
517
520
521 const Triple &TT = TM.getTargetTriple();
522
523 if (Subtarget->isThumb1Only())
524 addRegisterClass(MVT::i32, &ARM::tGPRRegClass);
525 else
526 addRegisterClass(MVT::i32, &ARM::GPRRegClass);
527
528 if (!Subtarget->useSoftFloat() && !Subtarget->isThumb1Only() &&
529 Subtarget->hasFPRegs()) {
530 addRegisterClass(MVT::f32, &ARM::SPRRegClass);
531 addRegisterClass(MVT::f64, &ARM::DPRRegClass);
532
533 if (!Subtarget->hasVFP2Base()) {
534 setAllExpand(MVT::f32);
535 } else {
538
541 setOperationAction(Op, MVT::f32, Legal);
542 }
543 if (!Subtarget->hasFP64()) {
544 setAllExpand(MVT::f64);
545 } else {
548 setOperationAction(Op, MVT::f64, Legal);
549
551 }
552 }
553
554 if (Subtarget->hasFullFP16()) {
557 setOperationAction(Op, MVT::f16, Legal);
558
559 addRegisterClass(MVT::f16, &ARM::HPRRegClass);
562
567 }
568
569 if (Subtarget->hasBF16()) {
570 addRegisterClass(MVT::bf16, &ARM::HPRRegClass);
571 setAllExpand(MVT::bf16);
572 if (!Subtarget->hasFullFP16())
574 } else {
579 }
580
582 for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
583 setTruncStoreAction(VT, InnerVT, Expand);
584 addAllExtLoads(VT, InnerVT, Expand);
585 }
586
589
591 }
592
593 if (!Subtarget->isThumb1Only() && !Subtarget->hasV8_1MMainlineOps())
595
596 if (!Subtarget->hasV8_1MMainlineOps())
598
599 if (!Subtarget->isThumb1Only())
601
604
607
608 if (Subtarget->hasMVEIntegerOps())
609 addMVEVectorTypes(Subtarget->hasMVEFloatOps());
610
611 // Combine low-overhead loop intrinsics so that we can lower i1 types.
612 if (Subtarget->hasLOB()) {
614 }
615
616 if (Subtarget->hasNEON()) {
617 addDRTypeForNEON(MVT::v2f32);
618 addDRTypeForNEON(MVT::v8i8);
619 addDRTypeForNEON(MVT::v4i16);
620 addDRTypeForNEON(MVT::v2i32);
621 addDRTypeForNEON(MVT::v1i64);
622
623 addQRTypeForNEON(MVT::v4f32);
624 addQRTypeForNEON(MVT::v2f64);
625 addQRTypeForNEON(MVT::v16i8);
626 addQRTypeForNEON(MVT::v8i16);
627 addQRTypeForNEON(MVT::v4i32);
628 addQRTypeForNEON(MVT::v2i64);
629
630 if (Subtarget->hasFullFP16()) {
631 addQRTypeForNEON(MVT::v8f16);
632 addDRTypeForNEON(MVT::v4f16);
633 }
634
635 if (Subtarget->hasBF16()) {
636 addQRTypeForNEON(MVT::v8bf16);
637 addDRTypeForNEON(MVT::v4bf16);
638 }
639 }
640
641 if (Subtarget->hasMVEIntegerOps() || Subtarget->hasNEON()) {
642 // v2f64 is legal so that QR subregs can be extracted as f64 elements, but
643 // none of Neon, MVE or VFP supports any arithmetic operations on it.
644 setOperationAction(ISD::FADD, MVT::v2f64, Expand);
645 setOperationAction(ISD::FSUB, MVT::v2f64, Expand);
646 setOperationAction(ISD::FMUL, MVT::v2f64, Expand);
647 // FIXME: Code duplication: FDIV and FREM are expanded always, see
648 // ARMTargetLowering::addTypeForNEON method for details.
649 setOperationAction(ISD::FDIV, MVT::v2f64, Expand);
650 setOperationAction(ISD::FREM, MVT::v2f64, Expand);
651 // FIXME: Create unittest.
652 // In another words, find a way when "copysign" appears in DAG with vector
653 // operands.
655 // FIXME: Code duplication: SETCC has custom operation action, see
656 // ARMTargetLowering::addTypeForNEON method for details.
658 // FIXME: Create unittest for FNEG and for FABS.
659 setOperationAction(ISD::FNEG, MVT::v2f64, Expand);
660 setOperationAction(ISD::FABS, MVT::v2f64, Expand);
662 setOperationAction(ISD::FSIN, MVT::v2f64, Expand);
663 setOperationAction(ISD::FCOS, MVT::v2f64, Expand);
664 setOperationAction(ISD::FTAN, MVT::v2f64, Expand);
665 setOperationAction(ISD::FPOW, MVT::v2f64, Expand);
666 setOperationAction(ISD::FLOG, MVT::v2f64, Expand);
669 setOperationAction(ISD::FEXP, MVT::v2f64, Expand);
678 setOperationAction(ISD::FMA, MVT::v2f64, Expand);
679 }
680
681 if (Subtarget->hasNEON()) {
682 // The same with v4f32. But keep in mind that vadd, vsub, vmul are natively
683 // supported for v4f32.
685 setOperationAction(ISD::FSIN, MVT::v4f32, Expand);
686 setOperationAction(ISD::FCOS, MVT::v4f32, Expand);
687 setOperationAction(ISD::FTAN, MVT::v4f32, Expand);
688 setOperationAction(ISD::FPOW, MVT::v4f32, Expand);
689 setOperationAction(ISD::FLOG, MVT::v4f32, Expand);
692 setOperationAction(ISD::FEXP, MVT::v4f32, Expand);
701
702 // Mark v2f32 intrinsics.
704 setOperationAction(ISD::FSIN, MVT::v2f32, Expand);
705 setOperationAction(ISD::FCOS, MVT::v2f32, Expand);
706 setOperationAction(ISD::FTAN, MVT::v2f32, Expand);
707 setOperationAction(ISD::FPOW, MVT::v2f32, Expand);
708 setOperationAction(ISD::FLOG, MVT::v2f32, Expand);
711 setOperationAction(ISD::FEXP, MVT::v2f32, Expand);
720
723 setOperationAction(Op, MVT::v4f16, Expand);
724 setOperationAction(Op, MVT::v8f16, Expand);
725 }
726
727 // Neon does not support some operations on v1i64 and v2i64 types.
728 setOperationAction(ISD::MUL, MVT::v1i64, Expand);
729 // Custom handling for some quad-vector types to detect VMULL.
730 setOperationAction(ISD::MUL, MVT::v8i16, Custom);
731 setOperationAction(ISD::MUL, MVT::v4i32, Custom);
732 setOperationAction(ISD::MUL, MVT::v2i64, Custom);
733 // Custom handling for some vector types to avoid expensive expansions
734 setOperationAction(ISD::SDIV, MVT::v4i16, Custom);
736 setOperationAction(ISD::UDIV, MVT::v4i16, Custom);
738 // Neon does not have single instruction SINT_TO_FP and UINT_TO_FP with
739 // a destination type that is wider than the source, and nor does
740 // it have a FP_TO_[SU]INT instruction with a narrower destination than
741 // source.
750
753
754 // NEON does not have single instruction CTPOP for vectors with element
755 // types wider than 8-bits. However, custom lowering can leverage the
756 // v8i8/v16i8 vcnt instruction.
763
764 setOperationAction(ISD::CTLZ, MVT::v1i64, Expand);
765 setOperationAction(ISD::CTLZ, MVT::v2i64, Expand);
766
767 // NEON does not have single instruction CTTZ for vectors.
769 setOperationAction(ISD::CTTZ, MVT::v4i16, Custom);
770 setOperationAction(ISD::CTTZ, MVT::v2i32, Custom);
771 setOperationAction(ISD::CTTZ, MVT::v1i64, Custom);
772
773 setOperationAction(ISD::CTTZ, MVT::v16i8, Custom);
774 setOperationAction(ISD::CTTZ, MVT::v8i16, Custom);
775 setOperationAction(ISD::CTTZ, MVT::v4i32, Custom);
776 setOperationAction(ISD::CTTZ, MVT::v2i64, Custom);
777
782
787
791 }
792
793 // NEON only has FMA instructions as of VFP4.
794 if (!Subtarget->hasVFP4Base()) {
795 setOperationAction(ISD::FMA, MVT::v2f32, Expand);
796 setOperationAction(ISD::FMA, MVT::v4f32, Expand);
797 }
798
801
802 // It is legal to extload from v4i8 to v4i16 or v4i32.
803 for (MVT Ty : {MVT::v8i8, MVT::v4i8, MVT::v2i8, MVT::v4i16, MVT::v2i16,
804 MVT::v2i32}) {
809 }
810 }
811
812 for (auto VT : {MVT::v8i8, MVT::v4i16, MVT::v2i32, MVT::v16i8, MVT::v8i16,
813 MVT::v4i32}) {
818 }
819 }
820
821 if (Subtarget->hasNEON() || Subtarget->hasMVEIntegerOps()) {
828 }
829 if (Subtarget->hasMVEIntegerOps()) {
832 ISD::SETCC});
833 }
834 if (Subtarget->hasMVEFloatOps()) {
836 }
837
838 if (!Subtarget->hasFP64()) {
839 // When targeting a floating-point unit with only single-precision
840 // operations, f64 is legal for the few double-precision instructions which
841 // are present However, no double-precision operations other than moves,
842 // loads and stores are provided by the hardware.
879 }
880
883
884 if (!Subtarget->hasFP64() || !Subtarget->hasFPARMv8Base()) {
887 if (Subtarget->hasFullFP16()) {
890 }
891 } else {
893 }
894
895 if (!Subtarget->hasFP16()) {
898 } else {
901 }
902
903 computeRegisterProperties(Subtarget->getRegisterInfo());
904
905 // ARM does not have floating-point extending loads.
906 for (MVT VT : MVT::fp_valuetypes()) {
907 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand);
908 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand);
909 setLoadExtAction(ISD::EXTLOAD, VT, MVT::bf16, Expand);
910 }
911
912 // ... or truncating stores
913 setTruncStoreAction(MVT::f64, MVT::f32, Expand);
914 setTruncStoreAction(MVT::f32, MVT::f16, Expand);
915 setTruncStoreAction(MVT::f64, MVT::f16, Expand);
916 setTruncStoreAction(MVT::f32, MVT::bf16, Expand);
917 setTruncStoreAction(MVT::f64, MVT::bf16, Expand);
918
919 // ARM does not have i1 sign extending load.
920 for (MVT VT : MVT::integer_valuetypes())
922
923 // ARM supports all 4 flavors of integer indexed load / store.
924 if (!Subtarget->isThumb1Only()) {
925 for (unsigned im = (unsigned)ISD::PRE_INC;
927 setIndexedLoadAction(im, MVT::i1, Legal);
928 setIndexedLoadAction(im, MVT::i8, Legal);
929 setIndexedLoadAction(im, MVT::i16, Legal);
930 setIndexedLoadAction(im, MVT::i32, Legal);
931 setIndexedStoreAction(im, MVT::i1, Legal);
932 setIndexedStoreAction(im, MVT::i8, Legal);
933 setIndexedStoreAction(im, MVT::i16, Legal);
934 setIndexedStoreAction(im, MVT::i32, Legal);
935 }
936 } else {
937 // Thumb-1 has limited post-inc load/store support - LDM r0!, {r1}.
940 }
941
942 // Custom loads/stores to possible use __aeabi_uread/write*
943 if (TT.isTargetAEABI() && !Subtarget->allowsUnalignedMem()) {
948 }
949
954
955 if (!Subtarget->isThumb1Only()) {
958 }
959
964 if (Subtarget->hasDSP()) {
973 }
974 if (Subtarget->hasBaseDSP()) {
977 }
978
979 // i64 operation support.
982 if (Subtarget->isThumb1Only()) {
985 }
986 if (Subtarget->isThumb1Only() || !Subtarget->hasV6Ops()
987 || (Subtarget->isThumb2() && !Subtarget->hasDSP()))
989
999
1000 // MVE lowers 64 bit shifts to lsll and lsrl
1001 // assuming that ISD::SRL and SRA of i64 are already marked custom
1002 if (Subtarget->hasMVEIntegerOps())
1004
1005 // Expand to __aeabi_l{lsl,lsr,asr} calls for Thumb1.
1006 if (Subtarget->isThumb1Only()) {
1010 }
1011
1012 if (!Subtarget->isThumb1Only() && Subtarget->hasV6T2Ops())
1014
1015 // ARM does not have ROTL.
1020 }
1022 // TODO: These two should be set to LibCall, but this currently breaks
1023 // the Linux kernel build. See #101786.
1026 if (!Subtarget->hasV5TOps() || Subtarget->isThumb1Only()) {
1029 }
1030
1031 // @llvm.readcyclecounter requires the Performance Monitors extension.
1032 // Default to the 0 expansion on unsupported platforms.
1033 // FIXME: Technically there are older ARM CPUs that have
1034 // implementation-specific ways of obtaining this information.
1035 if (Subtarget->hasPerfMon())
1037
1038 // Only ARMv6 has BSWAP.
1039 if (!Subtarget->hasV6Ops())
1041
1042 bool hasDivide = Subtarget->isThumb() ? Subtarget->hasDivideInThumbMode()
1043 : Subtarget->hasDivideInARMMode();
1044 if (!hasDivide) {
1045 // These are expanded into libcalls if the cpu doesn't have HW divider.
1048 }
1049
1050 if (TT.isOSWindows() && !Subtarget->hasDivideInThumbMode()) {
1053
1056 }
1057
1060
1061 // Register based DivRem for AEABI (RTABI 4.2)
1062 if (TT.isTargetAEABI() || TT.isAndroid() || TT.isTargetGNUAEABI() ||
1063 TT.isTargetMuslAEABI() || TT.isOSFuchsia() || TT.isOSWindows()) {
1066 HasStandaloneRem = false;
1067
1072 } else {
1075 }
1076
1081
1082 setOperationAction(ISD::TRAP, MVT::Other, Legal);
1084
1085 // Use the default implementation.
1087 setOperationAction(ISD::VAARG, MVT::Other, Expand);
1089 setOperationAction(ISD::VAEND, MVT::Other, Expand);
1092
1093 if (TT.isOSWindows())
1095 else
1097
1098 // ARMv6 Thumb1 (except for CPUs that support dmb / dsb) and earlier use
1099 // the default expansion.
1100 InsertFencesForAtomic = false;
1101 if (Subtarget->hasAnyDataBarrier() &&
1102 (!Subtarget->isThumb() || Subtarget->hasV8MBaselineOps())) {
1103 // ATOMIC_FENCE needs custom lowering; the others should have been expanded
1104 // to ldrex/strex loops already.
1106 if (!Subtarget->isThumb() || !Subtarget->isMClass())
1108
1109 // On v8, we have particularly efficient implementations of atomic fences
1110 // if they can be combined with nearby atomic loads and stores.
1111 if (!Subtarget->hasAcquireRelease() ||
1112 getTargetMachine().getOptLevel() == CodeGenOptLevel::None) {
1113 // Automatically insert fences (dmb ish) around ATOMIC_SWAP etc.
1114 InsertFencesForAtomic = true;
1115 }
1116 } else {
1117 // If there's anything we can use as a barrier, go through custom lowering
1118 // for ATOMIC_FENCE.
1119 // If target has DMB in thumb, Fences can be inserted.
1120 if (Subtarget->hasDataBarrier())
1121 InsertFencesForAtomic = true;
1122
1124 Subtarget->hasAnyDataBarrier() ? Custom : Expand);
1125
1126 // Set them all for libcall, which will force libcalls.
1139 // Mark ATOMIC_LOAD and ATOMIC_STORE custom so we can handle the
1140 // Unordered/Monotonic case.
1141 if (!InsertFencesForAtomic) {
1144 }
1145 }
1146
1147 // Compute supported atomic widths.
1148 if (TT.isOSLinux() || (!Subtarget->isMClass() && Subtarget->hasV6Ops())) {
1149 // For targets where __sync_* routines are reliably available, we use them
1150 // if necessary.
1151 //
1152 // ARM Linux always supports 64-bit atomics through kernel-assisted atomic
1153 // routines (kernel 3.1 or later). FIXME: Not with compiler-rt?
1154 //
1155 // ARMv6 targets have native instructions in ARM mode. For Thumb mode,
1156 // such targets should provide __sync_* routines, which use the ARM mode
1157 // instructions. (ARMv6 doesn't have dmb, but it has an equivalent
1158 // encoding; see ARMISD::MEMBARRIER_MCR.)
1160 } else if ((Subtarget->isMClass() && Subtarget->hasV8MBaselineOps()) ||
1161 Subtarget->hasForced32BitAtomics()) {
1162 // Cortex-M (besides Cortex-M0) have 32-bit atomics.
1164 } else {
1165 // We can't assume anything about other targets; just use libatomic
1166 // routines.
1168 }
1169
1171
1173
1174 // Requires SXTB/SXTH, available on v6 and up in both ARM and Thumb modes.
1175 if (!Subtarget->hasV6Ops()) {
1178 }
1180
1181 if (!Subtarget->useSoftFloat() && Subtarget->hasFPRegs() &&
1182 !Subtarget->isThumb1Only()) {
1183 // Turn f64->i64 into VMOVRRD, i64 -> f64 to VMOVDRR
1184 // iff target supports vfp2.
1194 }
1195
1196 // We want to custom lower some of our intrinsics.
1201
1211 if (Subtarget->hasFullFP16()) {
1215 }
1216
1218
1221 if (Subtarget->hasFullFP16())
1225 setOperationAction(ISD::BR_JT, MVT::Other, Custom);
1226
1227 // We don't support sin/cos/fmod/copysign/pow
1236 if (!Subtarget->useSoftFloat() && Subtarget->hasVFP2Base() &&
1237 !Subtarget->isThumb1Only()) {
1240 }
1243
1244 if (!Subtarget->hasVFP4Base()) {
1247 }
1248
1249 // Various VFP goodness
1250 if (!Subtarget->useSoftFloat() && !Subtarget->isThumb1Only()) {
1251 // FP-ARMv8 adds f64 <-> f16 conversion. Before that it should be expanded.
1252 if (!Subtarget->hasFPARMv8Base() || !Subtarget->hasFP64()) {
1257 }
1258
1259 // fp16 is a special v7 extension that adds f16 <-> f32 conversions.
1260 if (!Subtarget->hasFP16()) {
1265 }
1266
1267 // Strict floating-point comparisons need custom lowering.
1274 }
1275
1276 // FP-ARMv8 implements a lot of rounding-like FP operations.
1277 if (Subtarget->hasFPARMv8Base()) {
1278 for (auto Op :
1285 setOperationAction(Op, MVT::f32, Legal);
1286
1287 if (Subtarget->hasFP64())
1288 setOperationAction(Op, MVT::f64, Legal);
1289 }
1290
1291 if (Subtarget->hasNEON()) {
1296 }
1297 }
1298
1299 // FP16 often need to be promoted to call lib functions
1300 // clang-format off
1301 if (Subtarget->hasFullFP16()) {
1305
1306 for (auto Op : {ISD::FREM, ISD::FPOW, ISD::FPOWI,
1320 setOperationAction(Op, MVT::f16, Promote);
1321 }
1322
1323 // Round-to-integer need custom lowering for fp16, as Promote doesn't work
1324 // because the result type is integer.
1326 setOperationAction(Op, MVT::f16, Custom);
1327
1333 setOperationAction(Op, MVT::f16, Legal);
1334 }
1335 // clang-format on
1336 }
1337
1338 if (Subtarget->hasNEON()) {
1339 // vmin and vmax aren't available in a scalar form, so we can use
1340 // a NEON instruction with an undef lane instead.
1349
1350 if (Subtarget->hasV8Ops()) {
1355 setOperationAction(Op, MVT::v2f32, Legal);
1356 setOperationAction(Op, MVT::v4f32, Legal);
1357 }
1358 }
1359
1360 if (Subtarget->hasFullFP16()) {
1365
1370
1375 setOperationAction(Op, MVT::v4f16, Legal);
1376 setOperationAction(Op, MVT::v8f16, Legal);
1377 }
1378 }
1379 }
1380
1381 // On MSVC, both 32-bit and 64-bit, ldexpf(f32) is not defined. MinGW has
1382 // it, but it's just a wrapper around ldexp.
1383 if (TT.isOSWindows()) {
1385 if (isOperationExpand(Op, MVT::f32))
1386 setOperationAction(Op, MVT::f32, Promote);
1387 }
1388
1389 // LegalizeDAG currently can't expand fp16 LDEXP/FREXP on targets where i16
1390 // isn't legal.
1392 if (isOperationExpand(Op, MVT::f16))
1393 setOperationAction(Op, MVT::f16, Promote);
1394
1395 // We have target-specific dag combine patterns for the following nodes:
1396 // ARMISD::VMOVRRD - No need to call setTargetDAGCombine
1399
1400 if (Subtarget->hasMVEIntegerOps())
1402
1403 if (Subtarget->hasV6Ops())
1405 if (Subtarget->isThumb1Only())
1407 // Attempt to lower smin/smax to ssat/usat
1408 if ((!Subtarget->isThumb() && Subtarget->hasV6Ops()) ||
1409 Subtarget->isThumb2()) {
1411 }
1412
1414
1415 if (Subtarget->useSoftFloat() || Subtarget->isThumb1Only() ||
1416 !Subtarget->hasVFP2Base() || Subtarget->hasMinSize())
1418 else
1420
1421 //// temporary - rewrite interface to use type
1424 MaxStoresPerMemcpy = 4; // For @llvm.memcpy -> sequence of stores
1426 MaxStoresPerMemmove = 4; // For @llvm.memmove -> sequence of stores
1428
1429 // On ARM arguments smaller than 4 bytes are extended, so all arguments
1430 // are at least 4 bytes aligned.
1432
1433 // Prefer likely predicted branches to selects on out-of-order cores.
1434 PredictableSelectIsExpensive = Subtarget->getSchedModel().isOutOfOrder();
1435
1436 setPrefLoopAlignment(Align(1ULL << Subtarget->getPreferBranchLogAlignment()));
1438 Align(1ULL << Subtarget->getPreferBranchLogAlignment()));
1439
1440 setMinFunctionAlignment(Subtarget->isThumb() ? Align(2) : Align(4));
1441
1442 IsStrictFPEnabled = true;
1443}
1444
1446 return Subtarget->useSoftFloat();
1447}
1448
1450 return !Subtarget->isThumb1Only() && VT.getSizeInBits() <= 32;
1451}
1452
1453// FIXME: It might make sense to define the representative register class as the
1454// nearest super-register that has a non-null superset. For example, DPR_VFP2 is
1455// a super-register of SPR, and DPR is a superset if DPR_VFP2. Consequently,
1456// SPR's representative would be DPR_VFP2. This should work well if register
1457// pressure tracking were modified such that a register use would increment the
1458// pressure of the register class's representative and all of it's super
1459// classes' representatives transitively. We have not implemented this because
1460// of the difficulty prior to coalescing of modeling operand register classes
1461// due to the common occurrence of cross class copies and subregister insertions
1462// and extractions.
1463std::pair<const TargetRegisterClass *, uint8_t>
1465 MVT VT) const {
1466 const TargetRegisterClass *RRC = nullptr;
1467 uint8_t Cost = 1;
1468 switch (VT.SimpleTy) {
1469 default:
1471 // Use DPR as representative register class for all floating point
1472 // and vector types. Since there are 32 SPR registers and 32 DPR registers so
1473 // the cost is 1 for both f32 and f64.
1474 case MVT::f32: case MVT::f64: case MVT::v8i8: case MVT::v4i16:
1475 case MVT::v2i32: case MVT::v1i64: case MVT::v2f32:
1476 RRC = &ARM::DPRRegClass;
1477 // When NEON is used for SP, only half of the register file is available
1478 // because operations that define both SP and DP results will be constrained
1479 // to the VFP2 class (D0-D15). We currently model this constraint prior to
1480 // coalescing by double-counting the SP regs. See the FIXME above.
1481 if (Subtarget->useNEONForSinglePrecisionFP())
1482 Cost = 2;
1483 break;
1484 case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
1485 case MVT::v4f32: case MVT::v2f64:
1486 RRC = &ARM::DPRRegClass;
1487 Cost = 2;
1488 break;
1489 case MVT::v4i64:
1490 RRC = &ARM::DPRRegClass;
1491 Cost = 4;
1492 break;
1493 case MVT::v8i64:
1494 RRC = &ARM::DPRRegClass;
1495 Cost = 8;
1496 break;
1497 }
1498 return std::make_pair(RRC, Cost);
1499}
1500
1502 EVT VT) const {
1503 if (!VT.isVector())
1504 return getPointerTy(DL);
1505
1506 // MVE has a predicate register.
1507 if (Subtarget->hasMVEIntegerOps())
1508 return EVT::getVectorVT(C, MVT::i1, VT.getVectorElementCount());
1509
1511}
1512
1513/// getRegClassFor - Return the register class that should be used for the
1514/// specified value type.
1515const TargetRegisterClass *
1516ARMTargetLowering::getRegClassFor(MVT VT, bool isDivergent) const {
1517 (void)isDivergent;
1518 // Map v4i64 to QQ registers but do not make the type legal. Similarly map
1519 // v8i64 to QQQQ registers. v4i64 and v8i64 are only used for REG_SEQUENCE to
1520 // load / store 4 to 8 consecutive NEON D registers, or 2 to 4 consecutive
1521 // MVE Q registers.
1522 if (Subtarget->hasNEON()) {
1523 if (VT == MVT::v4i64)
1524 return &ARM::QQPRRegClass;
1525 if (VT == MVT::v8i64)
1526 return &ARM::QQQQPRRegClass;
1527 }
1528 if (Subtarget->hasMVEIntegerOps()) {
1529 if (VT == MVT::v4i64)
1530 return &ARM::MQQPRRegClass;
1531 if (VT == MVT::v8i64)
1532 return &ARM::MQQQQPRRegClass;
1533 }
1535}
1536
1537// memcpy, and other memory intrinsics, typically tries to use LDM/STM if the
1538// source/dest is aligned and the copy size is large enough. We therefore want
1539// to align such objects passed to memory intrinsics.
1541 Align &PrefAlign) const {
1542 if (!isa<MemIntrinsic>(CI))
1543 return false;
1544 MinSize = 8;
1545 // On ARM11 onwards (excluding M class) 8-byte aligned LDM is typically 1
1546 // cycle faster than 4-byte aligned LDM.
1547 PrefAlign =
1548 (Subtarget->hasV6Ops() && !Subtarget->isMClass() ? Align(8) : Align(4));
1549 return true;
1550}
1551
1552// Create a fast isel object.
1554 FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo,
1555 const LibcallLoweringInfo *libcallLowering) const {
1556 return ARM::createFastISel(funcInfo, libInfo, libcallLowering);
1557}
1558
1560 unsigned NumVals = N->getNumValues();
1561 if (!NumVals)
1562 return Sched::RegPressure;
1563
1564 for (unsigned i = 0; i != NumVals; ++i) {
1565 EVT VT = N->getValueType(i);
1566 if (VT == MVT::Glue || VT == MVT::Other)
1567 continue;
1568 if (VT.isFloatingPoint() || VT.isVector())
1569 return Sched::ILP;
1570 }
1571
1572 if (!N->isMachineOpcode())
1573 return Sched::RegPressure;
1574
1575 // Load are scheduled for latency even if there instruction itinerary
1576 // is not available.
1577 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
1578 const MCInstrDesc &MCID = TII->get(N->getMachineOpcode());
1579
1580 if (MCID.getNumDefs() == 0)
1581 return Sched::RegPressure;
1582 if (!Itins->isEmpty() &&
1583 Itins->getOperandCycle(MCID.getSchedClass(), 0) > 2U)
1584 return Sched::ILP;
1585
1586 return Sched::RegPressure;
1587}
1588
1589//===----------------------------------------------------------------------===//
1590// Lowering Code
1591//===----------------------------------------------------------------------===//
1592
1593static bool isSRL16(const SDValue &Op) {
1594 if (Op.getOpcode() != ISD::SRL)
1595 return false;
1596 if (auto Const = dyn_cast<ConstantSDNode>(Op.getOperand(1)))
1597 return Const->getZExtValue() == 16;
1598 return false;
1599}
1600
1601static bool isSRA16(const SDValue &Op) {
1602 if (Op.getOpcode() != ISD::SRA)
1603 return false;
1604 if (auto Const = dyn_cast<ConstantSDNode>(Op.getOperand(1)))
1605 return Const->getZExtValue() == 16;
1606 return false;
1607}
1608
1609static bool isSHL16(const SDValue &Op) {
1610 if (Op.getOpcode() != ISD::SHL)
1611 return false;
1612 if (auto Const = dyn_cast<ConstantSDNode>(Op.getOperand(1)))
1613 return Const->getZExtValue() == 16;
1614 return false;
1615}
1616
1617// Check for a signed 16-bit value. We special case SRA because it makes it
1618// more simple when also looking for SRAs that aren't sign extending a
1619// smaller value. Without the check, we'd need to take extra care with
1620// checking order for some operations.
1621static bool isS16(const SDValue &Op, SelectionDAG &DAG) {
1622 if (isSRA16(Op))
1623 return isSHL16(Op.getOperand(0));
1624 return DAG.ComputeNumSignBits(Op) == 17;
1625}
1626
1627/// IntCCToARMCC - Convert a DAG integer condition code to an ARM CC
1629 switch (CC) {
1630 default: llvm_unreachable("Unknown condition code!");
1631 case ISD::SETNE: return ARMCC::NE;
1632 case ISD::SETEQ: return ARMCC::EQ;
1633 case ISD::SETGT: return ARMCC::GT;
1634 case ISD::SETGE: return ARMCC::GE;
1635 case ISD::SETLT: return ARMCC::LT;
1636 case ISD::SETLE: return ARMCC::LE;
1637 case ISD::SETUGT: return ARMCC::HI;
1638 case ISD::SETUGE: return ARMCC::HS;
1639 case ISD::SETULT: return ARMCC::LO;
1640 case ISD::SETULE: return ARMCC::LS;
1641 }
1642}
1643
1644/// FPCCToARMCC - Convert a DAG fp condition code to an ARM CC.
1646 ARMCC::CondCodes &CondCode2) {
1647 CondCode2 = ARMCC::AL;
1648 switch (CC) {
1649 default: llvm_unreachable("Unknown FP condition!");
1650 case ISD::SETEQ:
1651 case ISD::SETOEQ: CondCode = ARMCC::EQ; break;
1652 case ISD::SETGT:
1653 case ISD::SETOGT: CondCode = ARMCC::GT; break;
1654 case ISD::SETGE:
1655 case ISD::SETOGE: CondCode = ARMCC::GE; break;
1656 case ISD::SETOLT: CondCode = ARMCC::MI; break;
1657 case ISD::SETOLE: CondCode = ARMCC::LS; break;
1658 case ISD::SETONE: CondCode = ARMCC::MI; CondCode2 = ARMCC::GT; break;
1659 case ISD::SETO: CondCode = ARMCC::VC; break;
1660 case ISD::SETUO: CondCode = ARMCC::VS; break;
1661 case ISD::SETUEQ: CondCode = ARMCC::EQ; CondCode2 = ARMCC::VS; break;
1662 case ISD::SETUGT: CondCode = ARMCC::HI; break;
1663 case ISD::SETUGE: CondCode = ARMCC::PL; break;
1664 case ISD::SETLT:
1665 case ISD::SETULT: CondCode = ARMCC::LT; break;
1666 case ISD::SETLE:
1667 case ISD::SETULE: CondCode = ARMCC::LE; break;
1668 case ISD::SETNE:
1669 case ISD::SETUNE: CondCode = ARMCC::NE; break;
1670 }
1671}
1672
1673//===----------------------------------------------------------------------===//
1674// Calling Convention Implementation
1675//===----------------------------------------------------------------------===//
1676
1677/// getEffectiveCallingConv - Get the effective calling convention, taking into
1678/// account presence of floating point hardware and calling convention
1679/// limitations, such as support for variadic functions.
1681ARMTargetLowering::getEffectiveCallingConv(CallingConv::ID CC,
1682 bool isVarArg) const {
1683 switch (CC) {
1684 default:
1685 report_fatal_error("Unsupported calling convention");
1688 case CallingConv::GHC:
1690 return CC;
1696 case CallingConv::Swift:
1699 case CallingConv::C:
1700 case CallingConv::Tail:
1701 if (!getTM().isAAPCS_ABI())
1702 return CallingConv::ARM_APCS;
1703 else if (Subtarget->hasFPRegs() && !Subtarget->isThumb1Only() &&
1704 getTargetMachine().Options.FloatABIType == FloatABI::Hard &&
1705 !isVarArg)
1707 else
1709 case CallingConv::Fast:
1711 if (!getTM().isAAPCS_ABI()) {
1712 if (Subtarget->hasFPRegs() && !Subtarget->isThumb1Only() && !isVarArg)
1713 return CallingConv::Fast;
1714 return CallingConv::ARM_APCS;
1715 } else if (Subtarget->hasFPRegs() && !Subtarget->isThumb1Only() &&
1716 !isVarArg)
1718 else
1720 }
1721}
1722
1724 bool isVarArg) const {
1725 return CCAssignFnForNode(CC, false, isVarArg);
1726}
1727
1729 bool isVarArg) const {
1730 return CCAssignFnForNode(CC, true, isVarArg);
1731}
1732
1733/// CCAssignFnForNode - Selects the correct CCAssignFn for the given
1734/// CallingConvention.
1735CCAssignFn *ARMTargetLowering::CCAssignFnForNode(CallingConv::ID CC,
1736 bool Return,
1737 bool isVarArg) const {
1738 switch (getEffectiveCallingConv(CC, isVarArg)) {
1739 default:
1740 report_fatal_error("Unsupported calling convention");
1742 return (Return ? RetCC_ARM_APCS : CC_ARM_APCS);
1744 return (Return ? RetCC_ARM_AAPCS : CC_ARM_AAPCS);
1746 return (Return ? RetCC_ARM_AAPCS_VFP : CC_ARM_AAPCS_VFP);
1747 case CallingConv::Fast:
1748 return (Return ? RetFastCC_ARM_APCS : FastCC_ARM_APCS);
1749 case CallingConv::GHC:
1750 return (Return ? RetCC_ARM_APCS : CC_ARM_APCS_GHC);
1752 return (Return ? RetCC_ARM_AAPCS : CC_ARM_AAPCS);
1754 return (Return ? RetCC_ARM_AAPCS : CC_ARM_AAPCS);
1756 return (Return ? RetCC_ARM_AAPCS : CC_ARM_Win32_CFGuard_Check);
1757 }
1758}
1759
1760SDValue ARMTargetLowering::MoveToHPR(const SDLoc &dl, SelectionDAG &DAG,
1761 MVT LocVT, MVT ValVT, SDValue Val) const {
1762 Val = DAG.getNode(ISD::BITCAST, dl, MVT::getIntegerVT(LocVT.getSizeInBits()),
1763 Val);
1764 if (Subtarget->hasFullFP16()) {
1765 Val = DAG.getNode(ARMISD::VMOVhr, dl, ValVT, Val);
1766 } else {
1767 Val = DAG.getNode(ISD::TRUNCATE, dl,
1768 MVT::getIntegerVT(ValVT.getSizeInBits()), Val);
1769 Val = DAG.getNode(ISD::BITCAST, dl, ValVT, Val);
1770 }
1771 return Val;
1772}
1773
1774SDValue ARMTargetLowering::MoveFromHPR(const SDLoc &dl, SelectionDAG &DAG,
1775 MVT LocVT, MVT ValVT,
1776 SDValue Val) const {
1777 if (Subtarget->hasFullFP16()) {
1778 Val = DAG.getNode(ARMISD::VMOVrh, dl,
1779 MVT::getIntegerVT(LocVT.getSizeInBits()), Val);
1780 } else {
1781 Val = DAG.getNode(ISD::BITCAST, dl,
1782 MVT::getIntegerVT(ValVT.getSizeInBits()), Val);
1783 Val = DAG.getNode(ISD::ZERO_EXTEND, dl,
1784 MVT::getIntegerVT(LocVT.getSizeInBits()), Val);
1785 }
1786 return DAG.getNode(ISD::BITCAST, dl, LocVT, Val);
1787}
1788
1789/// LowerCallResult - Lower the result values of a call into the
1790/// appropriate copies out of appropriate physical registers.
1791SDValue ARMTargetLowering::LowerCallResult(
1792 SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg,
1793 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
1794 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool isThisReturn,
1795 SDValue ThisVal, bool isCmseNSCall) const {
1796 // Assign locations to each value returned by this call.
1798 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
1799 *DAG.getContext());
1800 CCInfo.AnalyzeCallResult(Ins, CCAssignFnForReturn(CallConv, isVarArg));
1801
1802 // Copy all of the result registers out of their specified physreg.
1803 for (unsigned i = 0; i != RVLocs.size(); ++i) {
1804 CCValAssign VA = RVLocs[i];
1805
1806 // Pass 'this' value directly from the argument to return value, to avoid
1807 // reg unit interference
1808 if (i == 0 && isThisReturn) {
1809 assert(!VA.needsCustom() && VA.getLocVT() == MVT::i32 &&
1810 "unexpected return calling convention register assignment");
1811 InVals.push_back(ThisVal);
1812 continue;
1813 }
1814
1815 SDValue Val;
1816 if (VA.needsCustom() &&
1817 (VA.getLocVT() == MVT::f64 || VA.getLocVT() == MVT::v2f64)) {
1818 // Handle f64 or half of a v2f64.
1819 SDValue Lo = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32,
1820 InGlue);
1821 Chain = Lo.getValue(1);
1822 InGlue = Lo.getValue(2);
1823 VA = RVLocs[++i]; // skip ahead to next loc
1824 SDValue Hi = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32,
1825 InGlue);
1826 Chain = Hi.getValue(1);
1827 InGlue = Hi.getValue(2);
1828 if (!Subtarget->isLittle())
1829 std::swap (Lo, Hi);
1830 Val = DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi);
1831
1832 if (VA.getLocVT() == MVT::v2f64) {
1833 SDValue Vec = DAG.getNode(ISD::UNDEF, dl, MVT::v2f64);
1834 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Vec, Val,
1835 DAG.getConstant(0, dl, MVT::i32));
1836
1837 VA = RVLocs[++i]; // skip ahead to next loc
1838 Lo = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, InGlue);
1839 Chain = Lo.getValue(1);
1840 InGlue = Lo.getValue(2);
1841 VA = RVLocs[++i]; // skip ahead to next loc
1842 Hi = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, InGlue);
1843 Chain = Hi.getValue(1);
1844 InGlue = Hi.getValue(2);
1845 if (!Subtarget->isLittle())
1846 std::swap (Lo, Hi);
1847 Val = DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi);
1848 Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Vec, Val,
1849 DAG.getConstant(1, dl, MVT::i32));
1850 }
1851 } else {
1852 Val = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), VA.getLocVT(),
1853 InGlue);
1854 Chain = Val.getValue(1);
1855 InGlue = Val.getValue(2);
1856 }
1857
1858 switch (VA.getLocInfo()) {
1859 default: llvm_unreachable("Unknown loc info!");
1860 case CCValAssign::Full: break;
1861 case CCValAssign::BCvt:
1862 Val = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), Val);
1863 break;
1864 }
1865
1866 // f16 arguments have their size extended to 4 bytes and passed as if they
1867 // had been copied to the LSBs of a 32-bit register.
1868 // For that, it's passed extended to i32 (soft ABI) or to f32 (hard ABI)
1869 if (VA.needsCustom() &&
1870 (VA.getValVT() == MVT::f16 || VA.getValVT() == MVT::bf16))
1871 Val = MoveToHPR(dl, DAG, VA.getLocVT(), VA.getValVT(), Val);
1872
1873 // On CMSE Non-secure Calls, call results (returned values) whose bitwidth
1874 // is less than 32 bits must be sign- or zero-extended after the call for
1875 // security reasons. Although the ABI mandates an extension done by the
1876 // callee, the latter cannot be trusted to follow the rules of the ABI.
1877 const ISD::InputArg &Arg = Ins[VA.getValNo()];
1878 if (isCmseNSCall && Arg.ArgVT.isScalarInteger() &&
1879 VA.getLocVT().isScalarInteger() && Arg.ArgVT.bitsLT(MVT::i32))
1880 Val = handleCMSEValue(Val, Arg, DAG, dl);
1881
1882 InVals.push_back(Val);
1883 }
1884
1885 return Chain;
1886}
1887
1888std::pair<SDValue, MachinePointerInfo> ARMTargetLowering::computeAddrForCallArg(
1889 const SDLoc &dl, SelectionDAG &DAG, const CCValAssign &VA, SDValue StackPtr,
1890 bool IsTailCall, int SPDiff) const {
1891 SDValue DstAddr;
1892 MachinePointerInfo DstInfo;
1893 int32_t Offset = VA.getLocMemOffset();
1894 MachineFunction &MF = DAG.getMachineFunction();
1895
1896 if (IsTailCall) {
1897 Offset += SPDiff;
1898 auto PtrVT = getPointerTy(DAG.getDataLayout());
1899 int Size = VA.getLocVT().getFixedSizeInBits() / 8;
1900 int FI = MF.getFrameInfo().CreateFixedObject(Size, Offset, true);
1901 DstAddr = DAG.getFrameIndex(FI, PtrVT);
1902 DstInfo =
1904 } else {
1905 SDValue PtrOff = DAG.getIntPtrConstant(Offset, dl);
1906 DstAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
1907 StackPtr, PtrOff);
1908 DstInfo =
1910 }
1911
1912 return std::make_pair(DstAddr, DstInfo);
1913}
1914
1915// Returns the type of copying which is required to set up a byval argument to
1916// a tail-called function. This isn't needed for non-tail calls, because they
1917// always need the equivalent of CopyOnce, but tail-calls sometimes need two to
1918// avoid clobbering another argument (CopyViaTemp), and sometimes can be
1919// optimised to zero copies when forwarding an argument from the caller's
1920// caller (NoCopy).
1921ARMTargetLowering::ByValCopyKind ARMTargetLowering::ByValNeedsCopyForTailCall(
1922 SelectionDAG &DAG, SDValue Src, SDValue Dst, ISD::ArgFlagsTy Flags) const {
1923 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
1924 ARMFunctionInfo *AFI = DAG.getMachineFunction().getInfo<ARMFunctionInfo>();
1925
1926 // Globals are always safe to copy from.
1928 return CopyOnce;
1929
1930 // Can only analyse frame index nodes, conservatively assume we need a
1931 // temporary.
1932 auto *SrcFrameIdxNode = dyn_cast<FrameIndexSDNode>(Src);
1933 auto *DstFrameIdxNode = dyn_cast<FrameIndexSDNode>(Dst);
1934 if (!SrcFrameIdxNode || !DstFrameIdxNode)
1935 return CopyViaTemp;
1936
1937 int SrcFI = SrcFrameIdxNode->getIndex();
1938 int DstFI = DstFrameIdxNode->getIndex();
1939 assert(MFI.isFixedObjectIndex(DstFI) &&
1940 "byval passed in non-fixed stack slot");
1941
1942 int64_t SrcOffset = MFI.getObjectOffset(SrcFI);
1943 int64_t DstOffset = MFI.getObjectOffset(DstFI);
1944
1945 // If the source is in the local frame, then the copy to the argument memory
1946 // is always valid.
1947 bool FixedSrc = MFI.isFixedObjectIndex(SrcFI);
1948 if (!FixedSrc ||
1949 (FixedSrc && SrcOffset < -(int64_t)AFI->getArgRegsSaveSize()))
1950 return CopyOnce;
1951
1952 // In the case of byval arguments split between registers and the stack,
1953 // computeAddrForCallArg returns a FrameIndex which corresponds only to the
1954 // stack portion, but the Src SDValue will refer to the full value, including
1955 // the local stack memory that the register portion gets stored into. We only
1956 // need to compare them for equality, so normalise on the full value version.
1957 uint64_t RegSize = Flags.getByValSize() - MFI.getObjectSize(DstFI);
1958 DstOffset -= RegSize;
1959
1960 // If the value is already in the correct location, then no copying is
1961 // needed. If not, then we need to copy via a temporary.
1962 if (SrcOffset == DstOffset)
1963 return NoCopy;
1964 else
1965 return CopyViaTemp;
1966}
1967
1968void ARMTargetLowering::PassF64ArgInRegs(const SDLoc &dl, SelectionDAG &DAG,
1969 SDValue Chain, SDValue &Arg,
1970 RegsToPassVector &RegsToPass,
1971 CCValAssign &VA, CCValAssign &NextVA,
1972 SDValue &StackPtr,
1973 SmallVectorImpl<SDValue> &MemOpChains,
1974 bool IsTailCall,
1975 int SPDiff) const {
1976 SDValue fmrrd = DAG.getNode(ARMISD::VMOVRRD, dl,
1977 DAG.getVTList(MVT::i32, MVT::i32), Arg);
1978 unsigned id = Subtarget->isLittle() ? 0 : 1;
1979 RegsToPass.push_back(std::make_pair(VA.getLocReg(), fmrrd.getValue(id)));
1980
1981 if (NextVA.isRegLoc())
1982 RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), fmrrd.getValue(1-id)));
1983 else {
1984 assert(NextVA.isMemLoc());
1985 if (!StackPtr.getNode())
1986 StackPtr = DAG.getCopyFromReg(Chain, dl, ARM::SP,
1988
1989 SDValue DstAddr;
1990 MachinePointerInfo DstInfo;
1991 std::tie(DstAddr, DstInfo) =
1992 computeAddrForCallArg(dl, DAG, NextVA, StackPtr, IsTailCall, SPDiff);
1993 MemOpChains.push_back(
1994 DAG.getStore(Chain, dl, fmrrd.getValue(1 - id), DstAddr, DstInfo));
1995 }
1996}
1997
1998static bool canGuaranteeTCO(CallingConv::ID CC, bool GuaranteeTailCalls) {
1999 return (CC == CallingConv::Fast && GuaranteeTailCalls) ||
2001}
2002
2003/// LowerCall - Lowering a call into a callseq_start <-
2004/// ARMISD:CALL <- callseq_end chain. Also add input and output parameter
2005/// nodes.
2006SDValue
2007ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
2008 SmallVectorImpl<SDValue> &InVals) const {
2009 SelectionDAG &DAG = CLI.DAG;
2010 SDLoc &dl = CLI.DL;
2011 SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
2012 SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
2013 SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
2014 SDValue Chain = CLI.Chain;
2015 SDValue Callee = CLI.Callee;
2016 bool &isTailCall = CLI.IsTailCall;
2017 CallingConv::ID CallConv = CLI.CallConv;
2018 bool doesNotRet = CLI.DoesNotReturn;
2019 bool isVarArg = CLI.IsVarArg;
2020 const CallBase *CB = CLI.CB;
2021
2022 MachineFunction &MF = DAG.getMachineFunction();
2023 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
2024 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
2025 MachineFunction::CallSiteInfo CSInfo;
2026 bool isStructRet = (Outs.empty()) ? false : Outs[0].Flags.isSRet();
2027 bool isThisReturn = false;
2028 bool isCmseNSCall = false;
2029 bool isSibCall = false;
2030 bool PreferIndirect = false;
2031 bool GuardWithBTI = false;
2032
2033 // Analyze operands of the call, assigning locations to each operand.
2035 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
2036 *DAG.getContext());
2037 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CallConv, isVarArg));
2038
2039 // Lower 'returns_twice' calls to a pseudo-instruction.
2040 if (CLI.CB && CLI.CB->getAttributes().hasFnAttr(Attribute::ReturnsTwice) &&
2041 !Subtarget->noBTIAtReturnTwice())
2042 GuardWithBTI = AFI->branchTargetEnforcement();
2043
2044 // Set type id for call site info.
2045 setTypeIdForCallsiteInfo(CB, MF, CSInfo);
2046
2047 // Determine whether this is a non-secure function call.
2048 if (CLI.CB && CLI.CB->getAttributes().hasFnAttr("cmse_nonsecure_call"))
2049 isCmseNSCall = true;
2050
2051 // Disable tail calls if they're not supported.
2052 if (!Subtarget->supportsTailCall())
2053 isTailCall = false;
2054
2055 // For both the non-secure calls and the returns from a CMSE entry function,
2056 // the function needs to do some extra work after the call, or before the
2057 // return, respectively, thus it cannot end with a tail call
2058 if (isCmseNSCall || AFI->isCmseNSEntryFunction())
2059 isTailCall = false;
2060
2061 if (isa<GlobalAddressSDNode>(Callee)) {
2062 // If we're optimizing for minimum size and the function is called three or
2063 // more times in this block, we can improve codesize by calling indirectly
2064 // as BLXr has a 16-bit encoding.
2065 auto *GV = cast<GlobalAddressSDNode>(Callee)->getGlobal();
2066 if (CLI.CB) {
2067 auto *BB = CLI.CB->getParent();
2068 PreferIndirect = Subtarget->isThumb() && Subtarget->hasMinSize() &&
2069 count_if(GV->users(), [&BB](const User *U) {
2070 return isa<Instruction>(U) &&
2071 cast<Instruction>(U)->getParent() == BB;
2072 }) > 2;
2073 }
2074 }
2075 if (isTailCall) {
2076 // Check if it's really possible to do a tail call.
2077 isTailCall =
2078 IsEligibleForTailCallOptimization(CLI, CCInfo, ArgLocs, PreferIndirect);
2079
2080 if (isTailCall && !getTargetMachine().Options.GuaranteedTailCallOpt &&
2081 CallConv != CallingConv::Tail && CallConv != CallingConv::SwiftTail)
2082 isSibCall = true;
2083
2084 // We don't support GuaranteedTailCallOpt for ARM, only automatically
2085 // detected sibcalls.
2086 if (isTailCall)
2087 ++NumTailCalls;
2088 }
2089
2090 if (!isTailCall && CLI.CB && CLI.CB->isMustTailCall())
2091 report_fatal_error("failed to perform tail call elimination on a call "
2092 "site marked musttail");
2093
2094 // Get a count of how many bytes are to be pushed on the stack.
2095 unsigned NumBytes = CCInfo.getStackSize();
2096
2097 // SPDiff is the byte offset of the call's argument area from the callee's.
2098 // Stores to callee stack arguments will be placed in FixedStackSlots offset
2099 // by this amount for a tail call. In a sibling call it must be 0 because the
2100 // caller will deallocate the entire stack and the callee still expects its
2101 // arguments to begin at SP+0. Completely unused for non-tail calls.
2102 int SPDiff = 0;
2103
2104 if (isTailCall && !isSibCall) {
2105 auto FuncInfo = MF.getInfo<ARMFunctionInfo>();
2106 unsigned NumReusableBytes = FuncInfo->getArgumentStackSize();
2107
2108 // Since callee will pop argument stack as a tail call, we must keep the
2109 // popped size 16-byte aligned.
2110 MaybeAlign StackAlign = DAG.getDataLayout().getStackAlignment();
2111 assert(StackAlign && "data layout string is missing stack alignment");
2112 NumBytes = alignTo(NumBytes, *StackAlign);
2113
2114 // SPDiff will be negative if this tail call requires more space than we
2115 // would automatically have in our incoming argument space. Positive if we
2116 // can actually shrink the stack.
2117 SPDiff = NumReusableBytes - NumBytes;
2118
2119 // If this call requires more stack than we have available from
2120 // LowerFormalArguments, tell FrameLowering to reserve space for it.
2121 if (SPDiff < 0 && AFI->getArgRegsSaveSize() < (unsigned)-SPDiff)
2122 AFI->setArgRegsSaveSize(-SPDiff);
2123 }
2124
2125 if (isSibCall) {
2126 // For sibling tail calls, memory operands are available in our caller's stack.
2127 NumBytes = 0;
2128 } else {
2129 // Adjust the stack pointer for the new arguments...
2130 // These operations are automatically eliminated by the prolog/epilog pass
2131 Chain = DAG.getCALLSEQ_START(Chain, isTailCall ? 0 : NumBytes, 0, dl);
2132 }
2133
2135 DAG.getCopyFromReg(Chain, dl, ARM::SP, getPointerTy(DAG.getDataLayout()));
2136
2137 RegsToPassVector RegsToPass;
2138 SmallVector<SDValue, 8> MemOpChains;
2139
2140 // If we are doing a tail-call, any byval arguments will be written to stack
2141 // space which was used for incoming arguments. If any the values being used
2142 // are incoming byval arguments to this function, then they might be
2143 // overwritten by the stores of the outgoing arguments. To avoid this, we
2144 // need to make a temporary copy of them in local stack space, then copy back
2145 // to the argument area.
2146 DenseMap<unsigned, SDValue> ByValTemporaries;
2147 SDValue ByValTempChain;
2148 if (isTailCall) {
2149 SmallVector<SDValue, 8> ByValCopyChains;
2150 for (const CCValAssign &VA : ArgLocs) {
2151 unsigned ArgIdx = VA.getValNo();
2152 SDValue Src = OutVals[ArgIdx];
2153 ISD::ArgFlagsTy Flags = Outs[ArgIdx].Flags;
2154
2155 if (!Flags.isByVal())
2156 continue;
2157
2158 SDValue Dst;
2159 MachinePointerInfo DstInfo;
2160 std::tie(Dst, DstInfo) =
2161 computeAddrForCallArg(dl, DAG, VA, SDValue(), true, SPDiff);
2162 ByValCopyKind Copy = ByValNeedsCopyForTailCall(DAG, Src, Dst, Flags);
2163
2164 if (Copy == NoCopy) {
2165 // If the argument is already at the correct offset on the stack
2166 // (because we are forwarding a byval argument from our caller), we
2167 // don't need any copying.
2168 continue;
2169 } else if (Copy == CopyOnce) {
2170 // If the argument is in our local stack frame, no other argument
2171 // preparation can clobber it, so we can copy it to the final location
2172 // later.
2173 ByValTemporaries[ArgIdx] = Src;
2174 } else {
2175 assert(Copy == CopyViaTemp && "unexpected enum value");
2176 // If we might be copying this argument from the outgoing argument
2177 // stack area, we need to copy via a temporary in the local stack
2178 // frame.
2179 int TempFrameIdx = MFI.CreateStackObject(
2180 Flags.getByValSize(), Flags.getNonZeroByValAlign(), false);
2181 SDValue Temp =
2182 DAG.getFrameIndex(TempFrameIdx, getPointerTy(DAG.getDataLayout()));
2183
2184 SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), dl, MVT::i32);
2185 SDValue AlignNode =
2186 DAG.getConstant(Flags.getNonZeroByValAlign().value(), dl, MVT::i32);
2187
2188 SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
2189 SDValue Ops[] = {Chain, Temp, Src, SizeNode, AlignNode};
2190 ByValCopyChains.push_back(
2191 DAG.getNode(ARMISD::COPY_STRUCT_BYVAL, dl, VTs, Ops));
2192 ByValTemporaries[ArgIdx] = Temp;
2193 }
2194 }
2195 if (!ByValCopyChains.empty())
2196 ByValTempChain =
2197 DAG.getNode(ISD::TokenFactor, dl, MVT::Other, ByValCopyChains);
2198 }
2199
2200 // During a tail call, stores to the argument area must happen after all of
2201 // the function's incoming arguments have been loaded because they may alias.
2202 // This is done by folding in a TokenFactor from LowerFormalArguments, but
2203 // there's no point in doing so repeatedly so this tracks whether that's
2204 // happened yet.
2205 bool AfterFormalArgLoads = false;
2206
2207 // Walk the register/memloc assignments, inserting copies/loads. In the case
2208 // of tail call optimization, arguments are handled later.
2209 for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size();
2210 i != e;
2211 ++i, ++realArgIdx) {
2212 CCValAssign &VA = ArgLocs[i];
2213 SDValue Arg = OutVals[realArgIdx];
2214 ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags;
2215 bool isByVal = Flags.isByVal();
2216
2217 // Promote the value if needed.
2218 switch (VA.getLocInfo()) {
2219 default: llvm_unreachable("Unknown loc info!");
2220 case CCValAssign::Full: break;
2221 case CCValAssign::SExt:
2222 Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), Arg);
2223 break;
2224 case CCValAssign::ZExt:
2225 Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), Arg);
2226 break;
2227 case CCValAssign::AExt:
2228 Arg = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), Arg);
2229 break;
2230 case CCValAssign::BCvt:
2231 Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg);
2232 break;
2233 }
2234
2235 if (isTailCall && VA.isMemLoc() && !AfterFormalArgLoads) {
2236 Chain = DAG.getStackArgumentTokenFactor(Chain);
2237 if (ByValTempChain) {
2238 // In case of large byval copies, re-using the stackframe for tail-calls
2239 // can lead to overwriting incoming arguments on the stack. Force
2240 // loading these stack arguments before the copy to avoid that.
2241 SmallVector<SDValue, 8> IncomingLoad;
2242 for (unsigned I = 0; I < OutVals.size(); ++I) {
2243 if (Outs[I].Flags.isByVal())
2244 continue;
2245
2246 SDValue OutVal = OutVals[I];
2247 LoadSDNode *OutLN = dyn_cast_or_null<LoadSDNode>(OutVal);
2248 if (!OutLN)
2249 continue;
2250
2251 FrameIndexSDNode *FIN =
2253 if (!FIN)
2254 continue;
2255
2256 if (!MFI.isFixedObjectIndex(FIN->getIndex()))
2257 continue;
2258
2259 for (const CCValAssign &VA : ArgLocs) {
2260 if (VA.isMemLoc())
2261 IncomingLoad.push_back(OutVal.getValue(1));
2262 }
2263 }
2264
2265 // Update the chain to force loads for potentially clobbered argument
2266 // loads to happen before the byval copy.
2267 if (!IncomingLoad.empty()) {
2268 IncomingLoad.push_back(Chain);
2269 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, IncomingLoad);
2270 }
2271
2272 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chain,
2273 ByValTempChain);
2274 }
2275 AfterFormalArgLoads = true;
2276 }
2277
2278 // f16 arguments have their size extended to 4 bytes and passed as if they
2279 // had been copied to the LSBs of a 32-bit register.
2280 // For that, it's passed extended to i32 (soft ABI) or to f32 (hard ABI)
2281 if (VA.needsCustom() &&
2282 (VA.getValVT() == MVT::f16 || VA.getValVT() == MVT::bf16)) {
2283 Arg = MoveFromHPR(dl, DAG, VA.getLocVT(), VA.getValVT(), Arg);
2284 } else {
2285 // f16 arguments could have been extended prior to argument lowering.
2286 // Mask them arguments if this is a CMSE nonsecure call.
2287 auto ArgVT = Outs[realArgIdx].ArgVT;
2288 if (isCmseNSCall && (ArgVT == MVT::f16)) {
2289 auto LocBits = VA.getLocVT().getSizeInBits();
2290 auto MaskValue = APInt::getLowBitsSet(LocBits, ArgVT.getSizeInBits());
2291 SDValue Mask =
2292 DAG.getConstant(MaskValue, dl, MVT::getIntegerVT(LocBits));
2293 Arg = DAG.getNode(ISD::BITCAST, dl, MVT::getIntegerVT(LocBits), Arg);
2294 Arg = DAG.getNode(ISD::AND, dl, MVT::getIntegerVT(LocBits), Arg, Mask);
2295 Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg);
2296 }
2297 }
2298
2299 // f64 and v2f64 might be passed in i32 pairs and must be split into pieces
2300 if (VA.needsCustom() && VA.getLocVT() == MVT::v2f64) {
2301 SDValue Op0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
2302 DAG.getConstant(0, dl, MVT::i32));
2303 SDValue Op1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
2304 DAG.getConstant(1, dl, MVT::i32));
2305
2306 PassF64ArgInRegs(dl, DAG, Chain, Op0, RegsToPass, VA, ArgLocs[++i],
2307 StackPtr, MemOpChains, isTailCall, SPDiff);
2308
2309 VA = ArgLocs[++i]; // skip ahead to next loc
2310 if (VA.isRegLoc()) {
2311 PassF64ArgInRegs(dl, DAG, Chain, Op1, RegsToPass, VA, ArgLocs[++i],
2312 StackPtr, MemOpChains, isTailCall, SPDiff);
2313 } else {
2314 assert(VA.isMemLoc());
2315 SDValue DstAddr;
2316 MachinePointerInfo DstInfo;
2317 std::tie(DstAddr, DstInfo) =
2318 computeAddrForCallArg(dl, DAG, VA, StackPtr, isTailCall, SPDiff);
2319 MemOpChains.push_back(DAG.getStore(Chain, dl, Op1, DstAddr, DstInfo));
2320 }
2321 } else if (VA.needsCustom() && VA.getLocVT() == MVT::f64) {
2322 PassF64ArgInRegs(dl, DAG, Chain, Arg, RegsToPass, VA, ArgLocs[++i],
2323 StackPtr, MemOpChains, isTailCall, SPDiff);
2324 } else if (VA.isRegLoc()) {
2325 if (realArgIdx == 0 && Flags.isReturned() && !Flags.isSwiftSelf() &&
2326 Outs[0].VT == MVT::i32) {
2327 assert(VA.getLocVT() == MVT::i32 &&
2328 "unexpected calling convention register assignment");
2329 assert(!Ins.empty() && Ins[0].VT == MVT::i32 &&
2330 "unexpected use of 'returned'");
2331 isThisReturn = true;
2332 }
2333 const TargetOptions &Options = DAG.getTarget().Options;
2334 if (Options.EmitCallSiteInfo)
2335 CSInfo.ArgRegPairs.emplace_back(VA.getLocReg(), i);
2336 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
2337 } else if (isByVal) {
2338 assert(VA.isMemLoc());
2339 unsigned offset = 0;
2340
2341 // True if this byval aggregate will be split between registers
2342 // and memory.
2343 unsigned ByValArgsCount = CCInfo.getInRegsParamsCount();
2344 unsigned CurByValIdx = CCInfo.getInRegsParamsProcessed();
2345
2346 SDValue ByValSrc;
2347 bool NeedsStackCopy;
2348 if (auto It = ByValTemporaries.find(realArgIdx);
2349 It != ByValTemporaries.end()) {
2350 ByValSrc = It->second;
2351 NeedsStackCopy = true;
2352 } else {
2353 ByValSrc = Arg;
2354 NeedsStackCopy = !isTailCall;
2355 }
2356
2357 // If part of the argument is in registers, load them.
2358 if (CurByValIdx < ByValArgsCount) {
2359 unsigned RegBegin, RegEnd;
2360 CCInfo.getInRegsParamInfo(CurByValIdx, RegBegin, RegEnd);
2361
2362 EVT PtrVT = getPointerTy(DAG.getDataLayout());
2363 unsigned int i, j;
2364 for (i = 0, j = RegBegin; j < RegEnd; i++, j++) {
2365 SDValue Const = DAG.getConstant(4*i, dl, MVT::i32);
2366 SDValue AddArg = DAG.getNode(ISD::ADD, dl, PtrVT, ByValSrc, Const);
2367 SDValue Load =
2368 DAG.getLoad(PtrVT, dl, Chain, AddArg, MachinePointerInfo(),
2369 DAG.InferPtrAlign(AddArg));
2370 MemOpChains.push_back(Load.getValue(1));
2371 RegsToPass.push_back(std::make_pair(j, Load));
2372 }
2373
2374 // If parameter size outsides register area, "offset" value
2375 // helps us to calculate stack slot for remained part properly.
2376 offset = RegEnd - RegBegin;
2377
2378 CCInfo.nextInRegsParam();
2379 }
2380
2381 // If the memory part of the argument isn't already in the correct place
2382 // (which can happen with tail calls), copy it into the argument area.
2383 if (NeedsStackCopy && Flags.getByValSize() > 4 * offset) {
2384 auto PtrVT = getPointerTy(DAG.getDataLayout());
2385 SDValue Dst;
2386 MachinePointerInfo DstInfo;
2387 std::tie(Dst, DstInfo) =
2388 computeAddrForCallArg(dl, DAG, VA, StackPtr, isTailCall, SPDiff);
2389 SDValue SrcOffset = DAG.getIntPtrConstant(4*offset, dl);
2390 SDValue Src = DAG.getNode(ISD::ADD, dl, PtrVT, ByValSrc, SrcOffset);
2391 SDValue SizeNode = DAG.getConstant(Flags.getByValSize() - 4*offset, dl,
2392 MVT::i32);
2393 SDValue AlignNode =
2394 DAG.getConstant(Flags.getNonZeroByValAlign().value(), dl, MVT::i32);
2395
2396 SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
2397 SDValue Ops[] = { Chain, Dst, Src, SizeNode, AlignNode};
2398 MemOpChains.push_back(DAG.getNode(ARMISD::COPY_STRUCT_BYVAL, dl, VTs,
2399 Ops));
2400 }
2401 } else {
2402 assert(VA.isMemLoc());
2403 SDValue DstAddr;
2404 MachinePointerInfo DstInfo;
2405 std::tie(DstAddr, DstInfo) =
2406 computeAddrForCallArg(dl, DAG, VA, StackPtr, isTailCall, SPDiff);
2407
2408 SDValue Store = DAG.getStore(Chain, dl, Arg, DstAddr, DstInfo);
2409 MemOpChains.push_back(Store);
2410 }
2411 }
2412
2413 if (!MemOpChains.empty())
2414 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
2415
2416 // Build a sequence of copy-to-reg nodes chained together with token chain
2417 // and flag operands which copy the outgoing args into the appropriate regs.
2418 SDValue InGlue;
2419 for (const auto &[Reg, N] : RegsToPass) {
2420 Chain = DAG.getCopyToReg(Chain, dl, Reg, N, InGlue);
2421 InGlue = Chain.getValue(1);
2422 }
2423
2424 // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
2425 // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
2426 // node so that legalize doesn't hack it.
2427 bool isDirect = false;
2428
2429 const TargetMachine &TM = getTargetMachine();
2430 const Triple &TT = TM.getTargetTriple();
2431 const GlobalValue *GVal = nullptr;
2432 if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
2433 GVal = G->getGlobal();
2434 bool isStub = !TM.shouldAssumeDSOLocal(GVal) && TT.isOSBinFormatMachO();
2435
2436 bool isARMFunc = !Subtarget->isThumb() || (isStub && !Subtarget->isMClass());
2437 bool isLocalARMFunc = false;
2438 auto PtrVt = getPointerTy(DAG.getDataLayout());
2439
2440 if (Subtarget->genLongCalls()) {
2441 assert((!isPositionIndependent() || TT.isOSWindows()) &&
2442 "long-calls codegen is not position independent!");
2443 // Handle a global address or an external symbol. If it's not one of
2444 // those, the target's already in a register, so we don't need to do
2445 // anything extra.
2446 if (isa<GlobalAddressSDNode>(Callee)) {
2447 if (Subtarget->genExecuteOnly()) {
2448 if (Subtarget->useMovt())
2449 ++NumMovwMovt;
2450 Callee = DAG.getNode(ARMISD::Wrapper, dl, PtrVt,
2451 DAG.getTargetGlobalAddress(GVal, dl, PtrVt));
2452 } else {
2453 // Create a constant pool entry for the callee address
2454 unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
2455 ARMConstantPoolValue *CPV = ARMConstantPoolConstant::Create(
2456 GVal, ARMPCLabelIndex, ARMCP::CPValue, 0);
2457
2458 // Get the address of the callee into a register
2459 SDValue Addr = DAG.getTargetConstantPool(CPV, PtrVt, Align(4));
2460 Addr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Addr);
2461 Callee = DAG.getLoad(
2462 PtrVt, dl, DAG.getEntryNode(), Addr,
2464 }
2465 } else if (ExternalSymbolSDNode *S=dyn_cast<ExternalSymbolSDNode>(Callee)) {
2466 const char *Sym = S->getSymbol();
2467
2468 if (Subtarget->genExecuteOnly()) {
2469 if (Subtarget->useMovt())
2470 ++NumMovwMovt;
2471 Callee = DAG.getNode(ARMISD::Wrapper, dl, PtrVt,
2472 DAG.getTargetGlobalAddress(GVal, dl, PtrVt));
2473 } else {
2474 // Create a constant pool entry for the callee address
2475 unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
2476 ARMConstantPoolValue *CPV = ARMConstantPoolSymbol::Create(
2477 *DAG.getContext(), Sym, ARMPCLabelIndex, 0);
2478
2479 // Get the address of the callee into a register
2480 SDValue Addr = DAG.getTargetConstantPool(CPV, PtrVt, Align(4));
2481 Addr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Addr);
2482 Callee = DAG.getLoad(
2483 PtrVt, dl, DAG.getEntryNode(), Addr,
2485 }
2486 }
2487 } else if (isa<GlobalAddressSDNode>(Callee)) {
2488 if (!PreferIndirect) {
2489 isDirect = true;
2490 bool isDef = GVal->isStrongDefinitionForLinker();
2491
2492 // ARM call to a local ARM function is predicable.
2493 isLocalARMFunc = !Subtarget->isThumb() && (isDef || !ARMInterworking);
2494 // tBX takes a register source operand.
2495 if (isStub && Subtarget->isThumb1Only() && !Subtarget->hasV5TOps()) {
2496 assert(TT.isOSBinFormatMachO() && "WrapperPIC use on non-MachO?");
2497 Callee = DAG.getNode(
2498 ARMISD::WrapperPIC, dl, PtrVt,
2499 DAG.getTargetGlobalAddress(GVal, dl, PtrVt, 0, ARMII::MO_NONLAZY));
2500 Callee = DAG.getLoad(
2501 PtrVt, dl, DAG.getEntryNode(), Callee,
2505 } else if (Subtarget->isTargetCOFF()) {
2506 assert(Subtarget->isTargetWindows() &&
2507 "Windows is the only supported COFF target");
2508 unsigned TargetFlags = ARMII::MO_NO_FLAG;
2509 if (GVal->hasDLLImportStorageClass())
2510 TargetFlags = ARMII::MO_DLLIMPORT;
2511 else if (!TM.shouldAssumeDSOLocal(GVal))
2512 TargetFlags = ARMII::MO_COFFSTUB;
2513 Callee = DAG.getTargetGlobalAddress(GVal, dl, PtrVt, /*offset=*/0,
2514 TargetFlags);
2515 if (TargetFlags & (ARMII::MO_DLLIMPORT | ARMII::MO_COFFSTUB))
2516 Callee =
2517 DAG.getLoad(PtrVt, dl, DAG.getEntryNode(),
2518 DAG.getNode(ARMISD::Wrapper, dl, PtrVt, Callee),
2520 } else {
2521 Callee = DAG.getTargetGlobalAddress(GVal, dl, PtrVt, 0, 0);
2522 }
2523 }
2524 } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
2525 isDirect = true;
2526 // tBX takes a register source operand.
2527 const char *Sym = S->getSymbol();
2528 if (isARMFunc && Subtarget->isThumb1Only() && !Subtarget->hasV5TOps()) {
2529 unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
2530 ARMConstantPoolValue *CPV =
2532 ARMPCLabelIndex, 4);
2533 SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, Align(4));
2534 CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
2535 Callee = DAG.getLoad(
2536 PtrVt, dl, DAG.getEntryNode(), CPAddr,
2538 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32);
2539 Callee = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVt, Callee, PICLabel);
2540 } else {
2541 Callee = DAG.getTargetExternalSymbol(Sym, PtrVt, 0);
2542 }
2543 }
2544
2545 if (isCmseNSCall) {
2546 assert(!isARMFunc && !isDirect &&
2547 "Cannot handle call to ARM function or direct call");
2548 if (NumBytes > 0) {
2549 DAG.getContext()->diagnose(
2550 DiagnosticInfoUnsupported(DAG.getMachineFunction().getFunction(),
2551 "call to non-secure function would require "
2552 "passing arguments on stack",
2553 dl.getDebugLoc()));
2554 }
2555 if (isStructRet) {
2556 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
2558 "call to non-secure function would return value through pointer",
2559 dl.getDebugLoc()));
2560 }
2561 }
2562
2563 // FIXME: handle tail calls differently.
2564 unsigned CallOpc;
2565 if (Subtarget->isThumb()) {
2566 if (GuardWithBTI)
2567 CallOpc = ARMISD::t2CALL_BTI;
2568 else if (isCmseNSCall)
2569 CallOpc = ARMISD::tSECALL;
2570 else if ((!isDirect || isARMFunc) && !Subtarget->hasV5TOps())
2571 CallOpc = ARMISD::CALL_NOLINK;
2572 else
2573 CallOpc = ARMISD::CALL;
2574 } else {
2575 if (!isDirect && !Subtarget->hasV5TOps())
2576 CallOpc = ARMISD::CALL_NOLINK;
2577 else if (doesNotRet && isDirect && Subtarget->hasRetAddrStack() &&
2578 // Emit regular call when code size is the priority
2579 !Subtarget->hasMinSize())
2580 // "mov lr, pc; b _foo" to avoid confusing the RSP
2581 CallOpc = ARMISD::CALL_NOLINK;
2582 else
2583 CallOpc = isLocalARMFunc ? ARMISD::CALL_PRED : ARMISD::CALL;
2584 }
2585
2586 // We don't usually want to end the call-sequence here because we would tidy
2587 // the frame up *after* the call, however in the ABI-changing tail-call case
2588 // we've carefully laid out the parameters so that when sp is reset they'll be
2589 // in the correct location.
2590 if (isTailCall && !isSibCall) {
2591 Chain = DAG.getCALLSEQ_END(Chain, 0, 0, InGlue, dl);
2592 InGlue = Chain.getValue(1);
2593 }
2594
2595 std::vector<SDValue> Ops;
2596 Ops.push_back(Chain);
2597 Ops.push_back(Callee);
2598
2599 if (isTailCall) {
2600 Ops.push_back(DAG.getSignedTargetConstant(SPDiff, dl, MVT::i32));
2601 }
2602
2603 // Add argument registers to the end of the list so that they are known live
2604 // into the call.
2605 for (const auto &[Reg, N] : RegsToPass)
2606 Ops.push_back(DAG.getRegister(Reg, N.getValueType()));
2607
2608 // Add a register mask operand representing the call-preserved registers.
2609 const uint32_t *Mask;
2610 const ARMBaseRegisterInfo *ARI = Subtarget->getRegisterInfo();
2611 if (isThisReturn) {
2612 // For 'this' returns, use the R0-preserving mask if applicable
2613 Mask = ARI->getThisReturnPreservedMask(MF, CallConv);
2614 if (!Mask) {
2615 // Set isThisReturn to false if the calling convention is not one that
2616 // allows 'returned' to be modeled in this way, so LowerCallResult does
2617 // not try to pass 'this' straight through
2618 isThisReturn = false;
2619 Mask = ARI->getCallPreservedMask(MF, CallConv);
2620 }
2621 } else
2622 Mask = ARI->getCallPreservedMask(MF, CallConv);
2623
2624 assert(Mask && "Missing call preserved mask for calling convention");
2625 Ops.push_back(DAG.getRegisterMask(Mask));
2626
2627 if (InGlue.getNode())
2628 Ops.push_back(InGlue);
2629
2630 if (isTailCall) {
2632 SDValue Ret = DAG.getNode(ARMISD::TC_RETURN, dl, MVT::Other, Ops);
2633 if (CLI.CFIType)
2634 Ret.getNode()->setCFIType(CLI.CFIType->getZExtValue());
2635 DAG.addNoMergeSiteInfo(Ret.getNode(), CLI.NoMerge);
2636 DAG.addCallSiteInfo(Ret.getNode(), std::move(CSInfo));
2637 return Ret;
2638 }
2639
2640 // Returns a chain and a flag for retval copy to use.
2641 Chain = DAG.getNode(CallOpc, dl, {MVT::Other, MVT::Glue}, Ops);
2642 if (CLI.CFIType)
2643 Chain.getNode()->setCFIType(CLI.CFIType->getZExtValue());
2644 DAG.addNoMergeSiteInfo(Chain.getNode(), CLI.NoMerge);
2645 InGlue = Chain.getValue(1);
2646 DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo));
2647
2648 // If we're guaranteeing tail-calls will be honoured, the callee must
2649 // pop its own argument stack on return. But this call is *not* a tail call so
2650 // we need to undo that after it returns to restore the status-quo.
2651 bool TailCallOpt = getTargetMachine().Options.GuaranteedTailCallOpt;
2652 uint64_t CalleePopBytes =
2653 canGuaranteeTCO(CallConv, TailCallOpt) ? alignTo(NumBytes, 16) : -1U;
2654
2655 Chain = DAG.getCALLSEQ_END(Chain, NumBytes, CalleePopBytes, InGlue, dl);
2656 if (!Ins.empty())
2657 InGlue = Chain.getValue(1);
2658
2659 // Handle result values, copying them out of physregs into vregs that we
2660 // return.
2661 return LowerCallResult(Chain, InGlue, CallConv, isVarArg, Ins, dl, DAG,
2662 InVals, isThisReturn,
2663 isThisReturn ? OutVals[0] : SDValue(), isCmseNSCall);
2664}
2665
2666/// HandleByVal - Every parameter *after* a byval parameter is passed
2667/// on the stack. Remember the next parameter register to allocate,
2668/// and then confiscate the rest of the parameter registers to insure
2669/// this.
2670void ARMTargetLowering::HandleByVal(CCState *State, unsigned &Size,
2671 Align Alignment) const {
2672 // Byval (as with any stack) slots are always at least 4 byte aligned.
2673 Alignment = std::max(Alignment, Align(4));
2674
2675 MCRegister Reg = State->AllocateReg(GPRArgRegs);
2676 if (!Reg)
2677 return;
2678
2679 unsigned AlignInRegs = Alignment.value() / 4;
2680 unsigned Waste = (ARM::R4 - Reg) % AlignInRegs;
2681 for (unsigned i = 0; i < Waste; ++i)
2682 Reg = State->AllocateReg(GPRArgRegs);
2683
2684 if (!Reg)
2685 return;
2686
2687 unsigned Excess = 4 * (ARM::R4 - Reg);
2688
2689 // Special case when NSAA != SP and parameter size greater than size of
2690 // all remained GPR regs. In that case we can't split parameter, we must
2691 // send it to stack. We also must set NCRN to R4, so waste all
2692 // remained registers.
2693 const unsigned NSAAOffset = State->getStackSize();
2694 if (NSAAOffset != 0 && Size > Excess) {
2695 while (State->AllocateReg(GPRArgRegs))
2696 ;
2697 return;
2698 }
2699
2700 // First register for byval parameter is the first register that wasn't
2701 // allocated before this method call, so it would be "reg".
2702 // If parameter is small enough to be saved in range [reg, r4), then
2703 // the end (first after last) register would be reg + param-size-in-regs,
2704 // else parameter would be splitted between registers and stack,
2705 // end register would be r4 in this case.
2706 unsigned ByValRegBegin = Reg;
2707 unsigned ByValRegEnd = std::min<unsigned>(Reg + Size / 4, ARM::R4);
2708 State->addInRegsParamInfo(ByValRegBegin, ByValRegEnd);
2709 // Note, first register is allocated in the beginning of function already,
2710 // allocate remained amount of registers we need.
2711 for (unsigned i = Reg + 1; i != ByValRegEnd; ++i)
2712 State->AllocateReg(GPRArgRegs);
2713 // A byval parameter that is split between registers and memory needs its
2714 // size truncated here.
2715 // In the case where the entire structure fits in registers, we set the
2716 // size in memory to zero.
2717 Size = std::max<int>(Size - Excess, 0);
2718}
2719
2720/// IsEligibleForTailCallOptimization - Check whether the call is eligible
2721/// for tail call optimization. Targets which want to do tail call
2722/// optimization should implement this function. Note that this function also
2723/// processes musttail calls, so when this function returns false on a valid
2724/// musttail call, a fatal backend error occurs.
2725bool ARMTargetLowering::IsEligibleForTailCallOptimization(
2727 SmallVectorImpl<CCValAssign> &ArgLocs, const bool isIndirect) const {
2728 CallingConv::ID CalleeCC = CLI.CallConv;
2729 SDValue Callee = CLI.Callee;
2730 bool isVarArg = CLI.IsVarArg;
2731 const SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
2732 const SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
2733 const SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
2734 const SelectionDAG &DAG = CLI.DAG;
2735 MachineFunction &MF = DAG.getMachineFunction();
2736 const Function &CallerF = MF.getFunction();
2737 CallingConv::ID CallerCC = CallerF.getCallingConv();
2738
2739 assert(Subtarget->supportsTailCall());
2740
2741 // Indirect tail-calls require a register to hold the target address. That
2742 // register must be:
2743 // * Allocatable (i.e. r0-r7 if the target is Thumb1).
2744 // * Not callee-saved, so must be one of r0-r3 or r12.
2745 // * Not used to hold an argument to the tail-called function, which might be
2746 // in r0-r3.
2747 // * Not used to hold the return address authentication code, which is in r12
2748 // if enabled.
2749 // Sometimes, no register matches all of these conditions, so we can't do a
2750 // tail-call.
2751 if (!isa<GlobalAddressSDNode>(Callee.getNode()) || isIndirect) {
2752 SmallSet<MCPhysReg, 5> AddressRegisters = {ARM::R0, ARM::R1, ARM::R2,
2753 ARM::R3};
2754 if (!(Subtarget->isThumb1Only() ||
2755 MF.getInfo<ARMFunctionInfo>()->shouldSignReturnAddress(true)))
2756 AddressRegisters.insert(ARM::R12);
2757 for (const CCValAssign &AL : ArgLocs)
2758 if (AL.isRegLoc())
2759 AddressRegisters.erase(AL.getLocReg());
2760 if (AddressRegisters.empty()) {
2761 LLVM_DEBUG(dbgs() << "false (no reg to hold function pointer)\n");
2762 return false;
2763 }
2764 }
2765
2766 // Look for obvious safe cases to perform tail call optimization that do not
2767 // require ABI changes. This is what gcc calls sibcall.
2768
2769 // Exception-handling functions need a special set of instructions to indicate
2770 // a return to the hardware. Tail-calling another function would probably
2771 // break this.
2772 if (CallerF.hasFnAttribute("interrupt")) {
2773 LLVM_DEBUG(dbgs() << "false (interrupt attribute)\n");
2774 return false;
2775 }
2776
2777 if (canGuaranteeTCO(CalleeCC,
2778 getTargetMachine().Options.GuaranteedTailCallOpt)) {
2779 LLVM_DEBUG(dbgs() << (CalleeCC == CallerCC ? "true" : "false")
2780 << " (guaranteed tail-call CC)\n");
2781 return CalleeCC == CallerCC;
2782 }
2783
2784 // Also avoid sibcall optimization if either caller or callee uses struct
2785 // return semantics.
2786 bool isCalleeStructRet = Outs.empty() ? false : Outs[0].Flags.isSRet();
2787 bool isCallerStructRet = MF.getFunction().hasStructRetAttr();
2788 if (isCalleeStructRet != isCallerStructRet) {
2789 LLVM_DEBUG(dbgs() << "false (struct-ret)\n");
2790 return false;
2791 }
2792
2793 // Externally-defined functions with weak linkage should not be
2794 // tail-called on ARM when the OS does not support dynamic
2795 // pre-emption of symbols, as the AAELF spec requires normal calls
2796 // to undefined weak functions to be replaced with a NOP or jump to the
2797 // next instruction. The behaviour of branch instructions in this
2798 // situation (as used for tail calls) is implementation-defined, so we
2799 // cannot rely on the linker replacing the tail call with a return.
2800 if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
2801 const GlobalValue *GV = G->getGlobal();
2802 const Triple &TT = getTargetMachine().getTargetTriple();
2803 if (GV->hasExternalWeakLinkage() &&
2804 (!TT.isOSWindows() || TT.isOSBinFormatELF() ||
2805 TT.isOSBinFormatMachO())) {
2806 LLVM_DEBUG(dbgs() << "false (external weak linkage)\n");
2807 return false;
2808 }
2809 }
2810
2811 // Check that the call results are passed in the same way.
2812 LLVMContext &C = *DAG.getContext();
2814 getEffectiveCallingConv(CalleeCC, isVarArg),
2815 getEffectiveCallingConv(CallerCC, CallerF.isVarArg()), MF, C, Ins,
2816 CCAssignFnForReturn(CalleeCC, isVarArg),
2817 CCAssignFnForReturn(CallerCC, CallerF.isVarArg()))) {
2818 LLVM_DEBUG(dbgs() << "false (incompatible results)\n");
2819 return false;
2820 }
2821 // The callee has to preserve all registers the caller needs to preserve.
2822 const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo();
2823 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
2824 if (CalleeCC != CallerCC) {
2825 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
2826 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved)) {
2827 LLVM_DEBUG(dbgs() << "false (not all registers preserved)\n");
2828 return false;
2829 }
2830 }
2831
2832 // If Caller's vararg argument has been split between registers and stack, do
2833 // not perform tail call, since part of the argument is in caller's local
2834 // frame.
2835 const ARMFunctionInfo *AFI_Caller = MF.getInfo<ARMFunctionInfo>();
2836 if (CLI.IsVarArg && AFI_Caller->getArgRegsSaveSize()) {
2837 LLVM_DEBUG(dbgs() << "false (arg reg save area)\n");
2838 return false;
2839 }
2840
2841 // If the callee takes no arguments then go on to check the results of the
2842 // call.
2843 const MachineRegisterInfo &MRI = MF.getRegInfo();
2844 if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals)) {
2845 LLVM_DEBUG(dbgs() << "false (parameters in CSRs do not match)\n");
2846 return false;
2847 }
2848
2849 // If the stack arguments for this call do not fit into our own save area then
2850 // the call cannot be made tail.
2851 if (CCInfo.getStackSize() > AFI_Caller->getArgumentStackSize())
2852 return false;
2853
2854 LLVM_DEBUG(dbgs() << "true\n");
2855 return true;
2856}
2857
2858bool
2859ARMTargetLowering::CanLowerReturn(CallingConv::ID CallConv,
2860 MachineFunction &MF, bool isVarArg,
2862 LLVMContext &Context, const Type *RetTy) const {
2864 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
2865 return CCInfo.CheckReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg));
2866}
2867
2869 const SDLoc &DL, SelectionDAG &DAG) {
2870 const MachineFunction &MF = DAG.getMachineFunction();
2871 const Function &F = MF.getFunction();
2872
2873 StringRef IntKind = F.getFnAttribute("interrupt").getValueAsString();
2874
2875 // See ARM ARM v7 B1.8.3. On exception entry LR is set to a possibly offset
2876 // version of the "preferred return address". These offsets affect the return
2877 // instruction if this is a return from PL1 without hypervisor extensions.
2878 // IRQ/FIQ: +4 "subs pc, lr, #4"
2879 // SWI: 0 "subs pc, lr, #0"
2880 // ABORT: +4 "subs pc, lr, #4"
2881 // UNDEF: +4/+2 "subs pc, lr, #0"
2882 // UNDEF varies depending on where the exception came from ARM or Thumb
2883 // mode. Alongside GCC, we throw our hands up in disgust and pretend it's 0.
2884
2885 int64_t LROffset;
2886 if (IntKind == "" || IntKind == "IRQ" || IntKind == "FIQ" ||
2887 IntKind == "ABORT")
2888 LROffset = 4;
2889 else if (IntKind == "SWI" || IntKind == "UNDEF")
2890 LROffset = 0;
2891 else
2892 report_fatal_error("Unsupported interrupt attribute. If present, value "
2893 "must be one of: IRQ, FIQ, SWI, ABORT or UNDEF");
2894
2895 RetOps.insert(RetOps.begin() + 1,
2896 DAG.getConstant(LROffset, DL, MVT::i32, false));
2897
2898 return DAG.getNode(ARMISD::INTRET_GLUE, DL, MVT::Other, RetOps);
2899}
2900
2901SDValue
2902ARMTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
2903 bool isVarArg,
2905 const SmallVectorImpl<SDValue> &OutVals,
2906 const SDLoc &dl, SelectionDAG &DAG) const {
2907 // CCValAssign - represent the assignment of the return value to a location.
2909
2910 // CCState - Info about the registers and stack slots.
2911 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
2912 *DAG.getContext());
2913
2914 // Analyze outgoing return values.
2915 CCInfo.AnalyzeReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg));
2916
2917 SDValue Glue;
2919 RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
2920 bool isLittleEndian = Subtarget->isLittle();
2921
2922 MachineFunction &MF = DAG.getMachineFunction();
2923 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
2924 AFI->setReturnRegsCount(RVLocs.size());
2925
2926 // Report error if cmse entry function returns structure through first ptr arg.
2927 if (AFI->isCmseNSEntryFunction() && MF.getFunction().hasStructRetAttr()) {
2928 // Note: using an empty SDLoc(), as the first line of the function is a
2929 // better place to report than the last line.
2930 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
2932 "secure entry function would return value through pointer",
2933 SDLoc().getDebugLoc()));
2934 }
2935
2936 // Copy the result values into the output registers.
2937 for (unsigned i = 0, realRVLocIdx = 0;
2938 i != RVLocs.size();
2939 ++i, ++realRVLocIdx) {
2940 CCValAssign &VA = RVLocs[i];
2941 assert(VA.isRegLoc() && "Can only return in registers!");
2942
2943 SDValue Arg = OutVals[realRVLocIdx];
2944 bool ReturnF16 = false;
2945
2946 if (Subtarget->hasFullFP16() && getTM().isTargetHardFloat()) {
2947 // Half-precision return values can be returned like this:
2948 //
2949 // t11 f16 = fadd ...
2950 // t12: i16 = bitcast t11
2951 // t13: i32 = zero_extend t12
2952 // t14: f32 = bitcast t13 <~~~~~~~ Arg
2953 //
2954 // to avoid code generation for bitcasts, we simply set Arg to the node
2955 // that produces the f16 value, t11 in this case.
2956 //
2957 if (Arg.getValueType() == MVT::f32 && Arg.getOpcode() == ISD::BITCAST) {
2958 SDValue ZE = Arg.getOperand(0);
2959 if (ZE.getOpcode() == ISD::ZERO_EXTEND && ZE.getValueType() == MVT::i32) {
2960 SDValue BC = ZE.getOperand(0);
2961 if (BC.getOpcode() == ISD::BITCAST && BC.getValueType() == MVT::i16) {
2962 Arg = BC.getOperand(0);
2963 ReturnF16 = true;
2964 }
2965 }
2966 }
2967 }
2968
2969 switch (VA.getLocInfo()) {
2970 default: llvm_unreachable("Unknown loc info!");
2971 case CCValAssign::Full: break;
2972 case CCValAssign::BCvt:
2973 if (!ReturnF16)
2974 Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg);
2975 break;
2976 }
2977
2978 // Mask f16 arguments if this is a CMSE nonsecure entry.
2979 auto RetVT = Outs[realRVLocIdx].ArgVT;
2980 if (AFI->isCmseNSEntryFunction() && (RetVT == MVT::f16)) {
2981 if (VA.needsCustom() && VA.getValVT() == MVT::f16) {
2982 Arg = MoveFromHPR(dl, DAG, VA.getLocVT(), VA.getValVT(), Arg);
2983 } else {
2984 auto LocBits = VA.getLocVT().getSizeInBits();
2985 auto MaskValue = APInt::getLowBitsSet(LocBits, RetVT.getSizeInBits());
2986 SDValue Mask =
2987 DAG.getConstant(MaskValue, dl, MVT::getIntegerVT(LocBits));
2988 Arg = DAG.getNode(ISD::BITCAST, dl, MVT::getIntegerVT(LocBits), Arg);
2989 Arg = DAG.getNode(ISD::AND, dl, MVT::getIntegerVT(LocBits), Arg, Mask);
2990 Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg);
2991 }
2992 }
2993
2994 if (VA.needsCustom() &&
2995 (VA.getLocVT() == MVT::v2f64 || VA.getLocVT() == MVT::f64)) {
2996 if (VA.getLocVT() == MVT::v2f64) {
2997 // Extract the first half and return it in two registers.
2998 SDValue Half = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
2999 DAG.getConstant(0, dl, MVT::i32));
3000 SDValue HalfGPRs = DAG.getNode(ARMISD::VMOVRRD, dl,
3001 DAG.getVTList(MVT::i32, MVT::i32), Half);
3002
3003 Chain =
3004 DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
3005 HalfGPRs.getValue(isLittleEndian ? 0 : 1), Glue);
3006 Glue = Chain.getValue(1);
3007 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
3008 VA = RVLocs[++i]; // skip ahead to next loc
3009 Chain =
3010 DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
3011 HalfGPRs.getValue(isLittleEndian ? 1 : 0), Glue);
3012 Glue = Chain.getValue(1);
3013 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
3014 VA = RVLocs[++i]; // skip ahead to next loc
3015
3016 // Extract the 2nd half and fall through to handle it as an f64 value.
3017 Arg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
3018 DAG.getConstant(1, dl, MVT::i32));
3019 }
3020 // Legalize ret f64 -> ret 2 x i32. We always have fmrrd if f64 is
3021 // available.
3022 SDValue fmrrd = DAG.getNode(ARMISD::VMOVRRD, dl,
3023 DAG.getVTList(MVT::i32, MVT::i32), Arg);
3024 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
3025 fmrrd.getValue(isLittleEndian ? 0 : 1), Glue);
3026 Glue = Chain.getValue(1);
3027 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
3028 VA = RVLocs[++i]; // skip ahead to next loc
3029 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
3030 fmrrd.getValue(isLittleEndian ? 1 : 0), Glue);
3031 } else
3032 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), Arg, Glue);
3033
3034 // Guarantee that all emitted copies are
3035 // stuck together, avoiding something bad.
3036 Glue = Chain.getValue(1);
3037 RetOps.push_back(DAG.getRegister(
3038 VA.getLocReg(), ReturnF16 ? Arg.getValueType() : VA.getLocVT()));
3039 }
3040 const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo();
3041 const MCPhysReg *I =
3042 TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
3043 if (I) {
3044 for (; *I; ++I) {
3045 if (ARM::GPRRegClass.contains(*I))
3046 RetOps.push_back(DAG.getRegister(*I, MVT::i32));
3047 else if (ARM::DPRRegClass.contains(*I))
3049 else
3050 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
3051 }
3052 }
3053
3054 // Update chain and glue.
3055 RetOps[0] = Chain;
3056 if (Glue.getNode())
3057 RetOps.push_back(Glue);
3058
3059 // CPUs which aren't M-class use a special sequence to return from
3060 // exceptions (roughly, any instruction setting pc and cpsr simultaneously,
3061 // though we use "subs pc, lr, #N").
3062 //
3063 // M-class CPUs actually use a normal return sequence with a special
3064 // (hardware-provided) value in LR, so the normal code path works.
3065 if (DAG.getMachineFunction().getFunction().hasFnAttribute("interrupt") &&
3066 !Subtarget->isMClass()) {
3067 if (Subtarget->isThumb1Only())
3068 report_fatal_error("interrupt attribute is not supported in Thumb1");
3069 return LowerInterruptReturn(RetOps, dl, DAG);
3070 }
3071
3072 unsigned RetNode =
3073 AFI->isCmseNSEntryFunction() ? ARMISD::SERET_GLUE : ARMISD::RET_GLUE;
3074 return DAG.getNode(RetNode, dl, MVT::Other, RetOps);
3075}
3076
3077bool ARMTargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
3078 if (N->getNumValues() != 1)
3079 return false;
3080 if (!N->hasNUsesOfValue(1, 0))
3081 return false;
3082
3083 SDValue TCChain = Chain;
3084 SDNode *Copy = *N->user_begin();
3085 if (Copy->getOpcode() == ISD::CopyToReg) {
3086 // If the copy has a glue operand, we conservatively assume it isn't safe to
3087 // perform a tail call.
3088 if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
3089 return false;
3090 TCChain = Copy->getOperand(0);
3091 } else if (Copy->getOpcode() == ARMISD::VMOVRRD) {
3092 SDNode *VMov = Copy;
3093 // f64 returned in a pair of GPRs.
3094 SmallPtrSet<SDNode*, 2> Copies;
3095 for (SDNode *U : VMov->users()) {
3096 if (U->getOpcode() != ISD::CopyToReg)
3097 return false;
3098 Copies.insert(U);
3099 }
3100 if (Copies.size() > 2)
3101 return false;
3102
3103 for (SDNode *U : VMov->users()) {
3104 SDValue UseChain = U->getOperand(0);
3105 if (Copies.count(UseChain.getNode()))
3106 // Second CopyToReg
3107 Copy = U;
3108 else {
3109 // We are at the top of this chain.
3110 // If the copy has a glue operand, we conservatively assume it
3111 // isn't safe to perform a tail call.
3112 if (U->getOperand(U->getNumOperands() - 1).getValueType() == MVT::Glue)
3113 return false;
3114 // First CopyToReg
3115 TCChain = UseChain;
3116 }
3117 }
3118 } else if (Copy->getOpcode() == ISD::BITCAST) {
3119 // f32 returned in a single GPR.
3120 if (!Copy->hasOneUse())
3121 return false;
3122 Copy = *Copy->user_begin();
3123 if (Copy->getOpcode() != ISD::CopyToReg || !Copy->hasNUsesOfValue(1, 0))
3124 return false;
3125 // If the copy has a glue operand, we conservatively assume it isn't safe to
3126 // perform a tail call.
3127 if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
3128 return false;
3129 TCChain = Copy->getOperand(0);
3130 } else {
3131 return false;
3132 }
3133
3134 bool HasRet = false;
3135 for (const SDNode *U : Copy->users()) {
3136 if (U->getOpcode() != ARMISD::RET_GLUE &&
3137 U->getOpcode() != ARMISD::INTRET_GLUE)
3138 return false;
3139 HasRet = true;
3140 }
3141
3142 if (!HasRet)
3143 return false;
3144
3145 Chain = TCChain;
3146 return true;
3147}
3148
3149bool ARMTargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
3150 if (!Subtarget->supportsTailCall())
3151 return false;
3152
3153 if (!CI->isTailCall())
3154 return false;
3155
3156 return true;
3157}
3158
3159// Trying to write a 64 bit value so need to split into two 32 bit values first,
3160// and pass the lower and high parts through.
3162 SDLoc DL(Op);
3163 SDValue WriteValue = Op->getOperand(2);
3164
3165 // This function is only supposed to be called for i64 type argument.
3166 assert(WriteValue.getValueType() == MVT::i64
3167 && "LowerWRITE_REGISTER called for non-i64 type argument.");
3168
3169 SDValue Lo, Hi;
3170 std::tie(Lo, Hi) = DAG.SplitScalar(WriteValue, DL, MVT::i32, MVT::i32);
3171 SDValue Ops[] = { Op->getOperand(0), Op->getOperand(1), Lo, Hi };
3172 return DAG.getNode(ISD::WRITE_REGISTER, DL, MVT::Other, Ops);
3173}
3174
3175// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
3176// their target counterpart wrapped in the ARMISD::Wrapper node. Suppose N is
3177// one of the above mentioned nodes. It has to be wrapped because otherwise
3178// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
3179// be used to form addressing mode. These wrapped nodes will be selected
3180// into MOVi.
3181SDValue ARMTargetLowering::LowerConstantPool(SDValue Op,
3182 SelectionDAG &DAG) const {
3183 EVT PtrVT = Op.getValueType();
3184 // FIXME there is no actual debug info here
3185 SDLoc dl(Op);
3186 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
3187 SDValue Res;
3188
3189 // When generating execute-only code Constant Pools must be promoted to the
3190 // global data section. It's a bit ugly that we can't share them across basic
3191 // blocks, but this way we guarantee that execute-only behaves correct with
3192 // position-independent addressing modes.
3193 if (Subtarget->genExecuteOnly()) {
3194 auto AFI = DAG.getMachineFunction().getInfo<ARMFunctionInfo>();
3195 auto *T = CP->getType();
3196 auto C = const_cast<Constant*>(CP->getConstVal());
3197 auto M = DAG.getMachineFunction().getFunction().getParent();
3198 auto GV = new GlobalVariable(
3199 *M, T, /*isConstant=*/true, GlobalVariable::InternalLinkage, C,
3200 Twine(DAG.getDataLayout().getInternalSymbolPrefix()) + "CP" +
3201 Twine(DAG.getMachineFunction().getFunctionNumber()) + "_" +
3202 Twine(AFI->createPICLabelUId()));
3203 SDValue GA = DAG.getTargetGlobalAddress(GV, dl, PtrVT);
3204 return LowerGlobalAddress(GA, DAG);
3205 }
3206
3207 // The 16-bit ADR instruction can only encode offsets that are multiples of 4,
3208 // so we need to align to at least 4 bytes when we don't have 32-bit ADR.
3209 Align CPAlign = CP->getAlign();
3210 if (Subtarget->isThumb1Only())
3211 CPAlign = std::max(CPAlign, Align(4));
3213 Res =
3214 DAG.getTargetConstantPool(CP->getMachineCPVal(), PtrVT, CPAlign);
3215 else
3216 Res = DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CPAlign);
3217 return DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Res);
3218}
3219
3221 // If we don't have a 32-bit pc-relative branch instruction then the jump
3222 // table consists of block addresses. Usually this is inline, but for
3223 // execute-only it must be placed out-of-line.
3224 if (Subtarget->genExecuteOnly() && !Subtarget->hasV8MBaselineOps())
3227}
3228
3229SDValue ARMTargetLowering::LowerBlockAddress(SDValue Op,
3230 SelectionDAG &DAG) const {
3233 unsigned ARMPCLabelIndex = 0;
3234 SDLoc DL(Op);
3235 EVT PtrVT = getPointerTy(DAG.getDataLayout());
3236 const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
3237 SDValue CPAddr;
3238 bool IsPositionIndependent = isPositionIndependent() || Subtarget->isROPI();
3239 if (!IsPositionIndependent) {
3240 CPAddr = DAG.getTargetConstantPool(BA, PtrVT, Align(4));
3241 } else {
3242 unsigned PCAdj = Subtarget->isThumb() ? 4 : 8;
3243 ARMPCLabelIndex = AFI->createPICLabelUId();
3245 ARMConstantPoolConstant::Create(BA, ARMPCLabelIndex,
3246 ARMCP::CPBlockAddress, PCAdj);
3247 CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, Align(4));
3248 }
3249 CPAddr = DAG.getNode(ARMISD::Wrapper, DL, PtrVT, CPAddr);
3250 SDValue Result = DAG.getLoad(
3251 PtrVT, DL, DAG.getEntryNode(), CPAddr,
3253 if (!IsPositionIndependent)
3254 return Result;
3255 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, DL, MVT::i32);
3256 return DAG.getNode(ARMISD::PIC_ADD, DL, PtrVT, Result, PICLabel);
3257}
3258
3259/// Convert a TLS address reference into the correct sequence of loads
3260/// and calls to compute the variable's address for Darwin, and return an
3261/// SDValue containing the final node.
3262
3263/// Darwin only has one TLS scheme which must be capable of dealing with the
3264/// fully general situation, in the worst case. This means:
3265/// + "extern __thread" declaration.
3266/// + Defined in a possibly unknown dynamic library.
3267///
3268/// The general system is that each __thread variable has a [3 x i32] descriptor
3269/// which contains information used by the runtime to calculate the address. The
3270/// only part of this the compiler needs to know about is the first word, which
3271/// contains a function pointer that must be called with the address of the
3272/// entire descriptor in "r0".
3273///
3274/// Since this descriptor may be in a different unit, in general access must
3275/// proceed along the usual ARM rules. A common sequence to produce is:
3276///
3277/// movw rT1, :lower16:_var$non_lazy_ptr
3278/// movt rT1, :upper16:_var$non_lazy_ptr
3279/// ldr r0, [rT1]
3280/// ldr rT2, [r0]
3281/// blx rT2
3282/// [...address now in r0...]
3283SDValue
3284ARMTargetLowering::LowerGlobalTLSAddressDarwin(SDValue Op,
3285 SelectionDAG &DAG) const {
3286 assert(getTargetMachine().getTargetTriple().isOSDarwin() &&
3287 "This function expects a Darwin target");
3288 SDLoc DL(Op);
3289
3290 // First step is to get the address of the actua global symbol. This is where
3291 // the TLS descriptor lives.
3292 SDValue DescAddr = LowerGlobalAddressDarwin(Op, DAG);
3293
3294 // The first entry in the descriptor is a function pointer that we must call
3295 // to obtain the address of the variable.
3296 SDValue Chain = DAG.getEntryNode();
3297 SDValue FuncTLVGet = DAG.getLoad(
3298 MVT::i32, DL, Chain, DescAddr,
3302 Chain = FuncTLVGet.getValue(1);
3303
3304 MachineFunction &F = DAG.getMachineFunction();
3305 MachineFrameInfo &MFI = F.getFrameInfo();
3306 MFI.setAdjustsStack(true);
3307
3308 // TLS calls preserve all registers except those that absolutely must be
3309 // trashed: R0 (it takes an argument), LR (it's a call) and CPSR (let's not be
3310 // silly).
3311 auto TRI =
3313 auto ARI = static_cast<const ARMRegisterInfo *>(TRI);
3314 const uint32_t *Mask = ARI->getTLSCallPreservedMask(DAG.getMachineFunction());
3315
3316 // Finally, we can make the call. This is just a degenerate version of a
3317 // normal AArch64 call node: r0 takes the address of the descriptor, and
3318 // returns the address of the variable in this thread.
3319 Chain = DAG.getCopyToReg(Chain, DL, ARM::R0, DescAddr, SDValue());
3320 Chain =
3321 DAG.getNode(ARMISD::CALL, DL, DAG.getVTList(MVT::Other, MVT::Glue),
3322 Chain, FuncTLVGet, DAG.getRegister(ARM::R0, MVT::i32),
3323 DAG.getRegisterMask(Mask), Chain.getValue(1));
3324 return DAG.getCopyFromReg(Chain, DL, ARM::R0, MVT::i32, Chain.getValue(1));
3325}
3326
3327SDValue
3328ARMTargetLowering::LowerGlobalTLSAddressWindows(SDValue Op,
3329 SelectionDAG &DAG) const {
3330 assert(getTargetMachine().getTargetTriple().isOSWindows() &&
3331 "Windows specific TLS lowering");
3332
3333 SDValue Chain = DAG.getEntryNode();
3334 EVT PtrVT = getPointerTy(DAG.getDataLayout());
3335 SDLoc DL(Op);
3336
3337 // Load the current TEB (thread environment block)
3338 SDValue Ops[] = {Chain,
3339 DAG.getTargetConstant(Intrinsic::arm_mrc, DL, MVT::i32),
3340 DAG.getTargetConstant(15, DL, MVT::i32),
3341 DAG.getTargetConstant(0, DL, MVT::i32),
3342 DAG.getTargetConstant(13, DL, MVT::i32),
3343 DAG.getTargetConstant(0, DL, MVT::i32),
3344 DAG.getTargetConstant(2, DL, MVT::i32)};
3345 SDValue CurrentTEB = DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL,
3346 DAG.getVTList(MVT::i32, MVT::Other), Ops);
3347
3348 SDValue TEB = CurrentTEB.getValue(0);
3349 Chain = CurrentTEB.getValue(1);
3350
3351 // Load the ThreadLocalStoragePointer from the TEB
3352 // A pointer to the TLS array is located at offset 0x2c from the TEB.
3353 SDValue TLSArray =
3354 DAG.getNode(ISD::ADD, DL, PtrVT, TEB, DAG.getIntPtrConstant(0x2c, DL));
3355 TLSArray = DAG.getLoad(PtrVT, DL, Chain, TLSArray, MachinePointerInfo());
3356
3357 // The pointer to the thread's TLS data area is at the TLS Index scaled by 4
3358 // offset into the TLSArray.
3359
3360 // Load the TLS index from the C runtime
3361 SDValue TLSIndex =
3362 DAG.getTargetExternalSymbol("_tls_index", PtrVT, ARMII::MO_NO_FLAG);
3363 TLSIndex = DAG.getNode(ARMISD::Wrapper, DL, PtrVT, TLSIndex);
3364 TLSIndex = DAG.getLoad(PtrVT, DL, Chain, TLSIndex, MachinePointerInfo());
3365
3366 SDValue Slot = DAG.getNode(ISD::SHL, DL, PtrVT, TLSIndex,
3367 DAG.getConstant(2, DL, MVT::i32));
3368 SDValue TLS = DAG.getLoad(PtrVT, DL, Chain,
3369 DAG.getNode(ISD::ADD, DL, PtrVT, TLSArray, Slot),
3370 MachinePointerInfo());
3371
3372 // Get the offset of the start of the .tls section (section base)
3373 const auto *GA = cast<GlobalAddressSDNode>(Op);
3374 auto *CPV = ARMConstantPoolConstant::Create(GA->getGlobal(), ARMCP::SECREL);
3375 SDValue Offset = DAG.getLoad(
3376 PtrVT, DL, Chain,
3377 DAG.getNode(ARMISD::Wrapper, DL, MVT::i32,
3378 DAG.getTargetConstantPool(CPV, PtrVT, Align(4))),
3380
3381 return DAG.getNode(ISD::ADD, DL, PtrVT, TLS, Offset);
3382}
3383
3384// Lower ISD::GlobalTLSAddress using the "general dynamic" model
3385SDValue
3386ARMTargetLowering::LowerToTLSGeneralDynamicModel(GlobalAddressSDNode *GA,
3387 SelectionDAG &DAG) const {
3388 SDLoc dl(GA);
3389 EVT PtrVT = getPointerTy(DAG.getDataLayout());
3390 unsigned char PCAdj = Subtarget->isThumb() ? 4 : 8;
3391 MachineFunction &MF = DAG.getMachineFunction();
3392 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
3393 unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
3394 ARMConstantPoolValue *CPV =
3395 ARMConstantPoolConstant::Create(GA->getGlobal(), ARMPCLabelIndex,
3396 ARMCP::CPValue, PCAdj, ARMCP::TLSGD, true);
3397 SDValue Argument = DAG.getTargetConstantPool(CPV, PtrVT, Align(4));
3398 Argument = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Argument);
3399 Argument = DAG.getLoad(
3400 PtrVT, dl, DAG.getEntryNode(), Argument,
3402 SDValue Chain = Argument.getValue(1);
3403
3404 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32);
3405 Argument = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Argument, PICLabel);
3406
3407 // call __tls_get_addr.
3409 Args.emplace_back(Argument, Type::getInt32Ty(*DAG.getContext()));
3410
3411 // FIXME: is there useful debug info available here?
3412 TargetLowering::CallLoweringInfo CLI(DAG);
3413 CLI.setDebugLoc(dl).setChain(Chain).setLibCallee(
3415 DAG.getExternalSymbol("__tls_get_addr", PtrVT), std::move(Args));
3416
3417 std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
3418 return CallResult.first;
3419}
3420
3421// Lower ISD::GlobalTLSAddress using the "initial exec" or
3422// "local exec" model.
3423SDValue
3424ARMTargetLowering::LowerToTLSExecModels(GlobalAddressSDNode *GA,
3425 SelectionDAG &DAG,
3426 TLSModel::Model model) const {
3427 const GlobalValue *GV = GA->getGlobal();
3428 SDLoc dl(GA);
3430 SDValue Chain = DAG.getEntryNode();
3431 EVT PtrVT = getPointerTy(DAG.getDataLayout());
3432 // Get the Thread Pointer
3433 SDValue ThreadPointer = DAG.getNode(ARMISD::THREAD_POINTER, dl, PtrVT);
3434
3435 if (model == TLSModel::InitialExec) {
3436 MachineFunction &MF = DAG.getMachineFunction();
3437 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
3438 unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
3439 // Initial exec model.
3440 unsigned char PCAdj = Subtarget->isThumb() ? 4 : 8;
3441 ARMConstantPoolValue *CPV =
3442 ARMConstantPoolConstant::Create(GA->getGlobal(), ARMPCLabelIndex,
3444 true);
3445 Offset = DAG.getTargetConstantPool(CPV, PtrVT, Align(4));
3446 Offset = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Offset);
3447 Offset = DAG.getLoad(
3448 PtrVT, dl, Chain, Offset,
3450 Chain = Offset.getValue(1);
3451
3452 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32);
3453 Offset = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Offset, PICLabel);
3454
3455 Offset = DAG.getLoad(
3456 PtrVT, dl, Chain, Offset,
3458 } else {
3459 // local exec model
3460 assert(model == TLSModel::LocalExec);
3461 ARMConstantPoolValue *CPV =
3463 Offset = DAG.getTargetConstantPool(CPV, PtrVT, Align(4));
3464 Offset = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Offset);
3465 Offset = DAG.getLoad(
3466 PtrVT, dl, Chain, Offset,
3468 }
3469
3470 // The address of the thread local variable is the add of the thread
3471 // pointer with the offset of the variable.
3472 return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
3473}
3474
3475SDValue
3476ARMTargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
3477 GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
3478 if (DAG.getTarget().useEmulatedTLS())
3479 return LowerToTLSEmulatedModel(GA, DAG);
3480
3481 const Triple &TT = getTargetMachine().getTargetTriple();
3482 if (TT.isOSDarwin())
3483 return LowerGlobalTLSAddressDarwin(Op, DAG);
3484
3485 if (TT.isOSWindows())
3486 return LowerGlobalTLSAddressWindows(Op, DAG);
3487
3488 // TODO: implement the "local dynamic" model
3489 assert(TT.isOSBinFormatELF() && "Only ELF implemented here");
3491
3492 switch (model) {
3495 return LowerToTLSGeneralDynamicModel(GA, DAG);
3498 return LowerToTLSExecModels(GA, DAG, model);
3499 }
3500 llvm_unreachable("bogus TLS model");
3501}
3502
3503/// Return true if all users of V are within function F, looking through
3504/// ConstantExprs.
3505static bool allUsersAreInFunction(const Value *V, const Function *F) {
3506 SmallVector<const User*,4> Worklist(V->users());
3507 while (!Worklist.empty()) {
3508 auto *U = Worklist.pop_back_val();
3509 if (isa<ConstantExpr>(U)) {
3510 append_range(Worklist, U->users());
3511 continue;
3512 }
3513
3514 auto *I = dyn_cast<Instruction>(U);
3515 if (!I || I->getParent()->getParent() != F)
3516 return false;
3517 }
3518 return true;
3519}
3520
3522 const GlobalValue *GV, SelectionDAG &DAG,
3523 EVT PtrVT, const SDLoc &dl) {
3524 // If we're creating a pool entry for a constant global with unnamed address,
3525 // and the global is small enough, we can emit it inline into the constant pool
3526 // to save ourselves an indirection.
3527 //
3528 // This is a win if the constant is only used in one function (so it doesn't
3529 // need to be duplicated) or duplicating the constant wouldn't increase code
3530 // size (implying the constant is no larger than 4 bytes).
3531 const Function &F = DAG.getMachineFunction().getFunction();
3532
3533 // We rely on this decision to inline being idempotent and unrelated to the
3534 // use-site. We know that if we inline a variable at one use site, we'll
3535 // inline it elsewhere too (and reuse the constant pool entry). Fast-isel
3536 // doesn't know about this optimization, so bail out if it's enabled else
3537 // we could decide to inline here (and thus never emit the GV) but require
3538 // the GV from fast-isel generated code.
3541 return SDValue();
3542
3543 auto *GVar = dyn_cast<GlobalVariable>(GV);
3544 if (!GVar || !GVar->hasInitializer() ||
3545 !GVar->isConstant() || !GVar->hasGlobalUnnamedAddr() ||
3546 !GVar->hasLocalLinkage())
3547 return SDValue();
3548
3549 // If we inline a value that contains relocations, we move the relocations
3550 // from .data to .text. This is not allowed in position-independent code.
3551 auto *Init = GVar->getInitializer();
3552 if ((TLI->isPositionIndependent() || TLI->getSubtarget()->isROPI()) &&
3553 Init->needsDynamicRelocation())
3554 return SDValue();
3555
3556 // The constant islands pass can only really deal with alignment requests
3557 // <= 4 bytes and cannot pad constants itself. Therefore we cannot promote
3558 // any type wanting greater alignment requirements than 4 bytes. We also
3559 // can only promote constants that are multiples of 4 bytes in size or
3560 // are paddable to a multiple of 4. Currently we only try and pad constants
3561 // that are strings for simplicity.
3562 auto *CDAInit = dyn_cast<ConstantDataArray>(Init);
3563 unsigned Size = DAG.getDataLayout().getTypeAllocSize(Init->getType());
3564 Align PrefAlign = DAG.getDataLayout().getPreferredAlign(GVar);
3565 unsigned RequiredPadding = 4 - (Size % 4);
3566 bool PaddingPossible =
3567 RequiredPadding == 4 || (CDAInit && CDAInit->isString());
3568 if (!PaddingPossible || PrefAlign > 4 || Size > ConstpoolPromotionMaxSize ||
3569 Size == 0)
3570 return SDValue();
3571
3572 unsigned PaddedSize = Size + ((RequiredPadding == 4) ? 0 : RequiredPadding);
3574 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
3575
3576 // We can't bloat the constant pool too much, else the ConstantIslands pass
3577 // may fail to converge. If we haven't promoted this global yet (it may have
3578 // multiple uses), and promoting it would increase the constant pool size (Sz
3579 // > 4), ensure we have space to do so up to MaxTotal.
3580 if (!AFI->getGlobalsPromotedToConstantPool().count(GVar) && Size > 4)
3581 if (AFI->getPromotedConstpoolIncrease() + PaddedSize - 4 >=
3583 return SDValue();
3584
3585 // This is only valid if all users are in a single function; we can't clone
3586 // the constant in general. The LLVM IR unnamed_addr allows merging
3587 // constants, but not cloning them.
3588 //
3589 // We could potentially allow cloning if we could prove all uses of the
3590 // constant in the current function don't care about the address, like
3591 // printf format strings. But that isn't implemented for now.
3592 if (!allUsersAreInFunction(GVar, &F))
3593 return SDValue();
3594
3595 // We're going to inline this global. Pad it out if needed.
3596 if (RequiredPadding != 4) {
3597 StringRef S = CDAInit->getAsString();
3598
3600 std::copy(S.bytes_begin(), S.bytes_end(), V.begin());
3601 while (RequiredPadding--)
3602 V.push_back(0);
3604 }
3605
3606 auto CPVal = ARMConstantPoolConstant::Create(GVar, Init);
3607 SDValue CPAddr = DAG.getTargetConstantPool(CPVal, PtrVT, Align(4));
3608 if (!AFI->getGlobalsPromotedToConstantPool().count(GVar)) {
3611 PaddedSize - 4);
3612 }
3613 ++NumConstpoolPromoted;
3614 return DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
3615}
3616
3618 if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV))
3619 if (!(GV = GA->getAliaseeObject()))
3620 return false;
3621 if (const auto *V = dyn_cast<GlobalVariable>(GV))
3622 return V->isConstant();
3623 return isa<Function>(GV);
3624}
3625
3626SDValue ARMTargetLowering::LowerGlobalAddress(SDValue Op,
3627 SelectionDAG &DAG) const {
3628 switch (Subtarget->getTargetTriple().getObjectFormat()) {
3629 default: llvm_unreachable("unknown object format");
3630 case Triple::COFF:
3631 return LowerGlobalAddressWindows(Op, DAG);
3632 case Triple::ELF:
3633 return LowerGlobalAddressELF(Op, DAG);
3634 case Triple::MachO:
3635 return LowerGlobalAddressDarwin(Op, DAG);
3636 }
3637}
3638
3639SDValue ARMTargetLowering::LowerGlobalAddressELF(SDValue Op,
3640 SelectionDAG &DAG) const {
3641 EVT PtrVT = getPointerTy(DAG.getDataLayout());
3642 SDLoc dl(Op);
3643 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
3644 bool IsRO = isReadOnly(GV);
3645
3646 // promoteToConstantPool only if not generating XO text section
3647 if (GV->isDSOLocal() && !Subtarget->genExecuteOnly())
3648 if (SDValue V = promoteToConstantPool(this, GV, DAG, PtrVT, dl))
3649 return V;
3650
3651 if (isPositionIndependent()) {
3652 // Weak symbols need GOT indirection even when hidden/DSO-local.
3653 // The assembler eagerly resolves PC-relative expressions when the
3654 // symbol and reference are in the same section, which prevents the
3655 // linker from overriding a weak definition with a non-weak one.
3656 bool UseGOT = !GV->isDSOLocal() || GV->isWeakForLinker();
3657 SDValue G = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0,
3658 UseGOT ? ARMII::MO_GOT : 0);
3659 SDValue Result = DAG.getNode(ARMISD::WrapperPIC, dl, PtrVT, G);
3660 if (UseGOT)
3661 Result =
3662 DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
3664 return Result;
3665 } else if (Subtarget->isROPI() && IsRO) {
3666 // PC-relative.
3667 SDValue G = DAG.getTargetGlobalAddress(GV, dl, PtrVT);
3668 SDValue Result = DAG.getNode(ARMISD::WrapperPIC, dl, PtrVT, G);
3669 return Result;
3670 } else if (Subtarget->isRWPI() && !IsRO) {
3671 // SB-relative.
3672 SDValue RelAddr;
3673 if (Subtarget->useMovt()) {
3674 ++NumMovwMovt;
3675 SDValue G = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, ARMII::MO_SBREL);
3676 RelAddr = DAG.getNode(ARMISD::Wrapper, dl, PtrVT, G);
3677 } else { // use literal pool for address constant
3678 ARMConstantPoolValue *CPV =
3680 SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, Align(4));
3681 CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
3682 RelAddr = DAG.getLoad(
3683 PtrVT, dl, DAG.getEntryNode(), CPAddr,
3685 }
3686 SDValue SB = DAG.getCopyFromReg(DAG.getEntryNode(), dl, ARM::R9, PtrVT);
3687 SDValue Result = DAG.getNode(ISD::ADD, dl, PtrVT, SB, RelAddr);
3688 return Result;
3689 }
3690
3691 // If we have T2 ops, we can materialize the address directly via movt/movw
3692 // pair. This is always cheaper. If need to generate Execute Only code, and we
3693 // only have Thumb1 available, we can't use a constant pool and are forced to
3694 // use immediate relocations.
3695 if (Subtarget->useMovt() || Subtarget->genExecuteOnly()) {
3696 if (Subtarget->useMovt())
3697 ++NumMovwMovt;
3698 // FIXME: Once remat is capable of dealing with instructions with register
3699 // operands, expand this into two nodes.
3700 return DAG.getNode(ARMISD::Wrapper, dl, PtrVT,
3701 DAG.getTargetGlobalAddress(GV, dl, PtrVT));
3702 } else {
3703 SDValue CPAddr = DAG.getTargetConstantPool(GV, PtrVT, Align(4));
3704 CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
3705 return DAG.getLoad(
3706 PtrVT, dl, DAG.getEntryNode(), CPAddr,
3708 }
3709}
3710
3711SDValue ARMTargetLowering::LowerGlobalAddressDarwin(SDValue Op,
3712 SelectionDAG &DAG) const {
3713 assert(!Subtarget->isROPI() && !Subtarget->isRWPI() &&
3714 "ROPI/RWPI not currently supported for Darwin");
3715 EVT PtrVT = getPointerTy(DAG.getDataLayout());
3716 SDLoc dl(Op);
3717 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
3718
3719 if (Subtarget->useMovt())
3720 ++NumMovwMovt;
3721
3722 // FIXME: Once remat is capable of dealing with instructions with register
3723 // operands, expand this into multiple nodes
3724 unsigned Wrapper =
3725 isPositionIndependent() ? ARMISD::WrapperPIC : ARMISD::Wrapper;
3726
3727 SDValue G = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, ARMII::MO_NONLAZY);
3728 SDValue Result = DAG.getNode(Wrapper, dl, PtrVT, G);
3729
3730 if (Subtarget->isGVIndirectSymbol(GV))
3731 Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
3733 return Result;
3734}
3735
3736SDValue ARMTargetLowering::LowerGlobalAddressWindows(SDValue Op,
3737 SelectionDAG &DAG) const {
3738 assert(getTargetMachine().getTargetTriple().isOSWindows() &&
3739 "non-Windows COFF is not supported");
3740 assert(Subtarget->useMovt() &&
3741 "Windows on ARM expects to use movw/movt");
3742 assert(!Subtarget->isROPI() && !Subtarget->isRWPI() &&
3743 "ROPI/RWPI not currently supported for Windows");
3744
3745 const TargetMachine &TM = getTargetMachine();
3746 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
3747 ARMII::TOF TargetFlags = ARMII::MO_NO_FLAG;
3748 if (GV->hasDLLImportStorageClass())
3749 TargetFlags = ARMII::MO_DLLIMPORT;
3750 else if (!TM.shouldAssumeDSOLocal(GV))
3751 TargetFlags = ARMII::MO_COFFSTUB;
3752 EVT PtrVT = getPointerTy(DAG.getDataLayout());
3754 SDLoc DL(Op);
3755
3756 ++NumMovwMovt;
3757
3758 // FIXME: Once remat is capable of dealing with instructions with register
3759 // operands, expand this into two nodes.
3760 Result = DAG.getNode(ARMISD::Wrapper, DL, PtrVT,
3761 DAG.getTargetGlobalAddress(GV, DL, PtrVT, /*offset=*/0,
3762 TargetFlags));
3763 if (TargetFlags & (ARMII::MO_DLLIMPORT | ARMII::MO_COFFSTUB))
3764 Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result,
3766 return Result;
3767}
3768
3769SDValue
3770ARMTargetLowering::LowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const {
3771 SDLoc dl(Op);
3772 SDValue Val = DAG.getConstant(0, dl, MVT::i32);
3773 return DAG.getNode(ARMISD::EH_SJLJ_SETJMP, dl,
3774 DAG.getVTList(MVT::i32, MVT::Other), Op.getOperand(0),
3775 Op.getOperand(1), Val);
3776}
3777
3778SDValue
3779ARMTargetLowering::LowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const {
3780 SDLoc dl(Op);
3781 return DAG.getNode(ARMISD::EH_SJLJ_LONGJMP, dl, MVT::Other, Op.getOperand(0),
3782 Op.getOperand(1), DAG.getConstant(0, dl, MVT::i32));
3783}
3784
3785SDValue ARMTargetLowering::LowerEH_SJLJ_SETUP_DISPATCH(SDValue Op,
3786 SelectionDAG &DAG) const {
3787 SDLoc dl(Op);
3788 return DAG.getNode(ARMISD::EH_SJLJ_SETUP_DISPATCH, dl, MVT::Other,
3789 Op.getOperand(0));
3790}
3791
3792SDValue ARMTargetLowering::LowerINTRINSIC_VOID(
3793 SDValue Op, SelectionDAG &DAG, const ARMSubtarget *Subtarget) const {
3794 unsigned IntNo =
3795 Op.getConstantOperandVal(Op.getOperand(0).getValueType() == MVT::Other);
3796 switch (IntNo) {
3797 default:
3798 return SDValue(); // Don't custom lower most intrinsics.
3799 case Intrinsic::arm_gnu_eabi_mcount: {
3800 MachineFunction &MF = DAG.getMachineFunction();
3801 EVT PtrVT = getPointerTy(DAG.getDataLayout());
3802 SDLoc dl(Op);
3803 SDValue Chain = Op.getOperand(0);
3804 // call "\01__gnu_mcount_nc"
3805 const ARMBaseRegisterInfo *ARI = Subtarget->getRegisterInfo();
3806 const uint32_t *Mask =
3808 assert(Mask && "Missing call preserved mask for calling convention");
3809 // Mark LR an implicit live-in.
3810 Register Reg = MF.addLiveIn(ARM::LR, getRegClassFor(MVT::i32));
3811 SDValue ReturnAddress =
3812 DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, PtrVT);
3813 constexpr EVT ResultTys[] = {MVT::Other, MVT::Glue};
3814 SDValue Callee =
3815 DAG.getTargetExternalSymbol("\01__gnu_mcount_nc", PtrVT, 0);
3817 if (Subtarget->isThumb())
3818 return SDValue(
3819 DAG.getMachineNode(
3820 ARM::tBL_PUSHLR, dl, ResultTys,
3821 {ReturnAddress, DAG.getTargetConstant(ARMCC::AL, dl, PtrVT),
3822 DAG.getRegister(0, PtrVT), Callee, RegisterMask, Chain}),
3823 0);
3824 return SDValue(
3825 DAG.getMachineNode(ARM::BL_PUSHLR, dl, ResultTys,
3826 {ReturnAddress, Callee, RegisterMask, Chain}),
3827 0);
3828 }
3829 }
3830}
3831
3832SDValue
3833ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG,
3834 const ARMSubtarget *Subtarget) const {
3835 unsigned IntNo = Op.getConstantOperandVal(0);
3836 SDLoc dl(Op);
3837 switch (IntNo) {
3838 default: return SDValue(); // Don't custom lower most intrinsics.
3839 case Intrinsic::localaddress: {
3840 const MachineFunction &MF = DAG.getMachineFunction();
3841 const auto *RegInfo = Subtarget->getRegisterInfo();
3842 unsigned Reg = RegInfo->getLocalAddressRegister(MF);
3843 return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg,
3844 Op.getSimpleValueType());
3845 }
3846 case Intrinsic::eh_recoverfp: {
3847 SDValue FnOp = Op.getOperand(1);
3848 GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(FnOp);
3849 auto *Fn = dyn_cast_or_null<Function>(GSD ? GSD->getGlobal() : nullptr);
3850 if (!Fn)
3852 "llvm.eh.recoverfp must take a function as the first argument");
3853 const auto *RegInfo = Subtarget->getRegisterInfo();
3854 Register BaseReg = RegInfo->getBaseRegister();
3855 MachineFunction &MF = DAG.getMachineFunction();
3856 MachineBasicBlock &MBB = *MF.begin();
3857 if (!MBB.isLiveIn(BaseReg))
3858 MBB.addLiveIn(BaseReg);
3859 EVT PtrVT = getPointerTy(DAG.getDataLayout());
3860 return DAG.getCopyFromReg(DAG.getEntryNode(), dl, BaseReg, PtrVT);
3861 }
3862 case Intrinsic::thread_pointer: {
3863 EVT PtrVT = getPointerTy(DAG.getDataLayout());
3864 return DAG.getNode(ARMISD::THREAD_POINTER, dl, PtrVT);
3865 }
3866 case Intrinsic::arm_cls: {
3867 // Note: arm_cls and arm_cls64 intrinsics are expanded directly here
3868 // in LowerINTRINSIC_WO_CHAIN since there's no native scalar CLS
3869 // instruction.
3870 const SDValue &Operand = Op.getOperand(1);
3871 const EVT VTy = Op.getValueType();
3872 return DAG.getNode(ISD::CTLS, dl, VTy, Operand);
3873 }
3874 case Intrinsic::arm_cls64: {
3875 // arm_cls64 returns i32 but takes i64 input.
3876 // Use ISD::CTLS for i64 and truncate the result.
3877 SDValue CTLS64 = DAG.getNode(ISD::CTLS, dl, MVT::i64, Op.getOperand(1));
3878 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, CTLS64);
3879 }
3880 case Intrinsic::arm_neon_vcls:
3881 case Intrinsic::arm_mve_vcls: {
3882 // Lower vector CLS intrinsics to ISD::CTLS.
3883 // Vector CTLS is Legal when NEON/MVE is available (set elsewhere).
3884 const EVT VTy = Op.getValueType();
3885 return DAG.getNode(ISD::CTLS, dl, VTy, Op.getOperand(1));
3886 }
3887 case Intrinsic::eh_sjlj_lsda: {
3888 MachineFunction &MF = DAG.getMachineFunction();
3889 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
3890 unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
3891 EVT PtrVT = getPointerTy(DAG.getDataLayout());
3892 SDValue CPAddr;
3893 bool IsPositionIndependent = isPositionIndependent();
3894 unsigned PCAdj = IsPositionIndependent ? (Subtarget->isThumb() ? 4 : 8) : 0;
3895 ARMConstantPoolValue *CPV =
3896 ARMConstantPoolConstant::Create(&MF.getFunction(), ARMPCLabelIndex,
3897 ARMCP::CPLSDA, PCAdj);
3898 CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, Align(4));
3899 CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
3900 SDValue Result = DAG.getLoad(
3901 PtrVT, dl, DAG.getEntryNode(), CPAddr,
3903
3904 if (IsPositionIndependent) {
3905 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32);
3906 Result = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Result, PICLabel);
3907 }
3908 return Result;
3909 }
3910 case Intrinsic::arm_neon_vabs:
3911 return DAG.getNode(ISD::ABS, SDLoc(Op), Op.getValueType(),
3912 Op.getOperand(1));
3913 case Intrinsic::arm_neon_vabds:
3914 if (Op.getValueType().isInteger())
3915 return DAG.getNode(ISD::ABDS, SDLoc(Op), Op.getValueType(),
3916 Op.getOperand(1), Op.getOperand(2));
3917 return SDValue();
3918 case Intrinsic::arm_neon_vabdu:
3919 return DAG.getNode(ISD::ABDU, SDLoc(Op), Op.getValueType(),
3920 Op.getOperand(1), Op.getOperand(2));
3921 case Intrinsic::arm_neon_vmulls:
3922 case Intrinsic::arm_neon_vmullu: {
3923 unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmulls)
3924 ? ARMISD::VMULLs : ARMISD::VMULLu;
3925 return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),
3926 Op.getOperand(1), Op.getOperand(2));
3927 }
3928 case Intrinsic::arm_neon_vminnm:
3929 case Intrinsic::arm_neon_vmaxnm: {
3930 unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vminnm)
3931 ? ISD::FMINNUM : ISD::FMAXNUM;
3932 return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),
3933 Op.getOperand(1), Op.getOperand(2));
3934 }
3935 case Intrinsic::arm_neon_vminu:
3936 case Intrinsic::arm_neon_vmaxu: {
3937 if (Op.getValueType().isFloatingPoint())
3938 return SDValue();
3939 unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vminu)
3940 ? ISD::UMIN : ISD::UMAX;
3941 return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),
3942 Op.getOperand(1), Op.getOperand(2));
3943 }
3944 case Intrinsic::arm_neon_vmins:
3945 case Intrinsic::arm_neon_vmaxs: {
3946 // v{min,max}s is overloaded between signed integers and floats.
3947 if (!Op.getValueType().isFloatingPoint()) {
3948 unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmins)
3949 ? ISD::SMIN : ISD::SMAX;
3950 return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),
3951 Op.getOperand(1), Op.getOperand(2));
3952 }
3953 unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmins)
3954 ? ISD::FMINIMUM : ISD::FMAXIMUM;
3955 return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),
3956 Op.getOperand(1), Op.getOperand(2));
3957 }
3958 case Intrinsic::arm_neon_vtbl1:
3959 return DAG.getNode(ARMISD::VTBL1, SDLoc(Op), Op.getValueType(),
3960 Op.getOperand(1), Op.getOperand(2));
3961 case Intrinsic::arm_neon_vtbl2:
3962 return DAG.getNode(ARMISD::VTBL2, SDLoc(Op), Op.getValueType(),
3963 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
3964 case Intrinsic::arm_mve_pred_i2v:
3965 case Intrinsic::arm_mve_pred_v2i:
3966 return DAG.getNode(ARMISD::PREDICATE_CAST, SDLoc(Op), Op.getValueType(),
3967 Op.getOperand(1));
3968 case Intrinsic::arm_mve_vreinterpretq:
3969 return DAG.getNode(ARMISD::VECTOR_REG_CAST, SDLoc(Op), Op.getValueType(),
3970 Op.getOperand(1));
3971 case Intrinsic::arm_mve_lsll:
3972 return DAG.getNode(ARMISD::LSLL, SDLoc(Op), Op->getVTList(),
3973 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
3974 case Intrinsic::arm_mve_asrl:
3975 return DAG.getNode(ARMISD::ASRL, SDLoc(Op), Op->getVTList(),
3976 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
3977 case Intrinsic::arm_mve_vsli:
3978 return DAG.getNode(ARMISD::VSLIIMM, SDLoc(Op), Op->getVTList(),
3979 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
3980 case Intrinsic::arm_mve_vsri:
3981 return DAG.getNode(ARMISD::VSRIIMM, SDLoc(Op), Op->getVTList(),
3982 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
3983 }
3984}
3985
3987 const ARMSubtarget *Subtarget) {
3988 SDLoc dl(Op);
3989 auto SSID = static_cast<SyncScope::ID>(Op.getConstantOperandVal(2));
3990 if (SSID == SyncScope::SingleThread)
3991 return Op;
3992
3993 if (!Subtarget->hasDataBarrier()) {
3994 // Some ARMv6 cpus can support data barriers with an mcr instruction.
3995 // Thumb1 and pre-v6 ARM mode use a libcall instead and should never get
3996 // here.
3997 assert(Subtarget->hasV6Ops() && !Subtarget->isThumb() &&
3998 "Unexpected ISD::ATOMIC_FENCE encountered. Should be libcall!");
3999 return DAG.getNode(ARMISD::MEMBARRIER_MCR, dl, MVT::Other, Op.getOperand(0),
4000 DAG.getConstant(0, dl, MVT::i32));
4001 }
4002
4003 AtomicOrdering Ord =
4004 static_cast<AtomicOrdering>(Op.getConstantOperandVal(1));
4006 if (Subtarget->isMClass()) {
4007 // Only a full system barrier exists in the M-class architectures.
4009 } else if (Subtarget->preferISHSTBarriers() &&
4010 Ord == AtomicOrdering::Release) {
4011 // Swift happens to implement ISHST barriers in a way that's compatible with
4012 // Release semantics but weaker than ISH so we'd be fools not to use
4013 // it. Beware: other processors probably don't!
4015 }
4016
4017 return DAG.getNode(ISD::INTRINSIC_VOID, dl, MVT::Other, Op.getOperand(0),
4018 DAG.getConstant(Intrinsic::arm_dmb, dl, MVT::i32),
4019 DAG.getConstant(Domain, dl, MVT::i32));
4020}
4021
4023 const ARMSubtarget *Subtarget) {
4024 // ARM pre v5TE and Thumb1 does not have preload instructions.
4025 if (!(Subtarget->isThumb2() ||
4026 (!Subtarget->isThumb1Only() && Subtarget->hasV5TEOps())))
4027 // Just preserve the chain.
4028 return Op.getOperand(0);
4029
4030 SDLoc dl(Op);
4031 unsigned isRead = ~Op.getConstantOperandVal(2) & 1;
4032 if (!isRead &&
4033 (!Subtarget->hasV7Ops() || !Subtarget->hasMPExtension()))
4034 // ARMv7 with MP extension has PLDW.
4035 return Op.getOperand(0);
4036
4037 unsigned isData = Op.getConstantOperandVal(4);
4038 if (Subtarget->isThumb()) {
4039 // Invert the bits.
4040 isRead = ~isRead & 1;
4041 isData = ~isData & 1;
4042 }
4043
4044 return DAG.getNode(ARMISD::PRELOAD, dl, MVT::Other, Op.getOperand(0),
4045 Op.getOperand(1), DAG.getConstant(isRead, dl, MVT::i32),
4046 DAG.getConstant(isData, dl, MVT::i32));
4047}
4048
4051 ARMFunctionInfo *FuncInfo = MF.getInfo<ARMFunctionInfo>();
4052
4053 // vastart just stores the address of the VarArgsFrameIndex slot into the
4054 // memory location argument.
4055 SDLoc dl(Op);
4057 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
4058 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
4059 return DAG.getStore(Op.getOperand(0), dl, FR, Op.getOperand(1),
4060 MachinePointerInfo(SV));
4061}
4062
4063SDValue ARMTargetLowering::GetF64FormalArgument(CCValAssign &VA,
4064 CCValAssign &NextVA,
4065 SDValue &Root,
4066 SelectionDAG &DAG,
4067 const SDLoc &dl) const {
4068 MachineFunction &MF = DAG.getMachineFunction();
4069 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
4070
4071 const TargetRegisterClass *RC;
4072 if (AFI->isThumb1OnlyFunction())
4073 RC = &ARM::tGPRRegClass;
4074 else
4075 RC = &ARM::GPRRegClass;
4076
4077 // Transform the arguments stored in physical registers into virtual ones.
4078 Register Reg = MF.addLiveIn(VA.getLocReg(), RC);
4079 SDValue ArgValue = DAG.getCopyFromReg(Root, dl, Reg, MVT::i32);
4080
4081 SDValue ArgValue2;
4082 if (NextVA.isMemLoc()) {
4083 MachineFrameInfo &MFI = MF.getFrameInfo();
4084 int FI = MFI.CreateFixedObject(4, NextVA.getLocMemOffset(), true);
4085
4086 // Create load node to retrieve arguments from the stack.
4087 SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
4088 ArgValue2 = DAG.getLoad(
4089 MVT::i32, dl, Root, FIN,
4091 } else {
4092 Reg = MF.addLiveIn(NextVA.getLocReg(), RC);
4093 ArgValue2 = DAG.getCopyFromReg(Root, dl, Reg, MVT::i32);
4094 }
4095 if (!Subtarget->isLittle())
4096 std::swap (ArgValue, ArgValue2);
4097 return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, ArgValue, ArgValue2);
4098}
4099
4100// The remaining GPRs hold either the beginning of variable-argument
4101// data, or the beginning of an aggregate passed by value (usually
4102// byval). Either way, we allocate stack slots adjacent to the data
4103// provided by our caller, and store the unallocated registers there.
4104// If this is a variadic function, the va_list pointer will begin with
4105// these values; otherwise, this reassembles a (byval) structure that
4106// was split between registers and memory.
4107// Return: The frame index registers were stored into.
4108int ARMTargetLowering::StoreByValRegs(CCState &CCInfo, SelectionDAG &DAG,
4109 const SDLoc &dl, SDValue &Chain,
4110 const Value *OrigArg,
4111 unsigned InRegsParamRecordIdx,
4112 int ArgOffset, unsigned ArgSize) const {
4113 // Currently, two use-cases possible:
4114 // Case #1. Non-var-args function, and we meet first byval parameter.
4115 // Setup first unallocated register as first byval register;
4116 // eat all remained registers
4117 // (these two actions are performed by HandleByVal method).
4118 // Then, here, we initialize stack frame with
4119 // "store-reg" instructions.
4120 // Case #2. Var-args function, that doesn't contain byval parameters.
4121 // The same: eat all remained unallocated registers,
4122 // initialize stack frame.
4123
4124 MachineFunction &MF = DAG.getMachineFunction();
4125 MachineFrameInfo &MFI = MF.getFrameInfo();
4126 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
4127 unsigned RBegin, REnd;
4128 if (InRegsParamRecordIdx < CCInfo.getInRegsParamsCount()) {
4129 CCInfo.getInRegsParamInfo(InRegsParamRecordIdx, RBegin, REnd);
4130 } else {
4131 unsigned RBeginIdx = CCInfo.getFirstUnallocated(GPRArgRegs);
4132 RBegin = RBeginIdx == 4 ? (unsigned)ARM::R4 : GPRArgRegs[RBeginIdx];
4133 REnd = ARM::R4;
4134 }
4135
4136 if (REnd != RBegin)
4137 ArgOffset = -4 * (ARM::R4 - RBegin);
4138
4139 auto PtrVT = getPointerTy(DAG.getDataLayout());
4140 int FrameIndex = MFI.CreateFixedObject(ArgSize, ArgOffset, false);
4141 SDValue FIN = DAG.getFrameIndex(FrameIndex, PtrVT);
4142
4144 const TargetRegisterClass *RC =
4145 AFI->isThumb1OnlyFunction() ? &ARM::tGPRRegClass : &ARM::GPRRegClass;
4146
4147 for (unsigned Reg = RBegin, i = 0; Reg < REnd; ++Reg, ++i) {
4148 Register VReg = MF.addLiveIn(Reg, RC);
4149 SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i32);
4150 SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN,
4151 MachinePointerInfo(OrigArg, 4 * i));
4152 MemOps.push_back(Store);
4153 FIN = DAG.getNode(ISD::ADD, dl, PtrVT, FIN, DAG.getConstant(4, dl, PtrVT));
4154 }
4155
4156 if (!MemOps.empty())
4157 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
4158 return FrameIndex;
4159}
4160
4161// Setup stack frame, the va_list pointer will start from.
4162void ARMTargetLowering::VarArgStyleRegisters(CCState &CCInfo, SelectionDAG &DAG,
4163 const SDLoc &dl, SDValue &Chain,
4164 unsigned ArgOffset,
4165 unsigned TotalArgRegsSaveSize,
4166 bool ForceMutable) const {
4167 MachineFunction &MF = DAG.getMachineFunction();
4168 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
4169
4170 // Try to store any remaining integer argument regs
4171 // to their spots on the stack so that they may be loaded by dereferencing
4172 // the result of va_next.
4173 // If there is no regs to be stored, just point address after last
4174 // argument passed via stack.
4175 int FrameIndex = StoreByValRegs(
4176 CCInfo, DAG, dl, Chain, nullptr, CCInfo.getInRegsParamsCount(),
4177 CCInfo.getStackSize(), std::max(4U, TotalArgRegsSaveSize));
4178 AFI->setVarArgsFrameIndex(FrameIndex);
4179}
4180
4181bool ARMTargetLowering::splitValueIntoRegisterParts(
4182 SelectionDAG &DAG, const SDLoc &DL, SDValue Val, SDValue *Parts,
4183 unsigned NumParts, MVT PartVT, std::optional<CallingConv::ID> CC) const {
4184 EVT ValueVT = Val.getValueType();
4185 if ((ValueVT == MVT::f16 || ValueVT == MVT::bf16) && PartVT == MVT::f32) {
4186 unsigned ValueBits = ValueVT.getSizeInBits();
4187 unsigned PartBits = PartVT.getSizeInBits();
4188 Val = DAG.getNode(ISD::BITCAST, DL, MVT::getIntegerVT(ValueBits), Val);
4189 Val = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::getIntegerVT(PartBits), Val);
4190 Val = DAG.getNode(ISD::BITCAST, DL, PartVT, Val);
4191 Parts[0] = Val;
4192 return true;
4193 }
4194 return false;
4195}
4196
4197SDValue ARMTargetLowering::joinRegisterPartsIntoValue(
4198 SelectionDAG &DAG, const SDLoc &DL, const SDValue *Parts, unsigned NumParts,
4199 MVT PartVT, EVT ValueVT, std::optional<CallingConv::ID> CC) const {
4200 if ((ValueVT == MVT::f16 || ValueVT == MVT::bf16) && PartVT == MVT::f32) {
4201 unsigned ValueBits = ValueVT.getSizeInBits();
4202 unsigned PartBits = PartVT.getSizeInBits();
4203 SDValue Val = Parts[0];
4204
4205 Val = DAG.getNode(ISD::BITCAST, DL, MVT::getIntegerVT(PartBits), Val);
4206 Val = DAG.getNode(ISD::TRUNCATE, DL, MVT::getIntegerVT(ValueBits), Val);
4207 Val = DAG.getNode(ISD::BITCAST, DL, ValueVT, Val);
4208 return Val;
4209 }
4210 return SDValue();
4211}
4212
4213SDValue ARMTargetLowering::LowerFormalArguments(
4214 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
4215 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
4216 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
4217 MachineFunction &MF = DAG.getMachineFunction();
4218 MachineFrameInfo &MFI = MF.getFrameInfo();
4219
4220 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
4221
4222 // Assign locations to all of the incoming arguments.
4224 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
4225 *DAG.getContext());
4226 CCInfo.AnalyzeFormalArguments(Ins, CCAssignFnForCall(CallConv, isVarArg));
4227
4229 unsigned CurArgIdx = 0;
4230
4231 // Initially ArgRegsSaveSize is zero.
4232 // Then we increase this value each time we meet byval parameter.
4233 // We also increase this value in case of varargs function.
4234 AFI->setArgRegsSaveSize(0);
4235
4236 // Calculate the amount of stack space that we need to allocate to store
4237 // byval and variadic arguments that are passed in registers.
4238 // We need to know this before we allocate the first byval or variadic
4239 // argument, as they will be allocated a stack slot below the CFA (Canonical
4240 // Frame Address, the stack pointer at entry to the function).
4241 unsigned ArgRegBegin = ARM::R4;
4242 for (const CCValAssign &VA : ArgLocs) {
4243 if (CCInfo.getInRegsParamsProcessed() >= CCInfo.getInRegsParamsCount())
4244 break;
4245
4246 unsigned Index = VA.getValNo();
4247 ISD::ArgFlagsTy Flags = Ins[Index].Flags;
4248 if (!Flags.isByVal())
4249 continue;
4250
4251 assert(VA.isMemLoc() && "unexpected byval pointer in reg");
4252 unsigned RBegin, REnd;
4253 CCInfo.getInRegsParamInfo(CCInfo.getInRegsParamsProcessed(), RBegin, REnd);
4254 ArgRegBegin = std::min(ArgRegBegin, RBegin);
4255
4256 CCInfo.nextInRegsParam();
4257 }
4258 CCInfo.rewindByValRegsInfo();
4259
4260 int lastInsIndex = -1;
4261 if (isVarArg && MFI.hasVAStart()) {
4262 unsigned RegIdx = CCInfo.getFirstUnallocated(GPRArgRegs);
4263 if (RegIdx != std::size(GPRArgRegs))
4264 ArgRegBegin = std::min(ArgRegBegin, (unsigned)GPRArgRegs[RegIdx]);
4265 }
4266
4267 unsigned TotalArgRegsSaveSize = 4 * (ARM::R4 - ArgRegBegin);
4268 AFI->setArgRegsSaveSize(TotalArgRegsSaveSize);
4269 auto PtrVT = getPointerTy(DAG.getDataLayout());
4270
4271 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
4272 CCValAssign &VA = ArgLocs[i];
4273 if (Ins[VA.getValNo()].isOrigArg()) {
4274 std::advance(CurOrigArg,
4275 Ins[VA.getValNo()].getOrigArgIndex() - CurArgIdx);
4276 CurArgIdx = Ins[VA.getValNo()].getOrigArgIndex();
4277 }
4278 // Arguments stored in registers.
4279 if (VA.isRegLoc()) {
4280 EVT RegVT = VA.getLocVT();
4281 SDValue ArgValue;
4282
4283 if (VA.needsCustom() && VA.getLocVT() == MVT::v2f64) {
4284 // f64 and vector types are split up into multiple registers or
4285 // combinations of registers and stack slots.
4286 SDValue ArgValue1 =
4287 GetF64FormalArgument(VA, ArgLocs[++i], Chain, DAG, dl);
4288 VA = ArgLocs[++i]; // skip ahead to next loc
4289 SDValue ArgValue2;
4290 if (VA.isMemLoc()) {
4291 int FI = MFI.CreateFixedObject(8, VA.getLocMemOffset(), true);
4292 SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
4293 ArgValue2 = DAG.getLoad(
4294 MVT::f64, dl, Chain, FIN,
4296 } else {
4297 ArgValue2 = GetF64FormalArgument(VA, ArgLocs[++i], Chain, DAG, dl);
4298 }
4299 ArgValue = DAG.getNode(ISD::UNDEF, dl, MVT::v2f64);
4300 ArgValue = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, ArgValue,
4301 ArgValue1, DAG.getIntPtrConstant(0, dl));
4302 ArgValue = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, ArgValue,
4303 ArgValue2, DAG.getIntPtrConstant(1, dl));
4304 } else if (VA.needsCustom() && VA.getLocVT() == MVT::f64) {
4305 ArgValue = GetF64FormalArgument(VA, ArgLocs[++i], Chain, DAG, dl);
4306 } else {
4307 const TargetRegisterClass *RC;
4308
4309 if (RegVT == MVT::f16 || RegVT == MVT::bf16)
4310 RC = &ARM::HPRRegClass;
4311 else if (RegVT == MVT::f32)
4312 RC = &ARM::SPRRegClass;
4313 else if (RegVT == MVT::f64 || RegVT == MVT::v4f16 ||
4314 RegVT == MVT::v4bf16)
4315 RC = &ARM::DPRRegClass;
4316 else if (RegVT == MVT::v2f64 || RegVT == MVT::v8f16 ||
4317 RegVT == MVT::v8bf16)
4318 RC = &ARM::QPRRegClass;
4319 else if (RegVT == MVT::i32)
4320 RC = AFI->isThumb1OnlyFunction() ? &ARM::tGPRRegClass
4321 : &ARM::GPRRegClass;
4322 else
4323 llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering");
4324
4325 // Transform the arguments in physical registers into virtual ones.
4326 Register Reg = MF.addLiveIn(VA.getLocReg(), RC);
4327 ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
4328
4329 // If this value is passed in r0 and has the returned attribute (e.g.
4330 // C++ 'structors), record this fact for later use.
4331 if (VA.getLocReg() == ARM::R0 && Ins[VA.getValNo()].Flags.isReturned()) {
4332 AFI->setPreservesR0();
4333 }
4334 }
4335
4336 // If this is an 8 or 16-bit value, it is really passed promoted
4337 // to 32 bits. Insert an assert[sz]ext to capture this, then
4338 // truncate to the right size.
4339 switch (VA.getLocInfo()) {
4340 default: llvm_unreachable("Unknown loc info!");
4341 case CCValAssign::Full: break;
4342 case CCValAssign::BCvt:
4343 ArgValue = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), ArgValue);
4344 break;
4345 }
4346
4347 // f16 arguments have their size extended to 4 bytes and passed as if they
4348 // had been copied to the LSBs of a 32-bit register.
4349 // For that, it's passed extended to i32 (soft ABI) or to f32 (hard ABI)
4350 if (VA.needsCustom() &&
4351 (VA.getValVT() == MVT::f16 || VA.getValVT() == MVT::bf16))
4352 ArgValue = MoveToHPR(dl, DAG, VA.getLocVT(), VA.getValVT(), ArgValue);
4353
4354 // On CMSE Entry Functions, formal integer arguments whose bitwidth is
4355 // less than 32 bits must be sign- or zero-extended in the callee for
4356 // security reasons. Although the ABI mandates an extension done by the
4357 // caller, the latter cannot be trusted to follow the rules of the ABI.
4358 const ISD::InputArg &Arg = Ins[VA.getValNo()];
4359 if (AFI->isCmseNSEntryFunction() && Arg.ArgVT.isScalarInteger() &&
4360 RegVT.isScalarInteger() && Arg.ArgVT.bitsLT(MVT::i32))
4361 ArgValue = handleCMSEValue(ArgValue, Arg, DAG, dl);
4362
4363 InVals.push_back(ArgValue);
4364 } else { // VA.isRegLoc()
4365 // Only arguments passed on the stack should make it here.
4366 assert(VA.isMemLoc());
4367 assert(VA.getValVT() != MVT::i64 && "i64 should already be lowered");
4368
4369 int index = VA.getValNo();
4370
4371 // Some Ins[] entries become multiple ArgLoc[] entries.
4372 // Process them only once.
4373 if (index != lastInsIndex)
4374 {
4375 ISD::ArgFlagsTy Flags = Ins[index].Flags;
4376 // FIXME: For now, all byval parameter objects are marked mutable.
4377 // This can be changed with more analysis.
4378 // In case of tail call optimization mark all arguments mutable.
4379 // Since they could be overwritten by lowering of arguments in case of
4380 // a tail call.
4381 if (Flags.isByVal()) {
4382 assert(Ins[index].isOrigArg() &&
4383 "Byval arguments cannot be implicit");
4384 unsigned CurByValIndex = CCInfo.getInRegsParamsProcessed();
4385
4386 int FrameIndex = StoreByValRegs(
4387 CCInfo, DAG, dl, Chain, &*CurOrigArg, CurByValIndex,
4388 VA.getLocMemOffset(), Flags.getByValSize());
4389 InVals.push_back(DAG.getFrameIndex(FrameIndex, PtrVT));
4390 CCInfo.nextInRegsParam();
4391 } else if (VA.needsCustom() && (VA.getValVT() == MVT::f16 ||
4392 VA.getValVT() == MVT::bf16)) {
4393 // f16 and bf16 values are passed in the least-significant half of
4394 // a 4 byte stack slot. This is done as-if the extension was done
4395 // in a 32-bit register, so the actual bytes used for the value
4396 // differ between little and big endian.
4397 assert(VA.getLocVT().getSizeInBits() == 32);
4398 unsigned FIOffset = VA.getLocMemOffset();
4399 int FI = MFI.CreateFixedObject(VA.getLocVT().getSizeInBits() / 8,
4400 FIOffset, true);
4401
4402 SDValue Addr = DAG.getFrameIndex(FI, PtrVT);
4403 if (DAG.getDataLayout().isBigEndian())
4404 Addr = DAG.getObjectPtrOffset(dl, Addr, TypeSize::getFixed(2));
4405
4406 InVals.push_back(DAG.getLoad(VA.getValVT(), dl, Chain, Addr,
4408 DAG.getMachineFunction(), FI)));
4409
4410 } else {
4411 unsigned FIOffset = VA.getLocMemOffset();
4412 int FI = MFI.CreateFixedObject(VA.getLocVT().getSizeInBits()/8,
4413 FIOffset, true);
4414
4415 // Create load nodes to retrieve arguments from the stack.
4416 SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
4417 InVals.push_back(DAG.getLoad(VA.getValVT(), dl, Chain, FIN,
4419 DAG.getMachineFunction(), FI)));
4420 }
4421 lastInsIndex = index;
4422 }
4423 }
4424 }
4425
4426 // varargs
4427 if (isVarArg && MFI.hasVAStart()) {
4428 VarArgStyleRegisters(CCInfo, DAG, dl, Chain, CCInfo.getStackSize(),
4429 TotalArgRegsSaveSize);
4430 if (AFI->isCmseNSEntryFunction()) {
4431 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
4433 "secure entry function must not be variadic", dl.getDebugLoc()));
4434 }
4435 }
4436
4437 unsigned StackArgSize = CCInfo.getStackSize();
4438 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
4439 if (canGuaranteeTCO(CallConv, TailCallOpt)) {
4440 // The only way to guarantee a tail call is if the callee restores its
4441 // argument area, but it must also keep the stack aligned when doing so.
4442 MaybeAlign StackAlign = DAG.getDataLayout().getStackAlignment();
4443 assert(StackAlign && "data layout string is missing stack alignment");
4444 StackArgSize = alignTo(StackArgSize, *StackAlign);
4445
4446 AFI->setArgumentStackToRestore(StackArgSize);
4447 }
4448 AFI->setArgumentStackSize(StackArgSize);
4449
4450 if (CCInfo.getStackSize() > 0 && AFI->isCmseNSEntryFunction()) {
4451 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
4453 "secure entry function requires arguments on stack", dl.getDebugLoc()));
4454 }
4455
4456 return Chain;
4457}
4458
4459/// isFloatingPointZero - Return true if this is +0.0.
4462 return CFP->getValueAPF().isPosZero();
4463 else if (ISD::isEXTLoad(Op.getNode()) || ISD::isNON_EXTLoad(Op.getNode())) {
4464 // Maybe this has already been legalized into the constant pool?
4465 if (Op.getOperand(1).getOpcode() == ARMISD::Wrapper) {
4466 SDValue WrapperOp = Op.getOperand(1).getOperand(0);
4468 if (const ConstantFP *CFP = dyn_cast<ConstantFP>(CP->getConstVal()))
4469 return CFP->getValueAPF().isPosZero();
4470 }
4471 } else if (Op->getOpcode() == ISD::BITCAST &&
4472 Op->getValueType(0) == MVT::f64) {
4473 // Handle (ISD::BITCAST (ARMISD::VMOVIMM (ISD::TargetConstant 0)) MVT::f64)
4474 // created by LowerConstantFP().
4475 SDValue BitcastOp = Op->getOperand(0);
4476 if (BitcastOp->getOpcode() == ARMISD::VMOVIMM &&
4477 isNullConstant(BitcastOp->getOperand(0)))
4478 return true;
4479 }
4480 return false;
4481}
4482
4484 // 0 - INT_MIN sign wraps, so no signed wrap means cmn is safe.
4485 if (Op->getFlags().hasNoSignedWrap())
4486 return true;
4487
4488 // We can still figure out if the second operand is safe to use
4489 // in a CMN instruction by checking if it is known to be not the minimum
4490 // signed value. If it is not, then we can safely use CMN.
4491 // Note: We can eventually remove this check and simply rely on
4492 // Op->getFlags().hasNoSignedWrap() once SelectionDAG/ISelLowering
4493 // consistently sets them appropriately when making said nodes.
4494
4495 KnownBits KnownSrc = DAG.computeKnownBits(Op.getOperand(1));
4496 return !KnownSrc.getSignedMinValue().isMinSignedValue();
4497}
4498
4500 return Op.getOpcode() == ISD::SUB && isNullConstant(Op.getOperand(0)) &&
4501 (isIntEqualitySetCC(CC) ||
4502 (isUnsignedIntSetCC(CC) && DAG.isKnownNeverZero(Op.getOperand(1))) ||
4503 (isSignedIntSetCC(CC) && isSafeSignedCMN(Op, DAG)));
4504}
4505
4506/// Returns appropriate ARM CMP (cmp) and corresponding condition code for
4507/// the given operands.
4508SDValue ARMTargetLowering::getARMCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
4509 SDValue &ARMcc, SelectionDAG &DAG,
4510 const SDLoc &dl) const {
4511 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) {
4512 unsigned C = RHSC->getZExtValue();
4513 if (!isLegalICmpImmediate((int32_t)C)) {
4514 // Constant does not fit, try adjusting it by one.
4515 switch (CC) {
4516 default: break;
4517 case ISD::SETLT:
4518 case ISD::SETGE:
4519 if (C != 0x80000000 && isLegalICmpImmediate(C-1)) {
4520 CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT;
4521 RHS = DAG.getConstant(C - 1, dl, MVT::i32);
4522 }
4523 break;
4524 case ISD::SETULT:
4525 case ISD::SETUGE:
4526 if (C != 0 && isLegalICmpImmediate(C-1)) {
4527 CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT;
4528 RHS = DAG.getConstant(C - 1, dl, MVT::i32);
4529 }
4530 break;
4531 case ISD::SETLE:
4532 case ISD::SETGT:
4533 if (C != 0x7fffffff && isLegalICmpImmediate(C+1)) {
4534 CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE;
4535 RHS = DAG.getConstant(C + 1, dl, MVT::i32);
4536 }
4537 break;
4538 case ISD::SETULE:
4539 case ISD::SETUGT:
4540 if (C != 0xffffffff && isLegalICmpImmediate(C+1)) {
4541 CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE;
4542 RHS = DAG.getConstant(C + 1, dl, MVT::i32);
4543 }
4544 break;
4545 }
4546 }
4547 } else if ((ARM_AM::getShiftOpcForNode(LHS.getOpcode()) != ARM_AM::no_shift) &&
4549 // In ARM and Thumb-2, the compare instructions can shift their second
4550 // operand.
4552 std::swap(LHS, RHS);
4553 }
4554
4555 // Thumb1 has very limited immediate modes, so turning an "and" into a
4556 // shift can save multiple instructions.
4557 //
4558 // If we have (x & C1), and C1 is an appropriate mask, we can transform it
4559 // into "((x << n) >> n)". But that isn't necessarily profitable on its
4560 // own. If it's the operand to an unsigned comparison with an immediate,
4561 // we can eliminate one of the shifts: we transform
4562 // "((x << n) >> n) == C2" to "(x << n) == (C2 << n)".
4563 //
4564 // We avoid transforming cases which aren't profitable due to encoding
4565 // details:
4566 //
4567 // 1. C2 fits into the immediate field of a cmp, and the transformed version
4568 // would not; in that case, we're essentially trading one immediate load for
4569 // another.
4570 // 2. C1 is 255 or 65535, so we can use uxtb or uxth.
4571 // 3. C2 is zero; we have other code for this special case.
4572 //
4573 // FIXME: Figure out profitability for Thumb2; we usually can't save an
4574 // instruction, since the AND is always one instruction anyway, but we could
4575 // use narrow instructions in some cases.
4576 if (Subtarget->isThumb1Only() && LHS->getOpcode() == ISD::AND &&
4577 LHS->hasOneUse() && isa<ConstantSDNode>(LHS.getOperand(1)) &&
4578 LHS.getValueType() == MVT::i32 && isa<ConstantSDNode>(RHS) &&
4579 !isSignedIntSetCC(CC)) {
4580 unsigned Mask = LHS.getConstantOperandVal(1);
4581 auto *RHSC = cast<ConstantSDNode>(RHS.getNode());
4582 uint64_t RHSV = RHSC->getZExtValue();
4583 if (isMask_32(Mask) && (RHSV & ~Mask) == 0 && Mask != 255 && Mask != 65535) {
4584 unsigned ShiftBits = llvm::countl_zero(Mask);
4585 if (RHSV && (RHSV > 255 || (RHSV << ShiftBits) <= 255)) {
4586 SDValue ShiftAmt = DAG.getConstant(ShiftBits, dl, MVT::i32);
4587 LHS = DAG.getNode(ISD::SHL, dl, MVT::i32, LHS.getOperand(0), ShiftAmt);
4588 RHS = DAG.getConstant(RHSV << ShiftBits, dl, MVT::i32);
4589 }
4590 }
4591 }
4592
4593 // The specific comparison "(x<<c) > 0x80000000U" can be optimized to a
4594 // single "lsls x, c+1". The shift sets the "C" and "Z" flags the same
4595 // way a cmp would.
4596 // FIXME: Add support for ARM/Thumb2; this would need isel patterns, and
4597 // some tweaks to the heuristics for the previous and->shift transform.
4598 // FIXME: Optimize cases where the LHS isn't a shift.
4599 if (Subtarget->isThumb1Only() && LHS->getOpcode() == ISD::SHL &&
4600 isa<ConstantSDNode>(RHS) && RHS->getAsZExtVal() == 0x80000000U &&
4601 CC == ISD::SETUGT && isa<ConstantSDNode>(LHS.getOperand(1)) &&
4602 LHS.getConstantOperandVal(1) < 31) {
4603 unsigned ShiftAmt = LHS.getConstantOperandVal(1) + 1;
4604 SDValue Shift =
4605 DAG.getNode(ARMISD::LSLS, dl, DAG.getVTList(MVT::i32, FlagsVT),
4606 LHS.getOperand(0), DAG.getConstant(ShiftAmt, dl, MVT::i32));
4607 ARMcc = DAG.getConstant(ARMCC::HI, dl, MVT::i32);
4608 return Shift.getValue(1);
4609 }
4610
4612
4613 // If the RHS is a constant zero then the V (overflow) flag will never be
4614 // set. This can allow us to simplify GE to PL or LT to MI, which can be
4615 // simpler for other passes (like the peephole optimiser) to deal with.
4616 if (isNullConstant(RHS)) {
4617 switch (CondCode) {
4618 default: break;
4619 case ARMCC::GE:
4621 break;
4622 case ARMCC::LT:
4624 break;
4625 }
4626 }
4627
4628 unsigned CompareType;
4629 switch (CondCode) {
4630 default:
4631 CompareType = ARMISD::CMP;
4632 break;
4633 case ARMCC::EQ:
4634 case ARMCC::NE:
4635 // Uses only Z Flag
4636 CompareType = ARMISD::CMPZ;
4637 break;
4638 }
4639
4640 // TODO: Remove CMPZ check once we generalize and remove the CMPZ enum from
4641 // the codebase.
4642
4643 // TODO: When we have a solution to the vselect predicate not allowing pl/mi
4644 // all the time, allow those cases to be cmn too no matter what.
4645 if (CompareType != ARMISD::CMPZ && isCMN(RHS, CC, DAG)) {
4646 CompareType = ARMISD::CMN;
4647 RHS = RHS.getOperand(1);
4648 } else if (CompareType != ARMISD::CMPZ && isCMN(LHS, CC, DAG)) {
4649 CompareType = ARMISD::CMN;
4650 LHS = LHS.getOperand(1);
4652 }
4653
4654 ARMcc = DAG.getConstant(CondCode, dl, MVT::i32);
4655 return DAG.getNode(CompareType, dl, FlagsVT, LHS, RHS);
4656}
4657
4658/// Returns a appropriate VFP CMP (fcmp{s|d}+fmstat) for the given operands.
4659SDValue ARMTargetLowering::getVFPCmp(SDValue LHS, SDValue RHS,
4660 SelectionDAG &DAG, const SDLoc &dl,
4661 bool Signaling) const {
4662 assert(Subtarget->hasFP64() || RHS.getValueType() != MVT::f64);
4663 SDValue Flags;
4665 Flags = DAG.getNode(Signaling ? ARMISD::CMPFPE : ARMISD::CMPFP, dl, FlagsVT,
4666 LHS, RHS);
4667 else
4668 Flags = DAG.getNode(Signaling ? ARMISD::CMPFPEw0 : ARMISD::CMPFPw0, dl,
4669 FlagsVT, LHS);
4670 return DAG.getNode(ARMISD::FMSTAT, dl, FlagsVT, Flags);
4671}
4672
4673// This function returns three things: the arithmetic computation itself
4674// (Value), a comparison (OverflowCmp), and a condition code (ARMcc). The
4675// comparison and the condition code define the case in which the arithmetic
4676// computation *does not* overflow.
4677std::pair<SDValue, SDValue>
4678ARMTargetLowering::getARMXALUOOp(SDValue Op, SelectionDAG &DAG,
4679 SDValue &ARMcc) const {
4680 assert(Op.getValueType() == MVT::i32 && "Unsupported value type");
4681
4682 SDValue Value, OverflowCmp;
4683 SDValue LHS = Op.getOperand(0);
4684 SDValue RHS = Op.getOperand(1);
4685 SDLoc dl(Op);
4686
4687 // FIXME: We are currently always generating CMPs because we don't support
4688 // generating CMN through the backend. This is not as good as the natural
4689 // CMP case because it causes a register dependency and cannot be folded
4690 // later.
4691
4692 switch (Op.getOpcode()) {
4693 default:
4694 llvm_unreachable("Unknown overflow instruction!");
4695 case ISD::SADDO:
4696 ARMcc = DAG.getConstant(ARMCC::VC, dl, MVT::i32);
4697 Value = DAG.getNode(ISD::ADD, dl, Op.getValueType(), LHS, RHS);
4698 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, FlagsVT, Value, LHS);
4699 break;
4700 case ISD::UADDO:
4701 ARMcc = DAG.getConstant(ARMCC::HS, dl, MVT::i32);
4702 // We use ADDC here to correspond to its use in LowerALUO.
4703 // We do not use it in the USUBO case as Value may not be used.
4704 Value = DAG.getNode(ARMISD::ADDC, dl,
4705 DAG.getVTList(Op.getValueType(), MVT::i32), LHS, RHS)
4706 .getValue(0);
4707 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, FlagsVT, Value, LHS);
4708 break;
4709 case ISD::SSUBO:
4710 ARMcc = DAG.getConstant(ARMCC::VC, dl, MVT::i32);
4711 Value = DAG.getNode(ISD::SUB, dl, Op.getValueType(), LHS, RHS);
4712 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, FlagsVT, LHS, RHS);
4713 break;
4714 case ISD::USUBO:
4715 ARMcc = DAG.getConstant(ARMCC::HS, dl, MVT::i32);
4716 Value = DAG.getNode(ISD::SUB, dl, Op.getValueType(), LHS, RHS);
4717 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, FlagsVT, LHS, RHS);
4718 break;
4719 case ISD::UMULO:
4720 // We generate a UMUL_LOHI and then check if the high word is 0.
4721 ARMcc = DAG.getConstant(ARMCC::EQ, dl, MVT::i32);
4722 Value = DAG.getNode(ISD::UMUL_LOHI, dl,
4723 DAG.getVTList(Op.getValueType(), Op.getValueType()),
4724 LHS, RHS);
4725 OverflowCmp = DAG.getNode(ARMISD::CMPZ, dl, FlagsVT, Value.getValue(1),
4726 DAG.getConstant(0, dl, MVT::i32));
4727 Value = Value.getValue(0); // We only want the low 32 bits for the result.
4728 break;
4729 case ISD::SMULO:
4730 // We generate a SMUL_LOHI and then check if all the bits of the high word
4731 // are the same as the sign bit of the low word.
4732 ARMcc = DAG.getConstant(ARMCC::EQ, dl, MVT::i32);
4733 Value = DAG.getNode(ISD::SMUL_LOHI, dl,
4734 DAG.getVTList(Op.getValueType(), Op.getValueType()),
4735 LHS, RHS);
4736 OverflowCmp = DAG.getNode(ARMISD::CMPZ, dl, FlagsVT, Value.getValue(1),
4737 DAG.getNode(ISD::SRA, dl, Op.getValueType(),
4738 Value.getValue(0),
4739 DAG.getConstant(31, dl, MVT::i32)));
4740 Value = Value.getValue(0); // We only want the low 32 bits for the result.
4741 break;
4742 } // switch (...)
4743
4744 return std::make_pair(Value, OverflowCmp);
4745}
4746
4748 SDLoc DL(Value);
4749 EVT VT = Value.getValueType();
4750
4751 if (Invert)
4752 Value = DAG.getNode(ISD::SUB, DL, MVT::i32,
4753 DAG.getConstant(1, DL, MVT::i32), Value);
4754
4755 SDValue Cmp = DAG.getNode(ARMISD::SUBC, DL, DAG.getVTList(VT, MVT::i32),
4756 Value, DAG.getConstant(1, DL, VT));
4757 return Cmp.getValue(1);
4758}
4759
4761 bool Invert) {
4762 SDLoc DL(Flags);
4763
4764 if (Invert) {
4765 // Convert flags to boolean with ADDE 0,0,Carry then compute 1 - bool.
4766 SDValue BoolCarry = DAG.getNode(
4767 ARMISD::ADDE, DL, DAG.getVTList(VT, MVT::i32),
4768 DAG.getConstant(0, DL, VT), DAG.getConstant(0, DL, VT), Flags);
4769 return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(1, DL, VT), BoolCarry);
4770 }
4771
4772 // Now convert the carry flag into a boolean carry. We do this
4773 // using ARMISD::ADDE 0, 0, Carry
4774 return DAG.getNode(ARMISD::ADDE, DL, DAG.getVTList(VT, MVT::i32),
4775 DAG.getConstant(0, DL, VT), DAG.getConstant(0, DL, VT),
4776 Flags);
4777}
4778
4779// Value is 1 if 'V' bit is 1, else 0
4781 SDLoc DL(Flags);
4782 SDValue Zero = DAG.getConstant(0, DL, VT);
4783 SDValue One = DAG.getConstant(1, DL, VT);
4784 SDValue ARMcc = DAG.getConstant(ARMCC::VS, DL, MVT::i32);
4785 return DAG.getNode(ARMISD::CMOV, DL, VT, Zero, One, ARMcc, Flags);
4786}
4787
4788SDValue ARMTargetLowering::LowerALUO(SDValue Op, SelectionDAG &DAG) const {
4789 // Let legalize expand this if it isn't a legal type yet.
4790 if (!isTypeLegal(Op.getValueType()))
4791 return SDValue();
4792
4793 SDValue LHS = Op.getOperand(0);
4794 SDValue RHS = Op.getOperand(1);
4795 SDLoc dl(Op);
4796
4797 EVT VT = Op.getValueType();
4798 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
4799 SDValue Value;
4800 SDValue Overflow;
4801 switch (Op.getOpcode()) {
4802 case ISD::UADDO:
4803 Value = DAG.getNode(ARMISD::ADDC, dl, VTs, LHS, RHS);
4804 // Convert the carry flag into a boolean value.
4805 Overflow = carryFlagToValue(Value.getValue(1), VT, DAG, false);
4806 break;
4807 case ISD::USUBO:
4808 Value = DAG.getNode(ARMISD::SUBC, dl, VTs, LHS, RHS);
4809 // Convert the carry flag into a boolean value.
4810 Overflow = carryFlagToValue(Value.getValue(1), VT, DAG, true);
4811 break;
4812 default: {
4813 // Handle other operations with getARMXALUOOp
4814 SDValue OverflowCmp, ARMcc;
4815 std::tie(Value, OverflowCmp) = getARMXALUOOp(Op, DAG, ARMcc);
4816 // We use 0 and 1 as false and true values.
4817 // ARMcc represents the "no overflow" condition (e.g., VC for signed ops).
4818 // CMOV operand order is (FalseVal, TrueVal), so we put 1 in FalseVal
4819 // position to get Overflow=1 when the "no overflow" condition is false.
4820 Overflow =
4821 DAG.getNode(ARMISD::CMOV, dl, MVT::i32,
4822 DAG.getConstant(1, dl, MVT::i32), // FalseVal: overflow
4823 DAG.getConstant(0, dl, MVT::i32), // TrueVal: no overflow
4824 ARMcc, OverflowCmp);
4825 break;
4826 }
4827 }
4828
4829 return DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Value, Overflow);
4830}
4831
4833 const ARMSubtarget *Subtarget) {
4834 EVT VT = Op.getValueType();
4835 if (!Subtarget->hasV6Ops() || !Subtarget->hasDSP() || Subtarget->isThumb1Only())
4836 return SDValue();
4837 if (!VT.isSimple())
4838 return SDValue();
4839
4840 unsigned NewOpcode;
4841 switch (VT.getSimpleVT().SimpleTy) {
4842 default:
4843 return SDValue();
4844 case MVT::i8:
4845 switch (Op->getOpcode()) {
4846 case ISD::UADDSAT:
4847 NewOpcode = ARMISD::UQADD8b;
4848 break;
4849 case ISD::SADDSAT:
4850 NewOpcode = ARMISD::QADD8b;
4851 break;
4852 case ISD::USUBSAT:
4853 NewOpcode = ARMISD::UQSUB8b;
4854 break;
4855 case ISD::SSUBSAT:
4856 NewOpcode = ARMISD::QSUB8b;
4857 break;
4858 }
4859 break;
4860 case MVT::i16:
4861 switch (Op->getOpcode()) {
4862 case ISD::UADDSAT:
4863 NewOpcode = ARMISD::UQADD16b;
4864 break;
4865 case ISD::SADDSAT:
4866 NewOpcode = ARMISD::QADD16b;
4867 break;
4868 case ISD::USUBSAT:
4869 NewOpcode = ARMISD::UQSUB16b;
4870 break;
4871 case ISD::SSUBSAT:
4872 NewOpcode = ARMISD::QSUB16b;
4873 break;
4874 }
4875 break;
4876 }
4877
4878 SDLoc dl(Op);
4879 SDValue Add =
4880 DAG.getNode(NewOpcode, dl, MVT::i32,
4881 DAG.getSExtOrTrunc(Op->getOperand(0), dl, MVT::i32),
4882 DAG.getSExtOrTrunc(Op->getOperand(1), dl, MVT::i32));
4883 return DAG.getNode(ISD::TRUNCATE, dl, VT, Add);
4884}
4885
4886SDValue ARMTargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
4887 SDValue Cond = Op.getOperand(0);
4888 SDValue SelectTrue = Op.getOperand(1);
4889 SDValue SelectFalse = Op.getOperand(2);
4890 SDLoc dl(Op);
4891 unsigned Opc = Cond.getOpcode();
4892
4893 if (Cond.getResNo() == 1 &&
4894 (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO ||
4895 Opc == ISD::USUBO)) {
4896 if (!isTypeLegal(Cond->getValueType(0)))
4897 return SDValue();
4898
4899 SDValue Value, OverflowCmp;
4900 SDValue ARMcc;
4901 std::tie(Value, OverflowCmp) = getARMXALUOOp(Cond, DAG, ARMcc);
4902 EVT VT = Op.getValueType();
4903
4904 return getCMOV(dl, VT, SelectTrue, SelectFalse, ARMcc, OverflowCmp, DAG);
4905 }
4906
4907 // Convert:
4908 //
4909 // (select (cmov 1, 0, cond), t, f) -> (cmov t, f, cond)
4910 // (select (cmov 0, 1, cond), t, f) -> (cmov f, t, cond)
4911 //
4912 if (Cond.getOpcode() == ARMISD::CMOV && Cond.hasOneUse()) {
4913 const ConstantSDNode *CMOVTrue =
4914 dyn_cast<ConstantSDNode>(Cond.getOperand(0));
4915 const ConstantSDNode *CMOVFalse =
4916 dyn_cast<ConstantSDNode>(Cond.getOperand(1));
4917
4918 if (CMOVTrue && CMOVFalse) {
4919 unsigned CMOVTrueVal = CMOVTrue->getZExtValue();
4920 unsigned CMOVFalseVal = CMOVFalse->getZExtValue();
4921
4922 SDValue True;
4923 SDValue False;
4924 if (CMOVTrueVal == 1 && CMOVFalseVal == 0) {
4925 True = SelectTrue;
4926 False = SelectFalse;
4927 } else if (CMOVTrueVal == 0 && CMOVFalseVal == 1) {
4928 True = SelectFalse;
4929 False = SelectTrue;
4930 }
4931
4932 if (True.getNode() && False.getNode())
4933 return getCMOV(dl, Op.getValueType(), True, False, Cond.getOperand(2),
4934 Cond.getOperand(3), DAG);
4935 }
4936 }
4937
4938 return DAG.getSelectCC(dl, Cond,
4939 DAG.getConstant(0, dl, Cond.getValueType()),
4940 SelectTrue, SelectFalse, ISD::SETNE);
4941}
4942
4944 bool &swpCmpOps, bool &swpVselOps) {
4945 // Start by selecting the GE condition code for opcodes that return true for
4946 // 'equality'
4947 if (CC == ISD::SETUGE || CC == ISD::SETOGE || CC == ISD::SETOLE ||
4948 CC == ISD::SETULE || CC == ISD::SETGE || CC == ISD::SETLE)
4949 CondCode = ARMCC::GE;
4950
4951 // and GT for opcodes that return false for 'equality'.
4952 else if (CC == ISD::SETUGT || CC == ISD::SETOGT || CC == ISD::SETOLT ||
4953 CC == ISD::SETULT || CC == ISD::SETGT || CC == ISD::SETLT)
4954 CondCode = ARMCC::GT;
4955
4956 // Since we are constrained to GE/GT, if the opcode contains 'less', we need
4957 // to swap the compare operands.
4958 if (CC == ISD::SETOLE || CC == ISD::SETULE || CC == ISD::SETOLT ||
4959 CC == ISD::SETULT || CC == ISD::SETLE || CC == ISD::SETLT)
4960 swpCmpOps = true;
4961
4962 // Both GT and GE are ordered comparisons, and return false for 'unordered'.
4963 // If we have an unordered opcode, we need to swap the operands to the VSEL
4964 // instruction (effectively negating the condition).
4965 //
4966 // This also has the effect of swapping which one of 'less' or 'greater'
4967 // returns true, so we also swap the compare operands. It also switches
4968 // whether we return true for 'equality', so we compensate by picking the
4969 // opposite condition code to our original choice.
4970 if (CC == ISD::SETULE || CC == ISD::SETULT || CC == ISD::SETUGE ||
4971 CC == ISD::SETUGT) {
4972 swpCmpOps = !swpCmpOps;
4973 swpVselOps = !swpVselOps;
4974 CondCode = CondCode == ARMCC::GT ? ARMCC::GE : ARMCC::GT;
4975 }
4976
4977 // 'ordered' is 'anything but unordered', so use the VS condition code and
4978 // swap the VSEL operands.
4979 if (CC == ISD::SETO) {
4980 CondCode = ARMCC::VS;
4981 swpVselOps = true;
4982 }
4983
4984 // 'unordered or not equal' is 'anything but equal', so use the EQ condition
4985 // code and swap the VSEL operands. Also do this if we don't care about the
4986 // unordered case.
4987 if (CC == ISD::SETUNE || CC == ISD::SETNE) {
4988 CondCode = ARMCC::EQ;
4989 swpVselOps = true;
4990 }
4991}
4992
4993SDValue ARMTargetLowering::getCMOV(const SDLoc &dl, EVT VT, SDValue FalseVal,
4994 SDValue TrueVal, SDValue ARMcc,
4995 SDValue Flags, SelectionDAG &DAG) const {
4996 if (!Subtarget->hasFP64() && VT == MVT::f64) {
4997 FalseVal = DAG.getNode(ARMISD::VMOVRRD, dl,
4998 DAG.getVTList(MVT::i32, MVT::i32), FalseVal);
4999 TrueVal = DAG.getNode(ARMISD::VMOVRRD, dl,
5000 DAG.getVTList(MVT::i32, MVT::i32), TrueVal);
5001
5002 SDValue TrueLow = TrueVal.getValue(0);
5003 SDValue TrueHigh = TrueVal.getValue(1);
5004 SDValue FalseLow = FalseVal.getValue(0);
5005 SDValue FalseHigh = FalseVal.getValue(1);
5006
5007 SDValue Low = DAG.getNode(ARMISD::CMOV, dl, MVT::i32, FalseLow, TrueLow,
5008 ARMcc, Flags);
5009 SDValue High = DAG.getNode(ARMISD::CMOV, dl, MVT::i32, FalseHigh, TrueHigh,
5010 ARMcc, Flags);
5011
5012 return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Low, High);
5013 }
5014 return DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, TrueVal, ARMcc, Flags);
5015}
5016
5017static bool isGTorGE(ISD::CondCode CC) {
5018 return CC == ISD::SETGT || CC == ISD::SETGE;
5019}
5020
5021static bool isLTorLE(ISD::CondCode CC) {
5022 return CC == ISD::SETLT || CC == ISD::SETLE;
5023}
5024
5025// See if a conditional (LHS CC RHS ? TrueVal : FalseVal) is lower-saturating.
5026// All of these conditions (and their <= and >= counterparts) will do:
5027// x < k ? k : x
5028// x > k ? x : k
5029// k < x ? x : k
5030// k > x ? k : x
5031static bool isLowerSaturate(const SDValue LHS, const SDValue RHS,
5032 const SDValue TrueVal, const SDValue FalseVal,
5033 const ISD::CondCode CC, const SDValue K) {
5034 return (isGTorGE(CC) &&
5035 ((K == LHS && K == TrueVal) || (K == RHS && K == FalseVal))) ||
5036 (isLTorLE(CC) &&
5037 ((K == RHS && K == TrueVal) || (K == LHS && K == FalseVal)));
5038}
5039
5040// Check if two chained conditionals could be converted into SSAT or USAT.
5041//
5042// SSAT can replace a set of two conditional selectors that bound a number to an
5043// interval of type [k, ~k] when k + 1 is a power of 2. Here are some examples:
5044//
5045// x < -k ? -k : (x > k ? k : x)
5046// x < -k ? -k : (x < k ? x : k)
5047// x > -k ? (x > k ? k : x) : -k
5048// x < k ? (x < -k ? -k : x) : k
5049// etc.
5050//
5051// LLVM canonicalizes these to either a min(max()) or a max(min())
5052// pattern. This function tries to match one of these and will return a SSAT
5053// node if successful.
5054//
5055// USAT works similarly to SSAT but bounds on the interval [0, k] where k + 1
5056// is a power of 2.
5058 EVT VT = Op.getValueType();
5059 SDValue V1 = Op.getOperand(0);
5060 SDValue K1 = Op.getOperand(1);
5061 SDValue TrueVal1 = Op.getOperand(2);
5062 SDValue FalseVal1 = Op.getOperand(3);
5063 ISD::CondCode CC1 = cast<CondCodeSDNode>(Op.getOperand(4))->get();
5064
5065 const SDValue Op2 = isa<ConstantSDNode>(TrueVal1) ? FalseVal1 : TrueVal1;
5066 if (Op2.getOpcode() != ISD::SELECT_CC)
5067 return SDValue();
5068
5069 SDValue V2 = Op2.getOperand(0);
5070 SDValue K2 = Op2.getOperand(1);
5071 SDValue TrueVal2 = Op2.getOperand(2);
5072 SDValue FalseVal2 = Op2.getOperand(3);
5073 ISD::CondCode CC2 = cast<CondCodeSDNode>(Op2.getOperand(4))->get();
5074
5075 SDValue V1Tmp = V1;
5076 SDValue V2Tmp = V2;
5077
5078 // Check that the registers and the constants match a max(min()) or min(max())
5079 // pattern
5080 if (V1Tmp != TrueVal1 || V2Tmp != TrueVal2 || K1 != FalseVal1 ||
5081 K2 != FalseVal2 ||
5082 !((isGTorGE(CC1) && isLTorLE(CC2)) || (isLTorLE(CC1) && isGTorGE(CC2))))
5083 return SDValue();
5084
5085 // Check that the constant in the lower-bound check is
5086 // the opposite of the constant in the upper-bound check
5087 // in 1's complement.
5089 return SDValue();
5090
5091 int64_t Val1 = cast<ConstantSDNode>(K1)->getSExtValue();
5092 int64_t Val2 = cast<ConstantSDNode>(K2)->getSExtValue();
5093 int64_t PosVal = std::max(Val1, Val2);
5094 int64_t NegVal = std::min(Val1, Val2);
5095
5096 if (!((Val1 > Val2 && isLTorLE(CC1)) || (Val1 < Val2 && isLTorLE(CC2))) ||
5097 !isPowerOf2_64(PosVal + 1))
5098 return SDValue();
5099
5100 // Handle the difference between USAT (unsigned) and SSAT (signed)
5101 // saturation
5102 // At this point, PosVal is guaranteed to be positive
5103 uint64_t K = PosVal;
5104 SDLoc dl(Op);
5105 if (Val1 == ~Val2)
5106 return DAG.getNode(ARMISD::SSAT, dl, VT, V2Tmp,
5107 DAG.getConstant(llvm::countr_one(K), dl, VT));
5108 if (NegVal == 0)
5109 return DAG.getNode(ARMISD::USAT, dl, VT, V2Tmp,
5110 DAG.getConstant(llvm::countr_one(K), dl, VT));
5111
5112 return SDValue();
5113}
5114
5115// Check if a condition of the type x < k ? k : x can be converted into a
5116// bit operation instead of conditional moves.
5117// Currently this is allowed given:
5118// - The conditions and values match up
5119// - k is 0 or -1 (all ones)
5120// This function will not check the last condition, thats up to the caller
5121// It returns true if the transformation can be made, and in such case
5122// returns x in V, and k in SatK.
5124 SDValue &SatK)
5125{
5126 SDValue LHS = Op.getOperand(0);
5127 SDValue RHS = Op.getOperand(1);
5128 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
5129 SDValue TrueVal = Op.getOperand(2);
5130 SDValue FalseVal = Op.getOperand(3);
5131
5133 ? &RHS
5134 : nullptr;
5135
5136 // No constant operation in comparison, early out
5137 if (!K)
5138 return false;
5139
5140 SDValue KTmp = isa<ConstantSDNode>(TrueVal) ? TrueVal : FalseVal;
5141 V = (KTmp == TrueVal) ? FalseVal : TrueVal;
5142 SDValue VTmp = (K && *K == LHS) ? RHS : LHS;
5143
5144 // If the constant on left and right side, or variable on left and right,
5145 // does not match, early out
5146 if (*K != KTmp || V != VTmp)
5147 return false;
5148
5149 if (isLowerSaturate(LHS, RHS, TrueVal, FalseVal, CC, *K)) {
5150 SatK = *K;
5151 return true;
5152 }
5153
5154 return false;
5155}
5156
5157bool ARMTargetLowering::isUnsupportedFloatingType(EVT VT) const {
5158 if (VT == MVT::f32)
5159 return !Subtarget->hasVFP2Base();
5160 if (VT == MVT::f64)
5161 return !Subtarget->hasFP64();
5162 if (VT == MVT::f16)
5163 return !Subtarget->hasFullFP16();
5164 return false;
5165}
5166
5167static SDValue matchCSET(unsigned &Opcode, bool &InvertCond, SDValue TrueVal,
5168 SDValue FalseVal, const ARMSubtarget *Subtarget) {
5169 ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FalseVal);
5170 ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TrueVal);
5171 if (!CFVal || !CTVal || !Subtarget->hasV8_1MMainlineOps())
5172 return SDValue();
5173
5174 unsigned TVal = CTVal->getZExtValue();
5175 unsigned FVal = CFVal->getZExtValue();
5176
5177 Opcode = 0;
5178 InvertCond = false;
5179 if (TVal == ~FVal) {
5180 Opcode = ARMISD::CSINV;
5181 } else if (TVal == ~FVal + 1) {
5182 Opcode = ARMISD::CSNEG;
5183 } else if (TVal + 1 == FVal) {
5184 Opcode = ARMISD::CSINC;
5185 } else if (TVal == FVal + 1) {
5186 Opcode = ARMISD::CSINC;
5187 std::swap(TrueVal, FalseVal);
5188 std::swap(TVal, FVal);
5189 InvertCond = !InvertCond;
5190 } else {
5191 return SDValue();
5192 }
5193
5194 // If one of the constants is cheaper than another, materialise the
5195 // cheaper one and let the csel generate the other.
5196 if (Opcode != ARMISD::CSINC &&
5197 HasLowerConstantMaterializationCost(FVal, TVal, Subtarget)) {
5198 std::swap(TrueVal, FalseVal);
5199 std::swap(TVal, FVal);
5200 InvertCond = !InvertCond;
5201 }
5202
5203 // Attempt to use ZR checking TVal is 0, possibly inverting the condition
5204 // to get there. CSINC not is invertable like the other two (~(~a) == a,
5205 // -(-a) == a, but (a+1)+1 != a).
5206 if (FVal == 0 && Opcode != ARMISD::CSINC) {
5207 std::swap(TrueVal, FalseVal);
5208 std::swap(TVal, FVal);
5209 InvertCond = !InvertCond;
5210 }
5211
5212 return TrueVal;
5213}
5214
5215SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
5216 EVT VT = Op.getValueType();
5217 SDLoc dl(Op);
5218
5219 // Try to convert two saturating conditional selects into a single SSAT
5220 if ((!Subtarget->isThumb() && Subtarget->hasV6Ops()) || Subtarget->isThumb2())
5221 if (SDValue SatValue = LowerSaturatingConditional(Op, DAG))
5222 return SatValue;
5223
5224 // Try to convert expressions of the form x < k ? k : x (and similar forms)
5225 // into more efficient bit operations, which is possible when k is 0 or -1
5226 // On ARM and Thumb-2 which have flexible operand 2 this will result in
5227 // single instructions. On Thumb the shift and the bit operation will be two
5228 // instructions.
5229 // Only allow this transformation on full-width (32-bit) operations
5230 SDValue LowerSatConstant;
5231 SDValue SatValue;
5232 if (VT == MVT::i32 &&
5233 isLowerSaturatingConditional(Op, SatValue, LowerSatConstant)) {
5234 SDValue ShiftV = DAG.getNode(ISD::SRA, dl, VT, SatValue,
5235 DAG.getConstant(31, dl, VT));
5236 if (isNullConstant(LowerSatConstant)) {
5237 SDValue NotShiftV = DAG.getNode(ISD::XOR, dl, VT, ShiftV,
5238 DAG.getAllOnesConstant(dl, VT));
5239 return DAG.getNode(ISD::AND, dl, VT, SatValue, NotShiftV);
5240 } else if (isAllOnesConstant(LowerSatConstant))
5241 return DAG.getNode(ISD::OR, dl, VT, SatValue, ShiftV);
5242 }
5243
5244 SDValue LHS = Op.getOperand(0);
5245 SDValue RHS = Op.getOperand(1);
5246 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
5247 SDValue TrueVal = Op.getOperand(2);
5248 SDValue FalseVal = Op.getOperand(3);
5249 ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FalseVal);
5250 ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS);
5251 if (Op.getValueType().isInteger()) {
5252
5253 // Check for SMAX(lhs, 0) and SMIN(lhs, 0) patterns.
5254 // (SELECT_CC setgt, lhs, 0, lhs, 0) -> (BIC lhs, (SRA lhs, typesize-1))
5255 // (SELECT_CC setlt, lhs, 0, lhs, 0) -> (AND lhs, (SRA lhs, typesize-1))
5256 // Both require less instructions than compare and conditional select.
5257 if ((CC == ISD::SETGT || CC == ISD::SETLT) && LHS == TrueVal && RHSC &&
5258 RHSC->isZero() && CFVal && CFVal->isZero() &&
5259 LHS.getValueType() == RHS.getValueType()) {
5260 EVT VT = LHS.getValueType();
5261 SDValue Shift =
5262 DAG.getNode(ISD::SRA, dl, VT, LHS,
5263 DAG.getConstant(VT.getSizeInBits() - 1, dl, VT));
5264
5265 if (CC == ISD::SETGT)
5266 Shift = DAG.getNOT(dl, Shift, VT);
5267
5268 return DAG.getNode(ISD::AND, dl, VT, LHS, Shift);
5269 }
5270
5271 // (SELECT_CC setlt, x, 0, 1, 0) -> SRL(x, bw-1)
5272 if (CC == ISD::SETLT && isNullConstant(RHS) && isOneConstant(TrueVal) &&
5273 isNullConstant(FalseVal) && LHS.getValueType() == VT)
5274 return DAG.getNode(ISD::SRL, dl, VT, LHS,
5275 DAG.getConstant(VT.getSizeInBits() - 1, dl, VT));
5276 }
5277
5278 if (LHS.getValueType() == MVT::i32) {
5279 unsigned Opcode;
5280 bool InvertCond;
5281 if (SDValue Op =
5282 matchCSET(Opcode, InvertCond, TrueVal, FalseVal, Subtarget)) {
5283 if (InvertCond)
5284 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
5285
5286 SDValue ARMcc;
5287 SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl);
5288 EVT VT = Op.getValueType();
5289 return DAG.getNode(Opcode, dl, VT, Op, Op, ARMcc, Cmp);
5290 }
5291 }
5292
5293 if (isUnsupportedFloatingType(LHS.getValueType())) {
5294 softenSetCCOperands(DAG, LHS.getValueType(), LHS, RHS, CC, dl, LHS, RHS);
5295
5296 // If softenSetCCOperands only returned one value, we should compare it to
5297 // zero.
5298 if (!RHS.getNode()) {
5299 RHS = DAG.getConstant(0, dl, LHS.getValueType());
5300 CC = ISD::SETNE;
5301 }
5302 }
5303
5304 if (LHS.getValueType() == MVT::i32) {
5305 // Try to generate VSEL on ARMv8.
5306 // The VSEL instruction can't use all the usual ARM condition
5307 // codes: it only has two bits to select the condition code, so it's
5308 // constrained to use only GE, GT, VS and EQ.
5309 //
5310 // To implement all the various ISD::SETXXX opcodes, we sometimes need to
5311 // swap the operands of the previous compare instruction (effectively
5312 // inverting the compare condition, swapping 'less' and 'greater') and
5313 // sometimes need to swap the operands to the VSEL (which inverts the
5314 // condition in the sense of firing whenever the previous condition didn't)
5315 if (Subtarget->hasFPARMv8Base() && (TrueVal.getValueType() == MVT::f16 ||
5316 TrueVal.getValueType() == MVT::f32 ||
5317 TrueVal.getValueType() == MVT::f64)) {
5319 if (CondCode == ARMCC::LT || CondCode == ARMCC::LE ||
5320 CondCode == ARMCC::VC || CondCode == ARMCC::NE) {
5321 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
5322 std::swap(TrueVal, FalseVal);
5323 }
5324 }
5325
5326 SDValue ARMcc;
5327 SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl);
5328 // Choose GE over PL, which vsel does now support
5329 if (ARMcc->getAsZExtVal() == ARMCC::PL)
5330 ARMcc = DAG.getConstant(ARMCC::GE, dl, MVT::i32);
5331 return getCMOV(dl, VT, FalseVal, TrueVal, ARMcc, Cmp, DAG);
5332 }
5333
5334 ARMCC::CondCodes CondCode, CondCode2;
5335 FPCCToARMCC(CC, CondCode, CondCode2);
5336
5337 // Normalize the fp compare. If RHS is zero we prefer to keep it there so we
5338 // match CMPFPw0 instead of CMPFP, though we don't do this for f16 because we
5339 // must use VSEL (limited condition codes), due to not having conditional f16
5340 // moves.
5341 if (Subtarget->hasFPARMv8Base() &&
5342 !(isFloatingPointZero(RHS) && TrueVal.getValueType() != MVT::f16) &&
5343 (TrueVal.getValueType() == MVT::f16 ||
5344 TrueVal.getValueType() == MVT::f32 ||
5345 TrueVal.getValueType() == MVT::f64)) {
5346 bool swpCmpOps = false;
5347 bool swpVselOps = false;
5348 checkVSELConstraints(CC, CondCode, swpCmpOps, swpVselOps);
5349
5350 if (CondCode == ARMCC::GT || CondCode == ARMCC::GE ||
5351 CondCode == ARMCC::VS || CondCode == ARMCC::EQ) {
5352 if (swpCmpOps)
5353 std::swap(LHS, RHS);
5354 if (swpVselOps)
5355 std::swap(TrueVal, FalseVal);
5356 }
5357 }
5358
5359 SDValue ARMcc = DAG.getConstant(CondCode, dl, MVT::i32);
5360 SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl);
5361 SDValue Result = getCMOV(dl, VT, FalseVal, TrueVal, ARMcc, Cmp, DAG);
5362 if (CondCode2 != ARMCC::AL) {
5363 SDValue ARMcc2 = DAG.getConstant(CondCode2, dl, MVT::i32);
5364 Result = getCMOV(dl, VT, Result, TrueVal, ARMcc2, Cmp, DAG);
5365 }
5366 return Result;
5367}
5368
5369/// canChangeToInt - Given the fp compare operand, return true if it is suitable
5370/// to morph to an integer compare sequence.
5371static bool canChangeToInt(SDValue Op, bool &SeenZero,
5372 const ARMSubtarget *Subtarget) {
5373 SDNode *N = Op.getNode();
5374 if (!N->hasOneUse())
5375 // Otherwise it requires moving the value from fp to integer registers.
5376 return false;
5377 if (!N->getNumValues())
5378 return false;
5379 EVT VT = Op.getValueType();
5380 if (VT != MVT::f32 && !Subtarget->isFPBrccSlow())
5381 // f32 case is generally profitable. f64 case only makes sense when vcmpe +
5382 // vmrs are very slow, e.g. cortex-a8.
5383 return false;
5384
5385 if (isFloatingPointZero(Op)) {
5386 SeenZero = true;
5387 return true;
5388 }
5389 return ISD::isNormalLoad(N);
5390}
5391
5394 return DAG.getConstant(0, SDLoc(Op), MVT::i32);
5395
5397 return DAG.getLoad(MVT::i32, SDLoc(Op), Ld->getChain(), Ld->getBasePtr(),
5398 Ld->getPointerInfo(), Ld->getAlign(),
5399 Ld->getMemOperand()->getFlags());
5400
5401 llvm_unreachable("Unknown VFP cmp argument!");
5402}
5403
5405 SDValue &RetVal1, SDValue &RetVal2) {
5406 SDLoc dl(Op);
5407
5408 if (isFloatingPointZero(Op)) {
5409 RetVal1 = DAG.getConstant(0, dl, MVT::i32);
5410 RetVal2 = DAG.getConstant(0, dl, MVT::i32);
5411 return;
5412 }
5413
5414 if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Op)) {
5415 SDValue Ptr = Ld->getBasePtr();
5416 RetVal1 =
5417 DAG.getLoad(MVT::i32, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),
5418 Ld->getAlign(), Ld->getMemOperand()->getFlags());
5419
5420 EVT PtrType = Ptr.getValueType();
5421 SDValue NewPtr = DAG.getNode(ISD::ADD, dl,
5422 PtrType, Ptr, DAG.getConstant(4, dl, PtrType));
5423 RetVal2 = DAG.getLoad(MVT::i32, dl, Ld->getChain(), NewPtr,
5424 Ld->getPointerInfo().getWithOffset(4),
5425 commonAlignment(Ld->getAlign(), 4),
5426 Ld->getMemOperand()->getFlags());
5427 return;
5428 }
5429
5430 llvm_unreachable("Unknown VFP cmp argument!");
5431}
5432
5433/// OptimizeVFPBrcond - With nnan and without daz, it's legal to optimize some
5434/// f32 and even f64 comparisons to integer ones.
5435SDValue
5436ARMTargetLowering::OptimizeVFPBrcond(SDValue Op, SelectionDAG &DAG) const {
5437 SDValue Chain = Op.getOperand(0);
5438 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
5439 SDValue LHS = Op.getOperand(2);
5440 SDValue RHS = Op.getOperand(3);
5441 SDValue Dest = Op.getOperand(4);
5442 SDLoc dl(Op);
5443
5444 bool LHSSeenZero = false;
5445 bool LHSOk = canChangeToInt(LHS, LHSSeenZero, Subtarget);
5446 bool RHSSeenZero = false;
5447 bool RHSOk = canChangeToInt(RHS, RHSSeenZero, Subtarget);
5448 if (LHSOk && RHSOk && (LHSSeenZero || RHSSeenZero)) {
5449 // If unsafe fp math optimization is enabled and there are no other uses of
5450 // the CMP operands, and the condition code is EQ or NE, we can optimize it
5451 // to an integer comparison.
5452 if (CC == ISD::SETOEQ)
5453 CC = ISD::SETEQ;
5454 else if (CC == ISD::SETUNE)
5455 CC = ISD::SETNE;
5456
5457 SDValue Mask = DAG.getConstant(0x7fffffff, dl, MVT::i32);
5458 SDValue ARMcc;
5459 if (LHS.getValueType() == MVT::f32) {
5460 LHS = DAG.getNode(ISD::AND, dl, MVT::i32,
5461 bitcastf32Toi32(LHS, DAG), Mask);
5462 RHS = DAG.getNode(ISD::AND, dl, MVT::i32,
5463 bitcastf32Toi32(RHS, DAG), Mask);
5464 SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl);
5465 return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, Chain, Dest, ARMcc,
5466 Cmp);
5467 }
5468
5469 SDValue LHS1, LHS2;
5470 SDValue RHS1, RHS2;
5471 expandf64Toi32(LHS, DAG, LHS1, LHS2);
5472 expandf64Toi32(RHS, DAG, RHS1, RHS2);
5473 LHS2 = DAG.getNode(ISD::AND, dl, MVT::i32, LHS2, Mask);
5474 RHS2 = DAG.getNode(ISD::AND, dl, MVT::i32, RHS2, Mask);
5476 ARMcc = DAG.getConstant(CondCode, dl, MVT::i32);
5477 SDValue Ops[] = { Chain, ARMcc, LHS1, LHS2, RHS1, RHS2, Dest };
5478 return DAG.getNode(ARMISD::BCC_i64, dl, MVT::Other, Ops);
5479 }
5480
5481 return SDValue();
5482}
5483
5484// Generate CMP + CMOV for integer abs.
5485SDValue ARMTargetLowering::LowerABS(SDValue Op, SelectionDAG &DAG) const {
5486 SDLoc DL(Op);
5487
5488 SDValue Neg = DAG.getNegative(Op.getOperand(0), DL, MVT::i32);
5489
5490 // Generate CMP & CMOV.
5491 SDValue Cmp = DAG.getNode(ARMISD::CMP, DL, FlagsVT, Op.getOperand(0),
5492 DAG.getConstant(0, DL, MVT::i32));
5493 return DAG.getNode(ARMISD::CMOV, DL, MVT::i32, Op.getOperand(0), Neg,
5494 DAG.getConstant(ARMCC::MI, DL, MVT::i32), Cmp);
5495}
5496
5498 ARMCC::CondCodes CondCode =
5499 (ARMCC::CondCodes)cast<ConstantSDNode>(ARMcc)->getZExtValue();
5500 CondCode = ARMCC::getOppositeCondition(CondCode);
5501 return DAG.getConstant(CondCode, SDLoc(ARMcc), MVT::i32);
5502}
5503
5504SDValue ARMTargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
5505 SDValue Chain = Op.getOperand(0);
5506 SDValue Cond = Op.getOperand(1);
5507 SDValue Dest = Op.getOperand(2);
5508 SDLoc dl(Op);
5509
5510 // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch
5511 // instruction.
5512 unsigned Opc = Cond.getOpcode();
5513 bool OptimizeMul = (Opc == ISD::SMULO || Opc == ISD::UMULO) &&
5514 !Subtarget->isThumb1Only();
5515 if (Cond.getResNo() == 1 &&
5516 (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO ||
5517 Opc == ISD::USUBO || OptimizeMul)) {
5518 // Only lower legal XALUO ops.
5519 if (!isTypeLegal(Cond->getValueType(0)))
5520 return SDValue();
5521
5522 // The actual operation with overflow check.
5523 SDValue Value, OverflowCmp;
5524 SDValue ARMcc;
5525 std::tie(Value, OverflowCmp) = getARMXALUOOp(Cond, DAG, ARMcc);
5526
5527 // Reverse the condition code.
5528 ARMcc = getInvertedARMCondCode(ARMcc, DAG);
5529
5530 return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, Chain, Dest, ARMcc,
5531 OverflowCmp);
5532 }
5533
5534 return SDValue();
5535}
5536
5537SDValue ARMTargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
5538 SDValue Chain = Op.getOperand(0);
5539 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
5540 SDValue LHS = Op.getOperand(2);
5541 SDValue RHS = Op.getOperand(3);
5542 SDValue Dest = Op.getOperand(4);
5543 SDLoc dl(Op);
5544
5545 if (isUnsupportedFloatingType(LHS.getValueType())) {
5546 softenSetCCOperands(DAG, LHS.getValueType(), LHS, RHS, CC, dl, LHS, RHS);
5547
5548 // If softenSetCCOperands only returned one value, we should compare it to
5549 // zero.
5550 if (!RHS.getNode()) {
5551 RHS = DAG.getConstant(0, dl, LHS.getValueType());
5552 CC = ISD::SETNE;
5553 }
5554 }
5555
5556 // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch
5557 // instruction.
5558 unsigned Opc = LHS.getOpcode();
5559 bool OptimizeMul = (Opc == ISD::SMULO || Opc == ISD::UMULO) &&
5560 !Subtarget->isThumb1Only();
5561 if (LHS.getResNo() == 1 && (isOneConstant(RHS) || isNullConstant(RHS)) &&
5562 (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO ||
5563 Opc == ISD::USUBO || OptimizeMul) &&
5564 (CC == ISD::SETEQ || CC == ISD::SETNE)) {
5565 // Only lower legal XALUO ops.
5566 if (!isTypeLegal(LHS->getValueType(0)))
5567 return SDValue();
5568
5569 // The actual operation with overflow check.
5570 SDValue Value, OverflowCmp;
5571 SDValue ARMcc;
5572 std::tie(Value, OverflowCmp) = getARMXALUOOp(LHS.getValue(0), DAG, ARMcc);
5573
5574 if ((CC == ISD::SETNE) != isOneConstant(RHS)) {
5575 // Reverse the condition code.
5576 ARMcc = getInvertedARMCondCode(ARMcc, DAG);
5577 }
5578
5579 return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, Chain, Dest, ARMcc,
5580 OverflowCmp);
5581 }
5582
5583 if (LHS.getValueType() == MVT::i32) {
5584 SDValue ARMcc;
5585 SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl);
5586 return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, Chain, Dest, ARMcc, Cmp);
5587 }
5588
5589 SDNodeFlags Flags = Op->getFlags();
5590 if (Flags.hasNoNaNs() &&
5591 DAG.getDenormalMode(MVT::f32) == DenormalMode::getIEEE() &&
5592 DAG.getDenormalMode(MVT::f64) == DenormalMode::getIEEE() &&
5593 (CC == ISD::SETEQ || CC == ISD::SETOEQ || CC == ISD::SETNE ||
5594 CC == ISD::SETUNE)) {
5595 if (SDValue Result = OptimizeVFPBrcond(Op, DAG))
5596 return Result;
5597 }
5598
5599 ARMCC::CondCodes CondCode, CondCode2;
5600 FPCCToARMCC(CC, CondCode, CondCode2);
5601
5602 SDValue ARMcc = DAG.getConstant(CondCode, dl, MVT::i32);
5603 SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl);
5604 SDValue Ops[] = {Chain, Dest, ARMcc, Cmp};
5605 SDValue Res = DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, Ops);
5606 if (CondCode2 != ARMCC::AL) {
5607 ARMcc = DAG.getConstant(CondCode2, dl, MVT::i32);
5608 SDValue Ops[] = {Res, Dest, ARMcc, Cmp};
5609 Res = DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, Ops);
5610 }
5611 return Res;
5612}
5613
5614SDValue ARMTargetLowering::LowerBR_JT(SDValue Op, SelectionDAG &DAG) const {
5615 SDValue Chain = Op.getOperand(0);
5616 SDValue Table = Op.getOperand(1);
5617 SDValue Index = Op.getOperand(2);
5618 SDLoc dl(Op);
5619
5620 EVT PTy = getPointerTy(DAG.getDataLayout());
5621 JumpTableSDNode *JT = cast<JumpTableSDNode>(Table);
5622 SDValue JTI = DAG.getTargetJumpTable(JT->getIndex(), PTy);
5623 Table = DAG.getNode(ARMISD::WrapperJT, dl, MVT::i32, JTI);
5624 Index = DAG.getNode(ISD::MUL, dl, PTy, Index, DAG.getConstant(4, dl, PTy));
5625 SDValue Addr = DAG.getNode(ISD::ADD, dl, PTy, Table, Index);
5626 if (Subtarget->isThumb2() || (Subtarget->hasV8MBaselineOps() && Subtarget->isThumb())) {
5627 // Thumb2 and ARMv8-M use a two-level jump. That is, it jumps into the jump table
5628 // which does another jump to the destination. This also makes it easier
5629 // to translate it to TBB / TBH later (Thumb2 only).
5630 // FIXME: This might not work if the function is extremely large.
5631 return DAG.getNode(ARMISD::BR2_JT, dl, MVT::Other, Chain,
5632 Addr, Op.getOperand(2), JTI);
5633 }
5634 if (isPositionIndependent() || Subtarget->isROPI()) {
5635 Addr =
5636 DAG.getLoad((EVT)MVT::i32, dl, Chain, Addr,
5638 Chain = Addr.getValue(1);
5639 Addr = DAG.getNode(ISD::ADD, dl, PTy, Table, Addr);
5640 return DAG.getNode(ARMISD::BR_JT, dl, MVT::Other, Chain, Addr, JTI);
5641 } else {
5642 Addr =
5643 DAG.getLoad(PTy, dl, Chain, Addr,
5645 Chain = Addr.getValue(1);
5646 return DAG.getNode(ARMISD::BR_JT, dl, MVT::Other, Chain, Addr, JTI);
5647 }
5648}
5649
5651 EVT VT = Op.getValueType();
5652 SDLoc dl(Op);
5653
5654 if (Op.getValueType().getVectorElementType() == MVT::i32) {
5655 if (Op.getOperand(0).getValueType().getVectorElementType() == MVT::f32)
5656 return Op;
5657 return DAG.UnrollVectorOp(Op.getNode());
5658 }
5659
5660 const bool HasFullFP16 = DAG.getSubtarget<ARMSubtarget>().hasFullFP16();
5661
5662 EVT NewTy;
5663 const EVT OpTy = Op.getOperand(0).getValueType();
5664 if (OpTy == MVT::v4f32)
5665 NewTy = MVT::v4i32;
5666 else if (OpTy == MVT::v4f16 && HasFullFP16)
5667 NewTy = MVT::v4i16;
5668 else if (OpTy == MVT::v8f16 && HasFullFP16)
5669 NewTy = MVT::v8i16;
5670 else
5671 llvm_unreachable("Invalid type for custom lowering!");
5672
5673 if (VT != MVT::v4i16 && VT != MVT::v8i16)
5674 return DAG.UnrollVectorOp(Op.getNode());
5675
5676 Op = DAG.getNode(Op.getOpcode(), dl, NewTy, Op.getOperand(0));
5677 return DAG.getNode(ISD::TRUNCATE, dl, VT, Op);
5678}
5679
5680SDValue ARMTargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
5681 EVT VT = Op.getValueType();
5682 if (VT.isVector())
5683 return LowerVectorFP_TO_INT(Op, DAG);
5684
5685 bool IsStrict = Op->isStrictFPOpcode();
5686 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
5687
5688 if (isUnsupportedFloatingType(SrcVal.getValueType())) {
5689 RTLIB::Libcall LC;
5690 if (Op.getOpcode() == ISD::FP_TO_SINT ||
5691 Op.getOpcode() == ISD::STRICT_FP_TO_SINT)
5692 LC = RTLIB::getFPTOSINT(SrcVal.getValueType(),
5693 Op.getValueType());
5694 else
5695 LC = RTLIB::getFPTOUINT(SrcVal.getValueType(),
5696 Op.getValueType());
5697 SDLoc Loc(Op);
5698 MakeLibCallOptions CallOptions;
5699 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
5701 std::tie(Result, Chain) = makeLibCall(DAG, LC, Op.getValueType(), SrcVal,
5702 CallOptions, Loc, Chain);
5703 return IsStrict ? DAG.getMergeValues({Result, Chain}, Loc) : Result;
5704 }
5705
5706 // FIXME: Remove this when we have strict fp instruction selection patterns
5707 if (IsStrict) {
5708 SDLoc Loc(Op);
5709 SDValue Result =
5712 Loc, Op.getValueType(), SrcVal);
5713 return DAG.getMergeValues({Result, Op.getOperand(0)}, Loc);
5714 }
5715
5716 return Op;
5717}
5718
5720 const ARMSubtarget *Subtarget) {
5721 EVT VT = Op.getValueType();
5722 EVT ToVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
5723 EVT FromVT = Op.getOperand(0).getValueType();
5724
5725 if (VT == MVT::i32 && ToVT == MVT::i32 && FromVT == MVT::f32)
5726 return Op;
5727 if (VT == MVT::i32 && ToVT == MVT::i32 && FromVT == MVT::f64 &&
5728 Subtarget->hasFP64())
5729 return Op;
5730 if (VT == MVT::i32 && ToVT == MVT::i32 && FromVT == MVT::f16 &&
5731 Subtarget->hasFullFP16())
5732 return Op;
5733 if (VT == MVT::v4i32 && ToVT == MVT::i32 && FromVT == MVT::v4f32 &&
5734 Subtarget->hasMVEFloatOps())
5735 return Op;
5736 if (VT == MVT::v8i16 && ToVT == MVT::i16 && FromVT == MVT::v8f16 &&
5737 Subtarget->hasMVEFloatOps())
5738 return Op;
5739
5740 if (FromVT != MVT::v4f32 && FromVT != MVT::v8f16)
5741 return SDValue();
5742
5743 SDLoc DL(Op);
5744 bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT_SAT;
5745 unsigned BW = ToVT.getScalarSizeInBits() - IsSigned;
5746 SDValue CVT = DAG.getNode(Op.getOpcode(), DL, VT, Op.getOperand(0),
5747 DAG.getValueType(VT.getScalarType()));
5748 SDValue Max = DAG.getNode(IsSigned ? ISD::SMIN : ISD::UMIN, DL, VT, CVT,
5749 DAG.getConstant((1 << BW) - 1, DL, VT));
5750 if (IsSigned)
5751 Max = DAG.getNode(ISD::SMAX, DL, VT, Max,
5752 DAG.getSignedConstant(-(1 << BW), DL, VT));
5753 return Max;
5754}
5755
5757 EVT VT = Op.getValueType();
5758 SDLoc dl(Op);
5759
5760 if (Op.getOperand(0).getValueType().getVectorElementType() == MVT::i32) {
5761 if (VT.getVectorElementType() == MVT::f32)
5762 return Op;
5763 return DAG.UnrollVectorOp(Op.getNode());
5764 }
5765
5766 assert((Op.getOperand(0).getValueType() == MVT::v4i16 ||
5767 Op.getOperand(0).getValueType() == MVT::v8i16) &&
5768 "Invalid type for custom lowering!");
5769
5770 const bool HasFullFP16 = DAG.getSubtarget<ARMSubtarget>().hasFullFP16();
5771
5772 EVT DestVecType;
5773 if (VT == MVT::v4f32)
5774 DestVecType = MVT::v4i32;
5775 else if (VT == MVT::v4f16 && HasFullFP16)
5776 DestVecType = MVT::v4i16;
5777 else if (VT == MVT::v8f16 && HasFullFP16)
5778 DestVecType = MVT::v8i16;
5779 else
5780 return DAG.UnrollVectorOp(Op.getNode());
5781
5782 unsigned CastOpc;
5783 unsigned Opc;
5784 switch (Op.getOpcode()) {
5785 default: llvm_unreachable("Invalid opcode!");
5786 case ISD::SINT_TO_FP:
5787 CastOpc = ISD::SIGN_EXTEND;
5789 break;
5790 case ISD::UINT_TO_FP:
5791 CastOpc = ISD::ZERO_EXTEND;
5793 break;
5794 }
5795
5796 Op = DAG.getNode(CastOpc, dl, DestVecType, Op.getOperand(0));
5797 return DAG.getNode(Opc, dl, VT, Op);
5798}
5799
5800SDValue ARMTargetLowering::LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) const {
5801 EVT VT = Op.getValueType();
5802 if (VT.isVector())
5803 return LowerVectorINT_TO_FP(Op, DAG);
5804 if (isUnsupportedFloatingType(VT)) {
5805 RTLIB::Libcall LC;
5806 if (Op.getOpcode() == ISD::SINT_TO_FP)
5807 LC = RTLIB::getSINTTOFP(Op.getOperand(0).getValueType(),
5808 Op.getValueType());
5809 else
5810 LC = RTLIB::getUINTTOFP(Op.getOperand(0).getValueType(),
5811 Op.getValueType());
5812 MakeLibCallOptions CallOptions;
5813 return makeLibCall(DAG, LC, Op.getValueType(), Op.getOperand(0),
5814 CallOptions, SDLoc(Op)).first;
5815 }
5816
5817 return Op;
5818}
5819
5820SDValue ARMTargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
5821 // Implement fcopysign with a fabs and a conditional fneg.
5822 SDValue Tmp0 = Op.getOperand(0);
5823 SDValue Tmp1 = Op.getOperand(1);
5824 SDLoc dl(Op);
5825 EVT VT = Op.getValueType();
5826 EVT SrcVT = Tmp1.getValueType();
5827 bool InGPR = Tmp0.getOpcode() == ISD::BITCAST ||
5828 Tmp0.getOpcode() == ARMISD::VMOVDRR;
5829 bool UseNEON = !InGPR && Subtarget->hasNEON();
5830
5831 if (UseNEON) {
5832 // Use VBSL to copy the sign bit.
5833 unsigned EncodedVal = ARM_AM::createVMOVModImm(0x6, 0x80);
5834 SDValue Mask = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v2i32,
5835 DAG.getTargetConstant(EncodedVal, dl, MVT::i32));
5836 EVT OpVT = (VT == MVT::f32) ? MVT::v2i32 : MVT::v1i64;
5837 if (VT == MVT::f64)
5838 Mask = DAG.getNode(ARMISD::VSHLIMM, dl, OpVT,
5839 DAG.getNode(ISD::BITCAST, dl, OpVT, Mask),
5840 DAG.getConstant(32, dl, MVT::i32));
5841 else /*if (VT == MVT::f32)*/
5842 Tmp0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f32, Tmp0);
5843 if (SrcVT == MVT::f32) {
5844 Tmp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f32, Tmp1);
5845 if (VT == MVT::f64)
5846 Tmp1 = DAG.getNode(ARMISD::VSHLIMM, dl, OpVT,
5847 DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp1),
5848 DAG.getConstant(32, dl, MVT::i32));
5849 } else if (VT == MVT::f32)
5850 Tmp1 = DAG.getNode(ARMISD::VSHRuIMM, dl, MVT::v1i64,
5851 DAG.getNode(ISD::BITCAST, dl, MVT::v1i64, Tmp1),
5852 DAG.getConstant(32, dl, MVT::i32));
5853 Tmp0 = DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp0);
5854 Tmp1 = DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp1);
5855
5857 dl, MVT::i32);
5858 AllOnes = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v8i8, AllOnes);
5859 SDValue MaskNot = DAG.getNode(ISD::XOR, dl, OpVT, Mask,
5860 DAG.getNode(ISD::BITCAST, dl, OpVT, AllOnes));
5861
5862 SDValue Res = DAG.getNode(ISD::OR, dl, OpVT,
5863 DAG.getNode(ISD::AND, dl, OpVT, Tmp1, Mask),
5864 DAG.getNode(ISD::AND, dl, OpVT, Tmp0, MaskNot));
5865 if (VT == MVT::f32) {
5866 Res = DAG.getNode(ISD::BITCAST, dl, MVT::v2f32, Res);
5867 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res,
5868 DAG.getConstant(0, dl, MVT::i32));
5869 } else {
5870 Res = DAG.getNode(ISD::BITCAST, dl, MVT::f64, Res);
5871 }
5872
5873 return Res;
5874 }
5875
5876 // Bitcast operand 1 to i32.
5877 if (SrcVT == MVT::f64)
5878 Tmp1 = DAG.getNode(ARMISD::VMOVRRD, dl, DAG.getVTList(MVT::i32, MVT::i32),
5879 Tmp1).getValue(1);
5880 Tmp1 = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Tmp1);
5881
5882 // Or in the signbit with integer operations.
5883 SDValue Mask1 = DAG.getConstant(0x80000000, dl, MVT::i32);
5884 SDValue Mask2 = DAG.getConstant(0x7fffffff, dl, MVT::i32);
5885 Tmp1 = DAG.getNode(ISD::AND, dl, MVT::i32, Tmp1, Mask1);
5886 if (VT == MVT::f32) {
5887 Tmp0 = DAG.getNode(ISD::AND, dl, MVT::i32,
5888 DAG.getNode(ISD::BITCAST, dl, MVT::i32, Tmp0), Mask2);
5889 return DAG.getNode(ISD::BITCAST, dl, MVT::f32,
5890 DAG.getNode(ISD::OR, dl, MVT::i32, Tmp0, Tmp1));
5891 }
5892
5893 // f64: Or the high part with signbit and then combine two parts.
5894 Tmp0 = DAG.getNode(ARMISD::VMOVRRD, dl, DAG.getVTList(MVT::i32, MVT::i32),
5895 Tmp0);
5896 SDValue Lo = Tmp0.getValue(0);
5897 SDValue Hi = DAG.getNode(ISD::AND, dl, MVT::i32, Tmp0.getValue(1), Mask2);
5898 Hi = DAG.getNode(ISD::OR, dl, MVT::i32, Hi, Tmp1);
5899 return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi);
5900}
5901
5902SDValue ARMTargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const{
5903 MachineFunction &MF = DAG.getMachineFunction();
5904 MachineFrameInfo &MFI = MF.getFrameInfo();
5905 MFI.setReturnAddressIsTaken(true);
5906
5907 EVT VT = Op.getValueType();
5908 SDLoc dl(Op);
5909 unsigned Depth = Op.getConstantOperandVal(0);
5910 if (Depth) {
5911 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
5912 SDValue Offset = DAG.getConstant(4, dl, MVT::i32);
5913 return DAG.getLoad(VT, dl, DAG.getEntryNode(),
5914 DAG.getNode(ISD::ADD, dl, VT, FrameAddr, Offset),
5915 MachinePointerInfo());
5916 }
5917
5918 // Return LR, which contains the return address. Mark it an implicit live-in.
5919 Register Reg = MF.addLiveIn(ARM::LR, getRegClassFor(MVT::i32));
5920 return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT);
5921}
5922
5923SDValue ARMTargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
5924 const ARMBaseRegisterInfo &ARI =
5925 *static_cast<const ARMBaseRegisterInfo*>(RegInfo);
5926 MachineFunction &MF = DAG.getMachineFunction();
5927 MachineFrameInfo &MFI = MF.getFrameInfo();
5928 MFI.setFrameAddressIsTaken(true);
5929
5930 EVT VT = Op.getValueType();
5931 SDLoc dl(Op); // FIXME probably not meaningful
5932 unsigned Depth = Op.getConstantOperandVal(0);
5933 Register FrameReg = ARI.getFrameRegister(MF);
5934 SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
5935 while (Depth--)
5936 FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
5937 MachinePointerInfo());
5938 return FrameAddr;
5939}
5940
5941// FIXME? Maybe this could be a TableGen attribute on some registers and
5942// this table could be generated automatically from RegInfo.
5943Register ARMTargetLowering::getRegisterByName(const char* RegName, LLT VT,
5944 const MachineFunction &MF) const {
5945 return StringSwitch<Register>(RegName)
5946 .Case("sp", ARM::SP)
5947 .Default(Register());
5948}
5949
5950// Result is 64 bit value so split into two 32 bit values and return as a
5951// pair of values.
5953 SelectionDAG &DAG) {
5954 SDLoc DL(N);
5955
5956 // This function is only supposed to be called for i64 type destination.
5957 assert(N->getValueType(0) == MVT::i64
5958 && "ExpandREAD_REGISTER called for non-i64 type result.");
5959
5961 DAG.getVTList(MVT::i32, MVT::i32, MVT::Other),
5962 N->getOperand(0),
5963 N->getOperand(1));
5964
5965 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Read.getValue(0),
5966 Read.getValue(1)));
5967 Results.push_back(Read.getValue(2)); // Chain
5968}
5969
5970/// \p BC is a bitcast that is about to be turned into a VMOVDRR.
5971/// When \p DstVT, the destination type of \p BC, is on the vector
5972/// register bank and the source of bitcast, \p Op, operates on the same bank,
5973/// it might be possible to combine them, such that everything stays on the
5974/// vector register bank.
5975/// \p return The node that would replace \p BT, if the combine
5976/// is possible.
5978 SelectionDAG &DAG) {
5979 SDValue Op = BC->getOperand(0);
5980 EVT DstVT = BC->getValueType(0);
5981
5982 // The only vector instruction that can produce a scalar (remember,
5983 // since the bitcast was about to be turned into VMOVDRR, the source
5984 // type is i64) from a vector is EXTRACT_VECTOR_ELT.
5985 // Moreover, we can do this combine only if there is one use.
5986 // Finally, if the destination type is not a vector, there is not
5987 // much point on forcing everything on the vector bank.
5988 if (!DstVT.isVector() || Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
5989 !Op.hasOneUse())
5990 return SDValue();
5991
5992 // If the index is not constant, we will introduce an additional
5993 // multiply that will stick.
5994 // Give up in that case.
5995 ConstantSDNode *Index = dyn_cast<ConstantSDNode>(Op.getOperand(1));
5996 if (!Index)
5997 return SDValue();
5998 unsigned DstNumElt = DstVT.getVectorNumElements();
5999
6000 // Compute the new index.
6001 const APInt &APIntIndex = Index->getAPIntValue();
6002 APInt NewIndex(APIntIndex.getBitWidth(), DstNumElt);
6003 NewIndex *= APIntIndex;
6004 // Check if the new constant index fits into i32.
6005 if (NewIndex.getBitWidth() > 32)
6006 return SDValue();
6007
6008 // vMTy bitcast(i64 extractelt vNi64 src, i32 index) ->
6009 // vMTy extractsubvector vNxMTy (bitcast vNi64 src), i32 index*M)
6010 SDLoc dl(Op);
6011 SDValue ExtractSrc = Op.getOperand(0);
6012 EVT VecVT = EVT::getVectorVT(
6013 *DAG.getContext(), DstVT.getScalarType(),
6014 ExtractSrc.getValueType().getVectorNumElements() * DstNumElt);
6015 SDValue BitCast = DAG.getNode(ISD::BITCAST, dl, VecVT, ExtractSrc);
6016 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DstVT, BitCast,
6017 DAG.getConstant(NewIndex.getZExtValue(), dl, MVT::i32));
6018}
6019
6020/// ExpandBITCAST - If the target supports VFP, this function is called to
6021/// expand a bit convert where either the source or destination type is i64 to
6022/// use a VMOVDRR or VMOVRRD node. This should not be done when the non-i64
6023/// operand type is illegal (e.g., v2f32 for a target that doesn't support
6024/// vectors), since the legalizer won't know what to do with that.
6025SDValue ARMTargetLowering::ExpandBITCAST(SDNode *N, SelectionDAG &DAG,
6026 const ARMSubtarget *Subtarget) const {
6027 SDLoc dl(N);
6028 SDValue Op = N->getOperand(0);
6029
6030 // This function is only supposed to be called for i16 and i64 types, either
6031 // as the source or destination of the bit convert.
6032 EVT SrcVT = Op.getValueType();
6033 EVT DstVT = N->getValueType(0);
6034
6035 if ((SrcVT == MVT::i16 || SrcVT == MVT::i32) &&
6036 (DstVT == MVT::f16 || DstVT == MVT::bf16))
6037 return MoveToHPR(SDLoc(N), DAG, MVT::i32, DstVT.getSimpleVT(),
6038 DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), MVT::i32, Op));
6039
6040 if ((DstVT == MVT::i16 || DstVT == MVT::i32) &&
6041 (SrcVT == MVT::f16 || SrcVT == MVT::bf16)) {
6042 if (Subtarget->hasFullFP16() && !Subtarget->hasBF16())
6043 Op = DAG.getBitcast(MVT::f16, Op);
6044 return DAG.getNode(
6045 ISD::TRUNCATE, SDLoc(N), DstVT,
6046 MoveFromHPR(SDLoc(N), DAG, MVT::i32, SrcVT.getSimpleVT(), Op));
6047 }
6048
6049 if (!(SrcVT == MVT::i64 || DstVT == MVT::i64))
6050 return SDValue();
6051
6052 // Turn i64->f64 into VMOVDRR.
6053 if (SrcVT == MVT::i64 && isTypeLegal(DstVT)) {
6054 // Do not force values to GPRs (this is what VMOVDRR does for the inputs)
6055 // if we can combine the bitcast with its source.
6057 return Val;
6058 SDValue Lo, Hi;
6059 std::tie(Lo, Hi) = DAG.SplitScalar(Op, dl, MVT::i32, MVT::i32);
6060 return DAG.getNode(ISD::BITCAST, dl, DstVT,
6061 DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi));
6062 }
6063
6064 // Turn f64->i64 into VMOVRRD.
6065 if (DstVT == MVT::i64 && isTypeLegal(SrcVT)) {
6066 SDValue Cvt;
6067 if (DAG.getDataLayout().isBigEndian() && SrcVT.isVector() &&
6068 SrcVT.getVectorNumElements() > 1)
6069 Cvt = DAG.getNode(ARMISD::VMOVRRD, dl,
6070 DAG.getVTList(MVT::i32, MVT::i32),
6071 DAG.getNode(ARMISD::VREV64, dl, SrcVT, Op));
6072 else
6073 Cvt = DAG.getNode(ARMISD::VMOVRRD, dl,
6074 DAG.getVTList(MVT::i32, MVT::i32), Op);
6075 // Merge the pieces into a single i64 value.
6076 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Cvt, Cvt.getValue(1));
6077 }
6078
6079 return SDValue();
6080}
6081
6082/// getZeroVector - Returns a vector of specified type with all zero elements.
6083/// Zero vectors are used to represent vector negation and in those cases
6084/// will be implemented with the NEON VNEG instruction. However, VNEG does
6085/// not support i64 elements, so sometimes the zero vectors will need to be
6086/// explicitly constructed. Regardless, use a canonical VMOV to create the
6087/// zero vector.
6088static SDValue getZeroVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) {
6089 assert(VT.isVector() && "Expected a vector type");
6090 // The canonical modified immediate encoding of a zero vector is....0!
6091 SDValue EncodedVal = DAG.getTargetConstant(0, dl, MVT::i32);
6092 EVT VmovVT = VT.is128BitVector() ? MVT::v4i32 : MVT::v2i32;
6093 SDValue Vmov = DAG.getNode(ARMISD::VMOVIMM, dl, VmovVT, EncodedVal);
6094 return DAG.getNode(ISD::BITCAST, dl, VT, Vmov);
6095}
6096
6097/// LowerShiftRightParts - Lower SRA_PARTS, which returns two
6098/// i32 values and take a 2 x i32 value to shift plus a shift amount.
6099SDValue ARMTargetLowering::LowerShiftRightParts(SDValue Op,
6100 SelectionDAG &DAG) const {
6101 assert(Op.getNumOperands() == 3 && "Not a double-shift!");
6102 EVT VT = Op.getValueType();
6103 unsigned VTBits = VT.getSizeInBits();
6104 SDLoc dl(Op);
6105 SDValue ShOpLo = Op.getOperand(0);
6106 SDValue ShOpHi = Op.getOperand(1);
6107 SDValue ShAmt = Op.getOperand(2);
6108 SDValue ARMcc;
6109 unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL;
6110
6111 assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS);
6112
6113 SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
6114 DAG.getConstant(VTBits, dl, MVT::i32), ShAmt);
6115 SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt);
6116 SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
6117 DAG.getConstant(VTBits, dl, MVT::i32));
6118 SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt);
6119 SDValue LoSmallShift = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
6120 SDValue LoBigShift = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt);
6121 SDValue CmpLo = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32),
6122 ISD::SETGE, ARMcc, DAG, dl);
6123 SDValue Lo =
6124 DAG.getNode(ARMISD::CMOV, dl, VT, LoSmallShift, LoBigShift, ARMcc, CmpLo);
6125
6126 SDValue HiSmallShift = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
6127 SDValue HiBigShift = Opc == ISD::SRA
6128 ? DAG.getNode(Opc, dl, VT, ShOpHi,
6129 DAG.getConstant(VTBits - 1, dl, VT))
6130 : DAG.getConstant(0, dl, VT);
6131 SDValue CmpHi = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32),
6132 ISD::SETGE, ARMcc, DAG, dl);
6133 SDValue Hi =
6134 DAG.getNode(ARMISD::CMOV, dl, VT, HiSmallShift, HiBigShift, ARMcc, CmpHi);
6135
6136 SDValue Ops[2] = { Lo, Hi };
6137 return DAG.getMergeValues(Ops, dl);
6138}
6139
6140/// LowerShiftLeftParts - Lower SHL_PARTS, which returns two
6141/// i32 values and take a 2 x i32 value to shift plus a shift amount.
6142SDValue ARMTargetLowering::LowerShiftLeftParts(SDValue Op,
6143 SelectionDAG &DAG) const {
6144 assert(Op.getNumOperands() == 3 && "Not a double-shift!");
6145 EVT VT = Op.getValueType();
6146 unsigned VTBits = VT.getSizeInBits();
6147 SDLoc dl(Op);
6148 SDValue ShOpLo = Op.getOperand(0);
6149 SDValue ShOpHi = Op.getOperand(1);
6150 SDValue ShAmt = Op.getOperand(2);
6151 SDValue ARMcc;
6152
6153 assert(Op.getOpcode() == ISD::SHL_PARTS);
6154 SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
6155 DAG.getConstant(VTBits, dl, MVT::i32), ShAmt);
6156 SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt);
6157 SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt);
6158 SDValue HiSmallShift = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
6159
6160 SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
6161 DAG.getConstant(VTBits, dl, MVT::i32));
6162 SDValue HiBigShift = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt);
6163 SDValue CmpHi = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32),
6164 ISD::SETGE, ARMcc, DAG, dl);
6165 SDValue Hi =
6166 DAG.getNode(ARMISD::CMOV, dl, VT, HiSmallShift, HiBigShift, ARMcc, CmpHi);
6167
6168 SDValue CmpLo = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32),
6169 ISD::SETGE, ARMcc, DAG, dl);
6170 SDValue LoSmallShift = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
6171 SDValue Lo = DAG.getNode(ARMISD::CMOV, dl, VT, LoSmallShift,
6172 DAG.getConstant(0, dl, VT), ARMcc, CmpLo);
6173
6174 SDValue Ops[2] = { Lo, Hi };
6175 return DAG.getMergeValues(Ops, dl);
6176}
6177
6178SDValue ARMTargetLowering::LowerGET_ROUNDING(SDValue Op,
6179 SelectionDAG &DAG) const {
6180 // The rounding mode is in bits 23:22 of the FPSCR.
6181 // The ARM rounding mode value to FLT_ROUNDS mapping is 0->1, 1->2, 2->3, 3->0
6182 // The formula we use to implement this is (((FPSCR + 1 << 22) >> 22) & 3)
6183 // so that the shift + and get folded into a bitfield extract.
6184 SDLoc dl(Op);
6185 SDValue Chain = Op.getOperand(0);
6186 SDValue Ops[] = {Chain,
6187 DAG.getConstant(Intrinsic::arm_get_fpscr, dl, MVT::i32)};
6188
6189 SDValue FPSCR =
6190 DAG.getNode(ISD::INTRINSIC_W_CHAIN, dl, {MVT::i32, MVT::Other}, Ops);
6191 Chain = FPSCR.getValue(1);
6192 SDValue FltRounds = DAG.getNode(ISD::ADD, dl, MVT::i32, FPSCR,
6193 DAG.getConstant(1U << 22, dl, MVT::i32));
6194 SDValue RMODE = DAG.getNode(ISD::SRL, dl, MVT::i32, FltRounds,
6195 DAG.getConstant(22, dl, MVT::i32));
6196 SDValue And = DAG.getNode(ISD::AND, dl, MVT::i32, RMODE,
6197 DAG.getConstant(3, dl, MVT::i32));
6198 return DAG.getMergeValues({And, Chain}, dl);
6199}
6200
6201SDValue ARMTargetLowering::LowerSET_ROUNDING(SDValue Op,
6202 SelectionDAG &DAG) const {
6203 SDLoc DL(Op);
6204 SDValue Chain = Op->getOperand(0);
6205 SDValue RMValue = Op->getOperand(1);
6206
6207 // The rounding mode is in bits 23:22 of the FPSCR.
6208 // The llvm.set.rounding argument value to ARM rounding mode value mapping
6209 // is 0->3, 1->0, 2->1, 3->2. The formula we use to implement this is
6210 // ((arg - 1) & 3) << 22).
6211 //
6212 // It is expected that the argument of llvm.set.rounding is within the
6213 // segment [0, 3], so NearestTiesToAway (4) is not handled here. It is
6214 // responsibility of the code generated llvm.set.rounding to ensure this
6215 // condition.
6216
6217 // Calculate new value of FPSCR[23:22].
6218 RMValue = DAG.getNode(ISD::SUB, DL, MVT::i32, RMValue,
6219 DAG.getConstant(1, DL, MVT::i32));
6220 RMValue = DAG.getNode(ISD::AND, DL, MVT::i32, RMValue,
6221 DAG.getConstant(0x3, DL, MVT::i32));
6222 RMValue = DAG.getNode(ISD::SHL, DL, MVT::i32, RMValue,
6223 DAG.getConstant(ARM::RoundingBitsPos, DL, MVT::i32));
6224
6225 // Get current value of FPSCR.
6226 SDValue Ops[] = {Chain,
6227 DAG.getConstant(Intrinsic::arm_get_fpscr, DL, MVT::i32)};
6228 SDValue FPSCR =
6229 DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, {MVT::i32, MVT::Other}, Ops);
6230 Chain = FPSCR.getValue(1);
6231 FPSCR = FPSCR.getValue(0);
6232
6233 // Put new rounding mode into FPSCR[23:22].
6234 const unsigned RMMask = ~(ARM::Rounding::rmMask << ARM::RoundingBitsPos);
6235 FPSCR = DAG.getNode(ISD::AND, DL, MVT::i32, FPSCR,
6236 DAG.getConstant(RMMask, DL, MVT::i32));
6237 FPSCR = DAG.getNode(ISD::OR, DL, MVT::i32, FPSCR, RMValue);
6238 SDValue Ops2[] = {
6239 Chain, DAG.getConstant(Intrinsic::arm_set_fpscr, DL, MVT::i32), FPSCR};
6240 return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, Ops2);
6241}
6242
6243SDValue ARMTargetLowering::LowerSET_FPMODE(SDValue Op,
6244 SelectionDAG &DAG) const {
6245 SDLoc DL(Op);
6246 SDValue Chain = Op->getOperand(0);
6247 SDValue Mode = Op->getOperand(1);
6248
6249 // Generate nodes to build:
6250 // FPSCR = (FPSCR & FPStatusBits) | (Mode & ~FPStatusBits)
6251 SDValue Ops[] = {Chain,
6252 DAG.getConstant(Intrinsic::arm_get_fpscr, DL, MVT::i32)};
6253 SDValue FPSCR =
6254 DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, {MVT::i32, MVT::Other}, Ops);
6255 Chain = FPSCR.getValue(1);
6256 FPSCR = FPSCR.getValue(0);
6257
6258 SDValue FPSCRMasked =
6259 DAG.getNode(ISD::AND, DL, MVT::i32, FPSCR,
6260 DAG.getConstant(ARM::FPStatusBits, DL, MVT::i32));
6261 SDValue InputMasked =
6262 DAG.getNode(ISD::AND, DL, MVT::i32, Mode,
6263 DAG.getConstant(~ARM::FPStatusBits, DL, MVT::i32));
6264 FPSCR = DAG.getNode(ISD::OR, DL, MVT::i32, FPSCRMasked, InputMasked);
6265
6266 SDValue Ops2[] = {
6267 Chain, DAG.getConstant(Intrinsic::arm_set_fpscr, DL, MVT::i32), FPSCR};
6268 return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, Ops2);
6269}
6270
6271SDValue ARMTargetLowering::LowerRESET_FPMODE(SDValue Op,
6272 SelectionDAG &DAG) const {
6273 SDLoc DL(Op);
6274 SDValue Chain = Op->getOperand(0);
6275
6276 // To get the default FP mode all control bits are cleared:
6277 // FPSCR = FPSCR & (FPStatusBits | FPReservedBits)
6278 SDValue Ops[] = {Chain,
6279 DAG.getConstant(Intrinsic::arm_get_fpscr, DL, MVT::i32)};
6280 SDValue FPSCR =
6281 DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, {MVT::i32, MVT::Other}, Ops);
6282 Chain = FPSCR.getValue(1);
6283 FPSCR = FPSCR.getValue(0);
6284
6285 SDValue FPSCRMasked = DAG.getNode(
6286 ISD::AND, DL, MVT::i32, FPSCR,
6288 SDValue Ops2[] = {Chain,
6289 DAG.getConstant(Intrinsic::arm_set_fpscr, DL, MVT::i32),
6290 FPSCRMasked};
6291 return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, Ops2);
6292}
6293
6295 const ARMSubtarget *ST) {
6296 SDLoc dl(N);
6297 EVT VT = N->getValueType(0);
6298 if (VT.isVector() && ST->hasNEON()) {
6299
6300 // Compute the least significant set bit: LSB = X & -X
6301 SDValue X = N->getOperand(0);
6302 SDValue NX = DAG.getNode(ISD::SUB, dl, VT, getZeroVector(VT, DAG, dl), X);
6303 SDValue LSB = DAG.getNode(ISD::AND, dl, VT, X, NX);
6304
6305 EVT ElemTy = VT.getVectorElementType();
6306
6307 if (ElemTy == MVT::i8) {
6308 // Compute with: cttz(x) = ctpop(lsb - 1)
6309 SDValue One = DAG.getNode(ARMISD::VMOVIMM, dl, VT,
6310 DAG.getTargetConstant(1, dl, ElemTy));
6311 SDValue Bits = DAG.getNode(ISD::SUB, dl, VT, LSB, One);
6312 return DAG.getNode(ISD::CTPOP, dl, VT, Bits);
6313 }
6314
6315 if ((ElemTy == MVT::i16 || ElemTy == MVT::i32) &&
6316 (N->getOpcode() == ISD::CTTZ_ZERO_POISON)) {
6317 // Compute with: cttz(x) = (width - 1) - ctlz(lsb), if x != 0
6318 unsigned NumBits = ElemTy.getSizeInBits();
6319 SDValue WidthMinus1 =
6320 DAG.getNode(ARMISD::VMOVIMM, dl, VT,
6321 DAG.getTargetConstant(NumBits - 1, dl, ElemTy));
6322 SDValue CTLZ = DAG.getNode(ISD::CTLZ, dl, VT, LSB);
6323 return DAG.getNode(ISD::SUB, dl, VT, WidthMinus1, CTLZ);
6324 }
6325
6326 // Compute with: cttz(x) = ctpop(lsb - 1)
6327
6328 // Compute LSB - 1.
6329 SDValue Bits;
6330 if (ElemTy == MVT::i64) {
6331 // Load constant 0xffff'ffff'ffff'ffff to register.
6332 SDValue FF = DAG.getNode(ARMISD::VMOVIMM, dl, VT,
6333 DAG.getTargetConstant(0x1eff, dl, MVT::i32));
6334 Bits = DAG.getNode(ISD::ADD, dl, VT, LSB, FF);
6335 } else {
6336 SDValue One = DAG.getNode(ARMISD::VMOVIMM, dl, VT,
6337 DAG.getTargetConstant(1, dl, ElemTy));
6338 Bits = DAG.getNode(ISD::SUB, dl, VT, LSB, One);
6339 }
6340 return DAG.getNode(ISD::CTPOP, dl, VT, Bits);
6341 }
6342
6343 if (!ST->hasV6T2Ops())
6344 return SDValue();
6345
6346 SDValue rbit = DAG.getNode(ISD::BITREVERSE, dl, VT, N->getOperand(0));
6347 return DAG.getNode(ISD::CTLZ, dl, VT, rbit);
6348}
6349
6351 const ARMSubtarget *ST) {
6352 EVT VT = N->getValueType(0);
6353 SDLoc DL(N);
6354
6355 assert(ST->hasNEON() && "Custom ctpop lowering requires NEON.");
6356 assert((VT == MVT::v1i64 || VT == MVT::v2i64 || VT == MVT::v2i32 ||
6357 VT == MVT::v4i32 || VT == MVT::v4i16 || VT == MVT::v8i16) &&
6358 "Unexpected type for custom ctpop lowering");
6359
6360 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
6361 EVT VT8Bit = VT.is64BitVector() ? MVT::v8i8 : MVT::v16i8;
6362 SDValue Res = DAG.getBitcast(VT8Bit, N->getOperand(0));
6363 Res = DAG.getNode(ISD::CTPOP, DL, VT8Bit, Res);
6364
6365 // Widen v8i8/v16i8 CTPOP result to VT by repeatedly widening pairwise adds.
6366 unsigned EltSize = 8;
6367 unsigned NumElts = VT.is64BitVector() ? 8 : 16;
6368 while (EltSize != VT.getScalarSizeInBits()) {
6370 Ops.push_back(DAG.getConstant(Intrinsic::arm_neon_vpaddlu, DL,
6371 TLI.getPointerTy(DAG.getDataLayout())));
6372 Ops.push_back(Res);
6373
6374 EltSize *= 2;
6375 NumElts /= 2;
6376 MVT WidenVT = MVT::getVectorVT(MVT::getIntegerVT(EltSize), NumElts);
6377 Res = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, WidenVT, Ops);
6378 }
6379
6380 return Res;
6381}
6382
6383/// Getvshiftimm - Check if this is a valid build_vector for the immediate
6384/// operand of a vector shift operation, where all the elements of the
6385/// build_vector must have the same constant integer value.
6386static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) {
6387 // Ignore bit_converts.
6388 while (Op.getOpcode() == ISD::BITCAST)
6389 Op = Op.getOperand(0);
6391 APInt SplatBits, SplatUndef;
6392 unsigned SplatBitSize;
6393 bool HasAnyUndefs;
6394 if (!BVN ||
6395 !BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs,
6396 ElementBits) ||
6397 SplatBitSize > ElementBits)
6398 return false;
6399 Cnt = SplatBits.getSExtValue();
6400 return true;
6401}
6402
6403/// isVShiftLImm - Check if this is a valid build_vector for the immediate
6404/// operand of a vector shift left operation. That value must be in the range:
6405/// 0 <= Value < ElementBits for a left shift; or
6406/// 0 <= Value <= ElementBits for a long left shift.
6407static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) {
6408 assert(VT.isVector() && "vector shift count is not a vector type");
6409 int64_t ElementBits = VT.getScalarSizeInBits();
6410 if (!getVShiftImm(Op, ElementBits, Cnt))
6411 return false;
6412 return (Cnt >= 0 && (isLong ? Cnt - 1 : Cnt) < ElementBits);
6413}
6414
6415/// isVShiftRImm - Check if this is a valid build_vector for the immediate
6416/// operand of a vector shift right operation. For a shift opcode, the value
6417/// is positive, but for an intrinsic the value count must be negative. The
6418/// absolute value must be in the range:
6419/// 1 <= |Value| <= ElementBits for a right shift; or
6420/// 1 <= |Value| <= ElementBits/2 for a narrow right shift.
6421static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, bool isIntrinsic,
6422 int64_t &Cnt) {
6423 assert(VT.isVector() && "vector shift count is not a vector type");
6424 int64_t ElementBits = VT.getScalarSizeInBits();
6425 if (!getVShiftImm(Op, ElementBits, Cnt))
6426 return false;
6427 if (!isIntrinsic)
6428 return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits / 2 : ElementBits));
6429 if (Cnt >= -(isNarrow ? ElementBits / 2 : ElementBits) && Cnt <= -1) {
6430 Cnt = -Cnt;
6431 return true;
6432 }
6433 return false;
6434}
6435
6437 const ARMSubtarget *ST) {
6438 EVT VT = N->getValueType(0);
6439 SDLoc dl(N);
6440 int64_t Cnt;
6441
6442 if (!VT.isVector())
6443 return SDValue();
6444
6445 // We essentially have two forms here. Shift by an immediate and shift by a
6446 // vector register (there are also shift by a gpr, but that is just handled
6447 // with a tablegen pattern). We cannot easily match shift by an immediate in
6448 // tablegen so we do that here and generate a VSHLIMM/VSHRsIMM/VSHRuIMM.
6449 // For shifting by a vector, we don't have VSHR, only VSHL (which can be
6450 // signed or unsigned, and a negative shift indicates a shift right).
6451 if (N->getOpcode() == ISD::SHL) {
6452 if (isVShiftLImm(N->getOperand(1), VT, false, Cnt))
6453 return DAG.getNode(ARMISD::VSHLIMM, dl, VT, N->getOperand(0),
6454 DAG.getConstant(Cnt, dl, MVT::i32));
6455 return DAG.getNode(ARMISD::VSHLu, dl, VT, N->getOperand(0),
6456 N->getOperand(1));
6457 }
6458
6459 assert((N->getOpcode() == ISD::SRA || N->getOpcode() == ISD::SRL) &&
6460 "unexpected vector shift opcode");
6461
6462 if (isVShiftRImm(N->getOperand(1), VT, false, false, Cnt)) {
6463 unsigned VShiftOpc =
6464 (N->getOpcode() == ISD::SRA ? ARMISD::VSHRsIMM : ARMISD::VSHRuIMM);
6465 return DAG.getNode(VShiftOpc, dl, VT, N->getOperand(0),
6466 DAG.getConstant(Cnt, dl, MVT::i32));
6467 }
6468
6469 // Other right shifts we don't have operations for (we use a shift left by a
6470 // negative number).
6471 EVT ShiftVT = N->getOperand(1).getValueType();
6472 SDValue NegatedCount = DAG.getNode(
6473 ISD::SUB, dl, ShiftVT, getZeroVector(ShiftVT, DAG, dl), N->getOperand(1));
6474 unsigned VShiftOpc =
6475 (N->getOpcode() == ISD::SRA ? ARMISD::VSHLs : ARMISD::VSHLu);
6476 return DAG.getNode(VShiftOpc, dl, VT, N->getOperand(0), NegatedCount);
6477}
6478
6480 const ARMSubtarget *ST) {
6481 EVT VT = N->getValueType(0);
6482 SDLoc dl(N);
6483
6484 // We can get here for a node like i32 = ISD::SHL i32, i64
6485 if (VT != MVT::i64)
6486 return SDValue();
6487
6488 assert((N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA ||
6489 N->getOpcode() == ISD::SHL) &&
6490 "Unknown shift to lower!");
6491
6492 unsigned ShOpc = N->getOpcode();
6493 if (ST->hasMVEIntegerOps()) {
6494 SDValue ShAmt = N->getOperand(1);
6495 unsigned ShPartsOpc = ARMISD::LSLL;
6497
6498 // If the shift amount is greater than 32 or has a greater bitwidth than 64
6499 // then do the default optimisation
6500 if ((!Con && ShAmt->getValueType(0).getSizeInBits() > 64) ||
6501 (Con && (Con->getAPIntValue() == 0 || Con->getAPIntValue().uge(32))))
6502 return SDValue();
6503
6504 // Extract the lower 32 bits of the shift amount if it's not an i32
6505 if (ShAmt->getValueType(0) != MVT::i32)
6506 ShAmt = DAG.getZExtOrTrunc(ShAmt, dl, MVT::i32);
6507
6508 if (ShOpc == ISD::SRL) {
6509 if (!Con)
6510 // There is no t2LSRLr instruction so negate and perform an lsll if the
6511 // shift amount is in a register, emulating a right shift.
6512 ShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
6513 DAG.getConstant(0, dl, MVT::i32), ShAmt);
6514 else
6515 // Else generate an lsrl on the immediate shift amount
6516 ShPartsOpc = ARMISD::LSRL;
6517 } else if (ShOpc == ISD::SRA)
6518 ShPartsOpc = ARMISD::ASRL;
6519
6520 // Split Lower/Upper 32 bits of the destination/source
6521 SDValue Lo, Hi;
6522 std::tie(Lo, Hi) =
6523 DAG.SplitScalar(N->getOperand(0), dl, MVT::i32, MVT::i32);
6524 // Generate the shift operation as computed above
6525 Lo = DAG.getNode(ShPartsOpc, dl, DAG.getVTList(MVT::i32, MVT::i32), Lo, Hi,
6526 ShAmt);
6527 // The upper 32 bits come from the second return value of lsll
6528 Hi = SDValue(Lo.getNode(), 1);
6529 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
6530 }
6531
6532 // We only lower SRA, SRL of 1 here, all others use generic lowering.
6533 if (!isOneConstant(N->getOperand(1)) || N->getOpcode() == ISD::SHL)
6534 return SDValue();
6535
6536 // If we are in thumb mode, we don't have RRX.
6537 if (ST->isThumb1Only())
6538 return SDValue();
6539
6540 // Okay, we have a 64-bit SRA or SRL of 1. Lower this to an RRX expr.
6541 SDValue Lo, Hi;
6542 std::tie(Lo, Hi) = DAG.SplitScalar(N->getOperand(0), dl, MVT::i32, MVT::i32);
6543
6544 // First, build a LSRS1/ASRS1 op, which shifts the top part by one and
6545 // captures the shifted out bit into a carry flag.
6546 unsigned Opc = N->getOpcode() == ISD::SRL ? ARMISD::LSRS1 : ARMISD::ASRS1;
6547 Hi = DAG.getNode(Opc, dl, DAG.getVTList(MVT::i32, FlagsVT), Hi);
6548
6549 // The low part is an ARMISD::RRX operand, which shifts the carry in.
6550 Lo = DAG.getNode(ARMISD::RRX, dl, MVT::i32, Lo, Hi.getValue(1));
6551
6552 // Merge the pieces into a single i64 value.
6553 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
6554}
6555
6557 const ARMSubtarget *ST) {
6558 bool Invert = false;
6559 bool Swap = false;
6560 unsigned Opc = ARMCC::AL;
6561
6562 SDValue Op0 = Op.getOperand(0);
6563 SDValue Op1 = Op.getOperand(1);
6564 SDValue CC = Op.getOperand(2);
6565 EVT VT = Op.getValueType();
6566 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
6567 SDLoc dl(Op);
6568
6569 EVT CmpVT;
6570 if (ST->hasNEON())
6572 else {
6573 assert(ST->hasMVEIntegerOps() &&
6574 "No hardware support for integer vector comparison!");
6575
6576 if (Op.getValueType().getVectorElementType() != MVT::i1)
6577 return SDValue();
6578
6579 // Make sure we expand floating point setcc to scalar if we do not have
6580 // mve.fp, so that we can handle them from there.
6581 if (Op0.getValueType().isFloatingPoint() && !ST->hasMVEFloatOps())
6582 return SDValue();
6583
6584 CmpVT = VT;
6585 }
6586
6587 if (Op0.getValueType().getVectorElementType() == MVT::i64 &&
6588 (SetCCOpcode == ISD::SETEQ || SetCCOpcode == ISD::SETNE)) {
6589 // Special-case integer 64-bit equality comparisons. They aren't legal,
6590 // but they can be lowered with a few vector instructions.
6591 unsigned CmpElements = CmpVT.getVectorNumElements() * 2;
6592 EVT SplitVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, CmpElements);
6593 SDValue CastOp0 = DAG.getNode(ISD::BITCAST, dl, SplitVT, Op0);
6594 SDValue CastOp1 = DAG.getNode(ISD::BITCAST, dl, SplitVT, Op1);
6595 SDValue Cmp = DAG.getNode(ISD::SETCC, dl, SplitVT, CastOp0, CastOp1,
6596 DAG.getCondCode(ISD::SETEQ));
6597 SDValue Reversed = DAG.getNode(ARMISD::VREV64, dl, SplitVT, Cmp);
6598 SDValue Merged = DAG.getNode(ISD::AND, dl, SplitVT, Cmp, Reversed);
6599 Merged = DAG.getNode(ISD::BITCAST, dl, CmpVT, Merged);
6600 if (SetCCOpcode == ISD::SETNE)
6601 Merged = DAG.getNOT(dl, Merged, CmpVT);
6602 Merged = DAG.getSExtOrTrunc(Merged, dl, VT);
6603 return Merged;
6604 }
6605
6606 if (CmpVT.getVectorElementType() == MVT::i64)
6607 // 64-bit comparisons are not legal in general.
6608 return SDValue();
6609
6610 if (Op1.getValueType().isFloatingPoint()) {
6611 switch (SetCCOpcode) {
6612 default: llvm_unreachable("Illegal FP comparison");
6613 case ISD::SETUNE:
6614 case ISD::SETNE:
6615 if (ST->hasMVEFloatOps()) {
6616 Opc = ARMCC::NE; break;
6617 } else {
6618 Invert = true; [[fallthrough]];
6619 }
6620 case ISD::SETOEQ:
6621 case ISD::SETEQ: Opc = ARMCC::EQ; break;
6622 case ISD::SETOLT:
6623 case ISD::SETLT: Swap = true; [[fallthrough]];
6624 case ISD::SETOGT:
6625 case ISD::SETGT: Opc = ARMCC::GT; break;
6626 case ISD::SETOLE:
6627 case ISD::SETLE: Swap = true; [[fallthrough]];
6628 case ISD::SETOGE:
6629 case ISD::SETGE: Opc = ARMCC::GE; break;
6630 case ISD::SETUGE: Swap = true; [[fallthrough]];
6631 case ISD::SETULE: Invert = true; Opc = ARMCC::GT; break;
6632 case ISD::SETUGT: Swap = true; [[fallthrough]];
6633 case ISD::SETULT: Invert = true; Opc = ARMCC::GE; break;
6634 case ISD::SETUEQ: Invert = true; [[fallthrough]];
6635 case ISD::SETONE: {
6636 // Expand this to (OLT | OGT).
6637 SDValue TmpOp0 = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op1, Op0,
6638 DAG.getConstant(ARMCC::GT, dl, MVT::i32));
6639 SDValue TmpOp1 = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op0, Op1,
6640 DAG.getConstant(ARMCC::GT, dl, MVT::i32));
6641 SDValue Result = DAG.getNode(ISD::OR, dl, CmpVT, TmpOp0, TmpOp1);
6642 if (Invert)
6643 Result = DAG.getNOT(dl, Result, VT);
6644 return Result;
6645 }
6646 case ISD::SETUO: Invert = true; [[fallthrough]];
6647 case ISD::SETO: {
6648 // Expand this to (OLT | OGE).
6649 SDValue TmpOp0 = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op1, Op0,
6650 DAG.getConstant(ARMCC::GT, dl, MVT::i32));
6651 SDValue TmpOp1 = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op0, Op1,
6652 DAG.getConstant(ARMCC::GE, dl, MVT::i32));
6653 SDValue Result = DAG.getNode(ISD::OR, dl, CmpVT, TmpOp0, TmpOp1);
6654 if (Invert)
6655 Result = DAG.getNOT(dl, Result, VT);
6656 return Result;
6657 }
6658 }
6659 } else {
6660 // Integer comparisons.
6661 switch (SetCCOpcode) {
6662 default: llvm_unreachable("Illegal integer comparison");
6663 case ISD::SETNE:
6664 if (ST->hasMVEIntegerOps()) {
6665 Opc = ARMCC::NE; break;
6666 } else {
6667 Invert = true; [[fallthrough]];
6668 }
6669 case ISD::SETEQ: Opc = ARMCC::EQ; break;
6670 case ISD::SETLT: Swap = true; [[fallthrough]];
6671 case ISD::SETGT: Opc = ARMCC::GT; break;
6672 case ISD::SETLE: Swap = true; [[fallthrough]];
6673 case ISD::SETGE: Opc = ARMCC::GE; break;
6674 case ISD::SETULT: Swap = true; [[fallthrough]];
6675 case ISD::SETUGT: Opc = ARMCC::HI; break;
6676 case ISD::SETULE: Swap = true; [[fallthrough]];
6677 case ISD::SETUGE: Opc = ARMCC::HS; break;
6678 }
6679
6680 // Detect VTST (Vector Test Bits) = icmp ne (and (op0, op1), zero).
6681 if (ST->hasNEON() && Opc == ARMCC::EQ) {
6682 SDValue AndOp;
6684 AndOp = Op0;
6685 else if (ISD::isBuildVectorAllZeros(Op0.getNode()))
6686 AndOp = Op1;
6687
6688 // Ignore bitconvert.
6689 if (AndOp.getNode() && AndOp.getOpcode() == ISD::BITCAST)
6690 AndOp = AndOp.getOperand(0);
6691
6692 if (AndOp.getNode() && AndOp.getOpcode() == ISD::AND) {
6693 Op0 = DAG.getNode(ISD::BITCAST, dl, CmpVT, AndOp.getOperand(0));
6694 Op1 = DAG.getNode(ISD::BITCAST, dl, CmpVT, AndOp.getOperand(1));
6695 SDValue Result = DAG.getNode(ARMISD::VTST, dl, CmpVT, Op0, Op1);
6696 if (!Invert)
6697 Result = DAG.getNOT(dl, Result, VT);
6698 return Result;
6699 }
6700 }
6701 }
6702
6703 if (Swap)
6704 std::swap(Op0, Op1);
6705
6706 // If one of the operands is a constant vector zero, attempt to fold the
6707 // comparison to a specialized compare-against-zero form.
6709 (Opc == ARMCC::GE || Opc == ARMCC::GT || Opc == ARMCC::EQ ||
6710 Opc == ARMCC::NE)) {
6711 if (Opc == ARMCC::GE)
6712 Opc = ARMCC::LE;
6713 else if (Opc == ARMCC::GT)
6714 Opc = ARMCC::LT;
6715 std::swap(Op0, Op1);
6716 }
6717
6718 SDValue Result;
6720 (Opc == ARMCC::GE || Opc == ARMCC::GT || Opc == ARMCC::LE ||
6721 Opc == ARMCC::LT || Opc == ARMCC::NE || Opc == ARMCC::EQ))
6722 Result = DAG.getNode(ARMISD::VCMPZ, dl, CmpVT, Op0,
6723 DAG.getConstant(Opc, dl, MVT::i32));
6724 else
6725 Result = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op0, Op1,
6726 DAG.getConstant(Opc, dl, MVT::i32));
6727
6728 Result = DAG.getSExtOrTrunc(Result, dl, VT);
6729
6730 if (Invert)
6731 Result = DAG.getNOT(dl, Result, VT);
6732
6733 return Result;
6734}
6735
6737 SDValue LHS = Op.getOperand(0);
6738 SDValue RHS = Op.getOperand(1);
6739
6740 assert(LHS.getSimpleValueType().isInteger() && "SETCCCARRY is integer only.");
6741
6742 SDValue Carry = Op.getOperand(2);
6743 SDValue Cond = Op.getOperand(3);
6744 SDLoc DL(Op);
6745
6746 // ARMISD::SUBE expects a carry not a borrow like ISD::USUBO_CARRY so we
6747 // have to invert the carry first.
6748 SDValue InvCarry = valueToCarryFlag(Carry, DAG, true);
6749
6750 SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
6751 SDValue Cmp = DAG.getNode(ARMISD::SUBE, DL, VTs, LHS, RHS, InvCarry);
6752
6753 SDValue FVal = DAG.getConstant(0, DL, MVT::i32);
6754 SDValue TVal = DAG.getConstant(1, DL, MVT::i32);
6755 SDValue ARMcc = DAG.getConstant(
6756 IntCCToARMCC(cast<CondCodeSDNode>(Cond)->get()), DL, MVT::i32);
6757 return DAG.getNode(ARMISD::CMOV, DL, Op.getValueType(), FVal, TVal, ARMcc,
6758 Cmp.getValue(1));
6759}
6760
6761/// isVMOVModifiedImm - Check if the specified splat value corresponds to a
6762/// valid vector constant for a NEON or MVE instruction with a "modified
6763/// immediate" operand (e.g., VMOV). If so, return the encoded value.
6764static SDValue isVMOVModifiedImm(uint64_t SplatBits, uint64_t SplatUndef,
6765 unsigned SplatBitSize, SelectionDAG &DAG,
6766 const SDLoc &dl, EVT &VT, EVT VectorVT,
6767 VMOVModImmType type) {
6768 unsigned OpCmode, Imm;
6769 bool is128Bits = VectorVT.is128BitVector();
6770
6771 // SplatBitSize is set to the smallest size that splats the vector, so a
6772 // zero vector will always have SplatBitSize == 8. However, NEON modified
6773 // immediate instructions others than VMOV do not support the 8-bit encoding
6774 // of a zero vector, and the default encoding of zero is supposed to be the
6775 // 32-bit version.
6776 if (SplatBits == 0)
6777 SplatBitSize = 32;
6778
6779 switch (SplatBitSize) {
6780 case 8:
6781 if (type != VMOVModImm)
6782 return SDValue();
6783 // Any 1-byte value is OK. Op=0, Cmode=1110.
6784 assert((SplatBits & ~0xff) == 0 && "one byte splat value is too big");
6785 OpCmode = 0xe;
6786 Imm = SplatBits;
6787 VT = is128Bits ? MVT::v16i8 : MVT::v8i8;
6788 break;
6789
6790 case 16:
6791 // NEON's 16-bit VMOV supports splat values where only one byte is nonzero.
6792 VT = is128Bits ? MVT::v8i16 : MVT::v4i16;
6793 if ((SplatBits & ~0xff) == 0) {
6794 // Value = 0x00nn: Op=x, Cmode=100x.
6795 OpCmode = 0x8;
6796 Imm = SplatBits;
6797 break;
6798 }
6799 if ((SplatBits & ~0xff00) == 0) {
6800 // Value = 0xnn00: Op=x, Cmode=101x.
6801 OpCmode = 0xa;
6802 Imm = SplatBits >> 8;
6803 break;
6804 }
6805 return SDValue();
6806
6807 case 32:
6808 // NEON's 32-bit VMOV supports splat values where:
6809 // * only one byte is nonzero, or
6810 // * the least significant byte is 0xff and the second byte is nonzero, or
6811 // * the least significant 2 bytes are 0xff and the third is nonzero.
6812 VT = is128Bits ? MVT::v4i32 : MVT::v2i32;
6813 if ((SplatBits & ~0xff) == 0) {
6814 // Value = 0x000000nn: Op=x, Cmode=000x.
6815 OpCmode = 0;
6816 Imm = SplatBits;
6817 break;
6818 }
6819 if ((SplatBits & ~0xff00) == 0) {
6820 // Value = 0x0000nn00: Op=x, Cmode=001x.
6821 OpCmode = 0x2;
6822 Imm = SplatBits >> 8;
6823 break;
6824 }
6825 if ((SplatBits & ~0xff0000) == 0) {
6826 // Value = 0x00nn0000: Op=x, Cmode=010x.
6827 OpCmode = 0x4;
6828 Imm = SplatBits >> 16;
6829 break;
6830 }
6831 if ((SplatBits & ~0xff000000) == 0) {
6832 // Value = 0xnn000000: Op=x, Cmode=011x.
6833 OpCmode = 0x6;
6834 Imm = SplatBits >> 24;
6835 break;
6836 }
6837
6838 // cmode == 0b1100 and cmode == 0b1101 are not supported for VORR or VBIC
6839 if (type == OtherModImm) return SDValue();
6840
6841 if ((SplatBits & ~0xffff) == 0 &&
6842 ((SplatBits | SplatUndef) & 0xff) == 0xff) {
6843 // Value = 0x0000nnff: Op=x, Cmode=1100.
6844 OpCmode = 0xc;
6845 Imm = SplatBits >> 8;
6846 break;
6847 }
6848
6849 // cmode == 0b1101 is not supported for MVE VMVN
6850 if (type == MVEVMVNModImm)
6851 return SDValue();
6852
6853 if ((SplatBits & ~0xffffff) == 0 &&
6854 ((SplatBits | SplatUndef) & 0xffff) == 0xffff) {
6855 // Value = 0x00nnffff: Op=x, Cmode=1101.
6856 OpCmode = 0xd;
6857 Imm = SplatBits >> 16;
6858 break;
6859 }
6860
6861 // Note: there are a few 32-bit splat values (specifically: 00ffff00,
6862 // ff000000, ff0000ff, and ffff00ff) that are valid for VMOV.I64 but not
6863 // VMOV.I32. A (very) minor optimization would be to replicate the value
6864 // and fall through here to test for a valid 64-bit splat. But, then the
6865 // caller would also need to check and handle the change in size.
6866 return SDValue();
6867
6868 case 64: {
6869 if (type != VMOVModImm)
6870 return SDValue();
6871 // NEON has a 64-bit VMOV splat where each byte is either 0 or 0xff.
6872 uint64_t BitMask = 0xff;
6873 unsigned ImmMask = 1;
6874 Imm = 0;
6875 for (int ByteNum = 0; ByteNum < 8; ++ByteNum) {
6876 if (((SplatBits | SplatUndef) & BitMask) == BitMask) {
6877 Imm |= ImmMask;
6878 } else if ((SplatBits & BitMask) != 0) {
6879 return SDValue();
6880 }
6881 BitMask <<= 8;
6882 ImmMask <<= 1;
6883 }
6884
6885 // Op=1, Cmode=1110.
6886 OpCmode = 0x1e;
6887 VT = is128Bits ? MVT::v2i64 : MVT::v1i64;
6888 break;
6889 }
6890
6891 default:
6892 llvm_unreachable("unexpected size for isVMOVModifiedImm");
6893 }
6894
6895 unsigned EncodedVal = ARM_AM::createVMOVModImm(OpCmode, Imm);
6896 return DAG.getTargetConstant(EncodedVal, dl, MVT::i32);
6897}
6898
6899SDValue ARMTargetLowering::LowerConstantFP(SDValue Op, SelectionDAG &DAG,
6900 const ARMSubtarget *ST) const {
6901 EVT VT = Op.getValueType();
6902 bool IsDouble = (VT == MVT::f64);
6903 ConstantFPSDNode *CFP = cast<ConstantFPSDNode>(Op);
6904 const APFloat &FPVal = CFP->getValueAPF();
6905
6906 // Prevent floating-point constants from using literal loads
6907 // when execute-only is enabled.
6908 if (ST->genExecuteOnly()) {
6909 // We shouldn't trigger this for v6m execute-only
6910 assert((!ST->isThumb1Only() || ST->hasV8MBaselineOps()) &&
6911 "Unexpected architecture");
6912
6913 // If we can represent the constant as an immediate, don't lower it
6914 if (isFPImmLegal(FPVal, VT))
6915 return Op;
6916 // Otherwise, construct as integer, and move to float register
6917 APInt INTVal = FPVal.bitcastToAPInt();
6918 SDLoc DL(CFP);
6919 switch (VT.getSimpleVT().SimpleTy) {
6920 default:
6921 llvm_unreachable("Unknown floating point type!");
6922 break;
6923 case MVT::f64: {
6924 SDValue Lo = DAG.getConstant(INTVal.trunc(32), DL, MVT::i32);
6925 SDValue Hi = DAG.getConstant(INTVal.lshr(32).trunc(32), DL, MVT::i32);
6926 return DAG.getNode(ARMISD::VMOVDRR, DL, MVT::f64, Lo, Hi);
6927 }
6928 case MVT::f32:
6929 return DAG.getNode(ARMISD::VMOVSR, DL, VT,
6930 DAG.getConstant(INTVal, DL, MVT::i32));
6931 }
6932 }
6933
6934 if (!ST->hasVFP3Base())
6935 return SDValue();
6936
6937 // Use the default (constant pool) lowering for double constants when we have
6938 // an SP-only FPU
6939 if (IsDouble && !Subtarget->hasFP64())
6940 return SDValue();
6941
6942 // Try splatting with a VMOV.f32...
6943 int ImmVal = IsDouble ? ARM_AM::getFP64Imm(FPVal) : ARM_AM::getFP32Imm(FPVal);
6944
6945 if (ImmVal != -1) {
6946 if (IsDouble || !ST->useNEONForSinglePrecisionFP()) {
6947 // We have code in place to select a valid ConstantFP already, no need to
6948 // do any mangling.
6949 return Op;
6950 }
6951
6952 // It's a float and we are trying to use NEON operations where
6953 // possible. Lower it to a splat followed by an extract.
6954 SDLoc DL(Op);
6955 SDValue NewVal = DAG.getTargetConstant(ImmVal, DL, MVT::i32);
6956 SDValue VecConstant = DAG.getNode(ARMISD::VMOVFPIMM, DL, MVT::v2f32,
6957 NewVal);
6958 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecConstant,
6959 DAG.getConstant(0, DL, MVT::i32));
6960 }
6961
6962 // The rest of our options are NEON only, make sure that's allowed before
6963 // proceeding..
6964 if (!ST->hasNEON() || (!IsDouble && !ST->useNEONForSinglePrecisionFP()))
6965 return SDValue();
6966
6967 EVT VMovVT;
6968 uint64_t iVal = FPVal.bitcastToAPInt().getZExtValue();
6969
6970 // It wouldn't really be worth bothering for doubles except for one very
6971 // important value, which does happen to match: 0.0. So make sure we don't do
6972 // anything stupid.
6973 if (IsDouble && (iVal & 0xffffffff) != (iVal >> 32))
6974 return SDValue();
6975
6976 // Try a VMOV.i32 (FIXME: i8, i16, or i64 could work too).
6977 SDValue NewVal = isVMOVModifiedImm(iVal & 0xffffffffU, 0, 32, DAG, SDLoc(Op),
6978 VMovVT, VT, VMOVModImm);
6979 if (NewVal != SDValue()) {
6980 SDLoc DL(Op);
6981 SDValue VecConstant = DAG.getNode(ARMISD::VMOVIMM, DL, VMovVT,
6982 NewVal);
6983 if (IsDouble)
6984 return DAG.getNode(ISD::BITCAST, DL, MVT::f64, VecConstant);
6985
6986 // It's a float: cast and extract a vector element.
6987 SDValue VecFConstant = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32,
6988 VecConstant);
6989 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecFConstant,
6990 DAG.getConstant(0, DL, MVT::i32));
6991 }
6992
6993 // Finally, try a VMVN.i32
6994 NewVal = isVMOVModifiedImm(~iVal & 0xffffffffU, 0, 32, DAG, SDLoc(Op), VMovVT,
6995 VT, VMVNModImm);
6996 if (NewVal != SDValue()) {
6997 SDLoc DL(Op);
6998 SDValue VecConstant = DAG.getNode(ARMISD::VMVNIMM, DL, VMovVT, NewVal);
6999
7000 if (IsDouble)
7001 return DAG.getNode(ISD::BITCAST, DL, MVT::f64, VecConstant);
7002
7003 // It's a float: cast and extract a vector element.
7004 SDValue VecFConstant = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32,
7005 VecConstant);
7006 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecFConstant,
7007 DAG.getConstant(0, DL, MVT::i32));
7008 }
7009
7010 return SDValue();
7011}
7012
7013// check if an VEXT instruction can handle the shuffle mask when the
7014// vector sources of the shuffle are the same.
7015static bool isSingletonVEXTMask(ArrayRef<int> M, EVT VT, unsigned &Imm) {
7016 unsigned NumElts = VT.getVectorNumElements();
7017
7018 // Assume that the first shuffle index is not UNDEF. Fail if it is.
7019 if (M[0] < 0)
7020 return false;
7021
7022 Imm = M[0];
7023
7024 // If this is a VEXT shuffle, the immediate value is the index of the first
7025 // element. The other shuffle indices must be the successive elements after
7026 // the first one.
7027 unsigned ExpectedElt = Imm;
7028 for (unsigned i = 1; i < NumElts; ++i) {
7029 // Increment the expected index. If it wraps around, just follow it
7030 // back to index zero and keep going.
7031 ++ExpectedElt;
7032 if (ExpectedElt == NumElts)
7033 ExpectedElt = 0;
7034
7035 if (M[i] < 0) continue; // ignore UNDEF indices
7036 if (ExpectedElt != static_cast<unsigned>(M[i]))
7037 return false;
7038 }
7039
7040 return true;
7041}
7042
7043static bool isVEXTMask(ArrayRef<int> M, EVT VT,
7044 bool &ReverseVEXT, unsigned &Imm) {
7045 unsigned NumElts = VT.getVectorNumElements();
7046 ReverseVEXT = false;
7047
7048 // Assume that the first shuffle index is not UNDEF. Fail if it is.
7049 if (M[0] < 0)
7050 return false;
7051
7052 Imm = M[0];
7053
7054 // If this is a VEXT shuffle, the immediate value is the index of the first
7055 // element. The other shuffle indices must be the successive elements after
7056 // the first one.
7057 unsigned ExpectedElt = Imm;
7058 for (unsigned i = 1; i < NumElts; ++i) {
7059 // Increment the expected index. If it wraps around, it may still be
7060 // a VEXT but the source vectors must be swapped.
7061 ExpectedElt += 1;
7062 if (ExpectedElt == NumElts * 2) {
7063 ExpectedElt = 0;
7064 ReverseVEXT = true;
7065 }
7066
7067 if (M[i] < 0) continue; // ignore UNDEF indices
7068 if (ExpectedElt != static_cast<unsigned>(M[i]))
7069 return false;
7070 }
7071
7072 // Adjust the index value if the source operands will be swapped.
7073 if (ReverseVEXT)
7074 Imm -= NumElts;
7075
7076 return true;
7077}
7078
7079static bool isVTBLMask(ArrayRef<int> M, EVT VT) {
7080 // We can handle <8 x i8> vector shuffles. If the index in the mask is out of
7081 // range, then 0 is placed into the resulting vector. So pretty much any mask
7082 // of 8 elements can work here.
7083 return VT == MVT::v8i8 && M.size() == 8;
7084}
7085
7086static unsigned SelectPairHalf(unsigned Elements, ArrayRef<int> Mask,
7087 unsigned Index) {
7088 if (Mask.size() == Elements * 2)
7089 return Index / Elements;
7090 return Mask[Index] == 0 ? 0 : 1;
7091}
7092
7093// Checks whether the shuffle mask represents a vector transpose (VTRN) by
7094// checking that pairs of elements in the shuffle mask represent the same index
7095// in each vector, incrementing the expected index by 2 at each step.
7096// e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 4, 2, 6]
7097// v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,e,c,g}
7098// v2={e,f,g,h}
7099// WhichResult gives the offset for each element in the mask based on which
7100// of the two results it belongs to.
7101//
7102// The transpose can be represented either as:
7103// result1 = shufflevector v1, v2, result1_shuffle_mask
7104// result2 = shufflevector v1, v2, result2_shuffle_mask
7105// where v1/v2 and the shuffle masks have the same number of elements
7106// (here WhichResult (see below) indicates which result is being checked)
7107//
7108// or as:
7109// results = shufflevector v1, v2, shuffle_mask
7110// where both results are returned in one vector and the shuffle mask has twice
7111// as many elements as v1/v2 (here WhichResult will always be 0 if true) here we
7112// want to check the low half and high half of the shuffle mask as if it were
7113// the other case
7114static bool isVTRNMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
7115 unsigned EltSz = VT.getScalarSizeInBits();
7116 if (EltSz == 64)
7117 return false;
7118
7119 unsigned NumElts = VT.getVectorNumElements();
7120 if ((M.size() != NumElts && M.size() != NumElts * 2) || NumElts % 2 != 0)
7121 return false;
7122
7123 // If the mask is twice as long as the input vector then we need to check the
7124 // upper and lower parts of the mask with a matching value for WhichResult
7125 // FIXME: A mask with only even values will be rejected in case the first
7126 // element is undefined, e.g. [-1, 4, 2, 6] will be rejected, because only
7127 // M[0] is used to determine WhichResult
7128 for (unsigned i = 0; i < M.size(); i += NumElts) {
7129 WhichResult = SelectPairHalf(NumElts, M, i);
7130 for (unsigned j = 0; j < NumElts; j += 2) {
7131 if ((M[i+j] >= 0 && (unsigned) M[i+j] != j + WhichResult) ||
7132 (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != j + NumElts + WhichResult))
7133 return false;
7134 }
7135 }
7136
7137 if (M.size() == NumElts*2)
7138 WhichResult = 0;
7139
7140 return true;
7141}
7142
7143/// isVTRN_v_undef_Mask - Special case of isVTRNMask for canonical form of
7144/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
7145/// Mask is e.g., <0, 0, 2, 2> instead of <0, 4, 2, 6>.
7146static bool isVTRN_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){
7147 unsigned EltSz = VT.getScalarSizeInBits();
7148 if (EltSz == 64)
7149 return false;
7150
7151 unsigned NumElts = VT.getVectorNumElements();
7152 if ((M.size() != NumElts && M.size() != NumElts * 2) || NumElts % 2 != 0)
7153 return false;
7154
7155 for (unsigned i = 0; i < M.size(); i += NumElts) {
7156 WhichResult = SelectPairHalf(NumElts, M, i);
7157 for (unsigned j = 0; j < NumElts; j += 2) {
7158 if ((M[i+j] >= 0 && (unsigned) M[i+j] != j + WhichResult) ||
7159 (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != j + WhichResult))
7160 return false;
7161 }
7162 }
7163
7164 if (M.size() == NumElts*2)
7165 WhichResult = 0;
7166
7167 return true;
7168}
7169
7170// Checks whether the shuffle mask represents a vector unzip (VUZP) by checking
7171// that the mask elements are either all even and in steps of size 2 or all odd
7172// and in steps of size 2.
7173// e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 2, 4, 6]
7174// v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,c,e,g}
7175// v2={e,f,g,h}
7176// Requires similar checks to that of isVTRNMask with
7177// respect the how results are returned.
7178static bool isVUZPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
7179 unsigned EltSz = VT.getScalarSizeInBits();
7180 if (EltSz == 64)
7181 return false;
7182
7183 unsigned NumElts = VT.getVectorNumElements();
7184 if (M.size() != NumElts && M.size() != NumElts*2)
7185 return false;
7186
7187 for (unsigned i = 0; i < M.size(); i += NumElts) {
7188 WhichResult = SelectPairHalf(NumElts, M, i);
7189 for (unsigned j = 0; j < NumElts; ++j) {
7190 if (M[i+j] >= 0 && (unsigned) M[i+j] != 2 * j + WhichResult)
7191 return false;
7192 }
7193 }
7194
7195 if (M.size() == NumElts*2)
7196 WhichResult = 0;
7197
7198 // VUZP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
7199 if (VT.is64BitVector() && EltSz == 32)
7200 return false;
7201
7202 return true;
7203}
7204
7205/// isVUZP_v_undef_Mask - Special case of isVUZPMask for canonical form of
7206/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
7207/// Mask is e.g., <0, 2, 0, 2> instead of <0, 2, 4, 6>,
7208static bool isVUZP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){
7209 unsigned EltSz = VT.getScalarSizeInBits();
7210 if (EltSz == 64)
7211 return false;
7212
7213 unsigned NumElts = VT.getVectorNumElements();
7214 if (M.size() != NumElts && M.size() != NumElts*2)
7215 return false;
7216
7217 unsigned Half = NumElts / 2;
7218 for (unsigned i = 0; i < M.size(); i += NumElts) {
7219 WhichResult = SelectPairHalf(NumElts, M, i);
7220 for (unsigned j = 0; j < NumElts; j += Half) {
7221 unsigned Idx = WhichResult;
7222 for (unsigned k = 0; k < Half; ++k) {
7223 int MIdx = M[i + j + k];
7224 if (MIdx >= 0 && (unsigned) MIdx != Idx)
7225 return false;
7226 Idx += 2;
7227 }
7228 }
7229 }
7230
7231 if (M.size() == NumElts*2)
7232 WhichResult = 0;
7233
7234 // VUZP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
7235 if (VT.is64BitVector() && EltSz == 32)
7236 return false;
7237
7238 return true;
7239}
7240
7241// Checks whether the shuffle mask represents a vector zip (VZIP) by checking
7242// that pairs of elements of the shufflemask represent the same index in each
7243// vector incrementing sequentially through the vectors.
7244// e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 4, 1, 5]
7245// v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,e,b,f}
7246// v2={e,f,g,h}
7247// Requires similar checks to that of isVTRNMask with respect the how results
7248// are returned.
7249static bool isVZIPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
7250 unsigned EltSz = VT.getScalarSizeInBits();
7251 if (EltSz == 64)
7252 return false;
7253
7254 unsigned NumElts = VT.getVectorNumElements();
7255 if ((M.size() != NumElts && M.size() != NumElts * 2) || NumElts % 2 != 0)
7256 return false;
7257
7258 for (unsigned i = 0; i < M.size(); i += NumElts) {
7259 WhichResult = SelectPairHalf(NumElts, M, i);
7260 unsigned Idx = WhichResult * NumElts / 2;
7261 for (unsigned j = 0; j < NumElts; j += 2) {
7262 if ((M[i+j] >= 0 && (unsigned) M[i+j] != Idx) ||
7263 (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != Idx + NumElts))
7264 return false;
7265 Idx += 1;
7266 }
7267 }
7268
7269 if (M.size() == NumElts*2)
7270 WhichResult = 0;
7271
7272 // VZIP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
7273 if (VT.is64BitVector() && EltSz == 32)
7274 return false;
7275
7276 return true;
7277}
7278
7279/// isVZIP_v_undef_Mask - Special case of isVZIPMask for canonical form of
7280/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
7281/// Mask is e.g., <0, 0, 1, 1> instead of <0, 4, 1, 5>.
7282static bool isVZIP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){
7283 unsigned EltSz = VT.getScalarSizeInBits();
7284 if (EltSz == 64)
7285 return false;
7286
7287 unsigned NumElts = VT.getVectorNumElements();
7288 if ((M.size() != NumElts && M.size() != NumElts * 2) || NumElts % 2 != 0)
7289 return false;
7290
7291 for (unsigned i = 0; i < M.size(); i += NumElts) {
7292 WhichResult = SelectPairHalf(NumElts, M, i);
7293 unsigned Idx = WhichResult * NumElts / 2;
7294 for (unsigned j = 0; j < NumElts; j += 2) {
7295 if ((M[i+j] >= 0 && (unsigned) M[i+j] != Idx) ||
7296 (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != Idx))
7297 return false;
7298 Idx += 1;
7299 }
7300 }
7301
7302 if (M.size() == NumElts*2)
7303 WhichResult = 0;
7304
7305 // VZIP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
7306 if (VT.is64BitVector() && EltSz == 32)
7307 return false;
7308
7309 return true;
7310}
7311
7312/// Check if \p ShuffleMask is a NEON two-result shuffle (VZIP, VUZP, VTRN),
7313/// and return the corresponding ARMISD opcode if it is, or 0 if it isn't.
7314static unsigned isNEONTwoResultShuffleMask(ArrayRef<int> ShuffleMask, EVT VT,
7315 unsigned &WhichResult,
7316 bool &isV_UNDEF) {
7317 isV_UNDEF = false;
7318 if (isVTRNMask(ShuffleMask, VT, WhichResult))
7319 return ARMISD::VTRN;
7320 if (isVUZPMask(ShuffleMask, VT, WhichResult))
7321 return ARMISD::VUZP;
7322 if (isVZIPMask(ShuffleMask, VT, WhichResult))
7323 return ARMISD::VZIP;
7324
7325 isV_UNDEF = true;
7326 if (isVTRN_v_undef_Mask(ShuffleMask, VT, WhichResult))
7327 return ARMISD::VTRN;
7328 if (isVUZP_v_undef_Mask(ShuffleMask, VT, WhichResult))
7329 return ARMISD::VUZP;
7330 if (isVZIP_v_undef_Mask(ShuffleMask, VT, WhichResult))
7331 return ARMISD::VZIP;
7332
7333 return 0;
7334}
7335
7336/// \return true if this is a reverse operation on an vector.
7337static bool isReverseMask(ArrayRef<int> M, EVT VT) {
7338 unsigned NumElts = VT.getVectorNumElements();
7339 // Make sure the mask has the right size.
7340 if (NumElts != M.size())
7341 return false;
7342
7343 // Look for <15, ..., 3, -1, 1, 0>.
7344 for (unsigned i = 0; i != NumElts; ++i)
7345 if (M[i] >= 0 && M[i] != (int) (NumElts - 1 - i))
7346 return false;
7347
7348 return true;
7349}
7350
7351static bool isTruncMask(ArrayRef<int> M, EVT VT, bool Top, bool SingleSource) {
7352 unsigned NumElts = VT.getVectorNumElements();
7353 // Make sure the mask has the right size.
7354 if (NumElts != M.size() || (VT != MVT::v8i16 && VT != MVT::v16i8))
7355 return false;
7356
7357 // Half-width truncation patterns (e.g. v4i32 -> v8i16):
7358 // !Top && SingleSource: <0, 2, 4, 6, 0, 2, 4, 6>
7359 // !Top && !SingleSource: <0, 2, 4, 6, 8, 10, 12, 14>
7360 // Top && SingleSource: <1, 3, 5, 7, 1, 3, 5, 7>
7361 // Top && !SingleSource: <1, 3, 5, 7, 9, 11, 13, 15>
7362 int Ofs = Top ? 1 : 0;
7363 int Upper = SingleSource ? 0 : NumElts;
7364 for (int i = 0, e = NumElts / 2; i != e; ++i) {
7365 if (M[i] >= 0 && M[i] != (i * 2) + Ofs)
7366 return false;
7367 if (M[i + e] >= 0 && M[i + e] != (i * 2) + Ofs + Upper)
7368 return false;
7369 }
7370 return true;
7371}
7372
7373static bool isVMOVNMask(ArrayRef<int> M, EVT VT, bool Top, bool SingleSource) {
7374 unsigned NumElts = VT.getVectorNumElements();
7375 // Make sure the mask has the right size.
7376 if (NumElts != M.size() || (VT != MVT::v8i16 && VT != MVT::v16i8))
7377 return false;
7378
7379 // If Top
7380 // Look for <0, N, 2, N+2, 4, N+4, ..>.
7381 // This inserts Input2 into Input1
7382 // else if not Top
7383 // Look for <0, N+1, 2, N+3, 4, N+5, ..>
7384 // This inserts Input1 into Input2
7385 unsigned Offset = Top ? 0 : 1;
7386 unsigned N = SingleSource ? 0 : NumElts;
7387 for (unsigned i = 0; i < NumElts; i += 2) {
7388 if (M[i] >= 0 && M[i] != (int)i)
7389 return false;
7390 if (M[i + 1] >= 0 && M[i + 1] != (int)(N + i + Offset))
7391 return false;
7392 }
7393
7394 return true;
7395}
7396
7397static bool isVMOVNTruncMask(ArrayRef<int> M, EVT ToVT, bool rev) {
7398 unsigned NumElts = ToVT.getVectorNumElements();
7399 if (NumElts != M.size())
7400 return false;
7401
7402 // Test if the Trunc can be convertible to a VMOVN with this shuffle. We are
7403 // looking for patterns of:
7404 // !rev: 0 N/2 1 N/2+1 2 N/2+2 ...
7405 // rev: N/2 0 N/2+1 1 N/2+2 2 ...
7406
7407 unsigned Off0 = rev ? NumElts / 2 : 0;
7408 unsigned Off1 = rev ? 0 : NumElts / 2;
7409 for (unsigned i = 0; i < NumElts; i += 2) {
7410 if (M[i] >= 0 && M[i] != (int)(Off0 + i / 2))
7411 return false;
7412 if (M[i + 1] >= 0 && M[i + 1] != (int)(Off1 + i / 2))
7413 return false;
7414 }
7415
7416 return true;
7417}
7418
7419// Reconstruct an MVE VCVT from a BuildVector of scalar fptrunc, all extracted
7420// from a pair of inputs. For example:
7421// BUILDVECTOR(FP_ROUND(EXTRACT_ELT(X, 0),
7422// FP_ROUND(EXTRACT_ELT(Y, 0),
7423// FP_ROUND(EXTRACT_ELT(X, 1),
7424// FP_ROUND(EXTRACT_ELT(Y, 1), ...)
7426 const ARMSubtarget *ST) {
7427 assert(BV.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
7428 if (!ST->hasMVEFloatOps())
7429 return SDValue();
7430
7431 SDLoc dl(BV);
7432 EVT VT = BV.getValueType();
7433 if (VT != MVT::v8f16)
7434 return SDValue();
7435
7436 // We are looking for a buildvector of fptrunc elements, where all the
7437 // elements are interleavingly extracted from two sources. Check the first two
7438 // items are valid enough and extract some info from them (they are checked
7439 // properly in the loop below).
7440 if (BV.getOperand(0).getOpcode() != ISD::FP_ROUND ||
7443 return SDValue();
7444 if (BV.getOperand(1).getOpcode() != ISD::FP_ROUND ||
7447 return SDValue();
7448 SDValue Op0 = BV.getOperand(0).getOperand(0).getOperand(0);
7449 SDValue Op1 = BV.getOperand(1).getOperand(0).getOperand(0);
7450 if (Op0.getValueType() != MVT::v4f32 || Op1.getValueType() != MVT::v4f32)
7451 return SDValue();
7452
7453 // Check all the values in the BuildVector line up with our expectations.
7454 for (unsigned i = 1; i < 4; i++) {
7455 auto Check = [](SDValue Trunc, SDValue Op, unsigned Idx) {
7456 return Trunc.getOpcode() == ISD::FP_ROUND &&
7458 Trunc.getOperand(0).getOperand(0) == Op &&
7459 Trunc.getOperand(0).getConstantOperandVal(1) == Idx;
7460 };
7461 if (!Check(BV.getOperand(i * 2 + 0), Op0, i))
7462 return SDValue();
7463 if (!Check(BV.getOperand(i * 2 + 1), Op1, i))
7464 return SDValue();
7465 }
7466
7467 SDValue N1 = DAG.getNode(ARMISD::VCVTN, dl, VT, DAG.getUNDEF(VT), Op0,
7468 DAG.getConstant(0, dl, MVT::i32));
7469 return DAG.getNode(ARMISD::VCVTN, dl, VT, N1, Op1,
7470 DAG.getConstant(1, dl, MVT::i32));
7471}
7472
7473// Reconstruct an MVE VCVT from a BuildVector of scalar fpext, all extracted
7474// from a single input on alternating lanes. For example:
7475// BUILDVECTOR(FP_ROUND(EXTRACT_ELT(X, 0),
7476// FP_ROUND(EXTRACT_ELT(X, 2),
7477// FP_ROUND(EXTRACT_ELT(X, 4), ...)
7479 const ARMSubtarget *ST) {
7480 assert(BV.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
7481 if (!ST->hasMVEFloatOps())
7482 return SDValue();
7483
7484 SDLoc dl(BV);
7485 EVT VT = BV.getValueType();
7486 if (VT != MVT::v4f32)
7487 return SDValue();
7488
7489 // We are looking for a buildvector of fptext elements, where all the
7490 // elements are alternating lanes from a single source. For example <0,2,4,6>
7491 // or <1,3,5,7>. Check the first two items are valid enough and extract some
7492 // info from them (they are checked properly in the loop below).
7493 if (BV.getOperand(0).getOpcode() != ISD::FP_EXTEND ||
7495 return SDValue();
7496 SDValue Op0 = BV.getOperand(0).getOperand(0).getOperand(0);
7498 if (Op0.getValueType() != MVT::v8f16 || (Offset != 0 && Offset != 1))
7499 return SDValue();
7500
7501 // Check all the values in the BuildVector line up with our expectations.
7502 for (unsigned i = 1; i < 4; i++) {
7503 auto Check = [](SDValue Trunc, SDValue Op, unsigned Idx) {
7504 return Trunc.getOpcode() == ISD::FP_EXTEND &&
7506 Trunc.getOperand(0).getOperand(0) == Op &&
7507 Trunc.getOperand(0).getConstantOperandVal(1) == Idx;
7508 };
7509 if (!Check(BV.getOperand(i), Op0, 2 * i + Offset))
7510 return SDValue();
7511 }
7512
7513 return DAG.getNode(ARMISD::VCVTL, dl, VT, Op0,
7514 DAG.getConstant(Offset, dl, MVT::i32));
7515}
7516
7517// If N is an integer constant that can be moved into a register in one
7518// instruction, return an SDValue of such a constant (will become a MOV
7519// instruction). Otherwise return null.
7521 const ARMSubtarget *ST, const SDLoc &dl) {
7522 uint64_t Val;
7523 if (!isa<ConstantSDNode>(N))
7524 return SDValue();
7525 Val = N->getAsZExtVal();
7526
7527 if (ST->isThumb1Only()) {
7528 if (Val <= 255 || ~Val <= 255)
7529 return DAG.getConstant(Val, dl, MVT::i32);
7530 } else {
7531 if (ARM_AM::getSOImmVal(Val) != -1 || ARM_AM::getSOImmVal(~Val) != -1)
7532 return DAG.getConstant(Val, dl, MVT::i32);
7533 }
7534 return SDValue();
7535}
7536
7538 const ARMSubtarget *ST) {
7539 SDLoc dl(Op);
7540 EVT VT = Op.getValueType();
7541
7542 assert(ST->hasMVEIntegerOps() && "LowerBUILD_VECTOR_i1 called without MVE!");
7543
7544 unsigned NumElts = VT.getVectorNumElements();
7545 unsigned BoolMask;
7546 unsigned BitsPerBool;
7547 if (NumElts == 2) {
7548 BitsPerBool = 8;
7549 BoolMask = 0xff;
7550 } else if (NumElts == 4) {
7551 BitsPerBool = 4;
7552 BoolMask = 0xf;
7553 } else if (NumElts == 8) {
7554 BitsPerBool = 2;
7555 BoolMask = 0x3;
7556 } else if (NumElts == 16) {
7557 BitsPerBool = 1;
7558 BoolMask = 0x1;
7559 } else
7560 return SDValue();
7561
7562 // If this is a single value copied into all lanes (a splat), we can just sign
7563 // extend that single value
7564 SDValue FirstOp = Op.getOperand(0);
7565 if (!isa<ConstantSDNode>(FirstOp) &&
7566 llvm::all_of(llvm::drop_begin(Op->ops()), [&FirstOp](const SDUse &U) {
7567 return U.get().isUndef() || U.get() == FirstOp;
7568 })) {
7569 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::i32, FirstOp,
7570 DAG.getValueType(MVT::i1));
7571 return DAG.getNode(ARMISD::PREDICATE_CAST, dl, Op.getValueType(), Ext);
7572 }
7573
7574 // First create base with bits set where known
7575 unsigned Bits32 = 0;
7576 for (unsigned i = 0; i < NumElts; ++i) {
7577 SDValue V = Op.getOperand(i);
7578 if (!isa<ConstantSDNode>(V) && !V.isUndef())
7579 continue;
7580 bool BitSet = V.isUndef() ? false : V->getAsZExtVal();
7581 if (BitSet)
7582 Bits32 |= BoolMask << (i * BitsPerBool);
7583 }
7584
7585 // Add in unknown nodes
7586 SDValue Base = DAG.getNode(ARMISD::PREDICATE_CAST, dl, VT,
7587 DAG.getConstant(Bits32, dl, MVT::i32));
7588 for (unsigned i = 0; i < NumElts; ++i) {
7589 SDValue V = Op.getOperand(i);
7590 if (isa<ConstantSDNode>(V) || V.isUndef())
7591 continue;
7592 Base = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Base, V,
7593 DAG.getConstant(i, dl, MVT::i32));
7594 }
7595
7596 return Base;
7597}
7598
7600 const ARMSubtarget *ST) {
7601 if (!ST->hasMVEIntegerOps())
7602 return SDValue();
7603
7604 // We are looking for a buildvector where each element is Op[0] + i*N
7605 EVT VT = Op.getValueType();
7606 SDValue Op0 = Op.getOperand(0);
7607 unsigned NumElts = VT.getVectorNumElements();
7608
7609 // Get the increment value from operand 1
7610 SDValue Op1 = Op.getOperand(1);
7611 if (Op1.getOpcode() != ISD::ADD || Op1.getOperand(0) != Op0 ||
7613 return SDValue();
7614 unsigned N = Op1.getConstantOperandVal(1);
7615 if (N != 1 && N != 2 && N != 4 && N != 8)
7616 return SDValue();
7617
7618 // Check that each other operand matches
7619 for (unsigned I = 2; I < NumElts; I++) {
7620 SDValue OpI = Op.getOperand(I);
7621 if (OpI.getOpcode() != ISD::ADD || OpI.getOperand(0) != Op0 ||
7623 OpI.getConstantOperandVal(1) != I * N)
7624 return SDValue();
7625 }
7626
7627 SDLoc DL(Op);
7628 return DAG.getNode(ARMISD::VIDUP, DL, DAG.getVTList(VT, MVT::i32), Op0,
7629 DAG.getConstant(N, DL, MVT::i32));
7630}
7631
7632// Returns true if the operation N can be treated as qr instruction variant at
7633// operand Op.
7634static bool IsQRMVEInstruction(const SDNode *N, const SDNode *Op) {
7635 switch (N->getOpcode()) {
7636 case ISD::ADD:
7637 case ISD::MUL:
7638 case ISD::SADDSAT:
7639 case ISD::UADDSAT:
7640 case ISD::AVGFLOORS:
7641 case ISD::AVGFLOORU:
7642 return true;
7643 case ISD::SUB:
7644 case ISD::SSUBSAT:
7645 case ISD::USUBSAT:
7646 return N->getOperand(1).getNode() == Op;
7648 switch (N->getConstantOperandVal(0)) {
7649 case Intrinsic::arm_mve_add_predicated:
7650 case Intrinsic::arm_mve_mul_predicated:
7651 case Intrinsic::arm_mve_qadd_predicated:
7652 case Intrinsic::arm_mve_vhadd:
7653 case Intrinsic::arm_mve_hadd_predicated:
7654 case Intrinsic::arm_mve_vqdmulh:
7655 case Intrinsic::arm_mve_qdmulh_predicated:
7656 case Intrinsic::arm_mve_vqrdmulh:
7657 case Intrinsic::arm_mve_qrdmulh_predicated:
7658 case Intrinsic::arm_mve_vqdmull:
7659 case Intrinsic::arm_mve_vqdmull_predicated:
7660 return true;
7661 case Intrinsic::arm_mve_sub_predicated:
7662 case Intrinsic::arm_mve_qsub_predicated:
7663 case Intrinsic::arm_mve_vhsub:
7664 case Intrinsic::arm_mve_hsub_predicated:
7665 return N->getOperand(2).getNode() == Op;
7666 default:
7667 return false;
7668 }
7669 default:
7670 return false;
7671 }
7672}
7673
7674// If this is a case we can't handle, return null and let the default
7675// expansion code take care of it.
7676SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
7677 const ARMSubtarget *ST) const {
7678 BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode());
7679 SDLoc dl(Op);
7680 EVT VT = Op.getValueType();
7681
7682 if (ST->hasMVEIntegerOps() && VT.getScalarSizeInBits() == 1)
7683 return LowerBUILD_VECTOR_i1(Op, DAG, ST);
7684
7685 if (SDValue R = LowerBUILD_VECTORToVIDUP(Op, DAG, ST))
7686 return R;
7687
7688 APInt SplatBits, SplatUndef;
7689 unsigned SplatBitSize;
7690 bool HasAnyUndefs;
7691 if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
7692 if (SplatUndef.isAllOnes())
7693 return DAG.getUNDEF(VT);
7694
7695 // If all the users of this constant splat are qr instruction variants,
7696 // generate a vdup of the constant.
7697 if (ST->hasMVEIntegerOps() && VT.getScalarSizeInBits() == SplatBitSize &&
7698 (SplatBitSize == 8 || SplatBitSize == 16 || SplatBitSize == 32) &&
7699 all_of(BVN->users(),
7700 [BVN](const SDNode *U) { return IsQRMVEInstruction(U, BVN); })) {
7701 EVT DupVT = SplatBitSize == 32 ? MVT::v4i32
7702 : SplatBitSize == 16 ? MVT::v8i16
7703 : MVT::v16i8;
7704 SDValue Const = DAG.getConstant(SplatBits.getZExtValue(), dl, MVT::i32);
7705 SDValue VDup = DAG.getNode(ARMISD::VDUP, dl, DupVT, Const);
7706 return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, VDup);
7707 }
7708
7709 if ((ST->hasNEON() && SplatBitSize <= 64) ||
7710 (ST->hasMVEIntegerOps() && SplatBitSize <= 64)) {
7711 // Check if an immediate VMOV works.
7712 EVT VmovVT;
7713 SDValue Val =
7714 isVMOVModifiedImm(SplatBits.getZExtValue(), SplatUndef.getZExtValue(),
7715 SplatBitSize, DAG, dl, VmovVT, VT, VMOVModImm);
7716
7717 if (Val.getNode()) {
7718 SDValue Vmov = DAG.getNode(ARMISD::VMOVIMM, dl, VmovVT, Val);
7719 return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, Vmov);
7720 }
7721
7722 // Try an immediate VMVN.
7723 uint64_t NegatedImm = (~SplatBits).getZExtValue();
7724 Val = isVMOVModifiedImm(
7725 NegatedImm, SplatUndef.getZExtValue(), SplatBitSize, DAG, dl, VmovVT,
7726 VT, ST->hasMVEIntegerOps() ? MVEVMVNModImm : VMVNModImm);
7727 if (Val.getNode()) {
7728 SDValue Vmov = DAG.getNode(ARMISD::VMVNIMM, dl, VmovVT, Val);
7729 return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, Vmov);
7730 }
7731
7732 // Use vmov.f32 to materialize other v2f32 and v4f32 splats.
7733 if ((VT == MVT::v2f32 || VT == MVT::v4f32) && SplatBitSize == 32) {
7734 int ImmVal = ARM_AM::getFP32Imm(SplatBits);
7735 if (ImmVal != -1) {
7736 SDValue Val = DAG.getTargetConstant(ImmVal, dl, MVT::i32);
7737 return DAG.getNode(ARMISD::VMOVFPIMM, dl, VT, Val);
7738 }
7739 }
7740
7741 // If we are under MVE, generate a VDUP(constant), bitcast to the original
7742 // type.
7743 if (ST->hasMVEIntegerOps() &&
7744 (SplatBitSize == 8 || SplatBitSize == 16 || SplatBitSize == 32)) {
7745 EVT DupVT = SplatBitSize == 32 ? MVT::v4i32
7746 : SplatBitSize == 16 ? MVT::v8i16
7747 : MVT::v16i8;
7748 SDValue Const = DAG.getConstant(SplatBits.getZExtValue(), dl, MVT::i32);
7749 SDValue VDup = DAG.getNode(ARMISD::VDUP, dl, DupVT, Const);
7750 return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, VDup);
7751 }
7752 }
7753 }
7754
7755 // Scan through the operands to see if only one value is used.
7756 //
7757 // As an optimisation, even if more than one value is used it may be more
7758 // profitable to splat with one value then change some lanes.
7759 //
7760 // Heuristically we decide to do this if the vector has a "dominant" value,
7761 // defined as splatted to more than half of the lanes.
7762 unsigned NumElts = VT.getVectorNumElements();
7763 bool isOnlyLowElement = true;
7764 bool usesOnlyOneValue = true;
7765 bool hasDominantValue = false;
7766 bool isConstant = true;
7767
7768 // Map of the number of times a particular SDValue appears in the
7769 // element list.
7770 DenseMap<SDValue, unsigned> ValueCounts;
7771 SDValue Value;
7772 for (unsigned i = 0; i < NumElts; ++i) {
7773 SDValue V = Op.getOperand(i);
7774 if (V.isUndef())
7775 continue;
7776 if (i > 0)
7777 isOnlyLowElement = false;
7779 isConstant = false;
7780
7781 unsigned &Count = ValueCounts[V];
7782
7783 // Is this value dominant? (takes up more than half of the lanes)
7784 if (++Count > (NumElts / 2)) {
7785 hasDominantValue = true;
7786 Value = V;
7787 }
7788 }
7789 if (ValueCounts.size() != 1)
7790 usesOnlyOneValue = false;
7791 if (!Value.getNode() && !ValueCounts.empty())
7792 Value = ValueCounts.begin()->first;
7793
7794 if (ValueCounts.empty())
7795 return DAG.getUNDEF(VT);
7796
7797 // Loads are better lowered with insert_vector_elt/ARMISD::BUILD_VECTOR.
7798 // Keep going if we are hitting this case.
7799 if (isOnlyLowElement && !ISD::isNormalLoad(Value.getNode()) &&
7800 (VT != MVT::v8f16 || ST->hasFullFP16()))
7801 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Value);
7802
7803 unsigned EltSize = VT.getScalarSizeInBits();
7804
7805 // Use VDUP for non-constant splats. For f32 constant splats, reduce to
7806 // i32 and try again.
7807 if (hasDominantValue && EltSize <= 32) {
7808 if (!isConstant) {
7809 SDValue N;
7810
7811 // If we are VDUPing a value that comes directly from a vector, that will
7812 // cause an unnecessary move to and from a GPR, where instead we could
7813 // just use VDUPLANE. We can only do this if the lane being extracted
7814 // is at a constant index, as the VDUP from lane instructions only have
7815 // constant-index forms.
7816 ConstantSDNode *constIndex;
7817 if (Value->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
7818 (constIndex = dyn_cast<ConstantSDNode>(Value->getOperand(1)))) {
7819 // We need to create a new undef vector to use for the VDUPLANE if the
7820 // size of the vector from which we get the value is different than the
7821 // size of the vector that we need to create. We will insert the element
7822 // such that the register coalescer will remove unnecessary copies.
7823 if (VT != Value->getOperand(0).getValueType()) {
7824 unsigned index = constIndex->getAPIntValue().getLimitedValue() %
7826 N = DAG.getNode(ARMISD::VDUPLANE, dl, VT,
7827 DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DAG.getUNDEF(VT),
7828 Value, DAG.getConstant(index, dl, MVT::i32)),
7829 DAG.getConstant(index, dl, MVT::i32));
7830 } else
7831 N = DAG.getNode(ARMISD::VDUPLANE, dl, VT,
7832 Value->getOperand(0), Value->getOperand(1));
7833 } else
7834 N = DAG.getNode(ARMISD::VDUP, dl, VT, Value);
7835
7836 if (!usesOnlyOneValue) {
7837 // The dominant value was splatted as 'N', but we now have to insert
7838 // all differing elements.
7839 for (unsigned I = 0; I < NumElts; ++I) {
7840 if (Op.getOperand(I) == Value)
7841 continue;
7843 Ops.push_back(N);
7844 Ops.push_back(Op.getOperand(I));
7845 Ops.push_back(DAG.getConstant(I, dl, MVT::i32));
7846 N = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Ops);
7847 }
7848 }
7849 return N;
7850 }
7853 MVT FVT = VT.getVectorElementType().getSimpleVT();
7854 assert(FVT == MVT::f32 || FVT == MVT::f16);
7855 MVT IVT = (FVT == MVT::f32) ? MVT::i32 : MVT::i16;
7856 for (unsigned i = 0; i < NumElts; ++i)
7857 Ops.push_back(DAG.getNode(ISD::BITCAST, dl, IVT,
7858 Op.getOperand(i)));
7859 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), IVT, NumElts);
7860 SDValue Val = DAG.getBuildVector(VecVT, dl, Ops);
7861 Val = LowerBUILD_VECTOR(Val, DAG, ST);
7862 if (Val.getNode())
7863 return DAG.getNode(ISD::BITCAST, dl, VT, Val);
7864 }
7865 if (usesOnlyOneValue) {
7866 SDValue Val = IsSingleInstrConstant(Value, DAG, ST, dl);
7867 if (isConstant && Val.getNode())
7868 return DAG.getNode(ARMISD::VDUP, dl, VT, Val);
7869 }
7870 }
7871
7872 // If all elements are constants and the case above didn't get hit, fall back
7873 // to the default expansion, which will generate a load from the constant
7874 // pool.
7875 if (isConstant)
7876 return SDValue();
7877
7878 // Reconstruct the BUILDVECTOR to one of the legal shuffles (such as vext and
7879 // vmovn). Empirical tests suggest this is rarely worth it for vectors of
7880 // length <= 2.
7881 if (NumElts >= 4)
7882 if (SDValue shuffle = ReconstructShuffle(Op, DAG))
7883 return shuffle;
7884
7885 // Attempt to turn a buildvector of scalar fptrunc's or fpext's back into
7886 // VCVT's
7887 if (SDValue VCVT = LowerBuildVectorOfFPTrunc(Op, DAG, Subtarget))
7888 return VCVT;
7889 if (SDValue VCVT = LowerBuildVectorOfFPExt(Op, DAG, Subtarget))
7890 return VCVT;
7891
7892 if (ST->hasNEON() && VT.is128BitVector() && VT != MVT::v2f64 && VT != MVT::v4f32) {
7893 // If we haven't found an efficient lowering, try splitting a 128-bit vector
7894 // into two 64-bit vectors; we might discover a better way to lower it.
7895 SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElts);
7896 EVT ExtVT = VT.getVectorElementType();
7897 EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElts / 2);
7898 SDValue Lower = DAG.getBuildVector(HVT, dl, ArrayRef(&Ops[0], NumElts / 2));
7899 if (Lower.getOpcode() == ISD::BUILD_VECTOR)
7900 Lower = LowerBUILD_VECTOR(Lower, DAG, ST);
7901 SDValue Upper =
7902 DAG.getBuildVector(HVT, dl, ArrayRef(&Ops[NumElts / 2], NumElts / 2));
7903 if (Upper.getOpcode() == ISD::BUILD_VECTOR)
7904 Upper = LowerBUILD_VECTOR(Upper, DAG, ST);
7905 if (Lower && Upper)
7906 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lower, Upper);
7907 }
7908
7909 // Vectors with 32- or 64-bit elements can be built by directly assigning
7910 // the subregisters. Lower it to an ARMISD::BUILD_VECTOR so the operands
7911 // will be legalized.
7912 if (EltSize >= 32) {
7913 // Do the expansion with floating-point types, since that is what the VFP
7914 // registers are defined to use, and since i64 is not legal.
7915 EVT EltVT = EVT::getFloatingPointVT(EltSize);
7916 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts);
7918 for (unsigned i = 0; i < NumElts; ++i)
7919 Ops.push_back(DAG.getNode(ISD::BITCAST, dl, EltVT, Op.getOperand(i)));
7920 SDValue Val = DAG.getNode(ARMISD::BUILD_VECTOR, dl, VecVT, Ops);
7921 return DAG.getNode(ISD::BITCAST, dl, VT, Val);
7922 }
7923
7924 // If all else fails, just use a sequence of INSERT_VECTOR_ELT when we
7925 // know the default expansion would otherwise fall back on something even
7926 // worse. For a vector with one or two non-undef values, that's
7927 // scalar_to_vector for the elements followed by a shuffle (provided the
7928 // shuffle is valid for the target) and materialization element by element
7929 // on the stack followed by a load for everything else.
7930 if ((!isConstant && !usesOnlyOneValue) ||
7931 (VT == MVT::v8f16 && !ST->hasFullFP16())) {
7932 SDValue Vec = DAG.getUNDEF(VT);
7933 for (unsigned i = 0 ; i < NumElts; ++i) {
7934 SDValue V = Op.getOperand(i);
7935 if (V.isUndef())
7936 continue;
7937 SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i32);
7938 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Vec, V, LaneIdx);
7939 }
7940 return Vec;
7941 }
7942
7943 return SDValue();
7944}
7945
7946// Gather data to see if the operation can be modelled as a
7947// shuffle in combination with VEXTs.
7948SDValue ARMTargetLowering::ReconstructShuffle(SDValue Op,
7949 SelectionDAG &DAG) const {
7950 assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
7951 SDLoc dl(Op);
7952 EVT VT = Op.getValueType();
7953 unsigned NumElts = VT.getVectorNumElements();
7954
7955 struct ShuffleSourceInfo {
7956 SDValue Vec;
7957 unsigned MinElt = std::numeric_limits<unsigned>::max();
7958 unsigned MaxElt = 0;
7959
7960 // We may insert some combination of BITCASTs and VEXT nodes to force Vec to
7961 // be compatible with the shuffle we intend to construct. As a result
7962 // ShuffleVec will be some sliding window into the original Vec.
7963 SDValue ShuffleVec;
7964
7965 // Code should guarantee that element i in Vec starts at element "WindowBase
7966 // + i * WindowScale in ShuffleVec".
7967 int WindowBase = 0;
7968 int WindowScale = 1;
7969
7970 ShuffleSourceInfo(SDValue Vec) : Vec(Vec), ShuffleVec(Vec) {}
7971
7972 bool operator ==(SDValue OtherVec) { return Vec == OtherVec; }
7973 };
7974
7975 // First gather all vectors used as an immediate source for this BUILD_VECTOR
7976 // node.
7978 for (unsigned i = 0; i < NumElts; ++i) {
7979 SDValue V = Op.getOperand(i);
7980 if (V.isUndef())
7981 continue;
7982 else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT) {
7983 // A shuffle can only come from building a vector from various
7984 // elements of other vectors.
7985 return SDValue();
7986 } else if (!isa<ConstantSDNode>(V.getOperand(1))) {
7987 // Furthermore, shuffles require a constant mask, whereas extractelts
7988 // accept variable indices.
7989 return SDValue();
7990 }
7991
7992 // Add this element source to the list if it's not already there.
7993 SDValue SourceVec = V.getOperand(0);
7994 auto Source = llvm::find(Sources, SourceVec);
7995 if (Source == Sources.end())
7996 Source = Sources.insert(Sources.end(), ShuffleSourceInfo(SourceVec));
7997
7998 // Update the minimum and maximum lane number seen.
7999 unsigned EltNo = V.getConstantOperandVal(1);
8000 Source->MinElt = std::min(Source->MinElt, EltNo);
8001 Source->MaxElt = std::max(Source->MaxElt, EltNo);
8002 }
8003
8004 // Currently only do something sane when at most two source vectors
8005 // are involved.
8006 if (Sources.size() > 2)
8007 return SDValue();
8008
8009 // Find out the smallest element size among result and two sources, and use
8010 // it as element size to build the shuffle_vector.
8011 EVT SmallestEltTy = VT.getVectorElementType();
8012 for (auto &Source : Sources) {
8013 EVT SrcEltTy = Source.Vec.getValueType().getVectorElementType();
8014 if (SrcEltTy.bitsLT(SmallestEltTy))
8015 SmallestEltTy = SrcEltTy;
8016 }
8017 unsigned ResMultiplier =
8018 VT.getScalarSizeInBits() / SmallestEltTy.getSizeInBits();
8019 NumElts = VT.getSizeInBits() / SmallestEltTy.getSizeInBits();
8020 EVT ShuffleVT = EVT::getVectorVT(*DAG.getContext(), SmallestEltTy, NumElts);
8021
8022 // If the source vector is too wide or too narrow, we may nevertheless be able
8023 // to construct a compatible shuffle either by concatenating it with UNDEF or
8024 // extracting a suitable range of elements.
8025 for (auto &Src : Sources) {
8026 EVT SrcVT = Src.ShuffleVec.getValueType();
8027
8028 uint64_t SrcVTSize = SrcVT.getFixedSizeInBits();
8029 uint64_t VTSize = VT.getFixedSizeInBits();
8030 if (SrcVTSize == VTSize)
8031 continue;
8032
8033 // This stage of the search produces a source with the same element type as
8034 // the original, but with a total width matching the BUILD_VECTOR output.
8035 EVT EltVT = SrcVT.getVectorElementType();
8036 unsigned NumSrcElts = VTSize / EltVT.getFixedSizeInBits();
8037 EVT DestVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumSrcElts);
8038
8039 if (SrcVTSize < VTSize) {
8040 if (2 * SrcVTSize != VTSize)
8041 return SDValue();
8042 // We can pad out the smaller vector for free, so if it's part of a
8043 // shuffle...
8044 Src.ShuffleVec =
8045 DAG.getNode(ISD::CONCAT_VECTORS, dl, DestVT, Src.ShuffleVec,
8046 DAG.getUNDEF(Src.ShuffleVec.getValueType()));
8047 continue;
8048 }
8049
8050 if (SrcVTSize != 2 * VTSize)
8051 return SDValue();
8052
8053 if (Src.MaxElt - Src.MinElt >= NumSrcElts) {
8054 // Span too large for a VEXT to cope
8055 return SDValue();
8056 }
8057
8058 if (Src.MinElt >= NumSrcElts) {
8059 // The extraction can just take the second half
8060 Src.ShuffleVec =
8061 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
8062 DAG.getConstant(NumSrcElts, dl, MVT::i32));
8063 Src.WindowBase = -NumSrcElts;
8064 } else if (Src.MaxElt < NumSrcElts) {
8065 // The extraction can just take the first half
8066 Src.ShuffleVec =
8067 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
8068 DAG.getConstant(0, dl, MVT::i32));
8069 } else {
8070 // An actual VEXT is needed
8071 SDValue VEXTSrc1 =
8072 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
8073 DAG.getConstant(0, dl, MVT::i32));
8074 SDValue VEXTSrc2 =
8075 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
8076 DAG.getConstant(NumSrcElts, dl, MVT::i32));
8077
8078 Src.ShuffleVec = DAG.getNode(ARMISD::VEXT, dl, DestVT, VEXTSrc1,
8079 VEXTSrc2,
8080 DAG.getConstant(Src.MinElt, dl, MVT::i32));
8081 Src.WindowBase = -Src.MinElt;
8082 }
8083 }
8084
8085 // Another possible incompatibility occurs from the vector element types. We
8086 // can fix this by bitcasting the source vectors to the same type we intend
8087 // for the shuffle.
8088 for (auto &Src : Sources) {
8089 EVT SrcEltTy = Src.ShuffleVec.getValueType().getVectorElementType();
8090 if (SrcEltTy == SmallestEltTy)
8091 continue;
8092 assert(ShuffleVT.getVectorElementType() == SmallestEltTy);
8093 Src.ShuffleVec = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, ShuffleVT, Src.ShuffleVec);
8094 Src.WindowScale = SrcEltTy.getSizeInBits() / SmallestEltTy.getSizeInBits();
8095 Src.WindowBase *= Src.WindowScale;
8096 }
8097
8098 // Final check before we try to actually produce a shuffle.
8099 LLVM_DEBUG({
8100 for (auto Src : Sources)
8101 assert(Src.ShuffleVec.getValueType() == ShuffleVT);
8102 });
8103
8104 // The stars all align, our next step is to produce the mask for the shuffle.
8105 SmallVector<int, 8> Mask(ShuffleVT.getVectorNumElements(), -1);
8106 int BitsPerShuffleLane = ShuffleVT.getScalarSizeInBits();
8107 for (unsigned i = 0; i < VT.getVectorNumElements(); ++i) {
8108 SDValue Entry = Op.getOperand(i);
8109 if (Entry.isUndef())
8110 continue;
8111
8112 auto Src = llvm::find(Sources, Entry.getOperand(0));
8113 int EltNo = cast<ConstantSDNode>(Entry.getOperand(1))->getSExtValue();
8114
8115 // EXTRACT_VECTOR_ELT performs an implicit any_ext; BUILD_VECTOR an implicit
8116 // trunc. So only std::min(SrcBits, DestBits) actually get defined in this
8117 // segment.
8118 EVT OrigEltTy = Entry.getOperand(0).getValueType().getVectorElementType();
8119 int BitsDefined = std::min(OrigEltTy.getScalarSizeInBits(),
8120 VT.getScalarSizeInBits());
8121 int LanesDefined = BitsDefined / BitsPerShuffleLane;
8122
8123 // This source is expected to fill ResMultiplier lanes of the final shuffle,
8124 // starting at the appropriate offset.
8125 int *LaneMask = &Mask[i * ResMultiplier];
8126
8127 int ExtractBase = EltNo * Src->WindowScale + Src->WindowBase;
8128 ExtractBase += NumElts * (Src - Sources.begin());
8129 for (int j = 0; j < LanesDefined; ++j)
8130 LaneMask[j] = ExtractBase + j;
8131 }
8132
8133
8134 // We can't handle more than two sources. This should have already
8135 // been checked before this point.
8136 assert(Sources.size() <= 2 && "Too many sources!");
8137
8138 SDValue ShuffleOps[] = { DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT) };
8139 for (unsigned i = 0; i < Sources.size(); ++i)
8140 ShuffleOps[i] = Sources[i].ShuffleVec;
8141
8142 SDValue Shuffle = buildLegalVectorShuffle(ShuffleVT, dl, ShuffleOps[0],
8143 ShuffleOps[1], Mask, DAG);
8144 if (!Shuffle)
8145 return SDValue();
8146 return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, Shuffle);
8147}
8148
8150 OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3>
8159 OP_VUZPL, // VUZP, left result
8160 OP_VUZPR, // VUZP, right result
8161 OP_VZIPL, // VZIP, left result
8162 OP_VZIPR, // VZIP, right result
8163 OP_VTRNL, // VTRN, left result
8164 OP_VTRNR // VTRN, right result
8165};
8166
8167static bool isLegalMVEShuffleOp(unsigned PFEntry) {
8168 unsigned OpNum = (PFEntry >> 26) & 0x0F;
8169 switch (OpNum) {
8170 case OP_COPY:
8171 case OP_VREV:
8172 case OP_VDUP0:
8173 case OP_VDUP1:
8174 case OP_VDUP2:
8175 case OP_VDUP3:
8176 return true;
8177 }
8178 return false;
8179}
8180
8181/// isShuffleMaskLegal - Targets can use this to indicate that they only
8182/// support *some* VECTOR_SHUFFLE operations, those with specific masks.
8183/// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
8184/// are assumed to be legal.
8186 if (VT.getVectorNumElements() == 4 &&
8187 (VT.is128BitVector() || VT.is64BitVector())) {
8188 unsigned PFIndexes[4];
8189 for (unsigned i = 0; i != 4; ++i) {
8190 if (M[i] < 0)
8191 PFIndexes[i] = 8;
8192 else
8193 PFIndexes[i] = M[i];
8194 }
8195
8196 // Compute the index in the perfect shuffle table.
8197 unsigned PFTableIndex =
8198 PFIndexes[0]*9*9*9+PFIndexes[1]*9*9+PFIndexes[2]*9+PFIndexes[3];
8199 unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
8200 unsigned Cost = (PFEntry >> 30);
8201
8202 if (Cost <= 4 && (Subtarget->hasNEON() || isLegalMVEShuffleOp(PFEntry)))
8203 return true;
8204 }
8205
8206 bool ReverseVEXT, isV_UNDEF;
8207 unsigned Imm, WhichResult;
8208
8209 unsigned EltSize = VT.getScalarSizeInBits();
8210 if (EltSize >= 32 ||
8212 ShuffleVectorInst::isIdentityMask(M, M.size()) ||
8213 isVREVMask(M, VT, 64) ||
8214 isVREVMask(M, VT, 32) ||
8215 isVREVMask(M, VT, 16))
8216 return true;
8217 else if (Subtarget->hasNEON() &&
8218 (isVEXTMask(M, VT, ReverseVEXT, Imm) ||
8219 isVTBLMask(M, VT) ||
8220 isNEONTwoResultShuffleMask(M, VT, WhichResult, isV_UNDEF)))
8221 return true;
8222 else if ((VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i8) &&
8223 isReverseMask(M, VT))
8224 return true;
8225 else if (Subtarget->hasMVEIntegerOps() &&
8226 (isVMOVNMask(M, VT, true, false) ||
8227 isVMOVNMask(M, VT, false, false) || isVMOVNMask(M, VT, true, true)))
8228 return true;
8229 else if (Subtarget->hasMVEIntegerOps() &&
8230 (isTruncMask(M, VT, false, false) ||
8231 isTruncMask(M, VT, false, true) ||
8232 isTruncMask(M, VT, true, false) || isTruncMask(M, VT, true, true)))
8233 return true;
8234 else
8235 return false;
8236}
8237
8238/// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit
8239/// the specified operations to build the shuffle.
8241 SDValue RHS, SelectionDAG &DAG,
8242 const SDLoc &dl) {
8243 unsigned OpNum = (PFEntry >> 26) & 0x0F;
8244 unsigned LHSID = (PFEntry >> 13) & ((1 << 13)-1);
8245 unsigned RHSID = (PFEntry >> 0) & ((1 << 13)-1);
8246
8247 if (OpNum == OP_COPY) {
8248 if (LHSID == (1*9+2)*9+3) return LHS;
8249 assert(LHSID == ((4*9+5)*9+6)*9+7 && "Illegal OP_COPY!");
8250 return RHS;
8251 }
8252
8253 SDValue OpLHS, OpRHS;
8254 OpLHS = GeneratePerfectShuffle(PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl);
8255 OpRHS = GeneratePerfectShuffle(PerfectShuffleTable[RHSID], LHS, RHS, DAG, dl);
8256 EVT VT = OpLHS.getValueType();
8257
8258 switch (OpNum) {
8259 default: llvm_unreachable("Unknown shuffle opcode!");
8260 case OP_VREV:
8261 // VREV divides the vector in half and swaps within the half.
8262 if (VT.getScalarSizeInBits() == 32)
8263 return DAG.getNode(ARMISD::VREV64, dl, VT, OpLHS);
8264 // vrev <4 x i16> -> VREV32
8265 if (VT.getScalarSizeInBits() == 16)
8266 return DAG.getNode(ARMISD::VREV32, dl, VT, OpLHS);
8267 // vrev <4 x i8> -> VREV16
8268 assert(VT.getScalarSizeInBits() == 8);
8269 return DAG.getNode(ARMISD::VREV16, dl, VT, OpLHS);
8270 case OP_VDUP0:
8271 case OP_VDUP1:
8272 case OP_VDUP2:
8273 case OP_VDUP3:
8274 return DAG.getNode(ARMISD::VDUPLANE, dl, VT,
8275 OpLHS, DAG.getConstant(OpNum-OP_VDUP0, dl, MVT::i32));
8276 case OP_VEXT1:
8277 case OP_VEXT2:
8278 case OP_VEXT3:
8279 return DAG.getNode(ARMISD::VEXT, dl, VT,
8280 OpLHS, OpRHS,
8281 DAG.getConstant(OpNum - OP_VEXT1 + 1, dl, MVT::i32));
8282 case OP_VUZPL:
8283 case OP_VUZPR:
8284 return DAG.getNode(ARMISD::VUZP, dl, DAG.getVTList(VT, VT),
8285 OpLHS, OpRHS).getValue(OpNum-OP_VUZPL);
8286 case OP_VZIPL:
8287 case OP_VZIPR:
8288 return DAG.getNode(ARMISD::VZIP, dl, DAG.getVTList(VT, VT),
8289 OpLHS, OpRHS).getValue(OpNum-OP_VZIPL);
8290 case OP_VTRNL:
8291 case OP_VTRNR:
8292 return DAG.getNode(ARMISD::VTRN, dl, DAG.getVTList(VT, VT),
8293 OpLHS, OpRHS).getValue(OpNum-OP_VTRNL);
8294 }
8295}
8296
8298 ArrayRef<int> ShuffleMask,
8299 SelectionDAG &DAG) {
8300 // Check to see if we can use the VTBL instruction.
8301 SDValue V1 = Op.getOperand(0);
8302 SDValue V2 = Op.getOperand(1);
8303 SDLoc DL(Op);
8304
8305 SmallVector<SDValue, 8> VTBLMask;
8306 for (int I : ShuffleMask)
8307 VTBLMask.push_back(DAG.getSignedConstant(I, DL, MVT::i32));
8308
8309 if (V2.getNode()->isUndef())
8310 return DAG.getNode(ARMISD::VTBL1, DL, MVT::v8i8, V1,
8311 DAG.getBuildVector(MVT::v8i8, DL, VTBLMask));
8312
8313 return DAG.getNode(ARMISD::VTBL2, DL, MVT::v8i8, V1, V2,
8314 DAG.getBuildVector(MVT::v8i8, DL, VTBLMask));
8315}
8316
8318 SDLoc DL(Op);
8319 EVT VT = Op.getValueType();
8320
8321 assert((VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i8) &&
8322 "Expect an v8i16/v16i8 type");
8323 SDValue OpLHS = DAG.getNode(ARMISD::VREV64, DL, VT, Op.getOperand(0));
8324 // For a v16i8 type: After the VREV, we have got <7, ..., 0, 15, ..., 8>. Now,
8325 // extract the first 8 bytes into the top double word and the last 8 bytes
8326 // into the bottom double word, through a new vector shuffle that will be
8327 // turned into a VEXT on Neon, or a couple of VMOVDs on MVE.
8328 std::vector<int> NewMask;
8329 for (unsigned i = 0; i < VT.getVectorNumElements() / 2; i++)
8330 NewMask.push_back(VT.getVectorNumElements() / 2 + i);
8331 for (unsigned i = 0; i < VT.getVectorNumElements() / 2; i++)
8332 NewMask.push_back(i);
8333 return DAG.getVectorShuffle(VT, DL, OpLHS, OpLHS, NewMask);
8334}
8335
8337 switch (VT.getSimpleVT().SimpleTy) {
8338 case MVT::v2i1:
8339 return MVT::v2f64;
8340 case MVT::v4i1:
8341 return MVT::v4i32;
8342 case MVT::v8i1:
8343 return MVT::v8i16;
8344 case MVT::v16i1:
8345 return MVT::v16i8;
8346 default:
8347 llvm_unreachable("Unexpected vector predicate type");
8348 }
8349}
8350
8352 SelectionDAG &DAG) {
8353 // Converting from boolean predicates to integers involves creating a vector
8354 // of all ones or all zeroes and selecting the lanes based upon the real
8355 // predicate.
8357 DAG.getTargetConstant(ARM_AM::createVMOVModImm(0xe, 0xff), dl, MVT::i32);
8358 AllOnes = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v16i8, AllOnes);
8359
8360 SDValue AllZeroes =
8361 DAG.getTargetConstant(ARM_AM::createVMOVModImm(0xe, 0x0), dl, MVT::i32);
8362 AllZeroes = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v16i8, AllZeroes);
8363
8364 // Get full vector type from predicate type
8366
8367 SDValue RecastV1;
8368 // If the real predicate is an v8i1 or v4i1 (not v16i1) then we need to recast
8369 // this to a v16i1. This cannot be done with an ordinary bitcast because the
8370 // sizes are not the same. We have to use a MVE specific PREDICATE_CAST node,
8371 // since we know in hardware the sizes are really the same.
8372 if (VT != MVT::v16i1)
8373 RecastV1 = DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::v16i1, Pred);
8374 else
8375 RecastV1 = Pred;
8376
8377 // Select either all ones or zeroes depending upon the real predicate bits.
8378 SDValue PredAsVector =
8379 DAG.getNode(ISD::VSELECT, dl, MVT::v16i8, RecastV1, AllOnes, AllZeroes);
8380
8381 // Recast our new predicate-as-integer v16i8 vector into something
8382 // appropriate for the shuffle, i.e. v4i32 for a real v4i1 predicate.
8383 return DAG.getNode(ISD::BITCAST, dl, NewVT, PredAsVector);
8384}
8385
8387 const ARMSubtarget *ST) {
8388 EVT VT = Op.getValueType();
8390 ArrayRef<int> ShuffleMask = SVN->getMask();
8391
8392 assert(ST->hasMVEIntegerOps() &&
8393 "No support for vector shuffle of boolean predicates");
8394
8395 SDValue V1 = Op.getOperand(0);
8396 SDValue V2 = Op.getOperand(1);
8397 SDLoc dl(Op);
8398 if (isReverseMask(ShuffleMask, VT)) {
8399 SDValue cast = DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::i32, V1);
8400 SDValue rbit = DAG.getNode(ISD::BITREVERSE, dl, MVT::i32, cast);
8401 SDValue srl = DAG.getNode(ISD::SRL, dl, MVT::i32, rbit,
8402 DAG.getConstant(16, dl, MVT::i32));
8403 return DAG.getNode(ARMISD::PREDICATE_CAST, dl, VT, srl);
8404 }
8405
8406 // Until we can come up with optimised cases for every single vector
8407 // shuffle in existence we have chosen the least painful strategy. This is
8408 // to essentially promote the boolean predicate to a 8-bit integer, where
8409 // each predicate represents a byte. Then we fall back on a normal integer
8410 // vector shuffle and convert the result back into a predicate vector. In
8411 // many cases the generated code might be even better than scalar code
8412 // operating on bits. Just imagine trying to shuffle 8 arbitrary 2-bit
8413 // fields in a register into 8 other arbitrary 2-bit fields!
8414 SDValue PredAsVector1 = PromoteMVEPredVector(dl, V1, VT, DAG);
8415 EVT NewVT = PredAsVector1.getValueType();
8416 SDValue PredAsVector2 = V2.isUndef() ? DAG.getUNDEF(NewVT)
8417 : PromoteMVEPredVector(dl, V2, VT, DAG);
8418 assert(PredAsVector2.getValueType() == NewVT &&
8419 "Expected identical vector type in expanded i1 shuffle!");
8420
8421 // Do the shuffle!
8422 SDValue Shuffled = DAG.getVectorShuffle(NewVT, dl, PredAsVector1,
8423 PredAsVector2, ShuffleMask);
8424
8425 // Now return the result of comparing the shuffled vector with zero,
8426 // which will generate a real predicate, i.e. v4i1, v8i1 or v16i1. For a v2i1
8427 // we convert to a v4i1 compare to fill in the two halves of the i64 as i32s.
8428 if (VT == MVT::v2i1) {
8429 SDValue BC = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, Shuffled);
8430 SDValue Cmp = DAG.getNode(ARMISD::VCMPZ, dl, MVT::v4i1, BC,
8431 DAG.getConstant(ARMCC::NE, dl, MVT::i32));
8432 return DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::v2i1, Cmp);
8433 }
8434 return DAG.getNode(ARMISD::VCMPZ, dl, VT, Shuffled,
8435 DAG.getConstant(ARMCC::NE, dl, MVT::i32));
8436}
8437
8439 ArrayRef<int> ShuffleMask,
8440 SelectionDAG &DAG) {
8441 // Attempt to lower the vector shuffle using as many whole register movs as
8442 // possible. This is useful for types smaller than 32bits, which would
8443 // often otherwise become a series for grp movs.
8444 SDLoc dl(Op);
8445 EVT VT = Op.getValueType();
8446 if (VT.getScalarSizeInBits() >= 32)
8447 return SDValue();
8448
8449 assert((VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i8) &&
8450 "Unexpected vector type");
8451 int NumElts = VT.getVectorNumElements();
8452 int QuarterSize = NumElts / 4;
8453 // The four final parts of the vector, as i32's
8454 SDValue Parts[4];
8455
8456 // Look for full lane vmovs like <0,1,2,3> or <u,5,6,7> etc, (but not
8457 // <u,u,u,u>), returning the vmov lane index
8458 auto getMovIdx = [](ArrayRef<int> ShuffleMask, int Start, int Length) {
8459 // Detect which mov lane this would be from the first non-undef element.
8460 int MovIdx = -1;
8461 for (int i = 0; i < Length; i++) {
8462 if (ShuffleMask[Start + i] >= 0) {
8463 if (ShuffleMask[Start + i] % Length != i)
8464 return -1;
8465 MovIdx = ShuffleMask[Start + i] / Length;
8466 break;
8467 }
8468 }
8469 // If all items are undef, leave this for other combines
8470 if (MovIdx == -1)
8471 return -1;
8472 // Check the remaining values are the correct part of the same mov
8473 for (int i = 1; i < Length; i++) {
8474 if (ShuffleMask[Start + i] >= 0 &&
8475 (ShuffleMask[Start + i] / Length != MovIdx ||
8476 ShuffleMask[Start + i] % Length != i))
8477 return -1;
8478 }
8479 return MovIdx;
8480 };
8481
8482 for (int Part = 0; Part < 4; ++Part) {
8483 // Does this part look like a mov
8484 int Elt = getMovIdx(ShuffleMask, Part * QuarterSize, QuarterSize);
8485 if (Elt != -1) {
8486 SDValue Input = Op->getOperand(0);
8487 if (Elt >= 4) {
8488 Input = Op->getOperand(1);
8489 Elt -= 4;
8490 }
8491 SDValue BitCast = DAG.getBitcast(MVT::v4f32, Input);
8492 Parts[Part] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, BitCast,
8493 DAG.getConstant(Elt, dl, MVT::i32));
8494 }
8495 }
8496
8497 // Nothing interesting found, just return
8498 if (!Parts[0] && !Parts[1] && !Parts[2] && !Parts[3])
8499 return SDValue();
8500
8501 // The other parts need to be built with the old shuffle vector, cast to a
8502 // v4i32 and extract_vector_elts
8503 if (!Parts[0] || !Parts[1] || !Parts[2] || !Parts[3]) {
8504 SmallVector<int, 16> NewShuffleMask;
8505 for (int Part = 0; Part < 4; ++Part)
8506 for (int i = 0; i < QuarterSize; i++)
8507 NewShuffleMask.push_back(
8508 Parts[Part] ? -1 : ShuffleMask[Part * QuarterSize + i]);
8509 SDValue NewShuffle = DAG.getVectorShuffle(
8510 VT, dl, Op->getOperand(0), Op->getOperand(1), NewShuffleMask);
8511 SDValue BitCast = DAG.getBitcast(MVT::v4f32, NewShuffle);
8512
8513 for (int Part = 0; Part < 4; ++Part)
8514 if (!Parts[Part])
8515 Parts[Part] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32,
8516 BitCast, DAG.getConstant(Part, dl, MVT::i32));
8517 }
8518 // Build a vector out of the various parts and bitcast it back to the original
8519 // type.
8520 SDValue NewVec = DAG.getNode(ARMISD::BUILD_VECTOR, dl, MVT::v4f32, Parts);
8521 return DAG.getBitcast(VT, NewVec);
8522}
8523
8525 ArrayRef<int> ShuffleMask,
8526 SelectionDAG &DAG) {
8527 SDValue V1 = Op.getOperand(0);
8528 SDValue V2 = Op.getOperand(1);
8529 EVT VT = Op.getValueType();
8530 unsigned NumElts = VT.getVectorNumElements();
8531
8532 // An One-Off Identity mask is one that is mostly an identity mask from as
8533 // single source but contains a single element out-of-place, either from a
8534 // different vector or from another position in the same vector. As opposed to
8535 // lowering this via a ARMISD::BUILD_VECTOR we can generate an extract/insert
8536 // pair directly.
8537 auto isOneOffIdentityMask = [](ArrayRef<int> Mask, EVT VT, int BaseOffset,
8538 int &OffElement) {
8539 OffElement = -1;
8540 int NonUndef = 0;
8541 for (int i = 0, NumMaskElts = Mask.size(); i < NumMaskElts; ++i) {
8542 if (Mask[i] == -1)
8543 continue;
8544 NonUndef++;
8545 if (Mask[i] != i + BaseOffset) {
8546 if (OffElement == -1)
8547 OffElement = i;
8548 else
8549 return false;
8550 }
8551 }
8552 return NonUndef > 2 && OffElement != -1;
8553 };
8554 int OffElement;
8555 SDValue VInput;
8556 if (isOneOffIdentityMask(ShuffleMask, VT, 0, OffElement))
8557 VInput = V1;
8558 else if (isOneOffIdentityMask(ShuffleMask, VT, NumElts, OffElement))
8559 VInput = V2;
8560 else
8561 return SDValue();
8562
8563 SDLoc dl(Op);
8564 EVT SVT = VT.getScalarType() == MVT::i8 || VT.getScalarType() == MVT::i16
8565 ? MVT::i32
8566 : VT.getScalarType();
8567 SDValue Elt = DAG.getNode(
8568 ISD::EXTRACT_VECTOR_ELT, dl, SVT,
8569 ShuffleMask[OffElement] < (int)NumElts ? V1 : V2,
8570 DAG.getVectorIdxConstant(ShuffleMask[OffElement] % NumElts, dl));
8571 return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, VInput, Elt,
8572 DAG.getVectorIdxConstant(OffElement % NumElts, dl));
8573}
8574
8576 const ARMSubtarget *ST) {
8577 SDValue V1 = Op.getOperand(0);
8578 SDValue V2 = Op.getOperand(1);
8579 SDLoc dl(Op);
8580 EVT VT = Op.getValueType();
8582 unsigned EltSize = VT.getScalarSizeInBits();
8583
8584 if (ST->hasMVEIntegerOps() && EltSize == 1)
8585 return LowerVECTOR_SHUFFLE_i1(Op, DAG, ST);
8586
8587 // Convert shuffles that are directly supported on NEON to target-specific
8588 // DAG nodes, instead of keeping them as shuffles and matching them again
8589 // during code selection. This is more efficient and avoids the possibility
8590 // of inconsistencies between legalization and selection.
8591 // FIXME: floating-point vectors should be canonicalized to integer vectors
8592 // of the same time so that they get CSEd properly.
8593 ArrayRef<int> ShuffleMask = SVN->getMask();
8594
8595 if (EltSize <= 32) {
8596 if (SVN->isSplat()) {
8597 int Lane = SVN->getSplatIndex();
8598 // If this is undef splat, generate it via "just" vdup, if possible.
8599 if (Lane == -1) Lane = 0;
8600
8601 // Test if V1 is a SCALAR_TO_VECTOR.
8602 if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR) {
8603 return DAG.getNode(ARMISD::VDUP, dl, VT, V1.getOperand(0));
8604 }
8605 // Test if V1 is a BUILD_VECTOR which is equivalent to a SCALAR_TO_VECTOR
8606 // (and probably will turn into a SCALAR_TO_VECTOR once legalization
8607 // reaches it).
8608 if (Lane == 0 && V1.getOpcode() == ISD::BUILD_VECTOR &&
8609 !isa<ConstantSDNode>(V1.getOperand(0))) {
8610 bool IsScalarToVector = true;
8611 for (unsigned i = 1, e = V1.getNumOperands(); i != e; ++i)
8612 if (!V1.getOperand(i).isUndef()) {
8613 IsScalarToVector = false;
8614 break;
8615 }
8616 if (IsScalarToVector)
8617 return DAG.getNode(ARMISD::VDUP, dl, VT, V1.getOperand(0));
8618 }
8619 return DAG.getNode(ARMISD::VDUPLANE, dl, VT, V1,
8620 DAG.getConstant(Lane, dl, MVT::i32));
8621 }
8622
8623 bool ReverseVEXT = false;
8624 unsigned Imm = 0;
8625 if (ST->hasNEON() && isVEXTMask(ShuffleMask, VT, ReverseVEXT, Imm)) {
8626 if (ReverseVEXT)
8627 std::swap(V1, V2);
8628 return DAG.getNode(ARMISD::VEXT, dl, VT, V1, V2,
8629 DAG.getConstant(Imm, dl, MVT::i32));
8630 }
8631
8632 if (isVREVMask(ShuffleMask, VT, 64))
8633 return DAG.getNode(ARMISD::VREV64, dl, VT, V1);
8634 if (isVREVMask(ShuffleMask, VT, 32))
8635 return DAG.getNode(ARMISD::VREV32, dl, VT, V1);
8636 if (isVREVMask(ShuffleMask, VT, 16))
8637 return DAG.getNode(ARMISD::VREV16, dl, VT, V1);
8638
8639 if (ST->hasNEON() && V2->isUndef() && isSingletonVEXTMask(ShuffleMask, VT, Imm)) {
8640 return DAG.getNode(ARMISD::VEXT, dl, VT, V1, V1,
8641 DAG.getConstant(Imm, dl, MVT::i32));
8642 }
8643
8644 // Check for Neon shuffles that modify both input vectors in place.
8645 // If both results are used, i.e., if there are two shuffles with the same
8646 // source operands and with masks corresponding to both results of one of
8647 // these operations, DAG memoization will ensure that a single node is
8648 // used for both shuffles.
8649 unsigned WhichResult = 0;
8650 bool isV_UNDEF = false;
8651 if (ST->hasNEON()) {
8652 if (unsigned ShuffleOpc = isNEONTwoResultShuffleMask(
8653 ShuffleMask, VT, WhichResult, isV_UNDEF)) {
8654 if (isV_UNDEF)
8655 V2 = V1;
8656 return DAG.getNode(ShuffleOpc, dl, DAG.getVTList(VT, VT), V1, V2)
8657 .getValue(WhichResult);
8658 }
8659 }
8660 if (ST->hasMVEIntegerOps()) {
8661 if (isVMOVNMask(ShuffleMask, VT, false, false))
8662 return DAG.getNode(ARMISD::VMOVN, dl, VT, V2, V1,
8663 DAG.getConstant(0, dl, MVT::i32));
8664 if (isVMOVNMask(ShuffleMask, VT, true, false))
8665 return DAG.getNode(ARMISD::VMOVN, dl, VT, V1, V2,
8666 DAG.getConstant(1, dl, MVT::i32));
8667 if (isVMOVNMask(ShuffleMask, VT, true, true))
8668 return DAG.getNode(ARMISD::VMOVN, dl, VT, V1, V1,
8669 DAG.getConstant(1, dl, MVT::i32));
8670 }
8671
8672 // Also check for these shuffles through CONCAT_VECTORS: we canonicalize
8673 // shuffles that produce a result larger than their operands with:
8674 // shuffle(concat(v1, undef), concat(v2, undef))
8675 // ->
8676 // shuffle(concat(v1, v2), undef)
8677 // because we can access quad vectors (see PerformVECTOR_SHUFFLECombine).
8678 //
8679 // This is useful in the general case, but there are special cases where
8680 // native shuffles produce larger results: the two-result ops.
8681 //
8682 // Look through the concat when lowering them:
8683 // shuffle(concat(v1, v2), undef)
8684 // ->
8685 // concat(VZIP(v1, v2):0, :1)
8686 //
8687 if (ST->hasNEON() && V1->getOpcode() == ISD::CONCAT_VECTORS && V2->isUndef()) {
8688 SDValue SubV1 = V1->getOperand(0);
8689 SDValue SubV2 = V1->getOperand(1);
8690 EVT SubVT = SubV1.getValueType();
8691
8692 // We expect these to have been canonicalized to -1.
8693 assert(llvm::all_of(ShuffleMask, [&](int i) {
8694 return i < (int)VT.getVectorNumElements();
8695 }) && "Unexpected shuffle index into UNDEF operand!");
8696
8697 if (unsigned ShuffleOpc = isNEONTwoResultShuffleMask(
8698 ShuffleMask, SubVT, WhichResult, isV_UNDEF)) {
8699 if (isV_UNDEF)
8700 SubV2 = SubV1;
8701 assert((WhichResult == 0) &&
8702 "In-place shuffle of concat can only have one result!");
8703 SDValue Res = DAG.getNode(ShuffleOpc, dl, DAG.getVTList(SubVT, SubVT),
8704 SubV1, SubV2);
8705 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Res.getValue(0),
8706 Res.getValue(1));
8707 }
8708 }
8709 }
8710
8711 if (ST->hasMVEIntegerOps() && EltSize <= 32 &&
8712 (ST->hasFullFP16() || VT != MVT::v8f16)) {
8713 if (SDValue V = LowerVECTOR_SHUFFLEUsingOneOff(Op, ShuffleMask, DAG))
8714 return V;
8715
8716 for (bool Top : {false, true}) {
8717 for (bool SingleSource : {false, true}) {
8718 if (isTruncMask(ShuffleMask, VT, Top, SingleSource)) {
8719 MVT FromSVT = MVT::getIntegerVT(EltSize * 2);
8720 MVT FromVT = MVT::getVectorVT(FromSVT, ShuffleMask.size() / 2);
8721 SDValue Lo = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, FromVT, V1);
8722 SDValue Hi = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, FromVT,
8723 SingleSource ? V1 : V2);
8724 if (Top) {
8725 SDValue Amt = DAG.getConstant(EltSize, dl, FromVT);
8726 Lo = DAG.getNode(ISD::SRL, dl, FromVT, Lo, Amt);
8727 Hi = DAG.getNode(ISD::SRL, dl, FromVT, Hi, Amt);
8728 }
8729 return DAG.getNode(ARMISD::MVETRUNC, dl, VT, Lo, Hi);
8730 }
8731 }
8732 }
8733 }
8734
8735 // If the shuffle is not directly supported and it has 4 elements, use
8736 // the PerfectShuffle-generated table to synthesize it from other shuffles.
8737 unsigned NumElts = VT.getVectorNumElements();
8738 if (NumElts == 4) {
8739 unsigned PFIndexes[4];
8740 for (unsigned i = 0; i != 4; ++i) {
8741 if (ShuffleMask[i] < 0)
8742 PFIndexes[i] = 8;
8743 else
8744 PFIndexes[i] = ShuffleMask[i];
8745 }
8746
8747 // Compute the index in the perfect shuffle table.
8748 unsigned PFTableIndex =
8749 PFIndexes[0]*9*9*9+PFIndexes[1]*9*9+PFIndexes[2]*9+PFIndexes[3];
8750 unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
8751 unsigned Cost = (PFEntry >> 30);
8752
8753 if (Cost <= 4) {
8754 if (ST->hasNEON())
8755 return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl);
8756 else if (isLegalMVEShuffleOp(PFEntry)) {
8757 unsigned LHSID = (PFEntry >> 13) & ((1 << 13)-1);
8758 unsigned RHSID = (PFEntry >> 0) & ((1 << 13)-1);
8759 unsigned PFEntryLHS = PerfectShuffleTable[LHSID];
8760 unsigned PFEntryRHS = PerfectShuffleTable[RHSID];
8761 if (isLegalMVEShuffleOp(PFEntryLHS) && isLegalMVEShuffleOp(PFEntryRHS))
8762 return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl);
8763 }
8764 }
8765 }
8766
8767 // Implement shuffles with 32- or 64-bit elements as ARMISD::BUILD_VECTORs.
8768 if (EltSize >= 32) {
8769 // Do the expansion with floating-point types, since that is what the VFP
8770 // registers are defined to use, and since i64 is not legal.
8771 EVT EltVT = EVT::getFloatingPointVT(EltSize);
8772 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts);
8773 V1 = DAG.getNode(ISD::BITCAST, dl, VecVT, V1);
8774 V2 = DAG.getNode(ISD::BITCAST, dl, VecVT, V2);
8776 for (unsigned i = 0; i < NumElts; ++i) {
8777 if (ShuffleMask[i] < 0)
8778 Ops.push_back(DAG.getUNDEF(EltVT));
8779 else
8780 Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT,
8781 ShuffleMask[i] < (int)NumElts ? V1 : V2,
8782 DAG.getConstant(ShuffleMask[i] & (NumElts-1),
8783 dl, MVT::i32)));
8784 }
8785 SDValue Val = DAG.getNode(ARMISD::BUILD_VECTOR, dl, VecVT, Ops);
8786 return DAG.getNode(ISD::BITCAST, dl, VT, Val);
8787 }
8788
8789 if ((VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i8) &&
8790 isReverseMask(ShuffleMask, VT))
8791 return LowerReverse_VECTOR_SHUFFLE(Op, DAG);
8792
8793 if (ST->hasNEON() && VT == MVT::v8i8)
8794 if (SDValue NewOp = LowerVECTOR_SHUFFLEv8i8(Op, ShuffleMask, DAG))
8795 return NewOp;
8796
8797 if (ST->hasMVEIntegerOps())
8798 if (SDValue NewOp = LowerVECTOR_SHUFFLEUsingMovs(Op, ShuffleMask, DAG))
8799 return NewOp;
8800
8801 // Lower v8f16 via v8i16 to avoid invalid f16 nodes.
8802 if (VT == MVT::v8f16 && !ST->hasFullFP16()) {
8803 SDValue BC0 =
8804 DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v8i16, Op.getOperand(0));
8805 SDValue BC1 =
8806 DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v8i16, Op.getOperand(1));
8807 SDValue Shuf = DAG.getVectorShuffle(MVT::v8i16, dl, BC0, BC1, ShuffleMask);
8808 return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, Shuf);
8809 }
8810
8811 return SDValue();
8812}
8813
8815 const ARMSubtarget *ST) {
8816 EVT VecVT = Op.getOperand(0).getValueType();
8817 SDLoc dl(Op);
8818
8819 assert(ST->hasMVEIntegerOps() &&
8820 "LowerINSERT_VECTOR_ELT_i1 called without MVE!");
8821
8822 SDValue Conv =
8823 DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::i32, Op->getOperand(0));
8824 unsigned Lane = Op.getConstantOperandVal(2);
8825 unsigned LaneWidth =
8827 unsigned Mask = ((1 << LaneWidth) - 1) << Lane * LaneWidth;
8828 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::i32,
8829 Op.getOperand(1), DAG.getValueType(MVT::i1));
8830 SDValue BFI = DAG.getNode(ARMISD::BFI, dl, MVT::i32, Conv, Ext,
8831 DAG.getConstant(~Mask, dl, MVT::i32));
8832 return DAG.getNode(ARMISD::PREDICATE_CAST, dl, Op.getValueType(), BFI);
8833}
8834
8835SDValue ARMTargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
8836 SelectionDAG &DAG) const {
8837 // INSERT_VECTOR_ELT is legal only for immediate indexes.
8838 SDValue Lane = Op.getOperand(2);
8839 if (!isa<ConstantSDNode>(Lane))
8840 return SDValue();
8841
8842 SDValue Elt = Op.getOperand(1);
8843 EVT EltVT = Elt.getValueType();
8844
8845 if (Subtarget->hasMVEIntegerOps() &&
8846 Op.getValueType().getScalarSizeInBits() == 1)
8847 return LowerINSERT_VECTOR_ELT_i1(Op, DAG, Subtarget);
8848
8849 if (getTypeAction(*DAG.getContext(), EltVT) ==
8851 // INSERT_VECTOR_ELT doesn't want f16 operands promoting to f32,
8852 // but the type system will try to do that if we don't intervene.
8853 // Reinterpret any such vector-element insertion as one with the
8854 // corresponding integer types.
8855
8856 SDLoc dl(Op);
8857
8858 EVT IEltVT = MVT::getIntegerVT(EltVT.getScalarSizeInBits());
8859 assert(getTypeAction(*DAG.getContext(), IEltVT) !=
8861
8862 SDValue VecIn = Op.getOperand(0);
8863 EVT VecVT = VecIn.getValueType();
8864 EVT IVecVT = EVT::getVectorVT(*DAG.getContext(), IEltVT,
8865 VecVT.getVectorNumElements());
8866
8867 SDValue IElt = DAG.getNode(ISD::BITCAST, dl, IEltVT, Elt);
8868 SDValue IVecIn = DAG.getNode(ISD::BITCAST, dl, IVecVT, VecIn);
8869 SDValue IVecOut = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, IVecVT,
8870 IVecIn, IElt, Lane);
8871 return DAG.getNode(ISD::BITCAST, dl, VecVT, IVecOut);
8872 }
8873
8874 return Op;
8875}
8876
8878 const ARMSubtarget *ST) {
8879 EVT VecVT = Op.getOperand(0).getValueType();
8880 SDLoc dl(Op);
8881
8882 assert(ST->hasMVEIntegerOps() &&
8883 "LowerINSERT_VECTOR_ELT_i1 called without MVE!");
8884
8885 SDValue Conv =
8886 DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::i32, Op->getOperand(0));
8887 unsigned Lane = Op.getConstantOperandVal(1);
8888 unsigned LaneWidth =
8890 SDValue Shift = DAG.getNode(ISD::SRL, dl, MVT::i32, Conv,
8891 DAG.getConstant(Lane * LaneWidth, dl, MVT::i32));
8892 return Shift;
8893}
8894
8896 const ARMSubtarget *ST) {
8897 // EXTRACT_VECTOR_ELT is legal only for immediate indexes.
8898 SDValue Lane = Op.getOperand(1);
8899 if (!isa<ConstantSDNode>(Lane))
8900 return SDValue();
8901
8902 SDValue Vec = Op.getOperand(0);
8903 EVT VT = Vec.getValueType();
8904
8905 if (ST->hasMVEIntegerOps() && VT.getScalarSizeInBits() == 1)
8906 return LowerEXTRACT_VECTOR_ELT_i1(Op, DAG, ST);
8907
8908 if (Op.getValueType() == MVT::i32 && Vec.getScalarValueSizeInBits() < 32) {
8909 SDLoc dl(Op);
8910 return DAG.getNode(ARMISD::VGETLANEu, dl, MVT::i32, Vec, Lane);
8911 }
8912
8913 return Op;
8914}
8915
8917 const ARMSubtarget *ST) {
8918 SDLoc dl(Op);
8919 assert(Op.getValueType().getScalarSizeInBits() == 1 &&
8920 "Unexpected custom CONCAT_VECTORS lowering");
8921 assert(isPowerOf2_32(Op.getNumOperands()) &&
8922 "Unexpected custom CONCAT_VECTORS lowering");
8923 assert(ST->hasMVEIntegerOps() &&
8924 "CONCAT_VECTORS lowering only supported for MVE");
8925
8926 auto ConcatPair = [&](SDValue V1, SDValue V2) {
8927 EVT Op1VT = V1.getValueType();
8928 EVT Op2VT = V2.getValueType();
8929 assert(Op1VT == Op2VT && "Operand types don't match!");
8930 assert((Op1VT == MVT::v2i1 || Op1VT == MVT::v4i1 || Op1VT == MVT::v8i1) &&
8931 "Unexpected i1 concat operations!");
8932 EVT VT = Op1VT.getDoubleNumVectorElementsVT(*DAG.getContext());
8933
8934 SDValue NewV1 = PromoteMVEPredVector(dl, V1, Op1VT, DAG);
8935 SDValue NewV2 = PromoteMVEPredVector(dl, V2, Op2VT, DAG);
8936
8937 // We now have Op1 + Op2 promoted to vectors of integers, where v8i1 gets
8938 // promoted to v8i16, etc.
8939 MVT ElType =
8941 unsigned NumElts = 2 * Op1VT.getVectorNumElements();
8942
8943 EVT ConcatVT = MVT::getVectorVT(ElType, NumElts);
8944 if (Op1VT == MVT::v4i1 || Op1VT == MVT::v8i1) {
8945 // Use MVETRUNC to truncate the combined NewV1::NewV2 into the smaller
8946 // ConcatVT.
8947 SDValue ConVec =
8948 DAG.getNode(ARMISD::MVETRUNC, dl, ConcatVT, NewV1, NewV2);
8949 return DAG.getNode(ARMISD::VCMPZ, dl, VT, ConVec,
8950 DAG.getConstant(ARMCC::NE, dl, MVT::i32));
8951 }
8952
8953 // Extract the vector elements from Op1 and Op2 one by one and truncate them
8954 // to be the right size for the destination. For example, if Op1 is v4i1
8955 // then the promoted vector is v4i32. The result of concatenation gives a
8956 // v8i1, which when promoted is v8i16. That means each i32 element from Op1
8957 // needs truncating to i16 and inserting in the result.
8958 auto ExtractInto = [&DAG, &dl](SDValue NewV, SDValue ConVec, unsigned &j) {
8959 EVT NewVT = NewV.getValueType();
8960 EVT ConcatVT = ConVec.getValueType();
8961 unsigned ExtScale = 1;
8962 if (NewVT == MVT::v2f64) {
8963 NewV = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, NewV);
8964 ExtScale = 2;
8965 }
8966 for (unsigned i = 0, e = NewVT.getVectorNumElements(); i < e; i++, j++) {
8967 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, NewV,
8968 DAG.getIntPtrConstant(i * ExtScale, dl));
8969 ConVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ConcatVT, ConVec, Elt,
8970 DAG.getConstant(j, dl, MVT::i32));
8971 }
8972 return ConVec;
8973 };
8974 unsigned j = 0;
8975 SDValue ConVec = DAG.getNode(ISD::UNDEF, dl, ConcatVT);
8976 ConVec = ExtractInto(NewV1, ConVec, j);
8977 ConVec = ExtractInto(NewV2, ConVec, j);
8978
8979 // Now return the result of comparing the subvector with zero, which will
8980 // generate a real predicate, i.e. v4i1, v8i1 or v16i1.
8981 return DAG.getNode(ARMISD::VCMPZ, dl, VT, ConVec,
8982 DAG.getConstant(ARMCC::NE, dl, MVT::i32));
8983 };
8984
8985 // Concat each pair of subvectors and pack into the lower half of the array.
8986 SmallVector<SDValue> ConcatOps(Op->ops());
8987 while (ConcatOps.size() > 1) {
8988 for (unsigned I = 0, E = ConcatOps.size(); I != E; I += 2) {
8989 SDValue V1 = ConcatOps[I];
8990 SDValue V2 = ConcatOps[I + 1];
8991 ConcatOps[I / 2] = ConcatPair(V1, V2);
8992 }
8993 ConcatOps.resize(ConcatOps.size() / 2);
8994 }
8995 return ConcatOps[0];
8996}
8997
8999 const ARMSubtarget *ST) {
9000 EVT VT = Op->getValueType(0);
9001 if (ST->hasMVEIntegerOps() && VT.getScalarSizeInBits() == 1)
9002 return LowerCONCAT_VECTORS_i1(Op, DAG, ST);
9003
9004 // The only time a CONCAT_VECTORS operation can have legal types is when
9005 // two 64-bit vectors are concatenated to a 128-bit vector.
9006 assert(Op.getValueType().is128BitVector() && Op.getNumOperands() == 2 &&
9007 "unexpected CONCAT_VECTORS");
9008 SDLoc dl(Op);
9009 SDValue Val = DAG.getUNDEF(MVT::v2f64);
9010 SDValue Op0 = Op.getOperand(0);
9011 SDValue Op1 = Op.getOperand(1);
9012 if (!Op0.isUndef())
9013 Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Val,
9014 DAG.getNode(ISD::BITCAST, dl, MVT::f64, Op0),
9015 DAG.getIntPtrConstant(0, dl));
9016 if (!Op1.isUndef())
9017 Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Val,
9018 DAG.getNode(ISD::BITCAST, dl, MVT::f64, Op1),
9019 DAG.getIntPtrConstant(1, dl));
9020 return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Val);
9021}
9022
9024 const ARMSubtarget *ST) {
9025 SDValue V1 = Op.getOperand(0);
9026 SDValue V2 = Op.getOperand(1);
9027 SDLoc dl(Op);
9028 EVT VT = Op.getValueType();
9029 EVT Op1VT = V1.getValueType();
9030 unsigned NumElts = VT.getVectorNumElements();
9031 unsigned Index = V2->getAsZExtVal();
9032
9033 assert(VT.getScalarSizeInBits() == 1 &&
9034 "Unexpected custom EXTRACT_SUBVECTOR lowering");
9035 assert(ST->hasMVEIntegerOps() &&
9036 "EXTRACT_SUBVECTOR lowering only supported for MVE");
9037
9038 SDValue NewV1 = PromoteMVEPredVector(dl, V1, Op1VT, DAG);
9039
9040 // We now have Op1 promoted to a vector of integers, where v8i1 gets
9041 // promoted to v8i16, etc.
9042
9044
9045 if (NumElts == 2) {
9046 EVT SubVT = MVT::v4i32;
9047 SDValue SubVec = DAG.getNode(ISD::UNDEF, dl, SubVT);
9048 for (unsigned i = Index, j = 0; i < (Index + NumElts); i++, j += 2) {
9049 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, NewV1,
9050 DAG.getIntPtrConstant(i, dl));
9051 SubVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, SubVT, SubVec, Elt,
9052 DAG.getConstant(j, dl, MVT::i32));
9053 SubVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, SubVT, SubVec, Elt,
9054 DAG.getConstant(j + 1, dl, MVT::i32));
9055 }
9056 SDValue Cmp = DAG.getNode(ARMISD::VCMPZ, dl, MVT::v4i1, SubVec,
9057 DAG.getConstant(ARMCC::NE, dl, MVT::i32));
9058 return DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::v2i1, Cmp);
9059 }
9060
9061 EVT SubVT = MVT::getVectorVT(ElType, NumElts);
9062 SDValue SubVec = DAG.getNode(ISD::UNDEF, dl, SubVT);
9063 for (unsigned i = Index, j = 0; i < (Index + NumElts); i++, j++) {
9064 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, NewV1,
9065 DAG.getIntPtrConstant(i, dl));
9066 SubVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, SubVT, SubVec, Elt,
9067 DAG.getConstant(j, dl, MVT::i32));
9068 }
9069
9070 // Now return the result of comparing the subvector with zero,
9071 // which will generate a real predicate, i.e. v4i1, v8i1 or v16i1.
9072 return DAG.getNode(ARMISD::VCMPZ, dl, VT, SubVec,
9073 DAG.getConstant(ARMCC::NE, dl, MVT::i32));
9074}
9075
9076// Turn a truncate into a predicate (an i1 vector) into icmp(and(x, 1), 0).
9078 const ARMSubtarget *ST) {
9079 assert(ST->hasMVEIntegerOps() && "Expected MVE!");
9080 EVT VT = N->getValueType(0);
9081 assert((VT == MVT::v16i1 || VT == MVT::v8i1 || VT == MVT::v4i1) &&
9082 "Expected a vector i1 type!");
9083 SDValue Op = N->getOperand(0);
9084 EVT FromVT = Op.getValueType();
9085 SDLoc DL(N);
9086
9087 SDValue And =
9088 DAG.getNode(ISD::AND, DL, FromVT, Op, DAG.getConstant(1, DL, FromVT));
9089 return DAG.getNode(ISD::SETCC, DL, VT, And, DAG.getConstant(0, DL, FromVT),
9090 DAG.getCondCode(ISD::SETNE));
9091}
9092
9094 const ARMSubtarget *Subtarget) {
9095 if (!Subtarget->hasMVEIntegerOps())
9096 return SDValue();
9097
9098 EVT ToVT = N->getValueType(0);
9099 if (ToVT.getScalarType() == MVT::i1)
9100 return LowerTruncatei1(N, DAG, Subtarget);
9101
9102 // MVE does not have a single instruction to perform the truncation of a v4i32
9103 // into the lower half of a v8i16, in the same way that a NEON vmovn would.
9104 // Most of the instructions in MVE follow the 'Beats' system, where moving
9105 // values from different lanes is usually something that the instructions
9106 // avoid.
9107 //
9108 // Instead it has top/bottom instructions such as VMOVLT/B and VMOVNT/B,
9109 // which take a the top/bottom half of a larger lane and extend it (or do the
9110 // opposite, truncating into the top/bottom lane from a larger lane). Note
9111 // that because of the way we widen lanes, a v4i16 is really a v4i32 using the
9112 // bottom 16bits from each vector lane. This works really well with T/B
9113 // instructions, but that doesn't extend to v8i32->v8i16 where the lanes need
9114 // to move order.
9115 //
9116 // But truncates and sext/zext are always going to be fairly common from llvm.
9117 // We have several options for how to deal with them:
9118 // - Wherever possible combine them into an instruction that makes them
9119 // "free". This includes loads/stores, which can perform the trunc as part
9120 // of the memory operation. Or certain shuffles that can be turned into
9121 // VMOVN/VMOVL.
9122 // - Lane Interleaving to transform blocks surrounded by ext/trunc. So
9123 // trunc(mul(sext(a), sext(b))) may become
9124 // VMOVNT(VMUL(VMOVLB(a), VMOVLB(b)), VMUL(VMOVLT(a), VMOVLT(b))). (Which in
9125 // this case can use VMULL). This is performed in the
9126 // MVELaneInterleavingPass.
9127 // - Otherwise we have an option. By default we would expand the
9128 // zext/sext/trunc into a series of lane extract/inserts going via GPR
9129 // registers. One for each vector lane in the vector. This can obviously be
9130 // very expensive.
9131 // - The other option is to use the fact that loads/store can extend/truncate
9132 // to turn a trunc into two truncating stack stores and a stack reload. This
9133 // becomes 3 back-to-back memory operations, but at least that is less than
9134 // all the insert/extracts.
9135 //
9136 // In order to do the last, we convert certain trunc's into MVETRUNC, which
9137 // are either optimized where they can be, or eventually lowered into stack
9138 // stores/loads. This prevents us from splitting a v8i16 trunc into two stores
9139 // two early, where other instructions would be better, and stops us from
9140 // having to reconstruct multiple buildvector shuffles into loads/stores.
9141 if (ToVT != MVT::v8i16 && ToVT != MVT::v16i8)
9142 return SDValue();
9143 EVT FromVT = N->getOperand(0).getValueType();
9144 if (FromVT != MVT::v8i32 && FromVT != MVT::v16i16)
9145 return SDValue();
9146
9147 SDValue Lo, Hi;
9148 std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
9149 SDLoc DL(N);
9150 return DAG.getNode(ARMISD::MVETRUNC, DL, ToVT, Lo, Hi);
9151}
9152
9154 const ARMSubtarget *Subtarget) {
9155 if (!Subtarget->hasMVEIntegerOps())
9156 return SDValue();
9157
9158 // See LowerTruncate above for an explanation of MVEEXT/MVETRUNC.
9159
9160 EVT ToVT = N->getValueType(0);
9161 if (ToVT != MVT::v16i32 && ToVT != MVT::v8i32 && ToVT != MVT::v16i16)
9162 return SDValue();
9163 SDValue Op = N->getOperand(0);
9164 EVT FromVT = Op.getValueType();
9165 if (FromVT != MVT::v8i16 && FromVT != MVT::v16i8)
9166 return SDValue();
9167
9168 SDLoc DL(N);
9169 EVT ExtVT = ToVT.getHalfNumVectorElementsVT(*DAG.getContext());
9170 if (ToVT.getScalarType() == MVT::i32 && FromVT.getScalarType() == MVT::i8)
9171 ExtVT = MVT::v8i16;
9172
9173 unsigned Opcode =
9175 SDValue Ext = DAG.getNode(Opcode, DL, DAG.getVTList(ExtVT, ExtVT), Op);
9176 SDValue Ext1 = Ext.getValue(1);
9177
9178 if (ToVT.getScalarType() == MVT::i32 && FromVT.getScalarType() == MVT::i8) {
9179 Ext = DAG.getNode(N->getOpcode(), DL, MVT::v8i32, Ext);
9180 Ext1 = DAG.getNode(N->getOpcode(), DL, MVT::v8i32, Ext1);
9181 }
9182
9183 return DAG.getNode(ISD::CONCAT_VECTORS, DL, ToVT, Ext, Ext1);
9184}
9185
9186/// isExtendedBUILD_VECTOR - Check if N is a constant BUILD_VECTOR where each
9187/// element has been zero/sign-extended, depending on the isSigned parameter,
9188/// from an integer type half its size.
9190 bool isSigned) {
9191 // A v2i64 BUILD_VECTOR will have been legalized to a BITCAST from v4i32.
9192 EVT VT = N->getValueType(0);
9193 if (VT == MVT::v2i64 && N->getOpcode() == ISD::BITCAST) {
9194 SDNode *BVN = N->getOperand(0).getNode();
9195 if (BVN->getValueType(0) != MVT::v4i32 ||
9196 BVN->getOpcode() != ISD::BUILD_VECTOR)
9197 return false;
9198 unsigned LoElt = DAG.getDataLayout().isBigEndian() ? 1 : 0;
9199 unsigned HiElt = 1 - LoElt;
9204 if (!Lo0 || !Hi0 || !Lo1 || !Hi1)
9205 return false;
9206 if (isSigned) {
9207 if (Hi0->getSExtValue() == Lo0->getSExtValue() >> 32 &&
9208 Hi1->getSExtValue() == Lo1->getSExtValue() >> 32)
9209 return true;
9210 } else {
9211 if (Hi0->isZero() && Hi1->isZero())
9212 return true;
9213 }
9214 return false;
9215 }
9216
9217 if (N->getOpcode() != ISD::BUILD_VECTOR)
9218 return false;
9219
9220 for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
9221 SDNode *Elt = N->getOperand(i).getNode();
9223 unsigned EltSize = VT.getScalarSizeInBits();
9224 unsigned HalfSize = EltSize / 2;
9225 if (isSigned) {
9226 if (!isIntN(HalfSize, C->getSExtValue()))
9227 return false;
9228 } else {
9229 if (!isUIntN(HalfSize, C->getZExtValue()))
9230 return false;
9231 }
9232 continue;
9233 }
9234 return false;
9235 }
9236
9237 return true;
9238}
9239
9240/// isSignExtended - Check if a node is a vector value that is sign-extended
9241/// or a constant BUILD_VECTOR with sign-extended elements.
9243 if (N->getOpcode() == ISD::SIGN_EXTEND || ISD::isSEXTLoad(N))
9244 return true;
9245 if (isExtendedBUILD_VECTOR(N, DAG, true))
9246 return true;
9247 return false;
9248}
9249
9250/// isZeroExtended - Check if a node is a vector value that is zero-extended (or
9251/// any-extended) or a constant BUILD_VECTOR with zero-extended elements.
9253 if (N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode() == ISD::ANY_EXTEND ||
9255 return true;
9256 if (isExtendedBUILD_VECTOR(N, DAG, false))
9257 return true;
9258 return false;
9259}
9260
9261static EVT getExtensionTo64Bits(const EVT &OrigVT) {
9262 if (OrigVT.getSizeInBits() >= 64)
9263 return OrigVT;
9264
9265 assert(OrigVT.isSimple() && "Expecting a simple value type");
9266
9267 MVT::SimpleValueType OrigSimpleTy = OrigVT.getSimpleVT().SimpleTy;
9268 switch (OrigSimpleTy) {
9269 default: llvm_unreachable("Unexpected Vector Type");
9270 case MVT::v2i8:
9271 case MVT::v2i16:
9272 return MVT::v2i32;
9273 case MVT::v4i8:
9274 return MVT::v4i16;
9275 }
9276}
9277
9278/// AddRequiredExtensionForVMULL - Add a sign/zero extension to extend the total
9279/// value size to 64 bits. We need a 64-bit D register as an operand to VMULL.
9280/// We insert the required extension here to get the vector to fill a D register.
9282 const EVT &OrigTy,
9283 const EVT &ExtTy,
9284 unsigned ExtOpcode) {
9285 // The vector originally had a size of OrigTy. It was then extended to ExtTy.
9286 // We expect the ExtTy to be 128-bits total. If the OrigTy is less than
9287 // 64-bits we need to insert a new extension so that it will be 64-bits.
9288 assert(ExtTy.is128BitVector() && "Unexpected extension size");
9289 if (OrigTy.getSizeInBits() >= 64)
9290 return N;
9291
9292 // Must extend size to at least 64 bits to be used as an operand for VMULL.
9293 EVT NewVT = getExtensionTo64Bits(OrigTy);
9294
9295 return DAG.getNode(ExtOpcode, SDLoc(N), NewVT, N);
9296}
9297
9298/// SkipLoadExtensionForVMULL - return a load of the original vector size that
9299/// does not do any sign/zero extension. If the original vector is less
9300/// than 64 bits, an appropriate extension will be added after the load to
9301/// reach a total size of 64 bits. We have to add the extension separately
9302/// because ARM does not have a sign/zero extending load for vectors.
9304 EVT ExtendedTy = getExtensionTo64Bits(LD->getMemoryVT());
9305
9306 // The load already has the right type.
9307 if (ExtendedTy == LD->getMemoryVT())
9308 return DAG.getLoad(LD->getMemoryVT(), SDLoc(LD), LD->getChain(),
9309 LD->getBasePtr(), LD->getPointerInfo(), LD->getAlign(),
9310 LD->getMemOperand()->getFlags());
9311
9312 // We need to create a zextload/sextload. We cannot just create a load
9313 // followed by a zext/zext node because LowerMUL is also run during normal
9314 // operation legalization where we can't create illegal types.
9315 return DAG.getExtLoad(LD->getExtensionType(), SDLoc(LD), ExtendedTy,
9316 LD->getChain(), LD->getBasePtr(), LD->getPointerInfo(),
9317 LD->getMemoryVT(), LD->getAlign(),
9318 LD->getMemOperand()->getFlags());
9319}
9320
9321/// SkipExtensionForVMULL - For a node that is a SIGN_EXTEND, ZERO_EXTEND,
9322/// ANY_EXTEND, extending load, or BUILD_VECTOR with extended elements, return
9323/// the unextended value. The unextended vector should be 64 bits so that it can
9324/// be used as an operand to a VMULL instruction. If the original vector size
9325/// before extension is less than 64 bits we add a an extension to resize
9326/// the vector to 64 bits.
9328 if (N->getOpcode() == ISD::SIGN_EXTEND ||
9329 N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode() == ISD::ANY_EXTEND)
9330 return AddRequiredExtensionForVMULL(N->getOperand(0), DAG,
9331 N->getOperand(0)->getValueType(0),
9332 N->getValueType(0),
9333 N->getOpcode());
9334
9335 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
9336 assert((ISD::isSEXTLoad(LD) || ISD::isZEXTLoad(LD)) &&
9337 "Expected extending load");
9338
9339 SDValue newLoad = SkipLoadExtensionForVMULL(LD, DAG);
9340 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), newLoad.getValue(1));
9341 unsigned Opcode = ISD::isSEXTLoad(LD) ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
9342 SDValue extLoad =
9343 DAG.getNode(Opcode, SDLoc(newLoad), LD->getValueType(0), newLoad);
9344 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 0), extLoad);
9345
9346 return newLoad;
9347 }
9348
9349 // Otherwise, the value must be a BUILD_VECTOR. For v2i64, it will
9350 // have been legalized as a BITCAST from v4i32.
9351 if (N->getOpcode() == ISD::BITCAST) {
9352 SDNode *BVN = N->getOperand(0).getNode();
9354 BVN->getValueType(0) == MVT::v4i32 && "expected v4i32 BUILD_VECTOR");
9355 unsigned LowElt = DAG.getDataLayout().isBigEndian() ? 1 : 0;
9356 return DAG.getBuildVector(
9357 MVT::v2i32, SDLoc(N),
9358 {BVN->getOperand(LowElt), BVN->getOperand(LowElt + 2)});
9359 }
9360 // Construct a new BUILD_VECTOR with elements truncated to half the size.
9361 assert(N->getOpcode() == ISD::BUILD_VECTOR && "expected BUILD_VECTOR");
9362 EVT VT = N->getValueType(0);
9363 unsigned EltSize = VT.getScalarSizeInBits() / 2;
9364 unsigned NumElts = VT.getVectorNumElements();
9365 MVT TruncVT = MVT::getIntegerVT(EltSize);
9367 SDLoc dl(N);
9368 for (unsigned i = 0; i != NumElts; ++i) {
9369 const APInt &CInt = N->getConstantOperandAPInt(i);
9370 // Element types smaller than 32 bits are not legal, so use i32 elements.
9371 // The values are implicitly truncated so sext vs. zext doesn't matter.
9372 Ops.push_back(DAG.getConstant(CInt.zextOrTrunc(32), dl, MVT::i32));
9373 }
9374 return DAG.getBuildVector(MVT::getVectorVT(TruncVT, NumElts), dl, Ops);
9375}
9376
9377static bool isAddSubSExt(SDNode *N, SelectionDAG &DAG) {
9378 unsigned Opcode = N->getOpcode();
9379 if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
9380 SDNode *N0 = N->getOperand(0).getNode();
9381 SDNode *N1 = N->getOperand(1).getNode();
9382 return N0->hasOneUse() && N1->hasOneUse() &&
9383 isSignExtended(N0, DAG) && isSignExtended(N1, DAG);
9384 }
9385 return false;
9386}
9387
9388static bool isAddSubZExt(SDNode *N, SelectionDAG &DAG) {
9389 unsigned Opcode = N->getOpcode();
9390 if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
9391 SDNode *N0 = N->getOperand(0).getNode();
9392 SDNode *N1 = N->getOperand(1).getNode();
9393 return N0->hasOneUse() && N1->hasOneUse() &&
9394 isZeroExtended(N0, DAG) && isZeroExtended(N1, DAG);
9395 }
9396 return false;
9397}
9398
9400 // Multiplications are only custom-lowered for 128-bit vectors so that
9401 // VMULL can be detected. Otherwise v2i64 multiplications are not legal.
9402 EVT VT = Op.getValueType();
9403 assert(VT.is128BitVector() && VT.isInteger() &&
9404 "unexpected type for custom-lowering ISD::MUL");
9405 SDNode *N0 = Op.getOperand(0).getNode();
9406 SDNode *N1 = Op.getOperand(1).getNode();
9407 unsigned NewOpc = 0;
9408 bool isMLA = false;
9409 bool isN0SExt = isSignExtended(N0, DAG);
9410 bool isN1SExt = isSignExtended(N1, DAG);
9411 if (isN0SExt && isN1SExt)
9412 NewOpc = ARMISD::VMULLs;
9413 else {
9414 bool isN0ZExt = isZeroExtended(N0, DAG);
9415 bool isN1ZExt = isZeroExtended(N1, DAG);
9416 if (isN0ZExt && isN1ZExt)
9417 NewOpc = ARMISD::VMULLu;
9418 else if (isN1SExt || isN1ZExt) {
9419 // Look for (s/zext A + s/zext B) * (s/zext C). We want to turn these
9420 // into (s/zext A * s/zext C) + (s/zext B * s/zext C)
9421 if (isN1SExt && isAddSubSExt(N0, DAG)) {
9422 NewOpc = ARMISD::VMULLs;
9423 isMLA = true;
9424 } else if (isN1ZExt && isAddSubZExt(N0, DAG)) {
9425 NewOpc = ARMISD::VMULLu;
9426 isMLA = true;
9427 } else if (isN0ZExt && isAddSubZExt(N1, DAG)) {
9428 std::swap(N0, N1);
9429 NewOpc = ARMISD::VMULLu;
9430 isMLA = true;
9431 }
9432 }
9433
9434 if (!NewOpc) {
9435 if (VT == MVT::v2i64)
9436 // Fall through to expand this. It is not legal.
9437 return SDValue();
9438 else
9439 // Other vector multiplications are legal.
9440 return Op;
9441 }
9442 }
9443
9444 // Legalize to a VMULL instruction.
9445 SDLoc DL(Op);
9446 SDValue Op0;
9447 SDValue Op1 = SkipExtensionForVMULL(N1, DAG);
9448 if (!isMLA) {
9449 Op0 = SkipExtensionForVMULL(N0, DAG);
9451 Op1.getValueType().is64BitVector() &&
9452 "unexpected types for extended operands to VMULL");
9453 return DAG.getNode(NewOpc, DL, VT, Op0, Op1);
9454 }
9455
9456 // Optimizing (zext A + zext B) * C, to (VMULL A, C) + (VMULL B, C) during
9457 // isel lowering to take advantage of no-stall back to back vmul + vmla.
9458 // vmull q0, d4, d6
9459 // vmlal q0, d5, d6
9460 // is faster than
9461 // vaddl q0, d4, d5
9462 // vmovl q1, d6
9463 // vmul q0, q0, q1
9464 SDValue N00 = SkipExtensionForVMULL(N0->getOperand(0).getNode(), DAG);
9465 SDValue N01 = SkipExtensionForVMULL(N0->getOperand(1).getNode(), DAG);
9466 EVT Op1VT = Op1.getValueType();
9467 return DAG.getNode(N0->getOpcode(), DL, VT,
9468 DAG.getNode(NewOpc, DL, VT,
9469 DAG.getNode(ISD::BITCAST, DL, Op1VT, N00), Op1),
9470 DAG.getNode(NewOpc, DL, VT,
9471 DAG.getNode(ISD::BITCAST, DL, Op1VT, N01), Op1));
9472}
9473
9475 SelectionDAG &DAG) {
9476 // TODO: Should this propagate fast-math-flags?
9477
9478 // Convert to float
9479 // float4 xf = vcvt_f32_s32(vmovl_s16(a.lo));
9480 // float4 yf = vcvt_f32_s32(vmovl_s16(b.lo));
9481 X = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, X);
9482 Y = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Y);
9483 X = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, X);
9484 Y = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, Y);
9485 // Get reciprocal estimate.
9486 // float4 recip = vrecpeq_f32(yf);
9487 Y = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
9488 DAG.getConstant(Intrinsic::arm_neon_vrecpe, dl, MVT::i32),
9489 Y);
9490 // Because char has a smaller range than uchar, we can actually get away
9491 // without any newton steps. This requires that we use a weird bias
9492 // of 0xb000, however (again, this has been exhaustively tested).
9493 // float4 result = as_float4(as_int4(xf*recip) + 0xb000);
9494 X = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, X, Y);
9495 X = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, X);
9496 Y = DAG.getConstant(0xb000, dl, MVT::v4i32);
9497 X = DAG.getNode(ISD::ADD, dl, MVT::v4i32, X, Y);
9498 X = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, X);
9499 // Convert back to short.
9500 X = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, X);
9501 X = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, X);
9502 return X;
9503}
9504
9506 SelectionDAG &DAG) {
9507 // TODO: Should this propagate fast-math-flags?
9508
9509 SDValue N2;
9510 // Convert to float.
9511 // float4 yf = vcvt_f32_s32(vmovl_s16(y));
9512 // float4 xf = vcvt_f32_s32(vmovl_s16(x));
9513 N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, N0);
9514 N1 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, N1);
9515 N0 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N0);
9516 N1 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N1);
9517
9518 // Use reciprocal estimate and one refinement step.
9519 // float4 recip = vrecpeq_f32(yf);
9520 // recip *= vrecpsq_f32(yf, recip);
9521 N2 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
9522 DAG.getConstant(Intrinsic::arm_neon_vrecpe, dl, MVT::i32),
9523 N1);
9524 N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
9525 DAG.getConstant(Intrinsic::arm_neon_vrecps, dl, MVT::i32),
9526 N1, N2);
9527 N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2);
9528 // Because short has a smaller range than ushort, we can actually get away
9529 // with only a single newton step. This requires that we use a weird bias
9530 // of 89, however (again, this has been exhaustively tested).
9531 // float4 result = as_float4(as_int4(xf*recip) + 0x89);
9532 N0 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N0, N2);
9533 N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, N0);
9534 N1 = DAG.getConstant(0x89, dl, MVT::v4i32);
9535 N0 = DAG.getNode(ISD::ADD, dl, MVT::v4i32, N0, N1);
9536 N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, N0);
9537 // Convert back to integer and return.
9538 // return vmovn_s32(vcvt_s32_f32(result));
9539 N0 = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, N0);
9540 N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, N0);
9541 return N0;
9542}
9543
9545 const ARMSubtarget *ST) {
9546 EVT VT = Op.getValueType();
9547 assert((VT == MVT::v4i16 || VT == MVT::v8i8) &&
9548 "unexpected type for custom-lowering ISD::SDIV");
9549
9550 SDLoc dl(Op);
9551 SDValue N0 = Op.getOperand(0);
9552 SDValue N1 = Op.getOperand(1);
9553 SDValue N2, N3;
9554
9555 if (VT == MVT::v8i8) {
9556 N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i16, N0);
9557 N1 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i16, N1);
9558
9559 N2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,
9560 DAG.getIntPtrConstant(4, dl));
9561 N3 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1,
9562 DAG.getIntPtrConstant(4, dl));
9563 N0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,
9564 DAG.getIntPtrConstant(0, dl));
9565 N1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1,
9566 DAG.getIntPtrConstant(0, dl));
9567
9568 N0 = LowerSDIV_v4i8(N0, N1, dl, DAG); // v4i16
9569 N2 = LowerSDIV_v4i8(N2, N3, dl, DAG); // v4i16
9570
9571 N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, N0, N2);
9572 N0 = LowerCONCAT_VECTORS(N0, DAG, ST);
9573
9574 N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v8i8, N0);
9575 return N0;
9576 }
9577 return LowerSDIV_v4i16(N0, N1, dl, DAG);
9578}
9579
9581 const ARMSubtarget *ST) {
9582 // TODO: Should this propagate fast-math-flags?
9583 EVT VT = Op.getValueType();
9584 assert((VT == MVT::v4i16 || VT == MVT::v8i8) &&
9585 "unexpected type for custom-lowering ISD::UDIV");
9586
9587 SDLoc dl(Op);
9588 SDValue N0 = Op.getOperand(0);
9589 SDValue N1 = Op.getOperand(1);
9590 SDValue N2, N3;
9591
9592 if (VT == MVT::v8i8) {
9593 N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v8i16, N0);
9594 N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v8i16, N1);
9595
9596 N2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,
9597 DAG.getIntPtrConstant(4, dl));
9598 N3 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1,
9599 DAG.getIntPtrConstant(4, dl));
9600 N0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,
9601 DAG.getIntPtrConstant(0, dl));
9602 N1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1,
9603 DAG.getIntPtrConstant(0, dl));
9604
9605 N0 = LowerSDIV_v4i16(N0, N1, dl, DAG); // v4i16
9606 N2 = LowerSDIV_v4i16(N2, N3, dl, DAG); // v4i16
9607
9608 N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, N0, N2);
9609 N0 = LowerCONCAT_VECTORS(N0, DAG, ST);
9610
9611 N0 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v8i8,
9612 DAG.getConstant(Intrinsic::arm_neon_vqmovnsu, dl,
9613 MVT::i32),
9614 N0);
9615 return N0;
9616 }
9617
9618 // v4i16 sdiv ... Convert to float.
9619 // float4 yf = vcvt_f32_s32(vmovl_u16(y));
9620 // float4 xf = vcvt_f32_s32(vmovl_u16(x));
9621 N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v4i32, N0);
9622 N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v4i32, N1);
9623 N0 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N0);
9624 SDValue BN1 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N1);
9625
9626 // Use reciprocal estimate and two refinement steps.
9627 // float4 recip = vrecpeq_f32(yf);
9628 // recip *= vrecpsq_f32(yf, recip);
9629 // recip *= vrecpsq_f32(yf, recip);
9630 N2 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
9631 DAG.getConstant(Intrinsic::arm_neon_vrecpe, dl, MVT::i32),
9632 BN1);
9633 N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
9634 DAG.getConstant(Intrinsic::arm_neon_vrecps, dl, MVT::i32),
9635 BN1, N2);
9636 N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2);
9637 N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
9638 DAG.getConstant(Intrinsic::arm_neon_vrecps, dl, MVT::i32),
9639 BN1, N2);
9640 N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2);
9641 // Simply multiplying by the reciprocal estimate can leave us a few ulps
9642 // too low, so we add 2 ulps (exhaustive testing shows that this is enough,
9643 // and that it will never cause us to return an answer too large).
9644 // float4 result = as_float4(as_int4(xf*recip) + 2);
9645 N0 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N0, N2);
9646 N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, N0);
9647 N1 = DAG.getConstant(2, dl, MVT::v4i32);
9648 N0 = DAG.getNode(ISD::ADD, dl, MVT::v4i32, N0, N1);
9649 N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, N0);
9650 // Convert back to integer and return.
9651 // return vmovn_u32(vcvt_s32_f32(result));
9652 N0 = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, N0);
9653 N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, N0);
9654 return N0;
9655}
9656
9658 unsigned Opcode, bool IsSigned) {
9659 EVT VT0 = Op.getValue(0).getValueType();
9660 EVT VT1 = Op.getValue(1).getValueType();
9661
9662 bool InvertCarry = Opcode == ARMISD::SUBE;
9663 SDValue OpLHS = Op.getOperand(0);
9664 SDValue OpRHS = Op.getOperand(1);
9665 SDValue OpCarryIn = valueToCarryFlag(Op.getOperand(2), DAG, InvertCarry);
9666
9667 SDLoc DL(Op);
9668
9669 SDValue Result = DAG.getNode(Opcode, DL, DAG.getVTList(VT0, MVT::i32), OpLHS,
9670 OpRHS, OpCarryIn);
9671
9672 SDValue OutFlag =
9673 IsSigned ? overflowFlagToValue(Result.getValue(1), VT1, DAG)
9674 : carryFlagToValue(Result.getValue(1), VT1, DAG, InvertCarry);
9675
9676 return DAG.getMergeValues({Result, OutFlag}, DL);
9677}
9678
9679SDValue ARMTargetLowering::LowerWindowsDIVLibCall(SDValue Op, SelectionDAG &DAG,
9680 bool Signed,
9681 SDValue &Chain) const {
9682 EVT VT = Op.getValueType();
9683 assert((VT == MVT::i32 || VT == MVT::i64) &&
9684 "unexpected type for custom lowering DIV");
9685 SDLoc dl(Op);
9686
9687 const auto &DL = DAG.getDataLayout();
9688 RTLIB::Libcall LC;
9689 if (Signed)
9690 LC = VT == MVT::i32 ? RTLIB::SDIVREM_I32 : RTLIB::SDIVREM_I64;
9691 else
9692 LC = VT == MVT::i32 ? RTLIB::UDIVREM_I32 : RTLIB::UDIVREM_I64;
9693
9694 RTLIB::LibcallImpl LCImpl = DAG.getLibcalls().getLibcallImpl(LC);
9695 SDValue ES = DAG.getExternalSymbol(LCImpl, getPointerTy(DL));
9696
9698
9699 for (auto AI : {1, 0}) {
9700 SDValue Operand = Op.getOperand(AI);
9701 Args.emplace_back(Operand,
9702 Operand.getValueType().getTypeForEVT(*DAG.getContext()));
9703 }
9704
9705 CallLoweringInfo CLI(DAG);
9706 CLI.setDebugLoc(dl).setChain(Chain).setCallee(
9708 VT.getTypeForEVT(*DAG.getContext()), ES, std::move(Args));
9709
9710 return LowerCallTo(CLI).first;
9711}
9712
9713// This is a code size optimisation: return the original SDIV node to
9714// DAGCombiner when we don't want to expand SDIV into a sequence of
9715// instructions, and an empty node otherwise which will cause the
9716// SDIV to be expanded in DAGCombine.
9717SDValue
9718ARMTargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
9719 SelectionDAG &DAG,
9720 SmallVectorImpl<SDNode *> &Created) const {
9721 // TODO: Support SREM
9722 if (N->getOpcode() != ISD::SDIV)
9723 return SDValue();
9724
9725 const auto &ST = DAG.getSubtarget<ARMSubtarget>();
9726 const bool MinSize = ST.hasMinSize();
9727 const bool HasDivide = ST.isThumb() ? ST.hasDivideInThumbMode()
9728 : ST.hasDivideInARMMode();
9729
9730 // Don't touch vector types; rewriting this may lead to scalarizing
9731 // the int divs.
9732 if (N->getOperand(0).getValueType().isVector())
9733 return SDValue();
9734
9735 // Bail if MinSize is not set, and also for both ARM and Thumb mode we need
9736 // hwdiv support for this to be really profitable.
9737 if (!(MinSize && HasDivide))
9738 return SDValue();
9739
9740 // ARM mode is a bit simpler than Thumb: we can handle large power
9741 // of 2 immediates with 1 mov instruction; no further checks required,
9742 // just return the sdiv node.
9743 if (!ST.isThumb())
9744 return SDValue(N, 0);
9745
9746 // In Thumb mode, immediates larger than 128 need a wide 4-byte MOV,
9747 // and thus lose the code size benefits of a MOVS that requires only 2.
9748 // TargetTransformInfo and 'getIntImmCodeSizeCost' could be helpful here,
9749 // but as it's doing exactly this, it's not worth the trouble to get TTI.
9750 if (Divisor.sgt(128))
9751 return SDValue();
9752
9753 return SDValue(N, 0);
9754}
9755
9756SDValue ARMTargetLowering::LowerDIV_Windows(SDValue Op, SelectionDAG &DAG,
9757 bool Signed) const {
9758 assert(Op.getValueType() == MVT::i32 &&
9759 "unexpected type for custom lowering DIV");
9760 SDLoc dl(Op);
9761
9762 SDValue DBZCHK = DAG.getNode(ARMISD::WIN__DBZCHK, dl, MVT::Other,
9763 DAG.getEntryNode(), Op.getOperand(1));
9764
9765 return LowerWindowsDIVLibCall(Op, DAG, Signed, DBZCHK);
9766}
9767
9769 SDLoc DL(N);
9770 SDValue Op = N->getOperand(1);
9771 if (N->getValueType(0) == MVT::i32)
9772 return DAG.getNode(ARMISD::WIN__DBZCHK, DL, MVT::Other, InChain, Op);
9773 SDValue Lo, Hi;
9774 std::tie(Lo, Hi) = DAG.SplitScalar(Op, DL, MVT::i32, MVT::i32);
9775 return DAG.getNode(ARMISD::WIN__DBZCHK, DL, MVT::Other, InChain,
9776 DAG.getNode(ISD::OR, DL, MVT::i32, Lo, Hi));
9777}
9778
9779void ARMTargetLowering::ExpandDIV_Windows(
9780 SDValue Op, SelectionDAG &DAG, bool Signed,
9782 const auto &DL = DAG.getDataLayout();
9783
9784 assert(Op.getValueType() == MVT::i64 &&
9785 "unexpected type for custom lowering DIV");
9786 SDLoc dl(Op);
9787
9788 SDValue DBZCHK = WinDBZCheckDenominator(DAG, Op.getNode(), DAG.getEntryNode());
9789
9790 SDValue Result = LowerWindowsDIVLibCall(Op, DAG, Signed, DBZCHK);
9791
9792 SDValue Lower = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Result);
9793 SDValue Upper = DAG.getNode(ISD::SRL, dl, MVT::i64, Result,
9794 DAG.getConstant(32, dl, getPointerTy(DL)));
9795 Upper = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Upper);
9796
9797 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lower, Upper));
9798}
9799
9800std::pair<SDValue, SDValue>
9801ARMTargetLowering::LowerAEABIUnalignedLoad(SDValue Op,
9802 SelectionDAG &DAG) const {
9803 // If we have an unaligned load from a i32 or i64 that would normally be
9804 // split into separate ldrb's, we can use the __aeabi_uread4/__aeabi_uread8
9805 // functions instead.
9806 LoadSDNode *LD = cast<LoadSDNode>(Op.getNode());
9807 EVT MemVT = LD->getMemoryVT();
9808 if (MemVT != MVT::i32 && MemVT != MVT::i64)
9809 return std::make_pair(SDValue(), SDValue());
9810
9811 const auto &MF = DAG.getMachineFunction();
9812 unsigned AS = LD->getAddressSpace();
9813 Align Alignment = LD->getAlign();
9814 const DataLayout &DL = DAG.getDataLayout();
9815 bool AllowsUnaligned = Subtarget->allowsUnalignedMem();
9816
9817 if (MF.getFunction().hasMinSize() && !AllowsUnaligned &&
9818 Alignment <= llvm::Align(2)) {
9819
9820 RTLIB::Libcall LC =
9821 (MemVT == MVT::i32) ? RTLIB::AEABI_UREAD4 : RTLIB::AEABI_UREAD8;
9822
9823 MakeLibCallOptions Opts;
9824 SDLoc dl(Op);
9825
9826 auto Pair = makeLibCall(DAG, LC, MemVT.getSimpleVT(), LD->getBasePtr(),
9827 Opts, dl, LD->getChain());
9828
9829 // If necessary, extend the node to 64bit
9830 if (LD->getExtensionType() != ISD::NON_EXTLOAD) {
9831 unsigned ExtType = LD->getExtensionType() == ISD::SEXTLOAD
9834 SDValue EN = DAG.getNode(ExtType, dl, LD->getValueType(0), Pair.first);
9835 Pair.first = EN;
9836 }
9837 return Pair;
9838 }
9839
9840 // Default expand to individual loads
9841 if (!allowsMemoryAccess(*DAG.getContext(), DL, MemVT, AS, Alignment))
9842 return expandUnalignedLoad(LD, DAG);
9843 return std::make_pair(SDValue(), SDValue());
9844}
9845
9846SDValue ARMTargetLowering::LowerAEABIUnalignedStore(SDValue Op,
9847 SelectionDAG &DAG) const {
9848 // If we have an unaligned store to a i32 or i64 that would normally be
9849 // split into separate ldrb's, we can use the __aeabi_uwrite4/__aeabi_uwrite8
9850 // functions instead.
9851 StoreSDNode *ST = cast<StoreSDNode>(Op.getNode());
9852 EVT MemVT = ST->getMemoryVT();
9853 if (MemVT != MVT::i32 && MemVT != MVT::i64)
9854 return SDValue();
9855
9856 const auto &MF = DAG.getMachineFunction();
9857 unsigned AS = ST->getAddressSpace();
9858 Align Alignment = ST->getAlign();
9859 const DataLayout &DL = DAG.getDataLayout();
9860 bool AllowsUnaligned = Subtarget->allowsUnalignedMem();
9861
9862 if (MF.getFunction().hasMinSize() && !AllowsUnaligned &&
9863 Alignment <= llvm::Align(2)) {
9864
9865 SDLoc dl(Op);
9866
9867 // If necessary, trunc the value to 32bit
9868 SDValue StoreVal = ST->getOperand(1);
9869 if (ST->isTruncatingStore())
9870 StoreVal = DAG.getNode(ISD::TRUNCATE, dl, MemVT, ST->getOperand(1));
9871
9872 RTLIB::Libcall LC =
9873 (MemVT == MVT::i32) ? RTLIB::AEABI_UWRITE4 : RTLIB::AEABI_UWRITE8;
9874
9875 MakeLibCallOptions Opts;
9876 auto CallResult =
9877 makeLibCall(DAG, LC, MVT::isVoid, {StoreVal, ST->getBasePtr()}, Opts,
9878 dl, ST->getChain());
9879
9880 return CallResult.second;
9881 }
9882
9883 // Default expand to individual stores
9884 if (!allowsMemoryAccess(*DAG.getContext(), DL, MemVT, AS, Alignment))
9885 return expandUnalignedStore(ST, DAG);
9886 return SDValue();
9887}
9888
9890 LoadSDNode *LD = cast<LoadSDNode>(Op.getNode());
9891 EVT MemVT = LD->getMemoryVT();
9892 assert((MemVT == MVT::v2i1 || MemVT == MVT::v4i1 || MemVT == MVT::v8i1 ||
9893 MemVT == MVT::v16i1) &&
9894 "Expected a predicate type!");
9895 assert(MemVT == Op.getValueType());
9896 assert(LD->getExtensionType() == ISD::NON_EXTLOAD &&
9897 "Expected a non-extending load");
9898 assert(LD->isUnindexed() && "Expected a unindexed load");
9899
9900 // The basic MVE VLDR on a v2i1/v4i1/v8i1 actually loads the entire 16bit
9901 // predicate, with the "v4i1" bits spread out over the 16 bits loaded. We
9902 // need to make sure that 8/4/2 bits are actually loaded into the correct
9903 // place, which means loading the value and then shuffling the values into
9904 // the bottom bits of the predicate.
9905 // Equally, VLDR for an v16i1 will actually load 32bits (so will be incorrect
9906 // for BE).
9907 // Speaking of BE, apparently the rest of llvm will assume a reverse order to
9908 // a natural VMSR(load), so needs to be reversed.
9909
9910 SDLoc dl(Op);
9911 SDValue Load = DAG.getExtLoad(
9912 ISD::EXTLOAD, dl, MVT::i32, LD->getChain(), LD->getBasePtr(),
9914 LD->getMemOperand());
9915 SDValue Val = Load;
9916 if (DAG.getDataLayout().isBigEndian())
9917 Val = DAG.getNode(ISD::SRL, dl, MVT::i32,
9918 DAG.getNode(ISD::BITREVERSE, dl, MVT::i32, Load),
9919 DAG.getConstant(32 - MemVT.getSizeInBits(), dl, MVT::i32));
9920 SDValue Pred = DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::v16i1, Val);
9921 if (MemVT != MVT::v16i1)
9922 Pred = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MemVT, Pred,
9923 DAG.getConstant(0, dl, MVT::i32));
9924 return DAG.getMergeValues({Pred, Load.getValue(1)}, dl);
9925}
9926
9927void ARMTargetLowering::LowerLOAD(SDNode *N, SmallVectorImpl<SDValue> &Results,
9928 SelectionDAG &DAG) const {
9929 LoadSDNode *LD = cast<LoadSDNode>(N);
9930 EVT MemVT = LD->getMemoryVT();
9931
9932 if (MemVT == MVT::i64 && Subtarget->hasV5TEOps() &&
9933 !Subtarget->isThumb1Only() && LD->isVolatile() &&
9934 LD->getAlign() >= Subtarget->getDualLoadStoreAlignment()) {
9935 assert(LD->isUnindexed() && "Loads should be unindexed at this point.");
9936 SDLoc dl(N);
9938 ARMISD::LDRD, dl, DAG.getVTList({MVT::i32, MVT::i32, MVT::Other}),
9939 {LD->getChain(), LD->getBasePtr()}, MemVT, LD->getMemOperand());
9940 SDValue Lo = Result.getValue(DAG.getDataLayout().isLittleEndian() ? 0 : 1);
9941 SDValue Hi = Result.getValue(DAG.getDataLayout().isLittleEndian() ? 1 : 0);
9942 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
9943 Results.append({Pair, Result.getValue(2)});
9944 } else if (MemVT == MVT::i32 || MemVT == MVT::i64) {
9945 auto Pair = LowerAEABIUnalignedLoad(SDValue(N, 0), DAG);
9946 if (Pair.first) {
9947 Results.push_back(Pair.first);
9948 Results.push_back(Pair.second);
9949 }
9950 }
9951}
9952
9954 StoreSDNode *ST = cast<StoreSDNode>(Op.getNode());
9955 EVT MemVT = ST->getMemoryVT();
9956 assert((MemVT == MVT::v2i1 || MemVT == MVT::v4i1 || MemVT == MVT::v8i1 ||
9957 MemVT == MVT::v16i1) &&
9958 "Expected a predicate type!");
9959 assert(MemVT == ST->getValue().getValueType());
9960 assert(!ST->isTruncatingStore() && "Expected a non-extending store");
9961 assert(ST->isUnindexed() && "Expected a unindexed store");
9962
9963 // Only store the v2i1 or v4i1 or v8i1 worth of bits, via a buildvector with
9964 // top bits unset and a scalar store.
9965 SDLoc dl(Op);
9966 SDValue Build = ST->getValue();
9967 if (MemVT != MVT::v16i1) {
9969 for (unsigned I = 0; I < MemVT.getVectorNumElements(); I++) {
9970 unsigned Elt = DAG.getDataLayout().isBigEndian()
9971 ? MemVT.getVectorNumElements() - I - 1
9972 : I;
9973 Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, Build,
9974 DAG.getConstant(Elt, dl, MVT::i32)));
9975 }
9976 for (unsigned I = MemVT.getVectorNumElements(); I < 16; I++)
9977 Ops.push_back(DAG.getUNDEF(MVT::i32));
9978 Build = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i1, Ops);
9979 }
9980 SDValue GRP = DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::i32, Build);
9981 if (MemVT == MVT::v16i1 && DAG.getDataLayout().isBigEndian())
9982 GRP = DAG.getNode(ISD::SRL, dl, MVT::i32,
9983 DAG.getNode(ISD::BITREVERSE, dl, MVT::i32, GRP),
9984 DAG.getConstant(16, dl, MVT::i32));
9985 return DAG.getTruncStore(
9986 ST->getChain(), dl, GRP, ST->getBasePtr(),
9988 ST->getMemOperand());
9989}
9990
9991SDValue ARMTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG,
9992 const ARMSubtarget *Subtarget) const {
9993 StoreSDNode *ST = cast<StoreSDNode>(Op.getNode());
9994 EVT MemVT = ST->getMemoryVT();
9995
9996 if (MemVT == MVT::i64 && Subtarget->hasV5TEOps() &&
9997 !Subtarget->isThumb1Only() && ST->isVolatile() &&
9998 ST->getAlign() >= Subtarget->getDualLoadStoreAlignment()) {
9999 assert(ST->isUnindexed() && "Stores should be unindexed at this point.");
10000 SDNode *N = Op.getNode();
10001 SDLoc dl(N);
10002
10003 SDValue Lo = DAG.getNode(
10004 ISD::EXTRACT_ELEMENT, dl, MVT::i32, ST->getValue(),
10005 DAG.getTargetConstant(DAG.getDataLayout().isLittleEndian() ? 0 : 1, dl,
10006 MVT::i32));
10007 SDValue Hi = DAG.getNode(
10008 ISD::EXTRACT_ELEMENT, dl, MVT::i32, ST->getValue(),
10009 DAG.getTargetConstant(DAG.getDataLayout().isLittleEndian() ? 1 : 0, dl,
10010 MVT::i32));
10011
10012 return DAG.getMemIntrinsicNode(ARMISD::STRD, dl, DAG.getVTList(MVT::Other),
10013 {ST->getChain(), Lo, Hi, ST->getBasePtr()},
10014 MemVT, ST->getMemOperand());
10015 } else if (Subtarget->hasMVEIntegerOps() &&
10016 ((MemVT == MVT::v2i1 || MemVT == MVT::v4i1 || MemVT == MVT::v8i1 ||
10017 MemVT == MVT::v16i1))) {
10018 return LowerPredicateStore(Op, DAG);
10019 } else if (MemVT == MVT::i32 || MemVT == MVT::i64) {
10020 return LowerAEABIUnalignedStore(Op, DAG);
10021 }
10022 return SDValue();
10023}
10024
10025static bool isZeroVector(SDValue N) {
10026 return (ISD::isBuildVectorAllZeros(N.getNode()) ||
10027 (N->getOpcode() == ARMISD::VMOVIMM &&
10028 isNullConstant(N->getOperand(0))));
10029}
10030
10033 MVT VT = Op.getSimpleValueType();
10034 SDValue Mask = N->getMask();
10035 SDValue PassThru = N->getPassThru();
10036 SDLoc dl(Op);
10037
10038 if (isZeroVector(PassThru))
10039 return Op;
10040
10041 // MVE Masked loads use zero as the passthru value. Here we convert undef to
10042 // zero too, and other values are lowered to a select.
10043 SDValue ZeroVec = DAG.getNode(ARMISD::VMOVIMM, dl, VT,
10044 DAG.getTargetConstant(0, dl, MVT::i32));
10045 SDValue NewLoad = DAG.getMaskedLoad(
10046 VT, dl, N->getChain(), N->getBasePtr(), N->getOffset(), Mask, ZeroVec,
10047 N->getMemoryVT(), N->getMemOperand(), N->getAddressingMode(),
10048 N->getExtensionType(), N->isExpandingLoad());
10049 SDValue Combo = NewLoad;
10050 bool PassThruIsCastZero = (PassThru.getOpcode() == ISD::BITCAST ||
10051 PassThru.getOpcode() == ARMISD::VECTOR_REG_CAST) &&
10052 isZeroVector(PassThru->getOperand(0));
10053 if (!PassThru.isUndef() && !PassThruIsCastZero)
10054 Combo = DAG.getNode(ISD::VSELECT, dl, VT, Mask, NewLoad, PassThru);
10055 return DAG.getMergeValues({Combo, NewLoad.getValue(1)}, dl);
10056}
10057
10059 const ARMSubtarget *ST) {
10060 if (!ST->hasMVEIntegerOps())
10061 return SDValue();
10062
10063 SDLoc dl(Op);
10064 unsigned BaseOpcode = 0;
10065 switch (Op->getOpcode()) {
10066 default: llvm_unreachable("Expected VECREDUCE opcode");
10067 case ISD::VECREDUCE_FADD: BaseOpcode = ISD::FADD; break;
10068 case ISD::VECREDUCE_FMUL: BaseOpcode = ISD::FMUL; break;
10069 case ISD::VECREDUCE_MUL: BaseOpcode = ISD::MUL; break;
10070 case ISD::VECREDUCE_AND: BaseOpcode = ISD::AND; break;
10071 case ISD::VECREDUCE_OR: BaseOpcode = ISD::OR; break;
10072 case ISD::VECREDUCE_XOR: BaseOpcode = ISD::XOR; break;
10073 case ISD::VECREDUCE_FMAX: BaseOpcode = ISD::FMAXNUM; break;
10074 case ISD::VECREDUCE_FMIN: BaseOpcode = ISD::FMINNUM; break;
10075 }
10076
10077 SDValue Op0 = Op->getOperand(0);
10078 EVT VT = Op0.getValueType();
10079 EVT EltVT = VT.getVectorElementType();
10080 unsigned NumElts = VT.getVectorNumElements();
10081 unsigned NumActiveLanes = NumElts;
10082
10083 assert((NumActiveLanes == 16 || NumActiveLanes == 8 || NumActiveLanes == 4 ||
10084 NumActiveLanes == 2) &&
10085 "Only expected a power 2 vector size");
10086
10087 // Use Mul(X, Rev(X)) until 4 items remain. Going down to 4 vector elements
10088 // allows us to easily extract vector elements from the lanes.
10089 while (NumActiveLanes > 4) {
10090 unsigned RevOpcode = NumActiveLanes == 16 ? ARMISD::VREV16 : ARMISD::VREV32;
10091 SDValue Rev = DAG.getNode(RevOpcode, dl, VT, Op0);
10092 Op0 = DAG.getNode(BaseOpcode, dl, VT, Op0, Rev);
10093 NumActiveLanes /= 2;
10094 }
10095
10096 SDValue Res;
10097 if (NumActiveLanes == 4) {
10098 // The remaining 4 elements are summed sequentially
10099 SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0,
10100 DAG.getConstant(0 * NumElts / 4, dl, MVT::i32));
10101 SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0,
10102 DAG.getConstant(1 * NumElts / 4, dl, MVT::i32));
10103 SDValue Ext2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0,
10104 DAG.getConstant(2 * NumElts / 4, dl, MVT::i32));
10105 SDValue Ext3 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0,
10106 DAG.getConstant(3 * NumElts / 4, dl, MVT::i32));
10107 SDValue Res0 = DAG.getNode(BaseOpcode, dl, EltVT, Ext0, Ext1, Op->getFlags());
10108 SDValue Res1 = DAG.getNode(BaseOpcode, dl, EltVT, Ext2, Ext3, Op->getFlags());
10109 Res = DAG.getNode(BaseOpcode, dl, EltVT, Res0, Res1, Op->getFlags());
10110 } else {
10111 SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0,
10112 DAG.getConstant(0, dl, MVT::i32));
10113 SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0,
10114 DAG.getConstant(1, dl, MVT::i32));
10115 Res = DAG.getNode(BaseOpcode, dl, EltVT, Ext0, Ext1, Op->getFlags());
10116 }
10117
10118 // Result type may be wider than element type.
10119 if (EltVT != Op->getValueType(0))
10120 Res = DAG.getNode(ISD::ANY_EXTEND, dl, Op->getValueType(0), Res);
10121 return Res;
10122}
10123
10125 const ARMSubtarget *ST) {
10126 if (!ST->hasMVEFloatOps())
10127 return SDValue();
10128 return LowerVecReduce(Op, DAG, ST);
10129}
10130
10132 const ARMSubtarget *ST) {
10133 if (!ST->hasNEON())
10134 return SDValue();
10135
10136 SDLoc dl(Op);
10137 SDValue Op0 = Op->getOperand(0);
10138 EVT VT = Op0.getValueType();
10139 EVT EltVT = VT.getVectorElementType();
10140
10141 unsigned PairwiseIntrinsic = 0;
10142 switch (Op->getOpcode()) {
10143 default:
10144 llvm_unreachable("Expected VECREDUCE opcode");
10146 PairwiseIntrinsic = Intrinsic::arm_neon_vpminu;
10147 break;
10149 PairwiseIntrinsic = Intrinsic::arm_neon_vpmaxu;
10150 break;
10152 PairwiseIntrinsic = Intrinsic::arm_neon_vpmins;
10153 break;
10155 PairwiseIntrinsic = Intrinsic::arm_neon_vpmaxs;
10156 break;
10157 }
10158 SDValue PairwiseOp = DAG.getConstant(PairwiseIntrinsic, dl, MVT::i32);
10159
10160 unsigned NumElts = VT.getVectorNumElements();
10161 unsigned NumActiveLanes = NumElts;
10162
10163 assert((NumActiveLanes == 16 || NumActiveLanes == 8 || NumActiveLanes == 4 ||
10164 NumActiveLanes == 2) &&
10165 "Only expected a power 2 vector size");
10166
10167 // Split 128-bit vectors, since vpmin/max takes 2 64-bit vectors.
10168 if (VT.is128BitVector()) {
10169 SDValue Lo, Hi;
10170 std::tie(Lo, Hi) = DAG.SplitVector(Op0, dl);
10171 VT = Lo.getValueType();
10172 Op0 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, {PairwiseOp, Lo, Hi});
10173 NumActiveLanes /= 2;
10174 }
10175
10176 // Use pairwise reductions until one lane remains
10177 while (NumActiveLanes > 1) {
10178 Op0 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, {PairwiseOp, Op0, Op0});
10179 NumActiveLanes /= 2;
10180 }
10181
10182 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0,
10183 DAG.getConstant(0, dl, MVT::i32));
10184
10185 // Result type may be wider than element type.
10186 if (EltVT != Op.getValueType()) {
10187 unsigned Extend = 0;
10188 switch (Op->getOpcode()) {
10189 default:
10190 llvm_unreachable("Expected VECREDUCE opcode");
10193 Extend = ISD::ZERO_EXTEND;
10194 break;
10197 Extend = ISD::SIGN_EXTEND;
10198 break;
10199 }
10200 Res = DAG.getNode(Extend, dl, Op.getValueType(), Res);
10201 }
10202 return Res;
10203}
10204
10206 if (isStrongerThanMonotonic(cast<AtomicSDNode>(Op)->getSuccessOrdering()))
10207 // Acquire/Release load/store is not legal for targets without a dmb or
10208 // equivalent available.
10209 return SDValue();
10210
10211 // Monotonic load/store is legal for all targets.
10212 return Op;
10213}
10214
10217 SelectionDAG &DAG,
10218 const ARMSubtarget *Subtarget) {
10219 SDLoc DL(N);
10220 // Under Power Management extensions, the cycle-count is:
10221 // mrc p15, #0, <Rt>, c9, c13, #0
10222 SDValue Ops[] = { N->getOperand(0), // Chain
10223 DAG.getTargetConstant(Intrinsic::arm_mrc, DL, MVT::i32),
10224 DAG.getTargetConstant(15, DL, MVT::i32),
10225 DAG.getTargetConstant(0, DL, MVT::i32),
10226 DAG.getTargetConstant(9, DL, MVT::i32),
10227 DAG.getTargetConstant(13, DL, MVT::i32),
10228 DAG.getTargetConstant(0, DL, MVT::i32)
10229 };
10230
10231 SDValue Cycles32 = DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL,
10232 DAG.getVTList(MVT::i32, MVT::Other), Ops);
10233 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Cycles32,
10234 DAG.getConstant(0, DL, MVT::i32)));
10235 Results.push_back(Cycles32.getValue(1));
10236}
10237
10239 SDValue V1) {
10240 SDLoc dl(V0.getNode());
10241 SDValue RegClass =
10242 DAG.getTargetConstant(ARM::GPRPairRegClassID, dl, MVT::i32);
10243 SDValue SubReg0 = DAG.getTargetConstant(ARM::gsub_0, dl, MVT::i32);
10244 SDValue SubReg1 = DAG.getTargetConstant(ARM::gsub_1, dl, MVT::i32);
10245 const SDValue Ops[] = {RegClass, V0, SubReg0, V1, SubReg1};
10246 return SDValue(
10247 DAG.getMachineNode(TargetOpcode::REG_SEQUENCE, dl, MVT::Untyped, Ops), 0);
10248}
10249
10251 SDLoc dl(V.getNode());
10252 auto [VLo, VHi] = DAG.SplitScalar(V, dl, MVT::i32, MVT::i32);
10253 bool isBigEndian = DAG.getDataLayout().isBigEndian();
10254 if (isBigEndian)
10255 std::swap(VLo, VHi);
10256 return createGPRPairNode2xi32(DAG, VLo, VHi);
10257}
10258
10261 SelectionDAG &DAG) {
10262 assert(N->getValueType(0) == MVT::i64 &&
10263 "AtomicCmpSwap on types less than 64 should be legal");
10264 SDValue Ops[] = {
10265 createGPRPairNode2xi32(DAG, N->getOperand(1),
10266 DAG.getUNDEF(MVT::i32)), // pointer, temp
10267 createGPRPairNodei64(DAG, N->getOperand(2)), // expected
10268 createGPRPairNodei64(DAG, N->getOperand(3)), // new
10269 N->getOperand(0), // chain in
10270 };
10271 SDNode *CmpSwap = DAG.getMachineNode(
10272 ARM::CMP_SWAP_64, SDLoc(N),
10273 DAG.getVTList(MVT::Untyped, MVT::Untyped, MVT::Other), Ops);
10274
10275 MachineMemOperand *MemOp = cast<MemSDNode>(N)->getMemOperand();
10276 DAG.setNodeMemRefs(cast<MachineSDNode>(CmpSwap), {MemOp});
10277
10278 bool isBigEndian = DAG.getDataLayout().isBigEndian();
10279
10280 SDValue Lo =
10281 DAG.getTargetExtractSubreg(isBigEndian ? ARM::gsub_1 : ARM::gsub_0,
10282 SDLoc(N), MVT::i32, SDValue(CmpSwap, 0));
10283 SDValue Hi =
10284 DAG.getTargetExtractSubreg(isBigEndian ? ARM::gsub_0 : ARM::gsub_1,
10285 SDLoc(N), MVT::i32, SDValue(CmpSwap, 0));
10286 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i64, Lo, Hi));
10287 Results.push_back(SDValue(CmpSwap, 2));
10288}
10289
10290SDValue ARMTargetLowering::LowerFSETCC(SDValue Op, SelectionDAG &DAG) const {
10291 SDLoc dl(Op);
10292 EVT VT = Op.getValueType();
10293 SDValue Chain = Op.getOperand(0);
10294 SDValue LHS = Op.getOperand(1);
10295 SDValue RHS = Op.getOperand(2);
10296 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(3))->get();
10297 bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
10298
10299 // If we don't have instructions of this float type then soften to a libcall
10300 // and use SETCC instead.
10301 if (isUnsupportedFloatingType(LHS.getValueType())) {
10302 softenSetCCOperands(DAG, LHS.getValueType(), LHS, RHS, CC, dl, LHS, RHS,
10303 Chain, IsSignaling);
10304 if (!RHS.getNode()) {
10305 RHS = DAG.getConstant(0, dl, LHS.getValueType());
10306 CC = ISD::SETNE;
10307 }
10308 SDValue Result = DAG.getNode(ISD::SETCC, dl, VT, LHS, RHS,
10309 DAG.getCondCode(CC));
10310 return DAG.getMergeValues({Result, Chain}, dl);
10311 }
10312
10313 ARMCC::CondCodes CondCode, CondCode2;
10314 FPCCToARMCC(CC, CondCode, CondCode2);
10315
10316 SDValue True = DAG.getConstant(1, dl, VT);
10317 SDValue False = DAG.getConstant(0, dl, VT);
10318 SDValue ARMcc = DAG.getConstant(CondCode, dl, MVT::i32);
10319 SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl, IsSignaling);
10320 SDValue Result = getCMOV(dl, VT, False, True, ARMcc, Cmp, DAG);
10321 if (CondCode2 != ARMCC::AL) {
10322 ARMcc = DAG.getConstant(CondCode2, dl, MVT::i32);
10323 Result = getCMOV(dl, VT, Result, True, ARMcc, Cmp, DAG);
10324 }
10325 return DAG.getMergeValues({Result, Chain}, dl);
10326}
10327
10328SDValue ARMTargetLowering::LowerSPONENTRY(SDValue Op, SelectionDAG &DAG) const {
10329 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
10330
10331 EVT VT = getPointerTy(DAG.getDataLayout());
10332 int FI = MFI.CreateFixedObject(4, 0, false);
10333 return DAG.getFrameIndex(FI, VT);
10334}
10335
10336SDValue ARMTargetLowering::LowerFP_TO_BF16(SDValue Op,
10337 SelectionDAG &DAG) const {
10338 SDLoc DL(Op);
10339 MakeLibCallOptions CallOptions;
10340 MVT SVT = Op.getOperand(0).getSimpleValueType();
10341 RTLIB::Libcall LC = RTLIB::getFPROUND(SVT, MVT::bf16);
10342 SDValue Res =
10343 makeLibCall(DAG, LC, MVT::f32, Op.getOperand(0), CallOptions, DL).first;
10344 return DAG.getBitcast(MVT::i32, Res);
10345}
10346
10347SDValue ARMTargetLowering::LowerCMP(SDValue Op, SelectionDAG &DAG) const {
10348 SDLoc dl(Op);
10349 SDValue LHS = Op.getOperand(0);
10350 SDValue RHS = Op.getOperand(1);
10351
10352 // Determine if this is signed or unsigned comparison
10353 bool IsSigned = (Op.getOpcode() == ISD::SCMP);
10354
10355 // Special case for Thumb1 UCMP only
10356 if (!IsSigned && Subtarget->isThumb1Only()) {
10357 // For Thumb unsigned comparison, use this sequence:
10358 // subs r2, r0, r1 ; r2 = LHS - RHS, sets flags
10359 // sbc r2, r2 ; r2 = r2 - r2 - !carry
10360 // cmp r1, r0 ; compare RHS with LHS
10361 // sbc r1, r1 ; r1 = r1 - r1 - !carry
10362 // subs r0, r2, r1 ; r0 = r2 - r1 (final result)
10363
10364 // First subtraction: LHS - RHS
10365 SDValue Sub1WithFlags = DAG.getNode(
10366 ARMISD::SUBC, dl, DAG.getVTList(MVT::i32, FlagsVT), LHS, RHS);
10367 SDValue Sub1Result = Sub1WithFlags.getValue(0);
10368 SDValue Flags1 = Sub1WithFlags.getValue(1);
10369
10370 // SUBE: Sub1Result - Sub1Result - !carry
10371 // This gives 0 if LHS >= RHS (unsigned), -1 if LHS < RHS (unsigned)
10372 SDValue Sbc1 =
10373 DAG.getNode(ARMISD::SUBE, dl, DAG.getVTList(MVT::i32, FlagsVT),
10374 Sub1Result, Sub1Result, Flags1);
10375 SDValue Sbc1Result = Sbc1.getValue(0);
10376
10377 // Second comparison: RHS vs LHS (reverse comparison)
10378 SDValue CmpFlags = DAG.getNode(ARMISD::CMP, dl, FlagsVT, RHS, LHS);
10379
10380 // SUBE: RHS - RHS - !carry
10381 // This gives 0 if RHS <= LHS (unsigned), -1 if RHS > LHS (unsigned)
10382 SDValue Sbc2 = DAG.getNode(
10383 ARMISD::SUBE, dl, DAG.getVTList(MVT::i32, FlagsVT), RHS, RHS, CmpFlags);
10384 SDValue Sbc2Result = Sbc2.getValue(0);
10385
10386 // Final subtraction: Sbc1Result - Sbc2Result (no flags needed)
10387 SDValue Result =
10388 DAG.getNode(ISD::SUB, dl, MVT::i32, Sbc1Result, Sbc2Result);
10389 if (Op.getValueType() != MVT::i32)
10390 Result = DAG.getSExtOrTrunc(Result, dl, Op.getValueType());
10391
10392 return Result;
10393 }
10394
10395 // For the ARM assembly pattern:
10396 // subs r0, r0, r1 ; subtract RHS from LHS and set flags
10397 // movgt r0, #1 ; if LHS > RHS, set result to 1 (GT for signed, HI for
10398 // unsigned) mvnlt r0, #0 ; if LHS < RHS, set result to -1 (LT for
10399 // signed, LO for unsigned)
10400 // ; if LHS == RHS, result remains 0 from the subs
10401
10402 // Optimization: if RHS is a subtraction against 0, use ADDC instead of SUBC
10403 unsigned Opcode = ARMISD::SUBC;
10404
10405 // Check if RHS is a subtraction against 0: (0 - X)
10406 if (RHS.getOpcode() == ISD::SUB) {
10407 SDValue SubLHS = RHS.getOperand(0);
10408 SDValue SubRHS = RHS.getOperand(1);
10409
10410 // Check if it's 0 - X
10411 if (isNullConstant(SubLHS)) {
10412 bool CanUseAdd = false;
10413 if (IsSigned) {
10414 // For SCMP: only if X is known to never be INT_MIN (to avoid overflow)
10415 if (RHS->getFlags().hasNoSignedWrap() || !DAG.computeKnownBits(SubRHS)
10417 .isMinSignedValue()) {
10418 CanUseAdd = true;
10419 }
10420 } else {
10421 // For UCMP: only if X is known to never be zero
10422 if (DAG.isKnownNeverZero(SubRHS)) {
10423 CanUseAdd = true;
10424 }
10425 }
10426
10427 if (CanUseAdd) {
10428 Opcode = ARMISD::ADDC;
10429 RHS = SubRHS; // Replace RHS with X, so we do LHS + X instead of
10430 // LHS - (0 - X)
10431 }
10432 }
10433 }
10434
10435 // Generate the operation with flags
10436 SDValue OpWithFlags =
10437 DAG.getNode(Opcode, dl, DAG.getVTList(MVT::i32, FlagsVT), LHS, RHS);
10438
10439 SDValue OpResult = OpWithFlags.getValue(0);
10440 SDValue Flags = OpWithFlags.getValue(1);
10441
10442 // Constants for conditional moves
10443 SDValue One = DAG.getConstant(1, dl, MVT::i32);
10444 SDValue MinusOne = DAG.getAllOnesConstant(dl, MVT::i32);
10445
10446 // Select condition codes based on signed vs unsigned
10447 ARMCC::CondCodes GTCond = IsSigned ? ARMCC::GT : ARMCC::HI;
10448 ARMCC::CondCodes LTCond = IsSigned ? ARMCC::LT : ARMCC::LO;
10449
10450 // First conditional move: if greater than, set to 1
10451 SDValue GTCondValue = DAG.getConstant(GTCond, dl, MVT::i32);
10452 SDValue Result1 = DAG.getNode(ARMISD::CMOV, dl, MVT::i32, OpResult, One,
10453 GTCondValue, Flags);
10454
10455 // Second conditional move: if less than, set to -1
10456 SDValue LTCondValue = DAG.getConstant(LTCond, dl, MVT::i32);
10457 SDValue Result2 = DAG.getNode(ARMISD::CMOV, dl, MVT::i32, Result1, MinusOne,
10458 LTCondValue, Flags);
10459
10460 if (Op.getValueType() != MVT::i32)
10461 Result2 = DAG.getSExtOrTrunc(Result2, dl, Op.getValueType());
10462
10463 return Result2;
10464}
10465
10467 LLVM_DEBUG(dbgs() << "Lowering node: "; Op.dump());
10468 switch (Op.getOpcode()) {
10469 default: llvm_unreachable("Don't know how to custom lower this!");
10470 case ISD::WRITE_REGISTER: return LowerWRITE_REGISTER(Op, DAG);
10471 case ISD::ConstantPool: return LowerConstantPool(Op, DAG);
10472 case ISD::BlockAddress: return LowerBlockAddress(Op, DAG);
10473 case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG);
10474 case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
10475 case ISD::SELECT: return LowerSELECT(Op, DAG);
10476 case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
10477 case ISD::BRCOND: return LowerBRCOND(Op, DAG);
10478 case ISD::BR_CC: return LowerBR_CC(Op, DAG);
10479 case ISD::BR_JT: return LowerBR_JT(Op, DAG);
10480 case ISD::VASTART: return LowerVASTART(Op, DAG);
10481 case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, DAG, Subtarget);
10482 case ISD::PREFETCH: return LowerPREFETCH(Op, DAG, Subtarget);
10483 case ISD::SINT_TO_FP:
10484 case ISD::UINT_TO_FP: return LowerINT_TO_FP(Op, DAG);
10487 case ISD::FP_TO_SINT:
10488 case ISD::FP_TO_UINT: return LowerFP_TO_INT(Op, DAG);
10490 case ISD::FP_TO_UINT_SAT: return LowerFP_TO_INT_SAT(Op, DAG, Subtarget);
10491 case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG);
10492 case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG);
10493 case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG);
10494 case ISD::EH_SJLJ_SETJMP: return LowerEH_SJLJ_SETJMP(Op, DAG);
10495 case ISD::EH_SJLJ_LONGJMP: return LowerEH_SJLJ_LONGJMP(Op, DAG);
10496 case ISD::EH_SJLJ_SETUP_DISPATCH: return LowerEH_SJLJ_SETUP_DISPATCH(Op, DAG);
10497 case ISD::INTRINSIC_VOID: return LowerINTRINSIC_VOID(Op, DAG, Subtarget);
10498 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG,
10499 Subtarget);
10500 case ISD::BITCAST: return ExpandBITCAST(Op.getNode(), DAG, Subtarget);
10501 case ISD::SHL:
10502 case ISD::SRL:
10503 case ISD::SRA: return LowerShift(Op.getNode(), DAG, Subtarget);
10504 case ISD::SREM: return LowerREM(Op.getNode(), DAG);
10505 case ISD::UREM: return LowerREM(Op.getNode(), DAG);
10506 case ISD::SHL_PARTS: return LowerShiftLeftParts(Op, DAG);
10507 case ISD::SRL_PARTS:
10508 case ISD::SRA_PARTS: return LowerShiftRightParts(Op, DAG);
10509 case ISD::CTTZ:
10510 case ISD::CTTZ_ZERO_POISON: return LowerCTTZ(Op.getNode(), DAG, Subtarget);
10511 case ISD::CTPOP: return LowerCTPOP(Op.getNode(), DAG, Subtarget);
10512 case ISD::SETCC: return LowerVSETCC(Op, DAG, Subtarget);
10513 case ISD::SETCCCARRY: return LowerSETCCCARRY(Op, DAG);
10514 case ISD::ConstantFP: return LowerConstantFP(Op, DAG, Subtarget);
10515 case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG, Subtarget);
10516 case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG, Subtarget);
10517 case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG, Subtarget);
10518 case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
10519 case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG, Subtarget);
10520 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG, Subtarget);
10521 case ISD::TRUNCATE: return LowerTruncate(Op.getNode(), DAG, Subtarget);
10522 case ISD::SIGN_EXTEND:
10523 case ISD::ZERO_EXTEND: return LowerVectorExtend(Op.getNode(), DAG, Subtarget);
10524 case ISD::GET_ROUNDING: return LowerGET_ROUNDING(Op, DAG);
10525 case ISD::SET_ROUNDING: return LowerSET_ROUNDING(Op, DAG);
10526 case ISD::SET_FPMODE:
10527 return LowerSET_FPMODE(Op, DAG);
10528 case ISD::RESET_FPMODE:
10529 return LowerRESET_FPMODE(Op, DAG);
10530 case ISD::MUL: return LowerMUL(Op, DAG);
10531 case ISD::SDIV:
10532 if (getTargetMachine().getTargetTriple().isOSWindows() &&
10533 !Op.getValueType().isVector())
10534 return LowerDIV_Windows(Op, DAG, /* Signed */ true);
10535 return LowerSDIV(Op, DAG, Subtarget);
10536 case ISD::UDIV:
10537 if (getTargetMachine().getTargetTriple().isOSWindows() &&
10538 !Op.getValueType().isVector())
10539 return LowerDIV_Windows(Op, DAG, /* Signed */ false);
10540 return LowerUDIV(Op, DAG, Subtarget);
10541 case ISD::UADDO_CARRY:
10542 return LowerADDSUBO_CARRY(Op, DAG, ARMISD::ADDE, false /*unsigned*/);
10543 case ISD::USUBO_CARRY:
10544 return LowerADDSUBO_CARRY(Op, DAG, ARMISD::SUBE, false /*unsigned*/);
10545 case ISD::SADDO_CARRY:
10546 return LowerADDSUBO_CARRY(Op, DAG, ARMISD::ADDE, true /*signed*/);
10547 case ISD::SSUBO_CARRY:
10548 return LowerADDSUBO_CARRY(Op, DAG, ARMISD::SUBE, true /*signed*/);
10549 case ISD::UADDO:
10550 case ISD::USUBO:
10551 case ISD::UMULO:
10552 case ISD::SADDO:
10553 case ISD::SSUBO:
10554 case ISD::SMULO:
10555 return LowerALUO(Op, DAG);
10556 case ISD::SADDSAT:
10557 case ISD::SSUBSAT:
10558 case ISD::UADDSAT:
10559 case ISD::USUBSAT:
10560 return LowerADDSUBSAT(Op, DAG, Subtarget);
10561 case ISD::LOAD: {
10562 auto *LD = cast<LoadSDNode>(Op);
10563 EVT MemVT = LD->getMemoryVT();
10564 if (Subtarget->hasMVEIntegerOps() &&
10565 (MemVT == MVT::v2i1 || MemVT == MVT::v4i1 || MemVT == MVT::v8i1 ||
10566 MemVT == MVT::v16i1))
10567 return LowerPredicateLoad(Op, DAG);
10568
10569 auto Pair = LowerAEABIUnalignedLoad(Op, DAG);
10570 if (Pair.first)
10571 return DAG.getMergeValues({Pair.first, Pair.second}, SDLoc(Pair.first));
10572 return SDValue();
10573 }
10574 case ISD::STORE:
10575 return LowerSTORE(Op, DAG, Subtarget);
10576 case ISD::MLOAD:
10577 return LowerMLOAD(Op, DAG);
10578 case ISD::VECREDUCE_MUL:
10579 case ISD::VECREDUCE_AND:
10580 case ISD::VECREDUCE_OR:
10581 case ISD::VECREDUCE_XOR:
10582 return LowerVecReduce(Op, DAG, Subtarget);
10587 return LowerVecReduceF(Op, DAG, Subtarget);
10592 return LowerVecReduceMinMax(Op, DAG, Subtarget);
10593 case ISD::ATOMIC_LOAD:
10594 case ISD::ATOMIC_STORE:
10595 return LowerAtomicLoadStore(Op, DAG);
10596 case ISD::SDIVREM:
10597 case ISD::UDIVREM: return LowerDivRem(Op, DAG);
10599 if (getTargetMachine().getTargetTriple().isOSWindows())
10600 return LowerDYNAMIC_STACKALLOC(Op, DAG);
10601 llvm_unreachable("Don't know how to custom lower this!");
10603 case ISD::FP_ROUND: return LowerFP_ROUND(Op, DAG);
10605 case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG);
10606 case ISD::STRICT_FSETCC:
10607 case ISD::STRICT_FSETCCS: return LowerFSETCC(Op, DAG);
10608 case ISD::SPONENTRY:
10609 return LowerSPONENTRY(Op, DAG);
10610 case ISD::FP_TO_BF16:
10611 return LowerFP_TO_BF16(Op, DAG);
10612 case ARMISD::WIN__DBZCHK: return SDValue();
10613 case ISD::UCMP:
10614 case ISD::SCMP:
10615 return LowerCMP(Op, DAG);
10616 case ISD::ABS:
10617 return LowerABS(Op, DAG);
10618 case ISD::STRICT_LROUND:
10620 case ISD::STRICT_LRINT:
10621 case ISD::STRICT_LLRINT: {
10622 assert((Op.getOperand(1).getValueType() == MVT::f16 ||
10623 Op.getOperand(1).getValueType() == MVT::bf16) &&
10624 "Expected custom lowering of rounding operations only for f16");
10625 SDLoc DL(Op);
10626 SDValue Ext = DAG.getNode(ISD::STRICT_FP_EXTEND, DL, {MVT::f32, MVT::Other},
10627 {Op.getOperand(0), Op.getOperand(1)});
10628 return DAG.getNode(Op.getOpcode(), DL, {Op.getValueType(), MVT::Other},
10629 {Ext.getValue(1), Ext.getValue(0)});
10630 }
10631 }
10632}
10633
10635 SelectionDAG &DAG) {
10636 unsigned IntNo = N->getConstantOperandVal(0);
10637 unsigned Opc = 0;
10638 if (IntNo == Intrinsic::arm_smlald)
10639 Opc = ARMISD::SMLALD;
10640 else if (IntNo == Intrinsic::arm_smlaldx)
10641 Opc = ARMISD::SMLALDX;
10642 else if (IntNo == Intrinsic::arm_smlsld)
10643 Opc = ARMISD::SMLSLD;
10644 else if (IntNo == Intrinsic::arm_smlsldx)
10645 Opc = ARMISD::SMLSLDX;
10646 else
10647 return;
10648
10649 SDLoc dl(N);
10650 SDValue Lo, Hi;
10651 std::tie(Lo, Hi) = DAG.SplitScalar(N->getOperand(3), dl, MVT::i32, MVT::i32);
10652
10653 SDValue LongMul = DAG.getNode(Opc, dl,
10654 DAG.getVTList(MVT::i32, MVT::i32),
10655 N->getOperand(1), N->getOperand(2),
10656 Lo, Hi);
10657 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64,
10658 LongMul.getValue(0), LongMul.getValue(1)));
10659}
10660
10661/// ReplaceNodeResults - Replace the results of node with an illegal result
10662/// type with new values built out of custom code.
10665 SelectionDAG &DAG) const {
10666 SDValue Res;
10667 switch (N->getOpcode()) {
10668 default:
10669 llvm_unreachable("Don't know how to custom expand this!");
10670 case ISD::READ_REGISTER:
10672 break;
10673 case ISD::BITCAST:
10674 Res = ExpandBITCAST(N, DAG, Subtarget);
10675 break;
10676 case ISD::SRL:
10677 case ISD::SRA:
10678 case ISD::SHL:
10679 Res = Expand64BitShift(N, DAG, Subtarget);
10680 break;
10681 case ISD::SREM:
10682 case ISD::UREM:
10683 Res = LowerREM(N, DAG);
10684 break;
10685 case ISD::SDIVREM:
10686 case ISD::UDIVREM:
10687 Res = LowerDivRem(SDValue(N, 0), DAG);
10688 assert(Res.getNumOperands() == 2 && "DivRem needs two values");
10689 Results.push_back(Res.getValue(0));
10690 Results.push_back(Res.getValue(1));
10691 return;
10692 case ISD::SADDSAT:
10693 case ISD::SSUBSAT:
10694 case ISD::UADDSAT:
10695 case ISD::USUBSAT:
10696 Res = LowerADDSUBSAT(SDValue(N, 0), DAG, Subtarget);
10697 break;
10699 ReplaceREADCYCLECOUNTER(N, Results, DAG, Subtarget);
10700 return;
10701 case ISD::UDIV:
10702 case ISD::SDIV:
10703 assert(getTargetMachine().getTargetTriple().isOSWindows() &&
10704 "can only expand DIV on Windows");
10705 return ExpandDIV_Windows(SDValue(N, 0), DAG, N->getOpcode() == ISD::SDIV,
10706 Results);
10709 return;
10711 return ReplaceLongIntrinsic(N, Results, DAG);
10712 case ISD::LOAD:
10713 LowerLOAD(N, Results, DAG);
10714 break;
10715 case ISD::STORE:
10716 Res = LowerAEABIUnalignedStore(SDValue(N, 0), DAG);
10717 break;
10718 case ISD::TRUNCATE:
10719 Res = LowerTruncate(N, DAG, Subtarget);
10720 break;
10721 case ISD::SIGN_EXTEND:
10722 case ISD::ZERO_EXTEND:
10723 Res = LowerVectorExtend(N, DAG, Subtarget);
10724 break;
10727 Res = LowerFP_TO_INT_SAT(SDValue(N, 0), DAG, Subtarget);
10728 break;
10729 }
10730 if (Res.getNode())
10731 Results.push_back(Res);
10732}
10733
10734//===----------------------------------------------------------------------===//
10735// ARM Scheduler Hooks
10736//===----------------------------------------------------------------------===//
10737
10738/// SetupEntryBlockForSjLj - Insert code into the entry block that creates and
10739/// registers the function context.
10740void ARMTargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,
10742 MachineBasicBlock *DispatchBB,
10743 int FI) const {
10744 assert(!Subtarget->isROPI() && !Subtarget->isRWPI() &&
10745 "ROPI/RWPI not currently supported with SjLj");
10746 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
10747 DebugLoc dl = MI.getDebugLoc();
10748 MachineFunction *MF = MBB->getParent();
10749 MachineRegisterInfo *MRI = &MF->getRegInfo();
10752 const Function &F = MF->getFunction();
10753
10754 bool isThumb = Subtarget->isThumb();
10755 bool isThumb2 = Subtarget->isThumb2();
10756
10757 unsigned PCLabelId = AFI->createPICLabelUId();
10758 unsigned PCAdj = (isThumb || isThumb2) ? 4 : 8;
10760 ARMConstantPoolMBB::Create(F.getContext(), DispatchBB, PCLabelId, PCAdj);
10761 unsigned CPI = MCP->getConstantPoolIndex(CPV, Align(4));
10762
10763 const TargetRegisterClass *TRC = isThumb ? &ARM::tGPRRegClass
10764 : &ARM::GPRRegClass;
10765
10766 // Grab constant pool and fixed stack memory operands.
10767 MachineMemOperand *CPMMO =
10770
10771 MachineMemOperand *FIMMOSt =
10774
10775 // Load the address of the dispatch MBB into the jump buffer.
10776 if (isThumb2) {
10777 // Incoming value: jbuf
10778 // ldr.n r5, LCPI1_1
10779 // orr r5, r5, #1
10780 // add r5, pc
10781 // str r5, [$jbuf, #+4] ; &jbuf[1]
10782 Register NewVReg1 = MRI->createVirtualRegister(TRC);
10783 BuildMI(*MBB, MI, dl, TII->get(ARM::t2LDRpci), NewVReg1)
10785 .addMemOperand(CPMMO)
10787 // Set the low bit because of thumb mode.
10788 Register NewVReg2 = MRI->createVirtualRegister(TRC);
10789 BuildMI(*MBB, MI, dl, TII->get(ARM::t2ORRri), NewVReg2)
10790 .addReg(NewVReg1, RegState::Kill)
10791 .addImm(0x01)
10793 .add(condCodeOp());
10794 Register NewVReg3 = MRI->createVirtualRegister(TRC);
10795 BuildMI(*MBB, MI, dl, TII->get(ARM::tPICADD), NewVReg3)
10796 .addReg(NewVReg2, RegState::Kill)
10797 .addImm(PCLabelId);
10798 BuildMI(*MBB, MI, dl, TII->get(ARM::t2STRi12))
10799 .addReg(NewVReg3, RegState::Kill)
10800 .addFrameIndex(FI)
10801 .addImm(36) // &jbuf[1] :: pc
10802 .addMemOperand(FIMMOSt)
10804 } else if (isThumb) {
10805 // Incoming value: jbuf
10806 // ldr.n r1, LCPI1_4
10807 // add r1, pc
10808 // mov r2, #1
10809 // orrs r1, r2
10810 // add r2, $jbuf, #+4 ; &jbuf[1]
10811 // str r1, [r2]
10812 Register NewVReg1 = MRI->createVirtualRegister(TRC);
10813 BuildMI(*MBB, MI, dl, TII->get(ARM::tLDRpci), NewVReg1)
10815 .addMemOperand(CPMMO)
10817 Register NewVReg2 = MRI->createVirtualRegister(TRC);
10818 BuildMI(*MBB, MI, dl, TII->get(ARM::tPICADD), NewVReg2)
10819 .addReg(NewVReg1, RegState::Kill)
10820 .addImm(PCLabelId);
10821 // Set the low bit because of thumb mode.
10822 Register NewVReg3 = MRI->createVirtualRegister(TRC);
10823 BuildMI(*MBB, MI, dl, TII->get(ARM::tMOVi8), NewVReg3)
10824 .addReg(ARM::CPSR, RegState::Define)
10825 .addImm(1)
10827 Register NewVReg4 = MRI->createVirtualRegister(TRC);
10828 BuildMI(*MBB, MI, dl, TII->get(ARM::tORR), NewVReg4)
10829 .addReg(ARM::CPSR, RegState::Define)
10830 .addReg(NewVReg2, RegState::Kill)
10831 .addReg(NewVReg3, RegState::Kill)
10833 Register NewVReg5 = MRI->createVirtualRegister(TRC);
10834 BuildMI(*MBB, MI, dl, TII->get(ARM::tADDframe), NewVReg5)
10835 .addFrameIndex(FI)
10836 .addImm(36); // &jbuf[1] :: pc
10837 BuildMI(*MBB, MI, dl, TII->get(ARM::tSTRi))
10838 .addReg(NewVReg4, RegState::Kill)
10839 .addReg(NewVReg5, RegState::Kill)
10840 .addImm(0)
10841 .addMemOperand(FIMMOSt)
10843 } else {
10844 // Incoming value: jbuf
10845 // ldr r1, LCPI1_1
10846 // add r1, pc, r1
10847 // str r1, [$jbuf, #+4] ; &jbuf[1]
10848 Register NewVReg1 = MRI->createVirtualRegister(TRC);
10849 BuildMI(*MBB, MI, dl, TII->get(ARM::LDRi12), NewVReg1)
10851 .addImm(0)
10852 .addMemOperand(CPMMO)
10854 Register NewVReg2 = MRI->createVirtualRegister(TRC);
10855 BuildMI(*MBB, MI, dl, TII->get(ARM::PICADD), NewVReg2)
10856 .addReg(NewVReg1, RegState::Kill)
10857 .addImm(PCLabelId)
10859 BuildMI(*MBB, MI, dl, TII->get(ARM::STRi12))
10860 .addReg(NewVReg2, RegState::Kill)
10861 .addFrameIndex(FI)
10862 .addImm(36) // &jbuf[1] :: pc
10863 .addMemOperand(FIMMOSt)
10865 }
10866}
10867
10868void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
10869 MachineBasicBlock *MBB) const {
10870 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
10871 DebugLoc dl = MI.getDebugLoc();
10872 MachineFunction *MF = MBB->getParent();
10873 MachineRegisterInfo *MRI = &MF->getRegInfo();
10874 MachineFrameInfo &MFI = MF->getFrameInfo();
10875 int FI = MFI.getFunctionContextIndex();
10876
10877 const TargetRegisterClass *TRC = Subtarget->isThumb() ? &ARM::tGPRRegClass
10878 : &ARM::GPRnopcRegClass;
10879
10880 // Get a mapping of the call site numbers to all of the landing pads they're
10881 // associated with.
10882 DenseMap<unsigned, SmallVector<MachineBasicBlock*, 2>> CallSiteNumToLPad;
10883 unsigned MaxCSNum = 0;
10884 for (MachineBasicBlock &BB : *MF) {
10885 if (!BB.isEHPad())
10886 continue;
10887
10888 // FIXME: We should assert that the EH_LABEL is the first MI in the landing
10889 // pad.
10890 for (MachineInstr &II : BB) {
10891 if (!II.isEHLabel())
10892 continue;
10893
10894 MCSymbol *Sym = II.getOperand(0).getMCSymbol();
10895 if (!MF->hasCallSiteLandingPad(Sym)) continue;
10896
10897 SmallVectorImpl<unsigned> &CallSiteIdxs = MF->getCallSiteLandingPad(Sym);
10898 for (unsigned Idx : CallSiteIdxs) {
10899 CallSiteNumToLPad[Idx].push_back(&BB);
10900 MaxCSNum = std::max(MaxCSNum, Idx);
10901 }
10902 break;
10903 }
10904 }
10905
10906 // Get an ordered list of the machine basic blocks for the jump table.
10907 std::vector<MachineBasicBlock*> LPadList;
10908 SmallPtrSet<MachineBasicBlock*, 32> InvokeBBs;
10909 LPadList.reserve(CallSiteNumToLPad.size());
10910 for (unsigned I = 1; I <= MaxCSNum; ++I) {
10911 SmallVectorImpl<MachineBasicBlock*> &MBBList = CallSiteNumToLPad[I];
10912 for (MachineBasicBlock *MBB : MBBList) {
10913 LPadList.push_back(MBB);
10914 InvokeBBs.insert_range(MBB->predecessors());
10915 }
10916 }
10917
10918 assert(!LPadList.empty() &&
10919 "No landing pad destinations for the dispatch jump table!");
10920
10921 // Create the jump table and associated information.
10922 MachineJumpTableInfo *JTI =
10923 MF->getOrCreateJumpTableInfo(MachineJumpTableInfo::EK_Inline);
10924 unsigned MJTI = JTI->createJumpTableIndex(LPadList);
10925
10926 // Create the MBBs for the dispatch code.
10927
10928 // Shove the dispatch's address into the return slot in the function context.
10929 MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock();
10930 DispatchBB->setIsEHPad();
10931
10932 MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
10933
10934 BuildMI(TrapBB, dl, TII->get(Subtarget->isThumb() ? ARM::tTRAP : ARM::TRAP));
10935 DispatchBB->addSuccessor(TrapBB);
10936
10937 MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock();
10938 DispatchBB->addSuccessor(DispContBB);
10939
10940 // Insert and MBBs.
10941 MF->insert(MF->end(), DispatchBB);
10942 MF->insert(MF->end(), DispContBB);
10943 MF->insert(MF->end(), TrapBB);
10944
10945 // Insert code into the entry block that creates and registers the function
10946 // context.
10947 SetupEntryBlockForSjLj(MI, MBB, DispatchBB, FI);
10948
10949 MachineMemOperand *FIMMOLd = MF->getMachineMemOperand(
10952
10953 MachineInstrBuilder MIB;
10954 MIB = BuildMI(DispatchBB, dl, TII->get(ARM::Int_eh_sjlj_dispatchsetup));
10955
10956 const ARMBaseInstrInfo *AII = static_cast<const ARMBaseInstrInfo*>(TII);
10957 const ARMBaseRegisterInfo &RI = AII->getRegisterInfo();
10958
10959 // Add a register mask with no preserved registers. This results in all
10960 // registers being marked as clobbered. This can't work if the dispatch block
10961 // is in a Thumb1 function and is linked with ARM code which uses the FP
10962 // registers, as there is no way to preserve the FP registers in Thumb1 mode.
10964
10965 bool IsPositionIndependent = isPositionIndependent();
10966 unsigned NumLPads = LPadList.size();
10967 if (Subtarget->isThumb2()) {
10968 Register NewVReg1 = MRI->createVirtualRegister(TRC);
10969 BuildMI(DispatchBB, dl, TII->get(ARM::t2LDRi12), NewVReg1)
10970 .addFrameIndex(FI)
10971 .addImm(4)
10972 .addMemOperand(FIMMOLd)
10974
10975 if (NumLPads < 256) {
10976 BuildMI(DispatchBB, dl, TII->get(ARM::t2CMPri))
10977 .addReg(NewVReg1)
10978 .addImm(LPadList.size())
10980 } else {
10981 Register VReg1 = MRI->createVirtualRegister(TRC);
10982 BuildMI(DispatchBB, dl, TII->get(ARM::t2MOVi16), VReg1)
10983 .addImm(NumLPads & 0xFFFF)
10985
10986 unsigned VReg2 = VReg1;
10987 if ((NumLPads & 0xFFFF0000) != 0) {
10988 VReg2 = MRI->createVirtualRegister(TRC);
10989 BuildMI(DispatchBB, dl, TII->get(ARM::t2MOVTi16), VReg2)
10990 .addReg(VReg1)
10991 .addImm(NumLPads >> 16)
10993 }
10994
10995 BuildMI(DispatchBB, dl, TII->get(ARM::t2CMPrr))
10996 .addReg(NewVReg1)
10997 .addReg(VReg2)
10999 }
11000
11001 BuildMI(DispatchBB, dl, TII->get(ARM::t2Bcc))
11002 .addMBB(TrapBB)
11004 .addReg(ARM::CPSR);
11005
11006 Register NewVReg3 = MRI->createVirtualRegister(TRC);
11007 BuildMI(DispContBB, dl, TII->get(ARM::t2LEApcrelJT), NewVReg3)
11008 .addJumpTableIndex(MJTI)
11010
11011 Register NewVReg4 = MRI->createVirtualRegister(TRC);
11012 BuildMI(DispContBB, dl, TII->get(ARM::t2ADDrs), NewVReg4)
11013 .addReg(NewVReg3, RegState::Kill)
11014 .addReg(NewVReg1)
11017 .add(condCodeOp());
11018
11019 BuildMI(DispContBB, dl, TII->get(ARM::t2BR_JT))
11020 .addReg(NewVReg4, RegState::Kill)
11021 .addReg(NewVReg1)
11022 .addJumpTableIndex(MJTI);
11023 } else if (Subtarget->isThumb()) {
11024 Register NewVReg1 = MRI->createVirtualRegister(TRC);
11025 BuildMI(DispatchBB, dl, TII->get(ARM::tLDRspi), NewVReg1)
11026 .addFrameIndex(FI)
11027 .addImm(1)
11028 .addMemOperand(FIMMOLd)
11030
11031 if (NumLPads < 256) {
11032 BuildMI(DispatchBB, dl, TII->get(ARM::tCMPi8))
11033 .addReg(NewVReg1)
11034 .addImm(NumLPads)
11036 } else {
11037 MachineConstantPool *ConstantPool = MF->getConstantPool();
11038 Type *Int32Ty = Type::getInt32Ty(MF->getFunction().getContext());
11039 const Constant *C = ConstantInt::get(Int32Ty, NumLPads);
11040
11041 // MachineConstantPool wants an explicit alignment.
11042 Align Alignment = MF->getDataLayout().getPrefTypeAlign(Int32Ty);
11043 unsigned Idx = ConstantPool->getConstantPoolIndex(C, Alignment);
11044
11045 Register VReg1 = MRI->createVirtualRegister(TRC);
11046 BuildMI(DispatchBB, dl, TII->get(ARM::tLDRpci))
11047 .addReg(VReg1, RegState::Define)
11050 BuildMI(DispatchBB, dl, TII->get(ARM::tCMPr))
11051 .addReg(NewVReg1)
11052 .addReg(VReg1)
11054 }
11055
11056 BuildMI(DispatchBB, dl, TII->get(ARM::tBcc))
11057 .addMBB(TrapBB)
11059 .addReg(ARM::CPSR);
11060
11061 Register NewVReg2 = MRI->createVirtualRegister(TRC);
11062 BuildMI(DispContBB, dl, TII->get(ARM::tLSLri), NewVReg2)
11063 .addReg(ARM::CPSR, RegState::Define)
11064 .addReg(NewVReg1)
11065 .addImm(2)
11067
11068 Register NewVReg3 = MRI->createVirtualRegister(TRC);
11069 BuildMI(DispContBB, dl, TII->get(ARM::tLEApcrelJT), NewVReg3)
11070 .addJumpTableIndex(MJTI)
11072
11073 Register NewVReg4 = MRI->createVirtualRegister(TRC);
11074 BuildMI(DispContBB, dl, TII->get(ARM::tADDrr), NewVReg4)
11075 .addReg(ARM::CPSR, RegState::Define)
11076 .addReg(NewVReg2, RegState::Kill)
11077 .addReg(NewVReg3)
11079
11080 MachineMemOperand *JTMMOLd =
11081 MF->getMachineMemOperand(MachinePointerInfo::getJumpTable(*MF),
11083
11084 Register NewVReg5 = MRI->createVirtualRegister(TRC);
11085 BuildMI(DispContBB, dl, TII->get(ARM::tLDRi), NewVReg5)
11086 .addReg(NewVReg4, RegState::Kill)
11087 .addImm(0)
11088 .addMemOperand(JTMMOLd)
11090
11091 unsigned NewVReg6 = NewVReg5;
11092 if (IsPositionIndependent) {
11093 NewVReg6 = MRI->createVirtualRegister(TRC);
11094 BuildMI(DispContBB, dl, TII->get(ARM::tADDrr), NewVReg6)
11095 .addReg(ARM::CPSR, RegState::Define)
11096 .addReg(NewVReg5, RegState::Kill)
11097 .addReg(NewVReg3)
11099 }
11100
11101 BuildMI(DispContBB, dl, TII->get(ARM::tBR_JTr))
11102 .addReg(NewVReg6, RegState::Kill)
11103 .addJumpTableIndex(MJTI);
11104 } else {
11105 Register NewVReg1 = MRI->createVirtualRegister(TRC);
11106 BuildMI(DispatchBB, dl, TII->get(ARM::LDRi12), NewVReg1)
11107 .addFrameIndex(FI)
11108 .addImm(4)
11109 .addMemOperand(FIMMOLd)
11111
11112 if (NumLPads < 256) {
11113 BuildMI(DispatchBB, dl, TII->get(ARM::CMPri))
11114 .addReg(NewVReg1)
11115 .addImm(NumLPads)
11117 } else if (Subtarget->hasV6T2Ops() && isUInt<16>(NumLPads)) {
11118 Register VReg1 = MRI->createVirtualRegister(TRC);
11119 BuildMI(DispatchBB, dl, TII->get(ARM::MOVi16), VReg1)
11120 .addImm(NumLPads & 0xFFFF)
11122
11123 unsigned VReg2 = VReg1;
11124 if ((NumLPads & 0xFFFF0000) != 0) {
11125 VReg2 = MRI->createVirtualRegister(TRC);
11126 BuildMI(DispatchBB, dl, TII->get(ARM::MOVTi16), VReg2)
11127 .addReg(VReg1)
11128 .addImm(NumLPads >> 16)
11130 }
11131
11132 BuildMI(DispatchBB, dl, TII->get(ARM::CMPrr))
11133 .addReg(NewVReg1)
11134 .addReg(VReg2)
11136 } else {
11137 MachineConstantPool *ConstantPool = MF->getConstantPool();
11138 Type *Int32Ty = Type::getInt32Ty(MF->getFunction().getContext());
11139 const Constant *C = ConstantInt::get(Int32Ty, NumLPads);
11140
11141 // MachineConstantPool wants an explicit alignment.
11142 Align Alignment = MF->getDataLayout().getPrefTypeAlign(Int32Ty);
11143 unsigned Idx = ConstantPool->getConstantPoolIndex(C, Alignment);
11144
11145 Register VReg1 = MRI->createVirtualRegister(TRC);
11146 BuildMI(DispatchBB, dl, TII->get(ARM::LDRcp))
11147 .addReg(VReg1, RegState::Define)
11149 .addImm(0)
11151 BuildMI(DispatchBB, dl, TII->get(ARM::CMPrr))
11152 .addReg(NewVReg1)
11153 .addReg(VReg1, RegState::Kill)
11155 }
11156
11157 BuildMI(DispatchBB, dl, TII->get(ARM::Bcc))
11158 .addMBB(TrapBB)
11160 .addReg(ARM::CPSR);
11161
11162 Register NewVReg3 = MRI->createVirtualRegister(TRC);
11163 BuildMI(DispContBB, dl, TII->get(ARM::MOVsi), NewVReg3)
11164 .addReg(NewVReg1)
11167 .add(condCodeOp());
11168 Register NewVReg4 = MRI->createVirtualRegister(TRC);
11169 BuildMI(DispContBB, dl, TII->get(ARM::LEApcrelJT), NewVReg4)
11170 .addJumpTableIndex(MJTI)
11172
11173 MachineMemOperand *JTMMOLd =
11174 MF->getMachineMemOperand(MachinePointerInfo::getJumpTable(*MF),
11176 Register NewVReg5 = MRI->createVirtualRegister(TRC);
11177 BuildMI(DispContBB, dl, TII->get(ARM::LDRrs), NewVReg5)
11178 .addReg(NewVReg3, RegState::Kill)
11179 .addReg(NewVReg4)
11180 .addImm(0)
11181 .addMemOperand(JTMMOLd)
11183
11184 if (IsPositionIndependent) {
11185 BuildMI(DispContBB, dl, TII->get(ARM::BR_JTadd))
11186 .addReg(NewVReg5, RegState::Kill)
11187 .addReg(NewVReg4)
11188 .addJumpTableIndex(MJTI);
11189 } else {
11190 BuildMI(DispContBB, dl, TII->get(ARM::BR_JTr))
11191 .addReg(NewVReg5, RegState::Kill)
11192 .addJumpTableIndex(MJTI);
11193 }
11194 }
11195
11196 // Add the jump table entries as successors to the MBB.
11197 SmallPtrSet<MachineBasicBlock*, 8> SeenMBBs;
11198 for (MachineBasicBlock *CurMBB : LPadList) {
11199 if (SeenMBBs.insert(CurMBB).second)
11200 DispContBB->addSuccessor(CurMBB);
11201 }
11202
11203 // N.B. the order the invoke BBs are processed in doesn't matter here.
11204 const MCPhysReg *SavedRegs = RI.getCalleeSavedRegs(MF);
11206 for (MachineBasicBlock *BB : InvokeBBs) {
11207
11208 // Remove the landing pad successor from the invoke block and replace it
11209 // with the new dispatch block.
11210 SmallVector<MachineBasicBlock*, 4> Successors(BB->successors());
11211 while (!Successors.empty()) {
11212 MachineBasicBlock *SMBB = Successors.pop_back_val();
11213 if (SMBB->isEHPad()) {
11214 BB->removeSuccessor(SMBB);
11215 MBBLPads.push_back(SMBB);
11216 }
11217 }
11218
11219 BB->addSuccessor(DispatchBB, BranchProbability::getZero());
11220 BB->normalizeSuccProbs();
11221
11222 // Find the invoke call and mark all of the callee-saved registers as
11223 // 'implicit defined' so that they're spilled. This prevents code from
11224 // moving instructions to before the EH block, where they will never be
11225 // executed.
11227 II = BB->rbegin(), IE = BB->rend(); II != IE; ++II) {
11228 if (!II->isCall()) continue;
11229
11230 DenseSet<unsigned> DefRegs;
11232 OI = II->operands_begin(), OE = II->operands_end();
11233 OI != OE; ++OI) {
11234 if (!OI->isReg()) continue;
11235 DefRegs.insert(OI->getReg());
11236 }
11237
11238 MachineInstrBuilder MIB(*MF, &*II);
11239
11240 for (unsigned i = 0; SavedRegs[i] != 0; ++i) {
11241 unsigned Reg = SavedRegs[i];
11242 if (Subtarget->isThumb2() &&
11243 !ARM::tGPRRegClass.contains(Reg) &&
11244 !ARM::hGPRRegClass.contains(Reg))
11245 continue;
11246 if (Subtarget->isThumb1Only() && !ARM::tGPRRegClass.contains(Reg))
11247 continue;
11248 if (!Subtarget->isThumb() && !ARM::GPRRegClass.contains(Reg))
11249 continue;
11250 if (!DefRegs.contains(Reg))
11252 }
11253
11254 break;
11255 }
11256 }
11257
11258 // Mark all former landing pads as non-landing pads. The dispatch is the only
11259 // landing pad now.
11260 for (MachineBasicBlock *MBBLPad : MBBLPads)
11261 MBBLPad->setIsEHPad(false);
11262
11263 // The instruction is gone now.
11264 MI.eraseFromParent();
11265}
11266
11267static
11269 for (MachineBasicBlock *S : MBB->successors())
11270 if (S != Succ)
11271 return S;
11272 llvm_unreachable("Expecting a BB with two successors!");
11273}
11274
11275/// Return the load opcode for a given load size. If load size >= 8,
11276/// neon opcode will be returned.
11277static unsigned getLdOpcode(unsigned LdSize, bool IsThumb1, bool IsThumb2) {
11278 if (LdSize >= 8)
11279 return LdSize == 16 ? ARM::VLD1q32wb_fixed
11280 : LdSize == 8 ? ARM::VLD1d32wb_fixed : 0;
11281 if (IsThumb1)
11282 return LdSize == 4 ? ARM::tLDRi
11283 : LdSize == 2 ? ARM::tLDRHi
11284 : LdSize == 1 ? ARM::tLDRBi : 0;
11285 if (IsThumb2)
11286 return LdSize == 4 ? ARM::t2LDR_POST
11287 : LdSize == 2 ? ARM::t2LDRH_POST
11288 : LdSize == 1 ? ARM::t2LDRB_POST : 0;
11289 return LdSize == 4 ? ARM::LDR_POST_IMM
11290 : LdSize == 2 ? ARM::LDRH_POST
11291 : LdSize == 1 ? ARM::LDRB_POST_IMM : 0;
11292}
11293
11294/// Return the store opcode for a given store size. If store size >= 8,
11295/// neon opcode will be returned.
11296static unsigned getStOpcode(unsigned StSize, bool IsThumb1, bool IsThumb2) {
11297 if (StSize >= 8)
11298 return StSize == 16 ? ARM::VST1q32wb_fixed
11299 : StSize == 8 ? ARM::VST1d32wb_fixed : 0;
11300 if (IsThumb1)
11301 return StSize == 4 ? ARM::tSTRi
11302 : StSize == 2 ? ARM::tSTRHi
11303 : StSize == 1 ? ARM::tSTRBi : 0;
11304 if (IsThumb2)
11305 return StSize == 4 ? ARM::t2STR_POST
11306 : StSize == 2 ? ARM::t2STRH_POST
11307 : StSize == 1 ? ARM::t2STRB_POST : 0;
11308 return StSize == 4 ? ARM::STR_POST_IMM
11309 : StSize == 2 ? ARM::STRH_POST
11310 : StSize == 1 ? ARM::STRB_POST_IMM : 0;
11311}
11312
11313/// Emit a post-increment load operation with given size. The instructions
11314/// will be added to BB at Pos.
11316 const TargetInstrInfo *TII, const DebugLoc &dl,
11317 unsigned LdSize, unsigned Data, unsigned AddrIn,
11318 unsigned AddrOut, bool IsThumb1, bool IsThumb2) {
11319 unsigned LdOpc = getLdOpcode(LdSize, IsThumb1, IsThumb2);
11320 assert(LdOpc != 0 && "Should have a load opcode");
11321 if (LdSize >= 8) {
11322 BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data)
11323 .addReg(AddrOut, RegState::Define)
11324 .addReg(AddrIn)
11325 .addImm(0)
11327 } else if (IsThumb1) {
11328 // load + update AddrIn
11329 BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data)
11330 .addReg(AddrIn)
11331 .addImm(0)
11333 BuildMI(*BB, Pos, dl, TII->get(ARM::tADDi8), AddrOut)
11334 .add(t1CondCodeOp())
11335 .addReg(AddrIn)
11336 .addImm(LdSize)
11338 } else if (IsThumb2) {
11339 BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data)
11340 .addReg(AddrOut, RegState::Define)
11341 .addReg(AddrIn)
11342 .addImm(LdSize)
11344 } else { // arm
11345 BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data)
11346 .addReg(AddrOut, RegState::Define)
11347 .addReg(AddrIn)
11348 .addReg(0)
11349 .addImm(LdSize)
11351 }
11352}
11353
11354/// Emit a post-increment store operation with given size. The instructions
11355/// will be added to BB at Pos.
11357 const TargetInstrInfo *TII, const DebugLoc &dl,
11358 unsigned StSize, unsigned Data, unsigned AddrIn,
11359 unsigned AddrOut, bool IsThumb1, bool IsThumb2) {
11360 unsigned StOpc = getStOpcode(StSize, IsThumb1, IsThumb2);
11361 assert(StOpc != 0 && "Should have a store opcode");
11362 if (StSize >= 8) {
11363 BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut)
11364 .addReg(AddrIn)
11365 .addImm(0)
11366 .addReg(Data)
11368 } else if (IsThumb1) {
11369 // store + update AddrIn
11370 BuildMI(*BB, Pos, dl, TII->get(StOpc))
11371 .addReg(Data)
11372 .addReg(AddrIn)
11373 .addImm(0)
11375 BuildMI(*BB, Pos, dl, TII->get(ARM::tADDi8), AddrOut)
11376 .add(t1CondCodeOp())
11377 .addReg(AddrIn)
11378 .addImm(StSize)
11380 } else if (IsThumb2) {
11381 BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut)
11382 .addReg(Data)
11383 .addReg(AddrIn)
11384 .addImm(StSize)
11386 } else { // arm
11387 BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut)
11388 .addReg(Data)
11389 .addReg(AddrIn)
11390 .addReg(0)
11391 .addImm(StSize)
11393 }
11394}
11395
11397ARMTargetLowering::EmitStructByval(MachineInstr &MI,
11398 MachineBasicBlock *BB) const {
11399 // This pseudo instruction has 3 operands: dst, src, size
11400 // We expand it to a loop if size > Subtarget->getMaxInlineSizeThreshold().
11401 // Otherwise, we will generate unrolled scalar copies.
11402 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
11403 const BasicBlock *LLVM_BB = BB->getBasicBlock();
11405
11406 Register dest = MI.getOperand(0).getReg();
11407 Register src = MI.getOperand(1).getReg();
11408 unsigned SizeVal = MI.getOperand(2).getImm();
11409 unsigned Alignment = MI.getOperand(3).getImm();
11410 DebugLoc dl = MI.getDebugLoc();
11411
11412 MachineFunction *MF = BB->getParent();
11413 MachineRegisterInfo &MRI = MF->getRegInfo();
11414 unsigned UnitSize = 0;
11415 const TargetRegisterClass *TRC = nullptr;
11416 const TargetRegisterClass *VecTRC = nullptr;
11417
11418 bool IsThumb1 = Subtarget->isThumb1Only();
11419 bool IsThumb2 = Subtarget->isThumb2();
11420 bool IsThumb = Subtarget->isThumb();
11421
11422 if (Alignment & 1) {
11423 UnitSize = 1;
11424 } else if (Alignment & 2) {
11425 UnitSize = 2;
11426 } else {
11427 // Check whether we can use NEON instructions.
11428 if (!MF->getFunction().hasFnAttribute(Attribute::NoImplicitFloat) &&
11429 Subtarget->hasNEON()) {
11430 if ((Alignment % 16 == 0) && SizeVal >= 16)
11431 UnitSize = 16;
11432 else if ((Alignment % 8 == 0) && SizeVal >= 8)
11433 UnitSize = 8;
11434 }
11435 // Can't use NEON instructions.
11436 if (UnitSize == 0)
11437 UnitSize = 4;
11438 }
11439
11440 // Select the correct opcode and register class for unit size load/store
11441 bool IsNeon = UnitSize >= 8;
11442 TRC = IsThumb ? &ARM::tGPRRegClass : &ARM::GPRRegClass;
11443 if (IsNeon)
11444 VecTRC = UnitSize == 16 ? &ARM::DPairRegClass
11445 : UnitSize == 8 ? &ARM::DPRRegClass
11446 : nullptr;
11447
11448 unsigned BytesLeft = SizeVal % UnitSize;
11449 unsigned LoopSize = SizeVal - BytesLeft;
11450
11451 if (SizeVal <= Subtarget->getMaxInlineSizeThreshold()) {
11452 // Use LDR and STR to copy.
11453 // [scratch, srcOut] = LDR_POST(srcIn, UnitSize)
11454 // [destOut] = STR_POST(scratch, destIn, UnitSize)
11455 unsigned srcIn = src;
11456 unsigned destIn = dest;
11457 for (unsigned i = 0; i < LoopSize; i+=UnitSize) {
11458 Register srcOut = MRI.createVirtualRegister(TRC);
11459 Register destOut = MRI.createVirtualRegister(TRC);
11460 Register scratch = MRI.createVirtualRegister(IsNeon ? VecTRC : TRC);
11461 emitPostLd(BB, MI, TII, dl, UnitSize, scratch, srcIn, srcOut,
11462 IsThumb1, IsThumb2);
11463 emitPostSt(BB, MI, TII, dl, UnitSize, scratch, destIn, destOut,
11464 IsThumb1, IsThumb2);
11465 srcIn = srcOut;
11466 destIn = destOut;
11467 }
11468
11469 // Handle the leftover bytes with LDRB and STRB.
11470 // [scratch, srcOut] = LDRB_POST(srcIn, 1)
11471 // [destOut] = STRB_POST(scratch, destIn, 1)
11472 for (unsigned i = 0; i < BytesLeft; i++) {
11473 Register srcOut = MRI.createVirtualRegister(TRC);
11474 Register destOut = MRI.createVirtualRegister(TRC);
11475 Register scratch = MRI.createVirtualRegister(TRC);
11476 emitPostLd(BB, MI, TII, dl, 1, scratch, srcIn, srcOut,
11477 IsThumb1, IsThumb2);
11478 emitPostSt(BB, MI, TII, dl, 1, scratch, destIn, destOut,
11479 IsThumb1, IsThumb2);
11480 srcIn = srcOut;
11481 destIn = destOut;
11482 }
11483 MI.eraseFromParent(); // The instruction is gone now.
11484 return BB;
11485 }
11486
11487 // Expand the pseudo op to a loop.
11488 // thisMBB:
11489 // ...
11490 // movw varEnd, # --> with thumb2
11491 // movt varEnd, #
11492 // ldrcp varEnd, idx --> without thumb2
11493 // fallthrough --> loopMBB
11494 // loopMBB:
11495 // PHI varPhi, varEnd, varLoop
11496 // PHI srcPhi, src, srcLoop
11497 // PHI destPhi, dst, destLoop
11498 // [scratch, srcLoop] = LDR_POST(srcPhi, UnitSize)
11499 // [destLoop] = STR_POST(scratch, destPhi, UnitSize)
11500 // subs varLoop, varPhi, #UnitSize
11501 // bne loopMBB
11502 // fallthrough --> exitMBB
11503 // exitMBB:
11504 // epilogue to handle left-over bytes
11505 // [scratch, srcOut] = LDRB_POST(srcLoop, 1)
11506 // [destOut] = STRB_POST(scratch, destLoop, 1)
11507 MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB);
11508 MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
11509 MF->insert(It, loopMBB);
11510 MF->insert(It, exitMBB);
11511
11512 // Set the call frame size on entry to the new basic blocks.
11513 unsigned CallFrameSize = TII->getCallFrameSizeAt(MI);
11514 loopMBB->setCallFrameSize(CallFrameSize);
11515 exitMBB->setCallFrameSize(CallFrameSize);
11516
11517 // Transfer the remainder of BB and its successor edges to exitMBB.
11518 exitMBB->splice(exitMBB->begin(), BB,
11519 std::next(MachineBasicBlock::iterator(MI)), BB->end());
11521
11522 // Load an immediate to varEnd.
11523 Register varEnd = MRI.createVirtualRegister(TRC);
11524 if (Subtarget->useMovt()) {
11525 BuildMI(BB, dl, TII->get(IsThumb ? ARM::t2MOVi32imm : ARM::MOVi32imm),
11526 varEnd)
11527 .addImm(LoopSize);
11528 } else if (Subtarget->genExecuteOnly()) {
11529 assert(IsThumb && "Non-thumb expected to have used movt");
11530 BuildMI(BB, dl, TII->get(ARM::tMOVi32imm), varEnd).addImm(LoopSize);
11531 } else {
11532 MachineConstantPool *ConstantPool = MF->getConstantPool();
11534 const Constant *C = ConstantInt::get(Int32Ty, LoopSize);
11535
11536 // MachineConstantPool wants an explicit alignment.
11537 Align Alignment = MF->getDataLayout().getPrefTypeAlign(Int32Ty);
11538 unsigned Idx = ConstantPool->getConstantPoolIndex(C, Alignment);
11539 MachineMemOperand *CPMMO =
11542
11543 if (IsThumb)
11544 BuildMI(*BB, MI, dl, TII->get(ARM::tLDRpci))
11545 .addReg(varEnd, RegState::Define)
11548 .addMemOperand(CPMMO);
11549 else
11550 BuildMI(*BB, MI, dl, TII->get(ARM::LDRcp))
11551 .addReg(varEnd, RegState::Define)
11553 .addImm(0)
11555 .addMemOperand(CPMMO);
11556 }
11557 BB->addSuccessor(loopMBB);
11558
11559 // Generate the loop body:
11560 // varPhi = PHI(varLoop, varEnd)
11561 // srcPhi = PHI(srcLoop, src)
11562 // destPhi = PHI(destLoop, dst)
11563 MachineBasicBlock *entryBB = BB;
11564 BB = loopMBB;
11565 Register varLoop = MRI.createVirtualRegister(TRC);
11566 Register varPhi = MRI.createVirtualRegister(TRC);
11567 Register srcLoop = MRI.createVirtualRegister(TRC);
11568 Register srcPhi = MRI.createVirtualRegister(TRC);
11569 Register destLoop = MRI.createVirtualRegister(TRC);
11570 Register destPhi = MRI.createVirtualRegister(TRC);
11571
11572 BuildMI(*BB, BB->begin(), dl, TII->get(ARM::PHI), varPhi)
11573 .addReg(varLoop).addMBB(loopMBB)
11574 .addReg(varEnd).addMBB(entryBB);
11575 BuildMI(BB, dl, TII->get(ARM::PHI), srcPhi)
11576 .addReg(srcLoop).addMBB(loopMBB)
11577 .addReg(src).addMBB(entryBB);
11578 BuildMI(BB, dl, TII->get(ARM::PHI), destPhi)
11579 .addReg(destLoop).addMBB(loopMBB)
11580 .addReg(dest).addMBB(entryBB);
11581
11582 // [scratch, srcLoop] = LDR_POST(srcPhi, UnitSize)
11583 // [destLoop] = STR_POST(scratch, destPhi, UnitSiz)
11584 Register scratch = MRI.createVirtualRegister(IsNeon ? VecTRC : TRC);
11585 emitPostLd(BB, BB->end(), TII, dl, UnitSize, scratch, srcPhi, srcLoop,
11586 IsThumb1, IsThumb2);
11587 emitPostSt(BB, BB->end(), TII, dl, UnitSize, scratch, destPhi, destLoop,
11588 IsThumb1, IsThumb2);
11589
11590 // Decrement loop variable by UnitSize.
11591 if (IsThumb1) {
11592 BuildMI(*BB, BB->end(), dl, TII->get(ARM::tSUBi8), varLoop)
11593 .add(t1CondCodeOp())
11594 .addReg(varPhi)
11595 .addImm(UnitSize)
11597 } else {
11598 MachineInstrBuilder MIB =
11599 BuildMI(*BB, BB->end(), dl,
11600 TII->get(IsThumb2 ? ARM::t2SUBri : ARM::SUBri), varLoop);
11601 MIB.addReg(varPhi)
11602 .addImm(UnitSize)
11604 .add(condCodeOp());
11605 MIB->getOperand(5).setReg(ARM::CPSR);
11606 MIB->getOperand(5).setIsDef(true);
11607 }
11608 BuildMI(*BB, BB->end(), dl,
11609 TII->get(IsThumb1 ? ARM::tBcc : IsThumb2 ? ARM::t2Bcc : ARM::Bcc))
11610 .addMBB(loopMBB).addImm(ARMCC::NE).addReg(ARM::CPSR);
11611
11612 // loopMBB can loop back to loopMBB or fall through to exitMBB.
11613 BB->addSuccessor(loopMBB);
11614 BB->addSuccessor(exitMBB);
11615
11616 // Add epilogue to handle BytesLeft.
11617 BB = exitMBB;
11618 auto StartOfExit = exitMBB->begin();
11619
11620 // [scratch, srcOut] = LDRB_POST(srcLoop, 1)
11621 // [destOut] = STRB_POST(scratch, destLoop, 1)
11622 unsigned srcIn = srcLoop;
11623 unsigned destIn = destLoop;
11624 for (unsigned i = 0; i < BytesLeft; i++) {
11625 Register srcOut = MRI.createVirtualRegister(TRC);
11626 Register destOut = MRI.createVirtualRegister(TRC);
11627 Register scratch = MRI.createVirtualRegister(TRC);
11628 emitPostLd(BB, StartOfExit, TII, dl, 1, scratch, srcIn, srcOut,
11629 IsThumb1, IsThumb2);
11630 emitPostSt(BB, StartOfExit, TII, dl, 1, scratch, destIn, destOut,
11631 IsThumb1, IsThumb2);
11632 srcIn = srcOut;
11633 destIn = destOut;
11634 }
11635
11636 MI.eraseFromParent(); // The instruction is gone now.
11637 return BB;
11638}
11639
11641ARMTargetLowering::EmitLowered__chkstk(MachineInstr &MI,
11642 MachineBasicBlock *MBB) const {
11643 const TargetMachine &TM = getTargetMachine();
11644 const TargetInstrInfo &TII = *Subtarget->getInstrInfo();
11645 DebugLoc DL = MI.getDebugLoc();
11646
11647 assert(TM.getTargetTriple().isOSWindows() &&
11648 "__chkstk is only supported on Windows");
11649 assert(Subtarget->isThumb2() && "Windows on ARM requires Thumb-2 mode");
11650
11651 // __chkstk takes the number of words to allocate on the stack in R4, and
11652 // returns the stack adjustment in number of bytes in R4. This will not
11653 // clober any other registers (other than the obvious lr).
11654 //
11655 // Although, technically, IP should be considered a register which may be
11656 // clobbered, the call itself will not touch it. Windows on ARM is a pure
11657 // thumb-2 environment, so there is no interworking required. As a result, we
11658 // do not expect a veneer to be emitted by the linker, clobbering IP.
11659 //
11660 // Each module receives its own copy of __chkstk, so no import thunk is
11661 // required, again, ensuring that IP is not clobbered.
11662 //
11663 // Finally, although some linkers may theoretically provide a trampoline for
11664 // out of range calls (which is quite common due to a 32M range limitation of
11665 // branches for Thumb), we can generate the long-call version via
11666 // -mcmodel=large, alleviating the need for the trampoline which may clobber
11667 // IP.
11668
11669 RTLIB::LibcallImpl ChkStkLibcall = getLibcallImpl(RTLIB::STACK_PROBE);
11670 if (ChkStkLibcall == RTLIB::Unsupported)
11671 reportFatalUsageError("no available implementation of __chkstk");
11672
11673 const char *ChkStk = getLibcallImplName(ChkStkLibcall).data();
11674 switch (TM.getCodeModel()) {
11675 case CodeModel::Tiny:
11676 llvm_unreachable("Tiny code model not available on ARM.");
11677 case CodeModel::Small:
11678 case CodeModel::Medium:
11679 case CodeModel::Kernel:
11680 BuildMI(*MBB, MI, DL, TII.get(ARM::tBL))
11682 .addExternalSymbol(ChkStk)
11685 .addReg(ARM::R12,
11687 .addReg(ARM::CPSR,
11689 break;
11690 case CodeModel::Large: {
11691 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
11692 Register Reg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
11693
11694 BuildMI(*MBB, MI, DL, TII.get(ARM::t2MOVi32imm), Reg)
11695 .addExternalSymbol(ChkStk);
11701 .addReg(ARM::R12,
11703 .addReg(ARM::CPSR,
11705 break;
11706 }
11707 }
11708
11709 BuildMI(*MBB, MI, DL, TII.get(ARM::t2SUBrr), ARM::SP)
11710 .addReg(ARM::SP, RegState::Kill)
11711 .addReg(ARM::R4, RegState::Kill)
11714 .add(condCodeOp());
11715
11716 MI.eraseFromParent();
11717 return MBB;
11718}
11719
11721ARMTargetLowering::EmitLowered__dbzchk(MachineInstr &MI,
11722 MachineBasicBlock *MBB) const {
11723 DebugLoc DL = MI.getDebugLoc();
11724 MachineFunction *MF = MBB->getParent();
11725 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
11726
11727 MachineBasicBlock *ContBB = MF->CreateMachineBasicBlock();
11728 MF->insert(++MBB->getIterator(), ContBB);
11729 ContBB->splice(ContBB->begin(), MBB,
11730 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
11732 MBB->addSuccessor(ContBB);
11733
11734 MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
11735 BuildMI(TrapBB, DL, TII->get(ARM::t__brkdiv0));
11736 MF->push_back(TrapBB);
11737 MBB->addSuccessor(TrapBB);
11738
11739 BuildMI(*MBB, MI, DL, TII->get(ARM::tCMPi8))
11740 .addReg(MI.getOperand(0).getReg())
11741 .addImm(0)
11743 BuildMI(*MBB, MI, DL, TII->get(ARM::t2Bcc))
11744 .addMBB(TrapBB)
11746 .addReg(ARM::CPSR);
11747
11748 MI.eraseFromParent();
11749 return ContBB;
11750}
11751
11752// The CPSR operand of SelectItr might be missing a kill marker
11753// because there were multiple uses of CPSR, and ISel didn't know
11754// which to mark. Figure out whether SelectItr should have had a
11755// kill marker, and set it if it should. Returns the correct kill
11756// marker value.
11759 const TargetRegisterInfo* TRI) {
11760 // Scan forward through BB for a use/def of CPSR.
11761 MachineBasicBlock::iterator miI(std::next(SelectItr));
11762 for (MachineBasicBlock::iterator miE = BB->end(); miI != miE; ++miI) {
11763 const MachineInstr& mi = *miI;
11764 if (mi.readsRegister(ARM::CPSR, /*TRI=*/nullptr))
11765 return false;
11766 if (mi.definesRegister(ARM::CPSR, /*TRI=*/nullptr))
11767 break; // Should have kill-flag - update below.
11768 }
11769
11770 // If we hit the end of the block, check whether CPSR is live into a
11771 // successor.
11772 if (miI == BB->end()) {
11773 for (MachineBasicBlock *Succ : BB->successors())
11774 if (Succ->isLiveIn(ARM::CPSR))
11775 return false;
11776 }
11777
11778 // We found a def, or hit the end of the basic block and CPSR wasn't live
11779 // out. SelectMI should have a kill flag on CPSR.
11780 SelectItr->addRegisterKilled(ARM::CPSR, TRI);
11781 return true;
11782}
11783
11784/// Adds logic in loop entry MBB to calculate loop iteration count and adds
11785/// t2WhileLoopSetup and t2WhileLoopStart to generate WLS loop
11787 MachineBasicBlock *TpLoopBody,
11788 MachineBasicBlock *TpExit, Register OpSizeReg,
11789 const TargetInstrInfo *TII, DebugLoc Dl,
11790 MachineRegisterInfo &MRI) {
11791 // Calculates loop iteration count = ceil(n/16) = (n + 15) >> 4.
11792 Register AddDestReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
11793 BuildMI(TpEntry, Dl, TII->get(ARM::t2ADDri), AddDestReg)
11794 .addUse(OpSizeReg)
11795 .addImm(15)
11797 .addReg(0);
11798
11799 Register LsrDestReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
11800 BuildMI(TpEntry, Dl, TII->get(ARM::t2LSRri), LsrDestReg)
11801 .addUse(AddDestReg, RegState::Kill)
11802 .addImm(4)
11804 .addReg(0);
11805
11806 Register TotalIterationsReg = MRI.createVirtualRegister(&ARM::GPRlrRegClass);
11807 BuildMI(TpEntry, Dl, TII->get(ARM::t2WhileLoopSetup), TotalIterationsReg)
11808 .addUse(LsrDestReg, RegState::Kill);
11809
11810 BuildMI(TpEntry, Dl, TII->get(ARM::t2WhileLoopStart))
11811 .addUse(TotalIterationsReg)
11812 .addMBB(TpExit);
11813
11814 BuildMI(TpEntry, Dl, TII->get(ARM::t2B))
11815 .addMBB(TpLoopBody)
11817
11818 return TotalIterationsReg;
11819}
11820
11821/// Adds logic in the loopBody MBB to generate MVE_VCTP, t2DoLoopDec and
11822/// t2DoLoopEnd. These are used by later passes to generate tail predicated
11823/// loops.
11824static void genTPLoopBody(MachineBasicBlock *TpLoopBody,
11825 MachineBasicBlock *TpEntry, MachineBasicBlock *TpExit,
11826 const TargetInstrInfo *TII, DebugLoc Dl,
11827 MachineRegisterInfo &MRI, Register OpSrcReg,
11828 Register OpDestReg, Register ElementCountReg,
11829 Register TotalIterationsReg, bool IsMemcpy) {
11830 // First insert 4 PHI nodes for: Current pointer to Src (if memcpy), Dest
11831 // array, loop iteration counter, predication counter.
11832
11833 Register SrcPhiReg, CurrSrcReg;
11834 if (IsMemcpy) {
11835 // Current position in the src array
11836 SrcPhiReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
11837 CurrSrcReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
11838 BuildMI(TpLoopBody, Dl, TII->get(ARM::PHI), SrcPhiReg)
11839 .addUse(OpSrcReg)
11840 .addMBB(TpEntry)
11841 .addUse(CurrSrcReg)
11842 .addMBB(TpLoopBody);
11843 }
11844
11845 // Current position in the dest array
11846 Register DestPhiReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
11847 Register CurrDestReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
11848 BuildMI(TpLoopBody, Dl, TII->get(ARM::PHI), DestPhiReg)
11849 .addUse(OpDestReg)
11850 .addMBB(TpEntry)
11851 .addUse(CurrDestReg)
11852 .addMBB(TpLoopBody);
11853
11854 // Current loop counter
11855 Register LoopCounterPhiReg = MRI.createVirtualRegister(&ARM::GPRlrRegClass);
11856 Register RemainingLoopIterationsReg =
11857 MRI.createVirtualRegister(&ARM::GPRlrRegClass);
11858 BuildMI(TpLoopBody, Dl, TII->get(ARM::PHI), LoopCounterPhiReg)
11859 .addUse(TotalIterationsReg)
11860 .addMBB(TpEntry)
11861 .addUse(RemainingLoopIterationsReg)
11862 .addMBB(TpLoopBody);
11863
11864 // Predication counter
11865 Register PredCounterPhiReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
11866 Register RemainingElementsReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
11867 BuildMI(TpLoopBody, Dl, TII->get(ARM::PHI), PredCounterPhiReg)
11868 .addUse(ElementCountReg)
11869 .addMBB(TpEntry)
11870 .addUse(RemainingElementsReg)
11871 .addMBB(TpLoopBody);
11872
11873 // Pass predication counter to VCTP
11874 Register VccrReg = MRI.createVirtualRegister(&ARM::VCCRRegClass);
11875 BuildMI(TpLoopBody, Dl, TII->get(ARM::MVE_VCTP8), VccrReg)
11876 .addUse(PredCounterPhiReg)
11878 .addReg(0)
11879 .addReg(0);
11880
11881 BuildMI(TpLoopBody, Dl, TII->get(ARM::t2SUBri), RemainingElementsReg)
11882 .addUse(PredCounterPhiReg)
11883 .addImm(16)
11885 .addReg(0);
11886
11887 // VLDRB (only if memcpy) and VSTRB instructions, predicated using VPR
11888 Register SrcValueReg;
11889 if (IsMemcpy) {
11890 SrcValueReg = MRI.createVirtualRegister(&ARM::MQPRRegClass);
11891 BuildMI(TpLoopBody, Dl, TII->get(ARM::MVE_VLDRBU8_post))
11892 .addDef(CurrSrcReg)
11893 .addDef(SrcValueReg)
11894 .addReg(SrcPhiReg)
11895 .addImm(16)
11897 .addUse(VccrReg)
11898 .addReg(0);
11899 } else
11900 SrcValueReg = OpSrcReg;
11901
11902 BuildMI(TpLoopBody, Dl, TII->get(ARM::MVE_VSTRBU8_post))
11903 .addDef(CurrDestReg)
11904 .addUse(SrcValueReg)
11905 .addReg(DestPhiReg)
11906 .addImm(16)
11908 .addUse(VccrReg)
11909 .addReg(0);
11910
11911 // Add the pseudoInstrs for decrementing the loop counter and marking the
11912 // end:t2DoLoopDec and t2DoLoopEnd
11913 BuildMI(TpLoopBody, Dl, TII->get(ARM::t2LoopDec), RemainingLoopIterationsReg)
11914 .addUse(LoopCounterPhiReg)
11915 .addImm(1);
11916
11917 BuildMI(TpLoopBody, Dl, TII->get(ARM::t2LoopEnd))
11918 .addUse(RemainingLoopIterationsReg)
11919 .addMBB(TpLoopBody);
11920
11921 BuildMI(TpLoopBody, Dl, TII->get(ARM::t2B))
11922 .addMBB(TpExit)
11924}
11925
11927 // KCFI is supported in all ARM/Thumb modes
11928 return true;
11929}
11930
11934 const TargetInstrInfo *TII) const {
11935 assert(MBBI->isCall() && MBBI->getCFIType() &&
11936 "Invalid call instruction for a KCFI check");
11937
11938 MachineOperand *TargetOp = nullptr;
11939 switch (MBBI->getOpcode()) {
11940 // ARM mode opcodes
11941 case ARM::BLX:
11942 case ARM::BLX_pred:
11943 case ARM::BLX_noip:
11944 case ARM::BLX_pred_noip:
11945 case ARM::BX_CALL:
11946 TargetOp = &MBBI->getOperand(0);
11947 break;
11948 case ARM::TCRETURNri:
11949 case ARM::TCRETURNrinotr12:
11950 case ARM::TAILJMPr:
11951 case ARM::TAILJMPr4:
11952 TargetOp = &MBBI->getOperand(0);
11953 break;
11954 // Thumb mode opcodes (Thumb1 and Thumb2)
11955 // Note: Most Thumb call instructions have predicate operands before the
11956 // target register Format: tBLXr pred, predreg, target_register, ...
11957 case ARM::tBLXr: // Thumb1/Thumb2: BLX register (requires V5T)
11958 case ARM::tBLXr_noip: // Thumb1/Thumb2: BLX register, no IP clobber
11959 case ARM::tBX_CALL: // Thumb1 only: BX call (push LR, BX)
11960 TargetOp = &MBBI->getOperand(2);
11961 break;
11962 // Tail call instructions don't have predicates, target is operand 0
11963 case ARM::tTAILJMPr: // Thumb1/Thumb2: Tail call via register
11964 TargetOp = &MBBI->getOperand(0);
11965 break;
11966 default:
11967 llvm_unreachable("Unexpected CFI call opcode");
11968 }
11969
11970 assert(TargetOp && TargetOp->isReg() && "Invalid target operand");
11971 TargetOp->setIsRenamable(false);
11972
11973 // Select the appropriate KCFI_CHECK variant based on the instruction set
11974 unsigned KCFICheckOpcode;
11975 if (Subtarget->isThumb()) {
11976 if (Subtarget->isThumb2()) {
11977 KCFICheckOpcode = ARM::KCFI_CHECK_Thumb2;
11978 } else {
11979 KCFICheckOpcode = ARM::KCFI_CHECK_Thumb1;
11980 }
11981 } else {
11982 KCFICheckOpcode = ARM::KCFI_CHECK_ARM;
11983 }
11984
11985 return BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII->get(KCFICheckOpcode))
11986 .addReg(TargetOp->getReg())
11987 .addImm(MBBI->getCFIType())
11988 .getInstr();
11989}
11990
11993 MachineBasicBlock *BB) const {
11994 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
11995 DebugLoc dl = MI.getDebugLoc();
11996 bool isThumb2 = Subtarget->isThumb2();
11997 switch (MI.getOpcode()) {
11998 default: {
11999 MI.print(errs());
12000 llvm_unreachable("Unexpected instr type to insert");
12001 }
12002
12003 // Thumb1 post-indexed loads are really just single-register LDMs.
12004 case ARM::tLDR_postidx: {
12005 MachineOperand Def(MI.getOperand(1));
12006 BuildMI(*BB, MI, dl, TII->get(ARM::tLDMIA_UPD))
12007 .add(Def) // Rn_wb
12008 .add(MI.getOperand(2)) // Rn
12009 .add(MI.getOperand(3)) // PredImm
12010 .add(MI.getOperand(4)) // PredReg
12011 .add(MI.getOperand(0)) // Rt
12012 .cloneMemRefs(MI);
12013 MI.eraseFromParent();
12014 return BB;
12015 }
12016
12017 case ARM::MVE_MEMCPYLOOPINST:
12018 case ARM::MVE_MEMSETLOOPINST: {
12019
12020 // Transformation below expands MVE_MEMCPYLOOPINST/MVE_MEMSETLOOPINST Pseudo
12021 // into a Tail Predicated (TP) Loop. It adds the instructions to calculate
12022 // the iteration count =ceil(size_in_bytes/16)) in the TP entry block and
12023 // adds the relevant instructions in the TP loop Body for generation of a
12024 // WLSTP loop.
12025
12026 // Below is relevant portion of the CFG after the transformation.
12027 // The Machine Basic Blocks are shown along with branch conditions (in
12028 // brackets). Note that TP entry/exit MBBs depict the entry/exit of this
12029 // portion of the CFG and may not necessarily be the entry/exit of the
12030 // function.
12031
12032 // (Relevant) CFG after transformation:
12033 // TP entry MBB
12034 // |
12035 // |-----------------|
12036 // (n <= 0) (n > 0)
12037 // | |
12038 // | TP loop Body MBB<--|
12039 // | | |
12040 // \ |___________|
12041 // \ /
12042 // TP exit MBB
12043
12044 MachineFunction *MF = BB->getParent();
12045 MachineFunctionProperties &Properties = MF->getProperties();
12046 MachineRegisterInfo &MRI = MF->getRegInfo();
12047
12048 Register OpDestReg = MI.getOperand(0).getReg();
12049 Register OpSrcReg = MI.getOperand(1).getReg();
12050 Register OpSizeReg = MI.getOperand(2).getReg();
12051
12052 // Allocate the required MBBs and add to parent function.
12053 MachineBasicBlock *TpEntry = BB;
12054 MachineBasicBlock *TpLoopBody = MF->CreateMachineBasicBlock();
12055 MachineBasicBlock *TpExit;
12056
12057 MF->push_back(TpLoopBody);
12058
12059 // If any instructions are present in the current block after
12060 // MVE_MEMCPYLOOPINST or MVE_MEMSETLOOPINST, split the current block and
12061 // move the instructions into the newly created exit block. If there are no
12062 // instructions add an explicit branch to the FallThrough block and then
12063 // split.
12064 //
12065 // The split is required for two reasons:
12066 // 1) A terminator(t2WhileLoopStart) will be placed at that site.
12067 // 2) Since a TPLoopBody will be added later, any phis in successive blocks
12068 // need to be updated. splitAt() already handles this.
12069 TpExit = BB->splitAt(MI, false);
12070 if (TpExit == BB) {
12071 assert(BB->canFallThrough() && "Exit Block must be Fallthrough of the "
12072 "block containing memcpy/memset Pseudo");
12073 TpExit = BB->getFallThrough();
12074 BuildMI(BB, dl, TII->get(ARM::t2B))
12075 .addMBB(TpExit)
12077 TpExit = BB->splitAt(MI, false);
12078 }
12079
12080 // Add logic for iteration count
12081 Register TotalIterationsReg =
12082 genTPEntry(TpEntry, TpLoopBody, TpExit, OpSizeReg, TII, dl, MRI);
12083
12084 // Add the vectorized (and predicated) loads/store instructions
12085 bool IsMemcpy = MI.getOpcode() == ARM::MVE_MEMCPYLOOPINST;
12086 genTPLoopBody(TpLoopBody, TpEntry, TpExit, TII, dl, MRI, OpSrcReg,
12087 OpDestReg, OpSizeReg, TotalIterationsReg, IsMemcpy);
12088
12089 // Required to avoid conflict with the MachineVerifier during testing.
12090 Properties.resetNoPHIs();
12091
12092 // Connect the blocks
12093 TpEntry->addSuccessor(TpLoopBody);
12094 TpLoopBody->addSuccessor(TpLoopBody);
12095 TpLoopBody->addSuccessor(TpExit);
12096
12097 // Reorder for a more natural layout
12098 TpLoopBody->moveAfter(TpEntry);
12099 TpExit->moveAfter(TpLoopBody);
12100
12101 // Finally, remove the memcpy Pseudo Instruction
12102 MI.eraseFromParent();
12103
12104 // Return the exit block as it may contain other instructions requiring a
12105 // custom inserter
12106 return TpExit;
12107 }
12108
12109 // The Thumb2 pre-indexed stores have the same MI operands, they just
12110 // define them differently in the .td files from the isel patterns, so
12111 // they need pseudos.
12112 case ARM::t2STR_preidx:
12113 MI.setDesc(TII->get(ARM::t2STR_PRE));
12114 return BB;
12115 case ARM::t2STRB_preidx:
12116 MI.setDesc(TII->get(ARM::t2STRB_PRE));
12117 return BB;
12118 case ARM::t2STRH_preidx:
12119 MI.setDesc(TII->get(ARM::t2STRH_PRE));
12120 return BB;
12121
12122 case ARM::STRi_preidx:
12123 case ARM::STRBi_preidx: {
12124 unsigned NewOpc = MI.getOpcode() == ARM::STRi_preidx ? ARM::STR_PRE_IMM
12125 : ARM::STRB_PRE_IMM;
12126 // Decode the offset.
12127 unsigned Offset = MI.getOperand(4).getImm();
12128 bool isSub = ARM_AM::getAM2Op(Offset) == ARM_AM::sub;
12130 if (isSub)
12131 Offset = -Offset;
12132
12133 MachineMemOperand *MMO = *MI.memoperands_begin();
12134 BuildMI(*BB, MI, dl, TII->get(NewOpc))
12135 .add(MI.getOperand(0)) // Rn_wb
12136 .add(MI.getOperand(1)) // Rt
12137 .add(MI.getOperand(2)) // Rn
12138 .addImm(Offset) // offset (skip GPR==zero_reg)
12139 .add(MI.getOperand(5)) // pred
12140 .add(MI.getOperand(6))
12141 .addMemOperand(MMO);
12142 MI.eraseFromParent();
12143 return BB;
12144 }
12145 case ARM::STRr_preidx:
12146 case ARM::STRBr_preidx:
12147 case ARM::STRH_preidx: {
12148 unsigned NewOpc;
12149 switch (MI.getOpcode()) {
12150 default: llvm_unreachable("unexpected opcode!");
12151 case ARM::STRr_preidx: NewOpc = ARM::STR_PRE_REG; break;
12152 case ARM::STRBr_preidx: NewOpc = ARM::STRB_PRE_REG; break;
12153 case ARM::STRH_preidx: NewOpc = ARM::STRH_PRE; break;
12154 }
12155 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(NewOpc));
12156 for (const MachineOperand &MO : MI.operands())
12157 MIB.add(MO);
12158 MI.eraseFromParent();
12159 return BB;
12160 }
12161
12162 case ARM::tMOVCCr_pseudo: {
12163 // To "insert" a SELECT_CC instruction, we actually have to insert the
12164 // diamond control-flow pattern. The incoming instruction knows the
12165 // destination vreg to set, the condition code register to branch on, the
12166 // true/false values to select between, and a branch opcode to use.
12167 const BasicBlock *LLVM_BB = BB->getBasicBlock();
12169
12170 // thisMBB:
12171 // ...
12172 // TrueVal = ...
12173 // cmpTY ccX, r1, r2
12174 // bCC copy1MBB
12175 // fallthrough --> copy0MBB
12176 MachineBasicBlock *thisMBB = BB;
12177 MachineFunction *F = BB->getParent();
12178 MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
12179 MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
12180 F->insert(It, copy0MBB);
12181 F->insert(It, sinkMBB);
12182
12183 // Set the call frame size on entry to the new basic blocks.
12184 unsigned CallFrameSize = TII->getCallFrameSizeAt(MI);
12185 copy0MBB->setCallFrameSize(CallFrameSize);
12186 sinkMBB->setCallFrameSize(CallFrameSize);
12187
12188 // Check whether CPSR is live past the tMOVCCr_pseudo.
12189 const TargetRegisterInfo *TRI = Subtarget->getRegisterInfo();
12190 if (!MI.killsRegister(ARM::CPSR, /*TRI=*/nullptr) &&
12191 !checkAndUpdateCPSRKill(MI, thisMBB, TRI)) {
12192 copy0MBB->addLiveIn(ARM::CPSR);
12193 sinkMBB->addLiveIn(ARM::CPSR);
12194 }
12195
12196 // Transfer the remainder of BB and its successor edges to sinkMBB.
12197 sinkMBB->splice(sinkMBB->begin(), BB,
12198 std::next(MachineBasicBlock::iterator(MI)), BB->end());
12200
12201 BB->addSuccessor(copy0MBB);
12202 BB->addSuccessor(sinkMBB);
12203
12204 BuildMI(BB, dl, TII->get(ARM::tBcc))
12205 .addMBB(sinkMBB)
12206 .addImm(MI.getOperand(3).getImm())
12207 .addReg(MI.getOperand(4).getReg());
12208
12209 // copy0MBB:
12210 // %FalseValue = ...
12211 // # fallthrough to sinkMBB
12212 BB = copy0MBB;
12213
12214 // Update machine-CFG edges
12215 BB->addSuccessor(sinkMBB);
12216
12217 // sinkMBB:
12218 // %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
12219 // ...
12220 BB = sinkMBB;
12221 BuildMI(*BB, BB->begin(), dl, TII->get(ARM::PHI), MI.getOperand(0).getReg())
12222 .addReg(MI.getOperand(1).getReg())
12223 .addMBB(copy0MBB)
12224 .addReg(MI.getOperand(2).getReg())
12225 .addMBB(thisMBB);
12226
12227 MI.eraseFromParent(); // The pseudo instruction is gone now.
12228 return BB;
12229 }
12230
12231 case ARM::BCCi64:
12232 case ARM::BCCZi64: {
12233 // If there is an unconditional branch to the other successor, remove it.
12234 BB->erase(std::next(MachineBasicBlock::iterator(MI)), BB->end());
12235
12236 // Compare both parts that make up the double comparison separately for
12237 // equality.
12238 bool RHSisZero = MI.getOpcode() == ARM::BCCZi64;
12239
12240 Register LHS1 = MI.getOperand(1).getReg();
12241 Register LHS2 = MI.getOperand(2).getReg();
12242 if (RHSisZero) {
12243 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri))
12244 .addReg(LHS1)
12245 .addImm(0)
12247 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri))
12248 .addReg(LHS2).addImm(0)
12249 .addImm(ARMCC::EQ).addReg(ARM::CPSR);
12250 } else {
12251 Register RHS1 = MI.getOperand(3).getReg();
12252 Register RHS2 = MI.getOperand(4).getReg();
12253 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPrr : ARM::CMPrr))
12254 .addReg(LHS1)
12255 .addReg(RHS1)
12257 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPrr : ARM::CMPrr))
12258 .addReg(LHS2).addReg(RHS2)
12259 .addImm(ARMCC::EQ).addReg(ARM::CPSR);
12260 }
12261
12262 MachineBasicBlock *destMBB = MI.getOperand(RHSisZero ? 3 : 5).getMBB();
12263 MachineBasicBlock *exitMBB = OtherSucc(BB, destMBB);
12264 if (MI.getOperand(0).getImm() == ARMCC::NE)
12265 std::swap(destMBB, exitMBB);
12266
12267 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc))
12268 .addMBB(destMBB).addImm(ARMCC::EQ).addReg(ARM::CPSR);
12269 if (isThumb2)
12270 BuildMI(BB, dl, TII->get(ARM::t2B))
12271 .addMBB(exitMBB)
12273 else
12274 BuildMI(BB, dl, TII->get(ARM::B)) .addMBB(exitMBB);
12275
12276 MI.eraseFromParent(); // The pseudo instruction is gone now.
12277 return BB;
12278 }
12279
12280 case ARM::Int_eh_sjlj_setjmp:
12281 case ARM::Int_eh_sjlj_setjmp_nofp:
12282 case ARM::tInt_eh_sjlj_setjmp:
12283 case ARM::t2Int_eh_sjlj_setjmp:
12284 case ARM::t2Int_eh_sjlj_setjmp_nofp:
12285 return BB;
12286
12287 case ARM::Int_eh_sjlj_setup_dispatch:
12288 EmitSjLjDispatchBlock(MI, BB);
12289 return BB;
12290 case ARM::COPY_STRUCT_BYVAL_I32:
12291 ++NumLoopByVals;
12292 return EmitStructByval(MI, BB);
12293 case ARM::WIN__CHKSTK:
12294 return EmitLowered__chkstk(MI, BB);
12295 case ARM::WIN__DBZCHK:
12296 return EmitLowered__dbzchk(MI, BB);
12297 }
12298}
12299
12300/// Attaches vregs to MEMCPY that it will use as scratch registers
12301/// when it is expanded into LDM/STM. This is done as a post-isel lowering
12302/// instead of as a custom inserter because we need the use list from the SDNode.
12303static void attachMEMCPYScratchRegs(const ARMSubtarget *Subtarget,
12304 MachineInstr &MI, const SDNode *Node) {
12305 bool isThumb1 = Subtarget->isThumb1Only();
12306
12307 MachineFunction *MF = MI.getParent()->getParent();
12308 MachineRegisterInfo &MRI = MF->getRegInfo();
12309 MachineInstrBuilder MIB(*MF, MI);
12310
12311 // If the new dst/src is unused mark it as dead.
12312 if (!Node->hasAnyUseOfValue(0)) {
12313 MI.getOperand(0).setIsDead(true);
12314 }
12315 if (!Node->hasAnyUseOfValue(1)) {
12316 MI.getOperand(1).setIsDead(true);
12317 }
12318
12319 // The MEMCPY both defines and kills the scratch registers.
12320 for (unsigned I = 0; I != MI.getOperand(4).getImm(); ++I) {
12321 Register TmpReg = MRI.createVirtualRegister(isThumb1 ? &ARM::tGPRRegClass
12322 : &ARM::GPRRegClass);
12324 }
12325}
12326
12328 SDNode *Node) const {
12329 if (MI.getOpcode() == ARM::MEMCPY) {
12330 attachMEMCPYScratchRegs(Subtarget, MI, Node);
12331 return;
12332 }
12333
12334 const MCInstrDesc *MCID = &MI.getDesc();
12335 // Adjust potentially 's' setting instructions after isel, i.e. ADC, SBC, RSB,
12336 // RSC. Coming out of isel, they have an implicit CPSR def, but the optional
12337 // operand is still set to noreg. If needed, set the optional operand's
12338 // register to CPSR, and remove the redundant implicit def.
12339 //
12340 // e.g. ADCS (..., implicit-def CPSR) -> ADC (... opt:def CPSR).
12341
12342 // Rename pseudo opcodes.
12343 unsigned NewOpc = convertAddSubFlagsOpcode(MI.getOpcode());
12344 unsigned ccOutIdx;
12345 if (NewOpc) {
12346 const ARMBaseInstrInfo *TII = Subtarget->getInstrInfo();
12347 MCID = &TII->get(NewOpc);
12348
12349 assert(MCID->getNumOperands() ==
12350 MI.getDesc().getNumOperands() + 5 - MI.getDesc().getSize()
12351 && "converted opcode should be the same except for cc_out"
12352 " (and, on Thumb1, pred)");
12353
12354 MI.setDesc(*MCID);
12355
12356 // Add the optional cc_out operand
12357 MI.addOperand(MachineOperand::CreateReg(0, /*isDef=*/true));
12358
12359 // On Thumb1, move all input operands to the end, then add the predicate
12360 if (Subtarget->isThumb1Only()) {
12361 for (unsigned c = MCID->getNumOperands() - 4; c--;) {
12362 MI.addOperand(MI.getOperand(1));
12363 MI.removeOperand(1);
12364 }
12365
12366 // Restore the ties
12367 for (unsigned i = MI.getNumOperands(); i--;) {
12368 const MachineOperand& op = MI.getOperand(i);
12369 if (op.isReg() && op.isUse()) {
12370 int DefIdx = MCID->getOperandConstraint(i, MCOI::TIED_TO);
12371 if (DefIdx != -1)
12372 MI.tieOperands(DefIdx, i);
12373 }
12374 }
12375
12377 MI.addOperand(MachineOperand::CreateReg(0, /*isDef=*/false));
12378 ccOutIdx = 1;
12379 } else
12380 ccOutIdx = MCID->getNumOperands() - 1;
12381 } else
12382 ccOutIdx = MCID->getNumOperands() - 1;
12383
12384 // Any ARM instruction that sets the 's' bit should specify an optional
12385 // "cc_out" operand in the last operand position.
12386 if (!MI.hasOptionalDef() || !MCID->operands()[ccOutIdx].isOptionalDef()) {
12387 assert(!NewOpc && "Optional cc_out operand required");
12388 return;
12389 }
12390 // Look for an implicit def of CPSR added by MachineInstr ctor. Remove it
12391 // since we already have an optional CPSR def.
12392 bool definesCPSR = false;
12393 bool deadCPSR = false;
12394 for (unsigned i = MCID->getNumOperands(), e = MI.getNumOperands(); i != e;
12395 ++i) {
12396 const MachineOperand &MO = MI.getOperand(i);
12397 if (MO.isReg() && MO.isDef() && MO.getReg() == ARM::CPSR) {
12398 definesCPSR = true;
12399 if (MO.isDead())
12400 deadCPSR = true;
12401 MI.removeOperand(i);
12402 break;
12403 }
12404 }
12405 if (!definesCPSR) {
12406 assert(!NewOpc && "Optional cc_out operand required");
12407 return;
12408 }
12409 assert(deadCPSR == !Node->hasAnyUseOfValue(1) && "inconsistent dead flag");
12410 if (deadCPSR) {
12411 assert(!MI.getOperand(ccOutIdx).getReg() &&
12412 "expect uninitialized optional cc_out operand");
12413 // Thumb1 instructions must have the S bit even if the CPSR is dead.
12414 if (!Subtarget->isThumb1Only())
12415 return;
12416 }
12417
12418 // If this instruction was defined with an optional CPSR def and its dag node
12419 // had a live implicit CPSR def, then activate the optional CPSR def.
12420 MachineOperand &MO = MI.getOperand(ccOutIdx);
12421 MO.setReg(ARM::CPSR);
12422 MO.setIsDef(true);
12423}
12424
12425//===----------------------------------------------------------------------===//
12426// ARM Optimization Hooks
12427//===----------------------------------------------------------------------===//
12428
12429// Helper function that checks if N is a null or all ones constant.
12430static inline bool isZeroOrAllOnes(SDValue N, bool AllOnes) {
12432}
12433
12434// Return true if N is conditionally 0 or all ones.
12435// Detects these expressions where cc is an i1 value:
12436//
12437// (select cc 0, y) [AllOnes=0]
12438// (select cc y, 0) [AllOnes=0]
12439// (zext cc) [AllOnes=0]
12440// (sext cc) [AllOnes=0/1]
12441// (select cc -1, y) [AllOnes=1]
12442// (select cc y, -1) [AllOnes=1]
12443//
12444// Invert is set when N is the null/all ones constant when CC is false.
12445// OtherOp is set to the alternative value of N.
12447 SDValue &CC, bool &Invert,
12448 SDValue &OtherOp,
12449 SelectionDAG &DAG) {
12450 switch (N->getOpcode()) {
12451 default: return false;
12452 case ISD::SELECT: {
12453 CC = N->getOperand(0);
12454 SDValue N1 = N->getOperand(1);
12455 SDValue N2 = N->getOperand(2);
12456 if (isZeroOrAllOnes(N1, AllOnes)) {
12457 Invert = false;
12458 OtherOp = N2;
12459 return true;
12460 }
12461 if (isZeroOrAllOnes(N2, AllOnes)) {
12462 Invert = true;
12463 OtherOp = N1;
12464 return true;
12465 }
12466 return false;
12467 }
12468 case ISD::ZERO_EXTEND:
12469 // (zext cc) can never be the all ones value.
12470 if (AllOnes)
12471 return false;
12472 [[fallthrough]];
12473 case ISD::SIGN_EXTEND: {
12474 SDLoc dl(N);
12475 EVT VT = N->getValueType(0);
12476 CC = N->getOperand(0);
12477 if (CC.getValueType() != MVT::i1 || CC.getOpcode() != ISD::SETCC)
12478 return false;
12479 Invert = !AllOnes;
12480 if (AllOnes)
12481 // When looking for an AllOnes constant, N is an sext, and the 'other'
12482 // value is 0.
12483 OtherOp = DAG.getConstant(0, dl, VT);
12484 else if (N->getOpcode() == ISD::ZERO_EXTEND)
12485 // When looking for a 0 constant, N can be zext or sext.
12486 OtherOp = DAG.getConstant(1, dl, VT);
12487 else
12488 OtherOp = DAG.getAllOnesConstant(dl, VT);
12489 return true;
12490 }
12491 }
12492}
12493
12494// Combine a constant select operand into its use:
12495//
12496// (add (select cc, 0, c), x) -> (select cc, x, (add, x, c))
12497// (sub x, (select cc, 0, c)) -> (select cc, x, (sub, x, c))
12498// (and (select cc, -1, c), x) -> (select cc, x, (and, x, c)) [AllOnes=1]
12499// (or (select cc, 0, c), x) -> (select cc, x, (or, x, c))
12500// (xor (select cc, 0, c), x) -> (select cc, x, (xor, x, c))
12501//
12502// The transform is rejected if the select doesn't have a constant operand that
12503// is null, or all ones when AllOnes is set.
12504//
12505// Also recognize sext/zext from i1:
12506//
12507// (add (zext cc), x) -> (select cc (add x, 1), x)
12508// (add (sext cc), x) -> (select cc (add x, -1), x)
12509//
12510// These transformations eventually create predicated instructions.
12511//
12512// @param N The node to transform.
12513// @param Slct The N operand that is a select.
12514// @param OtherOp The other N operand (x above).
12515// @param DCI Context.
12516// @param AllOnes Require the select constant to be all ones instead of null.
12517// @returns The new node, or SDValue() on failure.
12518static
12521 bool AllOnes = false) {
12522 SelectionDAG &DAG = DCI.DAG;
12523 EVT VT = N->getValueType(0);
12524 SDValue NonConstantVal;
12525 SDValue CCOp;
12526 bool SwapSelectOps;
12527 if (!isConditionalZeroOrAllOnes(Slct.getNode(), AllOnes, CCOp, SwapSelectOps,
12528 NonConstantVal, DAG))
12529 return SDValue();
12530
12531 // Slct is now know to be the desired identity constant when CC is true.
12532 SDValue TrueVal = OtherOp;
12533 SDValue FalseVal = DAG.getNode(N->getOpcode(), SDLoc(N), VT,
12534 OtherOp, NonConstantVal);
12535 // Unless SwapSelectOps says CC should be false.
12536 if (SwapSelectOps)
12537 std::swap(TrueVal, FalseVal);
12538
12539 return DAG.getNode(ISD::SELECT, SDLoc(N), VT,
12540 CCOp, TrueVal, FalseVal);
12541}
12542
12543// Attempt combineSelectAndUse on each operand of a commutative operator N.
12544static
12547 SDValue N0 = N->getOperand(0);
12548 SDValue N1 = N->getOperand(1);
12549 if (N0.getNode()->hasOneUse())
12550 if (SDValue Result = combineSelectAndUse(N, N0, N1, DCI, AllOnes))
12551 return Result;
12552 if (N1.getNode()->hasOneUse())
12553 if (SDValue Result = combineSelectAndUse(N, N1, N0, DCI, AllOnes))
12554 return Result;
12555 return SDValue();
12556}
12557
12559 // VUZP shuffle node.
12560 if (N->getOpcode() == ARMISD::VUZP)
12561 return true;
12562
12563 // "VUZP" on i32 is an alias for VTRN.
12564 if (N->getOpcode() == ARMISD::VTRN && N->getValueType(0) == MVT::v2i32)
12565 return true;
12566
12567 return false;
12568}
12569
12572 const ARMSubtarget *Subtarget) {
12573 // Look for ADD(VUZP.0, VUZP.1).
12574 if (!IsVUZPShuffleNode(N0.getNode()) || N0.getNode() != N1.getNode() ||
12575 N0 == N1)
12576 return SDValue();
12577
12578 // Make sure the ADD is a 64-bit add; there is no 128-bit VPADD.
12579 if (!N->getValueType(0).is64BitVector())
12580 return SDValue();
12581
12582 // Generate vpadd.
12583 SelectionDAG &DAG = DCI.DAG;
12584 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
12585 SDLoc dl(N);
12586 SDNode *Unzip = N0.getNode();
12587 EVT VT = N->getValueType(0);
12588
12590 Ops.push_back(DAG.getConstant(Intrinsic::arm_neon_vpadd, dl,
12591 TLI.getPointerTy(DAG.getDataLayout())));
12592 Ops.push_back(Unzip->getOperand(0));
12593 Ops.push_back(Unzip->getOperand(1));
12594
12595 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, Ops);
12596}
12597
12600 const ARMSubtarget *Subtarget) {
12601 // Check for two extended operands.
12602 if (!(N0.getOpcode() == ISD::SIGN_EXTEND &&
12603 N1.getOpcode() == ISD::SIGN_EXTEND) &&
12604 !(N0.getOpcode() == ISD::ZERO_EXTEND &&
12605 N1.getOpcode() == ISD::ZERO_EXTEND))
12606 return SDValue();
12607
12608 SDValue N00 = N0.getOperand(0);
12609 SDValue N10 = N1.getOperand(0);
12610
12611 // Look for ADD(SEXT(VUZP.0), SEXT(VUZP.1))
12612 if (!IsVUZPShuffleNode(N00.getNode()) || N00.getNode() != N10.getNode() ||
12613 N00 == N10)
12614 return SDValue();
12615
12616 // We only recognize Q register paddl here; this can't be reached until
12617 // after type legalization.
12618 if (!N00.getValueType().is64BitVector() ||
12620 return SDValue();
12621
12622 // Generate vpaddl.
12623 SelectionDAG &DAG = DCI.DAG;
12624 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
12625 SDLoc dl(N);
12626 EVT VT = N->getValueType(0);
12627
12629 // Form vpaddl.sN or vpaddl.uN depending on the kind of extension.
12630 unsigned Opcode;
12631 if (N0.getOpcode() == ISD::SIGN_EXTEND)
12632 Opcode = Intrinsic::arm_neon_vpaddls;
12633 else
12634 Opcode = Intrinsic::arm_neon_vpaddlu;
12635 Ops.push_back(DAG.getConstant(Opcode, dl,
12636 TLI.getPointerTy(DAG.getDataLayout())));
12637 EVT ElemTy = N00.getValueType().getVectorElementType();
12638 unsigned NumElts = VT.getVectorNumElements();
12639 EVT ConcatVT = EVT::getVectorVT(*DAG.getContext(), ElemTy, NumElts * 2);
12640 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), ConcatVT,
12641 N00.getOperand(0), N00.getOperand(1));
12642 Ops.push_back(Concat);
12643
12644 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, Ops);
12645}
12646
12647// FIXME: This function shouldn't be necessary; if we lower BUILD_VECTOR in
12648// an appropriate manner, we end up with ADD(VUZP(ZEXT(N))), which is
12649// much easier to match.
12650static SDValue
12653 const ARMSubtarget *Subtarget) {
12654 // Only perform optimization if after legalize, and if NEON is available. We
12655 // also expected both operands to be BUILD_VECTORs.
12656 if (DCI.isBeforeLegalize() || !Subtarget->hasNEON()
12657 || N0.getOpcode() != ISD::BUILD_VECTOR
12658 || N1.getOpcode() != ISD::BUILD_VECTOR)
12659 return SDValue();
12660
12661 // Check output type since VPADDL operand elements can only be 8, 16, or 32.
12662 EVT VT = N->getValueType(0);
12663 if (!VT.isInteger() || VT.getVectorElementType() == MVT::i64)
12664 return SDValue();
12665
12666 // Check that the vector operands are of the right form.
12667 // N0 and N1 are BUILD_VECTOR nodes with N number of EXTRACT_VECTOR
12668 // operands, where N is the size of the formed vector.
12669 // Each EXTRACT_VECTOR should have the same input vector and odd or even
12670 // index such that we have a pair wise add pattern.
12671
12672 // Grab the vector that all EXTRACT_VECTOR nodes should be referencing.
12674 return SDValue();
12675 SDValue Vec = N0->getOperand(0)->getOperand(0);
12676 SDNode *V = Vec.getNode();
12677 unsigned nextIndex = 0;
12678
12679 // For each operands to the ADD which are BUILD_VECTORs,
12680 // check to see if each of their operands are an EXTRACT_VECTOR with
12681 // the same vector and appropriate index.
12682 for (unsigned i = 0, e = N0->getNumOperands(); i != e; ++i) {
12685
12686 SDValue ExtVec0 = N0->getOperand(i);
12687 SDValue ExtVec1 = N1->getOperand(i);
12688
12689 // First operand is the vector, verify its the same.
12690 if (V != ExtVec0->getOperand(0).getNode() ||
12691 V != ExtVec1->getOperand(0).getNode())
12692 return SDValue();
12693
12694 // Second is the constant, verify its correct.
12697
12698 // For the constant, we want to see all the even or all the odd.
12699 if (!C0 || !C1 || C0->getZExtValue() != nextIndex
12700 || C1->getZExtValue() != nextIndex+1)
12701 return SDValue();
12702
12703 // Increment index.
12704 nextIndex+=2;
12705 } else
12706 return SDValue();
12707 }
12708
12709 // Don't generate vpaddl+vmovn; we'll match it to vpadd later. Also make sure
12710 // we're using the entire input vector, otherwise there's a size/legality
12711 // mismatch somewhere.
12712 if (nextIndex != Vec.getValueType().getVectorNumElements() ||
12714 return SDValue();
12715
12716 // Create VPADDL node.
12717 SelectionDAG &DAG = DCI.DAG;
12718 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
12719
12720 SDLoc dl(N);
12721
12722 // Build operand list.
12724 Ops.push_back(DAG.getConstant(Intrinsic::arm_neon_vpaddls, dl,
12725 TLI.getPointerTy(DAG.getDataLayout())));
12726
12727 // Input is the vector.
12728 Ops.push_back(Vec);
12729
12730 // Get widened type and narrowed type.
12731 MVT widenType;
12732 unsigned numElem = VT.getVectorNumElements();
12733
12734 EVT inputLaneType = Vec.getValueType().getVectorElementType();
12735 switch (inputLaneType.getSimpleVT().SimpleTy) {
12736 case MVT::i8: widenType = MVT::getVectorVT(MVT::i16, numElem); break;
12737 case MVT::i16: widenType = MVT::getVectorVT(MVT::i32, numElem); break;
12738 case MVT::i32: widenType = MVT::getVectorVT(MVT::i64, numElem); break;
12739 default:
12740 llvm_unreachable("Invalid vector element type for padd optimization.");
12741 }
12742
12743 SDValue tmp = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, widenType, Ops);
12744 unsigned ExtOp = VT.bitsGT(tmp.getValueType()) ? ISD::ANY_EXTEND : ISD::TRUNCATE;
12745 return DAG.getNode(ExtOp, dl, VT, tmp);
12746}
12747
12749 if (V->getOpcode() == ISD::UMUL_LOHI ||
12750 V->getOpcode() == ISD::SMUL_LOHI)
12751 return V;
12752 return SDValue();
12753}
12754
12755static SDValue AddCombineTo64BitSMLAL16(SDNode *AddcNode, SDNode *AddeNode,
12757 const ARMSubtarget *Subtarget) {
12758 if (!Subtarget->hasBaseDSP())
12759 return SDValue();
12760
12761 // SMLALBB, SMLALBT, SMLALTB, SMLALTT multiply two 16-bit values and
12762 // accumulates the product into a 64-bit value. The 16-bit values will
12763 // be sign extended somehow or SRA'd into 32-bit values
12764 // (addc (adde (mul 16bit, 16bit), lo), hi)
12765 SDValue Mul = AddcNode->getOperand(0);
12766 SDValue Lo = AddcNode->getOperand(1);
12767 if (Mul.getOpcode() != ISD::MUL) {
12768 Lo = AddcNode->getOperand(0);
12769 Mul = AddcNode->getOperand(1);
12770 if (Mul.getOpcode() != ISD::MUL)
12771 return SDValue();
12772 }
12773
12774 SDValue SRA = AddeNode->getOperand(0);
12775 SDValue Hi = AddeNode->getOperand(1);
12776 if (SRA.getOpcode() != ISD::SRA) {
12777 SRA = AddeNode->getOperand(1);
12778 Hi = AddeNode->getOperand(0);
12779 if (SRA.getOpcode() != ISD::SRA)
12780 return SDValue();
12781 }
12782 if (auto Const = dyn_cast<ConstantSDNode>(SRA.getOperand(1))) {
12783 if (Const->getZExtValue() != 31)
12784 return SDValue();
12785 } else
12786 return SDValue();
12787
12788 if (SRA.getOperand(0) != Mul)
12789 return SDValue();
12790
12791 SelectionDAG &DAG = DCI.DAG;
12792 SDLoc dl(AddcNode);
12793 unsigned Opcode = 0;
12794 SDValue Op0;
12795 SDValue Op1;
12796
12797 if (isS16(Mul.getOperand(0), DAG) && isS16(Mul.getOperand(1), DAG)) {
12798 Opcode = ARMISD::SMLALBB;
12799 Op0 = Mul.getOperand(0);
12800 Op1 = Mul.getOperand(1);
12801 } else if (isS16(Mul.getOperand(0), DAG) && isSRA16(Mul.getOperand(1))) {
12802 Opcode = ARMISD::SMLALBT;
12803 Op0 = Mul.getOperand(0);
12804 Op1 = Mul.getOperand(1).getOperand(0);
12805 } else if (isSRA16(Mul.getOperand(0)) && isS16(Mul.getOperand(1), DAG)) {
12806 Opcode = ARMISD::SMLALTB;
12807 Op0 = Mul.getOperand(0).getOperand(0);
12808 Op1 = Mul.getOperand(1);
12809 } else if (isSRA16(Mul.getOperand(0)) && isSRA16(Mul.getOperand(1))) {
12810 Opcode = ARMISD::SMLALTT;
12811 Op0 = Mul->getOperand(0).getOperand(0);
12812 Op1 = Mul->getOperand(1).getOperand(0);
12813 }
12814
12815 if (!Op0 || !Op1)
12816 return SDValue();
12817
12818 SDValue SMLAL = DAG.getNode(Opcode, dl, DAG.getVTList(MVT::i32, MVT::i32),
12819 Op0, Op1, Lo, Hi);
12820 // Replace the ADDs' nodes uses by the MLA node's values.
12821 SDValue HiMLALResult(SMLAL.getNode(), 1);
12822 SDValue LoMLALResult(SMLAL.getNode(), 0);
12823
12824 DAG.ReplaceAllUsesOfValueWith(SDValue(AddcNode, 0), LoMLALResult);
12825 DAG.ReplaceAllUsesOfValueWith(SDValue(AddeNode, 0), HiMLALResult);
12826
12827 // Return original node to notify the driver to stop replacing.
12828 SDValue resNode(AddcNode, 0);
12829 return resNode;
12830}
12831
12834 const ARMSubtarget *Subtarget) {
12835 // Look for multiply add opportunities.
12836 // The pattern is a ISD::UMUL_LOHI followed by two add nodes, where
12837 // each add nodes consumes a value from ISD::UMUL_LOHI and there is
12838 // a glue link from the first add to the second add.
12839 // If we find this pattern, we can replace the U/SMUL_LOHI, ADDC, and ADDE by
12840 // a S/UMLAL instruction.
12841 // UMUL_LOHI
12842 // / :lo \ :hi
12843 // V \ [no multiline comment]
12844 // loAdd -> ADDC |
12845 // \ :carry /
12846 // V V
12847 // ADDE <- hiAdd
12848 //
12849 // In the special case where only the higher part of a signed result is used
12850 // and the add to the low part of the result of ISD::UMUL_LOHI adds or subtracts
12851 // a constant with the exact value of 0x80000000, we recognize we are dealing
12852 // with a "rounded multiply and add" (or subtract) and transform it into
12853 // either a ARMISD::SMMLAR or ARMISD::SMMLSR respectively.
12854
12855 assert((AddeSubeNode->getOpcode() == ARMISD::ADDE ||
12856 AddeSubeNode->getOpcode() == ARMISD::SUBE) &&
12857 "Expect an ADDE or SUBE");
12858
12859 assert(AddeSubeNode->getNumOperands() == 3 &&
12860 AddeSubeNode->getOperand(2).getValueType() == MVT::i32 &&
12861 "ADDE node has the wrong inputs");
12862
12863 // Check that we are chained to the right ADDC or SUBC node.
12864 SDNode *AddcSubcNode = AddeSubeNode->getOperand(2).getNode();
12865 if ((AddeSubeNode->getOpcode() == ARMISD::ADDE &&
12866 AddcSubcNode->getOpcode() != ARMISD::ADDC) ||
12867 (AddeSubeNode->getOpcode() == ARMISD::SUBE &&
12868 AddcSubcNode->getOpcode() != ARMISD::SUBC))
12869 return SDValue();
12870
12871 SDValue AddcSubcOp0 = AddcSubcNode->getOperand(0);
12872 SDValue AddcSubcOp1 = AddcSubcNode->getOperand(1);
12873
12874 // Check if the two operands are from the same mul_lohi node.
12875 if (AddcSubcOp0.getNode() == AddcSubcOp1.getNode())
12876 return SDValue();
12877
12878 assert(AddcSubcNode->getNumValues() == 2 &&
12879 AddcSubcNode->getValueType(0) == MVT::i32 &&
12880 "Expect ADDC with two result values. First: i32");
12881
12882 // Check that the ADDC adds the low result of the S/UMUL_LOHI. If not, it
12883 // maybe a SMLAL which multiplies two 16-bit values.
12884 if (AddeSubeNode->getOpcode() == ARMISD::ADDE &&
12885 AddcSubcOp0->getOpcode() != ISD::UMUL_LOHI &&
12886 AddcSubcOp0->getOpcode() != ISD::SMUL_LOHI &&
12887 AddcSubcOp1->getOpcode() != ISD::UMUL_LOHI &&
12888 AddcSubcOp1->getOpcode() != ISD::SMUL_LOHI)
12889 return AddCombineTo64BitSMLAL16(AddcSubcNode, AddeSubeNode, DCI, Subtarget);
12890
12891 // Check for the triangle shape.
12892 SDValue AddeSubeOp0 = AddeSubeNode->getOperand(0);
12893 SDValue AddeSubeOp1 = AddeSubeNode->getOperand(1);
12894
12895 // Make sure that the ADDE/SUBE operands are not coming from the same node.
12896 if (AddeSubeOp0.getNode() == AddeSubeOp1.getNode())
12897 return SDValue();
12898
12899 // Find the MUL_LOHI node walking up ADDE/SUBE's operands.
12900 bool IsLeftOperandMUL = false;
12901 SDValue MULOp = findMUL_LOHI(AddeSubeOp0);
12902 if (MULOp == SDValue())
12903 MULOp = findMUL_LOHI(AddeSubeOp1);
12904 else
12905 IsLeftOperandMUL = true;
12906 if (MULOp == SDValue())
12907 return SDValue();
12908
12909 // Figure out the right opcode.
12910 unsigned Opc = MULOp->getOpcode();
12911 unsigned FinalOpc = (Opc == ISD::SMUL_LOHI) ? ARMISD::SMLAL : ARMISD::UMLAL;
12912
12913 // Figure out the high and low input values to the MLAL node.
12914 SDValue *HiAddSub = nullptr;
12915 SDValue *LoMul = nullptr;
12916 SDValue *LowAddSub = nullptr;
12917
12918 // Ensure that ADDE/SUBE is from high result of ISD::xMUL_LOHI.
12919 if ((AddeSubeOp0 != MULOp.getValue(1)) && (AddeSubeOp1 != MULOp.getValue(1)))
12920 return SDValue();
12921
12922 if (IsLeftOperandMUL)
12923 HiAddSub = &AddeSubeOp1;
12924 else
12925 HiAddSub = &AddeSubeOp0;
12926
12927 // Ensure that LoMul and LowAddSub are taken from correct ISD::SMUL_LOHI node
12928 // whose low result is fed to the ADDC/SUBC we are checking.
12929
12930 if (AddcSubcOp0 == MULOp.getValue(0)) {
12931 LoMul = &AddcSubcOp0;
12932 LowAddSub = &AddcSubcOp1;
12933 }
12934 if (AddcSubcOp1 == MULOp.getValue(0)) {
12935 LoMul = &AddcSubcOp1;
12936 LowAddSub = &AddcSubcOp0;
12937 }
12938
12939 if (!LoMul)
12940 return SDValue();
12941
12942 // If HiAddSub is the same node as ADDC/SUBC or is a predecessor of ADDC/SUBC
12943 // the replacement below will create a cycle.
12944 if (AddcSubcNode == HiAddSub->getNode() ||
12945 AddcSubcNode->isPredecessorOf(HiAddSub->getNode()))
12946 return SDValue();
12947
12948 // Create the merged node.
12949 SelectionDAG &DAG = DCI.DAG;
12950
12951 // Start building operand list.
12953 Ops.push_back(LoMul->getOperand(0));
12954 Ops.push_back(LoMul->getOperand(1));
12955
12956 // Check whether we can use SMMLAR, SMMLSR or SMMULR instead. For this to be
12957 // the case, we must be doing signed multiplication and only use the higher
12958 // part of the result of the MLAL, furthermore the LowAddSub must be a constant
12959 // addition or subtraction with the value of 0x800000.
12960 if (Subtarget->hasV6Ops() && Subtarget->hasDSP() && Subtarget->useMulOps() &&
12961 FinalOpc == ARMISD::SMLAL && !AddeSubeNode->hasAnyUseOfValue(1) &&
12962 LowAddSub->getNode()->getOpcode() == ISD::Constant &&
12963 static_cast<ConstantSDNode *>(LowAddSub->getNode())->getZExtValue() ==
12964 0x80000000) {
12965 Ops.push_back(*HiAddSub);
12966 if (AddcSubcNode->getOpcode() == ARMISD::SUBC) {
12967 FinalOpc = ARMISD::SMMLSR;
12968 } else {
12969 FinalOpc = ARMISD::SMMLAR;
12970 }
12971 SDValue NewNode = DAG.getNode(FinalOpc, SDLoc(AddcSubcNode), MVT::i32, Ops);
12972 DAG.ReplaceAllUsesOfValueWith(SDValue(AddeSubeNode, 0), NewNode);
12973
12974 return SDValue(AddeSubeNode, 0);
12975 } else if (AddcSubcNode->getOpcode() == ARMISD::SUBC)
12976 // SMMLS is generated during instruction selection and the rest of this
12977 // function can not handle the case where AddcSubcNode is a SUBC.
12978 return SDValue();
12979
12980 // Finish building the operand list for {U/S}MLAL
12981 Ops.push_back(*LowAddSub);
12982 Ops.push_back(*HiAddSub);
12983
12984 SDValue MLALNode = DAG.getNode(FinalOpc, SDLoc(AddcSubcNode),
12985 DAG.getVTList(MVT::i32, MVT::i32), Ops);
12986
12987 // Replace the ADDs' nodes uses by the MLA node's values.
12988 SDValue HiMLALResult(MLALNode.getNode(), 1);
12989 DAG.ReplaceAllUsesOfValueWith(SDValue(AddeSubeNode, 0), HiMLALResult);
12990
12991 SDValue LoMLALResult(MLALNode.getNode(), 0);
12992 DAG.ReplaceAllUsesOfValueWith(SDValue(AddcSubcNode, 0), LoMLALResult);
12993
12994 // Return original node to notify the driver to stop replacing.
12995 return SDValue(AddeSubeNode, 0);
12996}
12997
13000 const ARMSubtarget *Subtarget) {
13001 // UMAAL is similar to UMLAL except that it adds two unsigned values.
13002 // While trying to combine for the other MLAL nodes, first search for the
13003 // chance to use UMAAL. Check if Addc uses a node which has already
13004 // been combined into a UMLAL. The other pattern is UMLAL using Addc/Adde
13005 // as the addend, and it's handled in PerformUMLALCombine.
13006
13007 if (!Subtarget->hasV6Ops() || !Subtarget->hasDSP())
13008 return AddCombineTo64bitMLAL(AddeNode, DCI, Subtarget);
13009
13010 // Check that we have a glued ADDC node.
13011 SDNode* AddcNode = AddeNode->getOperand(2).getNode();
13012 if (AddcNode->getOpcode() != ARMISD::ADDC)
13013 return SDValue();
13014
13015 // Find the converted UMAAL or quit if it doesn't exist.
13016 SDNode *UmlalNode = nullptr;
13017 SDValue AddHi;
13018 if (AddcNode->getOperand(0).getOpcode() == ARMISD::UMLAL) {
13019 UmlalNode = AddcNode->getOperand(0).getNode();
13020 AddHi = AddcNode->getOperand(1);
13021 } else if (AddcNode->getOperand(1).getOpcode() == ARMISD::UMLAL) {
13022 UmlalNode = AddcNode->getOperand(1).getNode();
13023 AddHi = AddcNode->getOperand(0);
13024 } else {
13025 return AddCombineTo64bitMLAL(AddeNode, DCI, Subtarget);
13026 }
13027
13028 // The ADDC should be glued to an ADDE node, which uses the same UMLAL as
13029 // the ADDC as well as Zero.
13030 if (!isNullConstant(UmlalNode->getOperand(3)))
13031 return SDValue();
13032
13033 if ((isNullConstant(AddeNode->getOperand(0)) &&
13034 AddeNode->getOperand(1).getNode() == UmlalNode) ||
13035 (AddeNode->getOperand(0).getNode() == UmlalNode &&
13036 isNullConstant(AddeNode->getOperand(1)))) {
13037 SelectionDAG &DAG = DCI.DAG;
13038 SDValue Ops[] = { UmlalNode->getOperand(0), UmlalNode->getOperand(1),
13039 UmlalNode->getOperand(2), AddHi };
13040 SDValue UMAAL = DAG.getNode(ARMISD::UMAAL, SDLoc(AddcNode),
13041 DAG.getVTList(MVT::i32, MVT::i32), Ops);
13042
13043 // Replace the ADDs' nodes uses by the UMAAL node's values.
13044 DAG.ReplaceAllUsesOfValueWith(SDValue(AddeNode, 0), SDValue(UMAAL.getNode(), 1));
13045 DAG.ReplaceAllUsesOfValueWith(SDValue(AddcNode, 0), SDValue(UMAAL.getNode(), 0));
13046
13047 // Return original node to notify the driver to stop replacing.
13048 return SDValue(AddeNode, 0);
13049 }
13050 return SDValue();
13051}
13052
13054 const ARMSubtarget *Subtarget) {
13055 if (!Subtarget->hasV6Ops() || !Subtarget->hasDSP())
13056 return SDValue();
13057
13058 // Check that we have a pair of ADDC and ADDE as operands.
13059 // Both addends of the ADDE must be zero.
13060 SDNode* AddcNode = N->getOperand(2).getNode();
13061 SDNode* AddeNode = N->getOperand(3).getNode();
13062 if ((AddcNode->getOpcode() == ARMISD::ADDC) &&
13063 (AddeNode->getOpcode() == ARMISD::ADDE) &&
13064 isNullConstant(AddeNode->getOperand(0)) &&
13065 isNullConstant(AddeNode->getOperand(1)) &&
13066 (AddeNode->getOperand(2).getNode() == AddcNode))
13067 return DAG.getNode(ARMISD::UMAAL, SDLoc(N),
13068 DAG.getVTList(MVT::i32, MVT::i32),
13069 {N->getOperand(0), N->getOperand(1),
13070 AddcNode->getOperand(0), AddcNode->getOperand(1)});
13071 else
13072 return SDValue();
13073}
13074
13077 const ARMSubtarget *Subtarget) {
13078 SelectionDAG &DAG(DCI.DAG);
13079
13080 if (N->getOpcode() == ARMISD::SUBC && N->hasAnyUseOfValue(1)) {
13081 // (SUBC (ADDE 0, 0, C), 1) -> C
13082 SDValue LHS = N->getOperand(0);
13083 SDValue RHS = N->getOperand(1);
13084 if (LHS->getOpcode() == ARMISD::ADDE &&
13085 isNullConstant(LHS->getOperand(0)) &&
13086 isNullConstant(LHS->getOperand(1)) && isOneConstant(RHS)) {
13087 return DCI.CombineTo(N, SDValue(N, 0), LHS->getOperand(2));
13088 }
13089 }
13090
13091 if (Subtarget->isThumb1Only()) {
13092 SDValue RHS = N->getOperand(1);
13094 int32_t imm = C->getSExtValue();
13095 if (imm < 0 && imm > std::numeric_limits<int>::min()) {
13096 SDLoc DL(N);
13097 RHS = DAG.getConstant(-imm, DL, MVT::i32);
13098 unsigned Opcode = (N->getOpcode() == ARMISD::ADDC) ? ARMISD::SUBC
13099 : ARMISD::ADDC;
13100 return DAG.getNode(Opcode, DL, N->getVTList(), N->getOperand(0), RHS);
13101 }
13102 }
13103 }
13104
13105 return SDValue();
13106}
13107
13110 const ARMSubtarget *Subtarget) {
13111 if (Subtarget->isThumb1Only()) {
13112 SelectionDAG &DAG = DCI.DAG;
13113 SDValue RHS = N->getOperand(1);
13115 int64_t imm = C->getSExtValue();
13116 if (imm < 0) {
13117 SDLoc DL(N);
13118
13119 // The with-carry-in form matches bitwise not instead of the negation.
13120 // Effectively, the inverse interpretation of the carry flag already
13121 // accounts for part of the negation.
13122 RHS = DAG.getConstant(~imm, DL, MVT::i32);
13123
13124 unsigned Opcode = (N->getOpcode() == ARMISD::ADDE) ? ARMISD::SUBE
13125 : ARMISD::ADDE;
13126 return DAG.getNode(Opcode, DL, N->getVTList(),
13127 N->getOperand(0), RHS, N->getOperand(2));
13128 }
13129 }
13130 } else if (N->getOperand(1)->getOpcode() == ISD::SMUL_LOHI) {
13131 return AddCombineTo64bitMLAL(N, DCI, Subtarget);
13132 }
13133 return SDValue();
13134}
13135
13138 const ARMSubtarget *Subtarget) {
13139 if (!Subtarget->hasMVEIntegerOps())
13140 return SDValue();
13141
13142 SDLoc dl(N);
13143 SDValue SetCC;
13144 SDValue LHS;
13145 SDValue RHS;
13146 ISD::CondCode CC;
13147 SDValue TrueVal;
13148 SDValue FalseVal;
13149
13150 if (N->getOpcode() == ISD::SELECT &&
13151 N->getOperand(0)->getOpcode() == ISD::SETCC) {
13152 SetCC = N->getOperand(0);
13153 LHS = SetCC->getOperand(0);
13154 RHS = SetCC->getOperand(1);
13155 CC = cast<CondCodeSDNode>(SetCC->getOperand(2))->get();
13156 TrueVal = N->getOperand(1);
13157 FalseVal = N->getOperand(2);
13158 } else if (N->getOpcode() == ISD::SELECT_CC) {
13159 LHS = N->getOperand(0);
13160 RHS = N->getOperand(1);
13161 CC = cast<CondCodeSDNode>(N->getOperand(4))->get();
13162 TrueVal = N->getOperand(2);
13163 FalseVal = N->getOperand(3);
13164 } else {
13165 return SDValue();
13166 }
13167
13168 unsigned int Opcode = 0;
13169 if ((TrueVal->getOpcode() == ISD::VECREDUCE_UMIN ||
13170 FalseVal->getOpcode() == ISD::VECREDUCE_UMIN) &&
13171 (CC == ISD::SETULT || CC == ISD::SETUGT)) {
13172 Opcode = ARMISD::VMINVu;
13173 if (CC == ISD::SETUGT)
13174 std::swap(TrueVal, FalseVal);
13175 } else if ((TrueVal->getOpcode() == ISD::VECREDUCE_SMIN ||
13176 FalseVal->getOpcode() == ISD::VECREDUCE_SMIN) &&
13177 (CC == ISD::SETLT || CC == ISD::SETGT)) {
13178 Opcode = ARMISD::VMINVs;
13179 if (CC == ISD::SETGT)
13180 std::swap(TrueVal, FalseVal);
13181 } else if ((TrueVal->getOpcode() == ISD::VECREDUCE_UMAX ||
13182 FalseVal->getOpcode() == ISD::VECREDUCE_UMAX) &&
13183 (CC == ISD::SETUGT || CC == ISD::SETULT)) {
13184 Opcode = ARMISD::VMAXVu;
13185 if (CC == ISD::SETULT)
13186 std::swap(TrueVal, FalseVal);
13187 } else if ((TrueVal->getOpcode() == ISD::VECREDUCE_SMAX ||
13188 FalseVal->getOpcode() == ISD::VECREDUCE_SMAX) &&
13189 (CC == ISD::SETGT || CC == ISD::SETLT)) {
13190 Opcode = ARMISD::VMAXVs;
13191 if (CC == ISD::SETLT)
13192 std::swap(TrueVal, FalseVal);
13193 } else
13194 return SDValue();
13195
13196 // Normalise to the right hand side being the vector reduction
13197 switch (TrueVal->getOpcode()) {
13202 std::swap(LHS, RHS);
13203 std::swap(TrueVal, FalseVal);
13204 break;
13205 }
13206
13207 EVT VectorType = FalseVal->getOperand(0).getValueType();
13208
13209 if (VectorType != MVT::v16i8 && VectorType != MVT::v8i16 &&
13210 VectorType != MVT::v4i32)
13211 return SDValue();
13212
13213 EVT VectorScalarType = VectorType.getVectorElementType();
13214
13215 // The values being selected must also be the ones being compared
13216 if (TrueVal != LHS || FalseVal != RHS)
13217 return SDValue();
13218
13219 EVT LeftType = LHS->getValueType(0);
13220 EVT RightType = RHS->getValueType(0);
13221
13222 // The types must match the reduced type too
13223 if (LeftType != VectorScalarType || RightType != VectorScalarType)
13224 return SDValue();
13225
13226 // Legalise the scalar to an i32
13227 if (VectorScalarType != MVT::i32)
13228 LHS = DCI.DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, LHS);
13229
13230 // Generate the reduction as an i32 for legalisation purposes
13231 auto Reduction =
13232 DCI.DAG.getNode(Opcode, dl, MVT::i32, LHS, RHS->getOperand(0));
13233
13234 // The result isn't actually an i32 so truncate it back to its original type
13235 if (VectorScalarType != MVT::i32)
13236 Reduction = DCI.DAG.getNode(ISD::TRUNCATE, dl, VectorScalarType, Reduction);
13237
13238 return Reduction;
13239}
13240
13241// A special combine for the vqdmulh family of instructions. This is one of the
13242// potential set of patterns that could patch this instruction. The base pattern
13243// you would expect to be min(max(ashr(mul(mul(sext(x), 2), sext(y)), 16))).
13244// This matches the different min(max(ashr(mul(mul(sext(x), sext(y)), 2), 16))),
13245// which llvm will have optimized to min(ashr(mul(sext(x), sext(y)), 15))) as
13246// the max is unnecessary.
13248 EVT VT = N->getValueType(0);
13249 SDValue Shft;
13250 ConstantSDNode *Clamp;
13251
13252 if (!VT.isVector() || VT.getScalarSizeInBits() > 64)
13253 return SDValue();
13254
13255 if (N->getOpcode() == ISD::SMIN) {
13256 Shft = N->getOperand(0);
13257 Clamp = isConstOrConstSplat(N->getOperand(1));
13258 } else if (N->getOpcode() == ISD::VSELECT) {
13259 // Detect a SMIN, which for an i64 node will be a vselect/setcc, not a smin.
13260 SDValue Cmp = N->getOperand(0);
13261 if (Cmp.getOpcode() != ISD::SETCC ||
13262 cast<CondCodeSDNode>(Cmp.getOperand(2))->get() != ISD::SETLT ||
13263 Cmp.getOperand(0) != N->getOperand(1) ||
13264 Cmp.getOperand(1) != N->getOperand(2))
13265 return SDValue();
13266 Shft = N->getOperand(1);
13267 Clamp = isConstOrConstSplat(N->getOperand(2));
13268 } else
13269 return SDValue();
13270
13271 if (!Clamp)
13272 return SDValue();
13273
13274 MVT ScalarType;
13275 int ShftAmt = 0;
13276 switch (Clamp->getSExtValue()) {
13277 case (1 << 7) - 1:
13278 ScalarType = MVT::i8;
13279 ShftAmt = 7;
13280 break;
13281 case (1 << 15) - 1:
13282 ScalarType = MVT::i16;
13283 ShftAmt = 15;
13284 break;
13285 case (1ULL << 31) - 1:
13286 ScalarType = MVT::i32;
13287 ShftAmt = 31;
13288 break;
13289 default:
13290 return SDValue();
13291 }
13292
13293 if (Shft.getOpcode() != ISD::SRA)
13294 return SDValue();
13296 if (!N1 || N1->getSExtValue() != ShftAmt)
13297 return SDValue();
13298
13299 SDValue Mul = Shft.getOperand(0);
13300 if (Mul.getOpcode() != ISD::MUL)
13301 return SDValue();
13302
13303 SDValue Ext0 = Mul.getOperand(0);
13304 SDValue Ext1 = Mul.getOperand(1);
13305 if (Ext0.getOpcode() != ISD::SIGN_EXTEND ||
13306 Ext1.getOpcode() != ISD::SIGN_EXTEND)
13307 return SDValue();
13308 EVT VecVT = Ext0.getOperand(0).getValueType();
13309 if (!VecVT.isPow2VectorType() || VecVT.getVectorNumElements() == 1)
13310 return SDValue();
13311 if (Ext1.getOperand(0).getValueType() != VecVT ||
13312 VecVT.getScalarType() != ScalarType ||
13313 VT.getScalarSizeInBits() < ScalarType.getScalarSizeInBits() * 2)
13314 return SDValue();
13315
13316 SDLoc DL(Mul);
13317 unsigned LegalLanes = 128 / (ShftAmt + 1);
13318 EVT LegalVecVT = MVT::getVectorVT(ScalarType, LegalLanes);
13319 // For types smaller than legal vectors extend to be legal and only use needed
13320 // lanes.
13321 if (VecVT.getSizeInBits() < 128) {
13322 EVT ExtVecVT =
13324 VecVT.getVectorNumElements());
13325 SDValue Inp0 =
13326 DAG.getNode(ISD::ANY_EXTEND, DL, ExtVecVT, Ext0.getOperand(0));
13327 SDValue Inp1 =
13328 DAG.getNode(ISD::ANY_EXTEND, DL, ExtVecVT, Ext1.getOperand(0));
13329 Inp0 = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, LegalVecVT, Inp0);
13330 Inp1 = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, LegalVecVT, Inp1);
13331 SDValue VQDMULH = DAG.getNode(ARMISD::VQDMULH, DL, LegalVecVT, Inp0, Inp1);
13332 SDValue Trunc = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, ExtVecVT, VQDMULH);
13333 Trunc = DAG.getNode(ISD::TRUNCATE, DL, VecVT, Trunc);
13334 return DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Trunc);
13335 }
13336
13337 // For larger types, split into legal sized chunks.
13338 assert(VecVT.getSizeInBits() % 128 == 0 && "Expected a power2 type");
13339 unsigned NumParts = VecVT.getSizeInBits() / 128;
13341 for (unsigned I = 0; I < NumParts; ++I) {
13342 SDValue Inp0 =
13343 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, LegalVecVT, Ext0.getOperand(0),
13344 DAG.getVectorIdxConstant(I * LegalLanes, DL));
13345 SDValue Inp1 =
13346 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, LegalVecVT, Ext1.getOperand(0),
13347 DAG.getVectorIdxConstant(I * LegalLanes, DL));
13348 SDValue VQDMULH = DAG.getNode(ARMISD::VQDMULH, DL, LegalVecVT, Inp0, Inp1);
13349 Parts.push_back(VQDMULH);
13350 }
13351 return DAG.getNode(ISD::SIGN_EXTEND, DL, VT,
13352 DAG.getNode(ISD::CONCAT_VECTORS, DL, VecVT, Parts));
13353}
13354
13357 const ARMSubtarget *Subtarget) {
13358 if (!Subtarget->hasMVEIntegerOps())
13359 return SDValue();
13360
13361 // Constant fold vselect 0, A, B -> B
13362 // and vselect 0xffff, A, B -> A
13363 if (N->getOperand(0).getOpcode() == ARMISD::PREDICATE_CAST &&
13364 isa<ConstantSDNode>(N->getOperand(0).getOperand(0))) {
13365 unsigned C = N->getOperand(0).getConstantOperandVal(0);
13366 if (C == 0)
13367 return N->getOperand(2);
13368 if (C == 0xffff)
13369 return N->getOperand(1);
13370 }
13371
13372 if (SDValue V = PerformVQDMULHCombine(N, DCI.DAG))
13373 return V;
13374
13375 // Transforms vselect(not(cond), lhs, rhs) into vselect(cond, rhs, lhs).
13376 //
13377 // We need to re-implement this optimization here as the implementation in the
13378 // Target-Independent DAGCombiner does not handle the kind of constant we make
13379 // (it calls isConstOrConstSplat with AllowTruncation set to false - and for
13380 // good reason, allowing truncation there would break other targets).
13381 //
13382 // Currently, this is only done for MVE, as it's the only target that benefits
13383 // from this transformation (e.g. VPNOT+VPSEL becomes a single VPSEL).
13384 if (N->getOperand(0).getOpcode() != ISD::XOR)
13385 return SDValue();
13386 SDValue XOR = N->getOperand(0);
13387
13388 // Check if the XOR's RHS is either a 1, or a BUILD_VECTOR of 1s.
13389 // It is important to check with truncation allowed as the BUILD_VECTORs we
13390 // generate in those situations will truncate their operands.
13391 ConstantSDNode *Const =
13392 isConstOrConstSplat(XOR->getOperand(1), /*AllowUndefs*/ false,
13393 /*AllowTruncation*/ true);
13394 if (!Const || !Const->isOne())
13395 return SDValue();
13396
13397 // Rewrite into vselect(cond, rhs, lhs).
13398 SDValue Cond = XOR->getOperand(0);
13399 SDValue LHS = N->getOperand(1);
13400 SDValue RHS = N->getOperand(2);
13401 EVT Type = N->getValueType(0);
13402 return DCI.DAG.getNode(ISD::VSELECT, SDLoc(N), Type, Cond, RHS, LHS);
13403}
13404
13405// Convert vsetcc([0,1,2,..], splat(n), ult) -> vctp n
13408 const ARMSubtarget *Subtarget) {
13409 SDValue Op0 = N->getOperand(0);
13410 SDValue Op1 = N->getOperand(1);
13411 ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
13412 EVT VT = N->getValueType(0);
13413
13414 if (!Subtarget->hasMVEIntegerOps() ||
13416 return SDValue();
13417
13418 if (CC == ISD::SETUGE) {
13419 std::swap(Op0, Op1);
13420 CC = ISD::SETULT;
13421 }
13422
13423 if (CC != ISD::SETULT || VT.getScalarSizeInBits() != 1 ||
13425 return SDValue();
13426
13427 // Check first operand is BuildVector of 0,1,2,...
13428 for (unsigned I = 0; I < VT.getVectorNumElements(); I++) {
13429 if (!Op0.getOperand(I).isUndef() &&
13431 Op0.getConstantOperandVal(I) == I))
13432 return SDValue();
13433 }
13434
13435 // The second is a Splat of Op1S
13436 SDValue Op1S = DCI.DAG.getSplatValue(Op1);
13437 if (!Op1S)
13438 return SDValue();
13439
13440 unsigned Opc;
13441 switch (VT.getVectorNumElements()) {
13442 case 2:
13443 Opc = Intrinsic::arm_mve_vctp64;
13444 break;
13445 case 4:
13446 Opc = Intrinsic::arm_mve_vctp32;
13447 break;
13448 case 8:
13449 Opc = Intrinsic::arm_mve_vctp16;
13450 break;
13451 case 16:
13452 Opc = Intrinsic::arm_mve_vctp8;
13453 break;
13454 default:
13455 return SDValue();
13456 }
13457
13458 SDLoc DL(N);
13459 return DCI.DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
13460 DCI.DAG.getConstant(Opc, DL, MVT::i32),
13461 DCI.DAG.getZExtOrTrunc(Op1S, DL, MVT::i32));
13462}
13463
13464/// PerformADDECombine - Target-specific dag combine transform from
13465/// ARMISD::ADDC, ARMISD::ADDE, and ISD::MUL_LOHI to MLAL or
13466/// ARMISD::ADDC, ARMISD::ADDE and ARMISD::UMLAL to ARMISD::UMAAL
13469 const ARMSubtarget *Subtarget) {
13470 // Only ARM and Thumb2 support UMLAL/SMLAL.
13471 if (Subtarget->isThumb1Only())
13472 return PerformAddeSubeCombine(N, DCI, Subtarget);
13473
13474 // Only perform the checks after legalize when the pattern is available.
13475 if (DCI.isBeforeLegalize()) return SDValue();
13476
13477 return AddCombineTo64bitUMAAL(N, DCI, Subtarget);
13478}
13479
13480/// PerformADDCombineWithOperands - Try DAG combinations for an ADD with
13481/// operands N0 and N1. This is a helper for PerformADDCombine that is
13482/// called with the default operands, and if that fails, with commuted
13483/// operands.
13486 const ARMSubtarget *Subtarget){
13487 // Attempt to create vpadd for this add.
13488 if (SDValue Result = AddCombineToVPADD(N, N0, N1, DCI, Subtarget))
13489 return Result;
13490
13491 // Attempt to create vpaddl for this add.
13492 if (SDValue Result = AddCombineVUZPToVPADDL(N, N0, N1, DCI, Subtarget))
13493 return Result;
13494 if (SDValue Result = AddCombineBUILD_VECTORToVPADDL(N, N0, N1, DCI,
13495 Subtarget))
13496 return Result;
13497
13498 // fold (add (select cc, 0, c), x) -> (select cc, x, (add, x, c))
13499 if (N0.getNode()->hasOneUse())
13500 if (SDValue Result = combineSelectAndUse(N, N0, N1, DCI))
13501 return Result;
13502 return SDValue();
13503}
13504
13506 EVT VT = N->getValueType(0);
13507 SDValue N0 = N->getOperand(0);
13508 SDValue N1 = N->getOperand(1);
13509 SDLoc dl(N);
13510
13511 auto IsVecReduce = [](SDValue Op) {
13512 switch (Op.getOpcode()) {
13513 case ISD::VECREDUCE_ADD:
13514 case ARMISD::VADDVs:
13515 case ARMISD::VADDVu:
13516 case ARMISD::VMLAVs:
13517 case ARMISD::VMLAVu:
13518 return true;
13519 }
13520 return false;
13521 };
13522
13523 auto DistrubuteAddAddVecReduce = [&](SDValue N0, SDValue N1) {
13524 // Distribute add(X, add(vecreduce(Y), vecreduce(Z))) ->
13525 // add(add(X, vecreduce(Y)), vecreduce(Z))
13526 // to make better use of vaddva style instructions.
13527 if (VT == MVT::i32 && N1.getOpcode() == ISD::ADD && !IsVecReduce(N0) &&
13528 IsVecReduce(N1.getOperand(0)) && IsVecReduce(N1.getOperand(1)) &&
13529 !isa<ConstantSDNode>(N0) && N1->hasOneUse()) {
13530 SDValue Add0 = DAG.getNode(ISD::ADD, dl, VT, N0, N1.getOperand(0));
13531 return DAG.getNode(ISD::ADD, dl, VT, Add0, N1.getOperand(1));
13532 }
13533 // And turn add(add(A, reduce(B)), add(C, reduce(D))) ->
13534 // add(add(add(A, C), reduce(B)), reduce(D))
13535 if (VT == MVT::i32 && N0.getOpcode() == ISD::ADD &&
13536 N1.getOpcode() == ISD::ADD && N0->hasOneUse() && N1->hasOneUse()) {
13537 unsigned N0RedOp = 0;
13538 if (!IsVecReduce(N0.getOperand(N0RedOp))) {
13539 N0RedOp = 1;
13540 if (!IsVecReduce(N0.getOperand(N0RedOp)))
13541 return SDValue();
13542 }
13543
13544 unsigned N1RedOp = 0;
13545 if (!IsVecReduce(N1.getOperand(N1RedOp)))
13546 N1RedOp = 1;
13547 if (!IsVecReduce(N1.getOperand(N1RedOp)))
13548 return SDValue();
13549
13550 SDValue Add0 = DAG.getNode(ISD::ADD, dl, VT, N0.getOperand(1 - N0RedOp),
13551 N1.getOperand(1 - N1RedOp));
13552 SDValue Add1 =
13553 DAG.getNode(ISD::ADD, dl, VT, Add0, N0.getOperand(N0RedOp));
13554 return DAG.getNode(ISD::ADD, dl, VT, Add1, N1.getOperand(N1RedOp));
13555 }
13556 return SDValue();
13557 };
13558 if (SDValue R = DistrubuteAddAddVecReduce(N0, N1))
13559 return R;
13560 if (SDValue R = DistrubuteAddAddVecReduce(N1, N0))
13561 return R;
13562
13563 // Distribute add(vecreduce(load(Y)), vecreduce(load(Z)))
13564 // Or add(add(X, vecreduce(load(Y))), vecreduce(load(Z)))
13565 // by ascending load offsets. This can help cores prefetch if the order of
13566 // loads is more predictable.
13567 auto DistrubuteVecReduceLoad = [&](SDValue N0, SDValue N1, bool IsForward) {
13568 // Check if two reductions are known to load data where one is before/after
13569 // another. Return negative if N0 loads data before N1, positive if N1 is
13570 // before N0 and 0 otherwise if nothing is known.
13571 auto IsKnownOrderedLoad = [&](SDValue N0, SDValue N1) {
13572 // Look through to the first operand of a MUL, for the VMLA case.
13573 // Currently only looks at the first operand, in the hope they are equal.
13574 if (N0.getOpcode() == ISD::MUL)
13575 N0 = N0.getOperand(0);
13576 if (N1.getOpcode() == ISD::MUL)
13577 N1 = N1.getOperand(0);
13578
13579 // Return true if the two operands are loads to the same object and the
13580 // offset of the first is known to be less than the offset of the second.
13581 LoadSDNode *Load0 = dyn_cast<LoadSDNode>(N0);
13582 LoadSDNode *Load1 = dyn_cast<LoadSDNode>(N1);
13583 if (!Load0 || !Load1 || Load0->getChain() != Load1->getChain() ||
13584 !Load0->isSimple() || !Load1->isSimple() || Load0->isIndexed() ||
13585 Load1->isIndexed())
13586 return 0;
13587
13588 auto BaseLocDecomp0 = BaseIndexOffset::match(Load0, DAG);
13589 auto BaseLocDecomp1 = BaseIndexOffset::match(Load1, DAG);
13590
13591 if (!BaseLocDecomp0.getBase() ||
13592 BaseLocDecomp0.getBase() != BaseLocDecomp1.getBase() ||
13593 !BaseLocDecomp0.hasValidOffset() || !BaseLocDecomp1.hasValidOffset())
13594 return 0;
13595 if (BaseLocDecomp0.getOffset() < BaseLocDecomp1.getOffset())
13596 return -1;
13597 if (BaseLocDecomp0.getOffset() > BaseLocDecomp1.getOffset())
13598 return 1;
13599 return 0;
13600 };
13601
13602 SDValue X;
13603 if (N0.getOpcode() == ISD::ADD && N0->hasOneUse()) {
13604 if (IsVecReduce(N0.getOperand(0)) && IsVecReduce(N0.getOperand(1))) {
13605 int IsBefore = IsKnownOrderedLoad(N0.getOperand(0).getOperand(0),
13606 N0.getOperand(1).getOperand(0));
13607 if (IsBefore < 0) {
13608 X = N0.getOperand(0);
13609 N0 = N0.getOperand(1);
13610 } else if (IsBefore > 0) {
13611 X = N0.getOperand(1);
13612 N0 = N0.getOperand(0);
13613 } else
13614 return SDValue();
13615 } else if (IsVecReduce(N0.getOperand(0))) {
13616 X = N0.getOperand(1);
13617 N0 = N0.getOperand(0);
13618 } else if (IsVecReduce(N0.getOperand(1))) {
13619 X = N0.getOperand(0);
13620 N0 = N0.getOperand(1);
13621 } else
13622 return SDValue();
13623 } else if (IsForward && IsVecReduce(N0) && IsVecReduce(N1) &&
13624 IsKnownOrderedLoad(N0.getOperand(0), N1.getOperand(0)) < 0) {
13625 // Note this is backward to how you would expect. We create
13626 // add(reduce(load + 16), reduce(load + 0)) so that the
13627 // add(reduce(load+16), X) is combined into VADDVA(X, load+16)), leaving
13628 // the X as VADDV(load + 0)
13629 return DAG.getNode(ISD::ADD, dl, VT, N1, N0);
13630 } else
13631 return SDValue();
13632
13633 if (!IsVecReduce(N0) || !IsVecReduce(N1))
13634 return SDValue();
13635
13636 if (IsKnownOrderedLoad(N1.getOperand(0), N0.getOperand(0)) >= 0)
13637 return SDValue();
13638
13639 // Switch from add(add(X, N0), N1) to add(add(X, N1), N0)
13640 SDValue Add0 = DAG.getNode(ISD::ADD, dl, VT, X, N1);
13641 return DAG.getNode(ISD::ADD, dl, VT, Add0, N0);
13642 };
13643 if (SDValue R = DistrubuteVecReduceLoad(N0, N1, true))
13644 return R;
13645 if (SDValue R = DistrubuteVecReduceLoad(N1, N0, false))
13646 return R;
13647 return SDValue();
13648}
13649
13651 const ARMSubtarget *Subtarget) {
13652 if (!Subtarget->hasMVEIntegerOps())
13653 return SDValue();
13654
13656 return R;
13657
13658 EVT VT = N->getValueType(0);
13659 SDValue N0 = N->getOperand(0);
13660 SDValue N1 = N->getOperand(1);
13661 SDLoc dl(N);
13662
13663 if (VT != MVT::i64)
13664 return SDValue();
13665
13666 // We are looking for a i64 add of a VADDLVx. Due to these being i64's, this
13667 // will look like:
13668 // t1: i32,i32 = ARMISD::VADDLVs x
13669 // t2: i64 = build_pair t1, t1:1
13670 // t3: i64 = add t2, y
13671 // Otherwise we try to push the add up above VADDLVAx, to potentially allow
13672 // the add to be simplified separately.
13673 // We also need to check for sext / zext and commutitive adds.
13674 auto MakeVecReduce = [&](unsigned Opcode, unsigned OpcodeA, SDValue NA,
13675 SDValue NB) {
13676 if (NB->getOpcode() != ISD::BUILD_PAIR)
13677 return SDValue();
13678 SDValue VecRed = NB->getOperand(0);
13679 if ((VecRed->getOpcode() != Opcode && VecRed->getOpcode() != OpcodeA) ||
13680 VecRed.getResNo() != 0 ||
13681 NB->getOperand(1) != SDValue(VecRed.getNode(), 1))
13682 return SDValue();
13683
13684 if (VecRed->getOpcode() == OpcodeA) {
13685 // add(NA, VADDLVA(Inp), Y) -> VADDLVA(add(NA, Inp), Y)
13686 SDValue Inp = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64,
13687 VecRed.getOperand(0), VecRed.getOperand(1));
13688 NA = DAG.getNode(ISD::ADD, dl, MVT::i64, Inp, NA);
13689 }
13690
13692 std::tie(Ops[0], Ops[1]) = DAG.SplitScalar(NA, dl, MVT::i32, MVT::i32);
13693
13694 unsigned S = VecRed->getOpcode() == OpcodeA ? 2 : 0;
13695 for (unsigned I = S, E = VecRed.getNumOperands(); I < E; I++)
13696 Ops.push_back(VecRed->getOperand(I));
13697 SDValue Red =
13698 DAG.getNode(OpcodeA, dl, DAG.getVTList({MVT::i32, MVT::i32}), Ops);
13699 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Red,
13700 SDValue(Red.getNode(), 1));
13701 };
13702
13703 if (SDValue M = MakeVecReduce(ARMISD::VADDLVs, ARMISD::VADDLVAs, N0, N1))
13704 return M;
13705 if (SDValue M = MakeVecReduce(ARMISD::VADDLVu, ARMISD::VADDLVAu, N0, N1))
13706 return M;
13707 if (SDValue M = MakeVecReduce(ARMISD::VADDLVs, ARMISD::VADDLVAs, N1, N0))
13708 return M;
13709 if (SDValue M = MakeVecReduce(ARMISD::VADDLVu, ARMISD::VADDLVAu, N1, N0))
13710 return M;
13711 if (SDValue M = MakeVecReduce(ARMISD::VADDLVps, ARMISD::VADDLVAps, N0, N1))
13712 return M;
13713 if (SDValue M = MakeVecReduce(ARMISD::VADDLVpu, ARMISD::VADDLVApu, N0, N1))
13714 return M;
13715 if (SDValue M = MakeVecReduce(ARMISD::VADDLVps, ARMISD::VADDLVAps, N1, N0))
13716 return M;
13717 if (SDValue M = MakeVecReduce(ARMISD::VADDLVpu, ARMISD::VADDLVApu, N1, N0))
13718 return M;
13719 if (SDValue M = MakeVecReduce(ARMISD::VMLALVs, ARMISD::VMLALVAs, N0, N1))
13720 return M;
13721 if (SDValue M = MakeVecReduce(ARMISD::VMLALVu, ARMISD::VMLALVAu, N0, N1))
13722 return M;
13723 if (SDValue M = MakeVecReduce(ARMISD::VMLALVs, ARMISD::VMLALVAs, N1, N0))
13724 return M;
13725 if (SDValue M = MakeVecReduce(ARMISD::VMLALVu, ARMISD::VMLALVAu, N1, N0))
13726 return M;
13727 if (SDValue M = MakeVecReduce(ARMISD::VMLALVps, ARMISD::VMLALVAps, N0, N1))
13728 return M;
13729 if (SDValue M = MakeVecReduce(ARMISD::VMLALVpu, ARMISD::VMLALVApu, N0, N1))
13730 return M;
13731 if (SDValue M = MakeVecReduce(ARMISD::VMLALVps, ARMISD::VMLALVAps, N1, N0))
13732 return M;
13733 if (SDValue M = MakeVecReduce(ARMISD::VMLALVpu, ARMISD::VMLALVApu, N1, N0))
13734 return M;
13735 return SDValue();
13736}
13737
13738bool
13740 CombineLevel Level) const {
13741 assert((N->getOpcode() == ISD::SHL || N->getOpcode() == ISD::SRA ||
13742 N->getOpcode() == ISD::SRL) &&
13743 "Expected shift op");
13744
13745 SDValue ShiftLHS = N->getOperand(0);
13746 if (!ShiftLHS->hasOneUse())
13747 return false;
13748
13749 if (ShiftLHS.getOpcode() == ISD::SIGN_EXTEND &&
13750 !ShiftLHS.getOperand(0)->hasOneUse())
13751 return false;
13752
13753 if (Level == BeforeLegalizeTypes)
13754 return true;
13755
13756 if (N->getOpcode() != ISD::SHL)
13757 return true;
13758
13759 if (Subtarget->isThumb1Only()) {
13760 // Avoid making expensive immediates by commuting shifts. (This logic
13761 // only applies to Thumb1 because ARM and Thumb2 immediates can be shifted
13762 // for free.)
13763 if (N->getOpcode() != ISD::SHL)
13764 return true;
13765 SDValue N1 = N->getOperand(0);
13766 if (N1->getOpcode() != ISD::ADD && N1->getOpcode() != ISD::AND &&
13767 N1->getOpcode() != ISD::OR && N1->getOpcode() != ISD::XOR)
13768 return true;
13769 if (auto *Const = dyn_cast<ConstantSDNode>(N1->getOperand(1))) {
13770 if (Const->getAPIntValue().ult(256))
13771 return false;
13772 if (N1->getOpcode() == ISD::ADD && Const->getAPIntValue().slt(0) &&
13773 Const->getAPIntValue().sgt(-256))
13774 return false;
13775 }
13776 return true;
13777 }
13778
13779 // Turn off commute-with-shift transform after legalization, so it doesn't
13780 // conflict with PerformSHLSimplify. (We could try to detect when
13781 // PerformSHLSimplify would trigger more precisely, but it isn't
13782 // really necessary.)
13783 return false;
13784}
13785
13787 const SDNode *N) const {
13788 assert(N->getOpcode() == ISD::XOR &&
13789 (N->getOperand(0).getOpcode() == ISD::SHL ||
13790 N->getOperand(0).getOpcode() == ISD::SRL) &&
13791 "Expected XOR(SHIFT) pattern");
13792
13793 // Only commute if the entire NOT mask is a hidden shifted mask.
13794 auto *XorC = dyn_cast<ConstantSDNode>(N->getOperand(1));
13795 auto *ShiftC = dyn_cast<ConstantSDNode>(N->getOperand(0).getOperand(1));
13796 if (XorC && ShiftC) {
13797 unsigned MaskIdx, MaskLen;
13798 if (XorC->getAPIntValue().isShiftedMask(MaskIdx, MaskLen)) {
13799 unsigned ShiftAmt = ShiftC->getZExtValue();
13800 unsigned BitWidth = N->getValueType(0).getScalarSizeInBits();
13801 if (N->getOperand(0).getOpcode() == ISD::SHL)
13802 return MaskIdx == ShiftAmt && MaskLen == (BitWidth - ShiftAmt);
13803 return MaskIdx == 0 && MaskLen == (BitWidth - ShiftAmt);
13804 }
13805 }
13806
13807 return false;
13808}
13809
13811 const SDNode *N) const {
13812 assert(((N->getOpcode() == ISD::SHL &&
13813 N->getOperand(0).getOpcode() == ISD::SRL) ||
13814 (N->getOpcode() == ISD::SRL &&
13815 N->getOperand(0).getOpcode() == ISD::SHL)) &&
13816 "Expected shift-shift mask");
13817
13818 if (!Subtarget->isThumb1Only())
13819 return true;
13820
13821 EVT VT = N->getValueType(0);
13822 if (VT.getScalarSizeInBits() > 32)
13823 return true;
13824
13825 return false;
13826}
13827
13829 unsigned BinOpcode, EVT VT, unsigned SelectOpcode, SDValue X,
13830 SDValue Y) const {
13831 return Subtarget->hasMVEIntegerOps() && isTypeLegal(VT) &&
13832 SelectOpcode == ISD::VSELECT;
13833}
13834
13836 if (!Subtarget->hasNEON() && !Subtarget->hasMVEIntegerOps()) {
13837 if (Subtarget->isThumb1Only())
13838 return VT.getScalarSizeInBits() <= 32;
13839 return true;
13840 }
13841 return VT.isScalarInteger();
13842}
13843
13845 EVT VT) const {
13846 if (!isOperationLegalOrCustom(Op, VT) || !FPVT.isSimple())
13847 return false;
13848
13849 switch (FPVT.getSimpleVT().SimpleTy) {
13850 case MVT::f16:
13851 return Subtarget->hasVFP2Base();
13852 case MVT::f32:
13853 return Subtarget->hasVFP2Base();
13854 case MVT::f64:
13855 return Subtarget->hasFP64();
13856 case MVT::v4f32:
13857 case MVT::v8f16:
13858 return Subtarget->hasMVEFloatOps();
13859 default:
13860 return false;
13861 }
13862}
13863
13866 const ARMSubtarget *ST) {
13867 // Allow the generic combiner to identify potential bswaps.
13868 if (DCI.isBeforeLegalize())
13869 return SDValue();
13870
13871 // DAG combiner will fold:
13872 // (shl (add x, c1), c2) -> (add (shl x, c2), c1 << c2)
13873 // (shl (or x, c1), c2) -> (or (shl x, c2), c1 << c2
13874 // Other code patterns that can be also be modified have the following form:
13875 // b + ((a << 1) | 510)
13876 // b + ((a << 1) & 510)
13877 // b + ((a << 1) ^ 510)
13878 // b + ((a << 1) + 510)
13879
13880 // Many instructions can perform the shift for free, but it requires both
13881 // the operands to be registers. If c1 << c2 is too large, a mov immediate
13882 // instruction will needed. So, unfold back to the original pattern if:
13883 // - if c1 and c2 are small enough that they don't require mov imms.
13884 // - the user(s) of the node can perform an shl
13885
13886 // No shifted operands for 16-bit instructions.
13887 if (ST->isThumb() && ST->isThumb1Only())
13888 return SDValue();
13889
13890 // Check that all the users could perform the shl themselves.
13891 for (auto *U : N->users()) {
13892 switch(U->getOpcode()) {
13893 default:
13894 return SDValue();
13895 case ISD::SUB:
13896 case ISD::ADD:
13897 case ISD::AND:
13898 case ISD::OR:
13899 case ISD::XOR:
13900 case ISD::SETCC:
13901 case ARMISD::CMP:
13902 // Check that the user isn't already using a constant because there
13903 // aren't any instructions that support an immediate operand and a
13904 // shifted operand.
13905 if (isa<ConstantSDNode>(U->getOperand(0)) ||
13906 isa<ConstantSDNode>(U->getOperand(1)))
13907 return SDValue();
13908
13909 // Check that it's not already using a shift.
13910 if (U->getOperand(0).getOpcode() == ISD::SHL ||
13911 U->getOperand(1).getOpcode() == ISD::SHL)
13912 return SDValue();
13913 break;
13914 }
13915 }
13916
13917 if (N->getOpcode() != ISD::ADD && N->getOpcode() != ISD::OR &&
13918 N->getOpcode() != ISD::XOR && N->getOpcode() != ISD::AND)
13919 return SDValue();
13920
13921 if (N->getOperand(0).getOpcode() != ISD::SHL)
13922 return SDValue();
13923
13924 SDValue SHL = N->getOperand(0);
13925
13926 auto *C1ShlC2 = dyn_cast<ConstantSDNode>(N->getOperand(1));
13927 auto *C2 = dyn_cast<ConstantSDNode>(SHL.getOperand(1));
13928 if (!C1ShlC2 || !C2)
13929 return SDValue();
13930
13931 APInt C2Int = C2->getAPIntValue();
13932 APInt C1Int = C1ShlC2->getAPIntValue();
13933 unsigned C2Width = C2Int.getBitWidth();
13934 if (C2Int.uge(C2Width))
13935 return SDValue();
13936 uint64_t C2Value = C2Int.getZExtValue();
13937
13938 // Check that performing a lshr will not lose any information.
13939 APInt Mask = APInt::getHighBitsSet(C2Width, C2Width - C2Value);
13940 if ((C1Int & Mask) != C1Int)
13941 return SDValue();
13942
13943 // Shift the first constant.
13944 C1Int.lshrInPlace(C2Int);
13945
13946 // The immediates are encoded as an 8-bit value that can be rotated.
13947 auto LargeImm = [](const APInt &Imm) {
13948 unsigned Zeros = Imm.countl_zero() + Imm.countr_zero();
13949 return Imm.getBitWidth() - Zeros > 8;
13950 };
13951
13952 if (LargeImm(C1Int) || LargeImm(C2Int))
13953 return SDValue();
13954
13955 SelectionDAG &DAG = DCI.DAG;
13956 SDLoc dl(N);
13957 SDValue X = SHL.getOperand(0);
13958 SDValue BinOp = DAG.getNode(N->getOpcode(), dl, MVT::i32, X,
13959 DAG.getConstant(C1Int, dl, MVT::i32));
13960 // Shift left to compensate for the lshr of C1Int.
13961 SDValue Res = DAG.getNode(ISD::SHL, dl, MVT::i32, BinOp, SHL.getOperand(1));
13962
13963 LLVM_DEBUG(dbgs() << "Simplify shl use:\n"; SHL.getOperand(0).dump();
13964 SHL.dump(); N->dump());
13965 LLVM_DEBUG(dbgs() << "Into:\n"; X.dump(); BinOp.dump(); Res.dump());
13966 return Res;
13967}
13968
13969
13970/// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD.
13971///
13974 const ARMSubtarget *Subtarget) {
13975 SDValue N0 = N->getOperand(0);
13976 SDValue N1 = N->getOperand(1);
13977
13978 // Only works one way, because it needs an immediate operand.
13979 if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget))
13980 return Result;
13981
13982 if (SDValue Result = PerformADDVecReduce(N, DCI.DAG, Subtarget))
13983 return Result;
13984
13985 // First try with the default operand order.
13986 if (SDValue Result = PerformADDCombineWithOperands(N, N0, N1, DCI, Subtarget))
13987 return Result;
13988
13989 // If that didn't work, try again with the operands commuted.
13990 return PerformADDCombineWithOperands(N, N1, N0, DCI, Subtarget);
13991}
13992
13993// Combine (sub 0, (csinc X, Y, CC)) -> (csinv -X, Y, CC)
13994// providing -X is as cheap as X (currently, just a constant).
13996 if (N->getValueType(0) != MVT::i32 || !isNullConstant(N->getOperand(0)))
13997 return SDValue();
13998 SDValue CSINC = N->getOperand(1);
13999 if (CSINC.getOpcode() != ARMISD::CSINC || !CSINC.hasOneUse())
14000 return SDValue();
14001
14003 if (!X)
14004 return SDValue();
14005
14006 return DAG.getNode(ARMISD::CSINV, SDLoc(N), MVT::i32,
14007 DAG.getNode(ISD::SUB, SDLoc(N), MVT::i32, N->getOperand(0),
14008 CSINC.getOperand(0)),
14009 CSINC.getOperand(1), CSINC.getOperand(2),
14010 CSINC.getOperand(3));
14011}
14012
14014 // Free to negate.
14016 return 0;
14017
14018 // Will save one instruction.
14019 if (Op.getOpcode() == ISD::SUB && isNullConstant(Op.getOperand(0)))
14020 return -1;
14021
14022 // Can freely negate by converting sra <-> srl.
14023 if (Op.getOpcode() == ISD::SRA || Op.getOpcode() == ISD::SRL) {
14024 ConstantSDNode *ShiftAmt = dyn_cast<ConstantSDNode>(Op.getOperand(1));
14025 if (Op.hasOneUse() && ShiftAmt &&
14026 ShiftAmt->getZExtValue() == Op.getValueType().getScalarSizeInBits() - 1)
14027 return 0;
14028 }
14029
14030 // Will have to create sub.
14031 return 1;
14032}
14033
14034// Try to fold
14035//
14036// (neg (cmov X, Y)) -> (cmov (neg X), (neg Y))
14037//
14038// The folding helps cmov to be matched with csneg without generating
14039// redundant neg instruction.
14041 assert(N->getOpcode() == ISD::SUB);
14042 if (!isNullConstant(N->getOperand(0)))
14043 return SDValue();
14044
14045 SDValue CMov = N->getOperand(1);
14046 if (CMov.getOpcode() != ARMISD::CMOV || !CMov->hasOneUse())
14047 return SDValue();
14048
14049 SDValue N0 = CMov.getOperand(0);
14050 SDValue N1 = CMov.getOperand(1);
14051
14052 // Only perform the fold if we actually save something.
14053 if (getNegationCost(N0) + getNegationCost(N1) > 0)
14054 return SDValue();
14055
14056 SDLoc DL(N);
14057 EVT VT = CMov.getValueType();
14058
14059 SDValue N0N = DAG.getNegative(N0, DL, VT);
14060 SDValue N1N = DAG.getNegative(N1, DL, VT);
14061 return DAG.getNode(ARMISD::CMOV, DL, VT, N0N, N1N, CMov.getOperand(2),
14062 CMov.getOperand(3));
14063}
14064
14065/// PerformSUBCombine - Target-specific dag combine xforms for ISD::SUB.
14066///
14069 const ARMSubtarget *Subtarget) {
14070 SDValue N0 = N->getOperand(0);
14071 SDValue N1 = N->getOperand(1);
14072
14073 // fold (sub x, (select cc, 0, c)) -> (select cc, x, (sub, x, c))
14074 if (N1.getNode()->hasOneUse())
14075 if (SDValue Result = combineSelectAndUse(N, N1, N0, DCI))
14076 return Result;
14077
14078 if (SDValue R = PerformSubCSINCCombine(N, DCI.DAG))
14079 return R;
14080
14081 if (SDValue Val = performNegCMovCombine(N, DCI.DAG))
14082 return Val;
14083
14084 if (!Subtarget->hasMVEIntegerOps() || !N->getValueType(0).isVector())
14085 return SDValue();
14086
14087 // Fold (sub (ARMvmovImm 0), (ARMvdup x)) -> (ARMvdup (sub 0, x))
14088 // so that we can readily pattern match more mve instructions which can use
14089 // a scalar operand.
14090 SDValue VDup = N->getOperand(1);
14091 if (VDup->getOpcode() != ARMISD::VDUP)
14092 return SDValue();
14093
14094 SDValue VMov = N->getOperand(0);
14095 if (VMov->getOpcode() == ISD::BITCAST)
14096 VMov = VMov->getOperand(0);
14097
14098 if (VMov->getOpcode() != ARMISD::VMOVIMM || !isZeroVector(VMov))
14099 return SDValue();
14100
14101 SDLoc dl(N);
14102 SDValue Negate = DCI.DAG.getNode(ISD::SUB, dl, MVT::i32,
14103 DCI.DAG.getConstant(0, dl, MVT::i32),
14104 VDup->getOperand(0));
14105 return DCI.DAG.getNode(ARMISD::VDUP, dl, N->getValueType(0), Negate);
14106}
14107
14108/// PerformVMULCombine
14109/// Distribute (A + B) * C to (A * C) + (B * C) to take advantage of the
14110/// special multiplier accumulator forwarding.
14111/// vmul d3, d0, d2
14112/// vmla d3, d1, d2
14113/// is faster than
14114/// vadd d3, d0, d1
14115/// vmul d3, d3, d2
14116// However, for (A + B) * (A + B),
14117// vadd d2, d0, d1
14118// vmul d3, d0, d2
14119// vmla d3, d1, d2
14120// is slower than
14121// vadd d2, d0, d1
14122// vmul d3, d2, d2
14125 const ARMSubtarget *Subtarget) {
14126 if (!Subtarget->hasVMLxForwarding())
14127 return SDValue();
14128
14129 SelectionDAG &DAG = DCI.DAG;
14130 SDValue N0 = N->getOperand(0);
14131 SDValue N1 = N->getOperand(1);
14132 unsigned Opcode = N0.getOpcode();
14133 if (Opcode != ISD::ADD && Opcode != ISD::SUB &&
14134 Opcode != ISD::FADD && Opcode != ISD::FSUB) {
14135 Opcode = N1.getOpcode();
14136 if (Opcode != ISD::ADD && Opcode != ISD::SUB &&
14137 Opcode != ISD::FADD && Opcode != ISD::FSUB)
14138 return SDValue();
14139 std::swap(N0, N1);
14140 }
14141
14142 if (N0 == N1)
14143 return SDValue();
14144
14145 EVT VT = N->getValueType(0);
14146 SDLoc DL(N);
14147 SDValue N00 = N0->getOperand(0);
14148 SDValue N01 = N0->getOperand(1);
14149 return DAG.getNode(Opcode, DL, VT,
14150 DAG.getNode(ISD::MUL, DL, VT, N00, N1),
14151 DAG.getNode(ISD::MUL, DL, VT, N01, N1));
14152}
14153
14155 const ARMSubtarget *Subtarget) {
14156 EVT VT = N->getValueType(0);
14157 if (VT != MVT::v2i64)
14158 return SDValue();
14159
14160 SDValue N0 = N->getOperand(0);
14161 SDValue N1 = N->getOperand(1);
14162
14163 auto IsSignExt = [&](SDValue Op) {
14164 if (Op->getOpcode() != ISD::SIGN_EXTEND_INREG)
14165 return SDValue();
14166 EVT VT = cast<VTSDNode>(Op->getOperand(1))->getVT();
14167 if (VT.getScalarSizeInBits() == 32)
14168 return Op->getOperand(0);
14169 return SDValue();
14170 };
14171 auto IsZeroExt = [&](SDValue Op) {
14172 // Zero extends are a little more awkward. At the point we are matching
14173 // this, we are looking for an AND with a (-1, 0, -1, 0) buildvector mask.
14174 // That might be before of after a bitcast depending on how the and is
14175 // placed. Because this has to look through bitcasts, it is currently only
14176 // supported on LE.
14177 if (!Subtarget->isLittle())
14178 return SDValue();
14179
14180 SDValue And = Op;
14181 if (And->getOpcode() == ISD::BITCAST)
14182 And = And->getOperand(0);
14183 if (And->getOpcode() != ISD::AND)
14184 return SDValue();
14185 SDValue Mask = And->getOperand(1);
14186 if (Mask->getOpcode() == ISD::BITCAST)
14187 Mask = Mask->getOperand(0);
14188
14189 if (Mask->getOpcode() != ISD::BUILD_VECTOR ||
14190 Mask.getValueType() != MVT::v4i32)
14191 return SDValue();
14192 if (isAllOnesConstant(Mask->getOperand(0)) &&
14193 isNullConstant(Mask->getOperand(1)) &&
14194 isAllOnesConstant(Mask->getOperand(2)) &&
14195 isNullConstant(Mask->getOperand(3)))
14196 return And->getOperand(0);
14197 return SDValue();
14198 };
14199
14200 SDLoc dl(N);
14201 if (SDValue Op0 = IsSignExt(N0)) {
14202 if (SDValue Op1 = IsSignExt(N1)) {
14203 SDValue New0a = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, Op0);
14204 SDValue New1a = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, Op1);
14205 return DAG.getNode(ARMISD::VMULLs, dl, VT, New0a, New1a);
14206 }
14207 }
14208 if (SDValue Op0 = IsZeroExt(N0)) {
14209 if (SDValue Op1 = IsZeroExt(N1)) {
14210 SDValue New0a = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, Op0);
14211 SDValue New1a = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, Op1);
14212 return DAG.getNode(ARMISD::VMULLu, dl, VT, New0a, New1a);
14213 }
14214 }
14215
14216 return SDValue();
14217}
14218
14221 const ARMSubtarget *Subtarget) {
14222 SelectionDAG &DAG = DCI.DAG;
14223
14224 EVT VT = N->getValueType(0);
14225 if (Subtarget->hasMVEIntegerOps() && VT == MVT::v2i64)
14226 return PerformMVEVMULLCombine(N, DAG, Subtarget);
14227
14228 if (Subtarget->isThumb1Only())
14229 return SDValue();
14230
14231 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
14232 return SDValue();
14233
14234 if (VT.is64BitVector() || VT.is128BitVector())
14235 return PerformVMULCombine(N, DCI, Subtarget);
14236 if (VT != MVT::i32)
14237 return SDValue();
14238
14239 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
14240 if (!C)
14241 return SDValue();
14242
14243 int64_t MulAmt = C->getSExtValue();
14244 unsigned ShiftAmt = llvm::countr_zero<uint64_t>(MulAmt);
14245
14246 ShiftAmt = ShiftAmt & (32 - 1);
14247 SDValue V = N->getOperand(0);
14248 SDLoc DL(N);
14249
14250 SDValue Res;
14251 MulAmt >>= ShiftAmt;
14252
14253 if (MulAmt >= 0) {
14254 if (llvm::has_single_bit<uint32_t>(MulAmt - 1)) {
14255 // (mul x, 2^N + 1) => (add (shl x, N), x)
14256 Res = DAG.getNode(ISD::ADD, DL, VT,
14257 V,
14258 DAG.getNode(ISD::SHL, DL, VT,
14259 V,
14260 DAG.getConstant(Log2_32(MulAmt - 1), DL,
14261 MVT::i32)));
14262 } else if (llvm::has_single_bit<uint32_t>(MulAmt + 1)) {
14263 // (mul x, 2^N - 1) => (sub (shl x, N), x)
14264 Res = DAG.getNode(ISD::SUB, DL, VT,
14265 DAG.getNode(ISD::SHL, DL, VT,
14266 V,
14267 DAG.getConstant(Log2_32(MulAmt + 1), DL,
14268 MVT::i32)),
14269 V);
14270 } else
14271 return SDValue();
14272 } else {
14273 uint64_t MulAmtAbs = -MulAmt;
14274 if (llvm::has_single_bit<uint32_t>(MulAmtAbs + 1)) {
14275 // (mul x, -(2^N - 1)) => (sub x, (shl x, N))
14276 Res = DAG.getNode(ISD::SUB, DL, VT,
14277 V,
14278 DAG.getNode(ISD::SHL, DL, VT,
14279 V,
14280 DAG.getConstant(Log2_32(MulAmtAbs + 1), DL,
14281 MVT::i32)));
14282 } else if (llvm::has_single_bit<uint32_t>(MulAmtAbs - 1)) {
14283 // (mul x, -(2^N + 1)) => - (add (shl x, N), x)
14284 Res = DAG.getNode(ISD::ADD, DL, VT,
14285 V,
14286 DAG.getNode(ISD::SHL, DL, VT,
14287 V,
14288 DAG.getConstant(Log2_32(MulAmtAbs - 1), DL,
14289 MVT::i32)));
14290 Res = DAG.getNode(ISD::SUB, DL, VT,
14291 DAG.getConstant(0, DL, MVT::i32), Res);
14292 } else
14293 return SDValue();
14294 }
14295
14296 if (ShiftAmt != 0)
14297 Res = DAG.getNode(ISD::SHL, DL, VT,
14298 Res, DAG.getConstant(ShiftAmt, DL, MVT::i32));
14299
14300 // Do not add new nodes to DAG combiner worklist.
14301 DCI.CombineTo(N, Res, false);
14302 return SDValue();
14303}
14304
14307 const ARMSubtarget *Subtarget) {
14308 // Allow DAGCombine to pattern-match before we touch the canonical form.
14309 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
14310 return SDValue();
14311
14312 if (N->getValueType(0) != MVT::i32)
14313 return SDValue();
14314
14315 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N->getOperand(1));
14316 if (!N1C)
14317 return SDValue();
14318
14319 uint32_t C1 = (uint32_t)N1C->getZExtValue();
14320 // Don't transform uxtb/uxth.
14321 if (C1 == 255 || C1 == 65535)
14322 return SDValue();
14323
14324 SDNode *N0 = N->getOperand(0).getNode();
14325 if (!N0->hasOneUse())
14326 return SDValue();
14327
14328 if (N0->getOpcode() != ISD::SHL && N0->getOpcode() != ISD::SRL)
14329 return SDValue();
14330
14331 bool LeftShift = N0->getOpcode() == ISD::SHL;
14332
14334 if (!N01C)
14335 return SDValue();
14336
14337 uint32_t C2 = (uint32_t)N01C->getZExtValue();
14338 if (!C2 || C2 >= 32)
14339 return SDValue();
14340
14341 // Clear irrelevant bits in the mask.
14342 if (LeftShift)
14343 C1 &= (-1U << C2);
14344 else
14345 C1 &= (-1U >> C2);
14346
14347 SelectionDAG &DAG = DCI.DAG;
14348 SDLoc DL(N);
14349
14350 // We have a pattern of the form "(and (shl x, c2) c1)" or
14351 // "(and (srl x, c2) c1)", where c1 is a shifted mask. Try to
14352 // transform to a pair of shifts, to save materializing c1.
14353
14354 // First pattern: right shift, then mask off leading bits.
14355 // FIXME: Use demanded bits?
14356 if (!LeftShift && isMask_32(C1)) {
14357 uint32_t C3 = llvm::countl_zero(C1);
14358 if (C2 < C3) {
14359 SDValue SHL = DAG.getNode(ISD::SHL, DL, MVT::i32, N0->getOperand(0),
14360 DAG.getConstant(C3 - C2, DL, MVT::i32));
14361 return DAG.getNode(ISD::SRL, DL, MVT::i32, SHL,
14362 DAG.getConstant(C3, DL, MVT::i32));
14363 }
14364 }
14365
14366 // First pattern, reversed: left shift, then mask off trailing bits.
14367 if (LeftShift && isMask_32(~C1)) {
14368 uint32_t C3 = llvm::countr_zero(C1);
14369 if (C2 < C3) {
14370 SDValue SHL = DAG.getNode(ISD::SRL, DL, MVT::i32, N0->getOperand(0),
14371 DAG.getConstant(C3 - C2, DL, MVT::i32));
14372 return DAG.getNode(ISD::SHL, DL, MVT::i32, SHL,
14373 DAG.getConstant(C3, DL, MVT::i32));
14374 }
14375 }
14376
14377 // Second pattern: left shift, then mask off leading bits.
14378 // FIXME: Use demanded bits?
14379 if (LeftShift && isShiftedMask_32(C1)) {
14380 uint32_t Trailing = llvm::countr_zero(C1);
14381 uint32_t C3 = llvm::countl_zero(C1);
14382 if (Trailing == C2 && C2 + C3 < 32) {
14383 SDValue SHL = DAG.getNode(ISD::SHL, DL, MVT::i32, N0->getOperand(0),
14384 DAG.getConstant(C2 + C3, DL, MVT::i32));
14385 return DAG.getNode(ISD::SRL, DL, MVT::i32, SHL,
14386 DAG.getConstant(C3, DL, MVT::i32));
14387 }
14388 }
14389
14390 // Second pattern, reversed: right shift, then mask off trailing bits.
14391 // FIXME: Handle other patterns of known/demanded bits.
14392 if (!LeftShift && isShiftedMask_32(C1)) {
14393 uint32_t Leading = llvm::countl_zero(C1);
14394 uint32_t C3 = llvm::countr_zero(C1);
14395 if (Leading == C2 && C2 + C3 < 32) {
14396 SDValue SHL = DAG.getNode(ISD::SRL, DL, MVT::i32, N0->getOperand(0),
14397 DAG.getConstant(C2 + C3, DL, MVT::i32));
14398 return DAG.getNode(ISD::SHL, DL, MVT::i32, SHL,
14399 DAG.getConstant(C3, DL, MVT::i32));
14400 }
14401 }
14402
14403 // Transform "(and (shl x, c2) c1)" into "(shl (and x, c1>>c2), c2)"
14404 // if "c1 >> c2" is a cheaper immediate than "c1"
14405 if (LeftShift &&
14406 HasLowerConstantMaterializationCost(C1 >> C2, C1, Subtarget)) {
14407
14408 SDValue And = DAG.getNode(ISD::AND, DL, MVT::i32, N0->getOperand(0),
14409 DAG.getConstant(C1 >> C2, DL, MVT::i32));
14410 return DAG.getNode(ISD::SHL, DL, MVT::i32, And,
14411 DAG.getConstant(C2, DL, MVT::i32));
14412 }
14413
14414 return SDValue();
14415}
14416
14419 const ARMSubtarget *Subtarget) {
14420 // Attempt to use immediate-form VBIC
14421 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(1));
14422 SDLoc dl(N);
14423 EVT VT = N->getValueType(0);
14424 SelectionDAG &DAG = DCI.DAG;
14425
14426 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT) || VT == MVT::v2i1 ||
14427 VT == MVT::v4i1 || VT == MVT::v8i1 || VT == MVT::v16i1)
14428 return SDValue();
14429
14430 APInt SplatBits, SplatUndef;
14431 unsigned SplatBitSize;
14432 bool HasAnyUndefs;
14433 if (BVN && (Subtarget->hasNEON() || Subtarget->hasMVEIntegerOps()) &&
14434 BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
14435 if (SplatBitSize == 8 || SplatBitSize == 16 || SplatBitSize == 32 ||
14436 SplatBitSize == 64) {
14437 EVT VbicVT;
14438 SDValue Val = isVMOVModifiedImm((~SplatBits).getZExtValue(),
14439 SplatUndef.getZExtValue(), SplatBitSize,
14440 DAG, dl, VbicVT, VT, OtherModImm);
14441 if (Val.getNode()) {
14442 SDValue Input =
14443 DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VbicVT, N->getOperand(0));
14444 SDValue Vbic = DAG.getNode(ARMISD::VBICIMM, dl, VbicVT, Input, Val);
14445 return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, Vbic);
14446 }
14447 }
14448 }
14449
14450 if (!Subtarget->isThumb1Only()) {
14451 // fold (and (select cc, -1, c), x) -> (select cc, x, (and, x, c))
14452 if (SDValue Result = combineSelectAndUseCommutative(N, true, DCI))
14453 return Result;
14454
14455 if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget))
14456 return Result;
14457 }
14458
14459 if (Subtarget->isThumb1Only())
14460 if (SDValue Result = CombineANDShift(N, DCI, Subtarget))
14461 return Result;
14462
14463 return SDValue();
14464}
14465
14466// Try combining OR nodes to SMULWB, SMULWT.
14469 const ARMSubtarget *Subtarget) {
14470 if (!Subtarget->hasV6Ops() ||
14471 (Subtarget->isThumb() &&
14472 (!Subtarget->hasThumb2() || !Subtarget->hasDSP())))
14473 return SDValue();
14474
14475 SDValue SRL = OR->getOperand(0);
14476 SDValue SHL = OR->getOperand(1);
14477
14478 if (SRL.getOpcode() != ISD::SRL || SHL.getOpcode() != ISD::SHL) {
14479 SRL = OR->getOperand(1);
14480 SHL = OR->getOperand(0);
14481 }
14482 if (!isSRL16(SRL) || !isSHL16(SHL))
14483 return SDValue();
14484
14485 // The first operands to the shifts need to be the two results from the
14486 // same smul_lohi node.
14487 if ((SRL.getOperand(0).getNode() != SHL.getOperand(0).getNode()) ||
14488 SRL.getOperand(0).getOpcode() != ISD::SMUL_LOHI)
14489 return SDValue();
14490
14491 SDNode *SMULLOHI = SRL.getOperand(0).getNode();
14492 if (SRL.getOperand(0) != SDValue(SMULLOHI, 0) ||
14493 SHL.getOperand(0) != SDValue(SMULLOHI, 1))
14494 return SDValue();
14495
14496 // Now we have:
14497 // (or (srl (smul_lohi ?, ?), 16), (shl (smul_lohi ?, ?), 16)))
14498 // For SMUL[B|T] smul_lohi will take a 32-bit and a 16-bit arguments.
14499 // For SMUWB the 16-bit value will signed extended somehow.
14500 // For SMULWT only the SRA is required.
14501 // Check both sides of SMUL_LOHI
14502 SDValue OpS16 = SMULLOHI->getOperand(0);
14503 SDValue OpS32 = SMULLOHI->getOperand(1);
14504
14505 SelectionDAG &DAG = DCI.DAG;
14506 if (!isS16(OpS16, DAG) && !isSRA16(OpS16)) {
14507 OpS16 = OpS32;
14508 OpS32 = SMULLOHI->getOperand(0);
14509 }
14510
14511 SDLoc dl(OR);
14512 unsigned Opcode = 0;
14513 if (isS16(OpS16, DAG))
14514 Opcode = ARMISD::SMULWB;
14515 else if (isSRA16(OpS16)) {
14516 Opcode = ARMISD::SMULWT;
14517 OpS16 = OpS16->getOperand(0);
14518 }
14519 else
14520 return SDValue();
14521
14522 SDValue Res = DAG.getNode(Opcode, dl, MVT::i32, OpS32, OpS16);
14523 DAG.ReplaceAllUsesOfValueWith(SDValue(OR, 0), Res);
14524 return SDValue(OR, 0);
14525}
14526
14529 const ARMSubtarget *Subtarget) {
14530 // BFI is only available on V6T2+
14531 if (Subtarget->isThumb1Only() || !Subtarget->hasV6T2Ops())
14532 return SDValue();
14533
14534 EVT VT = N->getValueType(0);
14535 SDValue N0 = N->getOperand(0);
14536 SDValue N1 = N->getOperand(1);
14537 SelectionDAG &DAG = DCI.DAG;
14538 SDLoc DL(N);
14539 // 1) or (and A, mask), val => ARMbfi A, val, mask
14540 // iff (val & mask) == val
14541 //
14542 // 2) or (and A, mask), (and B, mask2) => ARMbfi A, (lsr B, amt), mask
14543 // 2a) iff isBitFieldInvertedMask(mask) && isBitFieldInvertedMask(~mask2)
14544 // && mask == ~mask2
14545 // 2b) iff isBitFieldInvertedMask(~mask) && isBitFieldInvertedMask(mask2)
14546 // && ~mask == mask2
14547 // (i.e., copy a bitfield value into another bitfield of the same width)
14548
14549 if (VT != MVT::i32)
14550 return SDValue();
14551
14552 SDValue N00 = N0.getOperand(0);
14553
14554 // The value and the mask need to be constants so we can verify this is
14555 // actually a bitfield set. If the mask is 0xffff, we can do better
14556 // via a movt instruction, so don't use BFI in that case.
14557 SDValue MaskOp = N0.getOperand(1);
14559 if (!MaskC)
14560 return SDValue();
14561 unsigned Mask = MaskC->getZExtValue();
14562 if (Mask == 0xffff)
14563 return SDValue();
14564 SDValue Res;
14565 // Case (1): or (and A, mask), val => ARMbfi A, val, mask
14567 if (N1C) {
14568 unsigned Val = N1C->getZExtValue();
14569 if ((Val & ~Mask) != Val)
14570 return SDValue();
14571
14572 if (ARM::isBitFieldInvertedMask(Mask)) {
14573 Val >>= llvm::countr_zero(~Mask);
14574
14575 Res = DAG.getNode(ARMISD::BFI, DL, VT, N00,
14576 DAG.getConstant(Val, DL, MVT::i32),
14577 DAG.getConstant(Mask, DL, MVT::i32));
14578
14579 DCI.CombineTo(N, Res, false);
14580 // Return value from the original node to inform the combiner than N is
14581 // now dead.
14582 return SDValue(N, 0);
14583 }
14584 } else if (N1.getOpcode() == ISD::AND) {
14585 // case (2) or (and A, mask), (and B, mask2) => ARMbfi A, (lsr B, amt), mask
14587 if (!N11C)
14588 return SDValue();
14589 unsigned Mask2 = N11C->getZExtValue();
14590
14591 // Mask and ~Mask2 (or reverse) must be equivalent for the BFI pattern
14592 // as is to match.
14593 if (ARM::isBitFieldInvertedMask(Mask) &&
14594 (Mask == ~Mask2)) {
14595 // The pack halfword instruction works better for masks that fit it,
14596 // so use that when it's available.
14597 if (Subtarget->hasDSP() &&
14598 (Mask == 0xffff || Mask == 0xffff0000))
14599 return SDValue();
14600 // 2a
14601 unsigned amt = llvm::countr_zero(Mask2);
14602 Res = DAG.getNode(ISD::SRL, DL, VT, N1.getOperand(0),
14603 DAG.getConstant(amt, DL, MVT::i32));
14604 Res = DAG.getNode(ARMISD::BFI, DL, VT, N00, Res,
14605 DAG.getConstant(Mask, DL, MVT::i32));
14606 DCI.CombineTo(N, Res, false);
14607 // Return value from the original node to inform the combiner than N is
14608 // now dead.
14609 return SDValue(N, 0);
14610 } else if (ARM::isBitFieldInvertedMask(~Mask) &&
14611 (~Mask == Mask2)) {
14612 // The pack halfword instruction works better for masks that fit it,
14613 // so use that when it's available.
14614 if (Subtarget->hasDSP() &&
14615 (Mask2 == 0xffff || Mask2 == 0xffff0000))
14616 return SDValue();
14617 // 2b
14618 unsigned lsb = llvm::countr_zero(Mask);
14619 Res = DAG.getNode(ISD::SRL, DL, VT, N00,
14620 DAG.getConstant(lsb, DL, MVT::i32));
14621 Res = DAG.getNode(ARMISD::BFI, DL, VT, N1.getOperand(0), Res,
14622 DAG.getConstant(Mask2, DL, MVT::i32));
14623 DCI.CombineTo(N, Res, false);
14624 // Return value from the original node to inform the combiner than N is
14625 // now dead.
14626 return SDValue(N, 0);
14627 }
14628 }
14629
14630 if (DAG.MaskedValueIsZero(N1, MaskC->getAPIntValue()) &&
14631 N00.getOpcode() == ISD::SHL && isa<ConstantSDNode>(N00.getOperand(1)) &&
14633 // Case (3): or (and (shl A, #shamt), mask), B => ARMbfi B, A, ~mask
14634 // where lsb(mask) == #shamt and masked bits of B are known zero.
14635 SDValue ShAmt = N00.getOperand(1);
14636 unsigned ShAmtC = ShAmt->getAsZExtVal();
14637 unsigned LSB = llvm::countr_zero(Mask);
14638 if (ShAmtC != LSB)
14639 return SDValue();
14640
14641 Res = DAG.getNode(ARMISD::BFI, DL, VT, N1, N00.getOperand(0),
14642 DAG.getConstant(~Mask, DL, MVT::i32));
14643
14644 DCI.CombineTo(N, Res, false);
14645 // Return value from the original node to inform the combiner than N is
14646 // now dead.
14647 return SDValue(N, 0);
14648 }
14649
14650 return SDValue();
14651}
14652
14653static bool isValidMVECond(unsigned CC, bool IsFloat) {
14654 switch (CC) {
14655 case ARMCC::EQ:
14656 case ARMCC::NE:
14657 case ARMCC::LE:
14658 case ARMCC::GT:
14659 case ARMCC::GE:
14660 case ARMCC::LT:
14661 return true;
14662 case ARMCC::HS:
14663 case ARMCC::HI:
14664 return !IsFloat;
14665 default:
14666 return false;
14667 };
14668}
14669
14671 if (N->getOpcode() == ARMISD::VCMP)
14672 return (ARMCC::CondCodes)N->getConstantOperandVal(2);
14673 else if (N->getOpcode() == ARMISD::VCMPZ)
14674 return (ARMCC::CondCodes)N->getConstantOperandVal(1);
14675 else
14676 llvm_unreachable("Not a VCMP/VCMPZ!");
14677}
14678
14681 return isValidMVECond(CC, N->getOperand(0).getValueType().isFloatingPoint());
14682}
14683
14685 const ARMSubtarget *Subtarget) {
14686 // Try to invert "or A, B" -> "and ~A, ~B", as the "and" is easier to chain
14687 // together with predicates
14688 EVT VT = N->getValueType(0);
14689 SDLoc DL(N);
14690 SDValue N0 = N->getOperand(0);
14691 SDValue N1 = N->getOperand(1);
14692
14693 auto IsFreelyInvertable = [&](SDValue V) {
14694 if (V->getOpcode() == ARMISD::VCMP || V->getOpcode() == ARMISD::VCMPZ)
14695 return CanInvertMVEVCMP(V);
14696 return false;
14697 };
14698
14699 // At least one operand must be freely invertable.
14700 if (!(IsFreelyInvertable(N0) || IsFreelyInvertable(N1)))
14701 return SDValue();
14702
14703 SDValue NewN0 = DAG.getLogicalNOT(DL, N0, VT);
14704 SDValue NewN1 = DAG.getLogicalNOT(DL, N1, VT);
14705 SDValue And = DAG.getNode(ISD::AND, DL, VT, NewN0, NewN1);
14706 return DAG.getLogicalNOT(DL, And, VT);
14707}
14708
14709// Try to form a NEON shift-{right, left}-and-insert (VSRI/VSLI) from:
14710// (or (and X, splat (i32 C1)), (srl Y, splat (i32 C2))) -> VSRI X, Y, #C2
14711// (or (and X, splat (i32 C1)), (shl Y, splat (i32 C2))) -> VSLI X, Y, #C2
14712// where C1 is a mask that preserves the bits not written by the shift/insert,
14713// i.e. `C1 == (1 << C2) - 1`.
14715 SDValue ShiftOp, EVT VT,
14716 SDLoc dl) {
14717 // Match (and X, Mask)
14718 if (AndOp.getOpcode() != ISD::AND)
14719 return SDValue();
14720
14721 SDValue X = AndOp.getOperand(0);
14722 SDValue Mask = AndOp.getOperand(1);
14723
14724 ConstantSDNode *MaskC = isConstOrConstSplat(Mask, false, true);
14725 if (!MaskC)
14726 return SDValue();
14727 APInt MaskBits =
14728 MaskC->getAPIntValue().trunc(Mask.getScalarValueSizeInBits());
14729
14730 // Match shift (srl/shl Y, CntVec)
14731 int64_t Cnt = 0;
14732 bool IsShiftRight = false;
14733 SDValue Y;
14734
14735 if (ShiftOp.getOpcode() == ARMISD::VSHRuIMM) {
14736 IsShiftRight = true;
14737 Y = ShiftOp.getOperand(0);
14738 Cnt = ShiftOp.getConstantOperandVal(1);
14739 } else if (ShiftOp.getOpcode() == ARMISD::VSHLIMM) {
14740 Y = ShiftOp.getOperand(0);
14741 Cnt = ShiftOp.getConstantOperandVal(1);
14742 } else {
14743 return SDValue();
14744 }
14745
14746 unsigned ElemBits = VT.getScalarSizeInBits();
14747 APInt RequiredMask = IsShiftRight
14748 ? APInt::getHighBitsSet(ElemBits, (unsigned)Cnt)
14749 : APInt::getLowBitsSet(ElemBits, (unsigned)Cnt);
14750 if (MaskBits != RequiredMask)
14751 return SDValue();
14752
14753 unsigned Opc = IsShiftRight ? ARMISD::VSRIIMM : ARMISD::VSLIIMM;
14754 return DAG.getNode(Opc, dl, VT, X, Y, DAG.getConstant(Cnt, dl, MVT::i32));
14755}
14756
14757/// PerformORCombine - Target-specific dag combine xforms for ISD::OR
14759 const ARMSubtarget *Subtarget) {
14760 // Attempt to use immediate-form VORR
14761 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(1));
14762 SDLoc dl(N);
14763 EVT VT = N->getValueType(0);
14764 SelectionDAG &DAG = DCI.DAG;
14765
14766 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
14767 return SDValue();
14768
14769 if (Subtarget->hasMVEIntegerOps() && (VT == MVT::v2i1 || VT == MVT::v4i1 ||
14770 VT == MVT::v8i1 || VT == MVT::v16i1))
14771 return PerformORCombine_i1(N, DAG, Subtarget);
14772
14773 APInt SplatBits, SplatUndef;
14774 unsigned SplatBitSize;
14775 bool HasAnyUndefs;
14776 if (BVN && (Subtarget->hasNEON() || Subtarget->hasMVEIntegerOps()) &&
14777 BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
14778 if (SplatBitSize == 8 || SplatBitSize == 16 || SplatBitSize == 32 ||
14779 SplatBitSize == 64) {
14780 EVT VorrVT;
14781 SDValue Val =
14782 isVMOVModifiedImm(SplatBits.getZExtValue(), SplatUndef.getZExtValue(),
14783 SplatBitSize, DAG, dl, VorrVT, VT, OtherModImm);
14784 if (Val.getNode()) {
14785 SDValue Input =
14786 DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VorrVT, N->getOperand(0));
14787 SDValue Vorr = DAG.getNode(ARMISD::VORRIMM, dl, VorrVT, Input, Val);
14788 return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, Vorr);
14789 }
14790 }
14791 }
14792
14793 if (!Subtarget->isThumb1Only()) {
14794 // fold (or (select cc, 0, c), x) -> (select cc, x, (or, x, c))
14795 if (SDValue Result = combineSelectAndUseCommutative(N, false, DCI))
14796 return Result;
14797 if (SDValue Result = PerformORCombineToSMULWBT(N, DCI, Subtarget))
14798 return Result;
14799 }
14800
14801 SDValue N0 = N->getOperand(0);
14802 SDValue N1 = N->getOperand(1);
14803
14804 // (or (and X, C1), (srl Y, C2)) -> VSRI X, Y, #C2
14805 // (or (and X, C1), (shl Y, C2)) -> VSLI X, Y, #C2
14806 if (VT.isVector() &&
14807 ((Subtarget->hasNEON() && DAG.getTargetLoweringInfo().isTypeLegal(VT)) ||
14808 (Subtarget->hasMVEIntegerOps() &&
14809 (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32)))) {
14810 if (SDValue ShiftInsert =
14811 PerformORCombineToShiftInsert(DAG, N0, N1, VT, dl))
14812 return ShiftInsert;
14813
14814 if (SDValue ShiftInsert =
14815 PerformORCombineToShiftInsert(DAG, N1, N0, VT, dl))
14816 return ShiftInsert;
14817 }
14818
14819 // (or (and B, A), (and C, ~A)) => (VBSL A, B, C) when A is a constant.
14820 if (Subtarget->hasNEON() && N1.getOpcode() == ISD::AND && VT.isVector() &&
14822
14823 // The code below optimizes (or (and X, Y), Z).
14824 // The AND operand needs to have a single user to make these optimizations
14825 // profitable.
14826 if (N0.getOpcode() != ISD::AND || !N0.hasOneUse())
14827 return SDValue();
14828
14829 APInt SplatUndef;
14830 unsigned SplatBitSize;
14831 bool HasAnyUndefs;
14832
14833 APInt SplatBits0, SplatBits1;
14836 // Ensure that the second operand of both ands are constants
14837 if (BVN0 && BVN0->isConstantSplat(SplatBits0, SplatUndef, SplatBitSize,
14838 HasAnyUndefs) && !HasAnyUndefs) {
14839 if (BVN1 && BVN1->isConstantSplat(SplatBits1, SplatUndef, SplatBitSize,
14840 HasAnyUndefs) && !HasAnyUndefs) {
14841 // Ensure that the bit width of the constants are the same and that
14842 // the splat arguments are logical inverses as per the pattern we
14843 // are trying to simplify.
14844 if (SplatBits0.getBitWidth() == SplatBits1.getBitWidth() &&
14845 SplatBits0 == ~SplatBits1) {
14846 // Canonicalize the vector type to make instruction selection
14847 // simpler.
14848 EVT CanonicalVT = VT.is128BitVector() ? MVT::v4i32 : MVT::v2i32;
14849 SDValue Result = DAG.getNode(ARMISD::VBSP, dl, CanonicalVT,
14850 N0->getOperand(1),
14851 N0->getOperand(0),
14852 N1->getOperand(0));
14853 return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, Result);
14854 }
14855 }
14856 }
14857 }
14858
14859 // Try to use the ARM/Thumb2 BFI (bitfield insert) instruction when
14860 // reasonable.
14861 if (N0.getOpcode() == ISD::AND && N0.hasOneUse()) {
14862 if (SDValue Res = PerformORCombineToBFI(N, DCI, Subtarget))
14863 return Res;
14864 }
14865
14866 if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget))
14867 return Result;
14868
14869 // (or x, (csinc 0, 0, cc)) -> (csinc x, 0, cc)
14870 // providing that the x is 0 or 1.
14871 SDValue CSINC = N1;
14872 SDValue Other = N0;
14873 if (CSINC.getOpcode() != ARMISD::CSINC)
14874 std::swap(CSINC, Other);
14875 if (CSINC.getOpcode() == ARMISD::CSINC &&
14876 isNullConstant(CSINC.getOperand(0)) &&
14877 isNullConstant(CSINC.getOperand(1)) &&
14879 return DAG.getNode(ARMISD::CSINC, dl, VT, Other, CSINC.getOperand(1),
14880 CSINC.getOperand(2), CSINC.getOperand(3));
14881
14882 return SDValue();
14883}
14884
14887 const ARMSubtarget *Subtarget) {
14888 EVT VT = N->getValueType(0);
14889 SelectionDAG &DAG = DCI.DAG;
14890
14891 if(!DAG.getTargetLoweringInfo().isTypeLegal(VT))
14892 return SDValue();
14893
14894 if (!Subtarget->isThumb1Only()) {
14895 // fold (xor (select cc, 0, c), x) -> (select cc, x, (xor, x, c))
14896 if (SDValue Result = combineSelectAndUseCommutative(N, false, DCI))
14897 return Result;
14898
14899 if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget))
14900 return Result;
14901 }
14902
14903 if (Subtarget->hasMVEIntegerOps()) {
14904 // fold (xor(vcmp/z, 1)) into a vcmp with the opposite condition.
14905 SDValue N0 = N->getOperand(0);
14906 SDValue N1 = N->getOperand(1);
14907 const TargetLowering *TLI = Subtarget->getTargetLowering();
14908 if (TLI->isConstTrueVal(N1) &&
14909 (N0->getOpcode() == ARMISD::VCMP || N0->getOpcode() == ARMISD::VCMPZ)) {
14910 if (CanInvertMVEVCMP(N0)) {
14911 SDLoc DL(N0);
14913
14915 Ops.push_back(N0->getOperand(0));
14916 if (N0->getOpcode() == ARMISD::VCMP)
14917 Ops.push_back(N0->getOperand(1));
14918 Ops.push_back(DAG.getConstant(CC, DL, MVT::i32));
14919 return DAG.getNode(N0->getOpcode(), DL, N0->getValueType(0), Ops);
14920 }
14921 }
14922 }
14923
14924 return SDValue();
14925}
14926
14927// ParseBFI - given a BFI instruction in N, extract the "from" value (Rn) and return it,
14928// and fill in FromMask and ToMask with (consecutive) bits in "from" to be extracted and
14929// their position in "to" (Rd).
14930static SDValue ParseBFI(SDNode *N, APInt &ToMask, APInt &FromMask) {
14931 assert(N->getOpcode() == ARMISD::BFI);
14932
14933 SDValue From = N->getOperand(1);
14934 ToMask = ~N->getConstantOperandAPInt(2);
14935 FromMask = APInt::getLowBitsSet(ToMask.getBitWidth(), ToMask.popcount());
14936
14937 // If the Base came from a SHR #C, we can deduce that it is really testing bit
14938 // #C in the base of the SHR.
14939 if (From->getOpcode() == ISD::SRL &&
14940 isa<ConstantSDNode>(From->getOperand(1))) {
14941 APInt Shift = From->getConstantOperandAPInt(1);
14942 assert(Shift.getLimitedValue() < 32 && "Shift too large!");
14943 FromMask <<= Shift.getLimitedValue(31);
14944 From = From->getOperand(0);
14945 }
14946
14947 return From;
14948}
14949
14950// If A and B contain one contiguous set of bits, does A | B == A . B?
14951//
14952// Neither A nor B must be zero.
14953static bool BitsProperlyConcatenate(const APInt &A, const APInt &B) {
14954 unsigned LastActiveBitInA = A.countr_zero();
14955 unsigned FirstActiveBitInB = B.getBitWidth() - B.countl_zero() - 1;
14956 return LastActiveBitInA - 1 == FirstActiveBitInB;
14957}
14958
14960 // We have a BFI in N. Find a BFI it can combine with, if one exists.
14961 APInt ToMask, FromMask;
14962 SDValue From = ParseBFI(N, ToMask, FromMask);
14963 SDValue To = N->getOperand(0);
14964
14965 SDValue V = To;
14966 if (V.getOpcode() != ARMISD::BFI)
14967 return SDValue();
14968
14969 APInt NewToMask, NewFromMask;
14970 SDValue NewFrom = ParseBFI(V.getNode(), NewToMask, NewFromMask);
14971 if (NewFrom != From)
14972 return SDValue();
14973
14974 // Do the written bits conflict with any we've seen so far?
14975 if ((NewToMask & ToMask).getBoolValue())
14976 // Conflicting bits.
14977 return SDValue();
14978
14979 // Are the new bits contiguous when combined with the old bits?
14980 if (BitsProperlyConcatenate(ToMask, NewToMask) &&
14981 BitsProperlyConcatenate(FromMask, NewFromMask))
14982 return V;
14983 if (BitsProperlyConcatenate(NewToMask, ToMask) &&
14984 BitsProperlyConcatenate(NewFromMask, FromMask))
14985 return V;
14986
14987 return SDValue();
14988}
14989
14991 SDValue N0 = N->getOperand(0);
14992 SDValue N1 = N->getOperand(1);
14993
14994 if (N1.getOpcode() == ISD::AND) {
14995 // (bfi A, (and B, Mask1), Mask2) -> (bfi A, B, Mask2) iff
14996 // the bits being cleared by the AND are not demanded by the BFI.
14998 if (!N11C)
14999 return SDValue();
15000 unsigned InvMask = N->getConstantOperandVal(2);
15001 unsigned LSB = llvm::countr_zero(~InvMask);
15002 unsigned Width = llvm::bit_width<unsigned>(~InvMask) - LSB;
15003 assert(Width <
15004 static_cast<unsigned>(std::numeric_limits<unsigned>::digits) &&
15005 "undefined behavior");
15006 unsigned Mask = (1u << Width) - 1;
15007 unsigned Mask2 = N11C->getZExtValue();
15008 if ((Mask & (~Mask2)) == 0)
15009 return DAG.getNode(ARMISD::BFI, SDLoc(N), N->getValueType(0),
15010 N->getOperand(0), N1.getOperand(0), N->getOperand(2));
15011 return SDValue();
15012 }
15013
15014 // Look for another BFI to combine with.
15015 if (SDValue CombineBFI = FindBFIToCombineWith(N)) {
15016 // We've found a BFI.
15017 APInt ToMask1, FromMask1;
15018 SDValue From1 = ParseBFI(N, ToMask1, FromMask1);
15019
15020 APInt ToMask2, FromMask2;
15021 SDValue From2 = ParseBFI(CombineBFI.getNode(), ToMask2, FromMask2);
15022 assert(From1 == From2);
15023 (void)From2;
15024
15025 // Create a new BFI, combining the two together.
15026 APInt NewFromMask = FromMask1 | FromMask2;
15027 APInt NewToMask = ToMask1 | ToMask2;
15028
15029 EVT VT = N->getValueType(0);
15030 SDLoc dl(N);
15031
15032 if (NewFromMask[0] == 0)
15033 From1 = DAG.getNode(ISD::SRL, dl, VT, From1,
15034 DAG.getConstant(NewFromMask.countr_zero(), dl, VT));
15035 return DAG.getNode(ARMISD::BFI, dl, VT, CombineBFI.getOperand(0), From1,
15036 DAG.getConstant(~NewToMask, dl, VT));
15037 }
15038
15039 // Reassociate BFI(BFI (A, B, M1), C, M2) to BFI(BFI (A, C, M2), B, M1) so
15040 // that lower bit insertions are performed first, providing that M1 and M2
15041 // do no overlap. This can allow multiple BFI instructions to be combined
15042 // together by the other folds above.
15043 if (N->getOperand(0).getOpcode() == ARMISD::BFI) {
15044 APInt ToMask1 = ~N->getConstantOperandAPInt(2);
15045 APInt ToMask2 = ~N0.getConstantOperandAPInt(2);
15046
15047 if (!N0.hasOneUse() || (ToMask1 & ToMask2) != 0 ||
15048 ToMask1.countl_zero() < ToMask2.countl_zero())
15049 return SDValue();
15050
15051 EVT VT = N->getValueType(0);
15052 SDLoc dl(N);
15053 SDValue BFI1 = DAG.getNode(ARMISD::BFI, dl, VT, N0.getOperand(0),
15054 N->getOperand(1), N->getOperand(2));
15055 return DAG.getNode(ARMISD::BFI, dl, VT, BFI1, N0.getOperand(1),
15056 N0.getOperand(2));
15057 }
15058
15059 return SDValue();
15060}
15061
15062// Check that N is CMPZ(CSINC(0, 0, CC, X)),
15063// or CMPZ(CMOV(1, 0, CC, X))
15064// return X if valid.
15066 if (Cmp->getOpcode() != ARMISD::CMPZ || !isNullConstant(Cmp->getOperand(1)))
15067 return SDValue();
15068 SDValue CSInc = Cmp->getOperand(0);
15069
15070 // Ignore any `And 1` nodes that may not yet have been removed. We are
15071 // looking for a value that produces 1/0, so these have no effect on the
15072 // code.
15073 while (CSInc.getOpcode() == ISD::AND &&
15074 isa<ConstantSDNode>(CSInc.getOperand(1)) &&
15075 CSInc.getConstantOperandVal(1) == 1 && CSInc->hasOneUse())
15076 CSInc = CSInc.getOperand(0);
15077
15078 if (CSInc.getOpcode() == ARMISD::CSINC &&
15079 isNullConstant(CSInc.getOperand(0)) &&
15080 isNullConstant(CSInc.getOperand(1)) && CSInc->hasOneUse()) {
15082 return CSInc.getOperand(3);
15083 }
15084 if (CSInc.getOpcode() == ARMISD::CMOV && isOneConstant(CSInc.getOperand(0)) &&
15085 isNullConstant(CSInc.getOperand(1)) && CSInc->hasOneUse()) {
15087 return CSInc.getOperand(3);
15088 }
15089 if (CSInc.getOpcode() == ARMISD::CMOV && isOneConstant(CSInc.getOperand(1)) &&
15090 isNullConstant(CSInc.getOperand(0)) && CSInc->hasOneUse()) {
15093 return CSInc.getOperand(3);
15094 }
15095 return SDValue();
15096}
15097
15099 // Given CMPZ(CSINC(C, 0, 0, EQ), 0), we can just use C directly. As in
15100 // t92: flags = ARMISD::CMPZ t74, 0
15101 // t93: i32 = ARMISD::CSINC 0, 0, 1, t92
15102 // t96: flags = ARMISD::CMPZ t93, 0
15103 // t114: i32 = ARMISD::CSINV 0, 0, 0, t96
15105 if (SDValue C = IsCMPZCSINC(N, Cond))
15106 if (Cond == ARMCC::EQ)
15107 return C;
15108 return SDValue();
15109}
15110
15112 // Fold away an unnecessary CMPZ/CSINC
15113 // CSXYZ A, B, C1 (CMPZ (CSINC 0, 0, C2, D), 0) ->
15114 // if C1==EQ -> CSXYZ A, B, C2, D
15115 // if C1==NE -> CSXYZ A, B, NOT(C2), D
15117 if (SDValue C = IsCMPZCSINC(N->getOperand(3).getNode(), Cond)) {
15118 if (N->getConstantOperandVal(2) == ARMCC::EQ)
15119 return DAG.getNode(N->getOpcode(), SDLoc(N), MVT::i32, N->getOperand(0),
15120 N->getOperand(1),
15121 DAG.getConstant(Cond, SDLoc(N), MVT::i32), C);
15122 if (N->getConstantOperandVal(2) == ARMCC::NE)
15123 return DAG.getNode(
15124 N->getOpcode(), SDLoc(N), MVT::i32, N->getOperand(0),
15125 N->getOperand(1),
15127 }
15128 return SDValue();
15129}
15130
15131/// PerformVMOVRRDCombine - Target-specific dag combine xforms for
15132/// ARMISD::VMOVRRD.
15135 const ARMSubtarget *Subtarget) {
15136 // vmovrrd(vmovdrr x, y) -> x,y
15137 SDValue InDouble = N->getOperand(0);
15138 if (InDouble.getOpcode() == ARMISD::VMOVDRR && Subtarget->hasFP64())
15139 return DCI.CombineTo(N, InDouble.getOperand(0), InDouble.getOperand(1));
15140
15141 // vmovrrd(load f64) -> (load i32), (load i32)
15142 SDNode *InNode = InDouble.getNode();
15143 if (ISD::isNormalLoad(InNode) && InNode->hasOneUse() &&
15144 InNode->getValueType(0) == MVT::f64 &&
15145 InNode->getOperand(1).getOpcode() == ISD::FrameIndex &&
15146 !cast<LoadSDNode>(InNode)->isVolatile()) {
15147 // TODO: Should this be done for non-FrameIndex operands?
15148 LoadSDNode *LD = cast<LoadSDNode>(InNode);
15149
15150 SelectionDAG &DAG = DCI.DAG;
15151 SDLoc DL(LD);
15152 SDValue BasePtr = LD->getBasePtr();
15153 SDValue NewLD1 =
15154 DAG.getLoad(MVT::i32, DL, LD->getChain(), BasePtr, LD->getPointerInfo(),
15155 LD->getAlign(), LD->getMemOperand()->getFlags());
15156
15157 SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr,
15158 DAG.getConstant(4, DL, MVT::i32));
15159
15160 SDValue NewLD2 = DAG.getLoad(MVT::i32, DL, LD->getChain(), OffsetPtr,
15161 LD->getPointerInfo().getWithOffset(4),
15162 commonAlignment(LD->getAlign(), 4),
15163 LD->getMemOperand()->getFlags());
15164
15165 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewLD2.getValue(1));
15166 if (DCI.DAG.getDataLayout().isBigEndian())
15167 std::swap (NewLD1, NewLD2);
15168 SDValue Result = DCI.CombineTo(N, NewLD1, NewLD2);
15169 return Result;
15170 }
15171
15172 // VMOVRRD(extract(..(build_vector(a, b, c, d)))) -> a,b or c,d
15173 // VMOVRRD(extract(insert_vector(insert_vector(.., a, l1), b, l2))) -> a,b
15174 if (InDouble.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
15175 isa<ConstantSDNode>(InDouble.getOperand(1))) {
15176 SDValue BV = InDouble.getOperand(0);
15177 // Look up through any nop bitcasts and vector_reg_casts. bitcasts may
15178 // change lane order under big endian.
15179 bool BVSwap = BV.getOpcode() == ISD::BITCAST;
15180 while (
15181 (BV.getOpcode() == ISD::BITCAST ||
15182 BV.getOpcode() == ARMISD::VECTOR_REG_CAST) &&
15183 (BV.getValueType() == MVT::v2f64 || BV.getValueType() == MVT::v2i64)) {
15184 BVSwap = BV.getOpcode() == ISD::BITCAST;
15185 BV = BV.getOperand(0);
15186 }
15187 if (BV.getValueType() != MVT::v4i32)
15188 return SDValue();
15189
15190 // Handle buildvectors, pulling out the correct lane depending on
15191 // endianness.
15192 unsigned Offset = InDouble.getConstantOperandVal(1) == 1 ? 2 : 0;
15193 if (BV.getOpcode() == ISD::BUILD_VECTOR) {
15194 SDValue Op0 = BV.getOperand(Offset);
15195 SDValue Op1 = BV.getOperand(Offset + 1);
15196 if (!Subtarget->isLittle() && BVSwap)
15197 std::swap(Op0, Op1);
15198
15199 return DCI.DAG.getMergeValues({Op0, Op1}, SDLoc(N));
15200 }
15201
15202 // A chain of insert_vectors, grabbing the correct value of the chain of
15203 // inserts.
15204 SDValue Op0, Op1;
15205 while (BV.getOpcode() == ISD::INSERT_VECTOR_ELT) {
15206 if (isa<ConstantSDNode>(BV.getOperand(2))) {
15207 if (BV.getConstantOperandVal(2) == Offset && !Op0)
15208 Op0 = BV.getOperand(1);
15209 if (BV.getConstantOperandVal(2) == Offset + 1 && !Op1)
15210 Op1 = BV.getOperand(1);
15211 }
15212 BV = BV.getOperand(0);
15213 }
15214 if (!Subtarget->isLittle() && BVSwap)
15215 std::swap(Op0, Op1);
15216 if (Op0 && Op1)
15217 return DCI.DAG.getMergeValues({Op0, Op1}, SDLoc(N));
15218 }
15219
15220 return SDValue();
15221}
15222
15223/// PerformVMOVDRRCombine - Target-specific dag combine xforms for
15224/// ARMISD::VMOVDRR. This is also used for BUILD_VECTORs with 2 operands.
15226 // N=vmovrrd(X); vmovdrr(N:0, N:1) -> bit_convert(X)
15227 SDValue Op0 = N->getOperand(0);
15228 SDValue Op1 = N->getOperand(1);
15229 if (Op0.getOpcode() == ISD::BITCAST)
15230 Op0 = Op0.getOperand(0);
15231 if (Op1.getOpcode() == ISD::BITCAST)
15232 Op1 = Op1.getOperand(0);
15233 if (Op0.getOpcode() == ARMISD::VMOVRRD &&
15234 Op0.getNode() == Op1.getNode() &&
15235 Op0.getResNo() == 0 && Op1.getResNo() == 1)
15236 return DAG.getNode(ISD::BITCAST, SDLoc(N),
15237 N->getValueType(0), Op0.getOperand(0));
15238 return SDValue();
15239}
15240
15243 SDValue Op0 = N->getOperand(0);
15244
15245 // VMOVhr (VMOVrh (X)) -> X
15246 if (Op0->getOpcode() == ARMISD::VMOVrh)
15247 return Op0->getOperand(0);
15248
15249 // FullFP16: half values are passed in S-registers, and we don't
15250 // need any of the bitcast and moves:
15251 //
15252 // t2: f32,ch1,gl1? = CopyFromReg ch, Register:f32 %0, gl?
15253 // t5: i32 = bitcast t2
15254 // t18: f16 = ARMISD::VMOVhr t5
15255 // =>
15256 // tN: f16,ch2,gl2? = CopyFromReg ch, Register::f32 %0, gl?
15257 if (Op0->getOpcode() == ISD::BITCAST) {
15258 SDValue Copy = Op0->getOperand(0);
15259 if (Copy.getValueType() == MVT::f32 &&
15260 Copy->getOpcode() == ISD::CopyFromReg) {
15261 bool HasGlue = Copy->getNumOperands() == 3;
15262 SDValue Ops[] = {Copy->getOperand(0), Copy->getOperand(1),
15263 HasGlue ? Copy->getOperand(2) : SDValue()};
15264 EVT OutTys[] = {N->getValueType(0), MVT::Other, MVT::Glue};
15265 SDValue NewCopy =
15267 DCI.DAG.getVTList(ArrayRef(OutTys, HasGlue ? 3 : 2)),
15268 ArrayRef(Ops, HasGlue ? 3 : 2));
15269
15270 // Update Users, Chains, and Potential Glue.
15271 DCI.DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), NewCopy.getValue(0));
15272 DCI.DAG.ReplaceAllUsesOfValueWith(Copy.getValue(1), NewCopy.getValue(1));
15273 if (HasGlue)
15274 DCI.DAG.ReplaceAllUsesOfValueWith(Copy.getValue(2),
15275 NewCopy.getValue(2));
15276
15277 return NewCopy;
15278 }
15279 }
15280
15281 // fold (VMOVhr (load x)) -> (load (f16*)x)
15282 if (LoadSDNode *LN0 = dyn_cast<LoadSDNode>(Op0)) {
15283 if (LN0->hasOneUse() && LN0->isUnindexed() &&
15284 LN0->getMemoryVT() == MVT::i16) {
15285 SDValue Load =
15286 DCI.DAG.getLoad(N->getValueType(0), SDLoc(N), LN0->getChain(),
15287 LN0->getBasePtr(), LN0->getMemOperand());
15288 DCI.DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Load.getValue(0));
15289 DCI.DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), Load.getValue(1));
15290 return Load;
15291 }
15292 }
15293
15294 // Only the bottom 16 bits of the source register are used.
15295 APInt DemandedMask = APInt::getLowBitsSet(32, 16);
15296 const TargetLowering &TLI = DCI.DAG.getTargetLoweringInfo();
15297 if (TLI.SimplifyDemandedBits(Op0, DemandedMask, DCI))
15298 return SDValue(N, 0);
15299
15300 return SDValue();
15301}
15302
15304 SDValue N0 = N->getOperand(0);
15305 EVT VT = N->getValueType(0);
15306
15307 // fold (VMOVrh (fpconst x)) -> const x
15309 APFloat V = C->getValueAPF();
15310 return DAG.getConstant(V.bitcastToAPInt().getZExtValue(), SDLoc(N), VT);
15311 }
15312
15313 // fold (VMOVrh (load x)) -> (zextload (i16*)x)
15314 if (ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse()) {
15315 LoadSDNode *LN0 = cast<LoadSDNode>(N0);
15316
15317 SDValue Load =
15318 DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(N), VT, LN0->getChain(),
15319 LN0->getBasePtr(), MVT::i16, LN0->getMemOperand());
15320 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Load.getValue(0));
15321 DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), Load.getValue(1));
15322 return Load;
15323 }
15324
15325 // Fold VMOVrh(extract(x, n)) -> vgetlaneu(x, n)
15326 if (N0->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
15328 return DAG.getNode(ARMISD::VGETLANEu, SDLoc(N), VT, N0->getOperand(0),
15329 N0->getOperand(1));
15330
15331 return SDValue();
15332}
15333
15334/// hasNormalLoadOperand - Check if any of the operands of a BUILD_VECTOR node
15335/// are normal, non-volatile loads. If so, it is profitable to bitcast an
15336/// i64 vector to have f64 elements, since the value can then be loaded
15337/// directly into a VFP register.
15339 unsigned NumElts = N->getValueType(0).getVectorNumElements();
15340 for (unsigned i = 0; i < NumElts; ++i) {
15341 SDNode *Elt = N->getOperand(i).getNode();
15342 if (ISD::isNormalLoad(Elt) && !cast<LoadSDNode>(Elt)->isVolatile())
15343 return true;
15344 }
15345 return false;
15346}
15347
15348/// PerformBUILD_VECTORCombine - Target-specific dag combine xforms for
15349/// ISD::BUILD_VECTOR.
15352 const ARMSubtarget *Subtarget) {
15353 // build_vector(N=ARMISD::VMOVRRD(X), N:1) -> bit_convert(X):
15354 // VMOVRRD is introduced when legalizing i64 types. It forces the i64 value
15355 // into a pair of GPRs, which is fine when the value is used as a scalar,
15356 // but if the i64 value is converted to a vector, we need to undo the VMOVRRD.
15357 SelectionDAG &DAG = DCI.DAG;
15358 if (N->getNumOperands() == 2)
15359 if (SDValue RV = PerformVMOVDRRCombine(N, DAG))
15360 return RV;
15361
15362 // Load i64 elements as f64 values so that type legalization does not split
15363 // them up into i32 values.
15364 EVT VT = N->getValueType(0);
15365 if (VT.getVectorElementType() != MVT::i64 || !hasNormalLoadOperand(N))
15366 return SDValue();
15367 SDLoc dl(N);
15369 unsigned NumElts = VT.getVectorNumElements();
15370 for (unsigned i = 0; i < NumElts; ++i) {
15371 SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::f64, N->getOperand(i));
15372 Ops.push_back(V);
15373 // Make the DAGCombiner fold the bitcast.
15374 DCI.AddToWorklist(V.getNode());
15375 }
15376 EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, NumElts);
15377 SDValue BV = DAG.getBuildVector(FloatVT, dl, Ops);
15378 return DAG.getNode(ISD::BITCAST, dl, VT, BV);
15379}
15380
15381/// Target-specific dag combine xforms for ARMISD::BUILD_VECTOR.
15382static SDValue
15384 // ARMISD::BUILD_VECTOR is introduced when legalizing ISD::BUILD_VECTOR.
15385 // At that time, we may have inserted bitcasts from integer to float.
15386 // If these bitcasts have survived DAGCombine, change the lowering of this
15387 // BUILD_VECTOR in something more vector friendly, i.e., that does not
15388 // force to use floating point types.
15389
15390 // Make sure we can change the type of the vector.
15391 // This is possible iff:
15392 // 1. The vector is only used in a bitcast to a integer type. I.e.,
15393 // 1.1. Vector is used only once.
15394 // 1.2. Use is a bit convert to an integer type.
15395 // 2. The size of its operands are 32-bits (64-bits are not legal).
15396 EVT VT = N->getValueType(0);
15397 EVT EltVT = VT.getVectorElementType();
15398
15399 // Check 1.1. and 2.
15400 if (EltVT.getSizeInBits() != 32 || !N->hasOneUse())
15401 return SDValue();
15402
15403 // By construction, the input type must be float.
15404 assert(EltVT == MVT::f32 && "Unexpected type!");
15405
15406 // Check 1.2.
15407 SDNode *Use = *N->user_begin();
15408 if (Use->getOpcode() != ISD::BITCAST ||
15409 Use->getValueType(0).isFloatingPoint())
15410 return SDValue();
15411
15412 // Check profitability.
15413 // Model is, if more than half of the relevant operands are bitcast from
15414 // i32, turn the build_vector into a sequence of insert_vector_elt.
15415 // Relevant operands are everything that is not statically
15416 // (i.e., at compile time) bitcasted.
15417 unsigned NumOfBitCastedElts = 0;
15418 unsigned NumElts = VT.getVectorNumElements();
15419 unsigned NumOfRelevantElts = NumElts;
15420 for (unsigned Idx = 0; Idx < NumElts; ++Idx) {
15421 SDValue Elt = N->getOperand(Idx);
15422 if (Elt->getOpcode() == ISD::BITCAST) {
15423 // Assume only bit cast to i32 will go away.
15424 if (Elt->getOperand(0).getValueType() == MVT::i32)
15425 ++NumOfBitCastedElts;
15426 } else if (Elt.isUndef() || isa<ConstantSDNode>(Elt))
15427 // Constants are statically casted, thus do not count them as
15428 // relevant operands.
15429 --NumOfRelevantElts;
15430 }
15431
15432 // Check if more than half of the elements require a non-free bitcast.
15433 if (NumOfBitCastedElts <= NumOfRelevantElts / 2)
15434 return SDValue();
15435
15436 SelectionDAG &DAG = DCI.DAG;
15437 // Create the new vector type.
15438 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts);
15439 // Check if the type is legal.
15440 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
15441 if (!TLI.isTypeLegal(VecVT))
15442 return SDValue();
15443
15444 // Combine:
15445 // ARMISD::BUILD_VECTOR E1, E2, ..., EN.
15446 // => BITCAST INSERT_VECTOR_ELT
15447 // (INSERT_VECTOR_ELT (...), (BITCAST EN-1), N-1),
15448 // (BITCAST EN), N.
15449 SDValue Vec = DAG.getUNDEF(VecVT);
15450 SDLoc dl(N);
15451 for (unsigned Idx = 0 ; Idx < NumElts; ++Idx) {
15452 SDValue V = N->getOperand(Idx);
15453 if (V.isUndef())
15454 continue;
15455 if (V.getOpcode() == ISD::BITCAST &&
15456 V->getOperand(0).getValueType() == MVT::i32)
15457 // Fold obvious case.
15458 V = V.getOperand(0);
15459 else {
15460 V = DAG.getNode(ISD::BITCAST, SDLoc(V), MVT::i32, V);
15461 // Make the DAGCombiner fold the bitcasts.
15462 DCI.AddToWorklist(V.getNode());
15463 }
15464 SDValue LaneIdx = DAG.getConstant(Idx, dl, MVT::i32);
15465 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VecVT, Vec, V, LaneIdx);
15466 }
15467 Vec = DAG.getNode(ISD::BITCAST, dl, VT, Vec);
15468 // Make the DAGCombiner fold the bitcasts.
15469 DCI.AddToWorklist(Vec.getNode());
15470 return Vec;
15471}
15472
15473static SDValue
15475 EVT VT = N->getValueType(0);
15476 SDValue Op = N->getOperand(0);
15477 SDLoc dl(N);
15478
15479 // PREDICATE_CAST(PREDICATE_CAST(x)) == PREDICATE_CAST(x)
15480 if (Op->getOpcode() == ARMISD::PREDICATE_CAST) {
15481 // If the valuetypes are the same, we can remove the cast entirely.
15482 if (Op->getOperand(0).getValueType() == VT)
15483 return Op->getOperand(0);
15484 return DCI.DAG.getNode(ARMISD::PREDICATE_CAST, dl, VT, Op->getOperand(0));
15485 }
15486
15487 // Turn pred_cast(xor x, -1) into xor(pred_cast x, -1), in order to produce
15488 // more VPNOT which might get folded as else predicates.
15489 if (Op.getValueType() == MVT::i32 && isBitwiseNot(Op)) {
15490 SDValue X =
15491 DCI.DAG.getNode(ARMISD::PREDICATE_CAST, dl, VT, Op->getOperand(0));
15492 SDValue C = DCI.DAG.getNode(ARMISD::PREDICATE_CAST, dl, VT,
15493 DCI.DAG.getConstant(65535, dl, MVT::i32));
15494 return DCI.DAG.getNode(ISD::XOR, dl, VT, X, C);
15495 }
15496
15497 // Only the bottom 16 bits of the source register are used.
15498 if (Op.getValueType() == MVT::i32) {
15499 APInt DemandedMask = APInt::getLowBitsSet(32, 16);
15500 const TargetLowering &TLI = DCI.DAG.getTargetLoweringInfo();
15501 if (TLI.SimplifyDemandedBits(Op, DemandedMask, DCI))
15502 return SDValue(N, 0);
15503 }
15504 return SDValue();
15505}
15506
15508 const ARMSubtarget *ST) {
15509 EVT VT = N->getValueType(0);
15510 SDValue Op = N->getOperand(0);
15511 SDLoc dl(N);
15512
15513 // Under Little endian, a VECTOR_REG_CAST is equivalent to a BITCAST
15514 if (ST->isLittle())
15515 return DAG.getNode(ISD::BITCAST, dl, VT, Op);
15516
15517 // VT VECTOR_REG_CAST (VT Op) -> Op
15518 if (Op.getValueType() == VT)
15519 return Op;
15520 // VECTOR_REG_CAST undef -> undef
15521 if (Op.isUndef())
15522 return DAG.getUNDEF(VT);
15523
15524 // VECTOR_REG_CAST(VECTOR_REG_CAST(x)) == VECTOR_REG_CAST(x)
15525 if (Op->getOpcode() == ARMISD::VECTOR_REG_CAST) {
15526 // If the valuetypes are the same, we can remove the cast entirely.
15527 if (Op->getOperand(0).getValueType() == VT)
15528 return Op->getOperand(0);
15529 return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, Op->getOperand(0));
15530 }
15531
15532 return SDValue();
15533}
15534
15536 const ARMSubtarget *Subtarget) {
15537 if (!Subtarget->hasMVEIntegerOps())
15538 return SDValue();
15539
15540 EVT VT = N->getValueType(0);
15541 SDValue Op0 = N->getOperand(0);
15542 SDValue Op1 = N->getOperand(1);
15543 ARMCC::CondCodes Cond = (ARMCC::CondCodes)N->getConstantOperandVal(2);
15544 SDLoc dl(N);
15545
15546 // vcmp X, 0, cc -> vcmpz X, cc
15547 if (isZeroVector(Op1))
15548 return DAG.getNode(ARMISD::VCMPZ, dl, VT, Op0, N->getOperand(2));
15549
15550 unsigned SwappedCond = getSwappedCondition(Cond);
15551 if (isValidMVECond(SwappedCond, VT.isFloatingPoint())) {
15552 // vcmp 0, X, cc -> vcmpz X, reversed(cc)
15553 if (isZeroVector(Op0))
15554 return DAG.getNode(ARMISD::VCMPZ, dl, VT, Op1,
15555 DAG.getConstant(SwappedCond, dl, MVT::i32));
15556 // vcmp vdup(Y), X, cc -> vcmp X, vdup(Y), reversed(cc)
15557 if (Op0->getOpcode() == ARMISD::VDUP && Op1->getOpcode() != ARMISD::VDUP)
15558 return DAG.getNode(ARMISD::VCMP, dl, VT, Op1, Op0,
15559 DAG.getConstant(SwappedCond, dl, MVT::i32));
15560 }
15561
15562 return SDValue();
15563}
15564
15565/// PerformInsertEltCombine - Target-specific dag combine xforms for
15566/// ISD::INSERT_VECTOR_ELT.
15569 // Bitcast an i64 load inserted into a vector to f64.
15570 // Otherwise, the i64 value will be legalized to a pair of i32 values.
15571 EVT VT = N->getValueType(0);
15572 SDNode *Elt = N->getOperand(1).getNode();
15573 if (VT.getVectorElementType() != MVT::i64 ||
15574 !ISD::isNormalLoad(Elt) || cast<LoadSDNode>(Elt)->isVolatile())
15575 return SDValue();
15576
15577 SelectionDAG &DAG = DCI.DAG;
15578 SDLoc dl(N);
15579 EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64,
15581 SDValue Vec = DAG.getNode(ISD::BITCAST, dl, FloatVT, N->getOperand(0));
15582 SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::f64, N->getOperand(1));
15583 // Make the DAGCombiner fold the bitcasts.
15584 DCI.AddToWorklist(Vec.getNode());
15585 DCI.AddToWorklist(V.getNode());
15586 SDValue InsElt = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, FloatVT,
15587 Vec, V, N->getOperand(2));
15588 return DAG.getNode(ISD::BITCAST, dl, VT, InsElt);
15589}
15590
15591// Convert a pair of extracts from the same base vector to a VMOVRRD. Either
15592// directly or bitcast to an integer if the original is a float vector.
15593// extract(x, n); extract(x, n+1) -> VMOVRRD(extract v2f64 x, n/2)
15594// bitcast(extract(x, n)); bitcast(extract(x, n+1)) -> VMOVRRD(extract x, n/2)
15595static SDValue
15597 EVT VT = N->getValueType(0);
15598 SDLoc dl(N);
15599
15600 if (!DCI.isAfterLegalizeDAG() || VT != MVT::i32 ||
15601 !DCI.DAG.getTargetLoweringInfo().isTypeLegal(MVT::f64))
15602 return SDValue();
15603
15604 SDValue Ext = SDValue(N, 0);
15605 if (Ext.getOpcode() == ISD::BITCAST &&
15606 Ext.getOperand(0).getValueType() == MVT::f32)
15607 Ext = Ext.getOperand(0);
15608 if (Ext.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
15610 Ext.getConstantOperandVal(1) % 2 != 0)
15611 return SDValue();
15612 if (Ext->hasOneUse() && (Ext->user_begin()->getOpcode() == ISD::SINT_TO_FP ||
15613 Ext->user_begin()->getOpcode() == ISD::UINT_TO_FP))
15614 return SDValue();
15615
15616 SDValue Op0 = Ext.getOperand(0);
15617 EVT VecVT = Op0.getValueType();
15618 unsigned ResNo = Op0.getResNo();
15619 unsigned Lane = Ext.getConstantOperandVal(1);
15620 if (VecVT.getVectorNumElements() != 4)
15621 return SDValue();
15622
15623 // Find another extract, of Lane + 1
15624 auto OtherIt = find_if(Op0->users(), [&](SDNode *V) {
15625 return V->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
15626 isa<ConstantSDNode>(V->getOperand(1)) &&
15627 V->getConstantOperandVal(1) == Lane + 1 &&
15628 V->getOperand(0).getResNo() == ResNo;
15629 });
15630 if (OtherIt == Op0->users().end())
15631 return SDValue();
15632
15633 // For float extracts, we need to be converting to a i32 for both vector
15634 // lanes.
15635 SDValue OtherExt(*OtherIt, 0);
15636 if (OtherExt.getValueType() != MVT::i32) {
15637 if (!OtherExt->hasOneUse() ||
15638 OtherExt->user_begin()->getOpcode() != ISD::BITCAST ||
15639 OtherExt->user_begin()->getValueType(0) != MVT::i32)
15640 return SDValue();
15641 OtherExt = SDValue(*OtherExt->user_begin(), 0);
15642 }
15643
15644 // Convert the type to a f64 and extract with a VMOVRRD.
15645 SDValue F64 = DCI.DAG.getNode(
15646 ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
15647 DCI.DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v2f64, Op0),
15648 DCI.DAG.getConstant(Ext.getConstantOperandVal(1) / 2, dl, MVT::i32));
15649 SDValue VMOVRRD =
15650 DCI.DAG.getNode(ARMISD::VMOVRRD, dl, {MVT::i32, MVT::i32}, F64);
15651
15652 DCI.CombineTo(OtherExt.getNode(), SDValue(VMOVRRD.getNode(), 1));
15653 return VMOVRRD;
15654}
15655
15658 const ARMSubtarget *ST) {
15659 SDValue Op0 = N->getOperand(0);
15660 EVT VT = N->getValueType(0);
15661 SDLoc dl(N);
15662
15663 // extract (vdup x) -> x
15664 if (Op0->getOpcode() == ARMISD::VDUP) {
15665 SDValue X = Op0->getOperand(0);
15666 if (VT == MVT::f16 && X.getValueType() == MVT::i32)
15667 return DCI.DAG.getNode(ARMISD::VMOVhr, dl, VT, X);
15668 if (VT == MVT::i32 && X.getValueType() == MVT::f16)
15669 return DCI.DAG.getNode(ARMISD::VMOVrh, dl, VT, X);
15670 if (VT == MVT::f32 && X.getValueType() == MVT::i32)
15671 return DCI.DAG.getNode(ISD::BITCAST, dl, VT, X);
15672
15673 while (X.getValueType() != VT && X->getOpcode() == ISD::BITCAST)
15674 X = X->getOperand(0);
15675 if (X.getValueType() == VT)
15676 return X;
15677 }
15678
15679 // extract ARM_BUILD_VECTOR -> x
15680 if (Op0->getOpcode() == ARMISD::BUILD_VECTOR &&
15681 isa<ConstantSDNode>(N->getOperand(1)) &&
15682 N->getConstantOperandVal(1) < Op0.getNumOperands()) {
15683 return Op0.getOperand(N->getConstantOperandVal(1));
15684 }
15685
15686 // extract(bitcast(BUILD_VECTOR(VMOVDRR(a, b), ..))) -> a or b
15687 if (Op0.getValueType() == MVT::v4i32 &&
15688 isa<ConstantSDNode>(N->getOperand(1)) &&
15689 Op0.getOpcode() == ISD::BITCAST &&
15691 Op0.getOperand(0).getValueType() == MVT::v2f64) {
15692 SDValue BV = Op0.getOperand(0);
15693 unsigned Offset = N->getConstantOperandVal(1);
15694 SDValue MOV = BV.getOperand(Offset < 2 ? 0 : 1);
15695 if (MOV.getOpcode() == ARMISD::VMOVDRR)
15696 return MOV.getOperand(ST->isLittle() ? Offset % 2 : 1 - Offset % 2);
15697 }
15698
15699 // extract x, n; extract x, n+1 -> VMOVRRD x
15700 if (SDValue R = PerformExtractEltToVMOVRRD(N, DCI))
15701 return R;
15702
15703 // extract (MVETrunc(x)) -> extract x
15704 if (Op0->getOpcode() == ARMISD::MVETRUNC) {
15705 unsigned Idx = N->getConstantOperandVal(1);
15706 unsigned Vec =
15708 unsigned SubIdx =
15710 return DCI.DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Op0.getOperand(Vec),
15711 DCI.DAG.getConstant(SubIdx, dl, MVT::i32));
15712 }
15713
15714 // extract(bitcast(BUILD_VECTOR(extract(bitcast(a)), ..))) -> extract(a)
15715 if (ST->isLittle() && Op0.getOpcode() == ISD::BITCAST &&
15717 isa<ConstantSDNode>(N->getOperand(1)) &&
15720 unsigned Lane = N->getConstantOperandVal(1);
15721 EVT ExtVT = Op0.getValueType();
15722 EVT BVVT = Op0.getOperand(0).getValueType();
15723 unsigned BVLane =
15724 (Lane * BVVT.getVectorNumElements()) / ExtVT.getVectorNumElements();
15725 assert(BVLane < Op0.getOperand(0).getNumOperands());
15726 SDValue Ext = Op0.getOperand(0).getOperand(BVLane);
15727 if (Ext.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
15728 Ext.getOperand(0).getOpcode() == ISD::BITCAST &&
15730 Ext.getOperand(0).getOperand(0).getValueType() == ExtVT) {
15731 unsigned InnerLane = Ext.getConstantOperandVal(1);
15732 unsigned BVSubLane = Lane - (BVLane * ExtVT.getVectorNumElements()) /
15733 BVVT.getVectorNumElements();
15734 unsigned FinalLane = (InnerLane * ExtVT.getVectorNumElements()) /
15735 BVVT.getVectorNumElements() +
15736 BVSubLane;
15737 return DCI.DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT,
15738 Ext.getOperand(0).getOperand(0),
15739 DCI.DAG.getConstant(FinalLane, dl, MVT::i32));
15740 }
15741 }
15742
15743 return SDValue();
15744}
15745
15747 SDValue Op = N->getOperand(0);
15748 EVT VT = N->getValueType(0);
15749
15750 // sext_inreg(VGETLANEu) -> VGETLANEs
15751 if (Op.getOpcode() == ARMISD::VGETLANEu &&
15752 cast<VTSDNode>(N->getOperand(1))->getVT() ==
15753 Op.getOperand(0).getValueType().getScalarType())
15754 return DAG.getNode(ARMISD::VGETLANEs, SDLoc(N), VT, Op.getOperand(0),
15755 Op.getOperand(1));
15756
15757 return SDValue();
15758}
15759
15760static SDValue
15762 SDValue Vec = N->getOperand(0);
15763 SDValue SubVec = N->getOperand(1);
15764 uint64_t IdxVal = N->getConstantOperandVal(2);
15765 EVT VecVT = Vec.getValueType();
15766 EVT SubVT = SubVec.getValueType();
15767
15768 // Only do this for legal fixed vector types.
15769 if (!VecVT.isFixedLengthVector() ||
15770 !DCI.DAG.getTargetLoweringInfo().isTypeLegal(VecVT) ||
15772 return SDValue();
15773
15774 // Ignore widening patterns.
15775 if (IdxVal == 0 && Vec.isUndef())
15776 return SDValue();
15777
15778 // Subvector must be half the width and an "aligned" insertion.
15779 unsigned NumSubElts = SubVT.getVectorNumElements();
15780 if ((SubVT.getSizeInBits() * 2) != VecVT.getSizeInBits() ||
15781 (IdxVal != 0 && IdxVal != NumSubElts))
15782 return SDValue();
15783
15784 // Fold insert_subvector -> concat_vectors
15785 // insert_subvector(Vec,Sub,lo) -> concat_vectors(Sub,extract(Vec,hi))
15786 // insert_subvector(Vec,Sub,hi) -> concat_vectors(extract(Vec,lo),Sub)
15787 SDLoc DL(N);
15788 SDValue Lo, Hi;
15789 if (IdxVal == 0) {
15790 Lo = SubVec;
15791 Hi = DCI.DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, Vec,
15792 DCI.DAG.getVectorIdxConstant(NumSubElts, DL));
15793 } else {
15794 Lo = DCI.DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, Vec,
15795 DCI.DAG.getVectorIdxConstant(0, DL));
15796 Hi = SubVec;
15797 }
15798 return DCI.DAG.getNode(ISD::CONCAT_VECTORS, DL, VecVT, Lo, Hi);
15799}
15800
15801// shuffle(MVETrunc(x, y)) -> VMOVN(x, y)
15803 SelectionDAG &DAG) {
15804 SDValue Trunc = N->getOperand(0);
15805 EVT VT = Trunc.getValueType();
15806 if (Trunc.getOpcode() != ARMISD::MVETRUNC || !N->getOperand(1).isUndef())
15807 return SDValue();
15808
15809 SDLoc DL(Trunc);
15810 if (isVMOVNTruncMask(N->getMask(), VT, false))
15811 return DAG.getNode(
15812 ARMISD::VMOVN, DL, VT,
15813 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, Trunc.getOperand(0)),
15814 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, Trunc.getOperand(1)),
15815 DAG.getConstant(1, DL, MVT::i32));
15816 else if (isVMOVNTruncMask(N->getMask(), VT, true))
15817 return DAG.getNode(
15818 ARMISD::VMOVN, DL, VT,
15819 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, Trunc.getOperand(1)),
15820 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, Trunc.getOperand(0)),
15821 DAG.getConstant(1, DL, MVT::i32));
15822 return SDValue();
15823}
15824
15825/// PerformVECTOR_SHUFFLECombine - Target-specific dag combine xforms for
15826/// ISD::VECTOR_SHUFFLE.
15829 return R;
15830
15831 // The LLVM shufflevector instruction does not require the shuffle mask
15832 // length to match the operand vector length, but ISD::VECTOR_SHUFFLE does
15833 // have that requirement. When translating to ISD::VECTOR_SHUFFLE, if the
15834 // operands do not match the mask length, they are extended by concatenating
15835 // them with undef vectors. That is probably the right thing for other
15836 // targets, but for NEON it is better to concatenate two double-register
15837 // size vector operands into a single quad-register size vector. Do that
15838 // transformation here:
15839 // shuffle(concat(v1, undef), concat(v2, undef)) ->
15840 // shuffle(concat(v1, v2), undef)
15841 SDValue Op0 = N->getOperand(0);
15842 SDValue Op1 = N->getOperand(1);
15843 if (Op0.getOpcode() != ISD::CONCAT_VECTORS ||
15844 Op1.getOpcode() != ISD::CONCAT_VECTORS ||
15845 Op0.getNumOperands() != 2 ||
15846 Op1.getNumOperands() != 2)
15847 return SDValue();
15848 SDValue Concat0Op1 = Op0.getOperand(1);
15849 SDValue Concat1Op1 = Op1.getOperand(1);
15850 if (!Concat0Op1.isUndef() || !Concat1Op1.isUndef())
15851 return SDValue();
15852 // Skip the transformation if any of the types are illegal.
15853 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
15854 EVT VT = N->getValueType(0);
15855 if (!TLI.isTypeLegal(VT) ||
15856 !TLI.isTypeLegal(Concat0Op1.getValueType()) ||
15857 !TLI.isTypeLegal(Concat1Op1.getValueType()))
15858 return SDValue();
15859
15860 SDValue NewConcat = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT,
15861 Op0.getOperand(0), Op1.getOperand(0));
15862 // Translate the shuffle mask.
15863 SmallVector<int, 16> NewMask;
15864 unsigned NumElts = VT.getVectorNumElements();
15865 unsigned HalfElts = NumElts/2;
15867 for (unsigned n = 0; n < NumElts; ++n) {
15868 int MaskElt = SVN->getMaskElt(n);
15869 int NewElt = -1;
15870 if (MaskElt < (int)HalfElts)
15871 NewElt = MaskElt;
15872 else if (MaskElt >= (int)NumElts && MaskElt < (int)(NumElts + HalfElts))
15873 NewElt = HalfElts + MaskElt - NumElts;
15874 NewMask.push_back(NewElt);
15875 }
15876 return DAG.getVectorShuffle(VT, SDLoc(N), NewConcat,
15877 DAG.getUNDEF(VT), NewMask);
15878}
15879
15880/// Load/store instruction that can be merged with a base address
15881/// update
15886 unsigned AddrOpIdx;
15887};
15888
15890 /// Instruction that updates a pointer
15892 /// Pointer increment operand
15894 /// Pointer increment value if it is a constant, or 0 otherwise
15895 unsigned ConstInc;
15896};
15897
15899 // Check that the add is independent of the load/store.
15900 // Otherwise, folding it would create a cycle. Search through Addr
15901 // as well, since the User may not be a direct user of Addr and
15902 // only share a base pointer.
15905 Worklist.push_back(N);
15906 Worklist.push_back(User);
15907 const unsigned MaxSteps = 1024;
15908 if (SDNode::hasPredecessorHelper(N, Visited, Worklist, MaxSteps) ||
15909 SDNode::hasPredecessorHelper(User, Visited, Worklist, MaxSteps))
15910 return false;
15911 return true;
15912}
15913
15915 struct BaseUpdateUser &User,
15916 bool SimpleConstIncOnly,
15918 SelectionDAG &DAG = DCI.DAG;
15919 SDNode *N = Target.N;
15920 MemSDNode *MemN = cast<MemSDNode>(N);
15921 SDLoc dl(N);
15922
15923 // Find the new opcode for the updating load/store.
15924 bool isLoadOp = true;
15925 bool isLaneOp = false;
15926 // Workaround for vst1x and vld1x intrinsics which do not have alignment
15927 // as an operand.
15928 bool hasAlignment = true;
15929 unsigned NewOpc = 0;
15930 unsigned NumVecs = 0;
15931 if (Target.isIntrinsic) {
15932 unsigned IntNo = N->getConstantOperandVal(1);
15933 switch (IntNo) {
15934 default:
15935 llvm_unreachable("unexpected intrinsic for Neon base update");
15936 case Intrinsic::arm_neon_vld1:
15937 NewOpc = ARMISD::VLD1_UPD;
15938 NumVecs = 1;
15939 break;
15940 case Intrinsic::arm_neon_vld2:
15941 NewOpc = ARMISD::VLD2_UPD;
15942 NumVecs = 2;
15943 break;
15944 case Intrinsic::arm_neon_vld3:
15945 NewOpc = ARMISD::VLD3_UPD;
15946 NumVecs = 3;
15947 break;
15948 case Intrinsic::arm_neon_vld4:
15949 NewOpc = ARMISD::VLD4_UPD;
15950 NumVecs = 4;
15951 break;
15952 case Intrinsic::arm_neon_vld1x2:
15953 NewOpc = ARMISD::VLD1x2_UPD;
15954 NumVecs = 2;
15955 hasAlignment = false;
15956 break;
15957 case Intrinsic::arm_neon_vld1x3:
15958 NewOpc = ARMISD::VLD1x3_UPD;
15959 NumVecs = 3;
15960 hasAlignment = false;
15961 break;
15962 case Intrinsic::arm_neon_vld1x4:
15963 NewOpc = ARMISD::VLD1x4_UPD;
15964 NumVecs = 4;
15965 hasAlignment = false;
15966 break;
15967 case Intrinsic::arm_neon_vld2dup:
15968 NewOpc = ARMISD::VLD2DUP_UPD;
15969 NumVecs = 2;
15970 break;
15971 case Intrinsic::arm_neon_vld3dup:
15972 NewOpc = ARMISD::VLD3DUP_UPD;
15973 NumVecs = 3;
15974 break;
15975 case Intrinsic::arm_neon_vld4dup:
15976 NewOpc = ARMISD::VLD4DUP_UPD;
15977 NumVecs = 4;
15978 break;
15979 case Intrinsic::arm_neon_vld2lane:
15980 NewOpc = ARMISD::VLD2LN_UPD;
15981 NumVecs = 2;
15982 isLaneOp = true;
15983 break;
15984 case Intrinsic::arm_neon_vld3lane:
15985 NewOpc = ARMISD::VLD3LN_UPD;
15986 NumVecs = 3;
15987 isLaneOp = true;
15988 break;
15989 case Intrinsic::arm_neon_vld4lane:
15990 NewOpc = ARMISD::VLD4LN_UPD;
15991 NumVecs = 4;
15992 isLaneOp = true;
15993 break;
15994 case Intrinsic::arm_neon_vst1:
15995 NewOpc = ARMISD::VST1_UPD;
15996 NumVecs = 1;
15997 isLoadOp = false;
15998 break;
15999 case Intrinsic::arm_neon_vst2:
16000 NewOpc = ARMISD::VST2_UPD;
16001 NumVecs = 2;
16002 isLoadOp = false;
16003 break;
16004 case Intrinsic::arm_neon_vst3:
16005 NewOpc = ARMISD::VST3_UPD;
16006 NumVecs = 3;
16007 isLoadOp = false;
16008 break;
16009 case Intrinsic::arm_neon_vst4:
16010 NewOpc = ARMISD::VST4_UPD;
16011 NumVecs = 4;
16012 isLoadOp = false;
16013 break;
16014 case Intrinsic::arm_neon_vst2lane:
16015 NewOpc = ARMISD::VST2LN_UPD;
16016 NumVecs = 2;
16017 isLoadOp = false;
16018 isLaneOp = true;
16019 break;
16020 case Intrinsic::arm_neon_vst3lane:
16021 NewOpc = ARMISD::VST3LN_UPD;
16022 NumVecs = 3;
16023 isLoadOp = false;
16024 isLaneOp = true;
16025 break;
16026 case Intrinsic::arm_neon_vst4lane:
16027 NewOpc = ARMISD::VST4LN_UPD;
16028 NumVecs = 4;
16029 isLoadOp = false;
16030 isLaneOp = true;
16031 break;
16032 case Intrinsic::arm_neon_vst1x2:
16033 NewOpc = ARMISD::VST1x2_UPD;
16034 NumVecs = 2;
16035 isLoadOp = false;
16036 hasAlignment = false;
16037 break;
16038 case Intrinsic::arm_neon_vst1x3:
16039 NewOpc = ARMISD::VST1x3_UPD;
16040 NumVecs = 3;
16041 isLoadOp = false;
16042 hasAlignment = false;
16043 break;
16044 case Intrinsic::arm_neon_vst1x4:
16045 NewOpc = ARMISD::VST1x4_UPD;
16046 NumVecs = 4;
16047 isLoadOp = false;
16048 hasAlignment = false;
16049 break;
16050 }
16051 } else {
16052 isLaneOp = true;
16053 switch (N->getOpcode()) {
16054 default:
16055 llvm_unreachable("unexpected opcode for Neon base update");
16056 case ARMISD::VLD1DUP:
16057 NewOpc = ARMISD::VLD1DUP_UPD;
16058 NumVecs = 1;
16059 break;
16060 case ARMISD::VLD2DUP:
16061 NewOpc = ARMISD::VLD2DUP_UPD;
16062 NumVecs = 2;
16063 break;
16064 case ARMISD::VLD3DUP:
16065 NewOpc = ARMISD::VLD3DUP_UPD;
16066 NumVecs = 3;
16067 break;
16068 case ARMISD::VLD4DUP:
16069 NewOpc = ARMISD::VLD4DUP_UPD;
16070 NumVecs = 4;
16071 break;
16072 case ISD::LOAD:
16073 NewOpc = ARMISD::VLD1_UPD;
16074 NumVecs = 1;
16075 isLaneOp = false;
16076 break;
16077 case ISD::STORE:
16078 NewOpc = ARMISD::VST1_UPD;
16079 NumVecs = 1;
16080 isLaneOp = false;
16081 isLoadOp = false;
16082 break;
16083 }
16084 }
16085
16086 // Find the size of memory referenced by the load/store.
16087 EVT VecTy;
16088 if (isLoadOp) {
16089 VecTy = N->getValueType(0);
16090 } else if (Target.isIntrinsic) {
16091 VecTy = N->getOperand(Target.AddrOpIdx + 1).getValueType();
16092 } else {
16093 assert(Target.isStore &&
16094 "Node has to be a load, a store, or an intrinsic!");
16095 VecTy = N->getOperand(1).getValueType();
16096 }
16097
16098 bool isVLDDUPOp =
16099 NewOpc == ARMISD::VLD1DUP_UPD || NewOpc == ARMISD::VLD2DUP_UPD ||
16100 NewOpc == ARMISD::VLD3DUP_UPD || NewOpc == ARMISD::VLD4DUP_UPD;
16101
16102 unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8;
16103 if (isLaneOp || isVLDDUPOp)
16104 NumBytes /= VecTy.getVectorNumElements();
16105
16106 if (NumBytes >= 3 * 16 && User.ConstInc != NumBytes) {
16107 // VLD3/4 and VST3/4 for 128-bit vectors are implemented with two
16108 // separate instructions that make it harder to use a non-constant update.
16109 return false;
16110 }
16111
16112 if (SimpleConstIncOnly && User.ConstInc != NumBytes)
16113 return false;
16114
16115 if (!isValidBaseUpdate(N, User.N))
16116 return false;
16117
16118 // OK, we found an ADD we can fold into the base update.
16119 // Now, create a _UPD node, taking care of not breaking alignment.
16120
16121 EVT AlignedVecTy = VecTy;
16122 Align Alignment = MemN->getAlign();
16123
16124 // If this is a less-than-standard-aligned load/store, change the type to
16125 // match the standard alignment.
16126 // The alignment is overlooked when selecting _UPD variants; and it's
16127 // easier to introduce bitcasts here than fix that.
16128 // There are 3 ways to get to this base-update combine:
16129 // - intrinsics: they are assumed to be properly aligned (to the standard
16130 // alignment of the memory type), so we don't need to do anything.
16131 // - ARMISD::VLDx nodes: they are only generated from the aforementioned
16132 // intrinsics, so, likewise, there's nothing to do.
16133 // - generic load/store instructions: the alignment is specified as an
16134 // explicit operand, rather than implicitly as the standard alignment
16135 // of the memory type (like the intrinsics). We need to change the
16136 // memory type to match the explicit alignment. That way, we don't
16137 // generate non-standard-aligned ARMISD::VLDx nodes.
16138 if (isa<LSBaseSDNode>(N)) {
16139 if (Alignment.value() < VecTy.getScalarSizeInBits() / 8) {
16140 MVT EltTy = MVT::getIntegerVT(Alignment.value() * 8);
16141 assert(NumVecs == 1 && "Unexpected multi-element generic load/store.");
16142 assert(!isLaneOp && "Unexpected generic load/store lane.");
16143 unsigned NumElts = NumBytes / (EltTy.getSizeInBits() / 8);
16144 AlignedVecTy = MVT::getVectorVT(EltTy, NumElts);
16145 }
16146 // Don't set an explicit alignment on regular load/stores that we want
16147 // to transform to VLD/VST 1_UPD nodes.
16148 // This matches the behavior of regular load/stores, which only get an
16149 // explicit alignment if the MMO alignment is larger than the standard
16150 // alignment of the memory type.
16151 // Intrinsics, however, always get an explicit alignment, set to the
16152 // alignment of the MMO.
16153 Alignment = Align(1);
16154 }
16155
16156 // Create the new updating load/store node.
16157 // First, create an SDVTList for the new updating node's results.
16158 EVT Tys[6];
16159 unsigned NumResultVecs = (isLoadOp ? NumVecs : 0);
16160 unsigned n;
16161 for (n = 0; n < NumResultVecs; ++n)
16162 Tys[n] = AlignedVecTy;
16163 Tys[n++] = MVT::i32;
16164 Tys[n] = MVT::Other;
16165 SDVTList SDTys = DAG.getVTList(ArrayRef(Tys, NumResultVecs + 2));
16166
16167 // Then, gather the new node's operands.
16169 Ops.push_back(N->getOperand(0)); // incoming chain
16170 Ops.push_back(N->getOperand(Target.AddrOpIdx));
16171 Ops.push_back(User.Inc);
16172
16173 if (StoreSDNode *StN = dyn_cast<StoreSDNode>(N)) {
16174 // Try to match the intrinsic's signature
16175 Ops.push_back(StN->getValue());
16176 } else {
16177 // Loads (and of course intrinsics) match the intrinsics' signature,
16178 // so just add all but the alignment operand.
16179 unsigned LastOperand =
16180 hasAlignment ? N->getNumOperands() - 1 : N->getNumOperands();
16181 for (unsigned i = Target.AddrOpIdx + 1; i < LastOperand; ++i)
16182 Ops.push_back(N->getOperand(i));
16183 }
16184
16185 // For all node types, the alignment operand is always the last one.
16186 Ops.push_back(DAG.getConstant(Alignment.value(), dl, MVT::i32));
16187
16188 // If this is a non-standard-aligned STORE, the penultimate operand is the
16189 // stored value. Bitcast it to the aligned type.
16190 if (AlignedVecTy != VecTy && N->getOpcode() == ISD::STORE) {
16191 SDValue &StVal = Ops[Ops.size() - 2];
16192 StVal = DAG.getNode(ISD::BITCAST, dl, AlignedVecTy, StVal);
16193 }
16194
16195 EVT LoadVT = isLaneOp ? VecTy.getVectorElementType() : AlignedVecTy;
16196 SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, dl, SDTys, Ops, LoadVT,
16197 MemN->getMemOperand());
16198
16199 // Update the uses.
16200 SmallVector<SDValue, 5> NewResults;
16201 for (unsigned i = 0; i < NumResultVecs; ++i)
16202 NewResults.push_back(SDValue(UpdN.getNode(), i));
16203
16204 // If this is an non-standard-aligned LOAD, the first result is the loaded
16205 // value. Bitcast it to the expected result type.
16206 if (AlignedVecTy != VecTy && N->getOpcode() == ISD::LOAD) {
16207 SDValue &LdVal = NewResults[0];
16208 LdVal = DAG.getNode(ISD::BITCAST, dl, VecTy, LdVal);
16209 }
16210
16211 NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs + 1)); // chain
16212 DCI.CombineTo(N, NewResults);
16213 DCI.CombineTo(User.N, SDValue(UpdN.getNode(), NumResultVecs));
16214
16215 return true;
16216}
16217
16218// If (opcode ptr inc) is and ADD-like instruction, return the
16219// increment value. Otherwise return 0.
16220static unsigned getPointerConstIncrement(unsigned Opcode, SDValue Ptr,
16221 SDValue Inc, const SelectionDAG &DAG) {
16223 if (!CInc)
16224 return 0;
16225
16226 switch (Opcode) {
16227 case ARMISD::VLD1_UPD:
16228 case ISD::ADD:
16229 return CInc->getZExtValue();
16230 case ISD::OR: {
16231 if (DAG.haveNoCommonBitsSet(Ptr, Inc)) {
16232 // (OR ptr inc) is the same as (ADD ptr inc)
16233 return CInc->getZExtValue();
16234 }
16235 return 0;
16236 }
16237 default:
16238 return 0;
16239 }
16240}
16241
16243 switch (N->getOpcode()) {
16244 case ISD::ADD:
16245 case ISD::OR: {
16246 if (isa<ConstantSDNode>(N->getOperand(1))) {
16247 *Ptr = N->getOperand(0);
16248 *CInc = N->getOperand(1);
16249 return true;
16250 }
16251 return false;
16252 }
16253 case ARMISD::VLD1_UPD: {
16254 if (isa<ConstantSDNode>(N->getOperand(2))) {
16255 *Ptr = N->getOperand(1);
16256 *CInc = N->getOperand(2);
16257 return true;
16258 }
16259 return false;
16260 }
16261 default:
16262 return false;
16263 }
16264}
16265
16266/// CombineBaseUpdate - Target-specific DAG combine function for VLDDUP,
16267/// NEON load/store intrinsics, and generic vector load/stores, to merge
16268/// base address updates.
16269/// For generic load/stores, the memory type is assumed to be a vector.
16270/// The caller is assumed to have checked legality.
16273 const bool isIntrinsic = (N->getOpcode() == ISD::INTRINSIC_VOID ||
16274 N->getOpcode() == ISD::INTRINSIC_W_CHAIN);
16275 const bool isStore = N->getOpcode() == ISD::STORE;
16276 const unsigned AddrOpIdx = ((isIntrinsic || isStore) ? 2 : 1);
16277 BaseUpdateTarget Target = {N, isIntrinsic, isStore, AddrOpIdx};
16278
16279 // Limit the number of possible base-updates we look at to prevent degenerate
16280 // cases.
16281 unsigned MaxBaseUpdates = ArmMaxBaseUpdatesToCheck;
16282
16283 SDValue Addr = N->getOperand(AddrOpIdx);
16284
16286
16287 // Search for a use of the address operand that is an increment.
16288 for (SDUse &Use : Addr->uses()) {
16289 SDNode *User = Use.getUser();
16290 if (Use.getResNo() != Addr.getResNo() || User->getNumOperands() != 2)
16291 continue;
16292
16293 SDValue Inc = User->getOperand(Use.getOperandNo() == 1 ? 0 : 1);
16294 unsigned ConstInc =
16295 getPointerConstIncrement(User->getOpcode(), Addr, Inc, DCI.DAG);
16296
16297 if (ConstInc || User->getOpcode() == ISD::ADD) {
16298 BaseUpdates.push_back({User, Inc, ConstInc});
16299 if (BaseUpdates.size() >= MaxBaseUpdates)
16300 break;
16301 }
16302 }
16303
16304 // If the address is a constant pointer increment itself, find
16305 // another constant increment that has the same base operand
16306 SDValue Base;
16307 SDValue CInc;
16308 if (findPointerConstIncrement(Addr.getNode(), &Base, &CInc)) {
16309 unsigned Offset =
16310 getPointerConstIncrement(Addr->getOpcode(), Base, CInc, DCI.DAG);
16311 if (Offset) {
16312 for (SDUse &Use : Base->uses()) {
16313
16314 SDNode *User = Use.getUser();
16315 if (Use.getResNo() != Base.getResNo() || User == Addr.getNode() ||
16316 User->getNumOperands() != 2)
16317 continue;
16318
16319 SDValue UserInc = User->getOperand(Use.getOperandNo() == 0 ? 1 : 0);
16320 unsigned UserOffset =
16321 getPointerConstIncrement(User->getOpcode(), Base, UserInc, DCI.DAG);
16322
16323 if (!UserOffset || UserOffset <= Offset)
16324 continue;
16325
16326 unsigned NewConstInc = UserOffset - Offset;
16327 SDValue NewInc = DCI.DAG.getConstant(NewConstInc, SDLoc(N), MVT::i32);
16328 BaseUpdates.push_back({User, NewInc, NewConstInc});
16329 if (BaseUpdates.size() >= MaxBaseUpdates)
16330 break;
16331 }
16332 }
16333 }
16334
16335 // Try to fold the load/store with an update that matches memory
16336 // access size. This should work well for sequential loads.
16337 unsigned NumValidUpd = BaseUpdates.size();
16338 for (unsigned I = 0; I < NumValidUpd; I++) {
16339 BaseUpdateUser &User = BaseUpdates[I];
16340 if (TryCombineBaseUpdate(Target, User, /*SimpleConstIncOnly=*/true, DCI))
16341 return SDValue();
16342 }
16343
16344 // Try to fold with other users. Non-constant updates are considered
16345 // first, and constant updates are sorted to not break a sequence of
16346 // strided accesses (if there is any).
16347 llvm::stable_sort(BaseUpdates,
16348 [](const BaseUpdateUser &LHS, const BaseUpdateUser &RHS) {
16349 return LHS.ConstInc < RHS.ConstInc;
16350 });
16351 for (BaseUpdateUser &User : BaseUpdates) {
16352 if (TryCombineBaseUpdate(Target, User, /*SimpleConstIncOnly=*/false, DCI))
16353 return SDValue();
16354 }
16355 return SDValue();
16356}
16357
16360 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
16361 return SDValue();
16362
16363 return CombineBaseUpdate(N, DCI);
16364}
16365
16368 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
16369 return SDValue();
16370
16371 SelectionDAG &DAG = DCI.DAG;
16372 SDValue Addr = N->getOperand(2);
16373 MemSDNode *MemN = cast<MemSDNode>(N);
16374 SDLoc dl(N);
16375
16376 // For the stores, where there are multiple intrinsics we only actually want
16377 // to post-inc the last of the them.
16378 unsigned IntNo = N->getConstantOperandVal(1);
16379 if (IntNo == Intrinsic::arm_mve_vst2q && N->getConstantOperandVal(5) != 1)
16380 return SDValue();
16381 if (IntNo == Intrinsic::arm_mve_vst4q && N->getConstantOperandVal(7) != 3)
16382 return SDValue();
16383
16384 // Search for a use of the address operand that is an increment.
16385 for (SDUse &Use : Addr->uses()) {
16386 SDNode *User = Use.getUser();
16387 if (User->getOpcode() != ISD::ADD || Use.getResNo() != Addr.getResNo())
16388 continue;
16389
16390 // Check that the add is independent of the load/store. Otherwise, folding
16391 // it would create a cycle. We can avoid searching through Addr as it's a
16392 // predecessor to both.
16395 Visited.insert(Addr.getNode());
16396 Worklist.push_back(N);
16397 Worklist.push_back(User);
16398 const unsigned MaxSteps = 1024;
16399 if (SDNode::hasPredecessorHelper(N, Visited, Worklist, MaxSteps) ||
16400 SDNode::hasPredecessorHelper(User, Visited, Worklist, MaxSteps))
16401 continue;
16402
16403 // Find the new opcode for the updating load/store.
16404 bool isLoadOp = true;
16405 unsigned NewOpc = 0;
16406 unsigned NumVecs = 0;
16407 switch (IntNo) {
16408 default:
16409 llvm_unreachable("unexpected intrinsic for MVE VLDn combine");
16410 case Intrinsic::arm_mve_vld2q:
16411 NewOpc = ARMISD::VLD2_UPD;
16412 NumVecs = 2;
16413 break;
16414 case Intrinsic::arm_mve_vld4q:
16415 NewOpc = ARMISD::VLD4_UPD;
16416 NumVecs = 4;
16417 break;
16418 case Intrinsic::arm_mve_vst2q:
16419 NewOpc = ARMISD::VST2_UPD;
16420 NumVecs = 2;
16421 isLoadOp = false;
16422 break;
16423 case Intrinsic::arm_mve_vst4q:
16424 NewOpc = ARMISD::VST4_UPD;
16425 NumVecs = 4;
16426 isLoadOp = false;
16427 break;
16428 }
16429
16430 // Find the size of memory referenced by the load/store.
16431 EVT VecTy;
16432 if (isLoadOp) {
16433 VecTy = N->getValueType(0);
16434 } else {
16435 VecTy = N->getOperand(3).getValueType();
16436 }
16437
16438 unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8;
16439
16440 // If the increment is a constant, it must match the memory ref size.
16441 SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);
16443 if (!CInc || CInc->getZExtValue() != NumBytes)
16444 continue;
16445
16446 // Create the new updating load/store node.
16447 // First, create an SDVTList for the new updating node's results.
16448 EVT Tys[6];
16449 unsigned NumResultVecs = (isLoadOp ? NumVecs : 0);
16450 unsigned n;
16451 for (n = 0; n < NumResultVecs; ++n)
16452 Tys[n] = VecTy;
16453 Tys[n++] = MVT::i32;
16454 Tys[n] = MVT::Other;
16455 SDVTList SDTys = DAG.getVTList(ArrayRef(Tys, NumResultVecs + 2));
16456
16457 // Then, gather the new node's operands.
16459 Ops.push_back(N->getOperand(0)); // incoming chain
16460 Ops.push_back(N->getOperand(2)); // ptr
16461 Ops.push_back(Inc);
16462
16463 for (unsigned i = 3; i < N->getNumOperands(); ++i)
16464 Ops.push_back(N->getOperand(i));
16465
16466 SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, dl, SDTys, Ops, VecTy,
16467 MemN->getMemOperand());
16468
16469 // Update the uses.
16470 SmallVector<SDValue, 5> NewResults;
16471 for (unsigned i = 0; i < NumResultVecs; ++i)
16472 NewResults.push_back(SDValue(UpdN.getNode(), i));
16473
16474 NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs + 1)); // chain
16475 DCI.CombineTo(N, NewResults);
16476 DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs));
16477
16478 break;
16479 }
16480
16481 return SDValue();
16482}
16483
16484/// CombineVLDDUP - For a VDUPLANE node N, check if its source operand is a
16485/// vldN-lane (N > 1) intrinsic, and if all the other uses of that intrinsic
16486/// are also VDUPLANEs. If so, combine them to a vldN-dup operation and
16487/// return true.
16489 SelectionDAG &DAG = DCI.DAG;
16490 EVT VT = N->getValueType(0);
16491 // vldN-dup instructions only support 64-bit vectors for N > 1.
16492 if (!VT.is64BitVector())
16493 return false;
16494
16495 // Check if the VDUPLANE operand is a vldN-dup intrinsic.
16496 SDNode *VLD = N->getOperand(0).getNode();
16497 if (VLD->getOpcode() != ISD::INTRINSIC_W_CHAIN)
16498 return false;
16499 unsigned NumVecs = 0;
16500 unsigned NewOpc = 0;
16501 unsigned IntNo = VLD->getConstantOperandVal(1);
16502 if (IntNo == Intrinsic::arm_neon_vld2lane) {
16503 NumVecs = 2;
16504 NewOpc = ARMISD::VLD2DUP;
16505 } else if (IntNo == Intrinsic::arm_neon_vld3lane) {
16506 NumVecs = 3;
16507 NewOpc = ARMISD::VLD3DUP;
16508 } else if (IntNo == Intrinsic::arm_neon_vld4lane) {
16509 NumVecs = 4;
16510 NewOpc = ARMISD::VLD4DUP;
16511 } else {
16512 return false;
16513 }
16514
16515 // First check that all the vldN-lane uses are VDUPLANEs and that the lane
16516 // numbers match the load.
16517 unsigned VLDLaneNo = VLD->getConstantOperandVal(NumVecs + 3);
16518 for (SDUse &Use : VLD->uses()) {
16519 // Ignore uses of the chain result.
16520 if (Use.getResNo() == NumVecs)
16521 continue;
16522 SDNode *User = Use.getUser();
16523 if (User->getOpcode() != ARMISD::VDUPLANE ||
16524 VLDLaneNo != User->getConstantOperandVal(1))
16525 return false;
16526 }
16527
16528 // Create the vldN-dup node.
16529 EVT Tys[5];
16530 unsigned n;
16531 for (n = 0; n < NumVecs; ++n)
16532 Tys[n] = VT;
16533 Tys[n] = MVT::Other;
16534 SDVTList SDTys = DAG.getVTList(ArrayRef(Tys, NumVecs + 1));
16535 SDValue Ops[] = { VLD->getOperand(0), VLD->getOperand(2) };
16537 SDValue VLDDup = DAG.getMemIntrinsicNode(NewOpc, SDLoc(VLD), SDTys,
16538 Ops, VLDMemInt->getMemoryVT(),
16539 VLDMemInt->getMemOperand());
16540
16541 // Update the uses.
16542 for (SDUse &Use : VLD->uses()) {
16543 unsigned ResNo = Use.getResNo();
16544 // Ignore uses of the chain result.
16545 if (ResNo == NumVecs)
16546 continue;
16547 DCI.CombineTo(Use.getUser(), SDValue(VLDDup.getNode(), ResNo));
16548 }
16549
16550 // Now the vldN-lane intrinsic is dead except for its chain result.
16551 // Update uses of the chain.
16552 std::vector<SDValue> VLDDupResults;
16553 for (unsigned n = 0; n < NumVecs; ++n)
16554 VLDDupResults.push_back(SDValue(VLDDup.getNode(), n));
16555 VLDDupResults.push_back(SDValue(VLDDup.getNode(), NumVecs));
16556 DCI.CombineTo(VLD, VLDDupResults);
16557
16558 return true;
16559}
16560
16561/// PerformVDUPLANECombine - Target-specific dag combine xforms for
16562/// ARMISD::VDUPLANE.
16565 const ARMSubtarget *Subtarget) {
16566 SDValue Op = N->getOperand(0);
16567 EVT VT = N->getValueType(0);
16568
16569 // On MVE, we just convert the VDUPLANE to a VDUP with an extract.
16570 if (Subtarget->hasMVEIntegerOps()) {
16571 EVT ExtractVT = VT.getVectorElementType();
16572 // We need to ensure we are creating a legal type.
16573 if (!DCI.DAG.getTargetLoweringInfo().isTypeLegal(ExtractVT))
16574 ExtractVT = MVT::i32;
16575 SDValue Extract = DCI.DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(N), ExtractVT,
16576 N->getOperand(0), N->getOperand(1));
16577 return DCI.DAG.getNode(ARMISD::VDUP, SDLoc(N), VT, Extract);
16578 }
16579
16580 // If the source is a vldN-lane (N > 1) intrinsic, and all the other uses
16581 // of that intrinsic are also VDUPLANEs, combine them to a vldN-dup operation.
16582 if (CombineVLDDUP(N, DCI))
16583 return SDValue(N, 0);
16584
16585 // If the source is already a VMOVIMM or VMVNIMM splat, the VDUPLANE is
16586 // redundant. Ignore bit_converts for now; element sizes are checked below.
16587 while (Op.getOpcode() == ISD::BITCAST)
16588 Op = Op.getOperand(0);
16589 if (Op.getOpcode() != ARMISD::VMOVIMM && Op.getOpcode() != ARMISD::VMVNIMM)
16590 return SDValue();
16591
16592 // Make sure the VMOV element size is not bigger than the VDUPLANE elements.
16593 unsigned EltSize = Op.getScalarValueSizeInBits();
16594 // The canonical VMOV for a zero vector uses a 32-bit element size.
16595 unsigned Imm = Op.getConstantOperandVal(0);
16596 unsigned EltBits;
16597 if (ARM_AM::decodeVMOVModImm(Imm, EltBits) == 0)
16598 EltSize = 8;
16599 if (EltSize > VT.getScalarSizeInBits())
16600 return SDValue();
16601
16602 return DCI.DAG.getNode(ISD::BITCAST, SDLoc(N), VT, Op);
16603}
16604
16605/// PerformVDUPCombine - Target-specific dag combine xforms for ARMISD::VDUP.
16607 const ARMSubtarget *Subtarget) {
16608 SDValue Op = N->getOperand(0);
16609 SDLoc dl(N);
16610
16611 if (Subtarget->hasMVEIntegerOps()) {
16612 // Convert VDUP f32 -> VDUP BITCAST i32 under MVE, as we know the value will
16613 // need to come from a GPR.
16614 if (Op.getValueType() == MVT::f32)
16615 return DAG.getNode(ARMISD::VDUP, dl, N->getValueType(0),
16616 DAG.getNode(ISD::BITCAST, dl, MVT::i32, Op));
16617 else if (Op.getValueType() == MVT::f16)
16618 return DAG.getNode(ARMISD::VDUP, dl, N->getValueType(0),
16619 DAG.getNode(ARMISD::VMOVrh, dl, MVT::i32, Op));
16620 }
16621
16622 if (!Subtarget->hasNEON())
16623 return SDValue();
16624
16625 // Match VDUP(LOAD) -> VLD1DUP.
16626 // We match this pattern here rather than waiting for isel because the
16627 // transform is only legal for unindexed loads.
16628 LoadSDNode *LD = dyn_cast<LoadSDNode>(Op.getNode());
16629 if (LD && Op.hasOneUse() && LD->isUnindexed() &&
16630 LD->getMemoryVT() == N->getValueType(0).getVectorElementType()) {
16631 SDValue Ops[] = {LD->getOperand(0), LD->getOperand(1),
16632 DAG.getConstant(LD->getAlign().value(), SDLoc(N), MVT::i32)};
16633 SDVTList SDTys = DAG.getVTList(N->getValueType(0), MVT::Other);
16634 SDValue VLDDup =
16636 LD->getMemoryVT(), LD->getMemOperand());
16637 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), VLDDup.getValue(1));
16638 return VLDDup;
16639 }
16640
16641 return SDValue();
16642}
16643
16646 const ARMSubtarget *Subtarget) {
16647 EVT VT = N->getValueType(0);
16648
16649 // If this is a legal vector load, try to combine it into a VLD1_UPD.
16650 if (Subtarget->hasNEON() && ISD::isNormalLoad(N) && VT.isVector() &&
16652 return CombineBaseUpdate(N, DCI);
16653
16654 return SDValue();
16655}
16656
16657// Optimize trunc store (of multiple scalars) to shuffle and store. First,
16658// pack all of the elements in one place. Next, store to memory in fewer
16659// chunks.
16661 SelectionDAG &DAG) {
16662 SDValue StVal = St->getValue();
16663 EVT VT = StVal.getValueType();
16664 if (!St->isTruncatingStore() || !VT.isVector())
16665 return SDValue();
16666 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
16667 EVT StVT = St->getMemoryVT();
16668 unsigned NumElems = VT.getVectorNumElements();
16669 assert(StVT != VT && "Cannot truncate to the same type");
16670 unsigned FromEltSz = VT.getScalarSizeInBits();
16671 unsigned ToEltSz = StVT.getScalarSizeInBits();
16672
16673 // From, To sizes and ElemCount must be pow of two
16674 if (!isPowerOf2_32(NumElems * FromEltSz * ToEltSz))
16675 return SDValue();
16676
16677 // We are going to use the original vector elt for storing.
16678 // Accumulated smaller vector elements must be a multiple of the store size.
16679 if (0 != (NumElems * FromEltSz) % ToEltSz)
16680 return SDValue();
16681
16682 unsigned SizeRatio = FromEltSz / ToEltSz;
16683 assert(SizeRatio * NumElems * ToEltSz == VT.getSizeInBits());
16684
16685 // Create a type on which we perform the shuffle.
16686 EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), StVT.getScalarType(),
16687 NumElems * SizeRatio);
16688 assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
16689
16690 SDLoc DL(St);
16691 SDValue WideVec = DAG.getNode(ISD::BITCAST, DL, WideVecVT, StVal);
16692 SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
16693 for (unsigned i = 0; i < NumElems; ++i)
16694 ShuffleVec[i] = DAG.getDataLayout().isBigEndian() ? (i + 1) * SizeRatio - 1
16695 : i * SizeRatio;
16696
16697 // Can't shuffle using an illegal type.
16698 if (!TLI.isTypeLegal(WideVecVT))
16699 return SDValue();
16700
16701 SDValue Shuff = DAG.getVectorShuffle(
16702 WideVecVT, DL, WideVec, DAG.getUNDEF(WideVec.getValueType()), ShuffleVec);
16703 // At this point all of the data is stored at the bottom of the
16704 // register. We now need to save it to mem.
16705
16706 // Find the largest store unit
16707 MVT StoreType = MVT::i8;
16708 for (MVT Tp : MVT::integer_valuetypes()) {
16709 if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToEltSz)
16710 StoreType = Tp;
16711 }
16712 // Didn't find a legal store type.
16713 if (!TLI.isTypeLegal(StoreType))
16714 return SDValue();
16715
16716 // Bitcast the original vector into a vector of store-size units
16717 EVT StoreVecVT =
16718 EVT::getVectorVT(*DAG.getContext(), StoreType,
16719 VT.getSizeInBits() / EVT(StoreType).getSizeInBits());
16720 assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits());
16721 SDValue ShuffWide = DAG.getNode(ISD::BITCAST, DL, StoreVecVT, Shuff);
16723 SDValue Increment = DAG.getConstant(StoreType.getSizeInBits() / 8, DL,
16724 TLI.getPointerTy(DAG.getDataLayout()));
16725 SDValue BasePtr = St->getBasePtr();
16726
16727 // Perform one or more big stores into memory.
16728 unsigned E = (ToEltSz * NumElems) / StoreType.getSizeInBits();
16729 for (unsigned I = 0; I < E; I++) {
16730 SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, StoreType,
16731 ShuffWide, DAG.getIntPtrConstant(I, DL));
16732 SDValue Ch =
16733 DAG.getStore(St->getChain(), DL, SubVec, BasePtr, St->getPointerInfo(),
16734 St->getAlign(), St->getMemOperand()->getFlags());
16735 BasePtr =
16736 DAG.getNode(ISD::ADD, DL, BasePtr.getValueType(), BasePtr, Increment);
16737 Chains.push_back(Ch);
16738 }
16739 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
16740}
16741
16742// Try taking a single vector store from an fpround (which would otherwise turn
16743// into an expensive buildvector) and splitting it into a series of narrowing
16744// stores.
16746 SelectionDAG &DAG) {
16747 if (!St->isSimple() || St->isTruncatingStore() || !St->isUnindexed())
16748 return SDValue();
16749 SDValue Trunc = St->getValue();
16750 if (Trunc->getOpcode() != ISD::FP_ROUND)
16751 return SDValue();
16752 EVT FromVT = Trunc->getOperand(0).getValueType();
16753 EVT ToVT = Trunc.getValueType();
16754 if (!ToVT.isVector())
16755 return SDValue();
16757 EVT ToEltVT = ToVT.getVectorElementType();
16758 EVT FromEltVT = FromVT.getVectorElementType();
16759
16760 if (FromEltVT != MVT::f32 || ToEltVT != MVT::f16)
16761 return SDValue();
16762
16763 unsigned NumElements = 4;
16764 if (FromVT.getVectorNumElements() % NumElements != 0)
16765 return SDValue();
16766
16767 // Test if the Trunc will be convertible to a VMOVN with a shuffle, and if so
16768 // use the VMOVN over splitting the store. We are looking for patterns of:
16769 // !rev: 0 N 1 N+1 2 N+2 ...
16770 // rev: N 0 N+1 1 N+2 2 ...
16771 // The shuffle may either be a single source (in which case N = NumElts/2) or
16772 // two inputs extended with concat to the same size (in which case N =
16773 // NumElts).
16774 auto isVMOVNShuffle = [&](ShuffleVectorSDNode *SVN, bool Rev) {
16775 ArrayRef<int> M = SVN->getMask();
16776 unsigned NumElts = ToVT.getVectorNumElements();
16777 if (SVN->getOperand(1).isUndef())
16778 NumElts /= 2;
16779
16780 unsigned Off0 = Rev ? NumElts : 0;
16781 unsigned Off1 = Rev ? 0 : NumElts;
16782
16783 for (unsigned I = 0; I < NumElts; I += 2) {
16784 if (M[I] >= 0 && M[I] != (int)(Off0 + I / 2))
16785 return false;
16786 if (M[I + 1] >= 0 && M[I + 1] != (int)(Off1 + I / 2))
16787 return false;
16788 }
16789
16790 return true;
16791 };
16792
16793 if (auto *Shuffle = dyn_cast<ShuffleVectorSDNode>(Trunc.getOperand(0)))
16794 if (isVMOVNShuffle(Shuffle, false) || isVMOVNShuffle(Shuffle, true))
16795 return SDValue();
16796
16797 LLVMContext &C = *DAG.getContext();
16798 SDLoc DL(St);
16799 // Details about the old store
16800 SDValue Ch = St->getChain();
16801 SDValue BasePtr = St->getBasePtr();
16802 Align Alignment = St->getBaseAlign();
16804 AAMDNodes AAInfo = St->getAAInfo();
16805
16806 // We split the store into slices of NumElements. fp16 trunc stores are vcvt
16807 // and then stored as truncating integer stores.
16808 EVT NewFromVT = EVT::getVectorVT(C, FromEltVT, NumElements);
16809 EVT NewToVT = EVT::getVectorVT(
16810 C, EVT::getIntegerVT(C, ToEltVT.getSizeInBits()), NumElements);
16811
16813 for (unsigned i = 0; i < FromVT.getVectorNumElements() / NumElements; i++) {
16814 unsigned NewOffset = i * NumElements * ToEltVT.getSizeInBits() / 8;
16815 SDValue NewPtr =
16816 DAG.getObjectPtrOffset(DL, BasePtr, TypeSize::getFixed(NewOffset));
16817
16818 SDValue Extract =
16819 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NewFromVT, Trunc.getOperand(0),
16820 DAG.getConstant(i * NumElements, DL, MVT::i32));
16821
16822 SDValue FPTrunc =
16823 DAG.getNode(ARMISD::VCVTN, DL, MVT::v8f16, DAG.getUNDEF(MVT::v8f16),
16824 Extract, DAG.getConstant(0, DL, MVT::i32));
16825 Extract = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, MVT::v4i32, FPTrunc);
16826
16827 SDValue Store = DAG.getTruncStore(
16828 Ch, DL, Extract, NewPtr, St->getPointerInfo().getWithOffset(NewOffset),
16829 NewToVT, Alignment, MMOFlags, AAInfo);
16830 Stores.push_back(Store);
16831 }
16832 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores);
16833}
16834
16835// Try taking a single vector store from an MVETRUNC (which would otherwise turn
16836// into an expensive buildvector) and splitting it into a series of narrowing
16837// stores.
16839 SelectionDAG &DAG) {
16840 if (!St->isSimple() || St->isTruncatingStore() || !St->isUnindexed())
16841 return SDValue();
16842 SDValue Trunc = St->getValue();
16843 if (Trunc->getOpcode() != ARMISD::MVETRUNC)
16844 return SDValue();
16845 EVT FromVT = Trunc->getOperand(0).getValueType();
16846 EVT ToVT = Trunc.getValueType();
16847
16848 LLVMContext &C = *DAG.getContext();
16849 SDLoc DL(St);
16850 // Details about the old store
16851 SDValue Ch = St->getChain();
16852 SDValue BasePtr = St->getBasePtr();
16853 Align Alignment = St->getBaseAlign();
16855 AAMDNodes AAInfo = St->getAAInfo();
16856
16857 EVT NewToVT = EVT::getVectorVT(C, ToVT.getVectorElementType(),
16858 FromVT.getVectorNumElements());
16859
16861 for (unsigned i = 0; i < Trunc.getNumOperands(); i++) {
16862 unsigned NewOffset =
16863 i * FromVT.getVectorNumElements() * ToVT.getScalarSizeInBits() / 8;
16864 SDValue NewPtr =
16865 DAG.getObjectPtrOffset(DL, BasePtr, TypeSize::getFixed(NewOffset));
16866
16867 SDValue Extract = Trunc.getOperand(i);
16868 SDValue Store = DAG.getTruncStore(
16869 Ch, DL, Extract, NewPtr, St->getPointerInfo().getWithOffset(NewOffset),
16870 NewToVT, Alignment, MMOFlags, AAInfo);
16871 Stores.push_back(Store);
16872 }
16873 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores);
16874}
16875
16876// Given a floating point store from an extracted vector, with an integer
16877// VGETLANE that already exists, store the existing VGETLANEu directly. This can
16878// help reduce fp register pressure, doesn't require the fp extract and allows
16879// use of more integer post-inc stores not available with vstr.
16881 if (!St->isSimple() || St->isTruncatingStore() || !St->isUnindexed())
16882 return SDValue();
16883 SDValue Extract = St->getValue();
16884 EVT VT = Extract.getValueType();
16885 // For now only uses f16. This may be useful for f32 too, but that will
16886 // be bitcast(extract), not the VGETLANEu we currently check here.
16887 if (VT != MVT::f16 || Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
16888 return SDValue();
16889
16890 SDNode *GetLane =
16891 DAG.getNodeIfExists(ARMISD::VGETLANEu, DAG.getVTList(MVT::i32),
16892 {Extract.getOperand(0), Extract.getOperand(1)});
16893 if (!GetLane)
16894 return SDValue();
16895
16896 LLVMContext &C = *DAG.getContext();
16897 SDLoc DL(St);
16898 // Create a new integer store to replace the existing floating point version.
16899 SDValue Ch = St->getChain();
16900 SDValue BasePtr = St->getBasePtr();
16901 Align Alignment = St->getBaseAlign();
16903 AAMDNodes AAInfo = St->getAAInfo();
16904 EVT NewToVT = EVT::getIntegerVT(C, VT.getSizeInBits());
16905 SDValue Store = DAG.getTruncStore(Ch, DL, SDValue(GetLane, 0), BasePtr,
16906 St->getPointerInfo(), NewToVT, Alignment,
16907 MMOFlags, AAInfo);
16908
16909 return Store;
16910}
16911
16912/// PerformSTORECombine - Target-specific dag combine xforms for
16913/// ISD::STORE.
16916 const ARMSubtarget *Subtarget) {
16918 if (St->isVolatile())
16919 return SDValue();
16920 SDValue StVal = St->getValue();
16921 EVT VT = StVal.getValueType();
16922
16923 if (Subtarget->hasNEON())
16924 if (SDValue Store = PerformTruncatingStoreCombine(St, DCI.DAG))
16925 return Store;
16926
16927 if (Subtarget->hasMVEFloatOps())
16928 if (SDValue NewToken = PerformSplittingToNarrowingStores(St, DCI.DAG))
16929 return NewToken;
16930
16931 if (Subtarget->hasMVEIntegerOps()) {
16932 if (SDValue NewChain = PerformExtractFpToIntStores(St, DCI.DAG))
16933 return NewChain;
16934 if (SDValue NewToken =
16936 return NewToken;
16937 }
16938
16939 if (!ISD::isNormalStore(St))
16940 return SDValue();
16941
16942 // Split a store of a VMOVDRR into two integer stores to avoid mixing NEON and
16943 // ARM stores of arguments in the same cache line.
16944 if (StVal.getNode()->getOpcode() == ARMISD::VMOVDRR &&
16945 StVal.getNode()->hasOneUse()) {
16946 SelectionDAG &DAG = DCI.DAG;
16947 bool isBigEndian = DAG.getDataLayout().isBigEndian();
16948 SDLoc DL(St);
16949 SDValue BasePtr = St->getBasePtr();
16950 SDValue NewST1 = DAG.getStore(
16951 St->getChain(), DL, StVal.getNode()->getOperand(isBigEndian ? 1 : 0),
16952 BasePtr, St->getPointerInfo(), St->getBaseAlign(),
16953 St->getMemOperand()->getFlags());
16954
16955 SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr,
16956 DAG.getConstant(4, DL, MVT::i32));
16957 return DAG.getStore(NewST1.getValue(0), DL,
16958 StVal.getNode()->getOperand(isBigEndian ? 0 : 1),
16959 OffsetPtr, St->getPointerInfo().getWithOffset(4),
16960 St->getBaseAlign(), St->getMemOperand()->getFlags());
16961 }
16962
16963 if (StVal.getValueType() == MVT::i64 &&
16965
16966 // Bitcast an i64 store extracted from a vector to f64.
16967 // Otherwise, the i64 value will be legalized to a pair of i32 values.
16968 SelectionDAG &DAG = DCI.DAG;
16969 SDLoc dl(StVal);
16970 SDValue IntVec = StVal.getOperand(0);
16971 EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64,
16973 SDValue Vec = DAG.getNode(ISD::BITCAST, dl, FloatVT, IntVec);
16974 SDValue ExtElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
16975 Vec, StVal.getOperand(1));
16976 dl = SDLoc(N);
16977 SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::i64, ExtElt);
16978 // Make the DAGCombiner fold the bitcasts.
16979 DCI.AddToWorklist(Vec.getNode());
16980 DCI.AddToWorklist(ExtElt.getNode());
16981 DCI.AddToWorklist(V.getNode());
16982 return DAG.getStore(St->getChain(), dl, V, St->getBasePtr(),
16983 St->getPointerInfo(), St->getAlign(),
16984 St->getMemOperand()->getFlags(), St->getAAInfo());
16985 }
16986
16987 // If this is a legal vector store, try to combine it into a VST1_UPD.
16988 if (Subtarget->hasNEON() && ISD::isNormalStore(N) && VT.isVector() &&
16990 return CombineBaseUpdate(N, DCI);
16991
16992 return SDValue();
16993}
16994
16995/// PerformVCVTCombine - VCVT (floating-point to fixed-point, Advanced SIMD)
16996/// can replace combinations of VMUL and VCVT (floating-point to integer)
16997/// when the VMUL has a constant operand that is a power of 2.
16998///
16999/// Example (assume d17 = <float 8.000000e+00, float 8.000000e+00>):
17000/// vmul.f32 d16, d17, d16
17001/// vcvt.s32.f32 d16, d16
17002/// becomes:
17003/// vcvt.s32.f32 d16, d16, #3
17005 const ARMSubtarget *Subtarget) {
17006 if (!Subtarget->hasNEON())
17007 return SDValue();
17008
17009 SDValue Op = N->getOperand(0);
17010 if (!Op.getValueType().isVector() || !Op.getValueType().isSimple() ||
17011 Op.getOpcode() != ISD::FMUL)
17012 return SDValue();
17013
17014 SDValue ConstVec = Op->getOperand(1);
17015 if (!isa<BuildVectorSDNode>(ConstVec))
17016 return SDValue();
17017
17018 MVT FloatTy = Op.getSimpleValueType().getVectorElementType();
17019 uint32_t FloatBits = FloatTy.getSizeInBits();
17020 MVT IntTy = N->getSimpleValueType(0).getVectorElementType();
17021 uint32_t IntBits = IntTy.getSizeInBits();
17022 unsigned NumLanes = Op.getValueType().getVectorNumElements();
17023 if (FloatBits != 32 || IntBits > 32 || (NumLanes != 4 && NumLanes != 2)) {
17024 // These instructions only exist converting from f32 to i32. We can handle
17025 // smaller integers by generating an extra truncate, but larger ones would
17026 // be lossy. We also can't handle anything other than 2 or 4 lanes, since
17027 // these instructions only support v2i32/v4i32 types.
17028 return SDValue();
17029 }
17030
17031 BitVector UndefElements;
17033 int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, 33);
17034 if (C == -1 || C == 0 || C > 32)
17035 return SDValue();
17036
17037 SDLoc dl(N);
17038 bool isSigned = N->getOpcode() == ISD::FP_TO_SINT;
17039 unsigned IntrinsicOpcode = isSigned ? Intrinsic::arm_neon_vcvtfp2fxs :
17040 Intrinsic::arm_neon_vcvtfp2fxu;
17041 SDValue FixConv = DAG.getNode(
17042 ISD::INTRINSIC_WO_CHAIN, dl, NumLanes == 2 ? MVT::v2i32 : MVT::v4i32,
17043 DAG.getConstant(IntrinsicOpcode, dl, MVT::i32), Op->getOperand(0),
17044 DAG.getConstant(C, dl, MVT::i32));
17045
17046 if (IntBits < FloatBits)
17047 FixConv = DAG.getNode(ISD::TRUNCATE, dl, N->getValueType(0), FixConv);
17048
17049 return FixConv;
17050}
17051
17053 const ARMSubtarget *Subtarget) {
17054 if (!Subtarget->hasMVEFloatOps())
17055 return SDValue();
17056
17057 // Turn (fadd x, (vselect c, y, -0.0)) into (vselect c, (fadd x, y), x)
17058 // The second form can be more easily turned into a predicated vadd, and
17059 // possibly combined into a fma to become a predicated vfma.
17060 SDValue Op0 = N->getOperand(0);
17061 SDValue Op1 = N->getOperand(1);
17062 EVT VT = N->getValueType(0);
17063 SDLoc DL(N);
17064
17065 // The identity element for a fadd is -0.0 or +0.0 when the nsz flag is set,
17066 // which these VMOV's represent.
17067 auto isIdentitySplat = [&](SDValue Op, bool NSZ) {
17068 if (Op.getOpcode() != ISD::BITCAST ||
17069 Op.getOperand(0).getOpcode() != ARMISD::VMOVIMM)
17070 return false;
17071 uint64_t ImmVal = Op.getOperand(0).getConstantOperandVal(0);
17072 if (VT == MVT::v4f32 && (ImmVal == 1664 || (ImmVal == 0 && NSZ)))
17073 return true;
17074 if (VT == MVT::v8f16 && (ImmVal == 2688 || (ImmVal == 0 && NSZ)))
17075 return true;
17076 return false;
17077 };
17078
17079 if (Op0.getOpcode() == ISD::VSELECT && Op1.getOpcode() != ISD::VSELECT)
17080 std::swap(Op0, Op1);
17081
17082 if (Op1.getOpcode() != ISD::VSELECT)
17083 return SDValue();
17084
17085 SDNodeFlags FaddFlags = N->getFlags();
17086 bool NSZ = FaddFlags.hasNoSignedZeros();
17087 if (!isIdentitySplat(Op1.getOperand(2), NSZ))
17088 return SDValue();
17089
17090 SDValue FAdd =
17091 DAG.getNode(ISD::FADD, DL, VT, Op0, Op1.getOperand(1), FaddFlags);
17092 return DAG.getNode(ISD::VSELECT, DL, VT, Op1.getOperand(0), FAdd, Op0, FaddFlags);
17093}
17094
17096 SDValue LHS = N->getOperand(0);
17097 SDValue RHS = N->getOperand(1);
17098 EVT VT = N->getValueType(0);
17099 SDLoc DL(N);
17100
17101 if (!N->getFlags().hasAllowReassociation())
17102 return SDValue();
17103
17104 // Combine fadd(a, vcmla(b, c, d)) -> vcmla(fadd(a, b), b, c)
17105 auto ReassocComplex = [&](SDValue A, SDValue B) {
17106 if (A.getOpcode() != ISD::INTRINSIC_WO_CHAIN)
17107 return SDValue();
17108 unsigned Opc = A.getConstantOperandVal(0);
17109 if (Opc != Intrinsic::arm_mve_vcmlaq)
17110 return SDValue();
17111 SDValue VCMLA = DAG.getNode(
17112 ISD::INTRINSIC_WO_CHAIN, DL, VT, A.getOperand(0), A.getOperand(1),
17113 DAG.getNode(ISD::FADD, DL, VT, A.getOperand(2), B, N->getFlags()),
17114 A.getOperand(3), A.getOperand(4));
17115 VCMLA->setFlags(A->getFlags());
17116 return VCMLA;
17117 };
17118 if (SDValue R = ReassocComplex(LHS, RHS))
17119 return R;
17120 if (SDValue R = ReassocComplex(RHS, LHS))
17121 return R;
17122
17123 return SDValue();
17124}
17125
17127 const ARMSubtarget *Subtarget) {
17128 if (SDValue S = PerformFAddVSelectCombine(N, DAG, Subtarget))
17129 return S;
17130 if (SDValue S = PerformFADDVCMLACombine(N, DAG))
17131 return S;
17132 return SDValue();
17133}
17134
17135/// PerformVMulVCTPCombine - VCVT (fixed-point to floating-point, Advanced SIMD)
17136/// can replace combinations of VCVT (integer to floating-point) and VMUL
17137/// when the VMUL has a constant operand that is a power of 2.
17138///
17139/// Example (assume d17 = <float 0.125, float 0.125>):
17140/// vcvt.f32.s32 d16, d16
17141/// vmul.f32 d16, d16, d17
17142/// becomes:
17143/// vcvt.f32.s32 d16, d16, #3
17145 const ARMSubtarget *Subtarget) {
17146 if (!Subtarget->hasNEON())
17147 return SDValue();
17148
17149 SDValue Op = N->getOperand(0);
17150 unsigned OpOpcode = Op.getNode()->getOpcode();
17151 if (!N->getValueType(0).isVector() || !N->getValueType(0).isSimple() ||
17152 (OpOpcode != ISD::SINT_TO_FP && OpOpcode != ISD::UINT_TO_FP))
17153 return SDValue();
17154
17155 SDValue ConstVec = N->getOperand(1);
17156 if (!isa<BuildVectorSDNode>(ConstVec))
17157 return SDValue();
17158
17159 MVT FloatTy = N->getSimpleValueType(0).getVectorElementType();
17160 uint32_t FloatBits = FloatTy.getSizeInBits();
17161 MVT IntTy = Op.getOperand(0).getSimpleValueType().getVectorElementType();
17162 uint32_t IntBits = IntTy.getSizeInBits();
17163 unsigned NumLanes = Op.getValueType().getVectorNumElements();
17164 if (FloatBits != 32 || IntBits > 32 || (NumLanes != 4 && NumLanes != 2)) {
17165 // These instructions only exist converting from i32 to f32. We can handle
17166 // smaller integers by generating an extra extend, but larger ones would
17167 // be lossy. We also can't handle anything other than 2 or 4 lanes, since
17168 // these instructions only support v2i32/v4i32 types.
17169 return SDValue();
17170 }
17171
17172 ConstantFPSDNode *CN = isConstOrConstSplatFP(ConstVec, true);
17173 APFloat Recip(0.0f);
17174 if (!CN || !CN->getValueAPF().getExactInverse(&Recip))
17175 return SDValue();
17176
17177 bool IsExact;
17178 APSInt IntVal(33);
17179 if (Recip.convertToInteger(IntVal, APFloat::rmTowardZero, &IsExact) !=
17180 APFloat::opOK ||
17181 !IsExact)
17182 return SDValue();
17183
17184 int32_t C = IntVal.exactLogBase2();
17185 if (C == -1 || C == 0 || C > 32)
17186 return SDValue();
17187
17188 SDLoc DL(N);
17189 bool isSigned = OpOpcode == ISD::SINT_TO_FP;
17190 SDValue ConvInput = Op.getOperand(0);
17191 if (IntBits < FloatBits)
17193 NumLanes == 2 ? MVT::v2i32 : MVT::v4i32, ConvInput);
17194
17195 unsigned IntrinsicOpcode = isSigned ? Intrinsic::arm_neon_vcvtfxs2fp
17196 : Intrinsic::arm_neon_vcvtfxu2fp;
17197 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),
17198 DAG.getConstant(IntrinsicOpcode, DL, MVT::i32), ConvInput,
17199 DAG.getConstant(C, DL, MVT::i32));
17200}
17201
17203 const ARMSubtarget *ST) {
17204 if (!ST->hasMVEIntegerOps())
17205 return SDValue();
17206
17207 assert(N->getOpcode() == ISD::VECREDUCE_ADD);
17208 EVT ResVT = N->getValueType(0);
17209 SDValue N0 = N->getOperand(0);
17210 SDLoc dl(N);
17211
17212 // Try to turn vecreduce_add(add(x, y)) into vecreduce(x) + vecreduce(y)
17213 if (ResVT == MVT::i32 && N0.getOpcode() == ISD::ADD &&
17214 (N0.getValueType() == MVT::v4i32 || N0.getValueType() == MVT::v8i16 ||
17215 N0.getValueType() == MVT::v16i8)) {
17216 SDValue Red0 = DAG.getNode(ISD::VECREDUCE_ADD, dl, ResVT, N0.getOperand(0));
17217 SDValue Red1 = DAG.getNode(ISD::VECREDUCE_ADD, dl, ResVT, N0.getOperand(1));
17218 return DAG.getNode(ISD::ADD, dl, ResVT, Red0, Red1);
17219 }
17220
17221 // We are looking for something that will have illegal types if left alone,
17222 // but that we can convert to a single instruction under MVE. For example
17223 // vecreduce_add(sext(A, v8i32)) => VADDV.s16 A
17224 // or
17225 // vecreduce_add(mul(zext(A, v16i32), zext(B, v16i32))) => VMLADAV.u8 A, B
17226
17227 // The legal cases are:
17228 // VADDV u/s 8/16/32
17229 // VMLAV u/s 8/16/32
17230 // VADDLV u/s 32
17231 // VMLALV u/s 16/32
17232
17233 // If the input vector is smaller than legal (v4i8/v4i16 for example) we can
17234 // extend it and use v4i32 instead.
17235 auto ExtTypeMatches = [](SDValue A, ArrayRef<MVT> ExtTypes) {
17236 EVT AVT = A.getValueType();
17237 return any_of(ExtTypes, [&](MVT Ty) {
17238 return AVT.getVectorNumElements() == Ty.getVectorNumElements() &&
17239 AVT.bitsLE(Ty);
17240 });
17241 };
17242 auto ExtendIfNeeded = [&](SDValue A, unsigned ExtendCode) {
17243 EVT AVT = A.getValueType();
17244 if (!AVT.is128BitVector())
17245 A = DAG.getNode(
17246 ExtendCode, dl,
17248 *DAG.getContext(),
17250 A);
17251 return A;
17252 };
17253 auto IsVADDV = [&](MVT RetTy, unsigned ExtendCode, ArrayRef<MVT> ExtTypes) {
17254 if (ResVT != RetTy || N0->getOpcode() != ExtendCode)
17255 return SDValue();
17256 SDValue A = N0->getOperand(0);
17257 if (ExtTypeMatches(A, ExtTypes))
17258 return ExtendIfNeeded(A, ExtendCode);
17259 return SDValue();
17260 };
17261 auto IsPredVADDV = [&](MVT RetTy, unsigned ExtendCode,
17262 ArrayRef<MVT> ExtTypes, SDValue &Mask) {
17263 if (ResVT != RetTy || N0->getOpcode() != ISD::VSELECT ||
17265 return SDValue();
17266 Mask = N0->getOperand(0);
17267 SDValue Ext = N0->getOperand(1);
17268 if (Ext->getOpcode() != ExtendCode)
17269 return SDValue();
17270 SDValue A = Ext->getOperand(0);
17271 if (ExtTypeMatches(A, ExtTypes))
17272 return ExtendIfNeeded(A, ExtendCode);
17273 return SDValue();
17274 };
17275 auto IsVMLAV = [&](MVT RetTy, unsigned ExtendCode, ArrayRef<MVT> ExtTypes,
17276 SDValue &A, SDValue &B) {
17277 // For a vmla we are trying to match a larger pattern:
17278 // ExtA = sext/zext A
17279 // ExtB = sext/zext B
17280 // Mul = mul ExtA, ExtB
17281 // vecreduce.add Mul
17282 // There might also be en extra extend between the mul and the addreduce, so
17283 // long as the bitwidth is high enough to make them equivalent (for example
17284 // original v8i16 might be mul at v8i32 and the reduce happens at v8i64).
17285 if (ResVT != RetTy)
17286 return false;
17287 SDValue Mul = N0;
17288 if (Mul->getOpcode() == ExtendCode &&
17289 Mul->getOperand(0).getScalarValueSizeInBits() * 2 >=
17290 ResVT.getScalarSizeInBits())
17291 Mul = Mul->getOperand(0);
17292 if (Mul->getOpcode() != ISD::MUL)
17293 return false;
17294 SDValue ExtA = Mul->getOperand(0);
17295 SDValue ExtB = Mul->getOperand(1);
17296 if (ExtA->getOpcode() != ExtendCode || ExtB->getOpcode() != ExtendCode)
17297 return false;
17298 A = ExtA->getOperand(0);
17299 B = ExtB->getOperand(0);
17300 if (ExtTypeMatches(A, ExtTypes) && ExtTypeMatches(B, ExtTypes)) {
17301 A = ExtendIfNeeded(A, ExtendCode);
17302 B = ExtendIfNeeded(B, ExtendCode);
17303 return true;
17304 }
17305 return false;
17306 };
17307 auto IsPredVMLAV = [&](MVT RetTy, unsigned ExtendCode, ArrayRef<MVT> ExtTypes,
17308 SDValue &A, SDValue &B, SDValue &Mask) {
17309 // Same as the pattern above with a select for the zero predicated lanes
17310 // ExtA = sext/zext A
17311 // ExtB = sext/zext B
17312 // Mul = mul ExtA, ExtB
17313 // N0 = select Mask, Mul, 0
17314 // vecreduce.add N0
17315 if (ResVT != RetTy || N0->getOpcode() != ISD::VSELECT ||
17317 return false;
17318 Mask = N0->getOperand(0);
17319 SDValue Mul = N0->getOperand(1);
17320 if (Mul->getOpcode() == ExtendCode &&
17321 Mul->getOperand(0).getScalarValueSizeInBits() * 2 >=
17322 ResVT.getScalarSizeInBits())
17323 Mul = Mul->getOperand(0);
17324 if (Mul->getOpcode() != ISD::MUL)
17325 return false;
17326 SDValue ExtA = Mul->getOperand(0);
17327 SDValue ExtB = Mul->getOperand(1);
17328 if (ExtA->getOpcode() != ExtendCode || ExtB->getOpcode() != ExtendCode)
17329 return false;
17330 A = ExtA->getOperand(0);
17331 B = ExtB->getOperand(0);
17332 if (ExtTypeMatches(A, ExtTypes) && ExtTypeMatches(B, ExtTypes)) {
17333 A = ExtendIfNeeded(A, ExtendCode);
17334 B = ExtendIfNeeded(B, ExtendCode);
17335 return true;
17336 }
17337 return false;
17338 };
17339 auto Create64bitNode = [&](unsigned Opcode, ArrayRef<SDValue> Ops) {
17340 // Split illegal MVT::v16i8->i64 vector reductions into two legal v8i16->i64
17341 // reductions. The operands are extended with MVEEXT, but as they are
17342 // reductions the lane orders do not matter. MVEEXT may be combined with
17343 // loads to produce two extending loads, or else they will be expanded to
17344 // VREV/VMOVL.
17345 EVT VT = Ops[0].getValueType();
17346 if (VT == MVT::v16i8) {
17347 assert((Opcode == ARMISD::VMLALVs || Opcode == ARMISD::VMLALVu) &&
17348 "Unexpected illegal long reduction opcode");
17349 bool IsUnsigned = Opcode == ARMISD::VMLALVu;
17350
17351 SDValue Ext0 =
17352 DAG.getNode(IsUnsigned ? ARMISD::MVEZEXT : ARMISD::MVESEXT, dl,
17353 DAG.getVTList(MVT::v8i16, MVT::v8i16), Ops[0]);
17354 SDValue Ext1 =
17355 DAG.getNode(IsUnsigned ? ARMISD::MVEZEXT : ARMISD::MVESEXT, dl,
17356 DAG.getVTList(MVT::v8i16, MVT::v8i16), Ops[1]);
17357
17358 SDValue MLA0 = DAG.getNode(Opcode, dl, DAG.getVTList(MVT::i32, MVT::i32),
17359 Ext0, Ext1);
17360 SDValue MLA1 =
17361 DAG.getNode(IsUnsigned ? ARMISD::VMLALVAu : ARMISD::VMLALVAs, dl,
17362 DAG.getVTList(MVT::i32, MVT::i32), MLA0, MLA0.getValue(1),
17363 Ext0.getValue(1), Ext1.getValue(1));
17364 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, MLA1, MLA1.getValue(1));
17365 }
17366 SDValue Node = DAG.getNode(Opcode, dl, {MVT::i32, MVT::i32}, Ops);
17367 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Node,
17368 SDValue(Node.getNode(), 1));
17369 };
17370
17371 SDValue A, B;
17372 SDValue Mask;
17373 if (IsVMLAV(MVT::i32, ISD::SIGN_EXTEND, {MVT::v8i16, MVT::v16i8}, A, B))
17374 return DAG.getNode(ARMISD::VMLAVs, dl, ResVT, A, B);
17375 if (IsVMLAV(MVT::i32, ISD::ZERO_EXTEND, {MVT::v8i16, MVT::v16i8}, A, B))
17376 return DAG.getNode(ARMISD::VMLAVu, dl, ResVT, A, B);
17377 if (IsVMLAV(MVT::i64, ISD::SIGN_EXTEND, {MVT::v16i8, MVT::v8i16, MVT::v4i32},
17378 A, B))
17379 return Create64bitNode(ARMISD::VMLALVs, {A, B});
17380 if (IsVMLAV(MVT::i64, ISD::ZERO_EXTEND, {MVT::v16i8, MVT::v8i16, MVT::v4i32},
17381 A, B))
17382 return Create64bitNode(ARMISD::VMLALVu, {A, B});
17383 if (IsVMLAV(MVT::i16, ISD::SIGN_EXTEND, {MVT::v16i8}, A, B))
17384 return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
17385 DAG.getNode(ARMISD::VMLAVs, dl, MVT::i32, A, B));
17386 if (IsVMLAV(MVT::i16, ISD::ZERO_EXTEND, {MVT::v16i8}, A, B))
17387 return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
17388 DAG.getNode(ARMISD::VMLAVu, dl, MVT::i32, A, B));
17389
17390 if (IsPredVMLAV(MVT::i32, ISD::SIGN_EXTEND, {MVT::v8i16, MVT::v16i8}, A, B,
17391 Mask))
17392 return DAG.getNode(ARMISD::VMLAVps, dl, ResVT, A, B, Mask);
17393 if (IsPredVMLAV(MVT::i32, ISD::ZERO_EXTEND, {MVT::v8i16, MVT::v16i8}, A, B,
17394 Mask))
17395 return DAG.getNode(ARMISD::VMLAVpu, dl, ResVT, A, B, Mask);
17396 if (IsPredVMLAV(MVT::i64, ISD::SIGN_EXTEND, {MVT::v8i16, MVT::v4i32}, A, B,
17397 Mask))
17398 return Create64bitNode(ARMISD::VMLALVps, {A, B, Mask});
17399 if (IsPredVMLAV(MVT::i64, ISD::ZERO_EXTEND, {MVT::v8i16, MVT::v4i32}, A, B,
17400 Mask))
17401 return Create64bitNode(ARMISD::VMLALVpu, {A, B, Mask});
17402 if (IsPredVMLAV(MVT::i16, ISD::SIGN_EXTEND, {MVT::v16i8}, A, B, Mask))
17403 return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
17404 DAG.getNode(ARMISD::VMLAVps, dl, MVT::i32, A, B, Mask));
17405 if (IsPredVMLAV(MVT::i16, ISD::ZERO_EXTEND, {MVT::v16i8}, A, B, Mask))
17406 return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
17407 DAG.getNode(ARMISD::VMLAVpu, dl, MVT::i32, A, B, Mask));
17408
17409 if (SDValue A = IsVADDV(MVT::i32, ISD::SIGN_EXTEND, {MVT::v8i16, MVT::v16i8}))
17410 return DAG.getNode(ARMISD::VADDVs, dl, ResVT, A);
17411 if (SDValue A = IsVADDV(MVT::i32, ISD::ZERO_EXTEND, {MVT::v8i16, MVT::v16i8}))
17412 return DAG.getNode(ARMISD::VADDVu, dl, ResVT, A);
17413 if (SDValue A = IsVADDV(MVT::i64, ISD::SIGN_EXTEND, {MVT::v4i32}))
17414 return Create64bitNode(ARMISD::VADDLVs, {A});
17415 if (SDValue A = IsVADDV(MVT::i64, ISD::ZERO_EXTEND, {MVT::v4i32}))
17416 return Create64bitNode(ARMISD::VADDLVu, {A});
17417 if (SDValue A = IsVADDV(MVT::i16, ISD::SIGN_EXTEND, {MVT::v16i8}))
17418 return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
17419 DAG.getNode(ARMISD::VADDVs, dl, MVT::i32, A));
17420 if (SDValue A = IsVADDV(MVT::i16, ISD::ZERO_EXTEND, {MVT::v16i8}))
17421 return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
17422 DAG.getNode(ARMISD::VADDVu, dl, MVT::i32, A));
17423
17424 if (SDValue A = IsPredVADDV(MVT::i32, ISD::SIGN_EXTEND, {MVT::v8i16, MVT::v16i8}, Mask))
17425 return DAG.getNode(ARMISD::VADDVps, dl, ResVT, A, Mask);
17426 if (SDValue A = IsPredVADDV(MVT::i32, ISD::ZERO_EXTEND, {MVT::v8i16, MVT::v16i8}, Mask))
17427 return DAG.getNode(ARMISD::VADDVpu, dl, ResVT, A, Mask);
17428 if (SDValue A = IsPredVADDV(MVT::i64, ISD::SIGN_EXTEND, {MVT::v4i32}, Mask))
17429 return Create64bitNode(ARMISD::VADDLVps, {A, Mask});
17430 if (SDValue A = IsPredVADDV(MVT::i64, ISD::ZERO_EXTEND, {MVT::v4i32}, Mask))
17431 return Create64bitNode(ARMISD::VADDLVpu, {A, Mask});
17432 if (SDValue A = IsPredVADDV(MVT::i16, ISD::SIGN_EXTEND, {MVT::v16i8}, Mask))
17433 return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
17434 DAG.getNode(ARMISD::VADDVps, dl, MVT::i32, A, Mask));
17435 if (SDValue A = IsPredVADDV(MVT::i16, ISD::ZERO_EXTEND, {MVT::v16i8}, Mask))
17436 return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
17437 DAG.getNode(ARMISD::VADDVpu, dl, MVT::i32, A, Mask));
17438
17439 // Some complications. We can get a case where the two inputs of the mul are
17440 // the same, then the output sext will have been helpfully converted to a
17441 // zext. Turn it back.
17442 SDValue Op = N0;
17443 if (Op->getOpcode() == ISD::VSELECT)
17444 Op = Op->getOperand(1);
17445 if (Op->getOpcode() == ISD::ZERO_EXTEND &&
17446 Op->getOperand(0)->getOpcode() == ISD::MUL) {
17447 SDValue Mul = Op->getOperand(0);
17448 if (Mul->getOperand(0) == Mul->getOperand(1) &&
17449 Mul->getOperand(0)->getOpcode() == ISD::SIGN_EXTEND) {
17450 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, N0->getValueType(0), Mul);
17451 if (Op != N0)
17452 Ext = DAG.getNode(ISD::VSELECT, dl, N0->getValueType(0),
17453 N0->getOperand(0), Ext, N0->getOperand(2));
17454 return DAG.getNode(ISD::VECREDUCE_ADD, dl, ResVT, Ext);
17455 }
17456 }
17457
17458 return SDValue();
17459}
17460
17461// Looks for vaddv(shuffle) or vmlav(shuffle, shuffle), with a shuffle where all
17462// the lanes are used. Due to the reduction being commutative the shuffle can be
17463// removed.
17465 unsigned VecOp = N->getOperand(0).getValueType().isVector() ? 0 : 2;
17466 auto *Shuf = dyn_cast<ShuffleVectorSDNode>(N->getOperand(VecOp));
17467 if (!Shuf || !Shuf->getOperand(1).isUndef())
17468 return SDValue();
17469
17470 // Check all elements are used once in the mask.
17471 ArrayRef<int> Mask = Shuf->getMask();
17472 APInt SetElts(Mask.size(), 0);
17473 for (int E : Mask) {
17474 if (E < 0 || E >= (int)Mask.size())
17475 return SDValue();
17476 SetElts.setBit(E);
17477 }
17478 if (!SetElts.isAllOnes())
17479 return SDValue();
17480
17481 if (N->getNumOperands() != VecOp + 1) {
17482 auto *Shuf2 = dyn_cast<ShuffleVectorSDNode>(N->getOperand(VecOp + 1));
17483 if (!Shuf2 || !Shuf2->getOperand(1).isUndef() || Shuf2->getMask() != Mask)
17484 return SDValue();
17485 }
17486
17488 for (SDValue Op : N->ops()) {
17489 if (Op.getValueType().isVector())
17490 Ops.push_back(Op.getOperand(0));
17491 else
17492 Ops.push_back(Op);
17493 }
17494 return DAG.getNode(N->getOpcode(), SDLoc(N), N->getVTList(), Ops);
17495}
17496
17499 SDValue Op0 = N->getOperand(0);
17500 SDValue Op1 = N->getOperand(1);
17501 unsigned IsTop = N->getConstantOperandVal(2);
17502
17503 // VMOVNT a undef -> a
17504 // VMOVNB a undef -> a
17505 // VMOVNB undef a -> a
17506 if (Op1->isUndef())
17507 return Op0;
17508 if (Op0->isUndef() && !IsTop)
17509 return Op1;
17510
17511 // VMOVNt(c, VQMOVNb(a, b)) => VQMOVNt(c, b)
17512 // VMOVNb(c, VQMOVNb(a, b)) => VQMOVNb(c, b)
17513 if ((Op1->getOpcode() == ARMISD::VQMOVNs ||
17514 Op1->getOpcode() == ARMISD::VQMOVNu) &&
17515 Op1->getConstantOperandVal(2) == 0)
17516 return DCI.DAG.getNode(Op1->getOpcode(), SDLoc(Op1), N->getValueType(0),
17517 Op0, Op1->getOperand(1), N->getOperand(2));
17518
17519 // Only the bottom lanes from Qm (Op1) and either the top or bottom lanes from
17520 // Qd (Op0) are demanded from a VMOVN, depending on whether we are inserting
17521 // into the top or bottom lanes.
17522 unsigned NumElts = N->getValueType(0).getVectorNumElements();
17523 APInt Op1DemandedElts = APInt::getSplat(NumElts, APInt::getLowBitsSet(2, 1));
17524 APInt Op0DemandedElts =
17525 IsTop ? Op1DemandedElts
17526 : APInt::getSplat(NumElts, APInt::getHighBitsSet(2, 1));
17527
17528 const TargetLowering &TLI = DCI.DAG.getTargetLoweringInfo();
17529 if (TLI.SimplifyDemandedVectorElts(Op0, Op0DemandedElts, DCI))
17530 return SDValue(N, 0);
17531 if (TLI.SimplifyDemandedVectorElts(Op1, Op1DemandedElts, DCI))
17532 return SDValue(N, 0);
17533
17534 return SDValue();
17535}
17536
17539 SDValue Op0 = N->getOperand(0);
17540 unsigned IsTop = N->getConstantOperandVal(2);
17541
17542 unsigned NumElts = N->getValueType(0).getVectorNumElements();
17543 APInt Op0DemandedElts =
17544 APInt::getSplat(NumElts, IsTop ? APInt::getLowBitsSet(2, 1)
17545 : APInt::getHighBitsSet(2, 1));
17546
17547 const TargetLowering &TLI = DCI.DAG.getTargetLoweringInfo();
17548 if (TLI.SimplifyDemandedVectorElts(Op0, Op0DemandedElts, DCI))
17549 return SDValue(N, 0);
17550 return SDValue();
17551}
17552
17555 EVT VT = N->getValueType(0);
17556 SDValue LHS = N->getOperand(0);
17557 SDValue RHS = N->getOperand(1);
17558
17559 auto *Shuf0 = dyn_cast<ShuffleVectorSDNode>(LHS);
17560 auto *Shuf1 = dyn_cast<ShuffleVectorSDNode>(RHS);
17561 // Turn VQDMULH(shuffle, shuffle) -> shuffle(VQDMULH)
17562 if (Shuf0 && Shuf1 && Shuf0->getMask().equals(Shuf1->getMask()) &&
17563 LHS.getOperand(1).isUndef() && RHS.getOperand(1).isUndef() &&
17564 (LHS.hasOneUse() || RHS.hasOneUse() || LHS == RHS)) {
17565 SDLoc DL(N);
17566 SDValue NewBinOp = DCI.DAG.getNode(N->getOpcode(), DL, VT,
17567 LHS.getOperand(0), RHS.getOperand(0));
17568 SDValue UndefV = LHS.getOperand(1);
17569 return DCI.DAG.getVectorShuffle(VT, DL, NewBinOp, UndefV, Shuf0->getMask());
17570 }
17571 return SDValue();
17572}
17573
17575 SDLoc DL(N);
17576 SDValue Op0 = N->getOperand(0);
17577 SDValue Op1 = N->getOperand(1);
17578
17579 // Turn X << -C -> X >> C and viceversa. The negative shifts can come up from
17580 // uses of the intrinsics.
17581 if (auto C = dyn_cast<ConstantSDNode>(N->getOperand(2))) {
17582 int ShiftAmt = C->getSExtValue();
17583 if (ShiftAmt == 0) {
17584 SDValue Merge = DAG.getMergeValues({Op0, Op1}, DL);
17585 DAG.ReplaceAllUsesWith(N, Merge.getNode());
17586 return SDValue();
17587 }
17588
17589 if (ShiftAmt >= -32 && ShiftAmt < 0) {
17590 unsigned NewOpcode =
17591 N->getOpcode() == ARMISD::LSLL ? ARMISD::LSRL : ARMISD::LSLL;
17592 SDValue NewShift = DAG.getNode(NewOpcode, DL, N->getVTList(), Op0, Op1,
17593 DAG.getConstant(-ShiftAmt, DL, MVT::i32));
17594 DAG.ReplaceAllUsesWith(N, NewShift.getNode());
17595 return NewShift;
17596 }
17597 }
17598
17599 return SDValue();
17600}
17601
17602/// PerformIntrinsicCombine - ARM-specific DAG combining for intrinsics.
17604 DAGCombinerInfo &DCI) const {
17605 SelectionDAG &DAG = DCI.DAG;
17606 unsigned IntNo = N->getConstantOperandVal(0);
17607 switch (IntNo) {
17608 default:
17609 // Don't do anything for most intrinsics.
17610 break;
17611
17612 // Vector shifts: check for immediate versions and lower them.
17613 // Note: This is done during DAG combining instead of DAG legalizing because
17614 // the build_vectors for 64-bit vector element shift counts are generally
17615 // not legal, and it is hard to see their values after they get legalized to
17616 // loads from a constant pool.
17617 case Intrinsic::arm_neon_vshifts:
17618 case Intrinsic::arm_neon_vshiftu:
17619 case Intrinsic::arm_neon_vrshifts:
17620 case Intrinsic::arm_neon_vrshiftu:
17621 case Intrinsic::arm_neon_vrshiftn:
17622 case Intrinsic::arm_neon_vqshifts:
17623 case Intrinsic::arm_neon_vqshiftu:
17624 case Intrinsic::arm_neon_vqshiftsu:
17625 case Intrinsic::arm_neon_vqshiftns:
17626 case Intrinsic::arm_neon_vqshiftnu:
17627 case Intrinsic::arm_neon_vqshiftnsu:
17628 case Intrinsic::arm_neon_vqrshiftns:
17629 case Intrinsic::arm_neon_vqrshiftnu:
17630 case Intrinsic::arm_neon_vqrshiftnsu: {
17631 EVT VT = N->getOperand(1).getValueType();
17632 int64_t Cnt;
17633 unsigned VShiftOpc = 0;
17634
17635 switch (IntNo) {
17636 case Intrinsic::arm_neon_vshifts:
17637 case Intrinsic::arm_neon_vshiftu:
17638 if (isVShiftLImm(N->getOperand(2), VT, false, Cnt)) {
17639 VShiftOpc = ARMISD::VSHLIMM;
17640 break;
17641 }
17642 if (isVShiftRImm(N->getOperand(2), VT, false, true, Cnt)) {
17643 VShiftOpc = (IntNo == Intrinsic::arm_neon_vshifts ? ARMISD::VSHRsIMM
17644 : ARMISD::VSHRuIMM);
17645 break;
17646 }
17647 return SDValue();
17648
17649 case Intrinsic::arm_neon_vrshifts:
17650 case Intrinsic::arm_neon_vrshiftu:
17651 if (isVShiftRImm(N->getOperand(2), VT, false, true, Cnt))
17652 break;
17653 return SDValue();
17654
17655 case Intrinsic::arm_neon_vqshifts:
17656 case Intrinsic::arm_neon_vqshiftu:
17657 if (isVShiftLImm(N->getOperand(2), VT, false, Cnt))
17658 break;
17659 return SDValue();
17660
17661 case Intrinsic::arm_neon_vqshiftsu:
17662 if (isVShiftLImm(N->getOperand(2), VT, false, Cnt))
17663 break;
17664 llvm_unreachable("invalid shift count for vqshlu intrinsic");
17665
17666 case Intrinsic::arm_neon_vrshiftn:
17667 case Intrinsic::arm_neon_vqshiftns:
17668 case Intrinsic::arm_neon_vqshiftnu:
17669 case Intrinsic::arm_neon_vqshiftnsu:
17670 case Intrinsic::arm_neon_vqrshiftns:
17671 case Intrinsic::arm_neon_vqrshiftnu:
17672 case Intrinsic::arm_neon_vqrshiftnsu:
17673 // Narrowing shifts require an immediate right shift.
17674 if (isVShiftRImm(N->getOperand(2), VT, true, true, Cnt))
17675 break;
17676 llvm_unreachable("invalid shift count for narrowing vector shift "
17677 "intrinsic");
17678
17679 default:
17680 llvm_unreachable("unhandled vector shift");
17681 }
17682
17683 switch (IntNo) {
17684 case Intrinsic::arm_neon_vshifts:
17685 case Intrinsic::arm_neon_vshiftu:
17686 // Opcode already set above.
17687 break;
17688 case Intrinsic::arm_neon_vrshifts:
17689 VShiftOpc = ARMISD::VRSHRsIMM;
17690 break;
17691 case Intrinsic::arm_neon_vrshiftu:
17692 VShiftOpc = ARMISD::VRSHRuIMM;
17693 break;
17694 case Intrinsic::arm_neon_vrshiftn:
17695 VShiftOpc = ARMISD::VRSHRNIMM;
17696 break;
17697 case Intrinsic::arm_neon_vqshifts:
17698 VShiftOpc = ARMISD::VQSHLsIMM;
17699 break;
17700 case Intrinsic::arm_neon_vqshiftu:
17701 VShiftOpc = ARMISD::VQSHLuIMM;
17702 break;
17703 case Intrinsic::arm_neon_vqshiftsu:
17704 VShiftOpc = ARMISD::VQSHLsuIMM;
17705 break;
17706 case Intrinsic::arm_neon_vqshiftns:
17707 VShiftOpc = ARMISD::VQSHRNsIMM;
17708 break;
17709 case Intrinsic::arm_neon_vqshiftnu:
17710 VShiftOpc = ARMISD::VQSHRNuIMM;
17711 break;
17712 case Intrinsic::arm_neon_vqshiftnsu:
17713 VShiftOpc = ARMISD::VQSHRNsuIMM;
17714 break;
17715 case Intrinsic::arm_neon_vqrshiftns:
17716 VShiftOpc = ARMISD::VQRSHRNsIMM;
17717 break;
17718 case Intrinsic::arm_neon_vqrshiftnu:
17719 VShiftOpc = ARMISD::VQRSHRNuIMM;
17720 break;
17721 case Intrinsic::arm_neon_vqrshiftnsu:
17722 VShiftOpc = ARMISD::VQRSHRNsuIMM;
17723 break;
17724 }
17725
17726 SDLoc dl(N);
17727 return DAG.getNode(VShiftOpc, dl, N->getValueType(0),
17728 N->getOperand(1), DAG.getConstant(Cnt, dl, MVT::i32));
17729 }
17730
17731 case Intrinsic::arm_neon_vshiftins: {
17732 EVT VT = N->getOperand(1).getValueType();
17733 int64_t Cnt;
17734 unsigned VShiftOpc = 0;
17735
17736 if (isVShiftLImm(N->getOperand(3), VT, false, Cnt))
17737 VShiftOpc = ARMISD::VSLIIMM;
17738 else if (isVShiftRImm(N->getOperand(3), VT, false, true, Cnt))
17739 VShiftOpc = ARMISD::VSRIIMM;
17740 else {
17741 llvm_unreachable("invalid shift count for vsli/vsri intrinsic");
17742 }
17743
17744 SDLoc dl(N);
17745 return DAG.getNode(VShiftOpc, dl, N->getValueType(0),
17746 N->getOperand(1), N->getOperand(2),
17747 DAG.getConstant(Cnt, dl, MVT::i32));
17748 }
17749
17750 case Intrinsic::arm_neon_vqrshifts:
17751 case Intrinsic::arm_neon_vqrshiftu:
17752 // No immediate versions of these to check for.
17753 break;
17754
17755 case Intrinsic::arm_neon_vbsl: {
17756 SDLoc dl(N);
17757 return DAG.getNode(ARMISD::VBSP, dl, N->getValueType(0), N->getOperand(1),
17758 N->getOperand(2), N->getOperand(3));
17759 }
17760 case Intrinsic::arm_mve_vqdmlah:
17761 case Intrinsic::arm_mve_vqdmlash:
17762 case Intrinsic::arm_mve_vqrdmlah:
17763 case Intrinsic::arm_mve_vqrdmlash:
17764 case Intrinsic::arm_mve_vmla_n_predicated:
17765 case Intrinsic::arm_mve_vmlas_n_predicated:
17766 case Intrinsic::arm_mve_vqdmlah_predicated:
17767 case Intrinsic::arm_mve_vqdmlash_predicated:
17768 case Intrinsic::arm_mve_vqrdmlah_predicated:
17769 case Intrinsic::arm_mve_vqrdmlash_predicated: {
17770 // These intrinsics all take an i32 scalar operand which is narrowed to the
17771 // size of a single lane of the vector type they return. So we don't need
17772 // any bits of that operand above that point, which allows us to eliminate
17773 // uxth/sxth.
17774 unsigned BitWidth = N->getValueType(0).getScalarSizeInBits();
17775 APInt DemandedMask = APInt::getLowBitsSet(32, BitWidth);
17776 if (SimplifyDemandedBits(N->getOperand(3), DemandedMask, DCI))
17777 return SDValue();
17778 break;
17779 }
17780
17781 case Intrinsic::arm_mve_minv:
17782 case Intrinsic::arm_mve_maxv:
17783 case Intrinsic::arm_mve_minav:
17784 case Intrinsic::arm_mve_maxav:
17785 case Intrinsic::arm_mve_minv_predicated:
17786 case Intrinsic::arm_mve_maxv_predicated:
17787 case Intrinsic::arm_mve_minav_predicated:
17788 case Intrinsic::arm_mve_maxav_predicated: {
17789 // These intrinsics all take an i32 scalar operand which is narrowed to the
17790 // size of a single lane of the vector type they take as the other input.
17791 unsigned BitWidth = N->getOperand(2)->getValueType(0).getScalarSizeInBits();
17792 APInt DemandedMask = APInt::getLowBitsSet(32, BitWidth);
17793 if (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI))
17794 return SDValue();
17795 break;
17796 }
17797
17798 case Intrinsic::arm_mve_addv: {
17799 // Turn this intrinsic straight into the appropriate ARMISD::VADDV node,
17800 // which allow PerformADDVecReduce to turn it into VADDLV when possible.
17801 bool Unsigned = N->getConstantOperandVal(2);
17802 unsigned Opc = Unsigned ? ARMISD::VADDVu : ARMISD::VADDVs;
17803 return DAG.getNode(Opc, SDLoc(N), N->getVTList(), N->getOperand(1));
17804 }
17805
17806 case Intrinsic::arm_mve_addlv:
17807 case Intrinsic::arm_mve_addlv_predicated: {
17808 // Same for these, but ARMISD::VADDLV has to be followed by a BUILD_PAIR
17809 // which recombines the two outputs into an i64
17810 bool Unsigned = N->getConstantOperandVal(2);
17811 unsigned Opc = IntNo == Intrinsic::arm_mve_addlv ?
17812 (Unsigned ? ARMISD::VADDLVu : ARMISD::VADDLVs) :
17813 (Unsigned ? ARMISD::VADDLVpu : ARMISD::VADDLVps);
17814
17816 for (unsigned i = 1, e = N->getNumOperands(); i < e; i++)
17817 if (i != 2) // skip the unsigned flag
17818 Ops.push_back(N->getOperand(i));
17819
17820 SDLoc dl(N);
17821 SDValue val = DAG.getNode(Opc, dl, {MVT::i32, MVT::i32}, Ops);
17822 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, val.getValue(0),
17823 val.getValue(1));
17824 }
17825 }
17826
17827 return SDValue();
17828}
17829
17831 EVT VT = Y.getValueType();
17832 if (!VT.isVector())
17833 return hasAndNotCompare(Y);
17834 if (Subtarget->hasMVEIntegerOps())
17835 return VT.is128BitVector();
17836 if (Subtarget->hasNEON())
17837 return VT.is64BitVector() || VT.is128BitVector();
17838 return false;
17839}
17840
17841/// PerformShiftCombine - Checks for immediate versions of vector shifts and
17842/// lowers them. As with the vector shift intrinsics, this is done during DAG
17843/// combining instead of DAG legalizing because the build_vectors for 64-bit
17844/// vector element shift counts are generally not legal, and it is hard to see
17845/// their values after they get legalized to loads from a constant pool.
17848 const ARMSubtarget *ST) {
17849 SelectionDAG &DAG = DCI.DAG;
17850 EVT VT = N->getValueType(0);
17851
17852 if (ST->isThumb1Only() && N->getOpcode() == ISD::SHL && VT == MVT::i32 &&
17853 N->getOperand(0)->getOpcode() == ISD::AND &&
17854 N->getOperand(0)->hasOneUse()) {
17855 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
17856 return SDValue();
17857 // Look for the pattern (shl (and x, AndMask), ShiftAmt). This doesn't
17858 // usually show up because instcombine prefers to canonicalize it to
17859 // (and (shl x, ShiftAmt) (shl AndMask, ShiftAmt)), but the shift can come
17860 // out of GEP lowering in some cases.
17861 SDValue N0 = N->getOperand(0);
17862 ConstantSDNode *ShiftAmtNode = dyn_cast<ConstantSDNode>(N->getOperand(1));
17863 if (!ShiftAmtNode)
17864 return SDValue();
17865 uint32_t ShiftAmt = static_cast<uint32_t>(ShiftAmtNode->getZExtValue());
17866 ConstantSDNode *AndMaskNode = dyn_cast<ConstantSDNode>(N0->getOperand(1));
17867 if (!AndMaskNode)
17868 return SDValue();
17869 uint32_t AndMask = static_cast<uint32_t>(AndMaskNode->getZExtValue());
17870 // Don't transform uxtb/uxth.
17871 if (AndMask == 255 || AndMask == 65535)
17872 return SDValue();
17873 if (isMask_32(AndMask)) {
17874 uint32_t MaskedBits = llvm::countl_zero(AndMask);
17875 if (MaskedBits > ShiftAmt) {
17876 SDLoc DL(N);
17877 SDValue SHL = DAG.getNode(ISD::SHL, DL, MVT::i32, N0->getOperand(0),
17878 DAG.getConstant(MaskedBits, DL, MVT::i32));
17879 return DAG.getNode(
17880 ISD::SRL, DL, MVT::i32, SHL,
17881 DAG.getConstant(MaskedBits - ShiftAmt, DL, MVT::i32));
17882 }
17883 }
17884 }
17885
17886 // Nothing to be done for scalar shifts.
17887 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
17888 if (!VT.isVector() || !TLI.isTypeLegal(VT))
17889 return SDValue();
17890 if (ST->hasMVEIntegerOps())
17891 return SDValue();
17892
17893 int64_t Cnt;
17894
17895 switch (N->getOpcode()) {
17896 default: llvm_unreachable("unexpected shift opcode");
17897
17898 case ISD::SHL:
17899 if (isVShiftLImm(N->getOperand(1), VT, false, Cnt)) {
17900 SDLoc dl(N);
17901 return DAG.getNode(ARMISD::VSHLIMM, dl, VT, N->getOperand(0),
17902 DAG.getConstant(Cnt, dl, MVT::i32));
17903 }
17904 break;
17905
17906 case ISD::SRA:
17907 case ISD::SRL:
17908 if (isVShiftRImm(N->getOperand(1), VT, false, false, Cnt)) {
17909 unsigned VShiftOpc =
17910 (N->getOpcode() == ISD::SRA ? ARMISD::VSHRsIMM : ARMISD::VSHRuIMM);
17911 SDLoc dl(N);
17912 return DAG.getNode(VShiftOpc, dl, VT, N->getOperand(0),
17913 DAG.getConstant(Cnt, dl, MVT::i32));
17914 }
17915 }
17916 return SDValue();
17917}
17918
17919// Look for a sign/zero/fpextend extend of a larger than legal load. This can be
17920// split into multiple extending loads, which are simpler to deal with than an
17921// arbitrary extend. For fp extends we use an integer extending load and a VCVTL
17922// to convert the type to an f32.
17924 SDValue N0 = N->getOperand(0);
17925 if (N0.getOpcode() != ISD::LOAD)
17926 return SDValue();
17928 if (!LD->isSimple() || !N0.hasOneUse() || LD->isIndexed() ||
17929 LD->getExtensionType() != ISD::NON_EXTLOAD)
17930 return SDValue();
17931 EVT FromVT = LD->getValueType(0);
17932 EVT ToVT = N->getValueType(0);
17933 if (!ToVT.isVector())
17934 return SDValue();
17936 EVT ToEltVT = ToVT.getVectorElementType();
17937 EVT FromEltVT = FromVT.getVectorElementType();
17938
17939 unsigned NumElements = 0;
17940 if (ToEltVT == MVT::i32 && FromEltVT == MVT::i8)
17941 NumElements = 4;
17942 if (ToEltVT == MVT::f32 && FromEltVT == MVT::f16)
17943 NumElements = 4;
17944 if (NumElements == 0 ||
17945 (FromEltVT != MVT::f16 && FromVT.getVectorNumElements() == NumElements) ||
17946 FromVT.getVectorNumElements() % NumElements != 0 ||
17947 !isPowerOf2_32(NumElements))
17948 return SDValue();
17949
17950 LLVMContext &C = *DAG.getContext();
17951 SDLoc DL(LD);
17952 // Details about the old load
17953 SDValue Ch = LD->getChain();
17954 SDValue BasePtr = LD->getBasePtr();
17955 Align Alignment = LD->getBaseAlign();
17956 MachineMemOperand::Flags MMOFlags = LD->getMemOperand()->getFlags();
17957 AAMDNodes AAInfo = LD->getAAInfo();
17958
17959 ISD::LoadExtType NewExtType =
17960 N->getOpcode() == ISD::SIGN_EXTEND ? ISD::SEXTLOAD : ISD::ZEXTLOAD;
17961 SDValue Offset = DAG.getUNDEF(BasePtr.getValueType());
17962 EVT NewFromVT = EVT::getVectorVT(
17963 C, EVT::getIntegerVT(C, FromEltVT.getScalarSizeInBits()), NumElements);
17964 EVT NewToVT = EVT::getVectorVT(
17965 C, EVT::getIntegerVT(C, ToEltVT.getScalarSizeInBits()), NumElements);
17966
17969 for (unsigned i = 0; i < FromVT.getVectorNumElements() / NumElements; i++) {
17970 unsigned NewOffset = (i * NewFromVT.getSizeInBits()) / 8;
17971 SDValue NewPtr =
17972 DAG.getObjectPtrOffset(DL, BasePtr, TypeSize::getFixed(NewOffset));
17973
17974 SDValue NewLoad =
17975 DAG.getLoad(ISD::UNINDEXED, NewExtType, NewToVT, DL, Ch, NewPtr, Offset,
17976 LD->getPointerInfo().getWithOffset(NewOffset), NewFromVT,
17977 Alignment, MMOFlags, AAInfo);
17978 Loads.push_back(NewLoad);
17979 Chains.push_back(SDValue(NewLoad.getNode(), 1));
17980 }
17981
17982 // Float truncs need to extended with VCVTB's into their floating point types.
17983 if (FromEltVT == MVT::f16) {
17985
17986 for (unsigned i = 0; i < Loads.size(); i++) {
17987 SDValue LoadBC =
17988 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, MVT::v8f16, Loads[i]);
17989 SDValue FPExt = DAG.getNode(ARMISD::VCVTL, DL, MVT::v4f32, LoadBC,
17990 DAG.getConstant(0, DL, MVT::i32));
17991 Extends.push_back(FPExt);
17992 }
17993
17994 Loads = Extends;
17995 }
17996
17997 SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
17998 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewChain);
17999 return DAG.getNode(ISD::CONCAT_VECTORS, DL, ToVT, Loads);
18000}
18001
18002/// PerformExtendCombine - Target-specific DAG combining for ISD::SIGN_EXTEND,
18003/// ISD::ZERO_EXTEND, and ISD::ANY_EXTEND.
18005 const ARMSubtarget *ST) {
18006 SDValue N0 = N->getOperand(0);
18007 EVT VT = N->getValueType(0);
18008 SDLoc DL(N);
18009
18010 // Check for sign- and zero-extensions of vector extract operations of 8- and
18011 // 16-bit vector elements. NEON and MVE support these directly. They are
18012 // handled during DAG combining because type legalization will promote them
18013 // to 32-bit types and it is messy to recognize the operations after that.
18014 if ((ST->hasNEON() || ST->hasMVEIntegerOps()) &&
18016 SDValue Vec = N0.getOperand(0);
18017 SDValue Lane = N0.getOperand(1);
18018 EVT EltVT = N0.getValueType();
18019 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
18020
18021 if (VT == MVT::i32 &&
18022 (EltVT == MVT::i8 || EltVT == MVT::i16) &&
18023 TLI.isTypeLegal(Vec.getValueType()) &&
18024 isa<ConstantSDNode>(Lane)) {
18025
18026 unsigned Opc = 0;
18027 switch (N->getOpcode()) {
18028 default: llvm_unreachable("unexpected opcode");
18029 case ISD::SIGN_EXTEND:
18030 Opc = ARMISD::VGETLANEs;
18031 break;
18032 case ISD::ZERO_EXTEND:
18033 case ISD::ANY_EXTEND:
18034 Opc = ARMISD::VGETLANEu;
18035 break;
18036 }
18037 return DAG.getNode(Opc, DL, VT, Vec, Lane);
18038 }
18039 }
18040
18041 if (ST->hasMVEIntegerOps())
18042 if (SDValue NewLoad = PerformSplittingToWideningLoad(N, DAG))
18043 return NewLoad;
18044
18045 // Combine sext(buildvector(..)) to buildvector(sext(..)) to help avoid
18046 // difficult to lower i1 buildvector.
18047 if (ST->hasMVEIntegerOps() && N0.getValueType().getScalarSizeInBits() == 1 &&
18048 N0.getOpcode() == ISD::BUILD_VECTOR && VT.getScalarSizeInBits() <= 32) {
18050 for (unsigned I = 0; I < N0.getNumOperands(); I++) {
18051 SDValue InReg = N0.getOperand(I);
18052 if (N->getOpcode() == ISD::ZERO_EXTEND)
18053 InReg = DAG.getNode(ISD::AND, DL, InReg.getValueType(), InReg,
18054 DAG.getConstant(1, DL, InReg.getValueType()));
18055 else if (N->getOpcode() == ISD::SIGN_EXTEND)
18056 InReg = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, InReg.getValueType(),
18057 InReg, DAG.getValueType(MVT::i1));
18058 SDValue Ext = DAG.getNode(N->getOpcode(), DL, MVT::i32, InReg);
18059 Ops.push_back(Ext);
18060 }
18061 return DAG.getNode(ISD::BUILD_VECTOR, DL, VT, Ops);
18062 }
18063
18064 return SDValue();
18065}
18066
18068 const ARMSubtarget *ST) {
18069 if (ST->hasMVEFloatOps())
18070 if (SDValue NewLoad = PerformSplittingToWideningLoad(N, DAG))
18071 return NewLoad;
18072
18073 return SDValue();
18074}
18075
18076// Lower smin(smax(x, C1), C2) to ssat or usat, if they have saturating
18077// constant bounds.
18079 const ARMSubtarget *Subtarget) {
18080 if ((Subtarget->isThumb() || !Subtarget->hasV6Ops()) &&
18081 !Subtarget->isThumb2())
18082 return SDValue();
18083
18084 EVT VT = Op.getValueType();
18085 SDValue Op0 = Op.getOperand(0);
18086
18087 if (VT != MVT::i32 ||
18088 (Op0.getOpcode() != ISD::SMIN && Op0.getOpcode() != ISD::SMAX) ||
18089 !isa<ConstantSDNode>(Op.getOperand(1)) ||
18091 return SDValue();
18092
18093 SDValue Min = Op;
18094 SDValue Max = Op0;
18095 SDValue Input = Op0.getOperand(0);
18096 if (Min.getOpcode() == ISD::SMAX)
18097 std::swap(Min, Max);
18098
18099 APInt MinC = Min.getConstantOperandAPInt(1);
18100 APInt MaxC = Max.getConstantOperandAPInt(1);
18101
18102 if (Min.getOpcode() != ISD::SMIN || Max.getOpcode() != ISD::SMAX ||
18103 !(MinC + 1).isPowerOf2())
18104 return SDValue();
18105
18106 SDLoc DL(Op);
18107 if (MinC == ~MaxC)
18108 return DAG.getNode(ARMISD::SSAT, DL, VT, Input,
18109 DAG.getConstant(MinC.countr_one(), DL, VT));
18110 if (MaxC == 0)
18111 return DAG.getNode(ARMISD::USAT, DL, VT, Input,
18112 DAG.getConstant(MinC.countr_one(), DL, VT));
18113
18114 return SDValue();
18115}
18116
18117/// PerformMinMaxCombine - Target-specific DAG combining for creating truncating
18118/// saturates.
18120 const ARMSubtarget *ST) {
18121 EVT VT = N->getValueType(0);
18122 SDValue N0 = N->getOperand(0);
18123
18124 if (VT == MVT::i32)
18125 return PerformMinMaxToSatCombine(SDValue(N, 0), DAG, ST);
18126
18127 if (!ST->hasMVEIntegerOps())
18128 return SDValue();
18129
18130 if (SDValue V = PerformVQDMULHCombine(N, DAG))
18131 return V;
18132
18133 if (VT != MVT::v4i32 && VT != MVT::v8i16)
18134 return SDValue();
18135
18136 auto IsSignedSaturate = [&](SDNode *Min, SDNode *Max) {
18137 // Check one is a smin and the other is a smax
18138 if (Min->getOpcode() != ISD::SMIN)
18139 std::swap(Min, Max);
18140 if (Min->getOpcode() != ISD::SMIN || Max->getOpcode() != ISD::SMAX)
18141 return false;
18142
18143 APInt SaturateC;
18144 if (VT == MVT::v4i32)
18145 SaturateC = APInt(32, (1 << 15) - 1, true);
18146 else //if (VT == MVT::v8i16)
18147 SaturateC = APInt(16, (1 << 7) - 1, true);
18148
18149 APInt MinC, MaxC;
18150 if (!ISD::isConstantSplatVector(Min->getOperand(1).getNode(), MinC) ||
18151 MinC != SaturateC)
18152 return false;
18153 if (!ISD::isConstantSplatVector(Max->getOperand(1).getNode(), MaxC) ||
18154 MaxC != ~SaturateC)
18155 return false;
18156 return true;
18157 };
18158
18159 if (IsSignedSaturate(N, N0.getNode())) {
18160 SDLoc DL(N);
18161 MVT ExtVT, HalfVT;
18162 if (VT == MVT::v4i32) {
18163 HalfVT = MVT::v8i16;
18164 ExtVT = MVT::v4i16;
18165 } else { // if (VT == MVT::v8i16)
18166 HalfVT = MVT::v16i8;
18167 ExtVT = MVT::v8i8;
18168 }
18169
18170 // Create a VQMOVNB with undef top lanes, then signed extended into the top
18171 // half. That extend will hopefully be removed if only the bottom bits are
18172 // demanded (though a truncating store, for example).
18173 SDValue VQMOVN =
18174 DAG.getNode(ARMISD::VQMOVNs, DL, HalfVT, DAG.getUNDEF(HalfVT),
18175 N0->getOperand(0), DAG.getConstant(0, DL, MVT::i32));
18176 SDValue Bitcast = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, VQMOVN);
18177 return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Bitcast,
18178 DAG.getValueType(ExtVT));
18179 }
18180
18181 auto IsUnsignedSaturate = [&](SDNode *Min) {
18182 // For unsigned, we just need to check for <= 0xffff
18183 if (Min->getOpcode() != ISD::UMIN)
18184 return false;
18185
18186 APInt SaturateC;
18187 if (VT == MVT::v4i32)
18188 SaturateC = APInt(32, (1 << 16) - 1, true);
18189 else //if (VT == MVT::v8i16)
18190 SaturateC = APInt(16, (1 << 8) - 1, true);
18191
18192 APInt MinC;
18193 if (!ISD::isConstantSplatVector(Min->getOperand(1).getNode(), MinC) ||
18194 MinC != SaturateC)
18195 return false;
18196 return true;
18197 };
18198
18199 if (IsUnsignedSaturate(N)) {
18200 SDLoc DL(N);
18201 MVT HalfVT;
18202 unsigned ExtConst;
18203 if (VT == MVT::v4i32) {
18204 HalfVT = MVT::v8i16;
18205 ExtConst = 0x0000FFFF;
18206 } else { //if (VT == MVT::v8i16)
18207 HalfVT = MVT::v16i8;
18208 ExtConst = 0x00FF;
18209 }
18210
18211 // Create a VQMOVNB with undef top lanes, then ZExt into the top half with
18212 // an AND. That extend will hopefully be removed if only the bottom bits are
18213 // demanded (though a truncating store, for example).
18214 SDValue VQMOVN =
18215 DAG.getNode(ARMISD::VQMOVNu, DL, HalfVT, DAG.getUNDEF(HalfVT), N0,
18216 DAG.getConstant(0, DL, MVT::i32));
18217 SDValue Bitcast = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, VQMOVN);
18218 return DAG.getNode(ISD::AND, DL, VT, Bitcast,
18219 DAG.getConstant(ExtConst, DL, VT));
18220 }
18221
18222 return SDValue();
18223}
18224
18227 if (!C)
18228 return nullptr;
18229 const APInt *CV = &C->getAPIntValue();
18230 return CV->isPowerOf2() ? CV : nullptr;
18231}
18232
18234 // If we have a CMOV, OR and AND combination such as:
18235 // if (x & CN)
18236 // y |= CM;
18237 //
18238 // And:
18239 // * CN is a single bit;
18240 // * All bits covered by CM are known zero in y
18241 //
18242 // Then we can convert this into a sequence of BFI instructions. This will
18243 // always be a win if CM is a single bit, will always be no worse than the
18244 // TST&OR sequence if CM is two bits, and for thumb will be no worse if CM is
18245 // three bits (due to the extra IT instruction).
18246
18247 SDValue Op0 = CMOV->getOperand(0);
18248 SDValue Op1 = CMOV->getOperand(1);
18249 auto CC = CMOV->getConstantOperandAPInt(2).getLimitedValue();
18250 SDValue CmpZ = CMOV->getOperand(3);
18251
18252 // The compare must be against zero.
18253 if (!isNullConstant(CmpZ->getOperand(1)))
18254 return SDValue();
18255
18256 assert(CmpZ->getOpcode() == ARMISD::CMPZ);
18257 SDValue And = CmpZ->getOperand(0);
18258 if (And->getOpcode() != ISD::AND)
18259 return SDValue();
18260 const APInt *AndC = isPowerOf2Constant(And->getOperand(1));
18261 if (!AndC)
18262 return SDValue();
18263 SDValue X = And->getOperand(0);
18264
18265 if (CC == ARMCC::EQ) {
18266 // We're performing an "equal to zero" compare. Swap the operands so we
18267 // canonicalize on a "not equal to zero" compare.
18268 std::swap(Op0, Op1);
18269 } else {
18270 assert(CC == ARMCC::NE && "How can a CMPZ node not be EQ or NE?");
18271 }
18272
18273 if (Op1->getOpcode() != ISD::OR)
18274 return SDValue();
18275
18277 if (!OrC)
18278 return SDValue();
18279 SDValue Y = Op1->getOperand(0);
18280
18281 if (Op0 != Y)
18282 return SDValue();
18283
18284 // Now, is it profitable to continue?
18285 APInt OrCI = OrC->getAPIntValue();
18286 unsigned Heuristic = Subtarget->isThumb() ? 3 : 2;
18287 if (OrCI.popcount() > Heuristic)
18288 return SDValue();
18289
18290 // Lastly, can we determine that the bits defined by OrCI
18291 // are zero in Y?
18292 KnownBits Known = DAG.computeKnownBits(Y);
18293 if ((OrCI & Known.Zero) != OrCI)
18294 return SDValue();
18295
18296 // OK, we can do the combine.
18297 SDValue V = Y;
18298 SDLoc dl(X);
18299 EVT VT = X.getValueType();
18300 unsigned BitInX = AndC->logBase2();
18301
18302 if (BitInX != 0) {
18303 // We must shift X first.
18304 X = DAG.getNode(ISD::SRL, dl, VT, X,
18305 DAG.getConstant(BitInX, dl, VT));
18306 }
18307
18308 for (unsigned BitInY = 0, NumActiveBits = OrCI.getActiveBits();
18309 BitInY < NumActiveBits; ++BitInY) {
18310 if (OrCI[BitInY] == 0)
18311 continue;
18312 APInt Mask(VT.getSizeInBits(), 0);
18313 Mask.setBit(BitInY);
18314 V = DAG.getNode(ARMISD::BFI, dl, VT, V, X,
18315 // Confusingly, the operand is an *inverted* mask.
18316 DAG.getConstant(~Mask, dl, VT));
18317 }
18318
18319 return V;
18320}
18321
18322// Given N, the value controlling the conditional branch, search for the loop
18323// intrinsic, returning it, along with how the value is used. We need to handle
18324// patterns such as the following:
18325// (brcond (xor (setcc (loop.decrement), 0, ne), 1), exit)
18326// (brcond (setcc (loop.decrement), 0, eq), exit)
18327// (brcond (setcc (loop.decrement), 0, ne), header)
18329 bool &Negate) {
18330 switch (N->getOpcode()) {
18331 default:
18332 break;
18333 case ISD::XOR: {
18334 if (!isa<ConstantSDNode>(N.getOperand(1)))
18335 return SDValue();
18336 if (!cast<ConstantSDNode>(N.getOperand(1))->isOne())
18337 return SDValue();
18338 Negate = !Negate;
18339 return SearchLoopIntrinsic(N.getOperand(0), CC, Imm, Negate);
18340 }
18341 case ISD::SETCC: {
18342 auto *Const = dyn_cast<ConstantSDNode>(N.getOperand(1));
18343 if (!Const)
18344 return SDValue();
18345 if (Const->isZero())
18346 Imm = 0;
18347 else if (Const->isOne())
18348 Imm = 1;
18349 else
18350 return SDValue();
18351 CC = cast<CondCodeSDNode>(N.getOperand(2))->get();
18352 return SearchLoopIntrinsic(N->getOperand(0), CC, Imm, Negate);
18353 }
18355 unsigned IntOp = N.getConstantOperandVal(1);
18356 if (IntOp != Intrinsic::test_start_loop_iterations &&
18357 IntOp != Intrinsic::loop_decrement_reg)
18358 return SDValue();
18359 return N;
18360 }
18361 }
18362 return SDValue();
18363}
18364
18367 const ARMSubtarget *ST) {
18368
18369 // The hwloop intrinsics that we're interested are used for control-flow,
18370 // either for entering or exiting the loop:
18371 // - test.start.loop.iterations will test whether its operand is zero. If it
18372 // is zero, the proceeding branch should not enter the loop.
18373 // - loop.decrement.reg also tests whether its operand is zero. If it is
18374 // zero, the proceeding branch should not branch back to the beginning of
18375 // the loop.
18376 // So here, we need to check that how the brcond is using the result of each
18377 // of the intrinsics to ensure that we're branching to the right place at the
18378 // right time.
18379
18380 ISD::CondCode CC;
18381 SDValue Cond;
18382 int Imm = 1;
18383 bool Negate = false;
18384 SDValue Chain = N->getOperand(0);
18385 SDValue Dest;
18386
18387 if (N->getOpcode() == ISD::BRCOND) {
18388 CC = ISD::SETEQ;
18389 Cond = N->getOperand(1);
18390 Dest = N->getOperand(2);
18391 } else {
18392 assert(N->getOpcode() == ISD::BR_CC && "Expected BRCOND or BR_CC!");
18393 CC = cast<CondCodeSDNode>(N->getOperand(1))->get();
18394 Cond = N->getOperand(2);
18395 Dest = N->getOperand(4);
18396 if (auto *Const = dyn_cast<ConstantSDNode>(N->getOperand(3))) {
18397 if (!Const->isOne() && !Const->isZero())
18398 return SDValue();
18399 Imm = Const->getZExtValue();
18400 } else
18401 return SDValue();
18402 }
18403
18404 SDValue Int = SearchLoopIntrinsic(Cond, CC, Imm, Negate);
18405 if (!Int)
18406 return SDValue();
18407
18408 if (Negate)
18409 CC = ISD::getSetCCInverse(CC, /* Integer inverse */ MVT::i32);
18410
18411 auto IsTrueIfZero = [](ISD::CondCode CC, int Imm) {
18412 return (CC == ISD::SETEQ && Imm == 0) ||
18413 (CC == ISD::SETNE && Imm == 1) ||
18414 (CC == ISD::SETLT && Imm == 1) ||
18415 (CC == ISD::SETULT && Imm == 1);
18416 };
18417
18418 auto IsFalseIfZero = [](ISD::CondCode CC, int Imm) {
18419 return (CC == ISD::SETEQ && Imm == 1) ||
18420 (CC == ISD::SETNE && Imm == 0) ||
18421 (CC == ISD::SETGT && Imm == 0) ||
18422 (CC == ISD::SETUGT && Imm == 0) ||
18423 (CC == ISD::SETGE && Imm == 1) ||
18424 (CC == ISD::SETUGE && Imm == 1);
18425 };
18426
18427 assert((IsTrueIfZero(CC, Imm) || IsFalseIfZero(CC, Imm)) &&
18428 "unsupported condition");
18429
18430 SDLoc dl(Int);
18431 SelectionDAG &DAG = DCI.DAG;
18432 SDValue Elements = Int.getOperand(2);
18433 unsigned IntOp = Int->getConstantOperandVal(1);
18434 assert((N->hasOneUse() && N->user_begin()->getOpcode() == ISD::BR) &&
18435 "expected single br user");
18436 SDNode *Br = *N->user_begin();
18437 SDValue OtherTarget = Br->getOperand(1);
18438
18439 // Update the unconditional branch to branch to the given Dest.
18440 auto UpdateUncondBr = [](SDNode *Br, SDValue Dest, SelectionDAG &DAG) {
18441 SDValue NewBrOps[] = { Br->getOperand(0), Dest };
18442 SDValue NewBr = DAG.getNode(ISD::BR, SDLoc(Br), MVT::Other, NewBrOps);
18443 DAG.ReplaceAllUsesOfValueWith(SDValue(Br, 0), NewBr);
18444 };
18445
18446 if (IntOp == Intrinsic::test_start_loop_iterations) {
18447 SDValue Res;
18448 SDValue Setup = DAG.getNode(ARMISD::WLSSETUP, dl, MVT::i32, Elements);
18449 // We expect this 'instruction' to branch when the counter is zero.
18450 if (IsTrueIfZero(CC, Imm)) {
18451 SDValue Ops[] = {Chain, Setup, Dest};
18452 Res = DAG.getNode(ARMISD::WLS, dl, MVT::Other, Ops);
18453 } else {
18454 // The logic is the reverse of what we need for WLS, so find the other
18455 // basic block target: the target of the proceeding br.
18456 UpdateUncondBr(Br, Dest, DAG);
18457
18458 SDValue Ops[] = {Chain, Setup, OtherTarget};
18459 Res = DAG.getNode(ARMISD::WLS, dl, MVT::Other, Ops);
18460 }
18461 // Update LR count to the new value
18462 DAG.ReplaceAllUsesOfValueWith(Int.getValue(0), Setup);
18463 // Update chain
18464 DAG.ReplaceAllUsesOfValueWith(Int.getValue(2), Int.getOperand(0));
18465 return Res;
18466 } else {
18467 SDValue Size =
18468 DAG.getTargetConstant(Int.getConstantOperandVal(3), dl, MVT::i32);
18469 SDValue Args[] = { Int.getOperand(0), Elements, Size, };
18470 SDValue LoopDec = DAG.getNode(ARMISD::LOOP_DEC, dl,
18471 DAG.getVTList(MVT::i32, MVT::Other), Args);
18472 DAG.ReplaceAllUsesWith(Int.getNode(), LoopDec.getNode());
18473
18474 // We expect this instruction to branch when the count is not zero.
18475 SDValue Target = IsFalseIfZero(CC, Imm) ? Dest : OtherTarget;
18476
18477 // Update the unconditional branch to target the loop preheader if we've
18478 // found the condition has been reversed.
18479 if (Target == OtherTarget)
18480 UpdateUncondBr(Br, Dest, DAG);
18481
18482 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
18483 SDValue(LoopDec.getNode(), 1), Chain);
18484
18485 SDValue EndArgs[] = { Chain, SDValue(LoopDec.getNode(), 0), Target };
18486 return DAG.getNode(ARMISD::LE, dl, MVT::Other, EndArgs);
18487 }
18488 return SDValue();
18489}
18490
18491/// PerformBRCONDCombine - Target-specific DAG combining for ARMISD::BRCOND.
18492SDValue
18494 SDValue Cmp = N->getOperand(3);
18495 if (Cmp.getOpcode() != ARMISD::CMPZ)
18496 // Only looking at NE cases.
18497 return SDValue();
18498
18499 SDLoc dl(N);
18500 SDValue LHS = Cmp.getOperand(0);
18501 SDValue RHS = Cmp.getOperand(1);
18502 SDValue Chain = N->getOperand(0);
18503 SDValue BB = N->getOperand(1);
18504 SDValue ARMcc = N->getOperand(2);
18506
18507 // (brcond Chain BB ne (cmpz (and (cmov 0 1 CC Flags) 1) 0))
18508 // -> (brcond Chain BB CC Flags)
18509 if (CC == ARMCC::NE && LHS.getOpcode() == ISD::AND && LHS->hasOneUse() &&
18510 LHS->getOperand(0)->getOpcode() == ARMISD::CMOV &&
18511 LHS->getOperand(0)->hasOneUse() &&
18512 isNullConstant(LHS->getOperand(0)->getOperand(0)) &&
18513 isOneConstant(LHS->getOperand(0)->getOperand(1)) &&
18514 isOneConstant(LHS->getOperand(1)) && isNullConstant(RHS)) {
18515 return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, Chain, BB,
18516 LHS->getOperand(0)->getOperand(2),
18517 LHS->getOperand(0)->getOperand(3));
18518 }
18519
18520 return SDValue();
18521}
18522
18523/// PerformCMOVCombine - Target-specific DAG combining for ARMISD::CMOV.
18524SDValue
18526 SDLoc dl(N);
18527 EVT VT = N->getValueType(0);
18528 SDValue FalseVal = N->getOperand(0);
18529 SDValue TrueVal = N->getOperand(1);
18530 SDValue ARMcc = N->getOperand(2);
18531 SDValue Cmp = N->getOperand(3);
18532
18533 // Try to form CSINV etc.
18534 unsigned Opcode;
18535 bool InvertCond;
18536 if (SDValue CSetOp =
18537 matchCSET(Opcode, InvertCond, TrueVal, FalseVal, Subtarget)) {
18538 if (InvertCond) {
18539 ARMCC::CondCodes CondCode =
18540 (ARMCC::CondCodes)cast<const ConstantSDNode>(ARMcc)->getZExtValue();
18541 CondCode = ARMCC::getOppositeCondition(CondCode);
18542 ARMcc = DAG.getConstant(CondCode, SDLoc(ARMcc), MVT::i32);
18543 }
18544 return DAG.getNode(Opcode, dl, VT, CSetOp, CSetOp, ARMcc, Cmp);
18545 }
18546
18547 if (Cmp.getOpcode() != ARMISD::CMPZ)
18548 // Only looking at EQ and NE cases.
18549 return SDValue();
18550
18551 SDValue LHS = Cmp.getOperand(0);
18552 SDValue RHS = Cmp.getOperand(1);
18554
18555 // BFI is only available on V6T2+.
18556 if (!Subtarget->isThumb1Only() && Subtarget->hasV6T2Ops()) {
18558 if (R)
18559 return R;
18560 }
18561
18562 // Simplify
18563 // mov r1, r0
18564 // cmp r1, x
18565 // mov r0, y
18566 // moveq r0, x
18567 // to
18568 // cmp r0, x
18569 // movne r0, y
18570 //
18571 // mov r1, r0
18572 // cmp r1, x
18573 // mov r0, x
18574 // movne r0, y
18575 // to
18576 // cmp r0, x
18577 // movne r0, y
18578 /// FIXME: Turn this into a target neutral optimization?
18579 SDValue Res;
18580 if (CC == ARMCC::NE && FalseVal == RHS && FalseVal != LHS) {
18581 Res = DAG.getNode(ARMISD::CMOV, dl, VT, LHS, TrueVal, ARMcc, Cmp);
18582 } else if (CC == ARMCC::EQ && TrueVal == RHS) {
18583 SDValue ARMcc;
18584 SDValue NewCmp = getARMCmp(LHS, RHS, ISD::SETNE, ARMcc, DAG, dl);
18585 Res = DAG.getNode(ARMISD::CMOV, dl, VT, LHS, FalseVal, ARMcc, NewCmp);
18586 }
18587
18588 // (cmov F T ne (cmpz (cmov 0 1 CC Flags) 0))
18589 // -> (cmov F T CC Flags)
18590 if (CC == ARMCC::NE && LHS.getOpcode() == ARMISD::CMOV && LHS->hasOneUse() &&
18591 isNullConstant(LHS->getOperand(0)) && isOneConstant(LHS->getOperand(1)) &&
18592 isNullConstant(RHS)) {
18593 return DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, TrueVal,
18594 LHS->getOperand(2), LHS->getOperand(3));
18595 }
18596
18597 if (!VT.isInteger())
18598 return SDValue();
18599
18600 // Fold away an unnecessary CMPZ/CMOV
18601 // CMOV A, B, C1, (CMPZ (CMOV 1, 0, C2, D), 0) ->
18602 // if C1==EQ -> CMOV A, B, C2, D
18603 // if C1==NE -> CMOV A, B, NOT(C2), D
18604 if (N->getConstantOperandVal(2) == ARMCC::EQ ||
18605 N->getConstantOperandVal(2) == ARMCC::NE) {
18607 if (SDValue C = IsCMPZCSINC(N->getOperand(3).getNode(), Cond)) {
18608 if (N->getConstantOperandVal(2) == ARMCC::NE)
18610 return DAG.getNode(N->getOpcode(), SDLoc(N), MVT::i32, N->getOperand(0),
18611 N->getOperand(1),
18612 DAG.getConstant(Cond, SDLoc(N), MVT::i32), C);
18613 }
18614 }
18615
18616 // Materialize a boolean comparison for integers so we can avoid branching.
18617 if (isNullConstant(FalseVal)) {
18618 if (CC == ARMCC::EQ && isOneConstant(TrueVal)) {
18619 if (!Subtarget->isThumb1Only() && Subtarget->hasV5TOps()) {
18620 // If x == y then x - y == 0 and ARM's CLZ will return 32, shifting it
18621 // right 5 bits will make that 32 be 1, otherwise it will be 0.
18622 // CMOV 0, 1, ==, (CMPZ x, y) -> SRL (CTLZ (SUB x, y)), 5
18623 SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, LHS, RHS);
18624 Res = DAG.getNode(ISD::SRL, dl, VT, DAG.getNode(ISD::CTLZ, dl, VT, Sub),
18625 DAG.getConstant(5, dl, MVT::i32));
18626 } else {
18627 // CMOV 0, 1, ==, (CMPZ x, y) ->
18628 // (UADDO_CARRY (SUB x, y), t:0, t:1)
18629 // where t = (USUBO_CARRY 0, (SUB x, y), 0)
18630 //
18631 // The USUBO_CARRY computes 0 - (x - y) and this will give a borrow when
18632 // x != y. In other words, a carry C == 1 when x == y, C == 0
18633 // otherwise.
18634 // The final UADDO_CARRY computes
18635 // x - y + (0 - (x - y)) + C == C
18636 SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, LHS, RHS);
18637 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
18638 SDValue Neg = DAG.getNode(ISD::USUBO, dl, VTs, FalseVal, Sub);
18639 // ISD::USUBO_CARRY returns a borrow but we want the carry here
18640 // actually.
18641 SDValue Carry =
18642 DAG.getNode(ISD::SUB, dl, MVT::i32,
18643 DAG.getConstant(1, dl, MVT::i32), Neg.getValue(1));
18644 Res = DAG.getNode(ISD::UADDO_CARRY, dl, VTs, Sub, Neg, Carry);
18645 }
18646 } else if (CC == ARMCC::NE && !isNullConstant(RHS) &&
18647 (!Subtarget->isThumb1Only() || isPowerOf2Constant(TrueVal))) {
18648 // This seems pointless but will allow us to combine it further below.
18649 // CMOV 0, z, !=, (CMPZ x, y) -> CMOV (SUBC x, y), z, !=, (SUBC x, y):1
18650 SDValue Sub =
18651 DAG.getNode(ARMISD::SUBC, dl, DAG.getVTList(VT, MVT::i32), LHS, RHS);
18652 Res = DAG.getNode(ARMISD::CMOV, dl, VT, Sub, TrueVal, ARMcc,
18653 Sub.getValue(1));
18654 FalseVal = Sub;
18655 }
18656 } else if (isNullConstant(TrueVal)) {
18657 if (CC == ARMCC::EQ && !isNullConstant(RHS) &&
18658 (!Subtarget->isThumb1Only() || isPowerOf2Constant(FalseVal))) {
18659 // This seems pointless but will allow us to combine it further below
18660 // Note that we change == for != as this is the dual for the case above.
18661 // CMOV z, 0, ==, (CMPZ x, y) -> CMOV (SUBC x, y), z, !=, (SUBC x, y):1
18662 SDValue Sub =
18663 DAG.getNode(ARMISD::SUBC, dl, DAG.getVTList(VT, MVT::i32), LHS, RHS);
18664 Res = DAG.getNode(ARMISD::CMOV, dl, VT, Sub, FalseVal,
18665 DAG.getConstant(ARMCC::NE, dl, MVT::i32),
18666 Sub.getValue(1));
18667 FalseVal = Sub;
18668 }
18669 }
18670
18671 // On Thumb1, the DAG above may be further combined if z is a power of 2
18672 // (z == 2 ^ K).
18673 // CMOV (SUBC x, y), z, !=, (SUBC x, y):1 ->
18674 // t1 = (USUBO (SUB x, y), 1)
18675 // t2 = (USUBO_CARRY (SUB x, y), t1:0, t1:1)
18676 // Result = if K != 0 then (SHL t2:0, K) else t2:0
18677 //
18678 // This also handles the special case of comparing against zero; it's
18679 // essentially, the same pattern, except there's no SUBC:
18680 // CMOV x, z, !=, (CMPZ x, 0) ->
18681 // t1 = (USUBO x, 1)
18682 // t2 = (USUBO_CARRY x, t1:0, t1:1)
18683 // Result = if K != 0 then (SHL t2:0, K) else t2:0
18684 const APInt *TrueConst;
18685 if (Subtarget->isThumb1Only() && CC == ARMCC::NE &&
18686 ((FalseVal.getOpcode() == ARMISD::SUBC && FalseVal.getOperand(0) == LHS &&
18687 FalseVal.getOperand(1) == RHS) ||
18688 (FalseVal == LHS && isNullConstant(RHS))) &&
18689 (TrueConst = isPowerOf2Constant(TrueVal))) {
18690 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
18691 unsigned ShiftAmount = TrueConst->logBase2();
18692 if (ShiftAmount)
18693 TrueVal = DAG.getConstant(1, dl, VT);
18694 SDValue Subc = DAG.getNode(ISD::USUBO, dl, VTs, FalseVal, TrueVal);
18695 Res = DAG.getNode(ISD::USUBO_CARRY, dl, VTs, FalseVal, Subc,
18696 Subc.getValue(1));
18697
18698 if (ShiftAmount)
18699 Res = DAG.getNode(ISD::SHL, dl, VT, Res,
18700 DAG.getConstant(ShiftAmount, dl, MVT::i32));
18701 }
18702
18703 if (Res.getNode()) {
18704 KnownBits Known = DAG.computeKnownBits(SDValue(N,0));
18705 // Capture demanded bits information that would be otherwise lost.
18706 if (Known.Zero == 0xfffffffe)
18707 Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res,
18708 DAG.getValueType(MVT::i1));
18709 else if (Known.Zero == 0xffffff00)
18710 Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res,
18711 DAG.getValueType(MVT::i8));
18712 else if (Known.Zero == 0xffff0000)
18713 Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res,
18714 DAG.getValueType(MVT::i16));
18715 }
18716
18717 return Res;
18718}
18719
18722 const ARMSubtarget *ST) {
18723 SelectionDAG &DAG = DCI.DAG;
18724 SDValue Src = N->getOperand(0);
18725 EVT DstVT = N->getValueType(0);
18726
18727 // Convert v4f32 bitcast (v4i32 vdup (i32)) -> v4f32 vdup (i32) under MVE.
18728 if (ST->hasMVEIntegerOps() && Src.getOpcode() == ARMISD::VDUP) {
18729 EVT SrcVT = Src.getValueType();
18730 if (SrcVT.getScalarSizeInBits() == DstVT.getScalarSizeInBits())
18731 return DAG.getNode(ARMISD::VDUP, SDLoc(N), DstVT, Src.getOperand(0));
18732 }
18733
18734 // We may have a bitcast of something that has already had this bitcast
18735 // combine performed on it, so skip past any VECTOR_REG_CASTs.
18736 if (Src.getOpcode() == ARMISD::VECTOR_REG_CAST &&
18737 Src.getOperand(0).getValueType().getScalarSizeInBits() <=
18738 Src.getValueType().getScalarSizeInBits())
18739 Src = Src.getOperand(0);
18740
18741 // Bitcast from element-wise VMOV or VMVN doesn't need VREV if the VREV that
18742 // would be generated is at least the width of the element type.
18743 EVT SrcVT = Src.getValueType();
18744 if ((Src.getOpcode() == ARMISD::VMOVIMM ||
18745 Src.getOpcode() == ARMISD::VMVNIMM ||
18746 Src.getOpcode() == ARMISD::VMOVFPIMM) &&
18747 SrcVT.getScalarSizeInBits() <= DstVT.getScalarSizeInBits() &&
18748 DAG.getDataLayout().isBigEndian())
18749 return DAG.getNode(ARMISD::VECTOR_REG_CAST, SDLoc(N), DstVT, Src);
18750
18751 // bitcast(extract(x, n)); bitcast(extract(x, n+1)) -> VMOVRRD x
18752 if (SDValue R = PerformExtractEltToVMOVRRD(N, DCI))
18753 return R;
18754
18755 return SDValue();
18756}
18757
18758// Some combines for the MVETrunc truncations legalizer helper. Also lowers the
18759// node into stack operations after legalizeOps.
18762 SelectionDAG &DAG = DCI.DAG;
18763 EVT VT = N->getValueType(0);
18764 SDLoc DL(N);
18765
18766 // MVETrunc(Undef, Undef) -> Undef
18767 if (all_of(N->ops(), [](SDValue Op) { return Op.isUndef(); }))
18768 return DAG.getUNDEF(VT);
18769
18770 // MVETrunc(MVETrunc a b, MVETrunc c, d) -> MVETrunc
18771 if (N->getNumOperands() == 2 &&
18772 N->getOperand(0).getOpcode() == ARMISD::MVETRUNC &&
18773 N->getOperand(1).getOpcode() == ARMISD::MVETRUNC)
18774 return DAG.getNode(ARMISD::MVETRUNC, DL, VT, N->getOperand(0).getOperand(0),
18775 N->getOperand(0).getOperand(1),
18776 N->getOperand(1).getOperand(0),
18777 N->getOperand(1).getOperand(1));
18778
18779 // MVETrunc(shuffle, shuffle) -> VMOVN
18780 if (N->getNumOperands() == 2 &&
18781 N->getOperand(0).getOpcode() == ISD::VECTOR_SHUFFLE &&
18782 N->getOperand(1).getOpcode() == ISD::VECTOR_SHUFFLE) {
18783 auto *S0 = cast<ShuffleVectorSDNode>(N->getOperand(0).getNode());
18784 auto *S1 = cast<ShuffleVectorSDNode>(N->getOperand(1).getNode());
18785
18786 if (S0->getOperand(0) == S1->getOperand(0) &&
18787 S0->getOperand(1) == S1->getOperand(1)) {
18788 // Construct complete shuffle mask
18789 SmallVector<int, 8> Mask(S0->getMask());
18790 Mask.append(S1->getMask().begin(), S1->getMask().end());
18791
18792 if (isVMOVNTruncMask(Mask, VT, false))
18793 return DAG.getNode(
18794 ARMISD::VMOVN, DL, VT,
18795 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, S0->getOperand(0)),
18796 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, S0->getOperand(1)),
18797 DAG.getConstant(1, DL, MVT::i32));
18798 if (isVMOVNTruncMask(Mask, VT, true))
18799 return DAG.getNode(
18800 ARMISD::VMOVN, DL, VT,
18801 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, S0->getOperand(1)),
18802 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, S0->getOperand(0)),
18803 DAG.getConstant(1, DL, MVT::i32));
18804 }
18805 }
18806
18807 // For MVETrunc of a buildvector or shuffle, it can be beneficial to lower the
18808 // truncate to a buildvector to allow the generic optimisations to kick in.
18809 if (all_of(N->ops(), [](SDValue Op) {
18810 return Op.getOpcode() == ISD::BUILD_VECTOR ||
18811 Op.getOpcode() == ISD::VECTOR_SHUFFLE ||
18812 (Op.getOpcode() == ISD::BITCAST &&
18813 Op.getOperand(0).getOpcode() == ISD::BUILD_VECTOR);
18814 })) {
18815 SmallVector<SDValue, 8> Extracts;
18816 for (unsigned Op = 0; Op < N->getNumOperands(); Op++) {
18817 SDValue O = N->getOperand(Op);
18818 for (unsigned i = 0; i < O.getValueType().getVectorNumElements(); i++) {
18819 SDValue Ext = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, O,
18820 DAG.getConstant(i, DL, MVT::i32));
18821 Extracts.push_back(Ext);
18822 }
18823 }
18824 return DAG.getBuildVector(VT, DL, Extracts);
18825 }
18826
18827 // If we are late in the legalization process and nothing has optimised
18828 // the trunc to anything better, lower it to a stack store and reload,
18829 // performing the truncation whilst keeping the lanes in the correct order:
18830 // VSTRH.32 a, stack; VSTRH.32 b, stack+8; VLDRW.32 stack;
18831 if (!DCI.isAfterLegalizeDAG())
18832 return SDValue();
18833
18834 SDValue StackPtr = DAG.CreateStackTemporary(TypeSize::getFixed(16), Align(4));
18835 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
18836 int NumIns = N->getNumOperands();
18837 assert((NumIns == 2 || NumIns == 4) &&
18838 "Expected 2 or 4 inputs to an MVETrunc");
18839 EVT StoreVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
18840 if (N->getNumOperands() == 4)
18841 StoreVT = StoreVT.getHalfNumVectorElementsVT(*DAG.getContext());
18842
18843 SmallVector<SDValue> Chains;
18844 for (int I = 0; I < NumIns; I++) {
18845 SDValue Ptr = DAG.getNode(
18846 ISD::ADD, DL, StackPtr.getValueType(), StackPtr,
18847 DAG.getConstant(I * 16 / NumIns, DL, StackPtr.getValueType()));
18849 DAG.getMachineFunction(), SPFI, I * 16 / NumIns);
18850 SDValue Ch = DAG.getTruncStore(DAG.getEntryNode(), DL, N->getOperand(I),
18851 Ptr, MPI, StoreVT, Align(4));
18852 Chains.push_back(Ch);
18853 }
18854
18855 SDValue Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
18856 MachinePointerInfo MPI =
18858 return DAG.getLoad(VT, DL, Chain, StackPtr, MPI, Align(4));
18859}
18860
18861// Take a MVEEXT(load x) and split that into (extload x, extload x+8)
18863 SelectionDAG &DAG) {
18864 SDValue N0 = N->getOperand(0);
18866 if (!LD || !LD->isSimple() || !N0.hasOneUse() || LD->isIndexed())
18867 return SDValue();
18868
18869 EVT FromVT = LD->getMemoryVT();
18870 EVT ToVT = N->getValueType(0);
18871 if (!ToVT.isVector())
18872 return SDValue();
18873 assert(FromVT.getVectorNumElements() == ToVT.getVectorNumElements() * 2);
18874 EVT ToEltVT = ToVT.getVectorElementType();
18875 EVT FromEltVT = FromVT.getVectorElementType();
18876
18877 unsigned NumElements = 0;
18878 if (ToEltVT == MVT::i32 && (FromEltVT == MVT::i16 || FromEltVT == MVT::i8))
18879 NumElements = 4;
18880 if (ToEltVT == MVT::i16 && FromEltVT == MVT::i8)
18881 NumElements = 8;
18882 assert(NumElements != 0);
18883
18884 ISD::LoadExtType NewExtType =
18885 N->getOpcode() == ARMISD::MVESEXT ? ISD::SEXTLOAD : ISD::ZEXTLOAD;
18886 if (LD->getExtensionType() != ISD::NON_EXTLOAD &&
18887 LD->getExtensionType() != ISD::EXTLOAD &&
18888 LD->getExtensionType() != NewExtType)
18889 return SDValue();
18890
18891 LLVMContext &C = *DAG.getContext();
18892 SDLoc DL(LD);
18893 // Details about the old load
18894 SDValue Ch = LD->getChain();
18895 SDValue BasePtr = LD->getBasePtr();
18896 Align Alignment = LD->getBaseAlign();
18897 MachineMemOperand::Flags MMOFlags = LD->getMemOperand()->getFlags();
18898 AAMDNodes AAInfo = LD->getAAInfo();
18899
18900 SDValue Offset = DAG.getUNDEF(BasePtr.getValueType());
18901 EVT NewFromVT = EVT::getVectorVT(
18902 C, EVT::getIntegerVT(C, FromEltVT.getScalarSizeInBits()), NumElements);
18903 EVT NewToVT = EVT::getVectorVT(
18904 C, EVT::getIntegerVT(C, ToEltVT.getScalarSizeInBits()), NumElements);
18905
18908 for (unsigned i = 0; i < FromVT.getVectorNumElements() / NumElements; i++) {
18909 unsigned NewOffset = (i * NewFromVT.getSizeInBits()) / 8;
18910 SDValue NewPtr =
18911 DAG.getObjectPtrOffset(DL, BasePtr, TypeSize::getFixed(NewOffset));
18912
18913 SDValue NewLoad =
18914 DAG.getLoad(ISD::UNINDEXED, NewExtType, NewToVT, DL, Ch, NewPtr, Offset,
18915 LD->getPointerInfo().getWithOffset(NewOffset), NewFromVT,
18916 Alignment, MMOFlags, AAInfo);
18917 Loads.push_back(NewLoad);
18918 Chains.push_back(SDValue(NewLoad.getNode(), 1));
18919 }
18920
18921 SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
18922 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewChain);
18923 return DAG.getMergeValues(Loads, DL);
18924}
18925
18926// Perform combines for MVEEXT. If it has not be optimized to anything better
18927// before lowering, it gets converted to stack store and extloads performing the
18928// extend whilst still keeping the same lane ordering.
18931 SelectionDAG &DAG = DCI.DAG;
18932 EVT VT = N->getValueType(0);
18933 SDLoc DL(N);
18934 assert(N->getNumValues() == 2 && "Expected MVEEXT with 2 elements");
18935 assert((VT == MVT::v4i32 || VT == MVT::v8i16) && "Unexpected MVEEXT type");
18936
18937 EVT ExtVT = N->getOperand(0).getValueType().getHalfNumVectorElementsVT(
18938 *DAG.getContext());
18939 auto Extend = [&](SDValue V) {
18940 SDValue VVT = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, V);
18941 return N->getOpcode() == ARMISD::MVESEXT
18942 ? DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, VVT,
18943 DAG.getValueType(ExtVT))
18944 : DAG.getZeroExtendInReg(VVT, DL, ExtVT);
18945 };
18946
18947 // MVEEXT(VDUP) -> SIGN_EXTEND_INREG(VDUP)
18948 if (N->getOperand(0).getOpcode() == ARMISD::VDUP) {
18949 SDValue Ext = Extend(N->getOperand(0));
18950 return DAG.getMergeValues({Ext, Ext}, DL);
18951 }
18952
18953 // MVEEXT(shuffle) -> SIGN_EXTEND_INREG/ZERO_EXTEND_INREG
18954 if (auto *SVN = dyn_cast<ShuffleVectorSDNode>(N->getOperand(0))) {
18955 ArrayRef<int> Mask = SVN->getMask();
18956 assert(Mask.size() == 2 * VT.getVectorNumElements());
18957 assert(Mask.size() == SVN->getValueType(0).getVectorNumElements());
18958 unsigned Rev = VT == MVT::v4i32 ? ARMISD::VREV32 : ARMISD::VREV16;
18959 SDValue Op0 = SVN->getOperand(0);
18960 SDValue Op1 = SVN->getOperand(1);
18961
18962 auto CheckInregMask = [&](int Start, int Offset) {
18963 for (int Idx = 0, E = VT.getVectorNumElements(); Idx < E; ++Idx)
18964 if (Mask[Start + Idx] >= 0 && Mask[Start + Idx] != Idx * 2 + Offset)
18965 return false;
18966 return true;
18967 };
18968 SDValue V0 = SDValue(N, 0);
18969 SDValue V1 = SDValue(N, 1);
18970 if (CheckInregMask(0, 0))
18971 V0 = Extend(Op0);
18972 else if (CheckInregMask(0, 1))
18973 V0 = Extend(DAG.getNode(Rev, DL, SVN->getValueType(0), Op0));
18974 else if (CheckInregMask(0, Mask.size()))
18975 V0 = Extend(Op1);
18976 else if (CheckInregMask(0, Mask.size() + 1))
18977 V0 = Extend(DAG.getNode(Rev, DL, SVN->getValueType(0), Op1));
18978
18979 if (CheckInregMask(VT.getVectorNumElements(), Mask.size()))
18980 V1 = Extend(Op1);
18981 else if (CheckInregMask(VT.getVectorNumElements(), Mask.size() + 1))
18982 V1 = Extend(DAG.getNode(Rev, DL, SVN->getValueType(0), Op1));
18983 else if (CheckInregMask(VT.getVectorNumElements(), 0))
18984 V1 = Extend(Op0);
18985 else if (CheckInregMask(VT.getVectorNumElements(), 1))
18986 V1 = Extend(DAG.getNode(Rev, DL, SVN->getValueType(0), Op0));
18987
18988 if (V0.getNode() != N || V1.getNode() != N)
18989 return DAG.getMergeValues({V0, V1}, DL);
18990 }
18991
18992 // MVEEXT(load) -> extload, extload
18993 if (N->getOperand(0)->getOpcode() == ISD::LOAD)
18995 return L;
18996
18997 if (!DCI.isAfterLegalizeDAG())
18998 return SDValue();
18999
19000 // Lower to a stack store and reload:
19001 // VSTRW.32 a, stack; VLDRH.32 stack; VLDRH.32 stack+8;
19002 SDValue StackPtr = DAG.CreateStackTemporary(TypeSize::getFixed(16), Align(4));
19003 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
19004 int NumOuts = N->getNumValues();
19005 assert((NumOuts == 2 || NumOuts == 4) &&
19006 "Expected 2 or 4 outputs to an MVEEXT");
19007 EVT LoadVT = N->getOperand(0).getValueType().getHalfNumVectorElementsVT(
19008 *DAG.getContext());
19009 if (N->getNumOperands() == 4)
19010 LoadVT = LoadVT.getHalfNumVectorElementsVT(*DAG.getContext());
19011
19012 MachinePointerInfo MPI =
19014 SDValue Chain = DAG.getStore(DAG.getEntryNode(), DL, N->getOperand(0),
19015 StackPtr, MPI, Align(4));
19016
19018 for (int I = 0; I < NumOuts; I++) {
19019 SDValue Ptr = DAG.getNode(
19020 ISD::ADD, DL, StackPtr.getValueType(), StackPtr,
19021 DAG.getConstant(I * 16 / NumOuts, DL, StackPtr.getValueType()));
19023 DAG.getMachineFunction(), SPFI, I * 16 / NumOuts);
19024 SDValue Load = DAG.getExtLoad(
19025 N->getOpcode() == ARMISD::MVESEXT ? ISD::SEXTLOAD : ISD::ZEXTLOAD, DL,
19026 VT, Chain, Ptr, MPI, LoadVT, Align(4));
19027 Loads.push_back(Load);
19028 }
19029
19030 return DAG.getMergeValues(Loads, DL);
19031}
19032
19034 DAGCombinerInfo &DCI) const {
19035 switch (N->getOpcode()) {
19036 default: break;
19037 case ISD::SELECT_CC:
19038 case ISD::SELECT: return PerformSELECTCombine(N, DCI, Subtarget);
19039 case ISD::VSELECT: return PerformVSELECTCombine(N, DCI, Subtarget);
19040 case ISD::SETCC: return PerformVSetCCToVCTPCombine(N, DCI, Subtarget);
19041 case ARMISD::ADDE: return PerformADDECombine(N, DCI, Subtarget);
19042 case ARMISD::UMLAL: return PerformUMLALCombine(N, DCI.DAG, Subtarget);
19043 case ISD::ADD: return PerformADDCombine(N, DCI, Subtarget);
19044 case ISD::SUB: return PerformSUBCombine(N, DCI, Subtarget);
19045 case ISD::MUL: return PerformMULCombine(N, DCI, Subtarget);
19046 case ISD::OR: return PerformORCombine(N, DCI, Subtarget);
19047 case ISD::XOR: return PerformXORCombine(N, DCI, Subtarget);
19048 case ISD::AND: return PerformANDCombine(N, DCI, Subtarget);
19049 case ISD::BRCOND:
19050 case ISD::BR_CC: return PerformHWLoopCombine(N, DCI, Subtarget);
19051 case ARMISD::ADDC:
19052 case ARMISD::SUBC: return PerformAddcSubcCombine(N, DCI, Subtarget);
19053 case ARMISD::SUBE: return PerformAddeSubeCombine(N, DCI, Subtarget);
19054 case ARMISD::BFI: return PerformBFICombine(N, DCI.DAG);
19055 case ARMISD::VMOVRRD: return PerformVMOVRRDCombine(N, DCI, Subtarget);
19056 case ARMISD::VMOVDRR: return PerformVMOVDRRCombine(N, DCI.DAG);
19057 case ARMISD::VMOVhr: return PerformVMOVhrCombine(N, DCI);
19058 case ARMISD::VMOVrh: return PerformVMOVrhCombine(N, DCI.DAG);
19059 case ISD::STORE: return PerformSTORECombine(N, DCI, Subtarget);
19060 case ISD::BUILD_VECTOR: return PerformBUILD_VECTORCombine(N, DCI, Subtarget);
19063 return PerformExtractEltCombine(N, DCI, Subtarget);
19067 case ARMISD::VDUPLANE: return PerformVDUPLANECombine(N, DCI, Subtarget);
19068 case ARMISD::VDUP: return PerformVDUPCombine(N, DCI.DAG, Subtarget);
19069 case ISD::FP_TO_SINT:
19070 case ISD::FP_TO_UINT:
19071 return PerformVCVTCombine(N, DCI.DAG, Subtarget);
19072 case ISD::FADD:
19073 return PerformFADDCombine(N, DCI.DAG, Subtarget);
19074 case ISD::FMUL:
19075 return PerformVMulVCTPCombine(N, DCI.DAG, Subtarget);
19077 return PerformIntrinsicCombine(N, DCI);
19078 case ISD::SHL:
19079 case ISD::SRA:
19080 case ISD::SRL:
19081 return PerformShiftCombine(N, DCI, Subtarget);
19082 case ISD::SIGN_EXTEND:
19083 case ISD::ZERO_EXTEND:
19084 case ISD::ANY_EXTEND:
19085 return PerformExtendCombine(N, DCI.DAG, Subtarget);
19086 case ISD::FP_EXTEND:
19087 return PerformFPExtendCombine(N, DCI.DAG, Subtarget);
19088 case ISD::SMIN:
19089 case ISD::UMIN:
19090 case ISD::SMAX:
19091 case ISD::UMAX:
19092 return PerformMinMaxCombine(N, DCI.DAG, Subtarget);
19093 case ARMISD::CMOV:
19094 return PerformCMOVCombine(N, DCI.DAG);
19095 case ARMISD::BRCOND:
19096 return PerformBRCONDCombine(N, DCI.DAG);
19097 case ARMISD::CMPZ:
19098 return PerformCMPZCombine(N, DCI.DAG);
19099 case ARMISD::CSINC:
19100 case ARMISD::CSINV:
19101 case ARMISD::CSNEG:
19102 return PerformCSETCombine(N, DCI.DAG);
19103 case ISD::LOAD:
19104 return PerformLOADCombine(N, DCI, Subtarget);
19105 case ARMISD::VLD1DUP:
19106 case ARMISD::VLD2DUP:
19107 case ARMISD::VLD3DUP:
19108 case ARMISD::VLD4DUP:
19109 return PerformVLDCombine(N, DCI);
19111 return PerformARMBUILD_VECTORCombine(N, DCI);
19112 case ISD::BITCAST:
19113 return PerformBITCASTCombine(N, DCI, Subtarget);
19114 case ARMISD::PREDICATE_CAST:
19115 return PerformPREDICATE_CASTCombine(N, DCI);
19116 case ARMISD::VECTOR_REG_CAST:
19117 return PerformVECTOR_REG_CASTCombine(N, DCI.DAG, Subtarget);
19118 case ARMISD::MVETRUNC:
19119 return PerformMVETruncCombine(N, DCI);
19120 case ARMISD::MVESEXT:
19121 case ARMISD::MVEZEXT:
19122 return PerformMVEExtCombine(N, DCI);
19123 case ARMISD::VCMP:
19124 return PerformVCMPCombine(N, DCI.DAG, Subtarget);
19125 case ISD::VECREDUCE_ADD:
19126 return PerformVECREDUCE_ADDCombine(N, DCI.DAG, Subtarget);
19127 case ARMISD::VADDVs:
19128 case ARMISD::VADDVu:
19129 case ARMISD::VADDLVs:
19130 case ARMISD::VADDLVu:
19131 case ARMISD::VADDLVAs:
19132 case ARMISD::VADDLVAu:
19133 case ARMISD::VMLAVs:
19134 case ARMISD::VMLAVu:
19135 case ARMISD::VMLALVs:
19136 case ARMISD::VMLALVu:
19137 case ARMISD::VMLALVAs:
19138 case ARMISD::VMLALVAu:
19139 return PerformReduceShuffleCombine(N, DCI.DAG);
19140 case ARMISD::VMOVN:
19141 return PerformVMOVNCombine(N, DCI);
19142 case ARMISD::VQMOVNs:
19143 case ARMISD::VQMOVNu:
19144 return PerformVQMOVNCombine(N, DCI);
19145 case ARMISD::VQDMULH:
19146 return PerformVQDMULHCombine(N, DCI);
19147 case ARMISD::ASRL:
19148 case ARMISD::LSRL:
19149 case ARMISD::LSLL:
19150 return PerformLongShiftCombine(N, DCI.DAG);
19151 case ARMISD::SMULWB: {
19152 unsigned BitWidth = N->getValueType(0).getSizeInBits();
19153 APInt DemandedMask = APInt::getLowBitsSet(BitWidth, 16);
19154 if (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI))
19155 return SDValue();
19156 break;
19157 }
19158 case ARMISD::SMULWT: {
19159 unsigned BitWidth = N->getValueType(0).getSizeInBits();
19160 APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 16);
19161 if (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI))
19162 return SDValue();
19163 break;
19164 }
19165 case ARMISD::SMLALBB:
19166 case ARMISD::QADD16b:
19167 case ARMISD::QSUB16b:
19168 case ARMISD::UQADD16b:
19169 case ARMISD::UQSUB16b: {
19170 unsigned BitWidth = N->getValueType(0).getSizeInBits();
19171 APInt DemandedMask = APInt::getLowBitsSet(BitWidth, 16);
19172 if ((SimplifyDemandedBits(N->getOperand(0), DemandedMask, DCI)) ||
19173 (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI)))
19174 return SDValue();
19175 break;
19176 }
19177 case ARMISD::SMLALBT: {
19178 unsigned LowWidth = N->getOperand(0).getValueType().getSizeInBits();
19179 APInt LowMask = APInt::getLowBitsSet(LowWidth, 16);
19180 unsigned HighWidth = N->getOperand(1).getValueType().getSizeInBits();
19181 APInt HighMask = APInt::getHighBitsSet(HighWidth, 16);
19182 if ((SimplifyDemandedBits(N->getOperand(0), LowMask, DCI)) ||
19183 (SimplifyDemandedBits(N->getOperand(1), HighMask, DCI)))
19184 return SDValue();
19185 break;
19186 }
19187 case ARMISD::SMLALTB: {
19188 unsigned HighWidth = N->getOperand(0).getValueType().getSizeInBits();
19189 APInt HighMask = APInt::getHighBitsSet(HighWidth, 16);
19190 unsigned LowWidth = N->getOperand(1).getValueType().getSizeInBits();
19191 APInt LowMask = APInt::getLowBitsSet(LowWidth, 16);
19192 if ((SimplifyDemandedBits(N->getOperand(0), HighMask, DCI)) ||
19193 (SimplifyDemandedBits(N->getOperand(1), LowMask, DCI)))
19194 return SDValue();
19195 break;
19196 }
19197 case ARMISD::SMLALTT: {
19198 unsigned BitWidth = N->getValueType(0).getSizeInBits();
19199 APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 16);
19200 if ((SimplifyDemandedBits(N->getOperand(0), DemandedMask, DCI)) ||
19201 (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI)))
19202 return SDValue();
19203 break;
19204 }
19205 case ARMISD::QADD8b:
19206 case ARMISD::QSUB8b:
19207 case ARMISD::UQADD8b:
19208 case ARMISD::UQSUB8b: {
19209 unsigned BitWidth = N->getValueType(0).getSizeInBits();
19210 APInt DemandedMask = APInt::getLowBitsSet(BitWidth, 8);
19211 if ((SimplifyDemandedBits(N->getOperand(0), DemandedMask, DCI)) ||
19212 (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI)))
19213 return SDValue();
19214 break;
19215 }
19216 case ARMISD::VBSP:
19217 if (N->getOperand(1) == N->getOperand(2))
19218 return N->getOperand(1);
19219 return SDValue();
19222 switch (N->getConstantOperandVal(1)) {
19223 case Intrinsic::arm_neon_vld1:
19224 case Intrinsic::arm_neon_vld1x2:
19225 case Intrinsic::arm_neon_vld1x3:
19226 case Intrinsic::arm_neon_vld1x4:
19227 case Intrinsic::arm_neon_vld2:
19228 case Intrinsic::arm_neon_vld3:
19229 case Intrinsic::arm_neon_vld4:
19230 case Intrinsic::arm_neon_vld2lane:
19231 case Intrinsic::arm_neon_vld3lane:
19232 case Intrinsic::arm_neon_vld4lane:
19233 case Intrinsic::arm_neon_vld2dup:
19234 case Intrinsic::arm_neon_vld3dup:
19235 case Intrinsic::arm_neon_vld4dup:
19236 case Intrinsic::arm_neon_vst1:
19237 case Intrinsic::arm_neon_vst1x2:
19238 case Intrinsic::arm_neon_vst1x3:
19239 case Intrinsic::arm_neon_vst1x4:
19240 case Intrinsic::arm_neon_vst2:
19241 case Intrinsic::arm_neon_vst3:
19242 case Intrinsic::arm_neon_vst4:
19243 case Intrinsic::arm_neon_vst2lane:
19244 case Intrinsic::arm_neon_vst3lane:
19245 case Intrinsic::arm_neon_vst4lane:
19246 return PerformVLDCombine(N, DCI);
19247 case Intrinsic::arm_mve_vld2q:
19248 case Intrinsic::arm_mve_vld4q:
19249 case Intrinsic::arm_mve_vst2q:
19250 case Intrinsic::arm_mve_vst4q:
19251 return PerformMVEVLDCombine(N, DCI);
19252 default: break;
19253 }
19254 break;
19255 }
19256 return SDValue();
19257}
19258
19260 EVT VT) const {
19261 return (VT == MVT::f32) && (Opc == ISD::LOAD || Opc == ISD::STORE);
19262}
19263
19265 Align Alignment,
19267 unsigned *Fast) const {
19268 // Depends what it gets converted into if the type is weird.
19269 if (!VT.isSimple())
19270 return false;
19271
19272 // The AllowsUnaligned flag models the SCTLR.A setting in ARM cpus
19273 bool AllowsUnaligned = Subtarget->allowsUnalignedMem();
19274 auto Ty = VT.getSimpleVT().SimpleTy;
19275
19276 if (Ty == MVT::i8 || Ty == MVT::i16 || Ty == MVT::i32) {
19277 // Unaligned access can use (for example) LRDB, LRDH, LDR
19278 if (AllowsUnaligned) {
19279 if (Fast)
19280 *Fast = Subtarget->hasV7Ops();
19281 return true;
19282 }
19283 }
19284
19285 if (Ty == MVT::f64 || Ty == MVT::v2f64) {
19286 // For any little-endian targets with neon, we can support unaligned ld/st
19287 // of D and Q (e.g. {D0,D1}) registers by using vld1.i8/vst1.i8.
19288 // A big-endian target may also explicitly support unaligned accesses
19289 if (Subtarget->hasNEON() && (AllowsUnaligned || Subtarget->isLittle())) {
19290 if (Fast)
19291 *Fast = 1;
19292 return true;
19293 }
19294 }
19295
19296 if (!Subtarget->hasMVEIntegerOps())
19297 return false;
19298
19299 // These are for predicates
19300 if ((Ty == MVT::v16i1 || Ty == MVT::v8i1 || Ty == MVT::v4i1 ||
19301 Ty == MVT::v2i1)) {
19302 if (Fast)
19303 *Fast = 1;
19304 return true;
19305 }
19306
19307 // These are for truncated stores/narrowing loads. They are fine so long as
19308 // the alignment is at least the size of the item being loaded
19309 if ((Ty == MVT::v4i8 || Ty == MVT::v8i8 || Ty == MVT::v4i16) &&
19310 Alignment >= VT.getScalarSizeInBits() / 8) {
19311 if (Fast)
19312 *Fast = true;
19313 return true;
19314 }
19315
19316 // In little-endian MVE, the store instructions VSTRB.U8, VSTRH.U16 and
19317 // VSTRW.U32 all store the vector register in exactly the same format, and
19318 // differ only in the range of their immediate offset field and the required
19319 // alignment. So there is always a store that can be used, regardless of
19320 // actual type.
19321 //
19322 // For big endian, that is not the case. But can still emit a (VSTRB.U8;
19323 // VREV64.8) pair and get the same effect. This will likely be better than
19324 // aligning the vector through the stack.
19325 if (Ty == MVT::v16i8 || Ty == MVT::v8i16 || Ty == MVT::v8f16 ||
19326 Ty == MVT::v4i32 || Ty == MVT::v4f32 || Ty == MVT::v2i64 ||
19327 Ty == MVT::v2f64) {
19328 if (Fast)
19329 *Fast = 1;
19330 return true;
19331 }
19332
19333 return false;
19334}
19335
19337 LLVMContext &Context, const MemOp &Op,
19338 const AttributeList &FuncAttributes) const {
19339 // See if we can use NEON instructions for this...
19340 if ((Op.isMemcpy() || Op.isZeroMemset()) && Subtarget->hasNEON() &&
19341 !FuncAttributes.hasFnAttr(Attribute::NoImplicitFloat)) {
19342 unsigned Fast;
19343 if (Op.size() >= 16 &&
19344 (Op.isAligned(Align(16)) ||
19345 (allowsMisalignedMemoryAccesses(MVT::v2f64, 0, Align(1),
19347 Fast))) {
19348 return MVT::v2f64;
19349 } else if (Op.size() >= 8 &&
19350 (Op.isAligned(Align(8)) ||
19352 MVT::f64, 0, Align(1), MachineMemOperand::MONone, &Fast) &&
19353 Fast))) {
19354 return MVT::f64;
19355 }
19356 }
19357
19358 // Let the target-independent logic figure it out.
19359 return MVT::Other;
19360}
19361
19362// 64-bit integers are split into their high and low parts and held in two
19363// different registers, so the trunc is free since the low register can just
19364// be used.
19365bool ARMTargetLowering::isTruncateFree(Type *SrcTy, Type *DstTy) const {
19366 if (!SrcTy->isIntegerTy() || !DstTy->isIntegerTy())
19367 return false;
19368 unsigned SrcBits = SrcTy->getPrimitiveSizeInBits();
19369 unsigned DestBits = DstTy->getPrimitiveSizeInBits();
19370 return (SrcBits == 64 && DestBits == 32);
19371}
19372
19374 if (SrcVT.isVector() || DstVT.isVector() || !SrcVT.isInteger() ||
19375 !DstVT.isInteger())
19376 return false;
19377 unsigned SrcBits = SrcVT.getSizeInBits();
19378 unsigned DestBits = DstVT.getSizeInBits();
19379 return (SrcBits == 64 && DestBits == 32);
19380}
19381
19383 if (Val.getOpcode() != ISD::LOAD)
19384 return false;
19385
19386 EVT VT1 = Val.getValueType();
19387 if (!VT1.isSimple() || !VT1.isInteger() ||
19388 !VT2.isSimple() || !VT2.isInteger())
19389 return false;
19390
19391 switch (VT1.getSimpleVT().SimpleTy) {
19392 default: break;
19393 case MVT::i1:
19394 case MVT::i8:
19395 case MVT::i16:
19396 // 8-bit and 16-bit loads implicitly zero-extend to 32-bits.
19397 return true;
19398 }
19399
19400 return false;
19401}
19402
19404 if (!VT.isSimple())
19405 return false;
19406
19407 // There are quite a few FP16 instructions (e.g. VNMLA, VNMLS, etc.) that
19408 // negate values directly (fneg is free). So, we don't want to let the DAG
19409 // combiner rewrite fneg into xors and some other instructions. For f16 and
19410 // FullFP16 argument passing, some bitcast nodes may be introduced,
19411 // triggering this DAG combine rewrite, so we are avoiding that with this.
19412 switch (VT.getSimpleVT().SimpleTy) {
19413 default: break;
19414 case MVT::f16:
19415 return Subtarget->hasFullFP16();
19416 }
19417
19418 return false;
19419}
19420
19422 if (!Subtarget->hasMVEIntegerOps())
19423 return nullptr;
19424 Type *SVIType = SVI->getType();
19425 Type *ScalarType = SVIType->getScalarType();
19426
19427 if (ScalarType->isFloatTy())
19428 return Type::getInt32Ty(SVIType->getContext());
19429 if (ScalarType->isHalfTy())
19430 return Type::getInt16Ty(SVIType->getContext());
19431 return nullptr;
19432}
19433
19435 EVT VT = ExtVal.getValueType();
19436
19437 if (!isTypeLegal(VT))
19438 return false;
19439
19440 if (auto *Ld = dyn_cast<MaskedLoadSDNode>(ExtVal.getOperand(0))) {
19441 if (Ld->isExpandingLoad())
19442 return false;
19443 }
19444
19445 if (Subtarget->hasMVEIntegerOps())
19446 return true;
19447
19448 // Don't create a loadext if we can fold the extension into a wide/long
19449 // instruction.
19450 // If there's more than one user instruction, the loadext is desirable no
19451 // matter what. There can be two uses by the same instruction.
19452 if (ExtVal->use_empty() ||
19453 !ExtVal->user_begin()->isOnlyUserOf(ExtVal.getNode()))
19454 return true;
19455
19456 SDNode *U = *ExtVal->user_begin();
19457 if ((U->getOpcode() == ISD::ADD || U->getOpcode() == ISD::SUB ||
19458 U->getOpcode() == ISD::SHL || U->getOpcode() == ARMISD::VSHLIMM))
19459 return false;
19460
19461 return true;
19462}
19463
19465 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
19466 return false;
19467
19468 if (!isTypeLegal(EVT::getEVT(Ty1)))
19469 return false;
19470
19471 assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop");
19472
19473 // Assuming the caller doesn't have a zeroext or signext return parameter,
19474 // truncation all the way down to i1 is valid.
19475 return true;
19476}
19477
19478/// isFMAFasterThanFMulAndFAdd - Return true if an FMA operation is faster
19479/// than a pair of fmul and fadd instructions. fmuladd intrinsics will be
19480/// expanded to FMAs when this method returns true, otherwise fmuladd is
19481/// expanded to fmul + fadd.
19482///
19483/// ARM supports both fused and unfused multiply-add operations; we already
19484/// lower a pair of fmul and fadd to the latter so it's not clear that there
19485/// would be a gain or that the gain would be worthwhile enough to risk
19486/// correctness bugs.
19487///
19488/// For MVE, we set this to true as it helps simplify the need for some
19489/// patterns (and we don't have the non-fused floating point instruction).
19490bool ARMTargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
19491 EVT VT) const {
19492 if (Subtarget->useSoftFloat())
19493 return false;
19494
19495 if (!VT.isSimple())
19496 return false;
19497
19498 switch (VT.getSimpleVT().SimpleTy) {
19499 case MVT::v4f32:
19500 case MVT::v8f16:
19501 return Subtarget->hasMVEFloatOps();
19502 case MVT::f16:
19503 return Subtarget->useFPVFMx16();
19504 case MVT::f32:
19505 return Subtarget->useFPVFMx();
19506 case MVT::f64:
19507 return Subtarget->useFPVFMx64();
19508 default:
19509 break;
19510 }
19511
19512 return false;
19513}
19514
19515static bool isLegalT1AddressImmediate(int64_t V, EVT VT) {
19516 if (V < 0)
19517 return false;
19518
19519 unsigned Scale = 1;
19520 switch (VT.getSimpleVT().SimpleTy) {
19521 case MVT::i1:
19522 case MVT::i8:
19523 // Scale == 1;
19524 break;
19525 case MVT::i16:
19526 // Scale == 2;
19527 Scale = 2;
19528 break;
19529 default:
19530 // On thumb1 we load most things (i32, i64, floats, etc) with a LDR
19531 // Scale == 4;
19532 Scale = 4;
19533 break;
19534 }
19535
19536 if ((V & (Scale - 1)) != 0)
19537 return false;
19538 return isUInt<5>(V / Scale);
19539}
19540
19541static bool isLegalT2AddressImmediate(int64_t V, EVT VT,
19542 const ARMSubtarget *Subtarget) {
19543 if (!VT.isInteger() && !VT.isFloatingPoint())
19544 return false;
19545 if (VT.isVector() && Subtarget->hasNEON())
19546 return false;
19547 if (VT.isVector() && VT.isFloatingPoint() && Subtarget->hasMVEIntegerOps() &&
19548 !Subtarget->hasMVEFloatOps())
19549 return false;
19550
19551 bool IsNeg = false;
19552 if (V < 0) {
19553 IsNeg = true;
19554 V = -V;
19555 }
19556
19557 unsigned NumBytes = std::max((unsigned)VT.getSizeInBits() / 8, 1U);
19558
19559 // MVE: size * imm7
19560 if (VT.isVector() && Subtarget->hasMVEIntegerOps()) {
19561 switch (VT.getSimpleVT().getVectorElementType().SimpleTy) {
19562 case MVT::i32:
19563 case MVT::f32:
19564 return isShiftedUInt<7,2>(V);
19565 case MVT::i16:
19566 case MVT::f16:
19567 return isShiftedUInt<7,1>(V);
19568 case MVT::i8:
19569 return isUInt<7>(V);
19570 default:
19571 return false;
19572 }
19573 }
19574
19575 // half VLDR: 2 * imm8
19576 if (VT.isFloatingPoint() && NumBytes == 2 && Subtarget->hasFPRegs16())
19577 return isShiftedUInt<8, 1>(V);
19578 // VLDR and LDRD: 4 * imm8
19579 if ((VT.isFloatingPoint() && Subtarget->hasVFP2Base()) || NumBytes == 8)
19580 return isShiftedUInt<8, 2>(V);
19581
19582 if (NumBytes == 1 || NumBytes == 2 || NumBytes == 4) {
19583 // + imm12 or - imm8
19584 if (IsNeg)
19585 return isUInt<8>(V);
19586 return isUInt<12>(V);
19587 }
19588
19589 return false;
19590}
19591
19592/// isLegalAddressImmediate - Return true if the integer value can be used
19593/// as the offset of the target addressing mode for load / store of the
19594/// given type.
19595static bool isLegalAddressImmediate(int64_t V, EVT VT,
19596 const ARMSubtarget *Subtarget) {
19597 if (V == 0)
19598 return true;
19599
19600 if (!VT.isSimple())
19601 return false;
19602
19603 if (Subtarget->isThumb1Only())
19604 return isLegalT1AddressImmediate(V, VT);
19605 else if (Subtarget->isThumb2())
19606 return isLegalT2AddressImmediate(V, VT, Subtarget);
19607
19608 // ARM mode.
19609 if (V < 0)
19610 V = - V;
19611 switch (VT.getSimpleVT().SimpleTy) {
19612 default: return false;
19613 case MVT::i1:
19614 case MVT::i8:
19615 case MVT::i32:
19616 // +- imm12
19617 return isUInt<12>(V);
19618 case MVT::i16:
19619 // +- imm8
19620 return isUInt<8>(V);
19621 case MVT::f32:
19622 case MVT::f64:
19623 if (!Subtarget->hasVFP2Base()) // FIXME: NEON?
19624 return false;
19625 return isShiftedUInt<8, 2>(V);
19626 }
19627}
19628
19630 EVT VT) const {
19631 int Scale = AM.Scale;
19632 if (Scale < 0)
19633 return false;
19634
19635 switch (VT.getSimpleVT().SimpleTy) {
19636 default: return false;
19637 case MVT::i1:
19638 case MVT::i8:
19639 case MVT::i16:
19640 case MVT::i32:
19641 if (Scale == 1)
19642 return true;
19643 // r + r << imm
19644 Scale = Scale & ~1;
19645 return Scale == 2 || Scale == 4 || Scale == 8;
19646 case MVT::i64:
19647 // FIXME: What are we trying to model here? ldrd doesn't have an r + r
19648 // version in Thumb mode.
19649 // r + r
19650 if (Scale == 1)
19651 return true;
19652 // r * 2 (this can be lowered to r + r).
19653 if (!AM.HasBaseReg && Scale == 2)
19654 return true;
19655 return false;
19656 case MVT::isVoid:
19657 // Note, we allow "void" uses (basically, uses that aren't loads or
19658 // stores), because arm allows folding a scale into many arithmetic
19659 // operations. This should be made more precise and revisited later.
19660
19661 // Allow r << imm, but the imm has to be a multiple of two.
19662 if (Scale & 1) return false;
19663 return isPowerOf2_32(Scale);
19664 }
19665}
19666
19668 EVT VT) const {
19669 const int Scale = AM.Scale;
19670
19671 // Negative scales are not supported in Thumb1.
19672 if (Scale < 0)
19673 return false;
19674
19675 // Thumb1 addressing modes do not support register scaling excepting the
19676 // following cases:
19677 // 1. Scale == 1 means no scaling.
19678 // 2. Scale == 2 this can be lowered to r + r if there is no base register.
19679 return (Scale == 1) || (!AM.HasBaseReg && Scale == 2);
19680}
19681
19682/// isLegalAddressingMode - Return true if the addressing mode represented
19683/// by AM is legal for this target, for a load/store of the specified type.
19685 const AddrMode &AM, Type *Ty,
19686 unsigned AS, Instruction *I) const {
19687 EVT VT = getValueType(DL, Ty, true);
19688 if (!isLegalAddressImmediate(AM.BaseOffs, VT, Subtarget))
19689 return false;
19690
19691 // Can never fold addr of global into load/store.
19692 if (AM.BaseGV)
19693 return false;
19694
19695 switch (AM.Scale) {
19696 case 0: // no scale reg, must be "r+i" or "r", or "i".
19697 break;
19698 default:
19699 // ARM doesn't support any R+R*scale+imm addr modes.
19700 if (AM.BaseOffs)
19701 return false;
19702
19703 if (!VT.isSimple())
19704 return false;
19705
19706 if (Subtarget->isThumb1Only())
19707 return isLegalT1ScaledAddressingMode(AM, VT);
19708
19709 if (Subtarget->isThumb2())
19710 return isLegalT2ScaledAddressingMode(AM, VT);
19711
19712 int Scale = AM.Scale;
19713 switch (VT.getSimpleVT().SimpleTy) {
19714 default: return false;
19715 case MVT::i1:
19716 case MVT::i8:
19717 case MVT::i32:
19718 if (Scale < 0) Scale = -Scale;
19719 if (Scale == 1)
19720 return true;
19721 // r + r << imm
19722 return isPowerOf2_32(Scale & ~1);
19723 case MVT::i16:
19724 case MVT::i64:
19725 // r +/- r
19726 if (Scale == 1 || (AM.HasBaseReg && Scale == -1))
19727 return true;
19728 // r * 2 (this can be lowered to r + r).
19729 if (!AM.HasBaseReg && Scale == 2)
19730 return true;
19731 return false;
19732
19733 case MVT::isVoid:
19734 // Note, we allow "void" uses (basically, uses that aren't loads or
19735 // stores), because arm allows folding a scale into many arithmetic
19736 // operations. This should be made more precise and revisited later.
19737
19738 // Allow r << imm, but the imm has to be a multiple of two.
19739 if (Scale & 1) return false;
19740 return isPowerOf2_32(Scale);
19741 }
19742 }
19743 return true;
19744}
19745
19746/// isLegalICmpImmediate - Return true if the specified immediate is legal
19747/// icmp immediate, that is the target has icmp instructions which can compare
19748/// a register against the immediate without having to materialize the
19749/// immediate into a register.
19751 // Thumb2 and ARM modes can use cmn for negative immediates.
19752 if (!Subtarget->isThumb())
19753 return ARM_AM::getSOImmVal((uint32_t)Imm) != -1 ||
19754 ARM_AM::getSOImmVal(-(uint32_t)Imm) != -1;
19755 if (Subtarget->isThumb2())
19756 return ARM_AM::getT2SOImmVal((uint32_t)Imm) != -1 ||
19757 ARM_AM::getT2SOImmVal(-(uint32_t)Imm) != -1;
19758 // Thumb1 doesn't have cmn, and only 8-bit immediates.
19759 return Imm >= 0 && Imm <= 255;
19760}
19761
19762/// isLegalAddImmediate - Return true if the specified immediate is a legal add
19763/// *or sub* immediate, that is the target has add or sub instructions which can
19764/// add a register with the immediate without having to materialize the
19765/// immediate into a register.
19767 // Same encoding for add/sub, just flip the sign.
19768 uint64_t AbsImm = AbsoluteValue(Imm);
19769 if (!Subtarget->isThumb())
19770 return ARM_AM::getSOImmVal(AbsImm) != -1;
19771 if (Subtarget->isThumb2())
19772 return ARM_AM::getT2SOImmVal(AbsImm) != -1;
19773 // Thumb1 only has 8-bit unsigned immediate.
19774 return AbsImm <= 255;
19775}
19776
19777// Return false to prevent folding
19778// (mul (add r, c0), c1) -> (add (mul r, c1), c0*c1) in DAGCombine,
19779// if the folding leads to worse code.
19781 SDValue ConstNode) const {
19782 // Let the DAGCombiner decide for vector types and large types.
19783 const EVT VT = AddNode.getValueType();
19784 if (VT.isVector() || VT.getScalarSizeInBits() > 32)
19785 return true;
19786
19787 // It is worse if c0 is legal add immediate, while c1*c0 is not
19788 // and has to be composed by at least two instructions.
19789 const ConstantSDNode *C0Node = cast<ConstantSDNode>(AddNode.getOperand(1));
19790 const ConstantSDNode *C1Node = cast<ConstantSDNode>(ConstNode);
19791 const int64_t C0 = C0Node->getSExtValue();
19792 APInt CA = C0Node->getAPIntValue() * C1Node->getAPIntValue();
19794 return true;
19795 if (ConstantMaterializationCost((unsigned)CA.getZExtValue(), Subtarget) > 1)
19796 return false;
19797
19798 // Default to true and let the DAGCombiner decide.
19799 return true;
19800}
19801
19803 bool isSEXTLoad, SDValue &Base,
19804 SDValue &Offset, bool &isInc,
19805 SelectionDAG &DAG) {
19806 if (Ptr->getOpcode() != ISD::ADD && Ptr->getOpcode() != ISD::SUB)
19807 return false;
19808
19809 if (VT == MVT::i16 || ((VT == MVT::i8 || VT == MVT::i1) && isSEXTLoad)) {
19810 // AddressingMode 3
19811 Base = Ptr->getOperand(0);
19813 int RHSC = (int)RHS->getZExtValue();
19814 if (RHSC < 0 && RHSC > -256) {
19815 assert(Ptr->getOpcode() == ISD::ADD);
19816 isInc = false;
19817 Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0));
19818 return true;
19819 }
19820 }
19821 isInc = (Ptr->getOpcode() == ISD::ADD);
19822 Offset = Ptr->getOperand(1);
19823 return true;
19824 } else if (VT == MVT::i32 || VT == MVT::i8 || VT == MVT::i1) {
19825 // AddressingMode 2
19827 int RHSC = (int)RHS->getZExtValue();
19828 if (RHSC < 0 && RHSC > -0x1000) {
19829 assert(Ptr->getOpcode() == ISD::ADD);
19830 isInc = false;
19831 Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0));
19832 Base = Ptr->getOperand(0);
19833 return true;
19834 }
19835 }
19836
19837 if (Ptr->getOpcode() == ISD::ADD) {
19838 isInc = true;
19839 ARM_AM::ShiftOpc ShOpcVal=
19841 if (ShOpcVal != ARM_AM::no_shift) {
19842 Base = Ptr->getOperand(1);
19843 Offset = Ptr->getOperand(0);
19844 } else {
19845 Base = Ptr->getOperand(0);
19846 Offset = Ptr->getOperand(1);
19847 }
19848 return true;
19849 }
19850
19851 isInc = (Ptr->getOpcode() == ISD::ADD);
19852 Base = Ptr->getOperand(0);
19853 Offset = Ptr->getOperand(1);
19854 return true;
19855 }
19856
19857 // FIXME: Use VLDM / VSTM to emulate indexed FP load / store.
19858 return false;
19859}
19860
19862 bool isSEXTLoad, SDValue &Base,
19863 SDValue &Offset, bool &isInc,
19864 SelectionDAG &DAG) {
19865 if (Ptr->getOpcode() != ISD::ADD && Ptr->getOpcode() != ISD::SUB)
19866 return false;
19867
19868 Base = Ptr->getOperand(0);
19870 int RHSC = (int)RHS->getZExtValue();
19871 if (RHSC < 0 && RHSC > -0x100) { // 8 bits.
19872 assert(Ptr->getOpcode() == ISD::ADD);
19873 isInc = false;
19874 Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0));
19875 return true;
19876 } else if (RHSC > 0 && RHSC < 0x100) { // 8 bit, no zero.
19877 isInc = Ptr->getOpcode() == ISD::ADD;
19878 Offset = DAG.getConstant(RHSC, SDLoc(Ptr), RHS->getValueType(0));
19879 return true;
19880 }
19881 }
19882
19883 return false;
19884}
19885
19886static bool getMVEIndexedAddressParts(SDNode *Ptr, EVT VT, Align Alignment,
19887 bool isSEXTLoad, bool IsMasked, bool isLE,
19889 bool &isInc, SelectionDAG &DAG) {
19890 if (Ptr->getOpcode() != ISD::ADD && Ptr->getOpcode() != ISD::SUB)
19891 return false;
19892 if (!isa<ConstantSDNode>(Ptr->getOperand(1)))
19893 return false;
19894
19895 // We allow LE non-masked loads to change the type (for example use a vldrb.8
19896 // as opposed to a vldrw.32). This can allow extra addressing modes or
19897 // alignments for what is otherwise an equivalent instruction.
19898 bool CanChangeType = isLE && !IsMasked;
19899
19901 int RHSC = (int)RHS->getZExtValue();
19902
19903 auto IsInRange = [&](int RHSC, int Limit, int Scale) {
19904 if (RHSC < 0 && RHSC > -Limit * Scale && RHSC % Scale == 0) {
19905 assert(Ptr->getOpcode() == ISD::ADD);
19906 isInc = false;
19907 Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0));
19908 return true;
19909 } else if (RHSC > 0 && RHSC < Limit * Scale && RHSC % Scale == 0) {
19910 isInc = Ptr->getOpcode() == ISD::ADD;
19911 Offset = DAG.getConstant(RHSC, SDLoc(Ptr), RHS->getValueType(0));
19912 return true;
19913 }
19914 return false;
19915 };
19916
19917 // Try to find a matching instruction based on s/zext, Alignment, Offset and
19918 // (in BE/masked) type.
19919 Base = Ptr->getOperand(0);
19920 if (VT == MVT::v4i16) {
19921 if (Alignment >= 2 && IsInRange(RHSC, 0x80, 2))
19922 return true;
19923 } else if (VT == MVT::v4i8 || VT == MVT::v8i8) {
19924 if (IsInRange(RHSC, 0x80, 1))
19925 return true;
19926 } else if (Alignment >= 4 &&
19927 (CanChangeType || VT == MVT::v4i32 || VT == MVT::v4f32) &&
19928 IsInRange(RHSC, 0x80, 4))
19929 return true;
19930 else if (Alignment >= 2 &&
19931 (CanChangeType || VT == MVT::v8i16 || VT == MVT::v8f16) &&
19932 IsInRange(RHSC, 0x80, 2))
19933 return true;
19934 else if ((CanChangeType || VT == MVT::v16i8) && IsInRange(RHSC, 0x80, 1))
19935 return true;
19936 return false;
19937}
19938
19939/// getPreIndexedAddressParts - returns true by value, base pointer and
19940/// offset pointer and addressing mode by reference if the node's address
19941/// can be legally represented as pre-indexed load / store address.
19942bool
19944 SDValue &Offset,
19946 SelectionDAG &DAG) const {
19947 if (Subtarget->isThumb1Only())
19948 return false;
19949
19950 EVT VT;
19951 SDValue Ptr;
19952 Align Alignment;
19953 unsigned AS = 0;
19954 bool isSEXTLoad = false;
19955 bool IsMasked = false;
19956 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
19957 Ptr = LD->getBasePtr();
19958 VT = LD->getMemoryVT();
19959 Alignment = LD->getAlign();
19960 AS = LD->getAddressSpace();
19961 isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD;
19962 } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
19963 Ptr = ST->getBasePtr();
19964 VT = ST->getMemoryVT();
19965 Alignment = ST->getAlign();
19966 AS = ST->getAddressSpace();
19967 } else if (MaskedLoadSDNode *LD = dyn_cast<MaskedLoadSDNode>(N)) {
19968 Ptr = LD->getBasePtr();
19969 VT = LD->getMemoryVT();
19970 Alignment = LD->getAlign();
19971 AS = LD->getAddressSpace();
19972 isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD;
19973 IsMasked = true;
19975 Ptr = ST->getBasePtr();
19976 VT = ST->getMemoryVT();
19977 Alignment = ST->getAlign();
19978 AS = ST->getAddressSpace();
19979 IsMasked = true;
19980 } else
19981 return false;
19982
19983 unsigned Fast = 0;
19984 if (!allowsMisalignedMemoryAccesses(VT, AS, Alignment,
19986 // Only generate post-increment or pre-increment forms when a real
19987 // hardware instruction exists for them. Do not emit postinc/preinc
19988 // if the operation will end up as a libcall.
19989 return false;
19990 }
19991
19992 bool isInc;
19993 bool isLegal = false;
19994 if (VT.isVector())
19995 isLegal = Subtarget->hasMVEIntegerOps() &&
19997 Ptr.getNode(), VT, Alignment, isSEXTLoad, IsMasked,
19998 Subtarget->isLittle(), Base, Offset, isInc, DAG);
19999 else {
20000 if (Subtarget->isThumb2())
20001 isLegal = getT2IndexedAddressParts(Ptr.getNode(), VT, isSEXTLoad, Base,
20002 Offset, isInc, DAG);
20003 else
20004 isLegal = getARMIndexedAddressParts(Ptr.getNode(), VT, isSEXTLoad, Base,
20005 Offset, isInc, DAG);
20006 }
20007 if (!isLegal)
20008 return false;
20009
20010 AM = isInc ? ISD::PRE_INC : ISD::PRE_DEC;
20011 return true;
20012}
20013
20014/// getPostIndexedAddressParts - returns true by value, base pointer and
20015/// offset pointer and addressing mode by reference if this node can be
20016/// combined with a load / store to form a post-indexed load / store.
20018 SDValue &Base,
20019 SDValue &Offset,
20021 SelectionDAG &DAG) const {
20022 EVT VT;
20023 SDValue Ptr;
20024 Align Alignment;
20025 bool isSEXTLoad = false, isNonExt;
20026 bool IsMasked = false;
20027 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
20028 VT = LD->getMemoryVT();
20029 Ptr = LD->getBasePtr();
20030 Alignment = LD->getAlign();
20031 isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD;
20032 isNonExt = LD->getExtensionType() == ISD::NON_EXTLOAD;
20033 } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
20034 VT = ST->getMemoryVT();
20035 Ptr = ST->getBasePtr();
20036 Alignment = ST->getAlign();
20037 isNonExt = !ST->isTruncatingStore();
20038 } else if (MaskedLoadSDNode *LD = dyn_cast<MaskedLoadSDNode>(N)) {
20039 VT = LD->getMemoryVT();
20040 Ptr = LD->getBasePtr();
20041 Alignment = LD->getAlign();
20042 isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD;
20043 isNonExt = LD->getExtensionType() == ISD::NON_EXTLOAD;
20044 IsMasked = true;
20046 VT = ST->getMemoryVT();
20047 Ptr = ST->getBasePtr();
20048 Alignment = ST->getAlign();
20049 isNonExt = !ST->isTruncatingStore();
20050 IsMasked = true;
20051 } else
20052 return false;
20053
20054 if (Subtarget->isThumb1Only()) {
20055 // Thumb-1 can do a limited post-inc load or store as an updating LDM. It
20056 // must be non-extending/truncating, i32, with an offset of 4.
20057 assert(Op->getValueType(0) == MVT::i32 && "Non-i32 post-inc op?!");
20058 if (Op->getOpcode() != ISD::ADD || !isNonExt)
20059 return false;
20060 auto *RHS = dyn_cast<ConstantSDNode>(Op->getOperand(1));
20061 if (!RHS || RHS->getZExtValue() != 4)
20062 return false;
20063 if (Alignment < Align(4))
20064 return false;
20065
20066 Offset = Op->getOperand(1);
20067 Base = Op->getOperand(0);
20068 AM = ISD::POST_INC;
20069 return true;
20070 }
20071
20072 bool isInc;
20073 bool isLegal = false;
20074 if (VT.isVector())
20075 isLegal = Subtarget->hasMVEIntegerOps() &&
20076 getMVEIndexedAddressParts(Op, VT, Alignment, isSEXTLoad, IsMasked,
20077 Subtarget->isLittle(), Base, Offset,
20078 isInc, DAG);
20079 else {
20080 if (Subtarget->isThumb2())
20081 isLegal = getT2IndexedAddressParts(Op, VT, isSEXTLoad, Base, Offset,
20082 isInc, DAG);
20083 else
20084 isLegal = getARMIndexedAddressParts(Op, VT, isSEXTLoad, Base, Offset,
20085 isInc, DAG);
20086 }
20087 if (!isLegal)
20088 return false;
20089
20090 if (Ptr != Base) {
20091 // Swap base ptr and offset to catch more post-index load / store when
20092 // it's legal. In Thumb2 mode, offset must be an immediate.
20093 if (Ptr == Offset && Op->getOpcode() == ISD::ADD &&
20094 !Subtarget->isThumb2())
20096
20097 // Post-indexed load / store update the base pointer.
20098 if (Ptr != Base)
20099 return false;
20100 }
20101
20102 AM = isInc ? ISD::POST_INC : ISD::POST_DEC;
20103 return true;
20104}
20105
20107 KnownBits &Known,
20108 const APInt &DemandedElts,
20109 const SelectionDAG &DAG,
20110 unsigned Depth) const {
20111 unsigned BitWidth = Known.getBitWidth();
20112 Known.resetAll();
20113 switch (Op.getOpcode()) {
20114 default: break;
20115 case ARMISD::ADDC:
20116 case ARMISD::ADDE:
20117 case ARMISD::SUBC:
20118 case ARMISD::SUBE:
20119 // Special cases when we convert a carry to a boolean.
20120 if (Op.getResNo() == 0) {
20121 SDValue LHS = Op.getOperand(0);
20122 SDValue RHS = Op.getOperand(1);
20123 // (ADDE 0, 0, C) will give us a single bit.
20124 if (Op->getOpcode() == ARMISD::ADDE && isNullConstant(LHS) &&
20125 isNullConstant(RHS)) {
20127 return;
20128 }
20129 }
20130 break;
20131 case ARMISD::CMOV: {
20132 // Bits are known zero/one if known on the LHS and RHS.
20133 Known = DAG.computeKnownBits(Op.getOperand(0), Depth+1);
20134 if (Known.isUnknown())
20135 return;
20136
20137 KnownBits KnownRHS = DAG.computeKnownBits(Op.getOperand(1), Depth+1);
20138 Known = Known.intersectWith(KnownRHS);
20139 return;
20140 }
20142 Intrinsic::ID IntID =
20143 static_cast<Intrinsic::ID>(Op->getConstantOperandVal(1));
20144 switch (IntID) {
20145 default: return;
20146 case Intrinsic::arm_ldaex:
20147 case Intrinsic::arm_ldrex: {
20148 EVT VT = cast<MemIntrinsicSDNode>(Op)->getMemoryVT();
20149 unsigned MemBits = VT.getScalarSizeInBits();
20150 Known.Zero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits);
20151 return;
20152 }
20153 }
20154 }
20155 case ARMISD::BFI: {
20156 // Conservatively, we can recurse down the first operand
20157 // and just mask out all affected bits.
20158 Known = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
20159
20160 // The operand to BFI is already a mask suitable for removing the bits it
20161 // sets.
20162 const APInt &Mask = Op.getConstantOperandAPInt(2);
20163 Known.Zero &= Mask;
20164 Known.One &= Mask;
20165 return;
20166 }
20167 case ARMISD::VGETLANEs:
20168 case ARMISD::VGETLANEu: {
20169 const SDValue &SrcSV = Op.getOperand(0);
20170 EVT VecVT = SrcSV.getValueType();
20171 assert(VecVT.isVector() && "VGETLANE expected a vector type");
20172 const unsigned NumSrcElts = VecVT.getVectorNumElements();
20173 ConstantSDNode *Pos = cast<ConstantSDNode>(Op.getOperand(1).getNode());
20174 assert(Pos->getAPIntValue().ult(NumSrcElts) &&
20175 "VGETLANE index out of bounds");
20176 unsigned Idx = Pos->getZExtValue();
20177 APInt DemandedElt = APInt::getOneBitSet(NumSrcElts, Idx);
20178 Known = DAG.computeKnownBits(SrcSV, DemandedElt, Depth + 1);
20179
20180 EVT VT = Op.getValueType();
20181 const unsigned DstSz = VT.getScalarSizeInBits();
20182 const unsigned SrcSz = VecVT.getVectorElementType().getSizeInBits();
20183 (void)SrcSz;
20184 assert(SrcSz == Known.getBitWidth());
20185 assert(DstSz > SrcSz);
20186 if (Op.getOpcode() == ARMISD::VGETLANEs)
20187 Known = Known.sext(DstSz);
20188 else {
20189 Known = Known.zext(DstSz);
20190 }
20191 assert(DstSz == Known.getBitWidth());
20192 break;
20193 }
20194 case ARMISD::VMOVrh: {
20195 KnownBits KnownOp = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
20196 assert(KnownOp.getBitWidth() == 16);
20197 Known = KnownOp.zext(32);
20198 break;
20199 }
20200 case ARMISD::CSINC:
20201 case ARMISD::CSINV:
20202 case ARMISD::CSNEG: {
20203 KnownBits KnownOp0 = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
20204 KnownBits KnownOp1 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
20205
20206 // The result is either:
20207 // CSINC: KnownOp0 or KnownOp1 + 1
20208 // CSINV: KnownOp0 or ~KnownOp1
20209 // CSNEG: KnownOp0 or KnownOp1 * -1
20210 if (Op.getOpcode() == ARMISD::CSINC)
20211 KnownOp1 =
20212 KnownBits::add(KnownOp1, KnownBits::makeConstant(APInt(32, 1)));
20213 else if (Op.getOpcode() == ARMISD::CSINV)
20214 std::swap(KnownOp1.Zero, KnownOp1.One);
20215 else if (Op.getOpcode() == ARMISD::CSNEG)
20216 KnownOp1 = KnownBits::mul(KnownOp1,
20218
20219 Known = KnownOp0.intersectWith(KnownOp1);
20220 break;
20221 }
20222 case ARMISD::VORRIMM:
20223 case ARMISD::VBICIMM: {
20224 unsigned Encoded = Op.getConstantOperandVal(1);
20225 unsigned DecEltBits = 0;
20226 uint64_t DecodedVal = ARM_AM::decodeVMOVModImm(Encoded, DecEltBits);
20227
20228 unsigned EltBits = Op.getScalarValueSizeInBits();
20229 if (EltBits != DecEltBits) {
20230 // Be conservative: only update Known when EltBits == DecEltBits.
20231 // This is believed to always be true for VORRIMM/VBICIMM today, but if
20232 // that changes in the future, doing nothing here is safer than risking
20233 // subtle bugs.
20234 break;
20235 }
20236
20237 KnownBits KnownLHS = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
20238 bool IsVORR = Op.getOpcode() == ARMISD::VORRIMM;
20239 APInt Imm(DecEltBits, DecodedVal);
20240
20241 Known.One = IsVORR ? (KnownLHS.One | Imm) : (KnownLHS.One & ~Imm);
20242 Known.Zero = IsVORR ? (KnownLHS.Zero & ~Imm) : (KnownLHS.Zero | Imm);
20243 break;
20244 }
20245 }
20246}
20247
20248static bool isLegalLogicalImmediate(unsigned Imm,
20249 const ARMSubtarget *Subtarget) {
20250 if (!Subtarget->isThumb())
20251 return ARM_AM::getSOImmVal(Imm) != -1;
20252 if (Subtarget->isThumb2())
20253 return ARM_AM::getT2SOImmVal(Imm) != -1;
20254 // Thumb1 only has 8-bit unsigned immediate.
20255 return Imm <= 255;
20256}
20257
20258/// Refine i32 AND/OR/XOR with a constant RHS using demanded bits: replace the
20259/// immediate with an equivalent constant that ARM/Thumb can encode as a
20260/// logical immediate (or that selects better lowering), without changing the
20261/// computed result on those demanded bits.
20262static bool optimizeLogicalImm(SDValue Op, unsigned Imm,
20263 const APInt &DemandedBits,
20264 const ARMSubtarget *Subtarget,
20266
20267 if (Imm == 0 || Imm == ~0U)
20268 return false;
20269
20270 unsigned Opc = Op.getOpcode();
20271 unsigned Demanded = DemandedBits.getZExtValue();
20272 EVT VT = Op.getValueType();
20273
20274 unsigned ShrunkImm = Imm & Demanded;
20275 unsigned ExpandedImm = Imm | ~Demanded;
20276
20277 auto IsLegalImm = [ShrunkImm, ExpandedImm](unsigned CandidateImm) -> bool {
20278 return (ShrunkImm & CandidateImm) == ShrunkImm &&
20279 (~ExpandedImm & CandidateImm) == 0;
20280 };
20281 auto UseImm = [Imm, Opc, Op, VT, &TLO](unsigned NewImm) -> bool {
20282 if (NewImm == Imm)
20283 return true;
20284 SDLoc DL(Op);
20285 SDValue NewC = TLO.DAG.getConstant(NewImm, DL, VT);
20286 SDValue NewOp =
20287 TLO.DAG.getNode(Opc, DL, VT, Op.getOperand(0), NewC, Op->getFlags());
20288 return TLO.CombineTo(Op, NewOp);
20289 };
20290
20291 // Shrunk immediate is 0: AND becomes zero; OR/XOR with 0 leaves the other
20292 // operand (still valid on demanded bits).
20293 if (ShrunkImm == 0) {
20294 ++NumOptimizedImms;
20295 return UseImm(ShrunkImm);
20296 }
20297
20298 // If the immediate is all ones: for AND this removes the operation; for
20299 // OR/XOR it remains a transform valid on demanded bits. (Target-independent
20300 // shrink may not fold this, so keep it to avoid obscure combine loops.)
20301 if (ExpandedImm == ~0U) {
20302 ++NumOptimizedImms;
20303 return UseImm(ExpandedImm);
20304 }
20305
20306 // Thumb1: prefer 0xFF / 0xFFFF when they fit the demanded-bit envelope so
20307 // lowering can match uxtb / uxth (AND immediates only; OR/XOR do not use
20308 // that). Run this before strict ShrunkImm: a tight 8-bit ShrunkImm can be
20309 // legal while 0xFF still matches the envelope and yields better isel (uxtb).
20310 if (Opc == ISD::AND && Subtarget->hasV6Ops()) {
20311 if (IsLegalImm(0xFF)) {
20312 ++NumOptimizedImms;
20313 return UseImm(0xFF);
20314 }
20315
20316 if (IsLegalImm(0xFFFF)) {
20317 ++NumOptimizedImms;
20318 return UseImm(0xFFFF);
20319 }
20320 }
20321
20322 // Don't optimize if it is legal.
20323 if (isLegalLogicalImmediate(Imm, Subtarget))
20324 return false;
20325
20326 // FIXME: Check for BIC being legal causes infinite loop due to target
20327 // independent DAG combine undoing this.
20328
20329 // Prefer strict shrink when ShrunkImm encodes for this target, before
20330 // complement expansion.
20331 if (isLegalLogicalImmediate(ShrunkImm, Subtarget)) {
20332 ++NumOptimizedImms;
20333 return UseImm(ShrunkImm);
20334 }
20335
20336 // Complement expansion: if all undemanded bits are already one, ExpandedImm
20337 // is Imm with every non-demanded bit set. When (~ExpandedImm) < 256, the
20338 // complement fits in an 8-bit unsigned value, i.e. bits 8–31 of ExpandedImm
20339 // are all ones; only the low byte may differ from ~0. Use that expanded
20340 // constant so isel sees a mask shape that fits logical-immediate patterns.
20341 if ((~ExpandedImm) < 256) {
20342 ++NumOptimizedImms;
20343 return UseImm(ExpandedImm);
20344 }
20345
20346 // FIXME: The check for v6 is because this interferes with some ubfx
20347 // optimizations.
20348 if (Opc == ISD::AND && isLegalLogicalImmediate(~ExpandedImm, Subtarget) &&
20349 !Subtarget->hasV6Ops()) {
20350 ++NumOptimizedImms;
20351 return UseImm(ExpandedImm);
20352 }
20353
20354 // Potential improvements:
20355 //
20356 // We could try to recognize lsls+lsrs or lsrs+lsls pairs here.
20357 // We could try to prefer Thumb1 immediates which can be lowered to a
20358 // two-instruction sequence.
20359
20360 return false;
20361}
20362
20364 SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
20365 TargetLoweringOpt &TLO) const {
20366 // Delay this optimization to as late as possible.
20367 if (!TLO.LegalOps)
20368 return false;
20369
20370 EVT VT = Op.getValueType();
20371
20372 // Ignore vectors.
20373 if (VT.isVector())
20374 return false;
20375
20376 unsigned Size = VT.getSizeInBits();
20377
20378 if (Size != 32)
20379 return false;
20380
20381 // Exit early if we demand all bits.
20382 if (DemandedBits.isAllOnes())
20383 return false;
20384
20385 switch (Op.getOpcode()) {
20386 default:
20387 return false;
20388 case ISD::AND:
20389 case ISD::OR:
20390 case ISD::XOR:
20391 break;
20392 }
20393 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
20394 if (!C)
20395 return false;
20396 unsigned Imm = C->getZExtValue();
20397 return optimizeLogicalImm(Op, Imm, DemandedBits, Subtarget, TLO);
20398}
20399
20401 SDValue Op, const APInt &OriginalDemandedBits,
20402 const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO,
20403 unsigned Depth) const {
20404 unsigned Opc = Op.getOpcode();
20405
20406 switch (Opc) {
20407 case ARMISD::ASRL:
20408 case ARMISD::LSRL: {
20409 // If this is result 0 and the other result is unused, see if the demand
20410 // bits allow us to shrink this long shift into a standard small shift in
20411 // the opposite direction.
20412 if (Op.getResNo() == 0 && !Op->hasAnyUseOfValue(1) &&
20413 isa<ConstantSDNode>(Op->getOperand(2))) {
20414 unsigned ShAmt = Op->getConstantOperandVal(2);
20415 if (ShAmt < 32 && OriginalDemandedBits.isSubsetOf(APInt::getAllOnes(32)
20416 << (32 - ShAmt)))
20417 return TLO.CombineTo(
20418 Op, TLO.DAG.getNode(
20419 ISD::SHL, SDLoc(Op), MVT::i32, Op.getOperand(1),
20420 TLO.DAG.getConstant(32 - ShAmt, SDLoc(Op), MVT::i32)));
20421 }
20422 break;
20423 }
20424 case ARMISD::VBICIMM: {
20425 SDValue Op0 = Op.getOperand(0);
20426 unsigned ModImm = Op.getConstantOperandVal(1);
20427 unsigned EltBits = 0;
20428 uint64_t Mask = ARM_AM::decodeVMOVModImm(ModImm, EltBits);
20429 if ((OriginalDemandedBits & Mask) == 0)
20430 return TLO.CombineTo(Op, Op0);
20431 }
20432 }
20433
20435 Op, OriginalDemandedBits, OriginalDemandedElts, Known, TLO, Depth);
20436}
20437
20438//===----------------------------------------------------------------------===//
20439// ARM Inline Assembly Support
20440//===----------------------------------------------------------------------===//
20441
20442const char *ARMTargetLowering::LowerXConstraint(EVT ConstraintVT) const {
20443 // At this point, we have to lower this constraint to something else, so we
20444 // lower it to an "r" or "w". However, by doing this we will force the result
20445 // to be in register, while the X constraint is much more permissive.
20446 //
20447 // Although we are correct (we are free to emit anything, without
20448 // constraints), we might break use cases that would expect us to be more
20449 // efficient and emit something else.
20450 if (!Subtarget->hasVFP2Base())
20451 return "r";
20452 if (ConstraintVT.isFloatingPoint())
20453 return "w";
20454 if (ConstraintVT.isVector() && Subtarget->hasNEON() &&
20455 (ConstraintVT.getSizeInBits() == 64 ||
20456 ConstraintVT.getSizeInBits() == 128))
20457 return "w";
20458
20459 return "r";
20460}
20461
20462/// getConstraintType - Given a constraint letter, return the type of
20463/// constraint it is for this target.
20466 unsigned S = Constraint.size();
20467 if (S == 1) {
20468 switch (Constraint[0]) {
20469 default: break;
20470 case 'l': return C_RegisterClass;
20471 case 'w': return C_RegisterClass;
20472 case 'h': return C_RegisterClass;
20473 case 'x': return C_RegisterClass;
20474 case 't': return C_RegisterClass;
20475 case 'j': return C_Immediate; // Constant for movw.
20476 // An address with a single base register. Due to the way we
20477 // currently handle addresses it is the same as an 'r' memory constraint.
20478 case 'Q': return C_Memory;
20479 }
20480 } else if (S == 2) {
20481 switch (Constraint[0]) {
20482 default: break;
20483 case 'T': return C_RegisterClass;
20484 // All 'U+' constraints are addresses.
20485 case 'U': return C_Memory;
20486 }
20487 }
20488 return TargetLowering::getConstraintType(Constraint);
20489}
20490
20491/// Examine constraint type and operand type and determine a weight value.
20492/// This object must already have been set up with the operand type
20493/// and the current alternative constraint selected.
20496 AsmOperandInfo &info, const char *constraint) const {
20498 Value *CallOperandVal = info.CallOperandVal;
20499 // If we don't have a value, we can't do a match,
20500 // but allow it at the lowest weight.
20501 if (!CallOperandVal)
20502 return CW_Default;
20503 Type *type = CallOperandVal->getType();
20504 // Look at the constraint type.
20505 switch (*constraint) {
20506 default:
20508 break;
20509 case 'l':
20510 if (type->isIntegerTy()) {
20511 if (Subtarget->isThumb())
20512 weight = CW_SpecificReg;
20513 else
20514 weight = CW_Register;
20515 }
20516 break;
20517 case 'w':
20518 if (type->isFloatingPointTy())
20519 weight = CW_Register;
20520 break;
20521 }
20522 return weight;
20523}
20524
20525static bool isIncompatibleReg(const MCPhysReg &PR, MVT VT) {
20526 if (PR == 0 || VT == MVT::Other)
20527 return false;
20528 if (ARM::SPRRegClass.contains(PR))
20529 return VT != MVT::f32 && VT != MVT::f16 && VT != MVT::i32;
20530 if (ARM::DPRRegClass.contains(PR))
20531 return VT != MVT::f64 && !VT.is64BitVector();
20532 return false;
20533}
20534
20535using RCPair = std::pair<unsigned, const TargetRegisterClass *>;
20536
20538 const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const {
20539 switch (Constraint.size()) {
20540 case 1:
20541 // GCC ARM Constraint Letters
20542 switch (Constraint[0]) {
20543 case 'l': // Low regs or general regs.
20544 if (Subtarget->isThumb())
20545 return RCPair(0U, &ARM::tGPRRegClass);
20546 return RCPair(0U, &ARM::GPRRegClass);
20547 case 'h': // High regs or no regs.
20548 if (Subtarget->isThumb())
20549 return RCPair(0U, &ARM::hGPRRegClass);
20550 break;
20551 case 'r':
20552 if (Subtarget->isThumb1Only())
20553 return RCPair(0U, &ARM::tGPRRegClass);
20554 return RCPair(0U, &ARM::GPRRegClass);
20555 case 'w':
20556 if (VT == MVT::Other)
20557 break;
20558 if (VT == MVT::f32 || VT == MVT::f16 || VT == MVT::bf16)
20559 return RCPair(0U, &ARM::SPRRegClass);
20560 if (VT.getSizeInBits() == 64)
20561 return RCPair(0U, &ARM::DPRRegClass);
20562 if (VT.getSizeInBits() == 128)
20563 return RCPair(0U, &ARM::QPRRegClass);
20564 break;
20565 case 'x':
20566 if (VT == MVT::Other)
20567 break;
20568 if (VT == MVT::f32 || VT == MVT::f16 || VT == MVT::bf16)
20569 return RCPair(0U, &ARM::SPR_8RegClass);
20570 if (VT.getSizeInBits() == 64)
20571 return RCPair(0U, &ARM::DPR_8RegClass);
20572 if (VT.getSizeInBits() == 128)
20573 return RCPair(0U, &ARM::QPR_8RegClass);
20574 break;
20575 case 't':
20576 if (VT == MVT::Other)
20577 break;
20578 if (VT == MVT::f32 || VT == MVT::i32 || VT == MVT::f16 || VT == MVT::bf16)
20579 return RCPair(0U, &ARM::SPRRegClass);
20580 if (VT.getSizeInBits() == 64)
20581 return RCPair(0U, &ARM::DPR_VFP2RegClass);
20582 if (VT.getSizeInBits() == 128)
20583 return RCPair(0U, &ARM::QPR_VFP2RegClass);
20584 break;
20585 }
20586 break;
20587
20588 case 2:
20589 if (Constraint[0] == 'T') {
20590 switch (Constraint[1]) {
20591 default:
20592 break;
20593 case 'e':
20594 return RCPair(0U, &ARM::tGPREvenRegClass);
20595 case 'o':
20596 return RCPair(0U, &ARM::tGPROddRegClass);
20597 }
20598 }
20599 break;
20600
20601 default:
20602 break;
20603 }
20604
20605 if (StringRef("{cc}").equals_insensitive(Constraint))
20606 return std::make_pair(unsigned(ARM::CPSR), &ARM::CCRRegClass);
20607
20608 // r14 is an alias of lr.
20609 if (StringRef("{r14}").equals_insensitive(Constraint))
20610 return std::make_pair(unsigned(ARM::LR), getRegClassFor(MVT::i32));
20611
20612 auto RCP = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
20613 if (isIncompatibleReg(RCP.first, VT))
20614 return {0, nullptr};
20615 return RCP;
20616}
20617
20618/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
20619/// vector. If it is invalid, don't add anything to Ops.
20621 StringRef Constraint,
20622 std::vector<SDValue> &Ops,
20623 SelectionDAG &DAG) const {
20624 SDValue Result;
20625
20626 // Currently only support length 1 constraints.
20627 if (Constraint.size() != 1)
20628 return;
20629
20630 char ConstraintLetter = Constraint[0];
20631 switch (ConstraintLetter) {
20632 default: break;
20633 case 'j':
20634 case 'I': case 'J': case 'K': case 'L':
20635 case 'M': case 'N': case 'O':
20637 if (!C)
20638 return;
20639
20640 int64_t CVal64 = C->getSExtValue();
20641 int CVal = (int) CVal64;
20642 // None of these constraints allow values larger than 32 bits. Check
20643 // that the value fits in an int.
20644 if (CVal != CVal64)
20645 return;
20646
20647 switch (ConstraintLetter) {
20648 case 'j':
20649 // Constant suitable for movw, must be between 0 and
20650 // 65535.
20651 if (Subtarget->hasV6T2Ops() || (Subtarget->hasV8MBaselineOps()))
20652 if (CVal >= 0 && CVal <= 65535)
20653 break;
20654 return;
20655 case 'I':
20656 if (Subtarget->isThumb1Only()) {
20657 // This must be a constant between 0 and 255, for ADD
20658 // immediates.
20659 if (CVal >= 0 && CVal <= 255)
20660 break;
20661 } else if (Subtarget->isThumb2()) {
20662 // A constant that can be used as an immediate value in a
20663 // data-processing instruction.
20664 if (ARM_AM::getT2SOImmVal(CVal) != -1)
20665 break;
20666 } else {
20667 // A constant that can be used as an immediate value in a
20668 // data-processing instruction.
20669 if (ARM_AM::getSOImmVal(CVal) != -1)
20670 break;
20671 }
20672 return;
20673
20674 case 'J':
20675 if (Subtarget->isThumb1Only()) {
20676 // This must be a constant between -255 and -1, for negated ADD
20677 // immediates. This can be used in GCC with an "n" modifier that
20678 // prints the negated value, for use with SUB instructions. It is
20679 // not useful otherwise but is implemented for compatibility.
20680 if (CVal >= -255 && CVal <= -1)
20681 break;
20682 } else {
20683 // This must be a constant between -4095 and 4095. This is suitable
20684 // for use as the immediate offset field in LDR and STR instructions
20685 // such as LDR r0,[r1,#offset].
20686 if (CVal >= -4095 && CVal <= 4095)
20687 break;
20688 }
20689 return;
20690
20691 case 'K':
20692 if (Subtarget->isThumb1Only()) {
20693 // A 32-bit value where only one byte has a nonzero value. Exclude
20694 // zero to match GCC. This constraint is used by GCC internally for
20695 // constants that can be loaded with a move/shift combination.
20696 // It is not useful otherwise but is implemented for compatibility.
20697 if (CVal != 0 && ARM_AM::isThumbImmShiftedVal(CVal))
20698 break;
20699 } else if (Subtarget->isThumb2()) {
20700 // A constant whose bitwise inverse can be used as an immediate
20701 // value in a data-processing instruction. This can be used in GCC
20702 // with a "B" modifier that prints the inverted value, for use with
20703 // BIC and MVN instructions. It is not useful otherwise but is
20704 // implemented for compatibility.
20705 if (ARM_AM::getT2SOImmVal(~CVal) != -1)
20706 break;
20707 } else {
20708 // A constant whose bitwise inverse can be used as an immediate
20709 // value in a data-processing instruction. This can be used in GCC
20710 // with a "B" modifier that prints the inverted value, for use with
20711 // BIC and MVN instructions. It is not useful otherwise but is
20712 // implemented for compatibility.
20713 if (ARM_AM::getSOImmVal(~CVal) != -1)
20714 break;
20715 }
20716 return;
20717
20718 case 'L':
20719 if (Subtarget->isThumb1Only()) {
20720 // This must be a constant between -7 and 7,
20721 // for 3-operand ADD/SUB immediate instructions.
20722 if (CVal >= -7 && CVal < 7)
20723 break;
20724 } else if (Subtarget->isThumb2()) {
20725 // A constant whose negation can be used as an immediate value in a
20726 // data-processing instruction. This can be used in GCC with an "n"
20727 // modifier that prints the negated value, for use with SUB
20728 // instructions. It is not useful otherwise but is implemented for
20729 // compatibility.
20730 if (ARM_AM::getT2SOImmVal(-CVal) != -1)
20731 break;
20732 } else {
20733 // A constant whose negation can be used as an immediate value in a
20734 // data-processing instruction. This can be used in GCC with an "n"
20735 // modifier that prints the negated value, for use with SUB
20736 // instructions. It is not useful otherwise but is implemented for
20737 // compatibility.
20738 if (ARM_AM::getSOImmVal(-CVal) != -1)
20739 break;
20740 }
20741 return;
20742
20743 case 'M':
20744 if (Subtarget->isThumb1Only()) {
20745 // This must be a multiple of 4 between 0 and 1020, for
20746 // ADD sp + immediate.
20747 if ((CVal >= 0 && CVal <= 1020) && ((CVal & 3) == 0))
20748 break;
20749 } else {
20750 // A power of two or a constant between 0 and 32. This is used in
20751 // GCC for the shift amount on shifted register operands, but it is
20752 // useful in general for any shift amounts.
20753 if ((CVal >= 0 && CVal <= 32) || ((CVal & (CVal - 1)) == 0))
20754 break;
20755 }
20756 return;
20757
20758 case 'N':
20759 if (Subtarget->isThumb1Only()) {
20760 // This must be a constant between 0 and 31, for shift amounts.
20761 if (CVal >= 0 && CVal <= 31)
20762 break;
20763 }
20764 return;
20765
20766 case 'O':
20767 if (Subtarget->isThumb1Only()) {
20768 // This must be a multiple of 4 between -508 and 508, for
20769 // ADD/SUB sp = sp + immediate.
20770 if ((CVal >= -508 && CVal <= 508) && ((CVal & 3) == 0))
20771 break;
20772 }
20773 return;
20774 }
20775 Result = DAG.getSignedTargetConstant(CVal, SDLoc(Op), Op.getValueType());
20776 break;
20777 }
20778
20779 if (Result.getNode()) {
20780 Ops.push_back(Result);
20781 return;
20782 }
20783 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
20784}
20785
20786static RTLIB::Libcall getDivRemLibcall(
20787 const SDNode *N, MVT::SimpleValueType SVT) {
20788 assert((N->getOpcode() == ISD::SDIVREM || N->getOpcode() == ISD::UDIVREM ||
20789 N->getOpcode() == ISD::SREM || N->getOpcode() == ISD::UREM) &&
20790 "Unhandled Opcode in getDivRemLibcall");
20791 bool isSigned = N->getOpcode() == ISD::SDIVREM ||
20792 N->getOpcode() == ISD::SREM;
20793 RTLIB::Libcall LC;
20794 switch (SVT) {
20795 default: llvm_unreachable("Unexpected request for libcall!");
20796 case MVT::i8: LC = isSigned ? RTLIB::SDIVREM_I8 : RTLIB::UDIVREM_I8; break;
20797 case MVT::i16: LC = isSigned ? RTLIB::SDIVREM_I16 : RTLIB::UDIVREM_I16; break;
20798 case MVT::i32: LC = isSigned ? RTLIB::SDIVREM_I32 : RTLIB::UDIVREM_I32; break;
20799 case MVT::i64: LC = isSigned ? RTLIB::SDIVREM_I64 : RTLIB::UDIVREM_I64; break;
20800 }
20801 return LC;
20802}
20803
20805 const SDNode *N, LLVMContext *Context, const ARMSubtarget *Subtarget) {
20806 assert((N->getOpcode() == ISD::SDIVREM || N->getOpcode() == ISD::UDIVREM ||
20807 N->getOpcode() == ISD::SREM || N->getOpcode() == ISD::UREM) &&
20808 "Unhandled Opcode in getDivRemArgList");
20809 bool isSigned = N->getOpcode() == ISD::SDIVREM ||
20810 N->getOpcode() == ISD::SREM;
20812 for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
20813 EVT ArgVT = N->getOperand(i).getValueType();
20814 Type *ArgTy = ArgVT.getTypeForEVT(*Context);
20815 TargetLowering::ArgListEntry Entry(N->getOperand(i), ArgTy);
20816 Entry.IsSExt = isSigned;
20817 Entry.IsZExt = !isSigned;
20818 Args.push_back(Entry);
20819 }
20820 if (Subtarget->getTargetTriple().isOSWindows() && Args.size() >= 2)
20821 std::swap(Args[0], Args[1]);
20822 return Args;
20823}
20824
20825SDValue ARMTargetLowering::LowerDivRem(SDValue Op, SelectionDAG &DAG) const {
20826 assert((Subtarget->isTargetAEABI() || Subtarget->isTargetAndroid() ||
20827 Subtarget->isTargetGNUAEABI() || Subtarget->isTargetMuslAEABI() ||
20828 Subtarget->isTargetFuchsia() || Subtarget->isTargetWindows()) &&
20829 "Register-based DivRem lowering only");
20830 unsigned Opcode = Op->getOpcode();
20831 assert((Opcode == ISD::SDIVREM || Opcode == ISD::UDIVREM) &&
20832 "Invalid opcode for Div/Rem lowering");
20833 bool isSigned = (Opcode == ISD::SDIVREM);
20834 EVT VT = Op->getValueType(0);
20835 SDLoc dl(Op);
20836
20837 if (VT == MVT::i64 && isa<ConstantSDNode>(Op.getOperand(1))) {
20839 if (expandDIVREMByConstant(Op.getNode(), Result, MVT::i32, DAG)) {
20840 SDValue Res0 =
20841 DAG.getNode(ISD::BUILD_PAIR, dl, VT, Result[0], Result[1]);
20842 SDValue Res1 =
20843 DAG.getNode(ISD::BUILD_PAIR, dl, VT, Result[2], Result[3]);
20844 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(),
20845 {Res0, Res1});
20846 }
20847 }
20848
20849 Type *Ty = VT.getTypeForEVT(*DAG.getContext());
20850
20851 // If the target has hardware divide, use divide + multiply + subtract:
20852 // div = a / b
20853 // rem = a - b * div
20854 // return {div, rem}
20855 // This should be lowered into UDIV/SDIV + MLS later on.
20856 bool hasDivide = Subtarget->isThumb() ? Subtarget->hasDivideInThumbMode()
20857 : Subtarget->hasDivideInARMMode();
20858 if (hasDivide && Op->getValueType(0).isSimple() &&
20859 Op->getSimpleValueType(0) == MVT::i32) {
20860 unsigned DivOpcode = isSigned ? ISD::SDIV : ISD::UDIV;
20861 const SDValue Dividend = Op->getOperand(0);
20862 const SDValue Divisor = Op->getOperand(1);
20863 SDValue Div = DAG.getNode(DivOpcode, dl, VT, Dividend, Divisor);
20864 SDValue Mul = DAG.getNode(ISD::MUL, dl, VT, Div, Divisor);
20865 SDValue Rem = DAG.getNode(ISD::SUB, dl, VT, Dividend, Mul);
20866
20867 SDValue Values[2] = {Div, Rem};
20868 return DAG.getNode(ISD::MERGE_VALUES, dl, DAG.getVTList(VT, VT), Values);
20869 }
20870
20871 RTLIB::Libcall LC = getDivRemLibcall(Op.getNode(),
20872 VT.getSimpleVT().SimpleTy);
20873 RTLIB::LibcallImpl LCImpl = DAG.getLibcalls().getLibcallImpl(LC);
20874
20875 SDValue InChain = DAG.getEntryNode();
20876
20878 DAG.getContext(),
20879 Subtarget);
20880
20881 SDValue Callee =
20882 DAG.getExternalSymbol(LCImpl, getPointerTy(DAG.getDataLayout()));
20883
20884 Type *RetTy = StructType::get(Ty, Ty);
20885
20886 if (getTM().getTargetTriple().isOSWindows())
20887 InChain = WinDBZCheckDenominator(DAG, Op.getNode(), InChain);
20888
20889 TargetLowering::CallLoweringInfo CLI(DAG);
20890 CLI.setDebugLoc(dl)
20891 .setChain(InChain)
20892 .setCallee(DAG.getLibcalls().getLibcallImplCallingConv(LCImpl), RetTy,
20893 Callee, std::move(Args))
20894 .setInRegister()
20897
20898 std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
20899 return CallInfo.first;
20900}
20901
20902// Lowers REM using divmod helpers
20903// see RTABI section 4.2/4.3
20904SDValue ARMTargetLowering::LowerREM(SDNode *N, SelectionDAG &DAG) const {
20905 EVT VT = N->getValueType(0);
20906
20907 if (VT == MVT::i64 && isa<ConstantSDNode>(N->getOperand(1))) {
20909 if (expandDIVREMByConstant(N, Result, MVT::i32, DAG))
20910 return DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), N->getValueType(0),
20911 Result[0], Result[1]);
20912 }
20913
20914 // Build return types (div and rem)
20915 std::vector<Type*> RetTyParams;
20916 Type *RetTyElement;
20917
20918 switch (VT.getSimpleVT().SimpleTy) {
20919 default: llvm_unreachable("Unexpected request for libcall!");
20920 case MVT::i8: RetTyElement = Type::getInt8Ty(*DAG.getContext()); break;
20921 case MVT::i16: RetTyElement = Type::getInt16Ty(*DAG.getContext()); break;
20922 case MVT::i32: RetTyElement = Type::getInt32Ty(*DAG.getContext()); break;
20923 case MVT::i64: RetTyElement = Type::getInt64Ty(*DAG.getContext()); break;
20924 }
20925
20926 RetTyParams.push_back(RetTyElement);
20927 RetTyParams.push_back(RetTyElement);
20928 ArrayRef<Type*> ret = ArrayRef<Type*>(RetTyParams);
20929 Type *RetTy = StructType::get(*DAG.getContext(), ret);
20930
20931 RTLIB::Libcall LC = getDivRemLibcall(N, N->getValueType(0).getSimpleVT().
20932 SimpleTy);
20933 RTLIB::LibcallImpl LCImpl = DAG.getLibcalls().getLibcallImpl(LC);
20934 SDValue InChain = DAG.getEntryNode();
20936 Subtarget);
20937 bool isSigned = N->getOpcode() == ISD::SREM;
20938
20939 SDValue Callee =
20940 DAG.getExternalSymbol(LCImpl, getPointerTy(DAG.getDataLayout()));
20941
20942 if (getTM().getTargetTriple().isOSWindows())
20943 InChain = WinDBZCheckDenominator(DAG, N, InChain);
20944
20945 // Lower call
20946 CallLoweringInfo CLI(DAG);
20947 CLI.setChain(InChain)
20948 .setCallee(DAG.getLibcalls().getLibcallImplCallingConv(LCImpl), RetTy,
20949 Callee, std::move(Args))
20952 .setDebugLoc(SDLoc(N));
20953 std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
20954
20955 // Return second (rem) result operand (first contains div)
20956 SDNode *ResNode = CallResult.first.getNode();
20957 assert(ResNode->getNumOperands() == 2 && "divmod should return two operands");
20958 return ResNode->getOperand(1);
20959}
20960
20961SDValue
20962ARMTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const {
20963 assert(getTM().getTargetTriple().isOSWindows() &&
20964 "unsupported target platform");
20965 SDLoc DL(Op);
20966
20967 // Get the inputs.
20968 SDValue Chain = Op.getOperand(0);
20969 SDValue Size = Op.getOperand(1);
20970
20972 "no-stack-arg-probe")) {
20973 MaybeAlign Align =
20974 cast<ConstantSDNode>(Op.getOperand(2))->getMaybeAlignValue();
20975 SDValue SP = DAG.getCopyFromReg(Chain, DL, ARM::SP, MVT::i32);
20976 Chain = SP.getValue(1);
20977 SP = DAG.getNode(ISD::SUB, DL, MVT::i32, SP, Size);
20978 if (Align)
20979 SP = DAG.getNode(ISD::AND, DL, MVT::i32, SP.getValue(0),
20980 DAG.getSignedConstant(-Align->value(), DL, MVT::i32));
20981 Chain = DAG.getCopyToReg(Chain, DL, ARM::SP, SP);
20982 SDValue Ops[2] = { SP, Chain };
20983 return DAG.getMergeValues(Ops, DL);
20984 }
20985
20986 SDValue Words = DAG.getNode(ISD::SRL, DL, MVT::i32, Size,
20987 DAG.getConstant(2, DL, MVT::i32));
20988
20989 SDValue Glue;
20990 Chain = DAG.getCopyToReg(Chain, DL, ARM::R4, Words, Glue);
20991 Glue = Chain.getValue(1);
20992
20993 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
20994 Chain = DAG.getNode(ARMISD::WIN__CHKSTK, DL, NodeTys, Chain, Glue);
20995
20996 SDValue NewSP = DAG.getCopyFromReg(Chain, DL, ARM::SP, MVT::i32);
20997 Chain = NewSP.getValue(1);
20998
20999 SDValue Ops[2] = { NewSP, Chain };
21000 return DAG.getMergeValues(Ops, DL);
21001}
21002
21003SDValue ARMTargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
21004 bool IsStrict = Op->isStrictFPOpcode();
21005 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
21006 const unsigned DstSz = Op.getValueType().getSizeInBits();
21007 const unsigned SrcSz = SrcVal.getValueType().getSizeInBits();
21008 assert(DstSz > SrcSz && DstSz <= 64 && SrcSz >= 16 &&
21009 "Unexpected type for custom-lowering FP_EXTEND");
21010
21011 assert((!Subtarget->hasFP64() || !Subtarget->hasFPARMv8Base()) &&
21012 "With both FP DP and 16, any FP conversion is legal!");
21013
21014 assert(!(DstSz == 32 && Subtarget->hasFP16()) &&
21015 "With FP16, 16 to 32 conversion is legal!");
21016
21017 // Converting from 32 -> 64 is valid if we have FP64.
21018 if (SrcSz == 32 && DstSz == 64 && Subtarget->hasFP64()) {
21019 // FIXME: Remove this when we have strict fp instruction selection patterns
21020 if (IsStrict) {
21021 SDLoc Loc(Op);
21023 Loc, Op.getValueType(), SrcVal);
21024 return DAG.getMergeValues({Result, Op.getOperand(0)}, Loc);
21025 }
21026 return Op;
21027 }
21028
21029 // Either we are converting from 16 -> 64, without FP16 and/or
21030 // FP.double-precision or without Armv8-fp. So we must do it in two
21031 // steps.
21032 // Or we are converting from 32 -> 64 without fp.double-precision or 16 -> 32
21033 // without FP16. So we must do a function call.
21034 SDLoc Loc(Op);
21035 RTLIB::Libcall LC;
21036 MakeLibCallOptions CallOptions;
21037 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
21038 for (unsigned Sz = SrcSz; Sz <= 32 && Sz < DstSz; Sz *= 2) {
21039 bool Supported = (Sz == 16 ? Subtarget->hasFP16() : Subtarget->hasFP64());
21040 MVT SrcVT = (Sz == 16 ? MVT::f16 : MVT::f32);
21041 MVT DstVT = (Sz == 16 ? MVT::f32 : MVT::f64);
21042 if (Supported) {
21043 if (IsStrict) {
21044 SrcVal = DAG.getNode(ISD::STRICT_FP_EXTEND, Loc,
21045 {DstVT, MVT::Other}, {Chain, SrcVal});
21046 Chain = SrcVal.getValue(1);
21047 } else {
21048 SrcVal = DAG.getNode(ISD::FP_EXTEND, Loc, DstVT, SrcVal);
21049 }
21050 } else {
21051 LC = RTLIB::getFPEXT(SrcVT, DstVT);
21052 assert(LC != RTLIB::UNKNOWN_LIBCALL &&
21053 "Unexpected type for custom-lowering FP_EXTEND");
21054 std::tie(SrcVal, Chain) = makeLibCall(DAG, LC, DstVT, SrcVal, CallOptions,
21055 Loc, Chain);
21056 }
21057 }
21058
21059 return IsStrict ? DAG.getMergeValues({SrcVal, Chain}, Loc) : SrcVal;
21060}
21061
21062SDValue ARMTargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
21063 bool IsStrict = Op->isStrictFPOpcode();
21064
21065 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
21066 EVT SrcVT = SrcVal.getValueType();
21067 EVT DstVT = Op.getValueType();
21068 const unsigned DstSz = Op.getValueType().getSizeInBits();
21069 const unsigned SrcSz = SrcVT.getSizeInBits();
21070 (void)DstSz;
21071 assert(DstSz < SrcSz && SrcSz <= 64 && DstSz >= 16 &&
21072 "Unexpected type for custom-lowering FP_ROUND");
21073
21074 assert((!Subtarget->hasFP64() || !Subtarget->hasFPARMv8Base()) &&
21075 "With both FP DP and 16, any FP conversion is legal!");
21076
21077 SDLoc Loc(Op);
21078
21079 // Instruction from 32 -> 16 if hasFP16 is valid
21080 if (SrcSz == 32 && Subtarget->hasFP16())
21081 return Op;
21082
21083 // Lib call from 32 -> 16 / 64 -> [32, 16]
21084 RTLIB::Libcall LC = RTLIB::getFPROUND(SrcVT, DstVT);
21085 assert(LC != RTLIB::UNKNOWN_LIBCALL &&
21086 "Unexpected type for custom-lowering FP_ROUND");
21087 MakeLibCallOptions CallOptions;
21088 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
21090 std::tie(Result, Chain) = makeLibCall(DAG, LC, DstVT, SrcVal, CallOptions,
21091 Loc, Chain);
21092 return IsStrict ? DAG.getMergeValues({Result, Chain}, Loc) : Result;
21093}
21094
21095bool
21097 // The ARM target isn't yet aware of offsets.
21098 return false;
21099}
21100
21102 if (v == 0xffffffff)
21103 return false;
21104
21105 // there can be 1's on either or both "outsides", all the "inside"
21106 // bits must be 0's
21107 return isShiftedMask_32(~v);
21108}
21109
21110/// isFPImmLegal - Returns true if the target can instruction select the
21111/// specified FP immediate natively. If false, the legalizer will
21112/// materialize the FP immediate as a load from a constant pool.
21114 bool ForCodeSize) const {
21115 if (!Subtarget->hasVFP3Base())
21116 return false;
21117 if (VT == MVT::f16 && Subtarget->hasFullFP16())
21118 return ARM_AM::getFP16Imm(Imm) != -1;
21119 if (VT == MVT::f32 && Subtarget->hasFullFP16() &&
21120 ARM_AM::getFP32FP16Imm(Imm) != -1)
21121 return true;
21122 if (VT == MVT::f32)
21123 return ARM_AM::getFP32Imm(Imm) != -1;
21124 if (VT == MVT::f64 && Subtarget->hasFP64())
21125 return ARM_AM::getFP64Imm(Imm) != -1;
21126 return false;
21127}
21128
21129/// getTgtMemIntrinsic - Represent NEON load and store intrinsics as
21130/// MemIntrinsicNodes. The associated MachineMemOperands record the alignment
21131/// specified in the intrinsic calls.
21134 MachineFunction &MF, unsigned Intrinsic) const {
21135 IntrinsicInfo Info;
21136 switch (Intrinsic) {
21137 case Intrinsic::arm_neon_vld1:
21138 case Intrinsic::arm_neon_vld2:
21139 case Intrinsic::arm_neon_vld3:
21140 case Intrinsic::arm_neon_vld4:
21141 case Intrinsic::arm_neon_vld2lane:
21142 case Intrinsic::arm_neon_vld3lane:
21143 case Intrinsic::arm_neon_vld4lane:
21144 case Intrinsic::arm_neon_vld2dup:
21145 case Intrinsic::arm_neon_vld3dup:
21146 case Intrinsic::arm_neon_vld4dup: {
21147 Info.opc = ISD::INTRINSIC_W_CHAIN;
21148 // Conservatively set memVT to the entire set of vectors loaded.
21149 auto &DL = I.getDataLayout();
21150 uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64;
21151 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
21152 Info.ptrVal = I.getArgOperand(0);
21153 Info.offset = 0;
21154 Value *AlignArg = I.getArgOperand(I.arg_size() - 1);
21155 Info.align = cast<ConstantInt>(AlignArg)->getMaybeAlignValue();
21156 // volatile loads with NEON intrinsics not supported
21157 Info.flags = MachineMemOperand::MOLoad;
21158 Infos.push_back(Info);
21159 return;
21160 }
21161 case Intrinsic::arm_neon_vld1x2:
21162 case Intrinsic::arm_neon_vld1x3:
21163 case Intrinsic::arm_neon_vld1x4: {
21164 Info.opc = ISD::INTRINSIC_W_CHAIN;
21165 // Conservatively set memVT to the entire set of vectors loaded.
21166 auto &DL = I.getDataLayout();
21167 uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64;
21168 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
21169 Info.ptrVal = I.getArgOperand(I.arg_size() - 1);
21170 Info.offset = 0;
21171 Info.align = I.getParamAlign(I.arg_size() - 1).valueOrOne();
21172 // volatile loads with NEON intrinsics not supported
21173 Info.flags = MachineMemOperand::MOLoad;
21174 Infos.push_back(Info);
21175 return;
21176 }
21177 case Intrinsic::arm_neon_vst1:
21178 case Intrinsic::arm_neon_vst2:
21179 case Intrinsic::arm_neon_vst3:
21180 case Intrinsic::arm_neon_vst4:
21181 case Intrinsic::arm_neon_vst2lane:
21182 case Intrinsic::arm_neon_vst3lane:
21183 case Intrinsic::arm_neon_vst4lane: {
21184 Info.opc = ISD::INTRINSIC_VOID;
21185 // Conservatively set memVT to the entire set of vectors stored.
21186 auto &DL = I.getDataLayout();
21187 unsigned NumElts = 0;
21188 for (unsigned ArgI = 1, ArgE = I.arg_size(); ArgI < ArgE; ++ArgI) {
21189 Type *ArgTy = I.getArgOperand(ArgI)->getType();
21190 if (!ArgTy->isVectorTy())
21191 break;
21192 NumElts += DL.getTypeSizeInBits(ArgTy) / 64;
21193 }
21194 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
21195 Info.ptrVal = I.getArgOperand(0);
21196 Info.offset = 0;
21197 Value *AlignArg = I.getArgOperand(I.arg_size() - 1);
21198 Info.align = cast<ConstantInt>(AlignArg)->getMaybeAlignValue();
21199 // volatile stores with NEON intrinsics not supported
21200 Info.flags = MachineMemOperand::MOStore;
21201 Infos.push_back(Info);
21202 return;
21203 }
21204 case Intrinsic::arm_neon_vst1x2:
21205 case Intrinsic::arm_neon_vst1x3:
21206 case Intrinsic::arm_neon_vst1x4: {
21207 Info.opc = ISD::INTRINSIC_VOID;
21208 // Conservatively set memVT to the entire set of vectors stored.
21209 auto &DL = I.getDataLayout();
21210 unsigned NumElts = 0;
21211 for (unsigned ArgI = 1, ArgE = I.arg_size(); ArgI < ArgE; ++ArgI) {
21212 Type *ArgTy = I.getArgOperand(ArgI)->getType();
21213 if (!ArgTy->isVectorTy())
21214 break;
21215 NumElts += DL.getTypeSizeInBits(ArgTy) / 64;
21216 }
21217 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
21218 Info.ptrVal = I.getArgOperand(0);
21219 Info.offset = 0;
21220 Info.align = I.getParamAlign(0).valueOrOne();
21221 // volatile stores with NEON intrinsics not supported
21222 Info.flags = MachineMemOperand::MOStore;
21223 Infos.push_back(Info);
21224 return;
21225 }
21226 case Intrinsic::arm_mve_vld2q:
21227 case Intrinsic::arm_mve_vld4q: {
21228 Info.opc = ISD::INTRINSIC_W_CHAIN;
21229 // Conservatively set memVT to the entire set of vectors loaded.
21230 Type *VecTy = cast<StructType>(I.getType())->getElementType(1);
21231 unsigned Factor = Intrinsic == Intrinsic::arm_mve_vld2q ? 2 : 4;
21232 Info.memVT = EVT::getVectorVT(VecTy->getContext(), MVT::i64, Factor * 2);
21233 Info.ptrVal = I.getArgOperand(0);
21234 Info.offset = 0;
21235 Info.align = Align(VecTy->getScalarSizeInBits() / 8);
21236 // volatile loads with MVE intrinsics not supported
21237 Info.flags = MachineMemOperand::MOLoad;
21238 Infos.push_back(Info);
21239 return;
21240 }
21241 case Intrinsic::arm_mve_vst2q:
21242 case Intrinsic::arm_mve_vst4q: {
21243 Info.opc = ISD::INTRINSIC_VOID;
21244 // Conservatively set memVT to the entire set of vectors stored.
21245 Type *VecTy = I.getArgOperand(1)->getType();
21246 unsigned Factor = Intrinsic == Intrinsic::arm_mve_vst2q ? 2 : 4;
21247 Info.memVT = EVT::getVectorVT(VecTy->getContext(), MVT::i64, Factor * 2);
21248 Info.ptrVal = I.getArgOperand(0);
21249 Info.offset = 0;
21250 Info.align = Align(VecTy->getScalarSizeInBits() / 8);
21251 // volatile stores with MVE intrinsics not supported
21252 Info.flags = MachineMemOperand::MOStore;
21253 Infos.push_back(Info);
21254 return;
21255 }
21256 case Intrinsic::arm_mve_vldr_gather_base:
21257 case Intrinsic::arm_mve_vldr_gather_base_predicated: {
21258 Info.opc = ISD::INTRINSIC_W_CHAIN;
21259 Info.ptrVal = nullptr;
21260 Info.memVT = MVT::getVT(I.getType());
21261 Info.align = Align(1);
21262 Info.flags |= MachineMemOperand::MOLoad;
21263 Infos.push_back(Info);
21264 return;
21265 }
21266 case Intrinsic::arm_mve_vldr_gather_base_wb:
21267 case Intrinsic::arm_mve_vldr_gather_base_wb_predicated: {
21268 Info.opc = ISD::INTRINSIC_W_CHAIN;
21269 Info.ptrVal = nullptr;
21270 Info.memVT = MVT::getVT(I.getType()->getContainedType(0));
21271 Info.align = Align(1);
21272 Info.flags |= MachineMemOperand::MOLoad;
21273 Infos.push_back(Info);
21274 return;
21275 }
21276 case Intrinsic::arm_mve_vldr_gather_offset:
21277 case Intrinsic::arm_mve_vldr_gather_offset_predicated: {
21278 Info.opc = ISD::INTRINSIC_W_CHAIN;
21279 Info.ptrVal = nullptr;
21280 MVT DataVT = MVT::getVT(I.getType());
21281 unsigned MemSize = cast<ConstantInt>(I.getArgOperand(2))->getZExtValue();
21282 Info.memVT = MVT::getVectorVT(MVT::getIntegerVT(MemSize),
21283 DataVT.getVectorNumElements());
21284 Info.align = Align(1);
21285 Info.flags |= MachineMemOperand::MOLoad;
21286 Infos.push_back(Info);
21287 return;
21288 }
21289 case Intrinsic::arm_mve_vstr_scatter_base:
21290 case Intrinsic::arm_mve_vstr_scatter_base_predicated: {
21291 Info.opc = ISD::INTRINSIC_VOID;
21292 Info.ptrVal = nullptr;
21293 Info.memVT = MVT::getVT(I.getArgOperand(2)->getType());
21294 Info.align = Align(1);
21295 Info.flags |= MachineMemOperand::MOStore;
21296 Infos.push_back(Info);
21297 return;
21298 }
21299 case Intrinsic::arm_mve_vstr_scatter_base_wb:
21300 case Intrinsic::arm_mve_vstr_scatter_base_wb_predicated: {
21301 Info.opc = ISD::INTRINSIC_W_CHAIN;
21302 Info.ptrVal = nullptr;
21303 Info.memVT = MVT::getVT(I.getArgOperand(2)->getType());
21304 Info.align = Align(1);
21305 Info.flags |= MachineMemOperand::MOStore;
21306 Infos.push_back(Info);
21307 return;
21308 }
21309 case Intrinsic::arm_mve_vstr_scatter_offset:
21310 case Intrinsic::arm_mve_vstr_scatter_offset_predicated: {
21311 Info.opc = ISD::INTRINSIC_VOID;
21312 Info.ptrVal = nullptr;
21313 MVT DataVT = MVT::getVT(I.getArgOperand(2)->getType());
21314 unsigned MemSize = cast<ConstantInt>(I.getArgOperand(3))->getZExtValue();
21315 Info.memVT = MVT::getVectorVT(MVT::getIntegerVT(MemSize),
21316 DataVT.getVectorNumElements());
21317 Info.align = Align(1);
21318 Info.flags |= MachineMemOperand::MOStore;
21319 Infos.push_back(Info);
21320 return;
21321 }
21322 case Intrinsic::arm_ldaex:
21323 case Intrinsic::arm_ldrex: {
21324 auto &DL = I.getDataLayout();
21325 Type *ValTy = I.getParamElementType(0);
21326 Info.opc = ISD::INTRINSIC_W_CHAIN;
21327 Info.memVT = MVT::getVT(ValTy);
21328 Info.ptrVal = I.getArgOperand(0);
21329 Info.offset = 0;
21330 Info.align = DL.getABITypeAlign(ValTy);
21332 Infos.push_back(Info);
21333 return;
21334 }
21335 case Intrinsic::arm_stlex:
21336 case Intrinsic::arm_strex: {
21337 auto &DL = I.getDataLayout();
21338 Type *ValTy = I.getParamElementType(1);
21339 Info.opc = ISD::INTRINSIC_W_CHAIN;
21340 Info.memVT = MVT::getVT(ValTy);
21341 Info.ptrVal = I.getArgOperand(1);
21342 Info.offset = 0;
21343 Info.align = DL.getABITypeAlign(ValTy);
21345 Infos.push_back(Info);
21346 return;
21347 }
21348 case Intrinsic::arm_stlexd:
21349 case Intrinsic::arm_strexd:
21350 Info.opc = ISD::INTRINSIC_W_CHAIN;
21351 Info.memVT = MVT::i64;
21352 Info.ptrVal = I.getArgOperand(2);
21353 Info.offset = 0;
21354 Info.align = Align(8);
21356 Infos.push_back(Info);
21357 return;
21358
21359 case Intrinsic::arm_ldaexd:
21360 case Intrinsic::arm_ldrexd:
21361 Info.opc = ISD::INTRINSIC_W_CHAIN;
21362 Info.memVT = MVT::i64;
21363 Info.ptrVal = I.getArgOperand(0);
21364 Info.offset = 0;
21365 Info.align = Align(8);
21367 Infos.push_back(Info);
21368 return;
21369
21370 default:
21371 break;
21372 }
21373}
21374
21375/// Returns true if it is beneficial to convert a load of a constant
21376/// to just the constant itself.
21378 Type *Ty) const {
21379 assert(Ty->isIntegerTy());
21380
21381 unsigned Bits = Ty->getPrimitiveSizeInBits();
21382 if (Bits == 0 || Bits > 32)
21383 return false;
21384 return true;
21385}
21386
21388 unsigned Index) const {
21390 return false;
21391
21392 return (Index == 0 || Index == ResVT.getVectorNumElements());
21393}
21394
21396 ARM_MB::MemBOpt Domain) const {
21397 // First, if the target has no DMB, see what fallback we can use.
21398 if (!Subtarget->hasDataBarrier()) {
21399 // Some ARMv6 cpus can support data barriers with an mcr instruction.
21400 // Thumb1 and pre-v6 ARM mode use a libcall instead and should never get
21401 // here.
21402 if (Subtarget->hasV6Ops() && !Subtarget->isThumb()) {
21403 Value* args[6] = {Builder.getInt32(15), Builder.getInt32(0),
21404 Builder.getInt32(0), Builder.getInt32(7),
21405 Builder.getInt32(10), Builder.getInt32(5)};
21406 return Builder.CreateIntrinsic(Intrinsic::arm_mcr, args);
21407 } else {
21408 // Instead of using barriers, atomic accesses on these subtargets use
21409 // libcalls.
21410 llvm_unreachable("makeDMB on a target so old that it has no barriers");
21411 }
21412 } else {
21413 // Only a full system barrier exists in the M-class architectures.
21414 Domain = Subtarget->isMClass() ? ARM_MB::SY : Domain;
21415 Constant *CDomain = Builder.getInt32(Domain);
21416 return Builder.CreateIntrinsic(Intrinsic::arm_dmb, CDomain);
21417 }
21418}
21419
21420// Based on http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html
21422 Instruction *Inst,
21423 AtomicOrdering Ord) const {
21424 switch (Ord) {
21427 llvm_unreachable("Invalid fence: unordered/non-atomic");
21430 return nullptr; // Nothing to do
21432 if (!Inst->hasAtomicStore())
21433 return nullptr; // Nothing to do
21434 [[fallthrough]];
21437 if (Subtarget->preferISHSTBarriers())
21438 return makeDMB(Builder, ARM_MB::ISHST);
21439 // FIXME: add a comment with a link to documentation justifying this.
21440 else
21441 return makeDMB(Builder, ARM_MB::ISH);
21442 }
21443 llvm_unreachable("Unknown fence ordering in emitLeadingFence");
21444}
21445
21447 Instruction *Inst,
21448 AtomicOrdering Ord) const {
21449 switch (Ord) {
21452 llvm_unreachable("Invalid fence: unordered/not-atomic");
21455 return nullptr; // Nothing to do
21459 return makeDMB(Builder, ARM_MB::ISH);
21460 }
21461 llvm_unreachable("Unknown fence ordering in emitTrailingFence");
21462}
21463
21464// Loads and stores less than 64-bits are already atomic; ones above that
21465// are doomed anyway, so defer to the default libcall and blame the OS when
21466// things go wrong. Cortex M doesn't have ldrexd/strexd though, so don't emit
21467// anything for those.
21470 bool has64BitAtomicStore;
21471 if (Subtarget->isMClass())
21472 has64BitAtomicStore = false;
21473 else if (Subtarget->isThumb())
21474 has64BitAtomicStore = Subtarget->hasV7Ops();
21475 else
21476 has64BitAtomicStore = Subtarget->hasV6Ops();
21477
21478 unsigned Size = SI->getValueOperand()->getType()->getPrimitiveSizeInBits();
21479 return Size == 64 && has64BitAtomicStore ? AtomicExpansionKind::Expand
21481}
21482
21483// Loads and stores less than 64-bits are already atomic; ones above that
21484// are doomed anyway, so defer to the default libcall and blame the OS when
21485// things go wrong. Cortex M doesn't have ldrexd/strexd though, so don't emit
21486// anything for those.
21487// FIXME: ldrd and strd are atomic if the CPU has LPAE (e.g. A15 has that
21488// guarantee, see DDI0406C ARM architecture reference manual,
21489// sections A8.8.72-74 LDRD)
21492 bool has64BitAtomicLoad;
21493 if (Subtarget->isMClass())
21494 has64BitAtomicLoad = false;
21495 else if (Subtarget->isThumb())
21496 has64BitAtomicLoad = Subtarget->hasV7Ops();
21497 else
21498 has64BitAtomicLoad = Subtarget->hasV6Ops();
21499
21500 unsigned Size = LI->getType()->getPrimitiveSizeInBits();
21501 return (Size == 64 && has64BitAtomicLoad) ? AtomicExpansionKind::LLOnly
21503}
21504
21505// For the real atomic operations, we have ldrex/strex up to 32 bits,
21506// and up to 64 bits on the non-M profiles
21509 if (AI->isFloatingPointOperation())
21511
21512 unsigned Size = AI->getType()->getPrimitiveSizeInBits();
21513 bool hasAtomicRMW;
21514 if (Subtarget->isMClass())
21515 hasAtomicRMW = Subtarget->hasV8MBaselineOps();
21516 else if (Subtarget->isThumb())
21517 hasAtomicRMW = Subtarget->hasV7Ops();
21518 else
21519 hasAtomicRMW = Subtarget->hasV6Ops();
21520 if (Size <= (Subtarget->isMClass() ? 32U : 64U) && hasAtomicRMW) {
21521 // At -O0, fast-regalloc cannot cope with the live vregs necessary to
21522 // implement atomicrmw without spilling. If the target address is also on
21523 // the stack and close enough to the spill slot, this can lead to a
21524 // situation where the monitor always gets cleared and the atomic operation
21525 // can never succeed. So at -O0 lower this operation to a CAS loop.
21526 if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None)
21529 }
21531}
21532
21533// Similar to shouldExpandAtomicRMWInIR, ldrex/strex can be used up to 32
21534// bits, and up to 64 bits on the non-M profiles.
21537 const AtomicCmpXchgInst *AI) const {
21538 // At -O0, fast-regalloc cannot cope with the live vregs necessary to
21539 // implement cmpxchg without spilling. If the address being exchanged is also
21540 // on the stack and close enough to the spill slot, this can lead to a
21541 // situation where the monitor always gets cleared and the atomic operation
21542 // can never succeed. So at -O0 we need a late-expanded pseudo-inst instead.
21543 unsigned Size = AI->getOperand(1)->getType()->getPrimitiveSizeInBits();
21544 bool HasAtomicCmpXchg;
21545 if (Subtarget->isMClass())
21546 HasAtomicCmpXchg = Subtarget->hasV8MBaselineOps();
21547 else if (Subtarget->isThumb())
21548 HasAtomicCmpXchg = Subtarget->hasV7Ops();
21549 else
21550 HasAtomicCmpXchg = Subtarget->hasV6Ops();
21551 if (getTargetMachine().getOptLevel() != CodeGenOptLevel::None &&
21552 HasAtomicCmpXchg && Size <= (Subtarget->isMClass() ? 32U : 64U))
21555}
21556
21558 const Instruction *I) const {
21559 return InsertFencesForAtomic;
21560}
21561
21563 // ROPI/RWPI are not supported currently.
21564 return !Subtarget->isROPI() && !Subtarget->isRWPI();
21565}
21566
21568 Module &M, const LibcallLoweringInfo &Libcalls) const {
21569 // MSVC CRT provides functionalities for stack protection.
21570 RTLIB::LibcallImpl SecurityCheckCookieLibcall =
21571 Libcalls.getLibcallImpl(RTLIB::SECURITY_CHECK_COOKIE);
21572
21573 RTLIB::LibcallImpl SecurityCookieVar =
21574 Libcalls.getLibcallImpl(RTLIB::STACK_CHECK_GUARD);
21575 if (SecurityCheckCookieLibcall != RTLIB::Unsupported &&
21576 SecurityCookieVar != RTLIB::Unsupported) {
21577 // MSVC CRT has a global variable holding security cookie.
21578 M.getOrInsertGlobal(getLibcallImplName(SecurityCookieVar),
21579 PointerType::getUnqual(M.getContext()));
21580
21581 // MSVC CRT has a function to validate security cookie.
21582 FunctionCallee SecurityCheckCookie =
21583 M.getOrInsertFunction(getLibcallImplName(SecurityCheckCookieLibcall),
21584 Type::getVoidTy(M.getContext()),
21585 PointerType::getUnqual(M.getContext()));
21586 if (Function *F = dyn_cast<Function>(SecurityCheckCookie.getCallee()))
21587 F->addParamAttr(0, Attribute::AttrKind::InReg);
21588 }
21589
21591}
21592
21594 unsigned &Cost) const {
21595 // If we do not have NEON, vector types are not natively supported.
21596 if (!Subtarget->hasNEON())
21597 return false;
21598
21599 // Floating point values and vector values map to the same register file.
21600 // Therefore, although we could do a store extract of a vector type, this is
21601 // better to leave at float as we have more freedom in the addressing mode for
21602 // those.
21603 if (VectorTy->isFPOrFPVectorTy())
21604 return false;
21605
21606 // If the index is unknown at compile time, this is very expensive to lower
21607 // and it is not possible to combine the store with the extract.
21608 if (!isa<ConstantInt>(Idx))
21609 return false;
21610
21611 assert(VectorTy->isVectorTy() && "VectorTy is not a vector type");
21612 unsigned BitWidth = VectorTy->getPrimitiveSizeInBits().getFixedValue();
21613 // We can do a store + vector extract on any vector that fits perfectly in a D
21614 // or Q register.
21615 if (BitWidth == 64 || BitWidth == 128) {
21616 Cost = 0;
21617 return true;
21618 }
21619 return false;
21620}
21621
21623 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
21624 UndefPoisonKind Kind, bool ConsiderFlags, unsigned Depth) const {
21625 unsigned Opcode = Op.getOpcode();
21626 switch (Opcode) {
21627 case ARMISD::VORRIMM:
21628 case ARMISD::VBICIMM:
21629 return false;
21630 }
21632 Op, DemandedElts, DAG, Kind, ConsiderFlags, Depth);
21633}
21634
21636 return Subtarget->hasV5TOps() && !Subtarget->isThumb1Only();
21637}
21638
21640 return Subtarget->hasV5TOps() && !Subtarget->isThumb1Only();
21641}
21642
21644 const Instruction &AndI) const {
21645 if (!Subtarget->hasV7Ops())
21646 return false;
21647
21648 // Sink the `and` instruction only if the mask would fit into a modified
21649 // immediate operand.
21651 if (!Mask || Mask->getValue().getBitWidth() > 32u)
21652 return false;
21653 auto MaskVal = unsigned(Mask->getValue().getZExtValue());
21654 return (Subtarget->isThumb2() ? ARM_AM::getT2SOImmVal(MaskVal)
21655 : ARM_AM::getSOImmVal(MaskVal)) != -1;
21656}
21657
21660 SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const {
21661 if (Subtarget->hasMinSize() && !getTM().getTargetTriple().isOSWindows())
21664 ExpansionFactor);
21665}
21666
21668 Value *Addr,
21669 AtomicOrdering Ord) const {
21670 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
21671 bool IsAcquire = isAcquireOrStronger(Ord);
21672
21673 // Since i64 isn't legal and intrinsics don't get type-lowered, the ldrexd
21674 // intrinsic must return {i32, i32} and we have to recombine them into a
21675 // single i64 here.
21676 if (ValueTy->getPrimitiveSizeInBits() == 64) {
21678 IsAcquire ? Intrinsic::arm_ldaexd : Intrinsic::arm_ldrexd;
21679
21680 Value *LoHi =
21681 Builder.CreateIntrinsic(Int, Addr, /*FMFSource=*/nullptr, "lohi");
21682
21683 Value *Lo = Builder.CreateExtractValue(LoHi, 0, "lo");
21684 Value *Hi = Builder.CreateExtractValue(LoHi, 1, "hi");
21685 if (!Subtarget->isLittle())
21686 std::swap (Lo, Hi);
21687 Lo = Builder.CreateZExt(Lo, ValueTy, "lo64");
21688 Hi = Builder.CreateZExt(Hi, ValueTy, "hi64");
21689 return Builder.CreateOr(
21690 Lo, Builder.CreateShl(Hi, ConstantInt::get(ValueTy, 32)), "val64");
21691 }
21692
21693 Type *Tys[] = { Addr->getType() };
21694 Intrinsic::ID Int = IsAcquire ? Intrinsic::arm_ldaex : Intrinsic::arm_ldrex;
21695 CallInst *CI = Builder.CreateIntrinsic(Int, Tys, Addr);
21696
21697 CI->addParamAttr(
21698 0, Attribute::get(M->getContext(), Attribute::ElementType, ValueTy));
21699 return Builder.CreateTruncOrBitCast(CI, ValueTy);
21700}
21701
21703 IRBuilderBase &Builder) const {
21704 if (!Subtarget->hasV7Ops())
21705 return;
21706 Builder.CreateIntrinsic(Intrinsic::arm_clrex, {});
21707}
21708
21710 Value *Val, Value *Addr,
21711 AtomicOrdering Ord) const {
21712 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
21713 bool IsRelease = isReleaseOrStronger(Ord);
21714
21715 // Since the intrinsics must have legal type, the i64 intrinsics take two
21716 // parameters: "i32, i32". We must marshal Val into the appropriate form
21717 // before the call.
21718 if (Val->getType()->getPrimitiveSizeInBits() == 64) {
21720 IsRelease ? Intrinsic::arm_stlexd : Intrinsic::arm_strexd;
21721 Type *Int32Ty = Type::getInt32Ty(M->getContext());
21722
21723 Value *Lo = Builder.CreateTrunc(Val, Int32Ty, "lo");
21724 Value *Hi = Builder.CreateTrunc(Builder.CreateLShr(Val, 32), Int32Ty, "hi");
21725 if (!Subtarget->isLittle())
21726 std::swap(Lo, Hi);
21727 return Builder.CreateIntrinsic(Int, {Lo, Hi, Addr});
21728 }
21729
21730 Intrinsic::ID Int = IsRelease ? Intrinsic::arm_stlex : Intrinsic::arm_strex;
21731 Type *Tys[] = { Addr->getType() };
21733
21734 CallInst *CI = Builder.CreateCall(
21735 Strex, {Builder.CreateZExtOrBitCast(
21736 Val, Strex->getFunctionType()->getParamType(0)),
21737 Addr});
21738 CI->addParamAttr(1, Attribute::get(M->getContext(), Attribute::ElementType,
21739 Val->getType()));
21740 return CI;
21741}
21742
21743
21745 return Subtarget->isMClass();
21746}
21747
21748/// A helper function for determining the number of interleaved accesses we
21749/// will generate when lowering accesses of the given type.
21750unsigned
21752 const DataLayout &DL) const {
21753 return (DL.getTypeSizeInBits(VecTy) + 127) / 128;
21754}
21755
21757 unsigned Factor, FixedVectorType *VecTy, Align Alignment,
21758 const DataLayout &DL) const {
21759
21760 unsigned VecSize = DL.getTypeSizeInBits(VecTy);
21761 unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType());
21762
21763 if (!Subtarget->hasNEON() && !Subtarget->hasMVEIntegerOps())
21764 return false;
21765
21766 // Ensure the vector doesn't have f16 elements. Even though we could do an
21767 // i16 vldN, we can't hold the f16 vectors and will end up converting via
21768 // f32.
21769 if (Subtarget->hasNEON() && VecTy->getElementType()->isHalfTy())
21770 return false;
21771 if (Subtarget->hasMVEIntegerOps() && Factor == 3)
21772 return false;
21773
21774 // Ensure the number of vector elements is greater than 1.
21775 if (VecTy->getNumElements() < 2)
21776 return false;
21777
21778 // Ensure the element type is legal.
21779 if (ElSize != 8 && ElSize != 16 && ElSize != 32)
21780 return false;
21781 // And the alignment if high enough under MVE.
21782 if (Subtarget->hasMVEIntegerOps() && Alignment < ElSize / 8)
21783 return false;
21784
21785 // Ensure the total vector size is 64 or a multiple of 128. Types larger than
21786 // 128 will be split into multiple interleaved accesses.
21787 if (Subtarget->hasNEON() && VecSize == 64)
21788 return true;
21789 return VecSize % 128 == 0;
21790}
21791
21793 if (Subtarget->hasNEON())
21794 return 4;
21795 if (Subtarget->hasMVEIntegerOps())
21798}
21799
21800/// Lower an interleaved load into a vldN intrinsic.
21801///
21802/// E.g. Lower an interleaved load (Factor = 2):
21803/// %wide.vec = load <8 x i32>, <8 x i32>* %ptr, align 4
21804/// %v0 = shuffle %wide.vec, undef, <0, 2, 4, 6> ; Extract even elements
21805/// %v1 = shuffle %wide.vec, undef, <1, 3, 5, 7> ; Extract odd elements
21806///
21807/// Into:
21808/// %vld2 = { <4 x i32>, <4 x i32> } call llvm.arm.neon.vld2(%ptr, 4)
21809/// %vec0 = extractelement { <4 x i32>, <4 x i32> } %vld2, i32 0
21810/// %vec1 = extractelement { <4 x i32>, <4 x i32> } %vld2, i32 1
21812 Instruction *Load, Value *Mask, ArrayRef<ShuffleVectorInst *> Shuffles,
21813 ArrayRef<unsigned> Indices, unsigned Factor, const APInt &GapMask) const {
21814 assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
21815 "Invalid interleave factor");
21816 assert(!Shuffles.empty() && "Empty shufflevector input");
21817 assert(Shuffles.size() == Indices.size() &&
21818 "Unmatched number of shufflevectors and indices");
21819
21820 auto *LI = dyn_cast<LoadInst>(Load);
21821 if (!LI)
21822 return false;
21823 assert(!Mask && GapMask.popcount() == Factor && "Unexpected mask on a load");
21824
21825 auto *VecTy = cast<FixedVectorType>(Shuffles[0]->getType());
21826 Type *EltTy = VecTy->getElementType();
21827
21828 const DataLayout &DL = LI->getDataLayout();
21829 Align Alignment = LI->getAlign();
21830
21831 // Skip if we do not have NEON and skip illegal vector types. We can
21832 // "legalize" wide vector types into multiple interleaved accesses as long as
21833 // the vector types are divisible by 128.
21834 if (!isLegalInterleavedAccessType(Factor, VecTy, Alignment, DL))
21835 return false;
21836
21837 unsigned NumLoads = getNumInterleavedAccesses(VecTy, DL);
21838
21839 // A pointer vector can not be the return type of the ldN intrinsics. Need to
21840 // load integer vectors first and then convert to pointer vectors.
21841 if (EltTy->isPointerTy())
21842 VecTy = FixedVectorType::get(DL.getIntPtrType(EltTy), VecTy);
21843
21844 IRBuilder<> Builder(LI);
21845
21846 // The base address of the load.
21847 Value *BaseAddr = LI->getPointerOperand();
21848
21849 if (NumLoads > 1) {
21850 // If we're going to generate more than one load, reset the sub-vector type
21851 // to something legal.
21852 VecTy = FixedVectorType::get(VecTy->getElementType(),
21853 VecTy->getNumElements() / NumLoads);
21854 }
21855
21856 assert(isTypeLegal(EVT::getEVT(VecTy)) && "Illegal vldN vector type!");
21857
21858 auto createLoadIntrinsic = [&](Value *BaseAddr) {
21859 if (Subtarget->hasNEON()) {
21860 Type *PtrTy = Builder.getPtrTy(LI->getPointerAddressSpace());
21861 Type *Tys[] = {VecTy, PtrTy};
21862 static const Intrinsic::ID LoadInts[3] = {Intrinsic::arm_neon_vld2,
21863 Intrinsic::arm_neon_vld3,
21864 Intrinsic::arm_neon_vld4};
21865
21867 Ops.push_back(BaseAddr);
21868 Ops.push_back(Builder.getInt32(LI->getAlign().value()));
21869
21870 return Builder.CreateIntrinsic(LoadInts[Factor - 2], Tys, Ops,
21871 /*FMFSource=*/nullptr, "vldN");
21872 } else {
21873 assert((Factor == 2 || Factor == 4) &&
21874 "expected interleave factor of 2 or 4 for MVE");
21875 Intrinsic::ID LoadInts =
21876 Factor == 2 ? Intrinsic::arm_mve_vld2q : Intrinsic::arm_mve_vld4q;
21877 Type *PtrTy = Builder.getPtrTy(LI->getPointerAddressSpace());
21878 Type *Tys[] = {VecTy, PtrTy};
21879
21881 Ops.push_back(BaseAddr);
21882 return Builder.CreateIntrinsic(LoadInts, Tys, Ops, /*FMFSource=*/nullptr,
21883 "vldN");
21884 }
21885 };
21886
21887 // Holds sub-vectors extracted from the load intrinsic return values. The
21888 // sub-vectors are associated with the shufflevector instructions they will
21889 // replace.
21891
21892 for (unsigned LoadCount = 0; LoadCount < NumLoads; ++LoadCount) {
21893 // If we're generating more than one load, compute the base address of
21894 // subsequent loads as an offset from the previous.
21895 if (LoadCount > 0)
21896 BaseAddr = Builder.CreateConstGEP1_32(VecTy->getElementType(), BaseAddr,
21897 VecTy->getNumElements() * Factor);
21898
21899 CallInst *VldN = createLoadIntrinsic(BaseAddr);
21900
21901 // Replace uses of each shufflevector with the corresponding vector loaded
21902 // by ldN.
21903 for (unsigned i = 0; i < Shuffles.size(); i++) {
21904 ShuffleVectorInst *SV = Shuffles[i];
21905 unsigned Index = Indices[i];
21906
21907 Value *SubVec = Builder.CreateExtractValue(VldN, Index);
21908
21909 // Convert the integer vector to pointer vector if the element is pointer.
21910 if (EltTy->isPointerTy())
21911 SubVec = Builder.CreateIntToPtr(
21912 SubVec,
21914
21915 SubVecs[SV].push_back(SubVec);
21916 }
21917 }
21918
21919 // Replace uses of the shufflevector instructions with the sub-vectors
21920 // returned by the load intrinsic. If a shufflevector instruction is
21921 // associated with more than one sub-vector, those sub-vectors will be
21922 // concatenated into a single wide vector.
21923 for (ShuffleVectorInst *SVI : Shuffles) {
21924 auto &SubVec = SubVecs[SVI];
21925 auto *WideVec =
21926 SubVec.size() > 1 ? concatenateVectors(Builder, SubVec) : SubVec[0];
21927 SVI->replaceAllUsesWith(WideVec);
21928 }
21929
21930 return true;
21931}
21932
21933/// Lower an interleaved store into a vstN intrinsic.
21934///
21935/// E.g. Lower an interleaved store (Factor = 3):
21936/// %i.vec = shuffle <8 x i32> %v0, <8 x i32> %v1,
21937/// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>
21938/// store <12 x i32> %i.vec, <12 x i32>* %ptr, align 4
21939///
21940/// Into:
21941/// %sub.v0 = shuffle <8 x i32> %v0, <8 x i32> v1, <0, 1, 2, 3>
21942/// %sub.v1 = shuffle <8 x i32> %v0, <8 x i32> v1, <4, 5, 6, 7>
21943/// %sub.v2 = shuffle <8 x i32> %v0, <8 x i32> v1, <8, 9, 10, 11>
21944/// call void llvm.arm.neon.vst3(%ptr, %sub.v0, %sub.v1, %sub.v2, 4)
21945///
21946/// Note that the new shufflevectors will be removed and we'll only generate one
21947/// vst3 instruction in CodeGen.
21948///
21949/// Example for a more general valid mask (Factor 3). Lower:
21950/// %i.vec = shuffle <32 x i32> %v0, <32 x i32> %v1,
21951/// <4, 32, 16, 5, 33, 17, 6, 34, 18, 7, 35, 19>
21952/// store <12 x i32> %i.vec, <12 x i32>* %ptr
21953///
21954/// Into:
21955/// %sub.v0 = shuffle <32 x i32> %v0, <32 x i32> v1, <4, 5, 6, 7>
21956/// %sub.v1 = shuffle <32 x i32> %v0, <32 x i32> v1, <32, 33, 34, 35>
21957/// %sub.v2 = shuffle <32 x i32> %v0, <32 x i32> v1, <16, 17, 18, 19>
21958/// call void llvm.arm.neon.vst3(%ptr, %sub.v0, %sub.v1, %sub.v2, 4)
21960 Value *LaneMask,
21961 ShuffleVectorInst *SVI,
21962 unsigned Factor,
21963 const APInt &GapMask) const {
21964 assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
21965 "Invalid interleave factor");
21966 auto *SI = dyn_cast<StoreInst>(Store);
21967 if (!SI)
21968 return false;
21969 assert(!LaneMask && GapMask.popcount() == Factor &&
21970 "Unexpected mask on store");
21971
21972 auto *VecTy = cast<FixedVectorType>(SVI->getType());
21973 assert(VecTy->getNumElements() % Factor == 0 && "Invalid interleaved store");
21974
21975 unsigned LaneLen = VecTy->getNumElements() / Factor;
21976 Type *EltTy = VecTy->getElementType();
21977 auto *SubVecTy = FixedVectorType::get(EltTy, LaneLen);
21978
21979 const DataLayout &DL = SI->getDataLayout();
21980 Align Alignment = SI->getAlign();
21981
21982 // Skip if we do not have NEON and skip illegal vector types. We can
21983 // "legalize" wide vector types into multiple interleaved accesses as long as
21984 // the vector types are divisible by 128.
21985 if (!isLegalInterleavedAccessType(Factor, SubVecTy, Alignment, DL))
21986 return false;
21987
21988 unsigned NumStores = getNumInterleavedAccesses(SubVecTy, DL);
21989
21990 Value *Op0 = SVI->getOperand(0);
21991 Value *Op1 = SVI->getOperand(1);
21992 IRBuilder<> Builder(SI);
21993
21994 // StN intrinsics don't support pointer vectors as arguments. Convert pointer
21995 // vectors to integer vectors.
21996 if (EltTy->isPointerTy()) {
21997 Type *IntTy = DL.getIntPtrType(EltTy);
21998
21999 // Convert to the corresponding integer vector.
22000 auto *IntVecTy =
22002 Op0 = Builder.CreatePtrToInt(Op0, IntVecTy);
22003 Op1 = Builder.CreatePtrToInt(Op1, IntVecTy);
22004
22005 SubVecTy = FixedVectorType::get(IntTy, LaneLen);
22006 }
22007
22008 // The base address of the store.
22009 Value *BaseAddr = SI->getPointerOperand();
22010
22011 if (NumStores > 1) {
22012 // If we're going to generate more than one store, reset the lane length
22013 // and sub-vector type to something legal.
22014 LaneLen /= NumStores;
22015 SubVecTy = FixedVectorType::get(SubVecTy->getElementType(), LaneLen);
22016 }
22017
22018 assert(isTypeLegal(EVT::getEVT(SubVecTy)) && "Illegal vstN vector type!");
22019
22020 auto Mask = SVI->getShuffleMask();
22021
22022 auto createStoreIntrinsic = [&](Value *BaseAddr,
22023 SmallVectorImpl<Value *> &Shuffles) {
22024 if (Subtarget->hasNEON()) {
22025 static const Intrinsic::ID StoreInts[3] = {Intrinsic::arm_neon_vst2,
22026 Intrinsic::arm_neon_vst3,
22027 Intrinsic::arm_neon_vst4};
22028 Type *PtrTy = Builder.getPtrTy(SI->getPointerAddressSpace());
22029 Type *Tys[] = {PtrTy, SubVecTy};
22030
22032 Ops.push_back(BaseAddr);
22033 append_range(Ops, Shuffles);
22034 Ops.push_back(Builder.getInt32(SI->getAlign().value()));
22035 Builder.CreateIntrinsic(StoreInts[Factor - 2], Tys, Ops);
22036 } else {
22037 assert((Factor == 2 || Factor == 4) &&
22038 "expected interleave factor of 2 or 4 for MVE");
22039 Intrinsic::ID StoreInts =
22040 Factor == 2 ? Intrinsic::arm_mve_vst2q : Intrinsic::arm_mve_vst4q;
22041 Type *PtrTy = Builder.getPtrTy(SI->getPointerAddressSpace());
22042 Type *Tys[] = {PtrTy, SubVecTy};
22043
22045 Ops.push_back(BaseAddr);
22046 append_range(Ops, Shuffles);
22047 for (unsigned F = 0; F < Factor; F++) {
22048 Ops.push_back(Builder.getInt32(F));
22049 Builder.CreateIntrinsic(StoreInts, Tys, Ops);
22050 Ops.pop_back();
22051 }
22052 }
22053 };
22054
22055 for (unsigned StoreCount = 0; StoreCount < NumStores; ++StoreCount) {
22056 // If we generating more than one store, we compute the base address of
22057 // subsequent stores as an offset from the previous.
22058 if (StoreCount > 0)
22059 BaseAddr = Builder.CreateConstGEP1_32(SubVecTy->getElementType(),
22060 BaseAddr, LaneLen * Factor);
22061
22062 SmallVector<Value *, 4> Shuffles;
22063
22064 // Split the shufflevector operands into sub vectors for the new vstN call.
22065 for (unsigned i = 0; i < Factor; i++) {
22066 unsigned IdxI = StoreCount * LaneLen * Factor + i;
22067 if (Mask[IdxI] >= 0) {
22068 Shuffles.push_back(Builder.CreateShuffleVector(
22069 Op0, Op1, createSequentialMask(Mask[IdxI], LaneLen, 0)));
22070 } else {
22071 unsigned StartMask = 0;
22072 for (unsigned j = 1; j < LaneLen; j++) {
22073 unsigned IdxJ = StoreCount * LaneLen * Factor + j;
22074 if (Mask[IdxJ * Factor + IdxI] >= 0) {
22075 StartMask = Mask[IdxJ * Factor + IdxI] - IdxJ;
22076 break;
22077 }
22078 }
22079 // Note: If all elements in a chunk are undefs, StartMask=0!
22080 // Note: Filling undef gaps with random elements is ok, since
22081 // those elements were being written anyway (with undefs).
22082 // In the case of all undefs we're defaulting to using elems from 0
22083 // Note: StartMask cannot be negative, it's checked in
22084 // isReInterleaveMask
22085 Shuffles.push_back(Builder.CreateShuffleVector(
22086 Op0, Op1, createSequentialMask(StartMask, LaneLen, 0)));
22087 }
22088 }
22089
22090 createStoreIntrinsic(BaseAddr, Shuffles);
22091 }
22092 return true;
22093}
22094
22102
22104 uint64_t &Members) {
22105 if (auto *ST = dyn_cast<StructType>(Ty)) {
22106 for (unsigned i = 0; i < ST->getNumElements(); ++i) {
22107 uint64_t SubMembers = 0;
22108 if (!isHomogeneousAggregate(ST->getElementType(i), Base, SubMembers))
22109 return false;
22110 Members += SubMembers;
22111 }
22112 } else if (auto *AT = dyn_cast<ArrayType>(Ty)) {
22113 uint64_t SubMembers = 0;
22114 if (!isHomogeneousAggregate(AT->getElementType(), Base, SubMembers))
22115 return false;
22116 Members += SubMembers * AT->getNumElements();
22117 } else if (Ty->isFloatTy()) {
22118 if (Base != HA_UNKNOWN && Base != HA_FLOAT)
22119 return false;
22120 Members = 1;
22121 Base = HA_FLOAT;
22122 } else if (Ty->isDoubleTy()) {
22123 if (Base != HA_UNKNOWN && Base != HA_DOUBLE)
22124 return false;
22125 Members = 1;
22126 Base = HA_DOUBLE;
22127 } else if (auto *VT = dyn_cast<VectorType>(Ty)) {
22128 Members = 1;
22129 switch (Base) {
22130 case HA_FLOAT:
22131 case HA_DOUBLE:
22132 return false;
22133 case HA_VECT64:
22134 return VT->getPrimitiveSizeInBits().getFixedValue() == 64;
22135 case HA_VECT128:
22136 return VT->getPrimitiveSizeInBits().getFixedValue() == 128;
22137 case HA_UNKNOWN:
22138 switch (VT->getPrimitiveSizeInBits().getFixedValue()) {
22139 case 64:
22140 Base = HA_VECT64;
22141 return true;
22142 case 128:
22143 Base = HA_VECT128;
22144 return true;
22145 default:
22146 return false;
22147 }
22148 }
22149 }
22150
22151 return (Members > 0 && Members <= 4);
22152}
22153
22154/// Return the correct alignment for the current calling convention.
22156 Type *ArgTy, const DataLayout &DL) const {
22157 const Align ABITypeAlign = DL.getABITypeAlign(ArgTy);
22158 if (!ArgTy->isVectorTy())
22159 return ABITypeAlign;
22160
22161 // Avoid over-aligning vector parameters. It would require realigning the
22162 // stack and waste space for no real benefit.
22163 MaybeAlign StackAlign = DL.getStackAlignment();
22164 assert(StackAlign && "data layout string is missing stack alignment");
22165 return std::min(ABITypeAlign, *StackAlign);
22166}
22167
22168/// Return true if a type is an AAPCS-VFP homogeneous aggregate or one of
22169/// [N x i32] or [N x i64]. This allows front-ends to skip emitting padding when
22170/// passing according to AAPCS rules.
22172 Type *Ty, CallingConv::ID CallConv, bool isVarArg,
22173 const DataLayout &DL) const {
22174 if (getEffectiveCallingConv(CallConv, isVarArg) !=
22176 return false;
22177
22179 uint64_t Members = 0;
22180 bool IsHA = isHomogeneousAggregate(Ty, Base, Members);
22181 LLVM_DEBUG(dbgs() << "isHA: " << IsHA << " "; Ty->dump());
22182
22183 bool IsIntArray = Ty->isArrayTy() && Ty->getArrayElementType()->isIntegerTy();
22184 return IsHA || IsIntArray;
22185}
22186
22188 const Constant *PersonalityFn) const {
22189 // Platforms which do not use SjLj EH may return values in these registers
22190 // via the personality function.
22192 return EM == ExceptionHandling::SjLj ? Register() : ARM::R0;
22193}
22194
22196 const Constant *PersonalityFn) const {
22197 // Platforms which do not use SjLj EH may return values in these registers
22198 // via the personality function.
22200 return EM == ExceptionHandling::SjLj ? Register() : ARM::R1;
22201}
22202
22203void ARMTargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
22204 // Update IsSplitCSR in ARMFunctionInfo.
22205 ARMFunctionInfo *AFI = Entry->getParent()->getInfo<ARMFunctionInfo>();
22206 AFI->setIsSplitCSR(true);
22207}
22208
22209void ARMTargetLowering::insertCopiesSplitCSR(
22210 MachineBasicBlock *Entry,
22211 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
22212 const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo();
22213 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
22214 if (!IStart)
22215 return;
22216
22217 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
22218 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
22219 MachineBasicBlock::iterator MBBI = Entry->begin();
22220 for (const MCPhysReg *I = IStart; *I; ++I) {
22221 const TargetRegisterClass *RC = nullptr;
22222 if (ARM::GPRRegClass.contains(*I))
22223 RC = &ARM::GPRRegClass;
22224 else if (ARM::DPRRegClass.contains(*I))
22225 RC = &ARM::DPRRegClass;
22226 else
22227 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
22228
22229 Register NewVR = MRI->createVirtualRegister(RC);
22230 // Create copy from CSR to a virtual register.
22231 // FIXME: this currently does not emit CFI pseudo-instructions, it works
22232 // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
22233 // nounwind. If we want to generalize this later, we may need to emit
22234 // CFI pseudo-instructions.
22235 assert(Entry->getParent()->getFunction().hasFnAttribute(
22236 Attribute::NoUnwind) &&
22237 "Function should be nounwind in insertCopiesSplitCSR!");
22238 Entry->addLiveIn(*I);
22239 BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
22240 .addReg(*I);
22241
22242 // Insert the copy-back instructions right before the terminator.
22243 for (auto *Exit : Exits)
22244 BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
22245 TII->get(TargetOpcode::COPY), *I)
22246 .addReg(NewVR);
22247 }
22248}
22249
22254
22256 return Subtarget->hasMVEIntegerOps();
22257}
22258
22261 auto *VTy = dyn_cast<FixedVectorType>(Ty);
22262 if (!VTy)
22263 return false;
22264
22265 auto *ScalarTy = VTy->getScalarType();
22266 unsigned NumElements = VTy->getNumElements();
22267
22268 unsigned VTyWidth = VTy->getScalarSizeInBits() * NumElements;
22269 if (VTyWidth < 128 || !llvm::isPowerOf2_32(VTyWidth))
22270 return false;
22271
22272 // Both VCADD and VCMUL/VCMLA support the same types, F16 and F32
22273 if (ScalarTy->isHalfTy() || ScalarTy->isFloatTy())
22274 return Subtarget->hasMVEFloatOps();
22275
22277 return false;
22278
22279 return Subtarget->hasMVEIntegerOps() &&
22280 (ScalarTy->isIntegerTy(8) || ScalarTy->isIntegerTy(16) ||
22281 ScalarTy->isIntegerTy(32));
22282}
22283
22285 static const MCPhysReg RCRegs[] = {ARM::FPSCR_RM};
22286 return RCRegs;
22287}
22288
22291 ComplexDeinterleavingRotation Rotation, Value *InputA, Value *InputB,
22292 Value *Accumulator) const {
22293
22295
22296 unsigned TyWidth = Ty->getScalarSizeInBits() * Ty->getNumElements();
22297
22298 assert(TyWidth >= 128 && "Width of vector type must be at least 128 bits");
22299
22300 if (TyWidth > 128) {
22301 int Stride = Ty->getNumElements() / 2;
22302 auto SplitSeq = llvm::seq<int>(0, Ty->getNumElements());
22303 auto SplitSeqVec = llvm::to_vector(SplitSeq);
22304 ArrayRef<int> LowerSplitMask(&SplitSeqVec[0], Stride);
22305 ArrayRef<int> UpperSplitMask(&SplitSeqVec[Stride], Stride);
22306
22307 auto *LowerSplitA = B.CreateShuffleVector(InputA, LowerSplitMask);
22308 auto *LowerSplitB = B.CreateShuffleVector(InputB, LowerSplitMask);
22309 auto *UpperSplitA = B.CreateShuffleVector(InputA, UpperSplitMask);
22310 auto *UpperSplitB = B.CreateShuffleVector(InputB, UpperSplitMask);
22311 Value *LowerSplitAcc = nullptr;
22312 Value *UpperSplitAcc = nullptr;
22313
22314 if (Accumulator) {
22315 LowerSplitAcc = B.CreateShuffleVector(Accumulator, LowerSplitMask);
22316 UpperSplitAcc = B.CreateShuffleVector(Accumulator, UpperSplitMask);
22317 }
22318
22319 auto *LowerSplitInt = createComplexDeinterleavingIR(
22320 B, OperationType, Rotation, LowerSplitA, LowerSplitB, LowerSplitAcc);
22321 auto *UpperSplitInt = createComplexDeinterleavingIR(
22322 B, OperationType, Rotation, UpperSplitA, UpperSplitB, UpperSplitAcc);
22323
22324 ArrayRef<int> JoinMask(&SplitSeqVec[0], Ty->getNumElements());
22325 return B.CreateShuffleVector(LowerSplitInt, UpperSplitInt, JoinMask);
22326 }
22327
22328 auto *IntTy = Type::getInt32Ty(B.getContext());
22329
22330 ConstantInt *ConstRotation = nullptr;
22331 if (OperationType == ComplexDeinterleavingOperation::CMulPartial) {
22332 ConstRotation = ConstantInt::get(IntTy, (int)Rotation);
22333
22334 if (Accumulator)
22335 return B.CreateIntrinsic(Intrinsic::arm_mve_vcmlaq, Ty,
22336 {ConstRotation, Accumulator, InputB, InputA});
22337 return B.CreateIntrinsic(Intrinsic::arm_mve_vcmulq, Ty,
22338 {ConstRotation, InputB, InputA});
22339 }
22340
22341 if (OperationType == ComplexDeinterleavingOperation::CAdd) {
22342 // 1 means the value is not halved.
22343 auto *ConstHalving = ConstantInt::get(IntTy, 1);
22344
22346 ConstRotation = ConstantInt::get(IntTy, 0);
22348 ConstRotation = ConstantInt::get(IntTy, 1);
22349
22350 if (!ConstRotation)
22351 return nullptr; // Invalid rotation for arm_mve_vcaddq
22352
22353 return B.CreateIntrinsic(Intrinsic::arm_mve_vcaddq, Ty,
22354 {ConstHalving, ConstRotation, InputA, InputB});
22355 }
22356
22357 return nullptr;
22358}
static bool isAddSubSExt(SDValue N, SelectionDAG &DAG)
static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, int64_t &Cnt)
isVShiftRImm - Check if this is a valid build_vector for the immediate operand of a vector shift righ...
static bool isExtendedBUILD_VECTOR(SDValue N, SelectionDAG &DAG, bool isSigned)
static SDValue carryFlagToValue(SDValue Glue, EVT VT, SelectionDAG &DAG, bool Invert)
static SDValue overflowFlagToValue(SDValue Glue, EVT VT, SelectionDAG &DAG)
static bool isZeroExtended(SDValue N, SelectionDAG &DAG)
return SDValue()
static bool isCMN(SDValue Op, ISD::CondCode CC, SelectionDAG &DAG)
static const MCPhysReg GPRArgRegs[]
static SDValue valueToCarryFlag(SDValue Value, SelectionDAG &DAG, bool Invert)
static SDValue GeneratePerfectShuffle(unsigned ID, SDValue V1, SDValue V2, unsigned PFEntry, SDValue LHS, SDValue RHS, SelectionDAG &DAG, const SDLoc &DL)
GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit the specified operations t...
constexpr MVT FlagsVT
Value type used for NZCV flags.
static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt)
getVShiftImm - Check if this is a valid build_vector for the immediate operand of a vector shift oper...
static bool optimizeLogicalImm(SDValue Op, unsigned Size, uint64_t Imm, const APInt &Demanded, TargetLowering::TargetLoweringOpt &TLO, unsigned NewOpc)
static bool isSafeSignedCMN(SDValue Op, SelectionDAG &DAG)
static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG)
static bool isSignExtended(SDValue N, SelectionDAG &DAG)
static bool isAddSubZExt(SDValue N, SelectionDAG &DAG)
static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt)
isVShiftLImm - Check if this is a valid build_vector for the immediate operand of a vector shift left...
static bool canGuaranteeTCO(CallingConv::ID CC, bool GuaranteeTailCalls)
Return true if the calling convention is one that we can guarantee TCO for.
unsigned RegSize
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
amdgpu aa AMDGPU Address space based Alias Analysis Wrapper
static bool isConstant(const MachineInstr &MI)
constexpr LLT F64
constexpr LLT S1
This file declares a class to represent arbitrary precision floating point values and provide a varie...
This file implements a class to represent arbitrary precision integral constant values and operations...
static SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG)
static bool isStore(int Opcode)
static bool isThumb(const MCSubtargetInfo &STI)
static SDValue PerformExtractEltToVMOVRRD(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static bool isIncompatibleReg(const MCPhysReg &PR, MVT VT)
static SDValue PerformVQDMULHCombine(SDNode *N, SelectionDAG &DAG)
static SDValue LowerBUILD_VECTOR_i1(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue LowerShift(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue AddRequiredExtensionForVMULL(SDValue N, SelectionDAG &DAG, const EVT &OrigTy, const EVT &ExtTy, unsigned ExtOpcode)
AddRequiredExtensionForVMULL - Add a sign/zero extension to extend the total value size to 64 bits.
static cl::opt< unsigned > ConstpoolPromotionMaxSize("arm-promote-constant-max-size", cl::Hidden, cl::desc("Maximum size of constant to promote into a constant pool"), cl::init(64))
static bool isZeroOrAllOnes(SDValue N, bool AllOnes)
static SDValue LowerINSERT_VECTOR_ELT_i1(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static bool isVTBLMask(ArrayRef< int > M, EVT VT)
static SDValue PerformSUBCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformSUBCombine - Target-specific dag combine xforms for ISD::SUB.
static cl::opt< bool > EnableConstpoolPromotion("arm-promote-constant", cl::Hidden, cl::desc("Enable / disable promotion of unnamed_addr constants into " "constant pools"), cl::init(false))
static SDValue PerformFAddVSelectCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue PerformExtractFpToIntStores(StoreSDNode *St, SelectionDAG &DAG)
static SDValue PerformVDUPCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
PerformVDUPCombine - Target-specific dag combine xforms for ARMISD::VDUP.
static SDValue PerformExtractEltCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *ST)
static const APInt * isPowerOf2Constant(SDValue V)
static SDValue PerformVCVTCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
PerformVCVTCombine - VCVT (floating-point to fixed-point, Advanced SIMD) can replace combinations of ...
static SDValue PerformVMOVhrCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue LowerVectorFP_TO_INT(SDValue Op, SelectionDAG &DAG)
static SDValue LowerVECTOR_SHUFFLEUsingOneOff(SDValue Op, ArrayRef< int > ShuffleMask, SelectionDAG &DAG)
static bool isValidMVECond(unsigned CC, bool IsFloat)
static SDValue PerformPREDICATE_CASTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static ARMCC::CondCodes IntCCToARMCC(ISD::CondCode CC)
IntCCToARMCC - Convert a DAG integer condition code to an ARM CC.
static SDValue PerformSTORECombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformSTORECombine - Target-specific dag combine xforms for ISD::STORE.
static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static bool isGTorGE(ISD::CondCode CC)
static bool CombineVLDDUP(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
CombineVLDDUP - For a VDUPLANE node N, check if its source operand is a vldN-lane (N > 1) intrinsic,...
static SDValue ParseBFI(SDNode *N, APInt &ToMask, APInt &FromMask)
static bool isReverseMask(ArrayRef< int > M, EVT VT)
static bool isVZIP_v_undef_Mask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
isVZIP_v_undef_Mask - Special case of isVZIPMask for canonical form of "vector_shuffle v,...
static SDValue PerformSELECTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue AddCombineTo64bitUMAAL(SDNode *AddeNode, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue PerformVECTOR_REG_CASTCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue PerformVMulVCTPCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
PerformVMulVCTPCombine - VCVT (fixed-point to floating-point, Advanced SIMD) can replace combinations...
static SDValue createGPRPairNode2xi32(SelectionDAG &DAG, SDValue V0, SDValue V1)
static SDValue bitcastf32Toi32(SDValue Op, SelectionDAG &DAG)
static bool findPointerConstIncrement(SDNode *N, SDValue *Ptr, SDValue *CInc)
static bool isVTRNMask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static bool CanInvertMVEVCMP(SDValue N)
static SDValue PerformLongShiftCombine(SDNode *N, SelectionDAG &DAG)
static SDValue AddCombineToVPADD(SDNode *N, SDValue N0, SDValue N1, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue PerformShiftCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *ST)
PerformShiftCombine - Checks for immediate versions of vector shifts and lowers them.
static void FPCCToARMCC(ISD::CondCode CC, ARMCC::CondCodes &CondCode, ARMCC::CondCodes &CondCode2)
FPCCToARMCC - Convert a DAG fp condition code to an ARM CC.
static void ExpandREAD_REGISTER(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG)
static EVT getVectorTyFromPredicateVector(EVT VT)
static SDValue PerformFADDVCMLACombine(SDNode *N, SelectionDAG &DAG)
static SDValue handleCMSEValue(const SDValue &Value, const ISD::InputArg &Arg, SelectionDAG &DAG, const SDLoc &DL)
static SDValue PerformARMBUILD_VECTORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
Target-specific dag combine xforms for ARMISD::BUILD_VECTOR.
static bool isSRL16(const SDValue &Op)
static SDValue PerformVMOVrhCombine(SDNode *N, SelectionDAG &DAG)
static SDValue PerformLOADCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue IsCMPZCSINC(SDNode *Cmp, ARMCC::CondCodes &CC)
static unsigned getPointerConstIncrement(unsigned Opcode, SDValue Ptr, SDValue Inc, const SelectionDAG &DAG)
static SDValue combineSelectAndUseCommutative(SDNode *N, bool AllOnes, TargetLowering::DAGCombinerInfo &DCI)
static SDValue LowerATOMIC_FENCE(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static Register genTPEntry(MachineBasicBlock *TpEntry, MachineBasicBlock *TpLoopBody, MachineBasicBlock *TpExit, Register OpSizeReg, const TargetInstrInfo *TII, DebugLoc Dl, MachineRegisterInfo &MRI)
Adds logic in loop entry MBB to calculate loop iteration count and adds t2WhileLoopSetup and t2WhileL...
static SDValue createGPRPairNodei64(SelectionDAG &DAG, SDValue V)
static bool isLTorLE(ISD::CondCode CC)
static SDValue PerformVCMPCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue PerformMVEVMULLCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue LowerSDIV_v4i16(SDValue N0, SDValue N1, const SDLoc &dl, SelectionDAG &DAG)
static SDValue performNegCMovCombine(SDNode *N, SelectionDAG &DAG)
static EVT getExtensionTo64Bits(const EVT &OrigVT)
static SDValue PerformBITCASTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *ST)
static SDValue AddCombineTo64bitMLAL(SDNode *AddeSubeNode, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue LowerWRITE_REGISTER(SDValue Op, SelectionDAG &DAG)
static bool checkAndUpdateCPSRKill(MachineBasicBlock::iterator SelectItr, MachineBasicBlock *BB, const TargetRegisterInfo *TRI)
static SDValue PerformCMPZCombine(SDNode *N, SelectionDAG &DAG)
static bool hasNormalLoadOperand(SDNode *N)
hasNormalLoadOperand - Check if any of the operands of a BUILD_VECTOR node are normal,...
static SDValue PerformInsertEltCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
PerformInsertEltCombine - Target-specific dag combine xforms for ISD::INSERT_VECTOR_ELT.
static SDValue PerformVDUPLANECombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformVDUPLANECombine - Target-specific dag combine xforms for ARMISD::VDUPLANE.
static SDValue LowerBuildVectorOfFPTrunc(SDValue BV, SelectionDAG &DAG, const ARMSubtarget *ST)
static cl::opt< unsigned > ConstpoolPromotionMaxTotal("arm-promote-constant-max-total", cl::Hidden, cl::desc("Maximum size of ALL constants to promote into a constant pool"), cl::init(128))
static SDValue LowerTruncatei1(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
static RTLIB::Libcall getDivRemLibcall(const SDNode *N, MVT::SimpleValueType SVT)
static SDValue SkipLoadExtensionForVMULL(LoadSDNode *LD, SelectionDAG &DAG)
SkipLoadExtensionForVMULL - return a load of the original vector size that does not do any sign/zero ...
static SDValue AddCombineVUZPToVPADDL(SDNode *N, SDValue N0, SDValue N1, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue PerformADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformADDCombineWithOperands - Try DAG combinations for an ADD with operands N0 and N1.
static SDValue PromoteMVEPredVector(SDLoc dl, SDValue Pred, EVT VT, SelectionDAG &DAG)
static SDValue matchCSET(unsigned &Opcode, bool &InvertCond, SDValue TrueVal, SDValue FalseVal, const ARMSubtarget *Subtarget)
static bool isVZIPMask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
static SDValue PerformORCombineToSMULWBT(SDNode *OR, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static bool isVTRN_v_undef_Mask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
isVTRN_v_undef_Mask - Special case of isVTRNMask for canonical form of "vector_shuffle v,...
static SDValue LowerUDIV(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue FindBFIToCombineWith(SDNode *N)
static SDValue LowerADDSUBSAT(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static void checkVSELConstraints(ISD::CondCode CC, ARMCC::CondCodes &CondCode, bool &swpCmpOps, bool &swpVselOps)
static void ReplaceLongIntrinsic(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG)
static bool isS16(const SDValue &Op, SelectionDAG &DAG)
static bool isSRA16(const SDValue &Op)
static SDValue AddCombineBUILD_VECTORToVPADDL(SDNode *N, SDValue N0, SDValue N1, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue LowerVECTOR_SHUFFLEUsingMovs(SDValue Op, ArrayRef< int > ShuffleMask, SelectionDAG &DAG)
static SDValue LowerInterruptReturn(SmallVectorImpl< SDValue > &RetOps, const SDLoc &DL, SelectionDAG &DAG)
static SDValue LowerEXTRACT_VECTOR_ELT_i1(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue getInvertedARMCondCode(SDValue ARMcc, SelectionDAG &DAG)
static SDValue LowerSDIV_v4i8(SDValue X, SDValue Y, const SDLoc &dl, SelectionDAG &DAG)
static void expandf64Toi32(SDValue Op, SelectionDAG &DAG, SDValue &RetVal1, SDValue &RetVal2)
static SDValue LowerCONCAT_VECTORS_i1(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue LowerCTTZ(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue PerformVLDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static bool isSHL16(const SDValue &Op)
static bool isVEXTMask(ArrayRef< int > M, EVT VT, bool &ReverseVEXT, unsigned &Imm)
static SDValue PerformMVEVLDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
cl::opt< unsigned > ArmMaxBaseUpdatesToCheck("arm-max-base-updates-to-check", cl::Hidden, cl::desc("Maximum number of base-updates to check generating postindex."), cl::init(64))
static bool isTruncMask(ArrayRef< int > M, EVT VT, bool Top, bool SingleSource)
static SDValue PerformADDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformADDCombine - Target-specific dag combine xforms for ISD::ADD.
static unsigned getLdOpcode(unsigned LdSize, bool IsThumb1, bool IsThumb2)
Return the load opcode for a given load size.
static SDValue LowerADDSUBO_CARRY(SDValue Op, SelectionDAG &DAG, unsigned Opcode, bool IsSigned)
static bool isLegalT2AddressImmediate(int64_t V, EVT VT, const ARMSubtarget *Subtarget)
static bool isLegalMVEShuffleOp(unsigned PFEntry)
static SDValue PerformSignExtendInregCombine(SDNode *N, SelectionDAG &DAG)
static SDValue PerformShuffleVMOVNCombine(ShuffleVectorSDNode *N, SelectionDAG &DAG)
static bool isVUZPMask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
static SDValue PerformVECTOR_SHUFFLECombine(SDNode *N, SelectionDAG &DAG)
PerformVECTOR_SHUFFLECombine - Target-specific dag combine xforms for ISD::VECTOR_SHUFFLE.
static SDValue SkipExtensionForVMULL(SDNode *N, SelectionDAG &DAG)
SkipExtensionForVMULL - For a node that is a SIGN_EXTEND, ZERO_EXTEND, ANY_EXTEND,...
static int getNegationCost(SDValue Op)
static bool isVMOVNTruncMask(ArrayRef< int > M, EVT ToVT, bool rev)
static SDValue PerformVQMOVNCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static MachineBasicBlock * OtherSucc(MachineBasicBlock *MBB, MachineBasicBlock *Succ)
static SDValue LowerVecReduceMinMax(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue PerformFPExtendCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue PerformAddcSubcCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue PerformVSELECTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static TargetLowering::ArgListTy getDivRemArgList(const SDNode *N, LLVMContext *Context, const ARMSubtarget *Subtarget)
static SDValue PerformVECREDUCE_ADDCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue getZeroVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl)
getZeroVector - Returns a vector of specified type with all zero elements.
static SDValue LowerAtomicLoadStore(SDValue Op, SelectionDAG &DAG)
static SDValue PerformSplittingToNarrowingStores(StoreSDNode *St, SelectionDAG &DAG)
static bool getT2IndexedAddressParts(SDNode *Ptr, EVT VT, bool isSEXTLoad, SDValue &Base, SDValue &Offset, bool &isInc, SelectionDAG &DAG)
static ARMCC::CondCodes getVCMPCondCode(SDValue N)
static cl::opt< bool > ARMInterworking("arm-interworking", cl::Hidden, cl::desc("Enable / disable ARM interworking (for debugging only)"), cl::init(true))
static void ReplaceREADCYCLECOUNTER(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue PerformORCombineToBFI(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static bool isConditionalZeroOrAllOnes(SDNode *N, bool AllOnes, SDValue &CC, bool &Invert, SDValue &OtherOp, SelectionDAG &DAG)
static SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue PerformVSetCCToVCTPCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue LowerBUILD_VECTORToVIDUP(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static bool isZeroVector(SDValue N)
static SDValue PerformAddeSubeCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static void ReplaceCMP_SWAP_64Results(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG)
static bool isLowerSaturate(const SDValue LHS, const SDValue RHS, const SDValue TrueVal, const SDValue FalseVal, const ISD::CondCode CC, const SDValue K)
static bool isLegalLogicalImmediate(unsigned Imm, const ARMSubtarget *Subtarget)
static SDValue LowerPredicateLoad(SDValue Op, SelectionDAG &DAG)
static void emitPostSt(MachineBasicBlock *BB, MachineBasicBlock::iterator Pos, const TargetInstrInfo *TII, const DebugLoc &dl, unsigned StSize, unsigned Data, unsigned AddrIn, unsigned AddrOut, bool IsThumb1, bool IsThumb2)
Emit a post-increment store operation with given size.
static bool isVMOVNMask(ArrayRef< int > M, EVT VT, bool Top, bool SingleSource)
static SDValue CombineBaseUpdate(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
CombineBaseUpdate - Target-specific DAG combine function for VLDDUP, NEON load/store intrinsics,...
static SDValue LowerSaturatingConditional(SDValue Op, SelectionDAG &DAG)
static SDValue PerformSubCSINCCombine(SDNode *N, SelectionDAG &DAG)
static SDValue PerformVMOVRRDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformVMOVRRDCombine - Target-specific dag combine xforms for ARMISD::VMOVRRD.
static SDValue LowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue PerformCSETCombine(SDNode *N, SelectionDAG &DAG)
static SDValue PerformVMOVNCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue PerformInsertSubvectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue LowerVectorExtend(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue WinDBZCheckDenominator(SelectionDAG &DAG, SDNode *N, SDValue InChain)
static SDValue LowerVECTOR_SHUFFLEv8i8(SDValue Op, ArrayRef< int > ShuffleMask, SelectionDAG &DAG)
static SDValue PerformVMULCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformVMULCombine Distribute (A + B) * C to (A * C) + (B * C) to take advantage of the special multi...
static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG)
static SDValue PerformBFICombine(SDNode *N, SelectionDAG &DAG)
static SDValue PerformORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformORCombine - Target-specific dag combine xforms for ISD::OR.
static SDValue LowerMLOAD(SDValue Op, SelectionDAG &DAG)
static SDValue PerformTruncatingStoreCombine(StoreSDNode *St, SelectionDAG &DAG)
static unsigned SelectPairHalf(unsigned Elements, ArrayRef< int > Mask, unsigned Index)
static void emitPostLd(MachineBasicBlock *BB, MachineBasicBlock::iterator Pos, const TargetInstrInfo *TII, const DebugLoc &dl, unsigned LdSize, unsigned Data, unsigned AddrIn, unsigned AddrOut, bool IsThumb1, bool IsThumb2)
Emit a post-increment load operation with given size.
static SDValue TryDistrubutionADDVecReduce(SDNode *N, SelectionDAG &DAG)
static bool isValidBaseUpdate(SDNode *N, SDNode *User)
static SDValue IsSingleInstrConstant(SDValue N, SelectionDAG &DAG, const ARMSubtarget *ST, const SDLoc &dl)
static bool IsQRMVEInstruction(const SDNode *N, const SDNode *Op)
static SDValue PerformMinMaxToSatCombine(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue PerformXORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static bool getMVEIndexedAddressParts(SDNode *Ptr, EVT VT, Align Alignment, bool isSEXTLoad, bool IsMasked, bool isLE, SDValue &Base, SDValue &Offset, bool &isInc, SelectionDAG &DAG)
std::pair< unsigned, const TargetRegisterClass * > RCPair
static SDValue combineSelectAndUse(SDNode *N, SDValue Slct, SDValue OtherOp, TargetLowering::DAGCombinerInfo &DCI, bool AllOnes=false)
static SDValue PerformExtendCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
PerformExtendCombine - Target-specific DAG combining for ISD::SIGN_EXTEND, ISD::ZERO_EXTEND,...
static SDValue LowerSDIV(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
cl::opt< unsigned > MVEMaxSupportedInterleaveFactor("mve-max-interleave-factor", cl::Hidden, cl::desc("Maximum interleave factor for MVE VLDn to generate."), cl::init(2))
static SDValue isVMOVModifiedImm(uint64_t SplatBits, uint64_t SplatUndef, unsigned SplatBitSize, SelectionDAG &DAG, const SDLoc &dl, EVT &VT, EVT VectorVT, VMOVModImmType type)
isVMOVModifiedImm - Check if the specified splat value corresponds to a valid vector constant for a N...
static SDValue LowerBuildVectorOfFPExt(SDValue BV, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue CombineVMOVDRRCandidateWithVecOp(const SDNode *BC, SelectionDAG &DAG)
BC is a bitcast that is about to be turned into a VMOVDRR.
static SDValue promoteToConstantPool(const ARMTargetLowering *TLI, const GlobalValue *GV, SelectionDAG &DAG, EVT PtrVT, const SDLoc &dl)
static unsigned isNEONTwoResultShuffleMask(ArrayRef< int > ShuffleMask, EVT VT, unsigned &WhichResult, bool &isV_UNDEF)
Check if ShuffleMask is a NEON two-result shuffle (VZIP, VUZP, VTRN), and return the corresponding AR...
static bool BitsProperlyConcatenate(const APInt &A, const APInt &B)
static bool getARMIndexedAddressParts(SDNode *Ptr, EVT VT, bool isSEXTLoad, SDValue &Base, SDValue &Offset, bool &isInc, SelectionDAG &DAG)
static SDValue LowerVecReduce(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue LowerVectorINT_TO_FP(SDValue Op, SelectionDAG &DAG)
static bool TryCombineBaseUpdate(struct BaseUpdateTarget &Target, struct BaseUpdateUser &User, bool SimpleConstIncOnly, TargetLowering::DAGCombinerInfo &DCI)
static bool allUsersAreInFunction(const Value *V, const Function *F)
Return true if all users of V are within function F, looking through ConstantExprs.
static bool isSingletonVEXTMask(ArrayRef< int > M, EVT VT, unsigned &Imm)
static SDValue PerformVMOVDRRCombine(SDNode *N, SelectionDAG &DAG)
PerformVMOVDRRCombine - Target-specific dag combine xforms for ARMISD::VMOVDRR.
static bool isLowerSaturatingConditional(const SDValue &Op, SDValue &V, SDValue &SatK)
static bool isLegalAddressImmediate(int64_t V, EVT VT, const ARMSubtarget *Subtarget)
isLegalAddressImmediate - Return true if the integer value can be used as the offset of the target ad...
static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static bool isLegalT1AddressImmediate(int64_t V, EVT VT)
static SDValue CombineANDShift(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG)
static SDValue PerformSHLSimplify(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *ST)
static SDValue PerformADDECombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformADDECombine - Target-specific dag combine transform from ARMISD::ADDC, ARMISD::ADDE,...
static SDValue PerformReduceShuffleCombine(SDNode *N, SelectionDAG &DAG)
static SDValue PerformUMLALCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue LowerTruncate(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue PerformHWLoopCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *ST)
static SDValue PerformORCombineToShiftInsert(SelectionDAG &DAG, SDValue AndOp, SDValue ShiftOp, EVT VT, SDLoc dl)
static SDValue PerformSplittingMVETruncToNarrowingStores(StoreSDNode *St, SelectionDAG &DAG)
static bool isVUZP_v_undef_Mask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
isVUZP_v_undef_Mask - Special case of isVUZPMask for canonical form of "vector_shuffle v,...
static bool isHomogeneousAggregate(Type *Ty, HABaseType &Base, uint64_t &Members)
static SDValue PerformMULCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue PerformFADDCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue LowerReverse_VECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG)
static SDValue PerformANDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue PerformADDVecReduce(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue LowerPredicateStore(SDValue Op, SelectionDAG &DAG)
static SDValue SearchLoopIntrinsic(SDValue N, ISD::CondCode &CC, int &Imm, bool &Negate)
static bool canChangeToInt(SDValue Op, bool &SeenZero, const ARMSubtarget *Subtarget)
canChangeToInt - Given the fp compare operand, return true if it is suitable to morph to an integer c...
static unsigned getStOpcode(unsigned StSize, bool IsThumb1, bool IsThumb2)
Return the store opcode for a given store size.
static bool IsVUZPShuffleNode(SDNode *N)
static SDValue Expand64BitShift(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue AddCombineTo64BitSMLAL16(SDNode *AddcNode, SDNode *AddeNode, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static void attachMEMCPYScratchRegs(const ARMSubtarget *Subtarget, MachineInstr &MI, const SDNode *Node)
Attaches vregs to MEMCPY that it will use as scratch registers when it is expanded into LDM/STM.
static bool isFloatingPointZero(SDValue Op)
isFloatingPointZero - Return true if this is +0.0.
static SDValue findMUL_LOHI(SDValue V)
static SDValue LowerVECTOR_SHUFFLE_i1(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue PerformORCombine_i1(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue PerformSplittingMVEEXTToWideningLoad(SDNode *N, SelectionDAG &DAG)
static SDValue PerformSplittingToWideningLoad(SDNode *N, SelectionDAG &DAG)
static void genTPLoopBody(MachineBasicBlock *TpLoopBody, MachineBasicBlock *TpEntry, MachineBasicBlock *TpExit, const TargetInstrInfo *TII, DebugLoc Dl, MachineRegisterInfo &MRI, Register OpSrcReg, Register OpDestReg, Register ElementCountReg, Register TotalIterationsReg, bool IsMemcpy)
Adds logic in the loopBody MBB to generate MVE_VCTP, t2DoLoopDec and t2DoLoopEnd.
static SDValue PerformBUILD_VECTORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformBUILD_VECTORCombine - Target-specific dag combine xforms for ISD::BUILD_VECTOR.
static SDValue LowerVecReduceF(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue PerformMinMaxCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
PerformMinMaxCombine - Target-specific DAG combining for creating truncating saturates.
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
This file a TargetTransformInfoImplBase conforming object specific to the ARM target machine.
Function Alias Analysis false
Function Alias Analysis Results
Atomic ordering constants.
This file contains the simple types necessary to represent the attributes associated with functions a...
#define X(NUM, ENUM, NAME)
Definition ELF.h:853
This file implements the BitVector class.
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static std::optional< bool > isBigEndian(const SmallDenseMap< int64_t, int64_t, 8 > &MemOffset2Idx, int64_t LowestIdx)
Given a map from byte offsets in memory to indices in a load/store, determine if that map corresponds...
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static void createLoadIntrinsic(IntrinsicInst *II, LoadInst *LI, dxil::ResourceTypeInfo &RTI)
static void createStoreIntrinsic(IntrinsicInst *II, StoreInst *SI, dxil::ResourceTypeInfo &RTI)
This file defines the DenseMap class.
static bool isSigned(unsigned Opcode)
#define Check(C,...)
#define op(i)
#define im(i)
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
Module.h This file contains the declarations for the Module class.
std::pair< Value *, Value * > ShuffleOps
We are building a shuffle to create V, which is a sequence of insertelement, extractelement pairs.
static Value * LowerCTPOP(LLVMContext &Context, Value *V, Instruction *IP)
Emit the code to lower ctpop of V before the specified instruction IP.
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
#define RegName(no)
static LVOptions Options
Definition LVOptions.cpp:25
lazy value info
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
#define G(x, y, z)
Definition MD5.cpp:55
This file declares the MachineConstantPool class which is an abstract constant pool to keep track of ...
static DebugLoc getDebugLoc(MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
Return the first DebugLoc that has line number information, given a range of instructions.
Register Reg
Register const TargetRegisterInfo * TRI
Promote Memory to Register
Definition Mem2Reg.cpp:110
#define T
nvptx lower args
uint64_t High
uint64_t IntrinsicInst * II
PowerPC Reduce CR logical Operation
R600 Clause Merge
const SmallVectorImpl< MachineOperand > & Cond
static cl::opt< RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode > Mode("regalloc-enable-advisor", cl::Hidden, cl::init(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default), cl::desc("Enable regalloc advisor mode"), cl::values(clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default, "default", "Default"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Release, "release", "precompiled"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Development, "development", "for training")))
SI Lower i1 Copies
This file contains some templates that are useful if you are working with the STL at all.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition Value.cpp:483
static cl::opt< unsigned > MaxSteps("has-predecessor-max-steps", cl::Hidden, cl::init(8192), cl::desc("DAG combiner limit number of steps when searching DAG " "for predecessor nodes"))
This file defines the SmallPtrSet class.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:171
This file contains some functions that are useful when dealing with strings.
This file implements the StringSwitch template, which mimics a switch() statement whose cases are str...
#define LLVM_DEBUG(...)
Definition Debug.h:119
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39
This file describes how to lower LLVM code to machine code.
static X86::CondCode getSwappedCondition(X86::CondCode CC)
Assuming the flags are set by MI(a,b), return the condition code if we modify the instructions such t...
static constexpr int Concat[]
Value * RHS
Value * LHS
BinaryOperator * Mul
static bool isIntrinsic(const CallBase &Call, Intrinsic::ID ID)
The Input class is used to parse a yaml document into in-memory structs and vectors.
static constexpr roundingMode rmTowardZero
Definition APFloat.h:348
LLVM_ABI bool getExactInverse(APFloat *Inv) const
If this value is normal and has an exact, normal, multiplicative inverse, store it in inv and return ...
Definition APFloat.cpp:5854
APInt bitcastToAPInt() const
Definition APFloat.h:1430
opStatus convertToInteger(MutableArrayRef< integerPart > Input, unsigned int Width, bool IsSigned, roundingMode RM, bool *IsExact) const
Definition APFloat.h:1391
Class for arbitrary precision integers.
Definition APInt.h:78
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition APInt.h:235
bool isMinSignedValue() const
Determine if this is the smallest signed value.
Definition APInt.h:424
uint64_t getZExtValue() const
Get zero extended value.
Definition APInt.h:1563
unsigned popcount() const
Count the number of bits set.
Definition APInt.h:1693
LLVM_ABI APInt zextOrTrunc(unsigned width) const
Zero extend or truncate to width.
Definition APInt.cpp:1076
unsigned getActiveBits() const
Compute the number of active bits in the value.
Definition APInt.h:1535
LLVM_ABI APInt trunc(unsigned width) const
Truncate to new width.
Definition APInt.cpp:968
void setBit(unsigned BitPosition)
Set the given bit to 1 whose position is given as "bitPosition".
Definition APInt.h:1353
bool sgt(const APInt &RHS) const
Signed greater than comparison.
Definition APInt.h:1208
bool isAllOnes() const
Determine if all bits are set. This is true for zero-width values.
Definition APInt.h:372
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition APInt.h:1511
bool ult(const APInt &RHS) const
Unsigned less than comparison.
Definition APInt.h:1118
unsigned countr_zero() const
Count the number of trailing zero bits.
Definition APInt.h:1662
unsigned countl_zero() const
The APInt version of std::countl_zero.
Definition APInt.h:1621
static LLVM_ABI APInt getSplat(unsigned NewLen, const APInt &V)
Return a value containing V broadcasted over NewLen bits.
Definition APInt.cpp:652
unsigned logBase2() const
Definition APInt.h:1784
uint64_t getLimitedValue(uint64_t Limit=UINT64_MAX) const
If this value is smaller than the specified limit, return it, otherwise return the limit value.
Definition APInt.h:476
bool isSubsetOf(const APInt &RHS) const
This operation checks that all bits set in this APInt are also set in RHS.
Definition APInt.h:1264
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
Definition APInt.h:441
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition APInt.h:307
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition APInt.h:297
static APInt getOneBitSet(unsigned numBits, unsigned BitNo)
Return an APInt with exactly one bit set in the result.
Definition APInt.h:240
int64_t getSExtValue() const
Get sign extended value.
Definition APInt.h:1585
void lshrInPlace(unsigned ShiftAmt)
Logical right-shift this APInt by ShiftAmt in place.
Definition APInt.h:865
APInt lshr(unsigned shiftAmt) const
Logical right-shift function.
Definition APInt.h:858
unsigned countr_one() const
Count the number of trailing one bits.
Definition APInt.h:1679
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
Definition APInt.h:1228
An arbitrary precision integer that knows its signedness.
Definition APSInt.h:24
const ARMBaseRegisterInfo & getRegisterInfo() const
const uint32_t * getSjLjDispatchPreservedMask(const MachineFunction &MF) const
const MCPhysReg * getCalleeSavedRegs(const MachineFunction *MF) const override
Code Generation virtual methods...
Register getFrameRegister(const MachineFunction &MF) const override
const uint32_t * getCallPreservedMask(const MachineFunction &MF, CallingConv::ID) const override
const uint32_t * getTLSCallPreservedMask(const MachineFunction &MF) const
const uint32_t * getThisReturnPreservedMask(const MachineFunction &MF, CallingConv::ID) const
getThisReturnPreservedMask - Returns a call preserved mask specific to the case that 'returned' is on...
static ARMConstantPoolConstant * Create(const Constant *C, unsigned ID)
static ARMConstantPoolMBB * Create(LLVMContext &C, const MachineBasicBlock *mbb, unsigned ID, unsigned char PCAdj)
static ARMConstantPoolSymbol * Create(LLVMContext &C, StringRef s, unsigned ID, unsigned char PCAdj)
ARMConstantPoolValue - ARM specific constantpool value.
ARMFunctionInfo - This class is derived from MachineFunctionInfo and contains private ARM-specific in...
SmallPtrSet< const GlobalVariable *, 2 > & getGlobalsPromotedToConstantPool()
void setArgumentStackToRestore(unsigned v)
void setArgRegsSaveSize(unsigned s)
void setReturnRegsCount(unsigned s)
unsigned getArgRegsSaveSize() const
void markGlobalAsPromotedToConstantPool(const GlobalVariable *GV)
Indicate to the backend that GV has had its storage changed to inside a constant pool.
void setArgumentStackSize(unsigned size)
unsigned getArgumentStackSize() const
const Triple & getTargetTriple() const
const ARMBaseInstrInfo * getInstrInfo() const override
bool isThumb1Only() const
bool useFPVFMx() const
bool isThumb2() const
bool hasBaseDSP() const
const ARMTargetLowering * getTargetLowering() const override
const ARMBaseRegisterInfo * getRegisterInfo() const override
bool hasVFP2Base() const
bool useFPVFMx64() const
bool isLittle() const
bool useFPVFMx16() const
bool isMClass() const
bool useMulOps() const
bool shouldFoldSelectWithIdentityConstant(unsigned BinOpcode, EVT VT, unsigned SelectOpcode, SDValue X, SDValue Y) const override
Return true if pulling a binary operation into a select with an identity constant is profitable.
bool isReadOnly(const GlobalValue *GV) const
unsigned getMaxSupportedInterleaveFactor() const override
Get the maximum supported factor for interleaved memory accesses.
TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicLoadInIR(LoadInst *LI) const override
Returns how the given (atomic) load should be expanded by the IR-level AtomicExpand pass.
unsigned getNumInterleavedAccesses(VectorType *VecTy, const DataLayout &DL) const
Returns the number of interleaved accesses that will be generated when lowering accesses of the given...
bool shouldInsertFencesForAtomic(const Instruction *I) const override
Whether AtomicExpandPass should automatically insert fences and reduce ordering for this atomic.
Align getABIAlignmentForCallingConv(Type *ArgTy, const DataLayout &DL) const override
Return the correct alignment for the current calling convention.
bool isDesirableToCommuteWithShift(const SDNode *N, CombineLevel Level) const override
Return true if it is profitable to move this shift by a constant amount through its operand,...
Register getExceptionPointerRegister(const Constant *PersonalityFn) const override
If a physical register, this returns the register that receives the exception address on entry to an ...
ConstraintWeight getSingleConstraintMatchWeight(AsmOperandInfo &info, const char *constraint) const override
Examine constraint string and operand type and determine a weight value.
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I=nullptr) const override
isLegalAddressingMode - Return true if the addressing mode represented by AM is legal for this target...
const ARMSubtarget * getSubtarget() const
bool isLegalT2ScaledAddressingMode(const AddrMode &AM, EVT VT) const
bool isLegalT1ScaledAddressingMode(const AddrMode &AM, EVT VT) const
Returns true if the addressing mode representing by AM is legal for the Thumb1 target,...
bool getPreIndexedAddressParts(SDNode *N, SDValue &Base, SDValue &Offset, ISD::MemIndexedMode &AM, SelectionDAG &DAG) const override
getPreIndexedAddressParts - returns true by value, base pointer and offset pointer and addressing mod...
MachineInstr * EmitKCFICheck(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator &MBBI, const TargetInstrInfo *TII) const override
bool shouldAlignPointerArgs(CallInst *CI, unsigned &MinSize, Align &PrefAlign) const override
Return true if the pointer arguments to CI should be aligned by aligning the object whose address is ...
void getTgtMemIntrinsic(SmallVectorImpl< IntrinsicInfo > &Infos, const CallBase &I, MachineFunction &MF, unsigned Intrinsic) const override
getTgtMemIntrinsic - Represent NEON load and store intrinsics as MemIntrinsicNodes.
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
ReplaceNodeResults - Replace the results of node with an illegal result type with new values built ou...
void emitAtomicCmpXchgNoStoreLLBalance(IRBuilderBase &Builder) const override
bool isMulAddWithConstProfitable(SDValue AddNode, SDValue ConstNode) const override
Return true if it may be profitable to transform (mul (add x, c1), c2) -> (add (mul x,...
bool isLegalAddImmediate(int64_t Imm) const override
isLegalAddImmediate - Return true if the specified immediate is legal add immediate,...
EVT getOptimalMemOpType(LLVMContext &Context, const MemOp &Op, const AttributeList &FuncAttributes) const override
Returns the target specific optimal type for load and store operations as a result of memset,...
Instruction * emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst, AtomicOrdering Ord) const override
bool isFNegFree(EVT VT) const override
Return true if an fneg operation is free to the point where it is never worthwhile to replace it with...
void finalizeLowering(MachineFunction &MF) const override
Execute target specific actions to finalize target lowering.
SDValue PerformMVETruncCombine(SDNode *N, DAGCombinerInfo &DCI) const
bool isFPImmLegal(const APFloat &Imm, EVT VT, bool ForCodeSize=false) const override
isFPImmLegal - Returns true if the target can instruction select the specified FP immediate natively.
ConstraintType getConstraintType(StringRef Constraint) const override
getConstraintType - Given a constraint letter, return the type of constraint it is for this target.
bool preferIncOfAddToSubOfNot(EVT VT) const override
These two forms are equivalent: sub y, (xor x, -1) add (add x, 1), y The variant with two add's is IR...
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicStoreInIR(StoreInst *SI) const override
Returns how the given (atomic) store should be expanded by the IR-level AtomicExpand pass into.
SDValue PerformIntrinsicCombine(SDNode *N, DAGCombinerInfo &DCI) const
PerformIntrinsicCombine - ARM-specific DAG combining for intrinsics.
bool shouldFoldConstantShiftPairToMask(const SDNode *N) const override
Return true if it is profitable to fold a pair of shifts into a mask.
bool isDesirableToCommuteXorWithShift(const SDNode *N) const override
Return true if it is profitable to combine an XOR of a logical shift to create a logical shift of NOT...
SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG) const
PerformCMOVCombine - Target-specific DAG combining for ARMISD::CMOV.
TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicRMWInIR(const AtomicRMWInst *AI) const override
Returns how the IR-level AtomicExpand pass should expand the given AtomicRMW, if at all.
Value * createComplexDeinterleavingIR(IRBuilderBase &B, ComplexDeinterleavingOperation OperationType, ComplexDeinterleavingRotation Rotation, Value *InputA, Value *InputB, Value *Accumulator=nullptr) const override
Create the IR node for the given complex deinterleaving operation.
bool isComplexDeinterleavingSupported() const override
Does this target support complex deinterleaving.
SDValue PerformMVEExtCombine(SDNode *N, DAGCombinerInfo &DCI) const
FastISel * createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo, const LibcallLoweringInfo *libcallLowering) const override
createFastISel - This method returns a target specific FastISel object, or null if the target does no...
void insertSSPDeclarations(Module &M, const LibcallLoweringInfo &Libcalls) const override
Inserts necessary declarations for SSP (stack protection) purpose.
bool SimplifyDemandedBitsForTargetNode(SDValue Op, const APInt &OriginalDemandedBits, const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth) const override
Attempt to simplify any target nodes based on the demanded bits/elts, returning true on success.
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override
getSetCCResultType - Return the value type to use for ISD::SETCC.
MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const override
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
Value * emitStoreConditional(IRBuilderBase &Builder, Value *Val, Value *Addr, AtomicOrdering Ord) const override
Perform a store-conditional operation to Addr.
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC, bool isVarArg) const
void AdjustInstrPostInstrSelection(MachineInstr &MI, SDNode *Node) const override
This method should be implemented by targets that mark instructions with the 'hasPostISelHook' flag.
bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, unsigned Index) const override
Return true if EXTRACT_SUBVECTOR is cheap for this result type with this index.
bool isTruncateFree(Type *SrcTy, Type *DstTy) const override
Return true if it's free to truncate a value of type FromTy to type ToTy.
bool isShuffleMaskLegal(ArrayRef< int > M, EVT VT) const override
isShuffleMaskLegal - Targets can use this to indicate that they only support some VECTOR_SHUFFLE oper...
TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicCmpXchgInIR(const AtomicCmpXchgInst *AI) const override
Returns how the given atomic cmpxchg should be expanded by the IR-level AtomicExpand pass.
bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override
Returns true if it is beneficial to convert a load of a constant to just the constant itself.
bool lowerInterleavedStore(Instruction *Store, Value *Mask, ShuffleVectorInst *SVI, unsigned Factor, const APInt &GapMask) const override
Lower an interleaved store into a vstN intrinsic.
const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent=false) const override
getRegClassFor - Return the register class that should be used for the specified value type.
bool useLoadStackGuardNode(const Module &M) const override
If this function returns true, SelectionDAGBuilder emits a LOAD_STACK_GUARD node when it is lowering ...
bool lowerInterleavedLoad(Instruction *Load, Value *Mask, ArrayRef< ShuffleVectorInst * > Shuffles, ArrayRef< unsigned > Indices, unsigned Factor, const APInt &GapMask) const override
Lower an interleaved load into a vldN intrinsic.
std::pair< const TargetRegisterClass *, uint8_t > findRepresentativeClass(const TargetRegisterInfo *TRI, MVT VT) const override
Return the largest legal super-reg register class of the register class for the specified type and it...
bool preferSelectsOverBooleanArithmetic(EVT VT) const override
Should we prefer selects to doing arithmetic on boolean types.
bool isZExtFree(SDValue Val, EVT VT2) const override
Return true if zero-extending the specific node Val to type VT2 is free (either because it's implicit...
bool isCheapToSpeculateCttz(Type *Ty) const override
Return true if it is cheap to speculate a call to intrinsic cttz.
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
bool isCheapToSpeculateCtlz(Type *Ty) const override
Return true if it is cheap to speculate a call to intrinsic ctlz.
bool targetShrinkDemandedConstant(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, TargetLoweringOpt &TLO) const override
bool hasAndNot(SDValue Y) const override
Return true if the target has a bitwise and-not operation: X = ~A & B This can be used to simplify se...
ARMTargetLowering(const TargetMachine &TM, const ARMSubtarget &STI)
bool isComplexDeinterleavingOperationSupported(ComplexDeinterleavingOperation Operation, Type *Ty) const override
Does this target support complex deinterleaving with the given operation and type.
bool supportKCFIBundles() const override
Return true if the target supports kcfi operand bundles.
SDValue PerformBRCONDCombine(SDNode *N, SelectionDAG &DAG) const
PerformBRCONDCombine - Target-specific DAG combining for ARMISD::BRCOND.
Register getExceptionSelectorRegister(const Constant *PersonalityFn) const override
If a physical register, this returns the register that receives the exception typeid on entry to a la...
Type * shouldConvertSplatType(ShuffleVectorInst *SVI) const override
Given a shuffle vector SVI representing a vector splat, return a new scalar type of size equal to SVI...
Value * emitLoadLinked(IRBuilderBase &Builder, Type *ValueTy, Value *Addr, AtomicOrdering Ord) const override
Perform a load-linked operation on Addr, returning a "Value *" with the corresponding pointee type.
Instruction * makeDMB(IRBuilderBase &Builder, ARM_MB::MemBOpt Domain) const
bool isLegalICmpImmediate(int64_t Imm) const override
isLegalICmpImmediate - Return true if the specified immediate is legal icmp immediate,...
const char * LowerXConstraint(EVT ConstraintVT) const override
Try to replace an X constraint, which matches anything, with another that has more specific requireme...
unsigned getJumpTableEncoding() const override
Return the entry encoding for a jump table in the current function.
bool isDesirableToTransformToIntegerOp(unsigned Opc, EVT VT) const override
Return true if it is profitable for dag combiner to transform a floating point op of specified opcode...
CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool isVarArg) const
bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags, unsigned *Fast) const override
allowsMisalignedMemoryAccesses - Returns true if the target allows unaligned memory accesses of the s...
bool isLegalInterleavedAccessType(unsigned Factor, FixedVectorType *VecTy, Align Alignment, const DataLayout &DL) const
Returns true if VecTy is a legal interleaved access type.
bool isVectorLoadExtDesirable(SDValue ExtVal) const override
Return true if folding a vector load into ExtVal (a sign, zero, or any extend node) is profitable.
bool canCombineStoreAndExtract(Type *VectorTy, Value *Idx, unsigned &Cost) const override
Return true if the target can combine store(extractelement VectorTy,Idx).
bool useSoftFloat() const override
bool alignLoopsWithOptSize() const override
Should loops be aligned even when the function is marked OptSize (but not MinSize).
SDValue PerformCMOVToBFICombine(SDNode *N, SelectionDAG &DAG) const
bool allowTruncateForTailCall(Type *Ty1, Type *Ty2) const override
Return true if a truncation from FromTy to ToTy is permitted when deciding whether a call is in tail ...
void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const override
LowerAsmOperandForConstraint - Lower the specified operand into the Ops vector.
bool hasAndNotCompare(SDValue V) const override
Return true if the target should transform: (X & Y) == Y ---> (~X & Y) == 0 (X & Y) !...
std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override
Given a physical register constraint (e.g.
bool shouldConvertFpToSat(unsigned Op, EVT FPVT, EVT VT) const override
Should we generate fp_to_si_sat and fp_to_ui_sat from type FPVT to type VT from min(max(fptoi)) satur...
bool functionArgumentNeedsConsecutiveRegisters(Type *Ty, CallingConv::ID CallConv, bool isVarArg, const DataLayout &DL) const override
Returns true if an argument of type Ty needs to be passed in a contiguous block of registers in calli...
bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override
Return true if folding a constant offset with the given GlobalAddress is legal.
const ARMBaseTargetMachine & getTM() const
bool isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const override
Return if the target supports combining a chain like:
ShiftLegalizationStrategy preferredShiftLegalizationStrategy(SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const override
bool getPostIndexedAddressParts(SDNode *N, SDNode *Op, SDValue &Base, SDValue &Offset, ISD::MemIndexedMode &AM, SelectionDAG &DAG) const override
getPostIndexedAddressParts - returns true by value, base pointer and offset pointer and addressing mo...
Instruction * emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst, AtomicOrdering Ord) const override
Inserts in the IR a target-specific intrinsic specifying a fence.
bool canCreateUndefOrPoisonForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, UndefPoisonKind Kind, bool ConsiderFlags, unsigned Depth) const override
Return true if Op can create undef or poison from non-undef & non-poison operands.
Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
size_t size() const
Get the array size.
Definition ArrayRef.h:141
bool empty() const
Check if the array is empty.
Definition ArrayRef.h:136
An instruction that atomically checks whether a specified value is in a memory location,...
an instruction that atomically reads a memory location, combines it with another value,...
bool isFloatingPointOperation() const
static LLVM_ABI Attribute get(LLVMContext &Context, AttrKind Kind, uint64_t Val=0)
Return a uniquified Attribute object.
static LLVM_ABI BaseIndexOffset match(const SDNode *N, const SelectionDAG &DAG)
Parses tree in N for base, index, offset addresses.
LLVM Basic Block Representation.
Definition BasicBlock.h:62
The address of a basic block.
Definition Constants.h:1082
static BranchProbability getZero()
A "pseudo-class" with methods for operating on BUILD_VECTORs.
LLVM_ABI bool isConstantSplat(APInt &SplatValue, APInt &SplatUndef, unsigned &SplatBitSize, bool &HasAnyUndefs, unsigned MinSplatBits=0, bool isBigEndian=false) const
Check if this is a constant splat, and if so, find the smallest element size that splats the vector.
LLVM_ABI int32_t getConstantFPSplatPow2ToLog2Int(BitVector *UndefElements, uint32_t BitWidth) const
If this is a constant FP splat and the splatted constant FP is an exact power or 2,...
CCState - This class holds information needed while lowering arguments and return values.
void getInRegsParamInfo(unsigned InRegsParamRecordIndex, unsigned &BeginReg, unsigned &EndReg) const
unsigned getFirstUnallocated(ArrayRef< MCPhysReg > Regs) const
getFirstUnallocated - Return the index of the first unallocated register in the set,...
static LLVM_ABI bool resultsCompatible(CallingConv::ID CalleeCC, CallingConv::ID CallerCC, MachineFunction &MF, LLVMContext &C, const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn CalleeFn, CCAssignFn CallerFn)
Returns true if the results of the two calling conventions are compatible.
MCRegister AllocateReg(MCPhysReg Reg)
AllocateReg - Attempt to allocate one register.
LLVM_ABI bool CheckReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
CheckReturn - Analyze the return values of a function, returning true if the return can be performed ...
LLVM_ABI void AnalyzeReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeReturn - Analyze the returned values of a return, incorporating info about the result values i...
unsigned getInRegsParamsProcessed() const
uint64_t getStackSize() const
Returns the size of the currently allocated portion of the stack.
void addInRegsParamInfo(unsigned RegBegin, unsigned RegEnd)
LLVM_ABI void AnalyzeFormalArguments(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeFormalArguments - Analyze an array of argument values, incorporating info about the formals in...
unsigned getInRegsParamsCount() const
CCValAssign - Represent assignment of one arg/retval to a location.
Register getLocReg() const
LocInfo getLocInfo() const
bool needsCustom() const
int64_t getLocMemOffset() const
unsigned getValNo() const
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
LLVM_ABI bool isMustTailCall() const
Tests if this call site must be tail call optimized.
AttributeList getAttributes() const
Return the attributes for this call.
void addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind)
Adds the attribute to the indicated argument.
This class represents a function call, abstracting a target machine's calling convention.
bool isTailCall() const
static Constant * get(LLVMContext &Context, ArrayRef< ElementTy > Elts)
get() constructor - Return a constant with array type with an element count and element type matching...
Definition Constants.h:872
const APFloat & getValueAPF() const
ConstantFP - Floating Point Values [float, double].
Definition Constants.h:420
This is the shared class of boolean and integer constants.
Definition Constants.h:87
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
Definition Constants.h:168
MachineConstantPoolValue * getMachineCPVal() const
const Constant * getConstVal() const
LLVM_ABI Type * getType() const
uint64_t getZExtValue() const
const APInt & getAPIntValue() const
int64_t getSExtValue() const
This is an important base class in LLVM.
Definition Constant.h:43
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
bool isLittleEndian() const
Layout endianness...
Definition DataLayout.h:217
bool isBigEndian() const
Definition DataLayout.h:218
MaybeAlign getStackAlignment() const
Returns the natural stack alignment, or MaybeAlign() if one wasn't specified.
Definition DataLayout.h:250
LLVM_ABI TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
StringRef getInternalSymbolPrefix() const
Definition DataLayout.h:308
LLVM_ABI Align getPreferredAlign(const GlobalVariable *GV) const
Returns the preferred alignment of the specified global.
LLVM_ABI Align getPrefTypeAlign(Type *Ty) const
Returns the preferred stack/global alignment for the specified type.
A debug info location.
Definition DebugLoc.h:124
iterator find(const_arg_type_t< KeyT > Val)
Definition DenseMap.h:225
unsigned size() const
Definition DenseMap.h:174
bool empty() const
Definition DenseMap.h:173
iterator begin()
Definition DenseMap.h:139
iterator end()
Definition DenseMap.h:143
This is a fast-path instruction selection class that generates poor code and doesn't support illegal ...
Definition FastISel.h:66
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:869
A handy container for a FunctionType+Callee-pointer pair, which can be passed around as a single enti...
FunctionLoweringInfo - This contains information that is global to a function that is used when lower...
Type * getParamType(unsigned i) const
Parameter type accessors.
FunctionType * getFunctionType() const
Returns the FunctionType for me.
Definition Function.h:211
bool hasMinSize() const
Optimize this function for minimum size (-Oz).
Definition Function.h:711
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition Function.h:272
arg_iterator arg_begin()
Definition Function.h:868
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:354
bool hasStructRetAttr() const
Determine if the function returns a structure through first or second pointer argument.
Definition Function.h:695
const Argument * const_arg_iterator
Definition Function.h:74
bool isVarArg() const
isVarArg - Return true if this function takes a variable number of arguments.
Definition Function.h:229
bool hasFnAttribute(Attribute::AttrKind Kind) const
Return true if the function has the attribute.
Definition Function.cpp:724
const GlobalValue * getGlobal() const
bool isDSOLocal() const
bool hasExternalWeakLinkage() const
bool hasDLLImportStorageClass() const
Module * getParent()
Get the module that this global value is contained inside of...
bool isStrongDefinitionForLinker() const
Returns true if this global's definition will be the one chosen by the linker.
static bool isWeakForLinker(LinkageTypes Linkage)
Whether the definition of this global may be replaced at link time.
@ InternalLinkage
Rename collisions when linking (static functions).
Definition GlobalValue.h:60
Common base class shared among various IRBuilders.
Definition IRBuilder.h:114
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition IRBuilder.h:2868
LLVM_ABI bool hasAtomicStore() const LLVM_READONLY
Return true if this atomic instruction stores to memory.
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
bool isUnindexed() const
Return true if this is NOT a pre/post inc/dec load/store.
bool isIndexed() const
Return true if this is a pre/post inc/dec load/store.
Tracks which library functions to use for a particular subtarget.
CallingConv::ID getLibcallImplCallingConv(RTLIB::LibcallImpl Call) const
Get the CallingConv that should be used for the specified libcall.
RTLIB::LibcallImpl getLibcallImpl(RTLIB::Libcall Call) const
Return the lowering's selection of implementation call for Call.
An instruction for reading from memory.
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
Describe properties that are true of each instruction in the target description file.
Machine Value Type.
static MVT getFloatingPointVT(unsigned BitWidth)
static auto integer_fixedlen_vector_valuetypes()
SimpleValueType SimpleTy
uint64_t getScalarSizeInBits() const
unsigned getVectorNumElements() const
bool isInteger() const
Return true if this is an integer or a vector integer type.
static LLVM_ABI MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
static auto integer_valuetypes()
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
static auto fixedlen_vector_valuetypes()
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
bool isScalarInteger() const
Return true if this is an integer, not including vectors.
static MVT getVectorVT(MVT VT, unsigned NumElements)
MVT getVectorElementType() const
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
static MVT getIntegerVT(unsigned BitWidth)
static auto fp_valuetypes()
bool is64BitVector() const
Return true if this is a 64-bit vector type.
LLVM_ABI void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
bool isEHPad() const
Returns true if the block is a landing pad.
LLVM_ABI MachineBasicBlock * getFallThrough(bool JumpToFallThrough=true)
Return the fallthrough block if the block can implicitly transfer control to the block after it by fa...
void setCallFrameSize(unsigned N)
Set the call frame size on entry to this basic block.
const BasicBlock * getBasicBlock() const
Return the LLVM basic block that this instance corresponded to originally.
LLVM_ABI bool canFallThrough()
Return true if the block can implicitly transfer control to the block after it by falling off the end...
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
Instructions::iterator instr_iterator
MachineInstrBundleIterator< MachineInstr, true > reverse_iterator
LLVM_ABI MachineBasicBlock * splitAt(MachineInstr &SplitInst, bool UpdateLiveIns=true, LiveIntervals *LIS=nullptr)
Split a basic block into 2 pieces at SplitPoint.
void addLiveIn(MCRegister PhysReg, LaneBitmask LaneMask=LaneBitmask::getAll())
Adds the specified register as a live in.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
LLVM_ABI instr_iterator erase(instr_iterator I)
Remove an instruction from the instruction list and delete it.
iterator_range< succ_iterator > successors()
iterator_range< pred_iterator > predecessors()
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
MachineInstrBundleIterator< MachineInstr > iterator
LLVM_ABI void moveAfter(MachineBasicBlock *NewBefore)
LLVM_ABI bool isLiveIn(MCRegister Reg, LaneBitmask LaneMask=LaneBitmask::getAll()) const
Return true if the specified register is in the live in set.
void setIsEHPad(bool V=true)
Indicates the block is a landing pad.
The MachineConstantPool class keeps track of constants referenced by a function which must be spilled...
LLVM_ABI unsigned getConstantPoolIndex(const Constant *C, Align Alignment)
getConstantPoolIndex - Create a new entry in the constant pool or return an existing one.
LLVM_ABI int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
LLVM_ABI void computeMaxCallFrameSize(MachineFunction &MF, std::vector< MachineBasicBlock::iterator > *FrameSDOps=nullptr)
Computes the maximum size of a callframe.
void setFrameAddressIsTaken(bool T)
void setHasTailCall(bool V=true)
void setReturnAddressIsTaken(bool s)
int64_t getObjectSize(int ObjectIdx) const
Return the size of the specified object.
bool hasVAStart() const
Returns true if the function calls the llvm.va_start intrinsic.
int64_t getObjectOffset(int ObjectIdx) const
Return the assigned stack offset of the specified object from the incoming stack pointer.
bool isFixedObjectIndex(int ObjectIdx) const
Returns true if the specified index corresponds to a fixed stack object.
int getFunctionContextIndex() const
Return the index for the function context object.
Properties which a MachineFunction may have at a given point in time.
unsigned getFunctionNumber() const
getFunctionNumber - Return a unique ID for the current function.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
void push_back(MachineBasicBlock *MBB)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
const DataLayout & getDataLayout() const
Return the DataLayout attached to the Module associated to this MF.
Function & getFunction()
Return the LLVM function that this machine code represents.
BasicBlockListType::iterator iterator
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
MachineConstantPool * getConstantPool()
getConstantPool - Return the constant pool object for the current function.
const MachineFunctionProperties & getProperties() const
Get the function properties.
Register addLiveIn(MCRegister PReg, const TargetRegisterClass *RC)
addLiveIn - Add the specified physical register as a live-in value and create a corresponding virtual...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
const MachineInstrBuilder & addExternalSymbol(const char *FnName, unsigned TargetFlags=0) const
const MachineInstrBuilder & addUse(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a virtual register use operand.
const MachineInstrBuilder & addReg(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addFrameIndex(int Idx) const
const MachineInstrBuilder & addConstantPoolIndex(unsigned Idx, int Offset=0, unsigned TargetFlags=0) const
const MachineInstrBuilder & addRegMask(const uint32_t *Mask) const
const MachineInstrBuilder & addJumpTableIndex(unsigned Idx, unsigned TargetFlags=0) const
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & addDef(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a virtual register definition operand.
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
const MachineInstrBuilder & setMIFlags(unsigned Flags) const
const MachineInstrBuilder & addMemOperand(MachineMemOperand *MMO) const
MachineInstr * getInstr() const
If conversion operators fail, use this method to get the MachineInstr explicitly.
Representation of each machine instruction.
bool readsRegister(Register Reg, const TargetRegisterInfo *TRI) const
Return true if the MachineInstr reads the specified register.
bool definesRegister(Register Reg, const TargetRegisterInfo *TRI) const
Return true if the MachineInstr fully defines the specified register.
MachineOperand * mop_iterator
iterator/begin/end - Iterate over all operands of a machine instruction.
const MachineOperand & getOperand(unsigned i) const
LLVM_ABI unsigned createJumpTableIndex(const std::vector< MachineBasicBlock * > &DestBBs)
createJumpTableIndex - Create a new jump table.
@ EK_Inline
EK_Inline - Jump table entries are emitted inline at their point of use.
@ EK_BlockAddress
EK_BlockAddress - Each entry is a plain address of block, e.g.: .word LBB123.
A description of a memory reference used in the backend.
Flags
Flags values. These may be or'd together.
@ MOVolatile
The memory access is volatile.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MONonTemporal
The memory access is non-temporal.
@ MOInvariant
The memory access always returns the same value (or traps).
@ MOStore
The memory access writes data.
Flags getFlags() const
Return the raw flags of the source value,.
MachineOperand class - Representation of each machine instruction operand.
LLVM_ABI void setIsRenamable(bool Val=true)
bool isReg() const
isReg - Tests if this is a MO_Register operand.
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
static MachineOperand CreateImm(int64_t Val)
Register getReg() const
getReg - Returns the register number.
LLVM_ABI void setIsDef(bool Val=true)
Change a def to a use, or a use to a def.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLVM_ABI Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
This class is used to represent an MLOAD node.
This class is used to represent an MSTORE node.
This SDNode is used for target intrinsics that touch memory and need an associated MachineMemOperand.
This is an abstract virtual class for memory operations.
Align getBaseAlign() const
Returns alignment and volatility of the memory access.
Align getAlign() const
bool isVolatile() const
AAMDNodes getAAInfo() const
Returns the AA info that describes the dereference.
bool isSimple() const
Returns true if the memory operation is neither atomic or volatile.
MachineMemOperand * getMemOperand() const
Return the unique MachineMemOperand object describing the memory reference performed by operation.
const MachinePointerInfo & getPointerInfo() const
const SDValue & getChain() const
EVT getMemoryVT() const
Return the type of the in-memory value.
A Module instance is used to store all the information related to an LLVM module.
Definition Module.h:67
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
Wrapper class representing virtual and physical registers.
Definition Register.h:20
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
const DebugLoc & getDebugLoc() const
Represents one node in the SelectionDAG.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool hasOneUse() const
Return true if there is exactly one use of this node.
LLVM_ABI bool isOnlyUserOf(const SDNode *N) const
Return true if this node is the only use of N.
iterator_range< use_iterator > uses()
SDNodeFlags getFlags() const
static bool hasPredecessorHelper(const SDNode *N, SmallPtrSetImpl< const SDNode * > &Visited, SmallVectorImpl< const SDNode * > &Worklist, unsigned int MaxSteps=0, bool TopologicalPrune=false)
Returns true if N is a predecessor of any node in Worklist.
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
bool use_empty() const
Return true if there are no uses of this node.
unsigned getNumValues() const
Return the number of values defined/returned by this operator.
unsigned getNumOperands() const
Return the number of values used by this operation.
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
const APInt & getConstantOperandAPInt(unsigned Num) const
Helper method returns the APInt of a ConstantSDNode operand.
bool isPredecessorOf(const SDNode *N) const
Return true if this node is a predecessor of N.
LLVM_ABI bool hasAnyUseOfValue(unsigned Value) const
Return true if there are any use of the indicated value.
EVT getValueType(unsigned ResNo) const
Return the type of a specified result.
void setCFIType(uint32_t Type)
bool isUndef() const
Returns true if the node type is UNDEF or POISON.
iterator_range< user_iterator > users()
void setFlags(SDNodeFlags NewFlags)
user_iterator user_begin() const
Provide iteration support to walk over all users of an SDNode.
Represents a use of a SDNode.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
bool isUndef() const
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
const SDValue & getOperand(unsigned i) const
const APInt & getConstantOperandAPInt(unsigned i) const
uint64_t getScalarValueSizeInBits() const
unsigned getResNo() const
get the index which selects a specific result in the SDNode
uint64_t getConstantOperandVal(unsigned i) const
unsigned getOpcode() const
unsigned getNumOperands() const
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
LLVM_ABI SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
SDValue getTargetGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, unsigned TargetFlags=0)
LLVM_ABI SDValue getStackArgumentTokenFactor(SDValue Chain)
Compute a TokenFactor to force all the incoming stack arguments to be loaded from the stack.
const TargetSubtargetInfo & getSubtarget() const
SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, Register Reg, SDValue N)
LLVM_ABI SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
LLVM_ABI SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
LLVM_ABI SDValue getSplatValue(SDValue V, bool LegalTypes=false)
If V is a splat vector, return its scalar source operand by extracting that element from the source v...
LLVM_ABI SDValue getAllOnesConstant(const SDLoc &DL, EVT VT, bool IsTarget=false, bool IsOpaque=false)
LLVM_ABI MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
LLVM_ABI SDNode * getNodeIfExists(unsigned Opcode, SDVTList VTList, ArrayRef< SDValue > Ops, const SDNodeFlags Flags, bool AllowCommute=false)
Get the specified node if it's already available, or else return NULL.
LLVM_ABI SDValue UnrollVectorOp(SDNode *N, unsigned ResNE=0)
Utility function used by legalize and lowering to "unroll" a vector operation by splitting out the sc...
LLVM_ABI bool haveNoCommonBitsSet(SDValue A, SDValue B) const
Return true if A and B have no common bits set.
LLVM_ABI SDValue getRegister(Register Reg, EVT VT)
LLVM_ABI SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
LLVM_ABI SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, LocationSize Size=LocationSize::precise(0), const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
void addNoMergeSiteInfo(const SDNode *Node, bool NoMerge)
Set NoMergeSiteInfo to be associated with Node if NoMerge is true.
std::pair< SDValue, SDValue > SplitVectorOperand(const SDNode *N, unsigned OpNo)
Split the node's operand with EXTRACT_SUBVECTOR and return the low/high part.
LLVM_ABI SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
const TargetLowering & getTargetLoweringInfo() const
SDValue getTargetJumpTable(int JTI, EVT VT, unsigned TargetFlags=0)
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
LLVM_ABI SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, Register Reg, EVT VT)
LLVM_ABI SDValue getNegative(SDValue Val, const SDLoc &DL, EVT VT)
Create negative operation as (SUB 0, Val).
LLVM_ABI void setNodeMemRefs(MachineSDNode *N, ArrayRef< MachineMemOperand * > NewMemRefs)
Mutate the specified machine node's memory references to the provided list.
LLVM_ABI SDValue getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT VT)
Return the expression required to zero extend the Op value assuming it was the smaller SrcTy value.
const DataLayout & getDataLayout() const
LLVM_ABI SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
SDValue getSignedTargetConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
LLVM_ABI void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
LLVM_ABI std::pair< SDValue, SDValue > SplitVector(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the vector with EXTRACT_SUBVECTOR using the provided VTs and return the low/high part.
LLVM_ABI SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
LLVM_ABI SDValue getSignedConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
LLVM_ABI MaybeAlign InferPtrAlign(SDValue Ptr) const
Infer alignment of a load / store address.
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
LLVM_ABI SDValue getTargetExtractSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand)
A convenience function for creating TargetInstrInfo::EXTRACT_SUBREG nodes.
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
LLVM_ABI SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either sign-extending or trunca...
LLVM_ABI bool isKnownNeverZero(SDValue Op, unsigned Depth=0) const
Test whether the given SDValue is known to contain non-zero value(s).
LLVM_ABI SDValue getExternalSymbol(const char *Sym, EVT VT)
const TargetMachine & getTarget() const
const LibcallLoweringInfo & getLibcalls() const
LLVM_ABI SDValue getIntPtrConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
LLVM_ABI SDValue getValueType(EVT)
LLVM_ABI SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI unsigned ComputeNumSignBits(SDValue Op, unsigned Depth=0) const
Return the number of times the sign bit of the register is replicated into the other bits.
LLVM_ABI SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
LLVM_ABI void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
LLVM_ABI SDValue getFrameIndex(int FI, EVT VT, bool isTarget=false)
LLVM_ABI KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
LLVM_ABI SDValue getRegisterMask(const uint32_t *RegMask)
LLVM_ABI SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
LLVM_ABI SDValue getCondCode(ISD::CondCode Cond)
void addCallSiteInfo(const SDNode *Node, CallSiteInfo &&CallInfo)
Set CallSiteInfo to be associated with Node.
LLVM_ABI bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
SDValue getObjectPtrOffset(const SDLoc &SL, SDValue Ptr, TypeSize Offset)
Create an add instruction with appropriate flags when used for addressing some offset of an object.
LLVMContext * getContext() const
LLVM_ABI SDValue getTargetExternalSymbol(const char *Sym, EVT VT, unsigned TargetFlags=0)
LLVM_ABI SDValue CreateStackTemporary(TypeSize Bytes, Align Alignment)
Create a stack temporary based on the size in bytes and the alignment.
SDValue getTargetConstantPool(const Constant *C, EVT VT, MaybeAlign Align=std::nullopt, int Offset=0, unsigned TargetFlags=0)
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
LLVM_ABI SDValue getMaskedLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Base, SDValue Offset, SDValue Mask, SDValue Src0, EVT MemVT, MachineMemOperand *MMO, ISD::MemIndexedMode AM, ISD::LoadExtType, bool IsExpanding=false)
DenormalMode getDenormalMode(EVT VT) const
Return the current function's default denormal handling kind for the given floating point type.
LLVM_ABI std::pair< SDValue, SDValue > SplitScalar(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the scalar node with EXTRACT_ELEMENT using the provided VTs and return the low/high part.
LLVM_ABI SDValue getVectorShuffle(EVT VT, const SDLoc &dl, SDValue N1, SDValue N2, ArrayRef< int > Mask)
Return an ISD::VECTOR_SHUFFLE node.
LLVM_ABI SDValue getLogicalNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a logical NOT operation as (XOR Val, BooleanOne).
This instruction constructs a fixed permutation of two input vectors.
VectorType * getType() const
Overload to return most specific vector type.
static LLVM_ABI void getShuffleMask(const Constant *Mask, SmallVectorImpl< int > &Result)
Convert the input shuffle mask operand to a vector of integers.
static LLVM_ABI bool isIdentityMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector without lane crossin...
This SDNode is used to implement the code generator support for the llvm IR shufflevector instruction...
int getMaskElt(unsigned Idx) const
ArrayRef< int > getMask() const
static LLVM_ABI bool isSplatMask(ArrayRef< int > Mask)
void insert_range(Range &&R)
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
bool empty() const
Definition SmallSet.h:169
bool erase(const T &V)
Definition SmallSet.h:200
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition SmallSet.h:184
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
iterator insert(iterator I, T &&Elt)
void resize(size_type N)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
This class is used to represent ISD::STORE nodes.
const SDValue & getBasePtr() const
const SDValue & getValue() const
bool isTruncatingStore() const
Return true if the op does a truncation before store.
Represent a constant reference to a string, i.e.
Definition StringRef.h:56
const unsigned char * bytes_end() const
Definition StringRef.h:125
constexpr size_t size() const
Get the string size.
Definition StringRef.h:144
constexpr const char * data() const
Get a pointer to the start of the string (which may not be null terminated).
Definition StringRef.h:138
const unsigned char * bytes_begin() const
Definition StringRef.h:122
static LLVM_ABI StructType * get(LLVMContext &Context, ArrayRef< Type * > Elements, bool isPacked=false)
This static method is the primary way to create a literal StructType.
Definition Type.cpp:479
TargetInstrInfo - Interface to description of machine instruction set.
Provides information about what library functions are available for the current target.
bool isOperationExpand(unsigned Op, EVT VT) const
Return true if the specified operation is illegal on this target or unlikely to be made legal with cu...
void setBooleanVectorContents(BooleanContent Ty)
Specify how the target extends the result of a vector boolean value from a vector of i1 to a wider ty...
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
virtual void finalizeLowering(MachineFunction &MF) const
Execute target specific actions to finalize target lowering.
void setMaxDivRemBitWidthSupported(unsigned SizeInBits)
Set the size in bits of the maximum div/rem the backend supports.
bool PredictableSelectIsExpensive
Tells the code generator that select is more expensive than a branch if the branch is usually predict...
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
unsigned MaxStoresPerMemcpyOptSize
Likewise for functions with the OptSize attribute.
virtual const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent=false) const
Return the register class that should be used for the specified value type.
ShiftLegalizationStrategy
Return the preferred strategy to legalize tihs SHIFT instruction, with ExpansionFactor being the recu...
void setMinStackArgumentAlignment(Align Alignment)
Set the minimum stack alignment of an argument.
const TargetMachine & getTargetMachine() const
virtual void insertSSPDeclarations(Module &M, const LibcallLoweringInfo &Libcalls) const
Inserts necessary declarations for SSP (stack protection) purpose.
void setIndexedMaskedLoadAction(unsigned IdxMode, MVT VT, LegalizeAction Action)
Indicate that the specified indexed masked load does or does not work with the specified type and ind...
void setIndexedLoadAction(ArrayRef< unsigned > IdxModes, MVT VT, LegalizeAction Action)
Indicate that the specified indexed load does or does not work with the specified type and indicate w...
void setPrefLoopAlignment(Align Alignment)
Set the target's preferred loop alignment.
void setMaxAtomicSizeInBitsSupported(unsigned SizeInBits)
Set the maximum atomic operation size supported by the backend.
Sched::Preference getSchedulingPreference() const
Return target scheduling preference.
void setMinFunctionAlignment(Align Alignment)
Set the target's minimum function alignment.
unsigned MaxStoresPerMemsetOptSize
Likewise for functions with the OptSize attribute.
void setBooleanContents(BooleanContent Ty)
Specify how the target extends the result of integer and floating point boolean values from i1 to a w...
unsigned MaxStoresPerMemmove
Specify maximum number of store instructions per memmove call.
void computeRegisterProperties(const TargetRegisterInfo *TRI)
Once all of the register classes are added, this allows us to compute derived properties we expose.
unsigned MaxStoresPerMemmoveOptSize
Likewise for functions with the OptSize attribute.
void addRegisterClass(MVT VT, const TargetRegisterClass *RC)
Add the specified register class as an available regclass for the specified value type.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
void setIndexedStoreAction(ArrayRef< unsigned > IdxModes, MVT VT, LegalizeAction Action)
Indicate that the specified indexed store does or does not work with the specified type and indicate ...
virtual MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
void setPrefFunctionAlignment(Align Alignment)
Set the target's preferred function alignment.
virtual unsigned getMaxSupportedInterleaveFactor() const
Get the maximum supported factor for interleaved memory accesses.
void setIndexedMaskedStoreAction(unsigned IdxMode, MVT VT, LegalizeAction Action)
Indicate that the specified indexed masked store does or does not work with the specified type and in...
unsigned MaxStoresPerMemset
Specify maximum number of store instructions per memset call.
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
virtual ShiftLegalizationStrategy preferredShiftLegalizationStrategy(SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const
bool isOperationLegalOrCustom(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
virtual bool allowsMemoryAccess(LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const
Return true if the target supports a memory access of this type for the given address space and align...
void setStackPointerRegisterToSaveRestore(Register R)
If set to a physical register, this specifies the register that llvm.savestack/llvm....
void AddPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
If Opc/OrigVT is specified as being promoted, the promotion code defaults to trying a larger integer/...
AtomicExpansionKind
Enum that specifies what an atomic load/AtomicRMWInst is expanded to, if at all.
virtual std::pair< const TargetRegisterClass *, uint8_t > findRepresentativeClass(const TargetRegisterInfo *TRI, MVT VT) const
Return the largest legal super-reg register class of the register class for the specified type and it...
RTLIB::LibcallImpl getLibcallImpl(RTLIB::Libcall Call) const
Get the libcall impl routine name for the specified libcall.
static StringRef getLibcallImplName(RTLIB::LibcallImpl Call)
Get the libcall routine name for the specified libcall implementation.
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
void setLoadExtAction(unsigned ExtType, MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified load with extension does not work with the specified type and indicate wh...
LegalizeTypeAction getTypeAction(LLVMContext &Context, EVT VT) const
Return how we should legalize values of this type, either it is already legal (return 'Legal') or we ...
std::vector< ArgListEntry > ArgListTy
unsigned MaxStoresPerMemcpy
Specify maximum number of store instructions per memcpy call.
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
This class defines information used to lower LLVM code to legal SelectionDAG operators that the targe...
bool SimplifyDemandedVectorElts(SDValue Op, const APInt &DemandedEltMask, APInt &KnownUndef, APInt &KnownZero, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Vector Op.
void softenSetCCOperands(SelectionDAG &DAG, EVT VT, SDValue &NewLHS, SDValue &NewRHS, ISD::CondCode &CCCode, const SDLoc &DL, const SDValue OldLHS, const SDValue OldRHS) const
Soften the operands of a comparison.
SDValue expandUnalignedStore(StoreSDNode *ST, SelectionDAG &DAG) const
Expands an unaligned store to 2 half-size stores for integer values, and possibly more for vectors.
virtual ConstraintType getConstraintType(StringRef Constraint) const
Given a constraint, return the type of constraint it is for this target.
bool parametersInCSRMatch(const MachineRegisterInfo &MRI, const uint32_t *CallerPreservedMask, const SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< SDValue > &OutVals) const
Check whether parameters to a call that are passed in callee saved registers are the same as from the...
std::pair< SDValue, SDValue > expandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Expands an unaligned load to 2 half-size loads for an integer, and possibly more for vectors.
virtual SDValue LowerToTLSEmulatedModel(const GlobalAddressSDNode *GA, SelectionDAG &DAG) const
Lower TLS global address SDNode for target independent emulated TLS model.
std::pair< SDValue, SDValue > LowerCallTo(CallLoweringInfo &CLI) const
This function lowers an abstract call to a function into an actual call.
bool expandDIVREMByConstant(SDNode *N, SmallVectorImpl< SDValue > &Result, EVT HiLoVT, SelectionDAG &DAG, SDValue LL=SDValue(), SDValue LH=SDValue()) const
Attempt to expand an n-bit div/rem/divrem by constant using an n/2-bit algorithm.
bool isPositionIndependent() const
virtual ConstraintWeight getSingleConstraintMatchWeight(AsmOperandInfo &info, const char *constraint) const
Examine constraint string and operand type and determine a weight value.
SDValue buildLegalVectorShuffle(EVT VT, const SDLoc &DL, SDValue N0, SDValue N1, MutableArrayRef< int > Mask, SelectionDAG &DAG) const
Tries to build a legal vector shuffle using the provided parameters or equivalent variations.
virtual std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const
Given a physical register constraint (e.g.
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Op.
virtual bool SimplifyDemandedBitsForTargetNode(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0) const
Attempt to simplify any target nodes based on the demanded bits/elts, returning true on success.
TargetLowering(const TargetLowering &)=delete
bool isConstTrueVal(SDValue N) const
Return if the N is a constant or constant vector equal to the true value from getBooleanContents().
virtual ArrayRef< MCPhysReg > getRoundingControlRegisters() const
Returns a 0 terminated array of rounding control registers that can be attached into strict FP call.
virtual bool canCreateUndefOrPoisonForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, UndefPoisonKind Kind, bool ConsiderFlags, unsigned Depth) const
Return true if Op can create undef or poison from non-undef & non-poison operands.
virtual void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const
Lower the specified operand into the Ops vector.
std::pair< SDValue, SDValue > makeLibCall(SelectionDAG &DAG, RTLIB::LibcallImpl LibcallImpl, EVT RetVT, ArrayRef< SDValue > Ops, MakeLibCallOptions CallOptions, const SDLoc &dl, SDValue Chain=SDValue()) const
Returns a pair of (return value, chain).
void setTypeIdForCallsiteInfo(const CallBase *CB, MachineFunction &MF, MachineFunction::CallSiteInfo &CSInfo) const
Primary interface to the complete machine description for the target machine.
TLSModel::Model getTLSModel(const GlobalValue *GV) const
Returns the TLS model which should be used for the given global variable.
ExceptionHandling getExceptionModel() const
Return the ExceptionHandling to use, considering TargetOptions and the Triple's default.
const Triple & getTargetTriple() const
bool useEmulatedTLS() const
Returns true if this target uses emulated TLS.
virtual const TargetSubtargetInfo * getSubtargetImpl(const Function &) const
Virtual method implemented by subclasses that returns a reference to that target's TargetSubtargetInf...
TargetOptions Options
unsigned EnableFastISel
EnableFastISel - This flag enables fast-path instruction selection which trades away generated code q...
unsigned GuaranteedTailCallOpt
GuaranteedTailCallOpt - This flag is enabled when -tailcallopt is specified on the commandline.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
virtual const TargetRegisterInfo * getRegisterInfo() const =0
Return the target's register information.
Target - Wrapper for Target specific information.
Triple - Helper class for working with autoconf configuration names.
Definition Triple.h:47
ObjectFormatType getObjectFormat() const
Get the object format for this triple.
Definition Triple.h:460
bool isOSWindows() const
Tests whether the OS is Windows.
Definition Triple.h:699
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:343
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:46
static LLVM_ABI IntegerType * getInt64Ty(LLVMContext &C)
Definition Type.cpp:310
bool isVectorTy() const
True if this is an instance of VectorType.
Definition Type.h:288
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:309
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:282
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition Type.h:155
static LLVM_ABI Type * getVoidTy(LLVMContext &C)
Definition Type.cpp:282
static LLVM_ABI IntegerType * getInt8Ty(LLVMContext &C)
Definition Type.cpp:307
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:368
LLVM_ABI TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Definition Type.cpp:197
static LLVM_ABI IntegerType * getInt16Ty(LLVMContext &C)
Definition Type.cpp:308
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition Type.h:144
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition Type.h:130
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:232
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:186
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:257
bool isFPOrFPVectorTy() const
Return true if this is a FP type or a vector of FP.
Definition Type.h:227
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
LLVM_ABI unsigned getOperandNo() const
Return the operand # of this use in its User.
Definition Use.cpp:36
User * getUser() const
Returns the User that contains this Use.
Definition Use.h:61
Value * getOperand(unsigned i) const
Definition User.h:207
unsigned getNumOperands() const
Definition User.h:229
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:255
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
Base class of all SIMD vector types.
Type * getElementType() const
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:212
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
Definition DenseSet.h:185
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:200
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:123
IteratorT end() const
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
static CondCodes getOppositeCondition(CondCodes CC)
Definition ARMBaseInfo.h:48
@ SECREL
Thread Pointer Offset.
@ SBREL
Section Relative (Windows TLS)
@ GOTTPOFF
Global Offset Table, PC Relative.
@ TPOFF
Global Offset Table, Thread Pointer Offset.
TOF
Target Operand Flag enum.
@ MO_NONLAZY
MO_NONLAZY - This is an independent flag, on a symbol operand "FOO" it represents a symbol which,...
@ MO_SBREL
MO_SBREL - On a symbol operand, this represents a static base relative relocation.
@ MO_DLLIMPORT
MO_DLLIMPORT - On a symbol operand, this represents that the reference to the symbol is for an import...
@ MO_GOT
MO_GOT - On a symbol operand, this represents a GOT relative relocation.
@ MO_COFFSTUB
MO_COFFSTUB - On a symbol operand "FOO", this indicates that the reference is actually to the "....
static ShiftOpc getShiftOpcForNode(unsigned Opcode)
int getSOImmVal(unsigned Arg)
getSOImmVal - Given a 32-bit immediate, if it is something that can fit into an shifter_operand immed...
int getFP32Imm(const APInt &Imm)
getFP32Imm - Return an 8-bit floating-point version of the 32-bit floating-point value.
uint64_t decodeVMOVModImm(unsigned ModImm, unsigned &EltBits)
decodeVMOVModImm - Decode a NEON/MVE modified immediate value into the element value and the element ...
unsigned getAM2Offset(unsigned AM2Opc)
bool isThumbImmShiftedVal(unsigned V)
isThumbImmShiftedVal - Return true if the specified value can be obtained by left shifting a 8-bit im...
int getT2SOImmVal(unsigned Arg)
getT2SOImmVal - Given a 32-bit immediate, if it is something that can fit into a Thumb-2 shifter_oper...
unsigned createVMOVModImm(unsigned OpCmode, unsigned Val)
int getFP64Imm(const APInt &Imm)
getFP64Imm - Return an 8-bit floating-point version of the 64-bit floating-point value.
int getFP16Imm(const APInt &Imm)
getFP16Imm - Return an 8-bit floating-point version of the 16-bit floating-point value.
unsigned getSORegOpc(ShiftOpc ShOp, unsigned Imm)
int getFP32FP16Imm(const APInt &Imm)
If this is a FP16Imm encoded as a fp32 value, return the 8-bit encoding for it.
AddrOpc getAM2Op(unsigned AM2Opc)
bool isBitFieldInvertedMask(unsigned v)
const unsigned FPStatusBits
FastISel * createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo, const LibcallLoweringInfo *libcallLowering)
const unsigned FPReservedBits
const unsigned RoundingBitsPos
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
@ Entry
Definition COFF.h:862
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ Swift
Calling convention for Swift.
Definition CallingConv.h:69
@ ARM_APCS
ARM Procedure Calling Standard (obsolete, but still used on some targets).
@ CFGuard_Check
Special calling convention on Windows for calling the Control Guard Check ICall funtion.
Definition CallingConv.h:82
@ PreserveMost
Used for runtime calls that preserves most registers.
Definition CallingConv.h:63
@ ARM_AAPCS
ARM Architecture Procedure Calling Standard calling convention (aka EABI).
@ CXX_FAST_TLS
Used for access functions.
Definition CallingConv.h:72
@ GHC
Used by the Glasgow Haskell Compiler (GHC).
Definition CallingConv.h:50
@ PreserveAll
Used for runtime calls that preserves (almost) all registers.
Definition CallingConv.h:66
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition CallingConv.h:41
@ Tail
Attemps to make calls as fast as possible while guaranteeing that tail call optimization can always b...
Definition CallingConv.h:76
@ SwiftTail
This follows the Swift calling convention in how arguments are passed but guarantees tail calls will ...
Definition CallingConv.h:87
@ ARM_AAPCS_VFP
Same as ARM_AAPCS, but uses hard floating point ABI.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
bool isNON_EXTLoad(const SDNode *N)
Returns true if the specified node is a non-extending load.
NodeType
ISD::NodeType enum - This enum defines the target-independent operators for a SelectionDAG.
Definition ISDOpcodes.h:41
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition ISDOpcodes.h:823
@ MERGE_VALUES
MERGE_VALUES - This node takes multiple discrete operands and returns them all as its individual resu...
Definition ISDOpcodes.h:261
@ STACKRESTORE
STACKRESTORE has two operands, an input chain and a pointer to restore to it returns an output chain.
@ STACKSAVE
STACKSAVE - STACKSAVE has one operand, an input chain.
@ STRICT_FSETCC
STRICT_FSETCC/STRICT_FSETCCS - Constrained versions of SETCC, used for floating-point operands only.
Definition ISDOpcodes.h:511
@ SET_FPENV
Sets the current floating-point environment.
@ MLOAD
Masked load and store - consecutive vector load and store operations with additional mask operand tha...
@ EH_SJLJ_LONGJMP
OUTCHAIN = EH_SJLJ_LONGJMP(INCHAIN, buffer) This corresponds to the eh.sjlj.longjmp intrinsic.
Definition ISDOpcodes.h:168
@ FGETSIGN
INT = FGETSIGN(FP) - Return the sign bit of the specified floating point value as an integer 0/1 valu...
Definition ISDOpcodes.h:538
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition ISDOpcodes.h:275
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
Definition ISDOpcodes.h:600
@ BSWAP
Byte Swap and Counting operators.
Definition ISDOpcodes.h:783
@ VAEND
VAEND, VASTART - VAEND and VASTART have three operands: an input chain, pointer, and a SRCVALUE.
@ ATOMIC_STORE
OUTCHAIN = ATOMIC_STORE(INCHAIN, val, ptr) This corresponds to "store atomic" instruction.
@ RESET_FPENV
Set floating-point environment to default state.
@ ADD
Simple integer binary arithmetic operators.
Definition ISDOpcodes.h:264
@ LOAD
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
@ SET_FPMODE
Sets the current dynamic floating-point control modes.
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition ISDOpcodes.h:857
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition ISDOpcodes.h:518
@ FMODF
FMODF - Decomposes the operand into integral and fractional parts, each having the same type and sign...
@ FATAN2
FATAN2 - atan2, inspired by libm.
@ FSINCOSPI
FSINCOSPI - Compute both the sine and cosine times pi more accurately than FSINCOS(pi*x),...
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
Definition ISDOpcodes.h:220
@ EH_SJLJ_SETUP_DISPATCH
OUTCHAIN = EH_SJLJ_SETUP_DISPATCH(INCHAIN) The target initializes the dispatch table here.
Definition ISDOpcodes.h:172
@ GlobalAddress
Definition ISDOpcodes.h:88
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition ISDOpcodes.h:884
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
Definition ISDOpcodes.h:584
@ VECREDUCE_FMAX
FMIN/FMAX nodes can have flags, for NaN/NoNaN variants.
@ FADD
Simple binary floating point operators.
Definition ISDOpcodes.h:417
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
Definition ISDOpcodes.h:747
@ ATOMIC_FENCE
OUTCHAIN = ATOMIC_FENCE(INCHAIN, ordering, scope) This corresponds to the fence instruction.
@ RESET_FPMODE
Sets default dynamic floating-point control modes.
@ SDIVREM
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition ISDOpcodes.h:280
@ FP16_TO_FP
FP16_TO_FP, FP_TO_FP16 - These operators are used to perform promotions and truncation for half-preci...
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
Definition ISDOpcodes.h:997
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
Definition ISDOpcodes.h:254
@ FLDEXP
FLDEXP - ldexp, inspired by libm (op0 * 2**op1).
@ STRICT_FSQRT
Constrained versions of libm-equivalent floating point intrinsics.
Definition ISDOpcodes.h:438
@ BUILTIN_OP_END
BUILTIN_OP_END - This must be the last enum value in this list.
@ GlobalTLSAddress
Definition ISDOpcodes.h:89
@ CTLZ_ZERO_POISON
Definition ISDOpcodes.h:792
@ SET_ROUNDING
Set rounding mode.
Definition ISDOpcodes.h:979
@ SIGN_EXTEND
Conversion operators.
Definition ISDOpcodes.h:848
@ AVGCEILS
AVGCEILS/AVGCEILU - Rounding averaging add - Add two integers using an integer of type i[N+2],...
Definition ISDOpcodes.h:715
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
Definition ISDOpcodes.h:665
@ BR
Control flow instructions. These all have token chains.
@ VECREDUCE_FADD
These reductions have relaxed evaluation order semantics, and have a single vector operand.
@ PREFETCH
PREFETCH - This corresponds to a prefetch intrinsic.
@ FSINCOS
FSINCOS - Compute both fsin and fcos as a single operation.
@ SETCCCARRY
Like SetCC, ops #0 and #1 are the LHS and RHS operands to compare, but op #2 is a boolean indicating ...
Definition ISDOpcodes.h:831
@ FNEG
Perform various unary floating-point operations inspired by libm.
@ BR_CC
BR_CC - Conditional branch.
@ SSUBO
Same for subtraction.
Definition ISDOpcodes.h:352
@ BR_JT
BR_JT - Jumptable branch.
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
Definition ISDOpcodes.h:374
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition ISDOpcodes.h:800
@ ATOMIC_LOAD
Val, OUTCHAIN = ATOMIC_LOAD(INCHAIN, ptr) This corresponds to "load atomic" instruction.
@ UNDEF
UNDEF - An undefined node.
Definition ISDOpcodes.h:233
@ EXTRACT_ELEMENT
EXTRACT_ELEMENT - This is used to get the lower or upper (determined by a Constant,...
Definition ISDOpcodes.h:247
@ VACOPY
VACOPY - VACOPY has 5 operands: an input chain, a destination pointer, a source pointer,...
@ BasicBlock
Various leaf nodes.
Definition ISDOpcodes.h:81
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
Definition ISDOpcodes.h:230
@ SADDO
RESULT, BOOL = [SU]ADDO(LHS, RHS) - Overflow-aware nodes for addition.
Definition ISDOpcodes.h:348
@ CTLS
Count leading redundant sign bits.
Definition ISDOpcodes.h:796
@ VECREDUCE_ADD
Integer reductions may have a result type larger than the vector element type.
@ GET_ROUNDING
Returns current rounding mode: -1 Undefined 0 Round to 0 1 Round to nearest, ties to even 2 Round to ...
Definition ISDOpcodes.h:974
@ STRICT_FP_TO_FP16
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition ISDOpcodes.h:704
@ GET_FPMODE
Reads the current dynamic floating-point control modes.
@ STRICT_FP16_TO_FP
@ GET_FPENV
Gets the current floating-point environment.
@ SHL
Shift and rotation operations.
Definition ISDOpcodes.h:769
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition ISDOpcodes.h:649
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
Definition ISDOpcodes.h:614
@ READ_REGISTER
READ_REGISTER, WRITE_REGISTER - This node represents llvm.register on the DAG, which implements the n...
Definition ISDOpcodes.h:139
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition ISDOpcodes.h:576
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition ISDOpcodes.h:224
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition ISDOpcodes.h:854
@ DEBUGTRAP
DEBUGTRAP - Trap intended to get the attention of a debugger.
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
Definition ISDOpcodes.h:815
@ ATOMIC_CMP_SWAP
Val, OUTCHAIN = ATOMIC_CMP_SWAP(INCHAIN, ptr, cmp, swap) For double-word atomic operations: ValLo,...
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum maximum on two values, following IEEE-754 definition...
@ SMULO
Same for multiplication.
Definition ISDOpcodes.h:356
@ DYNAMIC_STACKALLOC
DYNAMIC_STACKALLOC - Allocate some number of bytes on the stack aligned to a specified boundary.
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition ISDOpcodes.h:892
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition ISDOpcodes.h:727
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition ISDOpcodes.h:982
@ VSELECT
Select with a vector condition (op #0) and two vector operands (ops #1 and #2), returning a vector re...
Definition ISDOpcodes.h:809
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:328
@ STRICT_FROUNDEVEN
Definition ISDOpcodes.h:464
@ BF16_TO_FP
BF16_TO_FP, FP_TO_BF16 - These operators are used to perform promotions and truncation for bfloat16.
@ FRAMEADDR
FRAMEADDR, RETURNADDR - These nodes represent llvm.frameaddress and llvm.returnaddress on the DAG.
Definition ISDOpcodes.h:110
@ STRICT_FP_TO_UINT
Definition ISDOpcodes.h:478
@ STRICT_FP_ROUND
X = STRICT_FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision ...
Definition ISDOpcodes.h:500
@ STRICT_FP_TO_SINT
STRICT_FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:477
@ FMINIMUM
FMINIMUM/FMAXIMUM - NaN-propagating minimum/maximum that also treat -0.0 as less than 0....
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:930
@ READCYCLECOUNTER
READCYCLECOUNTER - This corresponds to the readcyclecounter intrinsic.
@ STRICT_FP_EXTEND
X = STRICT_FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition ISDOpcodes.h:505
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition ISDOpcodes.h:739
@ TRAP
TRAP - Trapping instruction.
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition ISDOpcodes.h:205
@ SCMP
[US]CMP - 3-way comparison of signed or unsigned integers.
Definition ISDOpcodes.h:735
@ AVGFLOORS
AVGFLOORS/AVGFLOORU - Averaging add - Add two integers using an integer of type i[N+1],...
Definition ISDOpcodes.h:710
@ STRICT_FADD
Constrained versions of the binary floating point operators.
Definition ISDOpcodes.h:427
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
Definition ISDOpcodes.h:565
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
Definition ISDOpcodes.h:53
@ ATOMIC_SWAP
Val, OUTCHAIN = ATOMIC_SWAP(INCHAIN, ptr, amt) Val, OUTCHAIN = ATOMIC_LOAD_[OpName](INCHAIN,...
@ CTTZ_ZERO_POISON
Bit counting operators with a poisoned result for zero inputs.
Definition ISDOpcodes.h:791
@ FFREXP
FFREXP - frexp, extract fractional and exponent component of a floating-point value.
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition ISDOpcodes.h:963
@ SPONENTRY
SPONENTRY - Represents the llvm.sponentry intrinsic.
Definition ISDOpcodes.h:122
@ STRICT_FNEARBYINT
Definition ISDOpcodes.h:458
@ FP_TO_SINT_SAT
FP_TO_[US]INT_SAT - Convert floating point value in operand 0 to a signed or unsigned scalar integer ...
Definition ISDOpcodes.h:949
@ EH_SJLJ_SETJMP
RESULT, OUTCHAIN = EH_SJLJ_SETJMP(INCHAIN, buffer) This corresponds to the eh.sjlj....
Definition ISDOpcodes.h:162
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition ISDOpcodes.h:860
@ VAARG
VAARG - VAARG has four operands: an input chain, a pointer, a SRCVALUE, and the alignment.
@ BRCOND
BRCOND - Conditional branch.
@ SHL_PARTS
SHL_PARTS/SRA_PARTS/SRL_PARTS - These operators are used for expanded integer shift operations.
Definition ISDOpcodes.h:837
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
Definition ISDOpcodes.h:534
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
Definition ISDOpcodes.h:365
@ ABDS
ABDS/ABDU - Absolute difference - Return the absolute difference between two numbers interpreted as s...
Definition ISDOpcodes.h:722
@ SADDO_CARRY
Carry-using overflow-aware nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:338
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition ISDOpcodes.h:213
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
Definition ISDOpcodes.h:556
bool isNormalStore(const SDNode *N)
Returns true if the specified node is a non-truncating and unindexed store.
bool isZEXTLoad(const SDNode *N)
Returns true if the specified node is a ZEXTLOAD.
LLVM_ABI CondCode getSetCCInverse(CondCode Operation, EVT Type)
Return the operation corresponding to !(X op Y), where 'op' is a valid SetCC operation.
bool isEXTLoad(const SDNode *N)
Returns true if the specified node is a EXTLOAD.
LLVM_ABI CondCode getSetCCSwappedOperands(CondCode Operation)
Return the operation corresponding to (Y op X) when given the operation for (X op Y).
LLVM_ABI bool isBuildVectorAllZeros(const SDNode *N)
Return true if the specified node is a BUILD_VECTOR where all of the elements are 0 or undef.
bool isSignedIntSetCC(CondCode Code)
Return true if this is a setcc instruction that performs a signed comparison when used with integer o...
LLVM_ABI bool isConstantSplatVector(const SDNode *N, APInt &SplatValue)
Node predicates.
MemIndexedMode
MemIndexedMode enum - This enum defines the load / store indexed addressing modes.
bool isSEXTLoad(const SDNode *N)
Returns true if the specified node is a SEXTLOAD.
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
LoadExtType
LoadExtType enum - This enum defines the three variants of LOADEXT (load with extension).
static const int LAST_INDEXED_MODE
bool isNormalLoad(const SDNode *N)
Returns true if the specified node is a non-extending and unindexed load.
This namespace contains an enum with a value for every intrinsic/builtin function known by LLVM.
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > OverloadTys={})
Look up the Function declaration of the intrinsic id in the Module M.
LLVM_ABI Libcall getSINTTOFP(EVT OpVT, EVT RetVT)
getSINTTOFP - Return the SINTTOFP_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
LLVM_ABI Libcall getUINTTOFP(EVT OpVT, EVT RetVT)
getUINTTOFP - Return the UINTTOFP_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
LLVM_ABI Libcall getFPTOUINT(EVT OpVT, EVT RetVT)
getFPTOUINT - Return the FPTOUINT_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
LLVM_ABI Libcall getFPTOSINT(EVT OpVT, EVT RetVT)
getFPTOSINT - Return the FPTOSINT_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
LLVM_ABI Libcall getFPEXT(EVT OpVT, EVT RetVT)
getFPEXT - Return the FPEXT_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
LLVM_ABI Libcall getFPROUND(EVT OpVT, EVT RetVT)
getFPROUND - Return the FPROUND_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
@ SingleThread
Synchronized with respect to signal handlers executing in the same thread.
Definition LLVMContext.h:55
initializer< Ty > init(const Ty &Val)
BaseReg
Stack frame base register. Bit 0 of FREInfo.Info.
Definition SFrame.h:77
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:315
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
bool RetFastCC_ARM_APCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
@ Low
Lower the current thread's priority such that it does not affect foreground tasks significantly.
Definition Threading.h:280
@ Offset
Definition DWP.cpp:558
@ Length
Definition DWP.cpp:558
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
void stable_sort(R &&Range)
Definition STLExtras.h:2115
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1764
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1738
bool HasLowerConstantMaterializationCost(unsigned Val1, unsigned Val2, const ARMSubtarget *Subtarget, bool ForCodesize=false)
Returns true if Val1 has a lower Constant Materialization Cost than Val2.
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
InstructionCost Cost
LLVM_ABI bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Dead
Unused definition.
@ Kill
The last use of a register.
@ Define
Register definition.
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
bool isStrongerThanMonotonic(AtomicOrdering AO)
int countr_one(T Value)
Count the number of ones from the least significant bit to the first zero bit.
Definition bit.h:315
FunctionAddr VTableAddr uintptr_t uintptr_t Int32Ty
Definition InstrProf.h:328
bool CCAssignFn(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
CCAssignFn - This function assigns a location for Val, updating State to reflect the change.
bool CC_ARM_AAPCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
constexpr bool isMask_32(uint32_t Value)
Return true if the argument is a non-empty sequence of ones starting at the least significant bit wit...
Definition MathExtras.h:255
ExceptionHandling
Definition CodeGen.h:53
@ SjLj
setjmp/longjmp based exceptions
Definition CodeGen.h:56
bool RetCC_ARM_AAPCS_VFP(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
bool RetCC_ARM_APCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
Definition bit.h:325
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2207
constexpr bool isUIntN(unsigned N, uint64_t x)
Checks if an unsigned integer fits into the given (dynamic) bit width.
Definition MathExtras.h:243
bool RetCC_ARM_AAPCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
LLVM_ABI Value * concatenateVectors(IRBuilderBase &Builder, ArrayRef< Value * > Vecs)
Concatenate a list of vectors.
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition MathExtras.h:284
void shuffle(Iterator first, Iterator last, RNG &&g)
Definition STLExtras.h:1529
bool CC_ARM_APCS_GHC(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
static std::array< MachineOperand, 2 > predOps(ARMCC::CondCodes Pred, unsigned PredReg=0)
Get the operands corresponding to the given Pred value.
bool operator==(const AddressRangeValuePair &LHS, const AddressRangeValuePair &RHS)
constexpr bool isShiftedMask_32(uint32_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (32 bit ver...
Definition MathExtras.h:267
LLVM_ABI ConstantFPSDNode * isConstOrConstSplatFP(SDValue N, bool AllowUndefs=false)
Returns the SDNode if it is a constant splat BuildVector or constant float.
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition bit.h:204
bool isReleaseOrStronger(AtomicOrdering AO)
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:753
constexpr bool has_single_bit(T Value) noexcept
Definition bit.h:149
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1745
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:331
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
Definition bit.h:263
LLVM_ABI bool isBitwiseNot(SDValue V, bool AllowUndefs=false)
Returns true if V is a bitwise not operation.
MachineInstr * getImm(const MachineOperand &MO, const MachineRegisterInfo *MRI)
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:279
bool FastCC_ARM_APCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:209
bool CC_ARM_Win32_CFGuard_Check(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:163
constexpr uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition Alignment.h:144
FunctionAddr VTableAddr Count
Definition InstrProf.h:139
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
Definition MathExtras.h:189
SmallVector< ValueTypeFromRangeType< R >, Size > to_vector(R &&Range)
Given a range of type R, iterate the entire range and return a SmallVector with elements of the vecto...
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
LLVM_ABI raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
AtomicOrdering
Atomic ordering for LLVM's memory model.
@ Other
Any other memory.
Definition ModRef.h:68
FunctionAddr VTableAddr uintptr_t uintptr_t Data
Definition InstrProf.h:221
CombineLevel
Definition DAGCombine.h:15
@ BeforeLegalizeTypes
Definition DAGCombine.h:16
unsigned ConstantMaterializationCost(unsigned Val, const ARMSubtarget *Subtarget, bool ForCodesize=false)
Returns the number of instructions required to materialize the given constant in a register,...
@ Mul
Product of integers.
@ And
Bitwise or logical AND of integers.
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
@ FAdd
Sum of floats.
uint16_t MCPhysReg
An unsigned integer type large enough to represent all physical registers, but not necessarily virtua...
Definition MCRegister.h:21
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
LLVM_ABI ConstantSDNode * isConstOrConstSplat(SDValue N, bool AllowUndefs=false, bool AllowTruncation=false)
Returns the SDNode if it is a constant splat BuildVector or constant int.
constexpr U AbsoluteValue(T X)
Return the absolute value of a signed integer, converted to the corresponding unsigned integer type.
Definition MathExtras.h:592
bool isAcquireOrStronger(AtomicOrdering AO)
constexpr unsigned BitWidth
static MachineOperand t1CondCodeOp(bool isDead=false)
Get the operand corresponding to the conditional code result for Thumb1.
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
Definition STLExtras.h:2018
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1771
LLVM_ABI bool isOneConstant(SDValue V)
Returns true if V is a constant integer one.
UndefPoisonKind
Enumeration to track whether we are interested in Undef, Poison, or both.
Definition UndefPoison.h:20
constexpr bool isIntN(unsigned N, int64_t x)
Checks if an signed integer fits into the given (dynamic) bit width.
Definition MathExtras.h:248
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition Alignment.h:201
static MachineOperand condCodeOp(unsigned CCReg=0)
Get the operand corresponding to the conditional code result.
bool isVREVMask(ArrayRef< int > M, EVT VT, unsigned BlockSize)
isVREVMask - Check if a vector shuffle corresponds to a VREV instruction with the specified blocksize...
auto seq(T Begin, T End)
Iterate over an integral type from Begin up to - but not including - End.
Definition Sequence.h:305
unsigned gettBLXrOpcode(const MachineFunction &MF)
bool CC_ARM_APCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
@ Increment
Incrementally increasing token ID.
Definition AllocToken.h:26
bool CC_ARM_AAPCS_VFP(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
LLVM_ABI llvm::SmallVector< int, 16 > createSequentialMask(unsigned Start, unsigned NumInts, unsigned NumUndefs)
Create a sequential shuffle mask.
constexpr bool isShiftedUInt(uint64_t x)
Checks if a unsigned integer is an N bit number shifted left by S.
Definition MathExtras.h:198
unsigned convertAddSubFlagsOpcode(unsigned OldOpc)
Map pseudo instructions that imply an 'S' bit onto real opcodes.
LLVM_ABI bool isAllOnesConstant(SDValue V)
Returns true if V is an integer constant with all bits set.
static const unsigned PerfectShuffleTable[6561+1]
LLVM_ABI void reportFatalUsageError(Error Err)
Report a fatal error that does not indicate a bug in LLVM.
Definition Error.cpp:177
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:862
#define N
Load/store instruction that can be merged with a base address update.
SDNode * N
Instruction that updates a pointer.
unsigned ConstInc
Pointer increment value if it is a constant, or 0 otherwise.
SDValue Inc
Pointer increment operand.
A collection of metadata nodes that might be associated with a memory access used by the alias-analys...
Definition Metadata.h:763
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
constexpr uint64_t value() const
This is a hole in the type system and should not be abused.
Definition Alignment.h:77
static constexpr DenormalMode getIEEE()
Extended Value Type.
Definition ValueTypes.h:35
EVT changeVectorElementTypeToInteger() const
Return a vector with the same number of elements as this vector, but with the element type converted ...
Definition ValueTypes.h:90
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition ValueTypes.h:145
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition ValueTypes.h:70
bool bitsGT(EVT VT) const
Return true if this has more bits than VT.
Definition ValueTypes.h:307
bool bitsLT(EVT VT) const
Return true if this has less bits than VT.
Definition ValueTypes.h:323
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
Definition ValueTypes.h:155
ElementCount getVectorElementCount() const
Definition ValueTypes.h:373
EVT getDoubleNumVectorElementsVT(LLVMContext &Context) const
Definition ValueTypes.h:494
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition ValueTypes.h:396
unsigned getVectorMinNumElements() const
Given a vector type, return the minimum number of elements it contains.
Definition ValueTypes.h:382
uint64_t getScalarSizeInBits() const
Definition ValueTypes.h:408
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
Definition ValueTypes.h:501
static LLVM_ABI EVT getEVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
EVT changeVectorElementType(LLVMContext &Context, EVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
Definition ValueTypes.h:98
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition ValueTypes.h:339
bool is128BitVector() const
Return true if this is a 128-bit vector type.
Definition ValueTypes.h:230
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
Definition ValueTypes.h:61
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
Definition ValueTypes.h:404
bool isFixedLengthVector() const
Definition ValueTypes.h:199
static EVT getFloatingPointVT(unsigned BitWidth)
Returns the EVT that represents a floating-point type with the given number of bits.
Definition ValueTypes.h:55
bool isVector() const
Return true if this is a vector value type.
Definition ValueTypes.h:176
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition ValueTypes.h:346
LLVM_ABI Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition ValueTypes.h:351
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
Definition ValueTypes.h:165
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition ValueTypes.h:359
bool bitsLE(EVT VT) const
Return true if this has no more bits than VT.
Definition ValueTypes.h:331
EVT getHalfNumVectorElementsVT(LLVMContext &Context) const
Definition ValueTypes.h:484
bool isInteger() const
Return true if this is an integer or a vector integer type.
Definition ValueTypes.h:160
bool is64BitVector() const
Return true if this is a 64-bit vector type.
Definition ValueTypes.h:225
InputArg - This struct carries flags and type information about a single incoming (formal) argument o...
EVT ArgVT
Usually the non-legalized type of the argument, which is the EVT corresponding to the OrigTy IR type.
static KnownBits makeConstant(const APInt &C)
Create known bits from a known constant.
Definition KnownBits.h:315
bool isUnknown() const
Returns true if we don't know any bits.
Definition KnownBits.h:64
unsigned getBitWidth() const
Get the bit width of this value.
Definition KnownBits.h:44
KnownBits zext(unsigned BitWidth) const
Return known bits for a zero extension of the value we're tracking.
Definition KnownBits.h:176
void resetAll()
Resets the known state of all bits.
Definition KnownBits.h:72
static KnownBits add(const KnownBits &LHS, const KnownBits &RHS, bool NSW=false, bool NUW=false, bool SelfAdd=false)
Compute knownbits resulting from addition of LHS and RHS.
Definition KnownBits.h:361
KnownBits intersectWith(const KnownBits &RHS) const
Returns KnownBits information that is known to be true for both this and RHS.
Definition KnownBits.h:325
KnownBits sext(unsigned BitWidth) const
Return known bits for a sign extension of the value we're tracking.
Definition KnownBits.h:184
static LLVM_ABI KnownBits mul(const KnownBits &LHS, const KnownBits &RHS, bool NoUndefSelfMultiply=false)
Compute known bits resulting from multiplying LHS and RHS.
APInt getSignedMinValue() const
Return the minimal signed value possible given these KnownBits.
Definition KnownBits.h:136
Matching combinators.
SmallVector< ArgRegPair, 1 > ArgRegPairs
Vector of call argument and its forwarding register.
This class contains a discriminated union of information about pointers in memory operands,...
static LLVM_ABI MachinePointerInfo getJumpTable(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a jump table entry.
static LLVM_ABI MachinePointerInfo getStack(MachineFunction &MF, int64_t Offset, uint8_t ID=0)
Stack pointer relative access.
static LLVM_ABI MachinePointerInfo getConstantPool(MachineFunction &MF)
Return a MachinePointerInfo record that refers to the constant pool.
MachinePointerInfo getWithOffset(int64_t O) const
static LLVM_ABI MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
static LLVM_ABI MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition Alignment.h:106
These are IR-level optimization flags that may be propagated to SDNodes.
bool hasNoSignedZeros() const
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
This contains information for each constraint that we are lowering.
This structure contains all information that is necessary for lowering calls.
CallLoweringInfo & setInRegister(bool Value=true)
CallLoweringInfo & setLibCallee(CallingConv::ID CC, Type *ResultType, SDValue Target, ArgListTy &&ArgsList)
SmallVector< ISD::InputArg, 32 > Ins
CallLoweringInfo & setZExtResult(bool Value=true)
CallLoweringInfo & setDebugLoc(const SDLoc &dl)
CallLoweringInfo & setSExtResult(bool Value=true)
SmallVector< ISD::OutputArg, 32 > Outs
CallLoweringInfo & setChain(SDValue InChain)
CallLoweringInfo & setCallee(CallingConv::ID CC, Type *ResultType, SDValue Target, ArgListTy &&ArgsList, AttributeSet ResultAttrs={})
LLVM_ABI void AddToWorklist(SDNode *N)
LLVM_ABI SDValue CombineTo(SDNode *N, ArrayRef< SDValue > To, bool AddTo=true)
This structure is used to pass arguments to makeLibCall function.
A convenience struct that encapsulates a DAG, and two SDValues for returning information from TargetL...