LLVM 22.0.0git
ARMISelLowering.cpp
Go to the documentation of this file.
1//===- ARMISelLowering.cpp - ARM DAG Lowering Implementation --------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines the interfaces that ARM uses to lower LLVM code into a
10// selection DAG.
11//
12//===----------------------------------------------------------------------===//
13
14#include "ARMISelLowering.h"
15#include "ARMBaseInstrInfo.h"
16#include "ARMBaseRegisterInfo.h"
17#include "ARMCallingConv.h"
20#include "ARMPerfectShuffle.h"
21#include "ARMRegisterInfo.h"
22#include "ARMSelectionDAGInfo.h"
23#include "ARMSubtarget.h"
27#include "Utils/ARMBaseInfo.h"
28#include "llvm/ADT/APFloat.h"
29#include "llvm/ADT/APInt.h"
30#include "llvm/ADT/ArrayRef.h"
31#include "llvm/ADT/BitVector.h"
32#include "llvm/ADT/DenseMap.h"
33#include "llvm/ADT/STLExtras.h"
36#include "llvm/ADT/Statistic.h"
38#include "llvm/ADT/StringRef.h"
40#include "llvm/ADT/Twine.h"
66#include "llvm/IR/Attributes.h"
67#include "llvm/IR/CallingConv.h"
68#include "llvm/IR/Constant.h"
69#include "llvm/IR/Constants.h"
70#include "llvm/IR/DataLayout.h"
71#include "llvm/IR/DebugLoc.h"
73#include "llvm/IR/Function.h"
74#include "llvm/IR/GlobalAlias.h"
75#include "llvm/IR/GlobalValue.h"
77#include "llvm/IR/IRBuilder.h"
78#include "llvm/IR/InlineAsm.h"
79#include "llvm/IR/Instruction.h"
82#include "llvm/IR/Intrinsics.h"
83#include "llvm/IR/IntrinsicsARM.h"
84#include "llvm/IR/Module.h"
85#include "llvm/IR/Type.h"
86#include "llvm/IR/User.h"
87#include "llvm/IR/Value.h"
88#include "llvm/MC/MCInstrDesc.h"
90#include "llvm/MC/MCSchedule.h"
97#include "llvm/Support/Debug.h"
105#include <algorithm>
106#include <cassert>
107#include <cstdint>
108#include <cstdlib>
109#include <iterator>
110#include <limits>
111#include <optional>
112#include <tuple>
113#include <utility>
114#include <vector>
115
116using namespace llvm;
117
118#define DEBUG_TYPE "arm-isel"
119
120STATISTIC(NumTailCalls, "Number of tail calls");
121STATISTIC(NumMovwMovt, "Number of GAs materialized with movw + movt");
122STATISTIC(NumLoopByVals, "Number of loops generated for byval arguments");
123STATISTIC(NumConstpoolPromoted,
124 "Number of constants with their storage promoted into constant pools");
125
126static cl::opt<bool>
127ARMInterworking("arm-interworking", cl::Hidden,
128 cl::desc("Enable / disable ARM interworking (for debugging only)"),
129 cl::init(true));
130
132 "arm-promote-constant", cl::Hidden,
133 cl::desc("Enable / disable promotion of unnamed_addr constants into "
134 "constant pools"),
135 cl::init(false)); // FIXME: set to true by default once PR32780 is fixed
137 "arm-promote-constant-max-size", cl::Hidden,
138 cl::desc("Maximum size of constant to promote into a constant pool"),
139 cl::init(64));
141 "arm-promote-constant-max-total", cl::Hidden,
142 cl::desc("Maximum size of ALL constants to promote into a constant pool"),
143 cl::init(128));
144
146MVEMaxSupportedInterleaveFactor("mve-max-interleave-factor", cl::Hidden,
147 cl::desc("Maximum interleave factor for MVE VLDn to generate."),
148 cl::init(2));
149
151 "arm-max-base-updates-to-check", cl::Hidden,
152 cl::desc("Maximum number of base-updates to check generating postindex."),
153 cl::init(64));
154
155/// Value type used for "flags" operands / results (either CPSR or FPSCR_NZCV).
156constexpr MVT FlagsVT = MVT::i32;
157
158// The APCS parameter registers.
159static const MCPhysReg GPRArgRegs[] = {
160 ARM::R0, ARM::R1, ARM::R2, ARM::R3
161};
162
164 SelectionDAG &DAG, const SDLoc &DL) {
166 assert(Arg.ArgVT.bitsLT(MVT::i32));
167 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, Arg.ArgVT, Value);
168 SDValue Ext =
170 MVT::i32, Trunc);
171 return Ext;
172}
173
174void ARMTargetLowering::addTypeForNEON(MVT VT, MVT PromotedLdStVT) {
175 if (VT != PromotedLdStVT) {
176 setOperationAction(ISD::LOAD, VT, Promote);
177 AddPromotedToType (ISD::LOAD, VT, PromotedLdStVT);
178
179 setOperationAction(ISD::STORE, VT, Promote);
180 AddPromotedToType (ISD::STORE, VT, PromotedLdStVT);
181 }
182
183 MVT ElemTy = VT.getVectorElementType();
184 if (ElemTy != MVT::f64)
188 if (ElemTy == MVT::i32) {
193 } else {
198 }
207 if (VT.isInteger()) {
211 }
212
213 // Neon does not support vector divide/remainder operations.
222
223 if (!VT.isFloatingPoint() && VT != MVT::v2i64 && VT != MVT::v1i64)
224 for (auto Opcode : {ISD::ABS, ISD::ABDS, ISD::ABDU, ISD::SMIN, ISD::SMAX,
226 setOperationAction(Opcode, VT, Legal);
227 if (!VT.isFloatingPoint())
228 for (auto Opcode : {ISD::SADDSAT, ISD::UADDSAT, ISD::SSUBSAT, ISD::USUBSAT})
229 setOperationAction(Opcode, VT, Legal);
230}
231
232void ARMTargetLowering::addDRTypeForNEON(MVT VT) {
233 addRegisterClass(VT, &ARM::DPRRegClass);
234 addTypeForNEON(VT, MVT::f64);
235}
236
237void ARMTargetLowering::addQRTypeForNEON(MVT VT) {
238 addRegisterClass(VT, &ARM::DPairRegClass);
239 addTypeForNEON(VT, MVT::v2f64);
240}
241
242void ARMTargetLowering::setAllExpand(MVT VT) {
243 for (unsigned Opc = 0; Opc < ISD::BUILTIN_OP_END; ++Opc)
245
246 // We support these really simple operations even on types where all
247 // the actual arithmetic has to be broken down into simpler
248 // operations or turned into library calls.
249 setOperationAction(ISD::BITCAST, VT, Legal);
250 setOperationAction(ISD::LOAD, VT, Legal);
251 setOperationAction(ISD::STORE, VT, Legal);
253}
254
255void ARMTargetLowering::addAllExtLoads(const MVT From, const MVT To,
256 LegalizeAction Action) {
257 setLoadExtAction(ISD::EXTLOAD, From, To, Action);
258 setLoadExtAction(ISD::ZEXTLOAD, From, To, Action);
259 setLoadExtAction(ISD::SEXTLOAD, From, To, Action);
260}
261
262void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) {
263 const MVT IntTypes[] = { MVT::v16i8, MVT::v8i16, MVT::v4i32 };
264
265 for (auto VT : IntTypes) {
266 addRegisterClass(VT, &ARM::MQPRRegClass);
280 setOperationAction(ISD::MLOAD, VT, Custom);
281 setOperationAction(ISD::MSTORE, VT, Legal);
296
297 // No native support for these.
307
308 // Vector reductions
309 setOperationAction(ISD::VECREDUCE_ADD, VT, Legal);
310 setOperationAction(ISD::VECREDUCE_SMAX, VT, Legal);
311 setOperationAction(ISD::VECREDUCE_UMAX, VT, Legal);
312 setOperationAction(ISD::VECREDUCE_SMIN, VT, Legal);
313 setOperationAction(ISD::VECREDUCE_UMIN, VT, Legal);
314 setOperationAction(ISD::VECREDUCE_MUL, VT, Custom);
315 setOperationAction(ISD::VECREDUCE_AND, VT, Custom);
316 setOperationAction(ISD::VECREDUCE_OR, VT, Custom);
317 setOperationAction(ISD::VECREDUCE_XOR, VT, Custom);
318
319 if (!HasMVEFP) {
324 } else {
327 }
328
329 // Pre and Post inc are supported on loads and stores
330 for (unsigned im = (unsigned)ISD::PRE_INC;
331 im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
336 }
337 }
338
339 const MVT FloatTypes[] = { MVT::v8f16, MVT::v4f32 };
340 for (auto VT : FloatTypes) {
341 addRegisterClass(VT, &ARM::MQPRRegClass);
342 if (!HasMVEFP)
343 setAllExpand(VT);
344
345 // These are legal or custom whether we have MVE.fp or not
354 setOperationAction(ISD::MLOAD, VT, Custom);
355 setOperationAction(ISD::MSTORE, VT, Legal);
358
359 // Pre and Post inc are supported on loads and stores
360 for (unsigned im = (unsigned)ISD::PRE_INC;
361 im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
366 }
367
368 if (HasMVEFP) {
369 setOperationAction(ISD::FMINNUM, VT, Legal);
370 setOperationAction(ISD::FMAXNUM, VT, Legal);
371 setOperationAction(ISD::FROUND, VT, Legal);
372 setOperationAction(ISD::FROUNDEVEN, VT, Legal);
373 setOperationAction(ISD::FRINT, VT, Legal);
374 setOperationAction(ISD::FTRUNC, VT, Legal);
375 setOperationAction(ISD::FFLOOR, VT, Legal);
376 setOperationAction(ISD::FCEIL, VT, Legal);
377 setOperationAction(ISD::VECREDUCE_FADD, VT, Custom);
378 setOperationAction(ISD::VECREDUCE_FMUL, VT, Custom);
379 setOperationAction(ISD::VECREDUCE_FMIN, VT, Custom);
380 setOperationAction(ISD::VECREDUCE_FMAX, VT, Custom);
381
382 // No native support for these.
385 setOperationAction(ISD::FSQRT, VT, Expand);
386 setOperationAction(ISD::FSIN, VT, Expand);
387 setOperationAction(ISD::FCOS, VT, Expand);
388 setOperationAction(ISD::FTAN, VT, Expand);
389 setOperationAction(ISD::FPOW, VT, Expand);
390 setOperationAction(ISD::FLOG, VT, Expand);
391 setOperationAction(ISD::FLOG2, VT, Expand);
392 setOperationAction(ISD::FLOG10, VT, Expand);
393 setOperationAction(ISD::FEXP, VT, Expand);
394 setOperationAction(ISD::FEXP2, VT, Expand);
395 setOperationAction(ISD::FEXP10, VT, Expand);
396 setOperationAction(ISD::FNEARBYINT, VT, Expand);
397 }
398 }
399
400 // Custom Expand smaller than legal vector reductions to prevent false zero
401 // items being added.
402 setOperationAction(ISD::VECREDUCE_FADD, MVT::v4f16, Custom);
403 setOperationAction(ISD::VECREDUCE_FMUL, MVT::v4f16, Custom);
404 setOperationAction(ISD::VECREDUCE_FMIN, MVT::v4f16, Custom);
405 setOperationAction(ISD::VECREDUCE_FMAX, MVT::v4f16, Custom);
406 setOperationAction(ISD::VECREDUCE_FADD, MVT::v2f16, Custom);
407 setOperationAction(ISD::VECREDUCE_FMUL, MVT::v2f16, Custom);
408 setOperationAction(ISD::VECREDUCE_FMIN, MVT::v2f16, Custom);
409 setOperationAction(ISD::VECREDUCE_FMAX, MVT::v2f16, Custom);
410
411 // We 'support' these types up to bitcast/load/store level, regardless of
412 // MVE integer-only / float support. Only doing FP data processing on the FP
413 // vector types is inhibited at integer-only level.
414 const MVT LongTypes[] = { MVT::v2i64, MVT::v2f64 };
415 for (auto VT : LongTypes) {
416 addRegisterClass(VT, &ARM::MQPRRegClass);
417 setAllExpand(VT);
423 }
425
426 // We can do bitwise operations on v2i64 vectors
427 setOperationAction(ISD::AND, MVT::v2i64, Legal);
428 setOperationAction(ISD::OR, MVT::v2i64, Legal);
429 setOperationAction(ISD::XOR, MVT::v2i64, Legal);
430
431 // It is legal to extload from v4i8 to v4i16 or v4i32.
432 addAllExtLoads(MVT::v8i16, MVT::v8i8, Legal);
433 addAllExtLoads(MVT::v4i32, MVT::v4i16, Legal);
434 addAllExtLoads(MVT::v4i32, MVT::v4i8, Legal);
435
436 // It is legal to sign extend from v4i8/v4i16 to v4i32 or v8i8 to v8i16.
442
443 // Some truncating stores are legal too.
444 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal);
445 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal);
446 setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal);
447
448 // Pre and Post inc on these are legal, given the correct extends
449 for (unsigned im = (unsigned)ISD::PRE_INC;
450 im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
451 for (auto VT : {MVT::v8i8, MVT::v4i8, MVT::v4i16}) {
456 }
457 }
458
459 // Predicate types
460 const MVT pTypes[] = {MVT::v16i1, MVT::v8i1, MVT::v4i1, MVT::v2i1};
461 for (auto VT : pTypes) {
462 addRegisterClass(VT, &ARM::VCCRRegClass);
471 setOperationAction(ISD::LOAD, VT, Custom);
472 setOperationAction(ISD::STORE, VT, Custom);
477
478 if (!HasMVEFP) {
483 }
484 }
488 setOperationAction(ISD::OR, MVT::v2i1, Expand);
494
503}
504
506 return static_cast<const ARMBaseTargetMachine &>(getTargetMachine());
507}
508
510 const ARMSubtarget &STI)
511 : TargetLowering(TM_), Subtarget(&STI),
512 RegInfo(Subtarget->getRegisterInfo()),
513 Itins(Subtarget->getInstrItineraryData()) {
514 const auto &TM = static_cast<const ARMBaseTargetMachine &>(TM_);
515
518
519 const Triple &TT = TM.getTargetTriple();
520
521 if (TT.isOSBinFormatMachO()) {
522 // Uses VFP for Thumb libfuncs if available.
523 if (Subtarget->isThumb() && Subtarget->hasVFP2Base() &&
524 Subtarget->hasARMOps() && !Subtarget->useSoftFloat()) {
525 // clang-format off
526 static const struct {
527 const RTLIB::Libcall Op;
528 const RTLIB::LibcallImpl Impl;
529 } LibraryCalls[] = {
530 // Single-precision floating-point arithmetic.
531 { RTLIB::ADD_F32, RTLIB::impl___addsf3vfp },
532 { RTLIB::SUB_F32, RTLIB::impl___subsf3vfp },
533 { RTLIB::MUL_F32, RTLIB::impl___mulsf3vfp },
534 { RTLIB::DIV_F32, RTLIB::impl___divsf3vfp },
535
536 // Double-precision floating-point arithmetic.
537 { RTLIB::ADD_F64, RTLIB::impl___adddf3vfp },
538 { RTLIB::SUB_F64, RTLIB::impl___subdf3vfp },
539 { RTLIB::MUL_F64, RTLIB::impl___muldf3vfp },
540 { RTLIB::DIV_F64, RTLIB::impl___divdf3vfp },
541
542 // Single-precision comparisons.
543 { RTLIB::OEQ_F32, RTLIB::impl___eqsf2vfp },
544 { RTLIB::UNE_F32, RTLIB::impl___nesf2vfp },
545 { RTLIB::OLT_F32, RTLIB::impl___ltsf2vfp },
546 { RTLIB::OLE_F32, RTLIB::impl___lesf2vfp },
547 { RTLIB::OGE_F32, RTLIB::impl___gesf2vfp },
548 { RTLIB::OGT_F32, RTLIB::impl___gtsf2vfp },
549 { RTLIB::UO_F32, RTLIB::impl___unordsf2vfp },
550
551 // Double-precision comparisons.
552 { RTLIB::OEQ_F64, RTLIB::impl___eqdf2vfp },
553 { RTLIB::UNE_F64, RTLIB::impl___nedf2vfp },
554 { RTLIB::OLT_F64, RTLIB::impl___ltdf2vfp },
555 { RTLIB::OLE_F64, RTLIB::impl___ledf2vfp },
556 { RTLIB::OGE_F64, RTLIB::impl___gedf2vfp },
557 { RTLIB::OGT_F64, RTLIB::impl___gtdf2vfp },
558 { RTLIB::UO_F64, RTLIB::impl___unorddf2vfp },
559
560 // Floating-point to integer conversions.
561 // i64 conversions are done via library routines even when generating VFP
562 // instructions, so use the same ones.
563 { RTLIB::FPTOSINT_F64_I32, RTLIB::impl___fixdfsivfp },
564 { RTLIB::FPTOUINT_F64_I32, RTLIB::impl___fixunsdfsivfp },
565 { RTLIB::FPTOSINT_F32_I32, RTLIB::impl___fixsfsivfp },
566 { RTLIB::FPTOUINT_F32_I32, RTLIB::impl___fixunssfsivfp },
567
568 // Conversions between floating types.
569 { RTLIB::FPROUND_F64_F32, RTLIB::impl___truncdfsf2vfp },
570 { RTLIB::FPEXT_F32_F64, RTLIB::impl___extendsfdf2vfp },
571
572 // Integer to floating-point conversions.
573 // i64 conversions are done via library routines even when generating VFP
574 // instructions, so use the same ones.
575 // FIXME: There appears to be some naming inconsistency in ARM libgcc:
576 // e.g., __floatunsidf vs. __floatunssidfvfp.
577 { RTLIB::SINTTOFP_I32_F64, RTLIB::impl___floatsidfvfp },
578 { RTLIB::UINTTOFP_I32_F64, RTLIB::impl___floatunssidfvfp },
579 { RTLIB::SINTTOFP_I32_F32, RTLIB::impl___floatsisfvfp },
580 { RTLIB::UINTTOFP_I32_F32, RTLIB::impl___floatunssisfvfp },
581 };
582 // clang-format on
583
584 for (const auto &LC : LibraryCalls)
585 setLibcallImpl(LC.Op, LC.Impl);
586 }
587 }
588
589 if (Subtarget->isThumb1Only())
590 addRegisterClass(MVT::i32, &ARM::tGPRRegClass);
591 else
592 addRegisterClass(MVT::i32, &ARM::GPRRegClass);
593
594 if (!Subtarget->useSoftFloat() && !Subtarget->isThumb1Only() &&
595 Subtarget->hasFPRegs()) {
596 addRegisterClass(MVT::f32, &ARM::SPRRegClass);
597 addRegisterClass(MVT::f64, &ARM::DPRRegClass);
598
603
604 if (!Subtarget->hasVFP2Base()) {
605 setAllExpand(MVT::f32);
606 } else {
609 setOperationAction(Op, MVT::f32, Legal);
610 }
611 if (!Subtarget->hasFP64()) {
612 setAllExpand(MVT::f64);
613 } else {
616 setOperationAction(Op, MVT::f64, Legal);
617 }
618 }
619
620 if (Subtarget->hasFullFP16()) {
621 addRegisterClass(MVT::f16, &ARM::HPRRegClass);
622 setOperationAction(ISD::BITCAST, MVT::i16, Custom);
623 setOperationAction(ISD::BITCAST, MVT::f16, Custom);
624
625 setOperationAction(ISD::FMINNUM, MVT::f16, Legal);
626 setOperationAction(ISD::FMAXNUM, MVT::f16, Legal);
627 }
628
629 if (Subtarget->hasBF16()) {
630 addRegisterClass(MVT::bf16, &ARM::HPRRegClass);
631 setAllExpand(MVT::bf16);
632 if (!Subtarget->hasFullFP16())
633 setOperationAction(ISD::BITCAST, MVT::bf16, Custom);
634 } else {
635 setOperationAction(ISD::BF16_TO_FP, MVT::f32, Expand);
636 setOperationAction(ISD::BF16_TO_FP, MVT::f64, Expand);
637 setOperationAction(ISD::FP_TO_BF16, MVT::f32, Custom);
638 setOperationAction(ISD::FP_TO_BF16, MVT::f64, Custom);
639 }
640
642 for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
643 setTruncStoreAction(VT, InnerVT, Expand);
644 addAllExtLoads(VT, InnerVT, Expand);
645 }
646
649
651 }
652
653 if (!Subtarget->isThumb1Only() && !Subtarget->hasV8_1MMainlineOps())
655
656 if (!Subtarget->hasV8_1MMainlineOps())
658
659 if (!Subtarget->isThumb1Only())
661
664
667
668 if (Subtarget->hasMVEIntegerOps())
669 addMVEVectorTypes(Subtarget->hasMVEFloatOps());
670
671 // Combine low-overhead loop intrinsics so that we can lower i1 types.
672 if (Subtarget->hasLOB()) {
673 setTargetDAGCombine({ISD::BRCOND, ISD::BR_CC});
674 }
675
676 if (Subtarget->hasNEON()) {
677 addDRTypeForNEON(MVT::v2f32);
678 addDRTypeForNEON(MVT::v8i8);
679 addDRTypeForNEON(MVT::v4i16);
680 addDRTypeForNEON(MVT::v2i32);
681 addDRTypeForNEON(MVT::v1i64);
682
683 addQRTypeForNEON(MVT::v4f32);
684 addQRTypeForNEON(MVT::v2f64);
685 addQRTypeForNEON(MVT::v16i8);
686 addQRTypeForNEON(MVT::v8i16);
687 addQRTypeForNEON(MVT::v4i32);
688 addQRTypeForNEON(MVT::v2i64);
689
690 if (Subtarget->hasFullFP16()) {
691 addQRTypeForNEON(MVT::v8f16);
692 addDRTypeForNEON(MVT::v4f16);
693 }
694
695 if (Subtarget->hasBF16()) {
696 addQRTypeForNEON(MVT::v8bf16);
697 addDRTypeForNEON(MVT::v4bf16);
698 }
699 }
700
701 if (Subtarget->hasMVEIntegerOps() || Subtarget->hasNEON()) {
702 // v2f64 is legal so that QR subregs can be extracted as f64 elements, but
703 // none of Neon, MVE or VFP supports any arithmetic operations on it.
704 setOperationAction(ISD::FADD, MVT::v2f64, Expand);
705 setOperationAction(ISD::FSUB, MVT::v2f64, Expand);
706 setOperationAction(ISD::FMUL, MVT::v2f64, Expand);
707 // FIXME: Code duplication: FDIV and FREM are expanded always, see
708 // ARMTargetLowering::addTypeForNEON method for details.
709 setOperationAction(ISD::FDIV, MVT::v2f64, Expand);
710 setOperationAction(ISD::FREM, MVT::v2f64, Expand);
711 // FIXME: Create unittest.
712 // In another words, find a way when "copysign" appears in DAG with vector
713 // operands.
715 // FIXME: Code duplication: SETCC has custom operation action, see
716 // ARMTargetLowering::addTypeForNEON method for details.
718 // FIXME: Create unittest for FNEG and for FABS.
719 setOperationAction(ISD::FNEG, MVT::v2f64, Expand);
720 setOperationAction(ISD::FABS, MVT::v2f64, Expand);
721 setOperationAction(ISD::FSQRT, MVT::v2f64, Expand);
722 setOperationAction(ISD::FSIN, MVT::v2f64, Expand);
723 setOperationAction(ISD::FCOS, MVT::v2f64, Expand);
724 setOperationAction(ISD::FTAN, MVT::v2f64, Expand);
725 setOperationAction(ISD::FPOW, MVT::v2f64, Expand);
726 setOperationAction(ISD::FLOG, MVT::v2f64, Expand);
727 setOperationAction(ISD::FLOG2, MVT::v2f64, Expand);
728 setOperationAction(ISD::FLOG10, MVT::v2f64, Expand);
729 setOperationAction(ISD::FEXP, MVT::v2f64, Expand);
730 setOperationAction(ISD::FEXP2, MVT::v2f64, Expand);
731 setOperationAction(ISD::FEXP10, MVT::v2f64, Expand);
732 setOperationAction(ISD::FCEIL, MVT::v2f64, Expand);
733 setOperationAction(ISD::FTRUNC, MVT::v2f64, Expand);
734 setOperationAction(ISD::FRINT, MVT::v2f64, Expand);
735 setOperationAction(ISD::FROUNDEVEN, MVT::v2f64, Expand);
736 setOperationAction(ISD::FNEARBYINT, MVT::v2f64, Expand);
737 setOperationAction(ISD::FFLOOR, MVT::v2f64, Expand);
738 setOperationAction(ISD::FMA, MVT::v2f64, Expand);
739 }
740
741 if (Subtarget->hasNEON()) {
742 // The same with v4f32. But keep in mind that vadd, vsub, vmul are natively
743 // supported for v4f32.
744 setOperationAction(ISD::FSQRT, MVT::v4f32, Expand);
745 setOperationAction(ISD::FSIN, MVT::v4f32, Expand);
746 setOperationAction(ISD::FCOS, MVT::v4f32, Expand);
747 setOperationAction(ISD::FTAN, MVT::v4f32, Expand);
748 setOperationAction(ISD::FPOW, MVT::v4f32, Expand);
749 setOperationAction(ISD::FLOG, MVT::v4f32, Expand);
750 setOperationAction(ISD::FLOG2, MVT::v4f32, Expand);
751 setOperationAction(ISD::FLOG10, MVT::v4f32, Expand);
752 setOperationAction(ISD::FEXP, MVT::v4f32, Expand);
753 setOperationAction(ISD::FEXP2, MVT::v4f32, Expand);
754 setOperationAction(ISD::FEXP10, MVT::v4f32, Expand);
755 setOperationAction(ISD::FCEIL, MVT::v4f32, Expand);
756 setOperationAction(ISD::FTRUNC, MVT::v4f32, Expand);
757 setOperationAction(ISD::FRINT, MVT::v4f32, Expand);
758 setOperationAction(ISD::FROUNDEVEN, MVT::v4f32, Expand);
759 setOperationAction(ISD::FNEARBYINT, MVT::v4f32, Expand);
760 setOperationAction(ISD::FFLOOR, MVT::v4f32, Expand);
761
762 // Mark v2f32 intrinsics.
763 setOperationAction(ISD::FSQRT, MVT::v2f32, Expand);
764 setOperationAction(ISD::FSIN, MVT::v2f32, Expand);
765 setOperationAction(ISD::FCOS, MVT::v2f32, Expand);
766 setOperationAction(ISD::FTAN, MVT::v2f32, Expand);
767 setOperationAction(ISD::FPOW, MVT::v2f32, Expand);
768 setOperationAction(ISD::FLOG, MVT::v2f32, Expand);
769 setOperationAction(ISD::FLOG2, MVT::v2f32, Expand);
770 setOperationAction(ISD::FLOG10, MVT::v2f32, Expand);
771 setOperationAction(ISD::FEXP, MVT::v2f32, Expand);
772 setOperationAction(ISD::FEXP2, MVT::v2f32, Expand);
773 setOperationAction(ISD::FEXP10, MVT::v2f32, Expand);
774 setOperationAction(ISD::FCEIL, MVT::v2f32, Expand);
775 setOperationAction(ISD::FTRUNC, MVT::v2f32, Expand);
776 setOperationAction(ISD::FRINT, MVT::v2f32, Expand);
777 setOperationAction(ISD::FROUNDEVEN, MVT::v2f32, Expand);
778 setOperationAction(ISD::FNEARBYINT, MVT::v2f32, Expand);
779 setOperationAction(ISD::FFLOOR, MVT::v2f32, Expand);
780
781 for (ISD::NodeType Op : {ISD::FFLOOR, ISD::FNEARBYINT, ISD::FCEIL,
782 ISD::FRINT, ISD::FTRUNC, ISD::FROUNDEVEN}) {
783 setOperationAction(Op, MVT::v4f16, Expand);
784 setOperationAction(Op, MVT::v8f16, Expand);
785 }
786
787 // Neon does not support some operations on v1i64 and v2i64 types.
788 setOperationAction(ISD::MUL, MVT::v1i64, Expand);
789 // Custom handling for some quad-vector types to detect VMULL.
790 setOperationAction(ISD::MUL, MVT::v8i16, Custom);
791 setOperationAction(ISD::MUL, MVT::v4i32, Custom);
792 setOperationAction(ISD::MUL, MVT::v2i64, Custom);
793 // Custom handling for some vector types to avoid expensive expansions
794 setOperationAction(ISD::SDIV, MVT::v4i16, Custom);
796 setOperationAction(ISD::UDIV, MVT::v4i16, Custom);
798 // Neon does not have single instruction SINT_TO_FP and UINT_TO_FP with
799 // a destination type that is wider than the source, and nor does
800 // it have a FP_TO_[SU]INT instruction with a narrower destination than
801 // source.
810
812 setOperationAction(ISD::FP_EXTEND, MVT::v2f64, Expand);
813
814 // NEON does not have single instruction CTPOP for vectors with element
815 // types wider than 8-bits. However, custom lowering can leverage the
816 // v8i8/v16i8 vcnt instruction.
823
824 setOperationAction(ISD::CTLZ, MVT::v1i64, Expand);
825 setOperationAction(ISD::CTLZ, MVT::v2i64, Expand);
826
827 // NEON does not have single instruction CTTZ for vectors.
829 setOperationAction(ISD::CTTZ, MVT::v4i16, Custom);
830 setOperationAction(ISD::CTTZ, MVT::v2i32, Custom);
831 setOperationAction(ISD::CTTZ, MVT::v1i64, Custom);
832
833 setOperationAction(ISD::CTTZ, MVT::v16i8, Custom);
834 setOperationAction(ISD::CTTZ, MVT::v8i16, Custom);
835 setOperationAction(ISD::CTTZ, MVT::v4i32, Custom);
836 setOperationAction(ISD::CTTZ, MVT::v2i64, Custom);
837
842
847
851 }
852
853 // NEON only has FMA instructions as of VFP4.
854 if (!Subtarget->hasVFP4Base()) {
855 setOperationAction(ISD::FMA, MVT::v2f32, Expand);
856 setOperationAction(ISD::FMA, MVT::v4f32, Expand);
857 }
858
860 ISD::FP_TO_UINT, ISD::FMUL, ISD::LOAD});
861
862 // It is legal to extload from v4i8 to v4i16 or v4i32.
863 for (MVT Ty : {MVT::v8i8, MVT::v4i8, MVT::v2i8, MVT::v4i16, MVT::v2i16,
864 MVT::v2i32}) {
869 }
870 }
871
872 for (auto VT : {MVT::v8i8, MVT::v4i16, MVT::v2i32, MVT::v16i8, MVT::v8i16,
873 MVT::v4i32}) {
874 setOperationAction(ISD::VECREDUCE_SMAX, VT, Custom);
875 setOperationAction(ISD::VECREDUCE_UMAX, VT, Custom);
876 setOperationAction(ISD::VECREDUCE_SMIN, VT, Custom);
877 setOperationAction(ISD::VECREDUCE_UMIN, VT, Custom);
878 }
879 }
880
881 if (Subtarget->hasNEON() || Subtarget->hasMVEIntegerOps()) {
887 ISD::INTRINSIC_VOID, ISD::VECREDUCE_ADD, ISD::ADD, ISD::BITCAST});
888 }
889 if (Subtarget->hasMVEIntegerOps()) {
891 ISD::FP_EXTEND, ISD::SELECT, ISD::SELECT_CC,
892 ISD::SETCC});
893 }
894 if (Subtarget->hasMVEFloatOps()) {
896 }
897
898 if (!Subtarget->hasFP64()) {
899 // When targeting a floating-point unit with only single-precision
900 // operations, f64 is legal for the few double-precision instructions which
901 // are present However, no double-precision operations other than moves,
902 // loads and stores are provided by the hardware.
911 setOperationAction(ISD::FNEG, MVT::f64, Expand);
912 setOperationAction(ISD::FABS, MVT::f64, Expand);
913 setOperationAction(ISD::FSQRT, MVT::f64, Expand);
914 setOperationAction(ISD::FSIN, MVT::f64, Expand);
915 setOperationAction(ISD::FCOS, MVT::f64, Expand);
916 setOperationAction(ISD::FPOW, MVT::f64, Expand);
917 setOperationAction(ISD::FLOG, MVT::f64, Expand);
918 setOperationAction(ISD::FLOG2, MVT::f64, Expand);
919 setOperationAction(ISD::FLOG10, MVT::f64, Expand);
920 setOperationAction(ISD::FEXP, MVT::f64, Expand);
921 setOperationAction(ISD::FEXP2, MVT::f64, Expand);
922 setOperationAction(ISD::FEXP10, MVT::f64, Expand);
923 setOperationAction(ISD::FCEIL, MVT::f64, Expand);
924 setOperationAction(ISD::FTRUNC, MVT::f64, Expand);
925 setOperationAction(ISD::FRINT, MVT::f64, Expand);
926 setOperationAction(ISD::FROUNDEVEN, MVT::f64, Expand);
927 setOperationAction(ISD::FNEARBYINT, MVT::f64, Expand);
928 setOperationAction(ISD::FFLOOR, MVT::f64, Expand);
941 }
942
943 if (!Subtarget->hasFP64() || !Subtarget->hasFPARMv8Base()) {
944 setOperationAction(ISD::FP_EXTEND, MVT::f64, Custom);
946 if (Subtarget->hasFullFP16()) {
949 }
950 }
951
952 if (!Subtarget->hasFP16()) {
953 setOperationAction(ISD::FP_EXTEND, MVT::f32, Custom);
955 }
956
957 computeRegisterProperties(Subtarget->getRegisterInfo());
958
959 // ARM does not have floating-point extending loads.
960 for (MVT VT : MVT::fp_valuetypes()) {
961 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand);
962 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand);
963 setLoadExtAction(ISD::EXTLOAD, VT, MVT::bf16, Expand);
964 }
965
966 // ... or truncating stores
967 setTruncStoreAction(MVT::f64, MVT::f32, Expand);
968 setTruncStoreAction(MVT::f32, MVT::f16, Expand);
969 setTruncStoreAction(MVT::f64, MVT::f16, Expand);
970 setTruncStoreAction(MVT::f32, MVT::bf16, Expand);
971 setTruncStoreAction(MVT::f64, MVT::bf16, Expand);
972
973 // ARM does not have i1 sign extending load.
974 for (MVT VT : MVT::integer_valuetypes())
976
977 // ARM supports all 4 flavors of integer indexed load / store.
978 if (!Subtarget->isThumb1Only()) {
979 for (unsigned im = (unsigned)ISD::PRE_INC;
981 setIndexedLoadAction(im, MVT::i1, Legal);
982 setIndexedLoadAction(im, MVT::i8, Legal);
983 setIndexedLoadAction(im, MVT::i16, Legal);
984 setIndexedLoadAction(im, MVT::i32, Legal);
985 setIndexedStoreAction(im, MVT::i1, Legal);
986 setIndexedStoreAction(im, MVT::i8, Legal);
987 setIndexedStoreAction(im, MVT::i16, Legal);
988 setIndexedStoreAction(im, MVT::i32, Legal);
989 }
990 } else {
991 // Thumb-1 has limited post-inc load/store support - LDM r0!, {r1}.
994 }
995
1000
1003 if (Subtarget->hasDSP()) {
1012 }
1013 if (Subtarget->hasBaseDSP()) {
1016 }
1017
1018 // i64 operation support.
1021 if (Subtarget->isThumb1Only()) {
1024 }
1025 if (Subtarget->isThumb1Only() || !Subtarget->hasV6Ops()
1026 || (Subtarget->isThumb2() && !Subtarget->hasDSP()))
1028
1036 setOperationAction(ISD::LOAD, MVT::i64, Custom);
1037 setOperationAction(ISD::STORE, MVT::i64, Custom);
1038
1039 // MVE lowers 64 bit shifts to lsll and lsrl
1040 // assuming that ISD::SRL and SRA of i64 are already marked custom
1041 if (Subtarget->hasMVEIntegerOps())
1043
1044 // Expand to __aeabi_l{lsl,lsr,asr} calls for Thumb1.
1045 if (Subtarget->isThumb1Only()) {
1049 }
1050
1051 if (!Subtarget->isThumb1Only() && Subtarget->hasV6T2Ops())
1053
1054 // ARM does not have ROTL.
1059 }
1061 // TODO: These two should be set to LibCall, but this currently breaks
1062 // the Linux kernel build. See #101786.
1065 if (!Subtarget->hasV5TOps() || Subtarget->isThumb1Only()) {
1068 }
1069
1070 // @llvm.readcyclecounter requires the Performance Monitors extension.
1071 // Default to the 0 expansion on unsupported platforms.
1072 // FIXME: Technically there are older ARM CPUs that have
1073 // implementation-specific ways of obtaining this information.
1074 if (Subtarget->hasPerfMon())
1075 setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Custom);
1076
1077 // Only ARMv6 has BSWAP.
1078 if (!Subtarget->hasV6Ops())
1080
1081 bool hasDivide = Subtarget->isThumb() ? Subtarget->hasDivideInThumbMode()
1082 : Subtarget->hasDivideInARMMode();
1083 if (!hasDivide) {
1084 // These are expanded into libcalls if the cpu doesn't have HW divider.
1087 }
1088
1089 if (TT.isOSWindows() && !Subtarget->hasDivideInThumbMode()) {
1092
1095 }
1096
1099
1100 // Register based DivRem for AEABI (RTABI 4.2)
1101 if (TT.isTargetAEABI() || TT.isAndroid() || TT.isTargetGNUAEABI() ||
1102 TT.isTargetMuslAEABI() || TT.isOSFuchsia() || TT.isOSWindows()) {
1105 HasStandaloneRem = false;
1106
1111 } else {
1114 }
1115
1120
1121 setOperationAction(ISD::TRAP, MVT::Other, Legal);
1122 setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
1123
1124 // Use the default implementation.
1125 setOperationAction(ISD::VASTART, MVT::Other, Custom);
1126 setOperationAction(ISD::VAARG, MVT::Other, Expand);
1127 setOperationAction(ISD::VACOPY, MVT::Other, Expand);
1128 setOperationAction(ISD::VAEND, MVT::Other, Expand);
1129 setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
1130 setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
1131
1132 if (TT.isOSWindows())
1133 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom);
1134 else
1135 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Expand);
1136
1137 // ARMv6 Thumb1 (except for CPUs that support dmb / dsb) and earlier use
1138 // the default expansion.
1139 InsertFencesForAtomic = false;
1140 if (Subtarget->hasAnyDataBarrier() &&
1141 (!Subtarget->isThumb() || Subtarget->hasV8MBaselineOps())) {
1142 // ATOMIC_FENCE needs custom lowering; the others should have been expanded
1143 // to ldrex/strex loops already.
1144 setOperationAction(ISD::ATOMIC_FENCE, MVT::Other, Custom);
1145 if (!Subtarget->isThumb() || !Subtarget->isMClass())
1146 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i64, Custom);
1147
1148 // On v8, we have particularly efficient implementations of atomic fences
1149 // if they can be combined with nearby atomic loads and stores.
1150 if (!Subtarget->hasAcquireRelease() ||
1151 getTargetMachine().getOptLevel() == CodeGenOptLevel::None) {
1152 // Automatically insert fences (dmb ish) around ATOMIC_SWAP etc.
1153 InsertFencesForAtomic = true;
1154 }
1155 } else {
1156 // If there's anything we can use as a barrier, go through custom lowering
1157 // for ATOMIC_FENCE.
1158 // If target has DMB in thumb, Fences can be inserted.
1159 if (Subtarget->hasDataBarrier())
1160 InsertFencesForAtomic = true;
1161
1162 setOperationAction(ISD::ATOMIC_FENCE, MVT::Other,
1163 Subtarget->hasAnyDataBarrier() ? Custom : Expand);
1164
1165 // Set them all for libcall, which will force libcalls.
1166 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, LibCall);
1167 setOperationAction(ISD::ATOMIC_SWAP, MVT::i32, LibCall);
1168 setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i32, LibCall);
1169 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i32, LibCall);
1170 setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i32, LibCall);
1171 setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i32, LibCall);
1172 setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i32, LibCall);
1173 setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i32, LibCall);
1174 setOperationAction(ISD::ATOMIC_LOAD_MIN, MVT::i32, LibCall);
1175 setOperationAction(ISD::ATOMIC_LOAD_MAX, MVT::i32, LibCall);
1176 setOperationAction(ISD::ATOMIC_LOAD_UMIN, MVT::i32, LibCall);
1177 setOperationAction(ISD::ATOMIC_LOAD_UMAX, MVT::i32, LibCall);
1178 // Mark ATOMIC_LOAD and ATOMIC_STORE custom so we can handle the
1179 // Unordered/Monotonic case.
1180 if (!InsertFencesForAtomic) {
1181 setOperationAction(ISD::ATOMIC_LOAD, MVT::i32, Custom);
1182 setOperationAction(ISD::ATOMIC_STORE, MVT::i32, Custom);
1183 }
1184 }
1185
1186 // Compute supported atomic widths.
1187 if (TT.isOSLinux() || (!Subtarget->isMClass() && Subtarget->hasV6Ops())) {
1188 // For targets where __sync_* routines are reliably available, we use them
1189 // if necessary.
1190 //
1191 // ARM Linux always supports 64-bit atomics through kernel-assisted atomic
1192 // routines (kernel 3.1 or later). FIXME: Not with compiler-rt?
1193 //
1194 // ARMv6 targets have native instructions in ARM mode. For Thumb mode,
1195 // such targets should provide __sync_* routines, which use the ARM mode
1196 // instructions. (ARMv6 doesn't have dmb, but it has an equivalent
1197 // encoding; see ARMISD::MEMBARRIER_MCR.)
1199 } else if ((Subtarget->isMClass() && Subtarget->hasV8MBaselineOps()) ||
1200 Subtarget->hasForced32BitAtomics()) {
1201 // Cortex-M (besides Cortex-M0) have 32-bit atomics.
1203 } else {
1204 // We can't assume anything about other targets; just use libatomic
1205 // routines.
1207 }
1208
1210
1211 setOperationAction(ISD::PREFETCH, MVT::Other, Custom);
1212
1213 // Requires SXTB/SXTH, available on v6 and up in both ARM and Thumb modes.
1214 if (!Subtarget->hasV6Ops()) {
1217 }
1219
1220 if (!Subtarget->useSoftFloat() && Subtarget->hasFPRegs() &&
1221 !Subtarget->isThumb1Only()) {
1222 // Turn f64->i64 into VMOVRRD, i64 -> f64 to VMOVDRR
1223 // iff target supports vfp2.
1224 setOperationAction(ISD::BITCAST, MVT::i64, Custom);
1226 setOperationAction(ISD::SET_ROUNDING, MVT::Other, Custom);
1227 setOperationAction(ISD::GET_FPENV, MVT::i32, Legal);
1228 setOperationAction(ISD::SET_FPENV, MVT::i32, Legal);
1229 setOperationAction(ISD::RESET_FPENV, MVT::Other, Legal);
1230 setOperationAction(ISD::GET_FPMODE, MVT::i32, Legal);
1231 setOperationAction(ISD::SET_FPMODE, MVT::i32, Custom);
1232 setOperationAction(ISD::RESET_FPMODE, MVT::Other, Custom);
1233 }
1234
1235 // We want to custom lower some of our intrinsics.
1240
1250 if (Subtarget->hasFullFP16()) {
1254 }
1255
1257
1258 setOperationAction(ISD::BRCOND, MVT::Other, Custom);
1259 setOperationAction(ISD::BR_CC, MVT::i32, Custom);
1260 if (Subtarget->hasFullFP16())
1261 setOperationAction(ISD::BR_CC, MVT::f16, Custom);
1262 setOperationAction(ISD::BR_CC, MVT::f32, Custom);
1263 setOperationAction(ISD::BR_CC, MVT::f64, Custom);
1264 setOperationAction(ISD::BR_JT, MVT::Other, Custom);
1265
1266 // We don't support sin/cos/fmod/copysign/pow
1267 setOperationAction(ISD::FSIN, MVT::f64, Expand);
1268 setOperationAction(ISD::FSIN, MVT::f32, Expand);
1269 setOperationAction(ISD::FCOS, MVT::f32, Expand);
1270 setOperationAction(ISD::FCOS, MVT::f64, Expand);
1271 setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
1272 setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
1275 if (!Subtarget->useSoftFloat() && Subtarget->hasVFP2Base() &&
1276 !Subtarget->isThumb1Only()) {
1279 }
1280 setOperationAction(ISD::FPOW, MVT::f64, Expand);
1281 setOperationAction(ISD::FPOW, MVT::f32, Expand);
1282
1283 if (!Subtarget->hasVFP4Base()) {
1286 }
1287
1288 // Various VFP goodness
1289 if (!Subtarget->useSoftFloat() && !Subtarget->isThumb1Only()) {
1290 // FP-ARMv8 adds f64 <-> f16 conversion. Before that it should be expanded.
1291 if (!Subtarget->hasFPARMv8Base() || !Subtarget->hasFP64()) {
1292 setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
1293 setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand);
1294 setOperationAction(ISD::STRICT_FP16_TO_FP, MVT::f64, LibCall);
1295 setOperationAction(ISD::STRICT_FP_TO_FP16, MVT::f64, LibCall);
1296 }
1297
1298 // fp16 is a special v7 extension that adds f16 <-> f32 conversions.
1299 if (!Subtarget->hasFP16()) {
1300 setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand);
1301 setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand);
1302 setOperationAction(ISD::STRICT_FP16_TO_FP, MVT::f32, LibCall);
1303 setOperationAction(ISD::STRICT_FP_TO_FP16, MVT::f32, LibCall);
1304 }
1305
1306 // Strict floating-point comparisons need custom lowering.
1313 }
1314
1315 setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
1316 setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
1317
1318 // FP-ARMv8 implements a lot of rounding-like FP operations.
1319 if (Subtarget->hasFPARMv8Base()) {
1320 setOperationAction(ISD::FFLOOR, MVT::f32, Legal);
1321 setOperationAction(ISD::FCEIL, MVT::f32, Legal);
1322 setOperationAction(ISD::FROUND, MVT::f32, Legal);
1323 setOperationAction(ISD::FTRUNC, MVT::f32, Legal);
1324 setOperationAction(ISD::FNEARBYINT, MVT::f32, Legal);
1325 setOperationAction(ISD::FRINT, MVT::f32, Legal);
1326 setOperationAction(ISD::FROUNDEVEN, MVT::f32, Legal);
1327 setOperationAction(ISD::FMINNUM, MVT::f32, Legal);
1328 setOperationAction(ISD::FMAXNUM, MVT::f32, Legal);
1329 if (Subtarget->hasNEON()) {
1330 setOperationAction(ISD::FMINNUM, MVT::v2f32, Legal);
1331 setOperationAction(ISD::FMAXNUM, MVT::v2f32, Legal);
1332 setOperationAction(ISD::FMINNUM, MVT::v4f32, Legal);
1333 setOperationAction(ISD::FMAXNUM, MVT::v4f32, Legal);
1334 }
1335
1336 if (Subtarget->hasFP64()) {
1337 setOperationAction(ISD::FFLOOR, MVT::f64, Legal);
1338 setOperationAction(ISD::FCEIL, MVT::f64, Legal);
1339 setOperationAction(ISD::FROUND, MVT::f64, Legal);
1340 setOperationAction(ISD::FTRUNC, MVT::f64, Legal);
1341 setOperationAction(ISD::FNEARBYINT, MVT::f64, Legal);
1342 setOperationAction(ISD::FRINT, MVT::f64, Legal);
1343 setOperationAction(ISD::FROUNDEVEN, MVT::f64, Legal);
1344 setOperationAction(ISD::FMINNUM, MVT::f64, Legal);
1345 setOperationAction(ISD::FMAXNUM, MVT::f64, Legal);
1346 }
1347 }
1348
1349 // FP16 often need to be promoted to call lib functions
1350 // clang-format off
1351 if (Subtarget->hasFullFP16()) {
1352 setOperationAction(ISD::LRINT, MVT::f16, Expand);
1353 setOperationAction(ISD::LROUND, MVT::f16, Expand);
1355
1356 for (auto Op : {ISD::FREM, ISD::FPOW, ISD::FPOWI,
1357 ISD::FCOS, ISD::FSIN, ISD::FSINCOS,
1358 ISD::FSINCOSPI, ISD::FMODF, ISD::FACOS,
1359 ISD::FASIN, ISD::FATAN, ISD::FATAN2,
1360 ISD::FCOSH, ISD::FSINH, ISD::FTANH,
1361 ISD::FTAN, ISD::FEXP, ISD::FEXP2,
1362 ISD::FEXP10, ISD::FLOG, ISD::FLOG2,
1363 ISD::FLOG10, ISD::STRICT_FREM, ISD::STRICT_FPOW,
1370 setOperationAction(Op, MVT::f16, Promote);
1371 }
1372
1373 // Round-to-integer need custom lowering for fp16, as Promote doesn't work
1374 // because the result type is integer.
1376 setOperationAction(Op, MVT::f16, Custom);
1377
1378 for (auto Op : {ISD::FROUND, ISD::FROUNDEVEN, ISD::FTRUNC,
1379 ISD::FNEARBYINT, ISD::FRINT, ISD::FFLOOR,
1383 setOperationAction(Op, MVT::f16, Legal);
1384 }
1385 // clang-format on
1386 }
1387
1388 if (Subtarget->hasNEON()) {
1389 // vmin and vmax aren't available in a scalar form, so we can use
1390 // a NEON instruction with an undef lane instead.
1391 setOperationAction(ISD::FMINIMUM, MVT::f32, Legal);
1392 setOperationAction(ISD::FMAXIMUM, MVT::f32, Legal);
1393 setOperationAction(ISD::FMINIMUM, MVT::f16, Legal);
1394 setOperationAction(ISD::FMAXIMUM, MVT::f16, Legal);
1395 setOperationAction(ISD::FMINIMUM, MVT::v2f32, Legal);
1396 setOperationAction(ISD::FMAXIMUM, MVT::v2f32, Legal);
1397 setOperationAction(ISD::FMINIMUM, MVT::v4f32, Legal);
1398 setOperationAction(ISD::FMAXIMUM, MVT::v4f32, Legal);
1399
1400 if (Subtarget->hasV8Ops()) {
1401 setOperationAction(ISD::FFLOOR, MVT::v2f32, Legal);
1402 setOperationAction(ISD::FFLOOR, MVT::v4f32, Legal);
1403 setOperationAction(ISD::FROUND, MVT::v2f32, Legal);
1404 setOperationAction(ISD::FROUND, MVT::v4f32, Legal);
1405 setOperationAction(ISD::FROUNDEVEN, MVT::v2f32, Legal);
1406 setOperationAction(ISD::FROUNDEVEN, MVT::v4f32, Legal);
1407 setOperationAction(ISD::FCEIL, MVT::v2f32, Legal);
1408 setOperationAction(ISD::FCEIL, MVT::v4f32, Legal);
1409 setOperationAction(ISD::FTRUNC, MVT::v2f32, Legal);
1410 setOperationAction(ISD::FTRUNC, MVT::v4f32, Legal);
1411 setOperationAction(ISD::FRINT, MVT::v2f32, Legal);
1412 setOperationAction(ISD::FRINT, MVT::v4f32, Legal);
1413 }
1414
1415 if (Subtarget->hasFullFP16()) {
1416 setOperationAction(ISD::FMINNUM, MVT::v4f16, Legal);
1417 setOperationAction(ISD::FMAXNUM, MVT::v4f16, Legal);
1418 setOperationAction(ISD::FMINNUM, MVT::v8f16, Legal);
1419 setOperationAction(ISD::FMAXNUM, MVT::v8f16, Legal);
1420
1421 setOperationAction(ISD::FMINIMUM, MVT::v4f16, Legal);
1422 setOperationAction(ISD::FMAXIMUM, MVT::v4f16, Legal);
1423 setOperationAction(ISD::FMINIMUM, MVT::v8f16, Legal);
1424 setOperationAction(ISD::FMAXIMUM, MVT::v8f16, Legal);
1425
1426 setOperationAction(ISD::FFLOOR, MVT::v4f16, Legal);
1427 setOperationAction(ISD::FFLOOR, MVT::v8f16, Legal);
1428 setOperationAction(ISD::FROUND, MVT::v4f16, Legal);
1429 setOperationAction(ISD::FROUND, MVT::v8f16, Legal);
1430 setOperationAction(ISD::FROUNDEVEN, MVT::v4f16, Legal);
1431 setOperationAction(ISD::FROUNDEVEN, MVT::v8f16, Legal);
1432 setOperationAction(ISD::FCEIL, MVT::v4f16, Legal);
1433 setOperationAction(ISD::FCEIL, MVT::v8f16, Legal);
1434 setOperationAction(ISD::FTRUNC, MVT::v4f16, Legal);
1435 setOperationAction(ISD::FTRUNC, MVT::v8f16, Legal);
1436 setOperationAction(ISD::FRINT, MVT::v4f16, Legal);
1437 setOperationAction(ISD::FRINT, MVT::v8f16, Legal);
1438 }
1439 }
1440
1441 // On MSVC, both 32-bit and 64-bit, ldexpf(f32) is not defined. MinGW has
1442 // it, but it's just a wrapper around ldexp.
1443 if (TT.isOSWindows()) {
1444 for (ISD::NodeType Op : {ISD::FLDEXP, ISD::STRICT_FLDEXP, ISD::FFREXP})
1445 if (isOperationExpand(Op, MVT::f32))
1446 setOperationAction(Op, MVT::f32, Promote);
1447 }
1448
1449 // LegalizeDAG currently can't expand fp16 LDEXP/FREXP on targets where i16
1450 // isn't legal.
1451 for (ISD::NodeType Op : {ISD::FLDEXP, ISD::STRICT_FLDEXP, ISD::FFREXP})
1452 if (isOperationExpand(Op, MVT::f16))
1453 setOperationAction(Op, MVT::f16, Promote);
1454
1455 // We have target-specific dag combine patterns for the following nodes:
1456 // ARMISD::VMOVRRD - No need to call setTargetDAGCombine
1459
1460 if (Subtarget->hasMVEIntegerOps())
1462
1463 if (Subtarget->hasV6Ops())
1465 if (Subtarget->isThumb1Only())
1467 // Attempt to lower smin/smax to ssat/usat
1468 if ((!Subtarget->isThumb() && Subtarget->hasV6Ops()) ||
1469 Subtarget->isThumb2()) {
1471 }
1472
1474
1475 if (Subtarget->useSoftFloat() || Subtarget->isThumb1Only() ||
1476 !Subtarget->hasVFP2Base() || Subtarget->hasMinSize())
1478 else
1480
1481 //// temporary - rewrite interface to use type
1484 MaxStoresPerMemcpy = 4; // For @llvm.memcpy -> sequence of stores
1486 MaxStoresPerMemmove = 4; // For @llvm.memmove -> sequence of stores
1488
1489 // On ARM arguments smaller than 4 bytes are extended, so all arguments
1490 // are at least 4 bytes aligned.
1492
1493 // Prefer likely predicted branches to selects on out-of-order cores.
1494 PredictableSelectIsExpensive = Subtarget->getSchedModel().isOutOfOrder();
1495
1496 setPrefLoopAlignment(Align(1ULL << Subtarget->getPreferBranchLogAlignment()));
1498 Align(1ULL << Subtarget->getPreferBranchLogAlignment()));
1499
1500 setMinFunctionAlignment(Subtarget->isThumb() ? Align(2) : Align(4));
1501}
1502
1504 return Subtarget->useSoftFloat();
1505}
1506
1508 return !Subtarget->isThumb1Only() && VT.getSizeInBits() <= 32;
1509}
1510
1511// FIXME: It might make sense to define the representative register class as the
1512// nearest super-register that has a non-null superset. For example, DPR_VFP2 is
1513// a super-register of SPR, and DPR is a superset if DPR_VFP2. Consequently,
1514// SPR's representative would be DPR_VFP2. This should work well if register
1515// pressure tracking were modified such that a register use would increment the
1516// pressure of the register class's representative and all of it's super
1517// classes' representatives transitively. We have not implemented this because
1518// of the difficulty prior to coalescing of modeling operand register classes
1519// due to the common occurrence of cross class copies and subregister insertions
1520// and extractions.
1521std::pair<const TargetRegisterClass *, uint8_t>
1523 MVT VT) const {
1524 const TargetRegisterClass *RRC = nullptr;
1525 uint8_t Cost = 1;
1526 switch (VT.SimpleTy) {
1527 default:
1529 // Use DPR as representative register class for all floating point
1530 // and vector types. Since there are 32 SPR registers and 32 DPR registers so
1531 // the cost is 1 for both f32 and f64.
1532 case MVT::f32: case MVT::f64: case MVT::v8i8: case MVT::v4i16:
1533 case MVT::v2i32: case MVT::v1i64: case MVT::v2f32:
1534 RRC = &ARM::DPRRegClass;
1535 // When NEON is used for SP, only half of the register file is available
1536 // because operations that define both SP and DP results will be constrained
1537 // to the VFP2 class (D0-D15). We currently model this constraint prior to
1538 // coalescing by double-counting the SP regs. See the FIXME above.
1539 if (Subtarget->useNEONForSinglePrecisionFP())
1540 Cost = 2;
1541 break;
1542 case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
1543 case MVT::v4f32: case MVT::v2f64:
1544 RRC = &ARM::DPRRegClass;
1545 Cost = 2;
1546 break;
1547 case MVT::v4i64:
1548 RRC = &ARM::DPRRegClass;
1549 Cost = 4;
1550 break;
1551 case MVT::v8i64:
1552 RRC = &ARM::DPRRegClass;
1553 Cost = 8;
1554 break;
1555 }
1556 return std::make_pair(RRC, Cost);
1557}
1558
1559const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
1560#define MAKE_CASE(V) \
1561 case V: \
1562 return #V;
1563 switch ((ARMISD::NodeType)Opcode) {
1565 break;
1768#undef MAKE_CASE
1769 }
1770 return nullptr;
1771}
1772
1774 EVT VT) const {
1775 if (!VT.isVector())
1776 return getPointerTy(DL);
1777
1778 // MVE has a predicate register.
1779 if ((Subtarget->hasMVEIntegerOps() &&
1780 (VT == MVT::v2i64 || VT == MVT::v4i32 || VT == MVT::v8i16 ||
1781 VT == MVT::v16i8)) ||
1782 (Subtarget->hasMVEFloatOps() &&
1783 (VT == MVT::v2f64 || VT == MVT::v4f32 || VT == MVT::v8f16)))
1784 return MVT::getVectorVT(MVT::i1, VT.getVectorElementCount());
1786}
1787
1788/// getRegClassFor - Return the register class that should be used for the
1789/// specified value type.
1790const TargetRegisterClass *
1791ARMTargetLowering::getRegClassFor(MVT VT, bool isDivergent) const {
1792 (void)isDivergent;
1793 // Map v4i64 to QQ registers but do not make the type legal. Similarly map
1794 // v8i64 to QQQQ registers. v4i64 and v8i64 are only used for REG_SEQUENCE to
1795 // load / store 4 to 8 consecutive NEON D registers, or 2 to 4 consecutive
1796 // MVE Q registers.
1797 if (Subtarget->hasNEON()) {
1798 if (VT == MVT::v4i64)
1799 return &ARM::QQPRRegClass;
1800 if (VT == MVT::v8i64)
1801 return &ARM::QQQQPRRegClass;
1802 }
1803 if (Subtarget->hasMVEIntegerOps()) {
1804 if (VT == MVT::v4i64)
1805 return &ARM::MQQPRRegClass;
1806 if (VT == MVT::v8i64)
1807 return &ARM::MQQQQPRRegClass;
1808 }
1810}
1811
1812// memcpy, and other memory intrinsics, typically tries to use LDM/STM if the
1813// source/dest is aligned and the copy size is large enough. We therefore want
1814// to align such objects passed to memory intrinsics.
1816 Align &PrefAlign) const {
1817 if (!isa<MemIntrinsic>(CI))
1818 return false;
1819 MinSize = 8;
1820 // On ARM11 onwards (excluding M class) 8-byte aligned LDM is typically 1
1821 // cycle faster than 4-byte aligned LDM.
1822 PrefAlign =
1823 (Subtarget->hasV6Ops() && !Subtarget->isMClass() ? Align(8) : Align(4));
1824 return true;
1825}
1826
1827// Create a fast isel object.
1828FastISel *
1830 const TargetLibraryInfo *libInfo) const {
1831 return ARM::createFastISel(funcInfo, libInfo);
1832}
1833
1835 unsigned NumVals = N->getNumValues();
1836 if (!NumVals)
1837 return Sched::RegPressure;
1838
1839 for (unsigned i = 0; i != NumVals; ++i) {
1840 EVT VT = N->getValueType(i);
1841 if (VT == MVT::Glue || VT == MVT::Other)
1842 continue;
1843 if (VT.isFloatingPoint() || VT.isVector())
1844 return Sched::ILP;
1845 }
1846
1847 if (!N->isMachineOpcode())
1848 return Sched::RegPressure;
1849
1850 // Load are scheduled for latency even if there instruction itinerary
1851 // is not available.
1852 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
1853 const MCInstrDesc &MCID = TII->get(N->getMachineOpcode());
1854
1855 if (MCID.getNumDefs() == 0)
1856 return Sched::RegPressure;
1857 if (!Itins->isEmpty() &&
1858 Itins->getOperandCycle(MCID.getSchedClass(), 0) > 2U)
1859 return Sched::ILP;
1860
1861 return Sched::RegPressure;
1862}
1863
1864//===----------------------------------------------------------------------===//
1865// Lowering Code
1866//===----------------------------------------------------------------------===//
1867
1868static bool isSRL16(const SDValue &Op) {
1869 if (Op.getOpcode() != ISD::SRL)
1870 return false;
1871 if (auto Const = dyn_cast<ConstantSDNode>(Op.getOperand(1)))
1872 return Const->getZExtValue() == 16;
1873 return false;
1874}
1875
1876static bool isSRA16(const SDValue &Op) {
1877 if (Op.getOpcode() != ISD::SRA)
1878 return false;
1879 if (auto Const = dyn_cast<ConstantSDNode>(Op.getOperand(1)))
1880 return Const->getZExtValue() == 16;
1881 return false;
1882}
1883
1884static bool isSHL16(const SDValue &Op) {
1885 if (Op.getOpcode() != ISD::SHL)
1886 return false;
1887 if (auto Const = dyn_cast<ConstantSDNode>(Op.getOperand(1)))
1888 return Const->getZExtValue() == 16;
1889 return false;
1890}
1891
1892// Check for a signed 16-bit value. We special case SRA because it makes it
1893// more simple when also looking for SRAs that aren't sign extending a
1894// smaller value. Without the check, we'd need to take extra care with
1895// checking order for some operations.
1896static bool isS16(const SDValue &Op, SelectionDAG &DAG) {
1897 if (isSRA16(Op))
1898 return isSHL16(Op.getOperand(0));
1899 return DAG.ComputeNumSignBits(Op) == 17;
1900}
1901
1902/// IntCCToARMCC - Convert a DAG integer condition code to an ARM CC
1904 switch (CC) {
1905 default: llvm_unreachable("Unknown condition code!");
1906 case ISD::SETNE: return ARMCC::NE;
1907 case ISD::SETEQ: return ARMCC::EQ;
1908 case ISD::SETGT: return ARMCC::GT;
1909 case ISD::SETGE: return ARMCC::GE;
1910 case ISD::SETLT: return ARMCC::LT;
1911 case ISD::SETLE: return ARMCC::LE;
1912 case ISD::SETUGT: return ARMCC::HI;
1913 case ISD::SETUGE: return ARMCC::HS;
1914 case ISD::SETULT: return ARMCC::LO;
1915 case ISD::SETULE: return ARMCC::LS;
1916 }
1917}
1918
1919/// FPCCToARMCC - Convert a DAG fp condition code to an ARM CC.
1921 ARMCC::CondCodes &CondCode2) {
1922 CondCode2 = ARMCC::AL;
1923 switch (CC) {
1924 default: llvm_unreachable("Unknown FP condition!");
1925 case ISD::SETEQ:
1926 case ISD::SETOEQ: CondCode = ARMCC::EQ; break;
1927 case ISD::SETGT:
1928 case ISD::SETOGT: CondCode = ARMCC::GT; break;
1929 case ISD::SETGE:
1930 case ISD::SETOGE: CondCode = ARMCC::GE; break;
1931 case ISD::SETOLT: CondCode = ARMCC::MI; break;
1932 case ISD::SETOLE: CondCode = ARMCC::LS; break;
1933 case ISD::SETONE: CondCode = ARMCC::MI; CondCode2 = ARMCC::GT; break;
1934 case ISD::SETO: CondCode = ARMCC::VC; break;
1935 case ISD::SETUO: CondCode = ARMCC::VS; break;
1936 case ISD::SETUEQ: CondCode = ARMCC::EQ; CondCode2 = ARMCC::VS; break;
1937 case ISD::SETUGT: CondCode = ARMCC::HI; break;
1938 case ISD::SETUGE: CondCode = ARMCC::PL; break;
1939 case ISD::SETLT:
1940 case ISD::SETULT: CondCode = ARMCC::LT; break;
1941 case ISD::SETLE:
1942 case ISD::SETULE: CondCode = ARMCC::LE; break;
1943 case ISD::SETNE:
1944 case ISD::SETUNE: CondCode = ARMCC::NE; break;
1945 }
1946}
1947
1948//===----------------------------------------------------------------------===//
1949// Calling Convention Implementation
1950//===----------------------------------------------------------------------===//
1951
1952/// getEffectiveCallingConv - Get the effective calling convention, taking into
1953/// account presence of floating point hardware and calling convention
1954/// limitations, such as support for variadic functions.
1956ARMTargetLowering::getEffectiveCallingConv(CallingConv::ID CC,
1957 bool isVarArg) const {
1958 switch (CC) {
1959 default:
1960 report_fatal_error("Unsupported calling convention");
1963 case CallingConv::GHC:
1965 return CC;
1971 case CallingConv::Swift:
1974 case CallingConv::C:
1975 case CallingConv::Tail:
1976 if (!getTM().isAAPCS_ABI())
1977 return CallingConv::ARM_APCS;
1978 else if (Subtarget->hasFPRegs() && !Subtarget->isThumb1Only() &&
1979 getTargetMachine().Options.FloatABIType == FloatABI::Hard &&
1980 !isVarArg)
1982 else
1984 case CallingConv::Fast:
1986 if (!getTM().isAAPCS_ABI()) {
1987 if (Subtarget->hasVFP2Base() && !Subtarget->isThumb1Only() && !isVarArg)
1988 return CallingConv::Fast;
1989 return CallingConv::ARM_APCS;
1990 } else if (Subtarget->hasVFP2Base() && !Subtarget->isThumb1Only() &&
1991 !isVarArg)
1993 else
1995 }
1996}
1997
1999 bool isVarArg) const {
2000 return CCAssignFnForNode(CC, false, isVarArg);
2001}
2002
2004 bool isVarArg) const {
2005 return CCAssignFnForNode(CC, true, isVarArg);
2006}
2007
2008/// CCAssignFnForNode - Selects the correct CCAssignFn for the given
2009/// CallingConvention.
2010CCAssignFn *ARMTargetLowering::CCAssignFnForNode(CallingConv::ID CC,
2011 bool Return,
2012 bool isVarArg) const {
2013 switch (getEffectiveCallingConv(CC, isVarArg)) {
2014 default:
2015 report_fatal_error("Unsupported calling convention");
2017 return (Return ? RetCC_ARM_APCS : CC_ARM_APCS);
2019 return (Return ? RetCC_ARM_AAPCS : CC_ARM_AAPCS);
2021 return (Return ? RetCC_ARM_AAPCS_VFP : CC_ARM_AAPCS_VFP);
2022 case CallingConv::Fast:
2023 return (Return ? RetFastCC_ARM_APCS : FastCC_ARM_APCS);
2024 case CallingConv::GHC:
2025 return (Return ? RetCC_ARM_APCS : CC_ARM_APCS_GHC);
2027 return (Return ? RetCC_ARM_AAPCS : CC_ARM_AAPCS);
2029 return (Return ? RetCC_ARM_AAPCS : CC_ARM_AAPCS);
2031 return (Return ? RetCC_ARM_AAPCS : CC_ARM_Win32_CFGuard_Check);
2032 }
2033}
2034
2035SDValue ARMTargetLowering::MoveToHPR(const SDLoc &dl, SelectionDAG &DAG,
2036 MVT LocVT, MVT ValVT, SDValue Val) const {
2037 Val = DAG.getNode(ISD::BITCAST, dl, MVT::getIntegerVT(LocVT.getSizeInBits()),
2038 Val);
2039 if (Subtarget->hasFullFP16()) {
2040 Val = DAG.getNode(ARMISD::VMOVhr, dl, ValVT, Val);
2041 } else {
2042 Val = DAG.getNode(ISD::TRUNCATE, dl,
2043 MVT::getIntegerVT(ValVT.getSizeInBits()), Val);
2044 Val = DAG.getNode(ISD::BITCAST, dl, ValVT, Val);
2045 }
2046 return Val;
2047}
2048
2049SDValue ARMTargetLowering::MoveFromHPR(const SDLoc &dl, SelectionDAG &DAG,
2050 MVT LocVT, MVT ValVT,
2051 SDValue Val) const {
2052 if (Subtarget->hasFullFP16()) {
2053 Val = DAG.getNode(ARMISD::VMOVrh, dl,
2054 MVT::getIntegerVT(LocVT.getSizeInBits()), Val);
2055 } else {
2056 Val = DAG.getNode(ISD::BITCAST, dl,
2057 MVT::getIntegerVT(ValVT.getSizeInBits()), Val);
2058 Val = DAG.getNode(ISD::ZERO_EXTEND, dl,
2059 MVT::getIntegerVT(LocVT.getSizeInBits()), Val);
2060 }
2061 return DAG.getNode(ISD::BITCAST, dl, LocVT, Val);
2062}
2063
2064/// LowerCallResult - Lower the result values of a call into the
2065/// appropriate copies out of appropriate physical registers.
2066SDValue ARMTargetLowering::LowerCallResult(
2067 SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg,
2068 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
2069 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool isThisReturn,
2070 SDValue ThisVal, bool isCmseNSCall) const {
2071 // Assign locations to each value returned by this call.
2073 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
2074 *DAG.getContext());
2075 CCInfo.AnalyzeCallResult(Ins, CCAssignFnForReturn(CallConv, isVarArg));
2076
2077 // Copy all of the result registers out of their specified physreg.
2078 for (unsigned i = 0; i != RVLocs.size(); ++i) {
2079 CCValAssign VA = RVLocs[i];
2080
2081 // Pass 'this' value directly from the argument to return value, to avoid
2082 // reg unit interference
2083 if (i == 0 && isThisReturn) {
2084 assert(!VA.needsCustom() && VA.getLocVT() == MVT::i32 &&
2085 "unexpected return calling convention register assignment");
2086 InVals.push_back(ThisVal);
2087 continue;
2088 }
2089
2090 SDValue Val;
2091 if (VA.needsCustom() &&
2092 (VA.getLocVT() == MVT::f64 || VA.getLocVT() == MVT::v2f64)) {
2093 // Handle f64 or half of a v2f64.
2094 SDValue Lo = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32,
2095 InGlue);
2096 Chain = Lo.getValue(1);
2097 InGlue = Lo.getValue(2);
2098 VA = RVLocs[++i]; // skip ahead to next loc
2099 SDValue Hi = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32,
2100 InGlue);
2101 Chain = Hi.getValue(1);
2102 InGlue = Hi.getValue(2);
2103 if (!Subtarget->isLittle())
2104 std::swap (Lo, Hi);
2105 Val = DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi);
2106
2107 if (VA.getLocVT() == MVT::v2f64) {
2108 SDValue Vec = DAG.getNode(ISD::UNDEF, dl, MVT::v2f64);
2109 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Vec, Val,
2110 DAG.getConstant(0, dl, MVT::i32));
2111
2112 VA = RVLocs[++i]; // skip ahead to next loc
2113 Lo = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, InGlue);
2114 Chain = Lo.getValue(1);
2115 InGlue = Lo.getValue(2);
2116 VA = RVLocs[++i]; // skip ahead to next loc
2117 Hi = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, InGlue);
2118 Chain = Hi.getValue(1);
2119 InGlue = Hi.getValue(2);
2120 if (!Subtarget->isLittle())
2121 std::swap (Lo, Hi);
2122 Val = DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi);
2123 Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Vec, Val,
2124 DAG.getConstant(1, dl, MVT::i32));
2125 }
2126 } else {
2127 Val = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), VA.getLocVT(),
2128 InGlue);
2129 Chain = Val.getValue(1);
2130 InGlue = Val.getValue(2);
2131 }
2132
2133 switch (VA.getLocInfo()) {
2134 default: llvm_unreachable("Unknown loc info!");
2135 case CCValAssign::Full: break;
2136 case CCValAssign::BCvt:
2137 Val = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), Val);
2138 break;
2139 }
2140
2141 // f16 arguments have their size extended to 4 bytes and passed as if they
2142 // had been copied to the LSBs of a 32-bit register.
2143 // For that, it's passed extended to i32 (soft ABI) or to f32 (hard ABI)
2144 if (VA.needsCustom() &&
2145 (VA.getValVT() == MVT::f16 || VA.getValVT() == MVT::bf16))
2146 Val = MoveToHPR(dl, DAG, VA.getLocVT(), VA.getValVT(), Val);
2147
2148 // On CMSE Non-secure Calls, call results (returned values) whose bitwidth
2149 // is less than 32 bits must be sign- or zero-extended after the call for
2150 // security reasons. Although the ABI mandates an extension done by the
2151 // callee, the latter cannot be trusted to follow the rules of the ABI.
2152 const ISD::InputArg &Arg = Ins[VA.getValNo()];
2153 if (isCmseNSCall && Arg.ArgVT.isScalarInteger() &&
2154 VA.getLocVT().isScalarInteger() && Arg.ArgVT.bitsLT(MVT::i32))
2155 Val = handleCMSEValue(Val, Arg, DAG, dl);
2156
2157 InVals.push_back(Val);
2158 }
2159
2160 return Chain;
2161}
2162
2163std::pair<SDValue, MachinePointerInfo> ARMTargetLowering::computeAddrForCallArg(
2164 const SDLoc &dl, SelectionDAG &DAG, const CCValAssign &VA, SDValue StackPtr,
2165 bool IsTailCall, int SPDiff) const {
2166 SDValue DstAddr;
2167 MachinePointerInfo DstInfo;
2168 int32_t Offset = VA.getLocMemOffset();
2169 MachineFunction &MF = DAG.getMachineFunction();
2170
2171 if (IsTailCall) {
2172 Offset += SPDiff;
2173 auto PtrVT = getPointerTy(DAG.getDataLayout());
2174 int Size = VA.getLocVT().getFixedSizeInBits() / 8;
2175 int FI = MF.getFrameInfo().CreateFixedObject(Size, Offset, true);
2176 DstAddr = DAG.getFrameIndex(FI, PtrVT);
2177 DstInfo =
2179 } else {
2180 SDValue PtrOff = DAG.getIntPtrConstant(Offset, dl);
2181 DstAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
2182 StackPtr, PtrOff);
2183 DstInfo =
2185 }
2186
2187 return std::make_pair(DstAddr, DstInfo);
2188}
2189
2190// Returns the type of copying which is required to set up a byval argument to
2191// a tail-called function. This isn't needed for non-tail calls, because they
2192// always need the equivalent of CopyOnce, but tail-calls sometimes need two to
2193// avoid clobbering another argument (CopyViaTemp), and sometimes can be
2194// optimised to zero copies when forwarding an argument from the caller's
2195// caller (NoCopy).
2196ARMTargetLowering::ByValCopyKind ARMTargetLowering::ByValNeedsCopyForTailCall(
2197 SelectionDAG &DAG, SDValue Src, SDValue Dst, ISD::ArgFlagsTy Flags) const {
2198 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
2199 ARMFunctionInfo *AFI = DAG.getMachineFunction().getInfo<ARMFunctionInfo>();
2200
2201 // Globals are always safe to copy from.
2203 return CopyOnce;
2204
2205 // Can only analyse frame index nodes, conservatively assume we need a
2206 // temporary.
2207 auto *SrcFrameIdxNode = dyn_cast<FrameIndexSDNode>(Src);
2208 auto *DstFrameIdxNode = dyn_cast<FrameIndexSDNode>(Dst);
2209 if (!SrcFrameIdxNode || !DstFrameIdxNode)
2210 return CopyViaTemp;
2211
2212 int SrcFI = SrcFrameIdxNode->getIndex();
2213 int DstFI = DstFrameIdxNode->getIndex();
2214 assert(MFI.isFixedObjectIndex(DstFI) &&
2215 "byval passed in non-fixed stack slot");
2216
2217 int64_t SrcOffset = MFI.getObjectOffset(SrcFI);
2218 int64_t DstOffset = MFI.getObjectOffset(DstFI);
2219
2220 // If the source is in the local frame, then the copy to the argument memory
2221 // is always valid.
2222 bool FixedSrc = MFI.isFixedObjectIndex(SrcFI);
2223 if (!FixedSrc ||
2224 (FixedSrc && SrcOffset < -(int64_t)AFI->getArgRegsSaveSize()))
2225 return CopyOnce;
2226
2227 // In the case of byval arguments split between registers and the stack,
2228 // computeAddrForCallArg returns a FrameIndex which corresponds only to the
2229 // stack portion, but the Src SDValue will refer to the full value, including
2230 // the local stack memory that the register portion gets stored into. We only
2231 // need to compare them for equality, so normalise on the full value version.
2232 uint64_t RegSize = Flags.getByValSize() - MFI.getObjectSize(DstFI);
2233 DstOffset -= RegSize;
2234
2235 // If the value is already in the correct location, then no copying is
2236 // needed. If not, then we need to copy via a temporary.
2237 if (SrcOffset == DstOffset)
2238 return NoCopy;
2239 else
2240 return CopyViaTemp;
2241}
2242
2243void ARMTargetLowering::PassF64ArgInRegs(const SDLoc &dl, SelectionDAG &DAG,
2244 SDValue Chain, SDValue &Arg,
2245 RegsToPassVector &RegsToPass,
2246 CCValAssign &VA, CCValAssign &NextVA,
2247 SDValue &StackPtr,
2248 SmallVectorImpl<SDValue> &MemOpChains,
2249 bool IsTailCall,
2250 int SPDiff) const {
2251 SDValue fmrrd = DAG.getNode(ARMISD::VMOVRRD, dl,
2252 DAG.getVTList(MVT::i32, MVT::i32), Arg);
2253 unsigned id = Subtarget->isLittle() ? 0 : 1;
2254 RegsToPass.push_back(std::make_pair(VA.getLocReg(), fmrrd.getValue(id)));
2255
2256 if (NextVA.isRegLoc())
2257 RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), fmrrd.getValue(1-id)));
2258 else {
2259 assert(NextVA.isMemLoc());
2260 if (!StackPtr.getNode())
2261 StackPtr = DAG.getCopyFromReg(Chain, dl, ARM::SP,
2263
2264 SDValue DstAddr;
2265 MachinePointerInfo DstInfo;
2266 std::tie(DstAddr, DstInfo) =
2267 computeAddrForCallArg(dl, DAG, NextVA, StackPtr, IsTailCall, SPDiff);
2268 MemOpChains.push_back(
2269 DAG.getStore(Chain, dl, fmrrd.getValue(1 - id), DstAddr, DstInfo));
2270 }
2271}
2272
2273static bool canGuaranteeTCO(CallingConv::ID CC, bool GuaranteeTailCalls) {
2274 return (CC == CallingConv::Fast && GuaranteeTailCalls) ||
2276}
2277
2278/// LowerCall - Lowering a call into a callseq_start <-
2279/// ARMISD:CALL <- callseq_end chain. Also add input and output parameter
2280/// nodes.
2281SDValue
2282ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
2283 SmallVectorImpl<SDValue> &InVals) const {
2284 SelectionDAG &DAG = CLI.DAG;
2285 SDLoc &dl = CLI.DL;
2286 SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
2287 SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
2288 SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
2289 SDValue Chain = CLI.Chain;
2290 SDValue Callee = CLI.Callee;
2291 bool &isTailCall = CLI.IsTailCall;
2292 CallingConv::ID CallConv = CLI.CallConv;
2293 bool doesNotRet = CLI.DoesNotReturn;
2294 bool isVarArg = CLI.IsVarArg;
2295 const CallBase *CB = CLI.CB;
2296
2297 MachineFunction &MF = DAG.getMachineFunction();
2298 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
2299 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
2300 MachineFunction::CallSiteInfo CSInfo;
2301 bool isStructRet = (Outs.empty()) ? false : Outs[0].Flags.isSRet();
2302 bool isThisReturn = false;
2303 bool isCmseNSCall = false;
2304 bool isSibCall = false;
2305 bool PreferIndirect = false;
2306 bool GuardWithBTI = false;
2307
2308 // Analyze operands of the call, assigning locations to each operand.
2310 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
2311 *DAG.getContext());
2312 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CallConv, isVarArg));
2313
2314 // Lower 'returns_twice' calls to a pseudo-instruction.
2315 if (CLI.CB && CLI.CB->getAttributes().hasFnAttr(Attribute::ReturnsTwice) &&
2316 !Subtarget->noBTIAtReturnTwice())
2317 GuardWithBTI = AFI->branchTargetEnforcement();
2318
2319 // Set type id for call site info.
2320 if (MF.getTarget().Options.EmitCallGraphSection && CB && CB->isIndirectCall())
2321 CSInfo = MachineFunction::CallSiteInfo(*CB);
2322
2323 // Determine whether this is a non-secure function call.
2324 if (CLI.CB && CLI.CB->getAttributes().hasFnAttr("cmse_nonsecure_call"))
2325 isCmseNSCall = true;
2326
2327 // Disable tail calls if they're not supported.
2328 if (!Subtarget->supportsTailCall())
2329 isTailCall = false;
2330
2331 // For both the non-secure calls and the returns from a CMSE entry function,
2332 // the function needs to do some extra work after the call, or before the
2333 // return, respectively, thus it cannot end with a tail call
2334 if (isCmseNSCall || AFI->isCmseNSEntryFunction())
2335 isTailCall = false;
2336
2337 if (isa<GlobalAddressSDNode>(Callee)) {
2338 // If we're optimizing for minimum size and the function is called three or
2339 // more times in this block, we can improve codesize by calling indirectly
2340 // as BLXr has a 16-bit encoding.
2341 auto *GV = cast<GlobalAddressSDNode>(Callee)->getGlobal();
2342 if (CLI.CB) {
2343 auto *BB = CLI.CB->getParent();
2344 PreferIndirect = Subtarget->isThumb() && Subtarget->hasMinSize() &&
2345 count_if(GV->users(), [&BB](const User *U) {
2346 return isa<Instruction>(U) &&
2347 cast<Instruction>(U)->getParent() == BB;
2348 }) > 2;
2349 }
2350 }
2351 if (isTailCall) {
2352 // Check if it's really possible to do a tail call.
2353 isTailCall =
2354 IsEligibleForTailCallOptimization(CLI, CCInfo, ArgLocs, PreferIndirect);
2355
2356 if (isTailCall && !getTargetMachine().Options.GuaranteedTailCallOpt &&
2357 CallConv != CallingConv::Tail && CallConv != CallingConv::SwiftTail)
2358 isSibCall = true;
2359
2360 // We don't support GuaranteedTailCallOpt for ARM, only automatically
2361 // detected sibcalls.
2362 if (isTailCall)
2363 ++NumTailCalls;
2364 }
2365
2366 if (!isTailCall && CLI.CB && CLI.CB->isMustTailCall())
2367 report_fatal_error("failed to perform tail call elimination on a call "
2368 "site marked musttail");
2369
2370 // Get a count of how many bytes are to be pushed on the stack.
2371 unsigned NumBytes = CCInfo.getStackSize();
2372
2373 // SPDiff is the byte offset of the call's argument area from the callee's.
2374 // Stores to callee stack arguments will be placed in FixedStackSlots offset
2375 // by this amount for a tail call. In a sibling call it must be 0 because the
2376 // caller will deallocate the entire stack and the callee still expects its
2377 // arguments to begin at SP+0. Completely unused for non-tail calls.
2378 int SPDiff = 0;
2379
2380 if (isTailCall && !isSibCall) {
2381 auto FuncInfo = MF.getInfo<ARMFunctionInfo>();
2382 unsigned NumReusableBytes = FuncInfo->getArgumentStackSize();
2383
2384 // Since callee will pop argument stack as a tail call, we must keep the
2385 // popped size 16-byte aligned.
2386 MaybeAlign StackAlign = DAG.getDataLayout().getStackAlignment();
2387 assert(StackAlign && "data layout string is missing stack alignment");
2388 NumBytes = alignTo(NumBytes, *StackAlign);
2389
2390 // SPDiff will be negative if this tail call requires more space than we
2391 // would automatically have in our incoming argument space. Positive if we
2392 // can actually shrink the stack.
2393 SPDiff = NumReusableBytes - NumBytes;
2394
2395 // If this call requires more stack than we have available from
2396 // LowerFormalArguments, tell FrameLowering to reserve space for it.
2397 if (SPDiff < 0 && AFI->getArgRegsSaveSize() < (unsigned)-SPDiff)
2398 AFI->setArgRegsSaveSize(-SPDiff);
2399 }
2400
2401 if (isSibCall) {
2402 // For sibling tail calls, memory operands are available in our caller's stack.
2403 NumBytes = 0;
2404 } else {
2405 // Adjust the stack pointer for the new arguments...
2406 // These operations are automatically eliminated by the prolog/epilog pass
2407 Chain = DAG.getCALLSEQ_START(Chain, isTailCall ? 0 : NumBytes, 0, dl);
2408 }
2409
2411 DAG.getCopyFromReg(Chain, dl, ARM::SP, getPointerTy(DAG.getDataLayout()));
2412
2413 RegsToPassVector RegsToPass;
2414 SmallVector<SDValue, 8> MemOpChains;
2415
2416 // If we are doing a tail-call, any byval arguments will be written to stack
2417 // space which was used for incoming arguments. If any the values being used
2418 // are incoming byval arguments to this function, then they might be
2419 // overwritten by the stores of the outgoing arguments. To avoid this, we
2420 // need to make a temporary copy of them in local stack space, then copy back
2421 // to the argument area.
2422 DenseMap<unsigned, SDValue> ByValTemporaries;
2423 SDValue ByValTempChain;
2424 if (isTailCall) {
2425 SmallVector<SDValue, 8> ByValCopyChains;
2426 for (const CCValAssign &VA : ArgLocs) {
2427 unsigned ArgIdx = VA.getValNo();
2428 SDValue Src = OutVals[ArgIdx];
2429 ISD::ArgFlagsTy Flags = Outs[ArgIdx].Flags;
2430
2431 if (!Flags.isByVal())
2432 continue;
2433
2434 SDValue Dst;
2435 MachinePointerInfo DstInfo;
2436 std::tie(Dst, DstInfo) =
2437 computeAddrForCallArg(dl, DAG, VA, SDValue(), true, SPDiff);
2438 ByValCopyKind Copy = ByValNeedsCopyForTailCall(DAG, Src, Dst, Flags);
2439
2440 if (Copy == NoCopy) {
2441 // If the argument is already at the correct offset on the stack
2442 // (because we are forwarding a byval argument from our caller), we
2443 // don't need any copying.
2444 continue;
2445 } else if (Copy == CopyOnce) {
2446 // If the argument is in our local stack frame, no other argument
2447 // preparation can clobber it, so we can copy it to the final location
2448 // later.
2449 ByValTemporaries[ArgIdx] = Src;
2450 } else {
2451 assert(Copy == CopyViaTemp && "unexpected enum value");
2452 // If we might be copying this argument from the outgoing argument
2453 // stack area, we need to copy via a temporary in the local stack
2454 // frame.
2455 int TempFrameIdx = MFI.CreateStackObject(
2456 Flags.getByValSize(), Flags.getNonZeroByValAlign(), false);
2457 SDValue Temp =
2458 DAG.getFrameIndex(TempFrameIdx, getPointerTy(DAG.getDataLayout()));
2459
2460 SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), dl, MVT::i32);
2461 SDValue AlignNode =
2462 DAG.getConstant(Flags.getNonZeroByValAlign().value(), dl, MVT::i32);
2463
2464 SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
2465 SDValue Ops[] = {Chain, Temp, Src, SizeNode, AlignNode};
2466 ByValCopyChains.push_back(
2467 DAG.getNode(ARMISD::COPY_STRUCT_BYVAL, dl, VTs, Ops));
2468 ByValTemporaries[ArgIdx] = Temp;
2469 }
2470 }
2471 if (!ByValCopyChains.empty())
2472 ByValTempChain =
2473 DAG.getNode(ISD::TokenFactor, dl, MVT::Other, ByValCopyChains);
2474 }
2475
2476 // During a tail call, stores to the argument area must happen after all of
2477 // the function's incoming arguments have been loaded because they may alias.
2478 // This is done by folding in a TokenFactor from LowerFormalArguments, but
2479 // there's no point in doing so repeatedly so this tracks whether that's
2480 // happened yet.
2481 bool AfterFormalArgLoads = false;
2482
2483 // Walk the register/memloc assignments, inserting copies/loads. In the case
2484 // of tail call optimization, arguments are handled later.
2485 for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size();
2486 i != e;
2487 ++i, ++realArgIdx) {
2488 CCValAssign &VA = ArgLocs[i];
2489 SDValue Arg = OutVals[realArgIdx];
2490 ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags;
2491 bool isByVal = Flags.isByVal();
2492
2493 // Promote the value if needed.
2494 switch (VA.getLocInfo()) {
2495 default: llvm_unreachable("Unknown loc info!");
2496 case CCValAssign::Full: break;
2497 case CCValAssign::SExt:
2498 Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), Arg);
2499 break;
2500 case CCValAssign::ZExt:
2501 Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), Arg);
2502 break;
2503 case CCValAssign::AExt:
2504 Arg = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), Arg);
2505 break;
2506 case CCValAssign::BCvt:
2507 Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg);
2508 break;
2509 }
2510
2511 if (isTailCall && VA.isMemLoc() && !AfterFormalArgLoads) {
2512 Chain = DAG.getStackArgumentTokenFactor(Chain);
2513 if (ByValTempChain)
2514 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chain,
2515 ByValTempChain);
2516 AfterFormalArgLoads = true;
2517 }
2518
2519 // f16 arguments have their size extended to 4 bytes and passed as if they
2520 // had been copied to the LSBs of a 32-bit register.
2521 // For that, it's passed extended to i32 (soft ABI) or to f32 (hard ABI)
2522 if (VA.needsCustom() &&
2523 (VA.getValVT() == MVT::f16 || VA.getValVT() == MVT::bf16)) {
2524 Arg = MoveFromHPR(dl, DAG, VA.getLocVT(), VA.getValVT(), Arg);
2525 } else {
2526 // f16 arguments could have been extended prior to argument lowering.
2527 // Mask them arguments if this is a CMSE nonsecure call.
2528 auto ArgVT = Outs[realArgIdx].ArgVT;
2529 if (isCmseNSCall && (ArgVT == MVT::f16)) {
2530 auto LocBits = VA.getLocVT().getSizeInBits();
2531 auto MaskValue = APInt::getLowBitsSet(LocBits, ArgVT.getSizeInBits());
2532 SDValue Mask =
2533 DAG.getConstant(MaskValue, dl, MVT::getIntegerVT(LocBits));
2534 Arg = DAG.getNode(ISD::BITCAST, dl, MVT::getIntegerVT(LocBits), Arg);
2535 Arg = DAG.getNode(ISD::AND, dl, MVT::getIntegerVT(LocBits), Arg, Mask);
2536 Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg);
2537 }
2538 }
2539
2540 // f64 and v2f64 might be passed in i32 pairs and must be split into pieces
2541 if (VA.needsCustom() && VA.getLocVT() == MVT::v2f64) {
2542 SDValue Op0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
2543 DAG.getConstant(0, dl, MVT::i32));
2544 SDValue Op1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
2545 DAG.getConstant(1, dl, MVT::i32));
2546
2547 PassF64ArgInRegs(dl, DAG, Chain, Op0, RegsToPass, VA, ArgLocs[++i],
2548 StackPtr, MemOpChains, isTailCall, SPDiff);
2549
2550 VA = ArgLocs[++i]; // skip ahead to next loc
2551 if (VA.isRegLoc()) {
2552 PassF64ArgInRegs(dl, DAG, Chain, Op1, RegsToPass, VA, ArgLocs[++i],
2553 StackPtr, MemOpChains, isTailCall, SPDiff);
2554 } else {
2555 assert(VA.isMemLoc());
2556 SDValue DstAddr;
2557 MachinePointerInfo DstInfo;
2558 std::tie(DstAddr, DstInfo) =
2559 computeAddrForCallArg(dl, DAG, VA, StackPtr, isTailCall, SPDiff);
2560 MemOpChains.push_back(DAG.getStore(Chain, dl, Op1, DstAddr, DstInfo));
2561 }
2562 } else if (VA.needsCustom() && VA.getLocVT() == MVT::f64) {
2563 PassF64ArgInRegs(dl, DAG, Chain, Arg, RegsToPass, VA, ArgLocs[++i],
2564 StackPtr, MemOpChains, isTailCall, SPDiff);
2565 } else if (VA.isRegLoc()) {
2566 if (realArgIdx == 0 && Flags.isReturned() && !Flags.isSwiftSelf() &&
2567 Outs[0].VT == MVT::i32) {
2568 assert(VA.getLocVT() == MVT::i32 &&
2569 "unexpected calling convention register assignment");
2570 assert(!Ins.empty() && Ins[0].VT == MVT::i32 &&
2571 "unexpected use of 'returned'");
2572 isThisReturn = true;
2573 }
2574 const TargetOptions &Options = DAG.getTarget().Options;
2575 if (Options.EmitCallSiteInfo)
2576 CSInfo.ArgRegPairs.emplace_back(VA.getLocReg(), i);
2577 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
2578 } else if (isByVal) {
2579 assert(VA.isMemLoc());
2580 unsigned offset = 0;
2581
2582 // True if this byval aggregate will be split between registers
2583 // and memory.
2584 unsigned ByValArgsCount = CCInfo.getInRegsParamsCount();
2585 unsigned CurByValIdx = CCInfo.getInRegsParamsProcessed();
2586
2587 SDValue ByValSrc;
2588 bool NeedsStackCopy;
2589 if (auto It = ByValTemporaries.find(realArgIdx);
2590 It != ByValTemporaries.end()) {
2591 ByValSrc = It->second;
2592 NeedsStackCopy = true;
2593 } else {
2594 ByValSrc = Arg;
2595 NeedsStackCopy = !isTailCall;
2596 }
2597
2598 // If part of the argument is in registers, load them.
2599 if (CurByValIdx < ByValArgsCount) {
2600 unsigned RegBegin, RegEnd;
2601 CCInfo.getInRegsParamInfo(CurByValIdx, RegBegin, RegEnd);
2602
2603 EVT PtrVT = getPointerTy(DAG.getDataLayout());
2604 unsigned int i, j;
2605 for (i = 0, j = RegBegin; j < RegEnd; i++, j++) {
2606 SDValue Const = DAG.getConstant(4*i, dl, MVT::i32);
2607 SDValue AddArg = DAG.getNode(ISD::ADD, dl, PtrVT, ByValSrc, Const);
2608 SDValue Load =
2609 DAG.getLoad(PtrVT, dl, Chain, AddArg, MachinePointerInfo(),
2610 DAG.InferPtrAlign(AddArg));
2611 MemOpChains.push_back(Load.getValue(1));
2612 RegsToPass.push_back(std::make_pair(j, Load));
2613 }
2614
2615 // If parameter size outsides register area, "offset" value
2616 // helps us to calculate stack slot for remained part properly.
2617 offset = RegEnd - RegBegin;
2618
2619 CCInfo.nextInRegsParam();
2620 }
2621
2622 // If the memory part of the argument isn't already in the correct place
2623 // (which can happen with tail calls), copy it into the argument area.
2624 if (NeedsStackCopy && Flags.getByValSize() > 4 * offset) {
2625 auto PtrVT = getPointerTy(DAG.getDataLayout());
2626 SDValue Dst;
2627 MachinePointerInfo DstInfo;
2628 std::tie(Dst, DstInfo) =
2629 computeAddrForCallArg(dl, DAG, VA, StackPtr, isTailCall, SPDiff);
2630 SDValue SrcOffset = DAG.getIntPtrConstant(4*offset, dl);
2631 SDValue Src = DAG.getNode(ISD::ADD, dl, PtrVT, ByValSrc, SrcOffset);
2632 SDValue SizeNode = DAG.getConstant(Flags.getByValSize() - 4*offset, dl,
2633 MVT::i32);
2634 SDValue AlignNode =
2635 DAG.getConstant(Flags.getNonZeroByValAlign().value(), dl, MVT::i32);
2636
2637 SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
2638 SDValue Ops[] = { Chain, Dst, Src, SizeNode, AlignNode};
2639 MemOpChains.push_back(DAG.getNode(ARMISD::COPY_STRUCT_BYVAL, dl, VTs,
2640 Ops));
2641 }
2642 } else {
2643 assert(VA.isMemLoc());
2644 SDValue DstAddr;
2645 MachinePointerInfo DstInfo;
2646 std::tie(DstAddr, DstInfo) =
2647 computeAddrForCallArg(dl, DAG, VA, StackPtr, isTailCall, SPDiff);
2648
2649 SDValue Store = DAG.getStore(Chain, dl, Arg, DstAddr, DstInfo);
2650 MemOpChains.push_back(Store);
2651 }
2652 }
2653
2654 if (!MemOpChains.empty())
2655 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
2656
2657 // Build a sequence of copy-to-reg nodes chained together with token chain
2658 // and flag operands which copy the outgoing args into the appropriate regs.
2659 SDValue InGlue;
2660 for (const auto &[Reg, N] : RegsToPass) {
2661 Chain = DAG.getCopyToReg(Chain, dl, Reg, N, InGlue);
2662 InGlue = Chain.getValue(1);
2663 }
2664
2665 // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
2666 // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
2667 // node so that legalize doesn't hack it.
2668 bool isDirect = false;
2669
2670 const TargetMachine &TM = getTargetMachine();
2671 const GlobalValue *GVal = nullptr;
2672 if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
2673 GVal = G->getGlobal();
2674 bool isStub = !TM.shouldAssumeDSOLocal(GVal) && Subtarget->isTargetMachO();
2675
2676 bool isARMFunc = !Subtarget->isThumb() || (isStub && !Subtarget->isMClass());
2677 bool isLocalARMFunc = false;
2678 auto PtrVt = getPointerTy(DAG.getDataLayout());
2679
2680 if (Subtarget->genLongCalls()) {
2681 assert((!isPositionIndependent() || Subtarget->isTargetWindows()) &&
2682 "long-calls codegen is not position independent!");
2683 // Handle a global address or an external symbol. If it's not one of
2684 // those, the target's already in a register, so we don't need to do
2685 // anything extra.
2686 if (isa<GlobalAddressSDNode>(Callee)) {
2687 if (Subtarget->genExecuteOnly()) {
2688 if (Subtarget->useMovt())
2689 ++NumMovwMovt;
2690 Callee = DAG.getNode(ARMISD::Wrapper, dl, PtrVt,
2691 DAG.getTargetGlobalAddress(GVal, dl, PtrVt));
2692 } else {
2693 // Create a constant pool entry for the callee address
2694 unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
2695 ARMConstantPoolValue *CPV = ARMConstantPoolConstant::Create(
2696 GVal, ARMPCLabelIndex, ARMCP::CPValue, 0);
2697
2698 // Get the address of the callee into a register
2699 SDValue Addr = DAG.getTargetConstantPool(CPV, PtrVt, Align(4));
2700 Addr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Addr);
2701 Callee = DAG.getLoad(
2702 PtrVt, dl, DAG.getEntryNode(), Addr,
2704 }
2705 } else if (ExternalSymbolSDNode *S=dyn_cast<ExternalSymbolSDNode>(Callee)) {
2706 const char *Sym = S->getSymbol();
2707
2708 if (Subtarget->genExecuteOnly()) {
2709 if (Subtarget->useMovt())
2710 ++NumMovwMovt;
2711 Callee = DAG.getNode(ARMISD::Wrapper, dl, PtrVt,
2712 DAG.getTargetGlobalAddress(GVal, dl, PtrVt));
2713 } else {
2714 // Create a constant pool entry for the callee address
2715 unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
2716 ARMConstantPoolValue *CPV = ARMConstantPoolSymbol::Create(
2717 *DAG.getContext(), Sym, ARMPCLabelIndex, 0);
2718
2719 // Get the address of the callee into a register
2720 SDValue Addr = DAG.getTargetConstantPool(CPV, PtrVt, Align(4));
2721 Addr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Addr);
2722 Callee = DAG.getLoad(
2723 PtrVt, dl, DAG.getEntryNode(), Addr,
2725 }
2726 }
2727 } else if (isa<GlobalAddressSDNode>(Callee)) {
2728 if (!PreferIndirect) {
2729 isDirect = true;
2730 bool isDef = GVal->isStrongDefinitionForLinker();
2731
2732 // ARM call to a local ARM function is predicable.
2733 isLocalARMFunc = !Subtarget->isThumb() && (isDef || !ARMInterworking);
2734 // tBX takes a register source operand.
2735 if (isStub && Subtarget->isThumb1Only() && !Subtarget->hasV5TOps()) {
2736 assert(Subtarget->isTargetMachO() && "WrapperPIC use on non-MachO?");
2737 Callee = DAG.getNode(
2738 ARMISD::WrapperPIC, dl, PtrVt,
2739 DAG.getTargetGlobalAddress(GVal, dl, PtrVt, 0, ARMII::MO_NONLAZY));
2740 Callee = DAG.getLoad(
2741 PtrVt, dl, DAG.getEntryNode(), Callee,
2745 } else if (Subtarget->isTargetCOFF()) {
2746 assert(Subtarget->isTargetWindows() &&
2747 "Windows is the only supported COFF target");
2748 unsigned TargetFlags = ARMII::MO_NO_FLAG;
2749 if (GVal->hasDLLImportStorageClass())
2750 TargetFlags = ARMII::MO_DLLIMPORT;
2751 else if (!TM.shouldAssumeDSOLocal(GVal))
2752 TargetFlags = ARMII::MO_COFFSTUB;
2753 Callee = DAG.getTargetGlobalAddress(GVal, dl, PtrVt, /*offset=*/0,
2754 TargetFlags);
2755 if (TargetFlags & (ARMII::MO_DLLIMPORT | ARMII::MO_COFFSTUB))
2756 Callee =
2757 DAG.getLoad(PtrVt, dl, DAG.getEntryNode(),
2758 DAG.getNode(ARMISD::Wrapper, dl, PtrVt, Callee),
2760 } else {
2761 Callee = DAG.getTargetGlobalAddress(GVal, dl, PtrVt, 0, 0);
2762 }
2763 }
2764 } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
2765 isDirect = true;
2766 // tBX takes a register source operand.
2767 const char *Sym = S->getSymbol();
2768 if (isARMFunc && Subtarget->isThumb1Only() && !Subtarget->hasV5TOps()) {
2769 unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
2770 ARMConstantPoolValue *CPV =
2772 ARMPCLabelIndex, 4);
2773 SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, Align(4));
2774 CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
2775 Callee = DAG.getLoad(
2776 PtrVt, dl, DAG.getEntryNode(), CPAddr,
2778 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32);
2779 Callee = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVt, Callee, PICLabel);
2780 } else {
2781 Callee = DAG.getTargetExternalSymbol(Sym, PtrVt, 0);
2782 }
2783 }
2784
2785 if (isCmseNSCall) {
2786 assert(!isARMFunc && !isDirect &&
2787 "Cannot handle call to ARM function or direct call");
2788 if (NumBytes > 0) {
2789 DAG.getContext()->diagnose(
2790 DiagnosticInfoUnsupported(DAG.getMachineFunction().getFunction(),
2791 "call to non-secure function would require "
2792 "passing arguments on stack",
2793 dl.getDebugLoc()));
2794 }
2795 if (isStructRet) {
2796 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
2798 "call to non-secure function would return value through pointer",
2799 dl.getDebugLoc()));
2800 }
2801 }
2802
2803 // FIXME: handle tail calls differently.
2804 unsigned CallOpc;
2805 if (Subtarget->isThumb()) {
2806 if (GuardWithBTI)
2807 CallOpc = ARMISD::t2CALL_BTI;
2808 else if (isCmseNSCall)
2809 CallOpc = ARMISD::tSECALL;
2810 else if ((!isDirect || isARMFunc) && !Subtarget->hasV5TOps())
2811 CallOpc = ARMISD::CALL_NOLINK;
2812 else
2813 CallOpc = ARMISD::CALL;
2814 } else {
2815 if (!isDirect && !Subtarget->hasV5TOps())
2816 CallOpc = ARMISD::CALL_NOLINK;
2817 else if (doesNotRet && isDirect && Subtarget->hasRetAddrStack() &&
2818 // Emit regular call when code size is the priority
2819 !Subtarget->hasMinSize())
2820 // "mov lr, pc; b _foo" to avoid confusing the RSP
2821 CallOpc = ARMISD::CALL_NOLINK;
2822 else
2823 CallOpc = isLocalARMFunc ? ARMISD::CALL_PRED : ARMISD::CALL;
2824 }
2825
2826 // We don't usually want to end the call-sequence here because we would tidy
2827 // the frame up *after* the call, however in the ABI-changing tail-call case
2828 // we've carefully laid out the parameters so that when sp is reset they'll be
2829 // in the correct location.
2830 if (isTailCall && !isSibCall) {
2831 Chain = DAG.getCALLSEQ_END(Chain, 0, 0, InGlue, dl);
2832 InGlue = Chain.getValue(1);
2833 }
2834
2835 std::vector<SDValue> Ops;
2836 Ops.push_back(Chain);
2837 Ops.push_back(Callee);
2838
2839 if (isTailCall) {
2840 Ops.push_back(DAG.getSignedTargetConstant(SPDiff, dl, MVT::i32));
2841 }
2842
2843 // Add argument registers to the end of the list so that they are known live
2844 // into the call.
2845 for (const auto &[Reg, N] : RegsToPass)
2846 Ops.push_back(DAG.getRegister(Reg, N.getValueType()));
2847
2848 // Add a register mask operand representing the call-preserved registers.
2849 const uint32_t *Mask;
2850 const ARMBaseRegisterInfo *ARI = Subtarget->getRegisterInfo();
2851 if (isThisReturn) {
2852 // For 'this' returns, use the R0-preserving mask if applicable
2853 Mask = ARI->getThisReturnPreservedMask(MF, CallConv);
2854 if (!Mask) {
2855 // Set isThisReturn to false if the calling convention is not one that
2856 // allows 'returned' to be modeled in this way, so LowerCallResult does
2857 // not try to pass 'this' straight through
2858 isThisReturn = false;
2859 Mask = ARI->getCallPreservedMask(MF, CallConv);
2860 }
2861 } else
2862 Mask = ARI->getCallPreservedMask(MF, CallConv);
2863
2864 assert(Mask && "Missing call preserved mask for calling convention");
2865 Ops.push_back(DAG.getRegisterMask(Mask));
2866
2867 if (InGlue.getNode())
2868 Ops.push_back(InGlue);
2869
2870 if (isTailCall) {
2872 SDValue Ret = DAG.getNode(ARMISD::TC_RETURN, dl, MVT::Other, Ops);
2873 if (CLI.CFIType)
2874 Ret.getNode()->setCFIType(CLI.CFIType->getZExtValue());
2875 DAG.addNoMergeSiteInfo(Ret.getNode(), CLI.NoMerge);
2876 DAG.addCallSiteInfo(Ret.getNode(), std::move(CSInfo));
2877 return Ret;
2878 }
2879
2880 // Returns a chain and a flag for retval copy to use.
2881 Chain = DAG.getNode(CallOpc, dl, {MVT::Other, MVT::Glue}, Ops);
2882 if (CLI.CFIType)
2883 Chain.getNode()->setCFIType(CLI.CFIType->getZExtValue());
2884 DAG.addNoMergeSiteInfo(Chain.getNode(), CLI.NoMerge);
2885 InGlue = Chain.getValue(1);
2886 DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo));
2887
2888 // If we're guaranteeing tail-calls will be honoured, the callee must
2889 // pop its own argument stack on return. But this call is *not* a tail call so
2890 // we need to undo that after it returns to restore the status-quo.
2891 bool TailCallOpt = getTargetMachine().Options.GuaranteedTailCallOpt;
2892 uint64_t CalleePopBytes =
2893 canGuaranteeTCO(CallConv, TailCallOpt) ? alignTo(NumBytes, 16) : -1U;
2894
2895 Chain = DAG.getCALLSEQ_END(Chain, NumBytes, CalleePopBytes, InGlue, dl);
2896 if (!Ins.empty())
2897 InGlue = Chain.getValue(1);
2898
2899 // Handle result values, copying them out of physregs into vregs that we
2900 // return.
2901 return LowerCallResult(Chain, InGlue, CallConv, isVarArg, Ins, dl, DAG,
2902 InVals, isThisReturn,
2903 isThisReturn ? OutVals[0] : SDValue(), isCmseNSCall);
2904}
2905
2906/// HandleByVal - Every parameter *after* a byval parameter is passed
2907/// on the stack. Remember the next parameter register to allocate,
2908/// and then confiscate the rest of the parameter registers to insure
2909/// this.
2910void ARMTargetLowering::HandleByVal(CCState *State, unsigned &Size,
2911 Align Alignment) const {
2912 // Byval (as with any stack) slots are always at least 4 byte aligned.
2913 Alignment = std::max(Alignment, Align(4));
2914
2915 MCRegister Reg = State->AllocateReg(GPRArgRegs);
2916 if (!Reg)
2917 return;
2918
2919 unsigned AlignInRegs = Alignment.value() / 4;
2920 unsigned Waste = (ARM::R4 - Reg) % AlignInRegs;
2921 for (unsigned i = 0; i < Waste; ++i)
2922 Reg = State->AllocateReg(GPRArgRegs);
2923
2924 if (!Reg)
2925 return;
2926
2927 unsigned Excess = 4 * (ARM::R4 - Reg);
2928
2929 // Special case when NSAA != SP and parameter size greater than size of
2930 // all remained GPR regs. In that case we can't split parameter, we must
2931 // send it to stack. We also must set NCRN to R4, so waste all
2932 // remained registers.
2933 const unsigned NSAAOffset = State->getStackSize();
2934 if (NSAAOffset != 0 && Size > Excess) {
2935 while (State->AllocateReg(GPRArgRegs))
2936 ;
2937 return;
2938 }
2939
2940 // First register for byval parameter is the first register that wasn't
2941 // allocated before this method call, so it would be "reg".
2942 // If parameter is small enough to be saved in range [reg, r4), then
2943 // the end (first after last) register would be reg + param-size-in-regs,
2944 // else parameter would be splitted between registers and stack,
2945 // end register would be r4 in this case.
2946 unsigned ByValRegBegin = Reg;
2947 unsigned ByValRegEnd = std::min<unsigned>(Reg + Size / 4, ARM::R4);
2948 State->addInRegsParamInfo(ByValRegBegin, ByValRegEnd);
2949 // Note, first register is allocated in the beginning of function already,
2950 // allocate remained amount of registers we need.
2951 for (unsigned i = Reg + 1; i != ByValRegEnd; ++i)
2952 State->AllocateReg(GPRArgRegs);
2953 // A byval parameter that is split between registers and memory needs its
2954 // size truncated here.
2955 // In the case where the entire structure fits in registers, we set the
2956 // size in memory to zero.
2957 Size = std::max<int>(Size - Excess, 0);
2958}
2959
2960/// IsEligibleForTailCallOptimization - Check whether the call is eligible
2961/// for tail call optimization. Targets which want to do tail call
2962/// optimization should implement this function. Note that this function also
2963/// processes musttail calls, so when this function returns false on a valid
2964/// musttail call, a fatal backend error occurs.
2965bool ARMTargetLowering::IsEligibleForTailCallOptimization(
2967 SmallVectorImpl<CCValAssign> &ArgLocs, const bool isIndirect) const {
2968 CallingConv::ID CalleeCC = CLI.CallConv;
2969 SDValue Callee = CLI.Callee;
2970 bool isVarArg = CLI.IsVarArg;
2971 const SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
2972 const SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
2973 const SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
2974 const SelectionDAG &DAG = CLI.DAG;
2975 MachineFunction &MF = DAG.getMachineFunction();
2976 const Function &CallerF = MF.getFunction();
2977 CallingConv::ID CallerCC = CallerF.getCallingConv();
2978
2979 assert(Subtarget->supportsTailCall());
2980
2981 // Indirect tail-calls require a register to hold the target address. That
2982 // register must be:
2983 // * Allocatable (i.e. r0-r7 if the target is Thumb1).
2984 // * Not callee-saved, so must be one of r0-r3 or r12.
2985 // * Not used to hold an argument to the tail-called function, which might be
2986 // in r0-r3.
2987 // * Not used to hold the return address authentication code, which is in r12
2988 // if enabled.
2989 // Sometimes, no register matches all of these conditions, so we can't do a
2990 // tail-call.
2991 if (!isa<GlobalAddressSDNode>(Callee.getNode()) || isIndirect) {
2992 SmallSet<MCPhysReg, 5> AddressRegisters = {ARM::R0, ARM::R1, ARM::R2,
2993 ARM::R3};
2994 if (!(Subtarget->isThumb1Only() ||
2995 MF.getInfo<ARMFunctionInfo>()->shouldSignReturnAddress(true)))
2996 AddressRegisters.insert(ARM::R12);
2997 for (const CCValAssign &AL : ArgLocs)
2998 if (AL.isRegLoc())
2999 AddressRegisters.erase(AL.getLocReg());
3000 if (AddressRegisters.empty()) {
3001 LLVM_DEBUG(dbgs() << "false (no reg to hold function pointer)\n");
3002 return false;
3003 }
3004 }
3005
3006 // Look for obvious safe cases to perform tail call optimization that do not
3007 // require ABI changes. This is what gcc calls sibcall.
3008
3009 // Exception-handling functions need a special set of instructions to indicate
3010 // a return to the hardware. Tail-calling another function would probably
3011 // break this.
3012 if (CallerF.hasFnAttribute("interrupt")) {
3013 LLVM_DEBUG(dbgs() << "false (interrupt attribute)\n");
3014 return false;
3015 }
3016
3017 if (canGuaranteeTCO(CalleeCC,
3018 getTargetMachine().Options.GuaranteedTailCallOpt)) {
3019 LLVM_DEBUG(dbgs() << (CalleeCC == CallerCC ? "true" : "false")
3020 << " (guaranteed tail-call CC)\n");
3021 return CalleeCC == CallerCC;
3022 }
3023
3024 // Also avoid sibcall optimization if either caller or callee uses struct
3025 // return semantics.
3026 bool isCalleeStructRet = Outs.empty() ? false : Outs[0].Flags.isSRet();
3027 bool isCallerStructRet = MF.getFunction().hasStructRetAttr();
3028 if (isCalleeStructRet != isCallerStructRet) {
3029 LLVM_DEBUG(dbgs() << "false (struct-ret)\n");
3030 return false;
3031 }
3032
3033 // Externally-defined functions with weak linkage should not be
3034 // tail-called on ARM when the OS does not support dynamic
3035 // pre-emption of symbols, as the AAELF spec requires normal calls
3036 // to undefined weak functions to be replaced with a NOP or jump to the
3037 // next instruction. The behaviour of branch instructions in this
3038 // situation (as used for tail calls) is implementation-defined, so we
3039 // cannot rely on the linker replacing the tail call with a return.
3040 if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
3041 const GlobalValue *GV = G->getGlobal();
3042 const Triple &TT = getTargetMachine().getTargetTriple();
3043 if (GV->hasExternalWeakLinkage() &&
3044 (!TT.isOSWindows() || TT.isOSBinFormatELF() ||
3045 TT.isOSBinFormatMachO())) {
3046 LLVM_DEBUG(dbgs() << "false (external weak linkage)\n");
3047 return false;
3048 }
3049 }
3050
3051 // Check that the call results are passed in the same way.
3052 LLVMContext &C = *DAG.getContext();
3054 getEffectiveCallingConv(CalleeCC, isVarArg),
3055 getEffectiveCallingConv(CallerCC, CallerF.isVarArg()), MF, C, Ins,
3056 CCAssignFnForReturn(CalleeCC, isVarArg),
3057 CCAssignFnForReturn(CallerCC, CallerF.isVarArg()))) {
3058 LLVM_DEBUG(dbgs() << "false (incompatible results)\n");
3059 return false;
3060 }
3061 // The callee has to preserve all registers the caller needs to preserve.
3062 const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo();
3063 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
3064 if (CalleeCC != CallerCC) {
3065 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
3066 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved)) {
3067 LLVM_DEBUG(dbgs() << "false (not all registers preserved)\n");
3068 return false;
3069 }
3070 }
3071
3072 // If Caller's vararg argument has been split between registers and stack, do
3073 // not perform tail call, since part of the argument is in caller's local
3074 // frame.
3075 const ARMFunctionInfo *AFI_Caller = MF.getInfo<ARMFunctionInfo>();
3076 if (CLI.IsVarArg && AFI_Caller->getArgRegsSaveSize()) {
3077 LLVM_DEBUG(dbgs() << "false (arg reg save area)\n");
3078 return false;
3079 }
3080
3081 // If the callee takes no arguments then go on to check the results of the
3082 // call.
3083 const MachineRegisterInfo &MRI = MF.getRegInfo();
3084 if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals)) {
3085 LLVM_DEBUG(dbgs() << "false (parameters in CSRs do not match)\n");
3086 return false;
3087 }
3088
3089 // If the stack arguments for this call do not fit into our own save area then
3090 // the call cannot be made tail.
3091 if (CCInfo.getStackSize() > AFI_Caller->getArgumentStackSize())
3092 return false;
3093
3094 LLVM_DEBUG(dbgs() << "true\n");
3095 return true;
3096}
3097
3098bool
3099ARMTargetLowering::CanLowerReturn(CallingConv::ID CallConv,
3100 MachineFunction &MF, bool isVarArg,
3102 LLVMContext &Context, const Type *RetTy) const {
3104 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
3105 return CCInfo.CheckReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg));
3106}
3107
3109 const SDLoc &DL, SelectionDAG &DAG) {
3110 const MachineFunction &MF = DAG.getMachineFunction();
3111 const Function &F = MF.getFunction();
3112
3113 StringRef IntKind = F.getFnAttribute("interrupt").getValueAsString();
3114
3115 // See ARM ARM v7 B1.8.3. On exception entry LR is set to a possibly offset
3116 // version of the "preferred return address". These offsets affect the return
3117 // instruction if this is a return from PL1 without hypervisor extensions.
3118 // IRQ/FIQ: +4 "subs pc, lr, #4"
3119 // SWI: 0 "subs pc, lr, #0"
3120 // ABORT: +4 "subs pc, lr, #4"
3121 // UNDEF: +4/+2 "subs pc, lr, #0"
3122 // UNDEF varies depending on where the exception came from ARM or Thumb
3123 // mode. Alongside GCC, we throw our hands up in disgust and pretend it's 0.
3124
3125 int64_t LROffset;
3126 if (IntKind == "" || IntKind == "IRQ" || IntKind == "FIQ" ||
3127 IntKind == "ABORT")
3128 LROffset = 4;
3129 else if (IntKind == "SWI" || IntKind == "UNDEF")
3130 LROffset = 0;
3131 else
3132 report_fatal_error("Unsupported interrupt attribute. If present, value "
3133 "must be one of: IRQ, FIQ, SWI, ABORT or UNDEF");
3134
3135 RetOps.insert(RetOps.begin() + 1,
3136 DAG.getConstant(LROffset, DL, MVT::i32, false));
3137
3138 return DAG.getNode(ARMISD::INTRET_GLUE, DL, MVT::Other, RetOps);
3139}
3140
3141SDValue
3142ARMTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
3143 bool isVarArg,
3145 const SmallVectorImpl<SDValue> &OutVals,
3146 const SDLoc &dl, SelectionDAG &DAG) const {
3147 // CCValAssign - represent the assignment of the return value to a location.
3149
3150 // CCState - Info about the registers and stack slots.
3151 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
3152 *DAG.getContext());
3153
3154 // Analyze outgoing return values.
3155 CCInfo.AnalyzeReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg));
3156
3157 SDValue Glue;
3159 RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
3160 bool isLittleEndian = Subtarget->isLittle();
3161
3162 MachineFunction &MF = DAG.getMachineFunction();
3163 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
3164 AFI->setReturnRegsCount(RVLocs.size());
3165
3166 // Report error if cmse entry function returns structure through first ptr arg.
3167 if (AFI->isCmseNSEntryFunction() && MF.getFunction().hasStructRetAttr()) {
3168 // Note: using an empty SDLoc(), as the first line of the function is a
3169 // better place to report than the last line.
3170 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
3172 "secure entry function would return value through pointer",
3173 SDLoc().getDebugLoc()));
3174 }
3175
3176 // Copy the result values into the output registers.
3177 for (unsigned i = 0, realRVLocIdx = 0;
3178 i != RVLocs.size();
3179 ++i, ++realRVLocIdx) {
3180 CCValAssign &VA = RVLocs[i];
3181 assert(VA.isRegLoc() && "Can only return in registers!");
3182
3183 SDValue Arg = OutVals[realRVLocIdx];
3184 bool ReturnF16 = false;
3185
3186 if (Subtarget->hasFullFP16() && getTM().isTargetHardFloat()) {
3187 // Half-precision return values can be returned like this:
3188 //
3189 // t11 f16 = fadd ...
3190 // t12: i16 = bitcast t11
3191 // t13: i32 = zero_extend t12
3192 // t14: f32 = bitcast t13 <~~~~~~~ Arg
3193 //
3194 // to avoid code generation for bitcasts, we simply set Arg to the node
3195 // that produces the f16 value, t11 in this case.
3196 //
3197 if (Arg.getValueType() == MVT::f32 && Arg.getOpcode() == ISD::BITCAST) {
3198 SDValue ZE = Arg.getOperand(0);
3199 if (ZE.getOpcode() == ISD::ZERO_EXTEND && ZE.getValueType() == MVT::i32) {
3200 SDValue BC = ZE.getOperand(0);
3201 if (BC.getOpcode() == ISD::BITCAST && BC.getValueType() == MVT::i16) {
3202 Arg = BC.getOperand(0);
3203 ReturnF16 = true;
3204 }
3205 }
3206 }
3207 }
3208
3209 switch (VA.getLocInfo()) {
3210 default: llvm_unreachable("Unknown loc info!");
3211 case CCValAssign::Full: break;
3212 case CCValAssign::BCvt:
3213 if (!ReturnF16)
3214 Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg);
3215 break;
3216 }
3217
3218 // Mask f16 arguments if this is a CMSE nonsecure entry.
3219 auto RetVT = Outs[realRVLocIdx].ArgVT;
3220 if (AFI->isCmseNSEntryFunction() && (RetVT == MVT::f16)) {
3221 if (VA.needsCustom() && VA.getValVT() == MVT::f16) {
3222 Arg = MoveFromHPR(dl, DAG, VA.getLocVT(), VA.getValVT(), Arg);
3223 } else {
3224 auto LocBits = VA.getLocVT().getSizeInBits();
3225 auto MaskValue = APInt::getLowBitsSet(LocBits, RetVT.getSizeInBits());
3226 SDValue Mask =
3227 DAG.getConstant(MaskValue, dl, MVT::getIntegerVT(LocBits));
3228 Arg = DAG.getNode(ISD::BITCAST, dl, MVT::getIntegerVT(LocBits), Arg);
3229 Arg = DAG.getNode(ISD::AND, dl, MVT::getIntegerVT(LocBits), Arg, Mask);
3230 Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg);
3231 }
3232 }
3233
3234 if (VA.needsCustom() &&
3235 (VA.getLocVT() == MVT::v2f64 || VA.getLocVT() == MVT::f64)) {
3236 if (VA.getLocVT() == MVT::v2f64) {
3237 // Extract the first half and return it in two registers.
3238 SDValue Half = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
3239 DAG.getConstant(0, dl, MVT::i32));
3240 SDValue HalfGPRs = DAG.getNode(ARMISD::VMOVRRD, dl,
3241 DAG.getVTList(MVT::i32, MVT::i32), Half);
3242
3243 Chain =
3244 DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
3245 HalfGPRs.getValue(isLittleEndian ? 0 : 1), Glue);
3246 Glue = Chain.getValue(1);
3247 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
3248 VA = RVLocs[++i]; // skip ahead to next loc
3249 Chain =
3250 DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
3251 HalfGPRs.getValue(isLittleEndian ? 1 : 0), Glue);
3252 Glue = Chain.getValue(1);
3253 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
3254 VA = RVLocs[++i]; // skip ahead to next loc
3255
3256 // Extract the 2nd half and fall through to handle it as an f64 value.
3257 Arg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
3258 DAG.getConstant(1, dl, MVT::i32));
3259 }
3260 // Legalize ret f64 -> ret 2 x i32. We always have fmrrd if f64 is
3261 // available.
3262 SDValue fmrrd = DAG.getNode(ARMISD::VMOVRRD, dl,
3263 DAG.getVTList(MVT::i32, MVT::i32), Arg);
3264 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
3265 fmrrd.getValue(isLittleEndian ? 0 : 1), Glue);
3266 Glue = Chain.getValue(1);
3267 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
3268 VA = RVLocs[++i]; // skip ahead to next loc
3269 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
3270 fmrrd.getValue(isLittleEndian ? 1 : 0), Glue);
3271 } else
3272 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), Arg, Glue);
3273
3274 // Guarantee that all emitted copies are
3275 // stuck together, avoiding something bad.
3276 Glue = Chain.getValue(1);
3277 RetOps.push_back(DAG.getRegister(
3278 VA.getLocReg(), ReturnF16 ? Arg.getValueType() : VA.getLocVT()));
3279 }
3280 const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo();
3281 const MCPhysReg *I =
3282 TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
3283 if (I) {
3284 for (; *I; ++I) {
3285 if (ARM::GPRRegClass.contains(*I))
3286 RetOps.push_back(DAG.getRegister(*I, MVT::i32));
3287 else if (ARM::DPRRegClass.contains(*I))
3289 else
3290 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
3291 }
3292 }
3293
3294 // Update chain and glue.
3295 RetOps[0] = Chain;
3296 if (Glue.getNode())
3297 RetOps.push_back(Glue);
3298
3299 // CPUs which aren't M-class use a special sequence to return from
3300 // exceptions (roughly, any instruction setting pc and cpsr simultaneously,
3301 // though we use "subs pc, lr, #N").
3302 //
3303 // M-class CPUs actually use a normal return sequence with a special
3304 // (hardware-provided) value in LR, so the normal code path works.
3305 if (DAG.getMachineFunction().getFunction().hasFnAttribute("interrupt") &&
3306 !Subtarget->isMClass()) {
3307 if (Subtarget->isThumb1Only())
3308 report_fatal_error("interrupt attribute is not supported in Thumb1");
3309 return LowerInterruptReturn(RetOps, dl, DAG);
3310 }
3311
3314 return DAG.getNode(RetNode, dl, MVT::Other, RetOps);
3315}
3316
3317bool ARMTargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
3318 if (N->getNumValues() != 1)
3319 return false;
3320 if (!N->hasNUsesOfValue(1, 0))
3321 return false;
3322
3323 SDValue TCChain = Chain;
3324 SDNode *Copy = *N->user_begin();
3325 if (Copy->getOpcode() == ISD::CopyToReg) {
3326 // If the copy has a glue operand, we conservatively assume it isn't safe to
3327 // perform a tail call.
3328 if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
3329 return false;
3330 TCChain = Copy->getOperand(0);
3331 } else if (Copy->getOpcode() == ARMISD::VMOVRRD) {
3332 SDNode *VMov = Copy;
3333 // f64 returned in a pair of GPRs.
3334 SmallPtrSet<SDNode*, 2> Copies;
3335 for (SDNode *U : VMov->users()) {
3336 if (U->getOpcode() != ISD::CopyToReg)
3337 return false;
3338 Copies.insert(U);
3339 }
3340 if (Copies.size() > 2)
3341 return false;
3342
3343 for (SDNode *U : VMov->users()) {
3344 SDValue UseChain = U->getOperand(0);
3345 if (Copies.count(UseChain.getNode()))
3346 // Second CopyToReg
3347 Copy = U;
3348 else {
3349 // We are at the top of this chain.
3350 // If the copy has a glue operand, we conservatively assume it
3351 // isn't safe to perform a tail call.
3352 if (U->getOperand(U->getNumOperands() - 1).getValueType() == MVT::Glue)
3353 return false;
3354 // First CopyToReg
3355 TCChain = UseChain;
3356 }
3357 }
3358 } else if (Copy->getOpcode() == ISD::BITCAST) {
3359 // f32 returned in a single GPR.
3360 if (!Copy->hasOneUse())
3361 return false;
3362 Copy = *Copy->user_begin();
3363 if (Copy->getOpcode() != ISD::CopyToReg || !Copy->hasNUsesOfValue(1, 0))
3364 return false;
3365 // If the copy has a glue operand, we conservatively assume it isn't safe to
3366 // perform a tail call.
3367 if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
3368 return false;
3369 TCChain = Copy->getOperand(0);
3370 } else {
3371 return false;
3372 }
3373
3374 bool HasRet = false;
3375 for (const SDNode *U : Copy->users()) {
3376 if (U->getOpcode() != ARMISD::RET_GLUE &&
3377 U->getOpcode() != ARMISD::INTRET_GLUE)
3378 return false;
3379 HasRet = true;
3380 }
3381
3382 if (!HasRet)
3383 return false;
3384
3385 Chain = TCChain;
3386 return true;
3387}
3388
3389bool ARMTargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
3390 if (!Subtarget->supportsTailCall())
3391 return false;
3392
3393 if (!CI->isTailCall())
3394 return false;
3395
3396 return true;
3397}
3398
3399// Trying to write a 64 bit value so need to split into two 32 bit values first,
3400// and pass the lower and high parts through.
3402 SDLoc DL(Op);
3403 SDValue WriteValue = Op->getOperand(2);
3404
3405 // This function is only supposed to be called for i64 type argument.
3406 assert(WriteValue.getValueType() == MVT::i64
3407 && "LowerWRITE_REGISTER called for non-i64 type argument.");
3408
3409 SDValue Lo, Hi;
3410 std::tie(Lo, Hi) = DAG.SplitScalar(WriteValue, DL, MVT::i32, MVT::i32);
3411 SDValue Ops[] = { Op->getOperand(0), Op->getOperand(1), Lo, Hi };
3412 return DAG.getNode(ISD::WRITE_REGISTER, DL, MVT::Other, Ops);
3413}
3414
3415// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
3416// their target counterpart wrapped in the ARMISD::Wrapper node. Suppose N is
3417// one of the above mentioned nodes. It has to be wrapped because otherwise
3418// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
3419// be used to form addressing mode. These wrapped nodes will be selected
3420// into MOVi.
3421SDValue ARMTargetLowering::LowerConstantPool(SDValue Op,
3422 SelectionDAG &DAG) const {
3423 EVT PtrVT = Op.getValueType();
3424 // FIXME there is no actual debug info here
3425 SDLoc dl(Op);
3426 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
3427 SDValue Res;
3428
3429 // When generating execute-only code Constant Pools must be promoted to the
3430 // global data section. It's a bit ugly that we can't share them across basic
3431 // blocks, but this way we guarantee that execute-only behaves correct with
3432 // position-independent addressing modes.
3433 if (Subtarget->genExecuteOnly()) {
3434 auto AFI = DAG.getMachineFunction().getInfo<ARMFunctionInfo>();
3435 auto *T = CP->getType();
3436 auto C = const_cast<Constant*>(CP->getConstVal());
3437 auto M = DAG.getMachineFunction().getFunction().getParent();
3438 auto GV = new GlobalVariable(
3439 *M, T, /*isConstant=*/true, GlobalVariable::InternalLinkage, C,
3440 Twine(DAG.getDataLayout().getPrivateGlobalPrefix()) + "CP" +
3441 Twine(DAG.getMachineFunction().getFunctionNumber()) + "_" +
3442 Twine(AFI->createPICLabelUId())
3443 );
3445 dl, PtrVT);
3446 return LowerGlobalAddress(GA, DAG);
3447 }
3448
3449 // The 16-bit ADR instruction can only encode offsets that are multiples of 4,
3450 // so we need to align to at least 4 bytes when we don't have 32-bit ADR.
3451 Align CPAlign = CP->getAlign();
3452 if (Subtarget->isThumb1Only())
3453 CPAlign = std::max(CPAlign, Align(4));
3454 if (CP->isMachineConstantPoolEntry())
3455 Res =
3456 DAG.getTargetConstantPool(CP->getMachineCPVal(), PtrVT, CPAlign);
3457 else
3458 Res = DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CPAlign);
3459 return DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Res);
3460}
3461
3463 // If we don't have a 32-bit pc-relative branch instruction then the jump
3464 // table consists of block addresses. Usually this is inline, but for
3465 // execute-only it must be placed out-of-line.
3466 if (Subtarget->genExecuteOnly() && !Subtarget->hasV8MBaselineOps())
3469}
3470
3471SDValue ARMTargetLowering::LowerBlockAddress(SDValue Op,
3472 SelectionDAG &DAG) const {
3475 unsigned ARMPCLabelIndex = 0;
3476 SDLoc DL(Op);
3477 EVT PtrVT = getPointerTy(DAG.getDataLayout());
3478 const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
3479 SDValue CPAddr;
3480 bool IsPositionIndependent = isPositionIndependent() || Subtarget->isROPI();
3481 if (!IsPositionIndependent) {
3482 CPAddr = DAG.getTargetConstantPool(BA, PtrVT, Align(4));
3483 } else {
3484 unsigned PCAdj = Subtarget->isThumb() ? 4 : 8;
3485 ARMPCLabelIndex = AFI->createPICLabelUId();
3487 ARMConstantPoolConstant::Create(BA, ARMPCLabelIndex,
3488 ARMCP::CPBlockAddress, PCAdj);
3489 CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, Align(4));
3490 }
3491 CPAddr = DAG.getNode(ARMISD::Wrapper, DL, PtrVT, CPAddr);
3492 SDValue Result = DAG.getLoad(
3493 PtrVT, DL, DAG.getEntryNode(), CPAddr,
3495 if (!IsPositionIndependent)
3496 return Result;
3497 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, DL, MVT::i32);
3498 return DAG.getNode(ARMISD::PIC_ADD, DL, PtrVT, Result, PICLabel);
3499}
3500
3501/// Convert a TLS address reference into the correct sequence of loads
3502/// and calls to compute the variable's address for Darwin, and return an
3503/// SDValue containing the final node.
3504
3505/// Darwin only has one TLS scheme which must be capable of dealing with the
3506/// fully general situation, in the worst case. This means:
3507/// + "extern __thread" declaration.
3508/// + Defined in a possibly unknown dynamic library.
3509///
3510/// The general system is that each __thread variable has a [3 x i32] descriptor
3511/// which contains information used by the runtime to calculate the address. The
3512/// only part of this the compiler needs to know about is the first word, which
3513/// contains a function pointer that must be called with the address of the
3514/// entire descriptor in "r0".
3515///
3516/// Since this descriptor may be in a different unit, in general access must
3517/// proceed along the usual ARM rules. A common sequence to produce is:
3518///
3519/// movw rT1, :lower16:_var$non_lazy_ptr
3520/// movt rT1, :upper16:_var$non_lazy_ptr
3521/// ldr r0, [rT1]
3522/// ldr rT2, [r0]
3523/// blx rT2
3524/// [...address now in r0...]
3525SDValue
3526ARMTargetLowering::LowerGlobalTLSAddressDarwin(SDValue Op,
3527 SelectionDAG &DAG) const {
3528 assert(Subtarget->isTargetDarwin() &&
3529 "This function expects a Darwin target");
3530 SDLoc DL(Op);
3531
3532 // First step is to get the address of the actua global symbol. This is where
3533 // the TLS descriptor lives.
3534 SDValue DescAddr = LowerGlobalAddressDarwin(Op, DAG);
3535
3536 // The first entry in the descriptor is a function pointer that we must call
3537 // to obtain the address of the variable.
3538 SDValue Chain = DAG.getEntryNode();
3539 SDValue FuncTLVGet = DAG.getLoad(
3540 MVT::i32, DL, Chain, DescAddr,
3544 Chain = FuncTLVGet.getValue(1);
3545
3546 MachineFunction &F = DAG.getMachineFunction();
3547 MachineFrameInfo &MFI = F.getFrameInfo();
3548 MFI.setAdjustsStack(true);
3549
3550 // TLS calls preserve all registers except those that absolutely must be
3551 // trashed: R0 (it takes an argument), LR (it's a call) and CPSR (let's not be
3552 // silly).
3553 auto TRI =
3555 auto ARI = static_cast<const ARMRegisterInfo *>(TRI);
3556 const uint32_t *Mask = ARI->getTLSCallPreservedMask(DAG.getMachineFunction());
3557
3558 // Finally, we can make the call. This is just a degenerate version of a
3559 // normal AArch64 call node: r0 takes the address of the descriptor, and
3560 // returns the address of the variable in this thread.
3561 Chain = DAG.getCopyToReg(Chain, DL, ARM::R0, DescAddr, SDValue());
3562 Chain =
3563 DAG.getNode(ARMISD::CALL, DL, DAG.getVTList(MVT::Other, MVT::Glue),
3564 Chain, FuncTLVGet, DAG.getRegister(ARM::R0, MVT::i32),
3565 DAG.getRegisterMask(Mask), Chain.getValue(1));
3566 return DAG.getCopyFromReg(Chain, DL, ARM::R0, MVT::i32, Chain.getValue(1));
3567}
3568
3569SDValue
3570ARMTargetLowering::LowerGlobalTLSAddressWindows(SDValue Op,
3571 SelectionDAG &DAG) const {
3572 assert(Subtarget->isTargetWindows() && "Windows specific TLS lowering");
3573
3574 SDValue Chain = DAG.getEntryNode();
3575 EVT PtrVT = getPointerTy(DAG.getDataLayout());
3576 SDLoc DL(Op);
3577
3578 // Load the current TEB (thread environment block)
3579 SDValue Ops[] = {Chain,
3580 DAG.getTargetConstant(Intrinsic::arm_mrc, DL, MVT::i32),
3581 DAG.getTargetConstant(15, DL, MVT::i32),
3582 DAG.getTargetConstant(0, DL, MVT::i32),
3583 DAG.getTargetConstant(13, DL, MVT::i32),
3584 DAG.getTargetConstant(0, DL, MVT::i32),
3585 DAG.getTargetConstant(2, DL, MVT::i32)};
3586 SDValue CurrentTEB = DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL,
3587 DAG.getVTList(MVT::i32, MVT::Other), Ops);
3588
3589 SDValue TEB = CurrentTEB.getValue(0);
3590 Chain = CurrentTEB.getValue(1);
3591
3592 // Load the ThreadLocalStoragePointer from the TEB
3593 // A pointer to the TLS array is located at offset 0x2c from the TEB.
3594 SDValue TLSArray =
3595 DAG.getNode(ISD::ADD, DL, PtrVT, TEB, DAG.getIntPtrConstant(0x2c, DL));
3596 TLSArray = DAG.getLoad(PtrVT, DL, Chain, TLSArray, MachinePointerInfo());
3597
3598 // The pointer to the thread's TLS data area is at the TLS Index scaled by 4
3599 // offset into the TLSArray.
3600
3601 // Load the TLS index from the C runtime
3602 SDValue TLSIndex =
3603 DAG.getTargetExternalSymbol("_tls_index", PtrVT, ARMII::MO_NO_FLAG);
3604 TLSIndex = DAG.getNode(ARMISD::Wrapper, DL, PtrVT, TLSIndex);
3605 TLSIndex = DAG.getLoad(PtrVT, DL, Chain, TLSIndex, MachinePointerInfo());
3606
3607 SDValue Slot = DAG.getNode(ISD::SHL, DL, PtrVT, TLSIndex,
3608 DAG.getConstant(2, DL, MVT::i32));
3609 SDValue TLS = DAG.getLoad(PtrVT, DL, Chain,
3610 DAG.getNode(ISD::ADD, DL, PtrVT, TLSArray, Slot),
3611 MachinePointerInfo());
3612
3613 // Get the offset of the start of the .tls section (section base)
3614 const auto *GA = cast<GlobalAddressSDNode>(Op);
3615 auto *CPV = ARMConstantPoolConstant::Create(GA->getGlobal(), ARMCP::SECREL);
3616 SDValue Offset = DAG.getLoad(
3617 PtrVT, DL, Chain,
3618 DAG.getNode(ARMISD::Wrapper, DL, MVT::i32,
3619 DAG.getTargetConstantPool(CPV, PtrVT, Align(4))),
3621
3622 return DAG.getNode(ISD::ADD, DL, PtrVT, TLS, Offset);
3623}
3624
3625// Lower ISD::GlobalTLSAddress using the "general dynamic" model
3626SDValue
3627ARMTargetLowering::LowerToTLSGeneralDynamicModel(GlobalAddressSDNode *GA,
3628 SelectionDAG &DAG) const {
3629 SDLoc dl(GA);
3630 EVT PtrVT = getPointerTy(DAG.getDataLayout());
3631 unsigned char PCAdj = Subtarget->isThumb() ? 4 : 8;
3632 MachineFunction &MF = DAG.getMachineFunction();
3633 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
3634 unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
3635 ARMConstantPoolValue *CPV =
3636 ARMConstantPoolConstant::Create(GA->getGlobal(), ARMPCLabelIndex,
3637 ARMCP::CPValue, PCAdj, ARMCP::TLSGD, true);
3638 SDValue Argument = DAG.getTargetConstantPool(CPV, PtrVT, Align(4));
3639 Argument = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Argument);
3640 Argument = DAG.getLoad(
3641 PtrVT, dl, DAG.getEntryNode(), Argument,
3643 SDValue Chain = Argument.getValue(1);
3644
3645 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32);
3646 Argument = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Argument, PICLabel);
3647
3648 // call __tls_get_addr.
3650 Args.emplace_back(Argument, Type::getInt32Ty(*DAG.getContext()));
3651
3652 // FIXME: is there useful debug info available here?
3653 TargetLowering::CallLoweringInfo CLI(DAG);
3654 CLI.setDebugLoc(dl).setChain(Chain).setLibCallee(
3656 DAG.getExternalSymbol("__tls_get_addr", PtrVT), std::move(Args));
3657
3658 std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
3659 return CallResult.first;
3660}
3661
3662// Lower ISD::GlobalTLSAddress using the "initial exec" or
3663// "local exec" model.
3664SDValue
3665ARMTargetLowering::LowerToTLSExecModels(GlobalAddressSDNode *GA,
3666 SelectionDAG &DAG,
3667 TLSModel::Model model) const {
3668 const GlobalValue *GV = GA->getGlobal();
3669 SDLoc dl(GA);
3671 SDValue Chain = DAG.getEntryNode();
3672 EVT PtrVT = getPointerTy(DAG.getDataLayout());
3673 // Get the Thread Pointer
3675
3676 if (model == TLSModel::InitialExec) {
3677 MachineFunction &MF = DAG.getMachineFunction();
3678 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
3679 unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
3680 // Initial exec model.
3681 unsigned char PCAdj = Subtarget->isThumb() ? 4 : 8;
3682 ARMConstantPoolValue *CPV =
3683 ARMConstantPoolConstant::Create(GA->getGlobal(), ARMPCLabelIndex,
3685 true);
3686 Offset = DAG.getTargetConstantPool(CPV, PtrVT, Align(4));
3687 Offset = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Offset);
3688 Offset = DAG.getLoad(
3689 PtrVT, dl, Chain, Offset,
3691 Chain = Offset.getValue(1);
3692
3693 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32);
3694 Offset = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Offset, PICLabel);
3695
3696 Offset = DAG.getLoad(
3697 PtrVT, dl, Chain, Offset,
3699 } else {
3700 // local exec model
3701 assert(model == TLSModel::LocalExec);
3702 ARMConstantPoolValue *CPV =
3704 Offset = DAG.getTargetConstantPool(CPV, PtrVT, Align(4));
3705 Offset = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Offset);
3706 Offset = DAG.getLoad(
3707 PtrVT, dl, Chain, Offset,
3709 }
3710
3711 // The address of the thread local variable is the add of the thread
3712 // pointer with the offset of the variable.
3713 return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
3714}
3715
3716SDValue
3717ARMTargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
3718 GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
3719 if (DAG.getTarget().useEmulatedTLS())
3720 return LowerToTLSEmulatedModel(GA, DAG);
3721
3722 if (Subtarget->isTargetDarwin())
3723 return LowerGlobalTLSAddressDarwin(Op, DAG);
3724
3725 if (Subtarget->isTargetWindows())
3726 return LowerGlobalTLSAddressWindows(Op, DAG);
3727
3728 // TODO: implement the "local dynamic" model
3729 assert(Subtarget->isTargetELF() && "Only ELF implemented here");
3731
3732 switch (model) {
3735 return LowerToTLSGeneralDynamicModel(GA, DAG);
3738 return LowerToTLSExecModels(GA, DAG, model);
3739 }
3740 llvm_unreachable("bogus TLS model");
3741}
3742
3743/// Return true if all users of V are within function F, looking through
3744/// ConstantExprs.
3745static bool allUsersAreInFunction(const Value *V, const Function *F) {
3746 SmallVector<const User*,4> Worklist(V->users());
3747 while (!Worklist.empty()) {
3748 auto *U = Worklist.pop_back_val();
3749 if (isa<ConstantExpr>(U)) {
3750 append_range(Worklist, U->users());
3751 continue;
3752 }
3753
3754 auto *I = dyn_cast<Instruction>(U);
3755 if (!I || I->getParent()->getParent() != F)
3756 return false;
3757 }
3758 return true;
3759}
3760
3762 const GlobalValue *GV, SelectionDAG &DAG,
3763 EVT PtrVT, const SDLoc &dl) {
3764 // If we're creating a pool entry for a constant global with unnamed address,
3765 // and the global is small enough, we can emit it inline into the constant pool
3766 // to save ourselves an indirection.
3767 //
3768 // This is a win if the constant is only used in one function (so it doesn't
3769 // need to be duplicated) or duplicating the constant wouldn't increase code
3770 // size (implying the constant is no larger than 4 bytes).
3771 const Function &F = DAG.getMachineFunction().getFunction();
3772
3773 // We rely on this decision to inline being idemopotent and unrelated to the
3774 // use-site. We know that if we inline a variable at one use site, we'll
3775 // inline it elsewhere too (and reuse the constant pool entry). Fast-isel
3776 // doesn't know about this optimization, so bail out if it's enabled else
3777 // we could decide to inline here (and thus never emit the GV) but require
3778 // the GV from fast-isel generated code.
3781 return SDValue();
3782
3783 auto *GVar = dyn_cast<GlobalVariable>(GV);
3784 if (!GVar || !GVar->hasInitializer() ||
3785 !GVar->isConstant() || !GVar->hasGlobalUnnamedAddr() ||
3786 !GVar->hasLocalLinkage())
3787 return SDValue();
3788
3789 // If we inline a value that contains relocations, we move the relocations
3790 // from .data to .text. This is not allowed in position-independent code.
3791 auto *Init = GVar->getInitializer();
3792 if ((TLI->isPositionIndependent() || TLI->getSubtarget()->isROPI()) &&
3793 Init->needsDynamicRelocation())
3794 return SDValue();
3795
3796 // The constant islands pass can only really deal with alignment requests
3797 // <= 4 bytes and cannot pad constants itself. Therefore we cannot promote
3798 // any type wanting greater alignment requirements than 4 bytes. We also
3799 // can only promote constants that are multiples of 4 bytes in size or
3800 // are paddable to a multiple of 4. Currently we only try and pad constants
3801 // that are strings for simplicity.
3802 auto *CDAInit = dyn_cast<ConstantDataArray>(Init);
3803 unsigned Size = DAG.getDataLayout().getTypeAllocSize(Init->getType());
3804 Align PrefAlign = DAG.getDataLayout().getPreferredAlign(GVar);
3805 unsigned RequiredPadding = 4 - (Size % 4);
3806 bool PaddingPossible =
3807 RequiredPadding == 4 || (CDAInit && CDAInit->isString());
3808 if (!PaddingPossible || PrefAlign > 4 || Size > ConstpoolPromotionMaxSize ||
3809 Size == 0)
3810 return SDValue();
3811
3812 unsigned PaddedSize = Size + ((RequiredPadding == 4) ? 0 : RequiredPadding);
3814 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
3815
3816 // We can't bloat the constant pool too much, else the ConstantIslands pass
3817 // may fail to converge. If we haven't promoted this global yet (it may have
3818 // multiple uses), and promoting it would increase the constant pool size (Sz
3819 // > 4), ensure we have space to do so up to MaxTotal.
3820 if (!AFI->getGlobalsPromotedToConstantPool().count(GVar) && Size > 4)
3821 if (AFI->getPromotedConstpoolIncrease() + PaddedSize - 4 >=
3823 return SDValue();
3824
3825 // This is only valid if all users are in a single function; we can't clone
3826 // the constant in general. The LLVM IR unnamed_addr allows merging
3827 // constants, but not cloning them.
3828 //
3829 // We could potentially allow cloning if we could prove all uses of the
3830 // constant in the current function don't care about the address, like
3831 // printf format strings. But that isn't implemented for now.
3832 if (!allUsersAreInFunction(GVar, &F))
3833 return SDValue();
3834
3835 // We're going to inline this global. Pad it out if needed.
3836 if (RequiredPadding != 4) {
3837 StringRef S = CDAInit->getAsString();
3838
3840 std::copy(S.bytes_begin(), S.bytes_end(), V.begin());
3841 while (RequiredPadding--)
3842 V.push_back(0);
3844 }
3845
3846 auto CPVal = ARMConstantPoolConstant::Create(GVar, Init);
3847 SDValue CPAddr = DAG.getTargetConstantPool(CPVal, PtrVT, Align(4));
3848 if (!AFI->getGlobalsPromotedToConstantPool().count(GVar)) {
3851 PaddedSize - 4);
3852 }
3853 ++NumConstpoolPromoted;
3854 return DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
3855}
3856
3858 if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV))
3859 if (!(GV = GA->getAliaseeObject()))
3860 return false;
3861 if (const auto *V = dyn_cast<GlobalVariable>(GV))
3862 return V->isConstant();
3863 return isa<Function>(GV);
3864}
3865
3866SDValue ARMTargetLowering::LowerGlobalAddress(SDValue Op,
3867 SelectionDAG &DAG) const {
3868 switch (Subtarget->getTargetTriple().getObjectFormat()) {
3869 default: llvm_unreachable("unknown object format");
3870 case Triple::COFF:
3871 return LowerGlobalAddressWindows(Op, DAG);
3872 case Triple::ELF:
3873 return LowerGlobalAddressELF(Op, DAG);
3874 case Triple::MachO:
3875 return LowerGlobalAddressDarwin(Op, DAG);
3876 }
3877}
3878
3879SDValue ARMTargetLowering::LowerGlobalAddressELF(SDValue Op,
3880 SelectionDAG &DAG) const {
3881 EVT PtrVT = getPointerTy(DAG.getDataLayout());
3882 SDLoc dl(Op);
3883 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
3884 bool IsRO = isReadOnly(GV);
3885
3886 // promoteToConstantPool only if not generating XO text section
3887 if (GV->isDSOLocal() && !Subtarget->genExecuteOnly())
3888 if (SDValue V = promoteToConstantPool(this, GV, DAG, PtrVT, dl))
3889 return V;
3890
3891 if (isPositionIndependent()) {
3893 GV, dl, PtrVT, 0, GV->isDSOLocal() ? 0 : ARMII::MO_GOT);
3894 SDValue Result = DAG.getNode(ARMISD::WrapperPIC, dl, PtrVT, G);
3895 if (!GV->isDSOLocal())
3896 Result =
3897 DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
3899 return Result;
3900 } else if (Subtarget->isROPI() && IsRO) {
3901 // PC-relative.
3902 SDValue G = DAG.getTargetGlobalAddress(GV, dl, PtrVT);
3903 SDValue Result = DAG.getNode(ARMISD::WrapperPIC, dl, PtrVT, G);
3904 return Result;
3905 } else if (Subtarget->isRWPI() && !IsRO) {
3906 // SB-relative.
3907 SDValue RelAddr;
3908 if (Subtarget->useMovt()) {
3909 ++NumMovwMovt;
3910 SDValue G = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, ARMII::MO_SBREL);
3911 RelAddr = DAG.getNode(ARMISD::Wrapper, dl, PtrVT, G);
3912 } else { // use literal pool for address constant
3913 ARMConstantPoolValue *CPV =
3915 SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, Align(4));
3916 CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
3917 RelAddr = DAG.getLoad(
3918 PtrVT, dl, DAG.getEntryNode(), CPAddr,
3920 }
3921 SDValue SB = DAG.getCopyFromReg(DAG.getEntryNode(), dl, ARM::R9, PtrVT);
3922 SDValue Result = DAG.getNode(ISD::ADD, dl, PtrVT, SB, RelAddr);
3923 return Result;
3924 }
3925
3926 // If we have T2 ops, we can materialize the address directly via movt/movw
3927 // pair. This is always cheaper. If need to generate Execute Only code, and we
3928 // only have Thumb1 available, we can't use a constant pool and are forced to
3929 // use immediate relocations.
3930 if (Subtarget->useMovt() || Subtarget->genExecuteOnly()) {
3931 if (Subtarget->useMovt())
3932 ++NumMovwMovt;
3933 // FIXME: Once remat is capable of dealing with instructions with register
3934 // operands, expand this into two nodes.
3935 return DAG.getNode(ARMISD::Wrapper, dl, PtrVT,
3936 DAG.getTargetGlobalAddress(GV, dl, PtrVT));
3937 } else {
3938 SDValue CPAddr = DAG.getTargetConstantPool(GV, PtrVT, Align(4));
3939 CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
3940 return DAG.getLoad(
3941 PtrVT, dl, DAG.getEntryNode(), CPAddr,
3943 }
3944}
3945
3946SDValue ARMTargetLowering::LowerGlobalAddressDarwin(SDValue Op,
3947 SelectionDAG &DAG) const {
3948 assert(!Subtarget->isROPI() && !Subtarget->isRWPI() &&
3949 "ROPI/RWPI not currently supported for Darwin");
3950 EVT PtrVT = getPointerTy(DAG.getDataLayout());
3951 SDLoc dl(Op);
3952 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
3953
3954 if (Subtarget->useMovt())
3955 ++NumMovwMovt;
3956
3957 // FIXME: Once remat is capable of dealing with instructions with register
3958 // operands, expand this into multiple nodes
3959 unsigned Wrapper =
3961
3962 SDValue G = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, ARMII::MO_NONLAZY);
3963 SDValue Result = DAG.getNode(Wrapper, dl, PtrVT, G);
3964
3965 if (Subtarget->isGVIndirectSymbol(GV))
3966 Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
3968 return Result;
3969}
3970
3971SDValue ARMTargetLowering::LowerGlobalAddressWindows(SDValue Op,
3972 SelectionDAG &DAG) const {
3973 assert(Subtarget->isTargetWindows() && "non-Windows COFF is not supported");
3974 assert(Subtarget->useMovt() &&
3975 "Windows on ARM expects to use movw/movt");
3976 assert(!Subtarget->isROPI() && !Subtarget->isRWPI() &&
3977 "ROPI/RWPI not currently supported for Windows");
3978
3979 const TargetMachine &TM = getTargetMachine();
3980 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
3981 ARMII::TOF TargetFlags = ARMII::MO_NO_FLAG;
3982 if (GV->hasDLLImportStorageClass())
3983 TargetFlags = ARMII::MO_DLLIMPORT;
3984 else if (!TM.shouldAssumeDSOLocal(GV))
3985 TargetFlags = ARMII::MO_COFFSTUB;
3986 EVT PtrVT = getPointerTy(DAG.getDataLayout());
3988 SDLoc DL(Op);
3989
3990 ++NumMovwMovt;
3991
3992 // FIXME: Once remat is capable of dealing with instructions with register
3993 // operands, expand this into two nodes.
3994 Result = DAG.getNode(ARMISD::Wrapper, DL, PtrVT,
3995 DAG.getTargetGlobalAddress(GV, DL, PtrVT, /*offset=*/0,
3996 TargetFlags));
3997 if (TargetFlags & (ARMII::MO_DLLIMPORT | ARMII::MO_COFFSTUB))
3998 Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result,
4000 return Result;
4001}
4002
4003SDValue
4004ARMTargetLowering::LowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const {
4005 SDLoc dl(Op);
4006 SDValue Val = DAG.getConstant(0, dl, MVT::i32);
4007 return DAG.getNode(ARMISD::EH_SJLJ_SETJMP, dl,
4008 DAG.getVTList(MVT::i32, MVT::Other), Op.getOperand(0),
4009 Op.getOperand(1), Val);
4010}
4011
4012SDValue
4013ARMTargetLowering::LowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const {
4014 SDLoc dl(Op);
4015 return DAG.getNode(ARMISD::EH_SJLJ_LONGJMP, dl, MVT::Other, Op.getOperand(0),
4016 Op.getOperand(1), DAG.getConstant(0, dl, MVT::i32));
4017}
4018
4019SDValue ARMTargetLowering::LowerEH_SJLJ_SETUP_DISPATCH(SDValue Op,
4020 SelectionDAG &DAG) const {
4021 SDLoc dl(Op);
4022 return DAG.getNode(ARMISD::EH_SJLJ_SETUP_DISPATCH, dl, MVT::Other,
4023 Op.getOperand(0));
4024}
4025
4026SDValue ARMTargetLowering::LowerINTRINSIC_VOID(
4027 SDValue Op, SelectionDAG &DAG, const ARMSubtarget *Subtarget) const {
4028 unsigned IntNo =
4029 Op.getConstantOperandVal(Op.getOperand(0).getValueType() == MVT::Other);
4030 switch (IntNo) {
4031 default:
4032 return SDValue(); // Don't custom lower most intrinsics.
4033 case Intrinsic::arm_gnu_eabi_mcount: {
4034 MachineFunction &MF = DAG.getMachineFunction();
4035 EVT PtrVT = getPointerTy(DAG.getDataLayout());
4036 SDLoc dl(Op);
4037 SDValue Chain = Op.getOperand(0);
4038 // call "\01__gnu_mcount_nc"
4039 const ARMBaseRegisterInfo *ARI = Subtarget->getRegisterInfo();
4040 const uint32_t *Mask =
4042 assert(Mask && "Missing call preserved mask for calling convention");
4043 // Mark LR an implicit live-in.
4044 Register Reg = MF.addLiveIn(ARM::LR, getRegClassFor(MVT::i32));
4045 SDValue ReturnAddress =
4046 DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, PtrVT);
4047 constexpr EVT ResultTys[] = {MVT::Other, MVT::Glue};
4048 SDValue Callee =
4049 DAG.getTargetExternalSymbol("\01__gnu_mcount_nc", PtrVT, 0);
4051 if (Subtarget->isThumb())
4052 return SDValue(
4053 DAG.getMachineNode(
4054 ARM::tBL_PUSHLR, dl, ResultTys,
4055 {ReturnAddress, DAG.getTargetConstant(ARMCC::AL, dl, PtrVT),
4056 DAG.getRegister(0, PtrVT), Callee, RegisterMask, Chain}),
4057 0);
4058 return SDValue(
4059 DAG.getMachineNode(ARM::BL_PUSHLR, dl, ResultTys,
4060 {ReturnAddress, Callee, RegisterMask, Chain}),
4061 0);
4062 }
4063 }
4064}
4065
4066SDValue
4067ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG,
4068 const ARMSubtarget *Subtarget) const {
4069 unsigned IntNo = Op.getConstantOperandVal(0);
4070 SDLoc dl(Op);
4071 switch (IntNo) {
4072 default: return SDValue(); // Don't custom lower most intrinsics.
4073 case Intrinsic::thread_pointer: {
4074 EVT PtrVT = getPointerTy(DAG.getDataLayout());
4075 return DAG.getNode(ARMISD::THREAD_POINTER, dl, PtrVT);
4076 }
4077 case Intrinsic::arm_cls: {
4078 const SDValue &Operand = Op.getOperand(1);
4079 const EVT VTy = Op.getValueType();
4080 SDValue SRA =
4081 DAG.getNode(ISD::SRA, dl, VTy, Operand, DAG.getConstant(31, dl, VTy));
4082 SDValue XOR = DAG.getNode(ISD::XOR, dl, VTy, SRA, Operand);
4083 SDValue SHL =
4084 DAG.getNode(ISD::SHL, dl, VTy, XOR, DAG.getConstant(1, dl, VTy));
4085 SDValue OR =
4086 DAG.getNode(ISD::OR, dl, VTy, SHL, DAG.getConstant(1, dl, VTy));
4087 SDValue Result = DAG.getNode(ISD::CTLZ, dl, VTy, OR);
4088 return Result;
4089 }
4090 case Intrinsic::arm_cls64: {
4091 // cls(x) = if cls(hi(x)) != 31 then cls(hi(x))
4092 // else 31 + clz(if hi(x) == 0 then lo(x) else not(lo(x)))
4093 const SDValue &Operand = Op.getOperand(1);
4094 const EVT VTy = Op.getValueType();
4095 SDValue Lo, Hi;
4096 std::tie(Lo, Hi) = DAG.SplitScalar(Operand, dl, VTy, VTy);
4097 SDValue Constant0 = DAG.getConstant(0, dl, VTy);
4098 SDValue Constant1 = DAG.getConstant(1, dl, VTy);
4099 SDValue Constant31 = DAG.getConstant(31, dl, VTy);
4100 SDValue SRAHi = DAG.getNode(ISD::SRA, dl, VTy, Hi, Constant31);
4101 SDValue XORHi = DAG.getNode(ISD::XOR, dl, VTy, SRAHi, Hi);
4102 SDValue SHLHi = DAG.getNode(ISD::SHL, dl, VTy, XORHi, Constant1);
4103 SDValue ORHi = DAG.getNode(ISD::OR, dl, VTy, SHLHi, Constant1);
4104 SDValue CLSHi = DAG.getNode(ISD::CTLZ, dl, VTy, ORHi);
4105 SDValue CheckLo =
4106 DAG.getSetCC(dl, MVT::i1, CLSHi, Constant31, ISD::CondCode::SETEQ);
4107 SDValue HiIsZero =
4108 DAG.getSetCC(dl, MVT::i1, Hi, Constant0, ISD::CondCode::SETEQ);
4109 SDValue AdjustedLo =
4110 DAG.getSelect(dl, VTy, HiIsZero, Lo, DAG.getNOT(dl, Lo, VTy));
4111 SDValue CLZAdjustedLo = DAG.getNode(ISD::CTLZ, dl, VTy, AdjustedLo);
4112 SDValue Result =
4113 DAG.getSelect(dl, VTy, CheckLo,
4114 DAG.getNode(ISD::ADD, dl, VTy, CLZAdjustedLo, Constant31), CLSHi);
4115 return Result;
4116 }
4117 case Intrinsic::eh_sjlj_lsda: {
4118 MachineFunction &MF = DAG.getMachineFunction();
4119 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
4120 unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
4121 EVT PtrVT = getPointerTy(DAG.getDataLayout());
4122 SDValue CPAddr;
4123 bool IsPositionIndependent = isPositionIndependent();
4124 unsigned PCAdj = IsPositionIndependent ? (Subtarget->isThumb() ? 4 : 8) : 0;
4125 ARMConstantPoolValue *CPV =
4126 ARMConstantPoolConstant::Create(&MF.getFunction(), ARMPCLabelIndex,
4127 ARMCP::CPLSDA, PCAdj);
4128 CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, Align(4));
4129 CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
4130 SDValue Result = DAG.getLoad(
4131 PtrVT, dl, DAG.getEntryNode(), CPAddr,
4133
4134 if (IsPositionIndependent) {
4135 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32);
4136 Result = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Result, PICLabel);
4137 }
4138 return Result;
4139 }
4140 case Intrinsic::arm_neon_vabs:
4141 return DAG.getNode(ISD::ABS, SDLoc(Op), Op.getValueType(),
4142 Op.getOperand(1));
4143 case Intrinsic::arm_neon_vabds:
4144 if (Op.getValueType().isInteger())
4145 return DAG.getNode(ISD::ABDS, SDLoc(Op), Op.getValueType(),
4146 Op.getOperand(1), Op.getOperand(2));
4147 return SDValue();
4148 case Intrinsic::arm_neon_vabdu:
4149 return DAG.getNode(ISD::ABDU, SDLoc(Op), Op.getValueType(),
4150 Op.getOperand(1), Op.getOperand(2));
4151 case Intrinsic::arm_neon_vmulls:
4152 case Intrinsic::arm_neon_vmullu: {
4153 unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmulls)
4154 ? ARMISD::VMULLs : ARMISD::VMULLu;
4155 return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),
4156 Op.getOperand(1), Op.getOperand(2));
4157 }
4158 case Intrinsic::arm_neon_vminnm:
4159 case Intrinsic::arm_neon_vmaxnm: {
4160 unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vminnm)
4161 ? ISD::FMINNUM : ISD::FMAXNUM;
4162 return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),
4163 Op.getOperand(1), Op.getOperand(2));
4164 }
4165 case Intrinsic::arm_neon_vminu:
4166 case Intrinsic::arm_neon_vmaxu: {
4167 if (Op.getValueType().isFloatingPoint())
4168 return SDValue();
4169 unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vminu)
4170 ? ISD::UMIN : ISD::UMAX;
4171 return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),
4172 Op.getOperand(1), Op.getOperand(2));
4173 }
4174 case Intrinsic::arm_neon_vmins:
4175 case Intrinsic::arm_neon_vmaxs: {
4176 // v{min,max}s is overloaded between signed integers and floats.
4177 if (!Op.getValueType().isFloatingPoint()) {
4178 unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmins)
4179 ? ISD::SMIN : ISD::SMAX;
4180 return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),
4181 Op.getOperand(1), Op.getOperand(2));
4182 }
4183 unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmins)
4184 ? ISD::FMINIMUM : ISD::FMAXIMUM;
4185 return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),
4186 Op.getOperand(1), Op.getOperand(2));
4187 }
4188 case Intrinsic::arm_neon_vtbl1:
4189 return DAG.getNode(ARMISD::VTBL1, SDLoc(Op), Op.getValueType(),
4190 Op.getOperand(1), Op.getOperand(2));
4191 case Intrinsic::arm_neon_vtbl2:
4192 return DAG.getNode(ARMISD::VTBL2, SDLoc(Op), Op.getValueType(),
4193 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
4194 case Intrinsic::arm_mve_pred_i2v:
4195 case Intrinsic::arm_mve_pred_v2i:
4196 return DAG.getNode(ARMISD::PREDICATE_CAST, SDLoc(Op), Op.getValueType(),
4197 Op.getOperand(1));
4198 case Intrinsic::arm_mve_vreinterpretq:
4199 return DAG.getNode(ARMISD::VECTOR_REG_CAST, SDLoc(Op), Op.getValueType(),
4200 Op.getOperand(1));
4201 case Intrinsic::arm_mve_lsll:
4202 return DAG.getNode(ARMISD::LSLL, SDLoc(Op), Op->getVTList(),
4203 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
4204 case Intrinsic::arm_mve_asrl:
4205 return DAG.getNode(ARMISD::ASRL, SDLoc(Op), Op->getVTList(),
4206 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
4207 }
4208}
4209
4211 const ARMSubtarget *Subtarget) {
4212 SDLoc dl(Op);
4213 auto SSID = static_cast<SyncScope::ID>(Op.getConstantOperandVal(2));
4214 if (SSID == SyncScope::SingleThread)
4215 return Op;
4216
4217 if (!Subtarget->hasDataBarrier()) {
4218 // Some ARMv6 cpus can support data barriers with an mcr instruction.
4219 // Thumb1 and pre-v6 ARM mode use a libcall instead and should never get
4220 // here.
4221 assert(Subtarget->hasV6Ops() && !Subtarget->isThumb() &&
4222 "Unexpected ISD::ATOMIC_FENCE encountered. Should be libcall!");
4223 return DAG.getNode(ARMISD::MEMBARRIER_MCR, dl, MVT::Other, Op.getOperand(0),
4224 DAG.getConstant(0, dl, MVT::i32));
4225 }
4226
4227 AtomicOrdering Ord =
4228 static_cast<AtomicOrdering>(Op.getConstantOperandVal(1));
4230 if (Subtarget->isMClass()) {
4231 // Only a full system barrier exists in the M-class architectures.
4233 } else if (Subtarget->preferISHSTBarriers() &&
4234 Ord == AtomicOrdering::Release) {
4235 // Swift happens to implement ISHST barriers in a way that's compatible with
4236 // Release semantics but weaker than ISH so we'd be fools not to use
4237 // it. Beware: other processors probably don't!
4239 }
4240
4241 return DAG.getNode(ISD::INTRINSIC_VOID, dl, MVT::Other, Op.getOperand(0),
4242 DAG.getConstant(Intrinsic::arm_dmb, dl, MVT::i32),
4243 DAG.getConstant(Domain, dl, MVT::i32));
4244}
4245
4247 const ARMSubtarget *Subtarget) {
4248 // ARM pre v5TE and Thumb1 does not have preload instructions.
4249 if (!(Subtarget->isThumb2() ||
4250 (!Subtarget->isThumb1Only() && Subtarget->hasV5TEOps())))
4251 // Just preserve the chain.
4252 return Op.getOperand(0);
4253
4254 SDLoc dl(Op);
4255 unsigned isRead = ~Op.getConstantOperandVal(2) & 1;
4256 if (!isRead &&
4257 (!Subtarget->hasV7Ops() || !Subtarget->hasMPExtension()))
4258 // ARMv7 with MP extension has PLDW.
4259 return Op.getOperand(0);
4260
4261 unsigned isData = Op.getConstantOperandVal(4);
4262 if (Subtarget->isThumb()) {
4263 // Invert the bits.
4264 isRead = ~isRead & 1;
4265 isData = ~isData & 1;
4266 }
4267
4268 return DAG.getNode(ARMISD::PRELOAD, dl, MVT::Other, Op.getOperand(0),
4269 Op.getOperand(1), DAG.getConstant(isRead, dl, MVT::i32),
4270 DAG.getConstant(isData, dl, MVT::i32));
4271}
4272
4275 ARMFunctionInfo *FuncInfo = MF.getInfo<ARMFunctionInfo>();
4276
4277 // vastart just stores the address of the VarArgsFrameIndex slot into the
4278 // memory location argument.
4279 SDLoc dl(Op);
4281 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
4282 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
4283 return DAG.getStore(Op.getOperand(0), dl, FR, Op.getOperand(1),
4284 MachinePointerInfo(SV));
4285}
4286
4287SDValue ARMTargetLowering::GetF64FormalArgument(CCValAssign &VA,
4288 CCValAssign &NextVA,
4289 SDValue &Root,
4290 SelectionDAG &DAG,
4291 const SDLoc &dl) const {
4292 MachineFunction &MF = DAG.getMachineFunction();
4293 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
4294
4295 const TargetRegisterClass *RC;
4296 if (AFI->isThumb1OnlyFunction())
4297 RC = &ARM::tGPRRegClass;
4298 else
4299 RC = &ARM::GPRRegClass;
4300
4301 // Transform the arguments stored in physical registers into virtual ones.
4302 Register Reg = MF.addLiveIn(VA.getLocReg(), RC);
4303 SDValue ArgValue = DAG.getCopyFromReg(Root, dl, Reg, MVT::i32);
4304
4305 SDValue ArgValue2;
4306 if (NextVA.isMemLoc()) {
4307 MachineFrameInfo &MFI = MF.getFrameInfo();
4308 int FI = MFI.CreateFixedObject(4, NextVA.getLocMemOffset(), true);
4309
4310 // Create load node to retrieve arguments from the stack.
4311 SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
4312 ArgValue2 = DAG.getLoad(
4313 MVT::i32, dl, Root, FIN,
4315 } else {
4316 Reg = MF.addLiveIn(NextVA.getLocReg(), RC);
4317 ArgValue2 = DAG.getCopyFromReg(Root, dl, Reg, MVT::i32);
4318 }
4319 if (!Subtarget->isLittle())
4320 std::swap (ArgValue, ArgValue2);
4321 return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, ArgValue, ArgValue2);
4322}
4323
4324// The remaining GPRs hold either the beginning of variable-argument
4325// data, or the beginning of an aggregate passed by value (usually
4326// byval). Either way, we allocate stack slots adjacent to the data
4327// provided by our caller, and store the unallocated registers there.
4328// If this is a variadic function, the va_list pointer will begin with
4329// these values; otherwise, this reassembles a (byval) structure that
4330// was split between registers and memory.
4331// Return: The frame index registers were stored into.
4332int ARMTargetLowering::StoreByValRegs(CCState &CCInfo, SelectionDAG &DAG,
4333 const SDLoc &dl, SDValue &Chain,
4334 const Value *OrigArg,
4335 unsigned InRegsParamRecordIdx,
4336 int ArgOffset, unsigned ArgSize) const {
4337 // Currently, two use-cases possible:
4338 // Case #1. Non-var-args function, and we meet first byval parameter.
4339 // Setup first unallocated register as first byval register;
4340 // eat all remained registers
4341 // (these two actions are performed by HandleByVal method).
4342 // Then, here, we initialize stack frame with
4343 // "store-reg" instructions.
4344 // Case #2. Var-args function, that doesn't contain byval parameters.
4345 // The same: eat all remained unallocated registers,
4346 // initialize stack frame.
4347
4348 MachineFunction &MF = DAG.getMachineFunction();
4349 MachineFrameInfo &MFI = MF.getFrameInfo();
4350 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
4351 unsigned RBegin, REnd;
4352 if (InRegsParamRecordIdx < CCInfo.getInRegsParamsCount()) {
4353 CCInfo.getInRegsParamInfo(InRegsParamRecordIdx, RBegin, REnd);
4354 } else {
4355 unsigned RBeginIdx = CCInfo.getFirstUnallocated(GPRArgRegs);
4356 RBegin = RBeginIdx == 4 ? (unsigned)ARM::R4 : GPRArgRegs[RBeginIdx];
4357 REnd = ARM::R4;
4358 }
4359
4360 if (REnd != RBegin)
4361 ArgOffset = -4 * (ARM::R4 - RBegin);
4362
4363 auto PtrVT = getPointerTy(DAG.getDataLayout());
4364 int FrameIndex = MFI.CreateFixedObject(ArgSize, ArgOffset, false);
4365 SDValue FIN = DAG.getFrameIndex(FrameIndex, PtrVT);
4366
4368 const TargetRegisterClass *RC =
4369 AFI->isThumb1OnlyFunction() ? &ARM::tGPRRegClass : &ARM::GPRRegClass;
4370
4371 for (unsigned Reg = RBegin, i = 0; Reg < REnd; ++Reg, ++i) {
4372 Register VReg = MF.addLiveIn(Reg, RC);
4373 SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i32);
4374 SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN,
4375 MachinePointerInfo(OrigArg, 4 * i));
4376 MemOps.push_back(Store);
4377 FIN = DAG.getNode(ISD::ADD, dl, PtrVT, FIN, DAG.getConstant(4, dl, PtrVT));
4378 }
4379
4380 if (!MemOps.empty())
4381 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
4382 return FrameIndex;
4383}
4384
4385// Setup stack frame, the va_list pointer will start from.
4386void ARMTargetLowering::VarArgStyleRegisters(CCState &CCInfo, SelectionDAG &DAG,
4387 const SDLoc &dl, SDValue &Chain,
4388 unsigned ArgOffset,
4389 unsigned TotalArgRegsSaveSize,
4390 bool ForceMutable) const {
4391 MachineFunction &MF = DAG.getMachineFunction();
4392 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
4393
4394 // Try to store any remaining integer argument regs
4395 // to their spots on the stack so that they may be loaded by dereferencing
4396 // the result of va_next.
4397 // If there is no regs to be stored, just point address after last
4398 // argument passed via stack.
4399 int FrameIndex = StoreByValRegs(
4400 CCInfo, DAG, dl, Chain, nullptr, CCInfo.getInRegsParamsCount(),
4401 CCInfo.getStackSize(), std::max(4U, TotalArgRegsSaveSize));
4402 AFI->setVarArgsFrameIndex(FrameIndex);
4403}
4404
4405bool ARMTargetLowering::splitValueIntoRegisterParts(
4406 SelectionDAG &DAG, const SDLoc &DL, SDValue Val, SDValue *Parts,
4407 unsigned NumParts, MVT PartVT, std::optional<CallingConv::ID> CC) const {
4408 EVT ValueVT = Val.getValueType();
4409 if ((ValueVT == MVT::f16 || ValueVT == MVT::bf16) && PartVT == MVT::f32) {
4410 unsigned ValueBits = ValueVT.getSizeInBits();
4411 unsigned PartBits = PartVT.getSizeInBits();
4412 Val = DAG.getNode(ISD::BITCAST, DL, MVT::getIntegerVT(ValueBits), Val);
4413 Val = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::getIntegerVT(PartBits), Val);
4414 Val = DAG.getNode(ISD::BITCAST, DL, PartVT, Val);
4415 Parts[0] = Val;
4416 return true;
4417 }
4418 return false;
4419}
4420
4421SDValue ARMTargetLowering::joinRegisterPartsIntoValue(
4422 SelectionDAG &DAG, const SDLoc &DL, const SDValue *Parts, unsigned NumParts,
4423 MVT PartVT, EVT ValueVT, std::optional<CallingConv::ID> CC) const {
4424 if ((ValueVT == MVT::f16 || ValueVT == MVT::bf16) && PartVT == MVT::f32) {
4425 unsigned ValueBits = ValueVT.getSizeInBits();
4426 unsigned PartBits = PartVT.getSizeInBits();
4427 SDValue Val = Parts[0];
4428
4429 Val = DAG.getNode(ISD::BITCAST, DL, MVT::getIntegerVT(PartBits), Val);
4430 Val = DAG.getNode(ISD::TRUNCATE, DL, MVT::getIntegerVT(ValueBits), Val);
4431 Val = DAG.getNode(ISD::BITCAST, DL, ValueVT, Val);
4432 return Val;
4433 }
4434 return SDValue();
4435}
4436
4437SDValue ARMTargetLowering::LowerFormalArguments(
4438 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
4439 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
4440 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
4441 MachineFunction &MF = DAG.getMachineFunction();
4442 MachineFrameInfo &MFI = MF.getFrameInfo();
4443
4444 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
4445
4446 // Assign locations to all of the incoming arguments.
4448 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
4449 *DAG.getContext());
4450 CCInfo.AnalyzeFormalArguments(Ins, CCAssignFnForCall(CallConv, isVarArg));
4451
4453 unsigned CurArgIdx = 0;
4454
4455 // Initially ArgRegsSaveSize is zero.
4456 // Then we increase this value each time we meet byval parameter.
4457 // We also increase this value in case of varargs function.
4458 AFI->setArgRegsSaveSize(0);
4459
4460 // Calculate the amount of stack space that we need to allocate to store
4461 // byval and variadic arguments that are passed in registers.
4462 // We need to know this before we allocate the first byval or variadic
4463 // argument, as they will be allocated a stack slot below the CFA (Canonical
4464 // Frame Address, the stack pointer at entry to the function).
4465 unsigned ArgRegBegin = ARM::R4;
4466 for (const CCValAssign &VA : ArgLocs) {
4467 if (CCInfo.getInRegsParamsProcessed() >= CCInfo.getInRegsParamsCount())
4468 break;
4469
4470 unsigned Index = VA.getValNo();
4471 ISD::ArgFlagsTy Flags = Ins[Index].Flags;
4472 if (!Flags.isByVal())
4473 continue;
4474
4475 assert(VA.isMemLoc() && "unexpected byval pointer in reg");
4476 unsigned RBegin, REnd;
4477 CCInfo.getInRegsParamInfo(CCInfo.getInRegsParamsProcessed(), RBegin, REnd);
4478 ArgRegBegin = std::min(ArgRegBegin, RBegin);
4479
4480 CCInfo.nextInRegsParam();
4481 }
4482 CCInfo.rewindByValRegsInfo();
4483
4484 int lastInsIndex = -1;
4485 if (isVarArg && MFI.hasVAStart()) {
4486 unsigned RegIdx = CCInfo.getFirstUnallocated(GPRArgRegs);
4487 if (RegIdx != std::size(GPRArgRegs))
4488 ArgRegBegin = std::min(ArgRegBegin, (unsigned)GPRArgRegs[RegIdx]);
4489 }
4490
4491 unsigned TotalArgRegsSaveSize = 4 * (ARM::R4 - ArgRegBegin);
4492 AFI->setArgRegsSaveSize(TotalArgRegsSaveSize);
4493 auto PtrVT = getPointerTy(DAG.getDataLayout());
4494
4495 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
4496 CCValAssign &VA = ArgLocs[i];
4497 if (Ins[VA.getValNo()].isOrigArg()) {
4498 std::advance(CurOrigArg,
4499 Ins[VA.getValNo()].getOrigArgIndex() - CurArgIdx);
4500 CurArgIdx = Ins[VA.getValNo()].getOrigArgIndex();
4501 }
4502 // Arguments stored in registers.
4503 if (VA.isRegLoc()) {
4504 EVT RegVT = VA.getLocVT();
4505 SDValue ArgValue;
4506
4507 if (VA.needsCustom() && VA.getLocVT() == MVT::v2f64) {
4508 // f64 and vector types are split up into multiple registers or
4509 // combinations of registers and stack slots.
4510 SDValue ArgValue1 =
4511 GetF64FormalArgument(VA, ArgLocs[++i], Chain, DAG, dl);
4512 VA = ArgLocs[++i]; // skip ahead to next loc
4513 SDValue ArgValue2;
4514 if (VA.isMemLoc()) {
4515 int FI = MFI.CreateFixedObject(8, VA.getLocMemOffset(), true);
4516 SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
4517 ArgValue2 = DAG.getLoad(
4518 MVT::f64, dl, Chain, FIN,
4520 } else {
4521 ArgValue2 = GetF64FormalArgument(VA, ArgLocs[++i], Chain, DAG, dl);
4522 }
4523 ArgValue = DAG.getNode(ISD::UNDEF, dl, MVT::v2f64);
4524 ArgValue = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, ArgValue,
4525 ArgValue1, DAG.getIntPtrConstant(0, dl));
4526 ArgValue = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, ArgValue,
4527 ArgValue2, DAG.getIntPtrConstant(1, dl));
4528 } else if (VA.needsCustom() && VA.getLocVT() == MVT::f64) {
4529 ArgValue = GetF64FormalArgument(VA, ArgLocs[++i], Chain, DAG, dl);
4530 } else {
4531 const TargetRegisterClass *RC;
4532
4533 if (RegVT == MVT::f16 || RegVT == MVT::bf16)
4534 RC = &ARM::HPRRegClass;
4535 else if (RegVT == MVT::f32)
4536 RC = &ARM::SPRRegClass;
4537 else if (RegVT == MVT::f64 || RegVT == MVT::v4f16 ||
4538 RegVT == MVT::v4bf16)
4539 RC = &ARM::DPRRegClass;
4540 else if (RegVT == MVT::v2f64 || RegVT == MVT::v8f16 ||
4541 RegVT == MVT::v8bf16)
4542 RC = &ARM::QPRRegClass;
4543 else if (RegVT == MVT::i32)
4544 RC = AFI->isThumb1OnlyFunction() ? &ARM::tGPRRegClass
4545 : &ARM::GPRRegClass;
4546 else
4547 llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering");
4548
4549 // Transform the arguments in physical registers into virtual ones.
4550 Register Reg = MF.addLiveIn(VA.getLocReg(), RC);
4551 ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
4552
4553 // If this value is passed in r0 and has the returned attribute (e.g.
4554 // C++ 'structors), record this fact for later use.
4555 if (VA.getLocReg() == ARM::R0 && Ins[VA.getValNo()].Flags.isReturned()) {
4556 AFI->setPreservesR0();
4557 }
4558 }
4559
4560 // If this is an 8 or 16-bit value, it is really passed promoted
4561 // to 32 bits. Insert an assert[sz]ext to capture this, then
4562 // truncate to the right size.
4563 switch (VA.getLocInfo()) {
4564 default: llvm_unreachable("Unknown loc info!");
4565 case CCValAssign::Full: break;
4566 case CCValAssign::BCvt:
4567 ArgValue = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), ArgValue);
4568 break;
4569 }
4570
4571 // f16 arguments have their size extended to 4 bytes and passed as if they
4572 // had been copied to the LSBs of a 32-bit register.
4573 // For that, it's passed extended to i32 (soft ABI) or to f32 (hard ABI)
4574 if (VA.needsCustom() &&
4575 (VA.getValVT() == MVT::f16 || VA.getValVT() == MVT::bf16))
4576 ArgValue = MoveToHPR(dl, DAG, VA.getLocVT(), VA.getValVT(), ArgValue);
4577
4578 // On CMSE Entry Functions, formal integer arguments whose bitwidth is
4579 // less than 32 bits must be sign- or zero-extended in the callee for
4580 // security reasons. Although the ABI mandates an extension done by the
4581 // caller, the latter cannot be trusted to follow the rules of the ABI.
4582 const ISD::InputArg &Arg = Ins[VA.getValNo()];
4583 if (AFI->isCmseNSEntryFunction() && Arg.ArgVT.isScalarInteger() &&
4584 RegVT.isScalarInteger() && Arg.ArgVT.bitsLT(MVT::i32))
4585 ArgValue = handleCMSEValue(ArgValue, Arg, DAG, dl);
4586
4587 InVals.push_back(ArgValue);
4588 } else { // VA.isRegLoc()
4589 // Only arguments passed on the stack should make it here.
4590 assert(VA.isMemLoc());
4591 assert(VA.getValVT() != MVT::i64 && "i64 should already be lowered");
4592
4593 int index = VA.getValNo();
4594
4595 // Some Ins[] entries become multiple ArgLoc[] entries.
4596 // Process them only once.
4597 if (index != lastInsIndex)
4598 {
4599 ISD::ArgFlagsTy Flags = Ins[index].Flags;
4600 // FIXME: For now, all byval parameter objects are marked mutable.
4601 // This can be changed with more analysis.
4602 // In case of tail call optimization mark all arguments mutable.
4603 // Since they could be overwritten by lowering of arguments in case of
4604 // a tail call.
4605 if (Flags.isByVal()) {
4606 assert(Ins[index].isOrigArg() &&
4607 "Byval arguments cannot be implicit");
4608 unsigned CurByValIndex = CCInfo.getInRegsParamsProcessed();
4609
4610 int FrameIndex = StoreByValRegs(
4611 CCInfo, DAG, dl, Chain, &*CurOrigArg, CurByValIndex,
4612 VA.getLocMemOffset(), Flags.getByValSize());
4613 InVals.push_back(DAG.getFrameIndex(FrameIndex, PtrVT));
4614 CCInfo.nextInRegsParam();
4615 } else if (VA.needsCustom() && (VA.getValVT() == MVT::f16 ||
4616 VA.getValVT() == MVT::bf16)) {
4617 // f16 and bf16 values are passed in the least-significant half of
4618 // a 4 byte stack slot. This is done as-if the extension was done
4619 // in a 32-bit register, so the actual bytes used for the value
4620 // differ between little and big endian.
4621 assert(VA.getLocVT().getSizeInBits() == 32);
4622 unsigned FIOffset = VA.getLocMemOffset();
4623 int FI = MFI.CreateFixedObject(VA.getLocVT().getSizeInBits() / 8,
4624 FIOffset, true);
4625
4626 SDValue Addr = DAG.getFrameIndex(FI, PtrVT);
4627 if (DAG.getDataLayout().isBigEndian())
4628 Addr = DAG.getObjectPtrOffset(dl, Addr, TypeSize::getFixed(2));
4629
4630 InVals.push_back(DAG.getLoad(VA.getValVT(), dl, Chain, Addr,
4632 DAG.getMachineFunction(), FI)));
4633
4634 } else {
4635 unsigned FIOffset = VA.getLocMemOffset();
4636 int FI = MFI.CreateFixedObject(VA.getLocVT().getSizeInBits()/8,
4637 FIOffset, true);
4638
4639 // Create load nodes to retrieve arguments from the stack.
4640 SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
4641 InVals.push_back(DAG.getLoad(VA.getValVT(), dl, Chain, FIN,
4643 DAG.getMachineFunction(), FI)));
4644 }
4645 lastInsIndex = index;
4646 }
4647 }
4648 }
4649
4650 // varargs
4651 if (isVarArg && MFI.hasVAStart()) {
4652 VarArgStyleRegisters(CCInfo, DAG, dl, Chain, CCInfo.getStackSize(),
4653 TotalArgRegsSaveSize);
4654 if (AFI->isCmseNSEntryFunction()) {
4655 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
4657 "secure entry function must not be variadic", dl.getDebugLoc()));
4658 }
4659 }
4660
4661 unsigned StackArgSize = CCInfo.getStackSize();
4662 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
4663 if (canGuaranteeTCO(CallConv, TailCallOpt)) {
4664 // The only way to guarantee a tail call is if the callee restores its
4665 // argument area, but it must also keep the stack aligned when doing so.
4666 MaybeAlign StackAlign = DAG.getDataLayout().getStackAlignment();
4667 assert(StackAlign && "data layout string is missing stack alignment");
4668 StackArgSize = alignTo(StackArgSize, *StackAlign);
4669
4670 AFI->setArgumentStackToRestore(StackArgSize);
4671 }
4672 AFI->setArgumentStackSize(StackArgSize);
4673
4674 if (CCInfo.getStackSize() > 0 && AFI->isCmseNSEntryFunction()) {
4675 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
4677 "secure entry function requires arguments on stack", dl.getDebugLoc()));
4678 }
4679
4680 return Chain;
4681}
4682
4683/// isFloatingPointZero - Return true if this is +0.0.
4686 return CFP->getValueAPF().isPosZero();
4687 else if (ISD::isEXTLoad(Op.getNode()) || ISD::isNON_EXTLoad(Op.getNode())) {
4688 // Maybe this has already been legalized into the constant pool?
4689 if (Op.getOperand(1).getOpcode() == ARMISD::Wrapper) {
4690 SDValue WrapperOp = Op.getOperand(1).getOperand(0);
4692 if (const ConstantFP *CFP = dyn_cast<ConstantFP>(CP->getConstVal()))
4693 return CFP->getValueAPF().isPosZero();
4694 }
4695 } else if (Op->getOpcode() == ISD::BITCAST &&
4696 Op->getValueType(0) == MVT::f64) {
4697 // Handle (ISD::BITCAST (ARMISD::VMOVIMM (ISD::TargetConstant 0)) MVT::f64)
4698 // created by LowerConstantFP().
4699 SDValue BitcastOp = Op->getOperand(0);
4700 if (BitcastOp->getOpcode() == ARMISD::VMOVIMM &&
4701 isNullConstant(BitcastOp->getOperand(0)))
4702 return true;
4703 }
4704 return false;
4705}
4706
4707/// Returns appropriate ARM CMP (cmp) and corresponding condition code for
4708/// the given operands.
4709SDValue ARMTargetLowering::getARMCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
4710 SDValue &ARMcc, SelectionDAG &DAG,
4711 const SDLoc &dl) const {
4712 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) {
4713 unsigned C = RHSC->getZExtValue();
4714 if (!isLegalICmpImmediate((int32_t)C)) {
4715 // Constant does not fit, try adjusting it by one.
4716 switch (CC) {
4717 default: break;
4718 case ISD::SETLT:
4719 case ISD::SETGE:
4720 if (C != 0x80000000 && isLegalICmpImmediate(C-1)) {
4721 CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT;
4722 RHS = DAG.getConstant(C - 1, dl, MVT::i32);
4723 }
4724 break;
4725 case ISD::SETULT:
4726 case ISD::SETUGE:
4727 if (C != 0 && isLegalICmpImmediate(C-1)) {
4728 CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT;
4729 RHS = DAG.getConstant(C - 1, dl, MVT::i32);
4730 }
4731 break;
4732 case ISD::SETLE:
4733 case ISD::SETGT:
4734 if (C != 0x7fffffff && isLegalICmpImmediate(C+1)) {
4735 CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE;
4736 RHS = DAG.getConstant(C + 1, dl, MVT::i32);
4737 }
4738 break;
4739 case ISD::SETULE:
4740 case ISD::SETUGT:
4741 if (C != 0xffffffff && isLegalICmpImmediate(C+1)) {
4742 CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE;
4743 RHS = DAG.getConstant(C + 1, dl, MVT::i32);
4744 }
4745 break;
4746 }
4747 }
4748 } else if ((ARM_AM::getShiftOpcForNode(LHS.getOpcode()) != ARM_AM::no_shift) &&
4750 // In ARM and Thumb-2, the compare instructions can shift their second
4751 // operand.
4753 std::swap(LHS, RHS);
4754 }
4755
4756 // Thumb1 has very limited immediate modes, so turning an "and" into a
4757 // shift can save multiple instructions.
4758 //
4759 // If we have (x & C1), and C1 is an appropriate mask, we can transform it
4760 // into "((x << n) >> n)". But that isn't necessarily profitable on its
4761 // own. If it's the operand to an unsigned comparison with an immediate,
4762 // we can eliminate one of the shifts: we transform
4763 // "((x << n) >> n) == C2" to "(x << n) == (C2 << n)".
4764 //
4765 // We avoid transforming cases which aren't profitable due to encoding
4766 // details:
4767 //
4768 // 1. C2 fits into the immediate field of a cmp, and the transformed version
4769 // would not; in that case, we're essentially trading one immediate load for
4770 // another.
4771 // 2. C1 is 255 or 65535, so we can use uxtb or uxth.
4772 // 3. C2 is zero; we have other code for this special case.
4773 //
4774 // FIXME: Figure out profitability for Thumb2; we usually can't save an
4775 // instruction, since the AND is always one instruction anyway, but we could
4776 // use narrow instructions in some cases.
4777 if (Subtarget->isThumb1Only() && LHS->getOpcode() == ISD::AND &&
4778 LHS->hasOneUse() && isa<ConstantSDNode>(LHS.getOperand(1)) &&
4779 LHS.getValueType() == MVT::i32 && isa<ConstantSDNode>(RHS) &&
4780 !isSignedIntSetCC(CC)) {
4781 unsigned Mask = LHS.getConstantOperandVal(1);
4782 auto *RHSC = cast<ConstantSDNode>(RHS.getNode());
4783 uint64_t RHSV = RHSC->getZExtValue();
4784 if (isMask_32(Mask) && (RHSV & ~Mask) == 0 && Mask != 255 && Mask != 65535) {
4785 unsigned ShiftBits = llvm::countl_zero(Mask);
4786 if (RHSV && (RHSV > 255 || (RHSV << ShiftBits) <= 255)) {
4787 SDValue ShiftAmt = DAG.getConstant(ShiftBits, dl, MVT::i32);
4788 LHS = DAG.getNode(ISD::SHL, dl, MVT::i32, LHS.getOperand(0), ShiftAmt);
4789 RHS = DAG.getConstant(RHSV << ShiftBits, dl, MVT::i32);
4790 }
4791 }
4792 }
4793
4794 // The specific comparison "(x<<c) > 0x80000000U" can be optimized to a
4795 // single "lsls x, c+1". The shift sets the "C" and "Z" flags the same
4796 // way a cmp would.
4797 // FIXME: Add support for ARM/Thumb2; this would need isel patterns, and
4798 // some tweaks to the heuristics for the previous and->shift transform.
4799 // FIXME: Optimize cases where the LHS isn't a shift.
4800 if (Subtarget->isThumb1Only() && LHS->getOpcode() == ISD::SHL &&
4801 isa<ConstantSDNode>(RHS) && RHS->getAsZExtVal() == 0x80000000U &&
4802 CC == ISD::SETUGT && isa<ConstantSDNode>(LHS.getOperand(1)) &&
4803 LHS.getConstantOperandVal(1) < 31) {
4804 unsigned ShiftAmt = LHS.getConstantOperandVal(1) + 1;
4805 SDValue Shift =
4806 DAG.getNode(ARMISD::LSLS, dl, DAG.getVTList(MVT::i32, FlagsVT),
4807 LHS.getOperand(0), DAG.getConstant(ShiftAmt, dl, MVT::i32));
4808 ARMcc = DAG.getConstant(ARMCC::HI, dl, MVT::i32);
4809 return Shift.getValue(1);
4810 }
4811
4813
4814 // If the RHS is a constant zero then the V (overflow) flag will never be
4815 // set. This can allow us to simplify GE to PL or LT to MI, which can be
4816 // simpler for other passes (like the peephole optimiser) to deal with.
4817 if (isNullConstant(RHS)) {
4818 switch (CondCode) {
4819 default: break;
4820 case ARMCC::GE:
4822 break;
4823 case ARMCC::LT:
4825 break;
4826 }
4827 }
4828
4829 ARMISD::NodeType CompareType;
4830 switch (CondCode) {
4831 default:
4832 CompareType = ARMISD::CMP;
4833 break;
4834 case ARMCC::EQ:
4835 case ARMCC::NE:
4836 // Uses only Z Flag
4837 CompareType = ARMISD::CMPZ;
4838 break;
4839 }
4840 ARMcc = DAG.getConstant(CondCode, dl, MVT::i32);
4841 return DAG.getNode(CompareType, dl, FlagsVT, LHS, RHS);
4842}
4843
4844/// Returns a appropriate VFP CMP (fcmp{s|d}+fmstat) for the given operands.
4845SDValue ARMTargetLowering::getVFPCmp(SDValue LHS, SDValue RHS,
4846 SelectionDAG &DAG, const SDLoc &dl,
4847 bool Signaling) const {
4848 assert(Subtarget->hasFP64() || RHS.getValueType() != MVT::f64);
4849 SDValue Flags;
4851 Flags = DAG.getNode(Signaling ? ARMISD::CMPFPE : ARMISD::CMPFP, dl, FlagsVT,
4852 LHS, RHS);
4853 else
4854 Flags = DAG.getNode(Signaling ? ARMISD::CMPFPEw0 : ARMISD::CMPFPw0, dl,
4855 FlagsVT, LHS);
4856 return DAG.getNode(ARMISD::FMSTAT, dl, FlagsVT, Flags);
4857}
4858
4859// This function returns three things: the arithmetic computation itself
4860// (Value), a comparison (OverflowCmp), and a condition code (ARMcc). The
4861// comparison and the condition code define the case in which the arithmetic
4862// computation *does not* overflow.
4863std::pair<SDValue, SDValue>
4864ARMTargetLowering::getARMXALUOOp(SDValue Op, SelectionDAG &DAG,
4865 SDValue &ARMcc) const {
4866 assert(Op.getValueType() == MVT::i32 && "Unsupported value type");
4867
4868 SDValue Value, OverflowCmp;
4869 SDValue LHS = Op.getOperand(0);
4870 SDValue RHS = Op.getOperand(1);
4871 SDLoc dl(Op);
4872
4873 // FIXME: We are currently always generating CMPs because we don't support
4874 // generating CMN through the backend. This is not as good as the natural
4875 // CMP case because it causes a register dependency and cannot be folded
4876 // later.
4877
4878 switch (Op.getOpcode()) {
4879 default:
4880 llvm_unreachable("Unknown overflow instruction!");
4881 case ISD::SADDO:
4882 ARMcc = DAG.getConstant(ARMCC::VC, dl, MVT::i32);
4883 Value = DAG.getNode(ISD::ADD, dl, Op.getValueType(), LHS, RHS);
4884 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, FlagsVT, Value, LHS);
4885 break;
4886 case ISD::UADDO:
4887 ARMcc = DAG.getConstant(ARMCC::HS, dl, MVT::i32);
4888 // We use ADDC here to correspond to its use in LowerUnsignedALUO.
4889 // We do not use it in the USUBO case as Value may not be used.
4890 Value = DAG.getNode(ARMISD::ADDC, dl,
4891 DAG.getVTList(Op.getValueType(), MVT::i32), LHS, RHS)
4892 .getValue(0);
4893 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, FlagsVT, Value, LHS);
4894 break;
4895 case ISD::SSUBO:
4896 ARMcc = DAG.getConstant(ARMCC::VC, dl, MVT::i32);
4897 Value = DAG.getNode(ISD::SUB, dl, Op.getValueType(), LHS, RHS);
4898 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, FlagsVT, LHS, RHS);
4899 break;
4900 case ISD::USUBO:
4901 ARMcc = DAG.getConstant(ARMCC::HS, dl, MVT::i32);
4902 Value = DAG.getNode(ISD::SUB, dl, Op.getValueType(), LHS, RHS);
4903 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, FlagsVT, LHS, RHS);
4904 break;
4905 case ISD::UMULO:
4906 // We generate a UMUL_LOHI and then check if the high word is 0.
4907 ARMcc = DAG.getConstant(ARMCC::EQ, dl, MVT::i32);
4908 Value = DAG.getNode(ISD::UMUL_LOHI, dl,
4909 DAG.getVTList(Op.getValueType(), Op.getValueType()),
4910 LHS, RHS);
4911 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, FlagsVT, Value.getValue(1),
4912 DAG.getConstant(0, dl, MVT::i32));
4913 Value = Value.getValue(0); // We only want the low 32 bits for the result.
4914 break;
4915 case ISD::SMULO:
4916 // We generate a SMUL_LOHI and then check if all the bits of the high word
4917 // are the same as the sign bit of the low word.
4918 ARMcc = DAG.getConstant(ARMCC::EQ, dl, MVT::i32);
4919 Value = DAG.getNode(ISD::SMUL_LOHI, dl,
4920 DAG.getVTList(Op.getValueType(), Op.getValueType()),
4921 LHS, RHS);
4922 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, FlagsVT, Value.getValue(1),
4923 DAG.getNode(ISD::SRA, dl, Op.getValueType(),
4924 Value.getValue(0),
4925 DAG.getConstant(31, dl, MVT::i32)));
4926 Value = Value.getValue(0); // We only want the low 32 bits for the result.
4927 break;
4928 } // switch (...)
4929
4930 return std::make_pair(Value, OverflowCmp);
4931}
4932
4933SDValue
4934ARMTargetLowering::LowerSignedALUO(SDValue Op, SelectionDAG &DAG) const {
4935 // Let legalize expand this if it isn't a legal type yet.
4936 if (!isTypeLegal(Op.getValueType()))
4937 return SDValue();
4938
4939 SDValue Value, OverflowCmp;
4940 SDValue ARMcc;
4941 std::tie(Value, OverflowCmp) = getARMXALUOOp(Op, DAG, ARMcc);
4942 SDLoc dl(Op);
4943 // We use 0 and 1 as false and true values.
4944 SDValue TVal = DAG.getConstant(1, dl, MVT::i32);
4945 SDValue FVal = DAG.getConstant(0, dl, MVT::i32);
4946 EVT VT = Op.getValueType();
4947
4948 SDValue Overflow =
4949 DAG.getNode(ARMISD::CMOV, dl, VT, TVal, FVal, ARMcc, OverflowCmp);
4950
4951 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
4952 return DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Value, Overflow);
4953}
4954
4956 SelectionDAG &DAG) {
4957 SDLoc DL(BoolCarry);
4958 EVT CarryVT = BoolCarry.getValueType();
4959
4960 // This converts the boolean value carry into the carry flag by doing
4961 // ARMISD::SUBC Carry, 1
4962 SDValue Carry = DAG.getNode(ARMISD::SUBC, DL,
4963 DAG.getVTList(CarryVT, MVT::i32),
4964 BoolCarry, DAG.getConstant(1, DL, CarryVT));
4965 return Carry.getValue(1);
4966}
4967
4969 SelectionDAG &DAG) {
4970 SDLoc DL(Flags);
4971
4972 // Now convert the carry flag into a boolean carry. We do this
4973 // using ARMISD:ADDE 0, 0, Carry
4974 return DAG.getNode(ARMISD::ADDE, DL, DAG.getVTList(VT, MVT::i32),
4975 DAG.getConstant(0, DL, MVT::i32),
4976 DAG.getConstant(0, DL, MVT::i32), Flags);
4977}
4978
4979SDValue ARMTargetLowering::LowerUnsignedALUO(SDValue Op,
4980 SelectionDAG &DAG) const {
4981 // Let legalize expand this if it isn't a legal type yet.
4982 if (!isTypeLegal(Op.getValueType()))
4983 return SDValue();
4984
4985 SDValue LHS = Op.getOperand(0);
4986 SDValue RHS = Op.getOperand(1);
4987 SDLoc dl(Op);
4988
4989 EVT VT = Op.getValueType();
4990 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
4991 SDValue Value;
4992 SDValue Overflow;
4993 switch (Op.getOpcode()) {
4994 default:
4995 llvm_unreachable("Unknown overflow instruction!");
4996 case ISD::UADDO:
4997 Value = DAG.getNode(ARMISD::ADDC, dl, VTs, LHS, RHS);
4998 // Convert the carry flag into a boolean value.
4999 Overflow = ConvertCarryFlagToBooleanCarry(Value.getValue(1), VT, DAG);
5000 break;
5001 case ISD::USUBO: {
5002 Value = DAG.getNode(ARMISD::SUBC, dl, VTs, LHS, RHS);
5003 // Convert the carry flag into a boolean value.
5004 Overflow = ConvertCarryFlagToBooleanCarry(Value.getValue(1), VT, DAG);
5005 // ARMISD::SUBC returns 0 when we have to borrow, so make it an overflow
5006 // value. So compute 1 - C.
5007 Overflow = DAG.getNode(ISD::SUB, dl, MVT::i32,
5008 DAG.getConstant(1, dl, MVT::i32), Overflow);
5009 break;
5010 }
5011 }
5012
5013 return DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Value, Overflow);
5014}
5015
5017 const ARMSubtarget *Subtarget) {
5018 EVT VT = Op.getValueType();
5019 if (!Subtarget->hasV6Ops() || !Subtarget->hasDSP() || Subtarget->isThumb1Only())
5020 return SDValue();
5021 if (!VT.isSimple())
5022 return SDValue();
5023
5024 unsigned NewOpcode;
5025 switch (VT.getSimpleVT().SimpleTy) {
5026 default:
5027 return SDValue();
5028 case MVT::i8:
5029 switch (Op->getOpcode()) {
5030 case ISD::UADDSAT:
5031 NewOpcode = ARMISD::UQADD8b;
5032 break;
5033 case ISD::SADDSAT:
5034 NewOpcode = ARMISD::QADD8b;
5035 break;
5036 case ISD::USUBSAT:
5037 NewOpcode = ARMISD::UQSUB8b;
5038 break;
5039 case ISD::SSUBSAT:
5040 NewOpcode = ARMISD::QSUB8b;
5041 break;
5042 }
5043 break;
5044 case MVT::i16:
5045 switch (Op->getOpcode()) {
5046 case ISD::UADDSAT:
5047 NewOpcode = ARMISD::UQADD16b;
5048 break;
5049 case ISD::SADDSAT:
5050 NewOpcode = ARMISD::QADD16b;
5051 break;
5052 case ISD::USUBSAT:
5053 NewOpcode = ARMISD::UQSUB16b;
5054 break;
5055 case ISD::SSUBSAT:
5056 NewOpcode = ARMISD::QSUB16b;
5057 break;
5058 }
5059 break;
5060 }
5061
5062 SDLoc dl(Op);
5063 SDValue Add =
5064 DAG.getNode(NewOpcode, dl, MVT::i32,
5065 DAG.getSExtOrTrunc(Op->getOperand(0), dl, MVT::i32),
5066 DAG.getSExtOrTrunc(Op->getOperand(1), dl, MVT::i32));
5067 return DAG.getNode(ISD::TRUNCATE, dl, VT, Add);
5068}
5069
5070SDValue ARMTargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
5071 SDValue Cond = Op.getOperand(0);
5072 SDValue SelectTrue = Op.getOperand(1);
5073 SDValue SelectFalse = Op.getOperand(2);
5074 SDLoc dl(Op);
5075 unsigned Opc = Cond.getOpcode();
5076
5077 if (Cond.getResNo() == 1 &&
5078 (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO ||
5079 Opc == ISD::USUBO)) {
5080 if (!isTypeLegal(Cond->getValueType(0)))
5081 return SDValue();
5082
5083 SDValue Value, OverflowCmp;
5084 SDValue ARMcc;
5085 std::tie(Value, OverflowCmp) = getARMXALUOOp(Cond, DAG, ARMcc);
5086 EVT VT = Op.getValueType();
5087
5088 return getCMOV(dl, VT, SelectTrue, SelectFalse, ARMcc, OverflowCmp, DAG);
5089 }
5090
5091 // Convert:
5092 //
5093 // (select (cmov 1, 0, cond), t, f) -> (cmov t, f, cond)
5094 // (select (cmov 0, 1, cond), t, f) -> (cmov f, t, cond)
5095 //
5096 if (Cond.getOpcode() == ARMISD::CMOV && Cond.hasOneUse()) {
5097 const ConstantSDNode *CMOVTrue =
5098 dyn_cast<ConstantSDNode>(Cond.getOperand(0));
5099 const ConstantSDNode *CMOVFalse =
5100 dyn_cast<ConstantSDNode>(Cond.getOperand(1));
5101
5102 if (CMOVTrue && CMOVFalse) {
5103 unsigned CMOVTrueVal = CMOVTrue->getZExtValue();
5104 unsigned CMOVFalseVal = CMOVFalse->getZExtValue();
5105
5106 SDValue True;
5107 SDValue False;
5108 if (CMOVTrueVal == 1 && CMOVFalseVal == 0) {
5109 True = SelectTrue;
5110 False = SelectFalse;
5111 } else if (CMOVTrueVal == 0 && CMOVFalseVal == 1) {
5112 True = SelectFalse;
5113 False = SelectTrue;
5114 }
5115
5116 if (True.getNode() && False.getNode())
5117 return getCMOV(dl, Op.getValueType(), True, False, Cond.getOperand(2),
5118 Cond.getOperand(3), DAG);
5119 }
5120 }
5121
5122 // ARM's BooleanContents value is UndefinedBooleanContent. Mask out the
5123 // undefined bits before doing a full-word comparison with zero.
5124 Cond = DAG.getNode(ISD::AND, dl, Cond.getValueType(), Cond,
5125 DAG.getConstant(1, dl, Cond.getValueType()));
5126
5127 return DAG.getSelectCC(dl, Cond,
5128 DAG.getConstant(0, dl, Cond.getValueType()),
5129 SelectTrue, SelectFalse, ISD::SETNE);
5130}
5131
5133 bool &swpCmpOps, bool &swpVselOps) {
5134 // Start by selecting the GE condition code for opcodes that return true for
5135 // 'equality'
5136 if (CC == ISD::SETUGE || CC == ISD::SETOGE || CC == ISD::SETOLE ||
5137 CC == ISD::SETULE || CC == ISD::SETGE || CC == ISD::SETLE)
5138 CondCode = ARMCC::GE;
5139
5140 // and GT for opcodes that return false for 'equality'.
5141 else if (CC == ISD::SETUGT || CC == ISD::SETOGT || CC == ISD::SETOLT ||
5142 CC == ISD::SETULT || CC == ISD::SETGT || CC == ISD::SETLT)
5143 CondCode = ARMCC::GT;
5144
5145 // Since we are constrained to GE/GT, if the opcode contains 'less', we need
5146 // to swap the compare operands.
5147 if (CC == ISD::SETOLE || CC == ISD::SETULE || CC == ISD::SETOLT ||
5148 CC == ISD::SETULT || CC == ISD::SETLE || CC == ISD::SETLT)
5149 swpCmpOps = true;
5150
5151 // Both GT and GE are ordered comparisons, and return false for 'unordered'.
5152 // If we have an unordered opcode, we need to swap the operands to the VSEL
5153 // instruction (effectively negating the condition).
5154 //
5155 // This also has the effect of swapping which one of 'less' or 'greater'
5156 // returns true, so we also swap the compare operands. It also switches
5157 // whether we return true for 'equality', so we compensate by picking the
5158 // opposite condition code to our original choice.
5159 if (CC == ISD::SETULE || CC == ISD::SETULT || CC == ISD::SETUGE ||
5160 CC == ISD::SETUGT) {
5161 swpCmpOps = !swpCmpOps;
5162 swpVselOps = !swpVselOps;
5163 CondCode = CondCode == ARMCC::GT ? ARMCC::GE : ARMCC::GT;
5164 }
5165
5166 // 'ordered' is 'anything but unordered', so use the VS condition code and
5167 // swap the VSEL operands.
5168 if (CC == ISD::SETO) {
5169 CondCode = ARMCC::VS;
5170 swpVselOps = true;
5171 }
5172
5173 // 'unordered or not equal' is 'anything but equal', so use the EQ condition
5174 // code and swap the VSEL operands. Also do this if we don't care about the
5175 // unordered case.
5176 if (CC == ISD::SETUNE || CC == ISD::SETNE) {
5177 CondCode = ARMCC::EQ;
5178 swpVselOps = true;
5179 }
5180}
5181
5182SDValue ARMTargetLowering::getCMOV(const SDLoc &dl, EVT VT, SDValue FalseVal,
5183 SDValue TrueVal, SDValue ARMcc,
5184 SDValue Flags, SelectionDAG &DAG) const {
5185 if (!Subtarget->hasFP64() && VT == MVT::f64) {
5187 DAG.getVTList(MVT::i32, MVT::i32), FalseVal);
5189 DAG.getVTList(MVT::i32, MVT::i32), TrueVal);
5190
5191 SDValue TrueLow = TrueVal.getValue(0);
5192 SDValue TrueHigh = TrueVal.getValue(1);
5193 SDValue FalseLow = FalseVal.getValue(0);
5194 SDValue FalseHigh = FalseVal.getValue(1);
5195
5196 SDValue Low = DAG.getNode(ARMISD::CMOV, dl, MVT::i32, FalseLow, TrueLow,
5197 ARMcc, Flags);
5198 SDValue High = DAG.getNode(ARMISD::CMOV, dl, MVT::i32, FalseHigh, TrueHigh,
5199 ARMcc, Flags);
5200
5201 return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Low, High);
5202 }
5203 return DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, TrueVal, ARMcc, Flags);
5204}
5205
5206static bool isGTorGE(ISD::CondCode CC) {
5207 return CC == ISD::SETGT || CC == ISD::SETGE;
5208}
5209
5210static bool isLTorLE(ISD::CondCode CC) {
5211 return CC == ISD::SETLT || CC == ISD::SETLE;
5212}
5213
5214// See if a conditional (LHS CC RHS ? TrueVal : FalseVal) is lower-saturating.
5215// All of these conditions (and their <= and >= counterparts) will do:
5216// x < k ? k : x
5217// x > k ? x : k
5218// k < x ? x : k
5219// k > x ? k : x
5220static bool isLowerSaturate(const SDValue LHS, const SDValue RHS,
5221 const SDValue TrueVal, const SDValue FalseVal,
5222 const ISD::CondCode CC, const SDValue K) {
5223 return (isGTorGE(CC) &&
5224 ((K == LHS && K == TrueVal) || (K == RHS && K == FalseVal))) ||
5225 (isLTorLE(CC) &&
5226 ((K == RHS && K == TrueVal) || (K == LHS && K == FalseVal)));
5227}
5228
5229// Check if two chained conditionals could be converted into SSAT or USAT.
5230//
5231// SSAT can replace a set of two conditional selectors that bound a number to an
5232// interval of type [k, ~k] when k + 1 is a power of 2. Here are some examples:
5233//
5234// x < -k ? -k : (x > k ? k : x)
5235// x < -k ? -k : (x < k ? x : k)
5236// x > -k ? (x > k ? k : x) : -k
5237// x < k ? (x < -k ? -k : x) : k
5238// etc.
5239//
5240// LLVM canonicalizes these to either a min(max()) or a max(min())
5241// pattern. This function tries to match one of these and will return a SSAT
5242// node if successful.
5243//
5244// USAT works similarily to SSAT but bounds on the interval [0, k] where k + 1
5245// is a power of 2.
5247 EVT VT = Op.getValueType();
5248 SDValue V1 = Op.getOperand(0);
5249 SDValue K1 = Op.getOperand(1);
5250 SDValue TrueVal1 = Op.getOperand(2);
5251 SDValue FalseVal1 = Op.getOperand(3);
5252 ISD::CondCode CC1 = cast<CondCodeSDNode>(Op.getOperand(4))->get();
5253
5254 const SDValue Op2 = isa<ConstantSDNode>(TrueVal1) ? FalseVal1 : TrueVal1;
5255 if (Op2.getOpcode() != ISD::SELECT_CC)
5256 return SDValue();
5257
5258 SDValue V2 = Op2.getOperand(0);
5259 SDValue K2 = Op2.getOperand(1);
5260 SDValue TrueVal2 = Op2.getOperand(2);
5261 SDValue FalseVal2 = Op2.getOperand(3);
5262 ISD::CondCode CC2 = cast<CondCodeSDNode>(Op2.getOperand(4))->get();
5263
5264 SDValue V1Tmp = V1;
5265 SDValue V2Tmp = V2;
5266
5267 // Check that the registers and the constants match a max(min()) or min(max())
5268 // pattern
5269 if (V1Tmp != TrueVal1 || V2Tmp != TrueVal2 || K1 != FalseVal1 ||
5270 K2 != FalseVal2 ||
5271 !((isGTorGE(CC1) && isLTorLE(CC2)) || (isLTorLE(CC1) && isGTorGE(CC2))))
5272 return SDValue();
5273
5274 // Check that the constant in the lower-bound check is
5275 // the opposite of the constant in the upper-bound check
5276 // in 1's complement.
5278 return SDValue();
5279
5280 int64_t Val1 = cast<ConstantSDNode>(K1)->getSExtValue();
5281 int64_t Val2 = cast<ConstantSDNode>(K2)->getSExtValue();
5282 int64_t PosVal = std::max(Val1, Val2);
5283 int64_t NegVal = std::min(Val1, Val2);
5284
5285 if (!((Val1 > Val2 && isLTorLE(CC1)) || (Val1 < Val2 && isLTorLE(CC2))) ||
5286 !isPowerOf2_64(PosVal + 1))
5287 return SDValue();
5288
5289 // Handle the difference between USAT (unsigned) and SSAT (signed)
5290 // saturation
5291 // At this point, PosVal is guaranteed to be positive
5292 uint64_t K = PosVal;
5293 SDLoc dl(Op);
5294 if (Val1 == ~Val2)
5295 return DAG.getNode(ARMISD::SSAT, dl, VT, V2Tmp,
5296 DAG.getConstant(llvm::countr_one(K), dl, VT));
5297 if (NegVal == 0)
5298 return DAG.getNode(ARMISD::USAT, dl, VT, V2Tmp,
5299 DAG.getConstant(llvm::countr_one(K), dl, VT));
5300
5301 return SDValue();
5302}
5303
5304// Check if a condition of the type x < k ? k : x can be converted into a
5305// bit operation instead of conditional moves.
5306// Currently this is allowed given:
5307// - The conditions and values match up
5308// - k is 0 or -1 (all ones)
5309// This function will not check the last condition, thats up to the caller
5310// It returns true if the transformation can be made, and in such case
5311// returns x in V, and k in SatK.
5313 SDValue &SatK)
5314{
5315 SDValue LHS = Op.getOperand(0);
5316 SDValue RHS = Op.getOperand(1);
5317 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
5318 SDValue TrueVal = Op.getOperand(2);
5319 SDValue FalseVal = Op.getOperand(3);
5320
5322 ? &RHS
5323 : nullptr;
5324
5325 // No constant operation in comparison, early out
5326 if (!K)
5327 return false;
5328
5329 SDValue KTmp = isa<ConstantSDNode>(TrueVal) ? TrueVal : FalseVal;
5330 V = (KTmp == TrueVal) ? FalseVal : TrueVal;
5331 SDValue VTmp = (K && *K == LHS) ? RHS : LHS;
5332
5333 // If the constant on left and right side, or variable on left and right,
5334 // does not match, early out
5335 if (*K != KTmp || V != VTmp)
5336 return false;
5337
5338 if (isLowerSaturate(LHS, RHS, TrueVal, FalseVal, CC, *K)) {
5339 SatK = *K;
5340 return true;
5341 }
5342
5343 return false;
5344}
5345
5346bool ARMTargetLowering::isUnsupportedFloatingType(EVT VT) const {
5347 if (VT == MVT::f32)
5348 return !Subtarget->hasVFP2Base();
5349 if (VT == MVT::f64)
5350 return !Subtarget->hasFP64();
5351 if (VT == MVT::f16)
5352 return !Subtarget->hasFullFP16();
5353 return false;
5354}
5355
5356SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
5357 EVT VT = Op.getValueType();
5358 SDLoc dl(Op);
5359
5360 // Try to convert two saturating conditional selects into a single SSAT
5361 if ((!Subtarget->isThumb() && Subtarget->hasV6Ops()) || Subtarget->isThumb2())
5362 if (SDValue SatValue = LowerSaturatingConditional(Op, DAG))
5363 return SatValue;
5364
5365 // Try to convert expressions of the form x < k ? k : x (and similar forms)
5366 // into more efficient bit operations, which is possible when k is 0 or -1
5367 // On ARM and Thumb-2 which have flexible operand 2 this will result in
5368 // single instructions. On Thumb the shift and the bit operation will be two
5369 // instructions.
5370 // Only allow this transformation on full-width (32-bit) operations
5371 SDValue LowerSatConstant;
5372 SDValue SatValue;
5373 if (VT == MVT::i32 &&
5374 isLowerSaturatingConditional(Op, SatValue, LowerSatConstant)) {
5375 SDValue ShiftV = DAG.getNode(ISD::SRA, dl, VT, SatValue,
5376 DAG.getConstant(31, dl, VT));
5377 if (isNullConstant(LowerSatConstant)) {
5378 SDValue NotShiftV = DAG.getNode(ISD::XOR, dl, VT, ShiftV,
5379 DAG.getAllOnesConstant(dl, VT));
5380 return DAG.getNode(ISD::AND, dl, VT, SatValue, NotShiftV);
5381 } else if (isAllOnesConstant(LowerSatConstant))
5382 return DAG.getNode(ISD::OR, dl, VT, SatValue, ShiftV);
5383 }
5384
5385 SDValue LHS = Op.getOperand(0);
5386 SDValue RHS = Op.getOperand(1);
5387 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
5388 SDValue TrueVal = Op.getOperand(2);
5389 SDValue FalseVal = Op.getOperand(3);
5390 ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FalseVal);
5391 ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TrueVal);
5392 ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS);
5393 if (Op.getValueType().isInteger()) {
5394
5395 // Check for SMAX(lhs, 0) and SMIN(lhs, 0) patterns.
5396 // (SELECT_CC setgt, lhs, 0, lhs, 0) -> (BIC lhs, (SRA lhs, typesize-1))
5397 // (SELECT_CC setlt, lhs, 0, lhs, 0) -> (AND lhs, (SRA lhs, typesize-1))
5398 // Both require less instructions than compare and conditional select.
5399 if ((CC == ISD::SETGT || CC == ISD::SETLT) && LHS == TrueVal && RHSC &&
5400 RHSC->isZero() && CFVal && CFVal->isZero() &&
5401 LHS.getValueType() == RHS.getValueType()) {
5402 EVT VT = LHS.getValueType();
5403 SDValue Shift =
5404 DAG.getNode(ISD::SRA, dl, VT, LHS,
5405 DAG.getConstant(VT.getSizeInBits() - 1, dl, VT));
5406
5407 if (CC == ISD::SETGT)
5408 Shift = DAG.getNOT(dl, Shift, VT);
5409
5410 return DAG.getNode(ISD::AND, dl, VT, LHS, Shift);
5411 }
5412 }
5413
5414 if (Subtarget->hasV8_1MMainlineOps() && CFVal && CTVal &&
5415 LHS.getValueType() == MVT::i32 && RHS.getValueType() == MVT::i32) {
5416 unsigned TVal = CTVal->getZExtValue();
5417 unsigned FVal = CFVal->getZExtValue();
5418 unsigned Opcode = 0;
5419
5420 if (TVal == ~FVal) {
5421 Opcode = ARMISD::CSINV;
5422 } else if (TVal == ~FVal + 1) {
5423 Opcode = ARMISD::CSNEG;
5424 } else if (TVal + 1 == FVal) {
5425 Opcode = ARMISD::CSINC;
5426 } else if (TVal == FVal + 1) {
5427 Opcode = ARMISD::CSINC;
5428 std::swap(TrueVal, FalseVal);
5429 std::swap(TVal, FVal);
5430 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
5431 }
5432
5433 if (Opcode) {
5434 // If one of the constants is cheaper than another, materialise the
5435 // cheaper one and let the csel generate the other.
5436 if (Opcode != ARMISD::CSINC &&
5437 HasLowerConstantMaterializationCost(FVal, TVal, Subtarget)) {
5438 std::swap(TrueVal, FalseVal);
5439 std::swap(TVal, FVal);
5440 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
5441 }
5442
5443 // Attempt to use ZR checking TVal is 0, possibly inverting the condition
5444 // to get there. CSINC not is invertable like the other two (~(~a) == a,
5445 // -(-a) == a, but (a+1)+1 != a).
5446 if (FVal == 0 && Opcode != ARMISD::CSINC) {
5447 std::swap(TrueVal, FalseVal);
5448 std::swap(TVal, FVal);
5449 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
5450 }
5451
5452 // Drops F's value because we can get it by inverting/negating TVal.
5453 FalseVal = TrueVal;
5454
5455 SDValue ARMcc;
5456 SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl);
5457 EVT VT = TrueVal.getValueType();
5458 return DAG.getNode(Opcode, dl, VT, TrueVal, FalseVal, ARMcc, Cmp);
5459 }
5460 }
5461
5462 if (isUnsupportedFloatingType(LHS.getValueType())) {
5463 softenSetCCOperands(DAG, LHS.getValueType(), LHS, RHS, CC, dl, LHS, RHS);
5464
5465 // If softenSetCCOperands only returned one value, we should compare it to
5466 // zero.
5467 if (!RHS.getNode()) {
5468 RHS = DAG.getConstant(0, dl, LHS.getValueType());
5469 CC = ISD::SETNE;
5470 }
5471 }
5472
5473 if (LHS.getValueType() == MVT::i32) {
5474 // Try to generate VSEL on ARMv8.
5475 // The VSEL instruction can't use all the usual ARM condition
5476 // codes: it only has two bits to select the condition code, so it's
5477 // constrained to use only GE, GT, VS and EQ.
5478 //
5479 // To implement all the various ISD::SETXXX opcodes, we sometimes need to
5480 // swap the operands of the previous compare instruction (effectively
5481 // inverting the compare condition, swapping 'less' and 'greater') and
5482 // sometimes need to swap the operands to the VSEL (which inverts the
5483 // condition in the sense of firing whenever the previous condition didn't)
5484 if (Subtarget->hasFPARMv8Base() && (TrueVal.getValueType() == MVT::f16 ||
5485 TrueVal.getValueType() == MVT::f32 ||
5486 TrueVal.getValueType() == MVT::f64)) {
5488 if (CondCode == ARMCC::LT || CondCode == ARMCC::LE ||
5489 CondCode == ARMCC::VC || CondCode == ARMCC::NE) {
5490 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
5491 std::swap(TrueVal, FalseVal);
5492 }
5493 }
5494
5495 SDValue ARMcc;
5496 SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl);
5497 // Choose GE over PL, which vsel does now support
5498 if (ARMcc->getAsZExtVal() == ARMCC::PL)
5499 ARMcc = DAG.getConstant(ARMCC::GE, dl, MVT::i32);
5500 return getCMOV(dl, VT, FalseVal, TrueVal, ARMcc, Cmp, DAG);
5501 }
5502
5503 ARMCC::CondCodes CondCode, CondCode2;
5504 FPCCToARMCC(CC, CondCode, CondCode2);
5505
5506 // Normalize the fp compare. If RHS is zero we prefer to keep it there so we
5507 // match CMPFPw0 instead of CMPFP, though we don't do this for f16 because we
5508 // must use VSEL (limited condition codes), due to not having conditional f16
5509 // moves.
5510 if (Subtarget->hasFPARMv8Base() &&
5511 !(isFloatingPointZero(RHS) && TrueVal.getValueType() != MVT::f16) &&
5512 (TrueVal.getValueType() == MVT::f16 ||
5513 TrueVal.getValueType() == MVT::f32 ||
5514 TrueVal.getValueType() == MVT::f64)) {
5515 bool swpCmpOps = false;
5516 bool swpVselOps = false;
5517 checkVSELConstraints(CC, CondCode, swpCmpOps, swpVselOps);
5518
5519 if (CondCode == ARMCC::GT || CondCode == ARMCC::GE ||
5520 CondCode == ARMCC::VS || CondCode == ARMCC::EQ) {
5521 if (swpCmpOps)
5522 std::swap(LHS, RHS);
5523 if (swpVselOps)
5524 std::swap(TrueVal, FalseVal);
5525 }
5526 }
5527
5528 SDValue ARMcc = DAG.getConstant(CondCode, dl, MVT::i32);
5529 SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl);
5530 SDValue Result = getCMOV(dl, VT, FalseVal, TrueVal, ARMcc, Cmp, DAG);
5531 if (CondCode2 != ARMCC::AL) {
5532 SDValue ARMcc2 = DAG.getConstant(CondCode2, dl, MVT::i32);
5533 Result = getCMOV(dl, VT, Result, TrueVal, ARMcc2, Cmp, DAG);
5534 }
5535 return Result;
5536}
5537
5538/// canChangeToInt - Given the fp compare operand, return true if it is suitable
5539/// to morph to an integer compare sequence.
5540static bool canChangeToInt(SDValue Op, bool &SeenZero,
5541 const ARMSubtarget *Subtarget) {
5542 SDNode *N = Op.getNode();
5543 if (!N->hasOneUse())
5544 // Otherwise it requires moving the value from fp to integer registers.
5545 return false;
5546 if (!N->getNumValues())
5547 return false;
5548 EVT VT = Op.getValueType();
5549 if (VT != MVT::f32 && !Subtarget->isFPBrccSlow())
5550 // f32 case is generally profitable. f64 case only makes sense when vcmpe +
5551 // vmrs are very slow, e.g. cortex-a8.
5552 return false;
5553
5554 if (isFloatingPointZero(Op)) {
5555 SeenZero = true;
5556 return true;
5557 }
5558 return ISD::isNormalLoad(N);
5559}
5560
5563 return DAG.getConstant(0, SDLoc(Op), MVT::i32);
5564
5566 return DAG.getLoad(MVT::i32, SDLoc(Op), Ld->getChain(), Ld->getBasePtr(),
5567 Ld->getPointerInfo(), Ld->getAlign(),
5568 Ld->getMemOperand()->getFlags());
5569
5570 llvm_unreachable("Unknown VFP cmp argument!");
5571}
5572
5574 SDValue &RetVal1, SDValue &RetVal2) {
5575 SDLoc dl(Op);
5576
5577 if (isFloatingPointZero(Op)) {
5578 RetVal1 = DAG.getConstant(0, dl, MVT::i32);
5579 RetVal2 = DAG.getConstant(0, dl, MVT::i32);
5580 return;
5581 }
5582
5583 if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Op)) {
5584 SDValue Ptr = Ld->getBasePtr();
5585 RetVal1 =
5586 DAG.getLoad(MVT::i32, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),
5587 Ld->getAlign(), Ld->getMemOperand()->getFlags());
5588
5589 EVT PtrType = Ptr.getValueType();
5590 SDValue NewPtr = DAG.getNode(ISD::ADD, dl,
5591 PtrType, Ptr, DAG.getConstant(4, dl, PtrType));
5592 RetVal2 = DAG.getLoad(MVT::i32, dl, Ld->getChain(), NewPtr,
5593 Ld->getPointerInfo().getWithOffset(4),
5594 commonAlignment(Ld->getAlign(), 4),
5595 Ld->getMemOperand()->getFlags());
5596 return;
5597 }
5598
5599 llvm_unreachable("Unknown VFP cmp argument!");
5600}
5601
5602/// OptimizeVFPBrcond - With nnan and without daz, it's legal to optimize some
5603/// f32 and even f64 comparisons to integer ones.
5604SDValue
5605ARMTargetLowering::OptimizeVFPBrcond(SDValue Op, SelectionDAG &DAG) const {
5606 SDValue Chain = Op.getOperand(0);
5607 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
5608 SDValue LHS = Op.getOperand(2);
5609 SDValue RHS = Op.getOperand(3);
5610 SDValue Dest = Op.getOperand(4);
5611 SDLoc dl(Op);
5612
5613 bool LHSSeenZero = false;
5614 bool LHSOk = canChangeToInt(LHS, LHSSeenZero, Subtarget);
5615 bool RHSSeenZero = false;
5616 bool RHSOk = canChangeToInt(RHS, RHSSeenZero, Subtarget);
5617 if (LHSOk && RHSOk && (LHSSeenZero || RHSSeenZero)) {
5618 // If unsafe fp math optimization is enabled and there are no other uses of
5619 // the CMP operands, and the condition code is EQ or NE, we can optimize it
5620 // to an integer comparison.
5621 if (CC == ISD::SETOEQ)
5622 CC = ISD::SETEQ;
5623 else if (CC == ISD::SETUNE)
5624 CC = ISD::SETNE;
5625
5626 SDValue Mask = DAG.getConstant(0x7fffffff, dl, MVT::i32);
5627 SDValue ARMcc;
5628 if (LHS.getValueType() == MVT::f32) {
5629 LHS = DAG.getNode(ISD::AND, dl, MVT::i32,
5630 bitcastf32Toi32(LHS, DAG), Mask);
5631 RHS = DAG.getNode(ISD::AND, dl, MVT::i32,
5632 bitcastf32Toi32(RHS, DAG), Mask);
5633 SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl);
5634 return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, Chain, Dest, ARMcc,
5635 Cmp);
5636 }
5637
5638 SDValue LHS1, LHS2;
5639 SDValue RHS1, RHS2;
5640 expandf64Toi32(LHS, DAG, LHS1, LHS2);
5641 expandf64Toi32(RHS, DAG, RHS1, RHS2);
5642 LHS2 = DAG.getNode(ISD::AND, dl, MVT::i32, LHS2, Mask);
5643 RHS2 = DAG.getNode(ISD::AND, dl, MVT::i32, RHS2, Mask);
5645 ARMcc = DAG.getConstant(CondCode, dl, MVT::i32);
5646 SDValue Ops[] = { Chain, ARMcc, LHS1, LHS2, RHS1, RHS2, Dest };
5647 return DAG.getNode(ARMISD::BCC_i64, dl, MVT::Other, Ops);
5648 }
5649
5650 return SDValue();
5651}
5652
5653// Generate CMP + CMOV for integer abs.
5654SDValue ARMTargetLowering::LowerABS(SDValue Op, SelectionDAG &DAG) const {
5655 SDLoc DL(Op);
5656
5657 SDValue Neg = DAG.getNegative(Op.getOperand(0), DL, MVT::i32);
5658
5659 // Generate CMP & CMOV.
5660 SDValue Cmp = DAG.getNode(ARMISD::CMP, DL, FlagsVT, Op.getOperand(0),
5661 DAG.getConstant(0, DL, MVT::i32));
5662 return DAG.getNode(ARMISD::CMOV, DL, MVT::i32, Op.getOperand(0), Neg,
5663 DAG.getConstant(ARMCC::MI, DL, MVT::i32), Cmp);
5664}
5665
5666SDValue ARMTargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
5667 SDValue Chain = Op.getOperand(0);
5668 SDValue Cond = Op.getOperand(1);
5669 SDValue Dest = Op.getOperand(2);
5670 SDLoc dl(Op);
5671
5672 // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch
5673 // instruction.
5674 unsigned Opc = Cond.getOpcode();
5675 bool OptimizeMul = (Opc == ISD::SMULO || Opc == ISD::UMULO) &&
5676 !Subtarget->isThumb1Only();
5677 if (Cond.getResNo() == 1 &&
5678 (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO ||
5679 Opc == ISD::USUBO || OptimizeMul)) {
5680 // Only lower legal XALUO ops.
5681 if (!isTypeLegal(Cond->getValueType(0)))
5682 return SDValue();
5683
5684 // The actual operation with overflow check.
5685 SDValue Value, OverflowCmp;
5686 SDValue ARMcc;
5687 std::tie(Value, OverflowCmp) = getARMXALUOOp(Cond, DAG, ARMcc);
5688
5689 // Reverse the condition code.
5691 (ARMCC::CondCodes)cast<const ConstantSDNode>(ARMcc)->getZExtValue();
5693 ARMcc = DAG.getConstant(CondCode, SDLoc(ARMcc), MVT::i32);
5694
5695 return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, Chain, Dest, ARMcc,
5696 OverflowCmp);
5697 }
5698
5699 return SDValue();
5700}
5701
5702SDValue ARMTargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
5703 SDValue Chain = Op.getOperand(0);
5704 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
5705 SDValue LHS = Op.getOperand(2);
5706 SDValue RHS = Op.getOperand(3);
5707 SDValue Dest = Op.getOperand(4);
5708 SDLoc dl(Op);
5709
5710 if (isUnsupportedFloatingType(LHS.getValueType())) {
5711 softenSetCCOperands(DAG, LHS.getValueType(), LHS, RHS, CC, dl, LHS, RHS);
5712
5713 // If softenSetCCOperands only returned one value, we should compare it to
5714 // zero.
5715 if (!RHS.getNode()) {
5716 RHS = DAG.getConstant(0, dl, LHS.getValueType());
5717 CC = ISD::SETNE;
5718 }
5719 }
5720
5721 // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch
5722 // instruction.
5723 unsigned Opc = LHS.getOpcode();
5724 bool OptimizeMul = (Opc == ISD::SMULO || Opc == ISD::UMULO) &&
5725 !Subtarget->isThumb1Only();
5726 if (LHS.getResNo() == 1 && (isOneConstant(RHS) || isNullConstant(RHS)) &&
5727 (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO ||
5728 Opc == ISD::USUBO || OptimizeMul) &&
5729 (CC == ISD::SETEQ || CC == ISD::SETNE)) {
5730 // Only lower legal XALUO ops.
5731 if (!isTypeLegal(LHS->getValueType(0)))
5732 return SDValue();
5733
5734 // The actual operation with overflow check.
5735 SDValue Value, OverflowCmp;
5736 SDValue ARMcc;
5737 std::tie(Value, OverflowCmp) = getARMXALUOOp(LHS.getValue(0), DAG, ARMcc);
5738
5739 if ((CC == ISD::SETNE) != isOneConstant(RHS)) {
5740 // Reverse the condition code.
5742 (ARMCC::CondCodes)cast<const ConstantSDNode>(ARMcc)->getZExtValue();
5744 ARMcc = DAG.getConstant(CondCode, SDLoc(ARMcc), MVT::i32);
5745 }
5746
5747 return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, Chain, Dest, ARMcc,
5748 OverflowCmp);
5749 }
5750
5751 if (LHS.getValueType() == MVT::i32) {
5752 SDValue ARMcc;
5753 SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl);
5754 return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, Chain, Dest, ARMcc, Cmp);
5755 }
5756
5757 SDNodeFlags Flags = Op->getFlags();
5758 if (Flags.hasNoNaNs() &&
5759 DAG.getDenormalMode(MVT::f32) == DenormalMode::getIEEE() &&
5760 DAG.getDenormalMode(MVT::f64) == DenormalMode::getIEEE() &&
5761 (CC == ISD::SETEQ || CC == ISD::SETOEQ || CC == ISD::SETNE ||
5762 CC == ISD::SETUNE)) {
5763 if (SDValue Result = OptimizeVFPBrcond(Op, DAG))
5764 return Result;
5765 }
5766
5767 ARMCC::CondCodes CondCode, CondCode2;
5768 FPCCToARMCC(CC, CondCode, CondCode2);
5769
5770 SDValue ARMcc = DAG.getConstant(CondCode, dl, MVT::i32);
5771 SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl);
5772 SDValue Ops[] = {Chain, Dest, ARMcc, Cmp};
5773 SDValue Res = DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, Ops);
5774 if (CondCode2 != ARMCC::AL) {
5775 ARMcc = DAG.getConstant(CondCode2, dl, MVT::i32);
5776 SDValue Ops[] = {Res, Dest, ARMcc, Cmp};
5777 Res = DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, Ops);
5778 }
5779 return Res;
5780}
5781
5782SDValue ARMTargetLowering::LowerBR_JT(SDValue Op, SelectionDAG &DAG) const {
5783 SDValue Chain = Op.getOperand(0);
5784 SDValue Table = Op.getOperand(1);
5785 SDValue Index = Op.getOperand(2);
5786 SDLoc dl(Op);
5787
5788 EVT PTy = getPointerTy(DAG.getDataLayout());
5789 JumpTableSDNode *JT = cast<JumpTableSDNode>(Table);
5790 SDValue JTI = DAG.getTargetJumpTable(JT->getIndex(), PTy);
5791 Table = DAG.getNode(ARMISD::WrapperJT, dl, MVT::i32, JTI);
5792 Index = DAG.getNode(ISD::MUL, dl, PTy, Index, DAG.getConstant(4, dl, PTy));
5793 SDValue Addr = DAG.getNode(ISD::ADD, dl, PTy, Table, Index);
5794 if (Subtarget->isThumb2() || (Subtarget->hasV8MBaselineOps() && Subtarget->isThumb())) {
5795 // Thumb2 and ARMv8-M use a two-level jump. That is, it jumps into the jump table
5796 // which does another jump to the destination. This also makes it easier
5797 // to translate it to TBB / TBH later (Thumb2 only).
5798 // FIXME: This might not work if the function is extremely large.
5799 return DAG.getNode(ARMISD::BR2_JT, dl, MVT::Other, Chain,
5800 Addr, Op.getOperand(2), JTI);
5801 }
5802 if (isPositionIndependent() || Subtarget->isROPI()) {
5803 Addr =
5804 DAG.getLoad((EVT)MVT::i32, dl, Chain, Addr,
5806 Chain = Addr.getValue(1);
5807 Addr = DAG.getNode(ISD::ADD, dl, PTy, Table, Addr);
5808 return DAG.getNode(ARMISD::BR_JT, dl, MVT::Other, Chain, Addr, JTI);
5809 } else {
5810 Addr =
5811 DAG.getLoad(PTy, dl, Chain, Addr,
5813 Chain = Addr.getValue(1);
5814 return DAG.getNode(ARMISD::BR_JT, dl, MVT::Other, Chain, Addr, JTI);
5815 }
5816}
5817
5819 EVT VT = Op.getValueType();
5820 SDLoc dl(Op);
5821
5822 if (Op.getValueType().getVectorElementType() == MVT::i32) {
5823 if (Op.getOperand(0).getValueType().getVectorElementType() == MVT::f32)
5824 return Op;
5825 return DAG.UnrollVectorOp(Op.getNode());
5826 }
5827
5828 const bool HasFullFP16 = DAG.getSubtarget<ARMSubtarget>().hasFullFP16();
5829
5830 EVT NewTy;
5831 const EVT OpTy = Op.getOperand(0).getValueType();
5832 if (OpTy == MVT::v4f32)
5833 NewTy = MVT::v4i32;
5834 else if (OpTy == MVT::v4f16 && HasFullFP16)
5835 NewTy = MVT::v4i16;
5836 else if (OpTy == MVT::v8f16 && HasFullFP16)
5837 NewTy = MVT::v8i16;
5838 else
5839 llvm_unreachable("Invalid type for custom lowering!");
5840
5841 if (VT != MVT::v4i16 && VT != MVT::v8i16)
5842 return DAG.UnrollVectorOp(Op.getNode());
5843
5844 Op = DAG.getNode(Op.getOpcode(), dl, NewTy, Op.getOperand(0));
5845 return DAG.getNode(ISD::TRUNCATE, dl, VT, Op);
5846}
5847
5848SDValue ARMTargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
5849 EVT VT = Op.getValueType();
5850 if (VT.isVector())
5851 return LowerVectorFP_TO_INT(Op, DAG);
5852
5853 bool IsStrict = Op->isStrictFPOpcode();
5854 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
5855
5856 if (isUnsupportedFloatingType(SrcVal.getValueType())) {
5857 RTLIB::Libcall LC;
5858 if (Op.getOpcode() == ISD::FP_TO_SINT ||
5859 Op.getOpcode() == ISD::STRICT_FP_TO_SINT)
5860 LC = RTLIB::getFPTOSINT(SrcVal.getValueType(),
5861 Op.getValueType());
5862 else
5863 LC = RTLIB::getFPTOUINT(SrcVal.getValueType(),
5864 Op.getValueType());
5865 SDLoc Loc(Op);
5866 MakeLibCallOptions CallOptions;
5867 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
5869 std::tie(Result, Chain) = makeLibCall(DAG, LC, Op.getValueType(), SrcVal,
5870 CallOptions, Loc, Chain);
5871 return IsStrict ? DAG.getMergeValues({Result, Chain}, Loc) : Result;
5872 }
5873
5874 // FIXME: Remove this when we have strict fp instruction selection patterns
5875 if (IsStrict) {
5876 SDLoc Loc(Op);
5877 SDValue Result =
5880 Loc, Op.getValueType(), SrcVal);
5881 return DAG.getMergeValues({Result, Op.getOperand(0)}, Loc);
5882 }
5883
5884 return Op;
5885}
5886
5888 const ARMSubtarget *Subtarget) {
5889 EVT VT = Op.getValueType();
5890 EVT ToVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
5891 EVT FromVT = Op.getOperand(0).getValueType();
5892
5893 if (VT == MVT::i32 && ToVT == MVT::i32 && FromVT == MVT::f32)
5894 return Op;
5895 if (VT == MVT::i32 && ToVT == MVT::i32 && FromVT == MVT::f64 &&
5896 Subtarget->hasFP64())
5897 return Op;
5898 if (VT == MVT::i32 && ToVT == MVT::i32 && FromVT == MVT::f16 &&
5899 Subtarget->hasFullFP16())
5900 return Op;
5901 if (VT == MVT::v4i32 && ToVT == MVT::i32 && FromVT == MVT::v4f32 &&
5902 Subtarget->hasMVEFloatOps())
5903 return Op;
5904 if (VT == MVT::v8i16 && ToVT == MVT::i16 && FromVT == MVT::v8f16 &&
5905 Subtarget->hasMVEFloatOps())
5906 return Op;
5907
5908 if (FromVT != MVT::v4f32 && FromVT != MVT::v8f16)
5909 return SDValue();
5910
5911 SDLoc DL(Op);
5912 bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT_SAT;
5913 unsigned BW = ToVT.getScalarSizeInBits() - IsSigned;
5914 SDValue CVT = DAG.getNode(Op.getOpcode(), DL, VT, Op.getOperand(0),
5915 DAG.getValueType(VT.getScalarType()));
5916 SDValue Max = DAG.getNode(IsSigned ? ISD::SMIN : ISD::UMIN, DL, VT, CVT,
5917 DAG.getConstant((1 << BW) - 1, DL, VT));
5918 if (IsSigned)
5919 Max = DAG.getNode(ISD::SMAX, DL, VT, Max,
5920 DAG.getSignedConstant(-(1 << BW), DL, VT));
5921 return Max;
5922}
5923
5925 EVT VT = Op.getValueType();
5926 SDLoc dl(Op);
5927
5928 if (Op.getOperand(0).getValueType().getVectorElementType() == MVT::i32) {
5929 if (VT.getVectorElementType() == MVT::f32)
5930 return Op;
5931 return DAG.UnrollVectorOp(Op.getNode());
5932 }
5933
5934 assert((Op.getOperand(0).getValueType() == MVT::v4i16 ||
5935 Op.getOperand(0).getValueType() == MVT::v8i16) &&
5936 "Invalid type for custom lowering!");
5937
5938 const bool HasFullFP16 = DAG.getSubtarget<ARMSubtarget>().hasFullFP16();
5939
5940 EVT DestVecType;
5941 if (VT == MVT::v4f32)
5942 DestVecType = MVT::v4i32;
5943 else if (VT == MVT::v4f16 && HasFullFP16)
5944 DestVecType = MVT::v4i16;
5945 else if (VT == MVT::v8f16 && HasFullFP16)
5946 DestVecType = MVT::v8i16;
5947 else
5948 return DAG.UnrollVectorOp(Op.getNode());
5949
5950 unsigned CastOpc;
5951 unsigned Opc;
5952 switch (Op.getOpcode()) {
5953 default: llvm_unreachable("Invalid opcode!");
5954 case ISD::SINT_TO_FP:
5955 CastOpc = ISD::SIGN_EXTEND;
5957 break;
5958 case ISD::UINT_TO_FP:
5959 CastOpc = ISD::ZERO_EXTEND;
5961 break;
5962 }
5963
5964 Op = DAG.getNode(CastOpc, dl, DestVecType, Op.getOperand(0));
5965 return DAG.getNode(Opc, dl, VT, Op);
5966}
5967
5968SDValue ARMTargetLowering::LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) const {
5969 EVT VT = Op.getValueType();
5970 if (VT.isVector())
5971 return LowerVectorINT_TO_FP(Op, DAG);
5972 if (isUnsupportedFloatingType(VT)) {
5973 RTLIB::Libcall LC;
5974 if (Op.getOpcode() == ISD::SINT_TO_FP)
5975 LC = RTLIB::getSINTTOFP(Op.getOperand(0).getValueType(),
5976 Op.getValueType());
5977 else
5978 LC = RTLIB::getUINTTOFP(Op.getOperand(0).getValueType(),
5979 Op.getValueType());
5980 MakeLibCallOptions CallOptions;
5981 return makeLibCall(DAG, LC, Op.getValueType(), Op.getOperand(0),
5982 CallOptions, SDLoc(Op)).first;
5983 }
5984
5985 return Op;
5986}
5987
5988SDValue ARMTargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
5989 // Implement fcopysign with a fabs and a conditional fneg.
5990 SDValue Tmp0 = Op.getOperand(0);
5991 SDValue Tmp1 = Op.getOperand(1);
5992 SDLoc dl(Op);
5993 EVT VT = Op.getValueType();
5994 EVT SrcVT = Tmp1.getValueType();
5995 bool InGPR = Tmp0.getOpcode() == ISD::BITCAST ||
5996 Tmp0.getOpcode() == ARMISD::VMOVDRR;
5997 bool UseNEON = !InGPR && Subtarget->hasNEON();
5998
5999 if (UseNEON) {
6000 // Use VBSL to copy the sign bit.
6001 unsigned EncodedVal = ARM_AM::createVMOVModImm(0x6, 0x80);
6002 SDValue Mask = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v2i32,
6003 DAG.getTargetConstant(EncodedVal, dl, MVT::i32));
6004 EVT OpVT = (VT == MVT::f32) ? MVT::v2i32 : MVT::v1i64;
6005 if (VT == MVT::f64)
6006 Mask = DAG.getNode(ARMISD::VSHLIMM, dl, OpVT,
6007 DAG.getNode(ISD::BITCAST, dl, OpVT, Mask),
6008 DAG.getConstant(32, dl, MVT::i32));
6009 else /*if (VT == MVT::f32)*/
6010 Tmp0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f32, Tmp0);
6011 if (SrcVT == MVT::f32) {
6012 Tmp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f32, Tmp1);
6013 if (VT == MVT::f64)
6014 Tmp1 = DAG.getNode(ARMISD::VSHLIMM, dl, OpVT,
6015 DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp1),
6016 DAG.getConstant(32, dl, MVT::i32));
6017 } else if (VT == MVT::f32)
6018 Tmp1 = DAG.getNode(ARMISD::VSHRuIMM, dl, MVT::v1i64,
6019 DAG.getNode(ISD::BITCAST, dl, MVT::v1i64, Tmp1),
6020 DAG.getConstant(32, dl, MVT::i32));
6021 Tmp0 = DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp0);
6022 Tmp1 = DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp1);
6023
6025 dl, MVT::i32);
6026 AllOnes = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v8i8, AllOnes);
6027 SDValue MaskNot = DAG.getNode(ISD::XOR, dl, OpVT, Mask,
6028 DAG.getNode(ISD::BITCAST, dl, OpVT, AllOnes));
6029
6030 SDValue Res = DAG.getNode(ISD::OR, dl, OpVT,
6031 DAG.getNode(ISD::AND, dl, OpVT, Tmp1, Mask),
6032 DAG.getNode(ISD::AND, dl, OpVT, Tmp0, MaskNot));
6033 if (VT == MVT::f32) {
6034 Res = DAG.getNode(ISD::BITCAST, dl, MVT::v2f32, Res);
6035 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res,
6036 DAG.getConstant(0, dl, MVT::i32));
6037 } else {
6038 Res = DAG.getNode(ISD::BITCAST, dl, MVT::f64, Res);
6039 }
6040
6041 return Res;
6042 }
6043
6044 // Bitcast operand 1 to i32.
6045 if (SrcVT == MVT::f64)
6046 Tmp1 = DAG.getNode(ARMISD::VMOVRRD, dl, DAG.getVTList(MVT::i32, MVT::i32),
6047 Tmp1).getValue(1);
6048 Tmp1 = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Tmp1);
6049
6050 // Or in the signbit with integer operations.
6051 SDValue Mask1 = DAG.getConstant(0x80000000, dl, MVT::i32);
6052 SDValue Mask2 = DAG.getConstant(0x7fffffff, dl, MVT::i32);
6053 Tmp1 = DAG.getNode(ISD::AND, dl, MVT::i32, Tmp1, Mask1);
6054 if (VT == MVT::f32) {
6055 Tmp0 = DAG.getNode(ISD::AND, dl, MVT::i32,
6056 DAG.getNode(ISD::BITCAST, dl, MVT::i32, Tmp0), Mask2);
6057 return DAG.getNode(ISD::BITCAST, dl, MVT::f32,
6058 DAG.getNode(ISD::OR, dl, MVT::i32, Tmp0, Tmp1));
6059 }
6060
6061 // f64: Or the high part with signbit and then combine two parts.
6062 Tmp0 = DAG.getNode(ARMISD::VMOVRRD, dl, DAG.getVTList(MVT::i32, MVT::i32),
6063 Tmp0);
6064 SDValue Lo = Tmp0.getValue(0);
6065 SDValue Hi = DAG.getNode(ISD::AND, dl, MVT::i32, Tmp0.getValue(1), Mask2);
6066 Hi = DAG.getNode(ISD::OR, dl, MVT::i32, Hi, Tmp1);
6067 return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi);
6068}
6069
6070SDValue ARMTargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const{
6071 MachineFunction &MF = DAG.getMachineFunction();
6072 MachineFrameInfo &MFI = MF.getFrameInfo();
6073 MFI.setReturnAddressIsTaken(true);
6074
6075 EVT VT = Op.getValueType();
6076 SDLoc dl(Op);
6077 unsigned Depth = Op.getConstantOperandVal(0);
6078 if (Depth) {
6079 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
6080 SDValue Offset = DAG.getConstant(4, dl, MVT::i32);
6081 return DAG.getLoad(VT, dl, DAG.getEntryNode(),
6082 DAG.getNode(ISD::ADD, dl, VT, FrameAddr, Offset),
6083 MachinePointerInfo());
6084 }
6085
6086 // Return LR, which contains the return address. Mark it an implicit live-in.
6087 Register Reg = MF.addLiveIn(ARM::LR, getRegClassFor(MVT::i32));
6088 return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT);
6089}
6090
6091SDValue ARMTargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
6092 const ARMBaseRegisterInfo &ARI =
6093 *static_cast<const ARMBaseRegisterInfo*>(RegInfo);
6094 MachineFunction &MF = DAG.getMachineFunction();
6095 MachineFrameInfo &MFI = MF.getFrameInfo();
6096 MFI.setFrameAddressIsTaken(true);
6097
6098 EVT VT = Op.getValueType();
6099 SDLoc dl(Op); // FIXME probably not meaningful
6100 unsigned Depth = Op.getConstantOperandVal(0);
6101 Register FrameReg = ARI.getFrameRegister(MF);
6102 SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
6103 while (Depth--)
6104 FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
6105 MachinePointerInfo());
6106 return FrameAddr;
6107}
6108
6109// FIXME? Maybe this could be a TableGen attribute on some registers and
6110// this table could be generated automatically from RegInfo.
6111Register ARMTargetLowering::getRegisterByName(const char* RegName, LLT VT,
6112 const MachineFunction &MF) const {
6113 return StringSwitch<Register>(RegName)
6114 .Case("sp", ARM::SP)
6115 .Default(Register());
6116}
6117
6118// Result is 64 bit value so split into two 32 bit values and return as a
6119// pair of values.
6121 SelectionDAG &DAG) {
6122 SDLoc DL(N);
6123
6124 // This function is only supposed to be called for i64 type destination.
6125 assert(N->getValueType(0) == MVT::i64
6126 && "ExpandREAD_REGISTER called for non-i64 type result.");
6127
6129 DAG.getVTList(MVT::i32, MVT::i32, MVT::Other),
6130 N->getOperand(0),
6131 N->getOperand(1));
6132
6133 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Read.getValue(0),
6134 Read.getValue(1)));
6135 Results.push_back(Read.getValue(2)); // Chain
6136}
6137
6138/// \p BC is a bitcast that is about to be turned into a VMOVDRR.
6139/// When \p DstVT, the destination type of \p BC, is on the vector
6140/// register bank and the source of bitcast, \p Op, operates on the same bank,
6141/// it might be possible to combine them, such that everything stays on the
6142/// vector register bank.
6143/// \p return The node that would replace \p BT, if the combine
6144/// is possible.
6146 SelectionDAG &DAG) {
6147 SDValue Op = BC->getOperand(0);
6148 EVT DstVT = BC->getValueType(0);
6149
6150 // The only vector instruction that can produce a scalar (remember,
6151 // since the bitcast was about to be turned into VMOVDRR, the source
6152 // type is i64) from a vector is EXTRACT_VECTOR_ELT.
6153 // Moreover, we can do this combine only if there is one use.
6154 // Finally, if the destination type is not a vector, there is not
6155 // much point on forcing everything on the vector bank.
6156 if (!DstVT.isVector() || Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
6157 !Op.hasOneUse())
6158 return SDValue();
6159
6160 // If the index is not constant, we will introduce an additional
6161 // multiply that will stick.
6162 // Give up in that case.
6163 ConstantSDNode *Index = dyn_cast<ConstantSDNode>(Op.getOperand(1));
6164 if (!Index)
6165 return SDValue();
6166 unsigned DstNumElt = DstVT.getVectorNumElements();
6167
6168 // Compute the new index.
6169 const APInt &APIntIndex = Index->getAPIntValue();
6170 APInt NewIndex(APIntIndex.getBitWidth(), DstNumElt);
6171 NewIndex *= APIntIndex;
6172 // Check if the new constant index fits into i32.
6173 if (NewIndex.getBitWidth() > 32)
6174 return SDValue();
6175
6176 // vMTy bitcast(i64 extractelt vNi64 src, i32 index) ->
6177 // vMTy extractsubvector vNxMTy (bitcast vNi64 src), i32 index*M)
6178 SDLoc dl(Op);
6179 SDValue ExtractSrc = Op.getOperand(0);
6180 EVT VecVT = EVT::getVectorVT(
6181 *DAG.getContext(), DstVT.getScalarType(),
6182 ExtractSrc.getValueType().getVectorNumElements() * DstNumElt);
6183 SDValue BitCast = DAG.getNode(ISD::BITCAST, dl, VecVT, ExtractSrc);
6184 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DstVT, BitCast,
6185 DAG.getConstant(NewIndex.getZExtValue(), dl, MVT::i32));
6186}
6187
6188/// ExpandBITCAST - If the target supports VFP, this function is called to
6189/// expand a bit convert where either the source or destination type is i64 to
6190/// use a VMOVDRR or VMOVRRD node. This should not be done when the non-i64
6191/// operand type is illegal (e.g., v2f32 for a target that doesn't support
6192/// vectors), since the legalizer won't know what to do with that.
6193SDValue ARMTargetLowering::ExpandBITCAST(SDNode *N, SelectionDAG &DAG,
6194 const ARMSubtarget *Subtarget) const {
6195 SDLoc dl(N);
6196 SDValue Op = N->getOperand(0);
6197
6198 // This function is only supposed to be called for i16 and i64 types, either
6199 // as the source or destination of the bit convert.
6200 EVT SrcVT = Op.getValueType();
6201 EVT DstVT = N->getValueType(0);
6202
6203 if ((SrcVT == MVT::i16 || SrcVT == MVT::i32) &&
6204 (DstVT == MVT::f16 || DstVT == MVT::bf16))
6205 return MoveToHPR(SDLoc(N), DAG, MVT::i32, DstVT.getSimpleVT(),
6206 DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), MVT::i32, Op));
6207
6208 if ((DstVT == MVT::i16 || DstVT == MVT::i32) &&
6209 (SrcVT == MVT::f16 || SrcVT == MVT::bf16)) {
6210 if (Subtarget->hasFullFP16() && !Subtarget->hasBF16())
6211 Op = DAG.getBitcast(MVT::f16, Op);
6212 return DAG.getNode(
6213 ISD::TRUNCATE, SDLoc(N), DstVT,
6214 MoveFromHPR(SDLoc(N), DAG, MVT::i32, SrcVT.getSimpleVT(), Op));
6215 }
6216
6217 if (!(SrcVT == MVT::i64 || DstVT == MVT::i64))
6218 return SDValue();
6219
6220 // Turn i64->f64 into VMOVDRR.
6221 if (SrcVT == MVT::i64 && isTypeLegal(DstVT)) {
6222 // Do not force values to GPRs (this is what VMOVDRR does for the inputs)
6223 // if we can combine the bitcast with its source.
6225 return Val;
6226 SDValue Lo, Hi;
6227 std::tie(Lo, Hi) = DAG.SplitScalar(Op, dl, MVT::i32, MVT::i32);
6228 return DAG.getNode(ISD::BITCAST, dl, DstVT,
6229 DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi));
6230 }
6231
6232 // Turn f64->i64 into VMOVRRD.
6233 if (DstVT == MVT::i64 && isTypeLegal(SrcVT)) {
6234 SDValue Cvt;
6235 if (DAG.getDataLayout().isBigEndian() && SrcVT.isVector() &&
6236 SrcVT.getVectorNumElements() > 1)
6237 Cvt = DAG.getNode(ARMISD::VMOVRRD, dl,
6238 DAG.getVTList(MVT::i32, MVT::i32),
6239 DAG.getNode(ARMISD::VREV64, dl, SrcVT, Op));
6240 else
6241 Cvt = DAG.getNode(ARMISD::VMOVRRD, dl,
6242 DAG.getVTList(MVT::i32, MVT::i32), Op);
6243 // Merge the pieces into a single i64 value.
6244 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Cvt, Cvt.getValue(1));
6245 }
6246
6247 return SDValue();
6248}
6249
6250/// getZeroVector - Returns a vector of specified type with all zero elements.
6251/// Zero vectors are used to represent vector negation and in those cases
6252/// will be implemented with the NEON VNEG instruction. However, VNEG does
6253/// not support i64 elements, so sometimes the zero vectors will need to be
6254/// explicitly constructed. Regardless, use a canonical VMOV to create the
6255/// zero vector.
6256static SDValue getZeroVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) {
6257 assert(VT.isVector() && "Expected a vector type");
6258 // The canonical modified immediate encoding of a zero vector is....0!
6259 SDValue EncodedVal = DAG.getTargetConstant(0, dl, MVT::i32);
6260 EVT VmovVT = VT.is128BitVector() ? MVT::v4i32 : MVT::v2i32;
6261 SDValue Vmov = DAG.getNode(ARMISD::VMOVIMM, dl, VmovVT, EncodedVal);
6262 return DAG.getNode(ISD::BITCAST, dl, VT, Vmov);
6263}
6264
6265/// LowerShiftRightParts - Lower SRA_PARTS, which returns two
6266/// i32 values and take a 2 x i32 value to shift plus a shift amount.
6267SDValue ARMTargetLowering::LowerShiftRightParts(SDValue Op,
6268 SelectionDAG &DAG) const {
6269 assert(Op.getNumOperands() == 3 && "Not a double-shift!");
6270 EVT VT = Op.getValueType();
6271 unsigned VTBits = VT.getSizeInBits();
6272 SDLoc dl(Op);
6273 SDValue ShOpLo = Op.getOperand(0);
6274 SDValue ShOpHi = Op.getOperand(1);
6275 SDValue ShAmt = Op.getOperand(2);
6276 SDValue ARMcc;
6277 unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL;
6278
6279 assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS);
6280
6281 SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
6282 DAG.getConstant(VTBits, dl, MVT::i32), ShAmt);
6283 SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt);
6284 SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
6285 DAG.getConstant(VTBits, dl, MVT::i32));
6286 SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt);
6287 SDValue LoSmallShift = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
6288 SDValue LoBigShift = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt);
6289 SDValue CmpLo = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32),
6290 ISD::SETGE, ARMcc, DAG, dl);
6291 SDValue Lo =
6292 DAG.getNode(ARMISD::CMOV, dl, VT, LoSmallShift, LoBigShift, ARMcc, CmpLo);
6293
6294 SDValue HiSmallShift = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
6295 SDValue HiBigShift = Opc == ISD::SRA
6296 ? DAG.getNode(Opc, dl, VT, ShOpHi,
6297 DAG.getConstant(VTBits - 1, dl, VT))
6298 : DAG.getConstant(0, dl, VT);
6299 SDValue CmpHi = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32),
6300 ISD::SETGE, ARMcc, DAG, dl);
6301 SDValue Hi =
6302 DAG.getNode(ARMISD::CMOV, dl, VT, HiSmallShift, HiBigShift, ARMcc, CmpHi);
6303
6304 SDValue Ops[2] = { Lo, Hi };
6305 return DAG.getMergeValues(Ops, dl);
6306}
6307
6308/// LowerShiftLeftParts - Lower SHL_PARTS, which returns two
6309/// i32 values and take a 2 x i32 value to shift plus a shift amount.
6310SDValue ARMTargetLowering::LowerShiftLeftParts(SDValue Op,
6311 SelectionDAG &DAG) const {
6312 assert(Op.getNumOperands() == 3 && "Not a double-shift!");
6313 EVT VT = Op.getValueType();
6314 unsigned VTBits = VT.getSizeInBits();
6315 SDLoc dl(Op);
6316 SDValue ShOpLo = Op.getOperand(0);
6317 SDValue ShOpHi = Op.getOperand(1);
6318 SDValue ShAmt = Op.getOperand(2);
6319 SDValue ARMcc;
6320
6321 assert(Op.getOpcode() == ISD::SHL_PARTS);
6322 SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
6323 DAG.getConstant(VTBits, dl, MVT::i32), ShAmt);
6324 SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt);
6325 SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt);
6326 SDValue HiSmallShift = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
6327
6328 SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
6329 DAG.getConstant(VTBits, dl, MVT::i32));
6330 SDValue HiBigShift = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt);
6331 SDValue CmpHi = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32),
6332 ISD::SETGE, ARMcc, DAG, dl);
6333 SDValue Hi =
6334 DAG.getNode(ARMISD::CMOV, dl, VT, HiSmallShift, HiBigShift, ARMcc, CmpHi);
6335
6336 SDValue CmpLo = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32),
6337 ISD::SETGE, ARMcc, DAG, dl);
6338 SDValue LoSmallShift = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
6339 SDValue Lo = DAG.getNode(ARMISD::CMOV, dl, VT, LoSmallShift,
6340 DAG.getConstant(0, dl, VT), ARMcc, CmpLo);
6341
6342 SDValue Ops[2] = { Lo, Hi };
6343 return DAG.getMergeValues(Ops, dl);
6344}
6345
6346SDValue ARMTargetLowering::LowerGET_ROUNDING(SDValue Op,
6347 SelectionDAG &DAG) const {
6348 // The rounding mode is in bits 23:22 of the FPSCR.
6349 // The ARM rounding mode value to FLT_ROUNDS mapping is 0->1, 1->2, 2->3, 3->0
6350 // The formula we use to implement this is (((FPSCR + 1 << 22) >> 22) & 3)
6351 // so that the shift + and get folded into a bitfield extract.
6352 SDLoc dl(Op);
6353 SDValue Chain = Op.getOperand(0);
6354 SDValue Ops[] = {Chain,
6355 DAG.getConstant(Intrinsic::arm_get_fpscr, dl, MVT::i32)};
6356
6357 SDValue FPSCR =
6358 DAG.getNode(ISD::INTRINSIC_W_CHAIN, dl, {MVT::i32, MVT::Other}, Ops);
6359 Chain = FPSCR.getValue(1);
6360 SDValue FltRounds = DAG.getNode(ISD::ADD, dl, MVT::i32, FPSCR,
6361 DAG.getConstant(1U << 22, dl, MVT::i32));
6362 SDValue RMODE = DAG.getNode(ISD::SRL, dl, MVT::i32, FltRounds,
6363 DAG.getConstant(22, dl, MVT::i32));
6364 SDValue And = DAG.getNode(ISD::AND, dl, MVT::i32, RMODE,
6365 DAG.getConstant(3, dl, MVT::i32));
6366 return DAG.getMergeValues({And, Chain}, dl);
6367}
6368
6369SDValue ARMTargetLowering::LowerSET_ROUNDING(SDValue Op,
6370 SelectionDAG &DAG) const {
6371 SDLoc DL(Op);
6372 SDValue Chain = Op->getOperand(0);
6373 SDValue RMValue = Op->getOperand(1);
6374
6375 // The rounding mode is in bits 23:22 of the FPSCR.
6376 // The llvm.set.rounding argument value to ARM rounding mode value mapping
6377 // is 0->3, 1->0, 2->1, 3->2. The formula we use to implement this is
6378 // ((arg - 1) & 3) << 22).
6379 //
6380 // It is expected that the argument of llvm.set.rounding is within the
6381 // segment [0, 3], so NearestTiesToAway (4) is not handled here. It is
6382 // responsibility of the code generated llvm.set.rounding to ensure this
6383 // condition.
6384
6385 // Calculate new value of FPSCR[23:22].
6386 RMValue = DAG.getNode(ISD::SUB, DL, MVT::i32, RMValue,
6387 DAG.getConstant(1, DL, MVT::i32));
6388 RMValue = DAG.getNode(ISD::AND, DL, MVT::i32, RMValue,
6389 DAG.getConstant(0x3, DL, MVT::i32));
6390 RMValue = DAG.getNode(ISD::SHL, DL, MVT::i32, RMValue,
6391 DAG.getConstant(ARM::RoundingBitsPos, DL, MVT::i32));
6392
6393 // Get current value of FPSCR.
6394 SDValue Ops[] = {Chain,
6395 DAG.getConstant(Intrinsic::arm_get_fpscr, DL, MVT::i32)};
6396 SDValue FPSCR =
6397 DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, {MVT::i32, MVT::Other}, Ops);
6398 Chain = FPSCR.getValue(1);
6399 FPSCR = FPSCR.getValue(0);
6400
6401 // Put new rounding mode into FPSCR[23:22].
6402 const unsigned RMMask = ~(ARM::Rounding::rmMask << ARM::RoundingBitsPos);
6403 FPSCR = DAG.getNode(ISD::AND, DL, MVT::i32, FPSCR,
6404 DAG.getConstant(RMMask, DL, MVT::i32));
6405 FPSCR = DAG.getNode(ISD::OR, DL, MVT::i32, FPSCR, RMValue);
6406 SDValue Ops2[] = {
6407 Chain, DAG.getConstant(Intrinsic::arm_set_fpscr, DL, MVT::i32), FPSCR};
6408 return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, Ops2);
6409}
6410
6411SDValue ARMTargetLowering::LowerSET_FPMODE(SDValue Op,
6412 SelectionDAG &DAG) const {
6413 SDLoc DL(Op);
6414 SDValue Chain = Op->getOperand(0);
6415 SDValue Mode = Op->getOperand(1);
6416
6417 // Generate nodes to build:
6418 // FPSCR = (FPSCR & FPStatusBits) | (Mode & ~FPStatusBits)
6419 SDValue Ops[] = {Chain,
6420 DAG.getConstant(Intrinsic::arm_get_fpscr, DL, MVT::i32)};
6421 SDValue FPSCR =
6422 DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, {MVT::i32, MVT::Other}, Ops);
6423 Chain = FPSCR.getValue(1);
6424 FPSCR = FPSCR.getValue(0);
6425
6426 SDValue FPSCRMasked =
6427 DAG.getNode(ISD::AND, DL, MVT::i32, FPSCR,
6428 DAG.getConstant(ARM::FPStatusBits, DL, MVT::i32));
6429 SDValue InputMasked =
6430 DAG.getNode(ISD::AND, DL, MVT::i32, Mode,
6431 DAG.getConstant(~ARM::FPStatusBits, DL, MVT::i32));
6432 FPSCR = DAG.getNode(ISD::OR, DL, MVT::i32, FPSCRMasked, InputMasked);
6433
6434 SDValue Ops2[] = {
6435 Chain, DAG.getConstant(Intrinsic::arm_set_fpscr, DL, MVT::i32), FPSCR};
6436 return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, Ops2);
6437}
6438
6439SDValue ARMTargetLowering::LowerRESET_FPMODE(SDValue Op,
6440 SelectionDAG &DAG) const {
6441 SDLoc DL(Op);
6442 SDValue Chain = Op->getOperand(0);
6443
6444 // To get the default FP mode all control bits are cleared:
6445 // FPSCR = FPSCR & (FPStatusBits | FPReservedBits)
6446 SDValue Ops[] = {Chain,
6447 DAG.getConstant(Intrinsic::arm_get_fpscr, DL, MVT::i32)};
6448 SDValue FPSCR =
6449 DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, {MVT::i32, MVT::Other}, Ops);
6450 Chain = FPSCR.getValue(1);
6451 FPSCR = FPSCR.getValue(0);
6452
6453 SDValue FPSCRMasked = DAG.getNode(
6454 ISD::AND, DL, MVT::i32, FPSCR,
6456 SDValue Ops2[] = {Chain,
6457 DAG.getConstant(Intrinsic::arm_set_fpscr, DL, MVT::i32),
6458 FPSCRMasked};
6459 return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, Ops2);
6460}
6461
6463 const ARMSubtarget *ST) {
6464 SDLoc dl(N);
6465 EVT VT = N->getValueType(0);
6466 if (VT.isVector() && ST->hasNEON()) {
6467
6468 // Compute the least significant set bit: LSB = X & -X
6469 SDValue X = N->getOperand(0);
6470 SDValue NX = DAG.getNode(ISD::SUB, dl, VT, getZeroVector(VT, DAG, dl), X);
6471 SDValue LSB = DAG.getNode(ISD::AND, dl, VT, X, NX);
6472
6473 EVT ElemTy = VT.getVectorElementType();
6474
6475 if (ElemTy == MVT::i8) {
6476 // Compute with: cttz(x) = ctpop(lsb - 1)
6477 SDValue One = DAG.getNode(ARMISD::VMOVIMM, dl, VT,
6478 DAG.getTargetConstant(1, dl, ElemTy));
6479 SDValue Bits = DAG.getNode(ISD::SUB, dl, VT, LSB, One);
6480 return DAG.getNode(ISD::CTPOP, dl, VT, Bits);
6481 }
6482
6483 if ((ElemTy == MVT::i16 || ElemTy == MVT::i32) &&
6484 (N->getOpcode() == ISD::CTTZ_ZERO_UNDEF)) {
6485 // Compute with: cttz(x) = (width - 1) - ctlz(lsb), if x != 0
6486 unsigned NumBits = ElemTy.getSizeInBits();
6487 SDValue WidthMinus1 =
6488 DAG.getNode(ARMISD::VMOVIMM, dl, VT,
6489 DAG.getTargetConstant(NumBits - 1, dl, ElemTy));
6490 SDValue CTLZ = DAG.getNode(ISD::CTLZ, dl, VT, LSB);
6491 return DAG.getNode(ISD::SUB, dl, VT, WidthMinus1, CTLZ);
6492 }
6493
6494 // Compute with: cttz(x) = ctpop(lsb - 1)
6495
6496 // Compute LSB - 1.
6497 SDValue Bits;
6498 if (ElemTy == MVT::i64) {
6499 // Load constant 0xffff'ffff'ffff'ffff to register.
6500 SDValue FF = DAG.getNode(ARMISD::VMOVIMM, dl, VT,
6501 DAG.getTargetConstant(0x1eff, dl, MVT::i32));
6502 Bits = DAG.getNode(ISD::ADD, dl, VT, LSB, FF);
6503 } else {
6504 SDValue One = DAG.getNode(ARMISD::VMOVIMM, dl, VT,
6505 DAG.getTargetConstant(1, dl, ElemTy));
6506 Bits = DAG.getNode(ISD::SUB, dl, VT, LSB, One);
6507 }
6508 return DAG.getNode(ISD::CTPOP, dl, VT, Bits);
6509 }
6510
6511 if (!ST->hasV6T2Ops())
6512 return SDValue();
6513
6514 SDValue rbit = DAG.getNode(ISD::BITREVERSE, dl, VT, N->getOperand(0));
6515 return DAG.getNode(ISD::CTLZ, dl, VT, rbit);
6516}
6517
6519 const ARMSubtarget *ST) {
6520 EVT VT = N->getValueType(0);
6521 SDLoc DL(N);
6522
6523 assert(ST->hasNEON() && "Custom ctpop lowering requires NEON.");
6524 assert((VT == MVT::v1i64 || VT == MVT::v2i64 || VT == MVT::v2i32 ||
6525 VT == MVT::v4i32 || VT == MVT::v4i16 || VT == MVT::v8i16) &&
6526 "Unexpected type for custom ctpop lowering");
6527
6528 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
6529 EVT VT8Bit = VT.is64BitVector() ? MVT::v8i8 : MVT::v16i8;
6530 SDValue Res = DAG.getBitcast(VT8Bit, N->getOperand(0));
6531 Res = DAG.getNode(ISD::CTPOP, DL, VT8Bit, Res);
6532
6533 // Widen v8i8/v16i8 CTPOP result to VT by repeatedly widening pairwise adds.
6534 unsigned EltSize = 8;
6535 unsigned NumElts = VT.is64BitVector() ? 8 : 16;
6536 while (EltSize != VT.getScalarSizeInBits()) {
6538 Ops.push_back(DAG.getConstant(Intrinsic::arm_neon_vpaddlu, DL,
6539 TLI.getPointerTy(DAG.getDataLayout())));
6540 Ops.push_back(Res);
6541
6542 EltSize *= 2;
6543 NumElts /= 2;
6544 MVT WidenVT = MVT::getVectorVT(MVT::getIntegerVT(EltSize), NumElts);
6545 Res = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, WidenVT, Ops);
6546 }
6547
6548 return Res;
6549}
6550
6551/// Getvshiftimm - Check if this is a valid build_vector for the immediate
6552/// operand of a vector shift operation, where all the elements of the
6553/// build_vector must have the same constant integer value.
6554static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) {
6555 // Ignore bit_converts.
6556 while (Op.getOpcode() == ISD::BITCAST)
6557 Op = Op.getOperand(0);
6559 APInt SplatBits, SplatUndef;
6560 unsigned SplatBitSize;
6561 bool HasAnyUndefs;
6562 if (!BVN ||
6563 !BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs,
6564 ElementBits) ||
6565 SplatBitSize > ElementBits)
6566 return false;
6567 Cnt = SplatBits.getSExtValue();
6568 return true;
6569}
6570
6571/// isVShiftLImm - Check if this is a valid build_vector for the immediate
6572/// operand of a vector shift left operation. That value must be in the range:
6573/// 0 <= Value < ElementBits for a left shift; or
6574/// 0 <= Value <= ElementBits for a long left shift.
6575static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) {
6576 assert(VT.isVector() && "vector shift count is not a vector type");
6577 int64_t ElementBits = VT.getScalarSizeInBits();
6578 if (!getVShiftImm(Op, ElementBits, Cnt))
6579 return false;
6580 return (Cnt >= 0 && (isLong ? Cnt - 1 : Cnt) < ElementBits);
6581}
6582
6583/// isVShiftRImm - Check if this is a valid build_vector for the immediate
6584/// operand of a vector shift right operation. For a shift opcode, the value
6585/// is positive, but for an intrinsic the value count must be negative. The
6586/// absolute value must be in the range:
6587/// 1 <= |Value| <= ElementBits for a right shift; or
6588/// 1 <= |Value| <= ElementBits/2 for a narrow right shift.
6589static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, bool isIntrinsic,
6590 int64_t &Cnt) {
6591 assert(VT.isVector() && "vector shift count is not a vector type");
6592 int64_t ElementBits = VT.getScalarSizeInBits();
6593 if (!getVShiftImm(Op, ElementBits, Cnt))
6594 return false;
6595 if (!isIntrinsic)
6596 return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits / 2 : ElementBits));
6597 if (Cnt >= -(isNarrow ? ElementBits / 2 : ElementBits) && Cnt <= -1) {
6598 Cnt = -Cnt;
6599 return true;
6600 }
6601 return false;
6602}
6603
6605 const ARMSubtarget *ST) {
6606 EVT VT = N->getValueType(0);
6607 SDLoc dl(N);
6608 int64_t Cnt;
6609
6610 if (!VT.isVector())
6611 return SDValue();
6612
6613 // We essentially have two forms here. Shift by an immediate and shift by a
6614 // vector register (there are also shift by a gpr, but that is just handled
6615 // with a tablegen pattern). We cannot easily match shift by an immediate in
6616 // tablegen so we do that here and generate a VSHLIMM/VSHRsIMM/VSHRuIMM.
6617 // For shifting by a vector, we don't have VSHR, only VSHL (which can be
6618 // signed or unsigned, and a negative shift indicates a shift right).
6619 if (N->getOpcode() == ISD::SHL) {
6620 if (isVShiftLImm(N->getOperand(1), VT, false, Cnt))
6621 return DAG.getNode(ARMISD::VSHLIMM, dl, VT, N->getOperand(0),
6622 DAG.getConstant(Cnt, dl, MVT::i32));
6623 return DAG.getNode(ARMISD::VSHLu, dl, VT, N->getOperand(0),
6624 N->getOperand(1));
6625 }
6626
6627 assert((N->getOpcode() == ISD::SRA || N->getOpcode() == ISD::SRL) &&
6628 "unexpected vector shift opcode");
6629
6630 if (isVShiftRImm(N->getOperand(1), VT, false, false, Cnt)) {
6631 unsigned VShiftOpc =
6632 (N->getOpcode() == ISD::SRA ? ARMISD::VSHRsIMM : ARMISD::VSHRuIMM);
6633 return DAG.getNode(VShiftOpc, dl, VT, N->getOperand(0),
6634 DAG.getConstant(Cnt, dl, MVT::i32));
6635 }
6636
6637 // Other right shifts we don't have operations for (we use a shift left by a
6638 // negative number).
6639 EVT ShiftVT = N->getOperand(1).getValueType();
6640 SDValue NegatedCount = DAG.getNode(
6641 ISD::SUB, dl, ShiftVT, getZeroVector(ShiftVT, DAG, dl), N->getOperand(1));
6642 unsigned VShiftOpc =
6643 (N->getOpcode() == ISD::SRA ? ARMISD::VSHLs : ARMISD::VSHLu);
6644 return DAG.getNode(VShiftOpc, dl, VT, N->getOperand(0), NegatedCount);
6645}
6646
6648 const ARMSubtarget *ST) {
6649 EVT VT = N->getValueType(0);
6650 SDLoc dl(N);
6651
6652 // We can get here for a node like i32 = ISD::SHL i32, i64
6653 if (VT != MVT::i64)
6654 return SDValue();
6655
6656 assert((N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA ||
6657 N->getOpcode() == ISD::SHL) &&
6658 "Unknown shift to lower!");
6659
6660 unsigned ShOpc = N->getOpcode();
6661 if (ST->hasMVEIntegerOps()) {
6662 SDValue ShAmt = N->getOperand(1);
6663 unsigned ShPartsOpc = ARMISD::LSLL;
6665
6666 // If the shift amount is greater than 32 or has a greater bitwidth than 64
6667 // then do the default optimisation
6668 if ((!Con && ShAmt->getValueType(0).getSizeInBits() > 64) ||
6669 (Con && (Con->getAPIntValue() == 0 || Con->getAPIntValue().uge(32))))
6670 return SDValue();
6671
6672 // Extract the lower 32 bits of the shift amount if it's not an i32
6673 if (ShAmt->getValueType(0) != MVT::i32)
6674 ShAmt = DAG.getZExtOrTrunc(ShAmt, dl, MVT::i32);
6675
6676 if (ShOpc == ISD::SRL) {
6677 if (!Con)
6678 // There is no t2LSRLr instruction so negate and perform an lsll if the
6679 // shift amount is in a register, emulating a right shift.
6680 ShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
6681 DAG.getConstant(0, dl, MVT::i32), ShAmt);
6682 else
6683 // Else generate an lsrl on the immediate shift amount
6684 ShPartsOpc = ARMISD::LSRL;
6685 } else if (ShOpc == ISD::SRA)
6686 ShPartsOpc = ARMISD::ASRL;
6687
6688 // Split Lower/Upper 32 bits of the destination/source
6689 SDValue Lo, Hi;
6690 std::tie(Lo, Hi) =
6691 DAG.SplitScalar(N->getOperand(0), dl, MVT::i32, MVT::i32);
6692 // Generate the shift operation as computed above
6693 Lo = DAG.getNode(ShPartsOpc, dl, DAG.getVTList(MVT::i32, MVT::i32), Lo, Hi,
6694 ShAmt);
6695 // The upper 32 bits come from the second return value of lsll
6696 Hi = SDValue(Lo.getNode(), 1);
6697 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
6698 }
6699
6700 // We only lower SRA, SRL of 1 here, all others use generic lowering.
6701 if (!isOneConstant(N->getOperand(1)) || N->getOpcode() == ISD::SHL)
6702 return SDValue();
6703
6704 // If we are in thumb mode, we don't have RRX.
6705 if (ST->isThumb1Only())
6706 return SDValue();
6707
6708 // Okay, we have a 64-bit SRA or SRL of 1. Lower this to an RRX expr.
6709 SDValue Lo, Hi;
6710 std::tie(Lo, Hi) = DAG.SplitScalar(N->getOperand(0), dl, MVT::i32, MVT::i32);
6711
6712 // First, build a LSRS1/ASRS1 op, which shifts the top part by one and
6713 // captures the shifted out bit into a carry flag.
6714 unsigned Opc = N->getOpcode() == ISD::SRL ? ARMISD::LSRS1 : ARMISD::ASRS1;
6715 Hi = DAG.getNode(Opc, dl, DAG.getVTList(MVT::i32, FlagsVT), Hi);
6716
6717 // The low part is an ARMISD::RRX operand, which shifts the carry in.
6718 Lo = DAG.getNode(ARMISD::RRX, dl, MVT::i32, Lo, Hi.getValue(1));
6719
6720 // Merge the pieces into a single i64 value.
6721 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
6722}
6723
6725 const ARMSubtarget *ST) {
6726 bool Invert = false;
6727 bool Swap = false;
6728 unsigned Opc = ARMCC::AL;
6729
6730 SDValue Op0 = Op.getOperand(0);
6731 SDValue Op1 = Op.getOperand(1);
6732 SDValue CC = Op.getOperand(2);
6733 EVT VT = Op.getValueType();
6734 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
6735 SDLoc dl(Op);
6736
6737 EVT CmpVT;
6738 if (ST->hasNEON())
6740 else {
6741 assert(ST->hasMVEIntegerOps() &&
6742 "No hardware support for integer vector comparison!");
6743
6744 if (Op.getValueType().getVectorElementType() != MVT::i1)
6745 return SDValue();
6746
6747 // Make sure we expand floating point setcc to scalar if we do not have
6748 // mve.fp, so that we can handle them from there.
6749 if (Op0.getValueType().isFloatingPoint() && !ST->hasMVEFloatOps())
6750 return SDValue();
6751
6752 CmpVT = VT;
6753 }
6754
6755 if (Op0.getValueType().getVectorElementType() == MVT::i64 &&
6756 (SetCCOpcode == ISD::SETEQ || SetCCOpcode == ISD::SETNE)) {
6757 // Special-case integer 64-bit equality comparisons. They aren't legal,
6758 // but they can be lowered with a few vector instructions.
6759 unsigned CmpElements = CmpVT.getVectorNumElements() * 2;
6760 EVT SplitVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, CmpElements);
6761 SDValue CastOp0 = DAG.getNode(ISD::BITCAST, dl, SplitVT, Op0);
6762 SDValue CastOp1 = DAG.getNode(ISD::BITCAST, dl, SplitVT, Op1);
6763 SDValue Cmp = DAG.getNode(ISD::SETCC, dl, SplitVT, CastOp0, CastOp1,
6764 DAG.getCondCode(ISD::SETEQ));
6765 SDValue Reversed = DAG.getNode(ARMISD::VREV64, dl, SplitVT, Cmp);
6766 SDValue Merged = DAG.getNode(ISD::AND, dl, SplitVT, Cmp, Reversed);
6767 Merged = DAG.getNode(ISD::BITCAST, dl, CmpVT, Merged);
6768 if (SetCCOpcode == ISD::SETNE)
6769 Merged = DAG.getNOT(dl, Merged, CmpVT);
6770 Merged = DAG.getSExtOrTrunc(Merged, dl, VT);
6771 return Merged;
6772 }
6773
6774 if (CmpVT.getVectorElementType() == MVT::i64)
6775 // 64-bit comparisons are not legal in general.
6776 return SDValue();
6777
6778 if (Op1.getValueType().isFloatingPoint()) {
6779 switch (SetCCOpcode) {
6780 default: llvm_unreachable("Illegal FP comparison");
6781 case ISD::SETUNE:
6782 case ISD::SETNE:
6783 if (ST->hasMVEFloatOps()) {
6784 Opc = ARMCC::NE; break;
6785 } else {
6786 Invert = true; [[fallthrough]];
6787 }
6788 case ISD::SETOEQ:
6789 case ISD::SETEQ: Opc = ARMCC::EQ; break;
6790 case ISD::SETOLT:
6791 case ISD::SETLT: Swap = true; [[fallthrough]];
6792 case ISD::SETOGT:
6793 case ISD::SETGT: Opc = ARMCC::GT; break;
6794 case ISD::SETOLE:
6795 case ISD::SETLE: Swap = true; [[fallthrough]];
6796 case ISD::SETOGE:
6797 case ISD::SETGE: Opc = ARMCC::GE; break;
6798 case ISD::SETUGE: Swap = true; [[fallthrough]];
6799 case ISD::SETULE: Invert = true; Opc = ARMCC::GT; break;
6800 case ISD::SETUGT: Swap = true; [[fallthrough]];
6801 case ISD::SETULT: Invert = true; Opc = ARMCC::GE; break;
6802 case ISD::SETUEQ: Invert = true; [[fallthrough]];
6803 case ISD::SETONE: {
6804 // Expand this to (OLT | OGT).
6805 SDValue TmpOp0 = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op1, Op0,
6806 DAG.getConstant(ARMCC::GT, dl, MVT::i32));
6807 SDValue TmpOp1 = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op0, Op1,
6808 DAG.getConstant(ARMCC::GT, dl, MVT::i32));
6809 SDValue Result = DAG.getNode(ISD::OR, dl, CmpVT, TmpOp0, TmpOp1);
6810 if (Invert)
6811 Result = DAG.getNOT(dl, Result, VT);
6812 return Result;
6813 }
6814 case ISD::SETUO: Invert = true; [[fallthrough]];
6815 case ISD::SETO: {
6816 // Expand this to (OLT | OGE).
6817 SDValue TmpOp0 = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op1, Op0,
6818 DAG.getConstant(ARMCC::GT, dl, MVT::i32));
6819 SDValue TmpOp1 = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op0, Op1,
6820 DAG.getConstant(ARMCC::GE, dl, MVT::i32));
6821 SDValue Result = DAG.getNode(ISD::OR, dl, CmpVT, TmpOp0, TmpOp1);
6822 if (Invert)
6823 Result = DAG.getNOT(dl, Result, VT);
6824 return Result;
6825 }
6826 }
6827 } else {
6828 // Integer comparisons.
6829 switch (SetCCOpcode) {
6830 default: llvm_unreachable("Illegal integer comparison");
6831 case ISD::SETNE:
6832 if (ST->hasMVEIntegerOps()) {
6833 Opc = ARMCC::NE; break;
6834 } else {
6835 Invert = true; [[fallthrough]];
6836 }
6837 case ISD::SETEQ: Opc = ARMCC::EQ; break;
6838 case ISD::SETLT: Swap = true; [[fallthrough]];
6839 case ISD::SETGT: Opc = ARMCC::GT; break;
6840 case ISD::SETLE: Swap = true; [[fallthrough]];
6841 case ISD::SETGE: Opc = ARMCC::GE; break;
6842 case ISD::SETULT: Swap = true; [[fallthrough]];
6843 case ISD::SETUGT: Opc = ARMCC::HI; break;
6844 case ISD::SETULE: Swap = true; [[fallthrough]];
6845 case ISD::SETUGE: Opc = ARMCC::HS; break;
6846 }
6847
6848 // Detect VTST (Vector Test Bits) = icmp ne (and (op0, op1), zero).
6849 if (ST->hasNEON() && Opc == ARMCC::EQ) {
6850 SDValue AndOp;
6852 AndOp = Op0;
6853 else if (ISD::isBuildVectorAllZeros(Op0.getNode()))
6854 AndOp = Op1;
6855
6856 // Ignore bitconvert.
6857 if (AndOp.getNode() && AndOp.getOpcode() == ISD::BITCAST)
6858 AndOp = AndOp.getOperand(0);
6859
6860 if (AndOp.getNode() && AndOp.getOpcode() == ISD::AND) {
6861 Op0 = DAG.getNode(ISD::BITCAST, dl, CmpVT, AndOp.getOperand(0));
6862 Op1 = DAG.getNode(ISD::BITCAST, dl, CmpVT, AndOp.getOperand(1));
6863 SDValue Result = DAG.getNode(ARMISD::VTST, dl, CmpVT, Op0, Op1);
6864 if (!Invert)
6865 Result = DAG.getNOT(dl, Result, VT);
6866 return Result;
6867 }
6868 }
6869 }
6870
6871 if (Swap)
6872 std::swap(Op0, Op1);
6873
6874 // If one of the operands is a constant vector zero, attempt to fold the
6875 // comparison to a specialized compare-against-zero form.
6877 (Opc == ARMCC::GE || Opc == ARMCC::GT || Opc == ARMCC::EQ ||
6878 Opc == ARMCC::NE)) {
6879 if (Opc == ARMCC::GE)
6880 Opc = ARMCC::LE;
6881 else if (Opc == ARMCC::GT)
6882 Opc = ARMCC::LT;
6883 std::swap(Op0, Op1);
6884 }
6885
6886 SDValue Result;
6888 (Opc == ARMCC::GE || Opc == ARMCC::GT || Opc == ARMCC::LE ||
6889 Opc == ARMCC::LT || Opc == ARMCC::NE || Opc == ARMCC::EQ))
6890 Result = DAG.getNode(ARMISD::VCMPZ, dl, CmpVT, Op0,
6891 DAG.getConstant(Opc, dl, MVT::i32));
6892 else
6893 Result = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op0, Op1,
6894 DAG.getConstant(Opc, dl, MVT::i32));
6895
6896 Result = DAG.getSExtOrTrunc(Result, dl, VT);
6897
6898 if (Invert)
6899 Result = DAG.getNOT(dl, Result, VT);
6900
6901 return Result;
6902}
6903
6905 SDValue LHS = Op.getOperand(0);
6906 SDValue RHS = Op.getOperand(1);
6907 SDValue Carry = Op.getOperand(2);
6908 SDValue Cond = Op.getOperand(3);
6909 SDLoc DL(Op);
6910
6911 assert(LHS.getSimpleValueType().isInteger() && "SETCCCARRY is integer only.");
6912
6913 // ARMISD::SUBE expects a carry not a borrow like ISD::USUBO_CARRY so we
6914 // have to invert the carry first.
6915 Carry = DAG.getNode(ISD::SUB, DL, MVT::i32,
6916 DAG.getConstant(1, DL, MVT::i32), Carry);
6917 // This converts the boolean value carry into the carry flag.
6918 Carry = ConvertBooleanCarryToCarryFlag(Carry, DAG);
6919
6920 SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
6921 SDValue Cmp = DAG.getNode(ARMISD::SUBE, DL, VTs, LHS, RHS, Carry);
6922
6923 SDValue FVal = DAG.getConstant(0, DL, MVT::i32);
6924 SDValue TVal = DAG.getConstant(1, DL, MVT::i32);
6925 SDValue ARMcc = DAG.getConstant(
6926 IntCCToARMCC(cast<CondCodeSDNode>(Cond)->get()), DL, MVT::i32);
6927 return DAG.getNode(ARMISD::CMOV, DL, Op.getValueType(), FVal, TVal, ARMcc,
6928 Cmp.getValue(1));
6929}
6930
6931/// isVMOVModifiedImm - Check if the specified splat value corresponds to a
6932/// valid vector constant for a NEON or MVE instruction with a "modified
6933/// immediate" operand (e.g., VMOV). If so, return the encoded value.
6934static SDValue isVMOVModifiedImm(uint64_t SplatBits, uint64_t SplatUndef,
6935 unsigned SplatBitSize, SelectionDAG &DAG,
6936 const SDLoc &dl, EVT &VT, EVT VectorVT,
6937 VMOVModImmType type) {
6938 unsigned OpCmode, Imm;
6939 bool is128Bits = VectorVT.is128BitVector();
6940
6941 // SplatBitSize is set to the smallest size that splats the vector, so a
6942 // zero vector will always have SplatBitSize == 8. However, NEON modified
6943 // immediate instructions others than VMOV do not support the 8-bit encoding
6944 // of a zero vector, and the default encoding of zero is supposed to be the
6945 // 32-bit version.
6946 if (SplatBits == 0)
6947 SplatBitSize = 32;
6948
6949 switch (SplatBitSize) {
6950 case 8:
6951 if (type != VMOVModImm)
6952 return SDValue();
6953 // Any 1-byte value is OK. Op=0, Cmode=1110.
6954 assert((SplatBits & ~0xff) == 0 && "one byte splat value is too big");
6955 OpCmode = 0xe;
6956 Imm = SplatBits;
6957 VT = is128Bits ? MVT::v16i8 : MVT::v8i8;
6958 break;
6959
6960 case 16:
6961 // NEON's 16-bit VMOV supports splat values where only one byte is nonzero.
6962 VT = is128Bits ? MVT::v8i16 : MVT::v4i16;
6963 if ((SplatBits & ~0xff) == 0) {
6964 // Value = 0x00nn: Op=x, Cmode=100x.
6965 OpCmode = 0x8;
6966 Imm = SplatBits;
6967 break;
6968 }
6969 if ((SplatBits & ~0xff00) == 0) {
6970 // Value = 0xnn00: Op=x, Cmode=101x.
6971 OpCmode = 0xa;
6972 Imm = SplatBits >> 8;
6973 break;
6974 }
6975 return SDValue();
6976
6977 case 32:
6978 // NEON's 32-bit VMOV supports splat values where:
6979 // * only one byte is nonzero, or
6980 // * the least significant byte is 0xff and the second byte is nonzero, or
6981 // * the least significant 2 bytes are 0xff and the third is nonzero.
6982 VT = is128Bits ? MVT::v4i32 : MVT::v2i32;
6983 if ((SplatBits & ~0xff) == 0) {
6984 // Value = 0x000000nn: Op=x, Cmode=000x.
6985 OpCmode = 0;
6986 Imm = SplatBits;
6987 break;
6988 }
6989 if ((SplatBits & ~0xff00) == 0) {
6990 // Value = 0x0000nn00: Op=x, Cmode=001x.
6991 OpCmode = 0x2;
6992 Imm = SplatBits >> 8;
6993 break;
6994 }
6995 if ((SplatBits & ~0xff0000) == 0) {
6996 // Value = 0x00nn0000: Op=x, Cmode=010x.
6997 OpCmode = 0x4;
6998 Imm = SplatBits >> 16;
6999 break;
7000 }
7001 if ((SplatBits & ~0xff000000) == 0) {
7002 // Value = 0xnn000000: Op=x, Cmode=011x.
7003 OpCmode = 0x6;
7004 Imm = SplatBits >> 24;
7005 break;
7006 }
7007
7008 // cmode == 0b1100 and cmode == 0b1101 are not supported for VORR or VBIC
7009 if (type == OtherModImm) return SDValue();
7010
7011 if ((SplatBits & ~0xffff) == 0 &&
7012 ((SplatBits | SplatUndef) & 0xff) == 0xff) {
7013 // Value = 0x0000nnff: Op=x, Cmode=1100.
7014 OpCmode = 0xc;
7015 Imm = SplatBits >> 8;
7016 break;
7017 }
7018
7019 // cmode == 0b1101 is not supported for MVE VMVN
7020 if (type == MVEVMVNModImm)
7021 return SDValue();
7022
7023 if ((SplatBits & ~0xffffff) == 0 &&
7024 ((SplatBits | SplatUndef) & 0xffff) == 0xffff) {
7025 // Value = 0x00nnffff: Op=x, Cmode=1101.
7026 OpCmode = 0xd;
7027 Imm = SplatBits >> 16;
7028 break;
7029 }
7030
7031 // Note: there are a few 32-bit splat values (specifically: 00ffff00,
7032 // ff000000, ff0000ff, and ffff00ff) that are valid for VMOV.I64 but not
7033 // VMOV.I32. A (very) minor optimization would be to replicate the value
7034 // and fall through here to test for a valid 64-bit splat. But, then the
7035 // caller would also need to check and handle the change in size.
7036 return SDValue();
7037
7038 case 64: {
7039 if (type != VMOVModImm)
7040 return SDValue();
7041 // NEON has a 64-bit VMOV splat where each byte is either 0 or 0xff.
7042 uint64_t BitMask = 0xff;
7043 unsigned ImmMask = 1;
7044 Imm = 0;
7045 for (int ByteNum = 0; ByteNum < 8; ++ByteNum) {
7046 if (((SplatBits | SplatUndef) & BitMask) == BitMask) {
7047 Imm |= ImmMask;
7048 } else if ((SplatBits & BitMask) != 0) {
7049 return SDValue();
7050 }
7051 BitMask <<= 8;
7052 ImmMask <<= 1;
7053 }
7054
7055 // Op=1, Cmode=1110.
7056 OpCmode = 0x1e;
7057 VT = is128Bits ? MVT::v2i64 : MVT::v1i64;
7058 break;
7059 }
7060
7061 default:
7062 llvm_unreachable("unexpected size for isVMOVModifiedImm");
7063 }
7064
7065 unsigned EncodedVal = ARM_AM::createVMOVModImm(OpCmode, Imm);
7066 return DAG.getTargetConstant(EncodedVal, dl, MVT::i32);
7067}
7068
7069SDValue ARMTargetLowering::LowerConstantFP(SDValue Op, SelectionDAG &DAG,
7070 const ARMSubtarget *ST) const {
7071 EVT VT = Op.getValueType();
7072 bool IsDouble = (VT == MVT::f64);
7073 ConstantFPSDNode *CFP = cast<ConstantFPSDNode>(Op);
7074 const APFloat &FPVal = CFP->getValueAPF();
7075
7076 // Prevent floating-point constants from using literal loads
7077 // when execute-only is enabled.
7078 if (ST->genExecuteOnly()) {
7079 // We shouldn't trigger this for v6m execute-only
7080 assert((!ST->isThumb1Only() || ST->hasV8MBaselineOps()) &&
7081 "Unexpected architecture");
7082
7083 // If we can represent the constant as an immediate, don't lower it
7084 if (isFPImmLegal(FPVal, VT))
7085 return Op;
7086 // Otherwise, construct as integer, and move to float register
7087 APInt INTVal = FPVal.bitcastToAPInt();
7088 SDLoc DL(CFP);
7089 switch (VT.getSimpleVT().SimpleTy) {
7090 default:
7091 llvm_unreachable("Unknown floating point type!");
7092 break;
7093 case MVT::f64: {
7094 SDValue Lo = DAG.getConstant(INTVal.trunc(32), DL, MVT::i32);
7095 SDValue Hi = DAG.getConstant(INTVal.lshr(32).trunc(32), DL, MVT::i32);
7096 return DAG.getNode(ARMISD::VMOVDRR, DL, MVT::f64, Lo, Hi);
7097 }
7098 case MVT::f32:
7099 return DAG.getNode(ARMISD::VMOVSR, DL, VT,
7100 DAG.getConstant(INTVal, DL, MVT::i32));
7101 }
7102 }
7103
7104 if (!ST->hasVFP3Base())
7105 return SDValue();
7106
7107 // Use the default (constant pool) lowering for double constants when we have
7108 // an SP-only FPU
7109 if (IsDouble && !Subtarget->hasFP64())
7110 return SDValue();
7111
7112 // Try splatting with a VMOV.f32...
7113 int ImmVal = IsDouble ? ARM_AM::getFP64Imm(FPVal) : ARM_AM::getFP32Imm(FPVal);
7114
7115 if (ImmVal != -1) {
7116 if (IsDouble || !ST->useNEONForSinglePrecisionFP()) {
7117 // We have code in place to select a valid ConstantFP already, no need to
7118 // do any mangling.
7119 return Op;
7120 }
7121
7122 // It's a float and we are trying to use NEON operations where
7123 // possible. Lower it to a splat followed by an extract.
7124 SDLoc DL(Op);
7125 SDValue NewVal = DAG.getTargetConstant(ImmVal, DL, MVT::i32);
7126 SDValue VecConstant = DAG.getNode(ARMISD::VMOVFPIMM, DL, MVT::v2f32,
7127 NewVal);
7128 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecConstant,
7129 DAG.getConstant(0, DL, MVT::i32));
7130 }
7131
7132 // The rest of our options are NEON only, make sure that's allowed before
7133 // proceeding..
7134 if (!ST->hasNEON() || (!IsDouble && !ST->useNEONForSinglePrecisionFP()))
7135 return SDValue();
7136
7137 EVT VMovVT;
7138 uint64_t iVal = FPVal.bitcastToAPInt().getZExtValue();
7139
7140 // It wouldn't really be worth bothering for doubles except for one very
7141 // important value, which does happen to match: 0.0. So make sure we don't do
7142 // anything stupid.
7143 if (IsDouble && (iVal & 0xffffffff) != (iVal >> 32))
7144 return SDValue();
7145
7146 // Try a VMOV.i32 (FIXME: i8, i16, or i64 could work too).
7147 SDValue NewVal = isVMOVModifiedImm(iVal & 0xffffffffU, 0, 32, DAG, SDLoc(Op),
7148 VMovVT, VT, VMOVModImm);
7149 if (NewVal != SDValue()) {
7150 SDLoc DL(Op);
7151 SDValue VecConstant = DAG.getNode(ARMISD::VMOVIMM, DL, VMovVT,
7152 NewVal);
7153 if (IsDouble)
7154 return DAG.getNode(ISD::BITCAST, DL, MVT::f64, VecConstant);
7155
7156 // It's a float: cast and extract a vector element.
7157 SDValue VecFConstant = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32,
7158 VecConstant);
7159 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecFConstant,
7160 DAG.getConstant(0, DL, MVT::i32));
7161 }
7162
7163 // Finally, try a VMVN.i32
7164 NewVal = isVMOVModifiedImm(~iVal & 0xffffffffU, 0, 32, DAG, SDLoc(Op), VMovVT,
7165 VT, VMVNModImm);
7166 if (NewVal != SDValue()) {
7167 SDLoc DL(Op);
7168 SDValue VecConstant = DAG.getNode(ARMISD::VMVNIMM, DL, VMovVT, NewVal);
7169
7170 if (IsDouble)
7171 return DAG.getNode(ISD::BITCAST, DL, MVT::f64, VecConstant);
7172
7173 // It's a float: cast and extract a vector element.
7174 SDValue VecFConstant = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32,
7175 VecConstant);
7176 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecFConstant,
7177 DAG.getConstant(0, DL, MVT::i32));
7178 }
7179
7180 return SDValue();
7181}
7182
7183// check if an VEXT instruction can handle the shuffle mask when the
7184// vector sources of the shuffle are the same.
7185static bool isSingletonVEXTMask(ArrayRef<int> M, EVT VT, unsigned &Imm) {
7186 unsigned NumElts = VT.getVectorNumElements();
7187
7188 // Assume that the first shuffle index is not UNDEF. Fail if it is.
7189 if (M[0] < 0)
7190 return false;
7191
7192 Imm = M[0];
7193
7194 // If this is a VEXT shuffle, the immediate value is the index of the first
7195 // element. The other shuffle indices must be the successive elements after
7196 // the first one.
7197 unsigned ExpectedElt = Imm;
7198 for (unsigned i = 1; i < NumElts; ++i) {
7199 // Increment the expected index. If it wraps around, just follow it
7200 // back to index zero and keep going.
7201 ++ExpectedElt;
7202 if (ExpectedElt == NumElts)
7203 ExpectedElt = 0;
7204
7205 if (M[i] < 0) continue; // ignore UNDEF indices
7206 if (ExpectedElt != static_cast<unsigned>(M[i]))
7207 return false;
7208 }
7209
7210 return true;
7211}
7212
7213static bool isVEXTMask(ArrayRef<int> M, EVT VT,
7214 bool &ReverseVEXT, unsigned &Imm) {
7215 unsigned NumElts = VT.getVectorNumElements();
7216 ReverseVEXT = false;
7217
7218 // Assume that the first shuffle index is not UNDEF. Fail if it is.
7219 if (M[0] < 0)
7220 return false;
7221
7222 Imm = M[0];
7223
7224 // If this is a VEXT shuffle, the immediate value is the index of the first
7225 // element. The other shuffle indices must be the successive elements after
7226 // the first one.
7227 unsigned ExpectedElt = Imm;
7228 for (unsigned i = 1; i < NumElts; ++i) {
7229 // Increment the expected index. If it wraps around, it may still be
7230 // a VEXT but the source vectors must be swapped.
7231 ExpectedElt += 1;
7232 if (ExpectedElt == NumElts * 2) {
7233 ExpectedElt = 0;
7234 ReverseVEXT = true;
7235 }
7236
7237 if (M[i] < 0) continue; // ignore UNDEF indices
7238 if (ExpectedElt != static_cast<unsigned>(M[i]))
7239 return false;
7240 }
7241
7242 // Adjust the index value if the source operands will be swapped.
7243 if (ReverseVEXT)
7244 Imm -= NumElts;
7245
7246 return true;
7247}
7248
7249static bool isVTBLMask(ArrayRef<int> M, EVT VT) {
7250 // We can handle <8 x i8> vector shuffles. If the index in the mask is out of
7251 // range, then 0 is placed into the resulting vector. So pretty much any mask
7252 // of 8 elements can work here.
7253 return VT == MVT::v8i8 && M.size() == 8;
7254}
7255
7256static unsigned SelectPairHalf(unsigned Elements, ArrayRef<int> Mask,
7257 unsigned Index) {
7258 if (Mask.size() == Elements * 2)
7259 return Index / Elements;
7260 return Mask[Index] == 0 ? 0 : 1;
7261}
7262
7263// Checks whether the shuffle mask represents a vector transpose (VTRN) by
7264// checking that pairs of elements in the shuffle mask represent the same index
7265// in each vector, incrementing the expected index by 2 at each step.
7266// e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 4, 2, 6]
7267// v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,e,c,g}
7268// v2={e,f,g,h}
7269// WhichResult gives the offset for each element in the mask based on which
7270// of the two results it belongs to.
7271//
7272// The transpose can be represented either as:
7273// result1 = shufflevector v1, v2, result1_shuffle_mask
7274// result2 = shufflevector v1, v2, result2_shuffle_mask
7275// where v1/v2 and the shuffle masks have the same number of elements
7276// (here WhichResult (see below) indicates which result is being checked)
7277//
7278// or as:
7279// results = shufflevector v1, v2, shuffle_mask
7280// where both results are returned in one vector and the shuffle mask has twice
7281// as many elements as v1/v2 (here WhichResult will always be 0 if true) here we
7282// want to check the low half and high half of the shuffle mask as if it were
7283// the other case
7284static bool isVTRNMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
7285 unsigned EltSz = VT.getScalarSizeInBits();
7286 if (EltSz == 64)
7287 return false;
7288
7289 unsigned NumElts = VT.getVectorNumElements();
7290 if ((M.size() != NumElts && M.size() != NumElts * 2) || NumElts % 2 != 0)
7291 return false;
7292
7293 // If the mask is twice as long as the input vector then we need to check the
7294 // upper and lower parts of the mask with a matching value for WhichResult
7295 // FIXME: A mask with only even values will be rejected in case the first
7296 // element is undefined, e.g. [-1, 4, 2, 6] will be rejected, because only
7297 // M[0] is used to determine WhichResult
7298 for (unsigned i = 0; i < M.size(); i += NumElts) {
7299 WhichResult = SelectPairHalf(NumElts, M, i);
7300 for (unsigned j = 0; j < NumElts; j += 2) {
7301 if ((M[i+j] >= 0 && (unsigned) M[i+j] != j + WhichResult) ||
7302 (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != j + NumElts + WhichResult))
7303 return false;
7304 }
7305 }
7306
7307 if (M.size() == NumElts*2)
7308 WhichResult = 0;
7309
7310 return true;
7311}
7312
7313/// isVTRN_v_undef_Mask - Special case of isVTRNMask for canonical form of
7314/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
7315/// Mask is e.g., <0, 0, 2, 2> instead of <0, 4, 2, 6>.
7316static bool isVTRN_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){
7317 unsigned EltSz = VT.getScalarSizeInBits();
7318 if (EltSz == 64)
7319 return false;
7320
7321 unsigned NumElts = VT.getVectorNumElements();
7322 if ((M.size() != NumElts && M.size() != NumElts * 2) || NumElts % 2 != 0)
7323 return false;
7324
7325 for (unsigned i = 0; i < M.size(); i += NumElts) {
7326 WhichResult = SelectPairHalf(NumElts, M, i);
7327 for (unsigned j = 0; j < NumElts; j += 2) {
7328 if ((M[i+j] >= 0 && (unsigned) M[i+j] != j + WhichResult) ||
7329 (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != j + WhichResult))
7330 return false;
7331 }
7332 }
7333
7334 if (M.size() == NumElts*2)
7335 WhichResult = 0;
7336
7337 return true;
7338}
7339
7340// Checks whether the shuffle mask represents a vector unzip (VUZP) by checking
7341// that the mask elements are either all even and in steps of size 2 or all odd
7342// and in steps of size 2.
7343// e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 2, 4, 6]
7344// v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,c,e,g}
7345// v2={e,f,g,h}
7346// Requires similar checks to that of isVTRNMask with
7347// respect the how results are returned.
7348static bool isVUZPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
7349 unsigned EltSz = VT.getScalarSizeInBits();
7350 if (EltSz == 64)
7351 return false;
7352
7353 unsigned NumElts = VT.getVectorNumElements();
7354 if (M.size() != NumElts && M.size() != NumElts*2)
7355 return false;
7356
7357 for (unsigned i = 0; i < M.size(); i += NumElts) {
7358 WhichResult = SelectPairHalf(NumElts, M, i);
7359 for (unsigned j = 0; j < NumElts; ++j) {
7360 if (M[i+j] >= 0 && (unsigned) M[i+j] != 2 * j + WhichResult)
7361 return false;
7362 }
7363 }
7364
7365 if (M.size() == NumElts*2)
7366 WhichResult = 0;
7367
7368 // VUZP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
7369 if (VT.is64BitVector() && EltSz == 32)
7370 return false;
7371
7372 return true;
7373}
7374
7375/// isVUZP_v_undef_Mask - Special case of isVUZPMask for canonical form of
7376/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
7377/// Mask is e.g., <0, 2, 0, 2> instead of <0, 2, 4, 6>,
7378static bool isVUZP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){
7379 unsigned EltSz = VT.getScalarSizeInBits();
7380 if (EltSz == 64)
7381 return false;
7382
7383 unsigned NumElts = VT.getVectorNumElements();
7384 if (M.size() != NumElts && M.size() != NumElts*2)
7385 return false;
7386
7387 unsigned Half = NumElts / 2;
7388 for (unsigned i = 0; i < M.size(); i += NumElts) {
7389 WhichResult = SelectPairHalf(NumElts, M, i);
7390 for (unsigned j = 0; j < NumElts; j += Half) {
7391 unsigned Idx = WhichResult;
7392 for (unsigned k = 0; k < Half; ++k) {
7393 int MIdx = M[i + j + k];
7394 if (MIdx >= 0 && (unsigned) MIdx != Idx)
7395 return false;
7396 Idx += 2;
7397 }
7398 }
7399 }
7400
7401 if (M.size() == NumElts*2)
7402 WhichResult = 0;
7403
7404 // VUZP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
7405 if (VT.is64BitVector() && EltSz == 32)
7406 return false;
7407
7408 return true;
7409}
7410
7411// Checks whether the shuffle mask represents a vector zip (VZIP) by checking
7412// that pairs of elements of the shufflemask represent the same index in each
7413// vector incrementing sequentially through the vectors.
7414// e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 4, 1, 5]
7415// v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,e,b,f}
7416// v2={e,f,g,h}
7417// Requires similar checks to that of isVTRNMask with respect the how results
7418// are returned.
7419static bool isVZIPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
7420 unsigned EltSz = VT.getScalarSizeInBits();
7421 if (EltSz == 64)
7422 return false;
7423
7424 unsigned NumElts = VT.getVectorNumElements();
7425 if ((M.size() != NumElts && M.size() != NumElts * 2) || NumElts % 2 != 0)
7426 return false;
7427
7428 for (unsigned i = 0; i < M.size(); i += NumElts) {
7429 WhichResult = SelectPairHalf(NumElts, M, i);
7430 unsigned Idx = WhichResult * NumElts / 2;
7431 for (unsigned j = 0; j < NumElts; j += 2) {
7432 if ((M[i+j] >= 0 && (unsigned) M[i+j] != Idx) ||
7433 (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != Idx + NumElts))
7434 return false;
7435 Idx += 1;
7436 }
7437 }
7438
7439 if (M.size() == NumElts*2)
7440 WhichResult = 0;
7441
7442 // VZIP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
7443 if (VT.is64BitVector() && EltSz == 32)
7444 return false;
7445
7446 return true;
7447}
7448
7449/// isVZIP_v_undef_Mask - Special case of isVZIPMask for canonical form of
7450/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
7451/// Mask is e.g., <0, 0, 1, 1> instead of <0, 4, 1, 5>.
7452static bool isVZIP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){
7453 unsigned EltSz = VT.getScalarSizeInBits();
7454 if (EltSz == 64)
7455 return false;
7456
7457 unsigned NumElts = VT.getVectorNumElements();
7458 if ((M.size() != NumElts && M.size() != NumElts * 2) || NumElts % 2 != 0)
7459 return false;
7460
7461 for (unsigned i = 0; i < M.size(); i += NumElts) {
7462 WhichResult = SelectPairHalf(NumElts, M, i);
7463 unsigned Idx = WhichResult * NumElts / 2;
7464 for (unsigned j = 0; j < NumElts; j += 2) {
7465 if ((M[i+j] >= 0 && (unsigned) M[i+j] != Idx) ||
7466 (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != Idx))
7467 return false;
7468 Idx += 1;
7469 }
7470 }
7471
7472 if (M.size() == NumElts*2)
7473 WhichResult = 0;
7474
7475 // VZIP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
7476 if (VT.is64BitVector() && EltSz == 32)
7477 return false;
7478
7479 return true;
7480}
7481
7482/// Check if \p ShuffleMask is a NEON two-result shuffle (VZIP, VUZP, VTRN),
7483/// and return the corresponding ARMISD opcode if it is, or 0 if it isn't.
7484static unsigned isNEONTwoResultShuffleMask(ArrayRef<int> ShuffleMask, EVT VT,
7485 unsigned &WhichResult,
7486 bool &isV_UNDEF) {
7487 isV_UNDEF = false;
7488 if (isVTRNMask(ShuffleMask, VT, WhichResult))
7489 return ARMISD::VTRN;
7490 if (isVUZPMask(ShuffleMask, VT, WhichResult))
7491 return ARMISD::VUZP;
7492 if (isVZIPMask(ShuffleMask, VT, WhichResult))
7493 return ARMISD::VZIP;
7494
7495 isV_UNDEF = true;
7496 if (isVTRN_v_undef_Mask(ShuffleMask, VT, WhichResult))
7497 return ARMISD::VTRN;
7498 if (isVUZP_v_undef_Mask(ShuffleMask, VT, WhichResult))
7499 return ARMISD::VUZP;
7500 if (isVZIP_v_undef_Mask(ShuffleMask, VT, WhichResult))
7501 return ARMISD::VZIP;
7502
7503 return 0;
7504}
7505
7506/// \return true if this is a reverse operation on an vector.
7507static bool isReverseMask(ArrayRef<int> M, EVT VT) {
7508 unsigned NumElts = VT.getVectorNumElements();
7509 // Make sure the mask has the right size.
7510 if (NumElts != M.size())
7511 return false;
7512
7513 // Look for <15, ..., 3, -1, 1, 0>.
7514 for (unsigned i = 0; i != NumElts; ++i)
7515 if (M[i] >= 0 && M[i] != (int) (NumElts - 1 - i))
7516 return false;
7517
7518 return true;
7519}
7520
7521static bool isTruncMask(ArrayRef<int> M, EVT VT, bool Top, bool SingleSource) {
7522 unsigned NumElts = VT.getVectorNumElements();
7523 // Make sure the mask has the right size.
7524 if (NumElts != M.size() || (VT != MVT::v8i16 && VT != MVT::v16i8))
7525 return false;
7526
7527 // Half-width truncation patterns (e.g. v4i32 -> v8i16):
7528 // !Top && SingleSource: <0, 2, 4, 6, 0, 2, 4, 6>
7529 // !Top && !SingleSource: <0, 2, 4, 6, 8, 10, 12, 14>
7530 // Top && SingleSource: <1, 3, 5, 7, 1, 3, 5, 7>
7531 // Top && !SingleSource: <1, 3, 5, 7, 9, 11, 13, 15>
7532 int Ofs = Top ? 1 : 0;
7533 int Upper = SingleSource ? 0 : NumElts;
7534 for (int i = 0, e = NumElts / 2; i != e; ++i) {
7535 if (M[i] >= 0 && M[i] != (i * 2) + Ofs)
7536 return false;
7537 if (M[i + e] >= 0 && M[i + e] != (i * 2) + Ofs + Upper)
7538 return false;
7539 }
7540 return true;
7541}
7542
7543static bool isVMOVNMask(ArrayRef<int> M, EVT VT, bool Top, bool SingleSource) {
7544 unsigned NumElts = VT.getVectorNumElements();
7545 // Make sure the mask has the right size.
7546 if (NumElts != M.size() || (VT != MVT::v8i16 && VT != MVT::v16i8))
7547 return false;
7548
7549 // If Top
7550 // Look for <0, N, 2, N+2, 4, N+4, ..>.
7551 // This inserts Input2 into Input1
7552 // else if not Top
7553 // Look for <0, N+1, 2, N+3, 4, N+5, ..>
7554 // This inserts Input1 into Input2
7555 unsigned Offset = Top ? 0 : 1;
7556 unsigned N = SingleSource ? 0 : NumElts;
7557 for (unsigned i = 0; i < NumElts; i += 2) {
7558 if (M[i] >= 0 && M[i] != (int)i)
7559 return false;
7560 if (M[i + 1] >= 0 && M[i + 1] != (int)(N + i + Offset))
7561 return false;
7562 }
7563
7564 return true;
7565}
7566
7567static bool isVMOVNTruncMask(ArrayRef<int> M, EVT ToVT, bool rev) {
7568 unsigned NumElts = ToVT.getVectorNumElements();
7569 if (NumElts != M.size())
7570 return false;
7571
7572 // Test if the Trunc can be convertable to a VMOVN with this shuffle. We are
7573 // looking for patterns of:
7574 // !rev: 0 N/2 1 N/2+1 2 N/2+2 ...
7575 // rev: N/2 0 N/2+1 1 N/2+2 2 ...
7576
7577 unsigned Off0 = rev ? NumElts / 2 : 0;
7578 unsigned Off1 = rev ? 0 : NumElts / 2;
7579 for (unsigned i = 0; i < NumElts; i += 2) {
7580 if (M[i] >= 0 && M[i] != (int)(Off0 + i / 2))
7581 return false;
7582 if (M[i + 1] >= 0 && M[i + 1] != (int)(Off1 + i / 2))
7583 return false;
7584 }
7585
7586 return true;
7587}
7588
7589// Reconstruct an MVE VCVT from a BuildVector of scalar fptrunc, all extracted
7590// from a pair of inputs. For example:
7591// BUILDVECTOR(FP_ROUND(EXTRACT_ELT(X, 0),
7592// FP_ROUND(EXTRACT_ELT(Y, 0),
7593// FP_ROUND(EXTRACT_ELT(X, 1),
7594// FP_ROUND(EXTRACT_ELT(Y, 1), ...)
7596 const ARMSubtarget *ST) {
7597 assert(BV.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
7598 if (!ST->hasMVEFloatOps())
7599 return SDValue();
7600
7601 SDLoc dl(BV);
7602 EVT VT = BV.getValueType();
7603 if (VT != MVT::v8f16)
7604 return SDValue();
7605
7606 // We are looking for a buildvector of fptrunc elements, where all the
7607 // elements are interleavingly extracted from two sources. Check the first two
7608 // items are valid enough and extract some info from them (they are checked
7609 // properly in the loop below).
7610 if (BV.getOperand(0).getOpcode() != ISD::FP_ROUND ||
7613 return SDValue();
7614 if (BV.getOperand(1).getOpcode() != ISD::FP_ROUND ||
7617 return SDValue();
7618 SDValue Op0 = BV.getOperand(0).getOperand(0).getOperand(0);
7619 SDValue Op1 = BV.getOperand(1).getOperand(0).getOperand(0);
7620 if (Op0.getValueType() != MVT::v4f32 || Op1.getValueType() != MVT::v4f32)
7621 return SDValue();
7622
7623 // Check all the values in the BuildVector line up with our expectations.
7624 for (unsigned i = 1; i < 4; i++) {
7625 auto Check = [](SDValue Trunc, SDValue Op, unsigned Idx) {
7626 return Trunc.getOpcode() == ISD::FP_ROUND &&
7628 Trunc.getOperand(0).getOperand(0) == Op &&
7629 Trunc.getOperand(0).getConstantOperandVal(1) == Idx;
7630 };
7631 if (!Check(BV.getOperand(i * 2 + 0), Op0, i))
7632 return SDValue();
7633 if (!Check(BV.getOperand(i * 2 + 1), Op1, i))
7634 return SDValue();
7635 }
7636
7637 SDValue N1 = DAG.getNode(ARMISD::VCVTN, dl, VT, DAG.getUNDEF(VT), Op0,
7638 DAG.getConstant(0, dl, MVT::i32));
7639 return DAG.getNode(ARMISD::VCVTN, dl, VT, N1, Op1,
7640 DAG.getConstant(1, dl, MVT::i32));
7641}
7642
7643// Reconstruct an MVE VCVT from a BuildVector of scalar fpext, all extracted
7644// from a single input on alternating lanes. For example:
7645// BUILDVECTOR(FP_ROUND(EXTRACT_ELT(X, 0),
7646// FP_ROUND(EXTRACT_ELT(X, 2),
7647// FP_ROUND(EXTRACT_ELT(X, 4), ...)
7649 const ARMSubtarget *ST) {
7650 assert(BV.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
7651 if (!ST->hasMVEFloatOps())
7652 return SDValue();
7653
7654 SDLoc dl(BV);
7655 EVT VT = BV.getValueType();
7656 if (VT != MVT::v4f32)
7657 return SDValue();
7658
7659 // We are looking for a buildvector of fptext elements, where all the
7660 // elements are alternating lanes from a single source. For example <0,2,4,6>
7661 // or <1,3,5,7>. Check the first two items are valid enough and extract some
7662 // info from them (they are checked properly in the loop below).
7663 if (BV.getOperand(0).getOpcode() != ISD::FP_EXTEND ||
7665 return SDValue();
7666 SDValue Op0 = BV.getOperand(0).getOperand(0).getOperand(0);
7668 if (Op0.getValueType() != MVT::v8f16 || (Offset != 0 && Offset != 1))
7669 return SDValue();
7670
7671 // Check all the values in the BuildVector line up with our expectations.
7672 for (unsigned i = 1; i < 4; i++) {
7673 auto Check = [](SDValue Trunc, SDValue Op, unsigned Idx) {
7674 return Trunc.getOpcode() == ISD::FP_EXTEND &&
7676 Trunc.getOperand(0).getOperand(0) == Op &&
7677 Trunc.getOperand(0).getConstantOperandVal(1) == Idx;
7678 };
7679 if (!Check(BV.getOperand(i), Op0, 2 * i + Offset))
7680 return SDValue();
7681 }
7682
7683 return DAG.getNode(ARMISD::VCVTL, dl, VT, Op0,
7684 DAG.getConstant(Offset, dl, MVT::i32));
7685}
7686
7687// If N is an integer constant that can be moved into a register in one
7688// instruction, return an SDValue of such a constant (will become a MOV
7689// instruction). Otherwise return null.
7691 const ARMSubtarget *ST, const SDLoc &dl) {
7692 uint64_t Val;
7693 if (!isa<ConstantSDNode>(N))
7694 return SDValue();
7695 Val = N->getAsZExtVal();
7696
7697 if (ST->isThumb1Only()) {
7698 if (Val <= 255 || ~Val <= 255)
7699 return DAG.getConstant(Val, dl, MVT::i32);
7700 } else {
7701 if (ARM_AM::getSOImmVal(Val) != -1 || ARM_AM::getSOImmVal(~Val) != -1)
7702 return DAG.getConstant(Val, dl, MVT::i32);
7703 }
7704 return SDValue();
7705}
7706
7708 const ARMSubtarget *ST) {
7709 SDLoc dl(Op);
7710 EVT VT = Op.getValueType();
7711
7712 assert(ST->hasMVEIntegerOps() && "LowerBUILD_VECTOR_i1 called without MVE!");
7713
7714 unsigned NumElts = VT.getVectorNumElements();
7715 unsigned BoolMask;
7716 unsigned BitsPerBool;
7717 if (NumElts == 2) {
7718 BitsPerBool = 8;
7719 BoolMask = 0xff;
7720 } else if (NumElts == 4) {
7721 BitsPerBool = 4;
7722 BoolMask = 0xf;
7723 } else if (NumElts == 8) {
7724 BitsPerBool = 2;
7725 BoolMask = 0x3;
7726 } else if (NumElts == 16) {
7727 BitsPerBool = 1;
7728 BoolMask = 0x1;
7729 } else
7730 return SDValue();
7731
7732 // If this is a single value copied into all lanes (a splat), we can just sign
7733 // extend that single value
7734 SDValue FirstOp = Op.getOperand(0);
7735 if (!isa<ConstantSDNode>(FirstOp) &&
7736 llvm::all_of(llvm::drop_begin(Op->ops()), [&FirstOp](const SDUse &U) {
7737 return U.get().isUndef() || U.get() == FirstOp;
7738 })) {
7739 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::i32, FirstOp,
7740 DAG.getValueType(MVT::i1));
7741 return DAG.getNode(ARMISD::PREDICATE_CAST, dl, Op.getValueType(), Ext);
7742 }
7743
7744 // First create base with bits set where known
7745 unsigned Bits32 = 0;
7746 for (unsigned i = 0; i < NumElts; ++i) {
7747 SDValue V = Op.getOperand(i);
7748 if (!isa<ConstantSDNode>(V) && !V.isUndef())
7749 continue;
7750 bool BitSet = V.isUndef() ? false : V->getAsZExtVal();
7751 if (BitSet)
7752 Bits32 |= BoolMask << (i * BitsPerBool);
7753 }
7754
7755 // Add in unknown nodes
7757 DAG.getConstant(Bits32, dl, MVT::i32));
7758 for (unsigned i = 0; i < NumElts; ++i) {
7759 SDValue V = Op.getOperand(i);
7760 if (isa<ConstantSDNode>(V) || V.isUndef())
7761 continue;
7762 Base = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Base, V,
7763 DAG.getConstant(i, dl, MVT::i32));
7764 }
7765
7766 return Base;
7767}
7768
7770 const ARMSubtarget *ST) {
7771 if (!ST->hasMVEIntegerOps())
7772 return SDValue();
7773
7774 // We are looking for a buildvector where each element is Op[0] + i*N
7775 EVT VT = Op.getValueType();
7776 SDValue Op0 = Op.getOperand(0);
7777 unsigned NumElts = VT.getVectorNumElements();
7778
7779 // Get the increment value from operand 1
7780 SDValue Op1 = Op.getOperand(1);
7781 if (Op1.getOpcode() != ISD::ADD || Op1.getOperand(0) != Op0 ||
7783 return SDValue();
7784 unsigned N = Op1.getConstantOperandVal(1);
7785 if (N != 1 && N != 2 && N != 4 && N != 8)
7786 return SDValue();
7787
7788 // Check that each other operand matches
7789 for (unsigned I = 2; I < NumElts; I++) {
7790 SDValue OpI = Op.getOperand(I);
7791 if (OpI.getOpcode() != ISD::ADD || OpI.getOperand(0) != Op0 ||
7793 OpI.getConstantOperandVal(1) != I * N)
7794 return SDValue();
7795 }
7796
7797 SDLoc DL(Op);
7798 return DAG.getNode(ARMISD::VIDUP, DL, DAG.getVTList(VT, MVT::i32), Op0,
7799 DAG.getConstant(N, DL, MVT::i32));
7800}
7801
7802// Returns true if the operation N can be treated as qr instruction variant at
7803// operand Op.
7804static bool IsQRMVEInstruction(const SDNode *N, const SDNode *Op) {
7805 switch (N->getOpcode()) {
7806 case ISD::ADD:
7807 case ISD::MUL:
7808 case ISD::SADDSAT:
7809 case ISD::UADDSAT:
7810 case ISD::AVGFLOORS:
7811 case ISD::AVGFLOORU:
7812 return true;
7813 case ISD::SUB:
7814 case ISD::SSUBSAT:
7815 case ISD::USUBSAT:
7816 return N->getOperand(1).getNode() == Op;
7818 switch (N->getConstantOperandVal(0)) {
7819 case Intrinsic::arm_mve_add_predicated:
7820 case Intrinsic::arm_mve_mul_predicated:
7821 case Intrinsic::arm_mve_qadd_predicated:
7822 case Intrinsic::arm_mve_vhadd:
7823 case Intrinsic::arm_mve_hadd_predicated:
7824 case Intrinsic::arm_mve_vqdmulh:
7825 case Intrinsic::arm_mve_qdmulh_predicated:
7826 case Intrinsic::arm_mve_vqrdmulh:
7827 case Intrinsic::arm_mve_qrdmulh_predicated:
7828 case Intrinsic::arm_mve_vqdmull:
7829 case Intrinsic::arm_mve_vqdmull_predicated:
7830 return true;
7831 case Intrinsic::arm_mve_sub_predicated:
7832 case Intrinsic::arm_mve_qsub_predicated:
7833 case Intrinsic::arm_mve_vhsub:
7834 case Intrinsic::arm_mve_hsub_predicated:
7835 return N->getOperand(2).getNode() == Op;
7836 default:
7837 return false;
7838 }
7839 default:
7840 return false;
7841 }
7842}
7843
7844// If this is a case we can't handle, return null and let the default
7845// expansion code take care of it.
7846SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
7847 const ARMSubtarget *ST) const {
7848 BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode());
7849 SDLoc dl(Op);
7850 EVT VT = Op.getValueType();
7851
7852 if (ST->hasMVEIntegerOps() && VT.getScalarSizeInBits() == 1)
7853 return LowerBUILD_VECTOR_i1(Op, DAG, ST);
7854
7855 if (SDValue R = LowerBUILD_VECTORToVIDUP(Op, DAG, ST))
7856 return R;
7857
7858 APInt SplatBits, SplatUndef;
7859 unsigned SplatBitSize;
7860 bool HasAnyUndefs;
7861 if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
7862 if (SplatUndef.isAllOnes())
7863 return DAG.getUNDEF(VT);
7864
7865 // If all the users of this constant splat are qr instruction variants,
7866 // generate a vdup of the constant.
7867 if (ST->hasMVEIntegerOps() && VT.getScalarSizeInBits() == SplatBitSize &&
7868 (SplatBitSize == 8 || SplatBitSize == 16 || SplatBitSize == 32) &&
7869 all_of(BVN->users(),
7870 [BVN](const SDNode *U) { return IsQRMVEInstruction(U, BVN); })) {
7871 EVT DupVT = SplatBitSize == 32 ? MVT::v4i32
7872 : SplatBitSize == 16 ? MVT::v8i16
7873 : MVT::v16i8;
7874 SDValue Const = DAG.getConstant(SplatBits.getZExtValue(), dl, MVT::i32);
7875 SDValue VDup = DAG.getNode(ARMISD::VDUP, dl, DupVT, Const);
7876 return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, VDup);
7877 }
7878
7879 if ((ST->hasNEON() && SplatBitSize <= 64) ||
7880 (ST->hasMVEIntegerOps() && SplatBitSize <= 64)) {
7881 // Check if an immediate VMOV works.
7882 EVT VmovVT;
7883 SDValue Val =
7884 isVMOVModifiedImm(SplatBits.getZExtValue(), SplatUndef.getZExtValue(),
7885 SplatBitSize, DAG, dl, VmovVT, VT, VMOVModImm);
7886
7887 if (Val.getNode()) {
7888 SDValue Vmov = DAG.getNode(ARMISD::VMOVIMM, dl, VmovVT, Val);
7889 return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, Vmov);
7890 }
7891
7892 // Try an immediate VMVN.
7893 uint64_t NegatedImm = (~SplatBits).getZExtValue();
7894 Val = isVMOVModifiedImm(
7895 NegatedImm, SplatUndef.getZExtValue(), SplatBitSize, DAG, dl, VmovVT,
7896 VT, ST->hasMVEIntegerOps() ? MVEVMVNModImm : VMVNModImm);
7897 if (Val.getNode()) {
7898 SDValue Vmov = DAG.getNode(ARMISD::VMVNIMM, dl, VmovVT, Val);
7899 return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, Vmov);
7900 }
7901
7902 // Use vmov.f32 to materialize other v2f32 and v4f32 splats.
7903 if ((VT == MVT::v2f32 || VT == MVT::v4f32) && SplatBitSize == 32) {
7904 int ImmVal = ARM_AM::getFP32Imm(SplatBits);
7905 if (ImmVal != -1) {
7906 SDValue Val = DAG.getTargetConstant(ImmVal, dl, MVT::i32);
7907 return DAG.getNode(ARMISD::VMOVFPIMM, dl, VT, Val);
7908 }
7909 }
7910
7911 // If we are under MVE, generate a VDUP(constant), bitcast to the original
7912 // type.
7913 if (ST->hasMVEIntegerOps() &&
7914 (SplatBitSize == 8 || SplatBitSize == 16 || SplatBitSize == 32)) {
7915 EVT DupVT = SplatBitSize == 32 ? MVT::v4i32
7916 : SplatBitSize == 16 ? MVT::v8i16
7917 : MVT::v16i8;
7918 SDValue Const = DAG.getConstant(SplatBits.getZExtValue(), dl, MVT::i32);
7919 SDValue VDup = DAG.getNode(ARMISD::VDUP, dl, DupVT, Const);
7920 return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, VDup);
7921 }
7922 }
7923 }
7924
7925 // Scan through the operands to see if only one value is used.
7926 //
7927 // As an optimisation, even if more than one value is used it may be more
7928 // profitable to splat with one value then change some lanes.
7929 //
7930 // Heuristically we decide to do this if the vector has a "dominant" value,
7931 // defined as splatted to more than half of the lanes.
7932 unsigned NumElts = VT.getVectorNumElements();
7933 bool isOnlyLowElement = true;
7934 bool usesOnlyOneValue = true;
7935 bool hasDominantValue = false;
7936 bool isConstant = true;
7937
7938 // Map of the number of times a particular SDValue appears in the
7939 // element list.
7940 DenseMap<SDValue, unsigned> ValueCounts;
7941 SDValue Value;
7942 for (unsigned i = 0; i < NumElts; ++i) {
7943 SDValue V = Op.getOperand(i);
7944 if (V.isUndef())
7945 continue;
7946 if (i > 0)
7947 isOnlyLowElement = false;
7949 isConstant = false;
7950
7951 unsigned &Count = ValueCounts[V];
7952
7953 // Is this value dominant? (takes up more than half of the lanes)
7954 if (++Count > (NumElts / 2)) {
7955 hasDominantValue = true;
7956 Value = V;
7957 }
7958 }
7959 if (ValueCounts.size() != 1)
7960 usesOnlyOneValue = false;
7961 if (!Value.getNode() && !ValueCounts.empty())
7962 Value = ValueCounts.begin()->first;
7963
7964 if (ValueCounts.empty())
7965 return DAG.getUNDEF(VT);
7966
7967 // Loads are better lowered with insert_vector_elt/ARMISD::BUILD_VECTOR.
7968 // Keep going if we are hitting this case.
7969 if (isOnlyLowElement && !ISD::isNormalLoad(Value.getNode()))
7970 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Value);
7971
7972 unsigned EltSize = VT.getScalarSizeInBits();
7973
7974 // Use VDUP for non-constant splats. For f32 constant splats, reduce to
7975 // i32 and try again.
7976 if (hasDominantValue && EltSize <= 32) {
7977 if (!isConstant) {
7978 SDValue N;
7979
7980 // If we are VDUPing a value that comes directly from a vector, that will
7981 // cause an unnecessary move to and from a GPR, where instead we could
7982 // just use VDUPLANE. We can only do this if the lane being extracted
7983 // is at a constant index, as the VDUP from lane instructions only have
7984 // constant-index forms.
7985 ConstantSDNode *constIndex;
7986 if (Value->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
7987 (constIndex = dyn_cast<ConstantSDNode>(Value->getOperand(1)))) {
7988 // We need to create a new undef vector to use for the VDUPLANE if the
7989 // size of the vector from which we get the value is different than the
7990 // size of the vector that we need to create. We will insert the element
7991 // such that the register coalescer will remove unnecessary copies.
7992 if (VT != Value->getOperand(0).getValueType()) {
7993 unsigned index = constIndex->getAPIntValue().getLimitedValue() %
7995 N = DAG.getNode(ARMISD::VDUPLANE, dl, VT,
7996 DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DAG.getUNDEF(VT),
7997 Value, DAG.getConstant(index, dl, MVT::i32)),
7998 DAG.getConstant(index, dl, MVT::i32));
7999 } else
8000 N = DAG.getNode(ARMISD::VDUPLANE, dl, VT,
8001 Value->getOperand(0), Value->getOperand(1));
8002 } else
8003 N = DAG.getNode(ARMISD::VDUP, dl, VT, Value);
8004
8005 if (!usesOnlyOneValue) {
8006 // The dominant value was splatted as 'N', but we now have to insert
8007 // all differing elements.
8008 for (unsigned I = 0; I < NumElts; ++I) {
8009 if (Op.getOperand(I) == Value)
8010 continue;
8012 Ops.push_back(N);
8013 Ops.push_back(Op.getOperand(I));
8014 Ops.push_back(DAG.getConstant(I, dl, MVT::i32));
8015 N = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Ops);
8016 }
8017 }
8018 return N;
8019 }
8022 MVT FVT = VT.getVectorElementType().getSimpleVT();
8023 assert(FVT == MVT::f32 || FVT == MVT::f16);
8024 MVT IVT = (FVT == MVT::f32) ? MVT::i32 : MVT::i16;
8025 for (unsigned i = 0; i < NumElts; ++i)
8026 Ops.push_back(DAG.getNode(ISD::BITCAST, dl, IVT,
8027 Op.getOperand(i)));
8028 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), IVT, NumElts);
8029 SDValue Val = DAG.getBuildVector(VecVT, dl, Ops);
8030 Val = LowerBUILD_VECTOR(Val, DAG, ST);
8031 if (Val.getNode())
8032 return DAG.getNode(ISD::BITCAST, dl, VT, Val);
8033 }
8034 if (usesOnlyOneValue) {
8035 SDValue Val = IsSingleInstrConstant(Value, DAG, ST, dl);
8036 if (isConstant && Val.getNode())
8037 return DAG.getNode(ARMISD::VDUP, dl, VT, Val);
8038 }
8039 }
8040
8041 // If all elements are constants and the case above didn't get hit, fall back
8042 // to the default expansion, which will generate a load from the constant
8043 // pool.
8044 if (isConstant)
8045 return SDValue();
8046
8047 // Reconstruct the BUILDVECTOR to one of the legal shuffles (such as vext and
8048 // vmovn). Empirical tests suggest this is rarely worth it for vectors of
8049 // length <= 2.
8050 if (NumElts >= 4)
8051 if (SDValue shuffle = ReconstructShuffle(Op, DAG))
8052 return shuffle;
8053
8054 // Attempt to turn a buildvector of scalar fptrunc's or fpext's back into
8055 // VCVT's
8056 if (SDValue VCVT = LowerBuildVectorOfFPTrunc(Op, DAG, Subtarget))
8057 return VCVT;
8058 if (SDValue VCVT = LowerBuildVectorOfFPExt(Op, DAG, Subtarget))
8059 return VCVT;
8060
8061 if (ST->hasNEON() && VT.is128BitVector() && VT != MVT::v2f64 && VT != MVT::v4f32) {
8062 // If we haven't found an efficient lowering, try splitting a 128-bit vector
8063 // into two 64-bit vectors; we might discover a better way to lower it.
8064 SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElts);
8065 EVT ExtVT = VT.getVectorElementType();
8066 EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElts / 2);
8067 SDValue Lower = DAG.getBuildVector(HVT, dl, ArrayRef(&Ops[0], NumElts / 2));
8068 if (Lower.getOpcode() == ISD::BUILD_VECTOR)
8069 Lower = LowerBUILD_VECTOR(Lower, DAG, ST);
8070 SDValue Upper =
8071 DAG.getBuildVector(HVT, dl, ArrayRef(&Ops[NumElts / 2], NumElts / 2));
8072 if (Upper.getOpcode() == ISD::BUILD_VECTOR)
8073 Upper = LowerBUILD_VECTOR(Upper, DAG, ST);
8074 if (Lower && Upper)
8075 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lower, Upper);
8076 }
8077
8078 // Vectors with 32- or 64-bit elements can be built by directly assigning
8079 // the subregisters. Lower it to an ARMISD::BUILD_VECTOR so the operands
8080 // will be legalized.
8081 if (EltSize >= 32) {
8082 // Do the expansion with floating-point types, since that is what the VFP
8083 // registers are defined to use, and since i64 is not legal.
8084 EVT EltVT = EVT::getFloatingPointVT(EltSize);
8085 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts);
8087 for (unsigned i = 0; i < NumElts; ++i)
8088 Ops.push_back(DAG.getNode(ISD::BITCAST, dl, EltVT, Op.getOperand(i)));
8089 SDValue Val = DAG.getNode(ARMISD::BUILD_VECTOR, dl, VecVT, Ops);
8090 return DAG.getNode(ISD::BITCAST, dl, VT, Val);
8091 }
8092
8093 // If all else fails, just use a sequence of INSERT_VECTOR_ELT when we
8094 // know the default expansion would otherwise fall back on something even
8095 // worse. For a vector with one or two non-undef values, that's
8096 // scalar_to_vector for the elements followed by a shuffle (provided the
8097 // shuffle is valid for the target) and materialization element by element
8098 // on the stack followed by a load for everything else.
8099 if (!isConstant && !usesOnlyOneValue) {
8100 SDValue Vec = DAG.getUNDEF(VT);
8101 for (unsigned i = 0 ; i < NumElts; ++i) {
8102 SDValue V = Op.getOperand(i);
8103 if (V.isUndef())
8104 continue;
8105 SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i32);
8106 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Vec, V, LaneIdx);
8107 }
8108 return Vec;
8109 }
8110
8111 return SDValue();
8112}
8113
8114// Gather data to see if the operation can be modelled as a
8115// shuffle in combination with VEXTs.
8116SDValue ARMTargetLowering::ReconstructShuffle(SDValue Op,
8117 SelectionDAG &DAG) const {
8118 assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
8119 SDLoc dl(Op);
8120 EVT VT = Op.getValueType();
8121 unsigned NumElts = VT.getVectorNumElements();
8122
8123 struct ShuffleSourceInfo {
8124 SDValue Vec;
8125 unsigned MinElt = std::numeric_limits<unsigned>::max();
8126 unsigned MaxElt = 0;
8127
8128 // We may insert some combination of BITCASTs and VEXT nodes to force Vec to
8129 // be compatible with the shuffle we intend to construct. As a result
8130 // ShuffleVec will be some sliding window into the original Vec.
8131 SDValue ShuffleVec;
8132
8133 // Code should guarantee that element i in Vec starts at element "WindowBase
8134 // + i * WindowScale in ShuffleVec".
8135 int WindowBase = 0;
8136 int WindowScale = 1;
8137
8138 ShuffleSourceInfo(SDValue Vec) : Vec(Vec), ShuffleVec(Vec) {}
8139
8140 bool operator ==(SDValue OtherVec) { return Vec == OtherVec; }
8141 };
8142
8143 // First gather all vectors used as an immediate source for this BUILD_VECTOR
8144 // node.
8146 for (unsigned i = 0; i < NumElts; ++i) {
8147 SDValue V = Op.getOperand(i);
8148 if (V.isUndef())
8149 continue;
8150 else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT) {
8151 // A shuffle can only come from building a vector from various
8152 // elements of other vectors.
8153 return SDValue();
8154 } else if (!isa<ConstantSDNode>(V.getOperand(1))) {
8155 // Furthermore, shuffles require a constant mask, whereas extractelts
8156 // accept variable indices.
8157 return SDValue();
8158 }
8159
8160 // Add this element source to the list if it's not already there.
8161 SDValue SourceVec = V.getOperand(0);
8162 auto Source = llvm::find(Sources, SourceVec);
8163 if (Source == Sources.end())
8164 Source = Sources.insert(Sources.end(), ShuffleSourceInfo(SourceVec));
8165
8166 // Update the minimum and maximum lane number seen.
8167 unsigned EltNo = V.getConstantOperandVal(1);
8168 Source->MinElt = std::min(Source->MinElt, EltNo);
8169 Source->MaxElt = std::max(Source->MaxElt, EltNo);
8170 }
8171
8172 // Currently only do something sane when at most two source vectors
8173 // are involved.
8174 if (Sources.size() > 2)
8175 return SDValue();
8176
8177 // Find out the smallest element size among result and two sources, and use
8178 // it as element size to build the shuffle_vector.
8179 EVT SmallestEltTy = VT.getVectorElementType();
8180 for (auto &Source : Sources) {
8181 EVT SrcEltTy = Source.Vec.getValueType().getVectorElementType();
8182 if (SrcEltTy.bitsLT(SmallestEltTy))
8183 SmallestEltTy = SrcEltTy;
8184 }
8185 unsigned ResMultiplier =
8186 VT.getScalarSizeInBits() / SmallestEltTy.getSizeInBits();
8187 NumElts = VT.getSizeInBits() / SmallestEltTy.getSizeInBits();
8188 EVT ShuffleVT = EVT::getVectorVT(*DAG.getContext(), SmallestEltTy, NumElts);
8189
8190 // If the source vector is too wide or too narrow, we may nevertheless be able
8191 // to construct a compatible shuffle either by concatenating it with UNDEF or
8192 // extracting a suitable range of elements.
8193 for (auto &Src : Sources) {
8194 EVT SrcVT = Src.ShuffleVec.getValueType();
8195
8196 uint64_t SrcVTSize = SrcVT.getFixedSizeInBits();
8197 uint64_t VTSize = VT.getFixedSizeInBits();
8198 if (SrcVTSize == VTSize)
8199 continue;
8200
8201 // This stage of the search produces a source with the same element type as
8202 // the original, but with a total width matching the BUILD_VECTOR output.
8203 EVT EltVT = SrcVT.getVectorElementType();
8204 unsigned NumSrcElts = VTSize / EltVT.getFixedSizeInBits();
8205 EVT DestVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumSrcElts);
8206
8207 if (SrcVTSize < VTSize) {
8208 if (2 * SrcVTSize != VTSize)
8209 return SDValue();
8210 // We can pad out the smaller vector for free, so if it's part of a
8211 // shuffle...
8212 Src.ShuffleVec =
8213 DAG.getNode(ISD::CONCAT_VECTORS, dl, DestVT, Src.ShuffleVec,
8214 DAG.getUNDEF(Src.ShuffleVec.getValueType()));
8215 continue;
8216 }
8217
8218 if (SrcVTSize != 2 * VTSize)
8219 return SDValue();
8220
8221 if (Src.MaxElt - Src.MinElt >= NumSrcElts) {
8222 // Span too large for a VEXT to cope
8223 return SDValue();
8224 }
8225
8226 if (Src.MinElt >= NumSrcElts) {
8227 // The extraction can just take the second half
8228 Src.ShuffleVec =
8229 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
8230 DAG.getConstant(NumSrcElts, dl, MVT::i32));
8231 Src.WindowBase = -NumSrcElts;
8232 } else if (Src.MaxElt < NumSrcElts) {
8233 // The extraction can just take the first half
8234 Src.ShuffleVec =
8235 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
8236 DAG.getConstant(0, dl, MVT::i32));
8237 } else {
8238 // An actual VEXT is needed
8239 SDValue VEXTSrc1 =
8240 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
8241 DAG.getConstant(0, dl, MVT::i32));
8242 SDValue VEXTSrc2 =
8243 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
8244 DAG.getConstant(NumSrcElts, dl, MVT::i32));
8245
8246 Src.ShuffleVec = DAG.getNode(ARMISD::VEXT, dl, DestVT, VEXTSrc1,
8247 VEXTSrc2,
8248 DAG.getConstant(Src.MinElt, dl, MVT::i32));
8249 Src.WindowBase = -Src.MinElt;
8250 }
8251 }
8252
8253 // Another possible incompatibility occurs from the vector element types. We
8254 // can fix this by bitcasting the source vectors to the same type we intend
8255 // for the shuffle.
8256 for (auto &Src : Sources) {
8257 EVT SrcEltTy = Src.ShuffleVec.getValueType().getVectorElementType();
8258 if (SrcEltTy == SmallestEltTy)
8259 continue;
8260 assert(ShuffleVT.getVectorElementType() == SmallestEltTy);
8261 Src.ShuffleVec = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, ShuffleVT, Src.ShuffleVec);
8262 Src.WindowScale = SrcEltTy.getSizeInBits() / SmallestEltTy.getSizeInBits();
8263 Src.WindowBase *= Src.WindowScale;
8264 }
8265
8266 // Final check before we try to actually produce a shuffle.
8267 LLVM_DEBUG({
8268 for (auto Src : Sources)
8269 assert(Src.ShuffleVec.getValueType() == ShuffleVT);
8270 });
8271
8272 // The stars all align, our next step is to produce the mask for the shuffle.
8273 SmallVector<int, 8> Mask(ShuffleVT.getVectorNumElements(), -1);
8274 int BitsPerShuffleLane = ShuffleVT.getScalarSizeInBits();
8275 for (unsigned i = 0; i < VT.getVectorNumElements(); ++i) {
8276 SDValue Entry = Op.getOperand(i);
8277 if (Entry.isUndef())
8278 continue;
8279
8280 auto Src = llvm::find(Sources, Entry.getOperand(0));
8281 int EltNo = cast<ConstantSDNode>(Entry.getOperand(1))->getSExtValue();
8282
8283 // EXTRACT_VECTOR_ELT performs an implicit any_ext; BUILD_VECTOR an implicit
8284 // trunc. So only std::min(SrcBits, DestBits) actually get defined in this
8285 // segment.
8286 EVT OrigEltTy = Entry.getOperand(0).getValueType().getVectorElementType();
8287 int BitsDefined = std::min(OrigEltTy.getScalarSizeInBits(),
8288 VT.getScalarSizeInBits());
8289 int LanesDefined = BitsDefined / BitsPerShuffleLane;
8290
8291 // This source is expected to fill ResMultiplier lanes of the final shuffle,
8292 // starting at the appropriate offset.
8293 int *LaneMask = &Mask[i * ResMultiplier];
8294
8295 int ExtractBase = EltNo * Src->WindowScale + Src->WindowBase;
8296 ExtractBase += NumElts * (Src - Sources.begin());
8297 for (int j = 0; j < LanesDefined; ++j)
8298 LaneMask[j] = ExtractBase + j;
8299 }
8300
8301
8302 // We can't handle more than two sources. This should have already
8303 // been checked before this point.
8304 assert(Sources.size() <= 2 && "Too many sources!");
8305
8306 SDValue ShuffleOps[] = { DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT) };
8307 for (unsigned i = 0; i < Sources.size(); ++i)
8308 ShuffleOps[i] = Sources[i].ShuffleVec;
8309
8310 SDValue Shuffle = buildLegalVectorShuffle(ShuffleVT, dl, ShuffleOps[0],
8311 ShuffleOps[1], Mask, DAG);
8312 if (!Shuffle)
8313 return SDValue();
8314 return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, Shuffle);
8315}
8316
8318 OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3>
8327 OP_VUZPL, // VUZP, left result
8328 OP_VUZPR, // VUZP, right result
8329 OP_VZIPL, // VZIP, left result
8330 OP_VZIPR, // VZIP, right result
8331 OP_VTRNL, // VTRN, left result
8332 OP_VTRNR // VTRN, right result
8333};
8334
8335static bool isLegalMVEShuffleOp(unsigned PFEntry) {
8336 unsigned OpNum = (PFEntry >> 26) & 0x0F;
8337 switch (OpNum) {
8338 case OP_COPY:
8339 case OP_VREV:
8340 case OP_VDUP0:
8341 case OP_VDUP1:
8342 case OP_VDUP2:
8343 case OP_VDUP3:
8344 return true;
8345 }
8346 return false;
8347}
8348
8349/// isShuffleMaskLegal - Targets can use this to indicate that they only
8350/// support *some* VECTOR_SHUFFLE operations, those with specific masks.
8351/// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
8352/// are assumed to be legal.
8354 if (VT.getVectorNumElements() == 4 &&
8355 (VT.is128BitVector() || VT.is64BitVector())) {
8356 unsigned PFIndexes[4];
8357 for (unsigned i = 0; i != 4; ++i) {
8358 if (M[i] < 0)
8359 PFIndexes[i] = 8;
8360 else
8361 PFIndexes[i] = M[i];
8362 }
8363
8364 // Compute the index in the perfect shuffle table.
8365 unsigned PFTableIndex =
8366 PFIndexes[0]*9*9*9+PFIndexes[1]*9*9+PFIndexes[2]*9+PFIndexes[3];
8367 unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
8368 unsigned Cost = (PFEntry >> 30);
8369
8370 if (Cost <= 4 && (Subtarget->hasNEON() || isLegalMVEShuffleOp(PFEntry)))
8371 return true;
8372 }
8373
8374 bool ReverseVEXT, isV_UNDEF;
8375 unsigned Imm, WhichResult;
8376
8377 unsigned EltSize = VT.getScalarSizeInBits();
8378 if (EltSize >= 32 ||
8380 ShuffleVectorInst::isIdentityMask(M, M.size()) ||
8381 isVREVMask(M, VT, 64) ||
8382 isVREVMask(M, VT, 32) ||
8383 isVREVMask(M, VT, 16))
8384 return true;
8385 else if (Subtarget->hasNEON() &&
8386 (isVEXTMask(M, VT, ReverseVEXT, Imm) ||
8387 isVTBLMask(M, VT) ||
8388 isNEONTwoResultShuffleMask(M, VT, WhichResult, isV_UNDEF)))
8389 return true;
8390 else if ((VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i8) &&
8391 isReverseMask(M, VT))
8392 return true;
8393 else if (Subtarget->hasMVEIntegerOps() &&
8394 (isVMOVNMask(M, VT, true, false) ||
8395 isVMOVNMask(M, VT, false, false) || isVMOVNMask(M, VT, true, true)))
8396 return true;
8397 else if (Subtarget->hasMVEIntegerOps() &&
8398 (isTruncMask(M, VT, false, false) ||
8399 isTruncMask(M, VT, false, true) ||
8400 isTruncMask(M, VT, true, false) || isTruncMask(M, VT, true, true)))
8401 return true;
8402 else
8403 return false;
8404}
8405
8406/// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit
8407/// the specified operations to build the shuffle.
8409 SDValue RHS, SelectionDAG &DAG,
8410 const SDLoc &dl) {
8411 unsigned OpNum = (PFEntry >> 26) & 0x0F;
8412 unsigned LHSID = (PFEntry >> 13) & ((1 << 13)-1);
8413 unsigned RHSID = (PFEntry >> 0) & ((1 << 13)-1);
8414
8415 if (OpNum == OP_COPY) {
8416 if (LHSID == (1*9+2)*9+3) return LHS;
8417 assert(LHSID == ((4*9+5)*9+6)*9+7 && "Illegal OP_COPY!");
8418 return RHS;
8419 }
8420
8421 SDValue OpLHS, OpRHS;
8422 OpLHS = GeneratePerfectShuffle(PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl);
8423 OpRHS = GeneratePerfectShuffle(PerfectShuffleTable[RHSID], LHS, RHS, DAG, dl);
8424 EVT VT = OpLHS.getValueType();
8425
8426 switch (OpNum) {
8427 default: llvm_unreachable("Unknown shuffle opcode!");
8428 case OP_VREV:
8429 // VREV divides the vector in half and swaps within the half.
8430 if (VT.getScalarSizeInBits() == 32)
8431 return DAG.getNode(ARMISD::VREV64, dl, VT, OpLHS);
8432 // vrev <4 x i16> -> VREV32
8433 if (VT.getScalarSizeInBits() == 16)
8434 return DAG.getNode(ARMISD::VREV32, dl, VT, OpLHS);
8435 // vrev <4 x i8> -> VREV16
8436 assert(VT.getScalarSizeInBits() == 8);
8437 return DAG.getNode(ARMISD::VREV16, dl, VT, OpLHS);
8438 case OP_VDUP0:
8439 case OP_VDUP1:
8440 case OP_VDUP2:
8441 case OP_VDUP3:
8442 return DAG.getNode(ARMISD::VDUPLANE, dl, VT,
8443 OpLHS, DAG.getConstant(OpNum-OP_VDUP0, dl, MVT::i32));
8444 case OP_VEXT1:
8445 case OP_VEXT2:
8446 case OP_VEXT3:
8447 return DAG.getNode(ARMISD::VEXT, dl, VT,
8448 OpLHS, OpRHS,
8449 DAG.getConstant(OpNum - OP_VEXT1 + 1, dl, MVT::i32));
8450 case OP_VUZPL:
8451 case OP_VUZPR:
8452 return DAG.getNode(ARMISD::VUZP, dl, DAG.getVTList(VT, VT),
8453 OpLHS, OpRHS).getValue(OpNum-OP_VUZPL);
8454 case OP_VZIPL:
8455 case OP_VZIPR:
8456 return DAG.getNode(ARMISD::VZIP, dl, DAG.getVTList(VT, VT),
8457 OpLHS, OpRHS).getValue(OpNum-OP_VZIPL);
8458 case OP_VTRNL:
8459 case OP_VTRNR:
8460 return DAG.getNode(ARMISD::VTRN, dl, DAG.getVTList(VT, VT),
8461 OpLHS, OpRHS).getValue(OpNum-OP_VTRNL);
8462 }
8463}
8464
8466 ArrayRef<int> ShuffleMask,
8467 SelectionDAG &DAG) {
8468 // Check to see if we can use the VTBL instruction.
8469 SDValue V1 = Op.getOperand(0);
8470 SDValue V2 = Op.getOperand(1);
8471 SDLoc DL(Op);
8472
8473 SmallVector<SDValue, 8> VTBLMask;
8474 for (int I : ShuffleMask)
8475 VTBLMask.push_back(DAG.getSignedConstant(I, DL, MVT::i32));
8476
8477 if (V2.getNode()->isUndef())
8478 return DAG.getNode(ARMISD::VTBL1, DL, MVT::v8i8, V1,
8479 DAG.getBuildVector(MVT::v8i8, DL, VTBLMask));
8480
8481 return DAG.getNode(ARMISD::VTBL2, DL, MVT::v8i8, V1, V2,
8482 DAG.getBuildVector(MVT::v8i8, DL, VTBLMask));
8483}
8484
8486 SDLoc DL(Op);
8487 EVT VT = Op.getValueType();
8488
8489 assert((VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i8) &&
8490 "Expect an v8i16/v16i8 type");
8491 SDValue OpLHS = DAG.getNode(ARMISD::VREV64, DL, VT, Op.getOperand(0));
8492 // For a v16i8 type: After the VREV, we have got <7, ..., 0, 15, ..., 8>. Now,
8493 // extract the first 8 bytes into the top double word and the last 8 bytes
8494 // into the bottom double word, through a new vector shuffle that will be
8495 // turned into a VEXT on Neon, or a couple of VMOVDs on MVE.
8496 std::vector<int> NewMask;
8497 for (unsigned i = 0; i < VT.getVectorNumElements() / 2; i++)
8498 NewMask.push_back(VT.getVectorNumElements() / 2 + i);
8499 for (unsigned i = 0; i < VT.getVectorNumElements() / 2; i++)
8500 NewMask.push_back(i);
8501 return DAG.getVectorShuffle(VT, DL, OpLHS, OpLHS, NewMask);
8502}
8503
8505 switch (VT.getSimpleVT().SimpleTy) {
8506 case MVT::v2i1:
8507 return MVT::v2f64;
8508 case MVT::v4i1:
8509 return MVT::v4i32;
8510 case MVT::v8i1:
8511 return MVT::v8i16;
8512 case MVT::v16i1:
8513 return MVT::v16i8;
8514 default:
8515 llvm_unreachable("Unexpected vector predicate type");
8516 }
8517}
8518
8520 SelectionDAG &DAG) {
8521 // Converting from boolean predicates to integers involves creating a vector
8522 // of all ones or all zeroes and selecting the lanes based upon the real
8523 // predicate.
8525 DAG.getTargetConstant(ARM_AM::createVMOVModImm(0xe, 0xff), dl, MVT::i32);
8526 AllOnes = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v16i8, AllOnes);
8527
8528 SDValue AllZeroes =
8529 DAG.getTargetConstant(ARM_AM::createVMOVModImm(0xe, 0x0), dl, MVT::i32);
8530 AllZeroes = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v16i8, AllZeroes);
8531
8532 // Get full vector type from predicate type
8534
8535 SDValue RecastV1;
8536 // If the real predicate is an v8i1 or v4i1 (not v16i1) then we need to recast
8537 // this to a v16i1. This cannot be done with an ordinary bitcast because the
8538 // sizes are not the same. We have to use a MVE specific PREDICATE_CAST node,
8539 // since we know in hardware the sizes are really the same.
8540 if (VT != MVT::v16i1)
8541 RecastV1 = DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::v16i1, Pred);
8542 else
8543 RecastV1 = Pred;
8544
8545 // Select either all ones or zeroes depending upon the real predicate bits.
8546 SDValue PredAsVector =
8547 DAG.getNode(ISD::VSELECT, dl, MVT::v16i8, RecastV1, AllOnes, AllZeroes);
8548
8549 // Recast our new predicate-as-integer v16i8 vector into something
8550 // appropriate for the shuffle, i.e. v4i32 for a real v4i1 predicate.
8551 return DAG.getNode(ISD::BITCAST, dl, NewVT, PredAsVector);
8552}
8553
8555 const ARMSubtarget *ST) {
8556 EVT VT = Op.getValueType();
8558 ArrayRef<int> ShuffleMask = SVN->getMask();
8559
8560 assert(ST->hasMVEIntegerOps() &&
8561 "No support for vector shuffle of boolean predicates");
8562
8563 SDValue V1 = Op.getOperand(0);
8564 SDValue V2 = Op.getOperand(1);
8565 SDLoc dl(Op);
8566 if (isReverseMask(ShuffleMask, VT)) {
8567 SDValue cast = DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::i32, V1);
8568 SDValue rbit = DAG.getNode(ISD::BITREVERSE, dl, MVT::i32, cast);
8569 SDValue srl = DAG.getNode(ISD::SRL, dl, MVT::i32, rbit,
8570 DAG.getConstant(16, dl, MVT::i32));
8571 return DAG.getNode(ARMISD::PREDICATE_CAST, dl, VT, srl);
8572 }
8573
8574 // Until we can come up with optimised cases for every single vector
8575 // shuffle in existence we have chosen the least painful strategy. This is
8576 // to essentially promote the boolean predicate to a 8-bit integer, where
8577 // each predicate represents a byte. Then we fall back on a normal integer
8578 // vector shuffle and convert the result back into a predicate vector. In
8579 // many cases the generated code might be even better than scalar code
8580 // operating on bits. Just imagine trying to shuffle 8 arbitrary 2-bit
8581 // fields in a register into 8 other arbitrary 2-bit fields!
8582 SDValue PredAsVector1 = PromoteMVEPredVector(dl, V1, VT, DAG);
8583 EVT NewVT = PredAsVector1.getValueType();
8584 SDValue PredAsVector2 = V2.isUndef() ? DAG.getUNDEF(NewVT)
8585 : PromoteMVEPredVector(dl, V2, VT, DAG);
8586 assert(PredAsVector2.getValueType() == NewVT &&
8587 "Expected identical vector type in expanded i1 shuffle!");
8588
8589 // Do the shuffle!
8590 SDValue Shuffled = DAG.getVectorShuffle(NewVT, dl, PredAsVector1,
8591 PredAsVector2, ShuffleMask);
8592
8593 // Now return the result of comparing the shuffled vector with zero,
8594 // which will generate a real predicate, i.e. v4i1, v8i1 or v16i1. For a v2i1
8595 // we convert to a v4i1 compare to fill in the two halves of the i64 as i32s.
8596 if (VT == MVT::v2i1) {
8597 SDValue BC = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, Shuffled);
8598 SDValue Cmp = DAG.getNode(ARMISD::VCMPZ, dl, MVT::v4i1, BC,
8599 DAG.getConstant(ARMCC::NE, dl, MVT::i32));
8600 return DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::v2i1, Cmp);
8601 }
8602 return DAG.getNode(ARMISD::VCMPZ, dl, VT, Shuffled,
8603 DAG.getConstant(ARMCC::NE, dl, MVT::i32));
8604}
8605
8607 ArrayRef<int> ShuffleMask,
8608 SelectionDAG &DAG) {
8609 // Attempt to lower the vector shuffle using as many whole register movs as
8610 // possible. This is useful for types smaller than 32bits, which would
8611 // often otherwise become a series for grp movs.
8612 SDLoc dl(Op);
8613 EVT VT = Op.getValueType();
8614 if (VT.getScalarSizeInBits() >= 32)
8615 return SDValue();
8616
8617 assert((VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i8) &&
8618 "Unexpected vector type");
8619 int NumElts = VT.getVectorNumElements();
8620 int QuarterSize = NumElts / 4;
8621 // The four final parts of the vector, as i32's
8622 SDValue Parts[4];
8623
8624 // Look for full lane vmovs like <0,1,2,3> or <u,5,6,7> etc, (but not
8625 // <u,u,u,u>), returning the vmov lane index
8626 auto getMovIdx = [](ArrayRef<int> ShuffleMask, int Start, int Length) {
8627 // Detect which mov lane this would be from the first non-undef element.
8628 int MovIdx = -1;
8629 for (int i = 0; i < Length; i++) {
8630 if (ShuffleMask[Start + i] >= 0) {
8631 if (ShuffleMask[Start + i] % Length != i)
8632 return -1;
8633 MovIdx = ShuffleMask[Start + i] / Length;
8634 break;
8635 }
8636 }
8637 // If all items are undef, leave this for other combines
8638 if (MovIdx == -1)
8639 return -1;
8640 // Check the remaining values are the correct part of the same mov
8641 for (int i = 1; i < Length; i++) {
8642 if (ShuffleMask[Start + i] >= 0 &&
8643 (ShuffleMask[Start + i] / Length != MovIdx ||
8644 ShuffleMask[Start + i] % Length != i))
8645 return -1;
8646 }
8647 return MovIdx;
8648 };
8649
8650 for (int Part = 0; Part < 4; ++Part) {
8651 // Does this part look like a mov
8652 int Elt = getMovIdx(ShuffleMask, Part * QuarterSize, QuarterSize);
8653 if (Elt != -1) {
8654 SDValue Input = Op->getOperand(0);
8655 if (Elt >= 4) {
8656 Input = Op->getOperand(1);
8657 Elt -= 4;
8658 }
8659 SDValue BitCast = DAG.getBitcast(MVT::v4f32, Input);
8660 Parts[Part] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, BitCast,
8661 DAG.getConstant(Elt, dl, MVT::i32));
8662 }
8663 }
8664
8665 // Nothing interesting found, just return
8666 if (!Parts[0] && !Parts[1] && !Parts[2] && !Parts[3])
8667 return SDValue();
8668
8669 // The other parts need to be built with the old shuffle vector, cast to a
8670 // v4i32 and extract_vector_elts
8671 if (!Parts[0] || !Parts[1] || !Parts[2] || !Parts[3]) {
8672 SmallVector<int, 16> NewShuffleMask;
8673 for (int Part = 0; Part < 4; ++Part)
8674 for (int i = 0; i < QuarterSize; i++)
8675 NewShuffleMask.push_back(
8676 Parts[Part] ? -1 : ShuffleMask[Part * QuarterSize + i]);
8677 SDValue NewShuffle = DAG.getVectorShuffle(
8678 VT, dl, Op->getOperand(0), Op->getOperand(1), NewShuffleMask);
8679 SDValue BitCast = DAG.getBitcast(MVT::v4f32, NewShuffle);
8680
8681 for (int Part = 0; Part < 4; ++Part)
8682 if (!Parts[Part])
8683 Parts[Part] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32,
8684 BitCast, DAG.getConstant(Part, dl, MVT::i32));
8685 }
8686 // Build a vector out of the various parts and bitcast it back to the original
8687 // type.
8688 SDValue NewVec = DAG.getNode(ARMISD::BUILD_VECTOR, dl, MVT::v4f32, Parts);
8689 return DAG.getBitcast(VT, NewVec);
8690}
8691
8693 ArrayRef<int> ShuffleMask,
8694 SelectionDAG &DAG) {
8695 SDValue V1 = Op.getOperand(0);
8696 SDValue V2 = Op.getOperand(1);
8697 EVT VT = Op.getValueType();
8698 unsigned NumElts = VT.getVectorNumElements();
8699
8700 // An One-Off Identity mask is one that is mostly an identity mask from as
8701 // single source but contains a single element out-of-place, either from a
8702 // different vector or from another position in the same vector. As opposed to
8703 // lowering this via a ARMISD::BUILD_VECTOR we can generate an extract/insert
8704 // pair directly.
8705 auto isOneOffIdentityMask = [](ArrayRef<int> Mask, EVT VT, int BaseOffset,
8706 int &OffElement) {
8707 OffElement = -1;
8708 int NonUndef = 0;
8709 for (int i = 0, NumMaskElts = Mask.size(); i < NumMaskElts; ++i) {
8710 if (Mask[i] == -1)
8711 continue;
8712 NonUndef++;
8713 if (Mask[i] != i + BaseOffset) {
8714 if (OffElement == -1)
8715 OffElement = i;
8716 else
8717 return false;
8718 }
8719 }
8720 return NonUndef > 2 && OffElement != -1;
8721 };
8722 int OffElement;
8723 SDValue VInput;
8724 if (isOneOffIdentityMask(ShuffleMask, VT, 0, OffElement))
8725 VInput = V1;
8726 else if (isOneOffIdentityMask(ShuffleMask, VT, NumElts, OffElement))
8727 VInput = V2;
8728 else
8729 return SDValue();
8730
8731 SDLoc dl(Op);
8732 EVT SVT = VT.getScalarType() == MVT::i8 || VT.getScalarType() == MVT::i16
8733 ? MVT::i32
8734 : VT.getScalarType();
8735 SDValue Elt = DAG.getNode(
8736 ISD::EXTRACT_VECTOR_ELT, dl, SVT,
8737 ShuffleMask[OffElement] < (int)NumElts ? V1 : V2,
8738 DAG.getVectorIdxConstant(ShuffleMask[OffElement] % NumElts, dl));
8739 return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, VInput, Elt,
8740 DAG.getVectorIdxConstant(OffElement % NumElts, dl));
8741}
8742
8744 const ARMSubtarget *ST) {
8745 SDValue V1 = Op.getOperand(0);
8746 SDValue V2 = Op.getOperand(1);
8747 SDLoc dl(Op);
8748 EVT VT = Op.getValueType();
8750 unsigned EltSize = VT.getScalarSizeInBits();
8751
8752 if (ST->hasMVEIntegerOps() && EltSize == 1)
8753 return LowerVECTOR_SHUFFLE_i1(Op, DAG, ST);
8754
8755 // Convert shuffles that are directly supported on NEON to target-specific
8756 // DAG nodes, instead of keeping them as shuffles and matching them again
8757 // during code selection. This is more efficient and avoids the possibility
8758 // of inconsistencies between legalization and selection.
8759 // FIXME: floating-point vectors should be canonicalized to integer vectors
8760 // of the same time so that they get CSEd properly.
8761 ArrayRef<int> ShuffleMask = SVN->getMask();
8762
8763 if (EltSize <= 32) {
8764 if (SVN->isSplat()) {
8765 int Lane = SVN->getSplatIndex();
8766 // If this is undef splat, generate it via "just" vdup, if possible.
8767 if (Lane == -1) Lane = 0;
8768
8769 // Test if V1 is a SCALAR_TO_VECTOR.
8770 if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR) {
8771 return DAG.getNode(ARMISD::VDUP, dl, VT, V1.getOperand(0));
8772 }
8773 // Test if V1 is a BUILD_VECTOR which is equivalent to a SCALAR_TO_VECTOR
8774 // (and probably will turn into a SCALAR_TO_VECTOR once legalization
8775 // reaches it).
8776 if (Lane == 0 && V1.getOpcode() == ISD::BUILD_VECTOR &&
8778 bool IsScalarToVector = true;
8779 for (unsigned i = 1, e = V1.getNumOperands(); i != e; ++i)
8780 if (!V1.getOperand(i).isUndef()) {
8781 IsScalarToVector = false;
8782 break;
8783 }
8784 if (IsScalarToVector)
8785 return DAG.getNode(ARMISD::VDUP, dl, VT, V1.getOperand(0));
8786 }
8787 return DAG.getNode(ARMISD::VDUPLANE, dl, VT, V1,
8788 DAG.getConstant(Lane, dl, MVT::i32));
8789 }
8790
8791 bool ReverseVEXT = false;
8792 unsigned Imm = 0;
8793 if (ST->hasNEON() && isVEXTMask(ShuffleMask, VT, ReverseVEXT, Imm)) {
8794 if (ReverseVEXT)
8795 std::swap(V1, V2);
8796 return DAG.getNode(ARMISD::VEXT, dl, VT, V1, V2,
8797 DAG.getConstant(Imm, dl, MVT::i32));
8798 }
8799
8800 if (isVREVMask(ShuffleMask, VT, 64))
8801 return DAG.getNode(ARMISD::VREV64, dl, VT, V1);
8802 if (isVREVMask(ShuffleMask, VT, 32))
8803 return DAG.getNode(ARMISD::VREV32, dl, VT, V1);
8804 if (isVREVMask(ShuffleMask, VT, 16))
8805 return DAG.getNode(ARMISD::VREV16, dl, VT, V1);
8806
8807 if (ST->hasNEON() && V2->isUndef() && isSingletonVEXTMask(ShuffleMask, VT, Imm)) {
8808 return DAG.getNode(ARMISD::VEXT, dl, VT, V1, V1,
8809 DAG.getConstant(Imm, dl, MVT::i32));
8810 }
8811
8812 // Check for Neon shuffles that modify both input vectors in place.
8813 // If both results are used, i.e., if there are two shuffles with the same
8814 // source operands and with masks corresponding to both results of one of
8815 // these operations, DAG memoization will ensure that a single node is
8816 // used for both shuffles.
8817 unsigned WhichResult = 0;
8818 bool isV_UNDEF = false;
8819 if (ST->hasNEON()) {
8820 if (unsigned ShuffleOpc = isNEONTwoResultShuffleMask(
8821 ShuffleMask, VT, WhichResult, isV_UNDEF)) {
8822 if (isV_UNDEF)
8823 V2 = V1;
8824 return DAG.getNode(ShuffleOpc, dl, DAG.getVTList(VT, VT), V1, V2)
8825 .getValue(WhichResult);
8826 }
8827 }
8828 if (ST->hasMVEIntegerOps()) {
8829 if (isVMOVNMask(ShuffleMask, VT, false, false))
8830 return DAG.getNode(ARMISD::VMOVN, dl, VT, V2, V1,
8831 DAG.getConstant(0, dl, MVT::i32));
8832 if (isVMOVNMask(ShuffleMask, VT, true, false))
8833 return DAG.getNode(ARMISD::VMOVN, dl, VT, V1, V2,
8834 DAG.getConstant(1, dl, MVT::i32));
8835 if (isVMOVNMask(ShuffleMask, VT, true, true))
8836 return DAG.getNode(ARMISD::VMOVN, dl, VT, V1, V1,
8837 DAG.getConstant(1, dl, MVT::i32));
8838 }
8839
8840 // Also check for these shuffles through CONCAT_VECTORS: we canonicalize
8841 // shuffles that produce a result larger than their operands with:
8842 // shuffle(concat(v1, undef), concat(v2, undef))
8843 // ->
8844 // shuffle(concat(v1, v2), undef)
8845 // because we can access quad vectors (see PerformVECTOR_SHUFFLECombine).
8846 //
8847 // This is useful in the general case, but there are special cases where
8848 // native shuffles produce larger results: the two-result ops.
8849 //
8850 // Look through the concat when lowering them:
8851 // shuffle(concat(v1, v2), undef)
8852 // ->
8853 // concat(VZIP(v1, v2):0, :1)
8854 //
8855 if (ST->hasNEON() && V1->getOpcode() == ISD::CONCAT_VECTORS && V2->isUndef()) {
8856 SDValue SubV1 = V1->getOperand(0);
8857 SDValue SubV2 = V1->getOperand(1);
8858 EVT SubVT = SubV1.getValueType();
8859
8860 // We expect these to have been canonicalized to -1.
8861 assert(llvm::all_of(ShuffleMask, [&](int i) {
8862 return i < (int)VT.getVectorNumElements();
8863 }) && "Unexpected shuffle index into UNDEF operand!");
8864
8865 if (unsigned ShuffleOpc = isNEONTwoResultShuffleMask(
8866 ShuffleMask, SubVT, WhichResult, isV_UNDEF)) {
8867 if (isV_UNDEF)
8868 SubV2 = SubV1;
8869 assert((WhichResult == 0) &&
8870 "In-place shuffle of concat can only have one result!");
8871 SDValue Res = DAG.getNode(ShuffleOpc, dl, DAG.getVTList(SubVT, SubVT),
8872 SubV1, SubV2);
8873 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Res.getValue(0),
8874 Res.getValue(1));
8875 }
8876 }
8877 }
8878
8879 if (ST->hasMVEIntegerOps() && EltSize <= 32) {
8880 if (SDValue V = LowerVECTOR_SHUFFLEUsingOneOff(Op, ShuffleMask, DAG))
8881 return V;
8882
8883 for (bool Top : {false, true}) {
8884 for (bool SingleSource : {false, true}) {
8885 if (isTruncMask(ShuffleMask, VT, Top, SingleSource)) {
8886 MVT FromSVT = MVT::getIntegerVT(EltSize * 2);
8887 MVT FromVT = MVT::getVectorVT(FromSVT, ShuffleMask.size() / 2);
8888 SDValue Lo = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, FromVT, V1);
8889 SDValue Hi = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, FromVT,
8890 SingleSource ? V1 : V2);
8891 if (Top) {
8892 SDValue Amt = DAG.getConstant(EltSize, dl, FromVT);
8893 Lo = DAG.getNode(ISD::SRL, dl, FromVT, Lo, Amt);
8894 Hi = DAG.getNode(ISD::SRL, dl, FromVT, Hi, Amt);
8895 }
8896 return DAG.getNode(ARMISD::MVETRUNC, dl, VT, Lo, Hi);
8897 }
8898 }
8899 }
8900 }
8901
8902 // If the shuffle is not directly supported and it has 4 elements, use
8903 // the PerfectShuffle-generated table to synthesize it from other shuffles.
8904 unsigned NumElts = VT.getVectorNumElements();
8905 if (NumElts == 4) {
8906 unsigned PFIndexes[4];
8907 for (unsigned i = 0; i != 4; ++i) {
8908 if (ShuffleMask[i] < 0)
8909 PFIndexes[i] = 8;
8910 else
8911 PFIndexes[i] = ShuffleMask[i];
8912 }
8913
8914 // Compute the index in the perfect shuffle table.
8915 unsigned PFTableIndex =
8916 PFIndexes[0]*9*9*9+PFIndexes[1]*9*9+PFIndexes[2]*9+PFIndexes[3];
8917 unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
8918 unsigned Cost = (PFEntry >> 30);
8919
8920 if (Cost <= 4) {
8921 if (ST->hasNEON())
8922 return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl);
8923 else if (isLegalMVEShuffleOp(PFEntry)) {
8924 unsigned LHSID = (PFEntry >> 13) & ((1 << 13)-1);
8925 unsigned RHSID = (PFEntry >> 0) & ((1 << 13)-1);
8926 unsigned PFEntryLHS = PerfectShuffleTable[LHSID];
8927 unsigned PFEntryRHS = PerfectShuffleTable[RHSID];
8928 if (isLegalMVEShuffleOp(PFEntryLHS) && isLegalMVEShuffleOp(PFEntryRHS))
8929 return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl);
8930 }
8931 }
8932 }
8933
8934 // Implement shuffles with 32- or 64-bit elements as ARMISD::BUILD_VECTORs.
8935 if (EltSize >= 32) {
8936 // Do the expansion with floating-point types, since that is what the VFP
8937 // registers are defined to use, and since i64 is not legal.
8938 EVT EltVT = EVT::getFloatingPointVT(EltSize);
8939 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts);
8940 V1 = DAG.getNode(ISD::BITCAST, dl, VecVT, V1);
8941 V2 = DAG.getNode(ISD::BITCAST, dl, VecVT, V2);
8943 for (unsigned i = 0; i < NumElts; ++i) {
8944 if (ShuffleMask[i] < 0)
8945 Ops.push_back(DAG.getUNDEF(EltVT));
8946 else
8947 Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT,
8948 ShuffleMask[i] < (int)NumElts ? V1 : V2,
8949 DAG.getConstant(ShuffleMask[i] & (NumElts-1),
8950 dl, MVT::i32)));
8951 }
8952 SDValue Val = DAG.getNode(ARMISD::BUILD_VECTOR, dl, VecVT, Ops);
8953 return DAG.getNode(ISD::BITCAST, dl, VT, Val);
8954 }
8955
8956 if ((VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i8) &&
8957 isReverseMask(ShuffleMask, VT))
8958 return LowerReverse_VECTOR_SHUFFLE(Op, DAG);
8959
8960 if (ST->hasNEON() && VT == MVT::v8i8)
8961 if (SDValue NewOp = LowerVECTOR_SHUFFLEv8i8(Op, ShuffleMask, DAG))
8962 return NewOp;
8963
8964 if (ST->hasMVEIntegerOps())
8965 if (SDValue NewOp = LowerVECTOR_SHUFFLEUsingMovs(Op, ShuffleMask, DAG))
8966 return NewOp;
8967
8968 return SDValue();
8969}
8970
8972 const ARMSubtarget *ST) {
8973 EVT VecVT = Op.getOperand(0).getValueType();
8974 SDLoc dl(Op);
8975
8976 assert(ST->hasMVEIntegerOps() &&
8977 "LowerINSERT_VECTOR_ELT_i1 called without MVE!");
8978
8979 SDValue Conv =
8980 DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::i32, Op->getOperand(0));
8981 unsigned Lane = Op.getConstantOperandVal(2);
8982 unsigned LaneWidth =
8984 unsigned Mask = ((1 << LaneWidth) - 1) << Lane * LaneWidth;
8985 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::i32,
8986 Op.getOperand(1), DAG.getValueType(MVT::i1));
8987 SDValue BFI = DAG.getNode(ARMISD::BFI, dl, MVT::i32, Conv, Ext,
8988 DAG.getConstant(~Mask, dl, MVT::i32));
8989 return DAG.getNode(ARMISD::PREDICATE_CAST, dl, Op.getValueType(), BFI);
8990}
8991
8992SDValue ARMTargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
8993 SelectionDAG &DAG) const {
8994 // INSERT_VECTOR_ELT is legal only for immediate indexes.
8995 SDValue Lane = Op.getOperand(2);
8996 if (!isa<ConstantSDNode>(Lane))
8997 return SDValue();
8998
8999 SDValue Elt = Op.getOperand(1);
9000 EVT EltVT = Elt.getValueType();
9001
9002 if (Subtarget->hasMVEIntegerOps() &&
9003 Op.getValueType().getScalarSizeInBits() == 1)
9004 return LowerINSERT_VECTOR_ELT_i1(Op, DAG, Subtarget);
9005
9006 if (getTypeAction(*DAG.getContext(), EltVT) ==
9008 // INSERT_VECTOR_ELT doesn't want f16 operands promoting to f32,
9009 // but the type system will try to do that if we don't intervene.
9010 // Reinterpret any such vector-element insertion as one with the
9011 // corresponding integer types.
9012
9013 SDLoc dl(Op);
9014
9015 EVT IEltVT = MVT::getIntegerVT(EltVT.getScalarSizeInBits());
9016 assert(getTypeAction(*DAG.getContext(), IEltVT) !=
9018
9019 SDValue VecIn = Op.getOperand(0);
9020 EVT VecVT = VecIn.getValueType();
9021 EVT IVecVT = EVT::getVectorVT(*DAG.getContext(), IEltVT,
9022 VecVT.getVectorNumElements());
9023
9024 SDValue IElt = DAG.getNode(ISD::BITCAST, dl, IEltVT, Elt);
9025 SDValue IVecIn = DAG.getNode(ISD::BITCAST, dl, IVecVT, VecIn);
9026 SDValue IVecOut = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, IVecVT,
9027 IVecIn, IElt, Lane);
9028 return DAG.getNode(ISD::BITCAST, dl, VecVT, IVecOut);
9029 }
9030
9031 return Op;
9032}
9033
9035 const ARMSubtarget *ST) {
9036 EVT VecVT = Op.getOperand(0).getValueType();
9037 SDLoc dl(Op);
9038
9039 assert(ST->hasMVEIntegerOps() &&
9040 "LowerINSERT_VECTOR_ELT_i1 called without MVE!");
9041
9042 SDValue Conv =
9043 DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::i32, Op->getOperand(0));
9044 unsigned Lane = Op.getConstantOperandVal(1);
9045 unsigned LaneWidth =
9047 SDValue Shift = DAG.getNode(ISD::SRL, dl, MVT::i32, Conv,
9048 DAG.getConstant(Lane * LaneWidth, dl, MVT::i32));
9049 return Shift;
9050}
9051
9053 const ARMSubtarget *ST) {
9054 // EXTRACT_VECTOR_ELT is legal only for immediate indexes.
9055 SDValue Lane = Op.getOperand(1);
9056 if (!isa<ConstantSDNode>(Lane))
9057 return SDValue();
9058
9059 SDValue Vec = Op.getOperand(0);
9060 EVT VT = Vec.getValueType();
9061
9062 if (ST->hasMVEIntegerOps() && VT.getScalarSizeInBits() == 1)
9063 return LowerEXTRACT_VECTOR_ELT_i1(Op, DAG, ST);
9064
9065 if (Op.getValueType() == MVT::i32 && Vec.getScalarValueSizeInBits() < 32) {
9066 SDLoc dl(Op);
9067 return DAG.getNode(ARMISD::VGETLANEu, dl, MVT::i32, Vec, Lane);
9068 }
9069
9070 return Op;
9071}
9072
9074 const ARMSubtarget *ST) {
9075 SDLoc dl(Op);
9076 assert(Op.getValueType().getScalarSizeInBits() == 1 &&
9077 "Unexpected custom CONCAT_VECTORS lowering");
9078 assert(isPowerOf2_32(Op.getNumOperands()) &&
9079 "Unexpected custom CONCAT_VECTORS lowering");
9080 assert(ST->hasMVEIntegerOps() &&
9081 "CONCAT_VECTORS lowering only supported for MVE");
9082
9083 auto ConcatPair = [&](SDValue V1, SDValue V2) {
9084 EVT Op1VT = V1.getValueType();
9085 EVT Op2VT = V2.getValueType();
9086 assert(Op1VT == Op2VT && "Operand types don't match!");
9087 assert((Op1VT == MVT::v2i1 || Op1VT == MVT::v4i1 || Op1VT == MVT::v8i1) &&
9088 "Unexpected i1 concat operations!");
9089 EVT VT = Op1VT.getDoubleNumVectorElementsVT(*DAG.getContext());
9090
9091 SDValue NewV1 = PromoteMVEPredVector(dl, V1, Op1VT, DAG);
9092 SDValue NewV2 = PromoteMVEPredVector(dl, V2, Op2VT, DAG);
9093
9094 // We now have Op1 + Op2 promoted to vectors of integers, where v8i1 gets
9095 // promoted to v8i16, etc.
9096 MVT ElType =
9098 unsigned NumElts = 2 * Op1VT.getVectorNumElements();
9099
9100 EVT ConcatVT = MVT::getVectorVT(ElType, NumElts);
9101 if (Op1VT == MVT::v4i1 || Op1VT == MVT::v8i1) {
9102 // Use MVETRUNC to truncate the combined NewV1::NewV2 into the smaller
9103 // ConcatVT.
9104 SDValue ConVec =
9105 DAG.getNode(ARMISD::MVETRUNC, dl, ConcatVT, NewV1, NewV2);
9106 return DAG.getNode(ARMISD::VCMPZ, dl, VT, ConVec,
9107 DAG.getConstant(ARMCC::NE, dl, MVT::i32));
9108 }
9109
9110 // Extract the vector elements from Op1 and Op2 one by one and truncate them
9111 // to be the right size for the destination. For example, if Op1 is v4i1
9112 // then the promoted vector is v4i32. The result of concatenation gives a
9113 // v8i1, which when promoted is v8i16. That means each i32 element from Op1
9114 // needs truncating to i16 and inserting in the result.
9115 auto ExtractInto = [&DAG, &dl](SDValue NewV, SDValue ConVec, unsigned &j) {
9116 EVT NewVT = NewV.getValueType();
9117 EVT ConcatVT = ConVec.getValueType();
9118 unsigned ExtScale = 1;
9119 if (NewVT == MVT::v2f64) {
9120 NewV = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, NewV);
9121 ExtScale = 2;
9122 }
9123 for (unsigned i = 0, e = NewVT.getVectorNumElements(); i < e; i++, j++) {
9124 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, NewV,
9125 DAG.getIntPtrConstant(i * ExtScale, dl));
9126 ConVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ConcatVT, ConVec, Elt,
9127 DAG.getConstant(j, dl, MVT::i32));
9128 }
9129 return ConVec;
9130 };
9131 unsigned j = 0;
9132 SDValue ConVec = DAG.getNode(ISD::UNDEF, dl, ConcatVT);
9133 ConVec = ExtractInto(NewV1, ConVec, j);
9134 ConVec = ExtractInto(NewV2, ConVec, j);
9135
9136 // Now return the result of comparing the subvector with zero, which will
9137 // generate a real predicate, i.e. v4i1, v8i1 or v16i1.
9138 return DAG.getNode(ARMISD::VCMPZ, dl, VT, ConVec,
9139 DAG.getConstant(ARMCC::NE, dl, MVT::i32));
9140 };
9141
9142 // Concat each pair of subvectors and pack into the lower half of the array.
9143 SmallVector<SDValue> ConcatOps(Op->ops());
9144 while (ConcatOps.size() > 1) {
9145 for (unsigned I = 0, E = ConcatOps.size(); I != E; I += 2) {
9146 SDValue V1 = ConcatOps[I];
9147 SDValue V2 = ConcatOps[I + 1];
9148 ConcatOps[I / 2] = ConcatPair(V1, V2);
9149 }
9150 ConcatOps.resize(ConcatOps.size() / 2);
9151 }
9152 return ConcatOps[0];
9153}
9154
9156 const ARMSubtarget *ST) {
9157 EVT VT = Op->getValueType(0);
9158 if (ST->hasMVEIntegerOps() && VT.getScalarSizeInBits() == 1)
9159 return LowerCONCAT_VECTORS_i1(Op, DAG, ST);
9160
9161 // The only time a CONCAT_VECTORS operation can have legal types is when
9162 // two 64-bit vectors are concatenated to a 128-bit vector.
9163 assert(Op.getValueType().is128BitVector() && Op.getNumOperands() == 2 &&
9164 "unexpected CONCAT_VECTORS");
9165 SDLoc dl(Op);
9166 SDValue Val = DAG.getUNDEF(MVT::v2f64);
9167 SDValue Op0 = Op.getOperand(0);
9168 SDValue Op1 = Op.getOperand(1);
9169 if (!Op0.isUndef())
9170 Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Val,
9171 DAG.getNode(ISD::BITCAST, dl, MVT::f64, Op0),
9172 DAG.getIntPtrConstant(0, dl));
9173 if (!Op1.isUndef())
9174 Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Val,
9175 DAG.getNode(ISD::BITCAST, dl, MVT::f64, Op1),
9176 DAG.getIntPtrConstant(1, dl));
9177 return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Val);
9178}
9179
9181 const ARMSubtarget *ST) {
9182 SDValue V1 = Op.getOperand(0);
9183 SDValue V2 = Op.getOperand(1);
9184 SDLoc dl(Op);
9185 EVT VT = Op.getValueType();
9186 EVT Op1VT = V1.getValueType();
9187 unsigned NumElts = VT.getVectorNumElements();
9188 unsigned Index = V2->getAsZExtVal();
9189
9190 assert(VT.getScalarSizeInBits() == 1 &&
9191 "Unexpected custom EXTRACT_SUBVECTOR lowering");
9192 assert(ST->hasMVEIntegerOps() &&
9193 "EXTRACT_SUBVECTOR lowering only supported for MVE");
9194
9195 SDValue NewV1 = PromoteMVEPredVector(dl, V1, Op1VT, DAG);
9196
9197 // We now have Op1 promoted to a vector of integers, where v8i1 gets
9198 // promoted to v8i16, etc.
9199
9201
9202 if (NumElts == 2) {
9203 EVT SubVT = MVT::v4i32;
9204 SDValue SubVec = DAG.getNode(ISD::UNDEF, dl, SubVT);
9205 for (unsigned i = Index, j = 0; i < (Index + NumElts); i++, j += 2) {
9206 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, NewV1,
9207 DAG.getIntPtrConstant(i, dl));
9208 SubVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, SubVT, SubVec, Elt,
9209 DAG.getConstant(j, dl, MVT::i32));
9210 SubVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, SubVT, SubVec, Elt,
9211 DAG.getConstant(j + 1, dl, MVT::i32));
9212 }
9213 SDValue Cmp = DAG.getNode(ARMISD::VCMPZ, dl, MVT::v4i1, SubVec,
9214 DAG.getConstant(ARMCC::NE, dl, MVT::i32));
9215 return DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::v2i1, Cmp);
9216 }
9217
9218 EVT SubVT = MVT::getVectorVT(ElType, NumElts);
9219 SDValue SubVec = DAG.getNode(ISD::UNDEF, dl, SubVT);
9220 for (unsigned i = Index, j = 0; i < (Index + NumElts); i++, j++) {
9221 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, NewV1,
9222 DAG.getIntPtrConstant(i, dl));
9223 SubVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, SubVT, SubVec, Elt,
9224 DAG.getConstant(j, dl, MVT::i32));
9225 }
9226
9227 // Now return the result of comparing the subvector with zero,
9228 // which will generate a real predicate, i.e. v4i1, v8i1 or v16i1.
9229 return DAG.getNode(ARMISD::VCMPZ, dl, VT, SubVec,
9230 DAG.getConstant(ARMCC::NE, dl, MVT::i32));
9231}
9232
9233// Turn a truncate into a predicate (an i1 vector) into icmp(and(x, 1), 0).
9235 const ARMSubtarget *ST) {
9236 assert(ST->hasMVEIntegerOps() && "Expected MVE!");
9237 EVT VT = N->getValueType(0);
9238 assert((VT == MVT::v16i1 || VT == MVT::v8i1 || VT == MVT::v4i1) &&
9239 "Expected a vector i1 type!");
9240 SDValue Op = N->getOperand(0);
9241 EVT FromVT = Op.getValueType();
9242 SDLoc DL(N);
9243
9244 SDValue And =
9245 DAG.getNode(ISD::AND, DL, FromVT, Op, DAG.getConstant(1, DL, FromVT));
9246 return DAG.getNode(ISD::SETCC, DL, VT, And, DAG.getConstant(0, DL, FromVT),
9247 DAG.getCondCode(ISD::SETNE));
9248}
9249
9251 const ARMSubtarget *Subtarget) {
9252 if (!Subtarget->hasMVEIntegerOps())
9253 return SDValue();
9254
9255 EVT ToVT = N->getValueType(0);
9256 if (ToVT.getScalarType() == MVT::i1)
9257 return LowerTruncatei1(N, DAG, Subtarget);
9258
9259 // MVE does not have a single instruction to perform the truncation of a v4i32
9260 // into the lower half of a v8i16, in the same way that a NEON vmovn would.
9261 // Most of the instructions in MVE follow the 'Beats' system, where moving
9262 // values from different lanes is usually something that the instructions
9263 // avoid.
9264 //
9265 // Instead it has top/bottom instructions such as VMOVLT/B and VMOVNT/B,
9266 // which take a the top/bottom half of a larger lane and extend it (or do the
9267 // opposite, truncating into the top/bottom lane from a larger lane). Note
9268 // that because of the way we widen lanes, a v4i16 is really a v4i32 using the
9269 // bottom 16bits from each vector lane. This works really well with T/B
9270 // instructions, but that doesn't extend to v8i32->v8i16 where the lanes need
9271 // to move order.
9272 //
9273 // But truncates and sext/zext are always going to be fairly common from llvm.
9274 // We have several options for how to deal with them:
9275 // - Wherever possible combine them into an instruction that makes them
9276 // "free". This includes loads/stores, which can perform the trunc as part
9277 // of the memory operation. Or certain shuffles that can be turned into
9278 // VMOVN/VMOVL.
9279 // - Lane Interleaving to transform blocks surrounded by ext/trunc. So
9280 // trunc(mul(sext(a), sext(b))) may become
9281 // VMOVNT(VMUL(VMOVLB(a), VMOVLB(b)), VMUL(VMOVLT(a), VMOVLT(b))). (Which in
9282 // this case can use VMULL). This is performed in the
9283 // MVELaneInterleavingPass.
9284 // - Otherwise we have an option. By default we would expand the
9285 // zext/sext/trunc into a series of lane extract/inserts going via GPR
9286 // registers. One for each vector lane in the vector. This can obviously be
9287 // very expensive.
9288 // - The other option is to use the fact that loads/store can extend/truncate
9289 // to turn a trunc into two truncating stack stores and a stack reload. This
9290 // becomes 3 back-to-back memory operations, but at least that is less than
9291 // all the insert/extracts.
9292 //
9293 // In order to do the last, we convert certain trunc's into MVETRUNC, which
9294 // are either optimized where they can be, or eventually lowered into stack
9295 // stores/loads. This prevents us from splitting a v8i16 trunc into two stores
9296 // two early, where other instructions would be better, and stops us from
9297 // having to reconstruct multiple buildvector shuffles into loads/stores.
9298 if (ToVT != MVT::v8i16 && ToVT != MVT::v16i8)
9299 return SDValue();
9300 EVT FromVT = N->getOperand(0).getValueType();
9301 if (FromVT != MVT::v8i32 && FromVT != MVT::v16i16)
9302 return SDValue();
9303
9304 SDValue Lo, Hi;
9305 std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
9306 SDLoc DL(N);
9307 return DAG.getNode(ARMISD::MVETRUNC, DL, ToVT, Lo, Hi);
9308}
9309
9311 const ARMSubtarget *Subtarget) {
9312 if (!Subtarget->hasMVEIntegerOps())
9313 return SDValue();
9314
9315 // See LowerTruncate above for an explanation of MVEEXT/MVETRUNC.
9316
9317 EVT ToVT = N->getValueType(0);
9318 if (ToVT != MVT::v16i32 && ToVT != MVT::v8i32 && ToVT != MVT::v16i16)
9319 return SDValue();
9320 SDValue Op = N->getOperand(0);
9321 EVT FromVT = Op.getValueType();
9322 if (FromVT != MVT::v8i16 && FromVT != MVT::v16i8)
9323 return SDValue();
9324
9325 SDLoc DL(N);
9326 EVT ExtVT = ToVT.getHalfNumVectorElementsVT(*DAG.getContext());
9327 if (ToVT.getScalarType() == MVT::i32 && FromVT.getScalarType() == MVT::i8)
9328 ExtVT = MVT::v8i16;
9329
9330 unsigned Opcode =
9332 SDValue Ext = DAG.getNode(Opcode, DL, DAG.getVTList(ExtVT, ExtVT), Op);
9333 SDValue Ext1 = Ext.getValue(1);
9334
9335 if (ToVT.getScalarType() == MVT::i32 && FromVT.getScalarType() == MVT::i8) {
9336 Ext = DAG.getNode(N->getOpcode(), DL, MVT::v8i32, Ext);
9337 Ext1 = DAG.getNode(N->getOpcode(), DL, MVT::v8i32, Ext1);
9338 }
9339
9340 return DAG.getNode(ISD::CONCAT_VECTORS, DL, ToVT, Ext, Ext1);
9341}
9342
9343/// isExtendedBUILD_VECTOR - Check if N is a constant BUILD_VECTOR where each
9344/// element has been zero/sign-extended, depending on the isSigned parameter,
9345/// from an integer type half its size.
9347 bool isSigned) {
9348 // A v2i64 BUILD_VECTOR will have been legalized to a BITCAST from v4i32.
9349 EVT VT = N->getValueType(0);
9350 if (VT == MVT::v2i64 && N->getOpcode() == ISD::BITCAST) {
9351 SDNode *BVN = N->getOperand(0).getNode();
9352 if (BVN->getValueType(0) != MVT::v4i32 ||
9353 BVN->getOpcode() != ISD::BUILD_VECTOR)
9354 return false;
9355 unsigned LoElt = DAG.getDataLayout().isBigEndian() ? 1 : 0;
9356 unsigned HiElt = 1 - LoElt;
9361 if (!Lo0 || !Hi0 || !Lo1 || !Hi1)
9362 return false;
9363 if (isSigned) {
9364 if (Hi0->getSExtValue() == Lo0->getSExtValue() >> 32 &&
9365 Hi1->getSExtValue() == Lo1->getSExtValue() >> 32)
9366 return true;
9367 } else {
9368 if (Hi0->isZero() && Hi1->isZero())
9369 return true;
9370 }
9371 return false;
9372 }
9373
9374 if (N->getOpcode() != ISD::BUILD_VECTOR)
9375 return false;
9376
9377 for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
9378 SDNode *Elt = N->getOperand(i).getNode();
9380 unsigned EltSize = VT.getScalarSizeInBits();
9381 unsigned HalfSize = EltSize / 2;
9382 if (isSigned) {
9383 if (!isIntN(HalfSize, C->getSExtValue()))
9384 return false;
9385 } else {
9386 if (!isUIntN(HalfSize, C->getZExtValue()))
9387 return false;
9388 }
9389 continue;
9390 }
9391 return false;
9392 }
9393
9394 return true;
9395}
9396
9397/// isSignExtended - Check if a node is a vector value that is sign-extended
9398/// or a constant BUILD_VECTOR with sign-extended elements.
9400 if (N->getOpcode() == ISD::SIGN_EXTEND || ISD::isSEXTLoad(N))
9401 return true;
9402 if (isExtendedBUILD_VECTOR(N, DAG, true))
9403 return true;
9404 return false;
9405}
9406
9407/// isZeroExtended - Check if a node is a vector value that is zero-extended (or
9408/// any-extended) or a constant BUILD_VECTOR with zero-extended elements.
9410 if (N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode() == ISD::ANY_EXTEND ||
9412 return true;
9413 if (isExtendedBUILD_VECTOR(N, DAG, false))
9414 return true;
9415 return false;
9416}
9417
9418static EVT getExtensionTo64Bits(const EVT &OrigVT) {
9419 if (OrigVT.getSizeInBits() >= 64)
9420 return OrigVT;
9421
9422 assert(OrigVT.isSimple() && "Expecting a simple value type");
9423
9424 MVT::SimpleValueType OrigSimpleTy = OrigVT.getSimpleVT().SimpleTy;
9425 switch (OrigSimpleTy) {
9426 default: llvm_unreachable("Unexpected Vector Type");
9427 case MVT::v2i8:
9428 case MVT::v2i16:
9429 return MVT::v2i32;
9430 case MVT::v4i8:
9431 return MVT::v4i16;
9432 }
9433}
9434
9435/// AddRequiredExtensionForVMULL - Add a sign/zero extension to extend the total
9436/// value size to 64 bits. We need a 64-bit D register as an operand to VMULL.
9437/// We insert the required extension here to get the vector to fill a D register.
9439 const EVT &OrigTy,
9440 const EVT &ExtTy,
9441 unsigned ExtOpcode) {
9442 // The vector originally had a size of OrigTy. It was then extended to ExtTy.
9443 // We expect the ExtTy to be 128-bits total. If the OrigTy is less than
9444 // 64-bits we need to insert a new extension so that it will be 64-bits.
9445 assert(ExtTy.is128BitVector() && "Unexpected extension size");
9446 if (OrigTy.getSizeInBits() >= 64)
9447 return N;
9448
9449 // Must extend size to at least 64 bits to be used as an operand for VMULL.
9450 EVT NewVT = getExtensionTo64Bits(OrigTy);
9451
9452 return DAG.getNode(ExtOpcode, SDLoc(N), NewVT, N);
9453}
9454
9455/// SkipLoadExtensionForVMULL - return a load of the original vector size that
9456/// does not do any sign/zero extension. If the original vector is less
9457/// than 64 bits, an appropriate extension will be added after the load to
9458/// reach a total size of 64 bits. We have to add the extension separately
9459/// because ARM does not have a sign/zero extending load for vectors.
9461 EVT ExtendedTy = getExtensionTo64Bits(LD->getMemoryVT());
9462
9463 // The load already has the right type.
9464 if (ExtendedTy == LD->getMemoryVT())
9465 return DAG.getLoad(LD->getMemoryVT(), SDLoc(LD), LD->getChain(),
9466 LD->getBasePtr(), LD->getPointerInfo(), LD->getAlign(),
9467 LD->getMemOperand()->getFlags());
9468
9469 // We need to create a zextload/sextload. We cannot just create a load
9470 // followed by a zext/zext node because LowerMUL is also run during normal
9471 // operation legalization where we can't create illegal types.
9472 return DAG.getExtLoad(LD->getExtensionType(), SDLoc(LD), ExtendedTy,
9473 LD->getChain(), LD->getBasePtr(), LD->getPointerInfo(),
9474 LD->getMemoryVT(), LD->getAlign(),
9475 LD->getMemOperand()->getFlags());
9476}
9477
9478/// SkipExtensionForVMULL - For a node that is a SIGN_EXTEND, ZERO_EXTEND,
9479/// ANY_EXTEND, extending load, or BUILD_VECTOR with extended elements, return
9480/// the unextended value. The unextended vector should be 64 bits so that it can
9481/// be used as an operand to a VMULL instruction. If the original vector size
9482/// before extension is less than 64 bits we add a an extension to resize
9483/// the vector to 64 bits.
9485 if (N->getOpcode() == ISD::SIGN_EXTEND ||
9486 N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode() == ISD::ANY_EXTEND)
9487 return AddRequiredExtensionForVMULL(N->getOperand(0), DAG,
9488 N->getOperand(0)->getValueType(0),
9489 N->getValueType(0),
9490 N->getOpcode());
9491
9492 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
9493 assert((ISD::isSEXTLoad(LD) || ISD::isZEXTLoad(LD)) &&
9494 "Expected extending load");
9495
9496 SDValue newLoad = SkipLoadExtensionForVMULL(LD, DAG);
9497 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), newLoad.getValue(1));
9498 unsigned Opcode = ISD::isSEXTLoad(LD) ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
9499 SDValue extLoad =
9500 DAG.getNode(Opcode, SDLoc(newLoad), LD->getValueType(0), newLoad);
9501 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 0), extLoad);
9502
9503 return newLoad;
9504 }
9505
9506 // Otherwise, the value must be a BUILD_VECTOR. For v2i64, it will
9507 // have been legalized as a BITCAST from v4i32.
9508 if (N->getOpcode() == ISD::BITCAST) {
9509 SDNode *BVN = N->getOperand(0).getNode();
9511 BVN->getValueType(0) == MVT::v4i32 && "expected v4i32 BUILD_VECTOR");
9512 unsigned LowElt = DAG.getDataLayout().isBigEndian() ? 1 : 0;
9513 return DAG.getBuildVector(
9514 MVT::v2i32, SDLoc(N),
9515 {BVN->getOperand(LowElt), BVN->getOperand(LowElt + 2)});
9516 }
9517 // Construct a new BUILD_VECTOR with elements truncated to half the size.
9518 assert(N->getOpcode() == ISD::BUILD_VECTOR && "expected BUILD_VECTOR");
9519 EVT VT = N->getValueType(0);
9520 unsigned EltSize = VT.getScalarSizeInBits() / 2;
9521 unsigned NumElts = VT.getVectorNumElements();
9522 MVT TruncVT = MVT::getIntegerVT(EltSize);
9524 SDLoc dl(N);
9525 for (unsigned i = 0; i != NumElts; ++i) {
9526 const APInt &CInt = N->getConstantOperandAPInt(i);
9527 // Element types smaller than 32 bits are not legal, so use i32 elements.
9528 // The values are implicitly truncated so sext vs. zext doesn't matter.
9529 Ops.push_back(DAG.getConstant(CInt.zextOrTrunc(32), dl, MVT::i32));
9530 }
9531 return DAG.getBuildVector(MVT::getVectorVT(TruncVT, NumElts), dl, Ops);
9532}
9533
9534static bool isAddSubSExt(SDNode *N, SelectionDAG &DAG) {
9535 unsigned Opcode = N->getOpcode();
9536 if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
9537 SDNode *N0 = N->getOperand(0).getNode();
9538 SDNode *N1 = N->getOperand(1).getNode();
9539 return N0->hasOneUse() && N1->hasOneUse() &&
9540 isSignExtended(N0, DAG) && isSignExtended(N1, DAG);
9541 }
9542 return false;
9543}
9544
9545static bool isAddSubZExt(SDNode *N, SelectionDAG &DAG) {
9546 unsigned Opcode = N->getOpcode();
9547 if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
9548 SDNode *N0 = N->getOperand(0).getNode();
9549 SDNode *N1 = N->getOperand(1).getNode();
9550 return N0->hasOneUse() && N1->hasOneUse() &&
9551 isZeroExtended(N0, DAG) && isZeroExtended(N1, DAG);
9552 }
9553 return false;
9554}
9555
9557 // Multiplications are only custom-lowered for 128-bit vectors so that
9558 // VMULL can be detected. Otherwise v2i64 multiplications are not legal.
9559 EVT VT = Op.getValueType();
9560 assert(VT.is128BitVector() && VT.isInteger() &&
9561 "unexpected type for custom-lowering ISD::MUL");
9562 SDNode *N0 = Op.getOperand(0).getNode();
9563 SDNode *N1 = Op.getOperand(1).getNode();
9564 unsigned NewOpc = 0;
9565 bool isMLA = false;
9566 bool isN0SExt = isSignExtended(N0, DAG);
9567 bool isN1SExt = isSignExtended(N1, DAG);
9568 if (isN0SExt && isN1SExt)
9569 NewOpc = ARMISD::VMULLs;
9570 else {
9571 bool isN0ZExt = isZeroExtended(N0, DAG);
9572 bool isN1ZExt = isZeroExtended(N1, DAG);
9573 if (isN0ZExt && isN1ZExt)
9574 NewOpc = ARMISD::VMULLu;
9575 else if (isN1SExt || isN1ZExt) {
9576 // Look for (s/zext A + s/zext B) * (s/zext C). We want to turn these
9577 // into (s/zext A * s/zext C) + (s/zext B * s/zext C)
9578 if (isN1SExt && isAddSubSExt(N0, DAG)) {
9579 NewOpc = ARMISD::VMULLs;
9580 isMLA = true;
9581 } else if (isN1ZExt && isAddSubZExt(N0, DAG)) {
9582 NewOpc = ARMISD::VMULLu;
9583 isMLA = true;
9584 } else if (isN0ZExt && isAddSubZExt(N1, DAG)) {
9585 std::swap(N0, N1);
9586 NewOpc = ARMISD::VMULLu;
9587 isMLA = true;
9588 }
9589 }
9590
9591 if (!NewOpc) {
9592 if (VT == MVT::v2i64)
9593 // Fall through to expand this. It is not legal.
9594 return SDValue();
9595 else
9596 // Other vector multiplications are legal.
9597 return Op;
9598 }
9599 }
9600
9601 // Legalize to a VMULL instruction.
9602 SDLoc DL(Op);
9603 SDValue Op0;
9604 SDValue Op1 = SkipExtensionForVMULL(N1, DAG);
9605 if (!isMLA) {
9606 Op0 = SkipExtensionForVMULL(N0, DAG);
9608 Op1.getValueType().is64BitVector() &&
9609 "unexpected types for extended operands to VMULL");
9610 return DAG.getNode(NewOpc, DL, VT, Op0, Op1);
9611 }
9612
9613 // Optimizing (zext A + zext B) * C, to (VMULL A, C) + (VMULL B, C) during
9614 // isel lowering to take advantage of no-stall back to back vmul + vmla.
9615 // vmull q0, d4, d6
9616 // vmlal q0, d5, d6
9617 // is faster than
9618 // vaddl q0, d4, d5
9619 // vmovl q1, d6
9620 // vmul q0, q0, q1
9621 SDValue N00 = SkipExtensionForVMULL(N0->getOperand(0).getNode(), DAG);
9622 SDValue N01 = SkipExtensionForVMULL(N0->getOperand(1).getNode(), DAG);
9623 EVT Op1VT = Op1.getValueType();
9624 return DAG.getNode(N0->getOpcode(), DL, VT,
9625 DAG.getNode(NewOpc, DL, VT,
9626 DAG.getNode(ISD::BITCAST, DL, Op1VT, N00), Op1),
9627 DAG.getNode(NewOpc, DL, VT,
9628 DAG.getNode(ISD::BITCAST, DL, Op1VT, N01), Op1));
9629}
9630
9632 SelectionDAG &DAG) {
9633 // TODO: Should this propagate fast-math-flags?
9634
9635 // Convert to float
9636 // float4 xf = vcvt_f32_s32(vmovl_s16(a.lo));
9637 // float4 yf = vcvt_f32_s32(vmovl_s16(b.lo));
9638 X = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, X);
9639 Y = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Y);
9640 X = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, X);
9641 Y = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, Y);
9642 // Get reciprocal estimate.
9643 // float4 recip = vrecpeq_f32(yf);
9644 Y = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
9645 DAG.getConstant(Intrinsic::arm_neon_vrecpe, dl, MVT::i32),
9646 Y);
9647 // Because char has a smaller range than uchar, we can actually get away
9648 // without any newton steps. This requires that we use a weird bias
9649 // of 0xb000, however (again, this has been exhaustively tested).
9650 // float4 result = as_float4(as_int4(xf*recip) + 0xb000);
9651 X = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, X, Y);
9652 X = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, X);
9653 Y = DAG.getConstant(0xb000, dl, MVT::v4i32);
9654 X = DAG.getNode(ISD::ADD, dl, MVT::v4i32, X, Y);
9655 X = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, X);
9656 // Convert back to short.
9657 X = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, X);
9658 X = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, X);
9659 return X;
9660}
9661
9663 SelectionDAG &DAG) {
9664 // TODO: Should this propagate fast-math-flags?
9665
9666 SDValue N2;
9667 // Convert to float.
9668 // float4 yf = vcvt_f32_s32(vmovl_s16(y));
9669 // float4 xf = vcvt_f32_s32(vmovl_s16(x));
9670 N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, N0);
9671 N1 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, N1);
9672 N0 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N0);
9673 N1 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N1);
9674
9675 // Use reciprocal estimate and one refinement step.
9676 // float4 recip = vrecpeq_f32(yf);
9677 // recip *= vrecpsq_f32(yf, recip);
9678 N2 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
9679 DAG.getConstant(Intrinsic::arm_neon_vrecpe, dl, MVT::i32),
9680 N1);
9681 N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
9682 DAG.getConstant(Intrinsic::arm_neon_vrecps, dl, MVT::i32),
9683 N1, N2);
9684 N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2);
9685 // Because short has a smaller range than ushort, we can actually get away
9686 // with only a single newton step. This requires that we use a weird bias
9687 // of 89, however (again, this has been exhaustively tested).
9688 // float4 result = as_float4(as_int4(xf*recip) + 0x89);
9689 N0 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N0, N2);
9690 N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, N0);
9691 N1 = DAG.getConstant(0x89, dl, MVT::v4i32);
9692 N0 = DAG.getNode(ISD::ADD, dl, MVT::v4i32, N0, N1);
9693 N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, N0);
9694 // Convert back to integer and return.
9695 // return vmovn_s32(vcvt_s32_f32(result));
9696 N0 = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, N0);
9697 N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, N0);
9698 return N0;
9699}
9700
9702 const ARMSubtarget *ST) {
9703 EVT VT = Op.getValueType();
9704 assert((VT == MVT::v4i16 || VT == MVT::v8i8) &&
9705 "unexpected type for custom-lowering ISD::SDIV");
9706
9707 SDLoc dl(Op);
9708 SDValue N0 = Op.getOperand(0);
9709 SDValue N1 = Op.getOperand(1);
9710 SDValue N2, N3;
9711
9712 if (VT == MVT::v8i8) {
9713 N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i16, N0);
9714 N1 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i16, N1);
9715
9716 N2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,
9717 DAG.getIntPtrConstant(4, dl));
9718 N3 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1,
9719 DAG.getIntPtrConstant(4, dl));
9720 N0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,
9721 DAG.getIntPtrConstant(0, dl));
9722 N1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1,
9723 DAG.getIntPtrConstant(0, dl));
9724
9725 N0 = LowerSDIV_v4i8(N0, N1, dl, DAG); // v4i16
9726 N2 = LowerSDIV_v4i8(N2, N3, dl, DAG); // v4i16
9727
9728 N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, N0, N2);
9729 N0 = LowerCONCAT_VECTORS(N0, DAG, ST);
9730
9731 N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v8i8, N0);
9732 return N0;
9733 }
9734 return LowerSDIV_v4i16(N0, N1, dl, DAG);
9735}
9736
9738 const ARMSubtarget *ST) {
9739 // TODO: Should this propagate fast-math-flags?
9740 EVT VT = Op.getValueType();
9741 assert((VT == MVT::v4i16 || VT == MVT::v8i8) &&
9742 "unexpected type for custom-lowering ISD::UDIV");
9743
9744 SDLoc dl(Op);
9745 SDValue N0 = Op.getOperand(0);
9746 SDValue N1 = Op.getOperand(1);
9747 SDValue N2, N3;
9748
9749 if (VT == MVT::v8i8) {
9750 N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v8i16, N0);
9751 N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v8i16, N1);
9752
9753 N2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,
9754 DAG.getIntPtrConstant(4, dl));
9755 N3 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1,
9756 DAG.getIntPtrConstant(4, dl));
9757 N0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,
9758 DAG.getIntPtrConstant(0, dl));
9759 N1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1,
9760 DAG.getIntPtrConstant(0, dl));
9761
9762 N0 = LowerSDIV_v4i16(N0, N1, dl, DAG); // v4i16
9763 N2 = LowerSDIV_v4i16(N2, N3, dl, DAG); // v4i16
9764
9765 N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, N0, N2);
9766 N0 = LowerCONCAT_VECTORS(N0, DAG, ST);
9767
9768 N0 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v8i8,
9769 DAG.getConstant(Intrinsic::arm_neon_vqmovnsu, dl,
9770 MVT::i32),
9771 N0);
9772 return N0;
9773 }
9774
9775 // v4i16 sdiv ... Convert to float.
9776 // float4 yf = vcvt_f32_s32(vmovl_u16(y));
9777 // float4 xf = vcvt_f32_s32(vmovl_u16(x));
9778 N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v4i32, N0);
9779 N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v4i32, N1);
9780 N0 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N0);
9781 SDValue BN1 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N1);
9782
9783 // Use reciprocal estimate and two refinement steps.
9784 // float4 recip = vrecpeq_f32(yf);
9785 // recip *= vrecpsq_f32(yf, recip);
9786 // recip *= vrecpsq_f32(yf, recip);
9787 N2 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
9788 DAG.getConstant(Intrinsic::arm_neon_vrecpe, dl, MVT::i32),
9789 BN1);
9790 N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
9791 DAG.getConstant(Intrinsic::arm_neon_vrecps, dl, MVT::i32),
9792 BN1, N2);
9793 N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2);
9794 N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
9795 DAG.getConstant(Intrinsic::arm_neon_vrecps, dl, MVT::i32),
9796 BN1, N2);
9797 N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2);
9798 // Simply multiplying by the reciprocal estimate can leave us a few ulps
9799 // too low, so we add 2 ulps (exhaustive testing shows that this is enough,
9800 // and that it will never cause us to return an answer too large).
9801 // float4 result = as_float4(as_int4(xf*recip) + 2);
9802 N0 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N0, N2);
9803 N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, N0);
9804 N1 = DAG.getConstant(2, dl, MVT::v4i32);
9805 N0 = DAG.getNode(ISD::ADD, dl, MVT::v4i32, N0, N1);
9806 N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, N0);
9807 // Convert back to integer and return.
9808 // return vmovn_u32(vcvt_s32_f32(result));
9809 N0 = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, N0);
9810 N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, N0);
9811 return N0;
9812}
9813
9815 SDNode *N = Op.getNode();
9816 EVT VT = N->getValueType(0);
9817 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
9818
9819 SDValue Carry = Op.getOperand(2);
9820
9821 SDLoc DL(Op);
9822
9823 SDValue Result;
9824 if (Op.getOpcode() == ISD::UADDO_CARRY) {
9825 // This converts the boolean value carry into the carry flag.
9826 Carry = ConvertBooleanCarryToCarryFlag(Carry, DAG);
9827
9828 // Do the addition proper using the carry flag we wanted.
9829 Result = DAG.getNode(ARMISD::ADDE, DL, VTs, Op.getOperand(0),
9830 Op.getOperand(1), Carry);
9831
9832 // Now convert the carry flag into a boolean value.
9833 Carry = ConvertCarryFlagToBooleanCarry(Result.getValue(1), VT, DAG);
9834 } else {
9835 // ARMISD::SUBE expects a carry not a borrow like ISD::USUBO_CARRY so we
9836 // have to invert the carry first.
9837 Carry = DAG.getNode(ISD::SUB, DL, MVT::i32,
9838 DAG.getConstant(1, DL, MVT::i32), Carry);
9839 // This converts the boolean value carry into the carry flag.
9840 Carry = ConvertBooleanCarryToCarryFlag(Carry, DAG);
9841
9842 // Do the subtraction proper using the carry flag we wanted.
9843 Result = DAG.getNode(ARMISD::SUBE, DL, VTs, Op.getOperand(0),
9844 Op.getOperand(1), Carry);
9845
9846 // Now convert the carry flag into a boolean value.
9847 Carry = ConvertCarryFlagToBooleanCarry(Result.getValue(1), VT, DAG);
9848 // But the carry returned by ARMISD::SUBE is not a borrow as expected
9849 // by ISD::USUBO_CARRY, so compute 1 - C.
9850 Carry = DAG.getNode(ISD::SUB, DL, MVT::i32,
9851 DAG.getConstant(1, DL, MVT::i32), Carry);
9852 }
9853
9854 // Return both values.
9855 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Result, Carry);
9856}
9857
9858SDValue ARMTargetLowering::LowerWindowsDIVLibCall(SDValue Op, SelectionDAG &DAG,
9859 bool Signed,
9860 SDValue &Chain) const {
9861 EVT VT = Op.getValueType();
9862 assert((VT == MVT::i32 || VT == MVT::i64) &&
9863 "unexpected type for custom lowering DIV");
9864 SDLoc dl(Op);
9865
9866 const auto &DL = DAG.getDataLayout();
9867 RTLIB::Libcall LC;
9868 if (Signed)
9869 LC = VT == MVT::i32 ? RTLIB::SDIVREM_I32 : RTLIB::SDIVREM_I64;
9870 else
9871 LC = VT == MVT::i32 ? RTLIB::UDIVREM_I32 : RTLIB::UDIVREM_I64;
9872
9873 const char *Name = getLibcallName(LC);
9874 SDValue ES = DAG.getExternalSymbol(Name, getPointerTy(DL));
9875
9877
9878 for (auto AI : {1, 0}) {
9879 SDValue Operand = Op.getOperand(AI);
9880 Args.emplace_back(Operand,
9881 Operand.getValueType().getTypeForEVT(*DAG.getContext()));
9882 }
9883
9884 CallLoweringInfo CLI(DAG);
9885 CLI.setDebugLoc(dl)
9886 .setChain(Chain)
9888 ES, std::move(Args));
9889
9890 return LowerCallTo(CLI).first;
9891}
9892
9893// This is a code size optimisation: return the original SDIV node to
9894// DAGCombiner when we don't want to expand SDIV into a sequence of
9895// instructions, and an empty node otherwise which will cause the
9896// SDIV to be expanded in DAGCombine.
9897SDValue
9898ARMTargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
9899 SelectionDAG &DAG,
9900 SmallVectorImpl<SDNode *> &Created) const {
9901 // TODO: Support SREM
9902 if (N->getOpcode() != ISD::SDIV)
9903 return SDValue();
9904
9905 const auto &ST = DAG.getSubtarget<ARMSubtarget>();
9906 const bool MinSize = ST.hasMinSize();
9907 const bool HasDivide = ST.isThumb() ? ST.hasDivideInThumbMode()
9908 : ST.hasDivideInARMMode();
9909
9910 // Don't touch vector types; rewriting this may lead to scalarizing
9911 // the int divs.
9912 if (N->getOperand(0).getValueType().isVector())
9913 return SDValue();
9914
9915 // Bail if MinSize is not set, and also for both ARM and Thumb mode we need
9916 // hwdiv support for this to be really profitable.
9917 if (!(MinSize && HasDivide))
9918 return SDValue();
9919
9920 // ARM mode is a bit simpler than Thumb: we can handle large power
9921 // of 2 immediates with 1 mov instruction; no further checks required,
9922 // just return the sdiv node.
9923 if (!ST.isThumb())
9924 return SDValue(N, 0);
9925
9926 // In Thumb mode, immediates larger than 128 need a wide 4-byte MOV,
9927 // and thus lose the code size benefits of a MOVS that requires only 2.
9928 // TargetTransformInfo and 'getIntImmCodeSizeCost' could be helpful here,
9929 // but as it's doing exactly this, it's not worth the trouble to get TTI.
9930 if (Divisor.sgt(128))
9931 return SDValue();
9932
9933 return SDValue(N, 0);
9934}
9935
9936SDValue ARMTargetLowering::LowerDIV_Windows(SDValue Op, SelectionDAG &DAG,
9937 bool Signed) const {
9938 assert(Op.getValueType() == MVT::i32 &&
9939 "unexpected type for custom lowering DIV");
9940 SDLoc dl(Op);
9941
9942 SDValue DBZCHK = DAG.getNode(ARMISD::WIN__DBZCHK, dl, MVT::Other,
9943 DAG.getEntryNode(), Op.getOperand(1));
9944
9945 return LowerWindowsDIVLibCall(Op, DAG, Signed, DBZCHK);
9946}
9947
9949 SDLoc DL(N);
9950 SDValue Op = N->getOperand(1);
9951 if (N->getValueType(0) == MVT::i32)
9952 return DAG.getNode(ARMISD::WIN__DBZCHK, DL, MVT::Other, InChain, Op);
9953 SDValue Lo, Hi;
9954 std::tie(Lo, Hi) = DAG.SplitScalar(Op, DL, MVT::i32, MVT::i32);
9955 return DAG.getNode(ARMISD::WIN__DBZCHK, DL, MVT::Other, InChain,
9956 DAG.getNode(ISD::OR, DL, MVT::i32, Lo, Hi));
9957}
9958
9959void ARMTargetLowering::ExpandDIV_Windows(
9960 SDValue Op, SelectionDAG &DAG, bool Signed,
9962 const auto &DL = DAG.getDataLayout();
9963
9964 assert(Op.getValueType() == MVT::i64 &&
9965 "unexpected type for custom lowering DIV");
9966 SDLoc dl(Op);
9967
9968 SDValue DBZCHK = WinDBZCheckDenominator(DAG, Op.getNode(), DAG.getEntryNode());
9969
9970 SDValue Result = LowerWindowsDIVLibCall(Op, DAG, Signed, DBZCHK);
9971
9972 SDValue Lower = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Result);
9973 SDValue Upper = DAG.getNode(ISD::SRL, dl, MVT::i64, Result,
9974 DAG.getConstant(32, dl, getPointerTy(DL)));
9975 Upper = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Upper);
9976
9977 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lower, Upper));
9978}
9979
9981 LoadSDNode *LD = cast<LoadSDNode>(Op.getNode());
9982 EVT MemVT = LD->getMemoryVT();
9983 assert((MemVT == MVT::v2i1 || MemVT == MVT::v4i1 || MemVT == MVT::v8i1 ||
9984 MemVT == MVT::v16i1) &&
9985 "Expected a predicate type!");
9986 assert(MemVT == Op.getValueType());
9987 assert(LD->getExtensionType() == ISD::NON_EXTLOAD &&
9988 "Expected a non-extending load");
9989 assert(LD->isUnindexed() && "Expected a unindexed load");
9990
9991 // The basic MVE VLDR on a v2i1/v4i1/v8i1 actually loads the entire 16bit
9992 // predicate, with the "v4i1" bits spread out over the 16 bits loaded. We
9993 // need to make sure that 8/4/2 bits are actually loaded into the correct
9994 // place, which means loading the value and then shuffling the values into
9995 // the bottom bits of the predicate.
9996 // Equally, VLDR for an v16i1 will actually load 32bits (so will be incorrect
9997 // for BE).
9998 // Speaking of BE, apparently the rest of llvm will assume a reverse order to
9999 // a natural VMSR(load), so needs to be reversed.
10000
10001 SDLoc dl(Op);
10002 SDValue Load = DAG.getExtLoad(
10003 ISD::EXTLOAD, dl, MVT::i32, LD->getChain(), LD->getBasePtr(),
10005 LD->getMemOperand());
10006 SDValue Val = Load;
10007 if (DAG.getDataLayout().isBigEndian())
10008 Val = DAG.getNode(ISD::SRL, dl, MVT::i32,
10009 DAG.getNode(ISD::BITREVERSE, dl, MVT::i32, Load),
10010 DAG.getConstant(32 - MemVT.getSizeInBits(), dl, MVT::i32));
10011 SDValue Pred = DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::v16i1, Val);
10012 if (MemVT != MVT::v16i1)
10013 Pred = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MemVT, Pred,
10014 DAG.getConstant(0, dl, MVT::i32));
10015 return DAG.getMergeValues({Pred, Load.getValue(1)}, dl);
10016}
10017
10018void ARMTargetLowering::LowerLOAD(SDNode *N, SmallVectorImpl<SDValue> &Results,
10019 SelectionDAG &DAG) const {
10020 LoadSDNode *LD = cast<LoadSDNode>(N);
10021 EVT MemVT = LD->getMemoryVT();
10022 assert(LD->isUnindexed() && "Loads should be unindexed at this point.");
10023
10024 if (MemVT == MVT::i64 && Subtarget->hasV5TEOps() &&
10025 !Subtarget->isThumb1Only() && LD->isVolatile() &&
10026 LD->getAlign() >= Subtarget->getDualLoadStoreAlignment()) {
10027 SDLoc dl(N);
10029 ARMISD::LDRD, dl, DAG.getVTList({MVT::i32, MVT::i32, MVT::Other}),
10030 {LD->getChain(), LD->getBasePtr()}, MemVT, LD->getMemOperand());
10031 SDValue Lo = Result.getValue(DAG.getDataLayout().isLittleEndian() ? 0 : 1);
10032 SDValue Hi = Result.getValue(DAG.getDataLayout().isLittleEndian() ? 1 : 0);
10033 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
10034 Results.append({Pair, Result.getValue(2)});
10035 }
10036}
10037
10039 StoreSDNode *ST = cast<StoreSDNode>(Op.getNode());
10040 EVT MemVT = ST->getMemoryVT();
10041 assert((MemVT == MVT::v2i1 || MemVT == MVT::v4i1 || MemVT == MVT::v8i1 ||
10042 MemVT == MVT::v16i1) &&
10043 "Expected a predicate type!");
10044 assert(MemVT == ST->getValue().getValueType());
10045 assert(!ST->isTruncatingStore() && "Expected a non-extending store");
10046 assert(ST->isUnindexed() && "Expected a unindexed store");
10047
10048 // Only store the v2i1 or v4i1 or v8i1 worth of bits, via a buildvector with
10049 // top bits unset and a scalar store.
10050 SDLoc dl(Op);
10051 SDValue Build = ST->getValue();
10052 if (MemVT != MVT::v16i1) {
10054 for (unsigned I = 0; I < MemVT.getVectorNumElements(); I++) {
10055 unsigned Elt = DAG.getDataLayout().isBigEndian()
10056 ? MemVT.getVectorNumElements() - I - 1
10057 : I;
10058 Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, Build,
10059 DAG.getConstant(Elt, dl, MVT::i32)));
10060 }
10061 for (unsigned I = MemVT.getVectorNumElements(); I < 16; I++)
10062 Ops.push_back(DAG.getUNDEF(MVT::i32));
10063 Build = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i1, Ops);
10064 }
10065 SDValue GRP = DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::i32, Build);
10066 if (MemVT == MVT::v16i1 && DAG.getDataLayout().isBigEndian())
10067 GRP = DAG.getNode(ISD::SRL, dl, MVT::i32,
10068 DAG.getNode(ISD::BITREVERSE, dl, MVT::i32, GRP),
10069 DAG.getConstant(16, dl, MVT::i32));
10070 return DAG.getTruncStore(
10071 ST->getChain(), dl, GRP, ST->getBasePtr(),
10073 ST->getMemOperand());
10074}
10075
10077 const ARMSubtarget *Subtarget) {
10078 StoreSDNode *ST = cast<StoreSDNode>(Op.getNode());
10079 EVT MemVT = ST->getMemoryVT();
10080 assert(ST->isUnindexed() && "Stores should be unindexed at this point.");
10081
10082 if (MemVT == MVT::i64 && Subtarget->hasV5TEOps() &&
10083 !Subtarget->isThumb1Only() && ST->isVolatile() &&
10084 ST->getAlign() >= Subtarget->getDualLoadStoreAlignment()) {
10085 SDNode *N = Op.getNode();
10086 SDLoc dl(N);
10087
10088 SDValue Lo = DAG.getNode(
10089 ISD::EXTRACT_ELEMENT, dl, MVT::i32, ST->getValue(),
10090 DAG.getTargetConstant(DAG.getDataLayout().isLittleEndian() ? 0 : 1, dl,
10091 MVT::i32));
10092 SDValue Hi = DAG.getNode(
10093 ISD::EXTRACT_ELEMENT, dl, MVT::i32, ST->getValue(),
10094 DAG.getTargetConstant(DAG.getDataLayout().isLittleEndian() ? 1 : 0, dl,
10095 MVT::i32));
10096
10097 return DAG.getMemIntrinsicNode(ARMISD::STRD, dl, DAG.getVTList(MVT::Other),
10098 {ST->getChain(), Lo, Hi, ST->getBasePtr()},
10099 MemVT, ST->getMemOperand());
10100 } else if (Subtarget->hasMVEIntegerOps() &&
10101 ((MemVT == MVT::v2i1 || MemVT == MVT::v4i1 || MemVT == MVT::v8i1 ||
10102 MemVT == MVT::v16i1))) {
10103 return LowerPredicateStore(Op, DAG);
10104 }
10105
10106 return SDValue();
10107}
10108
10109static bool isZeroVector(SDValue N) {
10110 return (ISD::isBuildVectorAllZeros(N.getNode()) ||
10111 (N->getOpcode() == ARMISD::VMOVIMM &&
10112 isNullConstant(N->getOperand(0))));
10113}
10114
10117 MVT VT = Op.getSimpleValueType();
10118 SDValue Mask = N->getMask();
10119 SDValue PassThru = N->getPassThru();
10120 SDLoc dl(Op);
10121
10122 if (isZeroVector(PassThru))
10123 return Op;
10124
10125 // MVE Masked loads use zero as the passthru value. Here we convert undef to
10126 // zero too, and other values are lowered to a select.
10127 SDValue ZeroVec = DAG.getNode(ARMISD::VMOVIMM, dl, VT,
10128 DAG.getTargetConstant(0, dl, MVT::i32));
10129 SDValue NewLoad = DAG.getMaskedLoad(
10130 VT, dl, N->getChain(), N->getBasePtr(), N->getOffset(), Mask, ZeroVec,
10131 N->getMemoryVT(), N->getMemOperand(), N->getAddressingMode(),
10132 N->getExtensionType(), N->isExpandingLoad());
10133 SDValue Combo = NewLoad;
10134 bool PassThruIsCastZero = (PassThru.getOpcode() == ISD::BITCAST ||
10135 PassThru.getOpcode() == ARMISD::VECTOR_REG_CAST) &&
10136 isZeroVector(PassThru->getOperand(0));
10137 if (!PassThru.isUndef() && !PassThruIsCastZero)
10138 Combo = DAG.getNode(ISD::VSELECT, dl, VT, Mask, NewLoad, PassThru);
10139 return DAG.getMergeValues({Combo, NewLoad.getValue(1)}, dl);
10140}
10141
10143 const ARMSubtarget *ST) {
10144 if (!ST->hasMVEIntegerOps())
10145 return SDValue();
10146
10147 SDLoc dl(Op);
10148 unsigned BaseOpcode = 0;
10149 switch (Op->getOpcode()) {
10150 default: llvm_unreachable("Expected VECREDUCE opcode");
10151 case ISD::VECREDUCE_FADD: BaseOpcode = ISD::FADD; break;
10152 case ISD::VECREDUCE_FMUL: BaseOpcode = ISD::FMUL; break;
10153 case ISD::VECREDUCE_MUL: BaseOpcode = ISD::MUL; break;
10154 case ISD::VECREDUCE_AND: BaseOpcode = ISD::AND; break;
10155 case ISD::VECREDUCE_OR: BaseOpcode = ISD::OR; break;
10156 case ISD::VECREDUCE_XOR: BaseOpcode = ISD::XOR; break;
10157 case ISD::VECREDUCE_FMAX: BaseOpcode = ISD::FMAXNUM; break;
10158 case ISD::VECREDUCE_FMIN: BaseOpcode = ISD::FMINNUM; break;
10159 }
10160
10161 SDValue Op0 = Op->getOperand(0);
10162 EVT VT = Op0.getValueType();
10163 EVT EltVT = VT.getVectorElementType();
10164 unsigned NumElts = VT.getVectorNumElements();
10165 unsigned NumActiveLanes = NumElts;
10166
10167 assert((NumActiveLanes == 16 || NumActiveLanes == 8 || NumActiveLanes == 4 ||
10168 NumActiveLanes == 2) &&
10169 "Only expected a power 2 vector size");
10170
10171 // Use Mul(X, Rev(X)) until 4 items remain. Going down to 4 vector elements
10172 // allows us to easily extract vector elements from the lanes.
10173 while (NumActiveLanes > 4) {
10174 unsigned RevOpcode = NumActiveLanes == 16 ? ARMISD::VREV16 : ARMISD::VREV32;
10175 SDValue Rev = DAG.getNode(RevOpcode, dl, VT, Op0);
10176 Op0 = DAG.getNode(BaseOpcode, dl, VT, Op0, Rev);
10177 NumActiveLanes /= 2;
10178 }
10179
10180 SDValue Res;
10181 if (NumActiveLanes == 4) {
10182 // The remaining 4 elements are summed sequentially
10183 SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0,
10184 DAG.getConstant(0 * NumElts / 4, dl, MVT::i32));
10185 SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0,
10186 DAG.getConstant(1 * NumElts / 4, dl, MVT::i32));
10187 SDValue Ext2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0,
10188 DAG.getConstant(2 * NumElts / 4, dl, MVT::i32));
10189 SDValue Ext3 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0,
10190 DAG.getConstant(3 * NumElts / 4, dl, MVT::i32));
10191 SDValue Res0 = DAG.getNode(BaseOpcode, dl, EltVT, Ext0, Ext1, Op->getFlags());
10192 SDValue Res1 = DAG.getNode(BaseOpcode, dl, EltVT, Ext2, Ext3, Op->getFlags());
10193 Res = DAG.getNode(BaseOpcode, dl, EltVT, Res0, Res1, Op->getFlags());
10194 } else {
10195 SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0,
10196 DAG.getConstant(0, dl, MVT::i32));
10197 SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0,
10198 DAG.getConstant(1, dl, MVT::i32));
10199 Res = DAG.getNode(BaseOpcode, dl, EltVT, Ext0, Ext1, Op->getFlags());
10200 }
10201
10202 // Result type may be wider than element type.
10203 if (EltVT != Op->getValueType(0))
10204 Res = DAG.getNode(ISD::ANY_EXTEND, dl, Op->getValueType(0), Res);
10205 return Res;
10206}
10207
10209 const ARMSubtarget *ST) {
10210 if (!ST->hasMVEFloatOps())
10211 return SDValue();
10212 return LowerVecReduce(Op, DAG, ST);
10213}
10214
10216 const ARMSubtarget *ST) {
10217 if (!ST->hasNEON())
10218 return SDValue();
10219
10220 SDLoc dl(Op);
10221 SDValue Op0 = Op->getOperand(0);
10222 EVT VT = Op0.getValueType();
10223 EVT EltVT = VT.getVectorElementType();
10224
10225 unsigned PairwiseIntrinsic = 0;
10226 switch (Op->getOpcode()) {
10227 default:
10228 llvm_unreachable("Expected VECREDUCE opcode");
10229 case ISD::VECREDUCE_UMIN:
10230 PairwiseIntrinsic = Intrinsic::arm_neon_vpminu;
10231 break;
10232 case ISD::VECREDUCE_UMAX:
10233 PairwiseIntrinsic = Intrinsic::arm_neon_vpmaxu;
10234 break;
10235 case ISD::VECREDUCE_SMIN:
10236 PairwiseIntrinsic = Intrinsic::arm_neon_vpmins;
10237 break;
10238 case ISD::VECREDUCE_SMAX:
10239 PairwiseIntrinsic = Intrinsic::arm_neon_vpmaxs;
10240 break;
10241 }
10242 SDValue PairwiseOp = DAG.getConstant(PairwiseIntrinsic, dl, MVT::i32);
10243
10244 unsigned NumElts = VT.getVectorNumElements();
10245 unsigned NumActiveLanes = NumElts;
10246
10247 assert((NumActiveLanes == 16 || NumActiveLanes == 8 || NumActiveLanes == 4 ||
10248 NumActiveLanes == 2) &&
10249 "Only expected a power 2 vector size");
10250
10251 // Split 128-bit vectors, since vpmin/max takes 2 64-bit vectors.
10252 if (VT.is128BitVector()) {
10253 SDValue Lo, Hi;
10254 std::tie(Lo, Hi) = DAG.SplitVector(Op0, dl);
10255 VT = Lo.getValueType();
10256 Op0 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, {PairwiseOp, Lo, Hi});
10257 NumActiveLanes /= 2;
10258 }
10259
10260 // Use pairwise reductions until one lane remains
10261 while (NumActiveLanes > 1) {
10262 Op0 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, {PairwiseOp, Op0, Op0});
10263 NumActiveLanes /= 2;
10264 }
10265
10266 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0,
10267 DAG.getConstant(0, dl, MVT::i32));
10268
10269 // Result type may be wider than element type.
10270 if (EltVT != Op.getValueType()) {
10271 unsigned Extend = 0;
10272 switch (Op->getOpcode()) {
10273 default:
10274 llvm_unreachable("Expected VECREDUCE opcode");
10275 case ISD::VECREDUCE_UMIN:
10276 case ISD::VECREDUCE_UMAX:
10277 Extend = ISD::ZERO_EXTEND;
10278 break;
10279 case ISD::VECREDUCE_SMIN:
10280 case ISD::VECREDUCE_SMAX:
10281 Extend = ISD::SIGN_EXTEND;
10282 break;
10283 }
10284 Res = DAG.getNode(Extend, dl, Op.getValueType(), Res);
10285 }
10286 return Res;
10287}
10288
10290 if (isStrongerThanMonotonic(cast<AtomicSDNode>(Op)->getSuccessOrdering()))
10291 // Acquire/Release load/store is not legal for targets without a dmb or
10292 // equivalent available.
10293 return SDValue();
10294
10295 // Monotonic load/store is legal for all targets.
10296 return Op;
10297}
10298
10301 SelectionDAG &DAG,
10302 const ARMSubtarget *Subtarget) {
10303 SDLoc DL(N);
10304 // Under Power Management extensions, the cycle-count is:
10305 // mrc p15, #0, <Rt>, c9, c13, #0
10306 SDValue Ops[] = { N->getOperand(0), // Chain
10307 DAG.getTargetConstant(Intrinsic::arm_mrc, DL, MVT::i32),
10308 DAG.getTargetConstant(15, DL, MVT::i32),
10309 DAG.getTargetConstant(0, DL, MVT::i32),
10310 DAG.getTargetConstant(9, DL, MVT::i32),
10311 DAG.getTargetConstant(13, DL, MVT::i32),
10312 DAG.getTargetConstant(0, DL, MVT::i32)
10313 };
10314
10315 SDValue Cycles32 = DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL,
10316 DAG.getVTList(MVT::i32, MVT::Other), Ops);
10317 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Cycles32,
10318 DAG.getConstant(0, DL, MVT::i32)));
10319 Results.push_back(Cycles32.getValue(1));
10320}
10321
10323 SDValue V1) {
10324 SDLoc dl(V0.getNode());
10325 SDValue RegClass =
10326 DAG.getTargetConstant(ARM::GPRPairRegClassID, dl, MVT::i32);
10327 SDValue SubReg0 = DAG.getTargetConstant(ARM::gsub_0, dl, MVT::i32);
10328 SDValue SubReg1 = DAG.getTargetConstant(ARM::gsub_1, dl, MVT::i32);
10329 const SDValue Ops[] = {RegClass, V0, SubReg0, V1, SubReg1};
10330 return SDValue(
10331 DAG.getMachineNode(TargetOpcode::REG_SEQUENCE, dl, MVT::Untyped, Ops), 0);
10332}
10333
10335 SDLoc dl(V.getNode());
10336 auto [VLo, VHi] = DAG.SplitScalar(V, dl, MVT::i32, MVT::i32);
10337 bool isBigEndian = DAG.getDataLayout().isBigEndian();
10338 if (isBigEndian)
10339 std::swap(VLo, VHi);
10340 return createGPRPairNode2xi32(DAG, VLo, VHi);
10341}
10342
10345 SelectionDAG &DAG) {
10346 assert(N->getValueType(0) == MVT::i64 &&
10347 "AtomicCmpSwap on types less than 64 should be legal");
10348 SDValue Ops[] = {
10349 createGPRPairNode2xi32(DAG, N->getOperand(1),
10350 DAG.getUNDEF(MVT::i32)), // pointer, temp
10351 createGPRPairNodei64(DAG, N->getOperand(2)), // expected
10352 createGPRPairNodei64(DAG, N->getOperand(3)), // new
10353 N->getOperand(0), // chain in
10354 };
10355 SDNode *CmpSwap = DAG.getMachineNode(
10356 ARM::CMP_SWAP_64, SDLoc(N),
10357 DAG.getVTList(MVT::Untyped, MVT::Untyped, MVT::Other), Ops);
10358
10359 MachineMemOperand *MemOp = cast<MemSDNode>(N)->getMemOperand();
10360 DAG.setNodeMemRefs(cast<MachineSDNode>(CmpSwap), {MemOp});
10361
10362 bool isBigEndian = DAG.getDataLayout().isBigEndian();
10363
10364 SDValue Lo =
10365 DAG.getTargetExtractSubreg(isBigEndian ? ARM::gsub_1 : ARM::gsub_0,
10366 SDLoc(N), MVT::i32, SDValue(CmpSwap, 0));
10367 SDValue Hi =
10368 DAG.getTargetExtractSubreg(isBigEndian ? ARM::gsub_0 : ARM::gsub_1,
10369 SDLoc(N), MVT::i32, SDValue(CmpSwap, 0));
10370 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i64, Lo, Hi));
10371 Results.push_back(SDValue(CmpSwap, 2));
10372}
10373
10374SDValue ARMTargetLowering::LowerFSETCC(SDValue Op, SelectionDAG &DAG) const {
10375 SDLoc dl(Op);
10376 EVT VT = Op.getValueType();
10377 SDValue Chain = Op.getOperand(0);
10378 SDValue LHS = Op.getOperand(1);
10379 SDValue RHS = Op.getOperand(2);
10380 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(3))->get();
10381 bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
10382
10383 // If we don't have instructions of this float type then soften to a libcall
10384 // and use SETCC instead.
10385 if (isUnsupportedFloatingType(LHS.getValueType())) {
10386 softenSetCCOperands(DAG, LHS.getValueType(), LHS, RHS, CC, dl, LHS, RHS,
10387 Chain, IsSignaling);
10388 if (!RHS.getNode()) {
10389 RHS = DAG.getConstant(0, dl, LHS.getValueType());
10390 CC = ISD::SETNE;
10391 }
10392 SDValue Result = DAG.getNode(ISD::SETCC, dl, VT, LHS, RHS,
10393 DAG.getCondCode(CC));
10394 return DAG.getMergeValues({Result, Chain}, dl);
10395 }
10396
10397 ARMCC::CondCodes CondCode, CondCode2;
10398 FPCCToARMCC(CC, CondCode, CondCode2);
10399
10400 SDValue True = DAG.getConstant(1, dl, VT);
10401 SDValue False = DAG.getConstant(0, dl, VT);
10402 SDValue ARMcc = DAG.getConstant(CondCode, dl, MVT::i32);
10403 SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl, IsSignaling);
10404 SDValue Result = getCMOV(dl, VT, False, True, ARMcc, Cmp, DAG);
10405 if (CondCode2 != ARMCC::AL) {
10406 ARMcc = DAG.getConstant(CondCode2, dl, MVT::i32);
10407 Result = getCMOV(dl, VT, Result, True, ARMcc, Cmp, DAG);
10408 }
10409 return DAG.getMergeValues({Result, Chain}, dl);
10410}
10411
10412SDValue ARMTargetLowering::LowerSPONENTRY(SDValue Op, SelectionDAG &DAG) const {
10413 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
10414
10415 EVT VT = getPointerTy(DAG.getDataLayout());
10416 int FI = MFI.CreateFixedObject(4, 0, false);
10417 return DAG.getFrameIndex(FI, VT);
10418}
10419
10420SDValue ARMTargetLowering::LowerFP_TO_BF16(SDValue Op,
10421 SelectionDAG &DAG) const {
10422 SDLoc DL(Op);
10423 MakeLibCallOptions CallOptions;
10424 MVT SVT = Op.getOperand(0).getSimpleValueType();
10425 RTLIB::Libcall LC = RTLIB::getFPROUND(SVT, MVT::bf16);
10426 SDValue Res =
10427 makeLibCall(DAG, LC, MVT::f32, Op.getOperand(0), CallOptions, DL).first;
10428 return DAG.getBitcast(MVT::i32, Res);
10429}
10430
10431SDValue ARMTargetLowering::LowerCMP(SDValue Op, SelectionDAG &DAG) const {
10432 SDLoc dl(Op);
10433 SDValue LHS = Op.getOperand(0);
10434 SDValue RHS = Op.getOperand(1);
10435
10436 // Determine if this is signed or unsigned comparison
10437 bool IsSigned = (Op.getOpcode() == ISD::SCMP);
10438
10439 // Special case for Thumb1 UCMP only
10440 if (!IsSigned && Subtarget->isThumb1Only()) {
10441 // For Thumb unsigned comparison, use this sequence:
10442 // subs r2, r0, r1 ; r2 = LHS - RHS, sets flags
10443 // sbc r2, r2 ; r2 = r2 - r2 - !carry
10444 // cmp r1, r0 ; compare RHS with LHS
10445 // sbc r1, r1 ; r1 = r1 - r1 - !carry
10446 // subs r0, r2, r1 ; r0 = r2 - r1 (final result)
10447
10448 // First subtraction: LHS - RHS
10449 SDValue Sub1WithFlags = DAG.getNode(
10450 ARMISD::SUBC, dl, DAG.getVTList(MVT::i32, FlagsVT), LHS, RHS);
10451 SDValue Sub1Result = Sub1WithFlags.getValue(0);
10452 SDValue Flags1 = Sub1WithFlags.getValue(1);
10453
10454 // SUBE: Sub1Result - Sub1Result - !carry
10455 // This gives 0 if LHS >= RHS (unsigned), -1 if LHS < RHS (unsigned)
10456 SDValue Sbc1 =
10457 DAG.getNode(ARMISD::SUBE, dl, DAG.getVTList(MVT::i32, FlagsVT),
10458 Sub1Result, Sub1Result, Flags1);
10459 SDValue Sbc1Result = Sbc1.getValue(0);
10460
10461 // Second comparison: RHS vs LHS (reverse comparison)
10462 SDValue CmpFlags = DAG.getNode(ARMISD::CMP, dl, FlagsVT, RHS, LHS);
10463
10464 // SUBE: RHS - RHS - !carry
10465 // This gives 0 if RHS <= LHS (unsigned), -1 if RHS > LHS (unsigned)
10466 SDValue Sbc2 = DAG.getNode(
10467 ARMISD::SUBE, dl, DAG.getVTList(MVT::i32, FlagsVT), RHS, RHS, CmpFlags);
10468 SDValue Sbc2Result = Sbc2.getValue(0);
10469
10470 // Final subtraction: Sbc1Result - Sbc2Result (no flags needed)
10471 SDValue Result =
10472 DAG.getNode(ISD::SUB, dl, MVT::i32, Sbc1Result, Sbc2Result);
10473 if (Op.getValueType() != MVT::i32)
10474 Result = DAG.getSExtOrTrunc(Result, dl, Op.getValueType());
10475
10476 return Result;
10477 }
10478
10479 // For the ARM assembly pattern:
10480 // subs r0, r0, r1 ; subtract RHS from LHS and set flags
10481 // movgt r0, #1 ; if LHS > RHS, set result to 1 (GT for signed, HI for
10482 // unsigned) mvnlt r0, #0 ; if LHS < RHS, set result to -1 (LT for
10483 // signed, LO for unsigned)
10484 // ; if LHS == RHS, result remains 0 from the subs
10485
10486 // Optimization: if RHS is a subtraction against 0, use ADDC instead of SUBC
10487 unsigned Opcode = ARMISD::SUBC;
10488
10489 // Check if RHS is a subtraction against 0: (0 - X)
10490 if (RHS.getOpcode() == ISD::SUB) {
10491 SDValue SubLHS = RHS.getOperand(0);
10492 SDValue SubRHS = RHS.getOperand(1);
10493
10494 // Check if it's 0 - X
10495 if (isNullConstant(SubLHS)) {
10496 bool CanUseAdd = false;
10497 if (IsSigned) {
10498 // For SCMP: only if X is known to never be INT_MIN (to avoid overflow)
10499 if (RHS->getFlags().hasNoSignedWrap() || !DAG.computeKnownBits(SubRHS)
10501 .isMinSignedValue()) {
10502 CanUseAdd = true;
10503 }
10504 } else {
10505 // For UCMP: only if X is known to never be zero
10506 if (DAG.isKnownNeverZero(SubRHS)) {
10507 CanUseAdd = true;
10508 }
10509 }
10510
10511 if (CanUseAdd) {
10512 Opcode = ARMISD::ADDC;
10513 RHS = SubRHS; // Replace RHS with X, so we do LHS + X instead of
10514 // LHS - (0 - X)
10515 }
10516 }
10517 }
10518
10519 // Generate the operation with flags
10520 SDValue OpWithFlags =
10521 DAG.getNode(Opcode, dl, DAG.getVTList(MVT::i32, FlagsVT), LHS, RHS);
10522
10523 SDValue OpResult = OpWithFlags.getValue(0);
10524 SDValue Flags = OpWithFlags.getValue(1);
10525
10526 // Constants for conditional moves
10527 SDValue One = DAG.getConstant(1, dl, MVT::i32);
10528 SDValue MinusOne = DAG.getAllOnesConstant(dl, MVT::i32);
10529
10530 // Select condition codes based on signed vs unsigned
10531 ARMCC::CondCodes GTCond = IsSigned ? ARMCC::GT : ARMCC::HI;
10532 ARMCC::CondCodes LTCond = IsSigned ? ARMCC::LT : ARMCC::LO;
10533
10534 // First conditional move: if greater than, set to 1
10535 SDValue GTCondValue = DAG.getConstant(GTCond, dl, MVT::i32);
10536 SDValue Result1 = DAG.getNode(ARMISD::CMOV, dl, MVT::i32, OpResult, One,
10537 GTCondValue, Flags);
10538
10539 // Second conditional move: if less than, set to -1
10540 SDValue LTCondValue = DAG.getConstant(LTCond, dl, MVT::i32);
10541 SDValue Result2 = DAG.getNode(ARMISD::CMOV, dl, MVT::i32, Result1, MinusOne,
10542 LTCondValue, Flags);
10543
10544 if (Op.getValueType() != MVT::i32)
10545 Result2 = DAG.getSExtOrTrunc(Result2, dl, Op.getValueType());
10546
10547 return Result2;
10548}
10549
10551 LLVM_DEBUG(dbgs() << "Lowering node: "; Op.dump());
10552 switch (Op.getOpcode()) {
10553 default: llvm_unreachable("Don't know how to custom lower this!");
10554 case ISD::WRITE_REGISTER: return LowerWRITE_REGISTER(Op, DAG);
10555 case ISD::ConstantPool: return LowerConstantPool(Op, DAG);
10556 case ISD::BlockAddress: return LowerBlockAddress(Op, DAG);
10557 case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG);
10558 case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
10559 case ISD::SELECT: return LowerSELECT(Op, DAG);
10560 case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
10561 case ISD::BRCOND: return LowerBRCOND(Op, DAG);
10562 case ISD::BR_CC: return LowerBR_CC(Op, DAG);
10563 case ISD::BR_JT: return LowerBR_JT(Op, DAG);
10564 case ISD::VASTART: return LowerVASTART(Op, DAG);
10565 case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, DAG, Subtarget);
10566 case ISD::PREFETCH: return LowerPREFETCH(Op, DAG, Subtarget);
10567 case ISD::SINT_TO_FP:
10568 case ISD::UINT_TO_FP: return LowerINT_TO_FP(Op, DAG);
10571 case ISD::FP_TO_SINT:
10572 case ISD::FP_TO_UINT: return LowerFP_TO_INT(Op, DAG);
10574 case ISD::FP_TO_UINT_SAT: return LowerFP_TO_INT_SAT(Op, DAG, Subtarget);
10575 case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG);
10576 case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG);
10577 case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG);
10578 case ISD::EH_SJLJ_SETJMP: return LowerEH_SJLJ_SETJMP(Op, DAG);
10579 case ISD::EH_SJLJ_LONGJMP: return LowerEH_SJLJ_LONGJMP(Op, DAG);
10580 case ISD::EH_SJLJ_SETUP_DISPATCH: return LowerEH_SJLJ_SETUP_DISPATCH(Op, DAG);
10581 case ISD::INTRINSIC_VOID: return LowerINTRINSIC_VOID(Op, DAG, Subtarget);
10582 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG,
10583 Subtarget);
10584 case ISD::BITCAST: return ExpandBITCAST(Op.getNode(), DAG, Subtarget);
10585 case ISD::SHL:
10586 case ISD::SRL:
10587 case ISD::SRA: return LowerShift(Op.getNode(), DAG, Subtarget);
10588 case ISD::SREM: return LowerREM(Op.getNode(), DAG);
10589 case ISD::UREM: return LowerREM(Op.getNode(), DAG);
10590 case ISD::SHL_PARTS: return LowerShiftLeftParts(Op, DAG);
10591 case ISD::SRL_PARTS:
10592 case ISD::SRA_PARTS: return LowerShiftRightParts(Op, DAG);
10593 case ISD::CTTZ:
10594 case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op.getNode(), DAG, Subtarget);
10595 case ISD::CTPOP: return LowerCTPOP(Op.getNode(), DAG, Subtarget);
10596 case ISD::SETCC: return LowerVSETCC(Op, DAG, Subtarget);
10597 case ISD::SETCCCARRY: return LowerSETCCCARRY(Op, DAG);
10598 case ISD::ConstantFP: return LowerConstantFP(Op, DAG, Subtarget);
10599 case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG, Subtarget);
10600 case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG, Subtarget);
10601 case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG, Subtarget);
10602 case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
10603 case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG, Subtarget);
10604 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG, Subtarget);
10605 case ISD::TRUNCATE: return LowerTruncate(Op.getNode(), DAG, Subtarget);
10606 case ISD::SIGN_EXTEND:
10607 case ISD::ZERO_EXTEND: return LowerVectorExtend(Op.getNode(), DAG, Subtarget);
10608 case ISD::GET_ROUNDING: return LowerGET_ROUNDING(Op, DAG);
10609 case ISD::SET_ROUNDING: return LowerSET_ROUNDING(Op, DAG);
10610 case ISD::SET_FPMODE:
10611 return LowerSET_FPMODE(Op, DAG);
10612 case ISD::RESET_FPMODE:
10613 return LowerRESET_FPMODE(Op, DAG);
10614 case ISD::MUL: return LowerMUL(Op, DAG);
10615 case ISD::SDIV:
10616 if (Subtarget->isTargetWindows() && !Op.getValueType().isVector())
10617 return LowerDIV_Windows(Op, DAG, /* Signed */ true);
10618 return LowerSDIV(Op, DAG, Subtarget);
10619 case ISD::UDIV:
10620 if (Subtarget->isTargetWindows() && !Op.getValueType().isVector())
10621 return LowerDIV_Windows(Op, DAG, /* Signed */ false);
10622 return LowerUDIV(Op, DAG, Subtarget);
10623 case ISD::UADDO_CARRY:
10624 case ISD::USUBO_CARRY:
10625 return LowerUADDSUBO_CARRY(Op, DAG);
10626 case ISD::SADDO:
10627 case ISD::SSUBO:
10628 return LowerSignedALUO(Op, DAG);
10629 case ISD::UADDO:
10630 case ISD::USUBO:
10631 return LowerUnsignedALUO(Op, DAG);
10632 case ISD::SADDSAT:
10633 case ISD::SSUBSAT:
10634 case ISD::UADDSAT:
10635 case ISD::USUBSAT:
10636 return LowerADDSUBSAT(Op, DAG, Subtarget);
10637 case ISD::LOAD:
10638 return LowerPredicateLoad(Op, DAG);
10639 case ISD::STORE:
10640 return LowerSTORE(Op, DAG, Subtarget);
10641 case ISD::MLOAD:
10642 return LowerMLOAD(Op, DAG);
10643 case ISD::VECREDUCE_MUL:
10644 case ISD::VECREDUCE_AND:
10645 case ISD::VECREDUCE_OR:
10646 case ISD::VECREDUCE_XOR:
10647 return LowerVecReduce(Op, DAG, Subtarget);
10648 case ISD::VECREDUCE_FADD:
10649 case ISD::VECREDUCE_FMUL:
10650 case ISD::VECREDUCE_FMIN:
10651 case ISD::VECREDUCE_FMAX:
10652 return LowerVecReduceF(Op, DAG, Subtarget);
10653 case ISD::VECREDUCE_UMIN:
10654 case ISD::VECREDUCE_UMAX:
10655 case ISD::VECREDUCE_SMIN:
10656 case ISD::VECREDUCE_SMAX:
10657 return LowerVecReduceMinMax(Op, DAG, Subtarget);
10658 case ISD::ATOMIC_LOAD:
10659 case ISD::ATOMIC_STORE:
10660 return LowerAtomicLoadStore(Op, DAG);
10661 case ISD::SDIVREM:
10662 case ISD::UDIVREM: return LowerDivRem(Op, DAG);
10663 case ISD::DYNAMIC_STACKALLOC:
10664 if (Subtarget->isTargetWindows())
10665 return LowerDYNAMIC_STACKALLOC(Op, DAG);
10666 llvm_unreachable("Don't know how to custom lower this!");
10668 case ISD::FP_ROUND: return LowerFP_ROUND(Op, DAG);
10670 case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG);
10671 case ISD::STRICT_FSETCC:
10672 case ISD::STRICT_FSETCCS: return LowerFSETCC(Op, DAG);
10673 case ISD::SPONENTRY:
10674 return LowerSPONENTRY(Op, DAG);
10675 case ISD::FP_TO_BF16:
10676 return LowerFP_TO_BF16(Op, DAG);
10677 case ARMISD::WIN__DBZCHK: return SDValue();
10678 case ISD::UCMP:
10679 case ISD::SCMP:
10680 return LowerCMP(Op, DAG);
10681 case ISD::ABS:
10682 return LowerABS(Op, DAG);
10683 case ISD::STRICT_LROUND:
10685 case ISD::STRICT_LRINT:
10686 case ISD::STRICT_LLRINT: {
10687 assert((Op.getOperand(1).getValueType() == MVT::f16 ||
10688 Op.getOperand(1).getValueType() == MVT::bf16) &&
10689 "Expected custom lowering of rounding operations only for f16");
10690 SDLoc DL(Op);
10691 SDValue Ext = DAG.getNode(ISD::STRICT_FP_EXTEND, DL, {MVT::f32, MVT::Other},
10692 {Op.getOperand(0), Op.getOperand(1)});
10693 return DAG.getNode(Op.getOpcode(), DL, {Op.getValueType(), MVT::Other},
10694 {Ext.getValue(1), Ext.getValue(0)});
10695 }
10696 }
10697}
10698
10700 SelectionDAG &DAG) {
10701 unsigned IntNo = N->getConstantOperandVal(0);
10702 unsigned Opc = 0;
10703 if (IntNo == Intrinsic::arm_smlald)
10705 else if (IntNo == Intrinsic::arm_smlaldx)
10707 else if (IntNo == Intrinsic::arm_smlsld)
10709 else if (IntNo == Intrinsic::arm_smlsldx)
10711 else
10712 return;
10713
10714 SDLoc dl(N);
10715 SDValue Lo, Hi;
10716 std::tie(Lo, Hi) = DAG.SplitScalar(N->getOperand(3), dl, MVT::i32, MVT::i32);
10717
10718 SDValue LongMul = DAG.getNode(Opc, dl,
10719 DAG.getVTList(MVT::i32, MVT::i32),
10720 N->getOperand(1), N->getOperand(2),
10721 Lo, Hi);
10722 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64,
10723 LongMul.getValue(0), LongMul.getValue(1)));
10724}
10725
10726/// ReplaceNodeResults - Replace the results of node with an illegal result
10727/// type with new values built out of custom code.
10730 SelectionDAG &DAG) const {
10731 SDValue Res;
10732 switch (N->getOpcode()) {
10733 default:
10734 llvm_unreachable("Don't know how to custom expand this!");
10735 case ISD::READ_REGISTER:
10737 break;
10738 case ISD::BITCAST:
10739 Res = ExpandBITCAST(N, DAG, Subtarget);
10740 break;
10741 case ISD::SRL:
10742 case ISD::SRA:
10743 case ISD::SHL:
10744 Res = Expand64BitShift(N, DAG, Subtarget);
10745 break;
10746 case ISD::SREM:
10747 case ISD::UREM:
10748 Res = LowerREM(N, DAG);
10749 break;
10750 case ISD::SDIVREM:
10751 case ISD::UDIVREM:
10752 Res = LowerDivRem(SDValue(N, 0), DAG);
10753 assert(Res.getNumOperands() == 2 && "DivRem needs two values");
10754 Results.push_back(Res.getValue(0));
10755 Results.push_back(Res.getValue(1));
10756 return;
10757 case ISD::SADDSAT:
10758 case ISD::SSUBSAT:
10759 case ISD::UADDSAT:
10760 case ISD::USUBSAT:
10761 Res = LowerADDSUBSAT(SDValue(N, 0), DAG, Subtarget);
10762 break;
10763 case ISD::READCYCLECOUNTER:
10764 ReplaceREADCYCLECOUNTER(N, Results, DAG, Subtarget);
10765 return;
10766 case ISD::UDIV:
10767 case ISD::SDIV:
10768 assert(Subtarget->isTargetWindows() && "can only expand DIV on Windows");
10769 return ExpandDIV_Windows(SDValue(N, 0), DAG, N->getOpcode() == ISD::SDIV,
10770 Results);
10771 case ISD::ATOMIC_CMP_SWAP:
10773 return;
10775 return ReplaceLongIntrinsic(N, Results, DAG);
10776 case ISD::LOAD:
10777 LowerLOAD(N, Results, DAG);
10778 break;
10779 case ISD::TRUNCATE:
10780 Res = LowerTruncate(N, DAG, Subtarget);
10781 break;
10782 case ISD::SIGN_EXTEND:
10783 case ISD::ZERO_EXTEND:
10784 Res = LowerVectorExtend(N, DAG, Subtarget);
10785 break;
10788 Res = LowerFP_TO_INT_SAT(SDValue(N, 0), DAG, Subtarget);
10789 break;
10790 }
10791 if (Res.getNode())
10792 Results.push_back(Res);
10793}
10794
10795//===----------------------------------------------------------------------===//
10796// ARM Scheduler Hooks
10797//===----------------------------------------------------------------------===//
10798
10799/// SetupEntryBlockForSjLj - Insert code into the entry block that creates and
10800/// registers the function context.
10801void ARMTargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,
10803 MachineBasicBlock *DispatchBB,
10804 int FI) const {
10805 assert(!Subtarget->isROPI() && !Subtarget->isRWPI() &&
10806 "ROPI/RWPI not currently supported with SjLj");
10807 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
10808 DebugLoc dl = MI.getDebugLoc();
10809 MachineFunction *MF = MBB->getParent();
10813 const Function &F = MF->getFunction();
10814
10815 bool isThumb = Subtarget->isThumb();
10816 bool isThumb2 = Subtarget->isThumb2();
10817
10818 unsigned PCLabelId = AFI->createPICLabelUId();
10819 unsigned PCAdj = (isThumb || isThumb2) ? 4 : 8;
10821 ARMConstantPoolMBB::Create(F.getContext(), DispatchBB, PCLabelId, PCAdj);
10822 unsigned CPI = MCP->getConstantPoolIndex(CPV, Align(4));
10823
10824 const TargetRegisterClass *TRC = isThumb ? &ARM::tGPRRegClass
10825 : &ARM::GPRRegClass;
10826
10827 // Grab constant pool and fixed stack memory operands.
10828 MachineMemOperand *CPMMO =
10831
10832 MachineMemOperand *FIMMOSt =
10835
10836 // Load the address of the dispatch MBB into the jump buffer.
10837 if (isThumb2) {
10838 // Incoming value: jbuf
10839 // ldr.n r5, LCPI1_1
10840 // orr r5, r5, #1
10841 // add r5, pc
10842 // str r5, [$jbuf, #+4] ; &jbuf[1]
10843 Register NewVReg1 = MRI->createVirtualRegister(TRC);
10844 BuildMI(*MBB, MI, dl, TII->get(ARM::t2LDRpci), NewVReg1)
10846 .addMemOperand(CPMMO)
10848 // Set the low bit because of thumb mode.
10849 Register NewVReg2 = MRI->createVirtualRegister(TRC);
10850 BuildMI(*MBB, MI, dl, TII->get(ARM::t2ORRri), NewVReg2)
10851 .addReg(NewVReg1, RegState::Kill)
10852 .addImm(0x01)
10854 .add(condCodeOp());
10855 Register NewVReg3 = MRI->createVirtualRegister(TRC);
10856 BuildMI(*MBB, MI, dl, TII->get(ARM::tPICADD), NewVReg3)
10857 .addReg(NewVReg2, RegState::Kill)
10858 .addImm(PCLabelId);
10859 BuildMI(*MBB, MI, dl, TII->get(ARM::t2STRi12))
10860 .addReg(NewVReg3, RegState::Kill)
10861 .addFrameIndex(FI)
10862 .addImm(36) // &jbuf[1] :: pc
10863 .addMemOperand(FIMMOSt)
10865 } else if (isThumb) {
10866 // Incoming value: jbuf
10867 // ldr.n r1, LCPI1_4
10868 // add r1, pc
10869 // mov r2, #1
10870 // orrs r1, r2
10871 // add r2, $jbuf, #+4 ; &jbuf[1]
10872 // str r1, [r2]
10873 Register NewVReg1 = MRI->createVirtualRegister(TRC);
10874 BuildMI(*MBB, MI, dl, TII->get(ARM::tLDRpci), NewVReg1)
10876 .addMemOperand(CPMMO)
10878 Register NewVReg2 = MRI->createVirtualRegister(TRC);
10879 BuildMI(*MBB, MI, dl, TII->get(ARM::tPICADD), NewVReg2)
10880 .addReg(NewVReg1, RegState::Kill)
10881 .addImm(PCLabelId);
10882 // Set the low bit because of thumb mode.
10883 Register NewVReg3 = MRI->createVirtualRegister(TRC);
10884 BuildMI(*MBB, MI, dl, TII->get(ARM::tMOVi8), NewVReg3)
10885 .addReg(ARM::CPSR, RegState::Define)
10886 .addImm(1)
10888 Register NewVReg4 = MRI->createVirtualRegister(TRC);
10889 BuildMI(*MBB, MI, dl, TII->get(ARM::tORR), NewVReg4)
10890 .addReg(ARM::CPSR, RegState::Define)
10891 .addReg(NewVReg2, RegState::Kill)
10892 .addReg(NewVReg3, RegState::Kill)
10894 Register NewVReg5 = MRI->createVirtualRegister(TRC);
10895 BuildMI(*MBB, MI, dl, TII->get(ARM::tADDframe), NewVReg5)
10896 .addFrameIndex(FI)
10897 .addImm(36); // &jbuf[1] :: pc
10898 BuildMI(*MBB, MI, dl, TII->get(ARM::tSTRi))
10899 .addReg(NewVReg4, RegState::Kill)
10900 .addReg(NewVReg5, RegState::Kill)
10901 .addImm(0)
10902 .addMemOperand(FIMMOSt)
10904 } else {
10905 // Incoming value: jbuf
10906 // ldr r1, LCPI1_1
10907 // add r1, pc, r1
10908 // str r1, [$jbuf, #+4] ; &jbuf[1]
10909 Register NewVReg1 = MRI->createVirtualRegister(TRC);
10910 BuildMI(*MBB, MI, dl, TII->get(ARM::LDRi12), NewVReg1)
10912 .addImm(0)
10913 .addMemOperand(CPMMO)
10915 Register NewVReg2 = MRI->createVirtualRegister(TRC);
10916 BuildMI(*MBB, MI, dl, TII->get(ARM::PICADD), NewVReg2)
10917 .addReg(NewVReg1, RegState::Kill)
10918 .addImm(PCLabelId)
10920 BuildMI(*MBB, MI, dl, TII->get(ARM::STRi12))
10921 .addReg(NewVReg2, RegState::Kill)
10922 .addFrameIndex(FI)
10923 .addImm(36) // &jbuf[1] :: pc
10924 .addMemOperand(FIMMOSt)
10926 }
10927}
10928
10929void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
10930 MachineBasicBlock *MBB) const {
10931 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
10932 DebugLoc dl = MI.getDebugLoc();
10933 MachineFunction *MF = MBB->getParent();
10934 MachineRegisterInfo *MRI = &MF->getRegInfo();
10935 MachineFrameInfo &MFI = MF->getFrameInfo();
10936 int FI = MFI.getFunctionContextIndex();
10937
10938 const TargetRegisterClass *TRC = Subtarget->isThumb() ? &ARM::tGPRRegClass
10939 : &ARM::GPRnopcRegClass;
10940
10941 // Get a mapping of the call site numbers to all of the landing pads they're
10942 // associated with.
10943 DenseMap<unsigned, SmallVector<MachineBasicBlock*, 2>> CallSiteNumToLPad;
10944 unsigned MaxCSNum = 0;
10945 for (MachineBasicBlock &BB : *MF) {
10946 if (!BB.isEHPad())
10947 continue;
10948
10949 // FIXME: We should assert that the EH_LABEL is the first MI in the landing
10950 // pad.
10951 for (MachineInstr &II : BB) {
10952 if (!II.isEHLabel())
10953 continue;
10954
10955 MCSymbol *Sym = II.getOperand(0).getMCSymbol();
10956 if (!MF->hasCallSiteLandingPad(Sym)) continue;
10957
10958 SmallVectorImpl<unsigned> &CallSiteIdxs = MF->getCallSiteLandingPad(Sym);
10959 for (unsigned Idx : CallSiteIdxs) {
10960 CallSiteNumToLPad[Idx].push_back(&BB);
10961 MaxCSNum = std::max(MaxCSNum, Idx);
10962 }
10963 break;
10964 }
10965 }
10966
10967 // Get an ordered list of the machine basic blocks for the jump table.
10968 std::vector<MachineBasicBlock*> LPadList;
10969 SmallPtrSet<MachineBasicBlock*, 32> InvokeBBs;
10970 LPadList.reserve(CallSiteNumToLPad.size());
10971 for (unsigned I = 1; I <= MaxCSNum; ++I) {
10972 SmallVectorImpl<MachineBasicBlock*> &MBBList = CallSiteNumToLPad[I];
10973 for (MachineBasicBlock *MBB : MBBList) {
10974 LPadList.push_back(MBB);
10975 InvokeBBs.insert_range(MBB->predecessors());
10976 }
10977 }
10978
10979 assert(!LPadList.empty() &&
10980 "No landing pad destinations for the dispatch jump table!");
10981
10982 // Create the jump table and associated information.
10983 MachineJumpTableInfo *JTI =
10984 MF->getOrCreateJumpTableInfo(MachineJumpTableInfo::EK_Inline);
10985 unsigned MJTI = JTI->createJumpTableIndex(LPadList);
10986
10987 // Create the MBBs for the dispatch code.
10988
10989 // Shove the dispatch's address into the return slot in the function context.
10990 MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock();
10991 DispatchBB->setIsEHPad();
10992
10993 MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
10994
10995 BuildMI(TrapBB, dl, TII->get(Subtarget->isThumb() ? ARM::tTRAP : ARM::TRAP));
10996 DispatchBB->addSuccessor(TrapBB);
10997
10998 MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock();
10999 DispatchBB->addSuccessor(DispContBB);
11000
11001 // Insert and MBBs.
11002 MF->insert(MF->end(), DispatchBB);
11003 MF->insert(MF->end(), DispContBB);
11004 MF->insert(MF->end(), TrapBB);
11005
11006 // Insert code into the entry block that creates and registers the function
11007 // context.
11008 SetupEntryBlockForSjLj(MI, MBB, DispatchBB, FI);
11009
11010 MachineMemOperand *FIMMOLd = MF->getMachineMemOperand(
11013
11014 MachineInstrBuilder MIB;
11015 MIB = BuildMI(DispatchBB, dl, TII->get(ARM::Int_eh_sjlj_dispatchsetup));
11016
11017 const ARMBaseInstrInfo *AII = static_cast<const ARMBaseInstrInfo*>(TII);
11018 const ARMBaseRegisterInfo &RI = AII->getRegisterInfo();
11019
11020 // Add a register mask with no preserved registers. This results in all
11021 // registers being marked as clobbered. This can't work if the dispatch block
11022 // is in a Thumb1 function and is linked with ARM code which uses the FP
11023 // registers, as there is no way to preserve the FP registers in Thumb1 mode.
11025
11026 bool IsPositionIndependent = isPositionIndependent();
11027 unsigned NumLPads = LPadList.size();
11028 if (Subtarget->isThumb2()) {
11029 Register NewVReg1 = MRI->createVirtualRegister(TRC);
11030 BuildMI(DispatchBB, dl, TII->get(ARM::t2LDRi12), NewVReg1)
11031 .addFrameIndex(FI)
11032 .addImm(4)
11033 .addMemOperand(FIMMOLd)
11035
11036 if (NumLPads < 256) {
11037 BuildMI(DispatchBB, dl, TII->get(ARM::t2CMPri))
11038 .addReg(NewVReg1)
11039 .addImm(LPadList.size())
11041 } else {
11042 Register VReg1 = MRI->createVirtualRegister(TRC);
11043 BuildMI(DispatchBB, dl, TII->get(ARM::t2MOVi16), VReg1)
11044 .addImm(NumLPads & 0xFFFF)
11046
11047 unsigned VReg2 = VReg1;
11048 if ((NumLPads & 0xFFFF0000) != 0) {
11049 VReg2 = MRI->createVirtualRegister(TRC);
11050 BuildMI(DispatchBB, dl, TII->get(ARM::t2MOVTi16), VReg2)
11051 .addReg(VReg1)
11052 .addImm(NumLPads >> 16)
11054 }
11055
11056 BuildMI(DispatchBB, dl, TII->get(ARM::t2CMPrr))
11057 .addReg(NewVReg1)
11058 .addReg(VReg2)
11060 }
11061
11062 BuildMI(DispatchBB, dl, TII->get(ARM::t2Bcc))
11063 .addMBB(TrapBB)
11065 .addReg(ARM::CPSR);
11066
11067 Register NewVReg3 = MRI->createVirtualRegister(TRC);
11068 BuildMI(DispContBB, dl, TII->get(ARM::t2LEApcrelJT), NewVReg3)
11069 .addJumpTableIndex(MJTI)
11071
11072 Register NewVReg4 = MRI->createVirtualRegister(TRC);
11073 BuildMI(DispContBB, dl, TII->get(ARM::t2ADDrs), NewVReg4)
11074 .addReg(NewVReg3, RegState::Kill)
11075 .addReg(NewVReg1)
11078 .add(condCodeOp());
11079
11080 BuildMI(DispContBB, dl, TII->get(ARM::t2BR_JT))
11081 .addReg(NewVReg4, RegState::Kill)
11082 .addReg(NewVReg1)
11083 .addJumpTableIndex(MJTI);
11084 } else if (Subtarget->isThumb()) {
11085 Register NewVReg1 = MRI->createVirtualRegister(TRC);
11086 BuildMI(DispatchBB, dl, TII->get(ARM::tLDRspi), NewVReg1)
11087 .addFrameIndex(FI)
11088 .addImm(1)
11089 .addMemOperand(FIMMOLd)
11091
11092 if (NumLPads < 256) {
11093 BuildMI(DispatchBB, dl, TII->get(ARM::tCMPi8))
11094 .addReg(NewVReg1)
11095 .addImm(NumLPads)
11097 } else {
11098 MachineConstantPool *ConstantPool = MF->getConstantPool();
11099 Type *Int32Ty = Type::getInt32Ty(MF->getFunction().getContext());
11100 const Constant *C = ConstantInt::get(Int32Ty, NumLPads);
11101
11102 // MachineConstantPool wants an explicit alignment.
11103 Align Alignment = MF->getDataLayout().getPrefTypeAlign(Int32Ty);
11104 unsigned Idx = ConstantPool->getConstantPoolIndex(C, Alignment);
11105
11106 Register VReg1 = MRI->createVirtualRegister(TRC);
11107 BuildMI(DispatchBB, dl, TII->get(ARM::tLDRpci))
11108 .addReg(VReg1, RegState::Define)
11111 BuildMI(DispatchBB, dl, TII->get(ARM::tCMPr))
11112 .addReg(NewVReg1)
11113 .addReg(VReg1)
11115 }
11116
11117 BuildMI(DispatchBB, dl, TII->get(ARM::tBcc))
11118 .addMBB(TrapBB)
11120 .addReg(ARM::CPSR);
11121
11122 Register NewVReg2 = MRI->createVirtualRegister(TRC);
11123 BuildMI(DispContBB, dl, TII->get(ARM::tLSLri), NewVReg2)
11124 .addReg(ARM::CPSR, RegState::Define)
11125 .addReg(NewVReg1)
11126 .addImm(2)
11128
11129 Register NewVReg3 = MRI->createVirtualRegister(TRC);
11130 BuildMI(DispContBB, dl, TII->get(ARM::tLEApcrelJT), NewVReg3)
11131 .addJumpTableIndex(MJTI)
11133
11134 Register NewVReg4 = MRI->createVirtualRegister(TRC);
11135 BuildMI(DispContBB, dl, TII->get(ARM::tADDrr), NewVReg4)
11136 .addReg(ARM::CPSR, RegState::Define)
11137 .addReg(NewVReg2, RegState::Kill)
11138 .addReg(NewVReg3)
11140
11141 MachineMemOperand *JTMMOLd =
11142 MF->getMachineMemOperand(MachinePointerInfo::getJumpTable(*MF),
11144
11145 Register NewVReg5 = MRI->createVirtualRegister(TRC);
11146 BuildMI(DispContBB, dl, TII->get(ARM::tLDRi), NewVReg5)
11147 .addReg(NewVReg4, RegState::Kill)
11148 .addImm(0)
11149 .addMemOperand(JTMMOLd)
11151
11152 unsigned NewVReg6 = NewVReg5;
11153 if (IsPositionIndependent) {
11154 NewVReg6 = MRI->createVirtualRegister(TRC);
11155 BuildMI(DispContBB, dl, TII->get(ARM::tADDrr), NewVReg6)
11156 .addReg(ARM::CPSR, RegState::Define)
11157 .addReg(NewVReg5, RegState::Kill)
11158 .addReg(NewVReg3)
11160 }
11161
11162 BuildMI(DispContBB, dl, TII->get(ARM::tBR_JTr))
11163 .addReg(NewVReg6, RegState::Kill)
11164 .addJumpTableIndex(MJTI);
11165 } else {
11166 Register NewVReg1 = MRI->createVirtualRegister(TRC);
11167 BuildMI(DispatchBB, dl, TII->get(ARM::LDRi12), NewVReg1)
11168 .addFrameIndex(FI)
11169 .addImm(4)
11170 .addMemOperand(FIMMOLd)
11172
11173 if (NumLPads < 256) {
11174 BuildMI(DispatchBB, dl, TII->get(ARM::CMPri))
11175 .addReg(NewVReg1)
11176 .addImm(NumLPads)
11178 } else if (Subtarget->hasV6T2Ops() && isUInt<16>(NumLPads)) {
11179 Register VReg1 = MRI->createVirtualRegister(TRC);
11180 BuildMI(DispatchBB, dl, TII->get(ARM::MOVi16), VReg1)
11181 .addImm(NumLPads & 0xFFFF)
11183
11184 unsigned VReg2 = VReg1;
11185 if ((NumLPads & 0xFFFF0000) != 0) {
11186 VReg2 = MRI->createVirtualRegister(TRC);
11187 BuildMI(DispatchBB, dl, TII->get(ARM::MOVTi16), VReg2)
11188 .addReg(VReg1)
11189 .addImm(NumLPads >> 16)
11191 }
11192
11193 BuildMI(DispatchBB, dl, TII->get(ARM::CMPrr))
11194 .addReg(NewVReg1)
11195 .addReg(VReg2)
11197 } else {
11198 MachineConstantPool *ConstantPool = MF->getConstantPool();
11199 Type *Int32Ty = Type::getInt32Ty(MF->getFunction().getContext());
11200 const Constant *C = ConstantInt::get(Int32Ty, NumLPads);
11201
11202 // MachineConstantPool wants an explicit alignment.
11203 Align Alignment = MF->getDataLayout().getPrefTypeAlign(Int32Ty);
11204 unsigned Idx = ConstantPool->getConstantPoolIndex(C, Alignment);
11205
11206 Register VReg1 = MRI->createVirtualRegister(TRC);
11207 BuildMI(DispatchBB, dl, TII->get(ARM::LDRcp))
11208 .addReg(VReg1, RegState::Define)
11210 .addImm(0)
11212 BuildMI(DispatchBB, dl, TII->get(ARM::CMPrr))
11213 .addReg(NewVReg1)
11214 .addReg(VReg1, RegState::Kill)
11216 }
11217
11218 BuildMI(DispatchBB, dl, TII->get(ARM::Bcc))
11219 .addMBB(TrapBB)
11221 .addReg(ARM::CPSR);
11222
11223 Register NewVReg3 = MRI->createVirtualRegister(TRC);
11224 BuildMI(DispContBB, dl, TII->get(ARM::MOVsi), NewVReg3)
11225 .addReg(NewVReg1)
11228 .add(condCodeOp());
11229 Register NewVReg4 = MRI->createVirtualRegister(TRC);
11230 BuildMI(DispContBB, dl, TII->get(ARM::LEApcrelJT), NewVReg4)
11231 .addJumpTableIndex(MJTI)
11233
11234 MachineMemOperand *JTMMOLd =
11235 MF->getMachineMemOperand(MachinePointerInfo::getJumpTable(*MF),
11237 Register NewVReg5 = MRI->createVirtualRegister(TRC);
11238 BuildMI(DispContBB, dl, TII->get(ARM::LDRrs), NewVReg5)
11239 .addReg(NewVReg3, RegState::Kill)
11240 .addReg(NewVReg4)
11241 .addImm(0)
11242 .addMemOperand(JTMMOLd)
11244
11245 if (IsPositionIndependent) {
11246 BuildMI(DispContBB, dl, TII->get(ARM::BR_JTadd))
11247 .addReg(NewVReg5, RegState::Kill)
11248 .addReg(NewVReg4)
11249 .addJumpTableIndex(MJTI);
11250 } else {
11251 BuildMI(DispContBB, dl, TII->get(ARM::BR_JTr))
11252 .addReg(NewVReg5, RegState::Kill)
11253 .addJumpTableIndex(MJTI);
11254 }
11255 }
11256
11257 // Add the jump table entries as successors to the MBB.
11258 SmallPtrSet<MachineBasicBlock*, 8> SeenMBBs;
11259 for (MachineBasicBlock *CurMBB : LPadList) {
11260 if (SeenMBBs.insert(CurMBB).second)
11261 DispContBB->addSuccessor(CurMBB);
11262 }
11263
11264 // N.B. the order the invoke BBs are processed in doesn't matter here.
11265 const MCPhysReg *SavedRegs = RI.getCalleeSavedRegs(MF);
11267 for (MachineBasicBlock *BB : InvokeBBs) {
11268
11269 // Remove the landing pad successor from the invoke block and replace it
11270 // with the new dispatch block.
11271 SmallVector<MachineBasicBlock*, 4> Successors(BB->successors());
11272 while (!Successors.empty()) {
11273 MachineBasicBlock *SMBB = Successors.pop_back_val();
11274 if (SMBB->isEHPad()) {
11275 BB->removeSuccessor(SMBB);
11276 MBBLPads.push_back(SMBB);
11277 }
11278 }
11279
11280 BB->addSuccessor(DispatchBB, BranchProbability::getZero());
11281 BB->normalizeSuccProbs();
11282
11283 // Find the invoke call and mark all of the callee-saved registers as
11284 // 'implicit defined' so that they're spilled. This prevents code from
11285 // moving instructions to before the EH block, where they will never be
11286 // executed.
11288 II = BB->rbegin(), IE = BB->rend(); II != IE; ++II) {
11289 if (!II->isCall()) continue;
11290
11291 DenseSet<unsigned> DefRegs;
11293 OI = II->operands_begin(), OE = II->operands_end();
11294 OI != OE; ++OI) {
11295 if (!OI->isReg()) continue;
11296 DefRegs.insert(OI->getReg());
11297 }
11298
11299 MachineInstrBuilder MIB(*MF, &*II);
11300
11301 for (unsigned i = 0; SavedRegs[i] != 0; ++i) {
11302 unsigned Reg = SavedRegs[i];
11303 if (Subtarget->isThumb2() &&
11304 !ARM::tGPRRegClass.contains(Reg) &&
11305 !ARM::hGPRRegClass.contains(Reg))
11306 continue;
11307 if (Subtarget->isThumb1Only() && !ARM::tGPRRegClass.contains(Reg))
11308 continue;
11309 if (!Subtarget->isThumb() && !ARM::GPRRegClass.contains(Reg))
11310 continue;
11311 if (!DefRegs.contains(Reg))
11313 }
11314
11315 break;
11316 }
11317 }
11318
11319 // Mark all former landing pads as non-landing pads. The dispatch is the only
11320 // landing pad now.
11321 for (MachineBasicBlock *MBBLPad : MBBLPads)
11322 MBBLPad->setIsEHPad(false);
11323
11324 // The instruction is gone now.
11325 MI.eraseFromParent();
11326}
11327
11328static
11330 for (MachineBasicBlock *S : MBB->successors())
11331 if (S != Succ)
11332 return S;
11333 llvm_unreachable("Expecting a BB with two successors!");
11334}
11335
11336/// Return the load opcode for a given load size. If load size >= 8,
11337/// neon opcode will be returned.
11338static unsigned getLdOpcode(unsigned LdSize, bool IsThumb1, bool IsThumb2) {
11339 if (LdSize >= 8)
11340 return LdSize == 16 ? ARM::VLD1q32wb_fixed
11341 : LdSize == 8 ? ARM::VLD1d32wb_fixed : 0;
11342 if (IsThumb1)
11343 return LdSize == 4 ? ARM::tLDRi
11344 : LdSize == 2 ? ARM::tLDRHi
11345 : LdSize == 1 ? ARM::tLDRBi : 0;
11346 if (IsThumb2)
11347 return LdSize == 4 ? ARM::t2LDR_POST
11348 : LdSize == 2 ? ARM::t2LDRH_POST
11349 : LdSize == 1 ? ARM::t2LDRB_POST : 0;
11350 return LdSize == 4 ? ARM::LDR_POST_IMM
11351 : LdSize == 2 ? ARM::LDRH_POST
11352 : LdSize == 1 ? ARM::LDRB_POST_IMM : 0;
11353}
11354
11355/// Return the store opcode for a given store size. If store size >= 8,
11356/// neon opcode will be returned.
11357static unsigned getStOpcode(unsigned StSize, bool IsThumb1, bool IsThumb2) {
11358 if (StSize >= 8)
11359 return StSize == 16 ? ARM::VST1q32wb_fixed
11360 : StSize == 8 ? ARM::VST1d32wb_fixed : 0;
11361 if (IsThumb1)
11362 return StSize == 4 ? ARM::tSTRi
11363 : StSize == 2 ? ARM::tSTRHi
11364 : StSize == 1 ? ARM::tSTRBi : 0;
11365 if (IsThumb2)
11366 return StSize == 4 ? ARM::t2STR_POST
11367 : StSize == 2 ? ARM::t2STRH_POST
11368 : StSize == 1 ? ARM::t2STRB_POST : 0;
11369 return StSize == 4 ? ARM::STR_POST_IMM
11370 : StSize == 2 ? ARM::STRH_POST
11371 : StSize == 1 ? ARM::STRB_POST_IMM : 0;
11372}
11373
11374/// Emit a post-increment load operation with given size. The instructions
11375/// will be added to BB at Pos.
11377 const TargetInstrInfo *TII, const DebugLoc &dl,
11378 unsigned LdSize, unsigned Data, unsigned AddrIn,
11379 unsigned AddrOut, bool IsThumb1, bool IsThumb2) {
11380 unsigned LdOpc = getLdOpcode(LdSize, IsThumb1, IsThumb2);
11381 assert(LdOpc != 0 && "Should have a load opcode");
11382 if (LdSize >= 8) {
11383 BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data)
11384 .addReg(AddrOut, RegState::Define)
11385 .addReg(AddrIn)
11386 .addImm(0)
11388 } else if (IsThumb1) {
11389 // load + update AddrIn
11390 BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data)
11391 .addReg(AddrIn)
11392 .addImm(0)
11394 BuildMI(*BB, Pos, dl, TII->get(ARM::tADDi8), AddrOut)
11395 .add(t1CondCodeOp())
11396 .addReg(AddrIn)
11397 .addImm(LdSize)
11399 } else if (IsThumb2) {
11400 BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data)
11401 .addReg(AddrOut, RegState::Define)
11402 .addReg(AddrIn)
11403 .addImm(LdSize)
11405 } else { // arm
11406 BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data)
11407 .addReg(AddrOut, RegState::Define)
11408 .addReg(AddrIn)
11409 .addReg(0)
11410 .addImm(LdSize)
11412 }
11413}
11414
11415/// Emit a post-increment store operation with given size. The instructions
11416/// will be added to BB at Pos.
11418 const TargetInstrInfo *TII, const DebugLoc &dl,
11419 unsigned StSize, unsigned Data, unsigned AddrIn,
11420 unsigned AddrOut, bool IsThumb1, bool IsThumb2) {
11421 unsigned StOpc = getStOpcode(StSize, IsThumb1, IsThumb2);
11422 assert(StOpc != 0 && "Should have a store opcode");
11423 if (StSize >= 8) {
11424 BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut)
11425 .addReg(AddrIn)
11426 .addImm(0)
11427 .addReg(Data)
11429 } else if (IsThumb1) {
11430 // store + update AddrIn
11431 BuildMI(*BB, Pos, dl, TII->get(StOpc))
11432 .addReg(Data)
11433 .addReg(AddrIn)
11434 .addImm(0)
11436 BuildMI(*BB, Pos, dl, TII->get(ARM::tADDi8), AddrOut)
11437 .add(t1CondCodeOp())
11438 .addReg(AddrIn)
11439 .addImm(StSize)
11441 } else if (IsThumb2) {
11442 BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut)
11443 .addReg(Data)
11444 .addReg(AddrIn)
11445 .addImm(StSize)
11447 } else { // arm
11448 BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut)
11449 .addReg(Data)
11450 .addReg(AddrIn)
11451 .addReg(0)
11452 .addImm(StSize)
11454 }
11455}
11456
11458ARMTargetLowering::EmitStructByval(MachineInstr &MI,
11459 MachineBasicBlock *BB) const {
11460 // This pseudo instruction has 3 operands: dst, src, size
11461 // We expand it to a loop if size > Subtarget->getMaxInlineSizeThreshold().
11462 // Otherwise, we will generate unrolled scalar copies.
11463 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
11464 const BasicBlock *LLVM_BB = BB->getBasicBlock();
11466
11467 Register dest = MI.getOperand(0).getReg();
11468 Register src = MI.getOperand(1).getReg();
11469 unsigned SizeVal = MI.getOperand(2).getImm();
11470 unsigned Alignment = MI.getOperand(3).getImm();
11471 DebugLoc dl = MI.getDebugLoc();
11472
11473 MachineFunction *MF = BB->getParent();
11474 MachineRegisterInfo &MRI = MF->getRegInfo();
11475 unsigned UnitSize = 0;
11476 const TargetRegisterClass *TRC = nullptr;
11477 const TargetRegisterClass *VecTRC = nullptr;
11478
11479 bool IsThumb1 = Subtarget->isThumb1Only();
11480 bool IsThumb2 = Subtarget->isThumb2();
11481 bool IsThumb = Subtarget->isThumb();
11482
11483 if (Alignment & 1) {
11484 UnitSize = 1;
11485 } else if (Alignment & 2) {
11486 UnitSize = 2;
11487 } else {
11488 // Check whether we can use NEON instructions.
11489 if (!MF->getFunction().hasFnAttribute(Attribute::NoImplicitFloat) &&
11490 Subtarget->hasNEON()) {
11491 if ((Alignment % 16 == 0) && SizeVal >= 16)
11492 UnitSize = 16;
11493 else if ((Alignment % 8 == 0) && SizeVal >= 8)
11494 UnitSize = 8;
11495 }
11496 // Can't use NEON instructions.
11497 if (UnitSize == 0)
11498 UnitSize = 4;
11499 }
11500
11501 // Select the correct opcode and register class for unit size load/store
11502 bool IsNeon = UnitSize >= 8;
11503 TRC = IsThumb ? &ARM::tGPRRegClass : &ARM::GPRRegClass;
11504 if (IsNeon)
11505 VecTRC = UnitSize == 16 ? &ARM::DPairRegClass
11506 : UnitSize == 8 ? &ARM::DPRRegClass
11507 : nullptr;
11508
11509 unsigned BytesLeft = SizeVal % UnitSize;
11510 unsigned LoopSize = SizeVal - BytesLeft;
11511
11512 if (SizeVal <= Subtarget->getMaxInlineSizeThreshold()) {
11513 // Use LDR and STR to copy.
11514 // [scratch, srcOut] = LDR_POST(srcIn, UnitSize)
11515 // [destOut] = STR_POST(scratch, destIn, UnitSize)
11516 unsigned srcIn = src;
11517 unsigned destIn = dest;
11518 for (unsigned i = 0; i < LoopSize; i+=UnitSize) {
11519 Register srcOut = MRI.createVirtualRegister(TRC);
11520 Register destOut = MRI.createVirtualRegister(TRC);
11521 Register scratch = MRI.createVirtualRegister(IsNeon ? VecTRC : TRC);
11522 emitPostLd(BB, MI, TII, dl, UnitSize, scratch, srcIn, srcOut,
11523 IsThumb1, IsThumb2);
11524 emitPostSt(BB, MI, TII, dl, UnitSize, scratch, destIn, destOut,
11525 IsThumb1, IsThumb2);
11526 srcIn = srcOut;
11527 destIn = destOut;
11528 }
11529
11530 // Handle the leftover bytes with LDRB and STRB.
11531 // [scratch, srcOut] = LDRB_POST(srcIn, 1)
11532 // [destOut] = STRB_POST(scratch, destIn, 1)
11533 for (unsigned i = 0; i < BytesLeft; i++) {
11534 Register srcOut = MRI.createVirtualRegister(TRC);
11535 Register destOut = MRI.createVirtualRegister(TRC);
11536 Register scratch = MRI.createVirtualRegister(TRC);
11537 emitPostLd(BB, MI, TII, dl, 1, scratch, srcIn, srcOut,
11538 IsThumb1, IsThumb2);
11539 emitPostSt(BB, MI, TII, dl, 1, scratch, destIn, destOut,
11540 IsThumb1, IsThumb2);
11541 srcIn = srcOut;
11542 destIn = destOut;
11543 }
11544 MI.eraseFromParent(); // The instruction is gone now.
11545 return BB;
11546 }
11547
11548 // Expand the pseudo op to a loop.
11549 // thisMBB:
11550 // ...
11551 // movw varEnd, # --> with thumb2
11552 // movt varEnd, #
11553 // ldrcp varEnd, idx --> without thumb2
11554 // fallthrough --> loopMBB
11555 // loopMBB:
11556 // PHI varPhi, varEnd, varLoop
11557 // PHI srcPhi, src, srcLoop
11558 // PHI destPhi, dst, destLoop
11559 // [scratch, srcLoop] = LDR_POST(srcPhi, UnitSize)
11560 // [destLoop] = STR_POST(scratch, destPhi, UnitSize)
11561 // subs varLoop, varPhi, #UnitSize
11562 // bne loopMBB
11563 // fallthrough --> exitMBB
11564 // exitMBB:
11565 // epilogue to handle left-over bytes
11566 // [scratch, srcOut] = LDRB_POST(srcLoop, 1)
11567 // [destOut] = STRB_POST(scratch, destLoop, 1)
11568 MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB);
11569 MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
11570 MF->insert(It, loopMBB);
11571 MF->insert(It, exitMBB);
11572
11573 // Set the call frame size on entry to the new basic blocks.
11574 unsigned CallFrameSize = TII->getCallFrameSizeAt(MI);
11575 loopMBB->setCallFrameSize(CallFrameSize);
11576 exitMBB->setCallFrameSize(CallFrameSize);
11577
11578 // Transfer the remainder of BB and its successor edges to exitMBB.
11579 exitMBB->splice(exitMBB->begin(), BB,
11580 std::next(MachineBasicBlock::iterator(MI)), BB->end());
11582
11583 // Load an immediate to varEnd.
11584 Register varEnd = MRI.createVirtualRegister(TRC);
11585 if (Subtarget->useMovt()) {
11586 BuildMI(BB, dl, TII->get(IsThumb ? ARM::t2MOVi32imm : ARM::MOVi32imm),
11587 varEnd)
11588 .addImm(LoopSize);
11589 } else if (Subtarget->genExecuteOnly()) {
11590 assert(IsThumb && "Non-thumb expected to have used movt");
11591 BuildMI(BB, dl, TII->get(ARM::tMOVi32imm), varEnd).addImm(LoopSize);
11592 } else {
11593 MachineConstantPool *ConstantPool = MF->getConstantPool();
11595 const Constant *C = ConstantInt::get(Int32Ty, LoopSize);
11596
11597 // MachineConstantPool wants an explicit alignment.
11598 Align Alignment = MF->getDataLayout().getPrefTypeAlign(Int32Ty);
11599 unsigned Idx = ConstantPool->getConstantPoolIndex(C, Alignment);
11600 MachineMemOperand *CPMMO =
11603
11604 if (IsThumb)
11605 BuildMI(*BB, MI, dl, TII->get(ARM::tLDRpci))
11606 .addReg(varEnd, RegState::Define)
11609 .addMemOperand(CPMMO);
11610 else
11611 BuildMI(*BB, MI, dl, TII->get(ARM::LDRcp))
11612 .addReg(varEnd, RegState::Define)
11614 .addImm(0)
11616 .addMemOperand(CPMMO);
11617 }
11618 BB->addSuccessor(loopMBB);
11619
11620 // Generate the loop body:
11621 // varPhi = PHI(varLoop, varEnd)
11622 // srcPhi = PHI(srcLoop, src)
11623 // destPhi = PHI(destLoop, dst)
11624 MachineBasicBlock *entryBB = BB;
11625 BB = loopMBB;
11626 Register varLoop = MRI.createVirtualRegister(TRC);
11627 Register varPhi = MRI.createVirtualRegister(TRC);
11628 Register srcLoop = MRI.createVirtualRegister(TRC);
11629 Register srcPhi = MRI.createVirtualRegister(TRC);
11630 Register destLoop = MRI.createVirtualRegister(TRC);
11631 Register destPhi = MRI.createVirtualRegister(TRC);
11632
11633 BuildMI(*BB, BB->begin(), dl, TII->get(ARM::PHI), varPhi)
11634 .addReg(varLoop).addMBB(loopMBB)
11635 .addReg(varEnd).addMBB(entryBB);
11636 BuildMI(BB, dl, TII->get(ARM::PHI), srcPhi)
11637 .addReg(srcLoop).addMBB(loopMBB)
11638 .addReg(src).addMBB(entryBB);
11639 BuildMI(BB, dl, TII->get(ARM::PHI), destPhi)
11640 .addReg(destLoop).addMBB(loopMBB)
11641 .addReg(dest).addMBB(entryBB);
11642
11643 // [scratch, srcLoop] = LDR_POST(srcPhi, UnitSize)
11644 // [destLoop] = STR_POST(scratch, destPhi, UnitSiz)
11645 Register scratch = MRI.createVirtualRegister(IsNeon ? VecTRC : TRC);
11646 emitPostLd(BB, BB->end(), TII, dl, UnitSize, scratch, srcPhi, srcLoop,
11647 IsThumb1, IsThumb2);
11648 emitPostSt(BB, BB->end(), TII, dl, UnitSize, scratch, destPhi, destLoop,
11649 IsThumb1, IsThumb2);
11650
11651 // Decrement loop variable by UnitSize.
11652 if (IsThumb1) {
11653 BuildMI(*BB, BB->end(), dl, TII->get(ARM::tSUBi8), varLoop)
11654 .add(t1CondCodeOp())
11655 .addReg(varPhi)
11656 .addImm(UnitSize)
11658 } else {
11659 MachineInstrBuilder MIB =
11660 BuildMI(*BB, BB->end(), dl,
11661 TII->get(IsThumb2 ? ARM::t2SUBri : ARM::SUBri), varLoop);
11662 MIB.addReg(varPhi)
11663 .addImm(UnitSize)
11665 .add(condCodeOp());
11666 MIB->getOperand(5).setReg(ARM::CPSR);
11667 MIB->getOperand(5).setIsDef(true);
11668 }
11669 BuildMI(*BB, BB->end(), dl,
11670 TII->get(IsThumb1 ? ARM::tBcc : IsThumb2 ? ARM::t2Bcc : ARM::Bcc))
11671 .addMBB(loopMBB).addImm(ARMCC::NE).addReg(ARM::CPSR);
11672
11673 // loopMBB can loop back to loopMBB or fall through to exitMBB.
11674 BB->addSuccessor(loopMBB);
11675 BB->addSuccessor(exitMBB);
11676
11677 // Add epilogue to handle BytesLeft.
11678 BB = exitMBB;
11679 auto StartOfExit = exitMBB->begin();
11680
11681 // [scratch, srcOut] = LDRB_POST(srcLoop, 1)
11682 // [destOut] = STRB_POST(scratch, destLoop, 1)
11683 unsigned srcIn = srcLoop;
11684 unsigned destIn = destLoop;
11685 for (unsigned i = 0; i < BytesLeft; i++) {
11686 Register srcOut = MRI.createVirtualRegister(TRC);
11687 Register destOut = MRI.createVirtualRegister(TRC);
11688 Register scratch = MRI.createVirtualRegister(TRC);
11689 emitPostLd(BB, StartOfExit, TII, dl, 1, scratch, srcIn, srcOut,
11690 IsThumb1, IsThumb2);
11691 emitPostSt(BB, StartOfExit, TII, dl, 1, scratch, destIn, destOut,
11692 IsThumb1, IsThumb2);
11693 srcIn = srcOut;
11694 destIn = destOut;
11695 }
11696
11697 MI.eraseFromParent(); // The instruction is gone now.
11698 return BB;
11699}
11700
11702ARMTargetLowering::EmitLowered__chkstk(MachineInstr &MI,
11703 MachineBasicBlock *MBB) const {
11704 const TargetMachine &TM = getTargetMachine();
11705 const TargetInstrInfo &TII = *Subtarget->getInstrInfo();
11706 DebugLoc DL = MI.getDebugLoc();
11707
11708 assert(Subtarget->isTargetWindows() &&
11709 "__chkstk is only supported on Windows");
11710 assert(Subtarget->isThumb2() && "Windows on ARM requires Thumb-2 mode");
11711
11712 // __chkstk takes the number of words to allocate on the stack in R4, and
11713 // returns the stack adjustment in number of bytes in R4. This will not
11714 // clober any other registers (other than the obvious lr).
11715 //
11716 // Although, technically, IP should be considered a register which may be
11717 // clobbered, the call itself will not touch it. Windows on ARM is a pure
11718 // thumb-2 environment, so there is no interworking required. As a result, we
11719 // do not expect a veneer to be emitted by the linker, clobbering IP.
11720 //
11721 // Each module receives its own copy of __chkstk, so no import thunk is
11722 // required, again, ensuring that IP is not clobbered.
11723 //
11724 // Finally, although some linkers may theoretically provide a trampoline for
11725 // out of range calls (which is quite common due to a 32M range limitation of
11726 // branches for Thumb), we can generate the long-call version via
11727 // -mcmodel=large, alleviating the need for the trampoline which may clobber
11728 // IP.
11729
11730 switch (TM.getCodeModel()) {
11731 case CodeModel::Tiny:
11732 llvm_unreachable("Tiny code model not available on ARM.");
11733 case CodeModel::Small:
11734 case CodeModel::Medium:
11735 case CodeModel::Kernel:
11736 BuildMI(*MBB, MI, DL, TII.get(ARM::tBL))
11738 .addExternalSymbol("__chkstk")
11741 .addReg(ARM::R12,
11743 .addReg(ARM::CPSR,
11745 break;
11746 case CodeModel::Large: {
11747 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
11748 Register Reg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
11749
11750 BuildMI(*MBB, MI, DL, TII.get(ARM::t2MOVi32imm), Reg)
11751 .addExternalSymbol("__chkstk");
11757 .addReg(ARM::R12,
11759 .addReg(ARM::CPSR,
11761 break;
11762 }
11763 }
11764
11765 BuildMI(*MBB, MI, DL, TII.get(ARM::t2SUBrr), ARM::SP)
11766 .addReg(ARM::SP, RegState::Kill)
11767 .addReg(ARM::R4, RegState::Kill)
11770 .add(condCodeOp());
11771
11772 MI.eraseFromParent();
11773 return MBB;
11774}
11775
11777ARMTargetLowering::EmitLowered__dbzchk(MachineInstr &MI,
11778 MachineBasicBlock *MBB) const {
11779 DebugLoc DL = MI.getDebugLoc();
11780 MachineFunction *MF = MBB->getParent();
11781 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
11782
11783 MachineBasicBlock *ContBB = MF->CreateMachineBasicBlock();
11784 MF->insert(++MBB->getIterator(), ContBB);
11785 ContBB->splice(ContBB->begin(), MBB,
11786 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
11788 MBB->addSuccessor(ContBB);
11789
11790 MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
11791 BuildMI(TrapBB, DL, TII->get(ARM::t__brkdiv0));
11792 MF->push_back(TrapBB);
11793 MBB->addSuccessor(TrapBB);
11794
11795 BuildMI(*MBB, MI, DL, TII->get(ARM::tCMPi8))
11796 .addReg(MI.getOperand(0).getReg())
11797 .addImm(0)
11799 BuildMI(*MBB, MI, DL, TII->get(ARM::t2Bcc))
11800 .addMBB(TrapBB)
11802 .addReg(ARM::CPSR);
11803
11804 MI.eraseFromParent();
11805 return ContBB;
11806}
11807
11808// The CPSR operand of SelectItr might be missing a kill marker
11809// because there were multiple uses of CPSR, and ISel didn't know
11810// which to mark. Figure out whether SelectItr should have had a
11811// kill marker, and set it if it should. Returns the correct kill
11812// marker value.
11815 const TargetRegisterInfo* TRI) {
11816 // Scan forward through BB for a use/def of CPSR.
11817 MachineBasicBlock::iterator miI(std::next(SelectItr));
11818 for (MachineBasicBlock::iterator miE = BB->end(); miI != miE; ++miI) {
11819 const MachineInstr& mi = *miI;
11820 if (mi.readsRegister(ARM::CPSR, /*TRI=*/nullptr))
11821 return false;
11822 if (mi.definesRegister(ARM::CPSR, /*TRI=*/nullptr))
11823 break; // Should have kill-flag - update below.
11824 }
11825
11826 // If we hit the end of the block, check whether CPSR is live into a
11827 // successor.
11828 if (miI == BB->end()) {
11829 for (MachineBasicBlock *Succ : BB->successors())
11830 if (Succ->isLiveIn(ARM::CPSR))
11831 return false;
11832 }
11833
11834 // We found a def, or hit the end of the basic block and CPSR wasn't live
11835 // out. SelectMI should have a kill flag on CPSR.
11836 SelectItr->addRegisterKilled(ARM::CPSR, TRI);
11837 return true;
11838}
11839
11840/// Adds logic in loop entry MBB to calculate loop iteration count and adds
11841/// t2WhileLoopSetup and t2WhileLoopStart to generate WLS loop
11843 MachineBasicBlock *TpLoopBody,
11844 MachineBasicBlock *TpExit, Register OpSizeReg,
11845 const TargetInstrInfo *TII, DebugLoc Dl,
11847 // Calculates loop iteration count = ceil(n/16) = (n + 15) >> 4.
11848 Register AddDestReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
11849 BuildMI(TpEntry, Dl, TII->get(ARM::t2ADDri), AddDestReg)
11850 .addUse(OpSizeReg)
11851 .addImm(15)
11853 .addReg(0);
11854
11855 Register LsrDestReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
11856 BuildMI(TpEntry, Dl, TII->get(ARM::t2LSRri), LsrDestReg)
11857 .addUse(AddDestReg, RegState::Kill)
11858 .addImm(4)
11860 .addReg(0);
11861
11862 Register TotalIterationsReg = MRI.createVirtualRegister(&ARM::GPRlrRegClass);
11863 BuildMI(TpEntry, Dl, TII->get(ARM::t2WhileLoopSetup), TotalIterationsReg)
11864 .addUse(LsrDestReg, RegState::Kill);
11865
11866 BuildMI(TpEntry, Dl, TII->get(ARM::t2WhileLoopStart))
11867 .addUse(TotalIterationsReg)
11868 .addMBB(TpExit);
11869
11870 BuildMI(TpEntry, Dl, TII->get(ARM::t2B))
11871 .addMBB(TpLoopBody)
11873
11874 return TotalIterationsReg;
11875}
11876
11877/// Adds logic in the loopBody MBB to generate MVE_VCTP, t2DoLoopDec and
11878/// t2DoLoopEnd. These are used by later passes to generate tail predicated
11879/// loops.
11880static void genTPLoopBody(MachineBasicBlock *TpLoopBody,
11881 MachineBasicBlock *TpEntry, MachineBasicBlock *TpExit,
11882 const TargetInstrInfo *TII, DebugLoc Dl,
11883 MachineRegisterInfo &MRI, Register OpSrcReg,
11884 Register OpDestReg, Register ElementCountReg,
11885 Register TotalIterationsReg, bool IsMemcpy) {
11886 // First insert 4 PHI nodes for: Current pointer to Src (if memcpy), Dest
11887 // array, loop iteration counter, predication counter.
11888
11889 Register SrcPhiReg, CurrSrcReg;
11890 if (IsMemcpy) {
11891 // Current position in the src array
11892 SrcPhiReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
11893 CurrSrcReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
11894 BuildMI(TpLoopBody, Dl, TII->get(ARM::PHI), SrcPhiReg)
11895 .addUse(OpSrcReg)
11896 .addMBB(TpEntry)
11897 .addUse(CurrSrcReg)
11898 .addMBB(TpLoopBody);
11899 }
11900
11901 // Current position in the dest array
11902 Register DestPhiReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
11903 Register CurrDestReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
11904 BuildMI(TpLoopBody, Dl, TII->get(ARM::PHI), DestPhiReg)
11905 .addUse(OpDestReg)
11906 .addMBB(TpEntry)
11907 .addUse(CurrDestReg)
11908 .addMBB(TpLoopBody);
11909
11910 // Current loop counter
11911 Register LoopCounterPhiReg = MRI.createVirtualRegister(&ARM::GPRlrRegClass);
11912 Register RemainingLoopIterationsReg =
11913 MRI.createVirtualRegister(&ARM::GPRlrRegClass);
11914 BuildMI(TpLoopBody, Dl, TII->get(ARM::PHI), LoopCounterPhiReg)
11915 .addUse(TotalIterationsReg)
11916 .addMBB(TpEntry)
11917 .addUse(RemainingLoopIterationsReg)
11918 .addMBB(TpLoopBody);
11919
11920 // Predication counter
11921 Register PredCounterPhiReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
11922 Register RemainingElementsReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
11923 BuildMI(TpLoopBody, Dl, TII->get(ARM::PHI), PredCounterPhiReg)
11924 .addUse(ElementCountReg)
11925 .addMBB(TpEntry)
11926 .addUse(RemainingElementsReg)
11927 .addMBB(TpLoopBody);
11928
11929 // Pass predication counter to VCTP
11930 Register VccrReg = MRI.createVirtualRegister(&ARM::VCCRRegClass);
11931 BuildMI(TpLoopBody, Dl, TII->get(ARM::MVE_VCTP8), VccrReg)
11932 .addUse(PredCounterPhiReg)
11934 .addReg(0)
11935 .addReg(0);
11936
11937 BuildMI(TpLoopBody, Dl, TII->get(ARM::t2SUBri), RemainingElementsReg)
11938 .addUse(PredCounterPhiReg)
11939 .addImm(16)
11941 .addReg(0);
11942
11943 // VLDRB (only if memcpy) and VSTRB instructions, predicated using VPR
11944 Register SrcValueReg;
11945 if (IsMemcpy) {
11946 SrcValueReg = MRI.createVirtualRegister(&ARM::MQPRRegClass);
11947 BuildMI(TpLoopBody, Dl, TII->get(ARM::MVE_VLDRBU8_post))
11948 .addDef(CurrSrcReg)
11949 .addDef(SrcValueReg)
11950 .addReg(SrcPhiReg)
11951 .addImm(16)
11953 .addUse(VccrReg)
11954 .addReg(0);
11955 } else
11956 SrcValueReg = OpSrcReg;
11957
11958 BuildMI(TpLoopBody, Dl, TII->get(ARM::MVE_VSTRBU8_post))
11959 .addDef(CurrDestReg)
11960 .addUse(SrcValueReg)
11961 .addReg(DestPhiReg)
11962 .addImm(16)
11964 .addUse(VccrReg)
11965 .addReg(0);
11966
11967 // Add the pseudoInstrs for decrementing the loop counter and marking the
11968 // end:t2DoLoopDec and t2DoLoopEnd
11969 BuildMI(TpLoopBody, Dl, TII->get(ARM::t2LoopDec), RemainingLoopIterationsReg)
11970 .addUse(LoopCounterPhiReg)
11971 .addImm(1);
11972
11973 BuildMI(TpLoopBody, Dl, TII->get(ARM::t2LoopEnd))
11974 .addUse(RemainingLoopIterationsReg)
11975 .addMBB(TpLoopBody);
11976
11977 BuildMI(TpLoopBody, Dl, TII->get(ARM::t2B))
11978 .addMBB(TpExit)
11980}
11981
11983 // KCFI is supported in all ARM/Thumb modes
11984 return true;
11985}
11986
11990 const TargetInstrInfo *TII) const {
11991 assert(MBBI->isCall() && MBBI->getCFIType() &&
11992 "Invalid call instruction for a KCFI check");
11993
11994 MachineOperand *TargetOp = nullptr;
11995 switch (MBBI->getOpcode()) {
11996 // ARM mode opcodes
11997 case ARM::BLX:
11998 case ARM::BLX_pred:
11999 case ARM::BLX_noip:
12000 case ARM::BLX_pred_noip:
12001 case ARM::BX_CALL:
12002 TargetOp = &MBBI->getOperand(0);
12003 break;
12004 case ARM::TCRETURNri:
12005 case ARM::TCRETURNrinotr12:
12006 case ARM::TAILJMPr:
12007 case ARM::TAILJMPr4:
12008 TargetOp = &MBBI->getOperand(0);
12009 break;
12010 // Thumb mode opcodes (Thumb1 and Thumb2)
12011 // Note: Most Thumb call instructions have predicate operands before the
12012 // target register Format: tBLXr pred, predreg, target_register, ...
12013 case ARM::tBLXr: // Thumb1/Thumb2: BLX register (requires V5T)
12014 case ARM::tBLXr_noip: // Thumb1/Thumb2: BLX register, no IP clobber
12015 case ARM::tBX_CALL: // Thumb1 only: BX call (push LR, BX)
12016 TargetOp = &MBBI->getOperand(2);
12017 break;
12018 // Tail call instructions don't have predicates, target is operand 0
12019 case ARM::tTAILJMPr: // Thumb1/Thumb2: Tail call via register
12020 TargetOp = &MBBI->getOperand(0);
12021 break;
12022 default:
12023 llvm_unreachable("Unexpected CFI call opcode");
12024 }
12025
12026 assert(TargetOp && TargetOp->isReg() && "Invalid target operand");
12027 TargetOp->setIsRenamable(false);
12028
12029 // Select the appropriate KCFI_CHECK variant based on the instruction set
12030 unsigned KCFICheckOpcode;
12031 if (Subtarget->isThumb()) {
12032 if (Subtarget->isThumb2()) {
12033 KCFICheckOpcode = ARM::KCFI_CHECK_Thumb2;
12034 } else {
12035 KCFICheckOpcode = ARM::KCFI_CHECK_Thumb1;
12036 }
12037 } else {
12038 KCFICheckOpcode = ARM::KCFI_CHECK_ARM;
12039 }
12040
12041 return BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII->get(KCFICheckOpcode))
12042 .addReg(TargetOp->getReg())
12043 .addImm(MBBI->getCFIType())
12044 .getInstr();
12045}
12046
12049 MachineBasicBlock *BB) const {
12050 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
12051 DebugLoc dl = MI.getDebugLoc();
12052 bool isThumb2 = Subtarget->isThumb2();
12053 switch (MI.getOpcode()) {
12054 default: {
12055 MI.print(errs());
12056 llvm_unreachable("Unexpected instr type to insert");
12057 }
12058
12059 // Thumb1 post-indexed loads are really just single-register LDMs.
12060 case ARM::tLDR_postidx: {
12061 MachineOperand Def(MI.getOperand(1));
12062 BuildMI(*BB, MI, dl, TII->get(ARM::tLDMIA_UPD))
12063 .add(Def) // Rn_wb
12064 .add(MI.getOperand(2)) // Rn
12065 .add(MI.getOperand(3)) // PredImm
12066 .add(MI.getOperand(4)) // PredReg
12067 .add(MI.getOperand(0)) // Rt
12068 .cloneMemRefs(MI);
12069 MI.eraseFromParent();
12070 return BB;
12071 }
12072
12073 case ARM::MVE_MEMCPYLOOPINST:
12074 case ARM::MVE_MEMSETLOOPINST: {
12075
12076 // Transformation below expands MVE_MEMCPYLOOPINST/MVE_MEMSETLOOPINST Pseudo
12077 // into a Tail Predicated (TP) Loop. It adds the instructions to calculate
12078 // the iteration count =ceil(size_in_bytes/16)) in the TP entry block and
12079 // adds the relevant instructions in the TP loop Body for generation of a
12080 // WLSTP loop.
12081
12082 // Below is relevant portion of the CFG after the transformation.
12083 // The Machine Basic Blocks are shown along with branch conditions (in
12084 // brackets). Note that TP entry/exit MBBs depict the entry/exit of this
12085 // portion of the CFG and may not necessarily be the entry/exit of the
12086 // function.
12087
12088 // (Relevant) CFG after transformation:
12089 // TP entry MBB
12090 // |
12091 // |-----------------|
12092 // (n <= 0) (n > 0)
12093 // | |
12094 // | TP loop Body MBB<--|
12095 // | | |
12096 // \ |___________|
12097 // \ /
12098 // TP exit MBB
12099
12100 MachineFunction *MF = BB->getParent();
12101 MachineFunctionProperties &Properties = MF->getProperties();
12103
12104 Register OpDestReg = MI.getOperand(0).getReg();
12105 Register OpSrcReg = MI.getOperand(1).getReg();
12106 Register OpSizeReg = MI.getOperand(2).getReg();
12107
12108 // Allocate the required MBBs and add to parent function.
12109 MachineBasicBlock *TpEntry = BB;
12110 MachineBasicBlock *TpLoopBody = MF->CreateMachineBasicBlock();
12111 MachineBasicBlock *TpExit;
12112
12113 MF->push_back(TpLoopBody);
12114
12115 // If any instructions are present in the current block after
12116 // MVE_MEMCPYLOOPINST or MVE_MEMSETLOOPINST, split the current block and
12117 // move the instructions into the newly created exit block. If there are no
12118 // instructions add an explicit branch to the FallThrough block and then
12119 // split.
12120 //
12121 // The split is required for two reasons:
12122 // 1) A terminator(t2WhileLoopStart) will be placed at that site.
12123 // 2) Since a TPLoopBody will be added later, any phis in successive blocks
12124 // need to be updated. splitAt() already handles this.
12125 TpExit = BB->splitAt(MI, false);
12126 if (TpExit == BB) {
12127 assert(BB->canFallThrough() && "Exit Block must be Fallthrough of the "
12128 "block containing memcpy/memset Pseudo");
12129 TpExit = BB->getFallThrough();
12130 BuildMI(BB, dl, TII->get(ARM::t2B))
12131 .addMBB(TpExit)
12133 TpExit = BB->splitAt(MI, false);
12134 }
12135
12136 // Add logic for iteration count
12137 Register TotalIterationsReg =
12138 genTPEntry(TpEntry, TpLoopBody, TpExit, OpSizeReg, TII, dl, MRI);
12139
12140 // Add the vectorized (and predicated) loads/store instructions
12141 bool IsMemcpy = MI.getOpcode() == ARM::MVE_MEMCPYLOOPINST;
12142 genTPLoopBody(TpLoopBody, TpEntry, TpExit, TII, dl, MRI, OpSrcReg,
12143 OpDestReg, OpSizeReg, TotalIterationsReg, IsMemcpy);
12144
12145 // Required to avoid conflict with the MachineVerifier during testing.
12146 Properties.resetNoPHIs();
12147
12148 // Connect the blocks
12149 TpEntry->addSuccessor(TpLoopBody);
12150 TpLoopBody->addSuccessor(TpLoopBody);
12151 TpLoopBody->addSuccessor(TpExit);
12152
12153 // Reorder for a more natural layout
12154 TpLoopBody->moveAfter(TpEntry);
12155 TpExit->moveAfter(TpLoopBody);
12156
12157 // Finally, remove the memcpy Pseudo Instruction
12158 MI.eraseFromParent();
12159
12160 // Return the exit block as it may contain other instructions requiring a
12161 // custom inserter
12162 return TpExit;
12163 }
12164
12165 // The Thumb2 pre-indexed stores have the same MI operands, they just
12166 // define them differently in the .td files from the isel patterns, so
12167 // they need pseudos.
12168 case ARM::t2STR_preidx:
12169 MI.setDesc(TII->get(ARM::t2STR_PRE));
12170 return BB;
12171 case ARM::t2STRB_preidx:
12172 MI.setDesc(TII->get(ARM::t2STRB_PRE));
12173 return BB;
12174 case ARM::t2STRH_preidx:
12175 MI.setDesc(TII->get(ARM::t2STRH_PRE));
12176 return BB;
12177
12178 case ARM::STRi_preidx:
12179 case ARM::STRBi_preidx: {
12180 unsigned NewOpc = MI.getOpcode() == ARM::STRi_preidx ? ARM::STR_PRE_IMM
12181 : ARM::STRB_PRE_IMM;
12182 // Decode the offset.
12183 unsigned Offset = MI.getOperand(4).getImm();
12184 bool isSub = ARM_AM::getAM2Op(Offset) == ARM_AM::sub;
12186 if (isSub)
12187 Offset = -Offset;
12188
12189 MachineMemOperand *MMO = *MI.memoperands_begin();
12190 BuildMI(*BB, MI, dl, TII->get(NewOpc))
12191 .add(MI.getOperand(0)) // Rn_wb
12192 .add(MI.getOperand(1)) // Rt
12193 .add(MI.getOperand(2)) // Rn
12194 .addImm(Offset) // offset (skip GPR==zero_reg)
12195 .add(MI.getOperand(5)) // pred
12196 .add(MI.getOperand(6))
12197 .addMemOperand(MMO);
12198 MI.eraseFromParent();
12199 return BB;
12200 }
12201 case ARM::STRr_preidx:
12202 case ARM::STRBr_preidx:
12203 case ARM::STRH_preidx: {
12204 unsigned NewOpc;
12205 switch (MI.getOpcode()) {
12206 default: llvm_unreachable("unexpected opcode!");
12207 case ARM::STRr_preidx: NewOpc = ARM::STR_PRE_REG; break;
12208 case ARM::STRBr_preidx: NewOpc = ARM::STRB_PRE_REG; break;
12209 case ARM::STRH_preidx: NewOpc = ARM::STRH_PRE; break;
12210 }
12211 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(NewOpc));
12212 for (const MachineOperand &MO : MI.operands())
12213 MIB.add(MO);
12214 MI.eraseFromParent();
12215 return BB;
12216 }
12217
12218 case ARM::tMOVCCr_pseudo: {
12219 // To "insert" a SELECT_CC instruction, we actually have to insert the
12220 // diamond control-flow pattern. The incoming instruction knows the
12221 // destination vreg to set, the condition code register to branch on, the
12222 // true/false values to select between, and a branch opcode to use.
12223 const BasicBlock *LLVM_BB = BB->getBasicBlock();
12225
12226 // thisMBB:
12227 // ...
12228 // TrueVal = ...
12229 // cmpTY ccX, r1, r2
12230 // bCC copy1MBB
12231 // fallthrough --> copy0MBB
12232 MachineBasicBlock *thisMBB = BB;
12233 MachineFunction *F = BB->getParent();
12234 MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
12235 MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
12236 F->insert(It, copy0MBB);
12237 F->insert(It, sinkMBB);
12238
12239 // Set the call frame size on entry to the new basic blocks.
12240 unsigned CallFrameSize = TII->getCallFrameSizeAt(MI);
12241 copy0MBB->setCallFrameSize(CallFrameSize);
12242 sinkMBB->setCallFrameSize(CallFrameSize);
12243
12244 // Check whether CPSR is live past the tMOVCCr_pseudo.
12245 const TargetRegisterInfo *TRI = Subtarget->getRegisterInfo();
12246 if (!MI.killsRegister(ARM::CPSR, /*TRI=*/nullptr) &&
12247 !checkAndUpdateCPSRKill(MI, thisMBB, TRI)) {
12248 copy0MBB->addLiveIn(ARM::CPSR);
12249 sinkMBB->addLiveIn(ARM::CPSR);
12250 }
12251
12252 // Transfer the remainder of BB and its successor edges to sinkMBB.
12253 sinkMBB->splice(sinkMBB->begin(), BB,
12254 std::next(MachineBasicBlock::iterator(MI)), BB->end());
12256
12257 BB->addSuccessor(copy0MBB);
12258 BB->addSuccessor(sinkMBB);
12259
12260 BuildMI(BB, dl, TII->get(ARM::tBcc))
12261 .addMBB(sinkMBB)
12262 .addImm(MI.getOperand(3).getImm())
12263 .addReg(MI.getOperand(4).getReg());
12264
12265 // copy0MBB:
12266 // %FalseValue = ...
12267 // # fallthrough to sinkMBB
12268 BB = copy0MBB;
12269
12270 // Update machine-CFG edges
12271 BB->addSuccessor(sinkMBB);
12272
12273 // sinkMBB:
12274 // %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
12275 // ...
12276 BB = sinkMBB;
12277 BuildMI(*BB, BB->begin(), dl, TII->get(ARM::PHI), MI.getOperand(0).getReg())
12278 .addReg(MI.getOperand(1).getReg())
12279 .addMBB(copy0MBB)
12280 .addReg(MI.getOperand(2).getReg())
12281 .addMBB(thisMBB);
12282
12283 MI.eraseFromParent(); // The pseudo instruction is gone now.
12284 return BB;
12285 }
12286
12287 case ARM::BCCi64:
12288 case ARM::BCCZi64: {
12289 // If there is an unconditional branch to the other successor, remove it.
12290 BB->erase(std::next(MachineBasicBlock::iterator(MI)), BB->end());
12291
12292 // Compare both parts that make up the double comparison separately for
12293 // equality.
12294 bool RHSisZero = MI.getOpcode() == ARM::BCCZi64;
12295
12296 Register LHS1 = MI.getOperand(1).getReg();
12297 Register LHS2 = MI.getOperand(2).getReg();
12298 if (RHSisZero) {
12299 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri))
12300 .addReg(LHS1)
12301 .addImm(0)
12303 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri))
12304 .addReg(LHS2).addImm(0)
12305 .addImm(ARMCC::EQ).addReg(ARM::CPSR);
12306 } else {
12307 Register RHS1 = MI.getOperand(3).getReg();
12308 Register RHS2 = MI.getOperand(4).getReg();
12309 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPrr : ARM::CMPrr))
12310 .addReg(LHS1)
12311 .addReg(RHS1)
12313 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPrr : ARM::CMPrr))
12314 .addReg(LHS2).addReg(RHS2)
12315 .addImm(ARMCC::EQ).addReg(ARM::CPSR);
12316 }
12317
12318 MachineBasicBlock *destMBB = MI.getOperand(RHSisZero ? 3 : 5).getMBB();
12319 MachineBasicBlock *exitMBB = OtherSucc(BB, destMBB);
12320 if (MI.getOperand(0).getImm() == ARMCC::NE)
12321 std::swap(destMBB, exitMBB);
12322
12323 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc))
12324 .addMBB(destMBB).addImm(ARMCC::EQ).addReg(ARM::CPSR);
12325 if (isThumb2)
12326 BuildMI(BB, dl, TII->get(ARM::t2B))
12327 .addMBB(exitMBB)
12329 else
12330 BuildMI(BB, dl, TII->get(ARM::B)) .addMBB(exitMBB);
12331
12332 MI.eraseFromParent(); // The pseudo instruction is gone now.
12333 return BB;
12334 }
12335
12336 case ARM::Int_eh_sjlj_setjmp:
12337 case ARM::Int_eh_sjlj_setjmp_nofp:
12338 case ARM::tInt_eh_sjlj_setjmp:
12339 case ARM::t2Int_eh_sjlj_setjmp:
12340 case ARM::t2Int_eh_sjlj_setjmp_nofp:
12341 return BB;
12342
12343 case ARM::Int_eh_sjlj_setup_dispatch:
12344 EmitSjLjDispatchBlock(MI, BB);
12345 return BB;
12346 case ARM::COPY_STRUCT_BYVAL_I32:
12347 ++NumLoopByVals;
12348 return EmitStructByval(MI, BB);
12349 case ARM::WIN__CHKSTK:
12350 return EmitLowered__chkstk(MI, BB);
12351 case ARM::WIN__DBZCHK:
12352 return EmitLowered__dbzchk(MI, BB);
12353 }
12354}
12355
12356/// Attaches vregs to MEMCPY that it will use as scratch registers
12357/// when it is expanded into LDM/STM. This is done as a post-isel lowering
12358/// instead of as a custom inserter because we need the use list from the SDNode.
12359static void attachMEMCPYScratchRegs(const ARMSubtarget *Subtarget,
12360 MachineInstr &MI, const SDNode *Node) {
12361 bool isThumb1 = Subtarget->isThumb1Only();
12362
12363 MachineFunction *MF = MI.getParent()->getParent();
12365 MachineInstrBuilder MIB(*MF, MI);
12366
12367 // If the new dst/src is unused mark it as dead.
12368 if (!Node->hasAnyUseOfValue(0)) {
12369 MI.getOperand(0).setIsDead(true);
12370 }
12371 if (!Node->hasAnyUseOfValue(1)) {
12372 MI.getOperand(1).setIsDead(true);
12373 }
12374
12375 // The MEMCPY both defines and kills the scratch registers.
12376 for (unsigned I = 0; I != MI.getOperand(4).getImm(); ++I) {
12377 Register TmpReg = MRI.createVirtualRegister(isThumb1 ? &ARM::tGPRRegClass
12378 : &ARM::GPRRegClass);
12380 }
12381}
12382
12384 SDNode *Node) const {
12385 if (MI.getOpcode() == ARM::MEMCPY) {
12386 attachMEMCPYScratchRegs(Subtarget, MI, Node);
12387 return;
12388 }
12389
12390 const MCInstrDesc *MCID = &MI.getDesc();
12391 // Adjust potentially 's' setting instructions after isel, i.e. ADC, SBC, RSB,
12392 // RSC. Coming out of isel, they have an implicit CPSR def, but the optional
12393 // operand is still set to noreg. If needed, set the optional operand's
12394 // register to CPSR, and remove the redundant implicit def.
12395 //
12396 // e.g. ADCS (..., implicit-def CPSR) -> ADC (... opt:def CPSR).
12397
12398 // Rename pseudo opcodes.
12399 unsigned NewOpc = convertAddSubFlagsOpcode(MI.getOpcode());
12400 unsigned ccOutIdx;
12401 if (NewOpc) {
12402 const ARMBaseInstrInfo *TII = Subtarget->getInstrInfo();
12403 MCID = &TII->get(NewOpc);
12404
12405 assert(MCID->getNumOperands() ==
12406 MI.getDesc().getNumOperands() + 5 - MI.getDesc().getSize()
12407 && "converted opcode should be the same except for cc_out"
12408 " (and, on Thumb1, pred)");
12409
12410 MI.setDesc(*MCID);
12411
12412 // Add the optional cc_out operand
12413 MI.addOperand(MachineOperand::CreateReg(0, /*isDef=*/true));
12414
12415 // On Thumb1, move all input operands to the end, then add the predicate
12416 if (Subtarget->isThumb1Only()) {
12417 for (unsigned c = MCID->getNumOperands() - 4; c--;) {
12418 MI.addOperand(MI.getOperand(1));
12419 MI.removeOperand(1);
12420 }
12421
12422 // Restore the ties
12423 for (unsigned i = MI.getNumOperands(); i--;) {
12424 const MachineOperand& op = MI.getOperand(i);
12425 if (op.isReg() && op.isUse()) {
12426 int DefIdx = MCID->getOperandConstraint(i, MCOI::TIED_TO);
12427 if (DefIdx != -1)
12428 MI.tieOperands(DefIdx, i);
12429 }
12430 }
12431
12433 MI.addOperand(MachineOperand::CreateReg(0, /*isDef=*/false));
12434 ccOutIdx = 1;
12435 } else
12436 ccOutIdx = MCID->getNumOperands() - 1;
12437 } else
12438 ccOutIdx = MCID->getNumOperands() - 1;
12439
12440 // Any ARM instruction that sets the 's' bit should specify an optional
12441 // "cc_out" operand in the last operand position.
12442 if (!MI.hasOptionalDef() || !MCID->operands()[ccOutIdx].isOptionalDef()) {
12443 assert(!NewOpc && "Optional cc_out operand required");
12444 return;
12445 }
12446 // Look for an implicit def of CPSR added by MachineInstr ctor. Remove it
12447 // since we already have an optional CPSR def.
12448 bool definesCPSR = false;
12449 bool deadCPSR = false;
12450 for (unsigned i = MCID->getNumOperands(), e = MI.getNumOperands(); i != e;
12451 ++i) {
12452 const MachineOperand &MO = MI.getOperand(i);
12453 if (MO.isReg() && MO.isDef() && MO.getReg() == ARM::CPSR) {
12454 definesCPSR = true;
12455 if (MO.isDead())
12456 deadCPSR = true;
12457 MI.removeOperand(i);
12458 break;
12459 }
12460 }
12461 if (!definesCPSR) {
12462 assert(!NewOpc && "Optional cc_out operand required");
12463 return;
12464 }
12465 assert(deadCPSR == !Node->hasAnyUseOfValue(1) && "inconsistent dead flag");
12466 if (deadCPSR) {
12467 assert(!MI.getOperand(ccOutIdx).getReg() &&
12468 "expect uninitialized optional cc_out operand");
12469 // Thumb1 instructions must have the S bit even if the CPSR is dead.
12470 if (!Subtarget->isThumb1Only())
12471 return;
12472 }
12473
12474 // If this instruction was defined with an optional CPSR def and its dag node
12475 // had a live implicit CPSR def, then activate the optional CPSR def.
12476 MachineOperand &MO = MI.getOperand(ccOutIdx);
12477 MO.setReg(ARM::CPSR);
12478 MO.setIsDef(true);
12479}
12480
12481//===----------------------------------------------------------------------===//
12482// ARM Optimization Hooks
12483//===----------------------------------------------------------------------===//
12484
12485// Helper function that checks if N is a null or all ones constant.
12486static inline bool isZeroOrAllOnes(SDValue N, bool AllOnes) {
12488}
12489
12490// Return true if N is conditionally 0 or all ones.
12491// Detects these expressions where cc is an i1 value:
12492//
12493// (select cc 0, y) [AllOnes=0]
12494// (select cc y, 0) [AllOnes=0]
12495// (zext cc) [AllOnes=0]
12496// (sext cc) [AllOnes=0/1]
12497// (select cc -1, y) [AllOnes=1]
12498// (select cc y, -1) [AllOnes=1]
12499//
12500// Invert is set when N is the null/all ones constant when CC is false.
12501// OtherOp is set to the alternative value of N.
12503 SDValue &CC, bool &Invert,
12504 SDValue &OtherOp,
12505 SelectionDAG &DAG) {
12506 switch (N->getOpcode()) {
12507 default: return false;
12508 case ISD::SELECT: {
12509 CC = N->getOperand(0);
12510 SDValue N1 = N->getOperand(1);
12511 SDValue N2 = N->getOperand(2);
12512 if (isZeroOrAllOnes(N1, AllOnes)) {
12513 Invert = false;
12514 OtherOp = N2;
12515 return true;
12516 }
12517 if (isZeroOrAllOnes(N2, AllOnes)) {
12518 Invert = true;
12519 OtherOp = N1;
12520 return true;
12521 }
12522 return false;
12523 }
12524 case ISD::ZERO_EXTEND:
12525 // (zext cc) can never be the all ones value.
12526 if (AllOnes)
12527 return false;
12528 [[fallthrough]];
12529 case ISD::SIGN_EXTEND: {
12530 SDLoc dl(N);
12531 EVT VT = N->getValueType(0);
12532 CC = N->getOperand(0);
12533 if (CC.getValueType() != MVT::i1 || CC.getOpcode() != ISD::SETCC)
12534 return false;
12535 Invert = !AllOnes;
12536 if (AllOnes)
12537 // When looking for an AllOnes constant, N is an sext, and the 'other'
12538 // value is 0.
12539 OtherOp = DAG.getConstant(0, dl, VT);
12540 else if (N->getOpcode() == ISD::ZERO_EXTEND)
12541 // When looking for a 0 constant, N can be zext or sext.
12542 OtherOp = DAG.getConstant(1, dl, VT);
12543 else
12544 OtherOp = DAG.getAllOnesConstant(dl, VT);
12545 return true;
12546 }
12547 }
12548}
12549
12550// Combine a constant select operand into its use:
12551//
12552// (add (select cc, 0, c), x) -> (select cc, x, (add, x, c))
12553// (sub x, (select cc, 0, c)) -> (select cc, x, (sub, x, c))
12554// (and (select cc, -1, c), x) -> (select cc, x, (and, x, c)) [AllOnes=1]
12555// (or (select cc, 0, c), x) -> (select cc, x, (or, x, c))
12556// (xor (select cc, 0, c), x) -> (select cc, x, (xor, x, c))
12557//
12558// The transform is rejected if the select doesn't have a constant operand that
12559// is null, or all ones when AllOnes is set.
12560//
12561// Also recognize sext/zext from i1:
12562//
12563// (add (zext cc), x) -> (select cc (add x, 1), x)
12564// (add (sext cc), x) -> (select cc (add x, -1), x)
12565//
12566// These transformations eventually create predicated instructions.
12567//
12568// @param N The node to transform.
12569// @param Slct The N operand that is a select.
12570// @param OtherOp The other N operand (x above).
12571// @param DCI Context.
12572// @param AllOnes Require the select constant to be all ones instead of null.
12573// @returns The new node, or SDValue() on failure.
12574static
12577 bool AllOnes = false) {
12578 SelectionDAG &DAG = DCI.DAG;
12579 EVT VT = N->getValueType(0);
12580 SDValue NonConstantVal;
12581 SDValue CCOp;
12582 bool SwapSelectOps;
12583 if (!isConditionalZeroOrAllOnes(Slct.getNode(), AllOnes, CCOp, SwapSelectOps,
12584 NonConstantVal, DAG))
12585 return SDValue();
12586
12587 // Slct is now know to be the desired identity constant when CC is true.
12588 SDValue TrueVal = OtherOp;
12589 SDValue FalseVal = DAG.getNode(N->getOpcode(), SDLoc(N), VT,
12590 OtherOp, NonConstantVal);
12591 // Unless SwapSelectOps says CC should be false.
12592 if (SwapSelectOps)
12593 std::swap(TrueVal, FalseVal);
12594
12595 return DAG.getNode(ISD::SELECT, SDLoc(N), VT,
12596 CCOp, TrueVal, FalseVal);
12597}
12598
12599// Attempt combineSelectAndUse on each operand of a commutative operator N.
12600static
12603 SDValue N0 = N->getOperand(0);
12604 SDValue N1 = N->getOperand(1);
12605 if (N0.getNode()->hasOneUse())
12606 if (SDValue Result = combineSelectAndUse(N, N0, N1, DCI, AllOnes))
12607 return Result;
12608 if (N1.getNode()->hasOneUse())
12609 if (SDValue Result = combineSelectAndUse(N, N1, N0, DCI, AllOnes))
12610 return Result;
12611 return SDValue();
12612}
12613
12615 // VUZP shuffle node.
12616 if (N->getOpcode() == ARMISD::VUZP)
12617 return true;
12618
12619 // "VUZP" on i32 is an alias for VTRN.
12620 if (N->getOpcode() == ARMISD::VTRN && N->getValueType(0) == MVT::v2i32)
12621 return true;
12622
12623 return false;
12624}
12625
12628 const ARMSubtarget *Subtarget) {
12629 // Look for ADD(VUZP.0, VUZP.1).
12630 if (!IsVUZPShuffleNode(N0.getNode()) || N0.getNode() != N1.getNode() ||
12631 N0 == N1)
12632 return SDValue();
12633
12634 // Make sure the ADD is a 64-bit add; there is no 128-bit VPADD.
12635 if (!N->getValueType(0).is64BitVector())
12636 return SDValue();
12637
12638 // Generate vpadd.
12639 SelectionDAG &DAG = DCI.DAG;
12640 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
12641 SDLoc dl(N);
12642 SDNode *Unzip = N0.getNode();
12643 EVT VT = N->getValueType(0);
12644
12646 Ops.push_back(DAG.getConstant(Intrinsic::arm_neon_vpadd, dl,
12647 TLI.getPointerTy(DAG.getDataLayout())));
12648 Ops.push_back(Unzip->getOperand(0));
12649 Ops.push_back(Unzip->getOperand(1));
12650
12651 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, Ops);
12652}
12653
12656 const ARMSubtarget *Subtarget) {
12657 // Check for two extended operands.
12658 if (!(N0.getOpcode() == ISD::SIGN_EXTEND &&
12659 N1.getOpcode() == ISD::SIGN_EXTEND) &&
12660 !(N0.getOpcode() == ISD::ZERO_EXTEND &&
12661 N1.getOpcode() == ISD::ZERO_EXTEND))
12662 return SDValue();
12663
12664 SDValue N00 = N0.getOperand(0);
12665 SDValue N10 = N1.getOperand(0);
12666
12667 // Look for ADD(SEXT(VUZP.0), SEXT(VUZP.1))
12668 if (!IsVUZPShuffleNode(N00.getNode()) || N00.getNode() != N10.getNode() ||
12669 N00 == N10)
12670 return SDValue();
12671
12672 // We only recognize Q register paddl here; this can't be reached until
12673 // after type legalization.
12674 if (!N00.getValueType().is64BitVector() ||
12676 return SDValue();
12677
12678 // Generate vpaddl.
12679 SelectionDAG &DAG = DCI.DAG;
12680 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
12681 SDLoc dl(N);
12682 EVT VT = N->getValueType(0);
12683
12685 // Form vpaddl.sN or vpaddl.uN depending on the kind of extension.
12686 unsigned Opcode;
12687 if (N0.getOpcode() == ISD::SIGN_EXTEND)
12688 Opcode = Intrinsic::arm_neon_vpaddls;
12689 else
12690 Opcode = Intrinsic::arm_neon_vpaddlu;
12691 Ops.push_back(DAG.getConstant(Opcode, dl,
12692 TLI.getPointerTy(DAG.getDataLayout())));
12693 EVT ElemTy = N00.getValueType().getVectorElementType();
12694 unsigned NumElts = VT.getVectorNumElements();
12695 EVT ConcatVT = EVT::getVectorVT(*DAG.getContext(), ElemTy, NumElts * 2);
12696 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), ConcatVT,
12697 N00.getOperand(0), N00.getOperand(1));
12698 Ops.push_back(Concat);
12699
12700 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, Ops);
12701}
12702
12703// FIXME: This function shouldn't be necessary; if we lower BUILD_VECTOR in
12704// an appropriate manner, we end up with ADD(VUZP(ZEXT(N))), which is
12705// much easier to match.
12706static SDValue
12709 const ARMSubtarget *Subtarget) {
12710 // Only perform optimization if after legalize, and if NEON is available. We
12711 // also expected both operands to be BUILD_VECTORs.
12712 if (DCI.isBeforeLegalize() || !Subtarget->hasNEON()
12713 || N0.getOpcode() != ISD::BUILD_VECTOR
12714 || N1.getOpcode() != ISD::BUILD_VECTOR)
12715 return SDValue();
12716
12717 // Check output type since VPADDL operand elements can only be 8, 16, or 32.
12718 EVT VT = N->getValueType(0);
12719 if (!VT.isInteger() || VT.getVectorElementType() == MVT::i64)
12720 return SDValue();
12721
12722 // Check that the vector operands are of the right form.
12723 // N0 and N1 are BUILD_VECTOR nodes with N number of EXTRACT_VECTOR
12724 // operands, where N is the size of the formed vector.
12725 // Each EXTRACT_VECTOR should have the same input vector and odd or even
12726 // index such that we have a pair wise add pattern.
12727
12728 // Grab the vector that all EXTRACT_VECTOR nodes should be referencing.
12730 return SDValue();
12731 SDValue Vec = N0->getOperand(0)->getOperand(0);
12732 SDNode *V = Vec.getNode();
12733 unsigned nextIndex = 0;
12734
12735 // For each operands to the ADD which are BUILD_VECTORs,
12736 // check to see if each of their operands are an EXTRACT_VECTOR with
12737 // the same vector and appropriate index.
12738 for (unsigned i = 0, e = N0->getNumOperands(); i != e; ++i) {
12741
12742 SDValue ExtVec0 = N0->getOperand(i);
12743 SDValue ExtVec1 = N1->getOperand(i);
12744
12745 // First operand is the vector, verify its the same.
12746 if (V != ExtVec0->getOperand(0).getNode() ||
12747 V != ExtVec1->getOperand(0).getNode())
12748 return SDValue();
12749
12750 // Second is the constant, verify its correct.
12753
12754 // For the constant, we want to see all the even or all the odd.
12755 if (!C0 || !C1 || C0->getZExtValue() != nextIndex
12756 || C1->getZExtValue() != nextIndex+1)
12757 return SDValue();
12758
12759 // Increment index.
12760 nextIndex+=2;
12761 } else
12762 return SDValue();
12763 }
12764
12765 // Don't generate vpaddl+vmovn; we'll match it to vpadd later. Also make sure
12766 // we're using the entire input vector, otherwise there's a size/legality
12767 // mismatch somewhere.
12768 if (nextIndex != Vec.getValueType().getVectorNumElements() ||
12770 return SDValue();
12771
12772 // Create VPADDL node.
12773 SelectionDAG &DAG = DCI.DAG;
12774 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
12775
12776 SDLoc dl(N);
12777
12778 // Build operand list.
12780 Ops.push_back(DAG.getConstant(Intrinsic::arm_neon_vpaddls, dl,
12781 TLI.getPointerTy(DAG.getDataLayout())));
12782
12783 // Input is the vector.
12784 Ops.push_back(Vec);
12785
12786 // Get widened type and narrowed type.
12787 MVT widenType;
12788 unsigned numElem = VT.getVectorNumElements();
12789
12790 EVT inputLaneType = Vec.getValueType().getVectorElementType();
12791 switch (inputLaneType.getSimpleVT().SimpleTy) {
12792 case MVT::i8: widenType = MVT::getVectorVT(MVT::i16, numElem); break;
12793 case MVT::i16: widenType = MVT::getVectorVT(MVT::i32, numElem); break;
12794 case MVT::i32: widenType = MVT::getVectorVT(MVT::i64, numElem); break;
12795 default:
12796 llvm_unreachable("Invalid vector element type for padd optimization.");
12797 }
12798
12799 SDValue tmp = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, widenType, Ops);
12800 unsigned ExtOp = VT.bitsGT(tmp.getValueType()) ? ISD::ANY_EXTEND : ISD::TRUNCATE;
12801 return DAG.getNode(ExtOp, dl, VT, tmp);
12802}
12803
12805 if (V->getOpcode() == ISD::UMUL_LOHI ||
12806 V->getOpcode() == ISD::SMUL_LOHI)
12807 return V;
12808 return SDValue();
12809}
12810
12811static SDValue AddCombineTo64BitSMLAL16(SDNode *AddcNode, SDNode *AddeNode,
12813 const ARMSubtarget *Subtarget) {
12814 if (!Subtarget->hasBaseDSP())
12815 return SDValue();
12816
12817 // SMLALBB, SMLALBT, SMLALTB, SMLALTT multiply two 16-bit values and
12818 // accumulates the product into a 64-bit value. The 16-bit values will
12819 // be sign extended somehow or SRA'd into 32-bit values
12820 // (addc (adde (mul 16bit, 16bit), lo), hi)
12821 SDValue Mul = AddcNode->getOperand(0);
12822 SDValue Lo = AddcNode->getOperand(1);
12823 if (Mul.getOpcode() != ISD::MUL) {
12824 Lo = AddcNode->getOperand(0);
12825 Mul = AddcNode->getOperand(1);
12826 if (Mul.getOpcode() != ISD::MUL)
12827 return SDValue();
12828 }
12829
12830 SDValue SRA = AddeNode->getOperand(0);
12831 SDValue Hi = AddeNode->getOperand(1);
12832 if (SRA.getOpcode() != ISD::SRA) {
12833 SRA = AddeNode->getOperand(1);
12834 Hi = AddeNode->getOperand(0);
12835 if (SRA.getOpcode() != ISD::SRA)
12836 return SDValue();
12837 }
12838 if (auto Const = dyn_cast<ConstantSDNode>(SRA.getOperand(1))) {
12839 if (Const->getZExtValue() != 31)
12840 return SDValue();
12841 } else
12842 return SDValue();
12843
12844 if (SRA.getOperand(0) != Mul)
12845 return SDValue();
12846
12847 SelectionDAG &DAG = DCI.DAG;
12848 SDLoc dl(AddcNode);
12849 unsigned Opcode = 0;
12850 SDValue Op0;
12851 SDValue Op1;
12852
12853 if (isS16(Mul.getOperand(0), DAG) && isS16(Mul.getOperand(1), DAG)) {
12854 Opcode = ARMISD::SMLALBB;
12855 Op0 = Mul.getOperand(0);
12856 Op1 = Mul.getOperand(1);
12857 } else if (isS16(Mul.getOperand(0), DAG) && isSRA16(Mul.getOperand(1))) {
12858 Opcode = ARMISD::SMLALBT;
12859 Op0 = Mul.getOperand(0);
12860 Op1 = Mul.getOperand(1).getOperand(0);
12861 } else if (isSRA16(Mul.getOperand(0)) && isS16(Mul.getOperand(1), DAG)) {
12862 Opcode = ARMISD::SMLALTB;
12863 Op0 = Mul.getOperand(0).getOperand(0);
12864 Op1 = Mul.getOperand(1);
12865 } else if (isSRA16(Mul.getOperand(0)) && isSRA16(Mul.getOperand(1))) {
12866 Opcode = ARMISD::SMLALTT;
12867 Op0 = Mul->getOperand(0).getOperand(0);
12868 Op1 = Mul->getOperand(1).getOperand(0);
12869 }
12870
12871 if (!Op0 || !Op1)
12872 return SDValue();
12873
12874 SDValue SMLAL = DAG.getNode(Opcode, dl, DAG.getVTList(MVT::i32, MVT::i32),
12875 Op0, Op1, Lo, Hi);
12876 // Replace the ADDs' nodes uses by the MLA node's values.
12877 SDValue HiMLALResult(SMLAL.getNode(), 1);
12878 SDValue LoMLALResult(SMLAL.getNode(), 0);
12879
12880 DAG.ReplaceAllUsesOfValueWith(SDValue(AddcNode, 0), LoMLALResult);
12881 DAG.ReplaceAllUsesOfValueWith(SDValue(AddeNode, 0), HiMLALResult);
12882
12883 // Return original node to notify the driver to stop replacing.
12884 SDValue resNode(AddcNode, 0);
12885 return resNode;
12886}
12887
12890 const ARMSubtarget *Subtarget) {
12891 // Look for multiply add opportunities.
12892 // The pattern is a ISD::UMUL_LOHI followed by two add nodes, where
12893 // each add nodes consumes a value from ISD::UMUL_LOHI and there is
12894 // a glue link from the first add to the second add.
12895 // If we find this pattern, we can replace the U/SMUL_LOHI, ADDC, and ADDE by
12896 // a S/UMLAL instruction.
12897 // UMUL_LOHI
12898 // / :lo \ :hi
12899 // V \ [no multiline comment]
12900 // loAdd -> ADDC |
12901 // \ :carry /
12902 // V V
12903 // ADDE <- hiAdd
12904 //
12905 // In the special case where only the higher part of a signed result is used
12906 // and the add to the low part of the result of ISD::UMUL_LOHI adds or subtracts
12907 // a constant with the exact value of 0x80000000, we recognize we are dealing
12908 // with a "rounded multiply and add" (or subtract) and transform it into
12909 // either a ARMISD::SMMLAR or ARMISD::SMMLSR respectively.
12910
12911 assert((AddeSubeNode->getOpcode() == ARMISD::ADDE ||
12912 AddeSubeNode->getOpcode() == ARMISD::SUBE) &&
12913 "Expect an ADDE or SUBE");
12914
12915 assert(AddeSubeNode->getNumOperands() == 3 &&
12916 AddeSubeNode->getOperand(2).getValueType() == MVT::i32 &&
12917 "ADDE node has the wrong inputs");
12918
12919 // Check that we are chained to the right ADDC or SUBC node.
12920 SDNode *AddcSubcNode = AddeSubeNode->getOperand(2).getNode();
12921 if ((AddeSubeNode->getOpcode() == ARMISD::ADDE &&
12922 AddcSubcNode->getOpcode() != ARMISD::ADDC) ||
12923 (AddeSubeNode->getOpcode() == ARMISD::SUBE &&
12924 AddcSubcNode->getOpcode() != ARMISD::SUBC))
12925 return SDValue();
12926
12927 SDValue AddcSubcOp0 = AddcSubcNode->getOperand(0);
12928 SDValue AddcSubcOp1 = AddcSubcNode->getOperand(1);
12929
12930 // Check if the two operands are from the same mul_lohi node.
12931 if (AddcSubcOp0.getNode() == AddcSubcOp1.getNode())
12932 return SDValue();
12933
12934 assert(AddcSubcNode->getNumValues() == 2 &&
12935 AddcSubcNode->getValueType(0) == MVT::i32 &&
12936 "Expect ADDC with two result values. First: i32");
12937
12938 // Check that the ADDC adds the low result of the S/UMUL_LOHI. If not, it
12939 // maybe a SMLAL which multiplies two 16-bit values.
12940 if (AddeSubeNode->getOpcode() == ARMISD::ADDE &&
12941 AddcSubcOp0->getOpcode() != ISD::UMUL_LOHI &&
12942 AddcSubcOp0->getOpcode() != ISD::SMUL_LOHI &&
12943 AddcSubcOp1->getOpcode() != ISD::UMUL_LOHI &&
12944 AddcSubcOp1->getOpcode() != ISD::SMUL_LOHI)
12945 return AddCombineTo64BitSMLAL16(AddcSubcNode, AddeSubeNode, DCI, Subtarget);
12946
12947 // Check for the triangle shape.
12948 SDValue AddeSubeOp0 = AddeSubeNode->getOperand(0);
12949 SDValue AddeSubeOp1 = AddeSubeNode->getOperand(1);
12950
12951 // Make sure that the ADDE/SUBE operands are not coming from the same node.
12952 if (AddeSubeOp0.getNode() == AddeSubeOp1.getNode())
12953 return SDValue();
12954
12955 // Find the MUL_LOHI node walking up ADDE/SUBE's operands.
12956 bool IsLeftOperandMUL = false;
12957 SDValue MULOp = findMUL_LOHI(AddeSubeOp0);
12958 if (MULOp == SDValue())
12959 MULOp = findMUL_LOHI(AddeSubeOp1);
12960 else
12961 IsLeftOperandMUL = true;
12962 if (MULOp == SDValue())
12963 return SDValue();
12964
12965 // Figure out the right opcode.
12966 unsigned Opc = MULOp->getOpcode();
12967 unsigned FinalOpc = (Opc == ISD::SMUL_LOHI) ? ARMISD::SMLAL : ARMISD::UMLAL;
12968
12969 // Figure out the high and low input values to the MLAL node.
12970 SDValue *HiAddSub = nullptr;
12971 SDValue *LoMul = nullptr;
12972 SDValue *LowAddSub = nullptr;
12973
12974 // Ensure that ADDE/SUBE is from high result of ISD::xMUL_LOHI.
12975 if ((AddeSubeOp0 != MULOp.getValue(1)) && (AddeSubeOp1 != MULOp.getValue(1)))
12976 return SDValue();
12977
12978 if (IsLeftOperandMUL)
12979 HiAddSub = &AddeSubeOp1;
12980 else
12981 HiAddSub = &AddeSubeOp0;
12982
12983 // Ensure that LoMul and LowAddSub are taken from correct ISD::SMUL_LOHI node
12984 // whose low result is fed to the ADDC/SUBC we are checking.
12985
12986 if (AddcSubcOp0 == MULOp.getValue(0)) {
12987 LoMul = &AddcSubcOp0;
12988 LowAddSub = &AddcSubcOp1;
12989 }
12990 if (AddcSubcOp1 == MULOp.getValue(0)) {
12991 LoMul = &AddcSubcOp1;
12992 LowAddSub = &AddcSubcOp0;
12993 }
12994
12995 if (!LoMul)
12996 return SDValue();
12997
12998 // If HiAddSub is the same node as ADDC/SUBC or is a predecessor of ADDC/SUBC
12999 // the replacement below will create a cycle.
13000 if (AddcSubcNode == HiAddSub->getNode() ||
13001 AddcSubcNode->isPredecessorOf(HiAddSub->getNode()))
13002 return SDValue();
13003
13004 // Create the merged node.
13005 SelectionDAG &DAG = DCI.DAG;
13006
13007 // Start building operand list.
13009 Ops.push_back(LoMul->getOperand(0));
13010 Ops.push_back(LoMul->getOperand(1));
13011
13012 // Check whether we can use SMMLAR, SMMLSR or SMMULR instead. For this to be
13013 // the case, we must be doing signed multiplication and only use the higher
13014 // part of the result of the MLAL, furthermore the LowAddSub must be a constant
13015 // addition or subtraction with the value of 0x800000.
13016 if (Subtarget->hasV6Ops() && Subtarget->hasDSP() && Subtarget->useMulOps() &&
13017 FinalOpc == ARMISD::SMLAL && !AddeSubeNode->hasAnyUseOfValue(1) &&
13018 LowAddSub->getNode()->getOpcode() == ISD::Constant &&
13019 static_cast<ConstantSDNode *>(LowAddSub->getNode())->getZExtValue() ==
13020 0x80000000) {
13021 Ops.push_back(*HiAddSub);
13022 if (AddcSubcNode->getOpcode() == ARMISD::SUBC) {
13023 FinalOpc = ARMISD::SMMLSR;
13024 } else {
13025 FinalOpc = ARMISD::SMMLAR;
13026 }
13027 SDValue NewNode = DAG.getNode(FinalOpc, SDLoc(AddcSubcNode), MVT::i32, Ops);
13028 DAG.ReplaceAllUsesOfValueWith(SDValue(AddeSubeNode, 0), NewNode);
13029
13030 return SDValue(AddeSubeNode, 0);
13031 } else if (AddcSubcNode->getOpcode() == ARMISD::SUBC)
13032 // SMMLS is generated during instruction selection and the rest of this
13033 // function can not handle the case where AddcSubcNode is a SUBC.
13034 return SDValue();
13035
13036 // Finish building the operand list for {U/S}MLAL
13037 Ops.push_back(*LowAddSub);
13038 Ops.push_back(*HiAddSub);
13039
13040 SDValue MLALNode = DAG.getNode(FinalOpc, SDLoc(AddcSubcNode),
13041 DAG.getVTList(MVT::i32, MVT::i32), Ops);
13042
13043 // Replace the ADDs' nodes uses by the MLA node's values.
13044 SDValue HiMLALResult(MLALNode.getNode(), 1);
13045 DAG.ReplaceAllUsesOfValueWith(SDValue(AddeSubeNode, 0), HiMLALResult);
13046
13047 SDValue LoMLALResult(MLALNode.getNode(), 0);
13048 DAG.ReplaceAllUsesOfValueWith(SDValue(AddcSubcNode, 0), LoMLALResult);
13049
13050 // Return original node to notify the driver to stop replacing.
13051 return SDValue(AddeSubeNode, 0);
13052}
13053
13056 const ARMSubtarget *Subtarget) {
13057 // UMAAL is similar to UMLAL except that it adds two unsigned values.
13058 // While trying to combine for the other MLAL nodes, first search for the
13059 // chance to use UMAAL. Check if Addc uses a node which has already
13060 // been combined into a UMLAL. The other pattern is UMLAL using Addc/Adde
13061 // as the addend, and it's handled in PerformUMLALCombine.
13062
13063 if (!Subtarget->hasV6Ops() || !Subtarget->hasDSP())
13064 return AddCombineTo64bitMLAL(AddeNode, DCI, Subtarget);
13065
13066 // Check that we have a glued ADDC node.
13067 SDNode* AddcNode = AddeNode->getOperand(2).getNode();
13068 if (AddcNode->getOpcode() != ARMISD::ADDC)
13069 return SDValue();
13070
13071 // Find the converted UMAAL or quit if it doesn't exist.
13072 SDNode *UmlalNode = nullptr;
13073 SDValue AddHi;
13074 if (AddcNode->getOperand(0).getOpcode() == ARMISD::UMLAL) {
13075 UmlalNode = AddcNode->getOperand(0).getNode();
13076 AddHi = AddcNode->getOperand(1);
13077 } else if (AddcNode->getOperand(1).getOpcode() == ARMISD::UMLAL) {
13078 UmlalNode = AddcNode->getOperand(1).getNode();
13079 AddHi = AddcNode->getOperand(0);
13080 } else {
13081 return AddCombineTo64bitMLAL(AddeNode, DCI, Subtarget);
13082 }
13083
13084 // The ADDC should be glued to an ADDE node, which uses the same UMLAL as
13085 // the ADDC as well as Zero.
13086 if (!isNullConstant(UmlalNode->getOperand(3)))
13087 return SDValue();
13088
13089 if ((isNullConstant(AddeNode->getOperand(0)) &&
13090 AddeNode->getOperand(1).getNode() == UmlalNode) ||
13091 (AddeNode->getOperand(0).getNode() == UmlalNode &&
13092 isNullConstant(AddeNode->getOperand(1)))) {
13093 SelectionDAG &DAG = DCI.DAG;
13094 SDValue Ops[] = { UmlalNode->getOperand(0), UmlalNode->getOperand(1),
13095 UmlalNode->getOperand(2), AddHi };
13096 SDValue UMAAL = DAG.getNode(ARMISD::UMAAL, SDLoc(AddcNode),
13097 DAG.getVTList(MVT::i32, MVT::i32), Ops);
13098
13099 // Replace the ADDs' nodes uses by the UMAAL node's values.
13100 DAG.ReplaceAllUsesOfValueWith(SDValue(AddeNode, 0), SDValue(UMAAL.getNode(), 1));
13101 DAG.ReplaceAllUsesOfValueWith(SDValue(AddcNode, 0), SDValue(UMAAL.getNode(), 0));
13102
13103 // Return original node to notify the driver to stop replacing.
13104 return SDValue(AddeNode, 0);
13105 }
13106 return SDValue();
13107}
13108
13110 const ARMSubtarget *Subtarget) {
13111 if (!Subtarget->hasV6Ops() || !Subtarget->hasDSP())
13112 return SDValue();
13113
13114 // Check that we have a pair of ADDC and ADDE as operands.
13115 // Both addends of the ADDE must be zero.
13116 SDNode* AddcNode = N->getOperand(2).getNode();
13117 SDNode* AddeNode = N->getOperand(3).getNode();
13118 if ((AddcNode->getOpcode() == ARMISD::ADDC) &&
13119 (AddeNode->getOpcode() == ARMISD::ADDE) &&
13120 isNullConstant(AddeNode->getOperand(0)) &&
13121 isNullConstant(AddeNode->getOperand(1)) &&
13122 (AddeNode->getOperand(2).getNode() == AddcNode))
13123 return DAG.getNode(ARMISD::UMAAL, SDLoc(N),
13124 DAG.getVTList(MVT::i32, MVT::i32),
13125 {N->getOperand(0), N->getOperand(1),
13126 AddcNode->getOperand(0), AddcNode->getOperand(1)});
13127 else
13128 return SDValue();
13129}
13130
13133 const ARMSubtarget *Subtarget) {
13134 SelectionDAG &DAG(DCI.DAG);
13135
13136 if (N->getOpcode() == ARMISD::SUBC && N->hasAnyUseOfValue(1)) {
13137 // (SUBC (ADDE 0, 0, C), 1) -> C
13138 SDValue LHS = N->getOperand(0);
13139 SDValue RHS = N->getOperand(1);
13140 if (LHS->getOpcode() == ARMISD::ADDE &&
13141 isNullConstant(LHS->getOperand(0)) &&
13142 isNullConstant(LHS->getOperand(1)) && isOneConstant(RHS)) {
13143 return DCI.CombineTo(N, SDValue(N, 0), LHS->getOperand(2));
13144 }
13145 }
13146
13147 if (Subtarget->isThumb1Only()) {
13148 SDValue RHS = N->getOperand(1);
13150 int32_t imm = C->getSExtValue();
13151 if (imm < 0 && imm > std::numeric_limits<int>::min()) {
13152 SDLoc DL(N);
13153 RHS = DAG.getConstant(-imm, DL, MVT::i32);
13154 unsigned Opcode = (N->getOpcode() == ARMISD::ADDC) ? ARMISD::SUBC
13155 : ARMISD::ADDC;
13156 return DAG.getNode(Opcode, DL, N->getVTList(), N->getOperand(0), RHS);
13157 }
13158 }
13159 }
13160
13161 return SDValue();
13162}
13163
13166 const ARMSubtarget *Subtarget) {
13167 if (Subtarget->isThumb1Only()) {
13168 SelectionDAG &DAG = DCI.DAG;
13169 SDValue RHS = N->getOperand(1);
13171 int64_t imm = C->getSExtValue();
13172 if (imm < 0) {
13173 SDLoc DL(N);
13174
13175 // The with-carry-in form matches bitwise not instead of the negation.
13176 // Effectively, the inverse interpretation of the carry flag already
13177 // accounts for part of the negation.
13178 RHS = DAG.getConstant(~imm, DL, MVT::i32);
13179
13180 unsigned Opcode = (N->getOpcode() == ARMISD::ADDE) ? ARMISD::SUBE
13181 : ARMISD::ADDE;
13182 return DAG.getNode(Opcode, DL, N->getVTList(),
13183 N->getOperand(0), RHS, N->getOperand(2));
13184 }
13185 }
13186 } else if (N->getOperand(1)->getOpcode() == ISD::SMUL_LOHI) {
13187 return AddCombineTo64bitMLAL(N, DCI, Subtarget);
13188 }
13189 return SDValue();
13190}
13191
13194 const ARMSubtarget *Subtarget) {
13195 if (!Subtarget->hasMVEIntegerOps())
13196 return SDValue();
13197
13198 SDLoc dl(N);
13199 SDValue SetCC;
13200 SDValue LHS;
13201 SDValue RHS;
13202 ISD::CondCode CC;
13203 SDValue TrueVal;
13204 SDValue FalseVal;
13205
13206 if (N->getOpcode() == ISD::SELECT &&
13207 N->getOperand(0)->getOpcode() == ISD::SETCC) {
13208 SetCC = N->getOperand(0);
13209 LHS = SetCC->getOperand(0);
13210 RHS = SetCC->getOperand(1);
13211 CC = cast<CondCodeSDNode>(SetCC->getOperand(2))->get();
13212 TrueVal = N->getOperand(1);
13213 FalseVal = N->getOperand(2);
13214 } else if (N->getOpcode() == ISD::SELECT_CC) {
13215 LHS = N->getOperand(0);
13216 RHS = N->getOperand(1);
13217 CC = cast<CondCodeSDNode>(N->getOperand(4))->get();
13218 TrueVal = N->getOperand(2);
13219 FalseVal = N->getOperand(3);
13220 } else {
13221 return SDValue();
13222 }
13223
13224 unsigned int Opcode = 0;
13225 if ((TrueVal->getOpcode() == ISD::VECREDUCE_UMIN ||
13226 FalseVal->getOpcode() == ISD::VECREDUCE_UMIN) &&
13227 (CC == ISD::SETULT || CC == ISD::SETUGT)) {
13228 Opcode = ARMISD::VMINVu;
13229 if (CC == ISD::SETUGT)
13230 std::swap(TrueVal, FalseVal);
13231 } else if ((TrueVal->getOpcode() == ISD::VECREDUCE_SMIN ||
13232 FalseVal->getOpcode() == ISD::VECREDUCE_SMIN) &&
13233 (CC == ISD::SETLT || CC == ISD::SETGT)) {
13234 Opcode = ARMISD::VMINVs;
13235 if (CC == ISD::SETGT)
13236 std::swap(TrueVal, FalseVal);
13237 } else if ((TrueVal->getOpcode() == ISD::VECREDUCE_UMAX ||
13238 FalseVal->getOpcode() == ISD::VECREDUCE_UMAX) &&
13239 (CC == ISD::SETUGT || CC == ISD::SETULT)) {
13240 Opcode = ARMISD::VMAXVu;
13241 if (CC == ISD::SETULT)
13242 std::swap(TrueVal, FalseVal);
13243 } else if ((TrueVal->getOpcode() == ISD::VECREDUCE_SMAX ||
13244 FalseVal->getOpcode() == ISD::VECREDUCE_SMAX) &&
13245 (CC == ISD::SETGT || CC == ISD::SETLT)) {
13246 Opcode = ARMISD::VMAXVs;
13247 if (CC == ISD::SETLT)
13248 std::swap(TrueVal, FalseVal);
13249 } else
13250 return SDValue();
13251
13252 // Normalise to the right hand side being the vector reduction
13253 switch (TrueVal->getOpcode()) {
13254 case ISD::VECREDUCE_UMIN:
13255 case ISD::VECREDUCE_SMIN:
13256 case ISD::VECREDUCE_UMAX:
13257 case ISD::VECREDUCE_SMAX:
13258 std::swap(LHS, RHS);
13259 std::swap(TrueVal, FalseVal);
13260 break;
13261 }
13262
13263 EVT VectorType = FalseVal->getOperand(0).getValueType();
13264
13265 if (VectorType != MVT::v16i8 && VectorType != MVT::v8i16 &&
13266 VectorType != MVT::v4i32)
13267 return SDValue();
13268
13269 EVT VectorScalarType = VectorType.getVectorElementType();
13270
13271 // The values being selected must also be the ones being compared
13272 if (TrueVal != LHS || FalseVal != RHS)
13273 return SDValue();
13274
13275 EVT LeftType = LHS->getValueType(0);
13276 EVT RightType = RHS->getValueType(0);
13277
13278 // The types must match the reduced type too
13279 if (LeftType != VectorScalarType || RightType != VectorScalarType)
13280 return SDValue();
13281
13282 // Legalise the scalar to an i32
13283 if (VectorScalarType != MVT::i32)
13284 LHS = DCI.DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, LHS);
13285
13286 // Generate the reduction as an i32 for legalisation purposes
13287 auto Reduction =
13288 DCI.DAG.getNode(Opcode, dl, MVT::i32, LHS, RHS->getOperand(0));
13289
13290 // The result isn't actually an i32 so truncate it back to its original type
13291 if (VectorScalarType != MVT::i32)
13292 Reduction = DCI.DAG.getNode(ISD::TRUNCATE, dl, VectorScalarType, Reduction);
13293
13294 return Reduction;
13295}
13296
13297// A special combine for the vqdmulh family of instructions. This is one of the
13298// potential set of patterns that could patch this instruction. The base pattern
13299// you would expect to be min(max(ashr(mul(mul(sext(x), 2), sext(y)), 16))).
13300// This matches the different min(max(ashr(mul(mul(sext(x), sext(y)), 2), 16))),
13301// which llvm will have optimized to min(ashr(mul(sext(x), sext(y)), 15))) as
13302// the max is unnecessary.
13304 EVT VT = N->getValueType(0);
13305 SDValue Shft;
13306 ConstantSDNode *Clamp;
13307
13308 if (!VT.isVector() || VT.getScalarSizeInBits() > 64)
13309 return SDValue();
13310
13311 if (N->getOpcode() == ISD::SMIN) {
13312 Shft = N->getOperand(0);
13313 Clamp = isConstOrConstSplat(N->getOperand(1));
13314 } else if (N->getOpcode() == ISD::VSELECT) {
13315 // Detect a SMIN, which for an i64 node will be a vselect/setcc, not a smin.
13316 SDValue Cmp = N->getOperand(0);
13317 if (Cmp.getOpcode() != ISD::SETCC ||
13318 cast<CondCodeSDNode>(Cmp.getOperand(2))->get() != ISD::SETLT ||
13319 Cmp.getOperand(0) != N->getOperand(1) ||
13320 Cmp.getOperand(1) != N->getOperand(2))
13321 return SDValue();
13322 Shft = N->getOperand(1);
13323 Clamp = isConstOrConstSplat(N->getOperand(2));
13324 } else
13325 return SDValue();
13326
13327 if (!Clamp)
13328 return SDValue();
13329
13330 MVT ScalarType;
13331 int ShftAmt = 0;
13332 switch (Clamp->getSExtValue()) {
13333 case (1 << 7) - 1:
13334 ScalarType = MVT::i8;
13335 ShftAmt = 7;
13336 break;
13337 case (1 << 15) - 1:
13338 ScalarType = MVT::i16;
13339 ShftAmt = 15;
13340 break;
13341 case (1ULL << 31) - 1:
13342 ScalarType = MVT::i32;
13343 ShftAmt = 31;
13344 break;
13345 default:
13346 return SDValue();
13347 }
13348
13349 if (Shft.getOpcode() != ISD::SRA)
13350 return SDValue();
13352 if (!N1 || N1->getSExtValue() != ShftAmt)
13353 return SDValue();
13354
13355 SDValue Mul = Shft.getOperand(0);
13356 if (Mul.getOpcode() != ISD::MUL)
13357 return SDValue();
13358
13359 SDValue Ext0 = Mul.getOperand(0);
13360 SDValue Ext1 = Mul.getOperand(1);
13361 if (Ext0.getOpcode() != ISD::SIGN_EXTEND ||
13362 Ext1.getOpcode() != ISD::SIGN_EXTEND)
13363 return SDValue();
13364 EVT VecVT = Ext0.getOperand(0).getValueType();
13365 if (!VecVT.isPow2VectorType() || VecVT.getVectorNumElements() == 1)
13366 return SDValue();
13367 if (Ext1.getOperand(0).getValueType() != VecVT ||
13368 VecVT.getScalarType() != ScalarType ||
13369 VT.getScalarSizeInBits() < ScalarType.getScalarSizeInBits() * 2)
13370 return SDValue();
13371
13372 SDLoc DL(Mul);
13373 unsigned LegalLanes = 128 / (ShftAmt + 1);
13374 EVT LegalVecVT = MVT::getVectorVT(ScalarType, LegalLanes);
13375 // For types smaller than legal vectors extend to be legal and only use needed
13376 // lanes.
13377 if (VecVT.getSizeInBits() < 128) {
13378 EVT ExtVecVT =
13380 VecVT.getVectorNumElements());
13381 SDValue Inp0 =
13382 DAG.getNode(ISD::ANY_EXTEND, DL, ExtVecVT, Ext0.getOperand(0));
13383 SDValue Inp1 =
13384 DAG.getNode(ISD::ANY_EXTEND, DL, ExtVecVT, Ext1.getOperand(0));
13385 Inp0 = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, LegalVecVT, Inp0);
13386 Inp1 = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, LegalVecVT, Inp1);
13387 SDValue VQDMULH = DAG.getNode(ARMISD::VQDMULH, DL, LegalVecVT, Inp0, Inp1);
13388 SDValue Trunc = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, ExtVecVT, VQDMULH);
13389 Trunc = DAG.getNode(ISD::TRUNCATE, DL, VecVT, Trunc);
13390 return DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Trunc);
13391 }
13392
13393 // For larger types, split into legal sized chunks.
13394 assert(VecVT.getSizeInBits() % 128 == 0 && "Expected a power2 type");
13395 unsigned NumParts = VecVT.getSizeInBits() / 128;
13397 for (unsigned I = 0; I < NumParts; ++I) {
13398 SDValue Inp0 =
13399 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, LegalVecVT, Ext0.getOperand(0),
13400 DAG.getVectorIdxConstant(I * LegalLanes, DL));
13401 SDValue Inp1 =
13402 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, LegalVecVT, Ext1.getOperand(0),
13403 DAG.getVectorIdxConstant(I * LegalLanes, DL));
13404 SDValue VQDMULH = DAG.getNode(ARMISD::VQDMULH, DL, LegalVecVT, Inp0, Inp1);
13405 Parts.push_back(VQDMULH);
13406 }
13407 return DAG.getNode(ISD::SIGN_EXTEND, DL, VT,
13408 DAG.getNode(ISD::CONCAT_VECTORS, DL, VecVT, Parts));
13409}
13410
13413 const ARMSubtarget *Subtarget) {
13414 if (!Subtarget->hasMVEIntegerOps())
13415 return SDValue();
13416
13417 if (SDValue V = PerformVQDMULHCombine(N, DCI.DAG))
13418 return V;
13419
13420 // Transforms vselect(not(cond), lhs, rhs) into vselect(cond, rhs, lhs).
13421 //
13422 // We need to re-implement this optimization here as the implementation in the
13423 // Target-Independent DAGCombiner does not handle the kind of constant we make
13424 // (it calls isConstOrConstSplat with AllowTruncation set to false - and for
13425 // good reason, allowing truncation there would break other targets).
13426 //
13427 // Currently, this is only done for MVE, as it's the only target that benefits
13428 // from this transformation (e.g. VPNOT+VPSEL becomes a single VPSEL).
13429 if (N->getOperand(0).getOpcode() != ISD::XOR)
13430 return SDValue();
13431 SDValue XOR = N->getOperand(0);
13432
13433 // Check if the XOR's RHS is either a 1, or a BUILD_VECTOR of 1s.
13434 // It is important to check with truncation allowed as the BUILD_VECTORs we
13435 // generate in those situations will truncate their operands.
13436 ConstantSDNode *Const =
13437 isConstOrConstSplat(XOR->getOperand(1), /*AllowUndefs*/ false,
13438 /*AllowTruncation*/ true);
13439 if (!Const || !Const->isOne())
13440 return SDValue();
13441
13442 // Rewrite into vselect(cond, rhs, lhs).
13443 SDValue Cond = XOR->getOperand(0);
13444 SDValue LHS = N->getOperand(1);
13445 SDValue RHS = N->getOperand(2);
13446 EVT Type = N->getValueType(0);
13447 return DCI.DAG.getNode(ISD::VSELECT, SDLoc(N), Type, Cond, RHS, LHS);
13448}
13449
13450// Convert vsetcc([0,1,2,..], splat(n), ult) -> vctp n
13453 const ARMSubtarget *Subtarget) {
13454 SDValue Op0 = N->getOperand(0);
13455 SDValue Op1 = N->getOperand(1);
13456 ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
13457 EVT VT = N->getValueType(0);
13458
13459 if (!Subtarget->hasMVEIntegerOps() ||
13461 return SDValue();
13462
13463 if (CC == ISD::SETUGE) {
13464 std::swap(Op0, Op1);
13465 CC = ISD::SETULT;
13466 }
13467
13468 if (CC != ISD::SETULT || VT.getScalarSizeInBits() != 1 ||
13470 return SDValue();
13471
13472 // Check first operand is BuildVector of 0,1,2,...
13473 for (unsigned I = 0; I < VT.getVectorNumElements(); I++) {
13474 if (!Op0.getOperand(I).isUndef() &&
13476 Op0.getConstantOperandVal(I) == I))
13477 return SDValue();
13478 }
13479
13480 // The second is a Splat of Op1S
13481 SDValue Op1S = DCI.DAG.getSplatValue(Op1);
13482 if (!Op1S)
13483 return SDValue();
13484
13485 unsigned Opc;
13486 switch (VT.getVectorNumElements()) {
13487 case 2:
13488 Opc = Intrinsic::arm_mve_vctp64;
13489 break;
13490 case 4:
13491 Opc = Intrinsic::arm_mve_vctp32;
13492 break;
13493 case 8:
13494 Opc = Intrinsic::arm_mve_vctp16;
13495 break;
13496 case 16:
13497 Opc = Intrinsic::arm_mve_vctp8;
13498 break;
13499 default:
13500 return SDValue();
13501 }
13502
13503 SDLoc DL(N);
13504 return DCI.DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
13505 DCI.DAG.getConstant(Opc, DL, MVT::i32),
13506 DCI.DAG.getZExtOrTrunc(Op1S, DL, MVT::i32));
13507}
13508
13509/// PerformADDECombine - Target-specific dag combine transform from
13510/// ARMISD::ADDC, ARMISD::ADDE, and ISD::MUL_LOHI to MLAL or
13511/// ARMISD::ADDC, ARMISD::ADDE and ARMISD::UMLAL to ARMISD::UMAAL
13514 const ARMSubtarget *Subtarget) {
13515 // Only ARM and Thumb2 support UMLAL/SMLAL.
13516 if (Subtarget->isThumb1Only())
13517 return PerformAddeSubeCombine(N, DCI, Subtarget);
13518
13519 // Only perform the checks after legalize when the pattern is available.
13520 if (DCI.isBeforeLegalize()) return SDValue();
13521
13522 return AddCombineTo64bitUMAAL(N, DCI, Subtarget);
13523}
13524
13525/// PerformADDCombineWithOperands - Try DAG combinations for an ADD with
13526/// operands N0 and N1. This is a helper for PerformADDCombine that is
13527/// called with the default operands, and if that fails, with commuted
13528/// operands.
13531 const ARMSubtarget *Subtarget){
13532 // Attempt to create vpadd for this add.
13533 if (SDValue Result = AddCombineToVPADD(N, N0, N1, DCI, Subtarget))
13534 return Result;
13535
13536 // Attempt to create vpaddl for this add.
13537 if (SDValue Result = AddCombineVUZPToVPADDL(N, N0, N1, DCI, Subtarget))
13538 return Result;
13539 if (SDValue Result = AddCombineBUILD_VECTORToVPADDL(N, N0, N1, DCI,
13540 Subtarget))
13541 return Result;
13542
13543 // fold (add (select cc, 0, c), x) -> (select cc, x, (add, x, c))
13544 if (N0.getNode()->hasOneUse())
13545 if (SDValue Result = combineSelectAndUse(N, N0, N1, DCI))
13546 return Result;
13547 return SDValue();
13548}
13549
13551 EVT VT = N->getValueType(0);
13552 SDValue N0 = N->getOperand(0);
13553 SDValue N1 = N->getOperand(1);
13554 SDLoc dl(N);
13555
13556 auto IsVecReduce = [](SDValue Op) {
13557 switch (Op.getOpcode()) {
13558 case ISD::VECREDUCE_ADD:
13559 case ARMISD::VADDVs:
13560 case ARMISD::VADDVu:
13561 case ARMISD::VMLAVs:
13562 case ARMISD::VMLAVu:
13563 return true;
13564 }
13565 return false;
13566 };
13567
13568 auto DistrubuteAddAddVecReduce = [&](SDValue N0, SDValue N1) {
13569 // Distribute add(X, add(vecreduce(Y), vecreduce(Z))) ->
13570 // add(add(X, vecreduce(Y)), vecreduce(Z))
13571 // to make better use of vaddva style instructions.
13572 if (VT == MVT::i32 && N1.getOpcode() == ISD::ADD && !IsVecReduce(N0) &&
13573 IsVecReduce(N1.getOperand(0)) && IsVecReduce(N1.getOperand(1)) &&
13574 !isa<ConstantSDNode>(N0) && N1->hasOneUse()) {
13575 SDValue Add0 = DAG.getNode(ISD::ADD, dl, VT, N0, N1.getOperand(0));
13576 return DAG.getNode(ISD::ADD, dl, VT, Add0, N1.getOperand(1));
13577 }
13578 // And turn add(add(A, reduce(B)), add(C, reduce(D))) ->
13579 // add(add(add(A, C), reduce(B)), reduce(D))
13580 if (VT == MVT::i32 && N0.getOpcode() == ISD::ADD &&
13581 N1.getOpcode() == ISD::ADD && N0->hasOneUse() && N1->hasOneUse()) {
13582 unsigned N0RedOp = 0;
13583 if (!IsVecReduce(N0.getOperand(N0RedOp))) {
13584 N0RedOp = 1;
13585 if (!IsVecReduce(N0.getOperand(N0RedOp)))
13586 return SDValue();
13587 }
13588
13589 unsigned N1RedOp = 0;
13590 if (!IsVecReduce(N1.getOperand(N1RedOp)))
13591 N1RedOp = 1;
13592 if (!IsVecReduce(N1.getOperand(N1RedOp)))
13593 return SDValue();
13594
13595 SDValue Add0 = DAG.getNode(ISD::ADD, dl, VT, N0.getOperand(1 - N0RedOp),
13596 N1.getOperand(1 - N1RedOp));
13597 SDValue Add1 =
13598 DAG.getNode(ISD::ADD, dl, VT, Add0, N0.getOperand(N0RedOp));
13599 return DAG.getNode(ISD::ADD, dl, VT, Add1, N1.getOperand(N1RedOp));
13600 }
13601 return SDValue();
13602 };
13603 if (SDValue R = DistrubuteAddAddVecReduce(N0, N1))
13604 return R;
13605 if (SDValue R = DistrubuteAddAddVecReduce(N1, N0))
13606 return R;
13607
13608 // Distribute add(vecreduce(load(Y)), vecreduce(load(Z)))
13609 // Or add(add(X, vecreduce(load(Y))), vecreduce(load(Z)))
13610 // by ascending load offsets. This can help cores prefetch if the order of
13611 // loads is more predictable.
13612 auto DistrubuteVecReduceLoad = [&](SDValue N0, SDValue N1, bool IsForward) {
13613 // Check if two reductions are known to load data where one is before/after
13614 // another. Return negative if N0 loads data before N1, positive if N1 is
13615 // before N0 and 0 otherwise if nothing is known.
13616 auto IsKnownOrderedLoad = [&](SDValue N0, SDValue N1) {
13617 // Look through to the first operand of a MUL, for the VMLA case.
13618 // Currently only looks at the first operand, in the hope they are equal.
13619 if (N0.getOpcode() == ISD::MUL)
13620 N0 = N0.getOperand(0);
13621 if (N1.getOpcode() == ISD::MUL)
13622 N1 = N1.getOperand(0);
13623
13624 // Return true if the two operands are loads to the same object and the
13625 // offset of the first is known to be less than the offset of the second.
13626 LoadSDNode *Load0 = dyn_cast<LoadSDNode>(N0);
13627 LoadSDNode *Load1 = dyn_cast<LoadSDNode>(N1);
13628 if (!Load0 || !Load1 || Load0->getChain() != Load1->getChain() ||
13629 !Load0->isSimple() || !Load1->isSimple() || Load0->isIndexed() ||
13630 Load1->isIndexed())
13631 return 0;
13632
13633 auto BaseLocDecomp0 = BaseIndexOffset::match(Load0, DAG);
13634 auto BaseLocDecomp1 = BaseIndexOffset::match(Load1, DAG);
13635
13636 if (!BaseLocDecomp0.getBase() ||
13637 BaseLocDecomp0.getBase() != BaseLocDecomp1.getBase() ||
13638 !BaseLocDecomp0.hasValidOffset() || !BaseLocDecomp1.hasValidOffset())
13639 return 0;
13640 if (BaseLocDecomp0.getOffset() < BaseLocDecomp1.getOffset())
13641 return -1;
13642 if (BaseLocDecomp0.getOffset() > BaseLocDecomp1.getOffset())
13643 return 1;
13644 return 0;
13645 };
13646
13647 SDValue X;
13648 if (N0.getOpcode() == ISD::ADD && N0->hasOneUse()) {
13649 if (IsVecReduce(N0.getOperand(0)) && IsVecReduce(N0.getOperand(1))) {
13650 int IsBefore = IsKnownOrderedLoad(N0.getOperand(0).getOperand(0),
13651 N0.getOperand(1).getOperand(0));
13652 if (IsBefore < 0) {
13653 X = N0.getOperand(0);
13654 N0 = N0.getOperand(1);
13655 } else if (IsBefore > 0) {
13656 X = N0.getOperand(1);
13657 N0 = N0.getOperand(0);
13658 } else
13659 return SDValue();
13660 } else if (IsVecReduce(N0.getOperand(0))) {
13661 X = N0.getOperand(1);
13662 N0 = N0.getOperand(0);
13663 } else if (IsVecReduce(N0.getOperand(1))) {
13664 X = N0.getOperand(0);
13665 N0 = N0.getOperand(1);
13666 } else
13667 return SDValue();
13668 } else if (IsForward && IsVecReduce(N0) && IsVecReduce(N1) &&
13669 IsKnownOrderedLoad(N0.getOperand(0), N1.getOperand(0)) < 0) {
13670 // Note this is backward to how you would expect. We create
13671 // add(reduce(load + 16), reduce(load + 0)) so that the
13672 // add(reduce(load+16), X) is combined into VADDVA(X, load+16)), leaving
13673 // the X as VADDV(load + 0)
13674 return DAG.getNode(ISD::ADD, dl, VT, N1, N0);
13675 } else
13676 return SDValue();
13677
13678 if (!IsVecReduce(N0) || !IsVecReduce(N1))
13679 return SDValue();
13680
13681 if (IsKnownOrderedLoad(N1.getOperand(0), N0.getOperand(0)) >= 0)
13682 return SDValue();
13683
13684 // Switch from add(add(X, N0), N1) to add(add(X, N1), N0)
13685 SDValue Add0 = DAG.getNode(ISD::ADD, dl, VT, X, N1);
13686 return DAG.getNode(ISD::ADD, dl, VT, Add0, N0);
13687 };
13688 if (SDValue R = DistrubuteVecReduceLoad(N0, N1, true))
13689 return R;
13690 if (SDValue R = DistrubuteVecReduceLoad(N1, N0, false))
13691 return R;
13692 return SDValue();
13693}
13694
13696 const ARMSubtarget *Subtarget) {
13697 if (!Subtarget->hasMVEIntegerOps())
13698 return SDValue();
13699
13701 return R;
13702
13703 EVT VT = N->getValueType(0);
13704 SDValue N0 = N->getOperand(0);
13705 SDValue N1 = N->getOperand(1);
13706 SDLoc dl(N);
13707
13708 if (VT != MVT::i64)
13709 return SDValue();
13710
13711 // We are looking for a i64 add of a VADDLVx. Due to these being i64's, this
13712 // will look like:
13713 // t1: i32,i32 = ARMISD::VADDLVs x
13714 // t2: i64 = build_pair t1, t1:1
13715 // t3: i64 = add t2, y
13716 // Otherwise we try to push the add up above VADDLVAx, to potentially allow
13717 // the add to be simplified separately.
13718 // We also need to check for sext / zext and commutitive adds.
13719 auto MakeVecReduce = [&](unsigned Opcode, unsigned OpcodeA, SDValue NA,
13720 SDValue NB) {
13721 if (NB->getOpcode() != ISD::BUILD_PAIR)
13722 return SDValue();
13723 SDValue VecRed = NB->getOperand(0);
13724 if ((VecRed->getOpcode() != Opcode && VecRed->getOpcode() != OpcodeA) ||
13725 VecRed.getResNo() != 0 ||
13726 NB->getOperand(1) != SDValue(VecRed.getNode(), 1))
13727 return SDValue();
13728
13729 if (VecRed->getOpcode() == OpcodeA) {
13730 // add(NA, VADDLVA(Inp), Y) -> VADDLVA(add(NA, Inp), Y)
13731 SDValue Inp = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64,
13732 VecRed.getOperand(0), VecRed.getOperand(1));
13733 NA = DAG.getNode(ISD::ADD, dl, MVT::i64, Inp, NA);
13734 }
13735
13737 std::tie(Ops[0], Ops[1]) = DAG.SplitScalar(NA, dl, MVT::i32, MVT::i32);
13738
13739 unsigned S = VecRed->getOpcode() == OpcodeA ? 2 : 0;
13740 for (unsigned I = S, E = VecRed.getNumOperands(); I < E; I++)
13741 Ops.push_back(VecRed->getOperand(I));
13742 SDValue Red =
13743 DAG.getNode(OpcodeA, dl, DAG.getVTList({MVT::i32, MVT::i32}), Ops);
13744 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Red,
13745 SDValue(Red.getNode(), 1));
13746 };
13747
13748 if (SDValue M = MakeVecReduce(ARMISD::VADDLVs, ARMISD::VADDLVAs, N0, N1))
13749 return M;
13750 if (SDValue M = MakeVecReduce(ARMISD::VADDLVu, ARMISD::VADDLVAu, N0, N1))
13751 return M;
13752 if (SDValue M = MakeVecReduce(ARMISD::VADDLVs, ARMISD::VADDLVAs, N1, N0))
13753 return M;
13754 if (SDValue M = MakeVecReduce(ARMISD::VADDLVu, ARMISD::VADDLVAu, N1, N0))
13755 return M;
13756 if (SDValue M = MakeVecReduce(ARMISD::VADDLVps, ARMISD::VADDLVAps, N0, N1))
13757 return M;
13758 if (SDValue M = MakeVecReduce(ARMISD::VADDLVpu, ARMISD::VADDLVApu, N0, N1))
13759 return M;
13760 if (SDValue M = MakeVecReduce(ARMISD::VADDLVps, ARMISD::VADDLVAps, N1, N0))
13761 return M;
13762 if (SDValue M = MakeVecReduce(ARMISD::VADDLVpu, ARMISD::VADDLVApu, N1, N0))
13763 return M;
13764 if (SDValue M = MakeVecReduce(ARMISD::VMLALVs, ARMISD::VMLALVAs, N0, N1))
13765 return M;
13766 if (SDValue M = MakeVecReduce(ARMISD::VMLALVu, ARMISD::VMLALVAu, N0, N1))
13767 return M;
13768 if (SDValue M = MakeVecReduce(ARMISD::VMLALVs, ARMISD::VMLALVAs, N1, N0))
13769 return M;
13770 if (SDValue M = MakeVecReduce(ARMISD::VMLALVu, ARMISD::VMLALVAu, N1, N0))
13771 return M;
13772 if (SDValue M = MakeVecReduce(ARMISD::VMLALVps, ARMISD::VMLALVAps, N0, N1))
13773 return M;
13774 if (SDValue M = MakeVecReduce(ARMISD::VMLALVpu, ARMISD::VMLALVApu, N0, N1))
13775 return M;
13776 if (SDValue M = MakeVecReduce(ARMISD::VMLALVps, ARMISD::VMLALVAps, N1, N0))
13777 return M;
13778 if (SDValue M = MakeVecReduce(ARMISD::VMLALVpu, ARMISD::VMLALVApu, N1, N0))
13779 return M;
13780 return SDValue();
13781}
13782
13783bool
13785 CombineLevel Level) const {
13786 assert((N->getOpcode() == ISD::SHL || N->getOpcode() == ISD::SRA ||
13787 N->getOpcode() == ISD::SRL) &&
13788 "Expected shift op");
13789
13790 SDValue ShiftLHS = N->getOperand(0);
13791 if (!ShiftLHS->hasOneUse())
13792 return false;
13793
13794 if (ShiftLHS.getOpcode() == ISD::SIGN_EXTEND &&
13795 !ShiftLHS.getOperand(0)->hasOneUse())
13796 return false;
13797
13798 if (Level == BeforeLegalizeTypes)
13799 return true;
13800
13801 if (N->getOpcode() != ISD::SHL)
13802 return true;
13803
13804 if (Subtarget->isThumb1Only()) {
13805 // Avoid making expensive immediates by commuting shifts. (This logic
13806 // only applies to Thumb1 because ARM and Thumb2 immediates can be shifted
13807 // for free.)
13808 if (N->getOpcode() != ISD::SHL)
13809 return true;
13810 SDValue N1 = N->getOperand(0);
13811 if (N1->getOpcode() != ISD::ADD && N1->getOpcode() != ISD::AND &&
13812 N1->getOpcode() != ISD::OR && N1->getOpcode() != ISD::XOR)
13813 return true;
13814 if (auto *Const = dyn_cast<ConstantSDNode>(N1->getOperand(1))) {
13815 if (Const->getAPIntValue().ult(256))
13816 return false;
13817 if (N1->getOpcode() == ISD::ADD && Const->getAPIntValue().slt(0) &&
13818 Const->getAPIntValue().sgt(-256))
13819 return false;
13820 }
13821 return true;
13822 }
13823
13824 // Turn off commute-with-shift transform after legalization, so it doesn't
13825 // conflict with PerformSHLSimplify. (We could try to detect when
13826 // PerformSHLSimplify would trigger more precisely, but it isn't
13827 // really necessary.)
13828 return false;
13829}
13830
13832 const SDNode *N) const {
13833 assert(N->getOpcode() == ISD::XOR &&
13834 (N->getOperand(0).getOpcode() == ISD::SHL ||
13835 N->getOperand(0).getOpcode() == ISD::SRL) &&
13836 "Expected XOR(SHIFT) pattern");
13837
13838 // Only commute if the entire NOT mask is a hidden shifted mask.
13839 auto *XorC = dyn_cast<ConstantSDNode>(N->getOperand(1));
13840 auto *ShiftC = dyn_cast<ConstantSDNode>(N->getOperand(0).getOperand(1));
13841 if (XorC && ShiftC) {
13842 unsigned MaskIdx, MaskLen;
13843 if (XorC->getAPIntValue().isShiftedMask(MaskIdx, MaskLen)) {
13844 unsigned ShiftAmt = ShiftC->getZExtValue();
13845 unsigned BitWidth = N->getValueType(0).getScalarSizeInBits();
13846 if (N->getOperand(0).getOpcode() == ISD::SHL)
13847 return MaskIdx == ShiftAmt && MaskLen == (BitWidth - ShiftAmt);
13848 return MaskIdx == 0 && MaskLen == (BitWidth - ShiftAmt);
13849 }
13850 }
13851
13852 return false;
13853}
13854
13856 const SDNode *N) const {
13857 assert(((N->getOpcode() == ISD::SHL &&
13858 N->getOperand(0).getOpcode() == ISD::SRL) ||
13859 (N->getOpcode() == ISD::SRL &&
13860 N->getOperand(0).getOpcode() == ISD::SHL)) &&
13861 "Expected shift-shift mask");
13862
13863 if (!Subtarget->isThumb1Only())
13864 return true;
13865
13866 EVT VT = N->getValueType(0);
13867 if (VT.getScalarSizeInBits() > 32)
13868 return true;
13869
13870 return false;
13871}
13872
13874 unsigned BinOpcode, EVT VT, unsigned SelectOpcode, SDValue X,
13875 SDValue Y) const {
13876 return Subtarget->hasMVEIntegerOps() && isTypeLegal(VT) &&
13877 SelectOpcode == ISD::VSELECT;
13878}
13879
13881 if (!Subtarget->hasNEON()) {
13882 if (Subtarget->isThumb1Only())
13883 return VT.getScalarSizeInBits() <= 32;
13884 return true;
13885 }
13886 return VT.isScalarInteger();
13887}
13888
13890 EVT VT) const {
13891 if (!isOperationLegalOrCustom(Op, VT) || !FPVT.isSimple())
13892 return false;
13893
13894 switch (FPVT.getSimpleVT().SimpleTy) {
13895 case MVT::f16:
13896 return Subtarget->hasVFP2Base();
13897 case MVT::f32:
13898 return Subtarget->hasVFP2Base();
13899 case MVT::f64:
13900 return Subtarget->hasFP64();
13901 case MVT::v4f32:
13902 case MVT::v8f16:
13903 return Subtarget->hasMVEFloatOps();
13904 default:
13905 return false;
13906 }
13907}
13908
13911 const ARMSubtarget *ST) {
13912 // Allow the generic combiner to identify potential bswaps.
13913 if (DCI.isBeforeLegalize())
13914 return SDValue();
13915
13916 // DAG combiner will fold:
13917 // (shl (add x, c1), c2) -> (add (shl x, c2), c1 << c2)
13918 // (shl (or x, c1), c2) -> (or (shl x, c2), c1 << c2
13919 // Other code patterns that can be also be modified have the following form:
13920 // b + ((a << 1) | 510)
13921 // b + ((a << 1) & 510)
13922 // b + ((a << 1) ^ 510)
13923 // b + ((a << 1) + 510)
13924
13925 // Many instructions can perform the shift for free, but it requires both
13926 // the operands to be registers. If c1 << c2 is too large, a mov immediate
13927 // instruction will needed. So, unfold back to the original pattern if:
13928 // - if c1 and c2 are small enough that they don't require mov imms.
13929 // - the user(s) of the node can perform an shl
13930
13931 // No shifted operands for 16-bit instructions.
13932 if (ST->isThumb() && ST->isThumb1Only())
13933 return SDValue();
13934
13935 // Check that all the users could perform the shl themselves.
13936 for (auto *U : N->users()) {
13937 switch(U->getOpcode()) {
13938 default:
13939 return SDValue();
13940 case ISD::SUB:
13941 case ISD::ADD:
13942 case ISD::AND:
13943 case ISD::OR:
13944 case ISD::XOR:
13945 case ISD::SETCC:
13946 case ARMISD::CMP:
13947 // Check that the user isn't already using a constant because there
13948 // aren't any instructions that support an immediate operand and a
13949 // shifted operand.
13950 if (isa<ConstantSDNode>(U->getOperand(0)) ||
13951 isa<ConstantSDNode>(U->getOperand(1)))
13952 return SDValue();
13953
13954 // Check that it's not already using a shift.
13955 if (U->getOperand(0).getOpcode() == ISD::SHL ||
13956 U->getOperand(1).getOpcode() == ISD::SHL)
13957 return SDValue();
13958 break;
13959 }
13960 }
13961
13962 if (N->getOpcode() != ISD::ADD && N->getOpcode() != ISD::OR &&
13963 N->getOpcode() != ISD::XOR && N->getOpcode() != ISD::AND)
13964 return SDValue();
13965
13966 if (N->getOperand(0).getOpcode() != ISD::SHL)
13967 return SDValue();
13968
13969 SDValue SHL = N->getOperand(0);
13970
13971 auto *C1ShlC2 = dyn_cast<ConstantSDNode>(N->getOperand(1));
13972 auto *C2 = dyn_cast<ConstantSDNode>(SHL.getOperand(1));
13973 if (!C1ShlC2 || !C2)
13974 return SDValue();
13975
13976 APInt C2Int = C2->getAPIntValue();
13977 APInt C1Int = C1ShlC2->getAPIntValue();
13978 unsigned C2Width = C2Int.getBitWidth();
13979 if (C2Int.uge(C2Width))
13980 return SDValue();
13981 uint64_t C2Value = C2Int.getZExtValue();
13982
13983 // Check that performing a lshr will not lose any information.
13984 APInt Mask = APInt::getHighBitsSet(C2Width, C2Width - C2Value);
13985 if ((C1Int & Mask) != C1Int)
13986 return SDValue();
13987
13988 // Shift the first constant.
13989 C1Int.lshrInPlace(C2Int);
13990
13991 // The immediates are encoded as an 8-bit value that can be rotated.
13992 auto LargeImm = [](const APInt &Imm) {
13993 unsigned Zeros = Imm.countl_zero() + Imm.countr_zero();
13994 return Imm.getBitWidth() - Zeros > 8;
13995 };
13996
13997 if (LargeImm(C1Int) || LargeImm(C2Int))
13998 return SDValue();
13999
14000 SelectionDAG &DAG = DCI.DAG;
14001 SDLoc dl(N);
14002 SDValue X = SHL.getOperand(0);
14003 SDValue BinOp = DAG.getNode(N->getOpcode(), dl, MVT::i32, X,
14004 DAG.getConstant(C1Int, dl, MVT::i32));
14005 // Shift left to compensate for the lshr of C1Int.
14006 SDValue Res = DAG.getNode(ISD::SHL, dl, MVT::i32, BinOp, SHL.getOperand(1));
14007
14008 LLVM_DEBUG(dbgs() << "Simplify shl use:\n"; SHL.getOperand(0).dump();
14009 SHL.dump(); N->dump());
14010 LLVM_DEBUG(dbgs() << "Into:\n"; X.dump(); BinOp.dump(); Res.dump());
14011 return Res;
14012}
14013
14014
14015/// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD.
14016///
14019 const ARMSubtarget *Subtarget) {
14020 SDValue N0 = N->getOperand(0);
14021 SDValue N1 = N->getOperand(1);
14022
14023 // Only works one way, because it needs an immediate operand.
14024 if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget))
14025 return Result;
14026
14027 if (SDValue Result = PerformADDVecReduce(N, DCI.DAG, Subtarget))
14028 return Result;
14029
14030 // First try with the default operand order.
14031 if (SDValue Result = PerformADDCombineWithOperands(N, N0, N1, DCI, Subtarget))
14032 return Result;
14033
14034 // If that didn't work, try again with the operands commuted.
14035 return PerformADDCombineWithOperands(N, N1, N0, DCI, Subtarget);
14036}
14037
14038// Combine (sub 0, (csinc X, Y, CC)) -> (csinv -X, Y, CC)
14039// providing -X is as cheap as X (currently, just a constant).
14041 if (N->getValueType(0) != MVT::i32 || !isNullConstant(N->getOperand(0)))
14042 return SDValue();
14043 SDValue CSINC = N->getOperand(1);
14044 if (CSINC.getOpcode() != ARMISD::CSINC || !CSINC.hasOneUse())
14045 return SDValue();
14046
14047 ConstantSDNode *X = dyn_cast<ConstantSDNode>(CSINC.getOperand(0));
14048 if (!X)
14049 return SDValue();
14050
14051 return DAG.getNode(ARMISD::CSINV, SDLoc(N), MVT::i32,
14052 DAG.getNode(ISD::SUB, SDLoc(N), MVT::i32, N->getOperand(0),
14053 CSINC.getOperand(0)),
14054 CSINC.getOperand(1), CSINC.getOperand(2),
14055 CSINC.getOperand(3));
14056}
14057
14059 return Op.getOpcode() == ISD::SUB && isNullConstant(Op.getOperand(0));
14060}
14061
14062// Try to fold
14063//
14064// (neg (cmov X, Y)) -> (cmov (neg X), (neg Y))
14065//
14066// The folding helps cmov to be matched with csneg without generating
14067// redundant neg instruction.
14069 if (!isNegatedInteger(SDValue(N, 0)))
14070 return SDValue();
14071
14072 SDValue CMov = N->getOperand(1);
14073 if (CMov.getOpcode() != ARMISD::CMOV || !CMov->hasOneUse())
14074 return SDValue();
14075
14076 SDValue N0 = CMov.getOperand(0);
14077 SDValue N1 = CMov.getOperand(1);
14078
14079 // If neither of them are negations, it's not worth the folding as it
14080 // introduces two additional negations while reducing one negation.
14081 if (!isNegatedInteger(N0) && !isNegatedInteger(N1))
14082 return SDValue();
14083
14084 SDLoc DL(N);
14085 EVT VT = CMov.getValueType();
14086
14087 SDValue N0N = DAG.getNegative(N0, DL, VT);
14088 SDValue N1N = DAG.getNegative(N1, DL, VT);
14089 return DAG.getNode(ARMISD::CMOV, DL, VT, N0N, N1N, CMov.getOperand(2),
14090 CMov.getOperand(3));
14091}
14092
14093/// PerformSUBCombine - Target-specific dag combine xforms for ISD::SUB.
14094///
14097 const ARMSubtarget *Subtarget) {
14098 SDValue N0 = N->getOperand(0);
14099 SDValue N1 = N->getOperand(1);
14100
14101 // fold (sub x, (select cc, 0, c)) -> (select cc, x, (sub, x, c))
14102 if (N1.getNode()->hasOneUse())
14103 if (SDValue Result = combineSelectAndUse(N, N1, N0, DCI))
14104 return Result;
14105
14106 if (SDValue R = PerformSubCSINCCombine(N, DCI.DAG))
14107 return R;
14108
14109 if (SDValue Val = performNegCMovCombine(N, DCI.DAG))
14110 return Val;
14111
14112 if (!Subtarget->hasMVEIntegerOps() || !N->getValueType(0).isVector())
14113 return SDValue();
14114
14115 // Fold (sub (ARMvmovImm 0), (ARMvdup x)) -> (ARMvdup (sub 0, x))
14116 // so that we can readily pattern match more mve instructions which can use
14117 // a scalar operand.
14118 SDValue VDup = N->getOperand(1);
14119 if (VDup->getOpcode() != ARMISD::VDUP)
14120 return SDValue();
14121
14122 SDValue VMov = N->getOperand(0);
14123 if (VMov->getOpcode() == ISD::BITCAST)
14124 VMov = VMov->getOperand(0);
14125
14126 if (VMov->getOpcode() != ARMISD::VMOVIMM || !isZeroVector(VMov))
14127 return SDValue();
14128
14129 SDLoc dl(N);
14130 SDValue Negate = DCI.DAG.getNode(ISD::SUB, dl, MVT::i32,
14131 DCI.DAG.getConstant(0, dl, MVT::i32),
14132 VDup->getOperand(0));
14133 return DCI.DAG.getNode(ARMISD::VDUP, dl, N->getValueType(0), Negate);
14134}
14135
14136/// PerformVMULCombine
14137/// Distribute (A + B) * C to (A * C) + (B * C) to take advantage of the
14138/// special multiplier accumulator forwarding.
14139/// vmul d3, d0, d2
14140/// vmla d3, d1, d2
14141/// is faster than
14142/// vadd d3, d0, d1
14143/// vmul d3, d3, d2
14144// However, for (A + B) * (A + B),
14145// vadd d2, d0, d1
14146// vmul d3, d0, d2
14147// vmla d3, d1, d2
14148// is slower than
14149// vadd d2, d0, d1
14150// vmul d3, d2, d2
14153 const ARMSubtarget *Subtarget) {
14154 if (!Subtarget->hasVMLxForwarding())
14155 return SDValue();
14156
14157 SelectionDAG &DAG = DCI.DAG;
14158 SDValue N0 = N->getOperand(0);
14159 SDValue N1 = N->getOperand(1);
14160 unsigned Opcode = N0.getOpcode();
14161 if (Opcode != ISD::ADD && Opcode != ISD::SUB &&
14162 Opcode != ISD::FADD && Opcode != ISD::FSUB) {
14163 Opcode = N1.getOpcode();
14164 if (Opcode != ISD::ADD && Opcode != ISD::SUB &&
14165 Opcode != ISD::FADD && Opcode != ISD::FSUB)
14166 return SDValue();
14167 std::swap(N0, N1);
14168 }
14169
14170 if (N0 == N1)
14171 return SDValue();
14172
14173 EVT VT = N->getValueType(0);
14174 SDLoc DL(N);
14175 SDValue N00 = N0->getOperand(0);
14176 SDValue N01 = N0->getOperand(1);
14177 return DAG.getNode(Opcode, DL, VT,
14178 DAG.getNode(ISD::MUL, DL, VT, N00, N1),
14179 DAG.getNode(ISD::MUL, DL, VT, N01, N1));
14180}
14181
14183 const ARMSubtarget *Subtarget) {
14184 EVT VT = N->getValueType(0);
14185 if (VT != MVT::v2i64)
14186 return SDValue();
14187
14188 SDValue N0 = N->getOperand(0);
14189 SDValue N1 = N->getOperand(1);
14190
14191 auto IsSignExt = [&](SDValue Op) {
14192 if (Op->getOpcode() != ISD::SIGN_EXTEND_INREG)
14193 return SDValue();
14194 EVT VT = cast<VTSDNode>(Op->getOperand(1))->getVT();
14195 if (VT.getScalarSizeInBits() == 32)
14196 return Op->getOperand(0);
14197 return SDValue();
14198 };
14199 auto IsZeroExt = [&](SDValue Op) {
14200 // Zero extends are a little more awkward. At the point we are matching
14201 // this, we are looking for an AND with a (-1, 0, -1, 0) buildvector mask.
14202 // That might be before of after a bitcast depending on how the and is
14203 // placed. Because this has to look through bitcasts, it is currently only
14204 // supported on LE.
14205 if (!Subtarget->isLittle())
14206 return SDValue();
14207
14208 SDValue And = Op;
14209 if (And->getOpcode() == ISD::BITCAST)
14210 And = And->getOperand(0);
14211 if (And->getOpcode() != ISD::AND)
14212 return SDValue();
14213 SDValue Mask = And->getOperand(1);
14214 if (Mask->getOpcode() == ISD::BITCAST)
14215 Mask = Mask->getOperand(0);
14216
14217 if (Mask->getOpcode() != ISD::BUILD_VECTOR ||
14218 Mask.getValueType() != MVT::v4i32)
14219 return SDValue();
14220 if (isAllOnesConstant(Mask->getOperand(0)) &&
14221 isNullConstant(Mask->getOperand(1)) &&
14222 isAllOnesConstant(Mask->getOperand(2)) &&
14223 isNullConstant(Mask->getOperand(3)))
14224 return And->getOperand(0);
14225 return SDValue();
14226 };
14227
14228 SDLoc dl(N);
14229 if (SDValue Op0 = IsSignExt(N0)) {
14230 if (SDValue Op1 = IsSignExt(N1)) {
14231 SDValue New0a = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, Op0);
14232 SDValue New1a = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, Op1);
14233 return DAG.getNode(ARMISD::VMULLs, dl, VT, New0a, New1a);
14234 }
14235 }
14236 if (SDValue Op0 = IsZeroExt(N0)) {
14237 if (SDValue Op1 = IsZeroExt(N1)) {
14238 SDValue New0a = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, Op0);
14239 SDValue New1a = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, Op1);
14240 return DAG.getNode(ARMISD::VMULLu, dl, VT, New0a, New1a);
14241 }
14242 }
14243
14244 return SDValue();
14245}
14246
14249 const ARMSubtarget *Subtarget) {
14250 SelectionDAG &DAG = DCI.DAG;
14251
14252 EVT VT = N->getValueType(0);
14253 if (Subtarget->hasMVEIntegerOps() && VT == MVT::v2i64)
14254 return PerformMVEVMULLCombine(N, DAG, Subtarget);
14255
14256 if (Subtarget->isThumb1Only())
14257 return SDValue();
14258
14259 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
14260 return SDValue();
14261
14262 if (VT.is64BitVector() || VT.is128BitVector())
14263 return PerformVMULCombine(N, DCI, Subtarget);
14264 if (VT != MVT::i32)
14265 return SDValue();
14266
14267 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
14268 if (!C)
14269 return SDValue();
14270
14271 int64_t MulAmt = C->getSExtValue();
14272 unsigned ShiftAmt = llvm::countr_zero<uint64_t>(MulAmt);
14273
14274 ShiftAmt = ShiftAmt & (32 - 1);
14275 SDValue V = N->getOperand(0);
14276 SDLoc DL(N);
14277
14278 SDValue Res;
14279 MulAmt >>= ShiftAmt;
14280
14281 if (MulAmt >= 0) {
14282 if (llvm::has_single_bit<uint32_t>(MulAmt - 1)) {
14283 // (mul x, 2^N + 1) => (add (shl x, N), x)
14284 Res = DAG.getNode(ISD::ADD, DL, VT,
14285 V,
14286 DAG.getNode(ISD::SHL, DL, VT,
14287 V,
14288 DAG.getConstant(Log2_32(MulAmt - 1), DL,
14289 MVT::i32)));
14290 } else if (llvm::has_single_bit<uint32_t>(MulAmt + 1)) {
14291 // (mul x, 2^N - 1) => (sub (shl x, N), x)
14292 Res = DAG.getNode(ISD::SUB, DL, VT,
14293 DAG.getNode(ISD::SHL, DL, VT,
14294 V,
14295 DAG.getConstant(Log2_32(MulAmt + 1), DL,
14296 MVT::i32)),
14297 V);
14298 } else
14299 return SDValue();
14300 } else {
14301 uint64_t MulAmtAbs = -MulAmt;
14302 if (llvm::has_single_bit<uint32_t>(MulAmtAbs + 1)) {
14303 // (mul x, -(2^N - 1)) => (sub x, (shl x, N))
14304 Res = DAG.getNode(ISD::SUB, DL, VT,
14305 V,
14306 DAG.getNode(ISD::SHL, DL, VT,
14307 V,
14308 DAG.getConstant(Log2_32(MulAmtAbs + 1), DL,
14309 MVT::i32)));
14310 } else if (llvm::has_single_bit<uint32_t>(MulAmtAbs - 1)) {
14311 // (mul x, -(2^N + 1)) => - (add (shl x, N), x)
14312 Res = DAG.getNode(ISD::ADD, DL, VT,
14313 V,
14314 DAG.getNode(ISD::SHL, DL, VT,
14315 V,
14316 DAG.getConstant(Log2_32(MulAmtAbs - 1), DL,
14317 MVT::i32)));
14318 Res = DAG.getNode(ISD::SUB, DL, VT,
14319 DAG.getConstant(0, DL, MVT::i32), Res);
14320 } else
14321 return SDValue();
14322 }
14323
14324 if (ShiftAmt != 0)
14325 Res = DAG.getNode(ISD::SHL, DL, VT,
14326 Res, DAG.getConstant(ShiftAmt, DL, MVT::i32));
14327
14328 // Do not add new nodes to DAG combiner worklist.
14329 DCI.CombineTo(N, Res, false);
14330 return SDValue();
14331}
14332
14335 const ARMSubtarget *Subtarget) {
14336 // Allow DAGCombine to pattern-match before we touch the canonical form.
14337 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
14338 return SDValue();
14339
14340 if (N->getValueType(0) != MVT::i32)
14341 return SDValue();
14342
14343 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N->getOperand(1));
14344 if (!N1C)
14345 return SDValue();
14346
14347 uint32_t C1 = (uint32_t)N1C->getZExtValue();
14348 // Don't transform uxtb/uxth.
14349 if (C1 == 255 || C1 == 65535)
14350 return SDValue();
14351
14352 SDNode *N0 = N->getOperand(0).getNode();
14353 if (!N0->hasOneUse())
14354 return SDValue();
14355
14356 if (N0->getOpcode() != ISD::SHL && N0->getOpcode() != ISD::SRL)
14357 return SDValue();
14358
14359 bool LeftShift = N0->getOpcode() == ISD::SHL;
14360
14362 if (!N01C)
14363 return SDValue();
14364
14365 uint32_t C2 = (uint32_t)N01C->getZExtValue();
14366 if (!C2 || C2 >= 32)
14367 return SDValue();
14368
14369 // Clear irrelevant bits in the mask.
14370 if (LeftShift)
14371 C1 &= (-1U << C2);
14372 else
14373 C1 &= (-1U >> C2);
14374
14375 SelectionDAG &DAG = DCI.DAG;
14376 SDLoc DL(N);
14377
14378 // We have a pattern of the form "(and (shl x, c2) c1)" or
14379 // "(and (srl x, c2) c1)", where c1 is a shifted mask. Try to
14380 // transform to a pair of shifts, to save materializing c1.
14381
14382 // First pattern: right shift, then mask off leading bits.
14383 // FIXME: Use demanded bits?
14384 if (!LeftShift && isMask_32(C1)) {
14385 uint32_t C3 = llvm::countl_zero(C1);
14386 if (C2 < C3) {
14387 SDValue SHL = DAG.getNode(ISD::SHL, DL, MVT::i32, N0->getOperand(0),
14388 DAG.getConstant(C3 - C2, DL, MVT::i32));
14389 return DAG.getNode(ISD::SRL, DL, MVT::i32, SHL,
14390 DAG.getConstant(C3, DL, MVT::i32));
14391 }
14392 }
14393
14394 // First pattern, reversed: left shift, then mask off trailing bits.
14395 if (LeftShift && isMask_32(~C1)) {
14396 uint32_t C3 = llvm::countr_zero(C1);
14397 if (C2 < C3) {
14398 SDValue SHL = DAG.getNode(ISD::SRL, DL, MVT::i32, N0->getOperand(0),
14399 DAG.getConstant(C3 - C2, DL, MVT::i32));
14400 return DAG.getNode(ISD::SHL, DL, MVT::i32, SHL,
14401 DAG.getConstant(C3, DL, MVT::i32));
14402 }
14403 }
14404
14405 // Second pattern: left shift, then mask off leading bits.
14406 // FIXME: Use demanded bits?
14407 if (LeftShift && isShiftedMask_32(C1)) {
14408 uint32_t Trailing = llvm::countr_zero(C1);
14409 uint32_t C3 = llvm::countl_zero(C1);
14410 if (Trailing == C2 && C2 + C3 < 32) {
14411 SDValue SHL = DAG.getNode(ISD::SHL, DL, MVT::i32, N0->getOperand(0),
14412 DAG.getConstant(C2 + C3, DL, MVT::i32));
14413 return DAG.getNode(ISD::SRL, DL, MVT::i32, SHL,
14414 DAG.getConstant(C3, DL, MVT::i32));
14415 }
14416 }
14417
14418 // Second pattern, reversed: right shift, then mask off trailing bits.
14419 // FIXME: Handle other patterns of known/demanded bits.
14420 if (!LeftShift && isShiftedMask_32(C1)) {
14421 uint32_t Leading = llvm::countl_zero(C1);
14422 uint32_t C3 = llvm::countr_zero(C1);
14423 if (Leading == C2 && C2 + C3 < 32) {
14424 SDValue SHL = DAG.getNode(ISD::SRL, DL, MVT::i32, N0->getOperand(0),
14425 DAG.getConstant(C2 + C3, DL, MVT::i32));
14426 return DAG.getNode(ISD::SHL, DL, MVT::i32, SHL,
14427 DAG.getConstant(C3, DL, MVT::i32));
14428 }
14429 }
14430
14431 // Transform "(and (shl x, c2) c1)" into "(shl (and x, c1>>c2), c2)"
14432 // if "c1 >> c2" is a cheaper immediate than "c1"
14433 if (LeftShift &&
14434 HasLowerConstantMaterializationCost(C1 >> C2, C1, Subtarget)) {
14435
14436 SDValue And = DAG.getNode(ISD::AND, DL, MVT::i32, N0->getOperand(0),
14437 DAG.getConstant(C1 >> C2, DL, MVT::i32));
14438 return DAG.getNode(ISD::SHL, DL, MVT::i32, And,
14439 DAG.getConstant(C2, DL, MVT::i32));
14440 }
14441
14442 return SDValue();
14443}
14444
14447 const ARMSubtarget *Subtarget) {
14448 // Attempt to use immediate-form VBIC
14449 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(1));
14450 SDLoc dl(N);
14451 EVT VT = N->getValueType(0);
14452 SelectionDAG &DAG = DCI.DAG;
14453
14454 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT) || VT == MVT::v2i1 ||
14455 VT == MVT::v4i1 || VT == MVT::v8i1 || VT == MVT::v16i1)
14456 return SDValue();
14457
14458 APInt SplatBits, SplatUndef;
14459 unsigned SplatBitSize;
14460 bool HasAnyUndefs;
14461 if (BVN && (Subtarget->hasNEON() || Subtarget->hasMVEIntegerOps()) &&
14462 BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
14463 if (SplatBitSize == 8 || SplatBitSize == 16 || SplatBitSize == 32 ||
14464 SplatBitSize == 64) {
14465 EVT VbicVT;
14466 SDValue Val = isVMOVModifiedImm((~SplatBits).getZExtValue(),
14467 SplatUndef.getZExtValue(), SplatBitSize,
14468 DAG, dl, VbicVT, VT, OtherModImm);
14469 if (Val.getNode()) {
14470 SDValue Input =
14471 DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VbicVT, N->getOperand(0));
14472 SDValue Vbic = DAG.getNode(ARMISD::VBICIMM, dl, VbicVT, Input, Val);
14473 return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, Vbic);
14474 }
14475 }
14476 }
14477
14478 if (!Subtarget->isThumb1Only()) {
14479 // fold (and (select cc, -1, c), x) -> (select cc, x, (and, x, c))
14480 if (SDValue Result = combineSelectAndUseCommutative(N, true, DCI))
14481 return Result;
14482
14483 if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget))
14484 return Result;
14485 }
14486
14487 if (Subtarget->isThumb1Only())
14488 if (SDValue Result = CombineANDShift(N, DCI, Subtarget))
14489 return Result;
14490
14491 return SDValue();
14492}
14493
14494// Try combining OR nodes to SMULWB, SMULWT.
14497 const ARMSubtarget *Subtarget) {
14498 if (!Subtarget->hasV6Ops() ||
14499 (Subtarget->isThumb() &&
14500 (!Subtarget->hasThumb2() || !Subtarget->hasDSP())))
14501 return SDValue();
14502
14503 SDValue SRL = OR->getOperand(0);
14504 SDValue SHL = OR->getOperand(1);
14505
14506 if (SRL.getOpcode() != ISD::SRL || SHL.getOpcode() != ISD::SHL) {
14507 SRL = OR->getOperand(1);
14508 SHL = OR->getOperand(0);
14509 }
14510 if (!isSRL16(SRL) || !isSHL16(SHL))
14511 return SDValue();
14512
14513 // The first operands to the shifts need to be the two results from the
14514 // same smul_lohi node.
14515 if ((SRL.getOperand(0).getNode() != SHL.getOperand(0).getNode()) ||
14516 SRL.getOperand(0).getOpcode() != ISD::SMUL_LOHI)
14517 return SDValue();
14518
14519 SDNode *SMULLOHI = SRL.getOperand(0).getNode();
14520 if (SRL.getOperand(0) != SDValue(SMULLOHI, 0) ||
14521 SHL.getOperand(0) != SDValue(SMULLOHI, 1))
14522 return SDValue();
14523
14524 // Now we have:
14525 // (or (srl (smul_lohi ?, ?), 16), (shl (smul_lohi ?, ?), 16)))
14526 // For SMUL[B|T] smul_lohi will take a 32-bit and a 16-bit arguments.
14527 // For SMUWB the 16-bit value will signed extended somehow.
14528 // For SMULWT only the SRA is required.
14529 // Check both sides of SMUL_LOHI
14530 SDValue OpS16 = SMULLOHI->getOperand(0);
14531 SDValue OpS32 = SMULLOHI->getOperand(1);
14532
14533 SelectionDAG &DAG = DCI.DAG;
14534 if (!isS16(OpS16, DAG) && !isSRA16(OpS16)) {
14535 OpS16 = OpS32;
14536 OpS32 = SMULLOHI->getOperand(0);
14537 }
14538
14539 SDLoc dl(OR);
14540 unsigned Opcode = 0;
14541 if (isS16(OpS16, DAG))
14542 Opcode = ARMISD::SMULWB;
14543 else if (isSRA16(OpS16)) {
14544 Opcode = ARMISD::SMULWT;
14545 OpS16 = OpS16->getOperand(0);
14546 }
14547 else
14548 return SDValue();
14549
14550 SDValue Res = DAG.getNode(Opcode, dl, MVT::i32, OpS32, OpS16);
14551 DAG.ReplaceAllUsesOfValueWith(SDValue(OR, 0), Res);
14552 return SDValue(OR, 0);
14553}
14554
14557 const ARMSubtarget *Subtarget) {
14558 // BFI is only available on V6T2+
14559 if (Subtarget->isThumb1Only() || !Subtarget->hasV6T2Ops())
14560 return SDValue();
14561
14562 EVT VT = N->getValueType(0);
14563 SDValue N0 = N->getOperand(0);
14564 SDValue N1 = N->getOperand(1);
14565 SelectionDAG &DAG = DCI.DAG;
14566 SDLoc DL(N);
14567 // 1) or (and A, mask), val => ARMbfi A, val, mask
14568 // iff (val & mask) == val
14569 //
14570 // 2) or (and A, mask), (and B, mask2) => ARMbfi A, (lsr B, amt), mask
14571 // 2a) iff isBitFieldInvertedMask(mask) && isBitFieldInvertedMask(~mask2)
14572 // && mask == ~mask2
14573 // 2b) iff isBitFieldInvertedMask(~mask) && isBitFieldInvertedMask(mask2)
14574 // && ~mask == mask2
14575 // (i.e., copy a bitfield value into another bitfield of the same width)
14576
14577 if (VT != MVT::i32)
14578 return SDValue();
14579
14580 SDValue N00 = N0.getOperand(0);
14581
14582 // The value and the mask need to be constants so we can verify this is
14583 // actually a bitfield set. If the mask is 0xffff, we can do better
14584 // via a movt instruction, so don't use BFI in that case.
14585 SDValue MaskOp = N0.getOperand(1);
14587 if (!MaskC)
14588 return SDValue();
14589 unsigned Mask = MaskC->getZExtValue();
14590 if (Mask == 0xffff)
14591 return SDValue();
14592 SDValue Res;
14593 // Case (1): or (and A, mask), val => ARMbfi A, val, mask
14595 if (N1C) {
14596 unsigned Val = N1C->getZExtValue();
14597 if ((Val & ~Mask) != Val)
14598 return SDValue();
14599
14600 if (ARM::isBitFieldInvertedMask(Mask)) {
14601 Val >>= llvm::countr_zero(~Mask);
14602
14603 Res = DAG.getNode(ARMISD::BFI, DL, VT, N00,
14604 DAG.getConstant(Val, DL, MVT::i32),
14605 DAG.getConstant(Mask, DL, MVT::i32));
14606
14607 DCI.CombineTo(N, Res, false);
14608 // Return value from the original node to inform the combiner than N is
14609 // now dead.
14610 return SDValue(N, 0);
14611 }
14612 } else if (N1.getOpcode() == ISD::AND) {
14613 // case (2) or (and A, mask), (and B, mask2) => ARMbfi A, (lsr B, amt), mask
14615 if (!N11C)
14616 return SDValue();
14617 unsigned Mask2 = N11C->getZExtValue();
14618
14619 // Mask and ~Mask2 (or reverse) must be equivalent for the BFI pattern
14620 // as is to match.
14621 if (ARM::isBitFieldInvertedMask(Mask) &&
14622 (Mask == ~Mask2)) {
14623 // The pack halfword instruction works better for masks that fit it,
14624 // so use that when it's available.
14625 if (Subtarget->hasDSP() &&
14626 (Mask == 0xffff || Mask == 0xffff0000))
14627 return SDValue();
14628 // 2a
14629 unsigned amt = llvm::countr_zero(Mask2);
14630 Res = DAG.getNode(ISD::SRL, DL, VT, N1.getOperand(0),
14631 DAG.getConstant(amt, DL, MVT::i32));
14632 Res = DAG.getNode(ARMISD::BFI, DL, VT, N00, Res,
14633 DAG.getConstant(Mask, DL, MVT::i32));
14634 DCI.CombineTo(N, Res, false);
14635 // Return value from the original node to inform the combiner than N is
14636 // now dead.
14637 return SDValue(N, 0);
14638 } else if (ARM::isBitFieldInvertedMask(~Mask) &&
14639 (~Mask == Mask2)) {
14640 // The pack halfword instruction works better for masks that fit it,
14641 // so use that when it's available.
14642 if (Subtarget->hasDSP() &&
14643 (Mask2 == 0xffff || Mask2 == 0xffff0000))
14644 return SDValue();
14645 // 2b
14646 unsigned lsb = llvm::countr_zero(Mask);
14647 Res = DAG.getNode(ISD::SRL, DL, VT, N00,
14648 DAG.getConstant(lsb, DL, MVT::i32));
14649 Res = DAG.getNode(ARMISD::BFI, DL, VT, N1.getOperand(0), Res,
14650 DAG.getConstant(Mask2, DL, MVT::i32));
14651 DCI.CombineTo(N, Res, false);
14652 // Return value from the original node to inform the combiner than N is
14653 // now dead.
14654 return SDValue(N, 0);
14655 }
14656 }
14657
14658 if (DAG.MaskedValueIsZero(N1, MaskC->getAPIntValue()) &&
14659 N00.getOpcode() == ISD::SHL && isa<ConstantSDNode>(N00.getOperand(1)) &&
14661 // Case (3): or (and (shl A, #shamt), mask), B => ARMbfi B, A, ~mask
14662 // where lsb(mask) == #shamt and masked bits of B are known zero.
14663 SDValue ShAmt = N00.getOperand(1);
14664 unsigned ShAmtC = ShAmt->getAsZExtVal();
14665 unsigned LSB = llvm::countr_zero(Mask);
14666 if (ShAmtC != LSB)
14667 return SDValue();
14668
14669 Res = DAG.getNode(ARMISD::BFI, DL, VT, N1, N00.getOperand(0),
14670 DAG.getConstant(~Mask, DL, MVT::i32));
14671
14672 DCI.CombineTo(N, Res, false);
14673 // Return value from the original node to inform the combiner than N is
14674 // now dead.
14675 return SDValue(N, 0);
14676 }
14677
14678 return SDValue();
14679}
14680
14681static bool isValidMVECond(unsigned CC, bool IsFloat) {
14682 switch (CC) {
14683 case ARMCC::EQ:
14684 case ARMCC::NE:
14685 case ARMCC::LE:
14686 case ARMCC::GT:
14687 case ARMCC::GE:
14688 case ARMCC::LT:
14689 return true;
14690 case ARMCC::HS:
14691 case ARMCC::HI:
14692 return !IsFloat;
14693 default:
14694 return false;
14695 };
14696}
14697
14699 if (N->getOpcode() == ARMISD::VCMP)
14700 return (ARMCC::CondCodes)N->getConstantOperandVal(2);
14701 else if (N->getOpcode() == ARMISD::VCMPZ)
14702 return (ARMCC::CondCodes)N->getConstantOperandVal(1);
14703 else
14704 llvm_unreachable("Not a VCMP/VCMPZ!");
14705}
14706
14709 return isValidMVECond(CC, N->getOperand(0).getValueType().isFloatingPoint());
14710}
14711
14713 const ARMSubtarget *Subtarget) {
14714 // Try to invert "or A, B" -> "and ~A, ~B", as the "and" is easier to chain
14715 // together with predicates
14716 EVT VT = N->getValueType(0);
14717 SDLoc DL(N);
14718 SDValue N0 = N->getOperand(0);
14719 SDValue N1 = N->getOperand(1);
14720
14721 auto IsFreelyInvertable = [&](SDValue V) {
14722 if (V->getOpcode() == ARMISD::VCMP || V->getOpcode() == ARMISD::VCMPZ)
14723 return CanInvertMVEVCMP(V);
14724 return false;
14725 };
14726
14727 // At least one operand must be freely invertable.
14728 if (!(IsFreelyInvertable(N0) || IsFreelyInvertable(N1)))
14729 return SDValue();
14730
14731 SDValue NewN0 = DAG.getLogicalNOT(DL, N0, VT);
14732 SDValue NewN1 = DAG.getLogicalNOT(DL, N1, VT);
14733 SDValue And = DAG.getNode(ISD::AND, DL, VT, NewN0, NewN1);
14734 return DAG.getLogicalNOT(DL, And, VT);
14735}
14736
14737/// PerformORCombine - Target-specific dag combine xforms for ISD::OR
14740 const ARMSubtarget *Subtarget) {
14741 // Attempt to use immediate-form VORR
14742 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(1));
14743 SDLoc dl(N);
14744 EVT VT = N->getValueType(0);
14745 SelectionDAG &DAG = DCI.DAG;
14746
14747 if(!DAG.getTargetLoweringInfo().isTypeLegal(VT))
14748 return SDValue();
14749
14750 if (Subtarget->hasMVEIntegerOps() && (VT == MVT::v2i1 || VT == MVT::v4i1 ||
14751 VT == MVT::v8i1 || VT == MVT::v16i1))
14752 return PerformORCombine_i1(N, DAG, Subtarget);
14753
14754 APInt SplatBits, SplatUndef;
14755 unsigned SplatBitSize;
14756 bool HasAnyUndefs;
14757 if (BVN && (Subtarget->hasNEON() || Subtarget->hasMVEIntegerOps()) &&
14758 BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
14759 if (SplatBitSize == 8 || SplatBitSize == 16 || SplatBitSize == 32 ||
14760 SplatBitSize == 64) {
14761 EVT VorrVT;
14762 SDValue Val =
14763 isVMOVModifiedImm(SplatBits.getZExtValue(), SplatUndef.getZExtValue(),
14764 SplatBitSize, DAG, dl, VorrVT, VT, OtherModImm);
14765 if (Val.getNode()) {
14766 SDValue Input =
14767 DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VorrVT, N->getOperand(0));
14768 SDValue Vorr = DAG.getNode(ARMISD::VORRIMM, dl, VorrVT, Input, Val);
14769 return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, Vorr);
14770 }
14771 }
14772 }
14773
14774 if (!Subtarget->isThumb1Only()) {
14775 // fold (or (select cc, 0, c), x) -> (select cc, x, (or, x, c))
14776 if (SDValue Result = combineSelectAndUseCommutative(N, false, DCI))
14777 return Result;
14778 if (SDValue Result = PerformORCombineToSMULWBT(N, DCI, Subtarget))
14779 return Result;
14780 }
14781
14782 SDValue N0 = N->getOperand(0);
14783 SDValue N1 = N->getOperand(1);
14784
14785 // (or (and B, A), (and C, ~A)) => (VBSL A, B, C) when A is a constant.
14786 if (Subtarget->hasNEON() && N1.getOpcode() == ISD::AND && VT.isVector() &&
14788
14789 // The code below optimizes (or (and X, Y), Z).
14790 // The AND operand needs to have a single user to make these optimizations
14791 // profitable.
14792 if (N0.getOpcode() != ISD::AND || !N0.hasOneUse())
14793 return SDValue();
14794
14795 APInt SplatUndef;
14796 unsigned SplatBitSize;
14797 bool HasAnyUndefs;
14798
14799 APInt SplatBits0, SplatBits1;
14802 // Ensure that the second operand of both ands are constants
14803 if (BVN0 && BVN0->isConstantSplat(SplatBits0, SplatUndef, SplatBitSize,
14804 HasAnyUndefs) && !HasAnyUndefs) {
14805 if (BVN1 && BVN1->isConstantSplat(SplatBits1, SplatUndef, SplatBitSize,
14806 HasAnyUndefs) && !HasAnyUndefs) {
14807 // Ensure that the bit width of the constants are the same and that
14808 // the splat arguments are logical inverses as per the pattern we
14809 // are trying to simplify.
14810 if (SplatBits0.getBitWidth() == SplatBits1.getBitWidth() &&
14811 SplatBits0 == ~SplatBits1) {
14812 // Canonicalize the vector type to make instruction selection
14813 // simpler.
14814 EVT CanonicalVT = VT.is128BitVector() ? MVT::v4i32 : MVT::v2i32;
14815 SDValue Result = DAG.getNode(ARMISD::VBSP, dl, CanonicalVT,
14816 N0->getOperand(1),
14817 N0->getOperand(0),
14818 N1->getOperand(0));
14819 return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, Result);
14820 }
14821 }
14822 }
14823 }
14824
14825 // Try to use the ARM/Thumb2 BFI (bitfield insert) instruction when
14826 // reasonable.
14827 if (N0.getOpcode() == ISD::AND && N0.hasOneUse()) {
14828 if (SDValue Res = PerformORCombineToBFI(N, DCI, Subtarget))
14829 return Res;
14830 }
14831
14832 if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget))
14833 return Result;
14834
14835 return SDValue();
14836}
14837
14840 const ARMSubtarget *Subtarget) {
14841 EVT VT = N->getValueType(0);
14842 SelectionDAG &DAG = DCI.DAG;
14843
14844 if(!DAG.getTargetLoweringInfo().isTypeLegal(VT))
14845 return SDValue();
14846
14847 if (!Subtarget->isThumb1Only()) {
14848 // fold (xor (select cc, 0, c), x) -> (select cc, x, (xor, x, c))
14849 if (SDValue Result = combineSelectAndUseCommutative(N, false, DCI))
14850 return Result;
14851
14852 if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget))
14853 return Result;
14854 }
14855
14856 if (Subtarget->hasMVEIntegerOps()) {
14857 // fold (xor(vcmp/z, 1)) into a vcmp with the opposite condition.
14858 SDValue N0 = N->getOperand(0);
14859 SDValue N1 = N->getOperand(1);
14860 const TargetLowering *TLI = Subtarget->getTargetLowering();
14861 if (TLI->isConstTrueVal(N1) &&
14862 (N0->getOpcode() == ARMISD::VCMP || N0->getOpcode() == ARMISD::VCMPZ)) {
14863 if (CanInvertMVEVCMP(N0)) {
14864 SDLoc DL(N0);
14866
14868 Ops.push_back(N0->getOperand(0));
14869 if (N0->getOpcode() == ARMISD::VCMP)
14870 Ops.push_back(N0->getOperand(1));
14871 Ops.push_back(DAG.getConstant(CC, DL, MVT::i32));
14872 return DAG.getNode(N0->getOpcode(), DL, N0->getValueType(0), Ops);
14873 }
14874 }
14875 }
14876
14877 return SDValue();
14878}
14879
14880// ParseBFI - given a BFI instruction in N, extract the "from" value (Rn) and return it,
14881// and fill in FromMask and ToMask with (consecutive) bits in "from" to be extracted and
14882// their position in "to" (Rd).
14883static SDValue ParseBFI(SDNode *N, APInt &ToMask, APInt &FromMask) {
14884 assert(N->getOpcode() == ARMISD::BFI);
14885
14886 SDValue From = N->getOperand(1);
14887 ToMask = ~N->getConstantOperandAPInt(2);
14888 FromMask = APInt::getLowBitsSet(ToMask.getBitWidth(), ToMask.popcount());
14889
14890 // If the Base came from a SHR #C, we can deduce that it is really testing bit
14891 // #C in the base of the SHR.
14892 if (From->getOpcode() == ISD::SRL &&
14893 isa<ConstantSDNode>(From->getOperand(1))) {
14894 APInt Shift = From->getConstantOperandAPInt(1);
14895 assert(Shift.getLimitedValue() < 32 && "Shift too large!");
14896 FromMask <<= Shift.getLimitedValue(31);
14897 From = From->getOperand(0);
14898 }
14899
14900 return From;
14901}
14902
14903// If A and B contain one contiguous set of bits, does A | B == A . B?
14904//
14905// Neither A nor B must be zero.
14906static bool BitsProperlyConcatenate(const APInt &A, const APInt &B) {
14907 unsigned LastActiveBitInA = A.countr_zero();
14908 unsigned FirstActiveBitInB = B.getBitWidth() - B.countl_zero() - 1;
14909 return LastActiveBitInA - 1 == FirstActiveBitInB;
14910}
14911
14913 // We have a BFI in N. Find a BFI it can combine with, if one exists.
14914 APInt ToMask, FromMask;
14915 SDValue From = ParseBFI(N, ToMask, FromMask);
14916 SDValue To = N->getOperand(0);
14917
14918 SDValue V = To;
14919 if (V.getOpcode() != ARMISD::BFI)
14920 return SDValue();
14921
14922 APInt NewToMask, NewFromMask;
14923 SDValue NewFrom = ParseBFI(V.getNode(), NewToMask, NewFromMask);
14924 if (NewFrom != From)
14925 return SDValue();
14926
14927 // Do the written bits conflict with any we've seen so far?
14928 if ((NewToMask & ToMask).getBoolValue())
14929 // Conflicting bits.
14930 return SDValue();
14931
14932 // Are the new bits contiguous when combined with the old bits?
14933 if (BitsProperlyConcatenate(ToMask, NewToMask) &&
14934 BitsProperlyConcatenate(FromMask, NewFromMask))
14935 return V;
14936 if (BitsProperlyConcatenate(NewToMask, ToMask) &&
14937 BitsProperlyConcatenate(NewFromMask, FromMask))
14938 return V;
14939
14940 return SDValue();
14941}
14942
14944 SDValue N0 = N->getOperand(0);
14945 SDValue N1 = N->getOperand(1);
14946
14947 if (N1.getOpcode() == ISD::AND) {
14948 // (bfi A, (and B, Mask1), Mask2) -> (bfi A, B, Mask2) iff
14949 // the bits being cleared by the AND are not demanded by the BFI.
14951 if (!N11C)
14952 return SDValue();
14953 unsigned InvMask = N->getConstantOperandVal(2);
14954 unsigned LSB = llvm::countr_zero(~InvMask);
14955 unsigned Width = llvm::bit_width<unsigned>(~InvMask) - LSB;
14956 assert(Width <
14957 static_cast<unsigned>(std::numeric_limits<unsigned>::digits) &&
14958 "undefined behavior");
14959 unsigned Mask = (1u << Width) - 1;
14960 unsigned Mask2 = N11C->getZExtValue();
14961 if ((Mask & (~Mask2)) == 0)
14962 return DAG.getNode(ARMISD::BFI, SDLoc(N), N->getValueType(0),
14963 N->getOperand(0), N1.getOperand(0), N->getOperand(2));
14964 return SDValue();
14965 }
14966
14967 // Look for another BFI to combine with.
14968 if (SDValue CombineBFI = FindBFIToCombineWith(N)) {
14969 // We've found a BFI.
14970 APInt ToMask1, FromMask1;
14971 SDValue From1 = ParseBFI(N, ToMask1, FromMask1);
14972
14973 APInt ToMask2, FromMask2;
14974 SDValue From2 = ParseBFI(CombineBFI.getNode(), ToMask2, FromMask2);
14975 assert(From1 == From2);
14976 (void)From2;
14977
14978 // Create a new BFI, combining the two together.
14979 APInt NewFromMask = FromMask1 | FromMask2;
14980 APInt NewToMask = ToMask1 | ToMask2;
14981
14982 EVT VT = N->getValueType(0);
14983 SDLoc dl(N);
14984
14985 if (NewFromMask[0] == 0)
14986 From1 = DAG.getNode(ISD::SRL, dl, VT, From1,
14987 DAG.getConstant(NewFromMask.countr_zero(), dl, VT));
14988 return DAG.getNode(ARMISD::BFI, dl, VT, CombineBFI.getOperand(0), From1,
14989 DAG.getConstant(~NewToMask, dl, VT));
14990 }
14991
14992 // Reassociate BFI(BFI (A, B, M1), C, M2) to BFI(BFI (A, C, M2), B, M1) so
14993 // that lower bit insertions are performed first, providing that M1 and M2
14994 // do no overlap. This can allow multiple BFI instructions to be combined
14995 // together by the other folds above.
14996 if (N->getOperand(0).getOpcode() == ARMISD::BFI) {
14997 APInt ToMask1 = ~N->getConstantOperandAPInt(2);
14998 APInt ToMask2 = ~N0.getConstantOperandAPInt(2);
14999
15000 if (!N0.hasOneUse() || (ToMask1 & ToMask2) != 0 ||
15001 ToMask1.countl_zero() < ToMask2.countl_zero())
15002 return SDValue();
15003
15004 EVT VT = N->getValueType(0);
15005 SDLoc dl(N);
15006 SDValue BFI1 = DAG.getNode(ARMISD::BFI, dl, VT, N0.getOperand(0),
15007 N->getOperand(1), N->getOperand(2));
15008 return DAG.getNode(ARMISD::BFI, dl, VT, BFI1, N0.getOperand(1),
15009 N0.getOperand(2));
15010 }
15011
15012 return SDValue();
15013}
15014
15015// Check that N is CMPZ(CSINC(0, 0, CC, X)),
15016// or CMPZ(CMOV(1, 0, CC, X))
15017// return X if valid.
15019 if (Cmp->getOpcode() != ARMISD::CMPZ || !isNullConstant(Cmp->getOperand(1)))
15020 return SDValue();
15021 SDValue CSInc = Cmp->getOperand(0);
15022
15023 // Ignore any `And 1` nodes that may not yet have been removed. We are
15024 // looking for a value that produces 1/0, so these have no effect on the
15025 // code.
15026 while (CSInc.getOpcode() == ISD::AND &&
15027 isa<ConstantSDNode>(CSInc.getOperand(1)) &&
15028 CSInc.getConstantOperandVal(1) == 1 && CSInc->hasOneUse())
15029 CSInc = CSInc.getOperand(0);
15030
15031 if (CSInc.getOpcode() == ARMISD::CSINC &&
15032 isNullConstant(CSInc.getOperand(0)) &&
15033 isNullConstant(CSInc.getOperand(1)) && CSInc->hasOneUse()) {
15035 return CSInc.getOperand(3);
15036 }
15037 if (CSInc.getOpcode() == ARMISD::CMOV && isOneConstant(CSInc.getOperand(0)) &&
15038 isNullConstant(CSInc.getOperand(1)) && CSInc->hasOneUse()) {
15040 return CSInc.getOperand(3);
15041 }
15042 if (CSInc.getOpcode() == ARMISD::CMOV && isOneConstant(CSInc.getOperand(1)) &&
15043 isNullConstant(CSInc.getOperand(0)) && CSInc->hasOneUse()) {
15046 return CSInc.getOperand(3);
15047 }
15048 return SDValue();
15049}
15050
15052 // Given CMPZ(CSINC(C, 0, 0, EQ), 0), we can just use C directly. As in
15053 // t92: flags = ARMISD::CMPZ t74, 0
15054 // t93: i32 = ARMISD::CSINC 0, 0, 1, t92
15055 // t96: flags = ARMISD::CMPZ t93, 0
15056 // t114: i32 = ARMISD::CSINV 0, 0, 0, t96
15058 if (SDValue C = IsCMPZCSINC(N, Cond))
15059 if (Cond == ARMCC::EQ)
15060 return C;
15061 return SDValue();
15062}
15063
15065 // Fold away an unneccessary CMPZ/CSINC
15066 // CSXYZ A, B, C1 (CMPZ (CSINC 0, 0, C2, D), 0) ->
15067 // if C1==EQ -> CSXYZ A, B, C2, D
15068 // if C1==NE -> CSXYZ A, B, NOT(C2), D
15070 if (SDValue C = IsCMPZCSINC(N->getOperand(3).getNode(), Cond)) {
15071 if (N->getConstantOperandVal(2) == ARMCC::EQ)
15072 return DAG.getNode(N->getOpcode(), SDLoc(N), MVT::i32, N->getOperand(0),
15073 N->getOperand(1),
15074 DAG.getConstant(Cond, SDLoc(N), MVT::i32), C);
15075 if (N->getConstantOperandVal(2) == ARMCC::NE)
15076 return DAG.getNode(
15077 N->getOpcode(), SDLoc(N), MVT::i32, N->getOperand(0),
15078 N->getOperand(1),
15080 }
15081 return SDValue();
15082}
15083
15084/// PerformVMOVRRDCombine - Target-specific dag combine xforms for
15085/// ARMISD::VMOVRRD.
15088 const ARMSubtarget *Subtarget) {
15089 // vmovrrd(vmovdrr x, y) -> x,y
15090 SDValue InDouble = N->getOperand(0);
15091 if (InDouble.getOpcode() == ARMISD::VMOVDRR && Subtarget->hasFP64())
15092 return DCI.CombineTo(N, InDouble.getOperand(0), InDouble.getOperand(1));
15093
15094 // vmovrrd(load f64) -> (load i32), (load i32)
15095 SDNode *InNode = InDouble.getNode();
15096 if (ISD::isNormalLoad(InNode) && InNode->hasOneUse() &&
15097 InNode->getValueType(0) == MVT::f64 &&
15098 InNode->getOperand(1).getOpcode() == ISD::FrameIndex &&
15099 !cast<LoadSDNode>(InNode)->isVolatile()) {
15100 // TODO: Should this be done for non-FrameIndex operands?
15101 LoadSDNode *LD = cast<LoadSDNode>(InNode);
15102
15103 SelectionDAG &DAG = DCI.DAG;
15104 SDLoc DL(LD);
15105 SDValue BasePtr = LD->getBasePtr();
15106 SDValue NewLD1 =
15107 DAG.getLoad(MVT::i32, DL, LD->getChain(), BasePtr, LD->getPointerInfo(),
15108 LD->getAlign(), LD->getMemOperand()->getFlags());
15109
15110 SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr,
15111 DAG.getConstant(4, DL, MVT::i32));
15112
15113 SDValue NewLD2 = DAG.getLoad(MVT::i32, DL, LD->getChain(), OffsetPtr,
15114 LD->getPointerInfo().getWithOffset(4),
15115 commonAlignment(LD->getAlign(), 4),
15116 LD->getMemOperand()->getFlags());
15117
15118 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewLD2.getValue(1));
15119 if (DCI.DAG.getDataLayout().isBigEndian())
15120 std::swap (NewLD1, NewLD2);
15121 SDValue Result = DCI.CombineTo(N, NewLD1, NewLD2);
15122 return Result;
15123 }
15124
15125 // VMOVRRD(extract(..(build_vector(a, b, c, d)))) -> a,b or c,d
15126 // VMOVRRD(extract(insert_vector(insert_vector(.., a, l1), b, l2))) -> a,b
15127 if (InDouble.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
15128 isa<ConstantSDNode>(InDouble.getOperand(1))) {
15129 SDValue BV = InDouble.getOperand(0);
15130 // Look up through any nop bitcasts and vector_reg_casts. bitcasts may
15131 // change lane order under big endian.
15132 bool BVSwap = BV.getOpcode() == ISD::BITCAST;
15133 while (
15134 (BV.getOpcode() == ISD::BITCAST ||
15136 (BV.getValueType() == MVT::v2f64 || BV.getValueType() == MVT::v2i64)) {
15137 BVSwap = BV.getOpcode() == ISD::BITCAST;
15138 BV = BV.getOperand(0);
15139 }
15140 if (BV.getValueType() != MVT::v4i32)
15141 return SDValue();
15142
15143 // Handle buildvectors, pulling out the correct lane depending on
15144 // endianness.
15145 unsigned Offset = InDouble.getConstantOperandVal(1) == 1 ? 2 : 0;
15146 if (BV.getOpcode() == ISD::BUILD_VECTOR) {
15147 SDValue Op0 = BV.getOperand(Offset);
15148 SDValue Op1 = BV.getOperand(Offset + 1);
15149 if (!Subtarget->isLittle() && BVSwap)
15150 std::swap(Op0, Op1);
15151
15152 return DCI.DAG.getMergeValues({Op0, Op1}, SDLoc(N));
15153 }
15154
15155 // A chain of insert_vectors, grabbing the correct value of the chain of
15156 // inserts.
15157 SDValue Op0, Op1;
15158 while (BV.getOpcode() == ISD::INSERT_VECTOR_ELT) {
15159 if (isa<ConstantSDNode>(BV.getOperand(2))) {
15160 if (BV.getConstantOperandVal(2) == Offset && !Op0)
15161 Op0 = BV.getOperand(1);
15162 if (BV.getConstantOperandVal(2) == Offset + 1 && !Op1)
15163 Op1 = BV.getOperand(1);
15164 }
15165 BV = BV.getOperand(0);
15166 }
15167 if (!Subtarget->isLittle() && BVSwap)
15168 std::swap(Op0, Op1);
15169 if (Op0 && Op1)
15170 return DCI.DAG.getMergeValues({Op0, Op1}, SDLoc(N));
15171 }
15172
15173 return SDValue();
15174}
15175
15176/// PerformVMOVDRRCombine - Target-specific dag combine xforms for
15177/// ARMISD::VMOVDRR. This is also used for BUILD_VECTORs with 2 operands.
15179 // N=vmovrrd(X); vmovdrr(N:0, N:1) -> bit_convert(X)
15180 SDValue Op0 = N->getOperand(0);
15181 SDValue Op1 = N->getOperand(1);
15182 if (Op0.getOpcode() == ISD::BITCAST)
15183 Op0 = Op0.getOperand(0);
15184 if (Op1.getOpcode() == ISD::BITCAST)
15185 Op1 = Op1.getOperand(0);
15186 if (Op0.getOpcode() == ARMISD::VMOVRRD &&
15187 Op0.getNode() == Op1.getNode() &&
15188 Op0.getResNo() == 0 && Op1.getResNo() == 1)
15189 return DAG.getNode(ISD::BITCAST, SDLoc(N),
15190 N->getValueType(0), Op0.getOperand(0));
15191 return SDValue();
15192}
15193
15196 SDValue Op0 = N->getOperand(0);
15197
15198 // VMOVhr (VMOVrh (X)) -> X
15199 if (Op0->getOpcode() == ARMISD::VMOVrh)
15200 return Op0->getOperand(0);
15201
15202 // FullFP16: half values are passed in S-registers, and we don't
15203 // need any of the bitcast and moves:
15204 //
15205 // t2: f32,ch1,gl1? = CopyFromReg ch, Register:f32 %0, gl?
15206 // t5: i32 = bitcast t2
15207 // t18: f16 = ARMISD::VMOVhr t5
15208 // =>
15209 // tN: f16,ch2,gl2? = CopyFromReg ch, Register::f32 %0, gl?
15210 if (Op0->getOpcode() == ISD::BITCAST) {
15211 SDValue Copy = Op0->getOperand(0);
15212 if (Copy.getValueType() == MVT::f32 &&
15213 Copy->getOpcode() == ISD::CopyFromReg) {
15214 bool HasGlue = Copy->getNumOperands() == 3;
15215 SDValue Ops[] = {Copy->getOperand(0), Copy->getOperand(1),
15216 HasGlue ? Copy->getOperand(2) : SDValue()};
15217 EVT OutTys[] = {N->getValueType(0), MVT::Other, MVT::Glue};
15218 SDValue NewCopy =
15220 DCI.DAG.getVTList(ArrayRef(OutTys, HasGlue ? 3 : 2)),
15221 ArrayRef(Ops, HasGlue ? 3 : 2));
15222
15223 // Update Users, Chains, and Potential Glue.
15224 DCI.DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), NewCopy.getValue(0));
15225 DCI.DAG.ReplaceAllUsesOfValueWith(Copy.getValue(1), NewCopy.getValue(1));
15226 if (HasGlue)
15227 DCI.DAG.ReplaceAllUsesOfValueWith(Copy.getValue(2),
15228 NewCopy.getValue(2));
15229
15230 return NewCopy;
15231 }
15232 }
15233
15234 // fold (VMOVhr (load x)) -> (load (f16*)x)
15235 if (LoadSDNode *LN0 = dyn_cast<LoadSDNode>(Op0)) {
15236 if (LN0->hasOneUse() && LN0->isUnindexed() &&
15237 LN0->getMemoryVT() == MVT::i16) {
15238 SDValue Load =
15239 DCI.DAG.getLoad(N->getValueType(0), SDLoc(N), LN0->getChain(),
15240 LN0->getBasePtr(), LN0->getMemOperand());
15241 DCI.DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Load.getValue(0));
15242 DCI.DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), Load.getValue(1));
15243 return Load;
15244 }
15245 }
15246
15247 // Only the bottom 16 bits of the source register are used.
15248 APInt DemandedMask = APInt::getLowBitsSet(32, 16);
15249 const TargetLowering &TLI = DCI.DAG.getTargetLoweringInfo();
15250 if (TLI.SimplifyDemandedBits(Op0, DemandedMask, DCI))
15251 return SDValue(N, 0);
15252
15253 return SDValue();
15254}
15255
15257 SDValue N0 = N->getOperand(0);
15258 EVT VT = N->getValueType(0);
15259
15260 // fold (VMOVrh (fpconst x)) -> const x
15262 APFloat V = C->getValueAPF();
15263 return DAG.getConstant(V.bitcastToAPInt().getZExtValue(), SDLoc(N), VT);
15264 }
15265
15266 // fold (VMOVrh (load x)) -> (zextload (i16*)x)
15267 if (ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse()) {
15268 LoadSDNode *LN0 = cast<LoadSDNode>(N0);
15269
15270 SDValue Load =
15271 DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(N), VT, LN0->getChain(),
15272 LN0->getBasePtr(), MVT::i16, LN0->getMemOperand());
15273 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Load.getValue(0));
15274 DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), Load.getValue(1));
15275 return Load;
15276 }
15277
15278 // Fold VMOVrh(extract(x, n)) -> vgetlaneu(x, n)
15279 if (N0->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
15281 return DAG.getNode(ARMISD::VGETLANEu, SDLoc(N), VT, N0->getOperand(0),
15282 N0->getOperand(1));
15283
15284 return SDValue();
15285}
15286
15287/// hasNormalLoadOperand - Check if any of the operands of a BUILD_VECTOR node
15288/// are normal, non-volatile loads. If so, it is profitable to bitcast an
15289/// i64 vector to have f64 elements, since the value can then be loaded
15290/// directly into a VFP register.
15292 unsigned NumElts = N->getValueType(0).getVectorNumElements();
15293 for (unsigned i = 0; i < NumElts; ++i) {
15294 SDNode *Elt = N->getOperand(i).getNode();
15295 if (ISD::isNormalLoad(Elt) && !cast<LoadSDNode>(Elt)->isVolatile())
15296 return true;
15297 }
15298 return false;
15299}
15300
15301/// PerformBUILD_VECTORCombine - Target-specific dag combine xforms for
15302/// ISD::BUILD_VECTOR.
15305 const ARMSubtarget *Subtarget) {
15306 // build_vector(N=ARMISD::VMOVRRD(X), N:1) -> bit_convert(X):
15307 // VMOVRRD is introduced when legalizing i64 types. It forces the i64 value
15308 // into a pair of GPRs, which is fine when the value is used as a scalar,
15309 // but if the i64 value is converted to a vector, we need to undo the VMOVRRD.
15310 SelectionDAG &DAG = DCI.DAG;
15311 if (N->getNumOperands() == 2)
15312 if (SDValue RV = PerformVMOVDRRCombine(N, DAG))
15313 return RV;
15314
15315 // Load i64 elements as f64 values so that type legalization does not split
15316 // them up into i32 values.
15317 EVT VT = N->getValueType(0);
15318 if (VT.getVectorElementType() != MVT::i64 || !hasNormalLoadOperand(N))
15319 return SDValue();
15320 SDLoc dl(N);
15322 unsigned NumElts = VT.getVectorNumElements();
15323 for (unsigned i = 0; i < NumElts; ++i) {
15324 SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::f64, N->getOperand(i));
15325 Ops.push_back(V);
15326 // Make the DAGCombiner fold the bitcast.
15327 DCI.AddToWorklist(V.getNode());
15328 }
15329 EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, NumElts);
15330 SDValue BV = DAG.getBuildVector(FloatVT, dl, Ops);
15331 return DAG.getNode(ISD::BITCAST, dl, VT, BV);
15332}
15333
15334/// Target-specific dag combine xforms for ARMISD::BUILD_VECTOR.
15335static SDValue
15337 // ARMISD::BUILD_VECTOR is introduced when legalizing ISD::BUILD_VECTOR.
15338 // At that time, we may have inserted bitcasts from integer to float.
15339 // If these bitcasts have survived DAGCombine, change the lowering of this
15340 // BUILD_VECTOR in something more vector friendly, i.e., that does not
15341 // force to use floating point types.
15342
15343 // Make sure we can change the type of the vector.
15344 // This is possible iff:
15345 // 1. The vector is only used in a bitcast to a integer type. I.e.,
15346 // 1.1. Vector is used only once.
15347 // 1.2. Use is a bit convert to an integer type.
15348 // 2. The size of its operands are 32-bits (64-bits are not legal).
15349 EVT VT = N->getValueType(0);
15350 EVT EltVT = VT.getVectorElementType();
15351
15352 // Check 1.1. and 2.
15353 if (EltVT.getSizeInBits() != 32 || !N->hasOneUse())
15354 return SDValue();
15355
15356 // By construction, the input type must be float.
15357 assert(EltVT == MVT::f32 && "Unexpected type!");
15358
15359 // Check 1.2.
15360 SDNode *Use = *N->user_begin();
15361 if (Use->getOpcode() != ISD::BITCAST ||
15362 Use->getValueType(0).isFloatingPoint())
15363 return SDValue();
15364
15365 // Check profitability.
15366 // Model is, if more than half of the relevant operands are bitcast from
15367 // i32, turn the build_vector into a sequence of insert_vector_elt.
15368 // Relevant operands are everything that is not statically
15369 // (i.e., at compile time) bitcasted.
15370 unsigned NumOfBitCastedElts = 0;
15371 unsigned NumElts = VT.getVectorNumElements();
15372 unsigned NumOfRelevantElts = NumElts;
15373 for (unsigned Idx = 0; Idx < NumElts; ++Idx) {
15374 SDValue Elt = N->getOperand(Idx);
15375 if (Elt->getOpcode() == ISD::BITCAST) {
15376 // Assume only bit cast to i32 will go away.
15377 if (Elt->getOperand(0).getValueType() == MVT::i32)
15378 ++NumOfBitCastedElts;
15379 } else if (Elt.isUndef() || isa<ConstantSDNode>(Elt))
15380 // Constants are statically casted, thus do not count them as
15381 // relevant operands.
15382 --NumOfRelevantElts;
15383 }
15384
15385 // Check if more than half of the elements require a non-free bitcast.
15386 if (NumOfBitCastedElts <= NumOfRelevantElts / 2)
15387 return SDValue();
15388
15389 SelectionDAG &DAG = DCI.DAG;
15390 // Create the new vector type.
15391 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts);
15392 // Check if the type is legal.
15393 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
15394 if (!TLI.isTypeLegal(VecVT))
15395 return SDValue();
15396
15397 // Combine:
15398 // ARMISD::BUILD_VECTOR E1, E2, ..., EN.
15399 // => BITCAST INSERT_VECTOR_ELT
15400 // (INSERT_VECTOR_ELT (...), (BITCAST EN-1), N-1),
15401 // (BITCAST EN), N.
15402 SDValue Vec = DAG.getUNDEF(VecVT);
15403 SDLoc dl(N);
15404 for (unsigned Idx = 0 ; Idx < NumElts; ++Idx) {
15405 SDValue V = N->getOperand(Idx);
15406 if (V.isUndef())
15407 continue;
15408 if (V.getOpcode() == ISD::BITCAST &&
15409 V->getOperand(0).getValueType() == MVT::i32)
15410 // Fold obvious case.
15411 V = V.getOperand(0);
15412 else {
15413 V = DAG.getNode(ISD::BITCAST, SDLoc(V), MVT::i32, V);
15414 // Make the DAGCombiner fold the bitcasts.
15415 DCI.AddToWorklist(V.getNode());
15416 }
15417 SDValue LaneIdx = DAG.getConstant(Idx, dl, MVT::i32);
15418 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VecVT, Vec, V, LaneIdx);
15419 }
15420 Vec = DAG.getNode(ISD::BITCAST, dl, VT, Vec);
15421 // Make the DAGCombiner fold the bitcasts.
15422 DCI.AddToWorklist(Vec.getNode());
15423 return Vec;
15424}
15425
15426static SDValue
15428 EVT VT = N->getValueType(0);
15429 SDValue Op = N->getOperand(0);
15430 SDLoc dl(N);
15431
15432 // PREDICATE_CAST(PREDICATE_CAST(x)) == PREDICATE_CAST(x)
15433 if (Op->getOpcode() == ARMISD::PREDICATE_CAST) {
15434 // If the valuetypes are the same, we can remove the cast entirely.
15435 if (Op->getOperand(0).getValueType() == VT)
15436 return Op->getOperand(0);
15437 return DCI.DAG.getNode(ARMISD::PREDICATE_CAST, dl, VT, Op->getOperand(0));
15438 }
15439
15440 // Turn pred_cast(xor x, -1) into xor(pred_cast x, -1), in order to produce
15441 // more VPNOT which might get folded as else predicates.
15442 if (Op.getValueType() == MVT::i32 && isBitwiseNot(Op)) {
15443 SDValue X =
15444 DCI.DAG.getNode(ARMISD::PREDICATE_CAST, dl, VT, Op->getOperand(0));
15446 DCI.DAG.getConstant(65535, dl, MVT::i32));
15447 return DCI.DAG.getNode(ISD::XOR, dl, VT, X, C);
15448 }
15449
15450 // Only the bottom 16 bits of the source register are used.
15451 if (Op.getValueType() == MVT::i32) {
15452 APInt DemandedMask = APInt::getLowBitsSet(32, 16);
15453 const TargetLowering &TLI = DCI.DAG.getTargetLoweringInfo();
15454 if (TLI.SimplifyDemandedBits(Op, DemandedMask, DCI))
15455 return SDValue(N, 0);
15456 }
15457 return SDValue();
15458}
15459
15461 const ARMSubtarget *ST) {
15462 EVT VT = N->getValueType(0);
15463 SDValue Op = N->getOperand(0);
15464 SDLoc dl(N);
15465
15466 // Under Little endian, a VECTOR_REG_CAST is equivalent to a BITCAST
15467 if (ST->isLittle())
15468 return DAG.getNode(ISD::BITCAST, dl, VT, Op);
15469
15470 // VT VECTOR_REG_CAST (VT Op) -> Op
15471 if (Op.getValueType() == VT)
15472 return Op;
15473 // VECTOR_REG_CAST undef -> undef
15474 if (Op.isUndef())
15475 return DAG.getUNDEF(VT);
15476
15477 // VECTOR_REG_CAST(VECTOR_REG_CAST(x)) == VECTOR_REG_CAST(x)
15478 if (Op->getOpcode() == ARMISD::VECTOR_REG_CAST) {
15479 // If the valuetypes are the same, we can remove the cast entirely.
15480 if (Op->getOperand(0).getValueType() == VT)
15481 return Op->getOperand(0);
15482 return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, Op->getOperand(0));
15483 }
15484
15485 return SDValue();
15486}
15487
15489 const ARMSubtarget *Subtarget) {
15490 if (!Subtarget->hasMVEIntegerOps())
15491 return SDValue();
15492
15493 EVT VT = N->getValueType(0);
15494 SDValue Op0 = N->getOperand(0);
15495 SDValue Op1 = N->getOperand(1);
15496 ARMCC::CondCodes Cond = (ARMCC::CondCodes)N->getConstantOperandVal(2);
15497 SDLoc dl(N);
15498
15499 // vcmp X, 0, cc -> vcmpz X, cc
15500 if (isZeroVector(Op1))
15501 return DAG.getNode(ARMISD::VCMPZ, dl, VT, Op0, N->getOperand(2));
15502
15503 unsigned SwappedCond = getSwappedCondition(Cond);
15504 if (isValidMVECond(SwappedCond, VT.isFloatingPoint())) {
15505 // vcmp 0, X, cc -> vcmpz X, reversed(cc)
15506 if (isZeroVector(Op0))
15507 return DAG.getNode(ARMISD::VCMPZ, dl, VT, Op1,
15508 DAG.getConstant(SwappedCond, dl, MVT::i32));
15509 // vcmp vdup(Y), X, cc -> vcmp X, vdup(Y), reversed(cc)
15510 if (Op0->getOpcode() == ARMISD::VDUP && Op1->getOpcode() != ARMISD::VDUP)
15511 return DAG.getNode(ARMISD::VCMP, dl, VT, Op1, Op0,
15512 DAG.getConstant(SwappedCond, dl, MVT::i32));
15513 }
15514
15515 return SDValue();
15516}
15517
15518/// PerformInsertEltCombine - Target-specific dag combine xforms for
15519/// ISD::INSERT_VECTOR_ELT.
15522 // Bitcast an i64 load inserted into a vector to f64.
15523 // Otherwise, the i64 value will be legalized to a pair of i32 values.
15524 EVT VT = N->getValueType(0);
15525 SDNode *Elt = N->getOperand(1).getNode();
15526 if (VT.getVectorElementType() != MVT::i64 ||
15527 !ISD::isNormalLoad(Elt) || cast<LoadSDNode>(Elt)->isVolatile())
15528 return SDValue();
15529
15530 SelectionDAG &DAG = DCI.DAG;
15531 SDLoc dl(N);
15532 EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64,
15534 SDValue Vec = DAG.getNode(ISD::BITCAST, dl, FloatVT, N->getOperand(0));
15535 SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::f64, N->getOperand(1));
15536 // Make the DAGCombiner fold the bitcasts.
15537 DCI.AddToWorklist(Vec.getNode());
15538 DCI.AddToWorklist(V.getNode());
15539 SDValue InsElt = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, FloatVT,
15540 Vec, V, N->getOperand(2));
15541 return DAG.getNode(ISD::BITCAST, dl, VT, InsElt);
15542}
15543
15544// Convert a pair of extracts from the same base vector to a VMOVRRD. Either
15545// directly or bitcast to an integer if the original is a float vector.
15546// extract(x, n); extract(x, n+1) -> VMOVRRD(extract v2f64 x, n/2)
15547// bitcast(extract(x, n)); bitcast(extract(x, n+1)) -> VMOVRRD(extract x, n/2)
15548static SDValue
15550 EVT VT = N->getValueType(0);
15551 SDLoc dl(N);
15552
15553 if (!DCI.isAfterLegalizeDAG() || VT != MVT::i32 ||
15554 !DCI.DAG.getTargetLoweringInfo().isTypeLegal(MVT::f64))
15555 return SDValue();
15556
15557 SDValue Ext = SDValue(N, 0);
15558 if (Ext.getOpcode() == ISD::BITCAST &&
15559 Ext.getOperand(0).getValueType() == MVT::f32)
15560 Ext = Ext.getOperand(0);
15561 if (Ext.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
15562 !isa<ConstantSDNode>(Ext.getOperand(1)) ||
15563 Ext.getConstantOperandVal(1) % 2 != 0)
15564 return SDValue();
15565 if (Ext->hasOneUse() && (Ext->user_begin()->getOpcode() == ISD::SINT_TO_FP ||
15566 Ext->user_begin()->getOpcode() == ISD::UINT_TO_FP))
15567 return SDValue();
15568
15569 SDValue Op0 = Ext.getOperand(0);
15570 EVT VecVT = Op0.getValueType();
15571 unsigned ResNo = Op0.getResNo();
15572 unsigned Lane = Ext.getConstantOperandVal(1);
15573 if (VecVT.getVectorNumElements() != 4)
15574 return SDValue();
15575
15576 // Find another extract, of Lane + 1
15577 auto OtherIt = find_if(Op0->users(), [&](SDNode *V) {
15578 return V->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
15579 isa<ConstantSDNode>(V->getOperand(1)) &&
15580 V->getConstantOperandVal(1) == Lane + 1 &&
15581 V->getOperand(0).getResNo() == ResNo;
15582 });
15583 if (OtherIt == Op0->users().end())
15584 return SDValue();
15585
15586 // For float extracts, we need to be converting to a i32 for both vector
15587 // lanes.
15588 SDValue OtherExt(*OtherIt, 0);
15589 if (OtherExt.getValueType() != MVT::i32) {
15590 if (!OtherExt->hasOneUse() ||
15591 OtherExt->user_begin()->getOpcode() != ISD::BITCAST ||
15592 OtherExt->user_begin()->getValueType(0) != MVT::i32)
15593 return SDValue();
15594 OtherExt = SDValue(*OtherExt->user_begin(), 0);
15595 }
15596
15597 // Convert the type to a f64 and extract with a VMOVRRD.
15598 SDValue F64 = DCI.DAG.getNode(
15599 ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
15600 DCI.DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v2f64, Op0),
15601 DCI.DAG.getConstant(Ext.getConstantOperandVal(1) / 2, dl, MVT::i32));
15602 SDValue VMOVRRD =
15603 DCI.DAG.getNode(ARMISD::VMOVRRD, dl, {MVT::i32, MVT::i32}, F64);
15604
15605 DCI.CombineTo(OtherExt.getNode(), SDValue(VMOVRRD.getNode(), 1));
15606 return VMOVRRD;
15607}
15608
15611 const ARMSubtarget *ST) {
15612 SDValue Op0 = N->getOperand(0);
15613 EVT VT = N->getValueType(0);
15614 SDLoc dl(N);
15615
15616 // extract (vdup x) -> x
15617 if (Op0->getOpcode() == ARMISD::VDUP) {
15618 SDValue X = Op0->getOperand(0);
15619 if (VT == MVT::f16 && X.getValueType() == MVT::i32)
15620 return DCI.DAG.getNode(ARMISD::VMOVhr, dl, VT, X);
15621 if (VT == MVT::i32 && X.getValueType() == MVT::f16)
15622 return DCI.DAG.getNode(ARMISD::VMOVrh, dl, VT, X);
15623 if (VT == MVT::f32 && X.getValueType() == MVT::i32)
15624 return DCI.DAG.getNode(ISD::BITCAST, dl, VT, X);
15625
15626 while (X.getValueType() != VT && X->getOpcode() == ISD::BITCAST)
15627 X = X->getOperand(0);
15628 if (X.getValueType() == VT)
15629 return X;
15630 }
15631
15632 // extract ARM_BUILD_VECTOR -> x
15633 if (Op0->getOpcode() == ARMISD::BUILD_VECTOR &&
15634 isa<ConstantSDNode>(N->getOperand(1)) &&
15635 N->getConstantOperandVal(1) < Op0.getNumOperands()) {
15636 return Op0.getOperand(N->getConstantOperandVal(1));
15637 }
15638
15639 // extract(bitcast(BUILD_VECTOR(VMOVDRR(a, b), ..))) -> a or b
15640 if (Op0.getValueType() == MVT::v4i32 &&
15641 isa<ConstantSDNode>(N->getOperand(1)) &&
15642 Op0.getOpcode() == ISD::BITCAST &&
15644 Op0.getOperand(0).getValueType() == MVT::v2f64) {
15645 SDValue BV = Op0.getOperand(0);
15646 unsigned Offset = N->getConstantOperandVal(1);
15647 SDValue MOV = BV.getOperand(Offset < 2 ? 0 : 1);
15648 if (MOV.getOpcode() == ARMISD::VMOVDRR)
15649 return MOV.getOperand(ST->isLittle() ? Offset % 2 : 1 - Offset % 2);
15650 }
15651
15652 // extract x, n; extract x, n+1 -> VMOVRRD x
15653 if (SDValue R = PerformExtractEltToVMOVRRD(N, DCI))
15654 return R;
15655
15656 // extract (MVETrunc(x)) -> extract x
15657 if (Op0->getOpcode() == ARMISD::MVETRUNC) {
15658 unsigned Idx = N->getConstantOperandVal(1);
15659 unsigned Vec =
15661 unsigned SubIdx =
15663 return DCI.DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Op0.getOperand(Vec),
15664 DCI.DAG.getConstant(SubIdx, dl, MVT::i32));
15665 }
15666
15667 return SDValue();
15668}
15669
15671 SDValue Op = N->getOperand(0);
15672 EVT VT = N->getValueType(0);
15673
15674 // sext_inreg(VGETLANEu) -> VGETLANEs
15675 if (Op.getOpcode() == ARMISD::VGETLANEu &&
15676 cast<VTSDNode>(N->getOperand(1))->getVT() ==
15677 Op.getOperand(0).getValueType().getScalarType())
15678 return DAG.getNode(ARMISD::VGETLANEs, SDLoc(N), VT, Op.getOperand(0),
15679 Op.getOperand(1));
15680
15681 return SDValue();
15682}
15683
15684static SDValue
15686 SDValue Vec = N->getOperand(0);
15687 SDValue SubVec = N->getOperand(1);
15688 uint64_t IdxVal = N->getConstantOperandVal(2);
15689 EVT VecVT = Vec.getValueType();
15690 EVT SubVT = SubVec.getValueType();
15691
15692 // Only do this for legal fixed vector types.
15693 if (!VecVT.isFixedLengthVector() ||
15694 !DCI.DAG.getTargetLoweringInfo().isTypeLegal(VecVT) ||
15696 return SDValue();
15697
15698 // Ignore widening patterns.
15699 if (IdxVal == 0 && Vec.isUndef())
15700 return SDValue();
15701
15702 // Subvector must be half the width and an "aligned" insertion.
15703 unsigned NumSubElts = SubVT.getVectorNumElements();
15704 if ((SubVT.getSizeInBits() * 2) != VecVT.getSizeInBits() ||
15705 (IdxVal != 0 && IdxVal != NumSubElts))
15706 return SDValue();
15707
15708 // Fold insert_subvector -> concat_vectors
15709 // insert_subvector(Vec,Sub,lo) -> concat_vectors(Sub,extract(Vec,hi))
15710 // insert_subvector(Vec,Sub,hi) -> concat_vectors(extract(Vec,lo),Sub)
15711 SDLoc DL(N);
15712 SDValue Lo, Hi;
15713 if (IdxVal == 0) {
15714 Lo = SubVec;
15715 Hi = DCI.DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, Vec,
15716 DCI.DAG.getVectorIdxConstant(NumSubElts, DL));
15717 } else {
15718 Lo = DCI.DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, Vec,
15719 DCI.DAG.getVectorIdxConstant(0, DL));
15720 Hi = SubVec;
15721 }
15722 return DCI.DAG.getNode(ISD::CONCAT_VECTORS, DL, VecVT, Lo, Hi);
15723}
15724
15725// shuffle(MVETrunc(x, y)) -> VMOVN(x, y)
15727 SelectionDAG &DAG) {
15728 SDValue Trunc = N->getOperand(0);
15729 EVT VT = Trunc.getValueType();
15730 if (Trunc.getOpcode() != ARMISD::MVETRUNC || !N->getOperand(1).isUndef())
15731 return SDValue();
15732
15733 SDLoc DL(Trunc);
15734 if (isVMOVNTruncMask(N->getMask(), VT, false))
15735 return DAG.getNode(
15736 ARMISD::VMOVN, DL, VT,
15737 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, Trunc.getOperand(0)),
15738 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, Trunc.getOperand(1)),
15739 DAG.getConstant(1, DL, MVT::i32));
15740 else if (isVMOVNTruncMask(N->getMask(), VT, true))
15741 return DAG.getNode(
15742 ARMISD::VMOVN, DL, VT,
15743 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, Trunc.getOperand(1)),
15744 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, Trunc.getOperand(0)),
15745 DAG.getConstant(1, DL, MVT::i32));
15746 return SDValue();
15747}
15748
15749/// PerformVECTOR_SHUFFLECombine - Target-specific dag combine xforms for
15750/// ISD::VECTOR_SHUFFLE.
15753 return R;
15754
15755 // The LLVM shufflevector instruction does not require the shuffle mask
15756 // length to match the operand vector length, but ISD::VECTOR_SHUFFLE does
15757 // have that requirement. When translating to ISD::VECTOR_SHUFFLE, if the
15758 // operands do not match the mask length, they are extended by concatenating
15759 // them with undef vectors. That is probably the right thing for other
15760 // targets, but for NEON it is better to concatenate two double-register
15761 // size vector operands into a single quad-register size vector. Do that
15762 // transformation here:
15763 // shuffle(concat(v1, undef), concat(v2, undef)) ->
15764 // shuffle(concat(v1, v2), undef)
15765 SDValue Op0 = N->getOperand(0);
15766 SDValue Op1 = N->getOperand(1);
15767 if (Op0.getOpcode() != ISD::CONCAT_VECTORS ||
15768 Op1.getOpcode() != ISD::CONCAT_VECTORS ||
15769 Op0.getNumOperands() != 2 ||
15770 Op1.getNumOperands() != 2)
15771 return SDValue();
15772 SDValue Concat0Op1 = Op0.getOperand(1);
15773 SDValue Concat1Op1 = Op1.getOperand(1);
15774 if (!Concat0Op1.isUndef() || !Concat1Op1.isUndef())
15775 return SDValue();
15776 // Skip the transformation if any of the types are illegal.
15777 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
15778 EVT VT = N->getValueType(0);
15779 if (!TLI.isTypeLegal(VT) ||
15780 !TLI.isTypeLegal(Concat0Op1.getValueType()) ||
15781 !TLI.isTypeLegal(Concat1Op1.getValueType()))
15782 return SDValue();
15783
15784 SDValue NewConcat = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT,
15785 Op0.getOperand(0), Op1.getOperand(0));
15786 // Translate the shuffle mask.
15787 SmallVector<int, 16> NewMask;
15788 unsigned NumElts = VT.getVectorNumElements();
15789 unsigned HalfElts = NumElts/2;
15791 for (unsigned n = 0; n < NumElts; ++n) {
15792 int MaskElt = SVN->getMaskElt(n);
15793 int NewElt = -1;
15794 if (MaskElt < (int)HalfElts)
15795 NewElt = MaskElt;
15796 else if (MaskElt >= (int)NumElts && MaskElt < (int)(NumElts + HalfElts))
15797 NewElt = HalfElts + MaskElt - NumElts;
15798 NewMask.push_back(NewElt);
15799 }
15800 return DAG.getVectorShuffle(VT, SDLoc(N), NewConcat,
15801 DAG.getUNDEF(VT), NewMask);
15802}
15803
15804/// Load/store instruction that can be merged with a base address
15805/// update
15810 unsigned AddrOpIdx;
15811};
15812
15814 /// Instruction that updates a pointer
15816 /// Pointer increment operand
15818 /// Pointer increment value if it is a constant, or 0 otherwise
15819 unsigned ConstInc;
15820};
15821
15823 // Check that the add is independent of the load/store.
15824 // Otherwise, folding it would create a cycle. Search through Addr
15825 // as well, since the User may not be a direct user of Addr and
15826 // only share a base pointer.
15829 Worklist.push_back(N);
15830 Worklist.push_back(User);
15831 const unsigned MaxSteps = 1024;
15832 if (SDNode::hasPredecessorHelper(N, Visited, Worklist, MaxSteps) ||
15833 SDNode::hasPredecessorHelper(User, Visited, Worklist, MaxSteps))
15834 return false;
15835 return true;
15836}
15837
15839 struct BaseUpdateUser &User,
15840 bool SimpleConstIncOnly,
15842 SelectionDAG &DAG = DCI.DAG;
15843 SDNode *N = Target.N;
15844 MemSDNode *MemN = cast<MemSDNode>(N);
15845 SDLoc dl(N);
15846
15847 // Find the new opcode for the updating load/store.
15848 bool isLoadOp = true;
15849 bool isLaneOp = false;
15850 // Workaround for vst1x and vld1x intrinsics which do not have alignment
15851 // as an operand.
15852 bool hasAlignment = true;
15853 unsigned NewOpc = 0;
15854 unsigned NumVecs = 0;
15855 if (Target.isIntrinsic) {
15856 unsigned IntNo = N->getConstantOperandVal(1);
15857 switch (IntNo) {
15858 default:
15859 llvm_unreachable("unexpected intrinsic for Neon base update");
15860 case Intrinsic::arm_neon_vld1:
15861 NewOpc = ARMISD::VLD1_UPD;
15862 NumVecs = 1;
15863 break;
15864 case Intrinsic::arm_neon_vld2:
15865 NewOpc = ARMISD::VLD2_UPD;
15866 NumVecs = 2;
15867 break;
15868 case Intrinsic::arm_neon_vld3:
15869 NewOpc = ARMISD::VLD3_UPD;
15870 NumVecs = 3;
15871 break;
15872 case Intrinsic::arm_neon_vld4:
15873 NewOpc = ARMISD::VLD4_UPD;
15874 NumVecs = 4;
15875 break;
15876 case Intrinsic::arm_neon_vld1x2:
15877 NewOpc = ARMISD::VLD1x2_UPD;
15878 NumVecs = 2;
15879 hasAlignment = false;
15880 break;
15881 case Intrinsic::arm_neon_vld1x3:
15882 NewOpc = ARMISD::VLD1x3_UPD;
15883 NumVecs = 3;
15884 hasAlignment = false;
15885 break;
15886 case Intrinsic::arm_neon_vld1x4:
15887 NewOpc = ARMISD::VLD1x4_UPD;
15888 NumVecs = 4;
15889 hasAlignment = false;
15890 break;
15891 case Intrinsic::arm_neon_vld2dup:
15892 NewOpc = ARMISD::VLD2DUP_UPD;
15893 NumVecs = 2;
15894 break;
15895 case Intrinsic::arm_neon_vld3dup:
15896 NewOpc = ARMISD::VLD3DUP_UPD;
15897 NumVecs = 3;
15898 break;
15899 case Intrinsic::arm_neon_vld4dup:
15900 NewOpc = ARMISD::VLD4DUP_UPD;
15901 NumVecs = 4;
15902 break;
15903 case Intrinsic::arm_neon_vld2lane:
15904 NewOpc = ARMISD::VLD2LN_UPD;
15905 NumVecs = 2;
15906 isLaneOp = true;
15907 break;
15908 case Intrinsic::arm_neon_vld3lane:
15909 NewOpc = ARMISD::VLD3LN_UPD;
15910 NumVecs = 3;
15911 isLaneOp = true;
15912 break;
15913 case Intrinsic::arm_neon_vld4lane:
15914 NewOpc = ARMISD::VLD4LN_UPD;
15915 NumVecs = 4;
15916 isLaneOp = true;
15917 break;
15918 case Intrinsic::arm_neon_vst1:
15919 NewOpc = ARMISD::VST1_UPD;
15920 NumVecs = 1;
15921 isLoadOp = false;
15922 break;
15923 case Intrinsic::arm_neon_vst2:
15924 NewOpc = ARMISD::VST2_UPD;
15925 NumVecs = 2;
15926 isLoadOp = false;
15927 break;
15928 case Intrinsic::arm_neon_vst3:
15929 NewOpc = ARMISD::VST3_UPD;
15930 NumVecs = 3;
15931 isLoadOp = false;
15932 break;
15933 case Intrinsic::arm_neon_vst4:
15934 NewOpc = ARMISD::VST4_UPD;
15935 NumVecs = 4;
15936 isLoadOp = false;
15937 break;
15938 case Intrinsic::arm_neon_vst2lane:
15939 NewOpc = ARMISD::VST2LN_UPD;
15940 NumVecs = 2;
15941 isLoadOp = false;
15942 isLaneOp = true;
15943 break;
15944 case Intrinsic::arm_neon_vst3lane:
15945 NewOpc = ARMISD::VST3LN_UPD;
15946 NumVecs = 3;
15947 isLoadOp = false;
15948 isLaneOp = true;
15949 break;
15950 case Intrinsic::arm_neon_vst4lane:
15951 NewOpc = ARMISD::VST4LN_UPD;
15952 NumVecs = 4;
15953 isLoadOp = false;
15954 isLaneOp = true;
15955 break;
15956 case Intrinsic::arm_neon_vst1x2:
15957 NewOpc = ARMISD::VST1x2_UPD;
15958 NumVecs = 2;
15959 isLoadOp = false;
15960 hasAlignment = false;
15961 break;
15962 case Intrinsic::arm_neon_vst1x3:
15963 NewOpc = ARMISD::VST1x3_UPD;
15964 NumVecs = 3;
15965 isLoadOp = false;
15966 hasAlignment = false;
15967 break;
15968 case Intrinsic::arm_neon_vst1x4:
15969 NewOpc = ARMISD::VST1x4_UPD;
15970 NumVecs = 4;
15971 isLoadOp = false;
15972 hasAlignment = false;
15973 break;
15974 }
15975 } else {
15976 isLaneOp = true;
15977 switch (N->getOpcode()) {
15978 default:
15979 llvm_unreachable("unexpected opcode for Neon base update");
15980 case ARMISD::VLD1DUP:
15981 NewOpc = ARMISD::VLD1DUP_UPD;
15982 NumVecs = 1;
15983 break;
15984 case ARMISD::VLD2DUP:
15985 NewOpc = ARMISD::VLD2DUP_UPD;
15986 NumVecs = 2;
15987 break;
15988 case ARMISD::VLD3DUP:
15989 NewOpc = ARMISD::VLD3DUP_UPD;
15990 NumVecs = 3;
15991 break;
15992 case ARMISD::VLD4DUP:
15993 NewOpc = ARMISD::VLD4DUP_UPD;
15994 NumVecs = 4;
15995 break;
15996 case ISD::LOAD:
15997 NewOpc = ARMISD::VLD1_UPD;
15998 NumVecs = 1;
15999 isLaneOp = false;
16000 break;
16001 case ISD::STORE:
16002 NewOpc = ARMISD::VST1_UPD;
16003 NumVecs = 1;
16004 isLaneOp = false;
16005 isLoadOp = false;
16006 break;
16007 }
16008 }
16009
16010 // Find the size of memory referenced by the load/store.
16011 EVT VecTy;
16012 if (isLoadOp) {
16013 VecTy = N->getValueType(0);
16014 } else if (Target.isIntrinsic) {
16015 VecTy = N->getOperand(Target.AddrOpIdx + 1).getValueType();
16016 } else {
16017 assert(Target.isStore &&
16018 "Node has to be a load, a store, or an intrinsic!");
16019 VecTy = N->getOperand(1).getValueType();
16020 }
16021
16022 bool isVLDDUPOp =
16023 NewOpc == ARMISD::VLD1DUP_UPD || NewOpc == ARMISD::VLD2DUP_UPD ||
16024 NewOpc == ARMISD::VLD3DUP_UPD || NewOpc == ARMISD::VLD4DUP_UPD;
16025
16026 unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8;
16027 if (isLaneOp || isVLDDUPOp)
16028 NumBytes /= VecTy.getVectorNumElements();
16029
16030 if (NumBytes >= 3 * 16 && User.ConstInc != NumBytes) {
16031 // VLD3/4 and VST3/4 for 128-bit vectors are implemented with two
16032 // separate instructions that make it harder to use a non-constant update.
16033 return false;
16034 }
16035
16036 if (SimpleConstIncOnly && User.ConstInc != NumBytes)
16037 return false;
16038
16039 if (!isValidBaseUpdate(N, User.N))
16040 return false;
16041
16042 // OK, we found an ADD we can fold into the base update.
16043 // Now, create a _UPD node, taking care of not breaking alignment.
16044
16045 EVT AlignedVecTy = VecTy;
16046 Align Alignment = MemN->getAlign();
16047
16048 // If this is a less-than-standard-aligned load/store, change the type to
16049 // match the standard alignment.
16050 // The alignment is overlooked when selecting _UPD variants; and it's
16051 // easier to introduce bitcasts here than fix that.
16052 // There are 3 ways to get to this base-update combine:
16053 // - intrinsics: they are assumed to be properly aligned (to the standard
16054 // alignment of the memory type), so we don't need to do anything.
16055 // - ARMISD::VLDx nodes: they are only generated from the aforementioned
16056 // intrinsics, so, likewise, there's nothing to do.
16057 // - generic load/store instructions: the alignment is specified as an
16058 // explicit operand, rather than implicitly as the standard alignment
16059 // of the memory type (like the intrisics). We need to change the
16060 // memory type to match the explicit alignment. That way, we don't
16061 // generate non-standard-aligned ARMISD::VLDx nodes.
16062 if (isa<LSBaseSDNode>(N)) {
16063 if (Alignment.value() < VecTy.getScalarSizeInBits() / 8) {
16064 MVT EltTy = MVT::getIntegerVT(Alignment.value() * 8);
16065 assert(NumVecs == 1 && "Unexpected multi-element generic load/store.");
16066 assert(!isLaneOp && "Unexpected generic load/store lane.");
16067 unsigned NumElts = NumBytes / (EltTy.getSizeInBits() / 8);
16068 AlignedVecTy = MVT::getVectorVT(EltTy, NumElts);
16069 }
16070 // Don't set an explicit alignment on regular load/stores that we want
16071 // to transform to VLD/VST 1_UPD nodes.
16072 // This matches the behavior of regular load/stores, which only get an
16073 // explicit alignment if the MMO alignment is larger than the standard
16074 // alignment of the memory type.
16075 // Intrinsics, however, always get an explicit alignment, set to the
16076 // alignment of the MMO.
16077 Alignment = Align(1);
16078 }
16079
16080 // Create the new updating load/store node.
16081 // First, create an SDVTList for the new updating node's results.
16082 EVT Tys[6];
16083 unsigned NumResultVecs = (isLoadOp ? NumVecs : 0);
16084 unsigned n;
16085 for (n = 0; n < NumResultVecs; ++n)
16086 Tys[n] = AlignedVecTy;
16087 Tys[n++] = MVT::i32;
16088 Tys[n] = MVT::Other;
16089 SDVTList SDTys = DAG.getVTList(ArrayRef(Tys, NumResultVecs + 2));
16090
16091 // Then, gather the new node's operands.
16093 Ops.push_back(N->getOperand(0)); // incoming chain
16094 Ops.push_back(N->getOperand(Target.AddrOpIdx));
16095 Ops.push_back(User.Inc);
16096
16097 if (StoreSDNode *StN = dyn_cast<StoreSDNode>(N)) {
16098 // Try to match the intrinsic's signature
16099 Ops.push_back(StN->getValue());
16100 } else {
16101 // Loads (and of course intrinsics) match the intrinsics' signature,
16102 // so just add all but the alignment operand.
16103 unsigned LastOperand =
16104 hasAlignment ? N->getNumOperands() - 1 : N->getNumOperands();
16105 for (unsigned i = Target.AddrOpIdx + 1; i < LastOperand; ++i)
16106 Ops.push_back(N->getOperand(i));
16107 }
16108
16109 // For all node types, the alignment operand is always the last one.
16110 Ops.push_back(DAG.getConstant(Alignment.value(), dl, MVT::i32));
16111
16112 // If this is a non-standard-aligned STORE, the penultimate operand is the
16113 // stored value. Bitcast it to the aligned type.
16114 if (AlignedVecTy != VecTy && N->getOpcode() == ISD::STORE) {
16115 SDValue &StVal = Ops[Ops.size() - 2];
16116 StVal = DAG.getNode(ISD::BITCAST, dl, AlignedVecTy, StVal);
16117 }
16118
16119 EVT LoadVT = isLaneOp ? VecTy.getVectorElementType() : AlignedVecTy;
16120 SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, dl, SDTys, Ops, LoadVT,
16121 MemN->getMemOperand());
16122
16123 // Update the uses.
16124 SmallVector<SDValue, 5> NewResults;
16125 for (unsigned i = 0; i < NumResultVecs; ++i)
16126 NewResults.push_back(SDValue(UpdN.getNode(), i));
16127
16128 // If this is an non-standard-aligned LOAD, the first result is the loaded
16129 // value. Bitcast it to the expected result type.
16130 if (AlignedVecTy != VecTy && N->getOpcode() == ISD::LOAD) {
16131 SDValue &LdVal = NewResults[0];
16132 LdVal = DAG.getNode(ISD::BITCAST, dl, VecTy, LdVal);
16133 }
16134
16135 NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs + 1)); // chain
16136 DCI.CombineTo(N, NewResults);
16137 DCI.CombineTo(User.N, SDValue(UpdN.getNode(), NumResultVecs));
16138
16139 return true;
16140}
16141
16142// If (opcode ptr inc) is and ADD-like instruction, return the
16143// increment value. Otherwise return 0.
16144static unsigned getPointerConstIncrement(unsigned Opcode, SDValue Ptr,
16145 SDValue Inc, const SelectionDAG &DAG) {
16147 if (!CInc)
16148 return 0;
16149
16150 switch (Opcode) {
16151 case ARMISD::VLD1_UPD:
16152 case ISD::ADD:
16153 return CInc->getZExtValue();
16154 case ISD::OR: {
16155 if (DAG.haveNoCommonBitsSet(Ptr, Inc)) {
16156 // (OR ptr inc) is the same as (ADD ptr inc)
16157 return CInc->getZExtValue();
16158 }
16159 return 0;
16160 }
16161 default:
16162 return 0;
16163 }
16164}
16165
16167 switch (N->getOpcode()) {
16168 case ISD::ADD:
16169 case ISD::OR: {
16170 if (isa<ConstantSDNode>(N->getOperand(1))) {
16171 *Ptr = N->getOperand(0);
16172 *CInc = N->getOperand(1);
16173 return true;
16174 }
16175 return false;
16176 }
16177 case ARMISD::VLD1_UPD: {
16178 if (isa<ConstantSDNode>(N->getOperand(2))) {
16179 *Ptr = N->getOperand(1);
16180 *CInc = N->getOperand(2);
16181 return true;
16182 }
16183 return false;
16184 }
16185 default:
16186 return false;
16187 }
16188}
16189
16190/// CombineBaseUpdate - Target-specific DAG combine function for VLDDUP,
16191/// NEON load/store intrinsics, and generic vector load/stores, to merge
16192/// base address updates.
16193/// For generic load/stores, the memory type is assumed to be a vector.
16194/// The caller is assumed to have checked legality.
16197 const bool isIntrinsic = (N->getOpcode() == ISD::INTRINSIC_VOID ||
16198 N->getOpcode() == ISD::INTRINSIC_W_CHAIN);
16199 const bool isStore = N->getOpcode() == ISD::STORE;
16200 const unsigned AddrOpIdx = ((isIntrinsic || isStore) ? 2 : 1);
16201 BaseUpdateTarget Target = {N, isIntrinsic, isStore, AddrOpIdx};
16202
16203 // Limit the number of possible base-updates we look at to prevent degenerate
16204 // cases.
16205 unsigned MaxBaseUpdates = ArmMaxBaseUpdatesToCheck;
16206
16207 SDValue Addr = N->getOperand(AddrOpIdx);
16208
16210
16211 // Search for a use of the address operand that is an increment.
16212 for (SDUse &Use : Addr->uses()) {
16213 SDNode *User = Use.getUser();
16214 if (Use.getResNo() != Addr.getResNo() || User->getNumOperands() != 2)
16215 continue;
16216
16217 SDValue Inc = User->getOperand(Use.getOperandNo() == 1 ? 0 : 1);
16218 unsigned ConstInc =
16219 getPointerConstIncrement(User->getOpcode(), Addr, Inc, DCI.DAG);
16220
16221 if (ConstInc || User->getOpcode() == ISD::ADD) {
16222 BaseUpdates.push_back({User, Inc, ConstInc});
16223 if (BaseUpdates.size() >= MaxBaseUpdates)
16224 break;
16225 }
16226 }
16227
16228 // If the address is a constant pointer increment itself, find
16229 // another constant increment that has the same base operand
16230 SDValue Base;
16231 SDValue CInc;
16232 if (findPointerConstIncrement(Addr.getNode(), &Base, &CInc)) {
16233 unsigned Offset =
16234 getPointerConstIncrement(Addr->getOpcode(), Base, CInc, DCI.DAG);
16235 for (SDUse &Use : Base->uses()) {
16236
16237 SDNode *User = Use.getUser();
16238 if (Use.getResNo() != Base.getResNo() || User == Addr.getNode() ||
16239 User->getNumOperands() != 2)
16240 continue;
16241
16242 SDValue UserInc = User->getOperand(Use.getOperandNo() == 0 ? 1 : 0);
16243 unsigned UserOffset =
16244 getPointerConstIncrement(User->getOpcode(), Base, UserInc, DCI.DAG);
16245
16246 if (!UserOffset || UserOffset <= Offset)
16247 continue;
16248
16249 unsigned NewConstInc = UserOffset - Offset;
16250 SDValue NewInc = DCI.DAG.getConstant(NewConstInc, SDLoc(N), MVT::i32);
16251 BaseUpdates.push_back({User, NewInc, NewConstInc});
16252 if (BaseUpdates.size() >= MaxBaseUpdates)
16253 break;
16254 }
16255 }
16256
16257 // Try to fold the load/store with an update that matches memory
16258 // access size. This should work well for sequential loads.
16259 unsigned NumValidUpd = BaseUpdates.size();
16260 for (unsigned I = 0; I < NumValidUpd; I++) {
16261 BaseUpdateUser &User = BaseUpdates[I];
16262 if (TryCombineBaseUpdate(Target, User, /*SimpleConstIncOnly=*/true, DCI))
16263 return SDValue();
16264 }
16265
16266 // Try to fold with other users. Non-constant updates are considered
16267 // first, and constant updates are sorted to not break a sequence of
16268 // strided accesses (if there is any).
16269 llvm::stable_sort(BaseUpdates,
16270 [](const BaseUpdateUser &LHS, const BaseUpdateUser &RHS) {
16271 return LHS.ConstInc < RHS.ConstInc;
16272 });
16273 for (BaseUpdateUser &User : BaseUpdates) {
16274 if (TryCombineBaseUpdate(Target, User, /*SimpleConstIncOnly=*/false, DCI))
16275 return SDValue();
16276 }
16277 return SDValue();
16278}
16279
16282 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
16283 return SDValue();
16284
16285 return CombineBaseUpdate(N, DCI);
16286}
16287
16290 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
16291 return SDValue();
16292
16293 SelectionDAG &DAG = DCI.DAG;
16294 SDValue Addr = N->getOperand(2);
16295 MemSDNode *MemN = cast<MemSDNode>(N);
16296 SDLoc dl(N);
16297
16298 // For the stores, where there are multiple intrinsics we only actually want
16299 // to post-inc the last of the them.
16300 unsigned IntNo = N->getConstantOperandVal(1);
16301 if (IntNo == Intrinsic::arm_mve_vst2q && N->getConstantOperandVal(5) != 1)
16302 return SDValue();
16303 if (IntNo == Intrinsic::arm_mve_vst4q && N->getConstantOperandVal(7) != 3)
16304 return SDValue();
16305
16306 // Search for a use of the address operand that is an increment.
16307 for (SDUse &Use : Addr->uses()) {
16308 SDNode *User = Use.getUser();
16309 if (User->getOpcode() != ISD::ADD || Use.getResNo() != Addr.getResNo())
16310 continue;
16311
16312 // Check that the add is independent of the load/store. Otherwise, folding
16313 // it would create a cycle. We can avoid searching through Addr as it's a
16314 // predecessor to both.
16317 Visited.insert(Addr.getNode());
16318 Worklist.push_back(N);
16319 Worklist.push_back(User);
16320 const unsigned MaxSteps = 1024;
16321 if (SDNode::hasPredecessorHelper(N, Visited, Worklist, MaxSteps) ||
16322 SDNode::hasPredecessorHelper(User, Visited, Worklist, MaxSteps))
16323 continue;
16324
16325 // Find the new opcode for the updating load/store.
16326 bool isLoadOp = true;
16327 unsigned NewOpc = 0;
16328 unsigned NumVecs = 0;
16329 switch (IntNo) {
16330 default:
16331 llvm_unreachable("unexpected intrinsic for MVE VLDn combine");
16332 case Intrinsic::arm_mve_vld2q:
16333 NewOpc = ARMISD::VLD2_UPD;
16334 NumVecs = 2;
16335 break;
16336 case Intrinsic::arm_mve_vld4q:
16337 NewOpc = ARMISD::VLD4_UPD;
16338 NumVecs = 4;
16339 break;
16340 case Intrinsic::arm_mve_vst2q:
16341 NewOpc = ARMISD::VST2_UPD;
16342 NumVecs = 2;
16343 isLoadOp = false;
16344 break;
16345 case Intrinsic::arm_mve_vst4q:
16346 NewOpc = ARMISD::VST4_UPD;
16347 NumVecs = 4;
16348 isLoadOp = false;
16349 break;
16350 }
16351
16352 // Find the size of memory referenced by the load/store.
16353 EVT VecTy;
16354 if (isLoadOp) {
16355 VecTy = N->getValueType(0);
16356 } else {
16357 VecTy = N->getOperand(3).getValueType();
16358 }
16359
16360 unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8;
16361
16362 // If the increment is a constant, it must match the memory ref size.
16363 SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);
16365 if (!CInc || CInc->getZExtValue() != NumBytes)
16366 continue;
16367
16368 // Create the new updating load/store node.
16369 // First, create an SDVTList for the new updating node's results.
16370 EVT Tys[6];
16371 unsigned NumResultVecs = (isLoadOp ? NumVecs : 0);
16372 unsigned n;
16373 for (n = 0; n < NumResultVecs; ++n)
16374 Tys[n] = VecTy;
16375 Tys[n++] = MVT::i32;
16376 Tys[n] = MVT::Other;
16377 SDVTList SDTys = DAG.getVTList(ArrayRef(Tys, NumResultVecs + 2));
16378
16379 // Then, gather the new node's operands.
16381 Ops.push_back(N->getOperand(0)); // incoming chain
16382 Ops.push_back(N->getOperand(2)); // ptr
16383 Ops.push_back(Inc);
16384
16385 for (unsigned i = 3; i < N->getNumOperands(); ++i)
16386 Ops.push_back(N->getOperand(i));
16387
16388 SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, dl, SDTys, Ops, VecTy,
16389 MemN->getMemOperand());
16390
16391 // Update the uses.
16392 SmallVector<SDValue, 5> NewResults;
16393 for (unsigned i = 0; i < NumResultVecs; ++i)
16394 NewResults.push_back(SDValue(UpdN.getNode(), i));
16395
16396 NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs + 1)); // chain
16397 DCI.CombineTo(N, NewResults);
16398 DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs));
16399
16400 break;
16401 }
16402
16403 return SDValue();
16404}
16405
16406/// CombineVLDDUP - For a VDUPLANE node N, check if its source operand is a
16407/// vldN-lane (N > 1) intrinsic, and if all the other uses of that intrinsic
16408/// are also VDUPLANEs. If so, combine them to a vldN-dup operation and
16409/// return true.
16411 SelectionDAG &DAG = DCI.DAG;
16412 EVT VT = N->getValueType(0);
16413 // vldN-dup instructions only support 64-bit vectors for N > 1.
16414 if (!VT.is64BitVector())
16415 return false;
16416
16417 // Check if the VDUPLANE operand is a vldN-dup intrinsic.
16418 SDNode *VLD = N->getOperand(0).getNode();
16419 if (VLD->getOpcode() != ISD::INTRINSIC_W_CHAIN)
16420 return false;
16421 unsigned NumVecs = 0;
16422 unsigned NewOpc = 0;
16423 unsigned IntNo = VLD->getConstantOperandVal(1);
16424 if (IntNo == Intrinsic::arm_neon_vld2lane) {
16425 NumVecs = 2;
16426 NewOpc = ARMISD::VLD2DUP;
16427 } else if (IntNo == Intrinsic::arm_neon_vld3lane) {
16428 NumVecs = 3;
16429 NewOpc = ARMISD::VLD3DUP;
16430 } else if (IntNo == Intrinsic::arm_neon_vld4lane) {
16431 NumVecs = 4;
16432 NewOpc = ARMISD::VLD4DUP;
16433 } else {
16434 return false;
16435 }
16436
16437 // First check that all the vldN-lane uses are VDUPLANEs and that the lane
16438 // numbers match the load.
16439 unsigned VLDLaneNo = VLD->getConstantOperandVal(NumVecs + 3);
16440 for (SDUse &Use : VLD->uses()) {
16441 // Ignore uses of the chain result.
16442 if (Use.getResNo() == NumVecs)
16443 continue;
16444 SDNode *User = Use.getUser();
16445 if (User->getOpcode() != ARMISD::VDUPLANE ||
16446 VLDLaneNo != User->getConstantOperandVal(1))
16447 return false;
16448 }
16449
16450 // Create the vldN-dup node.
16451 EVT Tys[5];
16452 unsigned n;
16453 for (n = 0; n < NumVecs; ++n)
16454 Tys[n] = VT;
16455 Tys[n] = MVT::Other;
16456 SDVTList SDTys = DAG.getVTList(ArrayRef(Tys, NumVecs + 1));
16457 SDValue Ops[] = { VLD->getOperand(0), VLD->getOperand(2) };
16459 SDValue VLDDup = DAG.getMemIntrinsicNode(NewOpc, SDLoc(VLD), SDTys,
16460 Ops, VLDMemInt->getMemoryVT(),
16461 VLDMemInt->getMemOperand());
16462
16463 // Update the uses.
16464 for (SDUse &Use : VLD->uses()) {
16465 unsigned ResNo = Use.getResNo();
16466 // Ignore uses of the chain result.
16467 if (ResNo == NumVecs)
16468 continue;
16469 DCI.CombineTo(Use.getUser(), SDValue(VLDDup.getNode(), ResNo));
16470 }
16471
16472 // Now the vldN-lane intrinsic is dead except for its chain result.
16473 // Update uses of the chain.
16474 std::vector<SDValue> VLDDupResults;
16475 for (unsigned n = 0; n < NumVecs; ++n)
16476 VLDDupResults.push_back(SDValue(VLDDup.getNode(), n));
16477 VLDDupResults.push_back(SDValue(VLDDup.getNode(), NumVecs));
16478 DCI.CombineTo(VLD, VLDDupResults);
16479
16480 return true;
16481}
16482
16483/// PerformVDUPLANECombine - Target-specific dag combine xforms for
16484/// ARMISD::VDUPLANE.
16487 const ARMSubtarget *Subtarget) {
16488 SDValue Op = N->getOperand(0);
16489 EVT VT = N->getValueType(0);
16490
16491 // On MVE, we just convert the VDUPLANE to a VDUP with an extract.
16492 if (Subtarget->hasMVEIntegerOps()) {
16493 EVT ExtractVT = VT.getVectorElementType();
16494 // We need to ensure we are creating a legal type.
16495 if (!DCI.DAG.getTargetLoweringInfo().isTypeLegal(ExtractVT))
16496 ExtractVT = MVT::i32;
16497 SDValue Extract = DCI.DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(N), ExtractVT,
16498 N->getOperand(0), N->getOperand(1));
16499 return DCI.DAG.getNode(ARMISD::VDUP, SDLoc(N), VT, Extract);
16500 }
16501
16502 // If the source is a vldN-lane (N > 1) intrinsic, and all the other uses
16503 // of that intrinsic are also VDUPLANEs, combine them to a vldN-dup operation.
16504 if (CombineVLDDUP(N, DCI))
16505 return SDValue(N, 0);
16506
16507 // If the source is already a VMOVIMM or VMVNIMM splat, the VDUPLANE is
16508 // redundant. Ignore bit_converts for now; element sizes are checked below.
16509 while (Op.getOpcode() == ISD::BITCAST)
16510 Op = Op.getOperand(0);
16511 if (Op.getOpcode() != ARMISD::VMOVIMM && Op.getOpcode() != ARMISD::VMVNIMM)
16512 return SDValue();
16513
16514 // Make sure the VMOV element size is not bigger than the VDUPLANE elements.
16515 unsigned EltSize = Op.getScalarValueSizeInBits();
16516 // The canonical VMOV for a zero vector uses a 32-bit element size.
16517 unsigned Imm = Op.getConstantOperandVal(0);
16518 unsigned EltBits;
16519 if (ARM_AM::decodeVMOVModImm(Imm, EltBits) == 0)
16520 EltSize = 8;
16521 if (EltSize > VT.getScalarSizeInBits())
16522 return SDValue();
16523
16524 return DCI.DAG.getNode(ISD::BITCAST, SDLoc(N), VT, Op);
16525}
16526
16527/// PerformVDUPCombine - Target-specific dag combine xforms for ARMISD::VDUP.
16529 const ARMSubtarget *Subtarget) {
16530 SDValue Op = N->getOperand(0);
16531 SDLoc dl(N);
16532
16533 if (Subtarget->hasMVEIntegerOps()) {
16534 // Convert VDUP f32 -> VDUP BITCAST i32 under MVE, as we know the value will
16535 // need to come from a GPR.
16536 if (Op.getValueType() == MVT::f32)
16537 return DAG.getNode(ARMISD::VDUP, dl, N->getValueType(0),
16538 DAG.getNode(ISD::BITCAST, dl, MVT::i32, Op));
16539 else if (Op.getValueType() == MVT::f16)
16540 return DAG.getNode(ARMISD::VDUP, dl, N->getValueType(0),
16541 DAG.getNode(ARMISD::VMOVrh, dl, MVT::i32, Op));
16542 }
16543
16544 if (!Subtarget->hasNEON())
16545 return SDValue();
16546
16547 // Match VDUP(LOAD) -> VLD1DUP.
16548 // We match this pattern here rather than waiting for isel because the
16549 // transform is only legal for unindexed loads.
16550 LoadSDNode *LD = dyn_cast<LoadSDNode>(Op.getNode());
16551 if (LD && Op.hasOneUse() && LD->isUnindexed() &&
16552 LD->getMemoryVT() == N->getValueType(0).getVectorElementType()) {
16553 SDValue Ops[] = {LD->getOperand(0), LD->getOperand(1),
16554 DAG.getConstant(LD->getAlign().value(), SDLoc(N), MVT::i32)};
16555 SDVTList SDTys = DAG.getVTList(N->getValueType(0), MVT::Other);
16556 SDValue VLDDup =
16558 LD->getMemoryVT(), LD->getMemOperand());
16559 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), VLDDup.getValue(1));
16560 return VLDDup;
16561 }
16562
16563 return SDValue();
16564}
16565
16568 const ARMSubtarget *Subtarget) {
16569 EVT VT = N->getValueType(0);
16570
16571 // If this is a legal vector load, try to combine it into a VLD1_UPD.
16572 if (Subtarget->hasNEON() && ISD::isNormalLoad(N) && VT.isVector() &&
16574 return CombineBaseUpdate(N, DCI);
16575
16576 return SDValue();
16577}
16578
16579// Optimize trunc store (of multiple scalars) to shuffle and store. First,
16580// pack all of the elements in one place. Next, store to memory in fewer
16581// chunks.
16583 SelectionDAG &DAG) {
16584 SDValue StVal = St->getValue();
16585 EVT VT = StVal.getValueType();
16586 if (!St->isTruncatingStore() || !VT.isVector())
16587 return SDValue();
16588 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
16589 EVT StVT = St->getMemoryVT();
16590 unsigned NumElems = VT.getVectorNumElements();
16591 assert(StVT != VT && "Cannot truncate to the same type");
16592 unsigned FromEltSz = VT.getScalarSizeInBits();
16593 unsigned ToEltSz = StVT.getScalarSizeInBits();
16594
16595 // From, To sizes and ElemCount must be pow of two
16596 if (!isPowerOf2_32(NumElems * FromEltSz * ToEltSz))
16597 return SDValue();
16598
16599 // We are going to use the original vector elt for storing.
16600 // Accumulated smaller vector elements must be a multiple of the store size.
16601 if (0 != (NumElems * FromEltSz) % ToEltSz)
16602 return SDValue();
16603
16604 unsigned SizeRatio = FromEltSz / ToEltSz;
16605 assert(SizeRatio * NumElems * ToEltSz == VT.getSizeInBits());
16606
16607 // Create a type on which we perform the shuffle.
16608 EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), StVT.getScalarType(),
16609 NumElems * SizeRatio);
16610 assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
16611
16612 SDLoc DL(St);
16613 SDValue WideVec = DAG.getNode(ISD::BITCAST, DL, WideVecVT, StVal);
16614 SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
16615 for (unsigned i = 0; i < NumElems; ++i)
16616 ShuffleVec[i] = DAG.getDataLayout().isBigEndian() ? (i + 1) * SizeRatio - 1
16617 : i * SizeRatio;
16618
16619 // Can't shuffle using an illegal type.
16620 if (!TLI.isTypeLegal(WideVecVT))
16621 return SDValue();
16622
16623 SDValue Shuff = DAG.getVectorShuffle(
16624 WideVecVT, DL, WideVec, DAG.getUNDEF(WideVec.getValueType()), ShuffleVec);
16625 // At this point all of the data is stored at the bottom of the
16626 // register. We now need to save it to mem.
16627
16628 // Find the largest store unit
16629 MVT StoreType = MVT::i8;
16630 for (MVT Tp : MVT::integer_valuetypes()) {
16631 if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToEltSz)
16632 StoreType = Tp;
16633 }
16634 // Didn't find a legal store type.
16635 if (!TLI.isTypeLegal(StoreType))
16636 return SDValue();
16637
16638 // Bitcast the original vector into a vector of store-size units
16639 EVT StoreVecVT =
16640 EVT::getVectorVT(*DAG.getContext(), StoreType,
16641 VT.getSizeInBits() / EVT(StoreType).getSizeInBits());
16642 assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits());
16643 SDValue ShuffWide = DAG.getNode(ISD::BITCAST, DL, StoreVecVT, Shuff);
16645 SDValue Increment = DAG.getConstant(StoreType.getSizeInBits() / 8, DL,
16646 TLI.getPointerTy(DAG.getDataLayout()));
16647 SDValue BasePtr = St->getBasePtr();
16648
16649 // Perform one or more big stores into memory.
16650 unsigned E = (ToEltSz * NumElems) / StoreType.getSizeInBits();
16651 for (unsigned I = 0; I < E; I++) {
16652 SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, StoreType,
16653 ShuffWide, DAG.getIntPtrConstant(I, DL));
16654 SDValue Ch =
16655 DAG.getStore(St->getChain(), DL, SubVec, BasePtr, St->getPointerInfo(),
16656 St->getAlign(), St->getMemOperand()->getFlags());
16657 BasePtr =
16658 DAG.getNode(ISD::ADD, DL, BasePtr.getValueType(), BasePtr, Increment);
16659 Chains.push_back(Ch);
16660 }
16661 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
16662}
16663
16664// Try taking a single vector store from an fpround (which would otherwise turn
16665// into an expensive buildvector) and splitting it into a series of narrowing
16666// stores.
16668 SelectionDAG &DAG) {
16669 if (!St->isSimple() || St->isTruncatingStore() || !St->isUnindexed())
16670 return SDValue();
16671 SDValue Trunc = St->getValue();
16672 if (Trunc->getOpcode() != ISD::FP_ROUND)
16673 return SDValue();
16674 EVT FromVT = Trunc->getOperand(0).getValueType();
16675 EVT ToVT = Trunc.getValueType();
16676 if (!ToVT.isVector())
16677 return SDValue();
16679 EVT ToEltVT = ToVT.getVectorElementType();
16680 EVT FromEltVT = FromVT.getVectorElementType();
16681
16682 if (FromEltVT != MVT::f32 || ToEltVT != MVT::f16)
16683 return SDValue();
16684
16685 unsigned NumElements = 4;
16686 if (FromVT.getVectorNumElements() % NumElements != 0)
16687 return SDValue();
16688
16689 // Test if the Trunc will be convertable to a VMOVN with a shuffle, and if so
16690 // use the VMOVN over splitting the store. We are looking for patterns of:
16691 // !rev: 0 N 1 N+1 2 N+2 ...
16692 // rev: N 0 N+1 1 N+2 2 ...
16693 // The shuffle may either be a single source (in which case N = NumElts/2) or
16694 // two inputs extended with concat to the same size (in which case N =
16695 // NumElts).
16696 auto isVMOVNShuffle = [&](ShuffleVectorSDNode *SVN, bool Rev) {
16697 ArrayRef<int> M = SVN->getMask();
16698 unsigned NumElts = ToVT.getVectorNumElements();
16699 if (SVN->getOperand(1).isUndef())
16700 NumElts /= 2;
16701
16702 unsigned Off0 = Rev ? NumElts : 0;
16703 unsigned Off1 = Rev ? 0 : NumElts;
16704
16705 for (unsigned I = 0; I < NumElts; I += 2) {
16706 if (M[I] >= 0 && M[I] != (int)(Off0 + I / 2))
16707 return false;
16708 if (M[I + 1] >= 0 && M[I + 1] != (int)(Off1 + I / 2))
16709 return false;
16710 }
16711
16712 return true;
16713 };
16714
16715 if (auto *Shuffle = dyn_cast<ShuffleVectorSDNode>(Trunc.getOperand(0)))
16716 if (isVMOVNShuffle(Shuffle, false) || isVMOVNShuffle(Shuffle, true))
16717 return SDValue();
16718
16719 LLVMContext &C = *DAG.getContext();
16720 SDLoc DL(St);
16721 // Details about the old store
16722 SDValue Ch = St->getChain();
16723 SDValue BasePtr = St->getBasePtr();
16724 Align Alignment = St->getBaseAlign();
16726 AAMDNodes AAInfo = St->getAAInfo();
16727
16728 // We split the store into slices of NumElements. fp16 trunc stores are vcvt
16729 // and then stored as truncating integer stores.
16730 EVT NewFromVT = EVT::getVectorVT(C, FromEltVT, NumElements);
16731 EVT NewToVT = EVT::getVectorVT(
16732 C, EVT::getIntegerVT(C, ToEltVT.getSizeInBits()), NumElements);
16733
16735 for (unsigned i = 0; i < FromVT.getVectorNumElements() / NumElements; i++) {
16736 unsigned NewOffset = i * NumElements * ToEltVT.getSizeInBits() / 8;
16737 SDValue NewPtr =
16738 DAG.getObjectPtrOffset(DL, BasePtr, TypeSize::getFixed(NewOffset));
16739
16740 SDValue Extract =
16741 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NewFromVT, Trunc.getOperand(0),
16742 DAG.getConstant(i * NumElements, DL, MVT::i32));
16743
16744 SDValue FPTrunc =
16745 DAG.getNode(ARMISD::VCVTN, DL, MVT::v8f16, DAG.getUNDEF(MVT::v8f16),
16746 Extract, DAG.getConstant(0, DL, MVT::i32));
16747 Extract = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, MVT::v4i32, FPTrunc);
16748
16749 SDValue Store = DAG.getTruncStore(
16750 Ch, DL, Extract, NewPtr, St->getPointerInfo().getWithOffset(NewOffset),
16751 NewToVT, Alignment, MMOFlags, AAInfo);
16752 Stores.push_back(Store);
16753 }
16754 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores);
16755}
16756
16757// Try taking a single vector store from an MVETRUNC (which would otherwise turn
16758// into an expensive buildvector) and splitting it into a series of narrowing
16759// stores.
16761 SelectionDAG &DAG) {
16762 if (!St->isSimple() || St->isTruncatingStore() || !St->isUnindexed())
16763 return SDValue();
16764 SDValue Trunc = St->getValue();
16765 if (Trunc->getOpcode() != ARMISD::MVETRUNC)
16766 return SDValue();
16767 EVT FromVT = Trunc->getOperand(0).getValueType();
16768 EVT ToVT = Trunc.getValueType();
16769
16770 LLVMContext &C = *DAG.getContext();
16771 SDLoc DL(St);
16772 // Details about the old store
16773 SDValue Ch = St->getChain();
16774 SDValue BasePtr = St->getBasePtr();
16775 Align Alignment = St->getBaseAlign();
16777 AAMDNodes AAInfo = St->getAAInfo();
16778
16779 EVT NewToVT = EVT::getVectorVT(C, ToVT.getVectorElementType(),
16780 FromVT.getVectorNumElements());
16781
16783 for (unsigned i = 0; i < Trunc.getNumOperands(); i++) {
16784 unsigned NewOffset =
16785 i * FromVT.getVectorNumElements() * ToVT.getScalarSizeInBits() / 8;
16786 SDValue NewPtr =
16787 DAG.getObjectPtrOffset(DL, BasePtr, TypeSize::getFixed(NewOffset));
16788
16789 SDValue Extract = Trunc.getOperand(i);
16790 SDValue Store = DAG.getTruncStore(
16791 Ch, DL, Extract, NewPtr, St->getPointerInfo().getWithOffset(NewOffset),
16792 NewToVT, Alignment, MMOFlags, AAInfo);
16793 Stores.push_back(Store);
16794 }
16795 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores);
16796}
16797
16798// Given a floating point store from an extracted vector, with an integer
16799// VGETLANE that already exists, store the existing VGETLANEu directly. This can
16800// help reduce fp register pressure, doesn't require the fp extract and allows
16801// use of more integer post-inc stores not available with vstr.
16803 if (!St->isSimple() || St->isTruncatingStore() || !St->isUnindexed())
16804 return SDValue();
16805 SDValue Extract = St->getValue();
16806 EVT VT = Extract.getValueType();
16807 // For now only uses f16. This may be useful for f32 too, but that will
16808 // be bitcast(extract), not the VGETLANEu we currently check here.
16809 if (VT != MVT::f16 || Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
16810 return SDValue();
16811
16812 SDNode *GetLane =
16813 DAG.getNodeIfExists(ARMISD::VGETLANEu, DAG.getVTList(MVT::i32),
16814 {Extract.getOperand(0), Extract.getOperand(1)});
16815 if (!GetLane)
16816 return SDValue();
16817
16818 LLVMContext &C = *DAG.getContext();
16819 SDLoc DL(St);
16820 // Create a new integer store to replace the existing floating point version.
16821 SDValue Ch = St->getChain();
16822 SDValue BasePtr = St->getBasePtr();
16823 Align Alignment = St->getBaseAlign();
16825 AAMDNodes AAInfo = St->getAAInfo();
16826 EVT NewToVT = EVT::getIntegerVT(C, VT.getSizeInBits());
16827 SDValue Store = DAG.getTruncStore(Ch, DL, SDValue(GetLane, 0), BasePtr,
16828 St->getPointerInfo(), NewToVT, Alignment,
16829 MMOFlags, AAInfo);
16830
16831 return Store;
16832}
16833
16834/// PerformSTORECombine - Target-specific dag combine xforms for
16835/// ISD::STORE.
16838 const ARMSubtarget *Subtarget) {
16840 if (St->isVolatile())
16841 return SDValue();
16842 SDValue StVal = St->getValue();
16843 EVT VT = StVal.getValueType();
16844
16845 if (Subtarget->hasNEON())
16846 if (SDValue Store = PerformTruncatingStoreCombine(St, DCI.DAG))
16847 return Store;
16848
16849 if (Subtarget->hasMVEFloatOps())
16850 if (SDValue NewToken = PerformSplittingToNarrowingStores(St, DCI.DAG))
16851 return NewToken;
16852
16853 if (Subtarget->hasMVEIntegerOps()) {
16854 if (SDValue NewChain = PerformExtractFpToIntStores(St, DCI.DAG))
16855 return NewChain;
16856 if (SDValue NewToken =
16858 return NewToken;
16859 }
16860
16861 if (!ISD::isNormalStore(St))
16862 return SDValue();
16863
16864 // Split a store of a VMOVDRR into two integer stores to avoid mixing NEON and
16865 // ARM stores of arguments in the same cache line.
16866 if (StVal.getNode()->getOpcode() == ARMISD::VMOVDRR &&
16867 StVal.getNode()->hasOneUse()) {
16868 SelectionDAG &DAG = DCI.DAG;
16869 bool isBigEndian = DAG.getDataLayout().isBigEndian();
16870 SDLoc DL(St);
16871 SDValue BasePtr = St->getBasePtr();
16872 SDValue NewST1 = DAG.getStore(
16873 St->getChain(), DL, StVal.getNode()->getOperand(isBigEndian ? 1 : 0),
16874 BasePtr, St->getPointerInfo(), St->getBaseAlign(),
16875 St->getMemOperand()->getFlags());
16876
16877 SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr,
16878 DAG.getConstant(4, DL, MVT::i32));
16879 return DAG.getStore(NewST1.getValue(0), DL,
16880 StVal.getNode()->getOperand(isBigEndian ? 0 : 1),
16881 OffsetPtr, St->getPointerInfo().getWithOffset(4),
16882 St->getBaseAlign(), St->getMemOperand()->getFlags());
16883 }
16884
16885 if (StVal.getValueType() == MVT::i64 &&
16887
16888 // Bitcast an i64 store extracted from a vector to f64.
16889 // Otherwise, the i64 value will be legalized to a pair of i32 values.
16890 SelectionDAG &DAG = DCI.DAG;
16891 SDLoc dl(StVal);
16892 SDValue IntVec = StVal.getOperand(0);
16893 EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64,
16895 SDValue Vec = DAG.getNode(ISD::BITCAST, dl, FloatVT, IntVec);
16896 SDValue ExtElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
16897 Vec, StVal.getOperand(1));
16898 dl = SDLoc(N);
16899 SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::i64, ExtElt);
16900 // Make the DAGCombiner fold the bitcasts.
16901 DCI.AddToWorklist(Vec.getNode());
16902 DCI.AddToWorklist(ExtElt.getNode());
16903 DCI.AddToWorklist(V.getNode());
16904 return DAG.getStore(St->getChain(), dl, V, St->getBasePtr(),
16905 St->getPointerInfo(), St->getAlign(),
16906 St->getMemOperand()->getFlags(), St->getAAInfo());
16907 }
16908
16909 // If this is a legal vector store, try to combine it into a VST1_UPD.
16910 if (Subtarget->hasNEON() && ISD::isNormalStore(N) && VT.isVector() &&
16912 return CombineBaseUpdate(N, DCI);
16913
16914 return SDValue();
16915}
16916
16917/// PerformVCVTCombine - VCVT (floating-point to fixed-point, Advanced SIMD)
16918/// can replace combinations of VMUL and VCVT (floating-point to integer)
16919/// when the VMUL has a constant operand that is a power of 2.
16920///
16921/// Example (assume d17 = <float 8.000000e+00, float 8.000000e+00>):
16922/// vmul.f32 d16, d17, d16
16923/// vcvt.s32.f32 d16, d16
16924/// becomes:
16925/// vcvt.s32.f32 d16, d16, #3
16927 const ARMSubtarget *Subtarget) {
16928 if (!Subtarget->hasNEON())
16929 return SDValue();
16930
16931 SDValue Op = N->getOperand(0);
16932 if (!Op.getValueType().isVector() || !Op.getValueType().isSimple() ||
16933 Op.getOpcode() != ISD::FMUL)
16934 return SDValue();
16935
16936 SDValue ConstVec = Op->getOperand(1);
16937 if (!isa<BuildVectorSDNode>(ConstVec))
16938 return SDValue();
16939
16940 MVT FloatTy = Op.getSimpleValueType().getVectorElementType();
16941 uint32_t FloatBits = FloatTy.getSizeInBits();
16942 MVT IntTy = N->getSimpleValueType(0).getVectorElementType();
16943 uint32_t IntBits = IntTy.getSizeInBits();
16944 unsigned NumLanes = Op.getValueType().getVectorNumElements();
16945 if (FloatBits != 32 || IntBits > 32 || (NumLanes != 4 && NumLanes != 2)) {
16946 // These instructions only exist converting from f32 to i32. We can handle
16947 // smaller integers by generating an extra truncate, but larger ones would
16948 // be lossy. We also can't handle anything other than 2 or 4 lanes, since
16949 // these intructions only support v2i32/v4i32 types.
16950 return SDValue();
16951 }
16952
16953 BitVector UndefElements;
16955 int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, 33);
16956 if (C == -1 || C == 0 || C > 32)
16957 return SDValue();
16958
16959 SDLoc dl(N);
16960 bool isSigned = N->getOpcode() == ISD::FP_TO_SINT;
16961 unsigned IntrinsicOpcode = isSigned ? Intrinsic::arm_neon_vcvtfp2fxs :
16962 Intrinsic::arm_neon_vcvtfp2fxu;
16963 SDValue FixConv = DAG.getNode(
16964 ISD::INTRINSIC_WO_CHAIN, dl, NumLanes == 2 ? MVT::v2i32 : MVT::v4i32,
16965 DAG.getConstant(IntrinsicOpcode, dl, MVT::i32), Op->getOperand(0),
16966 DAG.getConstant(C, dl, MVT::i32));
16967
16968 if (IntBits < FloatBits)
16969 FixConv = DAG.getNode(ISD::TRUNCATE, dl, N->getValueType(0), FixConv);
16970
16971 return FixConv;
16972}
16973
16975 const ARMSubtarget *Subtarget) {
16976 if (!Subtarget->hasMVEFloatOps())
16977 return SDValue();
16978
16979 // Turn (fadd x, (vselect c, y, -0.0)) into (vselect c, (fadd x, y), x)
16980 // The second form can be more easily turned into a predicated vadd, and
16981 // possibly combined into a fma to become a predicated vfma.
16982 SDValue Op0 = N->getOperand(0);
16983 SDValue Op1 = N->getOperand(1);
16984 EVT VT = N->getValueType(0);
16985 SDLoc DL(N);
16986
16987 // The identity element for a fadd is -0.0 or +0.0 when the nsz flag is set,
16988 // which these VMOV's represent.
16989 auto isIdentitySplat = [&](SDValue Op, bool NSZ) {
16990 if (Op.getOpcode() != ISD::BITCAST ||
16991 Op.getOperand(0).getOpcode() != ARMISD::VMOVIMM)
16992 return false;
16993 uint64_t ImmVal = Op.getOperand(0).getConstantOperandVal(0);
16994 if (VT == MVT::v4f32 && (ImmVal == 1664 || (ImmVal == 0 && NSZ)))
16995 return true;
16996 if (VT == MVT::v8f16 && (ImmVal == 2688 || (ImmVal == 0 && NSZ)))
16997 return true;
16998 return false;
16999 };
17000
17001 if (Op0.getOpcode() == ISD::VSELECT && Op1.getOpcode() != ISD::VSELECT)
17002 std::swap(Op0, Op1);
17003
17004 if (Op1.getOpcode() != ISD::VSELECT)
17005 return SDValue();
17006
17007 SDNodeFlags FaddFlags = N->getFlags();
17008 bool NSZ = FaddFlags.hasNoSignedZeros();
17009 if (!isIdentitySplat(Op1.getOperand(2), NSZ))
17010 return SDValue();
17011
17012 SDValue FAdd =
17013 DAG.getNode(ISD::FADD, DL, VT, Op0, Op1.getOperand(1), FaddFlags);
17014 return DAG.getNode(ISD::VSELECT, DL, VT, Op1.getOperand(0), FAdd, Op0, FaddFlags);
17015}
17016
17018 SDValue LHS = N->getOperand(0);
17019 SDValue RHS = N->getOperand(1);
17020 EVT VT = N->getValueType(0);
17021 SDLoc DL(N);
17022
17023 if (!N->getFlags().hasAllowReassociation())
17024 return SDValue();
17025
17026 // Combine fadd(a, vcmla(b, c, d)) -> vcmla(fadd(a, b), b, c)
17027 auto ReassocComplex = [&](SDValue A, SDValue B) {
17028 if (A.getOpcode() != ISD::INTRINSIC_WO_CHAIN)
17029 return SDValue();
17030 unsigned Opc = A.getConstantOperandVal(0);
17031 if (Opc != Intrinsic::arm_mve_vcmlaq)
17032 return SDValue();
17033 SDValue VCMLA = DAG.getNode(
17034 ISD::INTRINSIC_WO_CHAIN, DL, VT, A.getOperand(0), A.getOperand(1),
17035 DAG.getNode(ISD::FADD, DL, VT, A.getOperand(2), B, N->getFlags()),
17036 A.getOperand(3), A.getOperand(4));
17037 VCMLA->setFlags(A->getFlags());
17038 return VCMLA;
17039 };
17040 if (SDValue R = ReassocComplex(LHS, RHS))
17041 return R;
17042 if (SDValue R = ReassocComplex(RHS, LHS))
17043 return R;
17044
17045 return SDValue();
17046}
17047
17049 const ARMSubtarget *Subtarget) {
17050 if (SDValue S = PerformFAddVSelectCombine(N, DAG, Subtarget))
17051 return S;
17052 if (SDValue S = PerformFADDVCMLACombine(N, DAG))
17053 return S;
17054 return SDValue();
17055}
17056
17057/// PerformVMulVCTPCombine - VCVT (fixed-point to floating-point, Advanced SIMD)
17058/// can replace combinations of VCVT (integer to floating-point) and VMUL
17059/// when the VMUL has a constant operand that is a power of 2.
17060///
17061/// Example (assume d17 = <float 0.125, float 0.125>):
17062/// vcvt.f32.s32 d16, d16
17063/// vmul.f32 d16, d16, d17
17064/// becomes:
17065/// vcvt.f32.s32 d16, d16, #3
17067 const ARMSubtarget *Subtarget) {
17068 if (!Subtarget->hasNEON())
17069 return SDValue();
17070
17071 SDValue Op = N->getOperand(0);
17072 unsigned OpOpcode = Op.getNode()->getOpcode();
17073 if (!N->getValueType(0).isVector() || !N->getValueType(0).isSimple() ||
17074 (OpOpcode != ISD::SINT_TO_FP && OpOpcode != ISD::UINT_TO_FP))
17075 return SDValue();
17076
17077 SDValue ConstVec = N->getOperand(1);
17078 if (!isa<BuildVectorSDNode>(ConstVec))
17079 return SDValue();
17080
17081 MVT FloatTy = N->getSimpleValueType(0).getVectorElementType();
17082 uint32_t FloatBits = FloatTy.getSizeInBits();
17083 MVT IntTy = Op.getOperand(0).getSimpleValueType().getVectorElementType();
17084 uint32_t IntBits = IntTy.getSizeInBits();
17085 unsigned NumLanes = Op.getValueType().getVectorNumElements();
17086 if (FloatBits != 32 || IntBits > 32 || (NumLanes != 4 && NumLanes != 2)) {
17087 // These instructions only exist converting from i32 to f32. We can handle
17088 // smaller integers by generating an extra extend, but larger ones would
17089 // be lossy. We also can't handle anything other than 2 or 4 lanes, since
17090 // these intructions only support v2i32/v4i32 types.
17091 return SDValue();
17092 }
17093
17094 ConstantFPSDNode *CN = isConstOrConstSplatFP(ConstVec, true);
17095 APFloat Recip(0.0f);
17096 if (!CN || !CN->getValueAPF().getExactInverse(&Recip))
17097 return SDValue();
17098
17099 bool IsExact;
17100 APSInt IntVal(33);
17101 if (Recip.convertToInteger(IntVal, APFloat::rmTowardZero, &IsExact) !=
17102 APFloat::opOK ||
17103 !IsExact)
17104 return SDValue();
17105
17106 int32_t C = IntVal.exactLogBase2();
17107 if (C == -1 || C == 0 || C > 32)
17108 return SDValue();
17109
17110 SDLoc DL(N);
17111 bool isSigned = OpOpcode == ISD::SINT_TO_FP;
17112 SDValue ConvInput = Op.getOperand(0);
17113 if (IntBits < FloatBits)
17115 NumLanes == 2 ? MVT::v2i32 : MVT::v4i32, ConvInput);
17116
17117 unsigned IntrinsicOpcode = isSigned ? Intrinsic::arm_neon_vcvtfxs2fp
17118 : Intrinsic::arm_neon_vcvtfxu2fp;
17119 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),
17120 DAG.getConstant(IntrinsicOpcode, DL, MVT::i32), ConvInput,
17121 DAG.getConstant(C, DL, MVT::i32));
17122}
17123
17125 const ARMSubtarget *ST) {
17126 if (!ST->hasMVEIntegerOps())
17127 return SDValue();
17128
17129 assert(N->getOpcode() == ISD::VECREDUCE_ADD);
17130 EVT ResVT = N->getValueType(0);
17131 SDValue N0 = N->getOperand(0);
17132 SDLoc dl(N);
17133
17134 // Try to turn vecreduce_add(add(x, y)) into vecreduce(x) + vecreduce(y)
17135 if (ResVT == MVT::i32 && N0.getOpcode() == ISD::ADD &&
17136 (N0.getValueType() == MVT::v4i32 || N0.getValueType() == MVT::v8i16 ||
17137 N0.getValueType() == MVT::v16i8)) {
17138 SDValue Red0 = DAG.getNode(ISD::VECREDUCE_ADD, dl, ResVT, N0.getOperand(0));
17139 SDValue Red1 = DAG.getNode(ISD::VECREDUCE_ADD, dl, ResVT, N0.getOperand(1));
17140 return DAG.getNode(ISD::ADD, dl, ResVT, Red0, Red1);
17141 }
17142
17143 // We are looking for something that will have illegal types if left alone,
17144 // but that we can convert to a single instruction under MVE. For example
17145 // vecreduce_add(sext(A, v8i32)) => VADDV.s16 A
17146 // or
17147 // vecreduce_add(mul(zext(A, v16i32), zext(B, v16i32))) => VMLADAV.u8 A, B
17148
17149 // The legal cases are:
17150 // VADDV u/s 8/16/32
17151 // VMLAV u/s 8/16/32
17152 // VADDLV u/s 32
17153 // VMLALV u/s 16/32
17154
17155 // If the input vector is smaller than legal (v4i8/v4i16 for example) we can
17156 // extend it and use v4i32 instead.
17157 auto ExtTypeMatches = [](SDValue A, ArrayRef<MVT> ExtTypes) {
17158 EVT AVT = A.getValueType();
17159 return any_of(ExtTypes, [&](MVT Ty) {
17160 return AVT.getVectorNumElements() == Ty.getVectorNumElements() &&
17161 AVT.bitsLE(Ty);
17162 });
17163 };
17164 auto ExtendIfNeeded = [&](SDValue A, unsigned ExtendCode) {
17165 EVT AVT = A.getValueType();
17166 if (!AVT.is128BitVector())
17167 A = DAG.getNode(ExtendCode, dl,
17169 128 / AVT.getVectorMinNumElements())),
17170 A);
17171 return A;
17172 };
17173 auto IsVADDV = [&](MVT RetTy, unsigned ExtendCode, ArrayRef<MVT> ExtTypes) {
17174 if (ResVT != RetTy || N0->getOpcode() != ExtendCode)
17175 return SDValue();
17176 SDValue A = N0->getOperand(0);
17177 if (ExtTypeMatches(A, ExtTypes))
17178 return ExtendIfNeeded(A, ExtendCode);
17179 return SDValue();
17180 };
17181 auto IsPredVADDV = [&](MVT RetTy, unsigned ExtendCode,
17182 ArrayRef<MVT> ExtTypes, SDValue &Mask) {
17183 if (ResVT != RetTy || N0->getOpcode() != ISD::VSELECT ||
17185 return SDValue();
17186 Mask = N0->getOperand(0);
17187 SDValue Ext = N0->getOperand(1);
17188 if (Ext->getOpcode() != ExtendCode)
17189 return SDValue();
17190 SDValue A = Ext->getOperand(0);
17191 if (ExtTypeMatches(A, ExtTypes))
17192 return ExtendIfNeeded(A, ExtendCode);
17193 return SDValue();
17194 };
17195 auto IsVMLAV = [&](MVT RetTy, unsigned ExtendCode, ArrayRef<MVT> ExtTypes,
17196 SDValue &A, SDValue &B) {
17197 // For a vmla we are trying to match a larger pattern:
17198 // ExtA = sext/zext A
17199 // ExtB = sext/zext B
17200 // Mul = mul ExtA, ExtB
17201 // vecreduce.add Mul
17202 // There might also be en extra extend between the mul and the addreduce, so
17203 // long as the bitwidth is high enough to make them equivalent (for example
17204 // original v8i16 might be mul at v8i32 and the reduce happens at v8i64).
17205 if (ResVT != RetTy)
17206 return false;
17207 SDValue Mul = N0;
17208 if (Mul->getOpcode() == ExtendCode &&
17209 Mul->getOperand(0).getScalarValueSizeInBits() * 2 >=
17210 ResVT.getScalarSizeInBits())
17211 Mul = Mul->getOperand(0);
17212 if (Mul->getOpcode() != ISD::MUL)
17213 return false;
17214 SDValue ExtA = Mul->getOperand(0);
17215 SDValue ExtB = Mul->getOperand(1);
17216 if (ExtA->getOpcode() != ExtendCode || ExtB->getOpcode() != ExtendCode)
17217 return false;
17218 A = ExtA->getOperand(0);
17219 B = ExtB->getOperand(0);
17220 if (ExtTypeMatches(A, ExtTypes) && ExtTypeMatches(B, ExtTypes)) {
17221 A = ExtendIfNeeded(A, ExtendCode);
17222 B = ExtendIfNeeded(B, ExtendCode);
17223 return true;
17224 }
17225 return false;
17226 };
17227 auto IsPredVMLAV = [&](MVT RetTy, unsigned ExtendCode, ArrayRef<MVT> ExtTypes,
17228 SDValue &A, SDValue &B, SDValue &Mask) {
17229 // Same as the pattern above with a select for the zero predicated lanes
17230 // ExtA = sext/zext A
17231 // ExtB = sext/zext B
17232 // Mul = mul ExtA, ExtB
17233 // N0 = select Mask, Mul, 0
17234 // vecreduce.add N0
17235 if (ResVT != RetTy || N0->getOpcode() != ISD::VSELECT ||
17237 return false;
17238 Mask = N0->getOperand(0);
17239 SDValue Mul = N0->getOperand(1);
17240 if (Mul->getOpcode() == ExtendCode &&
17241 Mul->getOperand(0).getScalarValueSizeInBits() * 2 >=
17242 ResVT.getScalarSizeInBits())
17243 Mul = Mul->getOperand(0);
17244 if (Mul->getOpcode() != ISD::MUL)
17245 return false;
17246 SDValue ExtA = Mul->getOperand(0);
17247 SDValue ExtB = Mul->getOperand(1);
17248 if (ExtA->getOpcode() != ExtendCode || ExtB->getOpcode() != ExtendCode)
17249 return false;
17250 A = ExtA->getOperand(0);
17251 B = ExtB->getOperand(0);
17252 if (ExtTypeMatches(A, ExtTypes) && ExtTypeMatches(B, ExtTypes)) {
17253 A = ExtendIfNeeded(A, ExtendCode);
17254 B = ExtendIfNeeded(B, ExtendCode);
17255 return true;
17256 }
17257 return false;
17258 };
17259 auto Create64bitNode = [&](unsigned Opcode, ArrayRef<SDValue> Ops) {
17260 // Split illegal MVT::v16i8->i64 vector reductions into two legal v8i16->i64
17261 // reductions. The operands are extended with MVEEXT, but as they are
17262 // reductions the lane orders do not matter. MVEEXT may be combined with
17263 // loads to produce two extending loads, or else they will be expanded to
17264 // VREV/VMOVL.
17265 EVT VT = Ops[0].getValueType();
17266 if (VT == MVT::v16i8) {
17267 assert((Opcode == ARMISD::VMLALVs || Opcode == ARMISD::VMLALVu) &&
17268 "Unexpected illegal long reduction opcode");
17269 bool IsUnsigned = Opcode == ARMISD::VMLALVu;
17270
17271 SDValue Ext0 =
17272 DAG.getNode(IsUnsigned ? ARMISD::MVEZEXT : ARMISD::MVESEXT, dl,
17273 DAG.getVTList(MVT::v8i16, MVT::v8i16), Ops[0]);
17274 SDValue Ext1 =
17275 DAG.getNode(IsUnsigned ? ARMISD::MVEZEXT : ARMISD::MVESEXT, dl,
17276 DAG.getVTList(MVT::v8i16, MVT::v8i16), Ops[1]);
17277
17278 SDValue MLA0 = DAG.getNode(Opcode, dl, DAG.getVTList(MVT::i32, MVT::i32),
17279 Ext0, Ext1);
17280 SDValue MLA1 =
17281 DAG.getNode(IsUnsigned ? ARMISD::VMLALVAu : ARMISD::VMLALVAs, dl,
17282 DAG.getVTList(MVT::i32, MVT::i32), MLA0, MLA0.getValue(1),
17283 Ext0.getValue(1), Ext1.getValue(1));
17284 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, MLA1, MLA1.getValue(1));
17285 }
17286 SDValue Node = DAG.getNode(Opcode, dl, {MVT::i32, MVT::i32}, Ops);
17287 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Node,
17288 SDValue(Node.getNode(), 1));
17289 };
17290
17291 SDValue A, B;
17292 SDValue Mask;
17293 if (IsVMLAV(MVT::i32, ISD::SIGN_EXTEND, {MVT::v8i16, MVT::v16i8}, A, B))
17294 return DAG.getNode(ARMISD::VMLAVs, dl, ResVT, A, B);
17295 if (IsVMLAV(MVT::i32, ISD::ZERO_EXTEND, {MVT::v8i16, MVT::v16i8}, A, B))
17296 return DAG.getNode(ARMISD::VMLAVu, dl, ResVT, A, B);
17297 if (IsVMLAV(MVT::i64, ISD::SIGN_EXTEND, {MVT::v16i8, MVT::v8i16, MVT::v4i32},
17298 A, B))
17299 return Create64bitNode(ARMISD::VMLALVs, {A, B});
17300 if (IsVMLAV(MVT::i64, ISD::ZERO_EXTEND, {MVT::v16i8, MVT::v8i16, MVT::v4i32},
17301 A, B))
17302 return Create64bitNode(ARMISD::VMLALVu, {A, B});
17303 if (IsVMLAV(MVT::i16, ISD::SIGN_EXTEND, {MVT::v16i8}, A, B))
17304 return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
17305 DAG.getNode(ARMISD::VMLAVs, dl, MVT::i32, A, B));
17306 if (IsVMLAV(MVT::i16, ISD::ZERO_EXTEND, {MVT::v16i8}, A, B))
17307 return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
17308 DAG.getNode(ARMISD::VMLAVu, dl, MVT::i32, A, B));
17309
17310 if (IsPredVMLAV(MVT::i32, ISD::SIGN_EXTEND, {MVT::v8i16, MVT::v16i8}, A, B,
17311 Mask))
17312 return DAG.getNode(ARMISD::VMLAVps, dl, ResVT, A, B, Mask);
17313 if (IsPredVMLAV(MVT::i32, ISD::ZERO_EXTEND, {MVT::v8i16, MVT::v16i8}, A, B,
17314 Mask))
17315 return DAG.getNode(ARMISD::VMLAVpu, dl, ResVT, A, B, Mask);
17316 if (IsPredVMLAV(MVT::i64, ISD::SIGN_EXTEND, {MVT::v8i16, MVT::v4i32}, A, B,
17317 Mask))
17318 return Create64bitNode(ARMISD::VMLALVps, {A, B, Mask});
17319 if (IsPredVMLAV(MVT::i64, ISD::ZERO_EXTEND, {MVT::v8i16, MVT::v4i32}, A, B,
17320 Mask))
17321 return Create64bitNode(ARMISD::VMLALVpu, {A, B, Mask});
17322 if (IsPredVMLAV(MVT::i16, ISD::SIGN_EXTEND, {MVT::v16i8}, A, B, Mask))
17323 return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
17324 DAG.getNode(ARMISD::VMLAVps, dl, MVT::i32, A, B, Mask));
17325 if (IsPredVMLAV(MVT::i16, ISD::ZERO_EXTEND, {MVT::v16i8}, A, B, Mask))
17326 return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
17327 DAG.getNode(ARMISD::VMLAVpu, dl, MVT::i32, A, B, Mask));
17328
17329 if (SDValue A = IsVADDV(MVT::i32, ISD::SIGN_EXTEND, {MVT::v8i16, MVT::v16i8}))
17330 return DAG.getNode(ARMISD::VADDVs, dl, ResVT, A);
17331 if (SDValue A = IsVADDV(MVT::i32, ISD::ZERO_EXTEND, {MVT::v8i16, MVT::v16i8}))
17332 return DAG.getNode(ARMISD::VADDVu, dl, ResVT, A);
17333 if (SDValue A = IsVADDV(MVT::i64, ISD::SIGN_EXTEND, {MVT::v4i32}))
17334 return Create64bitNode(ARMISD::VADDLVs, {A});
17335 if (SDValue A = IsVADDV(MVT::i64, ISD::ZERO_EXTEND, {MVT::v4i32}))
17336 return Create64bitNode(ARMISD::VADDLVu, {A});
17337 if (SDValue A = IsVADDV(MVT::i16, ISD::SIGN_EXTEND, {MVT::v16i8}))
17338 return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
17339 DAG.getNode(ARMISD::VADDVs, dl, MVT::i32, A));
17340 if (SDValue A = IsVADDV(MVT::i16, ISD::ZERO_EXTEND, {MVT::v16i8}))
17341 return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
17342 DAG.getNode(ARMISD::VADDVu, dl, MVT::i32, A));
17343
17344 if (SDValue A = IsPredVADDV(MVT::i32, ISD::SIGN_EXTEND, {MVT::v8i16, MVT::v16i8}, Mask))
17345 return DAG.getNode(ARMISD::VADDVps, dl, ResVT, A, Mask);
17346 if (SDValue A = IsPredVADDV(MVT::i32, ISD::ZERO_EXTEND, {MVT::v8i16, MVT::v16i8}, Mask))
17347 return DAG.getNode(ARMISD::VADDVpu, dl, ResVT, A, Mask);
17348 if (SDValue A = IsPredVADDV(MVT::i64, ISD::SIGN_EXTEND, {MVT::v4i32}, Mask))
17349 return Create64bitNode(ARMISD::VADDLVps, {A, Mask});
17350 if (SDValue A = IsPredVADDV(MVT::i64, ISD::ZERO_EXTEND, {MVT::v4i32}, Mask))
17351 return Create64bitNode(ARMISD::VADDLVpu, {A, Mask});
17352 if (SDValue A = IsPredVADDV(MVT::i16, ISD::SIGN_EXTEND, {MVT::v16i8}, Mask))
17353 return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
17354 DAG.getNode(ARMISD::VADDVps, dl, MVT::i32, A, Mask));
17355 if (SDValue A = IsPredVADDV(MVT::i16, ISD::ZERO_EXTEND, {MVT::v16i8}, Mask))
17356 return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
17357 DAG.getNode(ARMISD::VADDVpu, dl, MVT::i32, A, Mask));
17358
17359 // Some complications. We can get a case where the two inputs of the mul are
17360 // the same, then the output sext will have been helpfully converted to a
17361 // zext. Turn it back.
17362 SDValue Op = N0;
17363 if (Op->getOpcode() == ISD::VSELECT)
17364 Op = Op->getOperand(1);
17365 if (Op->getOpcode() == ISD::ZERO_EXTEND &&
17366 Op->getOperand(0)->getOpcode() == ISD::MUL) {
17367 SDValue Mul = Op->getOperand(0);
17368 if (Mul->getOperand(0) == Mul->getOperand(1) &&
17369 Mul->getOperand(0)->getOpcode() == ISD::SIGN_EXTEND) {
17370 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, N0->getValueType(0), Mul);
17371 if (Op != N0)
17372 Ext = DAG.getNode(ISD::VSELECT, dl, N0->getValueType(0),
17373 N0->getOperand(0), Ext, N0->getOperand(2));
17374 return DAG.getNode(ISD::VECREDUCE_ADD, dl, ResVT, Ext);
17375 }
17376 }
17377
17378 return SDValue();
17379}
17380
17381// Looks for vaddv(shuffle) or vmlav(shuffle, shuffle), with a shuffle where all
17382// the lanes are used. Due to the reduction being commutative the shuffle can be
17383// removed.
17385 unsigned VecOp = N->getOperand(0).getValueType().isVector() ? 0 : 2;
17386 auto *Shuf = dyn_cast<ShuffleVectorSDNode>(N->getOperand(VecOp));
17387 if (!Shuf || !Shuf->getOperand(1).isUndef())
17388 return SDValue();
17389
17390 // Check all elements are used once in the mask.
17391 ArrayRef<int> Mask = Shuf->getMask();
17392 APInt SetElts(Mask.size(), 0);
17393 for (int E : Mask) {
17394 if (E < 0 || E >= (int)Mask.size())
17395 return SDValue();
17396 SetElts.setBit(E);
17397 }
17398 if (!SetElts.isAllOnes())
17399 return SDValue();
17400
17401 if (N->getNumOperands() != VecOp + 1) {
17402 auto *Shuf2 = dyn_cast<ShuffleVectorSDNode>(N->getOperand(VecOp + 1));
17403 if (!Shuf2 || !Shuf2->getOperand(1).isUndef() || Shuf2->getMask() != Mask)
17404 return SDValue();
17405 }
17406
17408 for (SDValue Op : N->ops()) {
17409 if (Op.getValueType().isVector())
17410 Ops.push_back(Op.getOperand(0));
17411 else
17412 Ops.push_back(Op);
17413 }
17414 return DAG.getNode(N->getOpcode(), SDLoc(N), N->getVTList(), Ops);
17415}
17416
17419 SDValue Op0 = N->getOperand(0);
17420 SDValue Op1 = N->getOperand(1);
17421 unsigned IsTop = N->getConstantOperandVal(2);
17422
17423 // VMOVNT a undef -> a
17424 // VMOVNB a undef -> a
17425 // VMOVNB undef a -> a
17426 if (Op1->isUndef())
17427 return Op0;
17428 if (Op0->isUndef() && !IsTop)
17429 return Op1;
17430
17431 // VMOVNt(c, VQMOVNb(a, b)) => VQMOVNt(c, b)
17432 // VMOVNb(c, VQMOVNb(a, b)) => VQMOVNb(c, b)
17433 if ((Op1->getOpcode() == ARMISD::VQMOVNs ||
17434 Op1->getOpcode() == ARMISD::VQMOVNu) &&
17435 Op1->getConstantOperandVal(2) == 0)
17436 return DCI.DAG.getNode(Op1->getOpcode(), SDLoc(Op1), N->getValueType(0),
17437 Op0, Op1->getOperand(1), N->getOperand(2));
17438
17439 // Only the bottom lanes from Qm (Op1) and either the top or bottom lanes from
17440 // Qd (Op0) are demanded from a VMOVN, depending on whether we are inserting
17441 // into the top or bottom lanes.
17442 unsigned NumElts = N->getValueType(0).getVectorNumElements();
17443 APInt Op1DemandedElts = APInt::getSplat(NumElts, APInt::getLowBitsSet(2, 1));
17444 APInt Op0DemandedElts =
17445 IsTop ? Op1DemandedElts
17446 : APInt::getSplat(NumElts, APInt::getHighBitsSet(2, 1));
17447
17448 const TargetLowering &TLI = DCI.DAG.getTargetLoweringInfo();
17449 if (TLI.SimplifyDemandedVectorElts(Op0, Op0DemandedElts, DCI))
17450 return SDValue(N, 0);
17451 if (TLI.SimplifyDemandedVectorElts(Op1, Op1DemandedElts, DCI))
17452 return SDValue(N, 0);
17453
17454 return SDValue();
17455}
17456
17459 SDValue Op0 = N->getOperand(0);
17460 unsigned IsTop = N->getConstantOperandVal(2);
17461
17462 unsigned NumElts = N->getValueType(0).getVectorNumElements();
17463 APInt Op0DemandedElts =
17464 APInt::getSplat(NumElts, IsTop ? APInt::getLowBitsSet(2, 1)
17465 : APInt::getHighBitsSet(2, 1));
17466
17467 const TargetLowering &TLI = DCI.DAG.getTargetLoweringInfo();
17468 if (TLI.SimplifyDemandedVectorElts(Op0, Op0DemandedElts, DCI))
17469 return SDValue(N, 0);
17470 return SDValue();
17471}
17472
17475 EVT VT = N->getValueType(0);
17476 SDValue LHS = N->getOperand(0);
17477 SDValue RHS = N->getOperand(1);
17478
17479 auto *Shuf0 = dyn_cast<ShuffleVectorSDNode>(LHS);
17480 auto *Shuf1 = dyn_cast<ShuffleVectorSDNode>(RHS);
17481 // Turn VQDMULH(shuffle, shuffle) -> shuffle(VQDMULH)
17482 if (Shuf0 && Shuf1 && Shuf0->getMask().equals(Shuf1->getMask()) &&
17483 LHS.getOperand(1).isUndef() && RHS.getOperand(1).isUndef() &&
17484 (LHS.hasOneUse() || RHS.hasOneUse() || LHS == RHS)) {
17485 SDLoc DL(N);
17486 SDValue NewBinOp = DCI.DAG.getNode(N->getOpcode(), DL, VT,
17487 LHS.getOperand(0), RHS.getOperand(0));
17488 SDValue UndefV = LHS.getOperand(1);
17489 return DCI.DAG.getVectorShuffle(VT, DL, NewBinOp, UndefV, Shuf0->getMask());
17490 }
17491 return SDValue();
17492}
17493
17495 SDLoc DL(N);
17496 SDValue Op0 = N->getOperand(0);
17497 SDValue Op1 = N->getOperand(1);
17498
17499 // Turn X << -C -> X >> C and viceversa. The negative shifts can come up from
17500 // uses of the intrinsics.
17501 if (auto C = dyn_cast<ConstantSDNode>(N->getOperand(2))) {
17502 int ShiftAmt = C->getSExtValue();
17503 if (ShiftAmt == 0) {
17504 SDValue Merge = DAG.getMergeValues({Op0, Op1}, DL);
17505 DAG.ReplaceAllUsesWith(N, Merge.getNode());
17506 return SDValue();
17507 }
17508
17509 if (ShiftAmt >= -32 && ShiftAmt < 0) {
17510 unsigned NewOpcode =
17511 N->getOpcode() == ARMISD::LSLL ? ARMISD::LSRL : ARMISD::LSLL;
17512 SDValue NewShift = DAG.getNode(NewOpcode, DL, N->getVTList(), Op0, Op1,
17513 DAG.getConstant(-ShiftAmt, DL, MVT::i32));
17514 DAG.ReplaceAllUsesWith(N, NewShift.getNode());
17515 return NewShift;
17516 }
17517 }
17518
17519 return SDValue();
17520}
17521
17522/// PerformIntrinsicCombine - ARM-specific DAG combining for intrinsics.
17524 DAGCombinerInfo &DCI) const {
17525 SelectionDAG &DAG = DCI.DAG;
17526 unsigned IntNo = N->getConstantOperandVal(0);
17527 switch (IntNo) {
17528 default:
17529 // Don't do anything for most intrinsics.
17530 break;
17531
17532 // Vector shifts: check for immediate versions and lower them.
17533 // Note: This is done during DAG combining instead of DAG legalizing because
17534 // the build_vectors for 64-bit vector element shift counts are generally
17535 // not legal, and it is hard to see their values after they get legalized to
17536 // loads from a constant pool.
17537 case Intrinsic::arm_neon_vshifts:
17538 case Intrinsic::arm_neon_vshiftu:
17539 case Intrinsic::arm_neon_vrshifts:
17540 case Intrinsic::arm_neon_vrshiftu:
17541 case Intrinsic::arm_neon_vrshiftn:
17542 case Intrinsic::arm_neon_vqshifts:
17543 case Intrinsic::arm_neon_vqshiftu:
17544 case Intrinsic::arm_neon_vqshiftsu:
17545 case Intrinsic::arm_neon_vqshiftns:
17546 case Intrinsic::arm_neon_vqshiftnu:
17547 case Intrinsic::arm_neon_vqshiftnsu:
17548 case Intrinsic::arm_neon_vqrshiftns:
17549 case Intrinsic::arm_neon_vqrshiftnu:
17550 case Intrinsic::arm_neon_vqrshiftnsu: {
17551 EVT VT = N->getOperand(1).getValueType();
17552 int64_t Cnt;
17553 unsigned VShiftOpc = 0;
17554
17555 switch (IntNo) {
17556 case Intrinsic::arm_neon_vshifts:
17557 case Intrinsic::arm_neon_vshiftu:
17558 if (isVShiftLImm(N->getOperand(2), VT, false, Cnt)) {
17559 VShiftOpc = ARMISD::VSHLIMM;
17560 break;
17561 }
17562 if (isVShiftRImm(N->getOperand(2), VT, false, true, Cnt)) {
17563 VShiftOpc = (IntNo == Intrinsic::arm_neon_vshifts ? ARMISD::VSHRsIMM
17565 break;
17566 }
17567 return SDValue();
17568
17569 case Intrinsic::arm_neon_vrshifts:
17570 case Intrinsic::arm_neon_vrshiftu:
17571 if (isVShiftRImm(N->getOperand(2), VT, false, true, Cnt))
17572 break;
17573 return SDValue();
17574
17575 case Intrinsic::arm_neon_vqshifts:
17576 case Intrinsic::arm_neon_vqshiftu:
17577 if (isVShiftLImm(N->getOperand(2), VT, false, Cnt))
17578 break;
17579 return SDValue();
17580
17581 case Intrinsic::arm_neon_vqshiftsu:
17582 if (isVShiftLImm(N->getOperand(2), VT, false, Cnt))
17583 break;
17584 llvm_unreachable("invalid shift count for vqshlu intrinsic");
17585
17586 case Intrinsic::arm_neon_vrshiftn:
17587 case Intrinsic::arm_neon_vqshiftns:
17588 case Intrinsic::arm_neon_vqshiftnu:
17589 case Intrinsic::arm_neon_vqshiftnsu:
17590 case Intrinsic::arm_neon_vqrshiftns:
17591 case Intrinsic::arm_neon_vqrshiftnu:
17592 case Intrinsic::arm_neon_vqrshiftnsu:
17593 // Narrowing shifts require an immediate right shift.
17594 if (isVShiftRImm(N->getOperand(2), VT, true, true, Cnt))
17595 break;
17596 llvm_unreachable("invalid shift count for narrowing vector shift "
17597 "intrinsic");
17598
17599 default:
17600 llvm_unreachable("unhandled vector shift");
17601 }
17602
17603 switch (IntNo) {
17604 case Intrinsic::arm_neon_vshifts:
17605 case Intrinsic::arm_neon_vshiftu:
17606 // Opcode already set above.
17607 break;
17608 case Intrinsic::arm_neon_vrshifts:
17609 VShiftOpc = ARMISD::VRSHRsIMM;
17610 break;
17611 case Intrinsic::arm_neon_vrshiftu:
17612 VShiftOpc = ARMISD::VRSHRuIMM;
17613 break;
17614 case Intrinsic::arm_neon_vrshiftn:
17615 VShiftOpc = ARMISD::VRSHRNIMM;
17616 break;
17617 case Intrinsic::arm_neon_vqshifts:
17618 VShiftOpc = ARMISD::VQSHLsIMM;
17619 break;
17620 case Intrinsic::arm_neon_vqshiftu:
17621 VShiftOpc = ARMISD::VQSHLuIMM;
17622 break;
17623 case Intrinsic::arm_neon_vqshiftsu:
17624 VShiftOpc = ARMISD::VQSHLsuIMM;
17625 break;
17626 case Intrinsic::arm_neon_vqshiftns:
17627 VShiftOpc = ARMISD::VQSHRNsIMM;
17628 break;
17629 case Intrinsic::arm_neon_vqshiftnu:
17630 VShiftOpc = ARMISD::VQSHRNuIMM;
17631 break;
17632 case Intrinsic::arm_neon_vqshiftnsu:
17633 VShiftOpc = ARMISD::VQSHRNsuIMM;
17634 break;
17635 case Intrinsic::arm_neon_vqrshiftns:
17636 VShiftOpc = ARMISD::VQRSHRNsIMM;
17637 break;
17638 case Intrinsic::arm_neon_vqrshiftnu:
17639 VShiftOpc = ARMISD::VQRSHRNuIMM;
17640 break;
17641 case Intrinsic::arm_neon_vqrshiftnsu:
17642 VShiftOpc = ARMISD::VQRSHRNsuIMM;
17643 break;
17644 }
17645
17646 SDLoc dl(N);
17647 return DAG.getNode(VShiftOpc, dl, N->getValueType(0),
17648 N->getOperand(1), DAG.getConstant(Cnt, dl, MVT::i32));
17649 }
17650
17651 case Intrinsic::arm_neon_vshiftins: {
17652 EVT VT = N->getOperand(1).getValueType();
17653 int64_t Cnt;
17654 unsigned VShiftOpc = 0;
17655
17656 if (isVShiftLImm(N->getOperand(3), VT, false, Cnt))
17657 VShiftOpc = ARMISD::VSLIIMM;
17658 else if (isVShiftRImm(N->getOperand(3), VT, false, true, Cnt))
17659 VShiftOpc = ARMISD::VSRIIMM;
17660 else {
17661 llvm_unreachable("invalid shift count for vsli/vsri intrinsic");
17662 }
17663
17664 SDLoc dl(N);
17665 return DAG.getNode(VShiftOpc, dl, N->getValueType(0),
17666 N->getOperand(1), N->getOperand(2),
17667 DAG.getConstant(Cnt, dl, MVT::i32));
17668 }
17669
17670 case Intrinsic::arm_neon_vqrshifts:
17671 case Intrinsic::arm_neon_vqrshiftu:
17672 // No immediate versions of these to check for.
17673 break;
17674
17675 case Intrinsic::arm_neon_vbsl: {
17676 SDLoc dl(N);
17677 return DAG.getNode(ARMISD::VBSP, dl, N->getValueType(0), N->getOperand(1),
17678 N->getOperand(2), N->getOperand(3));
17679 }
17680 case Intrinsic::arm_mve_vqdmlah:
17681 case Intrinsic::arm_mve_vqdmlash:
17682 case Intrinsic::arm_mve_vqrdmlah:
17683 case Intrinsic::arm_mve_vqrdmlash:
17684 case Intrinsic::arm_mve_vmla_n_predicated:
17685 case Intrinsic::arm_mve_vmlas_n_predicated:
17686 case Intrinsic::arm_mve_vqdmlah_predicated:
17687 case Intrinsic::arm_mve_vqdmlash_predicated:
17688 case Intrinsic::arm_mve_vqrdmlah_predicated:
17689 case Intrinsic::arm_mve_vqrdmlash_predicated: {
17690 // These intrinsics all take an i32 scalar operand which is narrowed to the
17691 // size of a single lane of the vector type they return. So we don't need
17692 // any bits of that operand above that point, which allows us to eliminate
17693 // uxth/sxth.
17694 unsigned BitWidth = N->getValueType(0).getScalarSizeInBits();
17695 APInt DemandedMask = APInt::getLowBitsSet(32, BitWidth);
17696 if (SimplifyDemandedBits(N->getOperand(3), DemandedMask, DCI))
17697 return SDValue();
17698 break;
17699 }
17700
17701 case Intrinsic::arm_mve_minv:
17702 case Intrinsic::arm_mve_maxv:
17703 case Intrinsic::arm_mve_minav:
17704 case Intrinsic::arm_mve_maxav:
17705 case Intrinsic::arm_mve_minv_predicated:
17706 case Intrinsic::arm_mve_maxv_predicated:
17707 case Intrinsic::arm_mve_minav_predicated:
17708 case Intrinsic::arm_mve_maxav_predicated: {
17709 // These intrinsics all take an i32 scalar operand which is narrowed to the
17710 // size of a single lane of the vector type they take as the other input.
17711 unsigned BitWidth = N->getOperand(2)->getValueType(0).getScalarSizeInBits();
17712 APInt DemandedMask = APInt::getLowBitsSet(32, BitWidth);
17713 if (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI))
17714 return SDValue();
17715 break;
17716 }
17717
17718 case Intrinsic::arm_mve_addv: {
17719 // Turn this intrinsic straight into the appropriate ARMISD::VADDV node,
17720 // which allow PerformADDVecReduce to turn it into VADDLV when possible.
17721 bool Unsigned = N->getConstantOperandVal(2);
17723 return DAG.getNode(Opc, SDLoc(N), N->getVTList(), N->getOperand(1));
17724 }
17725
17726 case Intrinsic::arm_mve_addlv:
17727 case Intrinsic::arm_mve_addlv_predicated: {
17728 // Same for these, but ARMISD::VADDLV has to be followed by a BUILD_PAIR
17729 // which recombines the two outputs into an i64
17730 bool Unsigned = N->getConstantOperandVal(2);
17731 unsigned Opc = IntNo == Intrinsic::arm_mve_addlv ?
17734
17736 for (unsigned i = 1, e = N->getNumOperands(); i < e; i++)
17737 if (i != 2) // skip the unsigned flag
17738 Ops.push_back(N->getOperand(i));
17739
17740 SDLoc dl(N);
17741 SDValue val = DAG.getNode(Opc, dl, {MVT::i32, MVT::i32}, Ops);
17742 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, val.getValue(0),
17743 val.getValue(1));
17744 }
17745 }
17746
17747 return SDValue();
17748}
17749
17750/// PerformShiftCombine - Checks for immediate versions of vector shifts and
17751/// lowers them. As with the vector shift intrinsics, this is done during DAG
17752/// combining instead of DAG legalizing because the build_vectors for 64-bit
17753/// vector element shift counts are generally not legal, and it is hard to see
17754/// their values after they get legalized to loads from a constant pool.
17757 const ARMSubtarget *ST) {
17758 SelectionDAG &DAG = DCI.DAG;
17759 EVT VT = N->getValueType(0);
17760
17761 if (ST->isThumb1Only() && N->getOpcode() == ISD::SHL && VT == MVT::i32 &&
17762 N->getOperand(0)->getOpcode() == ISD::AND &&
17763 N->getOperand(0)->hasOneUse()) {
17764 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
17765 return SDValue();
17766 // Look for the pattern (shl (and x, AndMask), ShiftAmt). This doesn't
17767 // usually show up because instcombine prefers to canonicalize it to
17768 // (and (shl x, ShiftAmt) (shl AndMask, ShiftAmt)), but the shift can come
17769 // out of GEP lowering in some cases.
17770 SDValue N0 = N->getOperand(0);
17771 ConstantSDNode *ShiftAmtNode = dyn_cast<ConstantSDNode>(N->getOperand(1));
17772 if (!ShiftAmtNode)
17773 return SDValue();
17774 uint32_t ShiftAmt = static_cast<uint32_t>(ShiftAmtNode->getZExtValue());
17775 ConstantSDNode *AndMaskNode = dyn_cast<ConstantSDNode>(N0->getOperand(1));
17776 if (!AndMaskNode)
17777 return SDValue();
17778 uint32_t AndMask = static_cast<uint32_t>(AndMaskNode->getZExtValue());
17779 // Don't transform uxtb/uxth.
17780 if (AndMask == 255 || AndMask == 65535)
17781 return SDValue();
17782 if (isMask_32(AndMask)) {
17783 uint32_t MaskedBits = llvm::countl_zero(AndMask);
17784 if (MaskedBits > ShiftAmt) {
17785 SDLoc DL(N);
17786 SDValue SHL = DAG.getNode(ISD::SHL, DL, MVT::i32, N0->getOperand(0),
17787 DAG.getConstant(MaskedBits, DL, MVT::i32));
17788 return DAG.getNode(
17789 ISD::SRL, DL, MVT::i32, SHL,
17790 DAG.getConstant(MaskedBits - ShiftAmt, DL, MVT::i32));
17791 }
17792 }
17793 }
17794
17795 // Nothing to be done for scalar shifts.
17796 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
17797 if (!VT.isVector() || !TLI.isTypeLegal(VT))
17798 return SDValue();
17799 if (ST->hasMVEIntegerOps())
17800 return SDValue();
17801
17802 int64_t Cnt;
17803
17804 switch (N->getOpcode()) {
17805 default: llvm_unreachable("unexpected shift opcode");
17806
17807 case ISD::SHL:
17808 if (isVShiftLImm(N->getOperand(1), VT, false, Cnt)) {
17809 SDLoc dl(N);
17810 return DAG.getNode(ARMISD::VSHLIMM, dl, VT, N->getOperand(0),
17811 DAG.getConstant(Cnt, dl, MVT::i32));
17812 }
17813 break;
17814
17815 case ISD::SRA:
17816 case ISD::SRL:
17817 if (isVShiftRImm(N->getOperand(1), VT, false, false, Cnt)) {
17818 unsigned VShiftOpc =
17819 (N->getOpcode() == ISD::SRA ? ARMISD::VSHRsIMM : ARMISD::VSHRuIMM);
17820 SDLoc dl(N);
17821 return DAG.getNode(VShiftOpc, dl, VT, N->getOperand(0),
17822 DAG.getConstant(Cnt, dl, MVT::i32));
17823 }
17824 }
17825 return SDValue();
17826}
17827
17828// Look for a sign/zero/fpextend extend of a larger than legal load. This can be
17829// split into multiple extending loads, which are simpler to deal with than an
17830// arbitrary extend. For fp extends we use an integer extending load and a VCVTL
17831// to convert the type to an f32.
17833 SDValue N0 = N->getOperand(0);
17834 if (N0.getOpcode() != ISD::LOAD)
17835 return SDValue();
17837 if (!LD->isSimple() || !N0.hasOneUse() || LD->isIndexed() ||
17838 LD->getExtensionType() != ISD::NON_EXTLOAD)
17839 return SDValue();
17840 EVT FromVT = LD->getValueType(0);
17841 EVT ToVT = N->getValueType(0);
17842 if (!ToVT.isVector())
17843 return SDValue();
17845 EVT ToEltVT = ToVT.getVectorElementType();
17846 EVT FromEltVT = FromVT.getVectorElementType();
17847
17848 unsigned NumElements = 0;
17849 if (ToEltVT == MVT::i32 && FromEltVT == MVT::i8)
17850 NumElements = 4;
17851 if (ToEltVT == MVT::f32 && FromEltVT == MVT::f16)
17852 NumElements = 4;
17853 if (NumElements == 0 ||
17854 (FromEltVT != MVT::f16 && FromVT.getVectorNumElements() == NumElements) ||
17855 FromVT.getVectorNumElements() % NumElements != 0 ||
17856 !isPowerOf2_32(NumElements))
17857 return SDValue();
17858
17859 LLVMContext &C = *DAG.getContext();
17860 SDLoc DL(LD);
17861 // Details about the old load
17862 SDValue Ch = LD->getChain();
17863 SDValue BasePtr = LD->getBasePtr();
17864 Align Alignment = LD->getBaseAlign();
17865 MachineMemOperand::Flags MMOFlags = LD->getMemOperand()->getFlags();
17866 AAMDNodes AAInfo = LD->getAAInfo();
17867
17868 ISD::LoadExtType NewExtType =
17869 N->getOpcode() == ISD::SIGN_EXTEND ? ISD::SEXTLOAD : ISD::ZEXTLOAD;
17870 SDValue Offset = DAG.getUNDEF(BasePtr.getValueType());
17871 EVT NewFromVT = EVT::getVectorVT(
17872 C, EVT::getIntegerVT(C, FromEltVT.getScalarSizeInBits()), NumElements);
17873 EVT NewToVT = EVT::getVectorVT(
17874 C, EVT::getIntegerVT(C, ToEltVT.getScalarSizeInBits()), NumElements);
17875
17878 for (unsigned i = 0; i < FromVT.getVectorNumElements() / NumElements; i++) {
17879 unsigned NewOffset = (i * NewFromVT.getSizeInBits()) / 8;
17880 SDValue NewPtr =
17881 DAG.getObjectPtrOffset(DL, BasePtr, TypeSize::getFixed(NewOffset));
17882
17883 SDValue NewLoad =
17884 DAG.getLoad(ISD::UNINDEXED, NewExtType, NewToVT, DL, Ch, NewPtr, Offset,
17885 LD->getPointerInfo().getWithOffset(NewOffset), NewFromVT,
17886 Alignment, MMOFlags, AAInfo);
17887 Loads.push_back(NewLoad);
17888 Chains.push_back(SDValue(NewLoad.getNode(), 1));
17889 }
17890
17891 // Float truncs need to extended with VCVTB's into their floating point types.
17892 if (FromEltVT == MVT::f16) {
17894
17895 for (unsigned i = 0; i < Loads.size(); i++) {
17896 SDValue LoadBC =
17897 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, MVT::v8f16, Loads[i]);
17898 SDValue FPExt = DAG.getNode(ARMISD::VCVTL, DL, MVT::v4f32, LoadBC,
17899 DAG.getConstant(0, DL, MVT::i32));
17900 Extends.push_back(FPExt);
17901 }
17902
17903 Loads = Extends;
17904 }
17905
17906 SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
17907 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewChain);
17908 return DAG.getNode(ISD::CONCAT_VECTORS, DL, ToVT, Loads);
17909}
17910
17911/// PerformExtendCombine - Target-specific DAG combining for ISD::SIGN_EXTEND,
17912/// ISD::ZERO_EXTEND, and ISD::ANY_EXTEND.
17914 const ARMSubtarget *ST) {
17915 SDValue N0 = N->getOperand(0);
17916
17917 // Check for sign- and zero-extensions of vector extract operations of 8- and
17918 // 16-bit vector elements. NEON and MVE support these directly. They are
17919 // handled during DAG combining because type legalization will promote them
17920 // to 32-bit types and it is messy to recognize the operations after that.
17921 if ((ST->hasNEON() || ST->hasMVEIntegerOps()) &&
17923 SDValue Vec = N0.getOperand(0);
17924 SDValue Lane = N0.getOperand(1);
17925 EVT VT = N->getValueType(0);
17926 EVT EltVT = N0.getValueType();
17927 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
17928
17929 if (VT == MVT::i32 &&
17930 (EltVT == MVT::i8 || EltVT == MVT::i16) &&
17931 TLI.isTypeLegal(Vec.getValueType()) &&
17932 isa<ConstantSDNode>(Lane)) {
17933
17934 unsigned Opc = 0;
17935 switch (N->getOpcode()) {
17936 default: llvm_unreachable("unexpected opcode");
17937 case ISD::SIGN_EXTEND:
17939 break;
17940 case ISD::ZERO_EXTEND:
17941 case ISD::ANY_EXTEND:
17943 break;
17944 }
17945 return DAG.getNode(Opc, SDLoc(N), VT, Vec, Lane);
17946 }
17947 }
17948
17949 if (ST->hasMVEIntegerOps())
17950 if (SDValue NewLoad = PerformSplittingToWideningLoad(N, DAG))
17951 return NewLoad;
17952
17953 return SDValue();
17954}
17955
17957 const ARMSubtarget *ST) {
17958 if (ST->hasMVEFloatOps())
17959 if (SDValue NewLoad = PerformSplittingToWideningLoad(N, DAG))
17960 return NewLoad;
17961
17962 return SDValue();
17963}
17964
17965// Lower smin(smax(x, C1), C2) to ssat or usat, if they have saturating
17966// constant bounds.
17968 const ARMSubtarget *Subtarget) {
17969 if ((Subtarget->isThumb() || !Subtarget->hasV6Ops()) &&
17970 !Subtarget->isThumb2())
17971 return SDValue();
17972
17973 EVT VT = Op.getValueType();
17974 SDValue Op0 = Op.getOperand(0);
17975
17976 if (VT != MVT::i32 ||
17977 (Op0.getOpcode() != ISD::SMIN && Op0.getOpcode() != ISD::SMAX) ||
17978 !isa<ConstantSDNode>(Op.getOperand(1)) ||
17980 return SDValue();
17981
17982 SDValue Min = Op;
17983 SDValue Max = Op0;
17984 SDValue Input = Op0.getOperand(0);
17985 if (Min.getOpcode() == ISD::SMAX)
17986 std::swap(Min, Max);
17987
17988 APInt MinC = Min.getConstantOperandAPInt(1);
17989 APInt MaxC = Max.getConstantOperandAPInt(1);
17990
17991 if (Min.getOpcode() != ISD::SMIN || Max.getOpcode() != ISD::SMAX ||
17992 !(MinC + 1).isPowerOf2())
17993 return SDValue();
17994
17995 SDLoc DL(Op);
17996 if (MinC == ~MaxC)
17997 return DAG.getNode(ARMISD::SSAT, DL, VT, Input,
17998 DAG.getConstant(MinC.countr_one(), DL, VT));
17999 if (MaxC == 0)
18000 return DAG.getNode(ARMISD::USAT, DL, VT, Input,
18001 DAG.getConstant(MinC.countr_one(), DL, VT));
18002
18003 return SDValue();
18004}
18005
18006/// PerformMinMaxCombine - Target-specific DAG combining for creating truncating
18007/// saturates.
18009 const ARMSubtarget *ST) {
18010 EVT VT = N->getValueType(0);
18011 SDValue N0 = N->getOperand(0);
18012
18013 if (VT == MVT::i32)
18014 return PerformMinMaxToSatCombine(SDValue(N, 0), DAG, ST);
18015
18016 if (!ST->hasMVEIntegerOps())
18017 return SDValue();
18018
18019 if (SDValue V = PerformVQDMULHCombine(N, DAG))
18020 return V;
18021
18022 if (VT != MVT::v4i32 && VT != MVT::v8i16)
18023 return SDValue();
18024
18025 auto IsSignedSaturate = [&](SDNode *Min, SDNode *Max) {
18026 // Check one is a smin and the other is a smax
18027 if (Min->getOpcode() != ISD::SMIN)
18028 std::swap(Min, Max);
18029 if (Min->getOpcode() != ISD::SMIN || Max->getOpcode() != ISD::SMAX)
18030 return false;
18031
18032 APInt SaturateC;
18033 if (VT == MVT::v4i32)
18034 SaturateC = APInt(32, (1 << 15) - 1, true);
18035 else //if (VT == MVT::v8i16)
18036 SaturateC = APInt(16, (1 << 7) - 1, true);
18037
18038 APInt MinC, MaxC;
18039 if (!ISD::isConstantSplatVector(Min->getOperand(1).getNode(), MinC) ||
18040 MinC != SaturateC)
18041 return false;
18042 if (!ISD::isConstantSplatVector(Max->getOperand(1).getNode(), MaxC) ||
18043 MaxC != ~SaturateC)
18044 return false;
18045 return true;
18046 };
18047
18048 if (IsSignedSaturate(N, N0.getNode())) {
18049 SDLoc DL(N);
18050 MVT ExtVT, HalfVT;
18051 if (VT == MVT::v4i32) {
18052 HalfVT = MVT::v8i16;
18053 ExtVT = MVT::v4i16;
18054 } else { // if (VT == MVT::v8i16)
18055 HalfVT = MVT::v16i8;
18056 ExtVT = MVT::v8i8;
18057 }
18058
18059 // Create a VQMOVNB with undef top lanes, then signed extended into the top
18060 // half. That extend will hopefully be removed if only the bottom bits are
18061 // demanded (though a truncating store, for example).
18062 SDValue VQMOVN =
18063 DAG.getNode(ARMISD::VQMOVNs, DL, HalfVT, DAG.getUNDEF(HalfVT),
18064 N0->getOperand(0), DAG.getConstant(0, DL, MVT::i32));
18065 SDValue Bitcast = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, VQMOVN);
18066 return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Bitcast,
18067 DAG.getValueType(ExtVT));
18068 }
18069
18070 auto IsUnsignedSaturate = [&](SDNode *Min) {
18071 // For unsigned, we just need to check for <= 0xffff
18072 if (Min->getOpcode() != ISD::UMIN)
18073 return false;
18074
18075 APInt SaturateC;
18076 if (VT == MVT::v4i32)
18077 SaturateC = APInt(32, (1 << 16) - 1, true);
18078 else //if (VT == MVT::v8i16)
18079 SaturateC = APInt(16, (1 << 8) - 1, true);
18080
18081 APInt MinC;
18082 if (!ISD::isConstantSplatVector(Min->getOperand(1).getNode(), MinC) ||
18083 MinC != SaturateC)
18084 return false;
18085 return true;
18086 };
18087
18088 if (IsUnsignedSaturate(N)) {
18089 SDLoc DL(N);
18090 MVT HalfVT;
18091 unsigned ExtConst;
18092 if (VT == MVT::v4i32) {
18093 HalfVT = MVT::v8i16;
18094 ExtConst = 0x0000FFFF;
18095 } else { //if (VT == MVT::v8i16)
18096 HalfVT = MVT::v16i8;
18097 ExtConst = 0x00FF;
18098 }
18099
18100 // Create a VQMOVNB with undef top lanes, then ZExt into the top half with
18101 // an AND. That extend will hopefully be removed if only the bottom bits are
18102 // demanded (though a truncating store, for example).
18103 SDValue VQMOVN =
18104 DAG.getNode(ARMISD::VQMOVNu, DL, HalfVT, DAG.getUNDEF(HalfVT), N0,
18105 DAG.getConstant(0, DL, MVT::i32));
18106 SDValue Bitcast = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, VQMOVN);
18107 return DAG.getNode(ISD::AND, DL, VT, Bitcast,
18108 DAG.getConstant(ExtConst, DL, VT));
18109 }
18110
18111 return SDValue();
18112}
18113
18116 if (!C)
18117 return nullptr;
18118 const APInt *CV = &C->getAPIntValue();
18119 return CV->isPowerOf2() ? CV : nullptr;
18120}
18121
18123 // If we have a CMOV, OR and AND combination such as:
18124 // if (x & CN)
18125 // y |= CM;
18126 //
18127 // And:
18128 // * CN is a single bit;
18129 // * All bits covered by CM are known zero in y
18130 //
18131 // Then we can convert this into a sequence of BFI instructions. This will
18132 // always be a win if CM is a single bit, will always be no worse than the
18133 // TST&OR sequence if CM is two bits, and for thumb will be no worse if CM is
18134 // three bits (due to the extra IT instruction).
18135
18136 SDValue Op0 = CMOV->getOperand(0);
18137 SDValue Op1 = CMOV->getOperand(1);
18138 auto CC = CMOV->getConstantOperandAPInt(2).getLimitedValue();
18139 SDValue CmpZ = CMOV->getOperand(3);
18140
18141 // The compare must be against zero.
18142 if (!isNullConstant(CmpZ->getOperand(1)))
18143 return SDValue();
18144
18145 assert(CmpZ->getOpcode() == ARMISD::CMPZ);
18146 SDValue And = CmpZ->getOperand(0);
18147 if (And->getOpcode() != ISD::AND)
18148 return SDValue();
18149 const APInt *AndC = isPowerOf2Constant(And->getOperand(1));
18150 if (!AndC)
18151 return SDValue();
18152 SDValue X = And->getOperand(0);
18153
18154 if (CC == ARMCC::EQ) {
18155 // We're performing an "equal to zero" compare. Swap the operands so we
18156 // canonicalize on a "not equal to zero" compare.
18157 std::swap(Op0, Op1);
18158 } else {
18159 assert(CC == ARMCC::NE && "How can a CMPZ node not be EQ or NE?");
18160 }
18161
18162 if (Op1->getOpcode() != ISD::OR)
18163 return SDValue();
18164
18166 if (!OrC)
18167 return SDValue();
18168 SDValue Y = Op1->getOperand(0);
18169
18170 if (Op0 != Y)
18171 return SDValue();
18172
18173 // Now, is it profitable to continue?
18174 APInt OrCI = OrC->getAPIntValue();
18175 unsigned Heuristic = Subtarget->isThumb() ? 3 : 2;
18176 if (OrCI.popcount() > Heuristic)
18177 return SDValue();
18178
18179 // Lastly, can we determine that the bits defined by OrCI
18180 // are zero in Y?
18181 KnownBits Known = DAG.computeKnownBits(Y);
18182 if ((OrCI & Known.Zero) != OrCI)
18183 return SDValue();
18184
18185 // OK, we can do the combine.
18186 SDValue V = Y;
18187 SDLoc dl(X);
18188 EVT VT = X.getValueType();
18189 unsigned BitInX = AndC->logBase2();
18190
18191 if (BitInX != 0) {
18192 // We must shift X first.
18193 X = DAG.getNode(ISD::SRL, dl, VT, X,
18194 DAG.getConstant(BitInX, dl, VT));
18195 }
18196
18197 for (unsigned BitInY = 0, NumActiveBits = OrCI.getActiveBits();
18198 BitInY < NumActiveBits; ++BitInY) {
18199 if (OrCI[BitInY] == 0)
18200 continue;
18201 APInt Mask(VT.getSizeInBits(), 0);
18202 Mask.setBit(BitInY);
18203 V = DAG.getNode(ARMISD::BFI, dl, VT, V, X,
18204 // Confusingly, the operand is an *inverted* mask.
18205 DAG.getConstant(~Mask, dl, VT));
18206 }
18207
18208 return V;
18209}
18210
18211// Given N, the value controlling the conditional branch, search for the loop
18212// intrinsic, returning it, along with how the value is used. We need to handle
18213// patterns such as the following:
18214// (brcond (xor (setcc (loop.decrement), 0, ne), 1), exit)
18215// (brcond (setcc (loop.decrement), 0, eq), exit)
18216// (brcond (setcc (loop.decrement), 0, ne), header)
18218 bool &Negate) {
18219 switch (N->getOpcode()) {
18220 default:
18221 break;
18222 case ISD::XOR: {
18223 if (!isa<ConstantSDNode>(N.getOperand(1)))
18224 return SDValue();
18225 if (!cast<ConstantSDNode>(N.getOperand(1))->isOne())
18226 return SDValue();
18227 Negate = !Negate;
18228 return SearchLoopIntrinsic(N.getOperand(0), CC, Imm, Negate);
18229 }
18230 case ISD::SETCC: {
18231 auto *Const = dyn_cast<ConstantSDNode>(N.getOperand(1));
18232 if (!Const)
18233 return SDValue();
18234 if (Const->isZero())
18235 Imm = 0;
18236 else if (Const->isOne())
18237 Imm = 1;
18238 else
18239 return SDValue();
18240 CC = cast<CondCodeSDNode>(N.getOperand(2))->get();
18241 return SearchLoopIntrinsic(N->getOperand(0), CC, Imm, Negate);
18242 }
18244 unsigned IntOp = N.getConstantOperandVal(1);
18245 if (IntOp != Intrinsic::test_start_loop_iterations &&
18246 IntOp != Intrinsic::loop_decrement_reg)
18247 return SDValue();
18248 return N;
18249 }
18250 }
18251 return SDValue();
18252}
18253
18256 const ARMSubtarget *ST) {
18257
18258 // The hwloop intrinsics that we're interested are used for control-flow,
18259 // either for entering or exiting the loop:
18260 // - test.start.loop.iterations will test whether its operand is zero. If it
18261 // is zero, the proceeding branch should not enter the loop.
18262 // - loop.decrement.reg also tests whether its operand is zero. If it is
18263 // zero, the proceeding branch should not branch back to the beginning of
18264 // the loop.
18265 // So here, we need to check that how the brcond is using the result of each
18266 // of the intrinsics to ensure that we're branching to the right place at the
18267 // right time.
18268
18269 ISD::CondCode CC;
18270 SDValue Cond;
18271 int Imm = 1;
18272 bool Negate = false;
18273 SDValue Chain = N->getOperand(0);
18274 SDValue Dest;
18275
18276 if (N->getOpcode() == ISD::BRCOND) {
18277 CC = ISD::SETEQ;
18278 Cond = N->getOperand(1);
18279 Dest = N->getOperand(2);
18280 } else {
18281 assert(N->getOpcode() == ISD::BR_CC && "Expected BRCOND or BR_CC!");
18282 CC = cast<CondCodeSDNode>(N->getOperand(1))->get();
18283 Cond = N->getOperand(2);
18284 Dest = N->getOperand(4);
18285 if (auto *Const = dyn_cast<ConstantSDNode>(N->getOperand(3))) {
18286 if (!Const->isOne() && !Const->isZero())
18287 return SDValue();
18288 Imm = Const->getZExtValue();
18289 } else
18290 return SDValue();
18291 }
18292
18293 SDValue Int = SearchLoopIntrinsic(Cond, CC, Imm, Negate);
18294 if (!Int)
18295 return SDValue();
18296
18297 if (Negate)
18298 CC = ISD::getSetCCInverse(CC, /* Integer inverse */ MVT::i32);
18299
18300 auto IsTrueIfZero = [](ISD::CondCode CC, int Imm) {
18301 return (CC == ISD::SETEQ && Imm == 0) ||
18302 (CC == ISD::SETNE && Imm == 1) ||
18303 (CC == ISD::SETLT && Imm == 1) ||
18304 (CC == ISD::SETULT && Imm == 1);
18305 };
18306
18307 auto IsFalseIfZero = [](ISD::CondCode CC, int Imm) {
18308 return (CC == ISD::SETEQ && Imm == 1) ||
18309 (CC == ISD::SETNE && Imm == 0) ||
18310 (CC == ISD::SETGT && Imm == 0) ||
18311 (CC == ISD::SETUGT && Imm == 0) ||
18312 (CC == ISD::SETGE && Imm == 1) ||
18313 (CC == ISD::SETUGE && Imm == 1);
18314 };
18315
18316 assert((IsTrueIfZero(CC, Imm) || IsFalseIfZero(CC, Imm)) &&
18317 "unsupported condition");
18318
18319 SDLoc dl(Int);
18320 SelectionDAG &DAG = DCI.DAG;
18321 SDValue Elements = Int.getOperand(2);
18322 unsigned IntOp = Int->getConstantOperandVal(1);
18323 assert((N->hasOneUse() && N->user_begin()->getOpcode() == ISD::BR) &&
18324 "expected single br user");
18325 SDNode *Br = *N->user_begin();
18326 SDValue OtherTarget = Br->getOperand(1);
18327
18328 // Update the unconditional branch to branch to the given Dest.
18329 auto UpdateUncondBr = [](SDNode *Br, SDValue Dest, SelectionDAG &DAG) {
18330 SDValue NewBrOps[] = { Br->getOperand(0), Dest };
18331 SDValue NewBr = DAG.getNode(ISD::BR, SDLoc(Br), MVT::Other, NewBrOps);
18332 DAG.ReplaceAllUsesOfValueWith(SDValue(Br, 0), NewBr);
18333 };
18334
18335 if (IntOp == Intrinsic::test_start_loop_iterations) {
18336 SDValue Res;
18337 SDValue Setup = DAG.getNode(ARMISD::WLSSETUP, dl, MVT::i32, Elements);
18338 // We expect this 'instruction' to branch when the counter is zero.
18339 if (IsTrueIfZero(CC, Imm)) {
18340 SDValue Ops[] = {Chain, Setup, Dest};
18341 Res = DAG.getNode(ARMISD::WLS, dl, MVT::Other, Ops);
18342 } else {
18343 // The logic is the reverse of what we need for WLS, so find the other
18344 // basic block target: the target of the proceeding br.
18345 UpdateUncondBr(Br, Dest, DAG);
18346
18347 SDValue Ops[] = {Chain, Setup, OtherTarget};
18348 Res = DAG.getNode(ARMISD::WLS, dl, MVT::Other, Ops);
18349 }
18350 // Update LR count to the new value
18351 DAG.ReplaceAllUsesOfValueWith(Int.getValue(0), Setup);
18352 // Update chain
18353 DAG.ReplaceAllUsesOfValueWith(Int.getValue(2), Int.getOperand(0));
18354 return Res;
18355 } else {
18356 SDValue Size =
18357 DAG.getTargetConstant(Int.getConstantOperandVal(3), dl, MVT::i32);
18358 SDValue Args[] = { Int.getOperand(0), Elements, Size, };
18359 SDValue LoopDec = DAG.getNode(ARMISD::LOOP_DEC, dl,
18360 DAG.getVTList(MVT::i32, MVT::Other), Args);
18361 DAG.ReplaceAllUsesWith(Int.getNode(), LoopDec.getNode());
18362
18363 // We expect this instruction to branch when the count is not zero.
18364 SDValue Target = IsFalseIfZero(CC, Imm) ? Dest : OtherTarget;
18365
18366 // Update the unconditional branch to target the loop preheader if we've
18367 // found the condition has been reversed.
18368 if (Target == OtherTarget)
18369 UpdateUncondBr(Br, Dest, DAG);
18370
18371 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
18372 SDValue(LoopDec.getNode(), 1), Chain);
18373
18374 SDValue EndArgs[] = { Chain, SDValue(LoopDec.getNode(), 0), Target };
18375 return DAG.getNode(ARMISD::LE, dl, MVT::Other, EndArgs);
18376 }
18377 return SDValue();
18378}
18379
18380/// PerformBRCONDCombine - Target-specific DAG combining for ARMISD::BRCOND.
18381SDValue
18383 SDValue Cmp = N->getOperand(3);
18384 if (Cmp.getOpcode() != ARMISD::CMPZ)
18385 // Only looking at NE cases.
18386 return SDValue();
18387
18388 SDLoc dl(N);
18389 SDValue LHS = Cmp.getOperand(0);
18390 SDValue RHS = Cmp.getOperand(1);
18391 SDValue Chain = N->getOperand(0);
18392 SDValue BB = N->getOperand(1);
18393 SDValue ARMcc = N->getOperand(2);
18395
18396 // (brcond Chain BB ne (cmpz (and (cmov 0 1 CC Flags) 1) 0))
18397 // -> (brcond Chain BB CC Flags)
18398 if (CC == ARMCC::NE && LHS.getOpcode() == ISD::AND && LHS->hasOneUse() &&
18399 LHS->getOperand(0)->getOpcode() == ARMISD::CMOV &&
18400 LHS->getOperand(0)->hasOneUse() &&
18401 isNullConstant(LHS->getOperand(0)->getOperand(0)) &&
18402 isOneConstant(LHS->getOperand(0)->getOperand(1)) &&
18403 isOneConstant(LHS->getOperand(1)) && isNullConstant(RHS)) {
18404 return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, Chain, BB,
18405 LHS->getOperand(0)->getOperand(2),
18406 LHS->getOperand(0)->getOperand(3));
18407 }
18408
18409 return SDValue();
18410}
18411
18412/// PerformCMOVCombine - Target-specific DAG combining for ARMISD::CMOV.
18413SDValue
18415 SDValue Cmp = N->getOperand(3);
18416 if (Cmp.getOpcode() != ARMISD::CMPZ)
18417 // Only looking at EQ and NE cases.
18418 return SDValue();
18419
18420 EVT VT = N->getValueType(0);
18421 SDLoc dl(N);
18422 SDValue LHS = Cmp.getOperand(0);
18423 SDValue RHS = Cmp.getOperand(1);
18424 SDValue FalseVal = N->getOperand(0);
18425 SDValue TrueVal = N->getOperand(1);
18426 SDValue ARMcc = N->getOperand(2);
18428
18429 // BFI is only available on V6T2+.
18430 if (!Subtarget->isThumb1Only() && Subtarget->hasV6T2Ops()) {
18432 if (R)
18433 return R;
18434 }
18435
18436 // Simplify
18437 // mov r1, r0
18438 // cmp r1, x
18439 // mov r0, y
18440 // moveq r0, x
18441 // to
18442 // cmp r0, x
18443 // movne r0, y
18444 //
18445 // mov r1, r0
18446 // cmp r1, x
18447 // mov r0, x
18448 // movne r0, y
18449 // to
18450 // cmp r0, x
18451 // movne r0, y
18452 /// FIXME: Turn this into a target neutral optimization?
18453 SDValue Res;
18454 if (CC == ARMCC::NE && FalseVal == RHS && FalseVal != LHS) {
18455 Res = DAG.getNode(ARMISD::CMOV, dl, VT, LHS, TrueVal, ARMcc, Cmp);
18456 } else if (CC == ARMCC::EQ && TrueVal == RHS) {
18457 SDValue ARMcc;
18458 SDValue NewCmp = getARMCmp(LHS, RHS, ISD::SETNE, ARMcc, DAG, dl);
18459 Res = DAG.getNode(ARMISD::CMOV, dl, VT, LHS, FalseVal, ARMcc, NewCmp);
18460 }
18461
18462 // (cmov F T ne (cmpz (cmov 0 1 CC Flags) 0))
18463 // -> (cmov F T CC Flags)
18464 if (CC == ARMCC::NE && LHS.getOpcode() == ARMISD::CMOV && LHS->hasOneUse() &&
18465 isNullConstant(LHS->getOperand(0)) && isOneConstant(LHS->getOperand(1)) &&
18466 isNullConstant(RHS)) {
18467 return DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, TrueVal,
18468 LHS->getOperand(2), LHS->getOperand(3));
18469 }
18470
18471 if (!VT.isInteger())
18472 return SDValue();
18473
18474 // Fold away an unneccessary CMPZ/CMOV
18475 // CMOV A, B, C1, (CMPZ (CMOV 1, 0, C2, D), 0) ->
18476 // if C1==EQ -> CMOV A, B, C2, D
18477 // if C1==NE -> CMOV A, B, NOT(C2), D
18478 if (N->getConstantOperandVal(2) == ARMCC::EQ ||
18479 N->getConstantOperandVal(2) == ARMCC::NE) {
18481 if (SDValue C = IsCMPZCSINC(N->getOperand(3).getNode(), Cond)) {
18482 if (N->getConstantOperandVal(2) == ARMCC::NE)
18484 return DAG.getNode(N->getOpcode(), SDLoc(N), MVT::i32, N->getOperand(0),
18485 N->getOperand(1),
18486 DAG.getConstant(Cond, SDLoc(N), MVT::i32), C);
18487 }
18488 }
18489
18490 // Materialize a boolean comparison for integers so we can avoid branching.
18491 if (isNullConstant(FalseVal)) {
18492 if (CC == ARMCC::EQ && isOneConstant(TrueVal)) {
18493 if (!Subtarget->isThumb1Only() && Subtarget->hasV5TOps()) {
18494 // If x == y then x - y == 0 and ARM's CLZ will return 32, shifting it
18495 // right 5 bits will make that 32 be 1, otherwise it will be 0.
18496 // CMOV 0, 1, ==, (CMPZ x, y) -> SRL (CTLZ (SUB x, y)), 5
18497 SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, LHS, RHS);
18498 Res = DAG.getNode(ISD::SRL, dl, VT, DAG.getNode(ISD::CTLZ, dl, VT, Sub),
18499 DAG.getConstant(5, dl, MVT::i32));
18500 } else {
18501 // CMOV 0, 1, ==, (CMPZ x, y) ->
18502 // (UADDO_CARRY (SUB x, y), t:0, t:1)
18503 // where t = (USUBO_CARRY 0, (SUB x, y), 0)
18504 //
18505 // The USUBO_CARRY computes 0 - (x - y) and this will give a borrow when
18506 // x != y. In other words, a carry C == 1 when x == y, C == 0
18507 // otherwise.
18508 // The final UADDO_CARRY computes
18509 // x - y + (0 - (x - y)) + C == C
18510 SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, LHS, RHS);
18511 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
18512 SDValue Neg = DAG.getNode(ISD::USUBO, dl, VTs, FalseVal, Sub);
18513 // ISD::USUBO_CARRY returns a borrow but we want the carry here
18514 // actually.
18515 SDValue Carry =
18516 DAG.getNode(ISD::SUB, dl, MVT::i32,
18517 DAG.getConstant(1, dl, MVT::i32), Neg.getValue(1));
18518 Res = DAG.getNode(ISD::UADDO_CARRY, dl, VTs, Sub, Neg, Carry);
18519 }
18520 } else if (CC == ARMCC::NE && !isNullConstant(RHS) &&
18521 (!Subtarget->isThumb1Only() || isPowerOf2Constant(TrueVal))) {
18522 // This seems pointless but will allow us to combine it further below.
18523 // CMOV 0, z, !=, (CMPZ x, y) -> CMOV (SUBC x, y), z, !=, (SUBC x, y):1
18524 SDValue Sub =
18525 DAG.getNode(ARMISD::SUBC, dl, DAG.getVTList(VT, MVT::i32), LHS, RHS);
18526 Res = DAG.getNode(ARMISD::CMOV, dl, VT, Sub, TrueVal, ARMcc,
18527 Sub.getValue(1));
18528 FalseVal = Sub;
18529 }
18530 } else if (isNullConstant(TrueVal)) {
18531 if (CC == ARMCC::EQ && !isNullConstant(RHS) &&
18532 (!Subtarget->isThumb1Only() || isPowerOf2Constant(FalseVal))) {
18533 // This seems pointless but will allow us to combine it further below
18534 // Note that we change == for != as this is the dual for the case above.
18535 // CMOV z, 0, ==, (CMPZ x, y) -> CMOV (SUBC x, y), z, !=, (SUBC x, y):1
18536 SDValue Sub =
18537 DAG.getNode(ARMISD::SUBC, dl, DAG.getVTList(VT, MVT::i32), LHS, RHS);
18538 Res = DAG.getNode(ARMISD::CMOV, dl, VT, Sub, FalseVal,
18539 DAG.getConstant(ARMCC::NE, dl, MVT::i32),
18540 Sub.getValue(1));
18541 FalseVal = Sub;
18542 }
18543 }
18544
18545 // On Thumb1, the DAG above may be further combined if z is a power of 2
18546 // (z == 2 ^ K).
18547 // CMOV (SUBC x, y), z, !=, (SUBC x, y):1 ->
18548 // t1 = (USUBO (SUB x, y), 1)
18549 // t2 = (USUBO_CARRY (SUB x, y), t1:0, t1:1)
18550 // Result = if K != 0 then (SHL t2:0, K) else t2:0
18551 //
18552 // This also handles the special case of comparing against zero; it's
18553 // essentially, the same pattern, except there's no SUBC:
18554 // CMOV x, z, !=, (CMPZ x, 0) ->
18555 // t1 = (USUBO x, 1)
18556 // t2 = (USUBO_CARRY x, t1:0, t1:1)
18557 // Result = if K != 0 then (SHL t2:0, K) else t2:0
18558 const APInt *TrueConst;
18559 if (Subtarget->isThumb1Only() && CC == ARMCC::NE &&
18560 ((FalseVal.getOpcode() == ARMISD::SUBC && FalseVal.getOperand(0) == LHS &&
18561 FalseVal.getOperand(1) == RHS) ||
18562 (FalseVal == LHS && isNullConstant(RHS))) &&
18563 (TrueConst = isPowerOf2Constant(TrueVal))) {
18564 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
18565 unsigned ShiftAmount = TrueConst->logBase2();
18566 if (ShiftAmount)
18567 TrueVal = DAG.getConstant(1, dl, VT);
18568 SDValue Subc = DAG.getNode(ISD::USUBO, dl, VTs, FalseVal, TrueVal);
18569 Res = DAG.getNode(ISD::USUBO_CARRY, dl, VTs, FalseVal, Subc,
18570 Subc.getValue(1));
18571
18572 if (ShiftAmount)
18573 Res = DAG.getNode(ISD::SHL, dl, VT, Res,
18574 DAG.getConstant(ShiftAmount, dl, MVT::i32));
18575 }
18576
18577 if (Res.getNode()) {
18578 KnownBits Known = DAG.computeKnownBits(SDValue(N,0));
18579 // Capture demanded bits information that would be otherwise lost.
18580 if (Known.Zero == 0xfffffffe)
18581 Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res,
18582 DAG.getValueType(MVT::i1));
18583 else if (Known.Zero == 0xffffff00)
18584 Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res,
18585 DAG.getValueType(MVT::i8));
18586 else if (Known.Zero == 0xffff0000)
18587 Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res,
18588 DAG.getValueType(MVT::i16));
18589 }
18590
18591 return Res;
18592}
18593
18596 const ARMSubtarget *ST) {
18597 SelectionDAG &DAG = DCI.DAG;
18598 SDValue Src = N->getOperand(0);
18599 EVT DstVT = N->getValueType(0);
18600
18601 // Convert v4f32 bitcast (v4i32 vdup (i32)) -> v4f32 vdup (i32) under MVE.
18602 if (ST->hasMVEIntegerOps() && Src.getOpcode() == ARMISD::VDUP) {
18603 EVT SrcVT = Src.getValueType();
18604 if (SrcVT.getScalarSizeInBits() == DstVT.getScalarSizeInBits())
18605 return DAG.getNode(ARMISD::VDUP, SDLoc(N), DstVT, Src.getOperand(0));
18606 }
18607
18608 // We may have a bitcast of something that has already had this bitcast
18609 // combine performed on it, so skip past any VECTOR_REG_CASTs.
18610 if (Src.getOpcode() == ARMISD::VECTOR_REG_CAST &&
18611 Src.getOperand(0).getValueType().getScalarSizeInBits() <=
18612 Src.getValueType().getScalarSizeInBits())
18613 Src = Src.getOperand(0);
18614
18615 // Bitcast from element-wise VMOV or VMVN doesn't need VREV if the VREV that
18616 // would be generated is at least the width of the element type.
18617 EVT SrcVT = Src.getValueType();
18618 if ((Src.getOpcode() == ARMISD::VMOVIMM ||
18619 Src.getOpcode() == ARMISD::VMVNIMM ||
18620 Src.getOpcode() == ARMISD::VMOVFPIMM) &&
18621 SrcVT.getScalarSizeInBits() <= DstVT.getScalarSizeInBits() &&
18622 DAG.getDataLayout().isBigEndian())
18623 return DAG.getNode(ARMISD::VECTOR_REG_CAST, SDLoc(N), DstVT, Src);
18624
18625 // bitcast(extract(x, n)); bitcast(extract(x, n+1)) -> VMOVRRD x
18626 if (SDValue R = PerformExtractEltToVMOVRRD(N, DCI))
18627 return R;
18628
18629 return SDValue();
18630}
18631
18632// Some combines for the MVETrunc truncations legalizer helper. Also lowers the
18633// node into stack operations after legalizeOps.
18636 SelectionDAG &DAG = DCI.DAG;
18637 EVT VT = N->getValueType(0);
18638 SDLoc DL(N);
18639
18640 // MVETrunc(Undef, Undef) -> Undef
18641 if (all_of(N->ops(), [](SDValue Op) { return Op.isUndef(); }))
18642 return DAG.getUNDEF(VT);
18643
18644 // MVETrunc(MVETrunc a b, MVETrunc c, d) -> MVETrunc
18645 if (N->getNumOperands() == 2 &&
18646 N->getOperand(0).getOpcode() == ARMISD::MVETRUNC &&
18647 N->getOperand(1).getOpcode() == ARMISD::MVETRUNC)
18648 return DAG.getNode(ARMISD::MVETRUNC, DL, VT, N->getOperand(0).getOperand(0),
18649 N->getOperand(0).getOperand(1),
18650 N->getOperand(1).getOperand(0),
18651 N->getOperand(1).getOperand(1));
18652
18653 // MVETrunc(shuffle, shuffle) -> VMOVN
18654 if (N->getNumOperands() == 2 &&
18655 N->getOperand(0).getOpcode() == ISD::VECTOR_SHUFFLE &&
18656 N->getOperand(1).getOpcode() == ISD::VECTOR_SHUFFLE) {
18657 auto *S0 = cast<ShuffleVectorSDNode>(N->getOperand(0).getNode());
18658 auto *S1 = cast<ShuffleVectorSDNode>(N->getOperand(1).getNode());
18659
18660 if (S0->getOperand(0) == S1->getOperand(0) &&
18661 S0->getOperand(1) == S1->getOperand(1)) {
18662 // Construct complete shuffle mask
18663 SmallVector<int, 8> Mask(S0->getMask());
18664 Mask.append(S1->getMask().begin(), S1->getMask().end());
18665
18666 if (isVMOVNTruncMask(Mask, VT, false))
18667 return DAG.getNode(
18668 ARMISD::VMOVN, DL, VT,
18669 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, S0->getOperand(0)),
18670 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, S0->getOperand(1)),
18671 DAG.getConstant(1, DL, MVT::i32));
18672 if (isVMOVNTruncMask(Mask, VT, true))
18673 return DAG.getNode(
18674 ARMISD::VMOVN, DL, VT,
18675 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, S0->getOperand(1)),
18676 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, S0->getOperand(0)),
18677 DAG.getConstant(1, DL, MVT::i32));
18678 }
18679 }
18680
18681 // For MVETrunc of a buildvector or shuffle, it can be beneficial to lower the
18682 // truncate to a buildvector to allow the generic optimisations to kick in.
18683 if (all_of(N->ops(), [](SDValue Op) {
18684 return Op.getOpcode() == ISD::BUILD_VECTOR ||
18685 Op.getOpcode() == ISD::VECTOR_SHUFFLE ||
18686 (Op.getOpcode() == ISD::BITCAST &&
18687 Op.getOperand(0).getOpcode() == ISD::BUILD_VECTOR);
18688 })) {
18689 SmallVector<SDValue, 8> Extracts;
18690 for (unsigned Op = 0; Op < N->getNumOperands(); Op++) {
18691 SDValue O = N->getOperand(Op);
18692 for (unsigned i = 0; i < O.getValueType().getVectorNumElements(); i++) {
18693 SDValue Ext = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, O,
18694 DAG.getConstant(i, DL, MVT::i32));
18695 Extracts.push_back(Ext);
18696 }
18697 }
18698 return DAG.getBuildVector(VT, DL, Extracts);
18699 }
18700
18701 // If we are late in the legalization process and nothing has optimised
18702 // the trunc to anything better, lower it to a stack store and reload,
18703 // performing the truncation whilst keeping the lanes in the correct order:
18704 // VSTRH.32 a, stack; VSTRH.32 b, stack+8; VLDRW.32 stack;
18705 if (!DCI.isAfterLegalizeDAG())
18706 return SDValue();
18707
18708 SDValue StackPtr = DAG.CreateStackTemporary(TypeSize::getFixed(16), Align(4));
18709 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
18710 int NumIns = N->getNumOperands();
18711 assert((NumIns == 2 || NumIns == 4) &&
18712 "Expected 2 or 4 inputs to an MVETrunc");
18713 EVT StoreVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
18714 if (N->getNumOperands() == 4)
18715 StoreVT = StoreVT.getHalfNumVectorElementsVT(*DAG.getContext());
18716
18717 SmallVector<SDValue> Chains;
18718 for (int I = 0; I < NumIns; I++) {
18719 SDValue Ptr = DAG.getNode(
18720 ISD::ADD, DL, StackPtr.getValueType(), StackPtr,
18721 DAG.getConstant(I * 16 / NumIns, DL, StackPtr.getValueType()));
18723 DAG.getMachineFunction(), SPFI, I * 16 / NumIns);
18724 SDValue Ch = DAG.getTruncStore(DAG.getEntryNode(), DL, N->getOperand(I),
18725 Ptr, MPI, StoreVT, Align(4));
18726 Chains.push_back(Ch);
18727 }
18728
18729 SDValue Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
18730 MachinePointerInfo MPI =
18732 return DAG.getLoad(VT, DL, Chain, StackPtr, MPI, Align(4));
18733}
18734
18735// Take a MVEEXT(load x) and split that into (extload x, extload x+8)
18737 SelectionDAG &DAG) {
18738 SDValue N0 = N->getOperand(0);
18740 if (!LD || !LD->isSimple() || !N0.hasOneUse() || LD->isIndexed())
18741 return SDValue();
18742
18743 EVT FromVT = LD->getMemoryVT();
18744 EVT ToVT = N->getValueType(0);
18745 if (!ToVT.isVector())
18746 return SDValue();
18747 assert(FromVT.getVectorNumElements() == ToVT.getVectorNumElements() * 2);
18748 EVT ToEltVT = ToVT.getVectorElementType();
18749 EVT FromEltVT = FromVT.getVectorElementType();
18750
18751 unsigned NumElements = 0;
18752 if (ToEltVT == MVT::i32 && (FromEltVT == MVT::i16 || FromEltVT == MVT::i8))
18753 NumElements = 4;
18754 if (ToEltVT == MVT::i16 && FromEltVT == MVT::i8)
18755 NumElements = 8;
18756 assert(NumElements != 0);
18757
18758 ISD::LoadExtType NewExtType =
18759 N->getOpcode() == ARMISD::MVESEXT ? ISD::SEXTLOAD : ISD::ZEXTLOAD;
18760 if (LD->getExtensionType() != ISD::NON_EXTLOAD &&
18761 LD->getExtensionType() != ISD::EXTLOAD &&
18762 LD->getExtensionType() != NewExtType)
18763 return SDValue();
18764
18765 LLVMContext &C = *DAG.getContext();
18766 SDLoc DL(LD);
18767 // Details about the old load
18768 SDValue Ch = LD->getChain();
18769 SDValue BasePtr = LD->getBasePtr();
18770 Align Alignment = LD->getBaseAlign();
18771 MachineMemOperand::Flags MMOFlags = LD->getMemOperand()->getFlags();
18772 AAMDNodes AAInfo = LD->getAAInfo();
18773
18774 SDValue Offset = DAG.getUNDEF(BasePtr.getValueType());
18775 EVT NewFromVT = EVT::getVectorVT(
18776 C, EVT::getIntegerVT(C, FromEltVT.getScalarSizeInBits()), NumElements);
18777 EVT NewToVT = EVT::getVectorVT(
18778 C, EVT::getIntegerVT(C, ToEltVT.getScalarSizeInBits()), NumElements);
18779
18782 for (unsigned i = 0; i < FromVT.getVectorNumElements() / NumElements; i++) {
18783 unsigned NewOffset = (i * NewFromVT.getSizeInBits()) / 8;
18784 SDValue NewPtr =
18785 DAG.getObjectPtrOffset(DL, BasePtr, TypeSize::getFixed(NewOffset));
18786
18787 SDValue NewLoad =
18788 DAG.getLoad(ISD::UNINDEXED, NewExtType, NewToVT, DL, Ch, NewPtr, Offset,
18789 LD->getPointerInfo().getWithOffset(NewOffset), NewFromVT,
18790 Alignment, MMOFlags, AAInfo);
18791 Loads.push_back(NewLoad);
18792 Chains.push_back(SDValue(NewLoad.getNode(), 1));
18793 }
18794
18795 SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
18796 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewChain);
18797 return DAG.getMergeValues(Loads, DL);
18798}
18799
18800// Perform combines for MVEEXT. If it has not be optimized to anything better
18801// before lowering, it gets converted to stack store and extloads performing the
18802// extend whilst still keeping the same lane ordering.
18805 SelectionDAG &DAG = DCI.DAG;
18806 EVT VT = N->getValueType(0);
18807 SDLoc DL(N);
18808 assert(N->getNumValues() == 2 && "Expected MVEEXT with 2 elements");
18809 assert((VT == MVT::v4i32 || VT == MVT::v8i16) && "Unexpected MVEEXT type");
18810
18811 EVT ExtVT = N->getOperand(0).getValueType().getHalfNumVectorElementsVT(
18812 *DAG.getContext());
18813 auto Extend = [&](SDValue V) {
18814 SDValue VVT = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, V);
18815 return N->getOpcode() == ARMISD::MVESEXT
18816 ? DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, VVT,
18817 DAG.getValueType(ExtVT))
18818 : DAG.getZeroExtendInReg(VVT, DL, ExtVT);
18819 };
18820
18821 // MVEEXT(VDUP) -> SIGN_EXTEND_INREG(VDUP)
18822 if (N->getOperand(0).getOpcode() == ARMISD::VDUP) {
18823 SDValue Ext = Extend(N->getOperand(0));
18824 return DAG.getMergeValues({Ext, Ext}, DL);
18825 }
18826
18827 // MVEEXT(shuffle) -> SIGN_EXTEND_INREG/ZERO_EXTEND_INREG
18828 if (auto *SVN = dyn_cast<ShuffleVectorSDNode>(N->getOperand(0))) {
18829 ArrayRef<int> Mask = SVN->getMask();
18830 assert(Mask.size() == 2 * VT.getVectorNumElements());
18831 assert(Mask.size() == SVN->getValueType(0).getVectorNumElements());
18832 unsigned Rev = VT == MVT::v4i32 ? ARMISD::VREV32 : ARMISD::VREV16;
18833 SDValue Op0 = SVN->getOperand(0);
18834 SDValue Op1 = SVN->getOperand(1);
18835
18836 auto CheckInregMask = [&](int Start, int Offset) {
18837 for (int Idx = 0, E = VT.getVectorNumElements(); Idx < E; ++Idx)
18838 if (Mask[Start + Idx] >= 0 && Mask[Start + Idx] != Idx * 2 + Offset)
18839 return false;
18840 return true;
18841 };
18842 SDValue V0 = SDValue(N, 0);
18843 SDValue V1 = SDValue(N, 1);
18844 if (CheckInregMask(0, 0))
18845 V0 = Extend(Op0);
18846 else if (CheckInregMask(0, 1))
18847 V0 = Extend(DAG.getNode(Rev, DL, SVN->getValueType(0), Op0));
18848 else if (CheckInregMask(0, Mask.size()))
18849 V0 = Extend(Op1);
18850 else if (CheckInregMask(0, Mask.size() + 1))
18851 V0 = Extend(DAG.getNode(Rev, DL, SVN->getValueType(0), Op1));
18852
18853 if (CheckInregMask(VT.getVectorNumElements(), Mask.size()))
18854 V1 = Extend(Op1);
18855 else if (CheckInregMask(VT.getVectorNumElements(), Mask.size() + 1))
18856 V1 = Extend(DAG.getNode(Rev, DL, SVN->getValueType(0), Op1));
18857 else if (CheckInregMask(VT.getVectorNumElements(), 0))
18858 V1 = Extend(Op0);
18859 else if (CheckInregMask(VT.getVectorNumElements(), 1))
18860 V1 = Extend(DAG.getNode(Rev, DL, SVN->getValueType(0), Op0));
18861
18862 if (V0.getNode() != N || V1.getNode() != N)
18863 return DAG.getMergeValues({V0, V1}, DL);
18864 }
18865
18866 // MVEEXT(load) -> extload, extload
18867 if (N->getOperand(0)->getOpcode() == ISD::LOAD)
18869 return L;
18870
18871 if (!DCI.isAfterLegalizeDAG())
18872 return SDValue();
18873
18874 // Lower to a stack store and reload:
18875 // VSTRW.32 a, stack; VLDRH.32 stack; VLDRH.32 stack+8;
18876 SDValue StackPtr = DAG.CreateStackTemporary(TypeSize::getFixed(16), Align(4));
18877 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
18878 int NumOuts = N->getNumValues();
18879 assert((NumOuts == 2 || NumOuts == 4) &&
18880 "Expected 2 or 4 outputs to an MVEEXT");
18881 EVT LoadVT = N->getOperand(0).getValueType().getHalfNumVectorElementsVT(
18882 *DAG.getContext());
18883 if (N->getNumOperands() == 4)
18884 LoadVT = LoadVT.getHalfNumVectorElementsVT(*DAG.getContext());
18885
18886 MachinePointerInfo MPI =
18888 SDValue Chain = DAG.getStore(DAG.getEntryNode(), DL, N->getOperand(0),
18889 StackPtr, MPI, Align(4));
18890
18892 for (int I = 0; I < NumOuts; I++) {
18893 SDValue Ptr = DAG.getNode(
18894 ISD::ADD, DL, StackPtr.getValueType(), StackPtr,
18895 DAG.getConstant(I * 16 / NumOuts, DL, StackPtr.getValueType()));
18897 DAG.getMachineFunction(), SPFI, I * 16 / NumOuts);
18898 SDValue Load = DAG.getExtLoad(
18899 N->getOpcode() == ARMISD::MVESEXT ? ISD::SEXTLOAD : ISD::ZEXTLOAD, DL,
18900 VT, Chain, Ptr, MPI, LoadVT, Align(4));
18901 Loads.push_back(Load);
18902 }
18903
18904 return DAG.getMergeValues(Loads, DL);
18905}
18906
18908 DAGCombinerInfo &DCI) const {
18909 switch (N->getOpcode()) {
18910 default: break;
18911 case ISD::SELECT_CC:
18912 case ISD::SELECT: return PerformSELECTCombine(N, DCI, Subtarget);
18913 case ISD::VSELECT: return PerformVSELECTCombine(N, DCI, Subtarget);
18914 case ISD::SETCC: return PerformVSetCCToVCTPCombine(N, DCI, Subtarget);
18915 case ARMISD::ADDE: return PerformADDECombine(N, DCI, Subtarget);
18916 case ARMISD::UMLAL: return PerformUMLALCombine(N, DCI.DAG, Subtarget);
18917 case ISD::ADD: return PerformADDCombine(N, DCI, Subtarget);
18918 case ISD::SUB: return PerformSUBCombine(N, DCI, Subtarget);
18919 case ISD::MUL: return PerformMULCombine(N, DCI, Subtarget);
18920 case ISD::OR: return PerformORCombine(N, DCI, Subtarget);
18921 case ISD::XOR: return PerformXORCombine(N, DCI, Subtarget);
18922 case ISD::AND: return PerformANDCombine(N, DCI, Subtarget);
18923 case ISD::BRCOND:
18924 case ISD::BR_CC: return PerformHWLoopCombine(N, DCI, Subtarget);
18925 case ARMISD::ADDC:
18926 case ARMISD::SUBC: return PerformAddcSubcCombine(N, DCI, Subtarget);
18927 case ARMISD::SUBE: return PerformAddeSubeCombine(N, DCI, Subtarget);
18928 case ARMISD::BFI: return PerformBFICombine(N, DCI.DAG);
18929 case ARMISD::VMOVRRD: return PerformVMOVRRDCombine(N, DCI, Subtarget);
18930 case ARMISD::VMOVDRR: return PerformVMOVDRRCombine(N, DCI.DAG);
18931 case ARMISD::VMOVhr: return PerformVMOVhrCombine(N, DCI);
18932 case ARMISD::VMOVrh: return PerformVMOVrhCombine(N, DCI.DAG);
18933 case ISD::STORE: return PerformSTORECombine(N, DCI, Subtarget);
18934 case ISD::BUILD_VECTOR: return PerformBUILD_VECTORCombine(N, DCI, Subtarget);
18937 return PerformExtractEltCombine(N, DCI, Subtarget);
18941 case ARMISD::VDUPLANE: return PerformVDUPLANECombine(N, DCI, Subtarget);
18942 case ARMISD::VDUP: return PerformVDUPCombine(N, DCI.DAG, Subtarget);
18943 case ISD::FP_TO_SINT:
18944 case ISD::FP_TO_UINT:
18945 return PerformVCVTCombine(N, DCI.DAG, Subtarget);
18946 case ISD::FADD:
18947 return PerformFADDCombine(N, DCI.DAG, Subtarget);
18948 case ISD::FMUL:
18949 return PerformVMulVCTPCombine(N, DCI.DAG, Subtarget);
18951 return PerformIntrinsicCombine(N, DCI);
18952 case ISD::SHL:
18953 case ISD::SRA:
18954 case ISD::SRL:
18955 return PerformShiftCombine(N, DCI, Subtarget);
18956 case ISD::SIGN_EXTEND:
18957 case ISD::ZERO_EXTEND:
18958 case ISD::ANY_EXTEND:
18959 return PerformExtendCombine(N, DCI.DAG, Subtarget);
18960 case ISD::FP_EXTEND:
18961 return PerformFPExtendCombine(N, DCI.DAG, Subtarget);
18962 case ISD::SMIN:
18963 case ISD::UMIN:
18964 case ISD::SMAX:
18965 case ISD::UMAX:
18966 return PerformMinMaxCombine(N, DCI.DAG, Subtarget);
18967 case ARMISD::CMOV:
18968 return PerformCMOVCombine(N, DCI.DAG);
18969 case ARMISD::BRCOND:
18970 return PerformBRCONDCombine(N, DCI.DAG);
18971 case ARMISD::CMPZ:
18972 return PerformCMPZCombine(N, DCI.DAG);
18973 case ARMISD::CSINC:
18974 case ARMISD::CSINV:
18975 case ARMISD::CSNEG:
18976 return PerformCSETCombine(N, DCI.DAG);
18977 case ISD::LOAD:
18978 return PerformLOADCombine(N, DCI, Subtarget);
18979 case ARMISD::VLD1DUP:
18980 case ARMISD::VLD2DUP:
18981 case ARMISD::VLD3DUP:
18982 case ARMISD::VLD4DUP:
18983 return PerformVLDCombine(N, DCI);
18985 return PerformARMBUILD_VECTORCombine(N, DCI);
18986 case ISD::BITCAST:
18987 return PerformBITCASTCombine(N, DCI, Subtarget);
18989 return PerformPREDICATE_CASTCombine(N, DCI);
18991 return PerformVECTOR_REG_CASTCombine(N, DCI.DAG, Subtarget);
18992 case ARMISD::MVETRUNC:
18993 return PerformMVETruncCombine(N, DCI);
18994 case ARMISD::MVESEXT:
18995 case ARMISD::MVEZEXT:
18996 return PerformMVEExtCombine(N, DCI);
18997 case ARMISD::VCMP:
18998 return PerformVCMPCombine(N, DCI.DAG, Subtarget);
18999 case ISD::VECREDUCE_ADD:
19000 return PerformVECREDUCE_ADDCombine(N, DCI.DAG, Subtarget);
19001 case ARMISD::VADDVs:
19002 case ARMISD::VADDVu:
19003 case ARMISD::VADDLVs:
19004 case ARMISD::VADDLVu:
19005 case ARMISD::VADDLVAs:
19006 case ARMISD::VADDLVAu:
19007 case ARMISD::VMLAVs:
19008 case ARMISD::VMLAVu:
19009 case ARMISD::VMLALVs:
19010 case ARMISD::VMLALVu:
19011 case ARMISD::VMLALVAs:
19012 case ARMISD::VMLALVAu:
19013 return PerformReduceShuffleCombine(N, DCI.DAG);
19014 case ARMISD::VMOVN:
19015 return PerformVMOVNCombine(N, DCI);
19016 case ARMISD::VQMOVNs:
19017 case ARMISD::VQMOVNu:
19018 return PerformVQMOVNCombine(N, DCI);
19019 case ARMISD::VQDMULH:
19020 return PerformVQDMULHCombine(N, DCI);
19021 case ARMISD::ASRL:
19022 case ARMISD::LSRL:
19023 case ARMISD::LSLL:
19024 return PerformLongShiftCombine(N, DCI.DAG);
19025 case ARMISD::SMULWB: {
19026 unsigned BitWidth = N->getValueType(0).getSizeInBits();
19027 APInt DemandedMask = APInt::getLowBitsSet(BitWidth, 16);
19028 if (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI))
19029 return SDValue();
19030 break;
19031 }
19032 case ARMISD::SMULWT: {
19033 unsigned BitWidth = N->getValueType(0).getSizeInBits();
19034 APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 16);
19035 if (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI))
19036 return SDValue();
19037 break;
19038 }
19039 case ARMISD::SMLALBB:
19040 case ARMISD::QADD16b:
19041 case ARMISD::QSUB16b:
19042 case ARMISD::UQADD16b:
19043 case ARMISD::UQSUB16b: {
19044 unsigned BitWidth = N->getValueType(0).getSizeInBits();
19045 APInt DemandedMask = APInt::getLowBitsSet(BitWidth, 16);
19046 if ((SimplifyDemandedBits(N->getOperand(0), DemandedMask, DCI)) ||
19047 (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI)))
19048 return SDValue();
19049 break;
19050 }
19051 case ARMISD::SMLALBT: {
19052 unsigned LowWidth = N->getOperand(0).getValueType().getSizeInBits();
19053 APInt LowMask = APInt::getLowBitsSet(LowWidth, 16);
19054 unsigned HighWidth = N->getOperand(1).getValueType().getSizeInBits();
19055 APInt HighMask = APInt::getHighBitsSet(HighWidth, 16);
19056 if ((SimplifyDemandedBits(N->getOperand(0), LowMask, DCI)) ||
19057 (SimplifyDemandedBits(N->getOperand(1), HighMask, DCI)))
19058 return SDValue();
19059 break;
19060 }
19061 case ARMISD::SMLALTB: {
19062 unsigned HighWidth = N->getOperand(0).getValueType().getSizeInBits();
19063 APInt HighMask = APInt::getHighBitsSet(HighWidth, 16);
19064 unsigned LowWidth = N->getOperand(1).getValueType().getSizeInBits();
19065 APInt LowMask = APInt::getLowBitsSet(LowWidth, 16);
19066 if ((SimplifyDemandedBits(N->getOperand(0), HighMask, DCI)) ||
19067 (SimplifyDemandedBits(N->getOperand(1), LowMask, DCI)))
19068 return SDValue();
19069 break;
19070 }
19071 case ARMISD::SMLALTT: {
19072 unsigned BitWidth = N->getValueType(0).getSizeInBits();
19073 APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 16);
19074 if ((SimplifyDemandedBits(N->getOperand(0), DemandedMask, DCI)) ||
19075 (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI)))
19076 return SDValue();
19077 break;
19078 }
19079 case ARMISD::QADD8b:
19080 case ARMISD::QSUB8b:
19081 case ARMISD::UQADD8b:
19082 case ARMISD::UQSUB8b: {
19083 unsigned BitWidth = N->getValueType(0).getSizeInBits();
19084 APInt DemandedMask = APInt::getLowBitsSet(BitWidth, 8);
19085 if ((SimplifyDemandedBits(N->getOperand(0), DemandedMask, DCI)) ||
19086 (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI)))
19087 return SDValue();
19088 break;
19089 }
19090 case ARMISD::VBSP:
19091 if (N->getOperand(1) == N->getOperand(2))
19092 return N->getOperand(1);
19093 return SDValue();
19096 switch (N->getConstantOperandVal(1)) {
19097 case Intrinsic::arm_neon_vld1:
19098 case Intrinsic::arm_neon_vld1x2:
19099 case Intrinsic::arm_neon_vld1x3:
19100 case Intrinsic::arm_neon_vld1x4:
19101 case Intrinsic::arm_neon_vld2:
19102 case Intrinsic::arm_neon_vld3:
19103 case Intrinsic::arm_neon_vld4:
19104 case Intrinsic::arm_neon_vld2lane:
19105 case Intrinsic::arm_neon_vld3lane:
19106 case Intrinsic::arm_neon_vld4lane:
19107 case Intrinsic::arm_neon_vld2dup:
19108 case Intrinsic::arm_neon_vld3dup:
19109 case Intrinsic::arm_neon_vld4dup:
19110 case Intrinsic::arm_neon_vst1:
19111 case Intrinsic::arm_neon_vst1x2:
19112 case Intrinsic::arm_neon_vst1x3:
19113 case Intrinsic::arm_neon_vst1x4:
19114 case Intrinsic::arm_neon_vst2:
19115 case Intrinsic::arm_neon_vst3:
19116 case Intrinsic::arm_neon_vst4:
19117 case Intrinsic::arm_neon_vst2lane:
19118 case Intrinsic::arm_neon_vst3lane:
19119 case Intrinsic::arm_neon_vst4lane:
19120 return PerformVLDCombine(N, DCI);
19121 case Intrinsic::arm_mve_vld2q:
19122 case Intrinsic::arm_mve_vld4q:
19123 case Intrinsic::arm_mve_vst2q:
19124 case Intrinsic::arm_mve_vst4q:
19125 return PerformMVEVLDCombine(N, DCI);
19126 default: break;
19127 }
19128 break;
19129 }
19130 return SDValue();
19131}
19132
19134 EVT VT) const {
19135 return (VT == MVT::f32) && (Opc == ISD::LOAD || Opc == ISD::STORE);
19136}
19137
19139 Align Alignment,
19141 unsigned *Fast) const {
19142 // Depends what it gets converted into if the type is weird.
19143 if (!VT.isSimple())
19144 return false;
19145
19146 // The AllowsUnaligned flag models the SCTLR.A setting in ARM cpus
19147 bool AllowsUnaligned = Subtarget->allowsUnalignedMem();
19148 auto Ty = VT.getSimpleVT().SimpleTy;
19149
19150 if (Ty == MVT::i8 || Ty == MVT::i16 || Ty == MVT::i32) {
19151 // Unaligned access can use (for example) LRDB, LRDH, LDR
19152 if (AllowsUnaligned) {
19153 if (Fast)
19154 *Fast = Subtarget->hasV7Ops();
19155 return true;
19156 }
19157 }
19158
19159 if (Ty == MVT::f64 || Ty == MVT::v2f64) {
19160 // For any little-endian targets with neon, we can support unaligned ld/st
19161 // of D and Q (e.g. {D0,D1}) registers by using vld1.i8/vst1.i8.
19162 // A big-endian target may also explicitly support unaligned accesses
19163 if (Subtarget->hasNEON() && (AllowsUnaligned || Subtarget->isLittle())) {
19164 if (Fast)
19165 *Fast = 1;
19166 return true;
19167 }
19168 }
19169
19170 if (!Subtarget->hasMVEIntegerOps())
19171 return false;
19172
19173 // These are for predicates
19174 if ((Ty == MVT::v16i1 || Ty == MVT::v8i1 || Ty == MVT::v4i1 ||
19175 Ty == MVT::v2i1)) {
19176 if (Fast)
19177 *Fast = 1;
19178 return true;
19179 }
19180
19181 // These are for truncated stores/narrowing loads. They are fine so long as
19182 // the alignment is at least the size of the item being loaded
19183 if ((Ty == MVT::v4i8 || Ty == MVT::v8i8 || Ty == MVT::v4i16) &&
19184 Alignment >= VT.getScalarSizeInBits() / 8) {
19185 if (Fast)
19186 *Fast = true;
19187 return true;
19188 }
19189
19190 // In little-endian MVE, the store instructions VSTRB.U8, VSTRH.U16 and
19191 // VSTRW.U32 all store the vector register in exactly the same format, and
19192 // differ only in the range of their immediate offset field and the required
19193 // alignment. So there is always a store that can be used, regardless of
19194 // actual type.
19195 //
19196 // For big endian, that is not the case. But can still emit a (VSTRB.U8;
19197 // VREV64.8) pair and get the same effect. This will likely be better than
19198 // aligning the vector through the stack.
19199 if (Ty == MVT::v16i8 || Ty == MVT::v8i16 || Ty == MVT::v8f16 ||
19200 Ty == MVT::v4i32 || Ty == MVT::v4f32 || Ty == MVT::v2i64 ||
19201 Ty == MVT::v2f64) {
19202 if (Fast)
19203 *Fast = 1;
19204 return true;
19205 }
19206
19207 return false;
19208}
19209
19211 LLVMContext &Context, const MemOp &Op,
19212 const AttributeList &FuncAttributes) const {
19213 // See if we can use NEON instructions for this...
19214 if ((Op.isMemcpy() || Op.isZeroMemset()) && Subtarget->hasNEON() &&
19215 !FuncAttributes.hasFnAttr(Attribute::NoImplicitFloat)) {
19216 unsigned Fast;
19217 if (Op.size() >= 16 &&
19218 (Op.isAligned(Align(16)) ||
19219 (allowsMisalignedMemoryAccesses(MVT::v2f64, 0, Align(1),
19221 Fast))) {
19222 return MVT::v2f64;
19223 } else if (Op.size() >= 8 &&
19224 (Op.isAligned(Align(8)) ||
19226 MVT::f64, 0, Align(1), MachineMemOperand::MONone, &Fast) &&
19227 Fast))) {
19228 return MVT::f64;
19229 }
19230 }
19231
19232 // Let the target-independent logic figure it out.
19233 return MVT::Other;
19234}
19235
19236// 64-bit integers are split into their high and low parts and held in two
19237// different registers, so the trunc is free since the low register can just
19238// be used.
19239bool ARMTargetLowering::isTruncateFree(Type *SrcTy, Type *DstTy) const {
19240 if (!SrcTy->isIntegerTy() || !DstTy->isIntegerTy())
19241 return false;
19242 unsigned SrcBits = SrcTy->getPrimitiveSizeInBits();
19243 unsigned DestBits = DstTy->getPrimitiveSizeInBits();
19244 return (SrcBits == 64 && DestBits == 32);
19245}
19246
19248 if (SrcVT.isVector() || DstVT.isVector() || !SrcVT.isInteger() ||
19249 !DstVT.isInteger())
19250 return false;
19251 unsigned SrcBits = SrcVT.getSizeInBits();
19252 unsigned DestBits = DstVT.getSizeInBits();
19253 return (SrcBits == 64 && DestBits == 32);
19254}
19255
19257 if (Val.getOpcode() != ISD::LOAD)
19258 return false;
19259
19260 EVT VT1 = Val.getValueType();
19261 if (!VT1.isSimple() || !VT1.isInteger() ||
19262 !VT2.isSimple() || !VT2.isInteger())
19263 return false;
19264
19265 switch (VT1.getSimpleVT().SimpleTy) {
19266 default: break;
19267 case MVT::i1:
19268 case MVT::i8:
19269 case MVT::i16:
19270 // 8-bit and 16-bit loads implicitly zero-extend to 32-bits.
19271 return true;
19272 }
19273
19274 return false;
19275}
19276
19278 if (!VT.isSimple())
19279 return false;
19280
19281 // There are quite a few FP16 instructions (e.g. VNMLA, VNMLS, etc.) that
19282 // negate values directly (fneg is free). So, we don't want to let the DAG
19283 // combiner rewrite fneg into xors and some other instructions. For f16 and
19284 // FullFP16 argument passing, some bitcast nodes may be introduced,
19285 // triggering this DAG combine rewrite, so we are avoiding that with this.
19286 switch (VT.getSimpleVT().SimpleTy) {
19287 default: break;
19288 case MVT::f16:
19289 return Subtarget->hasFullFP16();
19290 }
19291
19292 return false;
19293}
19294
19296 if (!Subtarget->hasMVEIntegerOps())
19297 return nullptr;
19298 Type *SVIType = SVI->getType();
19299 Type *ScalarType = SVIType->getScalarType();
19300
19301 if (ScalarType->isFloatTy())
19302 return Type::getInt32Ty(SVIType->getContext());
19303 if (ScalarType->isHalfTy())
19304 return Type::getInt16Ty(SVIType->getContext());
19305 return nullptr;
19306}
19307
19309 EVT VT = ExtVal.getValueType();
19310
19311 if (!isTypeLegal(VT))
19312 return false;
19313
19314 if (auto *Ld = dyn_cast<MaskedLoadSDNode>(ExtVal.getOperand(0))) {
19315 if (Ld->isExpandingLoad())
19316 return false;
19317 }
19318
19319 if (Subtarget->hasMVEIntegerOps())
19320 return true;
19321
19322 // Don't create a loadext if we can fold the extension into a wide/long
19323 // instruction.
19324 // If there's more than one user instruction, the loadext is desirable no
19325 // matter what. There can be two uses by the same instruction.
19326 if (ExtVal->use_empty() ||
19327 !ExtVal->user_begin()->isOnlyUserOf(ExtVal.getNode()))
19328 return true;
19329
19330 SDNode *U = *ExtVal->user_begin();
19331 if ((U->getOpcode() == ISD::ADD || U->getOpcode() == ISD::SUB ||
19332 U->getOpcode() == ISD::SHL || U->getOpcode() == ARMISD::VSHLIMM))
19333 return false;
19334
19335 return true;
19336}
19337
19339 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
19340 return false;
19341
19342 if (!isTypeLegal(EVT::getEVT(Ty1)))
19343 return false;
19344
19345 assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop");
19346
19347 // Assuming the caller doesn't have a zeroext or signext return parameter,
19348 // truncation all the way down to i1 is valid.
19349 return true;
19350}
19351
19352/// isFMAFasterThanFMulAndFAdd - Return true if an FMA operation is faster
19353/// than a pair of fmul and fadd instructions. fmuladd intrinsics will be
19354/// expanded to FMAs when this method returns true, otherwise fmuladd is
19355/// expanded to fmul + fadd.
19356///
19357/// ARM supports both fused and unfused multiply-add operations; we already
19358/// lower a pair of fmul and fadd to the latter so it's not clear that there
19359/// would be a gain or that the gain would be worthwhile enough to risk
19360/// correctness bugs.
19361///
19362/// For MVE, we set this to true as it helps simplify the need for some
19363/// patterns (and we don't have the non-fused floating point instruction).
19364bool ARMTargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
19365 EVT VT) const {
19366 if (Subtarget->useSoftFloat())
19367 return false;
19368
19369 if (!VT.isSimple())
19370 return false;
19371
19372 switch (VT.getSimpleVT().SimpleTy) {
19373 case MVT::v4f32:
19374 case MVT::v8f16:
19375 return Subtarget->hasMVEFloatOps();
19376 case MVT::f16:
19377 return Subtarget->useFPVFMx16();
19378 case MVT::f32:
19379 return Subtarget->useFPVFMx();
19380 case MVT::f64:
19381 return Subtarget->useFPVFMx64();
19382 default:
19383 break;
19384 }
19385
19386 return false;
19387}
19388
19389static bool isLegalT1AddressImmediate(int64_t V, EVT VT) {
19390 if (V < 0)
19391 return false;
19392
19393 unsigned Scale = 1;
19394 switch (VT.getSimpleVT().SimpleTy) {
19395 case MVT::i1:
19396 case MVT::i8:
19397 // Scale == 1;
19398 break;
19399 case MVT::i16:
19400 // Scale == 2;
19401 Scale = 2;
19402 break;
19403 default:
19404 // On thumb1 we load most things (i32, i64, floats, etc) with a LDR
19405 // Scale == 4;
19406 Scale = 4;
19407 break;
19408 }
19409
19410 if ((V & (Scale - 1)) != 0)
19411 return false;
19412 return isUInt<5>(V / Scale);
19413}
19414
19415static bool isLegalT2AddressImmediate(int64_t V, EVT VT,
19416 const ARMSubtarget *Subtarget) {
19417 if (!VT.isInteger() && !VT.isFloatingPoint())
19418 return false;
19419 if (VT.isVector() && Subtarget->hasNEON())
19420 return false;
19421 if (VT.isVector() && VT.isFloatingPoint() && Subtarget->hasMVEIntegerOps() &&
19422 !Subtarget->hasMVEFloatOps())
19423 return false;
19424
19425 bool IsNeg = false;
19426 if (V < 0) {
19427 IsNeg = true;
19428 V = -V;
19429 }
19430
19431 unsigned NumBytes = std::max((unsigned)VT.getSizeInBits() / 8, 1U);
19432
19433 // MVE: size * imm7
19434 if (VT.isVector() && Subtarget->hasMVEIntegerOps()) {
19435 switch (VT.getSimpleVT().getVectorElementType().SimpleTy) {
19436 case MVT::i32:
19437 case MVT::f32:
19438 return isShiftedUInt<7,2>(V);
19439 case MVT::i16:
19440 case MVT::f16:
19441 return isShiftedUInt<7,1>(V);
19442 case MVT::i8:
19443 return isUInt<7>(V);
19444 default:
19445 return false;
19446 }
19447 }
19448
19449 // half VLDR: 2 * imm8
19450 if (VT.isFloatingPoint() && NumBytes == 2 && Subtarget->hasFPRegs16())
19451 return isShiftedUInt<8, 1>(V);
19452 // VLDR and LDRD: 4 * imm8
19453 if ((VT.isFloatingPoint() && Subtarget->hasVFP2Base()) || NumBytes == 8)
19454 return isShiftedUInt<8, 2>(V);
19455
19456 if (NumBytes == 1 || NumBytes == 2 || NumBytes == 4) {
19457 // + imm12 or - imm8
19458 if (IsNeg)
19459 return isUInt<8>(V);
19460 return isUInt<12>(V);
19461 }
19462
19463 return false;
19464}
19465
19466/// isLegalAddressImmediate - Return true if the integer value can be used
19467/// as the offset of the target addressing mode for load / store of the
19468/// given type.
19469static bool isLegalAddressImmediate(int64_t V, EVT VT,
19470 const ARMSubtarget *Subtarget) {
19471 if (V == 0)
19472 return true;
19473
19474 if (!VT.isSimple())
19475 return false;
19476
19477 if (Subtarget->isThumb1Only())
19478 return isLegalT1AddressImmediate(V, VT);
19479 else if (Subtarget->isThumb2())
19480 return isLegalT2AddressImmediate(V, VT, Subtarget);
19481
19482 // ARM mode.
19483 if (V < 0)
19484 V = - V;
19485 switch (VT.getSimpleVT().SimpleTy) {
19486 default: return false;
19487 case MVT::i1:
19488 case MVT::i8:
19489 case MVT::i32:
19490 // +- imm12
19491 return isUInt<12>(V);
19492 case MVT::i16:
19493 // +- imm8
19494 return isUInt<8>(V);
19495 case MVT::f32:
19496 case MVT::f64:
19497 if (!Subtarget->hasVFP2Base()) // FIXME: NEON?
19498 return false;
19499 return isShiftedUInt<8, 2>(V);
19500 }
19501}
19502
19504 EVT VT) const {
19505 int Scale = AM.Scale;
19506 if (Scale < 0)
19507 return false;
19508
19509 switch (VT.getSimpleVT().SimpleTy) {
19510 default: return false;
19511 case MVT::i1:
19512 case MVT::i8:
19513 case MVT::i16:
19514 case MVT::i32:
19515 if (Scale == 1)
19516 return true;
19517 // r + r << imm
19518 Scale = Scale & ~1;
19519 return Scale == 2 || Scale == 4 || Scale == 8;
19520 case MVT::i64:
19521 // FIXME: What are we trying to model here? ldrd doesn't have an r + r
19522 // version in Thumb mode.
19523 // r + r
19524 if (Scale == 1)
19525 return true;
19526 // r * 2 (this can be lowered to r + r).
19527 if (!AM.HasBaseReg && Scale == 2)
19528 return true;
19529 return false;
19530 case MVT::isVoid:
19531 // Note, we allow "void" uses (basically, uses that aren't loads or
19532 // stores), because arm allows folding a scale into many arithmetic
19533 // operations. This should be made more precise and revisited later.
19534
19535 // Allow r << imm, but the imm has to be a multiple of two.
19536 if (Scale & 1) return false;
19537 return isPowerOf2_32(Scale);
19538 }
19539}
19540
19542 EVT VT) const {
19543 const int Scale = AM.Scale;
19544
19545 // Negative scales are not supported in Thumb1.
19546 if (Scale < 0)
19547 return false;
19548
19549 // Thumb1 addressing modes do not support register scaling excepting the
19550 // following cases:
19551 // 1. Scale == 1 means no scaling.
19552 // 2. Scale == 2 this can be lowered to r + r if there is no base register.
19553 return (Scale == 1) || (!AM.HasBaseReg && Scale == 2);
19554}
19555
19556/// isLegalAddressingMode - Return true if the addressing mode represented
19557/// by AM is legal for this target, for a load/store of the specified type.
19559 const AddrMode &AM, Type *Ty,
19560 unsigned AS, Instruction *I) const {
19561 EVT VT = getValueType(DL, Ty, true);
19562 if (!isLegalAddressImmediate(AM.BaseOffs, VT, Subtarget))
19563 return false;
19564
19565 // Can never fold addr of global into load/store.
19566 if (AM.BaseGV)
19567 return false;
19568
19569 switch (AM.Scale) {
19570 case 0: // no scale reg, must be "r+i" or "r", or "i".
19571 break;
19572 default:
19573 // ARM doesn't support any R+R*scale+imm addr modes.
19574 if (AM.BaseOffs)
19575 return false;
19576
19577 if (!VT.isSimple())
19578 return false;
19579
19580 if (Subtarget->isThumb1Only())
19581 return isLegalT1ScaledAddressingMode(AM, VT);
19582
19583 if (Subtarget->isThumb2())
19584 return isLegalT2ScaledAddressingMode(AM, VT);
19585
19586 int Scale = AM.Scale;
19587 switch (VT.getSimpleVT().SimpleTy) {
19588 default: return false;
19589 case MVT::i1:
19590 case MVT::i8:
19591 case MVT::i32:
19592 if (Scale < 0) Scale = -Scale;
19593 if (Scale == 1)
19594 return true;
19595 // r + r << imm
19596 return isPowerOf2_32(Scale & ~1);
19597 case MVT::i16:
19598 case MVT::i64:
19599 // r +/- r
19600 if (Scale == 1 || (AM.HasBaseReg && Scale == -1))
19601 return true;
19602 // r * 2 (this can be lowered to r + r).
19603 if (!AM.HasBaseReg && Scale == 2)
19604 return true;
19605 return false;
19606
19607 case MVT::isVoid:
19608 // Note, we allow "void" uses (basically, uses that aren't loads or
19609 // stores), because arm allows folding a scale into many arithmetic
19610 // operations. This should be made more precise and revisited later.
19611
19612 // Allow r << imm, but the imm has to be a multiple of two.
19613 if (Scale & 1) return false;
19614 return isPowerOf2_32(Scale);
19615 }
19616 }
19617 return true;
19618}
19619
19620/// isLegalICmpImmediate - Return true if the specified immediate is legal
19621/// icmp immediate, that is the target has icmp instructions which can compare
19622/// a register against the immediate without having to materialize the
19623/// immediate into a register.
19625 // Thumb2 and ARM modes can use cmn for negative immediates.
19626 if (!Subtarget->isThumb())
19627 return ARM_AM::getSOImmVal((uint32_t)Imm) != -1 ||
19628 ARM_AM::getSOImmVal(-(uint32_t)Imm) != -1;
19629 if (Subtarget->isThumb2())
19630 return ARM_AM::getT2SOImmVal((uint32_t)Imm) != -1 ||
19631 ARM_AM::getT2SOImmVal(-(uint32_t)Imm) != -1;
19632 // Thumb1 doesn't have cmn, and only 8-bit immediates.
19633 return Imm >= 0 && Imm <= 255;
19634}
19635
19636/// isLegalAddImmediate - Return true if the specified immediate is a legal add
19637/// *or sub* immediate, that is the target has add or sub instructions which can
19638/// add a register with the immediate without having to materialize the
19639/// immediate into a register.
19641 // Same encoding for add/sub, just flip the sign.
19642 uint64_t AbsImm = AbsoluteValue(Imm);
19643 if (!Subtarget->isThumb())
19644 return ARM_AM::getSOImmVal(AbsImm) != -1;
19645 if (Subtarget->isThumb2())
19646 return ARM_AM::getT2SOImmVal(AbsImm) != -1;
19647 // Thumb1 only has 8-bit unsigned immediate.
19648 return AbsImm <= 255;
19649}
19650
19651// Return false to prevent folding
19652// (mul (add r, c0), c1) -> (add (mul r, c1), c0*c1) in DAGCombine,
19653// if the folding leads to worse code.
19655 SDValue ConstNode) const {
19656 // Let the DAGCombiner decide for vector types and large types.
19657 const EVT VT = AddNode.getValueType();
19658 if (VT.isVector() || VT.getScalarSizeInBits() > 32)
19659 return true;
19660
19661 // It is worse if c0 is legal add immediate, while c1*c0 is not
19662 // and has to be composed by at least two instructions.
19663 const ConstantSDNode *C0Node = cast<ConstantSDNode>(AddNode.getOperand(1));
19664 const ConstantSDNode *C1Node = cast<ConstantSDNode>(ConstNode);
19665 const int64_t C0 = C0Node->getSExtValue();
19666 APInt CA = C0Node->getAPIntValue() * C1Node->getAPIntValue();
19668 return true;
19669 if (ConstantMaterializationCost((unsigned)CA.getZExtValue(), Subtarget) > 1)
19670 return false;
19671
19672 // Default to true and let the DAGCombiner decide.
19673 return true;
19674}
19675
19677 bool isSEXTLoad, SDValue &Base,
19678 SDValue &Offset, bool &isInc,
19679 SelectionDAG &DAG) {
19680 if (Ptr->getOpcode() != ISD::ADD && Ptr->getOpcode() != ISD::SUB)
19681 return false;
19682
19683 if (VT == MVT::i16 || ((VT == MVT::i8 || VT == MVT::i1) && isSEXTLoad)) {
19684 // AddressingMode 3
19685 Base = Ptr->getOperand(0);
19686 if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Ptr->getOperand(1))) {
19687 int RHSC = (int)RHS->getZExtValue();
19688 if (RHSC < 0 && RHSC > -256) {
19689 assert(Ptr->getOpcode() == ISD::ADD);
19690 isInc = false;
19691 Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0));
19692 return true;
19693 }
19694 }
19695 isInc = (Ptr->getOpcode() == ISD::ADD);
19696 Offset = Ptr->getOperand(1);
19697 return true;
19698 } else if (VT == MVT::i32 || VT == MVT::i8 || VT == MVT::i1) {
19699 // AddressingMode 2
19700 if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Ptr->getOperand(1))) {
19701 int RHSC = (int)RHS->getZExtValue();
19702 if (RHSC < 0 && RHSC > -0x1000) {
19703 assert(Ptr->getOpcode() == ISD::ADD);
19704 isInc = false;
19705 Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0));
19706 Base = Ptr->getOperand(0);
19707 return true;
19708 }
19709 }
19710
19711 if (Ptr->getOpcode() == ISD::ADD) {
19712 isInc = true;
19713 ARM_AM::ShiftOpc ShOpcVal=
19714 ARM_AM::getShiftOpcForNode(Ptr->getOperand(0).getOpcode());
19715 if (ShOpcVal != ARM_AM::no_shift) {
19716 Base = Ptr->getOperand(1);
19717 Offset = Ptr->getOperand(0);
19718 } else {
19719 Base = Ptr->getOperand(0);
19720 Offset = Ptr->getOperand(1);
19721 }
19722 return true;
19723 }
19724
19725 isInc = (Ptr->getOpcode() == ISD::ADD);
19726 Base = Ptr->getOperand(0);
19727 Offset = Ptr->getOperand(1);
19728 return true;
19729 }
19730
19731 // FIXME: Use VLDM / VSTM to emulate indexed FP load / store.
19732 return false;
19733}
19734
19736 bool isSEXTLoad, SDValue &Base,
19737 SDValue &Offset, bool &isInc,
19738 SelectionDAG &DAG) {
19739 if (Ptr->getOpcode() != ISD::ADD && Ptr->getOpcode() != ISD::SUB)
19740 return false;
19741
19742 Base = Ptr->getOperand(0);
19743 if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Ptr->getOperand(1))) {
19744 int RHSC = (int)RHS->getZExtValue();
19745 if (RHSC < 0 && RHSC > -0x100) { // 8 bits.
19746 assert(Ptr->getOpcode() == ISD::ADD);
19747 isInc = false;
19748 Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0));
19749 return true;
19750 } else if (RHSC > 0 && RHSC < 0x100) { // 8 bit, no zero.
19751 isInc = Ptr->getOpcode() == ISD::ADD;
19752 Offset = DAG.getConstant(RHSC, SDLoc(Ptr), RHS->getValueType(0));
19753 return true;
19754 }
19755 }
19756
19757 return false;
19758}
19759
19760static bool getMVEIndexedAddressParts(SDNode *Ptr, EVT VT, Align Alignment,
19761 bool isSEXTLoad, bool IsMasked, bool isLE,
19763 bool &isInc, SelectionDAG &DAG) {
19764 if (Ptr->getOpcode() != ISD::ADD && Ptr->getOpcode() != ISD::SUB)
19765 return false;
19766 if (!isa<ConstantSDNode>(Ptr->getOperand(1)))
19767 return false;
19768
19769 // We allow LE non-masked loads to change the type (for example use a vldrb.8
19770 // as opposed to a vldrw.32). This can allow extra addressing modes or
19771 // alignments for what is otherwise an equivalent instruction.
19772 bool CanChangeType = isLE && !IsMasked;
19773
19774 ConstantSDNode *RHS = cast<ConstantSDNode>(Ptr->getOperand(1));
19775 int RHSC = (int)RHS->getZExtValue();
19776
19777 auto IsInRange = [&](int RHSC, int Limit, int Scale) {
19778 if (RHSC < 0 && RHSC > -Limit * Scale && RHSC % Scale == 0) {
19779 assert(Ptr->getOpcode() == ISD::ADD);
19780 isInc = false;
19781 Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0));
19782 return true;
19783 } else if (RHSC > 0 && RHSC < Limit * Scale && RHSC % Scale == 0) {
19784 isInc = Ptr->getOpcode() == ISD::ADD;
19785 Offset = DAG.getConstant(RHSC, SDLoc(Ptr), RHS->getValueType(0));
19786 return true;
19787 }
19788 return false;
19789 };
19790
19791 // Try to find a matching instruction based on s/zext, Alignment, Offset and
19792 // (in BE/masked) type.
19793 Base = Ptr->getOperand(0);
19794 if (VT == MVT::v4i16) {
19795 if (Alignment >= 2 && IsInRange(RHSC, 0x80, 2))
19796 return true;
19797 } else if (VT == MVT::v4i8 || VT == MVT::v8i8) {
19798 if (IsInRange(RHSC, 0x80, 1))
19799 return true;
19800 } else if (Alignment >= 4 &&
19801 (CanChangeType || VT == MVT::v4i32 || VT == MVT::v4f32) &&
19802 IsInRange(RHSC, 0x80, 4))
19803 return true;
19804 else if (Alignment >= 2 &&
19805 (CanChangeType || VT == MVT::v8i16 || VT == MVT::v8f16) &&
19806 IsInRange(RHSC, 0x80, 2))
19807 return true;
19808 else if ((CanChangeType || VT == MVT::v16i8) && IsInRange(RHSC, 0x80, 1))
19809 return true;
19810 return false;
19811}
19812
19813/// getPreIndexedAddressParts - returns true by value, base pointer and
19814/// offset pointer and addressing mode by reference if the node's address
19815/// can be legally represented as pre-indexed load / store address.
19816bool
19818 SDValue &Offset,
19820 SelectionDAG &DAG) const {
19821 if (Subtarget->isThumb1Only())
19822 return false;
19823
19824 EVT VT;
19825 SDValue Ptr;
19826 Align Alignment;
19827 bool isSEXTLoad = false;
19828 bool IsMasked = false;
19829 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
19830 Ptr = LD->getBasePtr();
19831 VT = LD->getMemoryVT();
19832 Alignment = LD->getAlign();
19833 isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD;
19834 } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
19835 Ptr = ST->getBasePtr();
19836 VT = ST->getMemoryVT();
19837 Alignment = ST->getAlign();
19838 } else if (MaskedLoadSDNode *LD = dyn_cast<MaskedLoadSDNode>(N)) {
19839 Ptr = LD->getBasePtr();
19840 VT = LD->getMemoryVT();
19841 Alignment = LD->getAlign();
19842 isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD;
19843 IsMasked = true;
19845 Ptr = ST->getBasePtr();
19846 VT = ST->getMemoryVT();
19847 Alignment = ST->getAlign();
19848 IsMasked = true;
19849 } else
19850 return false;
19851
19852 bool isInc;
19853 bool isLegal = false;
19854 if (VT.isVector())
19855 isLegal = Subtarget->hasMVEIntegerOps() &&
19857 Ptr.getNode(), VT, Alignment, isSEXTLoad, IsMasked,
19858 Subtarget->isLittle(), Base, Offset, isInc, DAG);
19859 else {
19860 if (Subtarget->isThumb2())
19861 isLegal = getT2IndexedAddressParts(Ptr.getNode(), VT, isSEXTLoad, Base,
19862 Offset, isInc, DAG);
19863 else
19864 isLegal = getARMIndexedAddressParts(Ptr.getNode(), VT, isSEXTLoad, Base,
19865 Offset, isInc, DAG);
19866 }
19867 if (!isLegal)
19868 return false;
19869
19870 AM = isInc ? ISD::PRE_INC : ISD::PRE_DEC;
19871 return true;
19872}
19873
19874/// getPostIndexedAddressParts - returns true by value, base pointer and
19875/// offset pointer and addressing mode by reference if this node can be
19876/// combined with a load / store to form a post-indexed load / store.
19878 SDValue &Base,
19879 SDValue &Offset,
19881 SelectionDAG &DAG) const {
19882 EVT VT;
19883 SDValue Ptr;
19884 Align Alignment;
19885 bool isSEXTLoad = false, isNonExt;
19886 bool IsMasked = false;
19887 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
19888 VT = LD->getMemoryVT();
19889 Ptr = LD->getBasePtr();
19890 Alignment = LD->getAlign();
19891 isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD;
19892 isNonExt = LD->getExtensionType() == ISD::NON_EXTLOAD;
19893 } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
19894 VT = ST->getMemoryVT();
19895 Ptr = ST->getBasePtr();
19896 Alignment = ST->getAlign();
19897 isNonExt = !ST->isTruncatingStore();
19898 } else if (MaskedLoadSDNode *LD = dyn_cast<MaskedLoadSDNode>(N)) {
19899 VT = LD->getMemoryVT();
19900 Ptr = LD->getBasePtr();
19901 Alignment = LD->getAlign();
19902 isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD;
19903 isNonExt = LD->getExtensionType() == ISD::NON_EXTLOAD;
19904 IsMasked = true;
19906 VT = ST->getMemoryVT();
19907 Ptr = ST->getBasePtr();
19908 Alignment = ST->getAlign();
19909 isNonExt = !ST->isTruncatingStore();
19910 IsMasked = true;
19911 } else
19912 return false;
19913
19914 if (Subtarget->isThumb1Only()) {
19915 // Thumb-1 can do a limited post-inc load or store as an updating LDM. It
19916 // must be non-extending/truncating, i32, with an offset of 4.
19917 assert(Op->getValueType(0) == MVT::i32 && "Non-i32 post-inc op?!");
19918 if (Op->getOpcode() != ISD::ADD || !isNonExt)
19919 return false;
19920 auto *RHS = dyn_cast<ConstantSDNode>(Op->getOperand(1));
19921 if (!RHS || RHS->getZExtValue() != 4)
19922 return false;
19923 if (Alignment < Align(4))
19924 return false;
19925
19926 Offset = Op->getOperand(1);
19927 Base = Op->getOperand(0);
19928 AM = ISD::POST_INC;
19929 return true;
19930 }
19931
19932 bool isInc;
19933 bool isLegal = false;
19934 if (VT.isVector())
19935 isLegal = Subtarget->hasMVEIntegerOps() &&
19936 getMVEIndexedAddressParts(Op, VT, Alignment, isSEXTLoad, IsMasked,
19937 Subtarget->isLittle(), Base, Offset,
19938 isInc, DAG);
19939 else {
19940 if (Subtarget->isThumb2())
19941 isLegal = getT2IndexedAddressParts(Op, VT, isSEXTLoad, Base, Offset,
19942 isInc, DAG);
19943 else
19944 isLegal = getARMIndexedAddressParts(Op, VT, isSEXTLoad, Base, Offset,
19945 isInc, DAG);
19946 }
19947 if (!isLegal)
19948 return false;
19949
19950 if (Ptr != Base) {
19951 // Swap base ptr and offset to catch more post-index load / store when
19952 // it's legal. In Thumb2 mode, offset must be an immediate.
19953 if (Ptr == Offset && Op->getOpcode() == ISD::ADD &&
19954 !Subtarget->isThumb2())
19956
19957 // Post-indexed load / store update the base pointer.
19958 if (Ptr != Base)
19959 return false;
19960 }
19961
19962 AM = isInc ? ISD::POST_INC : ISD::POST_DEC;
19963 return true;
19964}
19965
19967 KnownBits &Known,
19968 const APInt &DemandedElts,
19969 const SelectionDAG &DAG,
19970 unsigned Depth) const {
19971 unsigned BitWidth = Known.getBitWidth();
19972 Known.resetAll();
19973 switch (Op.getOpcode()) {
19974 default: break;
19975 case ARMISD::ADDC:
19976 case ARMISD::ADDE:
19977 case ARMISD::SUBC:
19978 case ARMISD::SUBE:
19979 // Special cases when we convert a carry to a boolean.
19980 if (Op.getResNo() == 0) {
19981 SDValue LHS = Op.getOperand(0);
19982 SDValue RHS = Op.getOperand(1);
19983 // (ADDE 0, 0, C) will give us a single bit.
19984 if (Op->getOpcode() == ARMISD::ADDE && isNullConstant(LHS) &&
19985 isNullConstant(RHS)) {
19987 return;
19988 }
19989 }
19990 break;
19991 case ARMISD::CMOV: {
19992 // Bits are known zero/one if known on the LHS and RHS.
19993 Known = DAG.computeKnownBits(Op.getOperand(0), Depth+1);
19994 if (Known.isUnknown())
19995 return;
19996
19997 KnownBits KnownRHS = DAG.computeKnownBits(Op.getOperand(1), Depth+1);
19998 Known = Known.intersectWith(KnownRHS);
19999 return;
20000 }
20002 Intrinsic::ID IntID =
20003 static_cast<Intrinsic::ID>(Op->getConstantOperandVal(1));
20004 switch (IntID) {
20005 default: return;
20006 case Intrinsic::arm_ldaex:
20007 case Intrinsic::arm_ldrex: {
20008 EVT VT = cast<MemIntrinsicSDNode>(Op)->getMemoryVT();
20009 unsigned MemBits = VT.getScalarSizeInBits();
20010 Known.Zero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits);
20011 return;
20012 }
20013 }
20014 }
20015 case ARMISD::BFI: {
20016 // Conservatively, we can recurse down the first operand
20017 // and just mask out all affected bits.
20018 Known = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
20019
20020 // The operand to BFI is already a mask suitable for removing the bits it
20021 // sets.
20022 const APInt &Mask = Op.getConstantOperandAPInt(2);
20023 Known.Zero &= Mask;
20024 Known.One &= Mask;
20025 return;
20026 }
20027 case ARMISD::VGETLANEs:
20028 case ARMISD::VGETLANEu: {
20029 const SDValue &SrcSV = Op.getOperand(0);
20030 EVT VecVT = SrcSV.getValueType();
20031 assert(VecVT.isVector() && "VGETLANE expected a vector type");
20032 const unsigned NumSrcElts = VecVT.getVectorNumElements();
20033 ConstantSDNode *Pos = cast<ConstantSDNode>(Op.getOperand(1).getNode());
20034 assert(Pos->getAPIntValue().ult(NumSrcElts) &&
20035 "VGETLANE index out of bounds");
20036 unsigned Idx = Pos->getZExtValue();
20037 APInt DemandedElt = APInt::getOneBitSet(NumSrcElts, Idx);
20038 Known = DAG.computeKnownBits(SrcSV, DemandedElt, Depth + 1);
20039
20040 EVT VT = Op.getValueType();
20041 const unsigned DstSz = VT.getScalarSizeInBits();
20042 const unsigned SrcSz = VecVT.getVectorElementType().getSizeInBits();
20043 (void)SrcSz;
20044 assert(SrcSz == Known.getBitWidth());
20045 assert(DstSz > SrcSz);
20046 if (Op.getOpcode() == ARMISD::VGETLANEs)
20047 Known = Known.sext(DstSz);
20048 else {
20049 Known = Known.zext(DstSz);
20050 }
20051 assert(DstSz == Known.getBitWidth());
20052 break;
20053 }
20054 case ARMISD::VMOVrh: {
20055 KnownBits KnownOp = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
20056 assert(KnownOp.getBitWidth() == 16);
20057 Known = KnownOp.zext(32);
20058 break;
20059 }
20060 case ARMISD::CSINC:
20061 case ARMISD::CSINV:
20062 case ARMISD::CSNEG: {
20063 KnownBits KnownOp0 = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
20064 KnownBits KnownOp1 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
20065
20066 // The result is either:
20067 // CSINC: KnownOp0 or KnownOp1 + 1
20068 // CSINV: KnownOp0 or ~KnownOp1
20069 // CSNEG: KnownOp0 or KnownOp1 * -1
20070 if (Op.getOpcode() == ARMISD::CSINC)
20071 KnownOp1 =
20072 KnownBits::add(KnownOp1, KnownBits::makeConstant(APInt(32, 1)));
20073 else if (Op.getOpcode() == ARMISD::CSINV)
20074 std::swap(KnownOp1.Zero, KnownOp1.One);
20075 else if (Op.getOpcode() == ARMISD::CSNEG)
20076 KnownOp1 = KnownBits::mul(KnownOp1,
20078
20079 Known = KnownOp0.intersectWith(KnownOp1);
20080 break;
20081 }
20082 case ARMISD::VORRIMM:
20083 case ARMISD::VBICIMM: {
20084 unsigned Encoded = Op.getConstantOperandVal(1);
20085 unsigned DecEltBits = 0;
20086 uint64_t DecodedVal = ARM_AM::decodeVMOVModImm(Encoded, DecEltBits);
20087
20088 unsigned EltBits = Op.getScalarValueSizeInBits();
20089 if (EltBits != DecEltBits) {
20090 // Be conservative: only update Known when EltBits == DecEltBits.
20091 // This is believed to always be true for VORRIMM/VBICIMM today, but if
20092 // that changes in the future, doing nothing here is safer than risking
20093 // subtle bugs.
20094 break;
20095 }
20096
20097 KnownBits KnownLHS = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
20098 bool IsVORR = Op.getOpcode() == ARMISD::VORRIMM;
20099 APInt Imm(DecEltBits, DecodedVal);
20100
20101 Known.One = IsVORR ? (KnownLHS.One | Imm) : (KnownLHS.One & ~Imm);
20102 Known.Zero = IsVORR ? (KnownLHS.Zero & ~Imm) : (KnownLHS.Zero | Imm);
20103 break;
20104 }
20105 }
20106}
20107
20109 SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
20110 TargetLoweringOpt &TLO) const {
20111 // Delay optimization, so we don't have to deal with illegal types, or block
20112 // optimizations.
20113 if (!TLO.LegalOps)
20114 return false;
20115
20116 // Only optimize AND for now.
20117 if (Op.getOpcode() != ISD::AND)
20118 return false;
20119
20120 EVT VT = Op.getValueType();
20121
20122 // Ignore vectors.
20123 if (VT.isVector())
20124 return false;
20125
20126 assert(VT == MVT::i32 && "Unexpected integer type");
20127
20128 // Make sure the RHS really is a constant.
20129 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
20130 if (!C)
20131 return false;
20132
20133 unsigned Mask = C->getZExtValue();
20134
20135 unsigned Demanded = DemandedBits.getZExtValue();
20136 unsigned ShrunkMask = Mask & Demanded;
20137 unsigned ExpandedMask = Mask | ~Demanded;
20138
20139 // If the mask is all zeros, let the target-independent code replace the
20140 // result with zero.
20141 if (ShrunkMask == 0)
20142 return false;
20143
20144 // If the mask is all ones, erase the AND. (Currently, the target-independent
20145 // code won't do this, so we have to do it explicitly to avoid an infinite
20146 // loop in obscure cases.)
20147 if (ExpandedMask == ~0U)
20148 return TLO.CombineTo(Op, Op.getOperand(0));
20149
20150 auto IsLegalMask = [ShrunkMask, ExpandedMask](unsigned Mask) -> bool {
20151 return (ShrunkMask & Mask) == ShrunkMask && (~ExpandedMask & Mask) == 0;
20152 };
20153 auto UseMask = [Mask, Op, VT, &TLO](unsigned NewMask) -> bool {
20154 if (NewMask == Mask)
20155 return true;
20156 SDLoc DL(Op);
20157 SDValue NewC = TLO.DAG.getConstant(NewMask, DL, VT);
20158 SDValue NewOp = TLO.DAG.getNode(ISD::AND, DL, VT, Op.getOperand(0), NewC);
20159 return TLO.CombineTo(Op, NewOp);
20160 };
20161
20162 // Prefer uxtb mask.
20163 if (IsLegalMask(0xFF))
20164 return UseMask(0xFF);
20165
20166 // Prefer uxth mask.
20167 if (IsLegalMask(0xFFFF))
20168 return UseMask(0xFFFF);
20169
20170 // [1, 255] is Thumb1 movs+ands, legal immediate for ARM/Thumb2.
20171 // FIXME: Prefer a contiguous sequence of bits for other optimizations.
20172 if (ShrunkMask < 256)
20173 return UseMask(ShrunkMask);
20174
20175 // [-256, -2] is Thumb1 movs+bics, legal immediate for ARM/Thumb2.
20176 // FIXME: Prefer a contiguous sequence of bits for other optimizations.
20177 if ((int)ExpandedMask <= -2 && (int)ExpandedMask >= -256)
20178 return UseMask(ExpandedMask);
20179
20180 // Potential improvements:
20181 //
20182 // We could try to recognize lsls+lsrs or lsrs+lsls pairs here.
20183 // We could try to prefer Thumb1 immediates which can be lowered to a
20184 // two-instruction sequence.
20185 // We could try to recognize more legal ARM/Thumb2 immediates here.
20186
20187 return false;
20188}
20189
20191 SDValue Op, const APInt &OriginalDemandedBits,
20192 const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO,
20193 unsigned Depth) const {
20194 unsigned Opc = Op.getOpcode();
20195
20196 switch (Opc) {
20197 case ARMISD::ASRL:
20198 case ARMISD::LSRL: {
20199 // If this is result 0 and the other result is unused, see if the demand
20200 // bits allow us to shrink this long shift into a standard small shift in
20201 // the opposite direction.
20202 if (Op.getResNo() == 0 && !Op->hasAnyUseOfValue(1) &&
20203 isa<ConstantSDNode>(Op->getOperand(2))) {
20204 unsigned ShAmt = Op->getConstantOperandVal(2);
20205 if (ShAmt < 32 && OriginalDemandedBits.isSubsetOf(APInt::getAllOnes(32)
20206 << (32 - ShAmt)))
20207 return TLO.CombineTo(
20208 Op, TLO.DAG.getNode(
20209 ISD::SHL, SDLoc(Op), MVT::i32, Op.getOperand(1),
20210 TLO.DAG.getConstant(32 - ShAmt, SDLoc(Op), MVT::i32)));
20211 }
20212 break;
20213 }
20214 case ARMISD::VBICIMM: {
20215 SDValue Op0 = Op.getOperand(0);
20216 unsigned ModImm = Op.getConstantOperandVal(1);
20217 unsigned EltBits = 0;
20218 uint64_t Mask = ARM_AM::decodeVMOVModImm(ModImm, EltBits);
20219 if ((OriginalDemandedBits & Mask) == 0)
20220 return TLO.CombineTo(Op, Op0);
20221 }
20222 }
20223
20225 Op, OriginalDemandedBits, OriginalDemandedElts, Known, TLO, Depth);
20226}
20227
20228//===----------------------------------------------------------------------===//
20229// ARM Inline Assembly Support
20230//===----------------------------------------------------------------------===//
20231
20232const char *ARMTargetLowering::LowerXConstraint(EVT ConstraintVT) const {
20233 // At this point, we have to lower this constraint to something else, so we
20234 // lower it to an "r" or "w". However, by doing this we will force the result
20235 // to be in register, while the X constraint is much more permissive.
20236 //
20237 // Although we are correct (we are free to emit anything, without
20238 // constraints), we might break use cases that would expect us to be more
20239 // efficient and emit something else.
20240 if (!Subtarget->hasVFP2Base())
20241 return "r";
20242 if (ConstraintVT.isFloatingPoint())
20243 return "w";
20244 if (ConstraintVT.isVector() && Subtarget->hasNEON() &&
20245 (ConstraintVT.getSizeInBits() == 64 ||
20246 ConstraintVT.getSizeInBits() == 128))
20247 return "w";
20248
20249 return "r";
20250}
20251
20252/// getConstraintType - Given a constraint letter, return the type of
20253/// constraint it is for this target.
20256 unsigned S = Constraint.size();
20257 if (S == 1) {
20258 switch (Constraint[0]) {
20259 default: break;
20260 case 'l': return C_RegisterClass;
20261 case 'w': return C_RegisterClass;
20262 case 'h': return C_RegisterClass;
20263 case 'x': return C_RegisterClass;
20264 case 't': return C_RegisterClass;
20265 case 'j': return C_Immediate; // Constant for movw.
20266 // An address with a single base register. Due to the way we
20267 // currently handle addresses it is the same as an 'r' memory constraint.
20268 case 'Q': return C_Memory;
20269 }
20270 } else if (S == 2) {
20271 switch (Constraint[0]) {
20272 default: break;
20273 case 'T': return C_RegisterClass;
20274 // All 'U+' constraints are addresses.
20275 case 'U': return C_Memory;
20276 }
20277 }
20278 return TargetLowering::getConstraintType(Constraint);
20279}
20280
20281/// Examine constraint type and operand type and determine a weight value.
20282/// This object must already have been set up with the operand type
20283/// and the current alternative constraint selected.
20286 AsmOperandInfo &info, const char *constraint) const {
20288 Value *CallOperandVal = info.CallOperandVal;
20289 // If we don't have a value, we can't do a match,
20290 // but allow it at the lowest weight.
20291 if (!CallOperandVal)
20292 return CW_Default;
20293 Type *type = CallOperandVal->getType();
20294 // Look at the constraint type.
20295 switch (*constraint) {
20296 default:
20298 break;
20299 case 'l':
20300 if (type->isIntegerTy()) {
20301 if (Subtarget->isThumb())
20302 weight = CW_SpecificReg;
20303 else
20304 weight = CW_Register;
20305 }
20306 break;
20307 case 'w':
20308 if (type->isFloatingPointTy())
20309 weight = CW_Register;
20310 break;
20311 }
20312 return weight;
20313}
20314
20315static bool isIncompatibleReg(const MCPhysReg &PR, MVT VT) {
20316 if (PR == 0 || VT == MVT::Other)
20317 return false;
20318 if (ARM::SPRRegClass.contains(PR))
20319 return VT != MVT::f32 && VT != MVT::f16 && VT != MVT::i32;
20320 if (ARM::DPRRegClass.contains(PR))
20321 return VT != MVT::f64 && !VT.is64BitVector();
20322 return false;
20323}
20324
20325using RCPair = std::pair<unsigned, const TargetRegisterClass *>;
20326
20328 const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const {
20329 switch (Constraint.size()) {
20330 case 1:
20331 // GCC ARM Constraint Letters
20332 switch (Constraint[0]) {
20333 case 'l': // Low regs or general regs.
20334 if (Subtarget->isThumb())
20335 return RCPair(0U, &ARM::tGPRRegClass);
20336 return RCPair(0U, &ARM::GPRRegClass);
20337 case 'h': // High regs or no regs.
20338 if (Subtarget->isThumb())
20339 return RCPair(0U, &ARM::hGPRRegClass);
20340 break;
20341 case 'r':
20342 if (Subtarget->isThumb1Only())
20343 return RCPair(0U, &ARM::tGPRRegClass);
20344 return RCPair(0U, &ARM::GPRRegClass);
20345 case 'w':
20346 if (VT == MVT::Other)
20347 break;
20348 if (VT == MVT::f32 || VT == MVT::f16 || VT == MVT::bf16)
20349 return RCPair(0U, &ARM::SPRRegClass);
20350 if (VT.getSizeInBits() == 64)
20351 return RCPair(0U, &ARM::DPRRegClass);
20352 if (VT.getSizeInBits() == 128)
20353 return RCPair(0U, &ARM::QPRRegClass);
20354 break;
20355 case 'x':
20356 if (VT == MVT::Other)
20357 break;
20358 if (VT == MVT::f32 || VT == MVT::f16 || VT == MVT::bf16)
20359 return RCPair(0U, &ARM::SPR_8RegClass);
20360 if (VT.getSizeInBits() == 64)
20361 return RCPair(0U, &ARM::DPR_8RegClass);
20362 if (VT.getSizeInBits() == 128)
20363 return RCPair(0U, &ARM::QPR_8RegClass);
20364 break;
20365 case 't':
20366 if (VT == MVT::Other)
20367 break;
20368 if (VT == MVT::f32 || VT == MVT::i32 || VT == MVT::f16 || VT == MVT::bf16)
20369 return RCPair(0U, &ARM::SPRRegClass);
20370 if (VT.getSizeInBits() == 64)
20371 return RCPair(0U, &ARM::DPR_VFP2RegClass);
20372 if (VT.getSizeInBits() == 128)
20373 return RCPair(0U, &ARM::QPR_VFP2RegClass);
20374 break;
20375 }
20376 break;
20377
20378 case 2:
20379 if (Constraint[0] == 'T') {
20380 switch (Constraint[1]) {
20381 default:
20382 break;
20383 case 'e':
20384 return RCPair(0U, &ARM::tGPREvenRegClass);
20385 case 'o':
20386 return RCPair(0U, &ARM::tGPROddRegClass);
20387 }
20388 }
20389 break;
20390
20391 default:
20392 break;
20393 }
20394
20395 if (StringRef("{cc}").equals_insensitive(Constraint))
20396 return std::make_pair(unsigned(ARM::CPSR), &ARM::CCRRegClass);
20397
20398 auto RCP = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
20399 if (isIncompatibleReg(RCP.first, VT))
20400 return {0, nullptr};
20401 return RCP;
20402}
20403
20404/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
20405/// vector. If it is invalid, don't add anything to Ops.
20407 StringRef Constraint,
20408 std::vector<SDValue> &Ops,
20409 SelectionDAG &DAG) const {
20410 SDValue Result;
20411
20412 // Currently only support length 1 constraints.
20413 if (Constraint.size() != 1)
20414 return;
20415
20416 char ConstraintLetter = Constraint[0];
20417 switch (ConstraintLetter) {
20418 default: break;
20419 case 'j':
20420 case 'I': case 'J': case 'K': case 'L':
20421 case 'M': case 'N': case 'O':
20423 if (!C)
20424 return;
20425
20426 int64_t CVal64 = C->getSExtValue();
20427 int CVal = (int) CVal64;
20428 // None of these constraints allow values larger than 32 bits. Check
20429 // that the value fits in an int.
20430 if (CVal != CVal64)
20431 return;
20432
20433 switch (ConstraintLetter) {
20434 case 'j':
20435 // Constant suitable for movw, must be between 0 and
20436 // 65535.
20437 if (Subtarget->hasV6T2Ops() || (Subtarget->hasV8MBaselineOps()))
20438 if (CVal >= 0 && CVal <= 65535)
20439 break;
20440 return;
20441 case 'I':
20442 if (Subtarget->isThumb1Only()) {
20443 // This must be a constant between 0 and 255, for ADD
20444 // immediates.
20445 if (CVal >= 0 && CVal <= 255)
20446 break;
20447 } else if (Subtarget->isThumb2()) {
20448 // A constant that can be used as an immediate value in a
20449 // data-processing instruction.
20450 if (ARM_AM::getT2SOImmVal(CVal) != -1)
20451 break;
20452 } else {
20453 // A constant that can be used as an immediate value in a
20454 // data-processing instruction.
20455 if (ARM_AM::getSOImmVal(CVal) != -1)
20456 break;
20457 }
20458 return;
20459
20460 case 'J':
20461 if (Subtarget->isThumb1Only()) {
20462 // This must be a constant between -255 and -1, for negated ADD
20463 // immediates. This can be used in GCC with an "n" modifier that
20464 // prints the negated value, for use with SUB instructions. It is
20465 // not useful otherwise but is implemented for compatibility.
20466 if (CVal >= -255 && CVal <= -1)
20467 break;
20468 } else {
20469 // This must be a constant between -4095 and 4095. This is suitable
20470 // for use as the immediate offset field in LDR and STR instructions
20471 // such as LDR r0,[r1,#offset].
20472 if (CVal >= -4095 && CVal <= 4095)
20473 break;
20474 }
20475 return;
20476
20477 case 'K':
20478 if (Subtarget->isThumb1Only()) {
20479 // A 32-bit value where only one byte has a nonzero value. Exclude
20480 // zero to match GCC. This constraint is used by GCC internally for
20481 // constants that can be loaded with a move/shift combination.
20482 // It is not useful otherwise but is implemented for compatibility.
20483 if (CVal != 0 && ARM_AM::isThumbImmShiftedVal(CVal))
20484 break;
20485 } else if (Subtarget->isThumb2()) {
20486 // A constant whose bitwise inverse can be used as an immediate
20487 // value in a data-processing instruction. This can be used in GCC
20488 // with a "B" modifier that prints the inverted value, for use with
20489 // BIC and MVN instructions. It is not useful otherwise but is
20490 // implemented for compatibility.
20491 if (ARM_AM::getT2SOImmVal(~CVal) != -1)
20492 break;
20493 } else {
20494 // A constant whose bitwise inverse can be used as an immediate
20495 // value in a data-processing instruction. This can be used in GCC
20496 // with a "B" modifier that prints the inverted value, for use with
20497 // BIC and MVN instructions. It is not useful otherwise but is
20498 // implemented for compatibility.
20499 if (ARM_AM::getSOImmVal(~CVal) != -1)
20500 break;
20501 }
20502 return;
20503
20504 case 'L':
20505 if (Subtarget->isThumb1Only()) {
20506 // This must be a constant between -7 and 7,
20507 // for 3-operand ADD/SUB immediate instructions.
20508 if (CVal >= -7 && CVal < 7)
20509 break;
20510 } else if (Subtarget->isThumb2()) {
20511 // A constant whose negation can be used as an immediate value in a
20512 // data-processing instruction. This can be used in GCC with an "n"
20513 // modifier that prints the negated value, for use with SUB
20514 // instructions. It is not useful otherwise but is implemented for
20515 // compatibility.
20516 if (ARM_AM::getT2SOImmVal(-CVal) != -1)
20517 break;
20518 } else {
20519 // A constant whose negation can be used as an immediate value in a
20520 // data-processing instruction. This can be used in GCC with an "n"
20521 // modifier that prints the negated value, for use with SUB
20522 // instructions. It is not useful otherwise but is implemented for
20523 // compatibility.
20524 if (ARM_AM::getSOImmVal(-CVal) != -1)
20525 break;
20526 }
20527 return;
20528
20529 case 'M':
20530 if (Subtarget->isThumb1Only()) {
20531 // This must be a multiple of 4 between 0 and 1020, for
20532 // ADD sp + immediate.
20533 if ((CVal >= 0 && CVal <= 1020) && ((CVal & 3) == 0))
20534 break;
20535 } else {
20536 // A power of two or a constant between 0 and 32. This is used in
20537 // GCC for the shift amount on shifted register operands, but it is
20538 // useful in general for any shift amounts.
20539 if ((CVal >= 0 && CVal <= 32) || ((CVal & (CVal - 1)) == 0))
20540 break;
20541 }
20542 return;
20543
20544 case 'N':
20545 if (Subtarget->isThumb1Only()) {
20546 // This must be a constant between 0 and 31, for shift amounts.
20547 if (CVal >= 0 && CVal <= 31)
20548 break;
20549 }
20550 return;
20551
20552 case 'O':
20553 if (Subtarget->isThumb1Only()) {
20554 // This must be a multiple of 4 between -508 and 508, for
20555 // ADD/SUB sp = sp + immediate.
20556 if ((CVal >= -508 && CVal <= 508) && ((CVal & 3) == 0))
20557 break;
20558 }
20559 return;
20560 }
20561 Result = DAG.getSignedTargetConstant(CVal, SDLoc(Op), Op.getValueType());
20562 break;
20563 }
20564
20565 if (Result.getNode()) {
20566 Ops.push_back(Result);
20567 return;
20568 }
20569 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
20570}
20571
20572static RTLIB::Libcall getDivRemLibcall(
20573 const SDNode *N, MVT::SimpleValueType SVT) {
20574 assert((N->getOpcode() == ISD::SDIVREM || N->getOpcode() == ISD::UDIVREM ||
20575 N->getOpcode() == ISD::SREM || N->getOpcode() == ISD::UREM) &&
20576 "Unhandled Opcode in getDivRemLibcall");
20577 bool isSigned = N->getOpcode() == ISD::SDIVREM ||
20578 N->getOpcode() == ISD::SREM;
20579 RTLIB::Libcall LC;
20580 switch (SVT) {
20581 default: llvm_unreachable("Unexpected request for libcall!");
20582 case MVT::i8: LC = isSigned ? RTLIB::SDIVREM_I8 : RTLIB::UDIVREM_I8; break;
20583 case MVT::i16: LC = isSigned ? RTLIB::SDIVREM_I16 : RTLIB::UDIVREM_I16; break;
20584 case MVT::i32: LC = isSigned ? RTLIB::SDIVREM_I32 : RTLIB::UDIVREM_I32; break;
20585 case MVT::i64: LC = isSigned ? RTLIB::SDIVREM_I64 : RTLIB::UDIVREM_I64; break;
20586 }
20587 return LC;
20588}
20589
20591 const SDNode *N, LLVMContext *Context, const ARMSubtarget *Subtarget) {
20592 assert((N->getOpcode() == ISD::SDIVREM || N->getOpcode() == ISD::UDIVREM ||
20593 N->getOpcode() == ISD::SREM || N->getOpcode() == ISD::UREM) &&
20594 "Unhandled Opcode in getDivRemArgList");
20595 bool isSigned = N->getOpcode() == ISD::SDIVREM ||
20596 N->getOpcode() == ISD::SREM;
20598 for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
20599 EVT ArgVT = N->getOperand(i).getValueType();
20600 Type *ArgTy = ArgVT.getTypeForEVT(*Context);
20601 TargetLowering::ArgListEntry Entry(N->getOperand(i), ArgTy);
20602 Entry.IsSExt = isSigned;
20603 Entry.IsZExt = !isSigned;
20604 Args.push_back(Entry);
20605 }
20606 if (Subtarget->isTargetWindows() && Args.size() >= 2)
20607 std::swap(Args[0], Args[1]);
20608 return Args;
20609}
20610
20611SDValue ARMTargetLowering::LowerDivRem(SDValue Op, SelectionDAG &DAG) const {
20612 assert((Subtarget->isTargetAEABI() || Subtarget->isTargetAndroid() ||
20613 Subtarget->isTargetGNUAEABI() || Subtarget->isTargetMuslAEABI() ||
20614 Subtarget->isTargetFuchsia() || Subtarget->isTargetWindows()) &&
20615 "Register-based DivRem lowering only");
20616 unsigned Opcode = Op->getOpcode();
20617 assert((Opcode == ISD::SDIVREM || Opcode == ISD::UDIVREM) &&
20618 "Invalid opcode for Div/Rem lowering");
20619 bool isSigned = (Opcode == ISD::SDIVREM);
20620 EVT VT = Op->getValueType(0);
20621 SDLoc dl(Op);
20622
20623 if (VT == MVT::i64 && isa<ConstantSDNode>(Op.getOperand(1))) {
20625 if (expandDIVREMByConstant(Op.getNode(), Result, MVT::i32, DAG)) {
20626 SDValue Res0 =
20627 DAG.getNode(ISD::BUILD_PAIR, dl, VT, Result[0], Result[1]);
20628 SDValue Res1 =
20629 DAG.getNode(ISD::BUILD_PAIR, dl, VT, Result[2], Result[3]);
20630 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(),
20631 {Res0, Res1});
20632 }
20633 }
20634
20635 Type *Ty = VT.getTypeForEVT(*DAG.getContext());
20636
20637 // If the target has hardware divide, use divide + multiply + subtract:
20638 // div = a / b
20639 // rem = a - b * div
20640 // return {div, rem}
20641 // This should be lowered into UDIV/SDIV + MLS later on.
20642 bool hasDivide = Subtarget->isThumb() ? Subtarget->hasDivideInThumbMode()
20643 : Subtarget->hasDivideInARMMode();
20644 if (hasDivide && Op->getValueType(0).isSimple() &&
20645 Op->getSimpleValueType(0) == MVT::i32) {
20646 unsigned DivOpcode = isSigned ? ISD::SDIV : ISD::UDIV;
20647 const SDValue Dividend = Op->getOperand(0);
20648 const SDValue Divisor = Op->getOperand(1);
20649 SDValue Div = DAG.getNode(DivOpcode, dl, VT, Dividend, Divisor);
20650 SDValue Mul = DAG.getNode(ISD::MUL, dl, VT, Div, Divisor);
20651 SDValue Rem = DAG.getNode(ISD::SUB, dl, VT, Dividend, Mul);
20652
20653 SDValue Values[2] = {Div, Rem};
20654 return DAG.getNode(ISD::MERGE_VALUES, dl, DAG.getVTList(VT, VT), Values);
20655 }
20656
20657 RTLIB::Libcall LC = getDivRemLibcall(Op.getNode(),
20658 VT.getSimpleVT().SimpleTy);
20659 SDValue InChain = DAG.getEntryNode();
20660
20662 DAG.getContext(),
20663 Subtarget);
20664
20667
20668 Type *RetTy = StructType::get(Ty, Ty);
20669
20670 if (Subtarget->isTargetWindows())
20671 InChain = WinDBZCheckDenominator(DAG, Op.getNode(), InChain);
20672
20673 TargetLowering::CallLoweringInfo CLI(DAG);
20674 CLI.setDebugLoc(dl).setChain(InChain)
20675 .setCallee(getLibcallCallingConv(LC), RetTy, Callee, std::move(Args))
20677
20678 std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
20679 return CallInfo.first;
20680}
20681
20682// Lowers REM using divmod helpers
20683// see RTABI section 4.2/4.3
20684SDValue ARMTargetLowering::LowerREM(SDNode *N, SelectionDAG &DAG) const {
20685 EVT VT = N->getValueType(0);
20686
20687 if (VT == MVT::i64 && isa<ConstantSDNode>(N->getOperand(1))) {
20689 if (expandDIVREMByConstant(N, Result, MVT::i32, DAG))
20690 return DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), N->getValueType(0),
20691 Result[0], Result[1]);
20692 }
20693
20694 // Build return types (div and rem)
20695 std::vector<Type*> RetTyParams;
20696 Type *RetTyElement;
20697
20698 switch (VT.getSimpleVT().SimpleTy) {
20699 default: llvm_unreachable("Unexpected request for libcall!");
20700 case MVT::i8: RetTyElement = Type::getInt8Ty(*DAG.getContext()); break;
20701 case MVT::i16: RetTyElement = Type::getInt16Ty(*DAG.getContext()); break;
20702 case MVT::i32: RetTyElement = Type::getInt32Ty(*DAG.getContext()); break;
20703 case MVT::i64: RetTyElement = Type::getInt64Ty(*DAG.getContext()); break;
20704 }
20705
20706 RetTyParams.push_back(RetTyElement);
20707 RetTyParams.push_back(RetTyElement);
20708 ArrayRef<Type*> ret = ArrayRef<Type*>(RetTyParams);
20709 Type *RetTy = StructType::get(*DAG.getContext(), ret);
20710
20711 RTLIB::Libcall LC = getDivRemLibcall(N, N->getValueType(0).getSimpleVT().
20712 SimpleTy);
20713 SDValue InChain = DAG.getEntryNode();
20715 Subtarget);
20716 bool isSigned = N->getOpcode() == ISD::SREM;
20719
20720 if (Subtarget->isTargetWindows())
20721 InChain = WinDBZCheckDenominator(DAG, N, InChain);
20722
20723 // Lower call
20724 CallLoweringInfo CLI(DAG);
20725 CLI.setChain(InChain)
20726 .setCallee(CallingConv::ARM_AAPCS, RetTy, Callee, std::move(Args))
20728 std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
20729
20730 // Return second (rem) result operand (first contains div)
20731 SDNode *ResNode = CallResult.first.getNode();
20732 assert(ResNode->getNumOperands() == 2 && "divmod should return two operands");
20733 return ResNode->getOperand(1);
20734}
20735
20736SDValue
20737ARMTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const {
20738 assert(Subtarget->isTargetWindows() && "unsupported target platform");
20739 SDLoc DL(Op);
20740
20741 // Get the inputs.
20742 SDValue Chain = Op.getOperand(0);
20743 SDValue Size = Op.getOperand(1);
20744
20746 "no-stack-arg-probe")) {
20747 MaybeAlign Align =
20748 cast<ConstantSDNode>(Op.getOperand(2))->getMaybeAlignValue();
20749 SDValue SP = DAG.getCopyFromReg(Chain, DL, ARM::SP, MVT::i32);
20750 Chain = SP.getValue(1);
20751 SP = DAG.getNode(ISD::SUB, DL, MVT::i32, SP, Size);
20752 if (Align)
20753 SP = DAG.getNode(ISD::AND, DL, MVT::i32, SP.getValue(0),
20754 DAG.getSignedConstant(-Align->value(), DL, MVT::i32));
20755 Chain = DAG.getCopyToReg(Chain, DL, ARM::SP, SP);
20756 SDValue Ops[2] = { SP, Chain };
20757 return DAG.getMergeValues(Ops, DL);
20758 }
20759
20760 SDValue Words = DAG.getNode(ISD::SRL, DL, MVT::i32, Size,
20761 DAG.getConstant(2, DL, MVT::i32));
20762
20763 SDValue Glue;
20764 Chain = DAG.getCopyToReg(Chain, DL, ARM::R4, Words, Glue);
20765 Glue = Chain.getValue(1);
20766
20767 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
20768 Chain = DAG.getNode(ARMISD::WIN__CHKSTK, DL, NodeTys, Chain, Glue);
20769
20770 SDValue NewSP = DAG.getCopyFromReg(Chain, DL, ARM::SP, MVT::i32);
20771 Chain = NewSP.getValue(1);
20772
20773 SDValue Ops[2] = { NewSP, Chain };
20774 return DAG.getMergeValues(Ops, DL);
20775}
20776
20777SDValue ARMTargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
20778 bool IsStrict = Op->isStrictFPOpcode();
20779 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
20780 const unsigned DstSz = Op.getValueType().getSizeInBits();
20781 const unsigned SrcSz = SrcVal.getValueType().getSizeInBits();
20782 assert(DstSz > SrcSz && DstSz <= 64 && SrcSz >= 16 &&
20783 "Unexpected type for custom-lowering FP_EXTEND");
20784
20785 assert((!Subtarget->hasFP64() || !Subtarget->hasFPARMv8Base()) &&
20786 "With both FP DP and 16, any FP conversion is legal!");
20787
20788 assert(!(DstSz == 32 && Subtarget->hasFP16()) &&
20789 "With FP16, 16 to 32 conversion is legal!");
20790
20791 // Converting from 32 -> 64 is valid if we have FP64.
20792 if (SrcSz == 32 && DstSz == 64 && Subtarget->hasFP64()) {
20793 // FIXME: Remove this when we have strict fp instruction selection patterns
20794 if (IsStrict) {
20795 SDLoc Loc(Op);
20796 SDValue Result = DAG.getNode(ISD::FP_EXTEND,
20797 Loc, Op.getValueType(), SrcVal);
20798 return DAG.getMergeValues({Result, Op.getOperand(0)}, Loc);
20799 }
20800 return Op;
20801 }
20802
20803 // Either we are converting from 16 -> 64, without FP16 and/or
20804 // FP.double-precision or without Armv8-fp. So we must do it in two
20805 // steps.
20806 // Or we are converting from 32 -> 64 without fp.double-precision or 16 -> 32
20807 // without FP16. So we must do a function call.
20808 SDLoc Loc(Op);
20809 RTLIB::Libcall LC;
20810 MakeLibCallOptions CallOptions;
20811 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
20812 for (unsigned Sz = SrcSz; Sz <= 32 && Sz < DstSz; Sz *= 2) {
20813 bool Supported = (Sz == 16 ? Subtarget->hasFP16() : Subtarget->hasFP64());
20814 MVT SrcVT = (Sz == 16 ? MVT::f16 : MVT::f32);
20815 MVT DstVT = (Sz == 16 ? MVT::f32 : MVT::f64);
20816 if (Supported) {
20817 if (IsStrict) {
20818 SrcVal = DAG.getNode(ISD::STRICT_FP_EXTEND, Loc,
20819 {DstVT, MVT::Other}, {Chain, SrcVal});
20820 Chain = SrcVal.getValue(1);
20821 } else {
20822 SrcVal = DAG.getNode(ISD::FP_EXTEND, Loc, DstVT, SrcVal);
20823 }
20824 } else {
20825 LC = RTLIB::getFPEXT(SrcVT, DstVT);
20826 assert(LC != RTLIB::UNKNOWN_LIBCALL &&
20827 "Unexpected type for custom-lowering FP_EXTEND");
20828 std::tie(SrcVal, Chain) = makeLibCall(DAG, LC, DstVT, SrcVal, CallOptions,
20829 Loc, Chain);
20830 }
20831 }
20832
20833 return IsStrict ? DAG.getMergeValues({SrcVal, Chain}, Loc) : SrcVal;
20834}
20835
20836SDValue ARMTargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
20837 bool IsStrict = Op->isStrictFPOpcode();
20838
20839 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
20840 EVT SrcVT = SrcVal.getValueType();
20841 EVT DstVT = Op.getValueType();
20842 const unsigned DstSz = Op.getValueType().getSizeInBits();
20843 const unsigned SrcSz = SrcVT.getSizeInBits();
20844 (void)DstSz;
20845 assert(DstSz < SrcSz && SrcSz <= 64 && DstSz >= 16 &&
20846 "Unexpected type for custom-lowering FP_ROUND");
20847
20848 assert((!Subtarget->hasFP64() || !Subtarget->hasFPARMv8Base()) &&
20849 "With both FP DP and 16, any FP conversion is legal!");
20850
20851 SDLoc Loc(Op);
20852
20853 // Instruction from 32 -> 16 if hasFP16 is valid
20854 if (SrcSz == 32 && Subtarget->hasFP16())
20855 return Op;
20856
20857 // Lib call from 32 -> 16 / 64 -> [32, 16]
20858 RTLIB::Libcall LC = RTLIB::getFPROUND(SrcVT, DstVT);
20859 assert(LC != RTLIB::UNKNOWN_LIBCALL &&
20860 "Unexpected type for custom-lowering FP_ROUND");
20861 MakeLibCallOptions CallOptions;
20862 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
20864 std::tie(Result, Chain) = makeLibCall(DAG, LC, DstVT, SrcVal, CallOptions,
20865 Loc, Chain);
20866 return IsStrict ? DAG.getMergeValues({Result, Chain}, Loc) : Result;
20867}
20868
20869bool
20871 // The ARM target isn't yet aware of offsets.
20872 return false;
20873}
20874
20876 if (v == 0xffffffff)
20877 return false;
20878
20879 // there can be 1's on either or both "outsides", all the "inside"
20880 // bits must be 0's
20881 return isShiftedMask_32(~v);
20882}
20883
20884/// isFPImmLegal - Returns true if the target can instruction select the
20885/// specified FP immediate natively. If false, the legalizer will
20886/// materialize the FP immediate as a load from a constant pool.
20888 bool ForCodeSize) const {
20889 if (!Subtarget->hasVFP3Base())
20890 return false;
20891 if (VT == MVT::f16 && Subtarget->hasFullFP16())
20892 return ARM_AM::getFP16Imm(Imm) != -1;
20893 if (VT == MVT::f32 && Subtarget->hasFullFP16() &&
20894 ARM_AM::getFP32FP16Imm(Imm) != -1)
20895 return true;
20896 if (VT == MVT::f32)
20897 return ARM_AM::getFP32Imm(Imm) != -1;
20898 if (VT == MVT::f64 && Subtarget->hasFP64())
20899 return ARM_AM::getFP64Imm(Imm) != -1;
20900 return false;
20901}
20902
20903/// getTgtMemIntrinsic - Represent NEON load and store intrinsics as
20904/// MemIntrinsicNodes. The associated MachineMemOperands record the alignment
20905/// specified in the intrinsic calls.
20907 const CallInst &I,
20908 MachineFunction &MF,
20909 unsigned Intrinsic) const {
20910 switch (Intrinsic) {
20911 case Intrinsic::arm_neon_vld1:
20912 case Intrinsic::arm_neon_vld2:
20913 case Intrinsic::arm_neon_vld3:
20914 case Intrinsic::arm_neon_vld4:
20915 case Intrinsic::arm_neon_vld2lane:
20916 case Intrinsic::arm_neon_vld3lane:
20917 case Intrinsic::arm_neon_vld4lane:
20918 case Intrinsic::arm_neon_vld2dup:
20919 case Intrinsic::arm_neon_vld3dup:
20920 case Intrinsic::arm_neon_vld4dup: {
20921 Info.opc = ISD::INTRINSIC_W_CHAIN;
20922 // Conservatively set memVT to the entire set of vectors loaded.
20923 auto &DL = I.getDataLayout();
20924 uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64;
20925 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
20926 Info.ptrVal = I.getArgOperand(0);
20927 Info.offset = 0;
20928 Value *AlignArg = I.getArgOperand(I.arg_size() - 1);
20929 Info.align = cast<ConstantInt>(AlignArg)->getMaybeAlignValue();
20930 // volatile loads with NEON intrinsics not supported
20931 Info.flags = MachineMemOperand::MOLoad;
20932 return true;
20933 }
20934 case Intrinsic::arm_neon_vld1x2:
20935 case Intrinsic::arm_neon_vld1x3:
20936 case Intrinsic::arm_neon_vld1x4: {
20937 Info.opc = ISD::INTRINSIC_W_CHAIN;
20938 // Conservatively set memVT to the entire set of vectors loaded.
20939 auto &DL = I.getDataLayout();
20940 uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64;
20941 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
20942 Info.ptrVal = I.getArgOperand(I.arg_size() - 1);
20943 Info.offset = 0;
20944 Info.align = I.getParamAlign(I.arg_size() - 1).valueOrOne();
20945 // volatile loads with NEON intrinsics not supported
20946 Info.flags = MachineMemOperand::MOLoad;
20947 return true;
20948 }
20949 case Intrinsic::arm_neon_vst1:
20950 case Intrinsic::arm_neon_vst2:
20951 case Intrinsic::arm_neon_vst3:
20952 case Intrinsic::arm_neon_vst4:
20953 case Intrinsic::arm_neon_vst2lane:
20954 case Intrinsic::arm_neon_vst3lane:
20955 case Intrinsic::arm_neon_vst4lane: {
20956 Info.opc = ISD::INTRINSIC_VOID;
20957 // Conservatively set memVT to the entire set of vectors stored.
20958 auto &DL = I.getDataLayout();
20959 unsigned NumElts = 0;
20960 for (unsigned ArgI = 1, ArgE = I.arg_size(); ArgI < ArgE; ++ArgI) {
20961 Type *ArgTy = I.getArgOperand(ArgI)->getType();
20962 if (!ArgTy->isVectorTy())
20963 break;
20964 NumElts += DL.getTypeSizeInBits(ArgTy) / 64;
20965 }
20966 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
20967 Info.ptrVal = I.getArgOperand(0);
20968 Info.offset = 0;
20969 Value *AlignArg = I.getArgOperand(I.arg_size() - 1);
20970 Info.align = cast<ConstantInt>(AlignArg)->getMaybeAlignValue();
20971 // volatile stores with NEON intrinsics not supported
20972 Info.flags = MachineMemOperand::MOStore;
20973 return true;
20974 }
20975 case Intrinsic::arm_neon_vst1x2:
20976 case Intrinsic::arm_neon_vst1x3:
20977 case Intrinsic::arm_neon_vst1x4: {
20978 Info.opc = ISD::INTRINSIC_VOID;
20979 // Conservatively set memVT to the entire set of vectors stored.
20980 auto &DL = I.getDataLayout();
20981 unsigned NumElts = 0;
20982 for (unsigned ArgI = 1, ArgE = I.arg_size(); ArgI < ArgE; ++ArgI) {
20983 Type *ArgTy = I.getArgOperand(ArgI)->getType();
20984 if (!ArgTy->isVectorTy())
20985 break;
20986 NumElts += DL.getTypeSizeInBits(ArgTy) / 64;
20987 }
20988 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
20989 Info.ptrVal = I.getArgOperand(0);
20990 Info.offset = 0;
20991 Info.align = I.getParamAlign(0).valueOrOne();
20992 // volatile stores with NEON intrinsics not supported
20993 Info.flags = MachineMemOperand::MOStore;
20994 return true;
20995 }
20996 case Intrinsic::arm_mve_vld2q:
20997 case Intrinsic::arm_mve_vld4q: {
20998 Info.opc = ISD::INTRINSIC_W_CHAIN;
20999 // Conservatively set memVT to the entire set of vectors loaded.
21000 Type *VecTy = cast<StructType>(I.getType())->getElementType(1);
21001 unsigned Factor = Intrinsic == Intrinsic::arm_mve_vld2q ? 2 : 4;
21002 Info.memVT = EVT::getVectorVT(VecTy->getContext(), MVT::i64, Factor * 2);
21003 Info.ptrVal = I.getArgOperand(0);
21004 Info.offset = 0;
21005 Info.align = Align(VecTy->getScalarSizeInBits() / 8);
21006 // volatile loads with MVE intrinsics not supported
21007 Info.flags = MachineMemOperand::MOLoad;
21008 return true;
21009 }
21010 case Intrinsic::arm_mve_vst2q:
21011 case Intrinsic::arm_mve_vst4q: {
21012 Info.opc = ISD::INTRINSIC_VOID;
21013 // Conservatively set memVT to the entire set of vectors stored.
21014 Type *VecTy = I.getArgOperand(1)->getType();
21015 unsigned Factor = Intrinsic == Intrinsic::arm_mve_vst2q ? 2 : 4;
21016 Info.memVT = EVT::getVectorVT(VecTy->getContext(), MVT::i64, Factor * 2);
21017 Info.ptrVal = I.getArgOperand(0);
21018 Info.offset = 0;
21019 Info.align = Align(VecTy->getScalarSizeInBits() / 8);
21020 // volatile stores with MVE intrinsics not supported
21021 Info.flags = MachineMemOperand::MOStore;
21022 return true;
21023 }
21024 case Intrinsic::arm_mve_vldr_gather_base:
21025 case Intrinsic::arm_mve_vldr_gather_base_predicated: {
21026 Info.opc = ISD::INTRINSIC_W_CHAIN;
21027 Info.ptrVal = nullptr;
21028 Info.memVT = MVT::getVT(I.getType());
21029 Info.align = Align(1);
21030 Info.flags |= MachineMemOperand::MOLoad;
21031 return true;
21032 }
21033 case Intrinsic::arm_mve_vldr_gather_base_wb:
21034 case Intrinsic::arm_mve_vldr_gather_base_wb_predicated: {
21035 Info.opc = ISD::INTRINSIC_W_CHAIN;
21036 Info.ptrVal = nullptr;
21037 Info.memVT = MVT::getVT(I.getType()->getContainedType(0));
21038 Info.align = Align(1);
21039 Info.flags |= MachineMemOperand::MOLoad;
21040 return true;
21041 }
21042 case Intrinsic::arm_mve_vldr_gather_offset:
21043 case Intrinsic::arm_mve_vldr_gather_offset_predicated: {
21044 Info.opc = ISD::INTRINSIC_W_CHAIN;
21045 Info.ptrVal = nullptr;
21046 MVT DataVT = MVT::getVT(I.getType());
21047 unsigned MemSize = cast<ConstantInt>(I.getArgOperand(2))->getZExtValue();
21048 Info.memVT = MVT::getVectorVT(MVT::getIntegerVT(MemSize),
21049 DataVT.getVectorNumElements());
21050 Info.align = Align(1);
21051 Info.flags |= MachineMemOperand::MOLoad;
21052 return true;
21053 }
21054 case Intrinsic::arm_mve_vstr_scatter_base:
21055 case Intrinsic::arm_mve_vstr_scatter_base_predicated: {
21056 Info.opc = ISD::INTRINSIC_VOID;
21057 Info.ptrVal = nullptr;
21058 Info.memVT = MVT::getVT(I.getArgOperand(2)->getType());
21059 Info.align = Align(1);
21060 Info.flags |= MachineMemOperand::MOStore;
21061 return true;
21062 }
21063 case Intrinsic::arm_mve_vstr_scatter_base_wb:
21064 case Intrinsic::arm_mve_vstr_scatter_base_wb_predicated: {
21065 Info.opc = ISD::INTRINSIC_W_CHAIN;
21066 Info.ptrVal = nullptr;
21067 Info.memVT = MVT::getVT(I.getArgOperand(2)->getType());
21068 Info.align = Align(1);
21069 Info.flags |= MachineMemOperand::MOStore;
21070 return true;
21071 }
21072 case Intrinsic::arm_mve_vstr_scatter_offset:
21073 case Intrinsic::arm_mve_vstr_scatter_offset_predicated: {
21074 Info.opc = ISD::INTRINSIC_VOID;
21075 Info.ptrVal = nullptr;
21076 MVT DataVT = MVT::getVT(I.getArgOperand(2)->getType());
21077 unsigned MemSize = cast<ConstantInt>(I.getArgOperand(3))->getZExtValue();
21078 Info.memVT = MVT::getVectorVT(MVT::getIntegerVT(MemSize),
21079 DataVT.getVectorNumElements());
21080 Info.align = Align(1);
21081 Info.flags |= MachineMemOperand::MOStore;
21082 return true;
21083 }
21084 case Intrinsic::arm_ldaex:
21085 case Intrinsic::arm_ldrex: {
21086 auto &DL = I.getDataLayout();
21087 Type *ValTy = I.getParamElementType(0);
21088 Info.opc = ISD::INTRINSIC_W_CHAIN;
21089 Info.memVT = MVT::getVT(ValTy);
21090 Info.ptrVal = I.getArgOperand(0);
21091 Info.offset = 0;
21092 Info.align = DL.getABITypeAlign(ValTy);
21094 return true;
21095 }
21096 case Intrinsic::arm_stlex:
21097 case Intrinsic::arm_strex: {
21098 auto &DL = I.getDataLayout();
21099 Type *ValTy = I.getParamElementType(1);
21100 Info.opc = ISD::INTRINSIC_W_CHAIN;
21101 Info.memVT = MVT::getVT(ValTy);
21102 Info.ptrVal = I.getArgOperand(1);
21103 Info.offset = 0;
21104 Info.align = DL.getABITypeAlign(ValTy);
21106 return true;
21107 }
21108 case Intrinsic::arm_stlexd:
21109 case Intrinsic::arm_strexd:
21110 Info.opc = ISD::INTRINSIC_W_CHAIN;
21111 Info.memVT = MVT::i64;
21112 Info.ptrVal = I.getArgOperand(2);
21113 Info.offset = 0;
21114 Info.align = Align(8);
21116 return true;
21117
21118 case Intrinsic::arm_ldaexd:
21119 case Intrinsic::arm_ldrexd:
21120 Info.opc = ISD::INTRINSIC_W_CHAIN;
21121 Info.memVT = MVT::i64;
21122 Info.ptrVal = I.getArgOperand(0);
21123 Info.offset = 0;
21124 Info.align = Align(8);
21126 return true;
21127
21128 default:
21129 break;
21130 }
21131
21132 return false;
21133}
21134
21135/// Returns true if it is beneficial to convert a load of a constant
21136/// to just the constant itself.
21138 Type *Ty) const {
21139 assert(Ty->isIntegerTy());
21140
21141 unsigned Bits = Ty->getPrimitiveSizeInBits();
21142 if (Bits == 0 || Bits > 32)
21143 return false;
21144 return true;
21145}
21146
21148 unsigned Index) const {
21150 return false;
21151
21152 return (Index == 0 || Index == ResVT.getVectorNumElements());
21153}
21154
21156 ARM_MB::MemBOpt Domain) const {
21157 // First, if the target has no DMB, see what fallback we can use.
21158 if (!Subtarget->hasDataBarrier()) {
21159 // Some ARMv6 cpus can support data barriers with an mcr instruction.
21160 // Thumb1 and pre-v6 ARM mode use a libcall instead and should never get
21161 // here.
21162 if (Subtarget->hasV6Ops() && !Subtarget->isThumb()) {
21163 Value* args[6] = {Builder.getInt32(15), Builder.getInt32(0),
21164 Builder.getInt32(0), Builder.getInt32(7),
21165 Builder.getInt32(10), Builder.getInt32(5)};
21166 return Builder.CreateIntrinsic(Intrinsic::arm_mcr, args);
21167 } else {
21168 // Instead of using barriers, atomic accesses on these subtargets use
21169 // libcalls.
21170 llvm_unreachable("makeDMB on a target so old that it has no barriers");
21171 }
21172 } else {
21173 // Only a full system barrier exists in the M-class architectures.
21174 Domain = Subtarget->isMClass() ? ARM_MB::SY : Domain;
21175 Constant *CDomain = Builder.getInt32(Domain);
21176 return Builder.CreateIntrinsic(Intrinsic::arm_dmb, CDomain);
21177 }
21178}
21179
21180// Based on http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html
21182 Instruction *Inst,
21183 AtomicOrdering Ord) const {
21184 switch (Ord) {
21187 llvm_unreachable("Invalid fence: unordered/non-atomic");
21190 return nullptr; // Nothing to do
21192 if (!Inst->hasAtomicStore())
21193 return nullptr; // Nothing to do
21194 [[fallthrough]];
21197 if (Subtarget->preferISHSTBarriers())
21198 return makeDMB(Builder, ARM_MB::ISHST);
21199 // FIXME: add a comment with a link to documentation justifying this.
21200 else
21201 return makeDMB(Builder, ARM_MB::ISH);
21202 }
21203 llvm_unreachable("Unknown fence ordering in emitLeadingFence");
21204}
21205
21207 Instruction *Inst,
21208 AtomicOrdering Ord) const {
21209 switch (Ord) {
21212 llvm_unreachable("Invalid fence: unordered/not-atomic");
21215 return nullptr; // Nothing to do
21219 return makeDMB(Builder, ARM_MB::ISH);
21220 }
21221 llvm_unreachable("Unknown fence ordering in emitTrailingFence");
21222}
21223
21224// Loads and stores less than 64-bits are already atomic; ones above that
21225// are doomed anyway, so defer to the default libcall and blame the OS when
21226// things go wrong. Cortex M doesn't have ldrexd/strexd though, so don't emit
21227// anything for those.
21230 bool has64BitAtomicStore;
21231 if (Subtarget->isMClass())
21232 has64BitAtomicStore = false;
21233 else if (Subtarget->isThumb())
21234 has64BitAtomicStore = Subtarget->hasV7Ops();
21235 else
21236 has64BitAtomicStore = Subtarget->hasV6Ops();
21237
21238 unsigned Size = SI->getValueOperand()->getType()->getPrimitiveSizeInBits();
21239 return Size == 64 && has64BitAtomicStore ? AtomicExpansionKind::Expand
21241}
21242
21243// Loads and stores less than 64-bits are already atomic; ones above that
21244// are doomed anyway, so defer to the default libcall and blame the OS when
21245// things go wrong. Cortex M doesn't have ldrexd/strexd though, so don't emit
21246// anything for those.
21247// FIXME: ldrd and strd are atomic if the CPU has LPAE (e.g. A15 has that
21248// guarantee, see DDI0406C ARM architecture reference manual,
21249// sections A8.8.72-74 LDRD)
21252 bool has64BitAtomicLoad;
21253 if (Subtarget->isMClass())
21254 has64BitAtomicLoad = false;
21255 else if (Subtarget->isThumb())
21256 has64BitAtomicLoad = Subtarget->hasV7Ops();
21257 else
21258 has64BitAtomicLoad = Subtarget->hasV6Ops();
21259
21260 unsigned Size = LI->getType()->getPrimitiveSizeInBits();
21261 return (Size == 64 && has64BitAtomicLoad) ? AtomicExpansionKind::LLOnly
21263}
21264
21265// For the real atomic operations, we have ldrex/strex up to 32 bits,
21266// and up to 64 bits on the non-M profiles
21269 if (AI->isFloatingPointOperation())
21271
21272 unsigned Size = AI->getType()->getPrimitiveSizeInBits();
21273 bool hasAtomicRMW;
21274 if (Subtarget->isMClass())
21275 hasAtomicRMW = Subtarget->hasV8MBaselineOps();
21276 else if (Subtarget->isThumb())
21277 hasAtomicRMW = Subtarget->hasV7Ops();
21278 else
21279 hasAtomicRMW = Subtarget->hasV6Ops();
21280 if (Size <= (Subtarget->isMClass() ? 32U : 64U) && hasAtomicRMW) {
21281 // At -O0, fast-regalloc cannot cope with the live vregs necessary to
21282 // implement atomicrmw without spilling. If the target address is also on
21283 // the stack and close enough to the spill slot, this can lead to a
21284 // situation where the monitor always gets cleared and the atomic operation
21285 // can never succeed. So at -O0 lower this operation to a CAS loop.
21286 if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None)
21289 }
21291}
21292
21293// Similar to shouldExpandAtomicRMWInIR, ldrex/strex can be used up to 32
21294// bits, and up to 64 bits on the non-M profiles.
21297 // At -O0, fast-regalloc cannot cope with the live vregs necessary to
21298 // implement cmpxchg without spilling. If the address being exchanged is also
21299 // on the stack and close enough to the spill slot, this can lead to a
21300 // situation where the monitor always gets cleared and the atomic operation
21301 // can never succeed. So at -O0 we need a late-expanded pseudo-inst instead.
21302 unsigned Size = AI->getOperand(1)->getType()->getPrimitiveSizeInBits();
21303 bool HasAtomicCmpXchg;
21304 if (Subtarget->isMClass())
21305 HasAtomicCmpXchg = Subtarget->hasV8MBaselineOps();
21306 else if (Subtarget->isThumb())
21307 HasAtomicCmpXchg = Subtarget->hasV7Ops();
21308 else
21309 HasAtomicCmpXchg = Subtarget->hasV6Ops();
21310 if (getTargetMachine().getOptLevel() != CodeGenOptLevel::None &&
21311 HasAtomicCmpXchg && Size <= (Subtarget->isMClass() ? 32U : 64U))
21314}
21315
21317 const Instruction *I) const {
21318 return InsertFencesForAtomic;
21319}
21320
21322 // ROPI/RWPI are not supported currently.
21323 return !Subtarget->isROPI() && !Subtarget->isRWPI();
21324}
21325
21327 // MSVC CRT provides functionalities for stack protection.
21328 RTLIB::LibcallImpl SecurityCheckCookieLibcall =
21329 getLibcallImpl(RTLIB::SECURITY_CHECK_COOKIE);
21330
21331 RTLIB::LibcallImpl SecurityCookieVar =
21332 getLibcallImpl(RTLIB::STACK_CHECK_GUARD);
21333 if (SecurityCheckCookieLibcall != RTLIB::Unsupported &&
21334 SecurityCookieVar != RTLIB::Unsupported) {
21335 // MSVC CRT has a global variable holding security cookie.
21336 M.getOrInsertGlobal(getLibcallImplName(SecurityCookieVar),
21337 PointerType::getUnqual(M.getContext()));
21338
21339 // MSVC CRT has a function to validate security cookie.
21340 FunctionCallee SecurityCheckCookie =
21341 M.getOrInsertFunction(getLibcallImplName(SecurityCheckCookieLibcall),
21342 Type::getVoidTy(M.getContext()),
21343 PointerType::getUnqual(M.getContext()));
21344 if (Function *F = dyn_cast<Function>(SecurityCheckCookie.getCallee()))
21345 F->addParamAttr(0, Attribute::AttrKind::InReg);
21346 }
21347
21349}
21350
21352 unsigned &Cost) const {
21353 // If we do not have NEON, vector types are not natively supported.
21354 if (!Subtarget->hasNEON())
21355 return false;
21356
21357 // Floating point values and vector values map to the same register file.
21358 // Therefore, although we could do a store extract of a vector type, this is
21359 // better to leave at float as we have more freedom in the addressing mode for
21360 // those.
21361 if (VectorTy->isFPOrFPVectorTy())
21362 return false;
21363
21364 // If the index is unknown at compile time, this is very expensive to lower
21365 // and it is not possible to combine the store with the extract.
21366 if (!isa<ConstantInt>(Idx))
21367 return false;
21368
21369 assert(VectorTy->isVectorTy() && "VectorTy is not a vector type");
21370 unsigned BitWidth = VectorTy->getPrimitiveSizeInBits().getFixedValue();
21371 // We can do a store + vector extract on any vector that fits perfectly in a D
21372 // or Q register.
21373 if (BitWidth == 64 || BitWidth == 128) {
21374 Cost = 0;
21375 return true;
21376 }
21377 return false;
21378}
21379
21381 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
21382 bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const {
21383 unsigned Opcode = Op.getOpcode();
21384 switch (Opcode) {
21385 case ARMISD::VORRIMM:
21386 case ARMISD::VBICIMM:
21387 return false;
21388 }
21390 Op, DemandedElts, DAG, PoisonOnly, ConsiderFlags, Depth);
21391}
21392
21394 return Subtarget->hasV5TOps() && !Subtarget->isThumb1Only();
21395}
21396
21398 return Subtarget->hasV5TOps() && !Subtarget->isThumb1Only();
21399}
21400
21402 const Instruction &AndI) const {
21403 if (!Subtarget->hasV7Ops())
21404 return false;
21405
21406 // Sink the `and` instruction only if the mask would fit into a modified
21407 // immediate operand.
21409 if (!Mask || Mask->getValue().getBitWidth() > 32u)
21410 return false;
21411 auto MaskVal = unsigned(Mask->getValue().getZExtValue());
21412 return (Subtarget->isThumb2() ? ARM_AM::getT2SOImmVal(MaskVal)
21413 : ARM_AM::getSOImmVal(MaskVal)) != -1;
21414}
21415
21418 SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const {
21419 if (Subtarget->hasMinSize() && !Subtarget->isTargetWindows())
21422 ExpansionFactor);
21423}
21424
21426 Value *Addr,
21427 AtomicOrdering Ord) const {
21428 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
21429 bool IsAcquire = isAcquireOrStronger(Ord);
21430
21431 // Since i64 isn't legal and intrinsics don't get type-lowered, the ldrexd
21432 // intrinsic must return {i32, i32} and we have to recombine them into a
21433 // single i64 here.
21434 if (ValueTy->getPrimitiveSizeInBits() == 64) {
21436 IsAcquire ? Intrinsic::arm_ldaexd : Intrinsic::arm_ldrexd;
21437
21438 Value *LoHi =
21439 Builder.CreateIntrinsic(Int, Addr, /*FMFSource=*/nullptr, "lohi");
21440
21441 Value *Lo = Builder.CreateExtractValue(LoHi, 0, "lo");
21442 Value *Hi = Builder.CreateExtractValue(LoHi, 1, "hi");
21443 if (!Subtarget->isLittle())
21444 std::swap (Lo, Hi);
21445 Lo = Builder.CreateZExt(Lo, ValueTy, "lo64");
21446 Hi = Builder.CreateZExt(Hi, ValueTy, "hi64");
21447 return Builder.CreateOr(
21448 Lo, Builder.CreateShl(Hi, ConstantInt::get(ValueTy, 32)), "val64");
21449 }
21450
21451 Type *Tys[] = { Addr->getType() };
21452 Intrinsic::ID Int = IsAcquire ? Intrinsic::arm_ldaex : Intrinsic::arm_ldrex;
21453 CallInst *CI = Builder.CreateIntrinsic(Int, Tys, Addr);
21454
21455 CI->addParamAttr(
21456 0, Attribute::get(M->getContext(), Attribute::ElementType, ValueTy));
21457 return Builder.CreateTruncOrBitCast(CI, ValueTy);
21458}
21459
21461 IRBuilderBase &Builder) const {
21462 if (!Subtarget->hasV7Ops())
21463 return;
21464 Builder.CreateIntrinsic(Intrinsic::arm_clrex, {});
21465}
21466
21468 Value *Val, Value *Addr,
21469 AtomicOrdering Ord) const {
21470 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
21471 bool IsRelease = isReleaseOrStronger(Ord);
21472
21473 // Since the intrinsics must have legal type, the i64 intrinsics take two
21474 // parameters: "i32, i32". We must marshal Val into the appropriate form
21475 // before the call.
21476 if (Val->getType()->getPrimitiveSizeInBits() == 64) {
21478 IsRelease ? Intrinsic::arm_stlexd : Intrinsic::arm_strexd;
21479 Type *Int32Ty = Type::getInt32Ty(M->getContext());
21480
21481 Value *Lo = Builder.CreateTrunc(Val, Int32Ty, "lo");
21482 Value *Hi = Builder.CreateTrunc(Builder.CreateLShr(Val, 32), Int32Ty, "hi");
21483 if (!Subtarget->isLittle())
21484 std::swap(Lo, Hi);
21485 return Builder.CreateIntrinsic(Int, {Lo, Hi, Addr});
21486 }
21487
21488 Intrinsic::ID Int = IsRelease ? Intrinsic::arm_stlex : Intrinsic::arm_strex;
21489 Type *Tys[] = { Addr->getType() };
21491
21492 CallInst *CI = Builder.CreateCall(
21493 Strex, {Builder.CreateZExtOrBitCast(
21494 Val, Strex->getFunctionType()->getParamType(0)),
21495 Addr});
21496 CI->addParamAttr(1, Attribute::get(M->getContext(), Attribute::ElementType,
21497 Val->getType()));
21498 return CI;
21499}
21500
21501
21503 return Subtarget->isMClass();
21504}
21505
21506/// A helper function for determining the number of interleaved accesses we
21507/// will generate when lowering accesses of the given type.
21508unsigned
21510 const DataLayout &DL) const {
21511 return (DL.getTypeSizeInBits(VecTy) + 127) / 128;
21512}
21513
21515 unsigned Factor, FixedVectorType *VecTy, Align Alignment,
21516 const DataLayout &DL) const {
21517
21518 unsigned VecSize = DL.getTypeSizeInBits(VecTy);
21519 unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType());
21520
21521 if (!Subtarget->hasNEON() && !Subtarget->hasMVEIntegerOps())
21522 return false;
21523
21524 // Ensure the vector doesn't have f16 elements. Even though we could do an
21525 // i16 vldN, we can't hold the f16 vectors and will end up converting via
21526 // f32.
21527 if (Subtarget->hasNEON() && VecTy->getElementType()->isHalfTy())
21528 return false;
21529 if (Subtarget->hasMVEIntegerOps() && Factor == 3)
21530 return false;
21531
21532 // Ensure the number of vector elements is greater than 1.
21533 if (VecTy->getNumElements() < 2)
21534 return false;
21535
21536 // Ensure the element type is legal.
21537 if (ElSize != 8 && ElSize != 16 && ElSize != 32)
21538 return false;
21539 // And the alignment if high enough under MVE.
21540 if (Subtarget->hasMVEIntegerOps() && Alignment < ElSize / 8)
21541 return false;
21542
21543 // Ensure the total vector size is 64 or a multiple of 128. Types larger than
21544 // 128 will be split into multiple interleaved accesses.
21545 if (Subtarget->hasNEON() && VecSize == 64)
21546 return true;
21547 return VecSize % 128 == 0;
21548}
21549
21551 if (Subtarget->hasNEON())
21552 return 4;
21553 if (Subtarget->hasMVEIntegerOps())
21556}
21557
21558/// Lower an interleaved load into a vldN intrinsic.
21559///
21560/// E.g. Lower an interleaved load (Factor = 2):
21561/// %wide.vec = load <8 x i32>, <8 x i32>* %ptr, align 4
21562/// %v0 = shuffle %wide.vec, undef, <0, 2, 4, 6> ; Extract even elements
21563/// %v1 = shuffle %wide.vec, undef, <1, 3, 5, 7> ; Extract odd elements
21564///
21565/// Into:
21566/// %vld2 = { <4 x i32>, <4 x i32> } call llvm.arm.neon.vld2(%ptr, 4)
21567/// %vec0 = extractelement { <4 x i32>, <4 x i32> } %vld2, i32 0
21568/// %vec1 = extractelement { <4 x i32>, <4 x i32> } %vld2, i32 1
21570 Instruction *Load, Value *Mask, ArrayRef<ShuffleVectorInst *> Shuffles,
21571 ArrayRef<unsigned> Indices, unsigned Factor, const APInt &GapMask) const {
21572 assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
21573 "Invalid interleave factor");
21574 assert(!Shuffles.empty() && "Empty shufflevector input");
21575 assert(Shuffles.size() == Indices.size() &&
21576 "Unmatched number of shufflevectors and indices");
21577
21578 auto *LI = dyn_cast<LoadInst>(Load);
21579 if (!LI)
21580 return false;
21581 assert(!Mask && GapMask.popcount() == Factor && "Unexpected mask on a load");
21582
21583 auto *VecTy = cast<FixedVectorType>(Shuffles[0]->getType());
21584 Type *EltTy = VecTy->getElementType();
21585
21586 const DataLayout &DL = LI->getDataLayout();
21587 Align Alignment = LI->getAlign();
21588
21589 // Skip if we do not have NEON and skip illegal vector types. We can
21590 // "legalize" wide vector types into multiple interleaved accesses as long as
21591 // the vector types are divisible by 128.
21592 if (!isLegalInterleavedAccessType(Factor, VecTy, Alignment, DL))
21593 return false;
21594
21595 unsigned NumLoads = getNumInterleavedAccesses(VecTy, DL);
21596
21597 // A pointer vector can not be the return type of the ldN intrinsics. Need to
21598 // load integer vectors first and then convert to pointer vectors.
21599 if (EltTy->isPointerTy())
21600 VecTy = FixedVectorType::get(DL.getIntPtrType(EltTy), VecTy);
21601
21602 IRBuilder<> Builder(LI);
21603
21604 // The base address of the load.
21605 Value *BaseAddr = LI->getPointerOperand();
21606
21607 if (NumLoads > 1) {
21608 // If we're going to generate more than one load, reset the sub-vector type
21609 // to something legal.
21610 VecTy = FixedVectorType::get(VecTy->getElementType(),
21611 VecTy->getNumElements() / NumLoads);
21612 }
21613
21614 assert(isTypeLegal(EVT::getEVT(VecTy)) && "Illegal vldN vector type!");
21615
21616 auto createLoadIntrinsic = [&](Value *BaseAddr) {
21617 if (Subtarget->hasNEON()) {
21618 Type *PtrTy = Builder.getPtrTy(LI->getPointerAddressSpace());
21619 Type *Tys[] = {VecTy, PtrTy};
21620 static const Intrinsic::ID LoadInts[3] = {Intrinsic::arm_neon_vld2,
21621 Intrinsic::arm_neon_vld3,
21622 Intrinsic::arm_neon_vld4};
21623
21625 Ops.push_back(BaseAddr);
21626 Ops.push_back(Builder.getInt32(LI->getAlign().value()));
21627
21628 return Builder.CreateIntrinsic(LoadInts[Factor - 2], Tys, Ops,
21629 /*FMFSource=*/nullptr, "vldN");
21630 } else {
21631 assert((Factor == 2 || Factor == 4) &&
21632 "expected interleave factor of 2 or 4 for MVE");
21633 Intrinsic::ID LoadInts =
21634 Factor == 2 ? Intrinsic::arm_mve_vld2q : Intrinsic::arm_mve_vld4q;
21635 Type *PtrTy = Builder.getPtrTy(LI->getPointerAddressSpace());
21636 Type *Tys[] = {VecTy, PtrTy};
21637
21639 Ops.push_back(BaseAddr);
21640 return Builder.CreateIntrinsic(LoadInts, Tys, Ops, /*FMFSource=*/nullptr,
21641 "vldN");
21642 }
21643 };
21644
21645 // Holds sub-vectors extracted from the load intrinsic return values. The
21646 // sub-vectors are associated with the shufflevector instructions they will
21647 // replace.
21649
21650 for (unsigned LoadCount = 0; LoadCount < NumLoads; ++LoadCount) {
21651 // If we're generating more than one load, compute the base address of
21652 // subsequent loads as an offset from the previous.
21653 if (LoadCount > 0)
21654 BaseAddr = Builder.CreateConstGEP1_32(VecTy->getElementType(), BaseAddr,
21655 VecTy->getNumElements() * Factor);
21656
21657 CallInst *VldN = createLoadIntrinsic(BaseAddr);
21658
21659 // Replace uses of each shufflevector with the corresponding vector loaded
21660 // by ldN.
21661 for (unsigned i = 0; i < Shuffles.size(); i++) {
21662 ShuffleVectorInst *SV = Shuffles[i];
21663 unsigned Index = Indices[i];
21664
21665 Value *SubVec = Builder.CreateExtractValue(VldN, Index);
21666
21667 // Convert the integer vector to pointer vector if the element is pointer.
21668 if (EltTy->isPointerTy())
21669 SubVec = Builder.CreateIntToPtr(
21670 SubVec,
21672
21673 SubVecs[SV].push_back(SubVec);
21674 }
21675 }
21676
21677 // Replace uses of the shufflevector instructions with the sub-vectors
21678 // returned by the load intrinsic. If a shufflevector instruction is
21679 // associated with more than one sub-vector, those sub-vectors will be
21680 // concatenated into a single wide vector.
21681 for (ShuffleVectorInst *SVI : Shuffles) {
21682 auto &SubVec = SubVecs[SVI];
21683 auto *WideVec =
21684 SubVec.size() > 1 ? concatenateVectors(Builder, SubVec) : SubVec[0];
21685 SVI->replaceAllUsesWith(WideVec);
21686 }
21687
21688 return true;
21689}
21690
21691/// Lower an interleaved store into a vstN intrinsic.
21692///
21693/// E.g. Lower an interleaved store (Factor = 3):
21694/// %i.vec = shuffle <8 x i32> %v0, <8 x i32> %v1,
21695/// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>
21696/// store <12 x i32> %i.vec, <12 x i32>* %ptr, align 4
21697///
21698/// Into:
21699/// %sub.v0 = shuffle <8 x i32> %v0, <8 x i32> v1, <0, 1, 2, 3>
21700/// %sub.v1 = shuffle <8 x i32> %v0, <8 x i32> v1, <4, 5, 6, 7>
21701/// %sub.v2 = shuffle <8 x i32> %v0, <8 x i32> v1, <8, 9, 10, 11>
21702/// call void llvm.arm.neon.vst3(%ptr, %sub.v0, %sub.v1, %sub.v2, 4)
21703///
21704/// Note that the new shufflevectors will be removed and we'll only generate one
21705/// vst3 instruction in CodeGen.
21706///
21707/// Example for a more general valid mask (Factor 3). Lower:
21708/// %i.vec = shuffle <32 x i32> %v0, <32 x i32> %v1,
21709/// <4, 32, 16, 5, 33, 17, 6, 34, 18, 7, 35, 19>
21710/// store <12 x i32> %i.vec, <12 x i32>* %ptr
21711///
21712/// Into:
21713/// %sub.v0 = shuffle <32 x i32> %v0, <32 x i32> v1, <4, 5, 6, 7>
21714/// %sub.v1 = shuffle <32 x i32> %v0, <32 x i32> v1, <32, 33, 34, 35>
21715/// %sub.v2 = shuffle <32 x i32> %v0, <32 x i32> v1, <16, 17, 18, 19>
21716/// call void llvm.arm.neon.vst3(%ptr, %sub.v0, %sub.v1, %sub.v2, 4)
21718 Value *LaneMask,
21719 ShuffleVectorInst *SVI,
21720 unsigned Factor,
21721 const APInt &GapMask) const {
21722 assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
21723 "Invalid interleave factor");
21724 auto *SI = dyn_cast<StoreInst>(Store);
21725 if (!SI)
21726 return false;
21727 assert(!LaneMask && GapMask.popcount() == Factor &&
21728 "Unexpected mask on store");
21729
21730 auto *VecTy = cast<FixedVectorType>(SVI->getType());
21731 assert(VecTy->getNumElements() % Factor == 0 && "Invalid interleaved store");
21732
21733 unsigned LaneLen = VecTy->getNumElements() / Factor;
21734 Type *EltTy = VecTy->getElementType();
21735 auto *SubVecTy = FixedVectorType::get(EltTy, LaneLen);
21736
21737 const DataLayout &DL = SI->getDataLayout();
21738 Align Alignment = SI->getAlign();
21739
21740 // Skip if we do not have NEON and skip illegal vector types. We can
21741 // "legalize" wide vector types into multiple interleaved accesses as long as
21742 // the vector types are divisible by 128.
21743 if (!isLegalInterleavedAccessType(Factor, SubVecTy, Alignment, DL))
21744 return false;
21745
21746 unsigned NumStores = getNumInterleavedAccesses(SubVecTy, DL);
21747
21748 Value *Op0 = SVI->getOperand(0);
21749 Value *Op1 = SVI->getOperand(1);
21750 IRBuilder<> Builder(SI);
21751
21752 // StN intrinsics don't support pointer vectors as arguments. Convert pointer
21753 // vectors to integer vectors.
21754 if (EltTy->isPointerTy()) {
21755 Type *IntTy = DL.getIntPtrType(EltTy);
21756
21757 // Convert to the corresponding integer vector.
21758 auto *IntVecTy =
21760 Op0 = Builder.CreatePtrToInt(Op0, IntVecTy);
21761 Op1 = Builder.CreatePtrToInt(Op1, IntVecTy);
21762
21763 SubVecTy = FixedVectorType::get(IntTy, LaneLen);
21764 }
21765
21766 // The base address of the store.
21767 Value *BaseAddr = SI->getPointerOperand();
21768
21769 if (NumStores > 1) {
21770 // If we're going to generate more than one store, reset the lane length
21771 // and sub-vector type to something legal.
21772 LaneLen /= NumStores;
21773 SubVecTy = FixedVectorType::get(SubVecTy->getElementType(), LaneLen);
21774 }
21775
21776 assert(isTypeLegal(EVT::getEVT(SubVecTy)) && "Illegal vstN vector type!");
21777
21778 auto Mask = SVI->getShuffleMask();
21779
21780 auto createStoreIntrinsic = [&](Value *BaseAddr,
21781 SmallVectorImpl<Value *> &Shuffles) {
21782 if (Subtarget->hasNEON()) {
21783 static const Intrinsic::ID StoreInts[3] = {Intrinsic::arm_neon_vst2,
21784 Intrinsic::arm_neon_vst3,
21785 Intrinsic::arm_neon_vst4};
21786 Type *PtrTy = Builder.getPtrTy(SI->getPointerAddressSpace());
21787 Type *Tys[] = {PtrTy, SubVecTy};
21788
21790 Ops.push_back(BaseAddr);
21791 append_range(Ops, Shuffles);
21792 Ops.push_back(Builder.getInt32(SI->getAlign().value()));
21793 Builder.CreateIntrinsic(StoreInts[Factor - 2], Tys, Ops);
21794 } else {
21795 assert((Factor == 2 || Factor == 4) &&
21796 "expected interleave factor of 2 or 4 for MVE");
21797 Intrinsic::ID StoreInts =
21798 Factor == 2 ? Intrinsic::arm_mve_vst2q : Intrinsic::arm_mve_vst4q;
21799 Type *PtrTy = Builder.getPtrTy(SI->getPointerAddressSpace());
21800 Type *Tys[] = {PtrTy, SubVecTy};
21801
21803 Ops.push_back(BaseAddr);
21804 append_range(Ops, Shuffles);
21805 for (unsigned F = 0; F < Factor; F++) {
21806 Ops.push_back(Builder.getInt32(F));
21807 Builder.CreateIntrinsic(StoreInts, Tys, Ops);
21808 Ops.pop_back();
21809 }
21810 }
21811 };
21812
21813 for (unsigned StoreCount = 0; StoreCount < NumStores; ++StoreCount) {
21814 // If we generating more than one store, we compute the base address of
21815 // subsequent stores as an offset from the previous.
21816 if (StoreCount > 0)
21817 BaseAddr = Builder.CreateConstGEP1_32(SubVecTy->getElementType(),
21818 BaseAddr, LaneLen * Factor);
21819
21820 SmallVector<Value *, 4> Shuffles;
21821
21822 // Split the shufflevector operands into sub vectors for the new vstN call.
21823 for (unsigned i = 0; i < Factor; i++) {
21824 unsigned IdxI = StoreCount * LaneLen * Factor + i;
21825 if (Mask[IdxI] >= 0) {
21826 Shuffles.push_back(Builder.CreateShuffleVector(
21827 Op0, Op1, createSequentialMask(Mask[IdxI], LaneLen, 0)));
21828 } else {
21829 unsigned StartMask = 0;
21830 for (unsigned j = 1; j < LaneLen; j++) {
21831 unsigned IdxJ = StoreCount * LaneLen * Factor + j;
21832 if (Mask[IdxJ * Factor + IdxI] >= 0) {
21833 StartMask = Mask[IdxJ * Factor + IdxI] - IdxJ;
21834 break;
21835 }
21836 }
21837 // Note: If all elements in a chunk are undefs, StartMask=0!
21838 // Note: Filling undef gaps with random elements is ok, since
21839 // those elements were being written anyway (with undefs).
21840 // In the case of all undefs we're defaulting to using elems from 0
21841 // Note: StartMask cannot be negative, it's checked in
21842 // isReInterleaveMask
21843 Shuffles.push_back(Builder.CreateShuffleVector(
21844 Op0, Op1, createSequentialMask(StartMask, LaneLen, 0)));
21845 }
21846 }
21847
21848 createStoreIntrinsic(BaseAddr, Shuffles);
21849 }
21850 return true;
21851}
21852
21860
21862 uint64_t &Members) {
21863 if (auto *ST = dyn_cast<StructType>(Ty)) {
21864 for (unsigned i = 0; i < ST->getNumElements(); ++i) {
21865 uint64_t SubMembers = 0;
21866 if (!isHomogeneousAggregate(ST->getElementType(i), Base, SubMembers))
21867 return false;
21868 Members += SubMembers;
21869 }
21870 } else if (auto *AT = dyn_cast<ArrayType>(Ty)) {
21871 uint64_t SubMembers = 0;
21872 if (!isHomogeneousAggregate(AT->getElementType(), Base, SubMembers))
21873 return false;
21874 Members += SubMembers * AT->getNumElements();
21875 } else if (Ty->isFloatTy()) {
21876 if (Base != HA_UNKNOWN && Base != HA_FLOAT)
21877 return false;
21878 Members = 1;
21879 Base = HA_FLOAT;
21880 } else if (Ty->isDoubleTy()) {
21881 if (Base != HA_UNKNOWN && Base != HA_DOUBLE)
21882 return false;
21883 Members = 1;
21884 Base = HA_DOUBLE;
21885 } else if (auto *VT = dyn_cast<VectorType>(Ty)) {
21886 Members = 1;
21887 switch (Base) {
21888 case HA_FLOAT:
21889 case HA_DOUBLE:
21890 return false;
21891 case HA_VECT64:
21892 return VT->getPrimitiveSizeInBits().getFixedValue() == 64;
21893 case HA_VECT128:
21894 return VT->getPrimitiveSizeInBits().getFixedValue() == 128;
21895 case HA_UNKNOWN:
21896 switch (VT->getPrimitiveSizeInBits().getFixedValue()) {
21897 case 64:
21898 Base = HA_VECT64;
21899 return true;
21900 case 128:
21901 Base = HA_VECT128;
21902 return true;
21903 default:
21904 return false;
21905 }
21906 }
21907 }
21908
21909 return (Members > 0 && Members <= 4);
21910}
21911
21912/// Return the correct alignment for the current calling convention.
21914 Type *ArgTy, const DataLayout &DL) const {
21915 const Align ABITypeAlign = DL.getABITypeAlign(ArgTy);
21916 if (!ArgTy->isVectorTy())
21917 return ABITypeAlign;
21918
21919 // Avoid over-aligning vector parameters. It would require realigning the
21920 // stack and waste space for no real benefit.
21921 MaybeAlign StackAlign = DL.getStackAlignment();
21922 assert(StackAlign && "data layout string is missing stack alignment");
21923 return std::min(ABITypeAlign, *StackAlign);
21924}
21925
21926/// Return true if a type is an AAPCS-VFP homogeneous aggregate or one of
21927/// [N x i32] or [N x i64]. This allows front-ends to skip emitting padding when
21928/// passing according to AAPCS rules.
21930 Type *Ty, CallingConv::ID CallConv, bool isVarArg,
21931 const DataLayout &DL) const {
21932 if (getEffectiveCallingConv(CallConv, isVarArg) !=
21934 return false;
21935
21937 uint64_t Members = 0;
21938 bool IsHA = isHomogeneousAggregate(Ty, Base, Members);
21939 LLVM_DEBUG(dbgs() << "isHA: " << IsHA << " "; Ty->dump());
21940
21941 bool IsIntArray = Ty->isArrayTy() && Ty->getArrayElementType()->isIntegerTy();
21942 return IsHA || IsIntArray;
21943}
21944
21946 const Constant *PersonalityFn) const {
21947 // Platforms which do not use SjLj EH may return values in these registers
21948 // via the personality function.
21950 return EM == ExceptionHandling::SjLj ? Register() : ARM::R0;
21951}
21952
21954 const Constant *PersonalityFn) const {
21955 // Platforms which do not use SjLj EH may return values in these registers
21956 // via the personality function.
21958 return EM == ExceptionHandling::SjLj ? Register() : ARM::R1;
21959}
21960
21961void ARMTargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
21962 // Update IsSplitCSR in ARMFunctionInfo.
21963 ARMFunctionInfo *AFI = Entry->getParent()->getInfo<ARMFunctionInfo>();
21964 AFI->setIsSplitCSR(true);
21965}
21966
21967void ARMTargetLowering::insertCopiesSplitCSR(
21968 MachineBasicBlock *Entry,
21969 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
21970 const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo();
21971 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
21972 if (!IStart)
21973 return;
21974
21975 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
21976 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
21977 MachineBasicBlock::iterator MBBI = Entry->begin();
21978 for (const MCPhysReg *I = IStart; *I; ++I) {
21979 const TargetRegisterClass *RC = nullptr;
21980 if (ARM::GPRRegClass.contains(*I))
21981 RC = &ARM::GPRRegClass;
21982 else if (ARM::DPRRegClass.contains(*I))
21983 RC = &ARM::DPRRegClass;
21984 else
21985 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
21986
21987 Register NewVR = MRI->createVirtualRegister(RC);
21988 // Create copy from CSR to a virtual register.
21989 // FIXME: this currently does not emit CFI pseudo-instructions, it works
21990 // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
21991 // nounwind. If we want to generalize this later, we may need to emit
21992 // CFI pseudo-instructions.
21993 assert(Entry->getParent()->getFunction().hasFnAttribute(
21994 Attribute::NoUnwind) &&
21995 "Function should be nounwind in insertCopiesSplitCSR!");
21996 Entry->addLiveIn(*I);
21997 BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
21998 .addReg(*I);
21999
22000 // Insert the copy-back instructions right before the terminator.
22001 for (auto *Exit : Exits)
22002 BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
22003 TII->get(TargetOpcode::COPY), *I)
22004 .addReg(NewVR);
22005 }
22006}
22007
22012
22014 return Subtarget->hasMVEIntegerOps();
22015}
22016
22019 auto *VTy = dyn_cast<FixedVectorType>(Ty);
22020 if (!VTy)
22021 return false;
22022
22023 auto *ScalarTy = VTy->getScalarType();
22024 unsigned NumElements = VTy->getNumElements();
22025
22026 unsigned VTyWidth = VTy->getScalarSizeInBits() * NumElements;
22027 if (VTyWidth < 128 || !llvm::isPowerOf2_32(VTyWidth))
22028 return false;
22029
22030 // Both VCADD and VCMUL/VCMLA support the same types, F16 and F32
22031 if (ScalarTy->isHalfTy() || ScalarTy->isFloatTy())
22032 return Subtarget->hasMVEFloatOps();
22033
22035 return false;
22036
22037 return Subtarget->hasMVEIntegerOps() &&
22038 (ScalarTy->isIntegerTy(8) || ScalarTy->isIntegerTy(16) ||
22039 ScalarTy->isIntegerTy(32));
22040}
22041
22043 static const MCPhysReg RCRegs[] = {ARM::FPSCR_RM};
22044 return RCRegs;
22045}
22046
22049 ComplexDeinterleavingRotation Rotation, Value *InputA, Value *InputB,
22050 Value *Accumulator) const {
22051
22053
22054 unsigned TyWidth = Ty->getScalarSizeInBits() * Ty->getNumElements();
22055
22056 assert(TyWidth >= 128 && "Width of vector type must be at least 128 bits");
22057
22058 if (TyWidth > 128) {
22059 int Stride = Ty->getNumElements() / 2;
22060 auto SplitSeq = llvm::seq<int>(0, Ty->getNumElements());
22061 auto SplitSeqVec = llvm::to_vector(SplitSeq);
22062 ArrayRef<int> LowerSplitMask(&SplitSeqVec[0], Stride);
22063 ArrayRef<int> UpperSplitMask(&SplitSeqVec[Stride], Stride);
22064
22065 auto *LowerSplitA = B.CreateShuffleVector(InputA, LowerSplitMask);
22066 auto *LowerSplitB = B.CreateShuffleVector(InputB, LowerSplitMask);
22067 auto *UpperSplitA = B.CreateShuffleVector(InputA, UpperSplitMask);
22068 auto *UpperSplitB = B.CreateShuffleVector(InputB, UpperSplitMask);
22069 Value *LowerSplitAcc = nullptr;
22070 Value *UpperSplitAcc = nullptr;
22071
22072 if (Accumulator) {
22073 LowerSplitAcc = B.CreateShuffleVector(Accumulator, LowerSplitMask);
22074 UpperSplitAcc = B.CreateShuffleVector(Accumulator, UpperSplitMask);
22075 }
22076
22077 auto *LowerSplitInt = createComplexDeinterleavingIR(
22078 B, OperationType, Rotation, LowerSplitA, LowerSplitB, LowerSplitAcc);
22079 auto *UpperSplitInt = createComplexDeinterleavingIR(
22080 B, OperationType, Rotation, UpperSplitA, UpperSplitB, UpperSplitAcc);
22081
22082 ArrayRef<int> JoinMask(&SplitSeqVec[0], Ty->getNumElements());
22083 return B.CreateShuffleVector(LowerSplitInt, UpperSplitInt, JoinMask);
22084 }
22085
22086 auto *IntTy = Type::getInt32Ty(B.getContext());
22087
22088 ConstantInt *ConstRotation = nullptr;
22089 if (OperationType == ComplexDeinterleavingOperation::CMulPartial) {
22090 ConstRotation = ConstantInt::get(IntTy, (int)Rotation);
22091
22092 if (Accumulator)
22093 return B.CreateIntrinsic(Intrinsic::arm_mve_vcmlaq, Ty,
22094 {ConstRotation, Accumulator, InputB, InputA});
22095 return B.CreateIntrinsic(Intrinsic::arm_mve_vcmulq, Ty,
22096 {ConstRotation, InputB, InputA});
22097 }
22098
22099 if (OperationType == ComplexDeinterleavingOperation::CAdd) {
22100 // 1 means the value is not halved.
22101 auto *ConstHalving = ConstantInt::get(IntTy, 1);
22102
22104 ConstRotation = ConstantInt::get(IntTy, 0);
22106 ConstRotation = ConstantInt::get(IntTy, 1);
22107
22108 if (!ConstRotation)
22109 return nullptr; // Invalid rotation for arm_mve_vcaddq
22110
22111 return B.CreateIntrinsic(Intrinsic::arm_mve_vcaddq, Ty,
22112 {ConstHalving, ConstRotation, InputA, InputB});
22113 }
22114
22115 return nullptr;
22116}
unsigned const MachineRegisterInfo * MRI
static bool isAddSubSExt(SDValue N, SelectionDAG &DAG)
static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, int64_t &Cnt)
isVShiftRImm - Check if this is a valid build_vector for the immediate operand of a vector shift righ...
static bool isExtendedBUILD_VECTOR(SDValue N, SelectionDAG &DAG, bool isSigned)
static bool isZeroExtended(SDValue N, SelectionDAG &DAG)
return SDValue()
static const MCPhysReg GPRArgRegs[]
static SDValue GeneratePerfectShuffle(unsigned ID, SDValue V1, SDValue V2, unsigned PFEntry, SDValue LHS, SDValue RHS, SelectionDAG &DAG, const SDLoc &DL)
GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit the specified operations t...
constexpr MVT FlagsVT
Value type used for NZCV flags.
static bool isNegatedInteger(SDValue Op)
static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt)
getVShiftImm - Check if this is a valid build_vector for the immediate operand of a vector shift oper...
static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG)
static bool isSignExtended(SDValue N, SelectionDAG &DAG)
static bool isAddSubZExt(SDValue N, SelectionDAG &DAG)
static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt)
isVShiftLImm - Check if this is a valid build_vector for the immediate operand of a vector shift left...
static bool canGuaranteeTCO(CallingConv::ID CC, bool GuaranteeTailCalls)
Return true if the calling convention is one that we can guarantee TCO for.
unsigned RegSize
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
amdgpu aa AMDGPU Address space based Alias Analysis Wrapper
static bool isConstant(const MachineInstr &MI)
constexpr LLT F64
constexpr LLT S1
This file declares a class to represent arbitrary precision floating point values and provide a varie...
This file implements a class to represent arbitrary precision integral constant values and operations...
static SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG)
static bool isStore(int Opcode)
static bool isThumb(const MCSubtargetInfo &STI)
static SDValue LowerUADDSUBO_CARRY(SDValue Op, SelectionDAG &DAG)
static SDValue PerformExtractEltToVMOVRRD(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static bool isIncompatibleReg(const MCPhysReg &PR, MVT VT)
static SDValue PerformVQDMULHCombine(SDNode *N, SelectionDAG &DAG)
static SDValue LowerBUILD_VECTOR_i1(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue LowerShift(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue AddRequiredExtensionForVMULL(SDValue N, SelectionDAG &DAG, const EVT &OrigTy, const EVT &ExtTy, unsigned ExtOpcode)
AddRequiredExtensionForVMULL - Add a sign/zero extension to extend the total value size to 64 bits.
static cl::opt< unsigned > ConstpoolPromotionMaxSize("arm-promote-constant-max-size", cl::Hidden, cl::desc("Maximum size of constant to promote into a constant pool"), cl::init(64))
static bool isZeroOrAllOnes(SDValue N, bool AllOnes)
static SDValue LowerINSERT_VECTOR_ELT_i1(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static bool isVTBLMask(ArrayRef< int > M, EVT VT)
static SDValue PerformSUBCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformSUBCombine - Target-specific dag combine xforms for ISD::SUB.
static cl::opt< bool > EnableConstpoolPromotion("arm-promote-constant", cl::Hidden, cl::desc("Enable / disable promotion of unnamed_addr constants into " "constant pools"), cl::init(false))
static SDValue PerformFAddVSelectCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue PerformExtractFpToIntStores(StoreSDNode *St, SelectionDAG &DAG)
static SDValue PerformVDUPCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
PerformVDUPCombine - Target-specific dag combine xforms for ARMISD::VDUP.
static SDValue PerformExtractEltCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *ST)
static const APInt * isPowerOf2Constant(SDValue V)
static SDValue PerformVCVTCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
PerformVCVTCombine - VCVT (floating-point to fixed-point, Advanced SIMD) can replace combinations of ...
static SDValue PerformVMOVhrCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue LowerVectorFP_TO_INT(SDValue Op, SelectionDAG &DAG)
static SDValue LowerVECTOR_SHUFFLEUsingOneOff(SDValue Op, ArrayRef< int > ShuffleMask, SelectionDAG &DAG)
static bool isValidMVECond(unsigned CC, bool IsFloat)
static SDValue PerformPREDICATE_CASTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static ARMCC::CondCodes IntCCToARMCC(ISD::CondCode CC)
IntCCToARMCC - Convert a DAG integer condition code to an ARM CC.
static SDValue PerformSTORECombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformSTORECombine - Target-specific dag combine xforms for ISD::STORE.
static SDValue ConvertBooleanCarryToCarryFlag(SDValue BoolCarry, SelectionDAG &DAG)
static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static bool isGTorGE(ISD::CondCode CC)
static bool CombineVLDDUP(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
CombineVLDDUP - For a VDUPLANE node N, check if its source operand is a vldN-lane (N > 1) intrinsic,...
static SDValue ParseBFI(SDNode *N, APInt &ToMask, APInt &FromMask)
static bool isReverseMask(ArrayRef< int > M, EVT VT)
static bool isVZIP_v_undef_Mask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
isVZIP_v_undef_Mask - Special case of isVZIPMask for canonical form of "vector_shuffle v,...
static SDValue PerformSELECTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue AddCombineTo64bitUMAAL(SDNode *AddeNode, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue PerformVECTOR_REG_CASTCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue PerformVMulVCTPCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
PerformVMulVCTPCombine - VCVT (fixed-point to floating-point, Advanced SIMD) can replace combinations...
static SDValue createGPRPairNode2xi32(SelectionDAG &DAG, SDValue V0, SDValue V1)
static SDValue bitcastf32Toi32(SDValue Op, SelectionDAG &DAG)
static bool findPointerConstIncrement(SDNode *N, SDValue *Ptr, SDValue *CInc)
static bool isVTRNMask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static bool CanInvertMVEVCMP(SDValue N)
static SDValue PerformLongShiftCombine(SDNode *N, SelectionDAG &DAG)
static SDValue AddCombineToVPADD(SDNode *N, SDValue N0, SDValue N1, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue PerformShiftCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *ST)
PerformShiftCombine - Checks for immediate versions of vector shifts and lowers them.
static void FPCCToARMCC(ISD::CondCode CC, ARMCC::CondCodes &CondCode, ARMCC::CondCodes &CondCode2)
FPCCToARMCC - Convert a DAG fp condition code to an ARM CC.
static void ExpandREAD_REGISTER(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG)
static EVT getVectorTyFromPredicateVector(EVT VT)
static SDValue PerformFADDVCMLACombine(SDNode *N, SelectionDAG &DAG)
static SDValue handleCMSEValue(const SDValue &Value, const ISD::InputArg &Arg, SelectionDAG &DAG, const SDLoc &DL)
static SDValue PerformARMBUILD_VECTORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
Target-specific dag combine xforms for ARMISD::BUILD_VECTOR.
static bool isSRL16(const SDValue &Op)
static SDValue PerformVMOVrhCombine(SDNode *N, SelectionDAG &DAG)
static SDValue PerformLOADCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue IsCMPZCSINC(SDNode *Cmp, ARMCC::CondCodes &CC)
static unsigned getPointerConstIncrement(unsigned Opcode, SDValue Ptr, SDValue Inc, const SelectionDAG &DAG)
static SDValue combineSelectAndUseCommutative(SDNode *N, bool AllOnes, TargetLowering::DAGCombinerInfo &DCI)
static SDValue LowerATOMIC_FENCE(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static Register genTPEntry(MachineBasicBlock *TpEntry, MachineBasicBlock *TpLoopBody, MachineBasicBlock *TpExit, Register OpSizeReg, const TargetInstrInfo *TII, DebugLoc Dl, MachineRegisterInfo &MRI)
Adds logic in loop entry MBB to calculate loop iteration count and adds t2WhileLoopSetup and t2WhileL...
static SDValue createGPRPairNodei64(SelectionDAG &DAG, SDValue V)
static bool isLTorLE(ISD::CondCode CC)
static SDValue PerformVCMPCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue PerformMVEVMULLCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue LowerSDIV_v4i16(SDValue N0, SDValue N1, const SDLoc &dl, SelectionDAG &DAG)
static SDValue performNegCMovCombine(SDNode *N, SelectionDAG &DAG)
static EVT getExtensionTo64Bits(const EVT &OrigVT)
static SDValue PerformBITCASTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *ST)
static SDValue AddCombineTo64bitMLAL(SDNode *AddeSubeNode, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue LowerWRITE_REGISTER(SDValue Op, SelectionDAG &DAG)
static bool checkAndUpdateCPSRKill(MachineBasicBlock::iterator SelectItr, MachineBasicBlock *BB, const TargetRegisterInfo *TRI)
static SDValue PerformCMPZCombine(SDNode *N, SelectionDAG &DAG)
static bool hasNormalLoadOperand(SDNode *N)
hasNormalLoadOperand - Check if any of the operands of a BUILD_VECTOR node are normal,...
static SDValue PerformInsertEltCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
PerformInsertEltCombine - Target-specific dag combine xforms for ISD::INSERT_VECTOR_ELT.
static SDValue PerformVDUPLANECombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformVDUPLANECombine - Target-specific dag combine xforms for ARMISD::VDUPLANE.
static SDValue LowerBuildVectorOfFPTrunc(SDValue BV, SelectionDAG &DAG, const ARMSubtarget *ST)
static cl::opt< unsigned > ConstpoolPromotionMaxTotal("arm-promote-constant-max-total", cl::Hidden, cl::desc("Maximum size of ALL constants to promote into a constant pool"), cl::init(128))
static SDValue LowerTruncatei1(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
static RTLIB::Libcall getDivRemLibcall(const SDNode *N, MVT::SimpleValueType SVT)
static SDValue SkipLoadExtensionForVMULL(LoadSDNode *LD, SelectionDAG &DAG)
SkipLoadExtensionForVMULL - return a load of the original vector size that does not do any sign/zero ...
static SDValue AddCombineVUZPToVPADDL(SDNode *N, SDValue N0, SDValue N1, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue PerformADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformADDCombineWithOperands - Try DAG combinations for an ADD with operands N0 and N1.
static SDValue PromoteMVEPredVector(SDLoc dl, SDValue Pred, EVT VT, SelectionDAG &DAG)
static bool isVZIPMask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
static SDValue PerformORCombineToSMULWBT(SDNode *OR, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static bool isVTRN_v_undef_Mask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
isVTRN_v_undef_Mask - Special case of isVTRNMask for canonical form of "vector_shuffle v,...
static SDValue LowerUDIV(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue FindBFIToCombineWith(SDNode *N)
static SDValue LowerADDSUBSAT(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue ConvertCarryFlagToBooleanCarry(SDValue Flags, EVT VT, SelectionDAG &DAG)
static void checkVSELConstraints(ISD::CondCode CC, ARMCC::CondCodes &CondCode, bool &swpCmpOps, bool &swpVselOps)
static void ReplaceLongIntrinsic(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG)
static bool isS16(const SDValue &Op, SelectionDAG &DAG)
static bool isSRA16(const SDValue &Op)
static SDValue AddCombineBUILD_VECTORToVPADDL(SDNode *N, SDValue N0, SDValue N1, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue LowerVECTOR_SHUFFLEUsingMovs(SDValue Op, ArrayRef< int > ShuffleMask, SelectionDAG &DAG)
static SDValue LowerInterruptReturn(SmallVectorImpl< SDValue > &RetOps, const SDLoc &DL, SelectionDAG &DAG)
static SDValue LowerEXTRACT_VECTOR_ELT_i1(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue LowerSDIV_v4i8(SDValue X, SDValue Y, const SDLoc &dl, SelectionDAG &DAG)
static void expandf64Toi32(SDValue Op, SelectionDAG &DAG, SDValue &RetVal1, SDValue &RetVal2)
static SDValue LowerCONCAT_VECTORS_i1(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue LowerCTTZ(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue PerformVLDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static bool isSHL16(const SDValue &Op)
static bool isVEXTMask(ArrayRef< int > M, EVT VT, bool &ReverseVEXT, unsigned &Imm)
static SDValue PerformMVEVLDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
cl::opt< unsigned > ArmMaxBaseUpdatesToCheck("arm-max-base-updates-to-check", cl::Hidden, cl::desc("Maximum number of base-updates to check generating postindex."), cl::init(64))
static bool isTruncMask(ArrayRef< int > M, EVT VT, bool Top, bool SingleSource)
static SDValue PerformADDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformADDCombine - Target-specific dag combine xforms for ISD::ADD.
static unsigned getLdOpcode(unsigned LdSize, bool IsThumb1, bool IsThumb2)
Return the load opcode for a given load size.
static bool isLegalT2AddressImmediate(int64_t V, EVT VT, const ARMSubtarget *Subtarget)
static bool isLegalMVEShuffleOp(unsigned PFEntry)
static SDValue PerformSignExtendInregCombine(SDNode *N, SelectionDAG &DAG)
static SDValue PerformShuffleVMOVNCombine(ShuffleVectorSDNode *N, SelectionDAG &DAG)
static bool isVUZPMask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
static SDValue PerformVECTOR_SHUFFLECombine(SDNode *N, SelectionDAG &DAG)
PerformVECTOR_SHUFFLECombine - Target-specific dag combine xforms for ISD::VECTOR_SHUFFLE.
static SDValue SkipExtensionForVMULL(SDNode *N, SelectionDAG &DAG)
SkipExtensionForVMULL - For a node that is a SIGN_EXTEND, ZERO_EXTEND, ANY_EXTEND,...
static bool isVMOVNTruncMask(ArrayRef< int > M, EVT ToVT, bool rev)
static SDValue PerformVQMOVNCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static MachineBasicBlock * OtherSucc(MachineBasicBlock *MBB, MachineBasicBlock *Succ)
static SDValue LowerVecReduceMinMax(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue PerformFPExtendCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue PerformAddcSubcCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue PerformVSELECTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static TargetLowering::ArgListTy getDivRemArgList(const SDNode *N, LLVMContext *Context, const ARMSubtarget *Subtarget)
static SDValue PerformVECREDUCE_ADDCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue getZeroVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl)
getZeroVector - Returns a vector of specified type with all zero elements.
static SDValue LowerAtomicLoadStore(SDValue Op, SelectionDAG &DAG)
static SDValue PerformSplittingToNarrowingStores(StoreSDNode *St, SelectionDAG &DAG)
static bool getT2IndexedAddressParts(SDNode *Ptr, EVT VT, bool isSEXTLoad, SDValue &Base, SDValue &Offset, bool &isInc, SelectionDAG &DAG)
static ARMCC::CondCodes getVCMPCondCode(SDValue N)
static cl::opt< bool > ARMInterworking("arm-interworking", cl::Hidden, cl::desc("Enable / disable ARM interworking (for debugging only)"), cl::init(true))
static void ReplaceREADCYCLECOUNTER(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue PerformORCombineToBFI(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static bool isConditionalZeroOrAllOnes(SDNode *N, bool AllOnes, SDValue &CC, bool &Invert, SDValue &OtherOp, SelectionDAG &DAG)
static SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue PerformVSetCCToVCTPCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue LowerBUILD_VECTORToVIDUP(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static bool isZeroVector(SDValue N)
static SDValue PerformAddeSubeCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static void ReplaceCMP_SWAP_64Results(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG)
static bool isLowerSaturate(const SDValue LHS, const SDValue RHS, const SDValue TrueVal, const SDValue FalseVal, const ISD::CondCode CC, const SDValue K)
static SDValue LowerPredicateLoad(SDValue Op, SelectionDAG &DAG)
static void emitPostSt(MachineBasicBlock *BB, MachineBasicBlock::iterator Pos, const TargetInstrInfo *TII, const DebugLoc &dl, unsigned StSize, unsigned Data, unsigned AddrIn, unsigned AddrOut, bool IsThumb1, bool IsThumb2)
Emit a post-increment store operation with given size.
static bool isVMOVNMask(ArrayRef< int > M, EVT VT, bool Top, bool SingleSource)
static SDValue CombineBaseUpdate(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
CombineBaseUpdate - Target-specific DAG combine function for VLDDUP, NEON load/store intrinsics,...
static SDValue LowerSaturatingConditional(SDValue Op, SelectionDAG &DAG)
static SDValue PerformSubCSINCCombine(SDNode *N, SelectionDAG &DAG)
static SDValue PerformVMOVRRDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformVMOVRRDCombine - Target-specific dag combine xforms for ARMISD::VMOVRRD.
static SDValue LowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue PerformCSETCombine(SDNode *N, SelectionDAG &DAG)
static SDValue PerformVMOVNCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue PerformInsertSubvectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue LowerVectorExtend(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue WinDBZCheckDenominator(SelectionDAG &DAG, SDNode *N, SDValue InChain)
static SDValue LowerVECTOR_SHUFFLEv8i8(SDValue Op, ArrayRef< int > ShuffleMask, SelectionDAG &DAG)
static SDValue PerformVMULCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformVMULCombine Distribute (A + B) * C to (A * C) + (B * C) to take advantage of the special multi...
static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG)
static SDValue PerformBFICombine(SDNode *N, SelectionDAG &DAG)
static SDValue PerformORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformORCombine - Target-specific dag combine xforms for ISD::OR.
static SDValue LowerMLOAD(SDValue Op, SelectionDAG &DAG)
static SDValue PerformTruncatingStoreCombine(StoreSDNode *St, SelectionDAG &DAG)
static unsigned SelectPairHalf(unsigned Elements, ArrayRef< int > Mask, unsigned Index)
static void emitPostLd(MachineBasicBlock *BB, MachineBasicBlock::iterator Pos, const TargetInstrInfo *TII, const DebugLoc &dl, unsigned LdSize, unsigned Data, unsigned AddrIn, unsigned AddrOut, bool IsThumb1, bool IsThumb2)
Emit a post-increment load operation with given size.
static SDValue TryDistrubutionADDVecReduce(SDNode *N, SelectionDAG &DAG)
static bool isValidBaseUpdate(SDNode *N, SDNode *User)
static SDValue IsSingleInstrConstant(SDValue N, SelectionDAG &DAG, const ARMSubtarget *ST, const SDLoc &dl)
static bool IsQRMVEInstruction(const SDNode *N, const SDNode *Op)
static SDValue PerformMinMaxToSatCombine(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue PerformXORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static bool getMVEIndexedAddressParts(SDNode *Ptr, EVT VT, Align Alignment, bool isSEXTLoad, bool IsMasked, bool isLE, SDValue &Base, SDValue &Offset, bool &isInc, SelectionDAG &DAG)
std::pair< unsigned, const TargetRegisterClass * > RCPair
static SDValue combineSelectAndUse(SDNode *N, SDValue Slct, SDValue OtherOp, TargetLowering::DAGCombinerInfo &DCI, bool AllOnes=false)
static SDValue PerformExtendCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
PerformExtendCombine - Target-specific DAG combining for ISD::SIGN_EXTEND, ISD::ZERO_EXTEND,...
static SDValue LowerSDIV(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
cl::opt< unsigned > MVEMaxSupportedInterleaveFactor("mve-max-interleave-factor", cl::Hidden, cl::desc("Maximum interleave factor for MVE VLDn to generate."), cl::init(2))
static SDValue isVMOVModifiedImm(uint64_t SplatBits, uint64_t SplatUndef, unsigned SplatBitSize, SelectionDAG &DAG, const SDLoc &dl, EVT &VT, EVT VectorVT, VMOVModImmType type)
isVMOVModifiedImm - Check if the specified splat value corresponds to a valid vector constant for a N...
static SDValue LowerBuildVectorOfFPExt(SDValue BV, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue CombineVMOVDRRCandidateWithVecOp(const SDNode *BC, SelectionDAG &DAG)
BC is a bitcast that is about to be turned into a VMOVDRR.
static SDValue promoteToConstantPool(const ARMTargetLowering *TLI, const GlobalValue *GV, SelectionDAG &DAG, EVT PtrVT, const SDLoc &dl)
static unsigned isNEONTwoResultShuffleMask(ArrayRef< int > ShuffleMask, EVT VT, unsigned &WhichResult, bool &isV_UNDEF)
Check if ShuffleMask is a NEON two-result shuffle (VZIP, VUZP, VTRN), and return the corresponding AR...
static bool BitsProperlyConcatenate(const APInt &A, const APInt &B)
static bool getARMIndexedAddressParts(SDNode *Ptr, EVT VT, bool isSEXTLoad, SDValue &Base, SDValue &Offset, bool &isInc, SelectionDAG &DAG)
static SDValue LowerVecReduce(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue LowerVectorINT_TO_FP(SDValue Op, SelectionDAG &DAG)
static bool TryCombineBaseUpdate(struct BaseUpdateTarget &Target, struct BaseUpdateUser &User, bool SimpleConstIncOnly, TargetLowering::DAGCombinerInfo &DCI)
static bool allUsersAreInFunction(const Value *V, const Function *F)
Return true if all users of V are within function F, looking through ConstantExprs.
static bool isSingletonVEXTMask(ArrayRef< int > M, EVT VT, unsigned &Imm)
static SDValue PerformVMOVDRRCombine(SDNode *N, SelectionDAG &DAG)
PerformVMOVDRRCombine - Target-specific dag combine xforms for ARMISD::VMOVDRR.
static bool isLowerSaturatingConditional(const SDValue &Op, SDValue &V, SDValue &SatK)
static bool isLegalAddressImmediate(int64_t V, EVT VT, const ARMSubtarget *Subtarget)
isLegalAddressImmediate - Return true if the integer value can be used as the offset of the target ad...
static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static bool isLegalT1AddressImmediate(int64_t V, EVT VT)
static SDValue CombineANDShift(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG)
static SDValue PerformSHLSimplify(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *ST)
static SDValue PerformADDECombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformADDECombine - Target-specific dag combine transform from ARMISD::ADDC, ARMISD::ADDE,...
static SDValue PerformReduceShuffleCombine(SDNode *N, SelectionDAG &DAG)
static SDValue PerformUMLALCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue LowerTruncate(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue PerformHWLoopCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *ST)
static SDValue PerformSplittingMVETruncToNarrowingStores(StoreSDNode *St, SelectionDAG &DAG)
static bool isVUZP_v_undef_Mask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
isVUZP_v_undef_Mask - Special case of isVUZPMask for canonical form of "vector_shuffle v,...
static bool isHomogeneousAggregate(Type *Ty, HABaseType &Base, uint64_t &Members)
static SDValue PerformMULCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue PerformFADDCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue LowerReverse_VECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG)
static SDValue PerformANDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue PerformADDVecReduce(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue LowerPredicateStore(SDValue Op, SelectionDAG &DAG)
static SDValue SearchLoopIntrinsic(SDValue N, ISD::CondCode &CC, int &Imm, bool &Negate)
static bool canChangeToInt(SDValue Op, bool &SeenZero, const ARMSubtarget *Subtarget)
canChangeToInt - Given the fp compare operand, return true if it is suitable to morph to an integer c...
static unsigned getStOpcode(unsigned StSize, bool IsThumb1, bool IsThumb2)
Return the store opcode for a given store size.
static bool IsVUZPShuffleNode(SDNode *N)
static SDValue Expand64BitShift(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue AddCombineTo64BitSMLAL16(SDNode *AddcNode, SDNode *AddeNode, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static void attachMEMCPYScratchRegs(const ARMSubtarget *Subtarget, MachineInstr &MI, const SDNode *Node)
Attaches vregs to MEMCPY that it will use as scratch registers when it is expanded into LDM/STM.
static bool isFloatingPointZero(SDValue Op)
isFloatingPointZero - Return true if this is +0.0.
static SDValue findMUL_LOHI(SDValue V)
static SDValue LowerVECTOR_SHUFFLE_i1(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue PerformORCombine_i1(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue PerformSplittingMVEEXTToWideningLoad(SDNode *N, SelectionDAG &DAG)
static SDValue PerformSplittingToWideningLoad(SDNode *N, SelectionDAG &DAG)
static void genTPLoopBody(MachineBasicBlock *TpLoopBody, MachineBasicBlock *TpEntry, MachineBasicBlock *TpExit, const TargetInstrInfo *TII, DebugLoc Dl, MachineRegisterInfo &MRI, Register OpSrcReg, Register OpDestReg, Register ElementCountReg, Register TotalIterationsReg, bool IsMemcpy)
Adds logic in the loopBody MBB to generate MVE_VCTP, t2DoLoopDec and t2DoLoopEnd.
static SDValue PerformBUILD_VECTORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformBUILD_VECTORCombine - Target-specific dag combine xforms for ISD::BUILD_VECTOR.
static SDValue LowerVecReduceF(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue PerformMinMaxCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
PerformMinMaxCombine - Target-specific DAG combining for creating truncating saturates.
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
This file a TargetTransformInfoImplBase conforming object specific to the ARM target machine.
Function Alias Analysis false
Function Alias Analysis Results
Atomic ordering constants.
This file contains the simple types necessary to represent the attributes associated with functions a...
This file implements the BitVector class.
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static std::optional< bool > isBigEndian(const SmallDenseMap< int64_t, int64_t, 8 > &MemOffset2Idx, int64_t LowestIdx)
Given a map from byte offsets in memory to indices in a load/store, determine if that map corresponds...
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static void createLoadIntrinsic(IntrinsicInst *II, LoadInst *LI, Value *Offset, dxil::ResourceTypeInfo &RTI)
static void createStoreIntrinsic(IntrinsicInst *II, StoreInst *SI, Value *Offset, dxil::ResourceTypeInfo &RTI)
This file defines the DenseMap class.
static bool isSigned(unsigned int Opcode)
#define Check(C,...)
#define op(i)
#define im(i)
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
Module.h This file contains the declarations for the Module class.
std::pair< Value *, Value * > ShuffleOps
We are building a shuffle to create V, which is a sequence of insertelement, extractelement pairs.
static Value * LowerCTPOP(LLVMContext &Context, Value *V, Instruction *IP)
Emit the code to lower ctpop of V before the specified instruction IP.
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
#define RegName(no)
static LVOptions Options
Definition LVOptions.cpp:25
lazy value info
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
#define G(x, y, z)
Definition MD5.cpp:55
This file declares the MachineConstantPool class which is an abstract constant pool to keep track of ...
static DebugLoc getDebugLoc(MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
Return the first DebugLoc that has line number information, given a range of instructions.
#define MAKE_CASE(V)
Register Reg
Register const TargetRegisterInfo * TRI
Promote Memory to Register
Definition Mem2Reg.cpp:110
#define T
nvptx lower args
uint64_t High
uint64_t IntrinsicInst * II
PowerPC Reduce CR logical Operation
R600 Clause Merge
const SmallVectorImpl< MachineOperand > & Cond
static cl::opt< RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode > Mode("regalloc-enable-advisor", cl::Hidden, cl::init(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default), cl::desc("Enable regalloc advisor mode"), cl::values(clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default, "default", "Default"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Release, "release", "precompiled"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Development, "development", "for training")))
SI Lower i1 Copies
This file contains some templates that are useful if you are working with the STL at all.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition Value.cpp:480
static cl::opt< unsigned > MaxSteps("has-predecessor-max-steps", cl::Hidden, cl::init(8192), cl::desc("DAG combiner limit number of steps when searching DAG " "for predecessor nodes"))
This file defines the SmallPtrSet class.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:171
This file contains some functions that are useful when dealing with strings.
This file implements the StringSwitch template, which mimics a switch() statement whose cases are str...
#define LLVM_DEBUG(...)
Definition Debug.h:114
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39
This file describes how to lower LLVM code to machine code.
static X86::CondCode getSwappedCondition(X86::CondCode CC)
Assuming the flags are set by MI(a,b), return the condition code if we modify the instructions such t...
static constexpr int Concat[]
Value * RHS
Value * LHS
BinaryOperator * Mul
static bool isIntrinsic(const CallBase &Call, Intrinsic::ID ID)
The Input class is used to parse a yaml document into in-memory structs and vectors.
static constexpr roundingMode rmTowardZero
Definition APFloat.h:348
bool getExactInverse(APFloat *Inv) const
If this value is normal and has an exact, normal, multiplicative inverse, store it in inv and return ...
Definition APFloat.cpp:5995
APInt bitcastToAPInt() const
Definition APFloat.h:1335
opStatus convertToInteger(MutableArrayRef< integerPart > Input, unsigned int Width, bool IsSigned, roundingMode RM, bool *IsExact) const
Definition APFloat.h:1314
Class for arbitrary precision integers.
Definition APInt.h:78
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition APInt.h:235
bool isMinSignedValue() const
Determine if this is the smallest signed value.
Definition APInt.h:424
uint64_t getZExtValue() const
Get zero extended value.
Definition APInt.h:1541
unsigned popcount() const
Count the number of bits set.
Definition APInt.h:1671
LLVM_ABI APInt zextOrTrunc(unsigned width) const
Zero extend or truncate to width.
Definition APInt.cpp:1033
unsigned getActiveBits() const
Compute the number of active bits in the value.
Definition APInt.h:1513
LLVM_ABI APInt trunc(unsigned width) const
Truncate to new width.
Definition APInt.cpp:936
void setBit(unsigned BitPosition)
Set the given bit to 1 whose position is given as "bitPosition".
Definition APInt.h:1331
bool sgt(const APInt &RHS) const
Signed greater than comparison.
Definition APInt.h:1202
bool isAllOnes() const
Determine if all bits are set. This is true for zero-width values.
Definition APInt.h:372
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition APInt.h:1489
bool ult(const APInt &RHS) const
Unsigned less than comparison.
Definition APInt.h:1112
unsigned countr_zero() const
Count the number of trailing zero bits.
Definition APInt.h:1640
unsigned countl_zero() const
The APInt version of std::countl_zero.
Definition APInt.h:1599
static LLVM_ABI APInt getSplat(unsigned NewLen, const APInt &V)
Return a value containing V broadcasted over NewLen bits.
Definition APInt.cpp:651
unsigned logBase2() const
Definition APInt.h:1762
uint64_t getLimitedValue(uint64_t Limit=UINT64_MAX) const
If this value is smaller than the specified limit, return it, otherwise return the limit value.
Definition APInt.h:476
bool isSubsetOf(const APInt &RHS) const
This operation checks that all bits set in this APInt are also set in RHS.
Definition APInt.h:1258
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
Definition APInt.h:441
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition APInt.h:307
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition APInt.h:297
static APInt getOneBitSet(unsigned numBits, unsigned BitNo)
Return an APInt with exactly one bit set in the result.
Definition APInt.h:240
int64_t getSExtValue() const
Get sign extended value.
Definition APInt.h:1563
void lshrInPlace(unsigned ShiftAmt)
Logical right-shift this APInt by ShiftAmt in place.
Definition APInt.h:859
APInt lshr(unsigned shiftAmt) const
Logical right-shift function.
Definition APInt.h:852
unsigned countr_one() const
Count the number of trailing one bits.
Definition APInt.h:1657
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
Definition APInt.h:1222
An arbitrary precision integer that knows its signedness.
Definition APSInt.h:24
virtual const ARMBaseRegisterInfo & getRegisterInfo() const =0
const uint32_t * getSjLjDispatchPreservedMask(const MachineFunction &MF) const
const MCPhysReg * getCalleeSavedRegs(const MachineFunction *MF) const override
Code Generation virtual methods...
Register getFrameRegister(const MachineFunction &MF) const override
const uint32_t * getCallPreservedMask(const MachineFunction &MF, CallingConv::ID) const override
const uint32_t * getTLSCallPreservedMask(const MachineFunction &MF) const
const uint32_t * getThisReturnPreservedMask(const MachineFunction &MF, CallingConv::ID) const
getThisReturnPreservedMask - Returns a call preserved mask specific to the case that 'returned' is on...
static ARMConstantPoolConstant * Create(const Constant *C, unsigned ID)
static ARMConstantPoolMBB * Create(LLVMContext &C, const MachineBasicBlock *mbb, unsigned ID, unsigned char PCAdj)
static ARMConstantPoolSymbol * Create(LLVMContext &C, StringRef s, unsigned ID, unsigned char PCAdj)
ARMConstantPoolValue - ARM specific constantpool value.
ARMFunctionInfo - This class is derived from MachineFunctionInfo and contains private ARM-specific in...
SmallPtrSet< const GlobalVariable *, 2 > & getGlobalsPromotedToConstantPool()
void setArgumentStackToRestore(unsigned v)
void setArgRegsSaveSize(unsigned s)
void setReturnRegsCount(unsigned s)
unsigned getArgRegsSaveSize() const
void markGlobalAsPromotedToConstantPool(const GlobalVariable *GV)
Indicate to the backend that GV has had its storage changed to inside a constant pool.
void setArgumentStackSize(unsigned size)
unsigned getArgumentStackSize() const
const Triple & getTargetTriple() const
const ARMBaseInstrInfo * getInstrInfo() const override
bool isThumb1Only() const
bool useFPVFMx() const
bool isThumb2() const
bool isTargetWindows() const
bool hasBaseDSP() const
const ARMTargetLowering * getTargetLowering() const override
const ARMBaseRegisterInfo * getRegisterInfo() const override
bool hasVFP2Base() const
bool useFPVFMx64() const
bool isLittle() const
bool useFPVFMx16() const
bool isMClass() const
bool useMulOps() const
Align getDualLoadStoreAlignment() const
bool shouldFoldSelectWithIdentityConstant(unsigned BinOpcode, EVT VT, unsigned SelectOpcode, SDValue X, SDValue Y) const override
Return true if pulling a binary operation into a select with an identity constant is profitable.
bool isReadOnly(const GlobalValue *GV) const
unsigned getMaxSupportedInterleaveFactor() const override
Get the maximum supported factor for interleaved memory accesses.
TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicLoadInIR(LoadInst *LI) const override
Returns how the given (atomic) load should be expanded by the IR-level AtomicExpand pass.
unsigned getNumInterleavedAccesses(VectorType *VecTy, const DataLayout &DL) const
Returns the number of interleaved accesses that will be generated when lowering accesses of the given...
bool shouldInsertFencesForAtomic(const Instruction *I) const override
Whether AtomicExpandPass should automatically insert fences and reduce ordering for this atomic.
Align getABIAlignmentForCallingConv(Type *ArgTy, const DataLayout &DL) const override
Return the correct alignment for the current calling convention.
bool isDesirableToCommuteWithShift(const SDNode *N, CombineLevel Level) const override
Return true if it is profitable to move this shift by a constant amount through its operand,...
Register getExceptionPointerRegister(const Constant *PersonalityFn) const override
If a physical register, this returns the register that receives the exception address on entry to an ...
ConstraintWeight getSingleConstraintMatchWeight(AsmOperandInfo &info, const char *constraint) const override
Examine constraint string and operand type and determine a weight value.
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I=nullptr) const override
isLegalAddressingMode - Return true if the addressing mode represented by AM is legal for this target...
const ARMSubtarget * getSubtarget() const
bool isLegalT2ScaledAddressingMode(const AddrMode &AM, EVT VT) const
bool isLegalT1ScaledAddressingMode(const AddrMode &AM, EVT VT) const
Returns true if the addressing mode representing by AM is legal for the Thumb1 target,...
bool getPreIndexedAddressParts(SDNode *N, SDValue &Base, SDValue &Offset, ISD::MemIndexedMode &AM, SelectionDAG &DAG) const override
getPreIndexedAddressParts - returns true by value, base pointer and offset pointer and addressing mod...
MachineInstr * EmitKCFICheck(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator &MBBI, const TargetInstrInfo *TII) const override
bool shouldAlignPointerArgs(CallInst *CI, unsigned &MinSize, Align &PrefAlign) const override
Return true if the pointer arguments to CI should be aligned by aligning the object whose address is ...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
ReplaceNodeResults - Replace the results of node with an illegal result type with new values built ou...
void emitAtomicCmpXchgNoStoreLLBalance(IRBuilderBase &Builder) const override
bool isMulAddWithConstProfitable(SDValue AddNode, SDValue ConstNode) const override
Return true if it may be profitable to transform (mul (add x, c1), c2) -> (add (mul x,...
bool isLegalAddImmediate(int64_t Imm) const override
isLegalAddImmediate - Return true if the specified immediate is legal add immediate,...
EVT getOptimalMemOpType(LLVMContext &Context, const MemOp &Op, const AttributeList &FuncAttributes) const override
Returns the target specific optimal type for load and store operations as a result of memset,...
Instruction * emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst, AtomicOrdering Ord) const override
bool isFNegFree(EVT VT) const override
Return true if an fneg operation is free to the point where it is never worthwhile to replace it with...
void finalizeLowering(MachineFunction &MF) const override
Execute target specific actions to finalize target lowering.
SDValue PerformMVETruncCombine(SDNode *N, DAGCombinerInfo &DCI) const
bool isFPImmLegal(const APFloat &Imm, EVT VT, bool ForCodeSize=false) const override
isFPImmLegal - Returns true if the target can instruction select the specified FP immediate natively.
ConstraintType getConstraintType(StringRef Constraint) const override
getConstraintType - Given a constraint letter, return the type of constraint it is for this target.
bool preferIncOfAddToSubOfNot(EVT VT) const override
These two forms are equivalent: sub y, (xor x, -1) add (add x, 1), y The variant with two add's is IR...
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicStoreInIR(StoreInst *SI) const override
Returns how the given (atomic) store should be expanded by the IR-level AtomicExpand pass into.
void insertSSPDeclarations(Module &M) const override
Inserts necessary declarations for SSP (stack protection) purpose.
SDValue PerformIntrinsicCombine(SDNode *N, DAGCombinerInfo &DCI) const
PerformIntrinsicCombine - ARM-specific DAG combining for intrinsics.
bool shouldFoldConstantShiftPairToMask(const SDNode *N) const override
Return true if it is profitable to fold a pair of shifts into a mask.
bool isDesirableToCommuteXorWithShift(const SDNode *N) const override
Return true if it is profitable to combine an XOR of a logical shift to create a logical shift of NOT...
SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG) const
PerformCMOVCombine - Target-specific DAG combining for ARMISD::CMOV.
Value * createComplexDeinterleavingIR(IRBuilderBase &B, ComplexDeinterleavingOperation OperationType, ComplexDeinterleavingRotation Rotation, Value *InputA, Value *InputB, Value *Accumulator=nullptr) const override
Create the IR node for the given complex deinterleaving operation.
bool isComplexDeinterleavingSupported() const override
Does this target support complex deinterleaving.
SDValue PerformMVEExtCombine(SDNode *N, DAGCombinerInfo &DCI) const
bool SimplifyDemandedBitsForTargetNode(SDValue Op, const APInt &OriginalDemandedBits, const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth) const override
Attempt to simplify any target nodes based on the demanded bits/elts, returning true on success.
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override
getSetCCResultType - Return the value type to use for ISD::SETCC.
MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const override
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
Value * emitStoreConditional(IRBuilderBase &Builder, Value *Val, Value *Addr, AtomicOrdering Ord) const override
Perform a store-conditional operation to Addr.
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC, bool isVarArg) const
FastISel * createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo) const override
createFastISel - This method returns a target specific FastISel object, or null if the target does no...
void AdjustInstrPostInstrSelection(MachineInstr &MI, SDNode *Node) const override
This method should be implemented by targets that mark instructions with the 'hasPostISelHook' flag.
TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override
Returns how the IR-level AtomicExpand pass should expand the given AtomicRMW, if at all.
bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, unsigned Index) const override
Return true if EXTRACT_SUBVECTOR is cheap for this result type with this index.
bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I, MachineFunction &MF, unsigned Intrinsic) const override
getTgtMemIntrinsic - Represent NEON load and store intrinsics as MemIntrinsicNodes.
bool isTruncateFree(Type *SrcTy, Type *DstTy) const override
Return true if it's free to truncate a value of type FromTy to type ToTy.
bool isShuffleMaskLegal(ArrayRef< int > M, EVT VT) const override
isShuffleMaskLegal - Targets can use this to indicate that they only support some VECTOR_SHUFFLE oper...
bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override
Returns true if it is beneficial to convert a load of a constant to just the constant itself.
bool lowerInterleavedStore(Instruction *Store, Value *Mask, ShuffleVectorInst *SVI, unsigned Factor, const APInt &GapMask) const override
Lower an interleaved store into a vstN intrinsic.
bool canCreateUndefOrPoisonForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const override
Return true if Op can create undef or poison from non-undef & non-poison operands.
const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent=false) const override
getRegClassFor - Return the register class that should be used for the specified value type.
bool useLoadStackGuardNode(const Module &M) const override
If this function returns true, SelectionDAGBuilder emits a LOAD_STACK_GUARD node when it is lowering ...
bool lowerInterleavedLoad(Instruction *Load, Value *Mask, ArrayRef< ShuffleVectorInst * > Shuffles, ArrayRef< unsigned > Indices, unsigned Factor, const APInt &GapMask) const override
Lower an interleaved load into a vldN intrinsic.
std::pair< const TargetRegisterClass *, uint8_t > findRepresentativeClass(const TargetRegisterInfo *TRI, MVT VT) const override
Return the largest legal super-reg register class of the register class for the specified type and it...
bool preferSelectsOverBooleanArithmetic(EVT VT) const override
Should we prefer selects to doing arithmetic on boolean types.
bool isZExtFree(SDValue Val, EVT VT2) const override
Return true if zero-extending the specific node Val to type VT2 is free (either because it's implicit...
bool isCheapToSpeculateCttz(Type *Ty) const override
Return true if it is cheap to speculate a call to intrinsic cttz.
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
bool isCheapToSpeculateCtlz(Type *Ty) const override
Return true if it is cheap to speculate a call to intrinsic ctlz.
bool targetShrinkDemandedConstant(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, TargetLoweringOpt &TLO) const override
ARMTargetLowering(const TargetMachine &TM, const ARMSubtarget &STI)
bool isComplexDeinterleavingOperationSupported(ComplexDeinterleavingOperation Operation, Type *Ty) const override
Does this target support complex deinterleaving with the given operation and type.
bool supportKCFIBundles() const override
Return true if the target supports kcfi operand bundles.
SDValue PerformBRCONDCombine(SDNode *N, SelectionDAG &DAG) const
PerformBRCONDCombine - Target-specific DAG combining for ARMISD::BRCOND.
const char * getTargetNodeName(unsigned Opcode) const override
This method returns the name of a target specific DAG node.
Register getExceptionSelectorRegister(const Constant *PersonalityFn) const override
If a physical register, this returns the register that receives the exception typeid on entry to a la...
Type * shouldConvertSplatType(ShuffleVectorInst *SVI) const override
Given a shuffle vector SVI representing a vector splat, return a new scalar type of size equal to SVI...
Value * emitLoadLinked(IRBuilderBase &Builder, Type *ValueTy, Value *Addr, AtomicOrdering Ord) const override
Perform a load-linked operation on Addr, returning a "Value *" with the corresponding pointee type.
Instruction * makeDMB(IRBuilderBase &Builder, ARM_MB::MemBOpt Domain) const
bool isLegalICmpImmediate(int64_t Imm) const override
isLegalICmpImmediate - Return true if the specified immediate is legal icmp immediate,...
const char * LowerXConstraint(EVT ConstraintVT) const override
Try to replace an X constraint, which matches anything, with another that has more specific requireme...
unsigned getJumpTableEncoding() const override
Return the entry encoding for a jump table in the current function.
bool isDesirableToTransformToIntegerOp(unsigned Opc, EVT VT) const override
Return true if it is profitable for dag combiner to transform a floating point op of specified opcode...
TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const override
Returns how the given atomic cmpxchg should be expanded by the IR-level AtomicExpand pass.
CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool isVarArg) const
bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags, unsigned *Fast) const override
allowsMisalignedMemoryAccesses - Returns true if the target allows unaligned memory accesses of the s...
bool isLegalInterleavedAccessType(unsigned Factor, FixedVectorType *VecTy, Align Alignment, const DataLayout &DL) const
Returns true if VecTy is a legal interleaved access type.
bool isVectorLoadExtDesirable(SDValue ExtVal) const override
Return true if folding a vector load into ExtVal (a sign, zero, or any extend node) is profitable.
bool canCombineStoreAndExtract(Type *VectorTy, Value *Idx, unsigned &Cost) const override
Return true if the target can combine store(extractelement VectorTy,Idx).
bool useSoftFloat() const override
bool alignLoopsWithOptSize() const override
Should loops be aligned even when the function is marked OptSize (but not MinSize).
SDValue PerformCMOVToBFICombine(SDNode *N, SelectionDAG &DAG) const
bool allowTruncateForTailCall(Type *Ty1, Type *Ty2) const override
Return true if a truncation from FromTy to ToTy is permitted when deciding whether a call is in tail ...
void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const override
LowerAsmOperandForConstraint - Lower the specified operand into the Ops vector.
std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override
Given a physical register constraint (e.g.
bool shouldConvertFpToSat(unsigned Op, EVT FPVT, EVT VT) const override
Should we generate fp_to_si_sat and fp_to_ui_sat from type FPVT to type VT from min(max(fptoi)) satur...
bool functionArgumentNeedsConsecutiveRegisters(Type *Ty, CallingConv::ID CallConv, bool isVarArg, const DataLayout &DL) const override
Returns true if an argument of type Ty needs to be passed in a contiguous block of registers in calli...
bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override
Return true if folding a constant offset with the given GlobalAddress is legal.
const ARMBaseTargetMachine & getTM() const
bool isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const override
Return if the target supports combining a chain like:
ShiftLegalizationStrategy preferredShiftLegalizationStrategy(SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const override
bool getPostIndexedAddressParts(SDNode *N, SDNode *Op, SDValue &Base, SDValue &Offset, ISD::MemIndexedMode &AM, SelectionDAG &DAG) const override
getPostIndexedAddressParts - returns true by value, base pointer and offset pointer and addressing mo...
Instruction * emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst, AtomicOrdering Ord) const override
Inserts in the IR a target-specific intrinsic specifying a fence.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition ArrayRef.h:143
bool empty() const
empty - Check if the array is empty.
Definition ArrayRef.h:138
An instruction that atomically checks whether a specified value is in a memory location,...
an instruction that atomically reads a memory location, combines it with another value,...
bool isFloatingPointOperation() const
static LLVM_ABI Attribute get(LLVMContext &Context, AttrKind Kind, uint64_t Val=0)
Return a uniquified Attribute object.
static LLVM_ABI BaseIndexOffset match(const SDNode *N, const SelectionDAG &DAG)
Parses tree in N for base, index, offset addresses.
LLVM Basic Block Representation.
Definition BasicBlock.h:62
The address of a basic block.
Definition Constants.h:899
static BranchProbability getZero()
A "pseudo-class" with methods for operating on BUILD_VECTORs.
LLVM_ABI bool isConstantSplat(APInt &SplatValue, APInt &SplatUndef, unsigned &SplatBitSize, bool &HasAnyUndefs, unsigned MinSplatBits=0, bool isBigEndian=false) const
Check if this is a constant splat, and if so, find the smallest element size that splats the vector.
LLVM_ABI int32_t getConstantFPSplatPow2ToLog2Int(BitVector *UndefElements, uint32_t BitWidth) const
If this is a constant FP splat and the splatted constant FP is an exact power or 2,...
CCState - This class holds information needed while lowering arguments and return values.
void getInRegsParamInfo(unsigned InRegsParamRecordIndex, unsigned &BeginReg, unsigned &EndReg) const
unsigned getFirstUnallocated(ArrayRef< MCPhysReg > Regs) const
getFirstUnallocated - Return the index of the first unallocated register in the set,...
static LLVM_ABI bool resultsCompatible(CallingConv::ID CalleeCC, CallingConv::ID CallerCC, MachineFunction &MF, LLVMContext &C, const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn CalleeFn, CCAssignFn CallerFn)
Returns true if the results of the two calling conventions are compatible.
MCRegister AllocateReg(MCPhysReg Reg)
AllocateReg - Attempt to allocate one register.
LLVM_ABI bool CheckReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
CheckReturn - Analyze the return values of a function, returning true if the return can be performed ...
LLVM_ABI void AnalyzeReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeReturn - Analyze the returned values of a return, incorporating info about the result values i...
unsigned getInRegsParamsProcessed() const
uint64_t getStackSize() const
Returns the size of the currently allocated portion of the stack.
void addInRegsParamInfo(unsigned RegBegin, unsigned RegEnd)
LLVM_ABI void AnalyzeFormalArguments(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeFormalArguments - Analyze an array of argument values, incorporating info about the formals in...
unsigned getInRegsParamsCount() const
CCValAssign - Represent assignment of one arg/retval to a location.
Register getLocReg() const
LocInfo getLocInfo() const
bool needsCustom() const
int64_t getLocMemOffset() const
unsigned getValNo() const
LLVM_ABI bool isMustTailCall() const
Tests if this call site must be tail call optimized.
LLVM_ABI bool isIndirectCall() const
Return true if the callsite is an indirect call.
AttributeList getAttributes() const
Return the attributes for this call.
void addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind)
Adds the attribute to the indicated argument.
This class represents a function call, abstracting a target machine's calling convention.
bool isTailCall() const
static Constant * get(LLVMContext &Context, ArrayRef< ElementTy > Elts)
get() constructor - Return a constant with array type with an element count and element type matching...
Definition Constants.h:715
const APFloat & getValueAPF() const
ConstantFP - Floating Point Values [float, double].
Definition Constants.h:277
This is the shared class of boolean and integer constants.
Definition Constants.h:87
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
Definition Constants.h:163
uint64_t getZExtValue() const
const APInt & getAPIntValue() const
int64_t getSExtValue() const
This is an important base class in LLVM.
Definition Constant.h:43
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:63
bool isLittleEndian() const
Layout endianness...
Definition DataLayout.h:207
bool isBigEndian() const
Definition DataLayout.h:208
MaybeAlign getStackAlignment() const
Returns the natural stack alignment, or MaybeAlign() if one wasn't specified.
Definition DataLayout.h:237
LLVM_ABI TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
LLVM_ABI Align getPreferredAlign(const GlobalVariable *GV) const
Returns the preferred alignment of the specified global.
StringRef getPrivateGlobalPrefix() const
Definition DataLayout.h:295
LLVM_ABI Align getPrefTypeAlign(Type *Ty) const
Returns the preferred stack/global alignment for the specified type.
A debug info location.
Definition DebugLoc.h:124
iterator find(const_arg_type_t< KeyT > Val)
Definition DenseMap.h:178
unsigned size() const
Definition DenseMap.h:110
bool empty() const
Definition DenseMap.h:109
iterator begin()
Definition DenseMap.h:78
iterator end()
Definition DenseMap.h:81
This is a fast-path instruction selection class that generates poor code and doesn't support illegal ...
Definition FastISel.h:66
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:803
A handy container for a FunctionType+Callee-pointer pair, which can be passed around as a single enti...
FunctionLoweringInfo - This contains information that is global to a function that is used when lower...
Type * getParamType(unsigned i) const
Parameter type accessors.
FunctionType * getFunctionType() const
Returns the FunctionType for me.
Definition Function.h:209
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition Function.h:270
arg_iterator arg_begin()
Definition Function.h:866
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:359
bool hasStructRetAttr() const
Determine if the function returns a structure through first or second pointer argument.
Definition Function.h:687
const Argument * const_arg_iterator
Definition Function.h:73
bool isVarArg() const
isVarArg - Return true if this function takes a variable number of arguments.
Definition Function.h:227
bool hasFnAttribute(Attribute::AttrKind Kind) const
Return true if the function has the attribute.
Definition Function.cpp:727
const GlobalValue * getGlobal() const
bool isDSOLocal() const
bool hasExternalWeakLinkage() const
bool hasDLLImportStorageClass() const
Module * getParent()
Get the module that this global value is contained inside of...
bool isStrongDefinitionForLinker() const
Returns true if this global's definition will be the one chosen by the linker.
@ InternalLinkage
Rename collisions when linking (static functions).
Definition GlobalValue.h:60
Common base class shared among various IRBuilders.
Definition IRBuilder.h:114
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition IRBuilder.h:2788
LLVM_ABI bool hasAtomicStore() const LLVM_READONLY
Return true if this atomic instruction stores to memory.
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
bool isUnindexed() const
Return true if this is NOT a pre/post inc/dec load/store.
bool isIndexed() const
Return true if this is a pre/post inc/dec load/store.
An instruction for reading from memory.
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
Describe properties that are true of each instruction in the target description file.
Machine Value Type.
static MVT getFloatingPointVT(unsigned BitWidth)
static auto integer_fixedlen_vector_valuetypes()
SimpleValueType SimpleTy
uint64_t getScalarSizeInBits() const
unsigned getVectorNumElements() const
bool isInteger() const
Return true if this is an integer or a vector integer type.
static LLVM_ABI MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
static auto integer_valuetypes()
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
static auto fixedlen_vector_valuetypes()
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
bool isScalarInteger() const
Return true if this is an integer, not including vectors.
static MVT getVectorVT(MVT VT, unsigned NumElements)
MVT getVectorElementType() const
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
static MVT getIntegerVT(unsigned BitWidth)
static auto fp_valuetypes()
bool is64BitVector() const
Return true if this is a 64-bit vector type.
LLVM_ABI void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
bool isEHPad() const
Returns true if the block is a landing pad.
LLVM_ABI MachineBasicBlock * getFallThrough(bool JumpToFallThrough=true)
Return the fallthrough block if the block can implicitly transfer control to the block after it by fa...
void setCallFrameSize(unsigned N)
Set the call frame size on entry to this basic block.
const BasicBlock * getBasicBlock() const
Return the LLVM basic block that this instance corresponded to originally.
LLVM_ABI bool canFallThrough()
Return true if the block can implicitly transfer control to the block after it by falling off the end...
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
Instructions::iterator instr_iterator
MachineInstrBundleIterator< MachineInstr, true > reverse_iterator
LLVM_ABI MachineBasicBlock * splitAt(MachineInstr &SplitInst, bool UpdateLiveIns=true, LiveIntervals *LIS=nullptr)
Split a basic block into 2 pieces at SplitPoint.
void addLiveIn(MCRegister PhysReg, LaneBitmask LaneMask=LaneBitmask::getAll())
Adds the specified register as a live in.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
LLVM_ABI instr_iterator erase(instr_iterator I)
Remove an instruction from the instruction list and delete it.
iterator_range< succ_iterator > successors()
iterator_range< pred_iterator > predecessors()
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
MachineInstrBundleIterator< MachineInstr > iterator
LLVM_ABI void moveAfter(MachineBasicBlock *NewBefore)
void setIsEHPad(bool V=true)
Indicates the block is a landing pad.
The MachineConstantPool class keeps track of constants referenced by a function which must be spilled...
unsigned getConstantPoolIndex(const Constant *C, Align Alignment)
getConstantPoolIndex - Create a new entry in the constant pool or return an existing one.
LLVM_ABI int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
LLVM_ABI void computeMaxCallFrameSize(MachineFunction &MF, std::vector< MachineBasicBlock::iterator > *FrameSDOps=nullptr)
Computes the maximum size of a callframe.
void setFrameAddressIsTaken(bool T)
void setHasTailCall(bool V=true)
void setReturnAddressIsTaken(bool s)
int64_t getObjectSize(int ObjectIdx) const
Return the size of the specified object.
bool hasVAStart() const
Returns true if the function calls the llvm.va_start intrinsic.
int64_t getObjectOffset(int ObjectIdx) const
Return the assigned stack offset of the specified object from the incoming stack pointer.
bool isFixedObjectIndex(int ObjectIdx) const
Returns true if the specified index corresponds to a fixed stack object.
int getFunctionContextIndex() const
Return the index for the function context object.
Properties which a MachineFunction may have at a given point in time.
unsigned getFunctionNumber() const
getFunctionNumber - Return a unique ID for the current function.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
void push_back(MachineBasicBlock *MBB)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
const DataLayout & getDataLayout() const
Return the DataLayout attached to the Module associated to this MF.
Function & getFunction()
Return the LLVM function that this machine code represents.
BasicBlockListType::iterator iterator
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
MachineConstantPool * getConstantPool()
getConstantPool - Return the constant pool object for the current function.
const MachineFunctionProperties & getProperties() const
Get the function properties.
Register addLiveIn(MCRegister PReg, const TargetRegisterClass *RC)
addLiveIn - Add the specified physical register as a live-in value and create a corresponding virtual...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
const MachineInstrBuilder & addExternalSymbol(const char *FnName, unsigned TargetFlags=0) const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addFrameIndex(int Idx) const
const MachineInstrBuilder & addConstantPoolIndex(unsigned Idx, int Offset=0, unsigned TargetFlags=0) const
const MachineInstrBuilder & addRegMask(const uint32_t *Mask) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addJumpTableIndex(unsigned Idx, unsigned TargetFlags=0) const
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
const MachineInstrBuilder & addUse(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register use operand.
const MachineInstrBuilder & setMIFlags(unsigned Flags) const
const MachineInstrBuilder & addMemOperand(MachineMemOperand *MMO) const
MachineInstr * getInstr() const
If conversion operators fail, use this method to get the MachineInstr explicitly.
const MachineInstrBuilder & addDef(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register definition operand.
Representation of each machine instruction.
bool readsRegister(Register Reg, const TargetRegisterInfo *TRI) const
Return true if the MachineInstr reads the specified register.
bool definesRegister(Register Reg, const TargetRegisterInfo *TRI) const
Return true if the MachineInstr fully defines the specified register.
MachineOperand * mop_iterator
iterator/begin/end - Iterate over all operands of a machine instruction.
const MachineOperand & getOperand(unsigned i) const
LLVM_ABI unsigned createJumpTableIndex(const std::vector< MachineBasicBlock * > &DestBBs)
createJumpTableIndex - Create a new jump table.
@ EK_Inline
EK_Inline - Jump table entries are emitted inline at their point of use.
@ EK_BlockAddress
EK_BlockAddress - Each entry is a plain address of block, e.g.: .word LBB123.
A description of a memory reference used in the backend.
Flags
Flags values. These may be or'd together.
@ MOVolatile
The memory access is volatile.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MONonTemporal
The memory access is non-temporal.
@ MOInvariant
The memory access always returns the same value (or traps).
@ MOStore
The memory access writes data.
Flags getFlags() const
Return the raw flags of the source value,.
MachineOperand class - Representation of each machine instruction operand.
LLVM_ABI void setIsRenamable(bool Val=true)
bool isReg() const
isReg - Tests if this is a MO_Register operand.
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
static MachineOperand CreateImm(int64_t Val)
Register getReg() const
getReg - Returns the register number.
LLVM_ABI void setIsDef(bool Val=true)
Change a def to a use, or a use to a def.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
This class is used to represent an MLOAD node.
This class is used to represent an MSTORE node.
This SDNode is used for target intrinsics that touch memory and need an associated MachineMemOperand.
This is an abstract virtual class for memory operations.
Align getBaseAlign() const
Returns alignment and volatility of the memory access.
Align getAlign() const
bool isVolatile() const
AAMDNodes getAAInfo() const
Returns the AA info that describes the dereference.
bool isSimple() const
Returns true if the memory operation is neither atomic or volatile.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
const MachinePointerInfo & getPointerInfo() const
const SDValue & getChain() const
EVT getMemoryVT() const
Return the type of the in-memory value.
A Module instance is used to store all the information related to an LLVM module.
Definition Module.h:67
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
Wrapper class representing virtual and physical registers.
Definition Register.h:20
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
const DebugLoc & getDebugLoc() const
Represents one node in the SelectionDAG.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool hasOneUse() const
Return true if there is exactly one use of this node.
LLVM_ABI bool isOnlyUserOf(const SDNode *N) const
Return true if this node is the only use of N.
iterator_range< use_iterator > uses()
static bool hasPredecessorHelper(const SDNode *N, SmallPtrSetImpl< const SDNode * > &Visited, SmallVectorImpl< const SDNode * > &Worklist, unsigned int MaxSteps=0, bool TopologicalPrune=false)
Returns true if N is a predecessor of any node in Worklist.
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
bool use_empty() const
Return true if there are no uses of this node.
unsigned getNumValues() const
Return the number of values defined/returned by this operator.
unsigned getNumOperands() const
Return the number of values used by this operation.
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
const APInt & getConstantOperandAPInt(unsigned Num) const
Helper method returns the APInt of a ConstantSDNode operand.
bool isPredecessorOf(const SDNode *N) const
Return true if this node is a predecessor of N.
LLVM_ABI bool hasAnyUseOfValue(unsigned Value) const
Return true if there are any use of the indicated value.
EVT getValueType(unsigned ResNo) const
Return the type of a specified result.
void setCFIType(uint32_t Type)
bool isUndef() const
Returns true if the node type is UNDEF or POISON.
iterator_range< user_iterator > users()
void setFlags(SDNodeFlags NewFlags)
user_iterator user_begin() const
Provide iteration support to walk over all users of an SDNode.
Represents a use of a SDNode.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
bool isUndef() const
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
const SDValue & getOperand(unsigned i) const
const APInt & getConstantOperandAPInt(unsigned i) const
uint64_t getScalarValueSizeInBits() const
unsigned getResNo() const
get the index which selects a specific result in the SDNode
uint64_t getConstantOperandVal(unsigned i) const
unsigned getOpcode() const
unsigned getNumOperands() const
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
LLVM_ABI SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
SDValue getTargetGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, unsigned TargetFlags=0)
LLVM_ABI SDValue getStackArgumentTokenFactor(SDValue Chain)
Compute a TokenFactor to force all the incoming stack arguments to be loaded from the stack.
const TargetSubtargetInfo & getSubtarget() const
SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, Register Reg, SDValue N)
LLVM_ABI SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
LLVM_ABI SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
LLVM_ABI SDValue getSplatValue(SDValue V, bool LegalTypes=false)
If V is a splat vector, return its scalar source operand by extracting that element from the source v...
LLVM_ABI SDValue getAllOnesConstant(const SDLoc &DL, EVT VT, bool IsTarget=false, bool IsOpaque=false)
LLVM_ABI MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
LLVM_ABI SDNode * getNodeIfExists(unsigned Opcode, SDVTList VTList, ArrayRef< SDValue > Ops, const SDNodeFlags Flags, bool AllowCommute=false)
Get the specified node if it's already available, or else return NULL.
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false)
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
LLVM_ABI SDValue UnrollVectorOp(SDNode *N, unsigned ResNE=0)
Utility function used by legalize and lowering to "unroll" a vector operation by splitting out the sc...
LLVM_ABI bool haveNoCommonBitsSet(SDValue A, SDValue B) const
Return true if A and B have no common bits set.
LLVM_ABI SDValue getRegister(Register Reg, EVT VT)
LLVM_ABI SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
LLVM_ABI SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, LocationSize Size=LocationSize::precise(0), const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
void addNoMergeSiteInfo(const SDNode *Node, bool NoMerge)
Set NoMergeSiteInfo to be associated with Node if NoMerge is true.
std::pair< SDValue, SDValue > SplitVectorOperand(const SDNode *N, unsigned OpNo)
Split the node's operand with EXTRACT_SUBVECTOR and return the low/high part.
LLVM_ABI SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
const TargetLowering & getTargetLoweringInfo() const
SDValue getTargetJumpTable(int JTI, EVT VT, unsigned TargetFlags=0)
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
LLVM_ABI SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, Register Reg, EVT VT)
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build Select's if you just have operands and don't want to check...
LLVM_ABI SDValue getNegative(SDValue Val, const SDLoc &DL, EVT VT)
Create negative operation as (SUB 0, Val).
LLVM_ABI void setNodeMemRefs(MachineSDNode *N, ArrayRef< MachineMemOperand * > NewMemRefs)
Mutate the specified machine node's memory references to the provided list.
LLVM_ABI SDValue getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT VT)
Return the expression required to zero extend the Op value assuming it was the smaller SrcTy value.
const DataLayout & getDataLayout() const
LLVM_ABI SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
SDValue getSignedTargetConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
LLVM_ABI void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
LLVM_ABI std::pair< SDValue, SDValue > SplitVector(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the vector with EXTRACT_SUBVECTOR using the provided VTs and return the low/high part.
LLVM_ABI SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
LLVM_ABI SDValue getSignedConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
LLVM_ABI MaybeAlign InferPtrAlign(SDValue Ptr) const
Infer alignment of a load / store address.
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
LLVM_ABI SDValue getTargetExtractSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand)
A convenience function for creating TargetInstrInfo::EXTRACT_SUBREG nodes.
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
LLVM_ABI SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either sign-extending or trunca...
LLVM_ABI bool isKnownNeverZero(SDValue Op, unsigned Depth=0) const
Test whether the given SDValue is known to contain non-zero value(s).
LLVM_ABI SDValue getExternalSymbol(const char *Sym, EVT VT)
const TargetMachine & getTarget() const
LLVM_ABI SDValue getIntPtrConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
LLVM_ABI SDValue getValueType(EVT)
LLVM_ABI SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI unsigned ComputeNumSignBits(SDValue Op, unsigned Depth=0) const
Return the number of times the sign bit of the register is replicated into the other bits.
LLVM_ABI SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
LLVM_ABI void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
LLVM_ABI SDValue getFrameIndex(int FI, EVT VT, bool isTarget=false)
LLVM_ABI KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
LLVM_ABI SDValue getRegisterMask(const uint32_t *RegMask)
LLVM_ABI SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
LLVM_ABI SDValue getCondCode(ISD::CondCode Cond)
void addCallSiteInfo(const SDNode *Node, CallSiteInfo &&CallInfo)
Set CallSiteInfo to be associated with Node.
LLVM_ABI bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
SDValue getObjectPtrOffset(const SDLoc &SL, SDValue Ptr, TypeSize Offset)
Create an add instruction with appropriate flags when used for addressing some offset of an object.
LLVMContext * getContext() const
LLVM_ABI SDValue getTargetExternalSymbol(const char *Sym, EVT VT, unsigned TargetFlags=0)
LLVM_ABI SDValue CreateStackTemporary(TypeSize Bytes, Align Alignment)
Create a stack temporary based on the size in bytes and the alignment.
SDValue getTargetConstantPool(const Constant *C, EVT VT, MaybeAlign Align=std::nullopt, int Offset=0, unsigned TargetFlags=0)
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
LLVM_ABI SDValue getMaskedLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Base, SDValue Offset, SDValue Mask, SDValue Src0, EVT MemVT, MachineMemOperand *MMO, ISD::MemIndexedMode AM, ISD::LoadExtType, bool IsExpanding=false)
DenormalMode getDenormalMode(EVT VT) const
Return the current function's default denormal handling kind for the given floating point type.
LLVM_ABI std::pair< SDValue, SDValue > SplitScalar(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the scalar node with EXTRACT_ELEMENT using the provided VTs and return the low/high part.
LLVM_ABI SDValue getVectorShuffle(EVT VT, const SDLoc &dl, SDValue N1, SDValue N2, ArrayRef< int > Mask)
Return an ISD::VECTOR_SHUFFLE node.
LLVM_ABI SDValue getLogicalNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a logical NOT operation as (XOR Val, BooleanOne).
This instruction constructs a fixed permutation of two input vectors.
VectorType * getType() const
Overload to return most specific vector type.
static LLVM_ABI void getShuffleMask(const Constant *Mask, SmallVectorImpl< int > &Result)
Convert the input shuffle mask operand to a vector of integers.
static LLVM_ABI bool isIdentityMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector without lane crossin...
This SDNode is used to implement the code generator support for the llvm IR shufflevector instruction...
int getMaskElt(unsigned Idx) const
ArrayRef< int > getMask() const
static LLVM_ABI bool isSplatMask(ArrayRef< int > Mask)
void insert_range(Range &&R)
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
bool empty() const
Definition SmallSet.h:168
bool erase(const T &V)
Definition SmallSet.h:199
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition SmallSet.h:183
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
iterator insert(iterator I, T &&Elt)
void resize(size_type N)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
This class is used to represent ISD::STORE nodes.
const SDValue & getBasePtr() const
const SDValue & getValue() const
bool isTruncatingStore() const
Return true if the op does a truncation before store.
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
const unsigned char * bytes_end() const
Definition StringRef.h:127
constexpr size_t size() const
size - Get the string size.
Definition StringRef.h:146
const unsigned char * bytes_begin() const
Definition StringRef.h:124
static LLVM_ABI StructType * get(LLVMContext &Context, ArrayRef< Type * > Elements, bool isPacked=false)
This static method is the primary way to create a literal StructType.
Definition Type.cpp:414
TargetInstrInfo - Interface to description of machine instruction set.
Provides information about what library functions are available for the current target.
bool isOperationExpand(unsigned Op, EVT VT) const
Return true if the specified operation is illegal on this target or unlikely to be made legal with cu...
void setBooleanVectorContents(BooleanContent Ty)
Specify how the target extends the result of a vector boolean value from a vector of i1 to a wider ty...
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
virtual void finalizeLowering(MachineFunction &MF) const
Execute target specific actions to finalize target lowering.
void setMaxDivRemBitWidthSupported(unsigned SizeInBits)
Set the size in bits of the maximum div/rem the backend supports.
bool PredictableSelectIsExpensive
Tells the code generator that select is more expensive than a branch if the branch is usually predict...
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
CallingConv::ID getLibcallCallingConv(RTLIB::Libcall Call) const
Get the CallingConv that should be used for the specified libcall.
unsigned MaxStoresPerMemcpyOptSize
Likewise for functions with the OptSize attribute.
virtual const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent=false) const
Return the register class that should be used for the specified value type.
ShiftLegalizationStrategy
Return the preferred strategy to legalize tihs SHIFT instruction, with ExpansionFactor being the recu...
void setMinStackArgumentAlignment(Align Alignment)
Set the minimum stack alignment of an argument.
const TargetMachine & getTargetMachine() const
void setIndexedMaskedLoadAction(unsigned IdxMode, MVT VT, LegalizeAction Action)
Indicate that the specified indexed masked load does or does not work with the specified type and ind...
void setIndexedLoadAction(ArrayRef< unsigned > IdxModes, MVT VT, LegalizeAction Action)
Indicate that the specified indexed load does or does not work with the specified type and indicate w...
void setPrefLoopAlignment(Align Alignment)
Set the target's preferred loop alignment.
void setMaxAtomicSizeInBitsSupported(unsigned SizeInBits)
Set the maximum atomic operation size supported by the backend.
Sched::Preference getSchedulingPreference() const
Return target scheduling preference.
void setMinFunctionAlignment(Align Alignment)
Set the target's minimum function alignment.
unsigned MaxStoresPerMemsetOptSize
Likewise for functions with the OptSize attribute.
void setBooleanContents(BooleanContent Ty)
Specify how the target extends the result of integer and floating point boolean values from i1 to a w...
unsigned MaxStoresPerMemmove
Specify maximum number of store instructions per memmove call.
void computeRegisterProperties(const TargetRegisterInfo *TRI)
Once all of the register classes are added, this allows us to compute derived properties we expose.
unsigned MaxStoresPerMemmoveOptSize
Likewise for functions with the OptSize attribute.
void addRegisterClass(MVT VT, const TargetRegisterClass *RC)
Add the specified register class as an available regclass for the specified value type.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
void setIndexedStoreAction(ArrayRef< unsigned > IdxModes, MVT VT, LegalizeAction Action)
Indicate that the specified indexed store does or does not work with the specified type and indicate ...
virtual MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
void setPrefFunctionAlignment(Align Alignment)
Set the target's preferred function alignment.
virtual unsigned getMaxSupportedInterleaveFactor() const
Get the maximum supported factor for interleaved memory accesses.
void setIndexedMaskedStoreAction(unsigned IdxMode, MVT VT, LegalizeAction Action)
Indicate that the specified indexed masked store does or does not work with the specified type and in...
unsigned MaxStoresPerMemset
Specify maximum number of store instructions per memset call.
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
virtual ShiftLegalizationStrategy preferredShiftLegalizationStrategy(SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const
bool isOperationLegalOrCustom(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
void setStackPointerRegisterToSaveRestore(Register R)
If set to a physical register, this specifies the register that llvm.savestack/llvm....
void AddPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
If Opc/OrigVT is specified as being promoted, the promotion code defaults to trying a larger integer/...
AtomicExpansionKind
Enum that specifies what an atomic load/AtomicRMWInst is expanded to, if at all.
virtual std::pair< const TargetRegisterClass *, uint8_t > findRepresentativeClass(const TargetRegisterInfo *TRI, MVT VT) const
Return the largest legal super-reg register class of the register class for the specified type and it...
RTLIB::LibcallImpl getLibcallImpl(RTLIB::Libcall Call) const
Get the libcall impl routine name for the specified libcall.
static StringRef getLibcallImplName(RTLIB::LibcallImpl Call)
Get the libcall routine name for the specified libcall implementation.
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
void setLoadExtAction(unsigned ExtType, MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified load with extension does not work with the specified type and indicate wh...
void setLibcallImpl(RTLIB::Libcall Call, RTLIB::LibcallImpl Impl)
LegalizeTypeAction getTypeAction(LLVMContext &Context, EVT VT) const
Return how we should legalize values of this type, either it is already legal (return 'Legal') or we ...
const char * getLibcallName(RTLIB::Libcall Call) const
Get the libcall routine name for the specified libcall.
std::vector< ArgListEntry > ArgListTy
unsigned MaxStoresPerMemcpy
Specify maximum number of store instructions per memcpy call.
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
virtual void insertSSPDeclarations(Module &M) const
Inserts necessary declarations for SSP (stack protection) purpose.
This class defines information used to lower LLVM code to legal SelectionDAG operators that the targe...
bool SimplifyDemandedVectorElts(SDValue Op, const APInt &DemandedEltMask, APInt &KnownUndef, APInt &KnownZero, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Vector Op.
void softenSetCCOperands(SelectionDAG &DAG, EVT VT, SDValue &NewLHS, SDValue &NewRHS, ISD::CondCode &CCCode, const SDLoc &DL, const SDValue OldLHS, const SDValue OldRHS) const
Soften the operands of a comparison.
std::pair< SDValue, SDValue > makeLibCall(SelectionDAG &DAG, RTLIB::Libcall LC, EVT RetVT, ArrayRef< SDValue > Ops, MakeLibCallOptions CallOptions, const SDLoc &dl, SDValue Chain=SDValue()) const
Returns a pair of (return value, chain).
virtual ConstraintType getConstraintType(StringRef Constraint) const
Given a constraint, return the type of constraint it is for this target.
bool parametersInCSRMatch(const MachineRegisterInfo &MRI, const uint32_t *CallerPreservedMask, const SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< SDValue > &OutVals) const
Check whether parameters to a call that are passed in callee saved registers are the same as from the...
virtual SDValue LowerToTLSEmulatedModel(const GlobalAddressSDNode *GA, SelectionDAG &DAG) const
Lower TLS global address SDNode for target independent emulated TLS model.
std::pair< SDValue, SDValue > LowerCallTo(CallLoweringInfo &CLI) const
This function lowers an abstract call to a function into an actual call.
bool expandDIVREMByConstant(SDNode *N, SmallVectorImpl< SDValue > &Result, EVT HiLoVT, SelectionDAG &DAG, SDValue LL=SDValue(), SDValue LH=SDValue()) const
Attempt to expand an n-bit div/rem/divrem by constant using a n/2-bit urem by constant and other arit...
bool isPositionIndependent() const
virtual ConstraintWeight getSingleConstraintMatchWeight(AsmOperandInfo &info, const char *constraint) const
Examine constraint string and operand type and determine a weight value.
SDValue buildLegalVectorShuffle(EVT VT, const SDLoc &DL, SDValue N0, SDValue N1, MutableArrayRef< int > Mask, SelectionDAG &DAG) const
Tries to build a legal vector shuffle using the provided parameters or equivalent variations.
virtual std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const
Given a physical register constraint (e.g.
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Op.
virtual bool SimplifyDemandedBitsForTargetNode(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0) const
Attempt to simplify any target nodes based on the demanded bits/elts, returning true on success.
TargetLowering(const TargetLowering &)=delete
bool isConstTrueVal(SDValue N) const
Return if the N is a constant or constant vector equal to the true value from getBooleanContents().
virtual ArrayRef< MCPhysReg > getRoundingControlRegisters() const
Returns a 0 terminated array of rounding control registers that can be attached into strict FP call.
virtual void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const
Lower the specified operand into the Ops vector.
virtual bool canCreateUndefOrPoisonForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const
Return true if Op can create undef or poison from non-undef & non-poison operands.
Primary interface to the complete machine description for the target machine.
TLSModel::Model getTLSModel(const GlobalValue *GV) const
Returns the TLS model which should be used for the given global variable.
ExceptionHandling getExceptionModel() const
Return the ExceptionHandling to use, considering TargetOptions and the Triple's default.
const Triple & getTargetTriple() const
bool useEmulatedTLS() const
Returns true if this target uses emulated TLS.
virtual const TargetSubtargetInfo * getSubtargetImpl(const Function &) const
Virtual method implemented by subclasses that returns a reference to that target's TargetSubtargetInf...
TargetOptions Options
unsigned EnableFastISel
EnableFastISel - This flag enables fast-path instruction selection which trades away generated code q...
unsigned GuaranteedTailCallOpt
GuaranteedTailCallOpt - This flag is enabled when -tailcallopt is specified on the commandline.
unsigned EmitCallGraphSection
Emit section containing call graph metadata.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
virtual const TargetRegisterInfo * getRegisterInfo() const =0
Return the target's register information.
Target - Wrapper for Target specific information.
Triple - Helper class for working with autoconf configuration names.
Definition Triple.h:47
ObjectFormatType getObjectFormat() const
Get the object format for this triple.
Definition Triple.h:439
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:344
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
static LLVM_ABI IntegerType * getInt64Ty(LLVMContext &C)
Definition Type.cpp:298
bool isVectorTy() const
True if this is an instance of VectorType.
Definition Type.h:273
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:297
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:267
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition Type.h:153
static LLVM_ABI Type * getVoidTy(LLVMContext &C)
Definition Type.cpp:281
static LLVM_ABI IntegerType * getInt8Ty(LLVMContext &C)
Definition Type.cpp:295
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:352
LLVM_ABI TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Definition Type.cpp:198
static LLVM_ABI IntegerType * getInt16Ty(LLVMContext &C)
Definition Type.cpp:296
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition Type.h:142
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition Type.h:128
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:231
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:184
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240
bool isFPOrFPVectorTy() const
Return true if this is a FP type or a vector of FP.
Definition Type.h:225
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
LLVM_ABI unsigned getOperandNo() const
Return the operand # of this use in its User.
Definition Use.cpp:35
User * getUser() const
Returns the User that contains this Use.
Definition Use.h:61
Value * getOperand(unsigned i) const
Definition User.h:232
unsigned getNumOperands() const
Definition User.h:254
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
Base class of all SIMD vector types.
Type * getElementType() const
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:202
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
Definition DenseSet.h:175
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:201
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:123
IteratorT end() const
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
static CondCodes getOppositeCondition(CondCodes CC)
Definition ARMBaseInfo.h:48
@ SECREL
Thread Pointer Offset.
@ SBREL
Section Relative (Windows TLS)
@ GOTTPOFF
Global Offset Table, PC Relative.
@ TPOFF
Global Offset Table, Thread Pointer Offset.
TOF
Target Operand Flag enum.
@ MO_NONLAZY
MO_NONLAZY - This is an independent flag, on a symbol operand "FOO" it represents a symbol which,...
@ MO_SBREL
MO_SBREL - On a symbol operand, this represents a static base relative relocation.
@ MO_DLLIMPORT
MO_DLLIMPORT - On a symbol operand, this represents that the reference to the symbol is for an import...
@ MO_GOT
MO_GOT - On a symbol operand, this represents a GOT relative relocation.
@ MO_COFFSTUB
MO_COFFSTUB - On a symbol operand "FOO", this indicates that the reference is actually to the "....
static ShiftOpc getShiftOpcForNode(unsigned Opcode)
int getSOImmVal(unsigned Arg)
getSOImmVal - Given a 32-bit immediate, if it is something that can fit into an shifter_operand immed...
int getFP32Imm(const APInt &Imm)
getFP32Imm - Return an 8-bit floating-point version of the 32-bit floating-point value.
uint64_t decodeVMOVModImm(unsigned ModImm, unsigned &EltBits)
decodeVMOVModImm - Decode a NEON/MVE modified immediate value into the element value and the element ...
unsigned getAM2Offset(unsigned AM2Opc)
bool isThumbImmShiftedVal(unsigned V)
isThumbImmShiftedVal - Return true if the specified value can be obtained by left shifting a 8-bit im...
int getT2SOImmVal(unsigned Arg)
getT2SOImmVal - Given a 32-bit immediate, if it is something that can fit into a Thumb-2 shifter_oper...
unsigned createVMOVModImm(unsigned OpCmode, unsigned Val)
int getFP64Imm(const APInt &Imm)
getFP64Imm - Return an 8-bit floating-point version of the 64-bit floating-point value.
int getFP16Imm(const APInt &Imm)
getFP16Imm - Return an 8-bit floating-point version of the 16-bit floating-point value.
unsigned getSORegOpc(ShiftOpc ShOp, unsigned Imm)
int getFP32FP16Imm(const APInt &Imm)
If this is a FP16Imm encoded as a fp32 value, return the 8-bit encoding for it.
AddrOpc getAM2Op(unsigned AM2Opc)
bool isBitFieldInvertedMask(unsigned v)
const unsigned FPStatusBits
const unsigned FPReservedBits
const unsigned RoundingBitsPos
FastISel * createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo)
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
@ Entry
Definition COFF.h:862
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ Swift
Calling convention for Swift.
Definition CallingConv.h:69
@ ARM_APCS
ARM Procedure Calling Standard (obsolete, but still used on some targets).
@ CFGuard_Check
Special calling convention on Windows for calling the Control Guard Check ICall funtion.
Definition CallingConv.h:82
@ PreserveMost
Used for runtime calls that preserves most registers.
Definition CallingConv.h:63
@ ARM_AAPCS
ARM Architecture Procedure Calling Standard calling convention (aka EABI).
@ CXX_FAST_TLS
Used for access functions.
Definition CallingConv.h:72
@ GHC
Used by the Glasgow Haskell Compiler (GHC).
Definition CallingConv.h:50
@ PreserveAll
Used for runtime calls that preserves (almost) all registers.
Definition CallingConv.h:66
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition CallingConv.h:41
@ Tail
Attemps to make calls as fast as possible while guaranteeing that tail call optimization can always b...
Definition CallingConv.h:76
@ SwiftTail
This follows the Swift calling convention in how arguments are passed but guarantees tail calls will ...
Definition CallingConv.h:87
@ ARM_AAPCS_VFP
Same as ARM_AAPCS, but uses hard floating point ABI.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
bool isNON_EXTLoad(const SDNode *N)
Returns true if the specified node is a non-extending load.
NodeType
ISD::NodeType enum - This enum defines the target-independent operators for a SelectionDAG.
Definition ISDOpcodes.h:41
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition ISDOpcodes.h:807
@ MERGE_VALUES
MERGE_VALUES - This node takes multiple discrete operands and returns them all as its individual resu...
Definition ISDOpcodes.h:256
@ CTLZ_ZERO_UNDEF
Definition ISDOpcodes.h:780
@ STRICT_FSETCC
STRICT_FSETCC/STRICT_FSETCCS - Constrained versions of SETCC, used for floating-point operands only.
Definition ISDOpcodes.h:504
@ EH_SJLJ_LONGJMP
OUTCHAIN = EH_SJLJ_LONGJMP(INCHAIN, buffer) This corresponds to the eh.sjlj.longjmp intrinsic.
Definition ISDOpcodes.h:163
@ FGETSIGN
INT = FGETSIGN(FP) - Return the sign bit of the specified floating point value as an integer 0/1 valu...
Definition ISDOpcodes.h:531
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition ISDOpcodes.h:270
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
Definition ISDOpcodes.h:593
@ BSWAP
Byte Swap and Counting operators.
Definition ISDOpcodes.h:771
@ ADD
Simple integer binary arithmetic operators.
Definition ISDOpcodes.h:259
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition ISDOpcodes.h:841
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition ISDOpcodes.h:511
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
Definition ISDOpcodes.h:215
@ EH_SJLJ_SETUP_DISPATCH
OUTCHAIN = EH_SJLJ_SETUP_DISPATCH(INCHAIN) The target initializes the dispatch table here.
Definition ISDOpcodes.h:167
@ GlobalAddress
Definition ISDOpcodes.h:88
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition ISDOpcodes.h:868
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
Definition ISDOpcodes.h:577
@ FADD
Simple binary floating point operators.
Definition ISDOpcodes.h:410
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
Definition ISDOpcodes.h:744
@ SDIVREM
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition ISDOpcodes.h:275
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
Definition ISDOpcodes.h:249
@ STRICT_FSQRT
Constrained versions of libm-equivalent floating point intrinsics.
Definition ISDOpcodes.h:431
@ BUILTIN_OP_END
BUILTIN_OP_END - This must be the last enum value in this list.
@ GlobalTLSAddress
Definition ISDOpcodes.h:89
@ SIGN_EXTEND
Conversion operators.
Definition ISDOpcodes.h:832
@ AVGCEILS
AVGCEILS/AVGCEILU - Rounding averaging add - Add two integers using an integer of type i[N+2],...
Definition ISDOpcodes.h:712
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
Definition ISDOpcodes.h:662
@ CTTZ_ZERO_UNDEF
Bit counting operators with an undefined result for zero inputs.
Definition ISDOpcodes.h:779
@ SETCCCARRY
Like SetCC, ops #0 and #1 are the LHS and RHS operands to compare, but op #2 is a boolean indicating ...
Definition ISDOpcodes.h:815
@ SSUBO
Same for subtraction.
Definition ISDOpcodes.h:347
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
Definition ISDOpcodes.h:369
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition ISDOpcodes.h:784
@ UNDEF
UNDEF - An undefined node.
Definition ISDOpcodes.h:228
@ EXTRACT_ELEMENT
EXTRACT_ELEMENT - This is used to get the lower or upper (determined by a Constant,...
Definition ISDOpcodes.h:242
@ BasicBlock
Various leaf nodes.
Definition ISDOpcodes.h:81
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
Definition ISDOpcodes.h:225
@ SADDO
RESULT, BOOL = [SU]ADDO(LHS, RHS) - Overflow-aware nodes for addition.
Definition ISDOpcodes.h:343
@ GET_ROUNDING
Returns current rounding mode: -1 Undefined 0 Round to 0 1 Round to nearest, ties to even 2 Round to ...
Definition ISDOpcodes.h:958
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition ISDOpcodes.h:701
@ SHL
Shift and rotation operations.
Definition ISDOpcodes.h:762
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition ISDOpcodes.h:642
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
Definition ISDOpcodes.h:607
@ READ_REGISTER
READ_REGISTER, WRITE_REGISTER - This node represents llvm.register on the DAG, which implements the n...
Definition ISDOpcodes.h:134
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition ISDOpcodes.h:569
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition ISDOpcodes.h:219
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition ISDOpcodes.h:838
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
Definition ISDOpcodes.h:799
@ SMULO
Same for multiplication.
Definition ISDOpcodes.h:351
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition ISDOpcodes.h:876
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition ISDOpcodes.h:724
@ VSELECT
Select with a vector condition (op #0) and two vector operands (ops #1 and #2), returning a vector re...
Definition ISDOpcodes.h:793
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:323
@ STRICT_FROUNDEVEN
Definition ISDOpcodes.h:457
@ FRAMEADDR
FRAMEADDR, RETURNADDR - These nodes represent llvm.frameaddress and llvm.returnaddress on the DAG.
Definition ISDOpcodes.h:110
@ STRICT_FP_TO_UINT
Definition ISDOpcodes.h:471
@ STRICT_FP_ROUND
X = STRICT_FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision ...
Definition ISDOpcodes.h:493
@ STRICT_FP_TO_SINT
STRICT_FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:470
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:914
@ STRICT_FP_EXTEND
X = STRICT_FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition ISDOpcodes.h:498
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition ISDOpcodes.h:736
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition ISDOpcodes.h:200
@ SCMP
[US]CMP - 3-way comparison of signed or unsigned integers.
Definition ISDOpcodes.h:732
@ AVGFLOORS
AVGFLOORS/AVGFLOORU - Averaging add - Add two integers using an integer of type i[N+1],...
Definition ISDOpcodes.h:707
@ STRICT_FADD
Constrained versions of the binary floating point operators.
Definition ISDOpcodes.h:420
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
Definition ISDOpcodes.h:558
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
Definition ISDOpcodes.h:53
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition ISDOpcodes.h:947
@ SPONENTRY
SPONENTRY - Represents the llvm.sponentry intrinsic.
Definition ISDOpcodes.h:122
@ STRICT_FNEARBYINT
Definition ISDOpcodes.h:451
@ FP_TO_SINT_SAT
FP_TO_[US]INT_SAT - Convert floating point value in operand 0 to a signed or unsigned scalar integer ...
Definition ISDOpcodes.h:933
@ EH_SJLJ_SETJMP
RESULT, OUTCHAIN = EH_SJLJ_SETJMP(INCHAIN, buffer) This corresponds to the eh.sjlj....
Definition ISDOpcodes.h:157
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition ISDOpcodes.h:844
@ SHL_PARTS
SHL_PARTS/SRA_PARTS/SRL_PARTS - These operators are used for expanded integer shift operations.
Definition ISDOpcodes.h:821
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
Definition ISDOpcodes.h:527
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
Definition ISDOpcodes.h:360
@ ABDS
ABDS/ABDU - Absolute difference - Return the absolute difference between two numbers interpreted as s...
Definition ISDOpcodes.h:719
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition ISDOpcodes.h:208
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
Definition ISDOpcodes.h:549
bool isNormalStore(const SDNode *N)
Returns true if the specified node is a non-truncating and unindexed store.
bool isZEXTLoad(const SDNode *N)
Returns true if the specified node is a ZEXTLOAD.
LLVM_ABI CondCode getSetCCInverse(CondCode Operation, EVT Type)
Return the operation corresponding to !(X op Y), where 'op' is a valid SetCC operation.
bool isEXTLoad(const SDNode *N)
Returns true if the specified node is a EXTLOAD.
LLVM_ABI CondCode getSetCCSwappedOperands(CondCode Operation)
Return the operation corresponding to (Y op X) when given the operation for (X op Y).
LLVM_ABI bool isBuildVectorAllZeros(const SDNode *N)
Return true if the specified node is a BUILD_VECTOR where all of the elements are 0 or undef.
bool isSignedIntSetCC(CondCode Code)
Return true if this is a setcc instruction that performs a signed comparison when used with integer o...
LLVM_ABI bool isConstantSplatVector(const SDNode *N, APInt &SplatValue)
Node predicates.
MemIndexedMode
MemIndexedMode enum - This enum defines the load / store indexed addressing modes.
bool isSEXTLoad(const SDNode *N)
Returns true if the specified node is a SEXTLOAD.
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
LoadExtType
LoadExtType enum - This enum defines the three variants of LOADEXT (load with extension).
static const int LAST_INDEXED_MODE
bool isNormalLoad(const SDNode *N)
Returns true if the specified node is a non-extending and unindexed load.
This namespace contains an enum with a value for every intrinsic/builtin function known by LLVM.
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
LLVM_ABI Libcall getSINTTOFP(EVT OpVT, EVT RetVT)
getSINTTOFP - Return the SINTTOFP_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
LLVM_ABI Libcall getUINTTOFP(EVT OpVT, EVT RetVT)
getUINTTOFP - Return the UINTTOFP_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
LLVM_ABI Libcall getFPTOUINT(EVT OpVT, EVT RetVT)
getFPTOUINT - Return the FPTOUINT_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
LLVM_ABI Libcall getFPTOSINT(EVT OpVT, EVT RetVT)
getFPTOSINT - Return the FPTOSINT_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
LLVM_ABI Libcall getFPEXT(EVT OpVT, EVT RetVT)
getFPEXT - Return the FPEXT_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
LLVM_ABI Libcall getFPROUND(EVT OpVT, EVT RetVT)
getFPROUND - Return the FPROUND_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Dead
Unused definition.
@ Define
Register definition.
@ Kill
The last use of a register.
@ SingleThread
Synchronized with respect to signal handlers executing in the same thread.
Definition LLVMContext.h:55
initializer< Ty > init(const Ty &Val)
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:316
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
bool RetFastCC_ARM_APCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
@ Low
Lower the current thread's priority such that it does not affect foreground tasks significantly.
Definition Threading.h:280
@ Offset
Definition DWP.cpp:477
@ Length
Definition DWP.cpp:477
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
void stable_sort(R &&Range)
Definition STLExtras.h:2058
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1751
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1725
bool HasLowerConstantMaterializationCost(unsigned Val1, unsigned Val2, const ARMSubtarget *Subtarget, bool ForCodesize=false)
Returns true if Val1 has a lower Constant Materialization Cost than Val2.
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
InstructionCost Cost
LLVM_ABI bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
bool isStrongerThanMonotonic(AtomicOrdering AO)
int countr_one(T Value)
Count the number of ones from the least significant bit to the first zero bit.
Definition bit.h:293
FunctionAddr VTableAddr uintptr_t uintptr_t Int32Ty
Definition InstrProf.h:296
bool CCAssignFn(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
CCAssignFn - This function assigns a location for Val, updating State to reflect the change.
bool CC_ARM_AAPCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
constexpr bool isMask_32(uint32_t Value)
Return true if the argument is a non-empty sequence of ones starting at the least significant bit wit...
Definition MathExtras.h:255
ExceptionHandling
Definition CodeGen.h:53
@ SjLj
setjmp/longjmp based exceptions
Definition CodeGen.h:56
bool RetCC_ARM_AAPCS_VFP(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
bool RetCC_ARM_APCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
Definition bit.h:303
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2136
constexpr bool isUIntN(unsigned N, uint64_t x)
Checks if an unsigned integer fits into the given (dynamic) bit width.
Definition MathExtras.h:243
bool RetCC_ARM_AAPCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
LLVM_ABI Value * concatenateVectors(IRBuilderBase &Builder, ArrayRef< Value * > Vecs)
Concatenate a list of vectors.
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition MathExtras.h:284
void shuffle(Iterator first, Iterator last, RNG &&g)
Definition STLExtras.h:1516
bool CC_ARM_APCS_GHC(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
static std::array< MachineOperand, 2 > predOps(ARMCC::CondCodes Pred, unsigned PredReg=0)
Get the operands corresponding to the given Pred value.
bool operator==(const AddressRangeValuePair &LHS, const AddressRangeValuePair &RHS)
constexpr bool isShiftedMask_32(uint32_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (32 bit ver...
Definition MathExtras.h:267
LLVM_ABI ConstantFPSDNode * isConstOrConstSplatFP(SDValue N, bool AllowUndefs=false)
Returns the SDNode if it is a constant splat BuildVector or constant float.
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition bit.h:202
bool isReleaseOrStronger(AtomicOrdering AO)
constexpr bool has_single_bit(T Value) noexcept
Definition bit.h:147
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1732
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:331
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
Definition bit.h:236
LLVM_ABI bool isBitwiseNot(SDValue V, bool AllowUndefs=false)
Returns true if V is a bitwise not operation.
MachineInstr * getImm(const MachineOperand &MO, const MachineRegisterInfo *MRI)
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:279
bool FastCC_ARM_APCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
bool CC_ARM_Win32_CFGuard_Check(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:167
FunctionAddr VTableAddr Count
Definition InstrProf.h:139
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
Definition MathExtras.h:189
SmallVector< ValueTypeFromRangeType< R >, Size > to_vector(R &&Range)
Given a range of type R, iterate the entire range and return a SmallVector with elements of the vecto...
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
LLVM_ABI raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
AtomicOrdering
Atomic ordering for LLVM's memory model.
FunctionAddr VTableAddr uintptr_t uintptr_t Data
Definition InstrProf.h:189
CombineLevel
Definition DAGCombine.h:15
@ BeforeLegalizeTypes
Definition DAGCombine.h:16
unsigned ConstantMaterializationCost(unsigned Val, const ARMSubtarget *Subtarget, bool ForCodesize=false)
Returns the number of instructions required to materialize the given constant in a register,...
@ Mul
Product of integers.
@ And
Bitwise or logical AND of integers.
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
@ FAdd
Sum of floats.
uint16_t MCPhysReg
An unsigned integer type large enough to represent all physical registers, but not necessarily virtua...
Definition MCRegister.h:21
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition Alignment.h:144
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
LLVM_ABI ConstantSDNode * isConstOrConstSplat(SDValue N, bool AllowUndefs=false, bool AllowTruncation=false)
Returns the SDNode if it is a constant splat BuildVector or constant int.
constexpr U AbsoluteValue(T X)
Return the absolute value of a signed integer, converted to the corresponding unsigned integer type.
Definition MathExtras.h:592
bool isAcquireOrStronger(AtomicOrdering AO)
constexpr unsigned BitWidth
static MachineOperand t1CondCodeOp(bool isDead=false)
Get the operand corresponding to the conditional code result for Thumb1.
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
Definition STLExtras.h:1961
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1758
LLVM_ABI bool isOneConstant(SDValue V)
Returns true if V is a constant integer one.
constexpr bool isIntN(unsigned N, int64_t x)
Checks if an signed integer fits into the given (dynamic) bit width.
Definition MathExtras.h:248
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition Alignment.h:201
static MachineOperand condCodeOp(unsigned CCReg=0)
Get the operand corresponding to the conditional code result.
bool isVREVMask(ArrayRef< int > M, EVT VT, unsigned BlockSize)
isVREVMask - Check if a vector shuffle corresponds to a VREV instruction with the specified blocksize...
auto seq(T Begin, T End)
Iterate over an integral type from Begin up to - but not including - End.
Definition Sequence.h:305
unsigned gettBLXrOpcode(const MachineFunction &MF)
bool CC_ARM_APCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
@ Increment
Incrementally increasing token ID.
Definition AllocToken.h:26
bool CC_ARM_AAPCS_VFP(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
LLVM_ABI llvm::SmallVector< int, 16 > createSequentialMask(unsigned Start, unsigned NumInts, unsigned NumUndefs)
Create a sequential shuffle mask.
constexpr bool isShiftedUInt(uint64_t x)
Checks if a unsigned integer is an N bit number shifted left by S.
Definition MathExtras.h:198
unsigned convertAddSubFlagsOpcode(unsigned OldOpc)
Map pseudo instructions that imply an 'S' bit onto real opcodes.
LLVM_ABI bool isAllOnesConstant(SDValue V)
Returns true if V is an integer constant with all bits set.
static const unsigned PerfectShuffleTable[6561+1]
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:869
#define N
Load/store instruction that can be merged with a base address update.
SDNode * N
Instruction that updates a pointer.
unsigned ConstInc
Pointer increment value if it is a constant, or 0 otherwise.
SDValue Inc
Pointer increment operand.
A collection of metadata nodes that might be associated with a memory access used by the alias-analys...
Definition Metadata.h:761
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
constexpr uint64_t value() const
This is a hole in the type system and should not be abused.
Definition Alignment.h:77
static constexpr DenormalMode getIEEE()
Extended Value Type.
Definition ValueTypes.h:35
EVT changeVectorElementTypeToInteger() const
Return a vector with the same number of elements as this vector, but with the element type converted ...
Definition ValueTypes.h:94
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition ValueTypes.h:137
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition ValueTypes.h:74
bool bitsGT(EVT VT) const
Return true if this has more bits than VT.
Definition ValueTypes.h:284
bool bitsLT(EVT VT) const
Return true if this has less bits than VT.
Definition ValueTypes.h:300
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
Definition ValueTypes.h:147
ElementCount getVectorElementCount() const
Definition ValueTypes.h:350
EVT getDoubleNumVectorElementsVT(LLVMContext &Context) const
Definition ValueTypes.h:463
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition ValueTypes.h:373
unsigned getVectorMinNumElements() const
Given a vector type, return the minimum number of elements it contains.
Definition ValueTypes.h:359
uint64_t getScalarSizeInBits() const
Definition ValueTypes.h:385
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
Definition ValueTypes.h:470
static LLVM_ABI EVT getEVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition ValueTypes.h:316
bool is128BitVector() const
Return true if this is a 128-bit vector type.
Definition ValueTypes.h:207
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
Definition ValueTypes.h:65
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
Definition ValueTypes.h:381
bool isFixedLengthVector() const
Definition ValueTypes.h:181
static EVT getFloatingPointVT(unsigned BitWidth)
Returns the EVT that represents a floating-point type with the given number of bits.
Definition ValueTypes.h:59
bool isVector() const
Return true if this is a vector value type.
Definition ValueTypes.h:168
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition ValueTypes.h:323
LLVM_ABI Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition ValueTypes.h:328
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
Definition ValueTypes.h:157
EVT changeVectorElementType(EVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
Definition ValueTypes.h:102
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition ValueTypes.h:336
bool bitsLE(EVT VT) const
Return true if this has no more bits than VT.
Definition ValueTypes.h:308
EVT getHalfNumVectorElementsVT(LLVMContext &Context) const
Definition ValueTypes.h:453
bool isInteger() const
Return true if this is an integer or a vector integer type.
Definition ValueTypes.h:152
bool is64BitVector() const
Return true if this is a 64-bit vector type.
Definition ValueTypes.h:202
InputArg - This struct carries flags and type information about a single incoming (formal) argument o...
EVT ArgVT
Usually the non-legalized type of the argument, which is the EVT corresponding to the OrigTy IR type.
static KnownBits makeConstant(const APInt &C)
Create known bits from a known constant.
Definition KnownBits.h:301
bool isUnknown() const
Returns true if we don't know any bits.
Definition KnownBits.h:66
unsigned getBitWidth() const
Get the bit width of this value.
Definition KnownBits.h:44
KnownBits zext(unsigned BitWidth) const
Return known bits for a zero extension of the value we're tracking.
Definition KnownBits.h:172
void resetAll()
Resets the known state of all bits.
Definition KnownBits.h:74
KnownBits intersectWith(const KnownBits &RHS) const
Returns KnownBits information that is known to be true for both this and RHS.
Definition KnownBits.h:311
KnownBits sext(unsigned BitWidth) const
Return known bits for a sign extension of the value we're tracking.
Definition KnownBits.h:180
static KnownBits add(const KnownBits &LHS, const KnownBits &RHS, bool NSW=false, bool NUW=false)
Compute knownbits resulting from addition of LHS and RHS.
Definition KnownBits.h:347
static LLVM_ABI KnownBits mul(const KnownBits &LHS, const KnownBits &RHS, bool NoUndefSelfMultiply=false)
Compute known bits resulting from multiplying LHS and RHS.
APInt getSignedMinValue() const
Return the minimal signed value possible given these KnownBits.
Definition KnownBits.h:135
Matching combinators.
SmallVector< ArgRegPair, 1 > ArgRegPairs
Vector of call argument and its forwarding register.
This class contains a discriminated union of information about pointers in memory operands,...
static LLVM_ABI MachinePointerInfo getJumpTable(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a jump table entry.
static LLVM_ABI MachinePointerInfo getStack(MachineFunction &MF, int64_t Offset, uint8_t ID=0)
Stack pointer relative access.
static LLVM_ABI MachinePointerInfo getConstantPool(MachineFunction &MF)
Return a MachinePointerInfo record that refers to the constant pool.
MachinePointerInfo getWithOffset(int64_t O) const
static LLVM_ABI MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
static LLVM_ABI MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition Alignment.h:106
These are IR-level optimization flags that may be propagated to SDNodes.
bool hasNoSignedZeros() const
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
This contains information for each constraint that we are lowering.
This structure contains all information that is necessary for lowering calls.
CallLoweringInfo & setInRegister(bool Value=true)
CallLoweringInfo & setLibCallee(CallingConv::ID CC, Type *ResultType, SDValue Target, ArgListTy &&ArgsList)
SmallVector< ISD::InputArg, 32 > Ins
CallLoweringInfo & setZExtResult(bool Value=true)
CallLoweringInfo & setDebugLoc(const SDLoc &dl)
CallLoweringInfo & setSExtResult(bool Value=true)
SmallVector< ISD::OutputArg, 32 > Outs
CallLoweringInfo & setChain(SDValue InChain)
CallLoweringInfo & setCallee(CallingConv::ID CC, Type *ResultType, SDValue Target, ArgListTy &&ArgsList, AttributeSet ResultAttrs={})
LLVM_ABI void AddToWorklist(SDNode *N)
LLVM_ABI SDValue CombineTo(SDNode *N, ArrayRef< SDValue > To, bool AddTo=true)
This structure is used to pass arguments to makeLibCall function.
A convenience struct that encapsulates a DAG, and two SDValues for returning information from TargetL...