LLVM 20.0.0git
NVPTXISelDAGToDAG.cpp
Go to the documentation of this file.
1//===-- NVPTXISelDAGToDAG.cpp - A dag to dag inst selector for NVPTX ------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines an instruction selector for the NVPTX target.
10//
11//===----------------------------------------------------------------------===//
12
13#include "NVPTXISelDAGToDAG.h"
15#include "NVPTXUtilities.h"
18#include "llvm/IR/GlobalValue.h"
20#include "llvm/IR/IntrinsicsNVPTX.h"
23#include "llvm/Support/Debug.h"
28
29using namespace llvm;
30
31#define DEBUG_TYPE "nvptx-isel"
32#define PASS_NAME "NVPTX DAG->DAG Pattern Instruction Selection"
33
34static cl::opt<bool>
35 EnableRsqrtOpt("nvptx-rsqrt-approx-opt", cl::init(true), cl::Hidden,
36 cl::desc("Enable reciprocal sqrt optimization"));
37
38/// createNVPTXISelDag - This pass converts a legalized DAG into a
39/// NVPTX-specific DAG, ready for instruction scheduling.
41 llvm::CodeGenOptLevel OptLevel) {
42 return new NVPTXDAGToDAGISelLegacy(TM, OptLevel);
43}
44
46 CodeGenOptLevel OptLevel)
48 ID, std::make_unique<NVPTXDAGToDAGISel>(tm, OptLevel)) {}
49
51
53
55 CodeGenOptLevel OptLevel)
56 : SelectionDAGISel(tm, OptLevel), TM(tm) {
57 doMulWide = (OptLevel > CodeGenOptLevel::None);
58}
59
63}
64
65int NVPTXDAGToDAGISel::getDivF32Level() const {
67}
68
69bool NVPTXDAGToDAGISel::usePrecSqrtF32() const {
71}
72
73bool NVPTXDAGToDAGISel::useF32FTZ() const {
75}
76
77bool NVPTXDAGToDAGISel::allowFMA() const {
79 return TL->allowFMA(*MF, OptLevel);
80}
81
82bool NVPTXDAGToDAGISel::allowUnsafeFPMath() const {
84 return TL->allowUnsafeFPMath(*MF);
85}
86
87bool NVPTXDAGToDAGISel::doRsqrtOpt() const { return EnableRsqrtOpt; }
88
89/// Select - Select instructions not customized! Used for
90/// expanded, promoted and normal instructions.
91void NVPTXDAGToDAGISel::Select(SDNode *N) {
92
93 if (N->isMachineOpcode()) {
94 N->setNodeId(-1);
95 return; // Already selected.
96 }
97
98 switch (N->getOpcode()) {
99 case ISD::LOAD:
100 case ISD::ATOMIC_LOAD:
101 if (tryLoad(N))
102 return;
103 break;
104 case ISD::STORE:
106 if (tryStore(N))
107 return;
108 break;
110 if (tryEXTRACT_VECTOR_ELEMENT(N))
111 return;
112 break;
114 SelectSETP_F16X2(N);
115 return;
117 SelectSETP_BF16X2(N);
118 return;
119 case NVPTXISD::LoadV2:
120 case NVPTXISD::LoadV4:
121 if (tryLoadVector(N))
122 return;
123 break;
124 case NVPTXISD::LDGV2:
125 case NVPTXISD::LDGV4:
126 case NVPTXISD::LDUV2:
127 case NVPTXISD::LDUV4:
128 if (tryLDGLDU(N))
129 return;
130 break;
133 if (tryStoreVector(N))
134 return;
135 break;
139 if (tryLoadParam(N))
140 return;
141 break;
145 if (tryStoreRetval(N))
146 return;
147 break;
153 if (tryStoreParam(N))
154 return;
155 break;
157 if (tryIntrinsicNoChain(N))
158 return;
159 break;
161 if (tryIntrinsicChain(N))
162 return;
163 break;
338 if (tryTextureIntrinsic(N))
339 return;
340 break;
506 if (trySurfaceIntrinsic(N))
507 return;
508 break;
509 case ISD::AND:
510 case ISD::SRA:
511 case ISD::SRL:
512 // Try to select BFE
513 if (tryBFE(N))
514 return;
515 break;
517 SelectAddrSpaceCast(N);
518 return;
519 case ISD::ConstantFP:
520 if (tryConstantFP(N))
521 return;
522 break;
523 case ISD::CopyToReg: {
524 if (N->getOperand(1).getValueType() == MVT::i128) {
525 SelectV2I64toI128(N);
526 return;
527 }
528 break;
529 }
530 case ISD::CopyFromReg: {
531 if (N->getOperand(1).getValueType() == MVT::i128) {
532 SelectI128toV2I64(N);
533 return;
534 }
535 break;
536 }
537 default:
538 break;
539 }
540 SelectCode(N);
541}
542
543bool NVPTXDAGToDAGISel::tryIntrinsicChain(SDNode *N) {
544 unsigned IID = N->getConstantOperandVal(1);
545 switch (IID) {
546 default:
547 return false;
548 case Intrinsic::nvvm_ldg_global_f:
549 case Intrinsic::nvvm_ldg_global_i:
550 case Intrinsic::nvvm_ldg_global_p:
551 case Intrinsic::nvvm_ldu_global_f:
552 case Intrinsic::nvvm_ldu_global_i:
553 case Intrinsic::nvvm_ldu_global_p:
554 return tryLDGLDU(N);
555 }
556}
557
558// There's no way to specify FP16 and BF16 immediates in .(b)f16 ops, so we
559// have to load them into an .(b)f16 register first.
560bool NVPTXDAGToDAGISel::tryConstantFP(SDNode *N) {
561 if (N->getValueType(0) != MVT::f16 && N->getValueType(0) != MVT::bf16)
562 return false;
564 cast<ConstantFPSDNode>(N)->getValueAPF(), SDLoc(N), N->getValueType(0));
565 SDNode *LoadConstF16 = CurDAG->getMachineNode(
566 (N->getValueType(0) == MVT::f16 ? NVPTX::LOAD_CONST_F16
567 : NVPTX::LOAD_CONST_BF16),
568 SDLoc(N), N->getValueType(0), Val);
569 ReplaceNode(N, LoadConstF16);
570 return true;
571}
572
573// Map ISD:CONDCODE value to appropriate CmpMode expected by
574// NVPTXInstPrinter::printCmpMode()
575static unsigned getPTXCmpMode(const CondCodeSDNode &CondCode, bool FTZ) {
577 unsigned PTXCmpMode = [](ISD::CondCode CC) {
578 switch (CC) {
579 default:
580 llvm_unreachable("Unexpected condition code.");
581 case ISD::SETOEQ:
582 return CmpMode::EQ;
583 case ISD::SETOGT:
584 return CmpMode::GT;
585 case ISD::SETOGE:
586 return CmpMode::GE;
587 case ISD::SETOLT:
588 return CmpMode::LT;
589 case ISD::SETOLE:
590 return CmpMode::LE;
591 case ISD::SETONE:
592 return CmpMode::NE;
593 case ISD::SETO:
594 return CmpMode::NUM;
595 case ISD::SETUO:
596 return CmpMode::NotANumber;
597 case ISD::SETUEQ:
598 return CmpMode::EQU;
599 case ISD::SETUGT:
600 return CmpMode::GTU;
601 case ISD::SETUGE:
602 return CmpMode::GEU;
603 case ISD::SETULT:
604 return CmpMode::LTU;
605 case ISD::SETULE:
606 return CmpMode::LEU;
607 case ISD::SETUNE:
608 return CmpMode::NEU;
609 case ISD::SETEQ:
610 return CmpMode::EQ;
611 case ISD::SETGT:
612 return CmpMode::GT;
613 case ISD::SETGE:
614 return CmpMode::GE;
615 case ISD::SETLT:
616 return CmpMode::LT;
617 case ISD::SETLE:
618 return CmpMode::LE;
619 case ISD::SETNE:
620 return CmpMode::NE;
621 }
622 }(CondCode.get());
623
624 if (FTZ)
625 PTXCmpMode |= NVPTX::PTXCmpMode::FTZ_FLAG;
626
627 return PTXCmpMode;
628}
629
630bool NVPTXDAGToDAGISel::SelectSETP_F16X2(SDNode *N) {
631 unsigned PTXCmpMode =
632 getPTXCmpMode(*cast<CondCodeSDNode>(N->getOperand(2)), useF32FTZ());
633 SDLoc DL(N);
635 NVPTX::SETP_f16x2rr, DL, MVT::i1, MVT::i1, N->getOperand(0),
636 N->getOperand(1), CurDAG->getTargetConstant(PTXCmpMode, DL, MVT::i32));
637 ReplaceNode(N, SetP);
638 return true;
639}
640
641bool NVPTXDAGToDAGISel::SelectSETP_BF16X2(SDNode *N) {
642 unsigned PTXCmpMode =
643 getPTXCmpMode(*cast<CondCodeSDNode>(N->getOperand(2)), useF32FTZ());
644 SDLoc DL(N);
646 NVPTX::SETP_bf16x2rr, DL, MVT::i1, MVT::i1, N->getOperand(0),
647 N->getOperand(1), CurDAG->getTargetConstant(PTXCmpMode, DL, MVT::i32));
648 ReplaceNode(N, SetP);
649 return true;
650}
651
652// Find all instances of extract_vector_elt that use this v2f16 vector
653// and coalesce them into a scattering move instruction.
654bool NVPTXDAGToDAGISel::tryEXTRACT_VECTOR_ELEMENT(SDNode *N) {
655 SDValue Vector = N->getOperand(0);
656
657 // We only care about 16x2 as it's the only real vector type we
658 // need to deal with.
659 MVT VT = Vector.getSimpleValueType();
660 if (!Isv2x16VT(VT))
661 return false;
662 // Find and record all uses of this vector that extract element 0 or 1.
664 for (auto *U : Vector.getNode()->uses()) {
665 if (U->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
666 continue;
667 if (U->getOperand(0) != Vector)
668 continue;
669 if (const ConstantSDNode *IdxConst =
670 dyn_cast<ConstantSDNode>(U->getOperand(1))) {
671 if (IdxConst->getZExtValue() == 0)
672 E0.push_back(U);
673 else if (IdxConst->getZExtValue() == 1)
674 E1.push_back(U);
675 else
676 llvm_unreachable("Invalid vector index.");
677 }
678 }
679
680 // There's no point scattering f16x2 if we only ever access one
681 // element of it.
682 if (E0.empty() || E1.empty())
683 return false;
684
685 // Merge (f16 extractelt(V, 0), f16 extractelt(V,1))
686 // into f16,f16 SplitF16x2(V)
687 MVT EltVT = VT.getVectorElementType();
688 SDNode *ScatterOp =
689 CurDAG->getMachineNode(NVPTX::I32toV2I16, SDLoc(N), EltVT, EltVT, Vector);
690 for (auto *Node : E0)
691 ReplaceUses(SDValue(Node, 0), SDValue(ScatterOp, 0));
692 for (auto *Node : E1)
693 ReplaceUses(SDValue(Node, 0), SDValue(ScatterOp, 1));
694
695 return true;
696}
697
698static unsigned int getCodeAddrSpace(MemSDNode *N) {
699 const Value *Src = N->getMemOperand()->getValue();
700
701 if (!Src)
703
704 if (auto *PT = dyn_cast<PointerType>(Src->getType())) {
705 switch (PT->getAddressSpace()) {
712 default: break;
713 }
714 }
716}
717
718namespace {
719
720struct OperationOrderings {
722 OperationOrderings(NVPTX::Ordering IO = NVPTX::Ordering::NotAtomic,
723 NVPTX::Ordering FO = NVPTX::Ordering::NotAtomic)
724 : InstructionOrdering(IO), FenceOrdering(FO) {}
725};
726
727static OperationOrderings
728getOperationOrderings(MemSDNode *N, const NVPTXSubtarget *Subtarget) {
729 AtomicOrdering Ordering = N->getSuccessOrdering();
730 auto CodeAddrSpace = getCodeAddrSpace(N);
731
732 bool HasMemoryOrdering = Subtarget->hasMemoryOrdering();
733 bool HasRelaxedMMIO = Subtarget->hasRelaxedMMIO();
734
735 // clang-format off
736
737 // Lowering for Load/Store Operations (note: AcquireRelease Loads or Stores error).
738 // Note: uses of Relaxed in the Atomic column of this table refer
739 // to LLVM AtomicOrdering::Monotonic.
740 //
741 // | Atomic | Volatile | Statespace | PTX sm_60- | PTX sm_70+ |
742 // |---------|----------|--------------------|------------|------------------------------|
743 // | No | No | All | plain | .weak |
744 // | No | Yes | Generic,Shared, | .volatile | .volatile |
745 // | | | Global [0] | | |
746 // | No | Yes | Local,Const,Param | plain [1] | .weak [1] |
747 // | Unorder | Yes/No | All | == Relaxed | == Relaxed |
748 // | Relaxed | No | Generic,Shared, | .volatile | <atomic sem> |
749 // | | | Global [0] | | |
750 // | Other | No | Generic,Shared, | Error [2] | <atomic sem> |
751 // | | | Global [0] | | |
752 // | Yes | No | Local,Const,Param | plain [1] | .weak [1] |
753 // | Relaxed | Yes | Generic,Shared [0] | .volatile | .volatile |
754 // | Relaxed | Yes | Global [0] | .volatile | .mmio.relaxed.sys (PTX 8.2+) |
755 // | | | | | or .volatile (PTX 8.1-) |
756 // | Relaxed | Yes | Local,Const,Param | plain [1] | .weak [1] |
757 // | Other | Yes | Generic, Shared, | Error [2] | <atomic sem> [3] |
758 // | | | / Global [0] | | |
759
760 // Lowering of CUDA C++ SequentiallyConsistent Operations and Fences to PTX
761 // by following the ABI proven sound in:
762 // Lustig et al, A Formal Analysis of the NVIDIA PTX Memory Consistency Model, ASPLOS’19.
763 // https://dl.acm.org/doi/pdf/10.1145/3297858.3304043
764 //
765 // | CUDA C++ Atomic Operation or Atomic Fence | PTX Atomic Operation or Fence |
766 // |------------------------------------------------------|-------------------------------|
767 // | cuda::atomic_thread_fence | fence.sc.<scope>; |
768 // | (memory_order_seq_cst, cuda::thread_scope_<scope>) | |
769 // |------------------------------------------------------|-------------------------------|
770 // | cuda::atomic_load | fence.sc.<scope>; |
771 // | (memory_order_seq_cst, cuda::thread_scope_<scope>) | ld.acquire.<scope>; |
772 // |------------------------------------------------------|-------------------------------|
773 // | cuda::atomic_store | fence.sc.<scope>; |
774 // | (memory_order_seq_cst, cuda::thread_scope_<scope>) | st.release.<scope>; |
775 // |------------------------------------------------------|-------------------------------|
776 // | cuda::atomic_fetch_<op> | fence.sc.<scope>; |
777 // | (memory_order_seq_cst, cuda::thread_scope_<scope>) | atom.acq_rel.<scope>; |
778
779 // clang-format on
780
781 // [0]: volatile and atomics are only supported on global or shared
782 // memory locations, accessed via generic/shared/global pointers.
783 // MMIO is only supported on global memory locations,
784 // accessed via generic/global pointers.
785 // TODO: Implement MMIO access via generic pointer to global.
786 // Currently implemented for global pointers only.
787
788 // [1]: Lowering volatile/atomic operations to non-volatile/non-atomic
789 // PTX instructions fails to preserve their C++ side-effects.
790 //
791 // Example (https://github.com/llvm/llvm-project/issues/62057):
792 //
793 // void example() {
794 // std::atomic<bool> True = true;
795 // while (True.load(std::memory_order_relaxed));
796 // }
797 //
798 // A C++ program that calls "example" is well-defined: the infinite loop
799 // performs an atomic operation. By lowering volatile/atomics to
800 // "weak" memory operations, we are transforming the above into:
801 //
802 // void undefined_behavior() {
803 // bool True = true;
804 // while (True);
805 // }
806 //
807 // which exhibits undefined behavior in both C++ and PTX.
808 //
809 // Calling "example" in CUDA C++ compiled for sm_60- exhibits undefined
810 // behavior due to lack of Independent Forward Progress. Lowering these
811 // to weak memory operations in sm_60- is therefore fine.
812 //
813 // TODO: lower atomic and volatile operations to memory locations
814 // in local, const, and param to two PTX instructions in sm_70+:
815 // - the "weak" memory instruction we are currently lowering to, and
816 // - some other instruction that preserves the side-effect, e.g.,
817 // a dead dummy volatile load.
818 if (CodeAddrSpace == NVPTX::PTXLdStInstCode::LOCAL ||
819 CodeAddrSpace == NVPTX::PTXLdStInstCode::CONSTANT ||
820 CodeAddrSpace == NVPTX::PTXLdStInstCode::PARAM) {
822 }
823
824 // [2]: Atomics with Ordering different than Unordered or Relaxed are not
825 // supported on sm_60 and older; this includes volatile atomics.
826 if (!(Ordering == AtomicOrdering::NotAtomic ||
827 Ordering == AtomicOrdering::Unordered ||
828 Ordering == AtomicOrdering::Monotonic) &&
829 !HasMemoryOrdering) {
831 formatv("PTX does not support \"atomic\" for orderings different than"
832 "\"NotAtomic\" or \"Monotonic\" for sm_60 or older, but order "
833 "is: \"{}\".",
834 toIRString(Ordering)));
835 }
836
837 // [3]: TODO: these should eventually use .mmio<.atomic sem>; for now we drop
838 // the volatile semantics and preserve the atomic ones.
839
840 // PTX volatile and PTX atomics are not available for statespace that differ
841 // from .generic, .global, or .shared. The behavior of PTX volatile and PTX
842 // atomics is undefined if the generic address does not refer to a .global or
843 // .shared memory location.
844 bool AddrGenericOrGlobalOrShared =
845 (CodeAddrSpace == NVPTX::PTXLdStInstCode::GENERIC ||
846 CodeAddrSpace == NVPTX::PTXLdStInstCode::GLOBAL ||
847 CodeAddrSpace == NVPTX::PTXLdStInstCode::SHARED);
848 if (!AddrGenericOrGlobalOrShared)
850
851 bool UseRelaxedMMIO =
852 HasRelaxedMMIO && CodeAddrSpace == NVPTX::PTXLdStInstCode::GLOBAL;
853
854 switch (Ordering) {
856 return N->isVolatile() ? NVPTX::Ordering::Volatile
859 // We lower unordered in the exact same way as 'monotonic' to respect
860 // LLVM IR atomicity requirements.
862 if (N->isVolatile())
863 return UseRelaxedMMIO ? NVPTX::Ordering::RelaxedMMIO
865 else
866 return HasMemoryOrdering ? NVPTX::Ordering::Relaxed
868 // case AtomicOrdering::Consume: // If LLVM ever provides this, lower it to
869 // Acquire.
871 if (!N->readMem())
873 formatv("PTX only supports Acquire Ordering on reads: {}",
874 N->getOperationName()));
877 if (!N->writeMem())
879 formatv("PTX only supports Release Ordering on writes: {}",
880 N->getOperationName()));
884 formatv("NVPTX does not support AcquireRelease Ordering on "
885 "read-modify-write "
886 "yet and PTX does not support it on loads or stores: {}",
887 N->getOperationName()));
888 }
890 // LLVM-IR SequentiallyConsistent atomics map to a two-instruction PTX
891 // sequence including a "fence.sc.sco" and the memory instruction with an
892 // Ordering that differs from "sc": acq, rel, or acq_rel, depending on
893 // whether the memory operation is a read, write, or read-modify-write.
894 //
895 // This sets the ordering of the fence to SequentiallyConsistent, and
896 // sets the corresponding ordering for the instruction.
897 NVPTX::Ordering InstrOrder;
898 if (N->readMem())
899 InstrOrder = NVPTX::Ordering::Acquire;
900 else if (N->writeMem())
901 InstrOrder = NVPTX::Ordering::Release;
902 else
904 formatv("NVPTX does not support SequentiallyConsistent Ordering on "
905 "read-modify-writes yet: {}",
906 N->getOperationName()));
907 return OperationOrderings(InstrOrder,
909 }
910 }
912 formatv("NVPTX backend does not support AtomicOrdering \"{}\" yet.",
913 toIRString(Ordering)));
914}
915
916} // namespace
917
918static bool canLowerToLDG(MemSDNode *N, const NVPTXSubtarget &Subtarget,
919 unsigned CodeAddrSpace, MachineFunction *F) {
920 // We use ldg (i.e. ld.global.nc) for invariant loads from the global address
921 // space.
922 //
923 // We have two ways of identifying invariant loads: Loads may be explicitly
924 // marked as invariant, or we may infer them to be invariant.
925 //
926 // We currently infer invariance for loads from
927 // - constant global variables, and
928 // - kernel function pointer params that are noalias (i.e. __restrict) and
929 // never written to.
930 //
931 // TODO: Perform a more powerful invariance analysis (ideally IPO, and ideally
932 // not during the SelectionDAG phase).
933 //
934 // TODO: Infer invariance only at -O2. We still want to use ldg at -O0 for
935 // explicitly invariant loads because these are how clang tells us to use ldg
936 // when the user uses a builtin.
937 if (!Subtarget.hasLDG() || CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL)
938 return false;
939
940 if (N->isInvariant())
941 return true;
942
943 bool IsKernelFn = isKernelFunction(F->getFunction());
944
945 // We use getUnderlyingObjects() here instead of getUnderlyingObject() mainly
946 // because the former looks through phi nodes while the latter does not. We
947 // need to look through phi nodes to handle pointer induction variables.
949 getUnderlyingObjects(N->getMemOperand()->getValue(), Objs);
950
951 return all_of(Objs, [&](const Value *V) {
952 if (auto *A = dyn_cast<const Argument>(V))
953 return IsKernelFn && A->onlyReadsMemory() && A->hasNoAliasAttr();
954 if (auto *GV = dyn_cast<const GlobalVariable>(V))
955 return GV->isConstant();
956 return false;
957 });
958}
959
960NVPTX::Ordering NVPTXDAGToDAGISel::insertMemoryInstructionFence(SDLoc DL,
961 SDValue &Chain,
962 MemSDNode *N) {
963 // Some memory instructions - loads, stores, atomics - need an extra fence
964 // instruction. Get the memory order of the instruction, and that of its
965 // fence, if any.
966 auto [InstructionOrdering, FenceOrdering] =
967 getOperationOrderings(N, Subtarget);
968
969 // If a fence is required before the operation, insert it:
970 switch (NVPTX::Ordering(FenceOrdering)) {
972 break;
974 unsigned Op = Subtarget->hasMemoryOrdering()
975 ? NVPTX::atomic_thread_fence_seq_cst_sys
976 : NVPTX::INT_MEMBAR_SYS;
977 Chain = SDValue(CurDAG->getMachineNode(Op, DL, MVT::Other, Chain), 0);
978 break;
979 }
980 default:
982 formatv("Unexpected fence ordering: \"{}\".",
983 OrderingToCString(NVPTX::Ordering(FenceOrdering))));
984 }
985
986 return InstructionOrdering;
987}
988
989bool NVPTXDAGToDAGISel::tryIntrinsicNoChain(SDNode *N) {
990 unsigned IID = N->getConstantOperandVal(0);
991 switch (IID) {
992 default:
993 return false;
994 case Intrinsic::nvvm_texsurf_handle_internal:
995 SelectTexSurfHandle(N);
996 return true;
997 }
998}
999
1000void NVPTXDAGToDAGISel::SelectTexSurfHandle(SDNode *N) {
1001 // Op 0 is the intrinsic ID
1002 SDValue Wrapper = N->getOperand(1);
1003 SDValue GlobalVal = Wrapper.getOperand(0);
1004 ReplaceNode(N, CurDAG->getMachineNode(NVPTX::texsurf_handles, SDLoc(N),
1005 MVT::i64, GlobalVal));
1006}
1007
1008void NVPTXDAGToDAGISel::SelectAddrSpaceCast(SDNode *N) {
1009 SDValue Src = N->getOperand(0);
1010 AddrSpaceCastSDNode *CastN = cast<AddrSpaceCastSDNode>(N);
1011 unsigned SrcAddrSpace = CastN->getSrcAddressSpace();
1012 unsigned DstAddrSpace = CastN->getDestAddressSpace();
1013 assert(SrcAddrSpace != DstAddrSpace &&
1014 "addrspacecast must be between different address spaces");
1015
1016 if (DstAddrSpace == ADDRESS_SPACE_GENERIC) {
1017 // Specific to generic
1018 unsigned Opc;
1019 switch (SrcAddrSpace) {
1020 default: report_fatal_error("Bad address space in addrspacecast");
1022 Opc = TM.is64Bit() ? NVPTX::cvta_global_64 : NVPTX::cvta_global;
1023 break;
1025 Opc = TM.is64Bit() ? (TM.getPointerSizeInBits(SrcAddrSpace) == 32
1026 ? NVPTX::cvta_shared_6432
1027 : NVPTX::cvta_shared_64)
1028 : NVPTX::cvta_shared;
1029 break;
1031 Opc = TM.is64Bit() ? (TM.getPointerSizeInBits(SrcAddrSpace) == 32
1032 ? NVPTX::cvta_const_6432
1033 : NVPTX::cvta_const_64)
1034 : NVPTX::cvta_const;
1035 break;
1037 Opc = TM.is64Bit() ? (TM.getPointerSizeInBits(SrcAddrSpace) == 32
1038 ? NVPTX::cvta_local_6432
1039 : NVPTX::cvta_local_64)
1040 : NVPTX::cvta_local;
1041 break;
1042 }
1043 ReplaceNode(N, CurDAG->getMachineNode(Opc, SDLoc(N), N->getValueType(0),
1044 Src));
1045 return;
1046 } else {
1047 // Generic to specific
1048 if (SrcAddrSpace != 0)
1049 report_fatal_error("Cannot cast between two non-generic address spaces");
1050 unsigned Opc;
1051 switch (DstAddrSpace) {
1052 default: report_fatal_error("Bad address space in addrspacecast");
1054 Opc = TM.is64Bit() ? NVPTX::cvta_to_global_64 : NVPTX::cvta_to_global;
1055 break;
1057 Opc = TM.is64Bit() ? (TM.getPointerSizeInBits(DstAddrSpace) == 32
1058 ? NVPTX::cvta_to_shared_3264
1059 : NVPTX::cvta_to_shared_64)
1060 : NVPTX::cvta_to_shared;
1061 break;
1063 Opc = TM.is64Bit() ? (TM.getPointerSizeInBits(DstAddrSpace) == 32
1064 ? NVPTX::cvta_to_const_3264
1065 : NVPTX::cvta_to_const_64)
1066 : NVPTX::cvta_to_const;
1067 break;
1069 Opc = TM.is64Bit() ? (TM.getPointerSizeInBits(DstAddrSpace) == 32
1070 ? NVPTX::cvta_to_local_3264
1071 : NVPTX::cvta_to_local_64)
1072 : NVPTX::cvta_to_local;
1073 break;
1075 Opc = TM.is64Bit() ? NVPTX::nvvm_ptr_gen_to_param_64
1076 : NVPTX::nvvm_ptr_gen_to_param;
1077 break;
1078 }
1079 ReplaceNode(N, CurDAG->getMachineNode(Opc, SDLoc(N), N->getValueType(0),
1080 Src));
1081 return;
1082 }
1083}
1084
1085// Helper function template to reduce amount of boilerplate code for
1086// opcode selection.
1087static std::optional<unsigned>
1089 unsigned Opcode_i16, unsigned Opcode_i32,
1090 std::optional<unsigned> Opcode_i64, unsigned Opcode_f32,
1091 std::optional<unsigned> Opcode_f64) {
1092 switch (VT) {
1093 case MVT::i1:
1094 case MVT::i8:
1095 return Opcode_i8;
1096 case MVT::i16:
1097 return Opcode_i16;
1098 case MVT::i32:
1099 return Opcode_i32;
1100 case MVT::i64:
1101 return Opcode_i64;
1102 case MVT::f16:
1103 case MVT::bf16:
1104 return Opcode_i16;
1105 case MVT::v2f16:
1106 case MVT::v2bf16:
1107 case MVT::v2i16:
1108 case MVT::v4i8:
1109 return Opcode_i32;
1110 case MVT::f32:
1111 return Opcode_f32;
1112 case MVT::f64:
1113 return Opcode_f64;
1114 default:
1115 return std::nullopt;
1116 }
1117}
1118
1119static int getLdStRegType(EVT VT) {
1120 if (VT.isFloatingPoint())
1121 switch (VT.getSimpleVT().SimpleTy) {
1122 case MVT::f16:
1123 case MVT::bf16:
1124 case MVT::v2f16:
1125 case MVT::v2bf16:
1127 default:
1129 }
1130 else
1132}
1133
1134bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
1135 MemSDNode *LD = cast<MemSDNode>(N);
1136 assert(LD->readMem() && "Expected load");
1137
1138 // do not support pre/post inc/dec
1139 LoadSDNode *PlainLoad = dyn_cast<LoadSDNode>(N);
1140 if (PlainLoad && PlainLoad->isIndexed())
1141 return false;
1142
1143 EVT LoadedVT = LD->getMemoryVT();
1144 if (!LoadedVT.isSimple())
1145 return false;
1146
1147 // Address Space Setting
1148 unsigned int CodeAddrSpace = getCodeAddrSpace(LD);
1149 if (canLowerToLDG(LD, *Subtarget, CodeAddrSpace, MF)) {
1150 return tryLDGLDU(N);
1151 }
1152 unsigned int PointerSize =
1153 CurDAG->getDataLayout().getPointerSizeInBits(LD->getAddressSpace());
1154
1155 SDLoc DL(N);
1156 SDValue Chain = N->getOperand(0);
1157 auto InstructionOrdering = insertMemoryInstructionFence(DL, Chain, LD);
1158
1159 // Type Setting: fromType + fromTypeWidth
1160 //
1161 // Sign : ISD::SEXTLOAD
1162 // Unsign : ISD::ZEXTLOAD, ISD::NON_EXTLOAD or ISD::EXTLOAD and the
1163 // type is integer
1164 // Float : ISD::NON_EXTLOAD or ISD::EXTLOAD and the type is float
1165 MVT SimpleVT = LoadedVT.getSimpleVT();
1166 MVT ScalarVT = SimpleVT.getScalarType();
1167 // Read at least 8 bits (predicates are stored as 8-bit values)
1168 unsigned FromTypeWidth = std::max(8U, (unsigned)ScalarVT.getSizeInBits());
1169 unsigned int FromType;
1170
1171 // Vector Setting
1173 if (SimpleVT.isVector()) {
1174 assert((Isv2x16VT(LoadedVT) || LoadedVT == MVT::v4i8) &&
1175 "Unexpected vector type");
1176 // v2f16/v2bf16/v2i16 is loaded using ld.b32
1177 FromTypeWidth = 32;
1178 }
1179
1180 if (PlainLoad && (PlainLoad->getExtensionType() == ISD::SEXTLOAD))
1182 else
1183 FromType = getLdStRegType(ScalarVT);
1184
1185 // Create the machine instruction DAG
1186 SDValue N1 = N->getOperand(1);
1187 SDValue Addr;
1189 std::optional<unsigned> Opcode;
1190 MVT::SimpleValueType TargetVT = LD->getSimpleValueType(0).SimpleTy;
1191
1193 getI32Imm(CodeAddrSpace, DL),
1194 getI32Imm(VecType, DL), getI32Imm(FromType, DL),
1195 getI32Imm(FromTypeWidth, DL)});
1196
1197 if (SelectDirectAddr(N1, Addr)) {
1198 Opcode = pickOpcodeForVT(TargetVT, NVPTX::LD_i8_avar, NVPTX::LD_i16_avar,
1199 NVPTX::LD_i32_avar, NVPTX::LD_i64_avar,
1200 NVPTX::LD_f32_avar, NVPTX::LD_f64_avar);
1201 if (!Opcode)
1202 return false;
1203 Ops.append({Addr, Chain});
1204 } else if (PointerSize == 64 ? SelectADDRsi64(N1.getNode(), N1, Base, Offset)
1205 : SelectADDRsi(N1.getNode(), N1, Base, Offset)) {
1206 Opcode = pickOpcodeForVT(TargetVT, NVPTX::LD_i8_asi, NVPTX::LD_i16_asi,
1207 NVPTX::LD_i32_asi, NVPTX::LD_i64_asi,
1208 NVPTX::LD_f32_asi, NVPTX::LD_f64_asi);
1209 if (!Opcode)
1210 return false;
1211 Ops.append({Base, Offset, Chain});
1212 } else if (PointerSize == 64 ? SelectADDRri64(N1.getNode(), N1, Base, Offset)
1213 : SelectADDRri(N1.getNode(), N1, Base, Offset)) {
1214 if (PointerSize == 64)
1215 Opcode =
1216 pickOpcodeForVT(TargetVT, NVPTX::LD_i8_ari_64, NVPTX::LD_i16_ari_64,
1217 NVPTX::LD_i32_ari_64, NVPTX::LD_i64_ari_64,
1218 NVPTX::LD_f32_ari_64, NVPTX::LD_f64_ari_64);
1219 else
1220 Opcode = pickOpcodeForVT(TargetVT, NVPTX::LD_i8_ari, NVPTX::LD_i16_ari,
1221 NVPTX::LD_i32_ari, NVPTX::LD_i64_ari,
1222 NVPTX::LD_f32_ari, NVPTX::LD_f64_ari);
1223 if (!Opcode)
1224 return false;
1225 Ops.append({Base, Offset, Chain});
1226 } else {
1227 if (PointerSize == 64)
1228 Opcode =
1229 pickOpcodeForVT(TargetVT, NVPTX::LD_i8_areg_64, NVPTX::LD_i16_areg_64,
1230 NVPTX::LD_i32_areg_64, NVPTX::LD_i64_areg_64,
1231 NVPTX::LD_f32_areg_64, NVPTX::LD_f64_areg_64);
1232 else
1233 Opcode = pickOpcodeForVT(TargetVT, NVPTX::LD_i8_areg, NVPTX::LD_i16_areg,
1234 NVPTX::LD_i32_areg, NVPTX::LD_i64_areg,
1235 NVPTX::LD_f32_areg, NVPTX::LD_f64_areg);
1236 if (!Opcode)
1237 return false;
1238 Ops.append({N1, Chain});
1239 }
1240
1241 SDNode *NVPTXLD =
1242 CurDAG->getMachineNode(*Opcode, DL, TargetVT, MVT::Other, Ops);
1243 if (!NVPTXLD)
1244 return false;
1245
1246 MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand();
1247 CurDAG->setNodeMemRefs(cast<MachineSDNode>(NVPTXLD), {MemRef});
1248
1249 ReplaceNode(N, NVPTXLD);
1250 return true;
1251}
1252
1253bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
1254 MemSDNode *MemSD = cast<MemSDNode>(N);
1255 EVT LoadedVT = MemSD->getMemoryVT();
1256 if (!LoadedVT.isSimple())
1257 return false;
1258
1259 // Address Space Setting
1260 unsigned int CodeAddrSpace = getCodeAddrSpace(MemSD);
1261 if (canLowerToLDG(MemSD, *Subtarget, CodeAddrSpace, MF)) {
1262 return tryLDGLDU(N);
1263 }
1264 unsigned int PointerSize =
1266
1267 SDLoc DL(N);
1268 SDValue Chain = N->getOperand(0);
1269 auto InstructionOrdering = insertMemoryInstructionFence(DL, Chain, MemSD);
1270
1271 // Vector Setting
1272 MVT SimpleVT = LoadedVT.getSimpleVT();
1273
1274 // Type Setting: fromType + fromTypeWidth
1275 //
1276 // Sign : ISD::SEXTLOAD
1277 // Unsign : ISD::ZEXTLOAD, ISD::NON_EXTLOAD or ISD::EXTLOAD and the
1278 // type is integer
1279 // Float : ISD::NON_EXTLOAD or ISD::EXTLOAD and the type is float
1280 MVT ScalarVT = SimpleVT.getScalarType();
1281 // Read at least 8 bits (predicates are stored as 8-bit values)
1282 unsigned FromTypeWidth = std::max(8U, (unsigned)ScalarVT.getSizeInBits());
1283 unsigned int FromType;
1284 // The last operand holds the original LoadSDNode::getExtensionType() value
1285 unsigned ExtensionType = cast<ConstantSDNode>(
1286 N->getOperand(N->getNumOperands() - 1))->getZExtValue();
1287 if (ExtensionType == ISD::SEXTLOAD)
1289 else
1290 FromType = getLdStRegType(ScalarVT);
1291
1292 unsigned VecType;
1293
1294 switch (N->getOpcode()) {
1295 case NVPTXISD::LoadV2:
1297 break;
1298 case NVPTXISD::LoadV4:
1300 break;
1301 default:
1302 return false;
1303 }
1304
1305 EVT EltVT = N->getValueType(0);
1306
1307 // v8x16 is a special case. PTX doesn't have ld.v8.16
1308 // instruction. Instead, we split the vector into v2x16 chunks and
1309 // load them with ld.v4.b32.
1310 if (Isv2x16VT(EltVT)) {
1311 assert(N->getOpcode() == NVPTXISD::LoadV4 && "Unexpected load opcode.");
1312 EltVT = MVT::i32;
1314 FromTypeWidth = 32;
1315 }
1316
1317 SDValue Op1 = N->getOperand(1);
1319 std::optional<unsigned> Opcode;
1320 SDNode *LD;
1321
1323 getI32Imm(CodeAddrSpace, DL),
1324 getI32Imm(VecType, DL), getI32Imm(FromType, DL),
1325 getI32Imm(FromTypeWidth, DL)});
1326
1327 if (SelectDirectAddr(Op1, Addr)) {
1328 switch (N->getOpcode()) {
1329 default:
1330 return false;
1331 case NVPTXISD::LoadV2:
1332 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1333 NVPTX::LDV_i8_v2_avar, NVPTX::LDV_i16_v2_avar,
1334 NVPTX::LDV_i32_v2_avar, NVPTX::LDV_i64_v2_avar,
1335 NVPTX::LDV_f32_v2_avar, NVPTX::LDV_f64_v2_avar);
1336 break;
1337 case NVPTXISD::LoadV4:
1338 Opcode =
1339 pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_avar,
1340 NVPTX::LDV_i16_v4_avar, NVPTX::LDV_i32_v4_avar,
1341 std::nullopt, NVPTX::LDV_f32_v4_avar, std::nullopt);
1342 break;
1343 }
1344 if (!Opcode)
1345 return false;
1346 Ops.append({Addr, Chain});
1347 } else if (PointerSize == 64
1348 ? SelectADDRsi64(Op1.getNode(), Op1, Base, Offset)
1349 : SelectADDRsi(Op1.getNode(), Op1, Base, Offset)) {
1350 switch (N->getOpcode()) {
1351 default:
1352 return false;
1353 case NVPTXISD::LoadV2:
1354 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1355 NVPTX::LDV_i8_v2_asi, NVPTX::LDV_i16_v2_asi,
1356 NVPTX::LDV_i32_v2_asi, NVPTX::LDV_i64_v2_asi,
1357 NVPTX::LDV_f32_v2_asi, NVPTX::LDV_f64_v2_asi);
1358 break;
1359 case NVPTXISD::LoadV4:
1360 Opcode =
1361 pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_asi,
1362 NVPTX::LDV_i16_v4_asi, NVPTX::LDV_i32_v4_asi,
1363 std::nullopt, NVPTX::LDV_f32_v4_asi, std::nullopt);
1364 break;
1365 }
1366 if (!Opcode)
1367 return false;
1368 Ops.append({Base, Offset, Chain});
1369 } else if (PointerSize == 64
1370 ? SelectADDRri64(Op1.getNode(), Op1, Base, Offset)
1371 : SelectADDRri(Op1.getNode(), Op1, Base, Offset)) {
1372 if (PointerSize == 64) {
1373 switch (N->getOpcode()) {
1374 default:
1375 return false;
1376 case NVPTXISD::LoadV2:
1377 Opcode =
1379 NVPTX::LDV_i8_v2_ari_64, NVPTX::LDV_i16_v2_ari_64,
1380 NVPTX::LDV_i32_v2_ari_64, NVPTX::LDV_i64_v2_ari_64,
1381 NVPTX::LDV_f32_v2_ari_64, NVPTX::LDV_f64_v2_ari_64);
1382 break;
1383 case NVPTXISD::LoadV4:
1384 Opcode = pickOpcodeForVT(
1385 EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_ari_64,
1386 NVPTX::LDV_i16_v4_ari_64, NVPTX::LDV_i32_v4_ari_64, std::nullopt,
1387 NVPTX::LDV_f32_v4_ari_64, std::nullopt);
1388 break;
1389 }
1390 } else {
1391 switch (N->getOpcode()) {
1392 default:
1393 return false;
1394 case NVPTXISD::LoadV2:
1395 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1396 NVPTX::LDV_i8_v2_ari, NVPTX::LDV_i16_v2_ari,
1397 NVPTX::LDV_i32_v2_ari, NVPTX::LDV_i64_v2_ari,
1398 NVPTX::LDV_f32_v2_ari, NVPTX::LDV_f64_v2_ari);
1399 break;
1400 case NVPTXISD::LoadV4:
1401 Opcode =
1402 pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_ari,
1403 NVPTX::LDV_i16_v4_ari, NVPTX::LDV_i32_v4_ari,
1404 std::nullopt, NVPTX::LDV_f32_v4_ari, std::nullopt);
1405 break;
1406 }
1407 }
1408 if (!Opcode)
1409 return false;
1410 Ops.append({Base, Offset, Chain});
1411 } else {
1412 if (PointerSize == 64) {
1413 switch (N->getOpcode()) {
1414 default:
1415 return false;
1416 case NVPTXISD::LoadV2:
1417 Opcode = pickOpcodeForVT(
1418 EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v2_areg_64,
1419 NVPTX::LDV_i16_v2_areg_64, NVPTX::LDV_i32_v2_areg_64,
1420 NVPTX::LDV_i64_v2_areg_64, NVPTX::LDV_f32_v2_areg_64,
1421 NVPTX::LDV_f64_v2_areg_64);
1422 break;
1423 case NVPTXISD::LoadV4:
1424 Opcode = pickOpcodeForVT(
1425 EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_areg_64,
1426 NVPTX::LDV_i16_v4_areg_64, NVPTX::LDV_i32_v4_areg_64, std::nullopt,
1427 NVPTX::LDV_f32_v4_areg_64, std::nullopt);
1428 break;
1429 }
1430 } else {
1431 switch (N->getOpcode()) {
1432 default:
1433 return false;
1434 case NVPTXISD::LoadV2:
1435 Opcode =
1436 pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v2_areg,
1437 NVPTX::LDV_i16_v2_areg, NVPTX::LDV_i32_v2_areg,
1438 NVPTX::LDV_i64_v2_areg, NVPTX::LDV_f32_v2_areg,
1439 NVPTX::LDV_f64_v2_areg);
1440 break;
1441 case NVPTXISD::LoadV4:
1442 Opcode =
1443 pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_areg,
1444 NVPTX::LDV_i16_v4_areg, NVPTX::LDV_i32_v4_areg,
1445 std::nullopt, NVPTX::LDV_f32_v4_areg, std::nullopt);
1446 break;
1447 }
1448 }
1449 if (!Opcode)
1450 return false;
1451 Ops.append({Op1, Chain});
1452 }
1453 LD = CurDAG->getMachineNode(*Opcode, DL, N->getVTList(), Ops);
1454
1455 MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand();
1456 CurDAG->setNodeMemRefs(cast<MachineSDNode>(LD), {MemRef});
1457
1458 ReplaceNode(N, LD);
1459 return true;
1460}
1461
1462bool NVPTXDAGToDAGISel::tryLDGLDU(SDNode *N) {
1463 SDValue Op1;
1464 MemSDNode *Mem;
1465 bool IsLDG = true;
1466
1467 // If this is an LDG intrinsic, the address is the third operand. If its an
1468 // LDG/LDU SD node (from custom vector handling), then its the second operand
1469 if (N->getOpcode() == ISD::INTRINSIC_W_CHAIN) {
1470 Op1 = N->getOperand(2);
1471 Mem = cast<MemIntrinsicSDNode>(N);
1472 unsigned IID = N->getConstantOperandVal(1);
1473 switch (IID) {
1474 default:
1475 return false;
1476 case Intrinsic::nvvm_ldg_global_f:
1477 case Intrinsic::nvvm_ldg_global_i:
1478 case Intrinsic::nvvm_ldg_global_p:
1479 IsLDG = true;
1480 break;
1481 case Intrinsic::nvvm_ldu_global_f:
1482 case Intrinsic::nvvm_ldu_global_i:
1483 case Intrinsic::nvvm_ldu_global_p:
1484 IsLDG = false;
1485 break;
1486 }
1487 } else {
1488 Op1 = N->getOperand(1);
1489 Mem = cast<MemSDNode>(N);
1490 }
1491
1492 EVT OrigType = N->getValueType(0);
1493 EVT EltVT = Mem->getMemoryVT();
1494 unsigned NumElts = 1;
1495 if (EltVT.isVector()) {
1496 NumElts = EltVT.getVectorNumElements();
1497 EltVT = EltVT.getVectorElementType();
1498 // vectors of 16bits type are loaded/stored as multiples of v2x16 elements.
1499 if ((EltVT == MVT::f16 && OrigType == MVT::v2f16) ||
1500 (EltVT == MVT::bf16 && OrigType == MVT::v2bf16) ||
1501 (EltVT == MVT::i16 && OrigType == MVT::v2i16)) {
1502 assert(NumElts % 2 == 0 && "Vector must have even number of elements");
1503 EltVT = OrigType;
1504 NumElts /= 2;
1505 } else if (OrigType == MVT::v4i8) {
1506 EltVT = OrigType;
1507 NumElts = 1;
1508 }
1509 }
1510
1511 // Build the "promoted" result VTList for the load. If we are really loading
1512 // i8s, then the return type will be promoted to i16 since we do not expose
1513 // 8-bit registers in NVPTX.
1514 EVT NodeVT = (EltVT == MVT::i8) ? MVT::i16 : EltVT;
1515 SmallVector<EVT, 5> InstVTs;
1516 for (unsigned i = 0; i != NumElts; ++i) {
1517 InstVTs.push_back(NodeVT);
1518 }
1519 InstVTs.push_back(MVT::Other);
1520 SDVTList InstVTList = CurDAG->getVTList(InstVTs);
1521 SDValue Chain = N->getOperand(0);
1522
1523 std::optional<unsigned> Opcode;
1524 SDLoc DL(N);
1525 SDNode *LD;
1527
1528 if (SelectDirectAddr(Op1, Addr)) {
1529 switch (N->getOpcode()) {
1530 default:
1531 return false;
1532 case ISD::LOAD:
1534 if (IsLDG)
1535 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1536 NVPTX::INT_PTX_LDG_GLOBAL_i8avar,
1537 NVPTX::INT_PTX_LDG_GLOBAL_i16avar,
1538 NVPTX::INT_PTX_LDG_GLOBAL_i32avar,
1539 NVPTX::INT_PTX_LDG_GLOBAL_i64avar,
1540 NVPTX::INT_PTX_LDG_GLOBAL_f32avar,
1541 NVPTX::INT_PTX_LDG_GLOBAL_f64avar);
1542 else
1543 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1544 NVPTX::INT_PTX_LDU_GLOBAL_i8avar,
1545 NVPTX::INT_PTX_LDU_GLOBAL_i16avar,
1546 NVPTX::INT_PTX_LDU_GLOBAL_i32avar,
1547 NVPTX::INT_PTX_LDU_GLOBAL_i64avar,
1548 NVPTX::INT_PTX_LDU_GLOBAL_f32avar,
1549 NVPTX::INT_PTX_LDU_GLOBAL_f64avar);
1550 break;
1551 case NVPTXISD::LoadV2:
1552 case NVPTXISD::LDGV2:
1553 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1554 NVPTX::INT_PTX_LDG_G_v2i8_ELE_avar,
1555 NVPTX::INT_PTX_LDG_G_v2i16_ELE_avar,
1556 NVPTX::INT_PTX_LDG_G_v2i32_ELE_avar,
1557 NVPTX::INT_PTX_LDG_G_v2i64_ELE_avar,
1558 NVPTX::INT_PTX_LDG_G_v2f32_ELE_avar,
1559 NVPTX::INT_PTX_LDG_G_v2f64_ELE_avar);
1560 break;
1561 case NVPTXISD::LDUV2:
1562 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1563 NVPTX::INT_PTX_LDU_G_v2i8_ELE_avar,
1564 NVPTX::INT_PTX_LDU_G_v2i16_ELE_avar,
1565 NVPTX::INT_PTX_LDU_G_v2i32_ELE_avar,
1566 NVPTX::INT_PTX_LDU_G_v2i64_ELE_avar,
1567 NVPTX::INT_PTX_LDU_G_v2f32_ELE_avar,
1568 NVPTX::INT_PTX_LDU_G_v2f64_ELE_avar);
1569 break;
1570 case NVPTXISD::LoadV4:
1571 case NVPTXISD::LDGV4:
1572 Opcode = pickOpcodeForVT(
1573 EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDG_G_v4i8_ELE_avar,
1574 NVPTX::INT_PTX_LDG_G_v4i16_ELE_avar,
1575 NVPTX::INT_PTX_LDG_G_v4i32_ELE_avar, std::nullopt,
1576 NVPTX::INT_PTX_LDG_G_v4f32_ELE_avar, std::nullopt);
1577 break;
1578 case NVPTXISD::LDUV4:
1579 Opcode = pickOpcodeForVT(
1580 EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDU_G_v4i8_ELE_avar,
1581 NVPTX::INT_PTX_LDU_G_v4i16_ELE_avar,
1582 NVPTX::INT_PTX_LDU_G_v4i32_ELE_avar, std::nullopt,
1583 NVPTX::INT_PTX_LDU_G_v4f32_ELE_avar, std::nullopt);
1584 break;
1585 }
1586 if (!Opcode)
1587 return false;
1588 SDValue Ops[] = { Addr, Chain };
1589 LD = CurDAG->getMachineNode(*Opcode, DL, InstVTList, Ops);
1590 } else if (TM.is64Bit() ? SelectADDRri64(Op1.getNode(), Op1, Base, Offset)
1591 : SelectADDRri(Op1.getNode(), Op1, Base, Offset)) {
1592 if (TM.is64Bit()) {
1593 switch (N->getOpcode()) {
1594 default:
1595 return false;
1596 case ISD::LOAD:
1598 if (IsLDG)
1599 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1600 NVPTX::INT_PTX_LDG_GLOBAL_i8ari64,
1601 NVPTX::INT_PTX_LDG_GLOBAL_i16ari64,
1602 NVPTX::INT_PTX_LDG_GLOBAL_i32ari64,
1603 NVPTX::INT_PTX_LDG_GLOBAL_i64ari64,
1604 NVPTX::INT_PTX_LDG_GLOBAL_f32ari64,
1605 NVPTX::INT_PTX_LDG_GLOBAL_f64ari64);
1606 else
1607 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1608 NVPTX::INT_PTX_LDU_GLOBAL_i8ari64,
1609 NVPTX::INT_PTX_LDU_GLOBAL_i16ari64,
1610 NVPTX::INT_PTX_LDU_GLOBAL_i32ari64,
1611 NVPTX::INT_PTX_LDU_GLOBAL_i64ari64,
1612 NVPTX::INT_PTX_LDU_GLOBAL_f32ari64,
1613 NVPTX::INT_PTX_LDU_GLOBAL_f64ari64);
1614 break;
1615 case NVPTXISD::LoadV2:
1616 case NVPTXISD::LDGV2:
1617 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1618 NVPTX::INT_PTX_LDG_G_v2i8_ELE_ari64,
1619 NVPTX::INT_PTX_LDG_G_v2i16_ELE_ari64,
1620 NVPTX::INT_PTX_LDG_G_v2i32_ELE_ari64,
1621 NVPTX::INT_PTX_LDG_G_v2i64_ELE_ari64,
1622 NVPTX::INT_PTX_LDG_G_v2f32_ELE_ari64,
1623 NVPTX::INT_PTX_LDG_G_v2f64_ELE_ari64);
1624 break;
1625 case NVPTXISD::LDUV2:
1626 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1627 NVPTX::INT_PTX_LDU_G_v2i8_ELE_ari64,
1628 NVPTX::INT_PTX_LDU_G_v2i16_ELE_ari64,
1629 NVPTX::INT_PTX_LDU_G_v2i32_ELE_ari64,
1630 NVPTX::INT_PTX_LDU_G_v2i64_ELE_ari64,
1631 NVPTX::INT_PTX_LDU_G_v2f32_ELE_ari64,
1632 NVPTX::INT_PTX_LDU_G_v2f64_ELE_ari64);
1633 break;
1634 case NVPTXISD::LoadV4:
1635 case NVPTXISD::LDGV4:
1636 Opcode = pickOpcodeForVT(
1637 EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDG_G_v4i8_ELE_ari64,
1638 NVPTX::INT_PTX_LDG_G_v4i16_ELE_ari64,
1639 NVPTX::INT_PTX_LDG_G_v4i32_ELE_ari64, std::nullopt,
1640 NVPTX::INT_PTX_LDG_G_v4f32_ELE_ari64, std::nullopt);
1641 break;
1642 case NVPTXISD::LDUV4:
1643 Opcode = pickOpcodeForVT(
1644 EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDU_G_v4i8_ELE_ari64,
1645 NVPTX::INT_PTX_LDU_G_v4i16_ELE_ari64,
1646 NVPTX::INT_PTX_LDU_G_v4i32_ELE_ari64, std::nullopt,
1647 NVPTX::INT_PTX_LDU_G_v4f32_ELE_ari64, std::nullopt);
1648 break;
1649 }
1650 } else {
1651 switch (N->getOpcode()) {
1652 default:
1653 return false;
1654 case ISD::LOAD:
1656 if (IsLDG)
1657 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1658 NVPTX::INT_PTX_LDG_GLOBAL_i8ari,
1659 NVPTX::INT_PTX_LDG_GLOBAL_i16ari,
1660 NVPTX::INT_PTX_LDG_GLOBAL_i32ari,
1661 NVPTX::INT_PTX_LDG_GLOBAL_i64ari,
1662 NVPTX::INT_PTX_LDG_GLOBAL_f32ari,
1663 NVPTX::INT_PTX_LDG_GLOBAL_f64ari);
1664 else
1665 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1666 NVPTX::INT_PTX_LDU_GLOBAL_i8ari,
1667 NVPTX::INT_PTX_LDU_GLOBAL_i16ari,
1668 NVPTX::INT_PTX_LDU_GLOBAL_i32ari,
1669 NVPTX::INT_PTX_LDU_GLOBAL_i64ari,
1670 NVPTX::INT_PTX_LDU_GLOBAL_f32ari,
1671 NVPTX::INT_PTX_LDU_GLOBAL_f64ari);
1672 break;
1673 case NVPTXISD::LoadV2:
1674 case NVPTXISD::LDGV2:
1675 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1676 NVPTX::INT_PTX_LDG_G_v2i8_ELE_ari32,
1677 NVPTX::INT_PTX_LDG_G_v2i16_ELE_ari32,
1678 NVPTX::INT_PTX_LDG_G_v2i32_ELE_ari32,
1679 NVPTX::INT_PTX_LDG_G_v2i64_ELE_ari32,
1680 NVPTX::INT_PTX_LDG_G_v2f32_ELE_ari32,
1681 NVPTX::INT_PTX_LDG_G_v2f64_ELE_ari32);
1682 break;
1683 case NVPTXISD::LDUV2:
1684 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1685 NVPTX::INT_PTX_LDU_G_v2i8_ELE_ari32,
1686 NVPTX::INT_PTX_LDU_G_v2i16_ELE_ari32,
1687 NVPTX::INT_PTX_LDU_G_v2i32_ELE_ari32,
1688 NVPTX::INT_PTX_LDU_G_v2i64_ELE_ari32,
1689 NVPTX::INT_PTX_LDU_G_v2f32_ELE_ari32,
1690 NVPTX::INT_PTX_LDU_G_v2f64_ELE_ari32);
1691 break;
1692 case NVPTXISD::LoadV4:
1693 case NVPTXISD::LDGV4:
1694 Opcode = pickOpcodeForVT(
1695 EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDG_G_v4i8_ELE_ari32,
1696 NVPTX::INT_PTX_LDG_G_v4i16_ELE_ari32,
1697 NVPTX::INT_PTX_LDG_G_v4i32_ELE_ari32, std::nullopt,
1698 NVPTX::INT_PTX_LDG_G_v4f32_ELE_ari32, std::nullopt);
1699 break;
1700 case NVPTXISD::LDUV4:
1701 Opcode = pickOpcodeForVT(
1702 EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDU_G_v4i8_ELE_ari32,
1703 NVPTX::INT_PTX_LDU_G_v4i16_ELE_ari32,
1704 NVPTX::INT_PTX_LDU_G_v4i32_ELE_ari32, std::nullopt,
1705 NVPTX::INT_PTX_LDU_G_v4f32_ELE_ari32, std::nullopt);
1706 break;
1707 }
1708 }
1709 if (!Opcode)
1710 return false;
1711 SDValue Ops[] = {Base, Offset, Chain};
1712 LD = CurDAG->getMachineNode(*Opcode, DL, InstVTList, Ops);
1713 } else {
1714 if (TM.is64Bit()) {
1715 switch (N->getOpcode()) {
1716 default:
1717 return false;
1718 case ISD::LOAD:
1720 if (IsLDG)
1721 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1722 NVPTX::INT_PTX_LDG_GLOBAL_i8areg64,
1723 NVPTX::INT_PTX_LDG_GLOBAL_i16areg64,
1724 NVPTX::INT_PTX_LDG_GLOBAL_i32areg64,
1725 NVPTX::INT_PTX_LDG_GLOBAL_i64areg64,
1726 NVPTX::INT_PTX_LDG_GLOBAL_f32areg64,
1727 NVPTX::INT_PTX_LDG_GLOBAL_f64areg64);
1728 else
1729 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1730 NVPTX::INT_PTX_LDU_GLOBAL_i8areg64,
1731 NVPTX::INT_PTX_LDU_GLOBAL_i16areg64,
1732 NVPTX::INT_PTX_LDU_GLOBAL_i32areg64,
1733 NVPTX::INT_PTX_LDU_GLOBAL_i64areg64,
1734 NVPTX::INT_PTX_LDU_GLOBAL_f32areg64,
1735 NVPTX::INT_PTX_LDU_GLOBAL_f64areg64);
1736 break;
1737 case NVPTXISD::LoadV2:
1738 case NVPTXISD::LDGV2:
1739 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1740 NVPTX::INT_PTX_LDG_G_v2i8_ELE_areg64,
1741 NVPTX::INT_PTX_LDG_G_v2i16_ELE_areg64,
1742 NVPTX::INT_PTX_LDG_G_v2i32_ELE_areg64,
1743 NVPTX::INT_PTX_LDG_G_v2i64_ELE_areg64,
1744 NVPTX::INT_PTX_LDG_G_v2f32_ELE_areg64,
1745 NVPTX::INT_PTX_LDG_G_v2f64_ELE_areg64);
1746 break;
1747 case NVPTXISD::LDUV2:
1748 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1749 NVPTX::INT_PTX_LDU_G_v2i8_ELE_areg64,
1750 NVPTX::INT_PTX_LDU_G_v2i16_ELE_areg64,
1751 NVPTX::INT_PTX_LDU_G_v2i32_ELE_areg64,
1752 NVPTX::INT_PTX_LDU_G_v2i64_ELE_areg64,
1753 NVPTX::INT_PTX_LDU_G_v2f32_ELE_areg64,
1754 NVPTX::INT_PTX_LDU_G_v2f64_ELE_areg64);
1755 break;
1756 case NVPTXISD::LoadV4:
1757 case NVPTXISD::LDGV4:
1758 Opcode = pickOpcodeForVT(
1759 EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDG_G_v4i8_ELE_areg64,
1760 NVPTX::INT_PTX_LDG_G_v4i16_ELE_areg64,
1761 NVPTX::INT_PTX_LDG_G_v4i32_ELE_areg64, std::nullopt,
1762 NVPTX::INT_PTX_LDG_G_v4f32_ELE_areg64, std::nullopt);
1763 break;
1764 case NVPTXISD::LDUV4:
1765 Opcode = pickOpcodeForVT(
1766 EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDU_G_v4i8_ELE_areg64,
1767 NVPTX::INT_PTX_LDU_G_v4i16_ELE_areg64,
1768 NVPTX::INT_PTX_LDU_G_v4i32_ELE_areg64, std::nullopt,
1769 NVPTX::INT_PTX_LDU_G_v4f32_ELE_areg64, std::nullopt);
1770 break;
1771 }
1772 } else {
1773 switch (N->getOpcode()) {
1774 default:
1775 return false;
1776 case ISD::LOAD:
1778 if (IsLDG)
1779 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1780 NVPTX::INT_PTX_LDG_GLOBAL_i8areg,
1781 NVPTX::INT_PTX_LDG_GLOBAL_i16areg,
1782 NVPTX::INT_PTX_LDG_GLOBAL_i32areg,
1783 NVPTX::INT_PTX_LDG_GLOBAL_i64areg,
1784 NVPTX::INT_PTX_LDG_GLOBAL_f32areg,
1785 NVPTX::INT_PTX_LDG_GLOBAL_f64areg);
1786 else
1787 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1788 NVPTX::INT_PTX_LDU_GLOBAL_i8areg,
1789 NVPTX::INT_PTX_LDU_GLOBAL_i16areg,
1790 NVPTX::INT_PTX_LDU_GLOBAL_i32areg,
1791 NVPTX::INT_PTX_LDU_GLOBAL_i64areg,
1792 NVPTX::INT_PTX_LDU_GLOBAL_f32areg,
1793 NVPTX::INT_PTX_LDU_GLOBAL_f64areg);
1794 break;
1795 case NVPTXISD::LoadV2:
1796 case NVPTXISD::LDGV2:
1797 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1798 NVPTX::INT_PTX_LDG_G_v2i8_ELE_areg32,
1799 NVPTX::INT_PTX_LDG_G_v2i16_ELE_areg32,
1800 NVPTX::INT_PTX_LDG_G_v2i32_ELE_areg32,
1801 NVPTX::INT_PTX_LDG_G_v2i64_ELE_areg32,
1802 NVPTX::INT_PTX_LDG_G_v2f32_ELE_areg32,
1803 NVPTX::INT_PTX_LDG_G_v2f64_ELE_areg32);
1804 break;
1805 case NVPTXISD::LDUV2:
1806 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1807 NVPTX::INT_PTX_LDU_G_v2i8_ELE_areg32,
1808 NVPTX::INT_PTX_LDU_G_v2i16_ELE_areg32,
1809 NVPTX::INT_PTX_LDU_G_v2i32_ELE_areg32,
1810 NVPTX::INT_PTX_LDU_G_v2i64_ELE_areg32,
1811 NVPTX::INT_PTX_LDU_G_v2f32_ELE_areg32,
1812 NVPTX::INT_PTX_LDU_G_v2f64_ELE_areg32);
1813 break;
1814 case NVPTXISD::LoadV4:
1815 case NVPTXISD::LDGV4:
1816 Opcode = pickOpcodeForVT(
1817 EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDG_G_v4i8_ELE_areg32,
1818 NVPTX::INT_PTX_LDG_G_v4i16_ELE_areg32,
1819 NVPTX::INT_PTX_LDG_G_v4i32_ELE_areg32, std::nullopt,
1820 NVPTX::INT_PTX_LDG_G_v4f32_ELE_areg32, std::nullopt);
1821 break;
1822 case NVPTXISD::LDUV4:
1823 Opcode = pickOpcodeForVT(
1824 EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDU_G_v4i8_ELE_areg32,
1825 NVPTX::INT_PTX_LDU_G_v4i16_ELE_areg32,
1826 NVPTX::INT_PTX_LDU_G_v4i32_ELE_areg32, std::nullopt,
1827 NVPTX::INT_PTX_LDU_G_v4f32_ELE_areg32, std::nullopt);
1828 break;
1829 }
1830 }
1831 if (!Opcode)
1832 return false;
1833 SDValue Ops[] = { Op1, Chain };
1834 LD = CurDAG->getMachineNode(*Opcode, DL, InstVTList, Ops);
1835 }
1836
1837 // For automatic generation of LDG (through SelectLoad[Vector], not the
1838 // intrinsics), we may have an extending load like:
1839 //
1840 // i32,ch = load<LD1[%data1(addrspace=1)], zext from i8> t0, t7, undef:i64
1841 //
1842 // In this case, the matching logic above will select a load for the original
1843 // memory type (in this case, i8) and our types will not match (the node needs
1844 // to return an i32 in this case). Our LDG/LDU nodes do not support the
1845 // concept of sign-/zero-extension, so emulate it here by adding an explicit
1846 // CVT instruction. Ptxas should clean up any redundancies here.
1847
1848 LoadSDNode *LdNode = dyn_cast<LoadSDNode>(N);
1849
1850 if (OrigType != EltVT &&
1851 (LdNode || (OrigType.isFloatingPoint() && EltVT.isFloatingPoint()))) {
1852 // We have an extending-load. The instruction we selected operates on the
1853 // smaller type, but the SDNode we are replacing has the larger type. We
1854 // need to emit a CVT to make the types match.
1855 unsigned CvtOpc =
1856 GetConvertOpcode(OrigType.getSimpleVT(), EltVT.getSimpleVT(), LdNode);
1857
1858 // For each output value, apply the manual sign/zero-extension and make sure
1859 // all users of the load go through that CVT.
1860 for (unsigned i = 0; i != NumElts; ++i) {
1861 SDValue Res(LD, i);
1862 SDValue OrigVal(N, i);
1863
1864 SDNode *CvtNode =
1865 CurDAG->getMachineNode(CvtOpc, DL, OrigType, Res,
1867 DL, MVT::i32));
1868 ReplaceUses(OrigVal, SDValue(CvtNode, 0));
1869 }
1870 }
1871
1872 ReplaceNode(N, LD);
1873 return true;
1874}
1875
1876bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
1877 MemSDNode *ST = cast<MemSDNode>(N);
1878 assert(ST->writeMem() && "Expected store");
1879 StoreSDNode *PlainStore = dyn_cast<StoreSDNode>(N);
1880 AtomicSDNode *AtomicStore = dyn_cast<AtomicSDNode>(N);
1881 assert((PlainStore || AtomicStore) && "Expected store");
1882
1883 // do not support pre/post inc/dec
1884 if (PlainStore && PlainStore->isIndexed())
1885 return false;
1886
1887 EVT StoreVT = ST->getMemoryVT();
1888 if (!StoreVT.isSimple())
1889 return false;
1890
1891 // Address Space Setting
1892 unsigned int CodeAddrSpace = getCodeAddrSpace(ST);
1893 unsigned int PointerSize =
1894 CurDAG->getDataLayout().getPointerSizeInBits(ST->getAddressSpace());
1895
1896 SDLoc DL(N);
1897 SDValue Chain = ST->getChain();
1898 auto InstructionOrdering = insertMemoryInstructionFence(DL, Chain, ST);
1899
1900 // Vector Setting
1901 MVT SimpleVT = StoreVT.getSimpleVT();
1903
1904 // Type Setting: toType + toTypeWidth
1905 // - for integer type, always use 'u'
1906 MVT ScalarVT = SimpleVT.getScalarType();
1907 unsigned ToTypeWidth = ScalarVT.getSizeInBits();
1908 if (SimpleVT.isVector()) {
1909 assert((Isv2x16VT(StoreVT) || StoreVT == MVT::v4i8) &&
1910 "Unexpected vector type");
1911 // v2x16 is stored using st.b32
1912 ToTypeWidth = 32;
1913 }
1914
1915 unsigned int ToType = getLdStRegType(ScalarVT);
1916
1917 // Create the machine instruction DAG
1918 SDValue Value = PlainStore ? PlainStore->getValue() : AtomicStore->getVal();
1919 SDValue BasePtr = ST->getBasePtr();
1920 SDValue Addr;
1922 std::optional<unsigned> Opcode;
1923 MVT::SimpleValueType SourceVT =
1924 Value.getNode()->getSimpleValueType(0).SimpleTy;
1925
1927 getI32Imm(CodeAddrSpace, DL),
1928 getI32Imm(VecType, DL), getI32Imm(ToType, DL),
1929 getI32Imm(ToTypeWidth, DL)});
1930
1931 if (SelectDirectAddr(BasePtr, Addr)) {
1932 Opcode = pickOpcodeForVT(SourceVT, NVPTX::ST_i8_avar, NVPTX::ST_i16_avar,
1933 NVPTX::ST_i32_avar, NVPTX::ST_i64_avar,
1934 NVPTX::ST_f32_avar, NVPTX::ST_f64_avar);
1935 if (!Opcode)
1936 return false;
1937 Ops.append({Addr, Chain});
1938 } else if (PointerSize == 64
1939 ? SelectADDRsi64(BasePtr.getNode(), BasePtr, Base, Offset)
1940 : SelectADDRsi(BasePtr.getNode(), BasePtr, Base, Offset)) {
1941 Opcode = pickOpcodeForVT(SourceVT, NVPTX::ST_i8_asi, NVPTX::ST_i16_asi,
1942 NVPTX::ST_i32_asi, NVPTX::ST_i64_asi,
1943 NVPTX::ST_f32_asi, NVPTX::ST_f64_asi);
1944 if (!Opcode)
1945 return false;
1946 Ops.append({Base, Offset, Chain});
1947 } else if (PointerSize == 64
1948 ? SelectADDRri64(BasePtr.getNode(), BasePtr, Base, Offset)
1949 : SelectADDRri(BasePtr.getNode(), BasePtr, Base, Offset)) {
1950 if (PointerSize == 64)
1951 Opcode =
1952 pickOpcodeForVT(SourceVT, NVPTX::ST_i8_ari_64, NVPTX::ST_i16_ari_64,
1953 NVPTX::ST_i32_ari_64, NVPTX::ST_i64_ari_64,
1954 NVPTX::ST_f32_ari_64, NVPTX::ST_f64_ari_64);
1955 else
1956 Opcode = pickOpcodeForVT(SourceVT, NVPTX::ST_i8_ari, NVPTX::ST_i16_ari,
1957 NVPTX::ST_i32_ari, NVPTX::ST_i64_ari,
1958 NVPTX::ST_f32_ari, NVPTX::ST_f64_ari);
1959 if (!Opcode)
1960 return false;
1961 Ops.append({Base, Offset, Chain});
1962 } else {
1963 if (PointerSize == 64)
1964 Opcode =
1965 pickOpcodeForVT(SourceVT, NVPTX::ST_i8_areg_64, NVPTX::ST_i16_areg_64,
1966 NVPTX::ST_i32_areg_64, NVPTX::ST_i64_areg_64,
1967 NVPTX::ST_f32_areg_64, NVPTX::ST_f64_areg_64);
1968 else
1969 Opcode = pickOpcodeForVT(SourceVT, NVPTX::ST_i8_areg, NVPTX::ST_i16_areg,
1970 NVPTX::ST_i32_areg, NVPTX::ST_i64_areg,
1971 NVPTX::ST_f32_areg, NVPTX::ST_f64_areg);
1972 if (!Opcode)
1973 return false;
1974 Ops.append({BasePtr, Chain});
1975 }
1976
1977 SDNode *NVPTXST = NVPTXST =
1978 CurDAG->getMachineNode(*Opcode, DL, MVT::Other, Ops);
1979
1980 if (!NVPTXST)
1981 return false;
1982
1983 MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand();
1984 CurDAG->setNodeMemRefs(cast<MachineSDNode>(NVPTXST), {MemRef});
1985 ReplaceNode(N, NVPTXST);
1986 return true;
1987}
1988
1989bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
1990 SDValue Op1 = N->getOperand(1);
1992 std::optional<unsigned> Opcode;
1993 SDNode *ST;
1994 EVT EltVT = Op1.getValueType();
1995 MemSDNode *MemSD = cast<MemSDNode>(N);
1996 EVT StoreVT = MemSD->getMemoryVT();
1997
1998 // Address Space Setting
1999 unsigned CodeAddrSpace = getCodeAddrSpace(MemSD);
2000 if (CodeAddrSpace == NVPTX::PTXLdStInstCode::CONSTANT) {
2001 report_fatal_error("Cannot store to pointer that points to constant "
2002 "memory space");
2003 }
2004 unsigned int PointerSize =
2006
2007 SDLoc DL(N);
2008 SDValue Chain = N->getOperand(0);
2009 auto InstructionOrdering = insertMemoryInstructionFence(DL, Chain, MemSD);
2010
2011 // Type Setting: toType + toTypeWidth
2012 // - for integer type, always use 'u'
2013 assert(StoreVT.isSimple() && "Store value is not simple");
2014 MVT ScalarVT = StoreVT.getSimpleVT().getScalarType();
2015 unsigned ToTypeWidth = ScalarVT.getSizeInBits();
2016 unsigned ToType = getLdStRegType(ScalarVT);
2017
2019 SDValue N2;
2020 unsigned VecType;
2021
2022 switch (N->getOpcode()) {
2023 case NVPTXISD::StoreV2:
2025 Ops.append({N->getOperand(1), N->getOperand(2)});
2026 N2 = N->getOperand(3);
2027 break;
2028 case NVPTXISD::StoreV4:
2030 Ops.append({N->getOperand(1), N->getOperand(2), N->getOperand(3),
2031 N->getOperand(4)});
2032 N2 = N->getOperand(5);
2033 break;
2034 default:
2035 return false;
2036 }
2037
2038 // v8x16 is a special case. PTX doesn't have st.v8.x16
2039 // instruction. Instead, we split the vector into v2x16 chunks and
2040 // store them with st.v4.b32.
2041 if (Isv2x16VT(EltVT)) {
2042 assert(N->getOpcode() == NVPTXISD::StoreV4 && "Unexpected load opcode.");
2043 EltVT = MVT::i32;
2045 ToTypeWidth = 32;
2046 }
2047
2048 Ops.append({getI32Imm(InstructionOrdering, DL), getI32Imm(CodeAddrSpace, DL),
2049 getI32Imm(VecType, DL), getI32Imm(ToType, DL),
2050 getI32Imm(ToTypeWidth, DL)});
2051
2052 if (SelectDirectAddr(N2, Addr)) {
2053 switch (N->getOpcode()) {
2054 default:
2055 return false;
2056 case NVPTXISD::StoreV2:
2057 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
2058 NVPTX::STV_i8_v2_avar, NVPTX::STV_i16_v2_avar,
2059 NVPTX::STV_i32_v2_avar, NVPTX::STV_i64_v2_avar,
2060 NVPTX::STV_f32_v2_avar, NVPTX::STV_f64_v2_avar);
2061 break;
2062 case NVPTXISD::StoreV4:
2063 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
2064 NVPTX::STV_i8_v4_avar, NVPTX::STV_i16_v4_avar,
2065 NVPTX::STV_i32_v4_avar, std::nullopt,
2066 NVPTX::STV_f32_v4_avar, std::nullopt);
2067 break;
2068 }
2069 Ops.push_back(Addr);
2070 } else if (PointerSize == 64 ? SelectADDRsi64(N2.getNode(), N2, Base, Offset)
2071 : SelectADDRsi(N2.getNode(), N2, Base, Offset)) {
2072 switch (N->getOpcode()) {
2073 default:
2074 return false;
2075 case NVPTXISD::StoreV2:
2076 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
2077 NVPTX::STV_i8_v2_asi, NVPTX::STV_i16_v2_asi,
2078 NVPTX::STV_i32_v2_asi, NVPTX::STV_i64_v2_asi,
2079 NVPTX::STV_f32_v2_asi, NVPTX::STV_f64_v2_asi);
2080 break;
2081 case NVPTXISD::StoreV4:
2082 Opcode =
2083 pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v4_asi,
2084 NVPTX::STV_i16_v4_asi, NVPTX::STV_i32_v4_asi,
2085 std::nullopt, NVPTX::STV_f32_v4_asi, std::nullopt);
2086 break;
2087 }
2088 Ops.append({Base, Offset});
2089 } else if (PointerSize == 64 ? SelectADDRri64(N2.getNode(), N2, Base, Offset)
2090 : SelectADDRri(N2.getNode(), N2, Base, Offset)) {
2091 if (PointerSize == 64) {
2092 switch (N->getOpcode()) {
2093 default:
2094 return false;
2095 case NVPTXISD::StoreV2:
2096 Opcode =
2098 NVPTX::STV_i8_v2_ari_64, NVPTX::STV_i16_v2_ari_64,
2099 NVPTX::STV_i32_v2_ari_64, NVPTX::STV_i64_v2_ari_64,
2100 NVPTX::STV_f32_v2_ari_64, NVPTX::STV_f64_v2_ari_64);
2101 break;
2102 case NVPTXISD::StoreV4:
2103 Opcode = pickOpcodeForVT(
2104 EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v4_ari_64,
2105 NVPTX::STV_i16_v4_ari_64, NVPTX::STV_i32_v4_ari_64, std::nullopt,
2106 NVPTX::STV_f32_v4_ari_64, std::nullopt);
2107 break;
2108 }
2109 } else {
2110 switch (N->getOpcode()) {
2111 default:
2112 return false;
2113 case NVPTXISD::StoreV2:
2114 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
2115 NVPTX::STV_i8_v2_ari, NVPTX::STV_i16_v2_ari,
2116 NVPTX::STV_i32_v2_ari, NVPTX::STV_i64_v2_ari,
2117 NVPTX::STV_f32_v2_ari, NVPTX::STV_f64_v2_ari);
2118 break;
2119 case NVPTXISD::StoreV4:
2120 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
2121 NVPTX::STV_i8_v4_ari, NVPTX::STV_i16_v4_ari,
2122 NVPTX::STV_i32_v4_ari, std::nullopt,
2123 NVPTX::STV_f32_v4_ari, std::nullopt);
2124 break;
2125 }
2126 }
2127 Ops.append({Base, Offset});
2128 } else {
2129 if (PointerSize == 64) {
2130 switch (N->getOpcode()) {
2131 default:
2132 return false;
2133 case NVPTXISD::StoreV2:
2134 Opcode = pickOpcodeForVT(
2135 EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v2_areg_64,
2136 NVPTX::STV_i16_v2_areg_64, NVPTX::STV_i32_v2_areg_64,
2137 NVPTX::STV_i64_v2_areg_64, NVPTX::STV_f32_v2_areg_64,
2138 NVPTX::STV_f64_v2_areg_64);
2139 break;
2140 case NVPTXISD::StoreV4:
2141 Opcode = pickOpcodeForVT(
2142 EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v4_areg_64,
2143 NVPTX::STV_i16_v4_areg_64, NVPTX::STV_i32_v4_areg_64, std::nullopt,
2144 NVPTX::STV_f32_v4_areg_64, std::nullopt);
2145 break;
2146 }
2147 } else {
2148 switch (N->getOpcode()) {
2149 default:
2150 return false;
2151 case NVPTXISD::StoreV2:
2152 Opcode =
2153 pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v2_areg,
2154 NVPTX::STV_i16_v2_areg, NVPTX::STV_i32_v2_areg,
2155 NVPTX::STV_i64_v2_areg, NVPTX::STV_f32_v2_areg,
2156 NVPTX::STV_f64_v2_areg);
2157 break;
2158 case NVPTXISD::StoreV4:
2159 Opcode =
2160 pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v4_areg,
2161 NVPTX::STV_i16_v4_areg, NVPTX::STV_i32_v4_areg,
2162 std::nullopt, NVPTX::STV_f32_v4_areg, std::nullopt);
2163 break;
2164 }
2165 }
2166 Ops.push_back(N2);
2167 }
2168
2169 if (!Opcode)
2170 return false;
2171
2172 Ops.push_back(Chain);
2173
2174 ST = CurDAG->getMachineNode(*Opcode, DL, MVT::Other, Ops);
2175
2176 MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand();
2177 CurDAG->setNodeMemRefs(cast<MachineSDNode>(ST), {MemRef});
2178
2179 ReplaceNode(N, ST);
2180 return true;
2181}
2182
2183bool NVPTXDAGToDAGISel::tryLoadParam(SDNode *Node) {
2184 SDValue Chain = Node->getOperand(0);
2185 SDValue Offset = Node->getOperand(2);
2186 SDValue Glue = Node->getOperand(3);
2187 SDLoc DL(Node);
2188 MemSDNode *Mem = cast<MemSDNode>(Node);
2189
2190 unsigned VecSize;
2191 switch (Node->getOpcode()) {
2192 default:
2193 return false;
2195 VecSize = 1;
2196 break;
2198 VecSize = 2;
2199 break;
2201 VecSize = 4;
2202 break;
2203 }
2204
2205 EVT EltVT = Node->getValueType(0);
2206 EVT MemVT = Mem->getMemoryVT();
2207
2208 std::optional<unsigned> Opcode;
2209
2210 switch (VecSize) {
2211 default:
2212 return false;
2213 case 1:
2214 Opcode = pickOpcodeForVT(MemVT.getSimpleVT().SimpleTy,
2215 NVPTX::LoadParamMemI8, NVPTX::LoadParamMemI16,
2216 NVPTX::LoadParamMemI32, NVPTX::LoadParamMemI64,
2217 NVPTX::LoadParamMemF32, NVPTX::LoadParamMemF64);
2218 break;
2219 case 2:
2220 Opcode =
2221 pickOpcodeForVT(MemVT.getSimpleVT().SimpleTy, NVPTX::LoadParamMemV2I8,
2222 NVPTX::LoadParamMemV2I16, NVPTX::LoadParamMemV2I32,
2223 NVPTX::LoadParamMemV2I64, NVPTX::LoadParamMemV2F32,
2224 NVPTX::LoadParamMemV2F64);
2225 break;
2226 case 4:
2227 Opcode =
2228 pickOpcodeForVT(MemVT.getSimpleVT().SimpleTy, NVPTX::LoadParamMemV4I8,
2229 NVPTX::LoadParamMemV4I16, NVPTX::LoadParamMemV4I32,
2230 std::nullopt, NVPTX::LoadParamMemV4F32, std::nullopt);
2231 break;
2232 }
2233 if (!Opcode)
2234 return false;
2235
2236 SDVTList VTs;
2237 if (VecSize == 1) {
2238 VTs = CurDAG->getVTList(EltVT, MVT::Other, MVT::Glue);
2239 } else if (VecSize == 2) {
2240 VTs = CurDAG->getVTList(EltVT, EltVT, MVT::Other, MVT::Glue);
2241 } else {
2242 EVT EVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other, MVT::Glue };
2243 VTs = CurDAG->getVTList(EVTs);
2244 }
2245
2246 unsigned OffsetVal = Offset->getAsZExtVal();
2247
2249 {CurDAG->getTargetConstant(OffsetVal, DL, MVT::i32), Chain, Glue});
2250
2251 ReplaceNode(Node, CurDAG->getMachineNode(*Opcode, DL, VTs, Ops));
2252 return true;
2253}
2254
2255bool NVPTXDAGToDAGISel::tryStoreRetval(SDNode *N) {
2256 SDLoc DL(N);
2257 SDValue Chain = N->getOperand(0);
2258 SDValue Offset = N->getOperand(1);
2259 unsigned OffsetVal = Offset->getAsZExtVal();
2260 MemSDNode *Mem = cast<MemSDNode>(N);
2261
2262 // How many elements do we have?
2263 unsigned NumElts = 1;
2264 switch (N->getOpcode()) {
2265 default:
2266 return false;
2268 NumElts = 1;
2269 break;
2271 NumElts = 2;
2272 break;
2274 NumElts = 4;
2275 break;
2276 }
2277
2278 // Build vector of operands
2280 for (unsigned i = 0; i < NumElts; ++i)
2281 Ops.push_back(N->getOperand(i + 2));
2282 Ops.append({CurDAG->getTargetConstant(OffsetVal, DL, MVT::i32), Chain});
2283
2284 // Determine target opcode
2285 // If we have an i1, use an 8-bit store. The lowering code in
2286 // NVPTXISelLowering will have already emitted an upcast.
2287 std::optional<unsigned> Opcode = 0;
2288 switch (NumElts) {
2289 default:
2290 return false;
2291 case 1:
2293 NVPTX::StoreRetvalI8, NVPTX::StoreRetvalI16,
2294 NVPTX::StoreRetvalI32, NVPTX::StoreRetvalI64,
2295 NVPTX::StoreRetvalF32, NVPTX::StoreRetvalF64);
2296 if (Opcode == NVPTX::StoreRetvalI8) {
2297 // Fine tune the opcode depending on the size of the operand.
2298 // This helps to avoid creating redundant COPY instructions in
2299 // InstrEmitter::AddRegisterOperand().
2300 switch (Ops[0].getSimpleValueType().SimpleTy) {
2301 default:
2302 break;
2303 case MVT::i32:
2304 Opcode = NVPTX::StoreRetvalI8TruncI32;
2305 break;
2306 case MVT::i64:
2307 Opcode = NVPTX::StoreRetvalI8TruncI64;
2308 break;
2309 }
2310 }
2311 break;
2312 case 2:
2314 NVPTX::StoreRetvalV2I8, NVPTX::StoreRetvalV2I16,
2315 NVPTX::StoreRetvalV2I32, NVPTX::StoreRetvalV2I64,
2316 NVPTX::StoreRetvalV2F32, NVPTX::StoreRetvalV2F64);
2317 break;
2318 case 4:
2320 NVPTX::StoreRetvalV4I8, NVPTX::StoreRetvalV4I16,
2321 NVPTX::StoreRetvalV4I32, std::nullopt,
2322 NVPTX::StoreRetvalV4F32, std::nullopt);
2323 break;
2324 }
2325 if (!Opcode)
2326 return false;
2327
2328 SDNode *Ret = CurDAG->getMachineNode(*Opcode, DL, MVT::Other, Ops);
2329 MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand();
2330 CurDAG->setNodeMemRefs(cast<MachineSDNode>(Ret), {MemRef});
2331
2332 ReplaceNode(N, Ret);
2333 return true;
2334}
2335
2336// Helpers for constructing opcode (ex: NVPTX::StoreParamV4F32_iiri)
2337#define getOpcV2H(ty, opKind0, opKind1) \
2338 NVPTX::StoreParamV2##ty##_##opKind0##opKind1
2339
2340#define getOpcV2H1(ty, opKind0, isImm1) \
2341 (isImm1) ? getOpcV2H(ty, opKind0, i) : getOpcV2H(ty, opKind0, r)
2342
2343#define getOpcodeForVectorStParamV2(ty, isimm) \
2344 (isimm[0]) ? getOpcV2H1(ty, i, isimm[1]) : getOpcV2H1(ty, r, isimm[1])
2345
2346#define getOpcV4H(ty, opKind0, opKind1, opKind2, opKind3) \
2347 NVPTX::StoreParamV4##ty##_##opKind0##opKind1##opKind2##opKind3
2348
2349#define getOpcV4H3(ty, opKind0, opKind1, opKind2, isImm3) \
2350 (isImm3) ? getOpcV4H(ty, opKind0, opKind1, opKind2, i) \
2351 : getOpcV4H(ty, opKind0, opKind1, opKind2, r)
2352
2353#define getOpcV4H2(ty, opKind0, opKind1, isImm2, isImm3) \
2354 (isImm2) ? getOpcV4H3(ty, opKind0, opKind1, i, isImm3) \
2355 : getOpcV4H3(ty, opKind0, opKind1, r, isImm3)
2356
2357#define getOpcV4H1(ty, opKind0, isImm1, isImm2, isImm3) \
2358 (isImm1) ? getOpcV4H2(ty, opKind0, i, isImm2, isImm3) \
2359 : getOpcV4H2(ty, opKind0, r, isImm2, isImm3)
2360
2361#define getOpcodeForVectorStParamV4(ty, isimm) \
2362 (isimm[0]) ? getOpcV4H1(ty, i, isimm[1], isimm[2], isimm[3]) \
2363 : getOpcV4H1(ty, r, isimm[1], isimm[2], isimm[3])
2364
2365#define getOpcodeForVectorStParam(n, ty, isimm) \
2366 (n == 2) ? getOpcodeForVectorStParamV2(ty, isimm) \
2367 : getOpcodeForVectorStParamV4(ty, isimm)
2368
2370 unsigned NumElts,
2372 SelectionDAG *CurDAG, SDLoc DL) {
2373 // Determine which inputs are registers and immediates make new operators
2374 // with constant values
2375 SmallVector<bool, 4> IsImm(NumElts, false);
2376 for (unsigned i = 0; i < NumElts; i++) {
2377 IsImm[i] = (isa<ConstantSDNode>(Ops[i]) || isa<ConstantFPSDNode>(Ops[i]));
2378 if (IsImm[i]) {
2379 SDValue Imm = Ops[i];
2380 if (MemTy == MVT::f32 || MemTy == MVT::f64) {
2381 const ConstantFPSDNode *ConstImm = cast<ConstantFPSDNode>(Imm);
2382 const ConstantFP *CF = ConstImm->getConstantFPValue();
2383 Imm = CurDAG->getTargetConstantFP(*CF, DL, Imm->getValueType(0));
2384 } else {
2385 const ConstantSDNode *ConstImm = cast<ConstantSDNode>(Imm);
2386 const ConstantInt *CI = ConstImm->getConstantIntValue();
2387 Imm = CurDAG->getTargetConstant(*CI, DL, Imm->getValueType(0));
2388 }
2389 Ops[i] = Imm;
2390 }
2391 }
2392
2393 // Get opcode for MemTy, size, and register/immediate operand ordering
2394 switch (MemTy) {
2395 case MVT::i8:
2396 return getOpcodeForVectorStParam(NumElts, I8, IsImm);
2397 case MVT::i16:
2398 return getOpcodeForVectorStParam(NumElts, I16, IsImm);
2399 case MVT::i32:
2400 return getOpcodeForVectorStParam(NumElts, I32, IsImm);
2401 case MVT::i64:
2402 assert(NumElts == 2 && "MVT too large for NumElts > 2");
2403 return getOpcodeForVectorStParamV2(I64, IsImm);
2404 case MVT::f32:
2405 return getOpcodeForVectorStParam(NumElts, F32, IsImm);
2406 case MVT::f64:
2407 assert(NumElts == 2 && "MVT too large for NumElts > 2");
2408 return getOpcodeForVectorStParamV2(F64, IsImm);
2409
2410 // These cases don't support immediates, just use the all register version
2411 // and generate moves.
2412 case MVT::i1:
2413 return (NumElts == 2) ? NVPTX::StoreParamV2I8_rr
2414 : NVPTX::StoreParamV4I8_rrrr;
2415 case MVT::f16:
2416 case MVT::bf16:
2417 return (NumElts == 2) ? NVPTX::StoreParamV2I16_rr
2418 : NVPTX::StoreParamV4I16_rrrr;
2419 case MVT::v2f16:
2420 case MVT::v2bf16:
2421 case MVT::v2i16:
2422 case MVT::v4i8:
2423 return (NumElts == 2) ? NVPTX::StoreParamV2I32_rr
2424 : NVPTX::StoreParamV4I32_rrrr;
2425 default:
2426 llvm_unreachable("Cannot select st.param for unknown MemTy");
2427 }
2428}
2429
2430bool NVPTXDAGToDAGISel::tryStoreParam(SDNode *N) {
2431 SDLoc DL(N);
2432 SDValue Chain = N->getOperand(0);
2433 SDValue Param = N->getOperand(1);
2434 unsigned ParamVal = Param->getAsZExtVal();
2435 SDValue Offset = N->getOperand(2);
2436 unsigned OffsetVal = Offset->getAsZExtVal();
2437 MemSDNode *Mem = cast<MemSDNode>(N);
2438 SDValue Glue = N->getOperand(N->getNumOperands() - 1);
2439
2440 // How many elements do we have?
2441 unsigned NumElts;
2442 switch (N->getOpcode()) {
2443 default:
2444 llvm_unreachable("Unexpected opcode");
2448 NumElts = 1;
2449 break;
2451 NumElts = 2;
2452 break;
2454 NumElts = 4;
2455 break;
2456 }
2457
2458 // Build vector of operands
2460 for (unsigned i = 0; i < NumElts; ++i)
2461 Ops.push_back(N->getOperand(i + 3));
2462 Ops.append({CurDAG->getTargetConstant(ParamVal, DL, MVT::i32),
2463 CurDAG->getTargetConstant(OffsetVal, DL, MVT::i32), Chain, Glue});
2464
2465 // Determine target opcode
2466 // If we have an i1, use an 8-bit store. The lowering code in
2467 // NVPTXISelLowering will have already emitted an upcast.
2468 std::optional<unsigned> Opcode;
2469 switch (N->getOpcode()) {
2470 default:
2471 switch (NumElts) {
2472 default:
2473 llvm_unreachable("Unexpected NumElts");
2474 case 1: {
2476 SDValue Imm = Ops[0];
2477 if (MemTy != MVT::f16 && MemTy != MVT::v2f16 &&
2478 (isa<ConstantSDNode>(Imm) || isa<ConstantFPSDNode>(Imm))) {
2479 // Convert immediate to target constant
2480 if (MemTy == MVT::f32 || MemTy == MVT::f64) {
2481 const ConstantFPSDNode *ConstImm = cast<ConstantFPSDNode>(Imm);
2482 const ConstantFP *CF = ConstImm->getConstantFPValue();
2483 Imm = CurDAG->getTargetConstantFP(*CF, DL, Imm->getValueType(0));
2484 } else {
2485 const ConstantSDNode *ConstImm = cast<ConstantSDNode>(Imm);
2486 const ConstantInt *CI = ConstImm->getConstantIntValue();
2487 Imm = CurDAG->getTargetConstant(*CI, DL, Imm->getValueType(0));
2488 }
2489 Ops[0] = Imm;
2490 // Use immediate version of store param
2491 Opcode = pickOpcodeForVT(MemTy, NVPTX::StoreParamI8_i,
2492 NVPTX::StoreParamI16_i, NVPTX::StoreParamI32_i,
2493 NVPTX::StoreParamI64_i, NVPTX::StoreParamF32_i,
2494 NVPTX::StoreParamF64_i);
2495 } else
2496 Opcode =
2498 NVPTX::StoreParamI8_r, NVPTX::StoreParamI16_r,
2499 NVPTX::StoreParamI32_r, NVPTX::StoreParamI64_r,
2500 NVPTX::StoreParamF32_r, NVPTX::StoreParamF64_r);
2501 if (Opcode == NVPTX::StoreParamI8_r) {
2502 // Fine tune the opcode depending on the size of the operand.
2503 // This helps to avoid creating redundant COPY instructions in
2504 // InstrEmitter::AddRegisterOperand().
2505 switch (Ops[0].getSimpleValueType().SimpleTy) {
2506 default:
2507 break;
2508 case MVT::i32:
2509 Opcode = NVPTX::StoreParamI8TruncI32_r;
2510 break;
2511 case MVT::i64:
2512 Opcode = NVPTX::StoreParamI8TruncI64_r;
2513 break;
2514 }
2515 }
2516 break;
2517 }
2518 case 2:
2519 case 4: {
2521 Opcode = pickOpcodeForVectorStParam(Ops, NumElts, MemTy, CurDAG, DL);
2522 break;
2523 }
2524 }
2525 break;
2526 // Special case: if we have a sign-extend/zero-extend node, insert the
2527 // conversion instruction first, and use that as the value operand to
2528 // the selected StoreParam node.
2530 Opcode = NVPTX::StoreParamI32_r;
2532 MVT::i32);
2533 SDNode *Cvt = CurDAG->getMachineNode(NVPTX::CVT_u32_u16, DL,
2534 MVT::i32, Ops[0], CvtNone);
2535 Ops[0] = SDValue(Cvt, 0);
2536 break;
2537 }
2539 Opcode = NVPTX::StoreParamI32_r;
2541 MVT::i32);
2542 SDNode *Cvt = CurDAG->getMachineNode(NVPTX::CVT_s32_s16, DL,
2543 MVT::i32, Ops[0], CvtNone);
2544 Ops[0] = SDValue(Cvt, 0);
2545 break;
2546 }
2547 }
2548
2549 SDVTList RetVTs = CurDAG->getVTList(MVT::Other, MVT::Glue);
2550 SDNode *Ret = CurDAG->getMachineNode(*Opcode, DL, RetVTs, Ops);
2551 MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand();
2552 CurDAG->setNodeMemRefs(cast<MachineSDNode>(Ret), {MemRef});
2553
2554 ReplaceNode(N, Ret);
2555 return true;
2556}
2557
2558bool NVPTXDAGToDAGISel::tryTextureIntrinsic(SDNode *N) {
2559 unsigned Opc = 0;
2560
2561 switch (N->getOpcode()) {
2562 default: return false;
2564 Opc = NVPTX::TEX_1D_F32_S32_RR;
2565 break;
2567 Opc = NVPTX::TEX_1D_F32_F32_RR;
2568 break;
2570 Opc = NVPTX::TEX_1D_F32_F32_LEVEL_RR;
2571 break;
2573 Opc = NVPTX::TEX_1D_F32_F32_GRAD_RR;
2574 break;
2576 Opc = NVPTX::TEX_1D_S32_S32_RR;
2577 break;
2579 Opc = NVPTX::TEX_1D_S32_F32_RR;
2580 break;
2582 Opc = NVPTX::TEX_1D_S32_F32_LEVEL_RR;
2583 break;
2585 Opc = NVPTX::TEX_1D_S32_F32_GRAD_RR;
2586 break;
2588 Opc = NVPTX::TEX_1D_U32_S32_RR;
2589 break;
2591 Opc = NVPTX::TEX_1D_U32_F32_RR;
2592 break;
2594 Opc = NVPTX::TEX_1D_U32_F32_LEVEL_RR;
2595 break;
2597 Opc = NVPTX::TEX_1D_U32_F32_GRAD_RR;
2598 break;
2600 Opc = NVPTX::TEX_1D_ARRAY_F32_S32_RR;
2601 break;
2603 Opc = NVPTX::TEX_1D_ARRAY_F32_F32_RR;
2604 break;
2606 Opc = NVPTX::TEX_1D_ARRAY_F32_F32_LEVEL_RR;
2607 break;
2609 Opc = NVPTX::TEX_1D_ARRAY_F32_F32_GRAD_RR;
2610 break;
2612 Opc = NVPTX::TEX_1D_ARRAY_S32_S32_RR;
2613 break;
2615 Opc = NVPTX::TEX_1D_ARRAY_S32_F32_RR;
2616 break;
2618 Opc = NVPTX::TEX_1D_ARRAY_S32_F32_LEVEL_RR;
2619 break;
2621 Opc = NVPTX::TEX_1D_ARRAY_S32_F32_GRAD_RR;
2622 break;
2624 Opc = NVPTX::TEX_1D_ARRAY_U32_S32_RR;
2625 break;
2627 Opc = NVPTX::TEX_1D_ARRAY_U32_F32_RR;
2628 break;
2630 Opc = NVPTX::TEX_1D_ARRAY_U32_F32_LEVEL_RR;
2631 break;
2633 Opc = NVPTX::TEX_1D_ARRAY_U32_F32_GRAD_RR;
2634 break;
2636 Opc = NVPTX::TEX_2D_F32_S32_RR;
2637 break;
2639 Opc = NVPTX::TEX_2D_F32_F32_RR;
2640 break;
2642 Opc = NVPTX::TEX_2D_F32_F32_LEVEL_RR;
2643 break;
2645 Opc = NVPTX::TEX_2D_F32_F32_GRAD_RR;
2646 break;
2648 Opc = NVPTX::TEX_2D_S32_S32_RR;
2649 break;
2651 Opc = NVPTX::TEX_2D_S32_F32_RR;
2652 break;
2654 Opc = NVPTX::TEX_2D_S32_F32_LEVEL_RR;
2655 break;
2657 Opc = NVPTX::TEX_2D_S32_F32_GRAD_RR;
2658 break;
2660 Opc = NVPTX::TEX_2D_U32_S32_RR;
2661 break;
2663 Opc = NVPTX::TEX_2D_U32_F32_RR;
2664 break;
2666 Opc = NVPTX::TEX_2D_U32_F32_LEVEL_RR;
2667 break;
2669 Opc = NVPTX::TEX_2D_U32_F32_GRAD_RR;
2670 break;
2672 Opc = NVPTX::TEX_2D_ARRAY_F32_S32_RR;
2673 break;
2675 Opc = NVPTX::TEX_2D_ARRAY_F32_F32_RR;
2676 break;
2678 Opc = NVPTX::TEX_2D_ARRAY_F32_F32_LEVEL_RR;
2679 break;
2681 Opc = NVPTX::TEX_2D_ARRAY_F32_F32_GRAD_RR;
2682 break;
2684 Opc = NVPTX::TEX_2D_ARRAY_S32_S32_RR;
2685 break;
2687 Opc = NVPTX::TEX_2D_ARRAY_S32_F32_RR;
2688 break;
2690 Opc = NVPTX::TEX_2D_ARRAY_S32_F32_LEVEL_RR;
2691 break;
2693 Opc = NVPTX::TEX_2D_ARRAY_S32_F32_GRAD_RR;
2694 break;
2696 Opc = NVPTX::TEX_2D_ARRAY_U32_S32_RR;
2697 break;
2699 Opc = NVPTX::TEX_2D_ARRAY_U32_F32_RR;
2700 break;
2702 Opc = NVPTX::TEX_2D_ARRAY_U32_F32_LEVEL_RR;
2703 break;
2705 Opc = NVPTX::TEX_2D_ARRAY_U32_F32_GRAD_RR;
2706 break;
2708 Opc = NVPTX::TEX_3D_F32_S32_RR;
2709 break;
2711 Opc = NVPTX::TEX_3D_F32_F32_RR;
2712 break;
2714 Opc = NVPTX::TEX_3D_F32_F32_LEVEL_RR;
2715 break;
2717 Opc = NVPTX::TEX_3D_F32_F32_GRAD_RR;
2718 break;
2720 Opc = NVPTX::TEX_3D_S32_S32_RR;
2721 break;
2723 Opc = NVPTX::TEX_3D_S32_F32_RR;
2724 break;
2726 Opc = NVPTX::TEX_3D_S32_F32_LEVEL_RR;
2727 break;
2729 Opc = NVPTX::TEX_3D_S32_F32_GRAD_RR;
2730 break;
2732 Opc = NVPTX::TEX_3D_U32_S32_RR;
2733 break;
2735 Opc = NVPTX::TEX_3D_U32_F32_RR;
2736 break;
2738 Opc = NVPTX::TEX_3D_U32_F32_LEVEL_RR;
2739 break;
2741 Opc = NVPTX::TEX_3D_U32_F32_GRAD_RR;
2742 break;
2744 Opc = NVPTX::TEX_CUBE_F32_F32_RR;
2745 break;
2747 Opc = NVPTX::TEX_CUBE_F32_F32_LEVEL_RR;
2748 break;
2750 Opc = NVPTX::TEX_CUBE_S32_F32_RR;
2751 break;
2753 Opc = NVPTX::TEX_CUBE_S32_F32_LEVEL_RR;
2754 break;
2756 Opc = NVPTX::TEX_CUBE_U32_F32_RR;
2757 break;
2759 Opc = NVPTX::TEX_CUBE_U32_F32_LEVEL_RR;
2760 break;
2762 Opc = NVPTX::TEX_CUBE_ARRAY_F32_F32_RR;
2763 break;
2765 Opc = NVPTX::TEX_CUBE_ARRAY_F32_F32_LEVEL_RR;
2766 break;
2768 Opc = NVPTX::TEX_CUBE_ARRAY_S32_F32_RR;
2769 break;
2771 Opc = NVPTX::TEX_CUBE_ARRAY_S32_F32_LEVEL_RR;
2772 break;
2774 Opc = NVPTX::TEX_CUBE_ARRAY_U32_F32_RR;
2775 break;
2777 Opc = NVPTX::TEX_CUBE_ARRAY_U32_F32_LEVEL_RR;
2778 break;
2780 Opc = NVPTX::TLD4_R_2D_F32_F32_RR;
2781 break;
2783 Opc = NVPTX::TLD4_G_2D_F32_F32_RR;
2784 break;
2786 Opc = NVPTX::TLD4_B_2D_F32_F32_RR;
2787 break;
2789 Opc = NVPTX::TLD4_A_2D_F32_F32_RR;
2790 break;
2792 Opc = NVPTX::TLD4_R_2D_S32_F32_RR;
2793 break;
2795 Opc = NVPTX::TLD4_G_2D_S32_F32_RR;
2796 break;
2798 Opc = NVPTX::TLD4_B_2D_S32_F32_RR;
2799 break;
2801 Opc = NVPTX::TLD4_A_2D_S32_F32_RR;
2802 break;
2804 Opc = NVPTX::TLD4_R_2D_U32_F32_RR;
2805 break;
2807 Opc = NVPTX::TLD4_G_2D_U32_F32_RR;
2808 break;
2810 Opc = NVPTX::TLD4_B_2D_U32_F32_RR;
2811 break;
2813 Opc = NVPTX::TLD4_A_2D_U32_F32_RR;
2814 break;
2816 Opc = NVPTX::TEX_UNIFIED_1D_F32_S32_R;
2817 break;
2819 Opc = NVPTX::TEX_UNIFIED_1D_F32_F32_R;
2820 break;
2822 Opc = NVPTX::TEX_UNIFIED_1D_F32_F32_LEVEL_R;
2823 break;
2825 Opc = NVPTX::TEX_UNIFIED_1D_F32_F32_GRAD_R;
2826 break;
2828 Opc = NVPTX::TEX_UNIFIED_1D_S32_S32_R;
2829 break;
2831 Opc = NVPTX::TEX_UNIFIED_1D_S32_F32_R;
2832 break;
2834 Opc = NVPTX::TEX_UNIFIED_1D_S32_F32_LEVEL_R;
2835 break;
2837 Opc = NVPTX::TEX_UNIFIED_1D_S32_F32_GRAD_R;
2838 break;
2840 Opc = NVPTX::TEX_UNIFIED_1D_U32_S32_R;
2841 break;
2843 Opc = NVPTX::TEX_UNIFIED_1D_U32_F32_R;
2844 break;
2846 Opc = NVPTX::TEX_UNIFIED_1D_U32_F32_LEVEL_R;
2847 break;
2849 Opc = NVPTX::TEX_UNIFIED_1D_U32_F32_GRAD_R;
2850 break;
2852 Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_F32_S32_R;
2853 break;
2855 Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_F32_F32_R;
2856 break;
2858 Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_F32_F32_LEVEL_R;
2859 break;
2861 Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_F32_F32_GRAD_R;
2862 break;
2864 Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_S32_S32_R;
2865 break;
2867 Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_S32_F32_R;
2868 break;
2870 Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_S32_F32_LEVEL_R;
2871 break;
2873 Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_S32_F32_GRAD_R;
2874 break;
2876 Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_U32_S32_R;
2877 break;
2879 Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_U32_F32_R;
2880 break;
2882 Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_U32_F32_LEVEL_R;
2883 break;
2885 Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_U32_F32_GRAD_R;
2886 break;
2888 Opc = NVPTX::TEX_UNIFIED_2D_F32_S32_R;
2889 break;
2891 Opc = NVPTX::TEX_UNIFIED_2D_F32_F32_R;
2892 break;
2894 Opc = NVPTX::TEX_UNIFIED_2D_F32_F32_LEVEL_R;
2895 break;
2897 Opc = NVPTX::TEX_UNIFIED_2D_F32_F32_GRAD_R;
2898 break;
2900 Opc = NVPTX::TEX_UNIFIED_2D_S32_S32_R;
2901 break;
2903 Opc = NVPTX::TEX_UNIFIED_2D_S32_F32_R;
2904 break;
2906 Opc = NVPTX::TEX_UNIFIED_2D_S32_F32_LEVEL_R;
2907 break;
2909 Opc = NVPTX::TEX_UNIFIED_2D_S32_F32_GRAD_R;
2910 break;
2912 Opc = NVPTX::TEX_UNIFIED_2D_U32_S32_R;
2913 break;
2915 Opc = NVPTX::TEX_UNIFIED_2D_U32_F32_R;
2916 break;
2918 Opc = NVPTX::TEX_UNIFIED_2D_U32_F32_LEVEL_R;
2919 break;
2921 Opc = NVPTX::TEX_UNIFIED_2D_U32_F32_GRAD_R;
2922 break;
2924 Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_F32_S32_R;
2925 break;
2927 Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_F32_F32_R;
2928 break;
2930 Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_F32_F32_LEVEL_R;
2931 break;
2933 Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_F32_F32_GRAD_R;
2934 break;
2936 Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_S32_S32_R;
2937 break;
2939 Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_S32_F32_R;
2940 break;
2942 Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_S32_F32_LEVEL_R;
2943 break;
2945 Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_S32_F32_GRAD_R;
2946 break;
2948 Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_U32_S32_R;
2949 break;
2951 Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_U32_F32_R;
2952 break;
2954 Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_U32_F32_LEVEL_R;
2955 break;
2957 Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_U32_F32_GRAD_R;
2958 break;
2960 Opc = NVPTX::TEX_UNIFIED_3D_F32_S32_R;
2961 break;
2963 Opc = NVPTX::TEX_UNIFIED_3D_F32_F32_R;
2964 break;
2966 Opc = NVPTX::TEX_UNIFIED_3D_F32_F32_LEVEL_R;
2967 break;
2969 Opc = NVPTX::TEX_UNIFIED_3D_F32_F32_GRAD_R;
2970 break;
2972 Opc = NVPTX::TEX_UNIFIED_3D_S32_S32_R;
2973 break;
2975 Opc = NVPTX::TEX_UNIFIED_3D_S32_F32_R;
2976 break;
2978 Opc = NVPTX::TEX_UNIFIED_3D_S32_F32_LEVEL_R;
2979 break;
2981 Opc = NVPTX::TEX_UNIFIED_3D_S32_F32_GRAD_R;
2982 break;
2984 Opc = NVPTX::TEX_UNIFIED_3D_U32_S32_R;
2985 break;
2987 Opc = NVPTX::TEX_UNIFIED_3D_U32_F32_R;
2988 break;
2990 Opc = NVPTX::TEX_UNIFIED_3D_U32_F32_LEVEL_R;
2991 break;
2993 Opc = NVPTX::TEX_UNIFIED_3D_U32_F32_GRAD_R;
2994 break;
2996 Opc = NVPTX::TEX_UNIFIED_CUBE_F32_F32_R;
2997 break;
2999 Opc = NVPTX::TEX_UNIFIED_CUBE_F32_F32_LEVEL_R;
3000 break;
3002 Opc = NVPTX::TEX_UNIFIED_CUBE_S32_F32_R;
3003 break;
3005 Opc = NVPTX::TEX_UNIFIED_CUBE_S32_F32_LEVEL_R;
3006 break;
3008 Opc = NVPTX::TEX_UNIFIED_CUBE_U32_F32_R;
3009 break;
3011 Opc = NVPTX::TEX_UNIFIED_CUBE_U32_F32_LEVEL_R;
3012 break;
3014 Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_F32_F32_R;
3015 break;
3017 Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_F32_F32_LEVEL_R;
3018 break;
3020 Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_S32_F32_R;
3021 break;
3023 Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_S32_F32_LEVEL_R;
3024 break;
3026 Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_U32_F32_R;
3027 break;
3029 Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_U32_F32_LEVEL_R;
3030 break;
3032 Opc = NVPTX::TLD4_UNIFIED_R_2D_F32_F32_R;
3033 break;
3035 Opc = NVPTX::TLD4_UNIFIED_G_2D_F32_F32_R;
3036 break;
3038 Opc = NVPTX::TLD4_UNIFIED_B_2D_F32_F32_R;
3039 break;
3041 Opc = NVPTX::TLD4_UNIFIED_A_2D_F32_F32_R;
3042 break;
3044 Opc = NVPTX::TLD4_UNIFIED_R_2D_S32_F32_R;
3045 break;
3047 Opc = NVPTX::TLD4_UNIFIED_G_2D_S32_F32_R;
3048 break;
3050 Opc = NVPTX::TLD4_UNIFIED_B_2D_S32_F32_R;
3051 break;
3053 Opc = NVPTX::TLD4_UNIFIED_A_2D_S32_F32_R;
3054 break;
3056 Opc = NVPTX::TLD4_UNIFIED_R_2D_U32_F32_R;
3057 break;
3059 Opc = NVPTX::TLD4_UNIFIED_G_2D_U32_F32_R;
3060 break;
3062 Opc = NVPTX::TLD4_UNIFIED_B_2D_U32_F32_R;
3063 break;
3065 Opc = NVPTX::TLD4_UNIFIED_A_2D_U32_F32_R;
3066 break;
3068 Opc = NVPTX::TEX_UNIFIED_CUBE_F32_F32_GRAD_R;
3069 break;
3071 Opc = NVPTX::TEX_UNIFIED_CUBE_S32_F32_GRAD_R;
3072 break;
3074 Opc = NVPTX::TEX_UNIFIED_CUBE_U32_F32_GRAD_R;
3075 break;
3077 Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_F32_F32_GRAD_R;
3078 break;
3080 Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_S32_F32_GRAD_R;
3081 break;
3083 Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_U32_F32_GRAD_R;
3084 break;
3085 }
3086
3087 // Copy over operands
3089 Ops.push_back(N->getOperand(0)); // Move chain to the back.
3090
3091 ReplaceNode(N, CurDAG->getMachineNode(Opc, SDLoc(N), N->getVTList(), Ops));
3092 return true;
3093}
3094
3095bool NVPTXDAGToDAGISel::trySurfaceIntrinsic(SDNode *N) {
3096 unsigned Opc = 0;
3097 switch (N->getOpcode()) {
3098 default: return false;
3100 Opc = NVPTX::SULD_1D_I8_CLAMP_R;
3101 break;
3103 Opc = NVPTX::SULD_1D_I16_CLAMP_R;
3104 break;
3106 Opc = NVPTX::SULD_1D_I32_CLAMP_R;
3107 break;
3109 Opc = NVPTX::SULD_1D_I64_CLAMP_R;
3110 break;
3112 Opc = NVPTX::SULD_1D_V2I8_CLAMP_R;
3113 break;
3115 Opc = NVPTX::SULD_1D_V2I16_CLAMP_R;
3116 break;
3118 Opc = NVPTX::SULD_1D_V2I32_CLAMP_R;
3119 break;
3121 Opc = NVPTX::SULD_1D_V2I64_CLAMP_R;
3122 break;
3124 Opc = NVPTX::SULD_1D_V4I8_CLAMP_R;
3125 break;
3127 Opc = NVPTX::SULD_1D_V4I16_CLAMP_R;
3128 break;
3130 Opc = NVPTX::SULD_1D_V4I32_CLAMP_R;
3131 break;
3133 Opc = NVPTX::SULD_1D_ARRAY_I8_CLAMP_R;
3134 break;
3136 Opc = NVPTX::SULD_1D_ARRAY_I16_CLAMP_R;
3137 break;
3139 Opc = NVPTX::SULD_1D_ARRAY_I32_CLAMP_R;
3140 break;
3142 Opc = NVPTX::SULD_1D_ARRAY_I64_CLAMP_R;
3143 break;
3145 Opc = NVPTX::SULD_1D_ARRAY_V2I8_CLAMP_R;
3146 break;
3148 Opc = NVPTX::SULD_1D_ARRAY_V2I16_CLAMP_R;
3149 break;
3151 Opc = NVPTX::SULD_1D_ARRAY_V2I32_CLAMP_R;
3152 break;
3154 Opc = NVPTX::SULD_1D_ARRAY_V2I64_CLAMP_R;
3155 break;
3157 Opc = NVPTX::SULD_1D_ARRAY_V4I8_CLAMP_R;
3158 break;
3160 Opc = NVPTX::SULD_1D_ARRAY_V4I16_CLAMP_R;
3161 break;
3163 Opc = NVPTX::SULD_1D_ARRAY_V4I32_CLAMP_R;
3164 break;
3166 Opc = NVPTX::SULD_2D_I8_CLAMP_R;
3167 break;
3169 Opc = NVPTX::SULD_2D_I16_CLAMP_R;
3170 break;
3172 Opc = NVPTX::SULD_2D_I32_CLAMP_R;
3173 break;
3175 Opc = NVPTX::SULD_2D_I64_CLAMP_R;
3176 break;
3178 Opc = NVPTX::SULD_2D_V2I8_CLAMP_R;
3179 break;
3181 Opc = NVPTX::SULD_2D_V2I16_CLAMP_R;
3182 break;
3184 Opc = NVPTX::SULD_2D_V2I32_CLAMP_R;
3185 break;
3187 Opc = NVPTX::SULD_2D_V2I64_CLAMP_R;
3188 break;
3190 Opc = NVPTX::SULD_2D_V4I8_CLAMP_R;
3191 break;
3193 Opc = NVPTX::SULD_2D_V4I16_CLAMP_R;
3194 break;
3196 Opc = NVPTX::SULD_2D_V4I32_CLAMP_R;
3197 break;
3199 Opc = NVPTX::SULD_2D_ARRAY_I8_CLAMP_R;
3200 break;
3202 Opc = NVPTX::SULD_2D_ARRAY_I16_CLAMP_R;
3203 break;
3205 Opc = NVPTX::SULD_2D_ARRAY_I32_CLAMP_R;
3206 break;
3208 Opc = NVPTX::SULD_2D_ARRAY_I64_CLAMP_R;
3209 break;
3211 Opc = NVPTX::SULD_2D_ARRAY_V2I8_CLAMP_R;
3212 break;
3214 Opc = NVPTX::SULD_2D_ARRAY_V2I16_CLAMP_R;
3215 break;
3217 Opc = NVPTX::SULD_2D_ARRAY_V2I32_CLAMP_R;
3218 break;
3220 Opc = NVPTX::SULD_2D_ARRAY_V2I64_CLAMP_R;
3221 break;
3223 Opc = NVPTX::SULD_2D_ARRAY_V4I8_CLAMP_R;
3224 break;
3226 Opc = NVPTX::SULD_2D_ARRAY_V4I16_CLAMP_R;
3227 break;
3229 Opc = NVPTX::SULD_2D_ARRAY_V4I32_CLAMP_R;
3230 break;
3232 Opc = NVPTX::SULD_3D_I8_CLAMP_R;
3233 break;
3235 Opc = NVPTX::SULD_3D_I16_CLAMP_R;
3236 break;
3238 Opc = NVPTX::SULD_3D_I32_CLAMP_R;
3239 break;
3241 Opc = NVPTX::SULD_3D_I64_CLAMP_R;
3242 break;
3244 Opc = NVPTX::SULD_3D_V2I8_CLAMP_R;
3245 break;
3247 Opc = NVPTX::SULD_3D_V2I16_CLAMP_R;
3248 break;
3250 Opc = NVPTX::SULD_3D_V2I32_CLAMP_R;
3251 break;
3253 Opc = NVPTX::SULD_3D_V2I64_CLAMP_R;
3254 break;
3256 Opc = NVPTX::SULD_3D_V4I8_CLAMP_R;
3257 break;
3259 Opc = NVPTX::SULD_3D_V4I16_CLAMP_R;
3260 break;
3262 Opc = NVPTX::SULD_3D_V4I32_CLAMP_R;
3263 break;
3265 Opc = NVPTX::SULD_1D_I8_TRAP_R;
3266 break;
3268 Opc = NVPTX::SULD_1D_I16_TRAP_R;
3269 break;
3271 Opc = NVPTX::SULD_1D_I32_TRAP_R;
3272 break;
3274 Opc = NVPTX::SULD_1D_I64_TRAP_R;
3275 break;
3277 Opc = NVPTX::SULD_1D_V2I8_TRAP_R;
3278 break;
3280 Opc = NVPTX::SULD_1D_V2I16_TRAP_R;
3281 break;
3283 Opc = NVPTX::SULD_1D_V2I32_TRAP_R;
3284 break;
3286 Opc = NVPTX::SULD_1D_V2I64_TRAP_R;
3287 break;
3289 Opc = NVPTX::SULD_1D_V4I8_TRAP_R;
3290 break;
3292 Opc = NVPTX::SULD_1D_V4I16_TRAP_R;
3293 break;
3295 Opc = NVPTX::SULD_1D_V4I32_TRAP_R;
3296 break;
3298 Opc = NVPTX::SULD_1D_ARRAY_I8_TRAP_R;
3299 break;
3301 Opc = NVPTX::SULD_1D_ARRAY_I16_TRAP_R;
3302 break;
3304 Opc = NVPTX::SULD_1D_ARRAY_I32_TRAP_R;
3305 break;
3307 Opc = NVPTX::SULD_1D_ARRAY_I64_TRAP_R;
3308 break;
3310 Opc = NVPTX::SULD_1D_ARRAY_V2I8_TRAP_R;
3311 break;
3313 Opc = NVPTX::SULD_1D_ARRAY_V2I16_TRAP_R;
3314 break;
3316 Opc = NVPTX::SULD_1D_ARRAY_V2I32_TRAP_R;
3317 break;
3319 Opc = NVPTX::SULD_1D_ARRAY_V2I64_TRAP_R;
3320 break;
3322 Opc = NVPTX::SULD_1D_ARRAY_V4I8_TRAP_R;
3323 break;
3325 Opc = NVPTX::SULD_1D_ARRAY_V4I16_TRAP_R;
3326 break;
3328 Opc = NVPTX::SULD_1D_ARRAY_V4I32_TRAP_R;
3329 break;
3331 Opc = NVPTX::SULD_2D_I8_TRAP_R;
3332 break;
3334 Opc = NVPTX::SULD_2D_I16_TRAP_R;
3335 break;
3337 Opc = NVPTX::SULD_2D_I32_TRAP_R;
3338 break;
3340 Opc = NVPTX::SULD_2D_I64_TRAP_R;
3341 break;
3343 Opc = NVPTX::SULD_2D_V2I8_TRAP_R;
3344 break;
3346 Opc = NVPTX::SULD_2D_V2I16_TRAP_R;
3347 break;
3349 Opc = NVPTX::SULD_2D_V2I32_TRAP_R;
3350 break;
3352 Opc = NVPTX::SULD_2D_V2I64_TRAP_R;
3353 break;
3355 Opc = NVPTX::SULD_2D_V4I8_TRAP_R;
3356 break;
3358 Opc = NVPTX::SULD_2D_V4I16_TRAP_R;
3359 break;
3361 Opc = NVPTX::SULD_2D_V4I32_TRAP_R;
3362 break;
3364 Opc = NVPTX::SULD_2D_ARRAY_I8_TRAP_R;
3365 break;
3367 Opc = NVPTX::SULD_2D_ARRAY_I16_TRAP_R;
3368 break;
3370 Opc = NVPTX::SULD_2D_ARRAY_I32_TRAP_R;
3371 break;
3373 Opc = NVPTX::SULD_2D_ARRAY_I64_TRAP_R;
3374 break;
3376 Opc = NVPTX::SULD_2D_ARRAY_V2I8_TRAP_R;
3377 break;
3379 Opc = NVPTX::SULD_2D_ARRAY_V2I16_TRAP_R;
3380 break;
3382 Opc = NVPTX::SULD_2D_ARRAY_V2I32_TRAP_R;
3383 break;
3385 Opc = NVPTX::SULD_2D_ARRAY_V2I64_TRAP_R;
3386 break;
3388 Opc = NVPTX::SULD_2D_ARRAY_V4I8_TRAP_R;
3389 break;
3391 Opc = NVPTX::SULD_2D_ARRAY_V4I16_TRAP_R;
3392 break;
3394 Opc = NVPTX::SULD_2D_ARRAY_V4I32_TRAP_R;
3395 break;
3397 Opc = NVPTX::SULD_3D_I8_TRAP_R;
3398 break;
3400 Opc = NVPTX::SULD_3D_I16_TRAP_R;
3401 break;
3403 Opc = NVPTX::SULD_3D_I32_TRAP_R;
3404 break;
3406 Opc = NVPTX::SULD_3D_I64_TRAP_R;
3407 break;
3409 Opc = NVPTX::SULD_3D_V2I8_TRAP_R;
3410 break;
3412 Opc = NVPTX::SULD_3D_V2I16_TRAP_R;
3413 break;
3415 Opc = NVPTX::SULD_3D_V2I32_TRAP_R;
3416 break;
3418 Opc = NVPTX::SULD_3D_V2I64_TRAP_R;
3419 break;
3421 Opc = NVPTX::SULD_3D_V4I8_TRAP_R;
3422 break;
3424 Opc = NVPTX::SULD_3D_V4I16_TRAP_R;
3425 break;
3427 Opc = NVPTX::SULD_3D_V4I32_TRAP_R;
3428 break;
3430 Opc = NVPTX::SULD_1D_I8_ZERO_R;
3431 break;
3433 Opc = NVPTX::SULD_1D_I16_ZERO_R;
3434 break;
3436 Opc = NVPTX::SULD_1D_I32_ZERO_R;
3437 break;
3439 Opc = NVPTX::SULD_1D_I64_ZERO_R;
3440 break;
3442 Opc = NVPTX::SULD_1D_V2I8_ZERO_R;
3443 break;
3445 Opc = NVPTX::SULD_1D_V2I16_ZERO_R;
3446 break;
3448 Opc = NVPTX::SULD_1D_V2I32_ZERO_R;
3449 break;
3451 Opc = NVPTX::SULD_1D_V2I64_ZERO_R;
3452 break;
3454 Opc = NVPTX::SULD_1D_V4I8_ZERO_R;
3455 break;
3457 Opc = NVPTX::SULD_1D_V4I16_ZERO_R;
3458 break;
3460 Opc = NVPTX::SULD_1D_V4I32_ZERO_R;
3461 break;
3463 Opc = NVPTX::SULD_1D_ARRAY_I8_ZERO_R;
3464 break;
3466 Opc = NVPTX::SULD_1D_ARRAY_I16_ZERO_R;
3467 break;
3469 Opc = NVPTX::SULD_1D_ARRAY_I32_ZERO_R;
3470 break;
3472 Opc = NVPTX::SULD_1D_ARRAY_I64_ZERO_R;
3473 break;
3475 Opc = NVPTX::SULD_1D_ARRAY_V2I8_ZERO_R;
3476 break;
3478 Opc = NVPTX::SULD_1D_ARRAY_V2I16_ZERO_R;
3479 break;
3481 Opc = NVPTX::SULD_1D_ARRAY_V2I32_ZERO_R;
3482 break;
3484 Opc = NVPTX::SULD_1D_ARRAY_V2I64_ZERO_R;
3485 break;
3487 Opc = NVPTX::SULD_1D_ARRAY_V4I8_ZERO_R;
3488 break;
3490 Opc = NVPTX::SULD_1D_ARRAY_V4I16_ZERO_R;
3491 break;
3493 Opc = NVPTX::SULD_1D_ARRAY_V4I32_ZERO_R;
3494 break;
3496 Opc = NVPTX::SULD_2D_I8_ZERO_R;
3497 break;
3499 Opc = NVPTX::SULD_2D_I16_ZERO_R;
3500 break;
3502 Opc = NVPTX::SULD_2D_I32_ZERO_R;
3503 break;
3505 Opc = NVPTX::SULD_2D_I64_ZERO_R;
3506 break;
3508 Opc = NVPTX::SULD_2D_V2I8_ZERO_R;
3509 break;
3511 Opc = NVPTX::SULD_2D_V2I16_ZERO_R;
3512 break;
3514 Opc = NVPTX::SULD_2D_V2I32_ZERO_R;
3515 break;
3517 Opc = NVPTX::SULD_2D_V2I64_ZERO_R;
3518 break;
3520 Opc = NVPTX::SULD_2D_V4I8_ZERO_R;
3521 break;
3523 Opc = NVPTX::SULD_2D_V4I16_ZERO_R;
3524 break;
3526 Opc = NVPTX::SULD_2D_V4I32_ZERO_R;
3527 break;
3529 Opc = NVPTX::SULD_2D_ARRAY_I8_ZERO_R;
3530 break;
3532 Opc = NVPTX::SULD_2D_ARRAY_I16_ZERO_R;
3533 break;
3535 Opc = NVPTX::SULD_2D_ARRAY_I32_ZERO_R;
3536 break;
3538 Opc = NVPTX::SULD_2D_ARRAY_I64_ZERO_R;
3539 break;
3541 Opc = NVPTX::SULD_2D_ARRAY_V2I8_ZERO_R;
3542 break;
3544 Opc = NVPTX::SULD_2D_ARRAY_V2I16_ZERO_R;
3545 break;
3547 Opc = NVPTX::SULD_2D_ARRAY_V2I32_ZERO_R;
3548 break;
3550 Opc = NVPTX::SULD_2D_ARRAY_V2I64_ZERO_R;
3551 break;
3553 Opc = NVPTX::SULD_2D_ARRAY_V4I8_ZERO_R;
3554 break;
3556 Opc = NVPTX::SULD_2D_ARRAY_V4I16_ZERO_R;
3557 break;
3559 Opc = NVPTX::SULD_2D_ARRAY_V4I32_ZERO_R;
3560 break;
3562 Opc = NVPTX::SULD_3D_I8_ZERO_R;
3563 break;
3565 Opc = NVPTX::SULD_3D_I16_ZERO_R;
3566 break;
3568 Opc = NVPTX::SULD_3D_I32_ZERO_R;
3569 break;
3571 Opc = NVPTX::SULD_3D_I64_ZERO_R;
3572 break;
3574 Opc = NVPTX::SULD_3D_V2I8_ZERO_R;
3575 break;
3577 Opc = NVPTX::SULD_3D_V2I16_ZERO_R;
3578 break;
3580 Opc = NVPTX::SULD_3D_V2I32_ZERO_R;
3581 break;
3583 Opc = NVPTX::SULD_3D_V2I64_ZERO_R;
3584 break;
3586 Opc = NVPTX::SULD_3D_V4I8_ZERO_R;
3587 break;
3589 Opc = NVPTX::SULD_3D_V4I16_ZERO_R;
3590 break;
3592 Opc = NVPTX::SULD_3D_V4I32_ZERO_R;
3593 break;
3594 }
3595
3596 // Copy over operands
3598 Ops.push_back(N->getOperand(0)); // Move chain to the back.
3599
3600 ReplaceNode(N, CurDAG->getMachineNode(Opc, SDLoc(N), N->getVTList(), Ops));
3601 return true;
3602}
3603
3604
3605/// SelectBFE - Look for instruction sequences that can be made more efficient
3606/// by using the 'bfe' (bit-field extract) PTX instruction
3607bool NVPTXDAGToDAGISel::tryBFE(SDNode *N) {
3608 SDLoc DL(N);
3609 SDValue LHS = N->getOperand(0);
3610 SDValue RHS = N->getOperand(1);
3611 SDValue Len;
3612 SDValue Start;
3613 SDValue Val;
3614 bool IsSigned = false;
3615
3616 if (N->getOpcode() == ISD::AND) {
3617 // Canonicalize the operands
3618 // We want 'and %val, %mask'
3619 if (isa<ConstantSDNode>(LHS) && !isa<ConstantSDNode>(RHS)) {
3620 std::swap(LHS, RHS);
3621 }
3622
3623 ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(RHS);
3624 if (!Mask) {
3625 // We need a constant mask on the RHS of the AND
3626 return false;
3627 }
3628
3629 // Extract the mask bits
3630 uint64_t MaskVal = Mask->getZExtValue();
3631 if (!isMask_64(MaskVal)) {
3632 // We *could* handle shifted masks here, but doing so would require an
3633 // 'and' operation to fix up the low-order bits so we would trade
3634 // shr+and for bfe+and, which has the same throughput
3635 return false;
3636 }
3637
3638 // How many bits are in our mask?
3639 int64_t NumBits = countr_one(MaskVal);
3640 Len = CurDAG->getTargetConstant(NumBits, DL, MVT::i32);
3641
3642 if (LHS.getOpcode() == ISD::SRL || LHS.getOpcode() == ISD::SRA) {
3643 // We have a 'srl/and' pair, extract the effective start bit and length
3644 Val = LHS.getNode()->getOperand(0);
3645 Start = LHS.getNode()->getOperand(1);
3646 ConstantSDNode *StartConst = dyn_cast<ConstantSDNode>(Start);
3647 if (StartConst) {
3648 uint64_t StartVal = StartConst->getZExtValue();
3649 // How many "good" bits do we have left? "good" is defined here as bits
3650 // that exist in the original value, not shifted in.
3651 int64_t GoodBits = Start.getValueSizeInBits() - StartVal;
3652 if (NumBits > GoodBits) {
3653 // Do not handle the case where bits have been shifted in. In theory
3654 // we could handle this, but the cost is likely higher than just
3655 // emitting the srl/and pair.
3656 return false;
3657 }
3658 Start = CurDAG->getTargetConstant(StartVal, DL, MVT::i32);
3659 } else {
3660 // Do not handle the case where the shift amount (can be zero if no srl
3661 // was found) is not constant. We could handle this case, but it would
3662 // require run-time logic that would be more expensive than just
3663 // emitting the srl/and pair.
3664 return false;
3665 }
3666 } else {
3667 // Do not handle the case where the LHS of the and is not a shift. While
3668 // it would be trivial to handle this case, it would just transform
3669 // 'and' -> 'bfe', but 'and' has higher-throughput.
3670 return false;
3671 }
3672 } else if (N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) {
3673 if (LHS->getOpcode() == ISD::AND) {
3674 ConstantSDNode *ShiftCnst = dyn_cast<ConstantSDNode>(RHS);
3675 if (!ShiftCnst) {
3676 // Shift amount must be constant
3677 return false;
3678 }
3679
3680 uint64_t ShiftAmt = ShiftCnst->getZExtValue();
3681
3682 SDValue AndLHS = LHS->getOperand(0);
3683 SDValue AndRHS = LHS->getOperand(1);
3684
3685 // Canonicalize the AND to have the mask on the RHS
3686 if (isa<ConstantSDNode>(AndLHS)) {
3687 std::swap(AndLHS, AndRHS);
3688 }
3689
3690 ConstantSDNode *MaskCnst = dyn_cast<ConstantSDNode>(AndRHS);
3691 if (!MaskCnst) {
3692 // Mask must be constant
3693 return false;
3694 }
3695
3696 uint64_t MaskVal = MaskCnst->getZExtValue();
3697 uint64_t NumZeros;
3698 uint64_t NumBits;
3699 if (isMask_64(MaskVal)) {
3700 NumZeros = 0;
3701 // The number of bits in the result bitfield will be the number of
3702 // trailing ones (the AND) minus the number of bits we shift off
3703 NumBits = llvm::countr_one(MaskVal) - ShiftAmt;
3704 } else if (isShiftedMask_64(MaskVal)) {
3705 NumZeros = llvm::countr_zero(MaskVal);
3706 unsigned NumOnes = llvm::countr_one(MaskVal >> NumZeros);
3707 // The number of bits in the result bitfield will be the number of
3708 // trailing zeros plus the number of set bits in the mask minus the
3709 // number of bits we shift off
3710 NumBits = NumZeros + NumOnes - ShiftAmt;
3711 } else {
3712 // This is not a mask we can handle
3713 return false;
3714 }
3715
3716 if (ShiftAmt < NumZeros) {
3717 // Handling this case would require extra logic that would make this
3718 // transformation non-profitable
3719 return false;
3720 }
3721
3722 Val = AndLHS;
3723 Start = CurDAG->getTargetConstant(ShiftAmt, DL, MVT::i32);
3724 Len = CurDAG->getTargetConstant(NumBits, DL, MVT::i32);
3725 } else if (LHS->getOpcode() == ISD::SHL) {
3726 // Here, we have a pattern like:
3727 //
3728 // (sra (shl val, NN), MM)
3729 // or
3730 // (srl (shl val, NN), MM)
3731 //
3732 // If MM >= NN, we can efficiently optimize this with bfe
3733 Val = LHS->getOperand(0);
3734
3735 SDValue ShlRHS = LHS->getOperand(1);
3736 ConstantSDNode *ShlCnst = dyn_cast<ConstantSDNode>(ShlRHS);
3737 if (!ShlCnst) {
3738 // Shift amount must be constant
3739 return false;
3740 }
3741 uint64_t InnerShiftAmt = ShlCnst->getZExtValue();
3742
3743 SDValue ShrRHS = RHS;
3744 ConstantSDNode *ShrCnst = dyn_cast<ConstantSDNode>(ShrRHS);
3745 if (!ShrCnst) {
3746 // Shift amount must be constant
3747 return false;
3748 }
3749 uint64_t OuterShiftAmt = ShrCnst->getZExtValue();
3750
3751 // To avoid extra codegen and be profitable, we need Outer >= Inner
3752 if (OuterShiftAmt < InnerShiftAmt) {
3753 return false;
3754 }
3755
3756 // If the outer shift is more than the type size, we have no bitfield to
3757 // extract (since we also check that the inner shift is <= the outer shift
3758 // then this also implies that the inner shift is < the type size)
3759 if (OuterShiftAmt >= Val.getValueSizeInBits()) {
3760 return false;
3761 }
3762
3763 Start = CurDAG->getTargetConstant(OuterShiftAmt - InnerShiftAmt, DL,
3764 MVT::i32);
3765 Len = CurDAG->getTargetConstant(Val.getValueSizeInBits() - OuterShiftAmt,
3766 DL, MVT::i32);
3767
3768 if (N->getOpcode() == ISD::SRA) {
3769 // If we have a arithmetic right shift, we need to use the signed bfe
3770 // variant
3771 IsSigned = true;
3772 }
3773 } else {
3774 // No can do...
3775 return false;
3776 }
3777 } else {
3778 // No can do...
3779 return false;
3780 }
3781
3782
3783 unsigned Opc;
3784 // For the BFE operations we form here from "and" and "srl", always use the
3785 // unsigned variants.
3786 if (Val.getValueType() == MVT::i32) {
3787 if (IsSigned) {
3788 Opc = NVPTX::BFE_S32rii;
3789 } else {
3790 Opc = NVPTX::BFE_U32rii;
3791 }
3792 } else if (Val.getValueType() == MVT::i64) {
3793 if (IsSigned) {
3794 Opc = NVPTX::BFE_S64rii;
3795 } else {
3796 Opc = NVPTX::BFE_U64rii;
3797 }
3798 } else {
3799 // We cannot handle this type
3800 return false;
3801 }
3802
3803 SDValue Ops[] = {
3804 Val, Start, Len
3805 };
3806
3807 ReplaceNode(N, CurDAG->getMachineNode(Opc, DL, N->getVTList(), Ops));
3808 return true;
3809}
3810
3811// SelectDirectAddr - Match a direct address for DAG.
3812// A direct address could be a globaladdress or externalsymbol.
3813bool NVPTXDAGToDAGISel::SelectDirectAddr(SDValue N, SDValue &Address) {
3814 // Return true if TGA or ES.
3815 if (N.getOpcode() == ISD::TargetGlobalAddress ||
3816 N.getOpcode() == ISD::TargetExternalSymbol) {
3817 Address = N;
3818 return true;
3819 }
3820 if (N.getOpcode() == NVPTXISD::Wrapper) {
3821 Address = N.getOperand(0);
3822 return true;
3823 }
3824 // addrspacecast(MoveParam(arg_symbol) to addrspace(PARAM)) -> arg_symbol
3825 if (AddrSpaceCastSDNode *CastN = dyn_cast<AddrSpaceCastSDNode>(N)) {
3826 if (CastN->getSrcAddressSpace() == ADDRESS_SPACE_GENERIC &&
3829 return SelectDirectAddr(CastN->getOperand(0).getOperand(0), Address);
3830 }
3831 return false;
3832}
3833
3834// symbol+offset
3835bool NVPTXDAGToDAGISel::SelectADDRsi_imp(
3836 SDNode *OpNode, SDValue Addr, SDValue &Base, SDValue &Offset, MVT mvt) {
3837 if (Addr.getOpcode() == ISD::ADD) {
3838 if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1))) {
3839 SDValue base = Addr.getOperand(0);
3840 if (SelectDirectAddr(base, Base)) {
3841 Offset = CurDAG->getTargetConstant(CN->getZExtValue(), SDLoc(OpNode),
3842 mvt);
3843 return true;
3844 }
3845 }
3846 }
3847 return false;
3848}
3849
3850// symbol+offset
3851bool NVPTXDAGToDAGISel::SelectADDRsi(SDNode *OpNode, SDValue Addr,
3853 return SelectADDRsi_imp(OpNode, Addr, Base, Offset, MVT::i32);
3854}
3855
3856// symbol+offset
3857bool NVPTXDAGToDAGISel::SelectADDRsi64(SDNode *OpNode, SDValue Addr,
3859 return SelectADDRsi_imp(OpNode, Addr, Base, Offset, MVT::i64);
3860}
3861
3862// register+offset
3863bool NVPTXDAGToDAGISel::SelectADDRri_imp(
3864 SDNode *OpNode, SDValue Addr, SDValue &Base, SDValue &Offset, MVT mvt) {
3865 if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Addr)) {
3866 Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), mvt);
3867 Offset = CurDAG->getTargetConstant(0, SDLoc(OpNode), mvt);
3868 return true;
3869 }
3870 if (Addr.getOpcode() == ISD::TargetExternalSymbol ||
3871 Addr.getOpcode() == ISD::TargetGlobalAddress)
3872 return false; // direct calls.
3873
3874 if (Addr.getOpcode() == ISD::ADD) {
3875 if (SelectDirectAddr(Addr.getOperand(0), Addr)) {
3876 return false;
3877 }
3878 if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1))) {
3879 if (FrameIndexSDNode *FIN =
3880 dyn_cast<FrameIndexSDNode>(Addr.getOperand(0)))
3881 // Constant offset from frame ref.
3882 Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), mvt);
3883 else
3884 Base = Addr.getOperand(0);
3885
3886 // Offset must fit in a 32-bit signed int in PTX [register+offset] address
3887 // mode
3888 if (!CN->getAPIntValue().isSignedIntN(32))
3889 return false;
3890
3891 Offset = CurDAG->getTargetConstant(CN->getSExtValue(), SDLoc(OpNode),
3892 MVT::i32);
3893 return true;
3894 }
3895 }
3896 return false;
3897}
3898
3899// register+offset
3900bool NVPTXDAGToDAGISel::SelectADDRri(SDNode *OpNode, SDValue Addr,
3902 return SelectADDRri_imp(OpNode, Addr, Base, Offset, MVT::i32);
3903}
3904
3905// register+offset
3906bool NVPTXDAGToDAGISel::SelectADDRri64(SDNode *OpNode, SDValue Addr,
3908 return SelectADDRri_imp(OpNode, Addr, Base, Offset, MVT::i64);
3909}
3910
3911bool NVPTXDAGToDAGISel::ChkMemSDNodeAddressSpace(SDNode *N,
3912 unsigned int spN) const {
3913 const Value *Src = nullptr;
3914 if (MemSDNode *mN = dyn_cast<MemSDNode>(N)) {
3915 if (spN == 0 && mN->getMemOperand()->getPseudoValue())
3916 return true;
3917 Src = mN->getMemOperand()->getValue();
3918 }
3919 if (!Src)
3920 return false;
3921 if (auto *PT = dyn_cast<PointerType>(Src->getType()))
3922 return (PT->getAddressSpace() == spN);
3923 return false;
3924}
3925
3926/// SelectInlineAsmMemoryOperand - Implement addressing mode selection for
3927/// inline asm expressions.
3929 const SDValue &Op, InlineAsm::ConstraintCode ConstraintID,
3930 std::vector<SDValue> &OutOps) {
3931 SDValue Op0, Op1;
3932 switch (ConstraintID) {
3933 default:
3934 return true;
3935 case InlineAsm::ConstraintCode::m: // memory
3936 if (SelectDirectAddr(Op, Op0)) {
3937 OutOps.push_back(Op0);
3938 OutOps.push_back(CurDAG->getTargetConstant(0, SDLoc(Op), MVT::i32));
3939 return false;
3940 }
3941 if (SelectADDRri(Op.getNode(), Op, Op0, Op1)) {
3942 OutOps.push_back(Op0);
3943 OutOps.push_back(Op1);
3944 return false;
3945 }
3946 break;
3947 }
3948 return true;
3949}
3950
3951void NVPTXDAGToDAGISel::SelectV2I64toI128(SDNode *N) {
3952 // Lower a CopyToReg with two 64-bit inputs
3953 // Dst:i128, lo:i64, hi:i64
3954 //
3955 // CopyToReg Dst, lo, hi;
3956 //
3957 // ==>
3958 //
3959 // tmp = V2I64toI128 {lo, hi};
3960 // CopyToReg Dst, tmp;
3961 SDValue Dst = N->getOperand(1);
3962 SDValue Lo = N->getOperand(2);
3963 SDValue Hi = N->getOperand(3);
3964
3965 SDLoc DL(N);
3966 SDNode *Mov =
3967 CurDAG->getMachineNode(NVPTX::V2I64toI128, DL, MVT::i128, {Lo, Hi});
3968
3969 SmallVector<SDValue, 4> NewOps(N->getNumOperands() - 1);
3970 NewOps[0] = N->getOperand(0);
3971 NewOps[1] = Dst;
3972 NewOps[2] = SDValue(Mov, 0);
3973 if (N->getNumOperands() == 5)
3974 NewOps[3] = N->getOperand(4);
3975 SDValue NewValue = CurDAG->getNode(ISD::CopyToReg, DL, SmallVector<EVT>(N->values()), NewOps);
3976
3977 ReplaceNode(N, NewValue.getNode());
3978}
3979
3980void NVPTXDAGToDAGISel::SelectI128toV2I64(SDNode *N) {
3981 // Lower CopyFromReg from a 128-bit regs to two 64-bit regs
3982 // Dst:i128, Src:i128
3983 //
3984 // {lo, hi} = CopyFromReg Src
3985 //
3986 // ==>
3987 //
3988 // {lo, hi} = I128toV2I64 Src
3989 //
3990 SDValue Ch = N->getOperand(0);
3991 SDValue Src = N->getOperand(1);
3992 SDValue Glue = N->getOperand(2);
3993 SDLoc DL(N);
3994
3995 // Add Glue and Ch to the operands and results to avoid break the execution
3996 // order
3998 NVPTX::I128toV2I64, DL,
3999 {MVT::i64, MVT::i64, Ch.getValueType(), Glue.getValueType()},
4000 {Src, Ch, Glue});
4001
4002 ReplaceNode(N, Mov);
4003}
4004
4005/// GetConvertOpcode - Returns the CVT_ instruction opcode that implements a
4006/// conversion from \p SrcTy to \p DestTy.
4007unsigned NVPTXDAGToDAGISel::GetConvertOpcode(MVT DestTy, MVT SrcTy,
4008 LoadSDNode *LdNode) {
4009 bool IsSigned = LdNode && LdNode->getExtensionType() == ISD::SEXTLOAD;
4010 switch (SrcTy.SimpleTy) {
4011 default:
4012 llvm_unreachable("Unhandled source type");
4013 case MVT::i8:
4014 switch (DestTy.SimpleTy) {
4015 default:
4016 llvm_unreachable("Unhandled dest type");
4017 case MVT::i16:
4018 return IsSigned ? NVPTX::CVT_s16_s8 : NVPTX::CVT_u16_u8;
4019 case MVT::i32:
4020 return IsSigned ? NVPTX::CVT_s32_s8 : NVPTX::CVT_u32_u8;
4021 case MVT::i64:
4022 return IsSigned ? NVPTX::CVT_s64_s8 : NVPTX::CVT_u64_u8;
4023 }
4024 case MVT::i16:
4025 switch (DestTy.SimpleTy) {
4026 default:
4027 llvm_unreachable("Unhandled dest type");
4028 case MVT::i8:
4029 return IsSigned ? NVPTX::CVT_s8_s16 : NVPTX::CVT_u8_u16;
4030 case MVT::i32:
4031 return IsSigned ? NVPTX::CVT_s32_s16 : NVPTX::CVT_u32_u16;
4032 case MVT::i64:
4033 return IsSigned ? NVPTX::CVT_s64_s16 : NVPTX::CVT_u64_u16;
4034 }
4035 case MVT::i32:
4036 switch (DestTy.SimpleTy) {
4037 default:
4038 llvm_unreachable("Unhandled dest type");
4039 case MVT::i8:
4040 return IsSigned ? NVPTX::CVT_s8_s32 : NVPTX::CVT_u8_u32;
4041 case MVT::i16:
4042 return IsSigned ? NVPTX::CVT_s16_s32 : NVPTX::CVT_u16_u32;
4043 case MVT::i64:
4044 return IsSigned ? NVPTX::CVT_s64_s32 : NVPTX::CVT_u64_u32;
4045 }
4046 case MVT::i64:
4047 switch (DestTy.SimpleTy) {
4048 default:
4049 llvm_unreachable("Unhandled dest type");
4050 case MVT::i8:
4051 return IsSigned ? NVPTX::CVT_s8_s64 : NVPTX::CVT_u8_u64;
4052 case MVT::i16:
4053 return IsSigned ? NVPTX::CVT_s16_s64 : NVPTX::CVT_u16_u64;
4054 case MVT::i32:
4055 return IsSigned ? NVPTX::CVT_s32_s64 : NVPTX::CVT_u32_u64;
4056 }
4057 case MVT::f16:
4058 switch (DestTy.SimpleTy) {
4059 default:
4060 llvm_unreachable("Unhandled dest type");
4061 case MVT::f32:
4062 return NVPTX::CVT_f32_f16;
4063 case MVT::f64:
4064 return NVPTX::CVT_f64_f16;
4065 }
4066 }
4067}
amdgpu aa AMDGPU Address space based Alias Analysis Wrapper
static const LLT F64
static const LLT F32
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Atomic ordering constants.
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
uint64_t Addr
#define DEBUG_TYPE
#define F(x, y, z)
Definition: MD5.cpp:55
#define getOpcodeForVectorStParam(n, ty, isimm)
static unsigned int getCodeAddrSpace(MemSDNode *N)
static int getLdStRegType(EVT VT)
static unsigned pickOpcodeForVectorStParam(SmallVector< SDValue, 8 > &Ops, unsigned NumElts, MVT::SimpleValueType MemTy, SelectionDAG *CurDAG, SDLoc DL)
#define getOpcodeForVectorStParamV2(ty, isimm)
static cl::opt< bool > EnableRsqrtOpt("nvptx-rsqrt-approx-opt", cl::init(true), cl::Hidden, cl::desc("Enable reciprocal sqrt optimization"))
static unsigned getPTXCmpMode(const CondCodeSDNode &CondCode, bool FTZ)
static std::optional< unsigned > pickOpcodeForVT(MVT::SimpleValueType VT, unsigned Opcode_i8, unsigned Opcode_i16, unsigned Opcode_i32, std::optional< unsigned > Opcode_i64, unsigned Opcode_f32, std::optional< unsigned > Opcode_f64)
static bool canLowerToLDG(MemSDNode *N, const NVPTXSubtarget &Subtarget, unsigned CodeAddrSpace, MachineFunction *F)
if(VerifyEach)
const char LLVMTargetMachineRef TM
#define INITIALIZE_PASS(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:38
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
#define PASS_NAME
Value * RHS
Value * LHS
unsigned getSrcAddressSpace() const
unsigned getDestAddressSpace() const
This is an SDNode representing atomic operations.
const SDValue & getVal() const
const ConstantFP * getConstantFPValue() const
ConstantFP - Floating Point Values [float, double].
Definition: Constants.h:269
This is the shared class of boolean and integer constants.
Definition: Constants.h:81
const ConstantInt * getConstantIntValue() const
uint64_t getZExtValue() const
This class represents an Operation in the Expression.
unsigned getPointerSizeInBits(unsigned AS=0) const
Layout pointer size, in bits FIXME: The defaults need to be removed once all of the backends/clients ...
Definition: DataLayout.h:410
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:310
Record instruction ordering so we can query their relative positions within a function.
bool isIndexed() const
Return true if this is a pre/post inc/dec load/store.
This class is used to represent ISD::LOAD nodes.
ISD::LoadExtType getExtensionType() const
Return whether this is a plain node, or one of the varieties of value-extending loads.
Machine Value Type.
SimpleValueType SimpleTy
bool isVector() const
Return true if this is a vector value type.
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
MVT getVectorElementType() const
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
A description of a memory reference used in the backend.
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
EVT getMemoryVT() const
Return the type of the in-memory value.
NVPTXDAGToDAGISelLegacy(NVPTXTargetMachine &tm, CodeGenOptLevel OptLevel)
bool runOnMachineFunction(MachineFunction &MF) override
bool SelectInlineAsmMemoryOperand(const SDValue &Op, InlineAsm::ConstraintCode ConstraintID, std::vector< SDValue > &OutOps) override
SelectInlineAsmMemoryOperand - Implement addressing mode selection for inline asm expressions.
const NVPTXSubtarget * Subtarget
const NVPTXTargetLowering * getTargetLowering() const override
bool hasRelaxedMMIO() const
bool hasMemoryOrdering() const
bool useF32FTZ(const MachineFunction &MF) const
bool allowFMA(MachineFunction &MF, CodeGenOptLevel OptLevel) const
bool allowUnsafeFPMath(MachineFunction &MF) const
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
Represents one node in the SelectionDAG.
const SDValue & getOperand(unsigned Num) const
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
SDNode * getNode() const
get the SDNode which holds the desired result
EVT getValueType() const
Return the ValueType of the referenced return value.
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
unsigned getOpcode() const
SelectionDAGISel - This is the common base class used for SelectionDAG-based pattern-matching instruc...
MachineFunction * MF
CodeGenOptLevel OptLevel
void ReplaceUses(SDValue F, SDValue T)
ReplaceUses - replace all uses of the old node F with the use of the new node T.
void ReplaceNode(SDNode *F, SDNode *T)
Replace all uses of F with T, then remove F from the DAG.
virtual bool runOnMachineFunction(MachineFunction &mf)
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
Definition: SelectionDAG.h:228
SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
void setNodeMemRefs(MachineSDNode *N, ArrayRef< MachineMemOperand * > NewMemRefs)
Mutate the specified machine node's memory references to the provided list.
const DataLayout & getDataLayout() const
Definition: SelectionDAG.h:489
SDValue getTargetFrameIndex(int FI, EVT VT)
Definition: SelectionDAG.h:743
SDValue getTargetConstantFP(double Val, const SDLoc &DL, EVT VT)
Definition: SelectionDAG.h:724
SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
Definition: SelectionDAG.h:692
bool empty() const
Definition: SmallVector.h:95
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
Definition: SmallVector.h:697
void push_back(const T &Elt)
Definition: SmallVector.h:427
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1210
This class is used to represent ISD::STORE nodes.
const SDValue & getValue() const
unsigned getPointerSizeInBits(unsigned AS) const
LLVM Value Representation.
Definition: Value.h:74
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Definition: Lint.cpp:86
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:121
@ ConstantFP
Definition: ISDOpcodes.h:77
@ ATOMIC_STORE
OUTCHAIN = ATOMIC_STORE(INCHAIN, ptr, val) This corresponds to "store atomic" instruction.
Definition: ISDOpcodes.h:1284
@ ADD
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:246
@ LOAD
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
Definition: ISDOpcodes.h:1074
@ TargetExternalSymbol
Definition: ISDOpcodes.h:175
@ ATOMIC_LOAD
Val, OUTCHAIN = ATOMIC_LOAD(INCHAIN, ptr) This corresponds to "load atomic" instruction.
Definition: ISDOpcodes.h:1280
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
Definition: ISDOpcodes.h:215
@ TargetGlobalAddress
TargetGlobalAddress - Like GlobalAddress, but the DAG does no folding or anything else with this node...
Definition: ISDOpcodes.h:170
@ SHL
Shift and rotation operations.
Definition: ISDOpcodes.h:734
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition: ISDOpcodes.h:549
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition: ISDOpcodes.h:209
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition: ISDOpcodes.h:708
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition: ISDOpcodes.h:190
@ ADDRSPACECAST
ADDRSPACECAST - This operator converts between pointers of different address spaces.
Definition: ISDOpcodes.h:937
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition: ISDOpcodes.h:198
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
Definition: ISDOpcodes.h:1578
std::string OrderingToCString(Ordering Order)
@ RelaxedMMIO
Definition: NVPTX.h:124
@ Acquire
Definition: NVPTX.h:118
@ Relaxed
Definition: NVPTX.h:116
@ NotAtomic
Definition: NVPTX.h:113
@ Volatile
Definition: NVPTX.h:123
@ Release
Definition: NVPTX.h:119
@ SequentiallyConsistent
Definition: NVPTX.h:121
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition: STLExtras.h:329
@ Offset
Definition: DWP.cpp:480
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1722
bool Isv2x16VT(EVT VT)
int countr_one(T Value)
Count the number of ones from the least significant bit to the first zero bit.
Definition: bit.h:307
@ ADDRESS_SPACE_GENERIC
Definition: NVPTXBaseInfo.h:22
@ ADDRESS_SPACE_LOCAL
Definition: NVPTXBaseInfo.h:26
@ ADDRESS_SPACE_CONST
Definition: NVPTXBaseInfo.h:25
@ ADDRESS_SPACE_GLOBAL
Definition: NVPTXBaseInfo.h:23
@ ADDRESS_SPACE_PARAM
Definition: NVPTXBaseInfo.h:29
@ ADDRESS_SPACE_SHARED
Definition: NVPTXBaseInfo.h:24
auto formatv(const char *Fmt, Ts &&...Vals) -> formatv_object< decltype(std::make_tuple(support::detail::build_format_adapter(std::forward< Ts >(Vals))...))>
FunctionPass * createNVPTXISelDag(NVPTXTargetMachine &TM, llvm::CodeGenOptLevel OptLevel)
createNVPTXISelDag - This pass converts a legalized DAG into a NVPTX-specific DAG,...
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition: bit.h:215
constexpr bool isShiftedMask_64(uint64_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (64 bit ver...
Definition: MathExtras.h:285
const char * toIRString(AtomicOrdering ao)
String used by LLVM IR to represent atomic ordering.
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition: Error.cpp:167
constexpr bool isMask_64(uint64_t Value)
Return true if the argument is a non-empty sequence of ones starting at the least significant bit wit...
Definition: MathExtras.h:273
CodeGenOptLevel
Code generation optimization level.
Definition: CodeGen.h:54
AtomicOrdering
Atomic ordering for LLVM's memory model.
bool isKernelFunction(const Function &F)
void getUnderlyingObjects(const Value *V, SmallVectorImpl< const Value * > &Objects, const LoopInfo *LI=nullptr, unsigned MaxLookup=6)
This method is similar to getUnderlyingObject except that it can look through phi and select instruct...
Implement std::hash so that hash_code can be used in STL containers.
Definition: BitVector.h:858
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:860
#define N
Extended Value Type.
Definition: ValueTypes.h:34
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition: ValueTypes.h:136
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
Definition: ValueTypes.h:146
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition: ValueTypes.h:306
bool isVector() const
Return true if this is a vector value type.
Definition: ValueTypes.h:167
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition: ValueTypes.h:318
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition: ValueTypes.h:326
This represents a list of ValueType's that has been intern'd by a SelectionDAG.