LLVM 20.0.0git
NVPTXISelDAGToDAG.cpp
Go to the documentation of this file.
1//===-- NVPTXISelDAGToDAG.cpp - A dag to dag inst selector for NVPTX ------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines an instruction selector for the NVPTX target.
10//
11//===----------------------------------------------------------------------===//
12
13#include "NVPTXISelDAGToDAG.h"
15#include "NVPTXUtilities.h"
18#include "llvm/IR/GlobalValue.h"
20#include "llvm/IR/IntrinsicsNVPTX.h"
23#include "llvm/Support/Debug.h"
27
28using namespace llvm;
29
30#define DEBUG_TYPE "nvptx-isel"
31#define PASS_NAME "NVPTX DAG->DAG Pattern Instruction Selection"
32
33static cl::opt<bool>
34 EnableRsqrtOpt("nvptx-rsqrt-approx-opt", cl::init(true), cl::Hidden,
35 cl::desc("Enable reciprocal sqrt optimization"));
36
37/// createNVPTXISelDag - This pass converts a legalized DAG into a
38/// NVPTX-specific DAG, ready for instruction scheduling.
40 llvm::CodeGenOptLevel OptLevel) {
41 return new NVPTXDAGToDAGISelLegacy(TM, OptLevel);
42}
43
45 CodeGenOptLevel OptLevel)
47 ID, std::make_unique<NVPTXDAGToDAGISel>(tm, OptLevel)) {}
48
50
52
54 CodeGenOptLevel OptLevel)
55 : SelectionDAGISel(tm, OptLevel), TM(tm) {
56 doMulWide = (OptLevel > CodeGenOptLevel::None);
57}
58
62}
63
64int NVPTXDAGToDAGISel::getDivF32Level() const {
66}
67
68bool NVPTXDAGToDAGISel::usePrecSqrtF32() const {
70}
71
72bool NVPTXDAGToDAGISel::useF32FTZ() const {
74}
75
76bool NVPTXDAGToDAGISel::allowFMA() const {
78 return TL->allowFMA(*MF, OptLevel);
79}
80
81bool NVPTXDAGToDAGISel::allowUnsafeFPMath() const {
83 return TL->allowUnsafeFPMath(*MF);
84}
85
86bool NVPTXDAGToDAGISel::doRsqrtOpt() const { return EnableRsqrtOpt; }
87
88/// Select - Select instructions not customized! Used for
89/// expanded, promoted and normal instructions.
90void NVPTXDAGToDAGISel::Select(SDNode *N) {
91
92 if (N->isMachineOpcode()) {
93 N->setNodeId(-1);
94 return; // Already selected.
95 }
96
97 switch (N->getOpcode()) {
98 case ISD::LOAD:
100 if (tryLoad(N))
101 return;
102 break;
103 case ISD::STORE:
105 if (tryStore(N))
106 return;
107 break;
109 if (tryEXTRACT_VECTOR_ELEMENT(N))
110 return;
111 break;
113 SelectSETP_F16X2(N);
114 return;
116 SelectSETP_BF16X2(N);
117 return;
118 case NVPTXISD::LoadV2:
119 case NVPTXISD::LoadV4:
120 if (tryLoadVector(N))
121 return;
122 break;
123 case NVPTXISD::LDGV2:
124 case NVPTXISD::LDGV4:
125 case NVPTXISD::LDUV2:
126 case NVPTXISD::LDUV4:
127 if (tryLDGLDU(N))
128 return;
129 break;
132 if (tryStoreVector(N))
133 return;
134 break;
138 if (tryLoadParam(N))
139 return;
140 break;
144 if (tryStoreRetval(N))
145 return;
146 break;
152 if (tryStoreParam(N))
153 return;
154 break;
156 if (tryIntrinsicNoChain(N))
157 return;
158 break;
160 if (tryIntrinsicChain(N))
161 return;
162 break;
337 if (tryTextureIntrinsic(N))
338 return;
339 break;
505 if (trySurfaceIntrinsic(N))
506 return;
507 break;
508 case ISD::AND:
509 case ISD::SRA:
510 case ISD::SRL:
511 // Try to select BFE
512 if (tryBFE(N))
513 return;
514 break;
516 SelectAddrSpaceCast(N);
517 return;
518 case ISD::ConstantFP:
519 if (tryConstantFP(N))
520 return;
521 break;
522 case ISD::CopyToReg: {
523 if (N->getOperand(1).getValueType() == MVT::i128) {
524 SelectV2I64toI128(N);
525 return;
526 }
527 break;
528 }
529 case ISD::CopyFromReg: {
530 if (N->getOperand(1).getValueType() == MVT::i128) {
531 SelectI128toV2I64(N);
532 return;
533 }
534 break;
535 }
536 default:
537 break;
538 }
539 SelectCode(N);
540}
541
542bool NVPTXDAGToDAGISel::tryIntrinsicChain(SDNode *N) {
543 unsigned IID = N->getConstantOperandVal(1);
544 switch (IID) {
545 default:
546 return false;
547 case Intrinsic::nvvm_ldg_global_f:
548 case Intrinsic::nvvm_ldg_global_i:
549 case Intrinsic::nvvm_ldg_global_p:
550 case Intrinsic::nvvm_ldu_global_f:
551 case Intrinsic::nvvm_ldu_global_i:
552 case Intrinsic::nvvm_ldu_global_p:
553 return tryLDGLDU(N);
554 }
555}
556
557// There's no way to specify FP16 and BF16 immediates in .(b)f16 ops, so we
558// have to load them into an .(b)f16 register first.
559bool NVPTXDAGToDAGISel::tryConstantFP(SDNode *N) {
560 if (N->getValueType(0) != MVT::f16 && N->getValueType(0) != MVT::bf16)
561 return false;
563 cast<ConstantFPSDNode>(N)->getValueAPF(), SDLoc(N), N->getValueType(0));
564 SDNode *LoadConstF16 = CurDAG->getMachineNode(
565 (N->getValueType(0) == MVT::f16 ? NVPTX::LOAD_CONST_F16
566 : NVPTX::LOAD_CONST_BF16),
567 SDLoc(N), N->getValueType(0), Val);
568 ReplaceNode(N, LoadConstF16);
569 return true;
570}
571
572// Map ISD:CONDCODE value to appropriate CmpMode expected by
573// NVPTXInstPrinter::printCmpMode()
574static unsigned getPTXCmpMode(const CondCodeSDNode &CondCode, bool FTZ) {
576 unsigned PTXCmpMode = [](ISD::CondCode CC) {
577 switch (CC) {
578 default:
579 llvm_unreachable("Unexpected condition code.");
580 case ISD::SETOEQ:
581 return CmpMode::EQ;
582 case ISD::SETOGT:
583 return CmpMode::GT;
584 case ISD::SETOGE:
585 return CmpMode::GE;
586 case ISD::SETOLT:
587 return CmpMode::LT;
588 case ISD::SETOLE:
589 return CmpMode::LE;
590 case ISD::SETONE:
591 return CmpMode::NE;
592 case ISD::SETO:
593 return CmpMode::NUM;
594 case ISD::SETUO:
595 return CmpMode::NotANumber;
596 case ISD::SETUEQ:
597 return CmpMode::EQU;
598 case ISD::SETUGT:
599 return CmpMode::GTU;
600 case ISD::SETUGE:
601 return CmpMode::GEU;
602 case ISD::SETULT:
603 return CmpMode::LTU;
604 case ISD::SETULE:
605 return CmpMode::LEU;
606 case ISD::SETUNE:
607 return CmpMode::NEU;
608 case ISD::SETEQ:
609 return CmpMode::EQ;
610 case ISD::SETGT:
611 return CmpMode::GT;
612 case ISD::SETGE:
613 return CmpMode::GE;
614 case ISD::SETLT:
615 return CmpMode::LT;
616 case ISD::SETLE:
617 return CmpMode::LE;
618 case ISD::SETNE:
619 return CmpMode::NE;
620 }
621 }(CondCode.get());
622
623 if (FTZ)
624 PTXCmpMode |= NVPTX::PTXCmpMode::FTZ_FLAG;
625
626 return PTXCmpMode;
627}
628
629bool NVPTXDAGToDAGISel::SelectSETP_F16X2(SDNode *N) {
630 unsigned PTXCmpMode =
631 getPTXCmpMode(*cast<CondCodeSDNode>(N->getOperand(2)), useF32FTZ());
632 SDLoc DL(N);
634 NVPTX::SETP_f16x2rr, DL, MVT::i1, MVT::i1, N->getOperand(0),
635 N->getOperand(1), CurDAG->getTargetConstant(PTXCmpMode, DL, MVT::i32));
636 ReplaceNode(N, SetP);
637 return true;
638}
639
640bool NVPTXDAGToDAGISel::SelectSETP_BF16X2(SDNode *N) {
641 unsigned PTXCmpMode =
642 getPTXCmpMode(*cast<CondCodeSDNode>(N->getOperand(2)), useF32FTZ());
643 SDLoc DL(N);
645 NVPTX::SETP_bf16x2rr, DL, MVT::i1, MVT::i1, N->getOperand(0),
646 N->getOperand(1), CurDAG->getTargetConstant(PTXCmpMode, DL, MVT::i32));
647 ReplaceNode(N, SetP);
648 return true;
649}
650
651// Find all instances of extract_vector_elt that use this v2f16 vector
652// and coalesce them into a scattering move instruction.
653bool NVPTXDAGToDAGISel::tryEXTRACT_VECTOR_ELEMENT(SDNode *N) {
654 SDValue Vector = N->getOperand(0);
655
656 // We only care about 16x2 as it's the only real vector type we
657 // need to deal with.
658 MVT VT = Vector.getSimpleValueType();
659 if (!Isv2x16VT(VT))
660 return false;
661 // Find and record all uses of this vector that extract element 0 or 1.
663 for (auto *U : Vector.getNode()->uses()) {
664 if (U->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
665 continue;
666 if (U->getOperand(0) != Vector)
667 continue;
668 if (const ConstantSDNode *IdxConst =
669 dyn_cast<ConstantSDNode>(U->getOperand(1))) {
670 if (IdxConst->getZExtValue() == 0)
671 E0.push_back(U);
672 else if (IdxConst->getZExtValue() == 1)
673 E1.push_back(U);
674 else
675 llvm_unreachable("Invalid vector index.");
676 }
677 }
678
679 // There's no point scattering f16x2 if we only ever access one
680 // element of it.
681 if (E0.empty() || E1.empty())
682 return false;
683
684 // Merge (f16 extractelt(V, 0), f16 extractelt(V,1))
685 // into f16,f16 SplitF16x2(V)
686 MVT EltVT = VT.getVectorElementType();
687 SDNode *ScatterOp =
688 CurDAG->getMachineNode(NVPTX::I32toV2I16, SDLoc(N), EltVT, EltVT, Vector);
689 for (auto *Node : E0)
690 ReplaceUses(SDValue(Node, 0), SDValue(ScatterOp, 0));
691 for (auto *Node : E1)
692 ReplaceUses(SDValue(Node, 0), SDValue(ScatterOp, 1));
693
694 return true;
695}
696
697static unsigned int getCodeAddrSpace(MemSDNode *N) {
698 const Value *Src = N->getMemOperand()->getValue();
699
700 if (!Src)
702
703 if (auto *PT = dyn_cast<PointerType>(Src->getType())) {
704 switch (PT->getAddressSpace()) {
711 default: break;
712 }
713 }
715}
716
717static unsigned int getCodeMemorySemantic(MemSDNode *N,
718 const NVPTXSubtarget *Subtarget) {
719 AtomicOrdering Ordering = N->getSuccessOrdering();
720 auto CodeAddrSpace = getCodeAddrSpace(N);
721
722 bool HasMemoryOrdering = Subtarget->hasMemoryOrdering();
723 bool HasRelaxedMMIO = Subtarget->hasRelaxedMMIO();
724
725 // TODO: lowering for SequentiallyConsistent Operations: for now, we error.
726 // TODO: lowering for AcquireRelease Operations: for now, we error.
727 //
728
729 // clang-format off
730
731 // Lowering for non-SequentiallyConsistent Operations
732 //
733 // | Atomic | Volatile | Statespace | PTX sm_60- | PTX sm_70+ |
734 // |---------|----------|--------------------|------------|------------------------------|
735 // | No | No | All | plain | .weak |
736 // | No | Yes | Generic,Shared, | .volatile | .volatile |
737 // | | | Global [0] | | |
738 // | No | Yes | Local,Const,Param | plain [1] | .weak [1] |
739 // | Unorder | Yes/No | All | == Relaxed | == Relaxed |
740 // | Relaxed | No | Generic,Shared, | .volatile | <atomic sem> |
741 // | | | Global [0] | | |
742 // | Other | No | Generic,Shared, | Error [2] | <atomic sem> |
743 // | | | Global [0] | | |
744 // | Yes | No | Local,Const,Param | plain [1] | .weak [1] |
745 // | Relaxed | Yes | Generic,Shared [0] | .volatile | .volatile |
746 // | Relaxed | Yes | Global [0] | .volatile | .mmio.relaxed.sys (PTX 8.2+) |
747 // | | | | | or .volatile (PTX 8.1-) |
748 // | Relaxed | Yes | Local,Const,Param | plain [1] | .weak [1] |
749 // | Other | Yes | Generic, Shared, | Error [2] | <atomic sem> [3] |
750 // | | | / Global [0] | | |
751
752 // clang-format on
753
754 // [0]: volatile and atomics are only supported on global or shared
755 // memory locations, accessed via generic/shared/global pointers.
756 // MMIO is only supported on global memory locations,
757 // accessed via generic/global pointers.
758 // TODO: Implement MMIO access via generic pointer to global.
759 // Currently implemented for global pointers only.
760
761 // [1]: Lowering volatile/atomic operations to non-volatile/non-atomic
762 // PTX instructions fails to preserve their C++ side-effects.
763 //
764 // Example (https://github.com/llvm/llvm-project/issues/62057):
765 //
766 // void example() {
767 // std::atomic<bool> True = true;
768 // while (True.load(std::memory_order_relaxed));
769 // }
770 //
771 // A C++ program that calls "example" is well-defined: the infinite loop
772 // performs an atomic operation. By lowering volatile/atomics to
773 // "weak" memory operations, we are transforming the above into:
774 //
775 // void undefined_behavior() {
776 // bool True = true;
777 // while (True);
778 // }
779 //
780 // which exhibits undefined behavior in both C++ and PTX.
781 //
782 // Calling "example" in CUDA C++ compiled for sm_60- exhibits undefined
783 // behavior due to lack of Independent Forward Progress. Lowering these
784 // to weak memory operations in sm_60- is therefore fine.
785 //
786 // TODO: lower atomic and volatile operations to memory locations
787 // in local, const, and param to two PTX instructions in sm_70+:
788 // - the "weak" memory instruction we are currently lowering to, and
789 // - some other instruction that preserves the side-effect, e.g.,
790 // a dead dummy volatile load.
791
792 if (CodeAddrSpace == NVPTX::PTXLdStInstCode::LOCAL ||
793 CodeAddrSpace == NVPTX::PTXLdStInstCode::CONSTANT ||
794 CodeAddrSpace == NVPTX::PTXLdStInstCode::PARAM) {
796 }
797
798 // [2]: Atomics with Ordering different than Unordered or Relaxed are not
799 // supported on sm_60 and older; this includes volatile atomics.
800 if (!(Ordering == AtomicOrdering::NotAtomic ||
801 Ordering == AtomicOrdering::Unordered ||
802 Ordering == AtomicOrdering::Monotonic) &&
803 !HasMemoryOrdering) {
806 OS << "PTX does not support \"atomic\" for orderings different than"
807 "\"NotAtomic\" or \"Monotonic\" for sm_60 or older, but order is: \""
808 << toIRString(Ordering) << "\".";
809 report_fatal_error(OS.str());
810 }
811
812 // [3]: TODO: these should eventually use .mmio<.atomic sem>; for now we drop
813 // the volatile semantics and preserve the atomic ones.
814
815 // PTX volatile and PTX atomics are not available for statespace that differ
816 // from .generic, .global, or .shared. The behavior of PTX volatile and PTX
817 // atomics is undefined if the generic address does not refer to a .global or
818 // .shared memory location.
819 bool AddrGenericOrGlobalOrShared =
820 (CodeAddrSpace == NVPTX::PTXLdStInstCode::GENERIC ||
821 CodeAddrSpace == NVPTX::PTXLdStInstCode::GLOBAL ||
822 CodeAddrSpace == NVPTX::PTXLdStInstCode::SHARED);
823 bool UseRelaxedMMIO =
824 HasRelaxedMMIO && CodeAddrSpace == NVPTX::PTXLdStInstCode::GLOBAL;
825
826 switch (Ordering) {
828 return N->isVolatile() && AddrGenericOrGlobalOrShared
832 // We lower unordered in the exact same way as 'monotonic' to respect
833 // LLVM IR atomicity requirements.
835 if (N->isVolatile())
836 return UseRelaxedMMIO ? NVPTX::PTXLdStInstCode::RelaxedMMIO
837 : AddrGenericOrGlobalOrShared ? NVPTX::PTXLdStInstCode::Volatile
839 else
840 return HasMemoryOrdering ? NVPTX::PTXLdStInstCode::Relaxed
841 : AddrGenericOrGlobalOrShared ? NVPTX::PTXLdStInstCode::Volatile
844 if (!N->readMem()) {
847 OS << "PTX only supports Acquire Ordering on reads: "
848 << N->getOperationName();
849 N->print(OS);
850 report_fatal_error(OS.str());
851 }
852 return AddrGenericOrGlobalOrShared ? NVPTX::PTXLdStInstCode::Acquire
855 if (!N->writeMem()) {
858 OS << "PTX only supports Release Ordering on writes: "
859 << N->getOperationName();
860 N->print(OS);
861 report_fatal_error(OS.str());
862 }
863 return AddrGenericOrGlobalOrShared ? NVPTX::PTXLdStInstCode::Release
868 OS << "PTX only supports AcquireRelease Ordering on read-modify-write: "
869 << N->getOperationName();
870 N->print(OS);
871 report_fatal_error(OS.str());
872 }
874 // TODO: support AcquireRelease and SequentiallyConsistent
877 OS << "NVPTX backend does not support AtomicOrdering \""
878 << toIRString(Ordering) << "\" yet.";
879 report_fatal_error(OS.str());
880 }
881
882 llvm_unreachable("unexpected unhandled case");
883}
884
885static bool canLowerToLDG(MemSDNode *N, const NVPTXSubtarget &Subtarget,
886 unsigned CodeAddrSpace, MachineFunction *F) {
887 // We use ldg (i.e. ld.global.nc) for invariant loads from the global address
888 // space.
889 //
890 // We have two ways of identifying invariant loads: Loads may be explicitly
891 // marked as invariant, or we may infer them to be invariant.
892 //
893 // We currently infer invariance for loads from
894 // - constant global variables, and
895 // - kernel function pointer params that are noalias (i.e. __restrict) and
896 // never written to.
897 //
898 // TODO: Perform a more powerful invariance analysis (ideally IPO, and ideally
899 // not during the SelectionDAG phase).
900 //
901 // TODO: Infer invariance only at -O2. We still want to use ldg at -O0 for
902 // explicitly invariant loads because these are how clang tells us to use ldg
903 // when the user uses a builtin.
904 if (!Subtarget.hasLDG() || CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL)
905 return false;
906
907 if (N->isInvariant())
908 return true;
909
910 bool IsKernelFn = isKernelFunction(F->getFunction());
911
912 // We use getUnderlyingObjects() here instead of getUnderlyingObject() mainly
913 // because the former looks through phi nodes while the latter does not. We
914 // need to look through phi nodes to handle pointer induction variables.
916 getUnderlyingObjects(N->getMemOperand()->getValue(), Objs);
917
918 return all_of(Objs, [&](const Value *V) {
919 if (auto *A = dyn_cast<const Argument>(V))
920 return IsKernelFn && A->onlyReadsMemory() && A->hasNoAliasAttr();
921 if (auto *GV = dyn_cast<const GlobalVariable>(V))
922 return GV->isConstant();
923 return false;
924 });
925}
926
927bool NVPTXDAGToDAGISel::tryIntrinsicNoChain(SDNode *N) {
928 unsigned IID = N->getConstantOperandVal(0);
929 switch (IID) {
930 default:
931 return false;
932 case Intrinsic::nvvm_texsurf_handle_internal:
933 SelectTexSurfHandle(N);
934 return true;
935 }
936}
937
938void NVPTXDAGToDAGISel::SelectTexSurfHandle(SDNode *N) {
939 // Op 0 is the intrinsic ID
940 SDValue Wrapper = N->getOperand(1);
941 SDValue GlobalVal = Wrapper.getOperand(0);
942 ReplaceNode(N, CurDAG->getMachineNode(NVPTX::texsurf_handles, SDLoc(N),
943 MVT::i64, GlobalVal));
944}
945
946void NVPTXDAGToDAGISel::SelectAddrSpaceCast(SDNode *N) {
947 SDValue Src = N->getOperand(0);
948 AddrSpaceCastSDNode *CastN = cast<AddrSpaceCastSDNode>(N);
949 unsigned SrcAddrSpace = CastN->getSrcAddressSpace();
950 unsigned DstAddrSpace = CastN->getDestAddressSpace();
951 assert(SrcAddrSpace != DstAddrSpace &&
952 "addrspacecast must be between different address spaces");
953
954 if (DstAddrSpace == ADDRESS_SPACE_GENERIC) {
955 // Specific to generic
956 unsigned Opc;
957 switch (SrcAddrSpace) {
958 default: report_fatal_error("Bad address space in addrspacecast");
960 Opc = TM.is64Bit() ? NVPTX::cvta_global_64 : NVPTX::cvta_global;
961 break;
963 Opc = TM.is64Bit() ? (TM.getPointerSizeInBits(SrcAddrSpace) == 32
964 ? NVPTX::cvta_shared_6432
965 : NVPTX::cvta_shared_64)
966 : NVPTX::cvta_shared;
967 break;
969 Opc = TM.is64Bit() ? (TM.getPointerSizeInBits(SrcAddrSpace) == 32
970 ? NVPTX::cvta_const_6432
971 : NVPTX::cvta_const_64)
972 : NVPTX::cvta_const;
973 break;
975 Opc = TM.is64Bit() ? (TM.getPointerSizeInBits(SrcAddrSpace) == 32
976 ? NVPTX::cvta_local_6432
977 : NVPTX::cvta_local_64)
978 : NVPTX::cvta_local;
979 break;
980 }
981 ReplaceNode(N, CurDAG->getMachineNode(Opc, SDLoc(N), N->getValueType(0),
982 Src));
983 return;
984 } else {
985 // Generic to specific
986 if (SrcAddrSpace != 0)
987 report_fatal_error("Cannot cast between two non-generic address spaces");
988 unsigned Opc;
989 switch (DstAddrSpace) {
990 default: report_fatal_error("Bad address space in addrspacecast");
992 Opc = TM.is64Bit() ? NVPTX::cvta_to_global_64 : NVPTX::cvta_to_global;
993 break;
995 Opc = TM.is64Bit() ? (TM.getPointerSizeInBits(DstAddrSpace) == 32
996 ? NVPTX::cvta_to_shared_3264
997 : NVPTX::cvta_to_shared_64)
998 : NVPTX::cvta_to_shared;
999 break;
1001 Opc = TM.is64Bit() ? (TM.getPointerSizeInBits(DstAddrSpace) == 32
1002 ? NVPTX::cvta_to_const_3264
1003 : NVPTX::cvta_to_const_64)
1004 : NVPTX::cvta_to_const;
1005 break;
1007 Opc = TM.is64Bit() ? (TM.getPointerSizeInBits(DstAddrSpace) == 32
1008 ? NVPTX::cvta_to_local_3264
1009 : NVPTX::cvta_to_local_64)
1010 : NVPTX::cvta_to_local;
1011 break;
1013 Opc = TM.is64Bit() ? NVPTX::nvvm_ptr_gen_to_param_64
1014 : NVPTX::nvvm_ptr_gen_to_param;
1015 break;
1016 }
1017 ReplaceNode(N, CurDAG->getMachineNode(Opc, SDLoc(N), N->getValueType(0),
1018 Src));
1019 return;
1020 }
1021}
1022
1023// Helper function template to reduce amount of boilerplate code for
1024// opcode selection.
1025static std::optional<unsigned>
1027 unsigned Opcode_i16, unsigned Opcode_i32,
1028 std::optional<unsigned> Opcode_i64, unsigned Opcode_f32,
1029 std::optional<unsigned> Opcode_f64) {
1030 switch (VT) {
1031 case MVT::i1:
1032 case MVT::i8:
1033 return Opcode_i8;
1034 case MVT::i16:
1035 return Opcode_i16;
1036 case MVT::i32:
1037 return Opcode_i32;
1038 case MVT::i64:
1039 return Opcode_i64;
1040 case MVT::f16:
1041 case MVT::bf16:
1042 return Opcode_i16;
1043 case MVT::v2f16:
1044 case MVT::v2bf16:
1045 case MVT::v2i16:
1046 case MVT::v4i8:
1047 return Opcode_i32;
1048 case MVT::f32:
1049 return Opcode_f32;
1050 case MVT::f64:
1051 return Opcode_f64;
1052 default:
1053 return std::nullopt;
1054 }
1055}
1056
1057static int getLdStRegType(EVT VT) {
1058 if (VT.isFloatingPoint())
1059 switch (VT.getSimpleVT().SimpleTy) {
1060 case MVT::f16:
1061 case MVT::bf16:
1062 case MVT::v2f16:
1063 case MVT::v2bf16:
1065 default:
1067 }
1068 else
1070}
1071
1072bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
1073 SDLoc dl(N);
1074 MemSDNode *LD = cast<MemSDNode>(N);
1075 assert(LD->readMem() && "Expected load");
1076 LoadSDNode *PlainLoad = dyn_cast<LoadSDNode>(N);
1077 EVT LoadedVT = LD->getMemoryVT();
1078 SDNode *NVPTXLD = nullptr;
1079
1080 // do not support pre/post inc/dec
1081 if (PlainLoad && PlainLoad->isIndexed())
1082 return false;
1083
1084 if (!LoadedVT.isSimple())
1085 return false;
1086
1087 // Address Space Setting
1088 unsigned int CodeAddrSpace = getCodeAddrSpace(LD);
1089 if (canLowerToLDG(LD, *Subtarget, CodeAddrSpace, MF)) {
1090 return tryLDGLDU(N);
1091 }
1092
1093 // Memory Semantic Setting
1094 unsigned int CodeMemorySem = getCodeMemorySemantic(LD, Subtarget);
1095
1096 unsigned int PointerSize =
1097 CurDAG->getDataLayout().getPointerSizeInBits(LD->getAddressSpace());
1098
1099 // Type Setting: fromType + fromTypeWidth
1100 //
1101 // Sign : ISD::SEXTLOAD
1102 // Unsign : ISD::ZEXTLOAD, ISD::NON_EXTLOAD or ISD::EXTLOAD and the
1103 // type is integer
1104 // Float : ISD::NON_EXTLOAD or ISD::EXTLOAD and the type is float
1105 MVT SimpleVT = LoadedVT.getSimpleVT();
1106 MVT ScalarVT = SimpleVT.getScalarType();
1107 // Read at least 8 bits (predicates are stored as 8-bit values)
1108 unsigned fromTypeWidth = std::max(8U, (unsigned)ScalarVT.getSizeInBits());
1109 unsigned int fromType;
1110
1111 // Vector Setting
1112 unsigned vecType = NVPTX::PTXLdStInstCode::Scalar;
1113 if (SimpleVT.isVector()) {
1114 assert((Isv2x16VT(LoadedVT) || LoadedVT == MVT::v4i8) &&
1115 "Unexpected vector type");
1116 // v2f16/v2bf16/v2i16 is loaded using ld.b32
1117 fromTypeWidth = 32;
1118 }
1119
1120 if (PlainLoad && (PlainLoad->getExtensionType() == ISD::SEXTLOAD))
1122 else
1123 fromType = getLdStRegType(ScalarVT);
1124
1125 // Create the machine instruction DAG
1126 SDValue Chain = N->getOperand(0);
1127 SDValue N1 = N->getOperand(1);
1128 SDValue Addr;
1130 std::optional<unsigned> Opcode;
1131 MVT::SimpleValueType TargetVT = LD->getSimpleValueType(0).SimpleTy;
1132
1133 if (SelectDirectAddr(N1, Addr)) {
1134 Opcode = pickOpcodeForVT(TargetVT, NVPTX::LD_i8_avar, NVPTX::LD_i16_avar,
1135 NVPTX::LD_i32_avar, NVPTX::LD_i64_avar,
1136 NVPTX::LD_f32_avar, NVPTX::LD_f64_avar);
1137 if (!Opcode)
1138 return false;
1139 SDValue Ops[] = {getI32Imm(CodeMemorySem, dl),
1140 getI32Imm(CodeAddrSpace, dl),
1141 getI32Imm(vecType, dl),
1142 getI32Imm(fromType, dl),
1143 getI32Imm(fromTypeWidth, dl),
1144 Addr,
1145 Chain};
1146 NVPTXLD = CurDAG->getMachineNode(*Opcode, dl, TargetVT, MVT::Other, Ops);
1147 } else if (PointerSize == 64 ? SelectADDRsi64(N1.getNode(), N1, Base, Offset)
1148 : SelectADDRsi(N1.getNode(), N1, Base, Offset)) {
1149 Opcode = pickOpcodeForVT(TargetVT, NVPTX::LD_i8_asi, NVPTX::LD_i16_asi,
1150 NVPTX::LD_i32_asi, NVPTX::LD_i64_asi,
1151 NVPTX::LD_f32_asi, NVPTX::LD_f64_asi);
1152 if (!Opcode)
1153 return false;
1154 SDValue Ops[] = {getI32Imm(CodeMemorySem, dl),
1155 getI32Imm(CodeAddrSpace, dl),
1156 getI32Imm(vecType, dl),
1157 getI32Imm(fromType, dl),
1158 getI32Imm(fromTypeWidth, dl),
1159 Base,
1160 Offset,
1161 Chain};
1162 NVPTXLD = CurDAG->getMachineNode(*Opcode, dl, TargetVT, MVT::Other, Ops);
1163 } else if (PointerSize == 64 ? SelectADDRri64(N1.getNode(), N1, Base, Offset)
1164 : SelectADDRri(N1.getNode(), N1, Base, Offset)) {
1165 if (PointerSize == 64)
1166 Opcode =
1167 pickOpcodeForVT(TargetVT, NVPTX::LD_i8_ari_64, NVPTX::LD_i16_ari_64,
1168 NVPTX::LD_i32_ari_64, NVPTX::LD_i64_ari_64,
1169 NVPTX::LD_f32_ari_64, NVPTX::LD_f64_ari_64);
1170 else
1171 Opcode = pickOpcodeForVT(TargetVT, NVPTX::LD_i8_ari, NVPTX::LD_i16_ari,
1172 NVPTX::LD_i32_ari, NVPTX::LD_i64_ari,
1173 NVPTX::LD_f32_ari, NVPTX::LD_f64_ari);
1174 if (!Opcode)
1175 return false;
1176 SDValue Ops[] = {getI32Imm(CodeMemorySem, dl),
1177 getI32Imm(CodeAddrSpace, dl),
1178 getI32Imm(vecType, dl),
1179 getI32Imm(fromType, dl),
1180 getI32Imm(fromTypeWidth, dl),
1181 Base,
1182 Offset,
1183 Chain};
1184 NVPTXLD = CurDAG->getMachineNode(*Opcode, dl, TargetVT, MVT::Other, Ops);
1185 } else {
1186 if (PointerSize == 64)
1187 Opcode =
1188 pickOpcodeForVT(TargetVT, NVPTX::LD_i8_areg_64, NVPTX::LD_i16_areg_64,
1189 NVPTX::LD_i32_areg_64, NVPTX::LD_i64_areg_64,
1190 NVPTX::LD_f32_areg_64, NVPTX::LD_f64_areg_64);
1191 else
1192 Opcode = pickOpcodeForVT(TargetVT, NVPTX::LD_i8_areg, NVPTX::LD_i16_areg,
1193 NVPTX::LD_i32_areg, NVPTX::LD_i64_areg,
1194 NVPTX::LD_f32_areg, NVPTX::LD_f64_areg);
1195 if (!Opcode)
1196 return false;
1197 SDValue Ops[] = {getI32Imm(CodeMemorySem, dl),
1198 getI32Imm(CodeAddrSpace, dl),
1199 getI32Imm(vecType, dl),
1200 getI32Imm(fromType, dl),
1201 getI32Imm(fromTypeWidth, dl),
1202 N1,
1203 Chain};
1204 NVPTXLD = CurDAG->getMachineNode(*Opcode, dl, TargetVT, MVT::Other, Ops);
1205 }
1206
1207 if (!NVPTXLD)
1208 return false;
1209
1210 MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand();
1211 CurDAG->setNodeMemRefs(cast<MachineSDNode>(NVPTXLD), {MemRef});
1212
1213 ReplaceNode(N, NVPTXLD);
1214 return true;
1215}
1216
1217bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
1218
1219 SDValue Chain = N->getOperand(0);
1220 SDValue Op1 = N->getOperand(1);
1222 std::optional<unsigned> Opcode;
1223 SDLoc DL(N);
1224 SDNode *LD;
1225 MemSDNode *MemSD = cast<MemSDNode>(N);
1226 EVT LoadedVT = MemSD->getMemoryVT();
1227
1228 if (!LoadedVT.isSimple())
1229 return false;
1230
1231 // Address Space Setting
1232 unsigned int CodeAddrSpace = getCodeAddrSpace(MemSD);
1233 if (canLowerToLDG(MemSD, *Subtarget, CodeAddrSpace, MF)) {
1234 return tryLDGLDU(N);
1235 }
1236
1237 unsigned int PointerSize =
1239
1240 // Memory Semantic Setting
1241 unsigned int CodeMemorySem = getCodeMemorySemantic(MemSD, Subtarget);
1242
1243 // Vector Setting
1244 MVT SimpleVT = LoadedVT.getSimpleVT();
1245
1246 // Type Setting: fromType + fromTypeWidth
1247 //
1248 // Sign : ISD::SEXTLOAD
1249 // Unsign : ISD::ZEXTLOAD, ISD::NON_EXTLOAD or ISD::EXTLOAD and the
1250 // type is integer
1251 // Float : ISD::NON_EXTLOAD or ISD::EXTLOAD and the type is float
1252 MVT ScalarVT = SimpleVT.getScalarType();
1253 // Read at least 8 bits (predicates are stored as 8-bit values)
1254 unsigned FromTypeWidth = std::max(8U, (unsigned)ScalarVT.getSizeInBits());
1255 unsigned int FromType;
1256 // The last operand holds the original LoadSDNode::getExtensionType() value
1257 unsigned ExtensionType = cast<ConstantSDNode>(
1258 N->getOperand(N->getNumOperands() - 1))->getZExtValue();
1259 if (ExtensionType == ISD::SEXTLOAD)
1261 else
1262 FromType = getLdStRegType(ScalarVT);
1263
1264 unsigned VecType;
1265
1266 switch (N->getOpcode()) {
1267 case NVPTXISD::LoadV2:
1269 break;
1270 case NVPTXISD::LoadV4:
1272 break;
1273 default:
1274 return false;
1275 }
1276
1277 EVT EltVT = N->getValueType(0);
1278
1279 // v8x16 is a special case. PTX doesn't have ld.v8.16
1280 // instruction. Instead, we split the vector into v2x16 chunks and
1281 // load them with ld.v4.b32.
1282 if (Isv2x16VT(EltVT)) {
1283 assert(N->getOpcode() == NVPTXISD::LoadV4 && "Unexpected load opcode.");
1284 EltVT = MVT::i32;
1286 FromTypeWidth = 32;
1287 }
1288
1289 if (SelectDirectAddr(Op1, Addr)) {
1290 switch (N->getOpcode()) {
1291 default:
1292 return false;
1293 case NVPTXISD::LoadV2:
1294 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1295 NVPTX::LDV_i8_v2_avar, NVPTX::LDV_i16_v2_avar,
1296 NVPTX::LDV_i32_v2_avar, NVPTX::LDV_i64_v2_avar,
1297 NVPTX::LDV_f32_v2_avar, NVPTX::LDV_f64_v2_avar);
1298 break;
1299 case NVPTXISD::LoadV4:
1300 Opcode =
1301 pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_avar,
1302 NVPTX::LDV_i16_v4_avar, NVPTX::LDV_i32_v4_avar,
1303 std::nullopt, NVPTX::LDV_f32_v4_avar, std::nullopt);
1304 break;
1305 }
1306 if (!Opcode)
1307 return false;
1308 SDValue Ops[] = {getI32Imm(CodeMemorySem, DL),
1309 getI32Imm(CodeAddrSpace, DL),
1310 getI32Imm(VecType, DL),
1311 getI32Imm(FromType, DL),
1312 getI32Imm(FromTypeWidth, DL),
1313 Addr,
1314 Chain};
1315 LD = CurDAG->getMachineNode(*Opcode, DL, N->getVTList(), Ops);
1316 } else if (PointerSize == 64
1317 ? SelectADDRsi64(Op1.getNode(), Op1, Base, Offset)
1318 : SelectADDRsi(Op1.getNode(), Op1, Base, Offset)) {
1319 switch (N->getOpcode()) {
1320 default:
1321 return false;
1322 case NVPTXISD::LoadV2:
1323 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1324 NVPTX::LDV_i8_v2_asi, NVPTX::LDV_i16_v2_asi,
1325 NVPTX::LDV_i32_v2_asi, NVPTX::LDV_i64_v2_asi,
1326 NVPTX::LDV_f32_v2_asi, NVPTX::LDV_f64_v2_asi);
1327 break;
1328 case NVPTXISD::LoadV4:
1329 Opcode =
1330 pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_asi,
1331 NVPTX::LDV_i16_v4_asi, NVPTX::LDV_i32_v4_asi,
1332 std::nullopt, NVPTX::LDV_f32_v4_asi, std::nullopt);
1333 break;
1334 }
1335 if (!Opcode)
1336 return false;
1337 SDValue Ops[] = {getI32Imm(CodeMemorySem, DL),
1338 getI32Imm(CodeAddrSpace, DL),
1339 getI32Imm(VecType, DL),
1340 getI32Imm(FromType, DL),
1341 getI32Imm(FromTypeWidth, DL),
1342 Base,
1343 Offset,
1344 Chain};
1345 LD = CurDAG->getMachineNode(*Opcode, DL, N->getVTList(), Ops);
1346 } else if (PointerSize == 64
1347 ? SelectADDRri64(Op1.getNode(), Op1, Base, Offset)
1348 : SelectADDRri(Op1.getNode(), Op1, Base, Offset)) {
1349 if (PointerSize == 64) {
1350 switch (N->getOpcode()) {
1351 default:
1352 return false;
1353 case NVPTXISD::LoadV2:
1354 Opcode =
1356 NVPTX::LDV_i8_v2_ari_64, NVPTX::LDV_i16_v2_ari_64,
1357 NVPTX::LDV_i32_v2_ari_64, NVPTX::LDV_i64_v2_ari_64,
1358 NVPTX::LDV_f32_v2_ari_64, NVPTX::LDV_f64_v2_ari_64);
1359 break;
1360 case NVPTXISD::LoadV4:
1361 Opcode = pickOpcodeForVT(
1362 EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_ari_64,
1363 NVPTX::LDV_i16_v4_ari_64, NVPTX::LDV_i32_v4_ari_64, std::nullopt,
1364 NVPTX::LDV_f32_v4_ari_64, std::nullopt);
1365 break;
1366 }
1367 } else {
1368 switch (N->getOpcode()) {
1369 default:
1370 return false;
1371 case NVPTXISD::LoadV2:
1372 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1373 NVPTX::LDV_i8_v2_ari, NVPTX::LDV_i16_v2_ari,
1374 NVPTX::LDV_i32_v2_ari, NVPTX::LDV_i64_v2_ari,
1375 NVPTX::LDV_f32_v2_ari, NVPTX::LDV_f64_v2_ari);
1376 break;
1377 case NVPTXISD::LoadV4:
1378 Opcode =
1379 pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_ari,
1380 NVPTX::LDV_i16_v4_ari, NVPTX::LDV_i32_v4_ari,
1381 std::nullopt, NVPTX::LDV_f32_v4_ari, std::nullopt);
1382 break;
1383 }
1384 }
1385 if (!Opcode)
1386 return false;
1387 SDValue Ops[] = {getI32Imm(CodeMemorySem, DL),
1388 getI32Imm(CodeAddrSpace, DL),
1389 getI32Imm(VecType, DL),
1390 getI32Imm(FromType, DL),
1391 getI32Imm(FromTypeWidth, DL),
1392 Base,
1393 Offset,
1394 Chain};
1395
1396 LD = CurDAG->getMachineNode(*Opcode, DL, N->getVTList(), Ops);
1397 } else {
1398 if (PointerSize == 64) {
1399 switch (N->getOpcode()) {
1400 default:
1401 return false;
1402 case NVPTXISD::LoadV2:
1403 Opcode = pickOpcodeForVT(
1404 EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v2_areg_64,
1405 NVPTX::LDV_i16_v2_areg_64, NVPTX::LDV_i32_v2_areg_64,
1406 NVPTX::LDV_i64_v2_areg_64, NVPTX::LDV_f32_v2_areg_64,
1407 NVPTX::LDV_f64_v2_areg_64);
1408 break;
1409 case NVPTXISD::LoadV4:
1410 Opcode = pickOpcodeForVT(
1411 EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_areg_64,
1412 NVPTX::LDV_i16_v4_areg_64, NVPTX::LDV_i32_v4_areg_64, std::nullopt,
1413 NVPTX::LDV_f32_v4_areg_64, std::nullopt);
1414 break;
1415 }
1416 } else {
1417 switch (N->getOpcode()) {
1418 default:
1419 return false;
1420 case NVPTXISD::LoadV2:
1421 Opcode =
1422 pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v2_areg,
1423 NVPTX::LDV_i16_v2_areg, NVPTX::LDV_i32_v2_areg,
1424 NVPTX::LDV_i64_v2_areg, NVPTX::LDV_f32_v2_areg,
1425 NVPTX::LDV_f64_v2_areg);
1426 break;
1427 case NVPTXISD::LoadV4:
1428 Opcode =
1429 pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_areg,
1430 NVPTX::LDV_i16_v4_areg, NVPTX::LDV_i32_v4_areg,
1431 std::nullopt, NVPTX::LDV_f32_v4_areg, std::nullopt);
1432 break;
1433 }
1434 }
1435 if (!Opcode)
1436 return false;
1437 SDValue Ops[] = {getI32Imm(CodeMemorySem, DL),
1438 getI32Imm(CodeAddrSpace, DL),
1439 getI32Imm(VecType, DL),
1440 getI32Imm(FromType, DL),
1441 getI32Imm(FromTypeWidth, DL),
1442 Op1,
1443 Chain};
1444 LD = CurDAG->getMachineNode(*Opcode, DL, N->getVTList(), Ops);
1445 }
1446
1447 MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand();
1448 CurDAG->setNodeMemRefs(cast<MachineSDNode>(LD), {MemRef});
1449
1450 ReplaceNode(N, LD);
1451 return true;
1452}
1453
1454bool NVPTXDAGToDAGISel::tryLDGLDU(SDNode *N) {
1455
1456 SDValue Chain = N->getOperand(0);
1457 SDValue Op1;
1458 MemSDNode *Mem;
1459 bool IsLDG = true;
1460
1461 // If this is an LDG intrinsic, the address is the third operand. If its an
1462 // LDG/LDU SD node (from custom vector handling), then its the second operand
1463 if (N->getOpcode() == ISD::INTRINSIC_W_CHAIN) {
1464 Op1 = N->getOperand(2);
1465 Mem = cast<MemIntrinsicSDNode>(N);
1466 unsigned IID = N->getConstantOperandVal(1);
1467 switch (IID) {
1468 default:
1469 return false;
1470 case Intrinsic::nvvm_ldg_global_f:
1471 case Intrinsic::nvvm_ldg_global_i:
1472 case Intrinsic::nvvm_ldg_global_p:
1473 IsLDG = true;
1474 break;
1475 case Intrinsic::nvvm_ldu_global_f:
1476 case Intrinsic::nvvm_ldu_global_i:
1477 case Intrinsic::nvvm_ldu_global_p:
1478 IsLDG = false;
1479 break;
1480 }
1481 } else {
1482 Op1 = N->getOperand(1);
1483 Mem = cast<MemSDNode>(N);
1484 }
1485
1486 std::optional<unsigned> Opcode;
1487 SDLoc DL(N);
1488 SDNode *LD;
1490 EVT OrigType = N->getValueType(0);
1491
1492 EVT EltVT = Mem->getMemoryVT();
1493 unsigned NumElts = 1;
1494 if (EltVT.isVector()) {
1495 NumElts = EltVT.getVectorNumElements();
1496 EltVT = EltVT.getVectorElementType();
1497 // vectors of 16bits type are loaded/stored as multiples of v2x16 elements.
1498 if ((EltVT == MVT::f16 && OrigType == MVT::v2f16) ||
1499 (EltVT == MVT::bf16 && OrigType == MVT::v2bf16) ||
1500 (EltVT == MVT::i16 && OrigType == MVT::v2i16)) {
1501 assert(NumElts % 2 == 0 && "Vector must have even number of elements");
1502 EltVT = OrigType;
1503 NumElts /= 2;
1504 } else if (OrigType == MVT::v4i8) {
1505 EltVT = OrigType;
1506 NumElts = 1;
1507 }
1508 }
1509
1510 // Build the "promoted" result VTList for the load. If we are really loading
1511 // i8s, then the return type will be promoted to i16 since we do not expose
1512 // 8-bit registers in NVPTX.
1513 EVT NodeVT = (EltVT == MVT::i8) ? MVT::i16 : EltVT;
1514 SmallVector<EVT, 5> InstVTs;
1515 for (unsigned i = 0; i != NumElts; ++i) {
1516 InstVTs.push_back(NodeVT);
1517 }
1518 InstVTs.push_back(MVT::Other);
1519 SDVTList InstVTList = CurDAG->getVTList(InstVTs);
1520
1521 if (SelectDirectAddr(Op1, Addr)) {
1522 switch (N->getOpcode()) {
1523 default:
1524 return false;
1525 case ISD::LOAD:
1527 if (IsLDG)
1528 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1529 NVPTX::INT_PTX_LDG_GLOBAL_i8avar,
1530 NVPTX::INT_PTX_LDG_GLOBAL_i16avar,
1531 NVPTX::INT_PTX_LDG_GLOBAL_i32avar,
1532 NVPTX::INT_PTX_LDG_GLOBAL_i64avar,
1533 NVPTX::INT_PTX_LDG_GLOBAL_f32avar,
1534 NVPTX::INT_PTX_LDG_GLOBAL_f64avar);
1535 else
1536 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1537 NVPTX::INT_PTX_LDU_GLOBAL_i8avar,
1538 NVPTX::INT_PTX_LDU_GLOBAL_i16avar,
1539 NVPTX::INT_PTX_LDU_GLOBAL_i32avar,
1540 NVPTX::INT_PTX_LDU_GLOBAL_i64avar,
1541 NVPTX::INT_PTX_LDU_GLOBAL_f32avar,
1542 NVPTX::INT_PTX_LDU_GLOBAL_f64avar);
1543 break;
1544 case NVPTXISD::LoadV2:
1545 case NVPTXISD::LDGV2:
1546 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1547 NVPTX::INT_PTX_LDG_G_v2i8_ELE_avar,
1548 NVPTX::INT_PTX_LDG_G_v2i16_ELE_avar,
1549 NVPTX::INT_PTX_LDG_G_v2i32_ELE_avar,
1550 NVPTX::INT_PTX_LDG_G_v2i64_ELE_avar,
1551 NVPTX::INT_PTX_LDG_G_v2f32_ELE_avar,
1552 NVPTX::INT_PTX_LDG_G_v2f64_ELE_avar);
1553 break;
1554 case NVPTXISD::LDUV2:
1555 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1556 NVPTX::INT_PTX_LDU_G_v2i8_ELE_avar,
1557 NVPTX::INT_PTX_LDU_G_v2i16_ELE_avar,
1558 NVPTX::INT_PTX_LDU_G_v2i32_ELE_avar,
1559 NVPTX::INT_PTX_LDU_G_v2i64_ELE_avar,
1560 NVPTX::INT_PTX_LDU_G_v2f32_ELE_avar,
1561 NVPTX::INT_PTX_LDU_G_v2f64_ELE_avar);
1562 break;
1563 case NVPTXISD::LoadV4:
1564 case NVPTXISD::LDGV4:
1565 Opcode = pickOpcodeForVT(
1566 EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDG_G_v4i8_ELE_avar,
1567 NVPTX::INT_PTX_LDG_G_v4i16_ELE_avar,
1568 NVPTX::INT_PTX_LDG_G_v4i32_ELE_avar, std::nullopt,
1569 NVPTX::INT_PTX_LDG_G_v4f32_ELE_avar, std::nullopt);
1570 break;
1571 case NVPTXISD::LDUV4:
1572 Opcode = pickOpcodeForVT(
1573 EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDU_G_v4i8_ELE_avar,
1574 NVPTX::INT_PTX_LDU_G_v4i16_ELE_avar,
1575 NVPTX::INT_PTX_LDU_G_v4i32_ELE_avar, std::nullopt,
1576 NVPTX::INT_PTX_LDU_G_v4f32_ELE_avar, std::nullopt);
1577 break;
1578 }
1579 if (!Opcode)
1580 return false;
1581 SDValue Ops[] = { Addr, Chain };
1582 LD = CurDAG->getMachineNode(*Opcode, DL, InstVTList, Ops);
1583 } else if (TM.is64Bit() ? SelectADDRri64(Op1.getNode(), Op1, Base, Offset)
1584 : SelectADDRri(Op1.getNode(), Op1, Base, Offset)) {
1585 if (TM.is64Bit()) {
1586 switch (N->getOpcode()) {
1587 default:
1588 return false;
1589 case ISD::LOAD:
1591 if (IsLDG)
1592 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1593 NVPTX::INT_PTX_LDG_GLOBAL_i8ari64,
1594 NVPTX::INT_PTX_LDG_GLOBAL_i16ari64,
1595 NVPTX::INT_PTX_LDG_GLOBAL_i32ari64,
1596 NVPTX::INT_PTX_LDG_GLOBAL_i64ari64,
1597 NVPTX::INT_PTX_LDG_GLOBAL_f32ari64,
1598 NVPTX::INT_PTX_LDG_GLOBAL_f64ari64);
1599 else
1600 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1601 NVPTX::INT_PTX_LDU_GLOBAL_i8ari64,
1602 NVPTX::INT_PTX_LDU_GLOBAL_i16ari64,
1603 NVPTX::INT_PTX_LDU_GLOBAL_i32ari64,
1604 NVPTX::INT_PTX_LDU_GLOBAL_i64ari64,
1605 NVPTX::INT_PTX_LDU_GLOBAL_f32ari64,
1606 NVPTX::INT_PTX_LDU_GLOBAL_f64ari64);
1607 break;
1608 case NVPTXISD::LoadV2:
1609 case NVPTXISD::LDGV2:
1610 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1611 NVPTX::INT_PTX_LDG_G_v2i8_ELE_ari64,
1612 NVPTX::INT_PTX_LDG_G_v2i16_ELE_ari64,
1613 NVPTX::INT_PTX_LDG_G_v2i32_ELE_ari64,
1614 NVPTX::INT_PTX_LDG_G_v2i64_ELE_ari64,
1615 NVPTX::INT_PTX_LDG_G_v2f32_ELE_ari64,
1616 NVPTX::INT_PTX_LDG_G_v2f64_ELE_ari64);
1617 break;
1618 case NVPTXISD::LDUV2:
1619 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1620 NVPTX::INT_PTX_LDU_G_v2i8_ELE_ari64,
1621 NVPTX::INT_PTX_LDU_G_v2i16_ELE_ari64,
1622 NVPTX::INT_PTX_LDU_G_v2i32_ELE_ari64,
1623 NVPTX::INT_PTX_LDU_G_v2i64_ELE_ari64,
1624 NVPTX::INT_PTX_LDU_G_v2f32_ELE_ari64,
1625 NVPTX::INT_PTX_LDU_G_v2f64_ELE_ari64);
1626 break;
1627 case NVPTXISD::LoadV4:
1628 case NVPTXISD::LDGV4:
1629 Opcode = pickOpcodeForVT(
1630 EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDG_G_v4i8_ELE_ari64,
1631 NVPTX::INT_PTX_LDG_G_v4i16_ELE_ari64,
1632 NVPTX::INT_PTX_LDG_G_v4i32_ELE_ari64, std::nullopt,
1633 NVPTX::INT_PTX_LDG_G_v4f32_ELE_ari64, std::nullopt);
1634 break;
1635 case NVPTXISD::LDUV4:
1636 Opcode = pickOpcodeForVT(
1637 EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDU_G_v4i8_ELE_ari64,
1638 NVPTX::INT_PTX_LDU_G_v4i16_ELE_ari64,
1639 NVPTX::INT_PTX_LDU_G_v4i32_ELE_ari64, std::nullopt,
1640 NVPTX::INT_PTX_LDU_G_v4f32_ELE_ari64, std::nullopt);
1641 break;
1642 }
1643 } else {
1644 switch (N->getOpcode()) {
1645 default:
1646 return false;
1647 case ISD::LOAD:
1649 if (IsLDG)
1650 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1651 NVPTX::INT_PTX_LDG_GLOBAL_i8ari,
1652 NVPTX::INT_PTX_LDG_GLOBAL_i16ari,
1653 NVPTX::INT_PTX_LDG_GLOBAL_i32ari,
1654 NVPTX::INT_PTX_LDG_GLOBAL_i64ari,
1655 NVPTX::INT_PTX_LDG_GLOBAL_f32ari,
1656 NVPTX::INT_PTX_LDG_GLOBAL_f64ari);
1657 else
1658 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1659 NVPTX::INT_PTX_LDU_GLOBAL_i8ari,
1660 NVPTX::INT_PTX_LDU_GLOBAL_i16ari,
1661 NVPTX::INT_PTX_LDU_GLOBAL_i32ari,
1662 NVPTX::INT_PTX_LDU_GLOBAL_i64ari,
1663 NVPTX::INT_PTX_LDU_GLOBAL_f32ari,
1664 NVPTX::INT_PTX_LDU_GLOBAL_f64ari);
1665 break;
1666 case NVPTXISD::LoadV2:
1667 case NVPTXISD::LDGV2:
1668 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1669 NVPTX::INT_PTX_LDG_G_v2i8_ELE_ari32,
1670 NVPTX::INT_PTX_LDG_G_v2i16_ELE_ari32,
1671 NVPTX::INT_PTX_LDG_G_v2i32_ELE_ari32,
1672 NVPTX::INT_PTX_LDG_G_v2i64_ELE_ari32,
1673 NVPTX::INT_PTX_LDG_G_v2f32_ELE_ari32,
1674 NVPTX::INT_PTX_LDG_G_v2f64_ELE_ari32);
1675 break;
1676 case NVPTXISD::LDUV2:
1677 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1678 NVPTX::INT_PTX_LDU_G_v2i8_ELE_ari32,
1679 NVPTX::INT_PTX_LDU_G_v2i16_ELE_ari32,
1680 NVPTX::INT_PTX_LDU_G_v2i32_ELE_ari32,
1681 NVPTX::INT_PTX_LDU_G_v2i64_ELE_ari32,
1682 NVPTX::INT_PTX_LDU_G_v2f32_ELE_ari32,
1683 NVPTX::INT_PTX_LDU_G_v2f64_ELE_ari32);
1684 break;
1685 case NVPTXISD::LoadV4:
1686 case NVPTXISD::LDGV4:
1687 Opcode = pickOpcodeForVT(
1688 EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDG_G_v4i8_ELE_ari32,
1689 NVPTX::INT_PTX_LDG_G_v4i16_ELE_ari32,
1690 NVPTX::INT_PTX_LDG_G_v4i32_ELE_ari32, std::nullopt,
1691 NVPTX::INT_PTX_LDG_G_v4f32_ELE_ari32, std::nullopt);
1692 break;
1693 case NVPTXISD::LDUV4:
1694 Opcode = pickOpcodeForVT(
1695 EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDU_G_v4i8_ELE_ari32,
1696 NVPTX::INT_PTX_LDU_G_v4i16_ELE_ari32,
1697 NVPTX::INT_PTX_LDU_G_v4i32_ELE_ari32, std::nullopt,
1698 NVPTX::INT_PTX_LDU_G_v4f32_ELE_ari32, std::nullopt);
1699 break;
1700 }
1701 }
1702 if (!Opcode)
1703 return false;
1704 SDValue Ops[] = {Base, Offset, Chain};
1705 LD = CurDAG->getMachineNode(*Opcode, DL, InstVTList, Ops);
1706 } else {
1707 if (TM.is64Bit()) {
1708 switch (N->getOpcode()) {
1709 default:
1710 return false;
1711 case ISD::LOAD:
1713 if (IsLDG)
1714 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1715 NVPTX::INT_PTX_LDG_GLOBAL_i8areg64,
1716 NVPTX::INT_PTX_LDG_GLOBAL_i16areg64,
1717 NVPTX::INT_PTX_LDG_GLOBAL_i32areg64,
1718 NVPTX::INT_PTX_LDG_GLOBAL_i64areg64,
1719 NVPTX::INT_PTX_LDG_GLOBAL_f32areg64,
1720 NVPTX::INT_PTX_LDG_GLOBAL_f64areg64);
1721 else
1722 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1723 NVPTX::INT_PTX_LDU_GLOBAL_i8areg64,
1724 NVPTX::INT_PTX_LDU_GLOBAL_i16areg64,
1725 NVPTX::INT_PTX_LDU_GLOBAL_i32areg64,
1726 NVPTX::INT_PTX_LDU_GLOBAL_i64areg64,
1727 NVPTX::INT_PTX_LDU_GLOBAL_f32areg64,
1728 NVPTX::INT_PTX_LDU_GLOBAL_f64areg64);
1729 break;
1730 case NVPTXISD::LoadV2:
1731 case NVPTXISD::LDGV2:
1732 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1733 NVPTX::INT_PTX_LDG_G_v2i8_ELE_areg64,
1734 NVPTX::INT_PTX_LDG_G_v2i16_ELE_areg64,
1735 NVPTX::INT_PTX_LDG_G_v2i32_ELE_areg64,
1736 NVPTX::INT_PTX_LDG_G_v2i64_ELE_areg64,
1737 NVPTX::INT_PTX_LDG_G_v2f32_ELE_areg64,
1738 NVPTX::INT_PTX_LDG_G_v2f64_ELE_areg64);
1739 break;
1740 case NVPTXISD::LDUV2:
1741 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1742 NVPTX::INT_PTX_LDU_G_v2i8_ELE_areg64,
1743 NVPTX::INT_PTX_LDU_G_v2i16_ELE_areg64,
1744 NVPTX::INT_PTX_LDU_G_v2i32_ELE_areg64,
1745 NVPTX::INT_PTX_LDU_G_v2i64_ELE_areg64,
1746 NVPTX::INT_PTX_LDU_G_v2f32_ELE_areg64,
1747 NVPTX::INT_PTX_LDU_G_v2f64_ELE_areg64);
1748 break;
1749 case NVPTXISD::LoadV4:
1750 case NVPTXISD::LDGV4:
1751 Opcode = pickOpcodeForVT(
1752 EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDG_G_v4i8_ELE_areg64,
1753 NVPTX::INT_PTX_LDG_G_v4i16_ELE_areg64,
1754 NVPTX::INT_PTX_LDG_G_v4i32_ELE_areg64, std::nullopt,
1755 NVPTX::INT_PTX_LDG_G_v4f32_ELE_areg64, std::nullopt);
1756 break;
1757 case NVPTXISD::LDUV4:
1758 Opcode = pickOpcodeForVT(
1759 EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDU_G_v4i8_ELE_areg64,
1760 NVPTX::INT_PTX_LDU_G_v4i16_ELE_areg64,
1761 NVPTX::INT_PTX_LDU_G_v4i32_ELE_areg64, std::nullopt,
1762 NVPTX::INT_PTX_LDU_G_v4f32_ELE_areg64, std::nullopt);
1763 break;
1764 }
1765 } else {
1766 switch (N->getOpcode()) {
1767 default:
1768 return false;
1769 case ISD::LOAD:
1771 if (IsLDG)
1772 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1773 NVPTX::INT_PTX_LDG_GLOBAL_i8areg,
1774 NVPTX::INT_PTX_LDG_GLOBAL_i16areg,
1775 NVPTX::INT_PTX_LDG_GLOBAL_i32areg,
1776 NVPTX::INT_PTX_LDG_GLOBAL_i64areg,
1777 NVPTX::INT_PTX_LDG_GLOBAL_f32areg,
1778 NVPTX::INT_PTX_LDG_GLOBAL_f64areg);
1779 else
1780 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1781 NVPTX::INT_PTX_LDU_GLOBAL_i8areg,
1782 NVPTX::INT_PTX_LDU_GLOBAL_i16areg,
1783 NVPTX::INT_PTX_LDU_GLOBAL_i32areg,
1784 NVPTX::INT_PTX_LDU_GLOBAL_i64areg,
1785 NVPTX::INT_PTX_LDU_GLOBAL_f32areg,
1786 NVPTX::INT_PTX_LDU_GLOBAL_f64areg);
1787 break;
1788 case NVPTXISD::LoadV2:
1789 case NVPTXISD::LDGV2:
1790 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1791 NVPTX::INT_PTX_LDG_G_v2i8_ELE_areg32,
1792 NVPTX::INT_PTX_LDG_G_v2i16_ELE_areg32,
1793 NVPTX::INT_PTX_LDG_G_v2i32_ELE_areg32,
1794 NVPTX::INT_PTX_LDG_G_v2i64_ELE_areg32,
1795 NVPTX::INT_PTX_LDG_G_v2f32_ELE_areg32,
1796 NVPTX::INT_PTX_LDG_G_v2f64_ELE_areg32);
1797 break;
1798 case NVPTXISD::LDUV2:
1799 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1800 NVPTX::INT_PTX_LDU_G_v2i8_ELE_areg32,
1801 NVPTX::INT_PTX_LDU_G_v2i16_ELE_areg32,
1802 NVPTX::INT_PTX_LDU_G_v2i32_ELE_areg32,
1803 NVPTX::INT_PTX_LDU_G_v2i64_ELE_areg32,
1804 NVPTX::INT_PTX_LDU_G_v2f32_ELE_areg32,
1805 NVPTX::INT_PTX_LDU_G_v2f64_ELE_areg32);
1806 break;
1807 case NVPTXISD::LoadV4:
1808 case NVPTXISD::LDGV4:
1809 Opcode = pickOpcodeForVT(
1810 EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDG_G_v4i8_ELE_areg32,
1811 NVPTX::INT_PTX_LDG_G_v4i16_ELE_areg32,
1812 NVPTX::INT_PTX_LDG_G_v4i32_ELE_areg32, std::nullopt,
1813 NVPTX::INT_PTX_LDG_G_v4f32_ELE_areg32, std::nullopt);
1814 break;
1815 case NVPTXISD::LDUV4:
1816 Opcode = pickOpcodeForVT(
1817 EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDU_G_v4i8_ELE_areg32,
1818 NVPTX::INT_PTX_LDU_G_v4i16_ELE_areg32,
1819 NVPTX::INT_PTX_LDU_G_v4i32_ELE_areg32, std::nullopt,
1820 NVPTX::INT_PTX_LDU_G_v4f32_ELE_areg32, std::nullopt);
1821 break;
1822 }
1823 }
1824 if (!Opcode)
1825 return false;
1826 SDValue Ops[] = { Op1, Chain };
1827 LD = CurDAG->getMachineNode(*Opcode, DL, InstVTList, Ops);
1828 }
1829
1830 // For automatic generation of LDG (through SelectLoad[Vector], not the
1831 // intrinsics), we may have an extending load like:
1832 //
1833 // i32,ch = load<LD1[%data1(addrspace=1)], zext from i8> t0, t7, undef:i64
1834 //
1835 // In this case, the matching logic above will select a load for the original
1836 // memory type (in this case, i8) and our types will not match (the node needs
1837 // to return an i32 in this case). Our LDG/LDU nodes do not support the
1838 // concept of sign-/zero-extension, so emulate it here by adding an explicit
1839 // CVT instruction. Ptxas should clean up any redundancies here.
1840
1841 LoadSDNode *LdNode = dyn_cast<LoadSDNode>(N);
1842
1843 if (OrigType != EltVT &&
1844 (LdNode || (OrigType.isFloatingPoint() && EltVT.isFloatingPoint()))) {
1845 // We have an extending-load. The instruction we selected operates on the
1846 // smaller type, but the SDNode we are replacing has the larger type. We
1847 // need to emit a CVT to make the types match.
1848 unsigned CvtOpc =
1849 GetConvertOpcode(OrigType.getSimpleVT(), EltVT.getSimpleVT(), LdNode);
1850
1851 // For each output value, apply the manual sign/zero-extension and make sure
1852 // all users of the load go through that CVT.
1853 for (unsigned i = 0; i != NumElts; ++i) {
1854 SDValue Res(LD, i);
1855 SDValue OrigVal(N, i);
1856
1857 SDNode *CvtNode =
1858 CurDAG->getMachineNode(CvtOpc, DL, OrigType, Res,
1860 DL, MVT::i32));
1861 ReplaceUses(OrigVal, SDValue(CvtNode, 0));
1862 }
1863 }
1864
1865 ReplaceNode(N, LD);
1866 return true;
1867}
1868
1869bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
1870 SDLoc dl(N);
1871 MemSDNode *ST = cast<MemSDNode>(N);
1872 assert(ST->writeMem() && "Expected store");
1873 StoreSDNode *PlainStore = dyn_cast<StoreSDNode>(N);
1874 AtomicSDNode *AtomicStore = dyn_cast<AtomicSDNode>(N);
1875 assert((PlainStore || AtomicStore) && "Expected store");
1876 EVT StoreVT = ST->getMemoryVT();
1877 SDNode *NVPTXST = nullptr;
1878
1879 // do not support pre/post inc/dec
1880 if (PlainStore && PlainStore->isIndexed())
1881 return false;
1882
1883 if (!StoreVT.isSimple())
1884 return false;
1885
1886 // Address Space Setting
1887 unsigned int CodeAddrSpace = getCodeAddrSpace(ST);
1888 unsigned int PointerSize =
1889 CurDAG->getDataLayout().getPointerSizeInBits(ST->getAddressSpace());
1890
1891 // Memory Semantic Setting
1892 unsigned int CodeMemorySem = getCodeMemorySemantic(ST, Subtarget);
1893
1894 // Vector Setting
1895 MVT SimpleVT = StoreVT.getSimpleVT();
1896 unsigned vecType = NVPTX::PTXLdStInstCode::Scalar;
1897
1898 // Type Setting: toType + toTypeWidth
1899 // - for integer type, always use 'u'
1900 //
1901 MVT ScalarVT = SimpleVT.getScalarType();
1902 unsigned toTypeWidth = ScalarVT.getSizeInBits();
1903 if (SimpleVT.isVector()) {
1904 assert((Isv2x16VT(StoreVT) || StoreVT == MVT::v4i8) &&
1905 "Unexpected vector type");
1906 // v2x16 is stored using st.b32
1907 toTypeWidth = 32;
1908 }
1909
1910 unsigned int toType = getLdStRegType(ScalarVT);
1911
1912 // Create the machine instruction DAG
1913 SDValue Chain = ST->getChain();
1914 SDValue Value = PlainStore ? PlainStore->getValue() : AtomicStore->getVal();
1915 SDValue BasePtr = ST->getBasePtr();
1916 SDValue Addr;
1918 std::optional<unsigned> Opcode;
1919 MVT::SimpleValueType SourceVT =
1920 Value.getNode()->getSimpleValueType(0).SimpleTy;
1921
1922 if (SelectDirectAddr(BasePtr, Addr)) {
1923 Opcode = pickOpcodeForVT(SourceVT, NVPTX::ST_i8_avar, NVPTX::ST_i16_avar,
1924 NVPTX::ST_i32_avar, NVPTX::ST_i64_avar,
1925 NVPTX::ST_f32_avar, NVPTX::ST_f64_avar);
1926 if (!Opcode)
1927 return false;
1928 SDValue Ops[] = {Value,
1929 getI32Imm(CodeMemorySem, dl),
1930 getI32Imm(CodeAddrSpace, dl),
1931 getI32Imm(vecType, dl),
1932 getI32Imm(toType, dl),
1933 getI32Imm(toTypeWidth, dl),
1934 Addr,
1935 Chain};
1936 NVPTXST = CurDAG->getMachineNode(*Opcode, dl, MVT::Other, Ops);
1937 } else if (PointerSize == 64
1938 ? SelectADDRsi64(BasePtr.getNode(), BasePtr, Base, Offset)
1939 : SelectADDRsi(BasePtr.getNode(), BasePtr, Base, Offset)) {
1940 Opcode = pickOpcodeForVT(SourceVT, NVPTX::ST_i8_asi, NVPTX::ST_i16_asi,
1941 NVPTX::ST_i32_asi, NVPTX::ST_i64_asi,
1942 NVPTX::ST_f32_asi, NVPTX::ST_f64_asi);
1943 if (!Opcode)
1944 return false;
1945 SDValue Ops[] = {Value,
1946 getI32Imm(CodeMemorySem, dl),
1947 getI32Imm(CodeAddrSpace, dl),
1948 getI32Imm(vecType, dl),
1949 getI32Imm(toType, dl),
1950 getI32Imm(toTypeWidth, dl),
1951 Base,
1952 Offset,
1953 Chain};
1954 NVPTXST = CurDAG->getMachineNode(*Opcode, dl, MVT::Other, Ops);
1955 } else if (PointerSize == 64
1956 ? SelectADDRri64(BasePtr.getNode(), BasePtr, Base, Offset)
1957 : SelectADDRri(BasePtr.getNode(), BasePtr, Base, Offset)) {
1958 if (PointerSize == 64)
1959 Opcode =
1960 pickOpcodeForVT(SourceVT, NVPTX::ST_i8_ari_64, NVPTX::ST_i16_ari_64,
1961 NVPTX::ST_i32_ari_64, NVPTX::ST_i64_ari_64,
1962 NVPTX::ST_f32_ari_64, NVPTX::ST_f64_ari_64);
1963 else
1964 Opcode = pickOpcodeForVT(SourceVT, NVPTX::ST_i8_ari, NVPTX::ST_i16_ari,
1965 NVPTX::ST_i32_ari, NVPTX::ST_i64_ari,
1966 NVPTX::ST_f32_ari, NVPTX::ST_f64_ari);
1967 if (!Opcode)
1968 return false;
1969
1970 SDValue Ops[] = {Value,
1971 getI32Imm(CodeMemorySem, dl),
1972 getI32Imm(CodeAddrSpace, dl),
1973 getI32Imm(vecType, dl),
1974 getI32Imm(toType, dl),
1975 getI32Imm(toTypeWidth, dl),
1976 Base,
1977 Offset,
1978 Chain};
1979 NVPTXST = CurDAG->getMachineNode(*Opcode, dl, MVT::Other, Ops);
1980 } else {
1981 if (PointerSize == 64)
1982 Opcode =
1983 pickOpcodeForVT(SourceVT, NVPTX::ST_i8_areg_64, NVPTX::ST_i16_areg_64,
1984 NVPTX::ST_i32_areg_64, NVPTX::ST_i64_areg_64,
1985 NVPTX::ST_f32_areg_64, NVPTX::ST_f64_areg_64);
1986 else
1987 Opcode = pickOpcodeForVT(SourceVT, NVPTX::ST_i8_areg, NVPTX::ST_i16_areg,
1988 NVPTX::ST_i32_areg, NVPTX::ST_i64_areg,
1989 NVPTX::ST_f32_areg, NVPTX::ST_f64_areg);
1990 if (!Opcode)
1991 return false;
1992 SDValue Ops[] = {Value,
1993 getI32Imm(CodeMemorySem, dl),
1994 getI32Imm(CodeAddrSpace, dl),
1995 getI32Imm(vecType, dl),
1996 getI32Imm(toType, dl),
1997 getI32Imm(toTypeWidth, dl),
1998 BasePtr,
1999 Chain};
2000 NVPTXST = CurDAG->getMachineNode(*Opcode, dl, MVT::Other, Ops);
2001 }
2002
2003 if (!NVPTXST)
2004 return false;
2005
2006 MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand();
2007 CurDAG->setNodeMemRefs(cast<MachineSDNode>(NVPTXST), {MemRef});
2008 ReplaceNode(N, NVPTXST);
2009 return true;
2010}
2011
2012bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
2013 SDValue Chain = N->getOperand(0);
2014 SDValue Op1 = N->getOperand(1);
2016 std::optional<unsigned> Opcode;
2017 SDLoc DL(N);
2018 SDNode *ST;
2019 EVT EltVT = Op1.getValueType();
2020 MemSDNode *MemSD = cast<MemSDNode>(N);
2021 EVT StoreVT = MemSD->getMemoryVT();
2022
2023 // Address Space Setting
2024 unsigned CodeAddrSpace = getCodeAddrSpace(MemSD);
2025 if (CodeAddrSpace == NVPTX::PTXLdStInstCode::CONSTANT) {
2026 report_fatal_error("Cannot store to pointer that points to constant "
2027 "memory space");
2028 }
2029 unsigned int PointerSize =
2031
2032 // Memory Semantic Setting
2033 unsigned int CodeMemorySem = getCodeMemorySemantic(MemSD, Subtarget);
2034
2035 // Type Setting: toType + toTypeWidth
2036 // - for integer type, always use 'u'
2037 assert(StoreVT.isSimple() && "Store value is not simple");
2038 MVT ScalarVT = StoreVT.getSimpleVT().getScalarType();
2039 unsigned ToTypeWidth = ScalarVT.getSizeInBits();
2040 unsigned ToType = getLdStRegType(ScalarVT);
2041
2043 SDValue N2;
2044 unsigned VecType;
2045
2046 switch (N->getOpcode()) {
2047 case NVPTXISD::StoreV2:
2049 StOps.push_back(N->getOperand(1));
2050 StOps.push_back(N->getOperand(2));
2051 N2 = N->getOperand(3);
2052 break;
2053 case NVPTXISD::StoreV4:
2055 StOps.push_back(N->getOperand(1));
2056 StOps.push_back(N->getOperand(2));
2057 StOps.push_back(N->getOperand(3));
2058 StOps.push_back(N->getOperand(4));
2059 N2 = N->getOperand(5);
2060 break;
2061 default:
2062 return false;
2063 }
2064
2065 // v8x16 is a special case. PTX doesn't have st.v8.x16
2066 // instruction. Instead, we split the vector into v2x16 chunks and
2067 // store them with st.v4.b32.
2068 if (Isv2x16VT(EltVT)) {
2069 assert(N->getOpcode() == NVPTXISD::StoreV4 && "Unexpected load opcode.");
2070 EltVT = MVT::i32;
2072 ToTypeWidth = 32;
2073 }
2074
2075 StOps.push_back(getI32Imm(CodeMemorySem, DL));
2076 StOps.push_back(getI32Imm(CodeAddrSpace, DL));
2077 StOps.push_back(getI32Imm(VecType, DL));
2078 StOps.push_back(getI32Imm(ToType, DL));
2079 StOps.push_back(getI32Imm(ToTypeWidth, DL));
2080
2081 if (SelectDirectAddr(N2, Addr)) {
2082 switch (N->getOpcode()) {
2083 default:
2084 return false;
2085 case NVPTXISD::StoreV2:
2086 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
2087 NVPTX::STV_i8_v2_avar, NVPTX::STV_i16_v2_avar,
2088 NVPTX::STV_i32_v2_avar, NVPTX::STV_i64_v2_avar,
2089 NVPTX::STV_f32_v2_avar, NVPTX::STV_f64_v2_avar);
2090 break;
2091 case NVPTXISD::StoreV4:
2092 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
2093 NVPTX::STV_i8_v4_avar, NVPTX::STV_i16_v4_avar,
2094 NVPTX::STV_i32_v4_avar, std::nullopt,
2095 NVPTX::STV_f32_v4_avar, std::nullopt);
2096 break;
2097 }
2098 StOps.push_back(Addr);
2099 } else if (PointerSize == 64 ? SelectADDRsi64(N2.getNode(), N2, Base, Offset)
2100 : SelectADDRsi(N2.getNode(), N2, Base, Offset)) {
2101 switch (N->getOpcode()) {
2102 default:
2103 return false;
2104 case NVPTXISD::StoreV2:
2105 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
2106 NVPTX::STV_i8_v2_asi, NVPTX::STV_i16_v2_asi,
2107 NVPTX::STV_i32_v2_asi, NVPTX::STV_i64_v2_asi,
2108 NVPTX::STV_f32_v2_asi, NVPTX::STV_f64_v2_asi);
2109 break;
2110 case NVPTXISD::StoreV4:
2111 Opcode =
2112 pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v4_asi,
2113 NVPTX::STV_i16_v4_asi, NVPTX::STV_i32_v4_asi,
2114 std::nullopt, NVPTX::STV_f32_v4_asi, std::nullopt);
2115 break;
2116 }
2117 StOps.push_back(Base);
2118 StOps.push_back(Offset);
2119 } else if (PointerSize == 64 ? SelectADDRri64(N2.getNode(), N2, Base, Offset)
2120 : SelectADDRri(N2.getNode(), N2, Base, Offset)) {
2121 if (PointerSize == 64) {
2122 switch (N->getOpcode()) {
2123 default:
2124 return false;
2125 case NVPTXISD::StoreV2:
2126 Opcode =
2128 NVPTX::STV_i8_v2_ari_64, NVPTX::STV_i16_v2_ari_64,
2129 NVPTX::STV_i32_v2_ari_64, NVPTX::STV_i64_v2_ari_64,
2130 NVPTX::STV_f32_v2_ari_64, NVPTX::STV_f64_v2_ari_64);
2131 break;
2132 case NVPTXISD::StoreV4:
2133 Opcode = pickOpcodeForVT(
2134 EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v4_ari_64,
2135 NVPTX::STV_i16_v4_ari_64, NVPTX::STV_i32_v4_ari_64, std::nullopt,
2136 NVPTX::STV_f32_v4_ari_64, std::nullopt);
2137 break;
2138 }
2139 } else {
2140 switch (N->getOpcode()) {
2141 default:
2142 return false;
2143 case NVPTXISD::StoreV2:
2144 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
2145 NVPTX::STV_i8_v2_ari, NVPTX::STV_i16_v2_ari,
2146 NVPTX::STV_i32_v2_ari, NVPTX::STV_i64_v2_ari,
2147 NVPTX::STV_f32_v2_ari, NVPTX::STV_f64_v2_ari);
2148 break;
2149 case NVPTXISD::StoreV4:
2150 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
2151 NVPTX::STV_i8_v4_ari, NVPTX::STV_i16_v4_ari,
2152 NVPTX::STV_i32_v4_ari, std::nullopt,
2153 NVPTX::STV_f32_v4_ari, std::nullopt);
2154 break;
2155 }
2156 }
2157 StOps.push_back(Base);
2158 StOps.push_back(Offset);
2159 } else {
2160 if (PointerSize == 64) {
2161 switch (N->getOpcode()) {
2162 default:
2163 return false;
2164 case NVPTXISD::StoreV2:
2165 Opcode = pickOpcodeForVT(
2166 EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v2_areg_64,
2167 NVPTX::STV_i16_v2_areg_64, NVPTX::STV_i32_v2_areg_64,
2168 NVPTX::STV_i64_v2_areg_64, NVPTX::STV_f32_v2_areg_64,
2169 NVPTX::STV_f64_v2_areg_64);
2170 break;
2171 case NVPTXISD::StoreV4:
2172 Opcode = pickOpcodeForVT(
2173 EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v4_areg_64,
2174 NVPTX::STV_i16_v4_areg_64, NVPTX::STV_i32_v4_areg_64, std::nullopt,
2175 NVPTX::STV_f32_v4_areg_64, std::nullopt);
2176 break;
2177 }
2178 } else {
2179 switch (N->getOpcode()) {
2180 default:
2181 return false;
2182 case NVPTXISD::StoreV2:
2183 Opcode =
2184 pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v2_areg,
2185 NVPTX::STV_i16_v2_areg, NVPTX::STV_i32_v2_areg,
2186 NVPTX::STV_i64_v2_areg, NVPTX::STV_f32_v2_areg,
2187 NVPTX::STV_f64_v2_areg);
2188 break;
2189 case NVPTXISD::StoreV4:
2190 Opcode =
2191 pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v4_areg,
2192 NVPTX::STV_i16_v4_areg, NVPTX::STV_i32_v4_areg,
2193 std::nullopt, NVPTX::STV_f32_v4_areg, std::nullopt);
2194 break;
2195 }
2196 }
2197 StOps.push_back(N2);
2198 }
2199
2200 if (!Opcode)
2201 return false;
2202
2203 StOps.push_back(Chain);
2204
2205 ST = CurDAG->getMachineNode(*Opcode, DL, MVT::Other, StOps);
2206
2207 MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand();
2208 CurDAG->setNodeMemRefs(cast<MachineSDNode>(ST), {MemRef});
2209
2210 ReplaceNode(N, ST);
2211 return true;
2212}
2213
2214bool NVPTXDAGToDAGISel::tryLoadParam(SDNode *Node) {
2215 SDValue Chain = Node->getOperand(0);
2216 SDValue Offset = Node->getOperand(2);
2217 SDValue Glue = Node->getOperand(3);
2218 SDLoc DL(Node);
2219 MemSDNode *Mem = cast<MemSDNode>(Node);
2220
2221 unsigned VecSize;
2222 switch (Node->getOpcode()) {
2223 default:
2224 return false;
2226 VecSize = 1;
2227 break;
2229 VecSize = 2;
2230 break;
2232 VecSize = 4;
2233 break;
2234 }
2235
2236 EVT EltVT = Node->getValueType(0);
2237 EVT MemVT = Mem->getMemoryVT();
2238
2239 std::optional<unsigned> Opcode;
2240
2241 switch (VecSize) {
2242 default:
2243 return false;
2244 case 1:
2245 Opcode = pickOpcodeForVT(MemVT.getSimpleVT().SimpleTy,
2246 NVPTX::LoadParamMemI8, NVPTX::LoadParamMemI16,
2247 NVPTX::LoadParamMemI32, NVPTX::LoadParamMemI64,
2248 NVPTX::LoadParamMemF32, NVPTX::LoadParamMemF64);
2249 break;
2250 case 2:
2251 Opcode =
2252 pickOpcodeForVT(MemVT.getSimpleVT().SimpleTy, NVPTX::LoadParamMemV2I8,
2253 NVPTX::LoadParamMemV2I16, NVPTX::LoadParamMemV2I32,
2254 NVPTX::LoadParamMemV2I64, NVPTX::LoadParamMemV2F32,
2255 NVPTX::LoadParamMemV2F64);
2256 break;
2257 case 4:
2258 Opcode =
2259 pickOpcodeForVT(MemVT.getSimpleVT().SimpleTy, NVPTX::LoadParamMemV4I8,
2260 NVPTX::LoadParamMemV4I16, NVPTX::LoadParamMemV4I32,
2261 std::nullopt, NVPTX::LoadParamMemV4F32, std::nullopt);
2262 break;
2263 }
2264 if (!Opcode)
2265 return false;
2266
2267 SDVTList VTs;
2268 if (VecSize == 1) {
2269 VTs = CurDAG->getVTList(EltVT, MVT::Other, MVT::Glue);
2270 } else if (VecSize == 2) {
2271 VTs = CurDAG->getVTList(EltVT, EltVT, MVT::Other, MVT::Glue);
2272 } else {
2273 EVT EVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other, MVT::Glue };
2274 VTs = CurDAG->getVTList(EVTs);
2275 }
2276
2277 unsigned OffsetVal = Offset->getAsZExtVal();
2278
2280 Ops.push_back(CurDAG->getTargetConstant(OffsetVal, DL, MVT::i32));
2281 Ops.push_back(Chain);
2282 Ops.push_back(Glue);
2283
2284 ReplaceNode(Node, CurDAG->getMachineNode(*Opcode, DL, VTs, Ops));
2285 return true;
2286}
2287
2288bool NVPTXDAGToDAGISel::tryStoreRetval(SDNode *N) {
2289 SDLoc DL(N);
2290 SDValue Chain = N->getOperand(0);
2291 SDValue Offset = N->getOperand(1);
2292 unsigned OffsetVal = Offset->getAsZExtVal();
2293 MemSDNode *Mem = cast<MemSDNode>(N);
2294
2295 // How many elements do we have?
2296 unsigned NumElts = 1;
2297 switch (N->getOpcode()) {
2298 default:
2299 return false;
2301 NumElts = 1;
2302 break;
2304 NumElts = 2;
2305 break;
2307 NumElts = 4;
2308 break;
2309 }
2310
2311 // Build vector of operands
2313 for (unsigned i = 0; i < NumElts; ++i)
2314 Ops.push_back(N->getOperand(i + 2));
2315 Ops.push_back(CurDAG->getTargetConstant(OffsetVal, DL, MVT::i32));
2316 Ops.push_back(Chain);
2317
2318 // Determine target opcode
2319 // If we have an i1, use an 8-bit store. The lowering code in
2320 // NVPTXISelLowering will have already emitted an upcast.
2321 std::optional<unsigned> Opcode = 0;
2322 switch (NumElts) {
2323 default:
2324 return false;
2325 case 1:
2327 NVPTX::StoreRetvalI8, NVPTX::StoreRetvalI16,
2328 NVPTX::StoreRetvalI32, NVPTX::StoreRetvalI64,
2329 NVPTX::StoreRetvalF32, NVPTX::StoreRetvalF64);
2330 if (Opcode == NVPTX::StoreRetvalI8) {
2331 // Fine tune the opcode depending on the size of the operand.
2332 // This helps to avoid creating redundant COPY instructions in
2333 // InstrEmitter::AddRegisterOperand().
2334 switch (Ops[0].getSimpleValueType().SimpleTy) {
2335 default:
2336 break;
2337 case MVT::i32:
2338 Opcode = NVPTX::StoreRetvalI8TruncI32;
2339 break;
2340 case MVT::i64:
2341 Opcode = NVPTX::StoreRetvalI8TruncI64;
2342 break;
2343 }
2344 }
2345 break;
2346 case 2:
2348 NVPTX::StoreRetvalV2I8, NVPTX::StoreRetvalV2I16,
2349 NVPTX::StoreRetvalV2I32, NVPTX::StoreRetvalV2I64,
2350 NVPTX::StoreRetvalV2F32, NVPTX::StoreRetvalV2F64);
2351 break;
2352 case 4:
2354 NVPTX::StoreRetvalV4I8, NVPTX::StoreRetvalV4I16,
2355 NVPTX::StoreRetvalV4I32, std::nullopt,
2356 NVPTX::StoreRetvalV4F32, std::nullopt);
2357 break;
2358 }
2359 if (!Opcode)
2360 return false;
2361
2362 SDNode *Ret = CurDAG->getMachineNode(*Opcode, DL, MVT::Other, Ops);
2363 MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand();
2364 CurDAG->setNodeMemRefs(cast<MachineSDNode>(Ret), {MemRef});
2365
2366 ReplaceNode(N, Ret);
2367 return true;
2368}
2369
2370// Helpers for constructing opcode (ex: NVPTX::StoreParamV4F32_iiri)
2371#define getOpcV2H(ty, opKind0, opKind1) \
2372 NVPTX::StoreParamV2##ty##_##opKind0##opKind1
2373
2374#define getOpcV2H1(ty, opKind0, isImm1) \
2375 (isImm1) ? getOpcV2H(ty, opKind0, i) : getOpcV2H(ty, opKind0, r)
2376
2377#define getOpcodeForVectorStParamV2(ty, isimm) \
2378 (isimm[0]) ? getOpcV2H1(ty, i, isimm[1]) : getOpcV2H1(ty, r, isimm[1])
2379
2380#define getOpcV4H(ty, opKind0, opKind1, opKind2, opKind3) \
2381 NVPTX::StoreParamV4##ty##_##opKind0##opKind1##opKind2##opKind3
2382
2383#define getOpcV4H3(ty, opKind0, opKind1, opKind2, isImm3) \
2384 (isImm3) ? getOpcV4H(ty, opKind0, opKind1, opKind2, i) \
2385 : getOpcV4H(ty, opKind0, opKind1, opKind2, r)
2386
2387#define getOpcV4H2(ty, opKind0, opKind1, isImm2, isImm3) \
2388 (isImm2) ? getOpcV4H3(ty, opKind0, opKind1, i, isImm3) \
2389 : getOpcV4H3(ty, opKind0, opKind1, r, isImm3)
2390
2391#define getOpcV4H1(ty, opKind0, isImm1, isImm2, isImm3) \
2392 (isImm1) ? getOpcV4H2(ty, opKind0, i, isImm2, isImm3) \
2393 : getOpcV4H2(ty, opKind0, r, isImm2, isImm3)
2394
2395#define getOpcodeForVectorStParamV4(ty, isimm) \
2396 (isimm[0]) ? getOpcV4H1(ty, i, isimm[1], isimm[2], isimm[3]) \
2397 : getOpcV4H1(ty, r, isimm[1], isimm[2], isimm[3])
2398
2399#define getOpcodeForVectorStParam(n, ty, isimm) \
2400 (n == 2) ? getOpcodeForVectorStParamV2(ty, isimm) \
2401 : getOpcodeForVectorStParamV4(ty, isimm)
2402
2404 unsigned NumElts,
2406 SelectionDAG *CurDAG, SDLoc DL) {
2407 // Determine which inputs are registers and immediates make new operators
2408 // with constant values
2409 SmallVector<bool, 4> IsImm(NumElts, false);
2410 for (unsigned i = 0; i < NumElts; i++) {
2411 IsImm[i] = (isa<ConstantSDNode>(Ops[i]) || isa<ConstantFPSDNode>(Ops[i]));
2412 if (IsImm[i]) {
2413 SDValue Imm = Ops[i];
2414 if (MemTy == MVT::f32 || MemTy == MVT::f64) {
2415 const ConstantFPSDNode *ConstImm = cast<ConstantFPSDNode>(Imm);
2416 const ConstantFP *CF = ConstImm->getConstantFPValue();
2417 Imm = CurDAG->getTargetConstantFP(*CF, DL, Imm->getValueType(0));
2418 } else {
2419 const ConstantSDNode *ConstImm = cast<ConstantSDNode>(Imm);
2420 const ConstantInt *CI = ConstImm->getConstantIntValue();
2421 Imm = CurDAG->getTargetConstant(*CI, DL, Imm->getValueType(0));
2422 }
2423 Ops[i] = Imm;
2424 }
2425 }
2426
2427 // Get opcode for MemTy, size, and register/immediate operand ordering
2428 switch (MemTy) {
2429 case MVT::i8:
2430 return getOpcodeForVectorStParam(NumElts, I8, IsImm);
2431 case MVT::i16:
2432 return getOpcodeForVectorStParam(NumElts, I16, IsImm);
2433 case MVT::i32:
2434 return getOpcodeForVectorStParam(NumElts, I32, IsImm);
2435 case MVT::i64:
2436 assert(NumElts == 2 && "MVT too large for NumElts > 2");
2437 return getOpcodeForVectorStParamV2(I64, IsImm);
2438 case MVT::f32:
2439 return getOpcodeForVectorStParam(NumElts, F32, IsImm);
2440 case MVT::f64:
2441 assert(NumElts == 2 && "MVT too large for NumElts > 2");
2442 return getOpcodeForVectorStParamV2(F64, IsImm);
2443
2444 // These cases don't support immediates, just use the all register version
2445 // and generate moves.
2446 case MVT::i1:
2447 return (NumElts == 2) ? NVPTX::StoreParamV2I8_rr
2448 : NVPTX::StoreParamV4I8_rrrr;
2449 case MVT::f16:
2450 case MVT::bf16:
2451 return (NumElts == 2) ? NVPTX::StoreParamV2I16_rr
2452 : NVPTX::StoreParamV4I16_rrrr;
2453 case MVT::v2f16:
2454 case MVT::v2bf16:
2455 case MVT::v2i16:
2456 case MVT::v4i8:
2457 return (NumElts == 2) ? NVPTX::StoreParamV2I32_rr
2458 : NVPTX::StoreParamV4I32_rrrr;
2459 default:
2460 llvm_unreachable("Cannot select st.param for unknown MemTy");
2461 }
2462}
2463
2464bool NVPTXDAGToDAGISel::tryStoreParam(SDNode *N) {
2465 SDLoc DL(N);
2466 SDValue Chain = N->getOperand(0);
2467 SDValue Param = N->getOperand(1);
2468 unsigned ParamVal = Param->getAsZExtVal();
2469 SDValue Offset = N->getOperand(2);
2470 unsigned OffsetVal = Offset->getAsZExtVal();
2471 MemSDNode *Mem = cast<MemSDNode>(N);
2472 SDValue Glue = N->getOperand(N->getNumOperands() - 1);
2473
2474 // How many elements do we have?
2475 unsigned NumElts;
2476 switch (N->getOpcode()) {
2477 default:
2478 llvm_unreachable("Unexpected opcode");
2482 NumElts = 1;
2483 break;
2485 NumElts = 2;
2486 break;
2488 NumElts = 4;
2489 break;
2490 }
2491
2492 // Build vector of operands
2494 for (unsigned i = 0; i < NumElts; ++i)
2495 Ops.push_back(N->getOperand(i + 3));
2496 Ops.push_back(CurDAG->getTargetConstant(ParamVal, DL, MVT::i32));
2497 Ops.push_back(CurDAG->getTargetConstant(OffsetVal, DL, MVT::i32));
2498 Ops.push_back(Chain);
2499 Ops.push_back(Glue);
2500
2501 // Determine target opcode
2502 // If we have an i1, use an 8-bit store. The lowering code in
2503 // NVPTXISelLowering will have already emitted an upcast.
2504 std::optional<unsigned> Opcode;
2505 switch (N->getOpcode()) {
2506 default:
2507 switch (NumElts) {
2508 default:
2509 llvm_unreachable("Unexpected NumElts");
2510 case 1: {
2512 SDValue Imm = Ops[0];
2513 if (MemTy != MVT::f16 && MemTy != MVT::v2f16 &&
2514 (isa<ConstantSDNode>(Imm) || isa<ConstantFPSDNode>(Imm))) {
2515 // Convert immediate to target constant
2516 if (MemTy == MVT::f32 || MemTy == MVT::f64) {
2517 const ConstantFPSDNode *ConstImm = cast<ConstantFPSDNode>(Imm);
2518 const ConstantFP *CF = ConstImm->getConstantFPValue();
2519 Imm = CurDAG->getTargetConstantFP(*CF, DL, Imm->getValueType(0));
2520 } else {
2521 const ConstantSDNode *ConstImm = cast<ConstantSDNode>(Imm);
2522 const ConstantInt *CI = ConstImm->getConstantIntValue();
2523 Imm = CurDAG->getTargetConstant(*CI, DL, Imm->getValueType(0));
2524 }
2525 Ops[0] = Imm;
2526 // Use immediate version of store param
2527 Opcode = pickOpcodeForVT(MemTy, NVPTX::StoreParamI8_i,
2528 NVPTX::StoreParamI16_i, NVPTX::StoreParamI32_i,
2529 NVPTX::StoreParamI64_i, NVPTX::StoreParamF32_i,
2530 NVPTX::StoreParamF64_i);
2531 } else
2532 Opcode =
2534 NVPTX::StoreParamI8_r, NVPTX::StoreParamI16_r,
2535 NVPTX::StoreParamI32_r, NVPTX::StoreParamI64_r,
2536 NVPTX::StoreParamF32_r, NVPTX::StoreParamF64_r);
2537 if (Opcode == NVPTX::StoreParamI8_r) {
2538 // Fine tune the opcode depending on the size of the operand.
2539 // This helps to avoid creating redundant COPY instructions in
2540 // InstrEmitter::AddRegisterOperand().
2541 switch (Ops[0].getSimpleValueType().SimpleTy) {
2542 default:
2543 break;
2544 case MVT::i32:
2545 Opcode = NVPTX::StoreParamI8TruncI32_r;
2546 break;
2547 case MVT::i64:
2548 Opcode = NVPTX::StoreParamI8TruncI64_r;
2549 break;
2550 }
2551 }
2552 break;
2553 }
2554 case 2:
2555 case 4: {
2557 Opcode = pickOpcodeForVectorStParam(Ops, NumElts, MemTy, CurDAG, DL);
2558 break;
2559 }
2560 }
2561 break;
2562 // Special case: if we have a sign-extend/zero-extend node, insert the
2563 // conversion instruction first, and use that as the value operand to
2564 // the selected StoreParam node.
2566 Opcode = NVPTX::StoreParamI32_r;
2568 MVT::i32);
2569 SDNode *Cvt = CurDAG->getMachineNode(NVPTX::CVT_u32_u16, DL,
2570 MVT::i32, Ops[0], CvtNone);
2571 Ops[0] = SDValue(Cvt, 0);
2572 break;
2573 }
2575 Opcode = NVPTX::StoreParamI32_r;
2577 MVT::i32);
2578 SDNode *Cvt = CurDAG->getMachineNode(NVPTX::CVT_s32_s16, DL,
2579 MVT::i32, Ops[0], CvtNone);
2580 Ops[0] = SDValue(Cvt, 0);
2581 break;
2582 }
2583 }
2584
2585 SDVTList RetVTs = CurDAG->getVTList(MVT::Other, MVT::Glue);
2586 SDNode *Ret = CurDAG->getMachineNode(*Opcode, DL, RetVTs, Ops);
2587 MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand();
2588 CurDAG->setNodeMemRefs(cast<MachineSDNode>(Ret), {MemRef});
2589
2590 ReplaceNode(N, Ret);
2591 return true;
2592}
2593
2594bool NVPTXDAGToDAGISel::tryTextureIntrinsic(SDNode *N) {
2595 unsigned Opc = 0;
2596
2597 switch (N->getOpcode()) {
2598 default: return false;
2600 Opc = NVPTX::TEX_1D_F32_S32_RR;
2601 break;
2603 Opc = NVPTX::TEX_1D_F32_F32_RR;
2604 break;
2606 Opc = NVPTX::TEX_1D_F32_F32_LEVEL_RR;
2607 break;
2609 Opc = NVPTX::TEX_1D_F32_F32_GRAD_RR;
2610 break;
2612 Opc = NVPTX::TEX_1D_S32_S32_RR;
2613 break;
2615 Opc = NVPTX::TEX_1D_S32_F32_RR;
2616 break;
2618 Opc = NVPTX::TEX_1D_S32_F32_LEVEL_RR;
2619 break;
2621 Opc = NVPTX::TEX_1D_S32_F32_GRAD_RR;
2622 break;
2624 Opc = NVPTX::TEX_1D_U32_S32_RR;
2625 break;
2627 Opc = NVPTX::TEX_1D_U32_F32_RR;
2628 break;
2630 Opc = NVPTX::TEX_1D_U32_F32_LEVEL_RR;
2631 break;
2633 Opc = NVPTX::TEX_1D_U32_F32_GRAD_RR;
2634 break;
2636 Opc = NVPTX::TEX_1D_ARRAY_F32_S32_RR;
2637 break;
2639 Opc = NVPTX::TEX_1D_ARRAY_F32_F32_RR;
2640 break;
2642 Opc = NVPTX::TEX_1D_ARRAY_F32_F32_LEVEL_RR;
2643 break;
2645 Opc = NVPTX::TEX_1D_ARRAY_F32_F32_GRAD_RR;
2646 break;
2648 Opc = NVPTX::TEX_1D_ARRAY_S32_S32_RR;
2649 break;
2651 Opc = NVPTX::TEX_1D_ARRAY_S32_F32_RR;
2652 break;
2654 Opc = NVPTX::TEX_1D_ARRAY_S32_F32_LEVEL_RR;
2655 break;
2657 Opc = NVPTX::TEX_1D_ARRAY_S32_F32_GRAD_RR;
2658 break;
2660 Opc = NVPTX::TEX_1D_ARRAY_U32_S32_RR;
2661 break;
2663 Opc = NVPTX::TEX_1D_ARRAY_U32_F32_RR;
2664 break;
2666 Opc = NVPTX::TEX_1D_ARRAY_U32_F32_LEVEL_RR;
2667 break;
2669 Opc = NVPTX::TEX_1D_ARRAY_U32_F32_GRAD_RR;
2670 break;
2672 Opc = NVPTX::TEX_2D_F32_S32_RR;
2673 break;
2675 Opc = NVPTX::TEX_2D_F32_F32_RR;
2676 break;
2678 Opc = NVPTX::TEX_2D_F32_F32_LEVEL_RR;
2679 break;
2681 Opc = NVPTX::TEX_2D_F32_F32_GRAD_RR;
2682 break;
2684 Opc = NVPTX::TEX_2D_S32_S32_RR;
2685 break;
2687 Opc = NVPTX::TEX_2D_S32_F32_RR;
2688 break;
2690 Opc = NVPTX::TEX_2D_S32_F32_LEVEL_RR;
2691 break;
2693 Opc = NVPTX::TEX_2D_S32_F32_GRAD_RR;
2694 break;
2696 Opc = NVPTX::TEX_2D_U32_S32_RR;
2697 break;
2699 Opc = NVPTX::TEX_2D_U32_F32_RR;
2700 break;
2702 Opc = NVPTX::TEX_2D_U32_F32_LEVEL_RR;
2703 break;
2705 Opc = NVPTX::TEX_2D_U32_F32_GRAD_RR;
2706 break;
2708 Opc = NVPTX::TEX_2D_ARRAY_F32_S32_RR;
2709 break;
2711 Opc = NVPTX::TEX_2D_ARRAY_F32_F32_RR;
2712 break;
2714 Opc = NVPTX::TEX_2D_ARRAY_F32_F32_LEVEL_RR;
2715 break;
2717 Opc = NVPTX::TEX_2D_ARRAY_F32_F32_GRAD_RR;
2718 break;
2720 Opc = NVPTX::TEX_2D_ARRAY_S32_S32_RR;
2721 break;
2723 Opc = NVPTX::TEX_2D_ARRAY_S32_F32_RR;
2724 break;
2726 Opc = NVPTX::TEX_2D_ARRAY_S32_F32_LEVEL_RR;
2727 break;
2729 Opc = NVPTX::TEX_2D_ARRAY_S32_F32_GRAD_RR;
2730 break;
2732 Opc = NVPTX::TEX_2D_ARRAY_U32_S32_RR;
2733 break;
2735 Opc = NVPTX::TEX_2D_ARRAY_U32_F32_RR;
2736 break;
2738 Opc = NVPTX::TEX_2D_ARRAY_U32_F32_LEVEL_RR;
2739 break;
2741 Opc = NVPTX::TEX_2D_ARRAY_U32_F32_GRAD_RR;
2742 break;
2744 Opc = NVPTX::TEX_3D_F32_S32_RR;
2745 break;
2747 Opc = NVPTX::TEX_3D_F32_F32_RR;
2748 break;
2750 Opc = NVPTX::TEX_3D_F32_F32_LEVEL_RR;
2751 break;
2753 Opc = NVPTX::TEX_3D_F32_F32_GRAD_RR;
2754 break;
2756 Opc = NVPTX::TEX_3D_S32_S32_RR;
2757 break;
2759 Opc = NVPTX::TEX_3D_S32_F32_RR;
2760 break;
2762 Opc = NVPTX::TEX_3D_S32_F32_LEVEL_RR;
2763 break;
2765 Opc = NVPTX::TEX_3D_S32_F32_GRAD_RR;
2766 break;
2768 Opc = NVPTX::TEX_3D_U32_S32_RR;
2769 break;
2771 Opc = NVPTX::TEX_3D_U32_F32_RR;
2772 break;
2774 Opc = NVPTX::TEX_3D_U32_F32_LEVEL_RR;
2775 break;
2777 Opc = NVPTX::TEX_3D_U32_F32_GRAD_RR;
2778 break;
2780 Opc = NVPTX::TEX_CUBE_F32_F32_RR;
2781 break;
2783 Opc = NVPTX::TEX_CUBE_F32_F32_LEVEL_RR;
2784 break;
2786 Opc = NVPTX::TEX_CUBE_S32_F32_RR;
2787 break;
2789 Opc = NVPTX::TEX_CUBE_S32_F32_LEVEL_RR;
2790 break;
2792 Opc = NVPTX::TEX_CUBE_U32_F32_RR;
2793 break;
2795 Opc = NVPTX::TEX_CUBE_U32_F32_LEVEL_RR;
2796 break;
2798 Opc = NVPTX::TEX_CUBE_ARRAY_F32_F32_RR;
2799 break;
2801 Opc = NVPTX::TEX_CUBE_ARRAY_F32_F32_LEVEL_RR;
2802 break;
2804 Opc = NVPTX::TEX_CUBE_ARRAY_S32_F32_RR;
2805 break;
2807 Opc = NVPTX::TEX_CUBE_ARRAY_S32_F32_LEVEL_RR;
2808 break;
2810 Opc = NVPTX::TEX_CUBE_ARRAY_U32_F32_RR;
2811 break;
2813 Opc = NVPTX::TEX_CUBE_ARRAY_U32_F32_LEVEL_RR;
2814 break;
2816 Opc = NVPTX::TLD4_R_2D_F32_F32_RR;
2817 break;
2819 Opc = NVPTX::TLD4_G_2D_F32_F32_RR;
2820 break;
2822 Opc = NVPTX::TLD4_B_2D_F32_F32_RR;
2823 break;
2825 Opc = NVPTX::TLD4_A_2D_F32_F32_RR;
2826 break;
2828 Opc = NVPTX::TLD4_R_2D_S32_F32_RR;
2829 break;
2831 Opc = NVPTX::TLD4_G_2D_S32_F32_RR;
2832 break;
2834 Opc = NVPTX::TLD4_B_2D_S32_F32_RR;
2835 break;
2837 Opc = NVPTX::TLD4_A_2D_S32_F32_RR;
2838 break;
2840 Opc = NVPTX::TLD4_R_2D_U32_F32_RR;
2841 break;
2843 Opc = NVPTX::TLD4_G_2D_U32_F32_RR;
2844 break;
2846 Opc = NVPTX::TLD4_B_2D_U32_F32_RR;
2847 break;
2849 Opc = NVPTX::TLD4_A_2D_U32_F32_RR;
2850 break;
2852 Opc = NVPTX::TEX_UNIFIED_1D_F32_S32_R;
2853 break;
2855 Opc = NVPTX::TEX_UNIFIED_1D_F32_F32_R;
2856 break;
2858 Opc = NVPTX::TEX_UNIFIED_1D_F32_F32_LEVEL_R;
2859 break;
2861 Opc = NVPTX::TEX_UNIFIED_1D_F32_F32_GRAD_R;
2862 break;
2864 Opc = NVPTX::TEX_UNIFIED_1D_S32_S32_R;
2865 break;
2867 Opc = NVPTX::TEX_UNIFIED_1D_S32_F32_R;
2868 break;
2870 Opc = NVPTX::TEX_UNIFIED_1D_S32_F32_LEVEL_R;
2871 break;
2873 Opc = NVPTX::TEX_UNIFIED_1D_S32_F32_GRAD_R;
2874 break;
2876 Opc = NVPTX::TEX_UNIFIED_1D_U32_S32_R;
2877 break;
2879 Opc = NVPTX::TEX_UNIFIED_1D_U32_F32_R;
2880 break;
2882 Opc = NVPTX::TEX_UNIFIED_1D_U32_F32_LEVEL_R;
2883 break;
2885 Opc = NVPTX::TEX_UNIFIED_1D_U32_F32_GRAD_R;
2886 break;
2888 Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_F32_S32_R;
2889 break;
2891 Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_F32_F32_R;
2892 break;
2894 Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_F32_F32_LEVEL_R;
2895 break;
2897 Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_F32_F32_GRAD_R;
2898 break;
2900 Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_S32_S32_R;
2901 break;
2903 Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_S32_F32_R;
2904 break;
2906 Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_S32_F32_LEVEL_R;
2907 break;
2909 Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_S32_F32_GRAD_R;
2910 break;
2912 Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_U32_S32_R;
2913 break;
2915 Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_U32_F32_R;
2916 break;
2918 Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_U32_F32_LEVEL_R;
2919 break;
2921 Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_U32_F32_GRAD_R;
2922 break;
2924 Opc = NVPTX::TEX_UNIFIED_2D_F32_S32_R;
2925 break;
2927 Opc = NVPTX::TEX_UNIFIED_2D_F32_F32_R;
2928 break;
2930 Opc = NVPTX::TEX_UNIFIED_2D_F32_F32_LEVEL_R;
2931 break;
2933 Opc = NVPTX::TEX_UNIFIED_2D_F32_F32_GRAD_R;
2934 break;
2936 Opc = NVPTX::TEX_UNIFIED_2D_S32_S32_R;
2937 break;
2939 Opc = NVPTX::TEX_UNIFIED_2D_S32_F32_R;
2940 break;
2942 Opc = NVPTX::TEX_UNIFIED_2D_S32_F32_LEVEL_R;
2943 break;
2945 Opc = NVPTX::TEX_UNIFIED_2D_S32_F32_GRAD_R;
2946 break;
2948 Opc = NVPTX::TEX_UNIFIED_2D_U32_S32_R;
2949 break;
2951 Opc = NVPTX::TEX_UNIFIED_2D_U32_F32_R;
2952 break;
2954 Opc = NVPTX::TEX_UNIFIED_2D_U32_F32_LEVEL_R;
2955 break;
2957 Opc = NVPTX::TEX_UNIFIED_2D_U32_F32_GRAD_R;
2958 break;
2960 Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_F32_S32_R;
2961 break;
2963 Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_F32_F32_R;
2964 break;
2966 Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_F32_F32_LEVEL_R;
2967 break;
2969 Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_F32_F32_GRAD_R;
2970 break;
2972 Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_S32_S32_R;
2973 break;
2975 Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_S32_F32_R;
2976 break;
2978 Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_S32_F32_LEVEL_R;
2979 break;
2981 Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_S32_F32_GRAD_R;
2982 break;
2984 Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_U32_S32_R;
2985 break;
2987 Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_U32_F32_R;
2988 break;
2990 Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_U32_F32_LEVEL_R;
2991 break;
2993 Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_U32_F32_GRAD_R;
2994 break;
2996 Opc = NVPTX::TEX_UNIFIED_3D_F32_S32_R;
2997 break;
2999 Opc = NVPTX::TEX_UNIFIED_3D_F32_F32_R;
3000 break;
3002 Opc = NVPTX::TEX_UNIFIED_3D_F32_F32_LEVEL_R;
3003 break;
3005 Opc = NVPTX::TEX_UNIFIED_3D_F32_F32_GRAD_R;
3006 break;
3008 Opc = NVPTX::TEX_UNIFIED_3D_S32_S32_R;
3009 break;
3011 Opc = NVPTX::TEX_UNIFIED_3D_S32_F32_R;
3012 break;
3014 Opc = NVPTX::TEX_UNIFIED_3D_S32_F32_LEVEL_R;
3015 break;
3017 Opc = NVPTX::TEX_UNIFIED_3D_S32_F32_GRAD_R;
3018 break;
3020 Opc = NVPTX::TEX_UNIFIED_3D_U32_S32_R;
3021 break;
3023 Opc = NVPTX::TEX_UNIFIED_3D_U32_F32_R;
3024 break;
3026 Opc = NVPTX::TEX_UNIFIED_3D_U32_F32_LEVEL_R;
3027 break;
3029 Opc = NVPTX::TEX_UNIFIED_3D_U32_F32_GRAD_R;
3030 break;
3032 Opc = NVPTX::TEX_UNIFIED_CUBE_F32_F32_R;
3033 break;
3035 Opc = NVPTX::TEX_UNIFIED_CUBE_F32_F32_LEVEL_R;
3036 break;
3038 Opc = NVPTX::TEX_UNIFIED_CUBE_S32_F32_R;
3039 break;
3041 Opc = NVPTX::TEX_UNIFIED_CUBE_S32_F32_LEVEL_R;
3042 break;
3044 Opc = NVPTX::TEX_UNIFIED_CUBE_U32_F32_R;
3045 break;
3047 Opc = NVPTX::TEX_UNIFIED_CUBE_U32_F32_LEVEL_R;
3048 break;
3050 Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_F32_F32_R;
3051 break;
3053 Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_F32_F32_LEVEL_R;
3054 break;
3056 Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_S32_F32_R;
3057 break;
3059 Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_S32_F32_LEVEL_R;
3060 break;
3062 Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_U32_F32_R;
3063 break;
3065 Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_U32_F32_LEVEL_R;
3066 break;
3068 Opc = NVPTX::TLD4_UNIFIED_R_2D_F32_F32_R;
3069 break;
3071 Opc = NVPTX::TLD4_UNIFIED_G_2D_F32_F32_R;
3072 break;
3074 Opc = NVPTX::TLD4_UNIFIED_B_2D_F32_F32_R;
3075 break;
3077 Opc = NVPTX::TLD4_UNIFIED_A_2D_F32_F32_R;
3078 break;
3080 Opc = NVPTX::TLD4_UNIFIED_R_2D_S32_F32_R;
3081 break;
3083 Opc = NVPTX::TLD4_UNIFIED_G_2D_S32_F32_R;
3084 break;
3086 Opc = NVPTX::TLD4_UNIFIED_B_2D_S32_F32_R;
3087 break;
3089 Opc = NVPTX::TLD4_UNIFIED_A_2D_S32_F32_R;
3090 break;
3092 Opc = NVPTX::TLD4_UNIFIED_R_2D_U32_F32_R;
3093 break;
3095 Opc = NVPTX::TLD4_UNIFIED_G_2D_U32_F32_R;
3096 break;
3098 Opc = NVPTX::TLD4_UNIFIED_B_2D_U32_F32_R;
3099 break;
3101 Opc = NVPTX::TLD4_UNIFIED_A_2D_U32_F32_R;
3102 break;
3104 Opc = NVPTX::TEX_UNIFIED_CUBE_F32_F32_GRAD_R;
3105 break;
3107 Opc = NVPTX::TEX_UNIFIED_CUBE_S32_F32_GRAD_R;
3108 break;
3110 Opc = NVPTX::TEX_UNIFIED_CUBE_U32_F32_GRAD_R;
3111 break;
3113 Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_F32_F32_GRAD_R;
3114 break;
3116 Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_S32_F32_GRAD_R;
3117 break;
3119 Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_U32_F32_GRAD_R;
3120 break;
3121 }
3122
3123 // Copy over operands
3125 Ops.push_back(N->getOperand(0)); // Move chain to the back.
3126
3127 ReplaceNode(N, CurDAG->getMachineNode(Opc, SDLoc(N), N->getVTList(), Ops));
3128 return true;
3129}
3130
3131bool NVPTXDAGToDAGISel::trySurfaceIntrinsic(SDNode *N) {
3132 unsigned Opc = 0;
3133 switch (N->getOpcode()) {
3134 default: return false;
3136 Opc = NVPTX::SULD_1D_I8_CLAMP_R;
3137 break;
3139 Opc = NVPTX::SULD_1D_I16_CLAMP_R;
3140 break;
3142 Opc = NVPTX::SULD_1D_I32_CLAMP_R;
3143 break;
3145 Opc = NVPTX::SULD_1D_I64_CLAMP_R;
3146 break;
3148 Opc = NVPTX::SULD_1D_V2I8_CLAMP_R;
3149 break;
3151 Opc = NVPTX::SULD_1D_V2I16_CLAMP_R;
3152 break;
3154 Opc = NVPTX::SULD_1D_V2I32_CLAMP_R;
3155 break;
3157 Opc = NVPTX::SULD_1D_V2I64_CLAMP_R;
3158 break;
3160 Opc = NVPTX::SULD_1D_V4I8_CLAMP_R;
3161 break;
3163 Opc = NVPTX::SULD_1D_V4I16_CLAMP_R;
3164 break;
3166 Opc = NVPTX::SULD_1D_V4I32_CLAMP_R;
3167 break;
3169 Opc = NVPTX::SULD_1D_ARRAY_I8_CLAMP_R;
3170 break;
3172 Opc = NVPTX::SULD_1D_ARRAY_I16_CLAMP_R;
3173 break;
3175 Opc = NVPTX::SULD_1D_ARRAY_I32_CLAMP_R;
3176 break;
3178 Opc = NVPTX::SULD_1D_ARRAY_I64_CLAMP_R;
3179 break;
3181 Opc = NVPTX::SULD_1D_ARRAY_V2I8_CLAMP_R;
3182 break;
3184 Opc = NVPTX::SULD_1D_ARRAY_V2I16_CLAMP_R;
3185 break;
3187 Opc = NVPTX::SULD_1D_ARRAY_V2I32_CLAMP_R;
3188 break;
3190 Opc = NVPTX::SULD_1D_ARRAY_V2I64_CLAMP_R;
3191 break;
3193 Opc = NVPTX::SULD_1D_ARRAY_V4I8_CLAMP_R;
3194 break;
3196 Opc = NVPTX::SULD_1D_ARRAY_V4I16_CLAMP_R;
3197 break;
3199 Opc = NVPTX::SULD_1D_ARRAY_V4I32_CLAMP_R;
3200 break;
3202 Opc = NVPTX::SULD_2D_I8_CLAMP_R;
3203 break;
3205 Opc = NVPTX::SULD_2D_I16_CLAMP_R;
3206 break;
3208 Opc = NVPTX::SULD_2D_I32_CLAMP_R;
3209 break;
3211 Opc = NVPTX::SULD_2D_I64_CLAMP_R;
3212 break;
3214 Opc = NVPTX::SULD_2D_V2I8_CLAMP_R;
3215 break;
3217 Opc = NVPTX::SULD_2D_V2I16_CLAMP_R;
3218 break;
3220 Opc = NVPTX::SULD_2D_V2I32_CLAMP_R;
3221 break;
3223 Opc = NVPTX::SULD_2D_V2I64_CLAMP_R;
3224 break;
3226 Opc = NVPTX::SULD_2D_V4I8_CLAMP_R;
3227 break;
3229 Opc = NVPTX::SULD_2D_V4I16_CLAMP_R;
3230 break;
3232 Opc = NVPTX::SULD_2D_V4I32_CLAMP_R;
3233 break;
3235 Opc = NVPTX::SULD_2D_ARRAY_I8_CLAMP_R;
3236 break;
3238 Opc = NVPTX::SULD_2D_ARRAY_I16_CLAMP_R;
3239 break;
3241 Opc = NVPTX::SULD_2D_ARRAY_I32_CLAMP_R;
3242 break;
3244 Opc = NVPTX::SULD_2D_ARRAY_I64_CLAMP_R;
3245 break;
3247 Opc = NVPTX::SULD_2D_ARRAY_V2I8_CLAMP_R;
3248 break;
3250 Opc = NVPTX::SULD_2D_ARRAY_V2I16_CLAMP_R;
3251 break;
3253 Opc = NVPTX::SULD_2D_ARRAY_V2I32_CLAMP_R;
3254 break;
3256 Opc = NVPTX::SULD_2D_ARRAY_V2I64_CLAMP_R;
3257 break;
3259 Opc = NVPTX::SULD_2D_ARRAY_V4I8_CLAMP_R;
3260 break;
3262 Opc = NVPTX::SULD_2D_ARRAY_V4I16_CLAMP_R;
3263 break;
3265 Opc = NVPTX::SULD_2D_ARRAY_V4I32_CLAMP_R;
3266 break;
3268 Opc = NVPTX::SULD_3D_I8_CLAMP_R;
3269 break;
3271 Opc = NVPTX::SULD_3D_I16_CLAMP_R;
3272 break;
3274 Opc = NVPTX::SULD_3D_I32_CLAMP_R;
3275 break;
3277 Opc = NVPTX::SULD_3D_I64_CLAMP_R;
3278 break;
3280 Opc = NVPTX::SULD_3D_V2I8_CLAMP_R;
3281 break;
3283 Opc = NVPTX::SULD_3D_V2I16_CLAMP_R;
3284 break;
3286 Opc = NVPTX::SULD_3D_V2I32_CLAMP_R;
3287 break;
3289 Opc = NVPTX::SULD_3D_V2I64_CLAMP_R;
3290 break;
3292 Opc = NVPTX::SULD_3D_V4I8_CLAMP_R;
3293 break;
3295 Opc = NVPTX::SULD_3D_V4I16_CLAMP_R;
3296 break;
3298 Opc = NVPTX::SULD_3D_V4I32_CLAMP_R;
3299 break;
3301 Opc = NVPTX::SULD_1D_I8_TRAP_R;
3302 break;
3304 Opc = NVPTX::SULD_1D_I16_TRAP_R;
3305 break;
3307 Opc = NVPTX::SULD_1D_I32_TRAP_R;
3308 break;
3310 Opc = NVPTX::SULD_1D_I64_TRAP_R;
3311 break;
3313 Opc = NVPTX::SULD_1D_V2I8_TRAP_R;
3314 break;
3316 Opc = NVPTX::SULD_1D_V2I16_TRAP_R;
3317 break;
3319 Opc = NVPTX::SULD_1D_V2I32_TRAP_R;
3320 break;
3322 Opc = NVPTX::SULD_1D_V2I64_TRAP_R;
3323 break;
3325 Opc = NVPTX::SULD_1D_V4I8_TRAP_R;
3326 break;
3328 Opc = NVPTX::SULD_1D_V4I16_TRAP_R;
3329 break;
3331 Opc = NVPTX::SULD_1D_V4I32_TRAP_R;
3332 break;
3334 Opc = NVPTX::SULD_1D_ARRAY_I8_TRAP_R;
3335 break;
3337 Opc = NVPTX::SULD_1D_ARRAY_I16_TRAP_R;
3338 break;
3340 Opc = NVPTX::SULD_1D_ARRAY_I32_TRAP_R;
3341 break;
3343 Opc = NVPTX::SULD_1D_ARRAY_I64_TRAP_R;
3344 break;
3346 Opc = NVPTX::SULD_1D_ARRAY_V2I8_TRAP_R;
3347 break;
3349 Opc = NVPTX::SULD_1D_ARRAY_V2I16_TRAP_R;
3350 break;
3352 Opc = NVPTX::SULD_1D_ARRAY_V2I32_TRAP_R;
3353 break;
3355 Opc = NVPTX::SULD_1D_ARRAY_V2I64_TRAP_R;
3356 break;
3358 Opc = NVPTX::SULD_1D_ARRAY_V4I8_TRAP_R;
3359 break;
3361 Opc = NVPTX::SULD_1D_ARRAY_V4I16_TRAP_R;
3362 break;
3364 Opc = NVPTX::SULD_1D_ARRAY_V4I32_TRAP_R;
3365 break;
3367 Opc = NVPTX::SULD_2D_I8_TRAP_R;
3368 break;
3370 Opc = NVPTX::SULD_2D_I16_TRAP_R;
3371 break;
3373 Opc = NVPTX::SULD_2D_I32_TRAP_R;
3374 break;
3376 Opc = NVPTX::SULD_2D_I64_TRAP_R;
3377 break;
3379 Opc = NVPTX::SULD_2D_V2I8_TRAP_R;
3380 break;
3382 Opc = NVPTX::SULD_2D_V2I16_TRAP_R;
3383 break;
3385 Opc = NVPTX::SULD_2D_V2I32_TRAP_R;
3386 break;
3388 Opc = NVPTX::SULD_2D_V2I64_TRAP_R;
3389 break;
3391 Opc = NVPTX::SULD_2D_V4I8_TRAP_R;
3392 break;
3394 Opc = NVPTX::SULD_2D_V4I16_TRAP_R;
3395 break;
3397 Opc = NVPTX::SULD_2D_V4I32_TRAP_R;
3398 break;
3400 Opc = NVPTX::SULD_2D_ARRAY_I8_TRAP_R;
3401 break;
3403 Opc = NVPTX::SULD_2D_ARRAY_I16_TRAP_R;
3404 break;
3406 Opc = NVPTX::SULD_2D_ARRAY_I32_TRAP_R;
3407 break;
3409 Opc = NVPTX::SULD_2D_ARRAY_I64_TRAP_R;
3410 break;
3412 Opc = NVPTX::SULD_2D_ARRAY_V2I8_TRAP_R;
3413 break;
3415 Opc = NVPTX::SULD_2D_ARRAY_V2I16_TRAP_R;
3416 break;
3418 Opc = NVPTX::SULD_2D_ARRAY_V2I32_TRAP_R;
3419 break;
3421 Opc = NVPTX::SULD_2D_ARRAY_V2I64_TRAP_R;
3422 break;
3424 Opc = NVPTX::SULD_2D_ARRAY_V4I8_TRAP_R;
3425 break;
3427 Opc = NVPTX::SULD_2D_ARRAY_V4I16_TRAP_R;
3428 break;
3430 Opc = NVPTX::SULD_2D_ARRAY_V4I32_TRAP_R;
3431 break;
3433 Opc = NVPTX::SULD_3D_I8_TRAP_R;
3434 break;
3436 Opc = NVPTX::SULD_3D_I16_TRAP_R;
3437 break;
3439 Opc = NVPTX::SULD_3D_I32_TRAP_R;
3440 break;
3442 Opc = NVPTX::SULD_3D_I64_TRAP_R;
3443 break;
3445 Opc = NVPTX::SULD_3D_V2I8_TRAP_R;
3446 break;
3448 Opc = NVPTX::SULD_3D_V2I16_TRAP_R;
3449 break;
3451 Opc = NVPTX::SULD_3D_V2I32_TRAP_R;
3452 break;
3454 Opc = NVPTX::SULD_3D_V2I64_TRAP_R;
3455 break;
3457 Opc = NVPTX::SULD_3D_V4I8_TRAP_R;
3458 break;
3460 Opc = NVPTX::SULD_3D_V4I16_TRAP_R;
3461 break;
3463 Opc = NVPTX::SULD_3D_V4I32_TRAP_R;
3464 break;
3466 Opc = NVPTX::SULD_1D_I8_ZERO_R;
3467 break;
3469 Opc = NVPTX::SULD_1D_I16_ZERO_R;
3470 break;
3472 Opc = NVPTX::SULD_1D_I32_ZERO_R;
3473 break;
3475 Opc = NVPTX::SULD_1D_I64_ZERO_R;
3476 break;
3478 Opc = NVPTX::SULD_1D_V2I8_ZERO_R;
3479 break;
3481 Opc = NVPTX::SULD_1D_V2I16_ZERO_R;
3482 break;
3484 Opc = NVPTX::SULD_1D_V2I32_ZERO_R;
3485 break;
3487 Opc = NVPTX::SULD_1D_V2I64_ZERO_R;
3488 break;
3490 Opc = NVPTX::SULD_1D_V4I8_ZERO_R;
3491 break;
3493 Opc = NVPTX::SULD_1D_V4I16_ZERO_R;
3494 break;
3496 Opc = NVPTX::SULD_1D_V4I32_ZERO_R;
3497 break;
3499 Opc = NVPTX::SULD_1D_ARRAY_I8_ZERO_R;
3500 break;
3502 Opc = NVPTX::SULD_1D_ARRAY_I16_ZERO_R;
3503 break;
3505 Opc = NVPTX::SULD_1D_ARRAY_I32_ZERO_R;
3506 break;
3508 Opc = NVPTX::SULD_1D_ARRAY_I64_ZERO_R;
3509 break;
3511 Opc = NVPTX::SULD_1D_ARRAY_V2I8_ZERO_R;
3512 break;
3514 Opc = NVPTX::SULD_1D_ARRAY_V2I16_ZERO_R;
3515 break;
3517 Opc = NVPTX::SULD_1D_ARRAY_V2I32_ZERO_R;
3518 break;
3520 Opc = NVPTX::SULD_1D_ARRAY_V2I64_ZERO_R;
3521 break;
3523 Opc = NVPTX::SULD_1D_ARRAY_V4I8_ZERO_R;
3524 break;
3526 Opc = NVPTX::SULD_1D_ARRAY_V4I16_ZERO_R;
3527 break;
3529 Opc = NVPTX::SULD_1D_ARRAY_V4I32_ZERO_R;
3530 break;
3532 Opc = NVPTX::SULD_2D_I8_ZERO_R;
3533 break;
3535 Opc = NVPTX::SULD_2D_I16_ZERO_R;
3536 break;
3538 Opc = NVPTX::SULD_2D_I32_ZERO_R;
3539 break;
3541 Opc = NVPTX::SULD_2D_I64_ZERO_R;
3542 break;
3544 Opc = NVPTX::SULD_2D_V2I8_ZERO_R;
3545 break;
3547 Opc = NVPTX::SULD_2D_V2I16_ZERO_R;
3548 break;
3550 Opc = NVPTX::SULD_2D_V2I32_ZERO_R;
3551 break;
3553 Opc = NVPTX::SULD_2D_V2I64_ZERO_R;
3554 break;
3556 Opc = NVPTX::SULD_2D_V4I8_ZERO_R;
3557 break;
3559 Opc = NVPTX::SULD_2D_V4I16_ZERO_R;
3560 break;
3562 Opc = NVPTX::SULD_2D_V4I32_ZERO_R;
3563 break;
3565 Opc = NVPTX::SULD_2D_ARRAY_I8_ZERO_R;
3566 break;
3568 Opc = NVPTX::SULD_2D_ARRAY_I16_ZERO_R;
3569 break;
3571 Opc = NVPTX::SULD_2D_ARRAY_I32_ZERO_R;
3572 break;
3574 Opc = NVPTX::SULD_2D_ARRAY_I64_ZERO_R;
3575 break;
3577 Opc = NVPTX::SULD_2D_ARRAY_V2I8_ZERO_R;
3578 break;
3580 Opc = NVPTX::SULD_2D_ARRAY_V2I16_ZERO_R;
3581 break;
3583 Opc = NVPTX::SULD_2D_ARRAY_V2I32_ZERO_R;
3584 break;
3586 Opc = NVPTX::SULD_2D_ARRAY_V2I64_ZERO_R;
3587 break;
3589 Opc = NVPTX::SULD_2D_ARRAY_V4I8_ZERO_R;
3590 break;
3592 Opc = NVPTX::SULD_2D_ARRAY_V4I16_ZERO_R;
3593 break;
3595 Opc = NVPTX::SULD_2D_ARRAY_V4I32_ZERO_R;
3596 break;
3598 Opc = NVPTX::SULD_3D_I8_ZERO_R;
3599 break;
3601 Opc = NVPTX::SULD_3D_I16_ZERO_R;
3602 break;
3604 Opc = NVPTX::SULD_3D_I32_ZERO_R;
3605 break;
3607 Opc = NVPTX::SULD_3D_I64_ZERO_R;
3608 break;
3610 Opc = NVPTX::SULD_3D_V2I8_ZERO_R;
3611 break;
3613 Opc = NVPTX::SULD_3D_V2I16_ZERO_R;
3614 break;
3616 Opc = NVPTX::SULD_3D_V2I32_ZERO_R;
3617 break;
3619 Opc = NVPTX::SULD_3D_V2I64_ZERO_R;
3620 break;
3622 Opc = NVPTX::SULD_3D_V4I8_ZERO_R;
3623 break;
3625 Opc = NVPTX::SULD_3D_V4I16_ZERO_R;
3626 break;
3628 Opc = NVPTX::SULD_3D_V4I32_ZERO_R;
3629 break;
3630 }
3631
3632 // Copy over operands
3634 Ops.push_back(N->getOperand(0)); // Move chain to the back.
3635
3636 ReplaceNode(N, CurDAG->getMachineNode(Opc, SDLoc(N), N->getVTList(), Ops));
3637 return true;
3638}
3639
3640
3641/// SelectBFE - Look for instruction sequences that can be made more efficient
3642/// by using the 'bfe' (bit-field extract) PTX instruction
3643bool NVPTXDAGToDAGISel::tryBFE(SDNode *N) {
3644 SDLoc DL(N);
3645 SDValue LHS = N->getOperand(0);
3646 SDValue RHS = N->getOperand(1);
3647 SDValue Len;
3648 SDValue Start;
3649 SDValue Val;
3650 bool IsSigned = false;
3651
3652 if (N->getOpcode() == ISD::AND) {
3653 // Canonicalize the operands
3654 // We want 'and %val, %mask'
3655 if (isa<ConstantSDNode>(LHS) && !isa<ConstantSDNode>(RHS)) {
3656 std::swap(LHS, RHS);
3657 }
3658
3659 ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(RHS);
3660 if (!Mask) {
3661 // We need a constant mask on the RHS of the AND
3662 return false;
3663 }
3664
3665 // Extract the mask bits
3666 uint64_t MaskVal = Mask->getZExtValue();
3667 if (!isMask_64(MaskVal)) {
3668 // We *could* handle shifted masks here, but doing so would require an
3669 // 'and' operation to fix up the low-order bits so we would trade
3670 // shr+and for bfe+and, which has the same throughput
3671 return false;
3672 }
3673
3674 // How many bits are in our mask?
3675 int64_t NumBits = countr_one(MaskVal);
3676 Len = CurDAG->getTargetConstant(NumBits, DL, MVT::i32);
3677
3678 if (LHS.getOpcode() == ISD::SRL || LHS.getOpcode() == ISD::SRA) {
3679 // We have a 'srl/and' pair, extract the effective start bit and length
3680 Val = LHS.getNode()->getOperand(0);
3681 Start = LHS.getNode()->getOperand(1);
3682 ConstantSDNode *StartConst = dyn_cast<ConstantSDNode>(Start);
3683 if (StartConst) {
3684 uint64_t StartVal = StartConst->getZExtValue();
3685 // How many "good" bits do we have left? "good" is defined here as bits
3686 // that exist in the original value, not shifted in.
3687 int64_t GoodBits = Start.getValueSizeInBits() - StartVal;
3688 if (NumBits > GoodBits) {
3689 // Do not handle the case where bits have been shifted in. In theory
3690 // we could handle this, but the cost is likely higher than just
3691 // emitting the srl/and pair.
3692 return false;
3693 }
3694 Start = CurDAG->getTargetConstant(StartVal, DL, MVT::i32);
3695 } else {
3696 // Do not handle the case where the shift amount (can be zero if no srl
3697 // was found) is not constant. We could handle this case, but it would
3698 // require run-time logic that would be more expensive than just
3699 // emitting the srl/and pair.
3700 return false;
3701 }
3702 } else {
3703 // Do not handle the case where the LHS of the and is not a shift. While
3704 // it would be trivial to handle this case, it would just transform
3705 // 'and' -> 'bfe', but 'and' has higher-throughput.
3706 return false;
3707 }
3708 } else if (N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) {
3709 if (LHS->getOpcode() == ISD::AND) {
3710 ConstantSDNode *ShiftCnst = dyn_cast<ConstantSDNode>(RHS);
3711 if (!ShiftCnst) {
3712 // Shift amount must be constant
3713 return false;
3714 }
3715
3716 uint64_t ShiftAmt = ShiftCnst->getZExtValue();
3717
3718 SDValue AndLHS = LHS->getOperand(0);
3719 SDValue AndRHS = LHS->getOperand(1);
3720
3721 // Canonicalize the AND to have the mask on the RHS
3722 if (isa<ConstantSDNode>(AndLHS)) {
3723 std::swap(AndLHS, AndRHS);
3724 }
3725
3726 ConstantSDNode *MaskCnst = dyn_cast<ConstantSDNode>(AndRHS);
3727 if (!MaskCnst) {
3728 // Mask must be constant
3729 return false;
3730 }
3731
3732 uint64_t MaskVal = MaskCnst->getZExtValue();
3733 uint64_t NumZeros;
3734 uint64_t NumBits;
3735 if (isMask_64(MaskVal)) {
3736 NumZeros = 0;
3737 // The number of bits in the result bitfield will be the number of
3738 // trailing ones (the AND) minus the number of bits we shift off
3739 NumBits = llvm::countr_one(MaskVal) - ShiftAmt;
3740 } else if (isShiftedMask_64(MaskVal)) {
3741 NumZeros = llvm::countr_zero(MaskVal);
3742 unsigned NumOnes = llvm::countr_one(MaskVal >> NumZeros);
3743 // The number of bits in the result bitfield will be the number of
3744 // trailing zeros plus the number of set bits in the mask minus the
3745 // number of bits we shift off
3746 NumBits = NumZeros + NumOnes - ShiftAmt;
3747 } else {
3748 // This is not a mask we can handle
3749 return false;
3750 }
3751
3752 if (ShiftAmt < NumZeros) {
3753 // Handling this case would require extra logic that would make this
3754 // transformation non-profitable
3755 return false;
3756 }
3757
3758 Val = AndLHS;
3759 Start = CurDAG->getTargetConstant(ShiftAmt, DL, MVT::i32);
3760 Len = CurDAG->getTargetConstant(NumBits, DL, MVT::i32);
3761 } else if (LHS->getOpcode() == ISD::SHL) {
3762 // Here, we have a pattern like:
3763 //
3764 // (sra (shl val, NN), MM)
3765 // or
3766 // (srl (shl val, NN), MM)
3767 //
3768 // If MM >= NN, we can efficiently optimize this with bfe
3769 Val = LHS->getOperand(0);
3770
3771 SDValue ShlRHS = LHS->getOperand(1);
3772 ConstantSDNode *ShlCnst = dyn_cast<ConstantSDNode>(ShlRHS);
3773 if (!ShlCnst) {
3774 // Shift amount must be constant
3775 return false;
3776 }
3777 uint64_t InnerShiftAmt = ShlCnst->getZExtValue();
3778
3779 SDValue ShrRHS = RHS;
3780 ConstantSDNode *ShrCnst = dyn_cast<ConstantSDNode>(ShrRHS);
3781 if (!ShrCnst) {
3782 // Shift amount must be constant
3783 return false;
3784 }
3785 uint64_t OuterShiftAmt = ShrCnst->getZExtValue();
3786
3787 // To avoid extra codegen and be profitable, we need Outer >= Inner
3788 if (OuterShiftAmt < InnerShiftAmt) {
3789 return false;
3790 }
3791
3792 // If the outer shift is more than the type size, we have no bitfield to
3793 // extract (since we also check that the inner shift is <= the outer shift
3794 // then this also implies that the inner shift is < the type size)
3795 if (OuterShiftAmt >= Val.getValueSizeInBits()) {
3796 return false;
3797 }
3798
3799 Start = CurDAG->getTargetConstant(OuterShiftAmt - InnerShiftAmt, DL,
3800 MVT::i32);
3801 Len = CurDAG->getTargetConstant(Val.getValueSizeInBits() - OuterShiftAmt,
3802 DL, MVT::i32);
3803
3804 if (N->getOpcode() == ISD::SRA) {
3805 // If we have a arithmetic right shift, we need to use the signed bfe
3806 // variant
3807 IsSigned = true;
3808 }
3809 } else {
3810 // No can do...
3811 return false;
3812 }
3813 } else {
3814 // No can do...
3815 return false;
3816 }
3817
3818
3819 unsigned Opc;
3820 // For the BFE operations we form here from "and" and "srl", always use the
3821 // unsigned variants.
3822 if (Val.getValueType() == MVT::i32) {
3823 if (IsSigned) {
3824 Opc = NVPTX::BFE_S32rii;
3825 } else {
3826 Opc = NVPTX::BFE_U32rii;
3827 }
3828 } else if (Val.getValueType() == MVT::i64) {
3829 if (IsSigned) {
3830 Opc = NVPTX::BFE_S64rii;
3831 } else {
3832 Opc = NVPTX::BFE_U64rii;
3833 }
3834 } else {
3835 // We cannot handle this type
3836 return false;
3837 }
3838
3839 SDValue Ops[] = {
3840 Val, Start, Len
3841 };
3842
3843 ReplaceNode(N, CurDAG->getMachineNode(Opc, DL, N->getVTList(), Ops));
3844 return true;
3845}
3846
3847// SelectDirectAddr - Match a direct address for DAG.
3848// A direct address could be a globaladdress or externalsymbol.
3849bool NVPTXDAGToDAGISel::SelectDirectAddr(SDValue N, SDValue &Address) {
3850 // Return true if TGA or ES.
3851 if (N.getOpcode() == ISD::TargetGlobalAddress ||
3852 N.getOpcode() == ISD::TargetExternalSymbol) {
3853 Address = N;
3854 return true;
3855 }
3856 if (N.getOpcode() == NVPTXISD::Wrapper) {
3857 Address = N.getOperand(0);
3858 return true;
3859 }
3860 // addrspacecast(MoveParam(arg_symbol) to addrspace(PARAM)) -> arg_symbol
3861 if (AddrSpaceCastSDNode *CastN = dyn_cast<AddrSpaceCastSDNode>(N)) {
3862 if (CastN->getSrcAddressSpace() == ADDRESS_SPACE_GENERIC &&
3865 return SelectDirectAddr(CastN->getOperand(0).getOperand(0), Address);
3866 }
3867 return false;
3868}
3869
3870// symbol+offset
3871bool NVPTXDAGToDAGISel::SelectADDRsi_imp(
3872 SDNode *OpNode, SDValue Addr, SDValue &Base, SDValue &Offset, MVT mvt) {
3873 if (Addr.getOpcode() == ISD::ADD) {
3874 if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1))) {
3875 SDValue base = Addr.getOperand(0);
3876 if (SelectDirectAddr(base, Base)) {
3877 Offset = CurDAG->getTargetConstant(CN->getZExtValue(), SDLoc(OpNode),
3878 mvt);
3879 return true;
3880 }
3881 }
3882 }
3883 return false;
3884}
3885
3886// symbol+offset
3887bool NVPTXDAGToDAGISel::SelectADDRsi(SDNode *OpNode, SDValue Addr,
3889 return SelectADDRsi_imp(OpNode, Addr, Base, Offset, MVT::i32);
3890}
3891
3892// symbol+offset
3893bool NVPTXDAGToDAGISel::SelectADDRsi64(SDNode *OpNode, SDValue Addr,
3895 return SelectADDRsi_imp(OpNode, Addr, Base, Offset, MVT::i64);
3896}
3897
3898// register+offset
3899bool NVPTXDAGToDAGISel::SelectADDRri_imp(
3900 SDNode *OpNode, SDValue Addr, SDValue &Base, SDValue &Offset, MVT mvt) {
3901 if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Addr)) {
3902 Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), mvt);
3903 Offset = CurDAG->getTargetConstant(0, SDLoc(OpNode), mvt);
3904 return true;
3905 }
3906 if (Addr.getOpcode() == ISD::TargetExternalSymbol ||
3907 Addr.getOpcode() == ISD::TargetGlobalAddress)
3908 return false; // direct calls.
3909
3910 if (Addr.getOpcode() == ISD::ADD) {
3911 if (SelectDirectAddr(Addr.getOperand(0), Addr)) {
3912 return false;
3913 }
3914 if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1))) {
3915 if (FrameIndexSDNode *FIN =
3916 dyn_cast<FrameIndexSDNode>(Addr.getOperand(0)))
3917 // Constant offset from frame ref.
3918 Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), mvt);
3919 else
3920 Base = Addr.getOperand(0);
3921
3922 // Offset must fit in a 32-bit signed int in PTX [register+offset] address
3923 // mode
3924 if (!CN->getAPIntValue().isSignedIntN(32))
3925 return false;
3926
3927 Offset = CurDAG->getTargetConstant(CN->getSExtValue(), SDLoc(OpNode),
3928 MVT::i32);
3929 return true;
3930 }
3931 }
3932 return false;
3933}
3934
3935// register+offset
3936bool NVPTXDAGToDAGISel::SelectADDRri(SDNode *OpNode, SDValue Addr,
3938 return SelectADDRri_imp(OpNode, Addr, Base, Offset, MVT::i32);
3939}
3940
3941// register+offset
3942bool NVPTXDAGToDAGISel::SelectADDRri64(SDNode *OpNode, SDValue Addr,
3944 return SelectADDRri_imp(OpNode, Addr, Base, Offset, MVT::i64);
3945}
3946
3947bool NVPTXDAGToDAGISel::ChkMemSDNodeAddressSpace(SDNode *N,
3948 unsigned int spN) const {
3949 const Value *Src = nullptr;
3950 if (MemSDNode *mN = dyn_cast<MemSDNode>(N)) {
3951 if (spN == 0 && mN->getMemOperand()->getPseudoValue())
3952 return true;
3953 Src = mN->getMemOperand()->getValue();
3954 }
3955 if (!Src)
3956 return false;
3957 if (auto *PT = dyn_cast<PointerType>(Src->getType()))
3958 return (PT->getAddressSpace() == spN);
3959 return false;
3960}
3961
3962/// SelectInlineAsmMemoryOperand - Implement addressing mode selection for
3963/// inline asm expressions.
3965 const SDValue &Op, InlineAsm::ConstraintCode ConstraintID,
3966 std::vector<SDValue> &OutOps) {
3967 SDValue Op0, Op1;
3968 switch (ConstraintID) {
3969 default:
3970 return true;
3971 case InlineAsm::ConstraintCode::m: // memory
3972 if (SelectDirectAddr(Op, Op0)) {
3973 OutOps.push_back(Op0);
3974 OutOps.push_back(CurDAG->getTargetConstant(0, SDLoc(Op), MVT::i32));
3975 return false;
3976 }
3977 if (SelectADDRri(Op.getNode(), Op, Op0, Op1)) {
3978 OutOps.push_back(Op0);
3979 OutOps.push_back(Op1);
3980 return false;
3981 }
3982 break;
3983 }
3984 return true;
3985}
3986
3987void NVPTXDAGToDAGISel::SelectV2I64toI128(SDNode *N) {
3988 // Lower a CopyToReg with two 64-bit inputs
3989 // Dst:i128, lo:i64, hi:i64
3990 //
3991 // CopyToReg Dst, lo, hi;
3992 //
3993 // ==>
3994 //
3995 // tmp = V2I64toI128 {lo, hi};
3996 // CopyToReg Dst, tmp;
3997 SDValue Dst = N->getOperand(1);
3998 SDValue Lo = N->getOperand(2);
3999 SDValue Hi = N->getOperand(3);
4000
4001 SDLoc DL(N);
4002 SDNode *Mov =
4003 CurDAG->getMachineNode(NVPTX::V2I64toI128, DL, MVT::i128, {Lo, Hi});
4004
4005 SmallVector<SDValue, 4> NewOps(N->getNumOperands() - 1);
4006 NewOps[0] = N->getOperand(0);
4007 NewOps[1] = Dst;
4008 NewOps[2] = SDValue(Mov, 0);
4009 if (N->getNumOperands() == 5)
4010 NewOps[3] = N->getOperand(4);
4011 SDValue NewValue = CurDAG->getNode(ISD::CopyToReg, DL, SmallVector<EVT>(N->values()), NewOps);
4012
4013 ReplaceNode(N, NewValue.getNode());
4014}
4015
4016void NVPTXDAGToDAGISel::SelectI128toV2I64(SDNode *N) {
4017 // Lower CopyFromReg from a 128-bit regs to two 64-bit regs
4018 // Dst:i128, Src:i128
4019 //
4020 // {lo, hi} = CopyFromReg Src
4021 //
4022 // ==>
4023 //
4024 // {lo, hi} = I128toV2I64 Src
4025 //
4026 SDValue Ch = N->getOperand(0);
4027 SDValue Src = N->getOperand(1);
4028 SDValue Glue = N->getOperand(2);
4029 SDLoc DL(N);
4030
4031 // Add Glue and Ch to the operands and results to avoid break the execution
4032 // order
4034 NVPTX::I128toV2I64, DL,
4035 {MVT::i64, MVT::i64, Ch.getValueType(), Glue.getValueType()},
4036 {Src, Ch, Glue});
4037
4038 ReplaceNode(N, Mov);
4039}
4040
4041/// GetConvertOpcode - Returns the CVT_ instruction opcode that implements a
4042/// conversion from \p SrcTy to \p DestTy.
4043unsigned NVPTXDAGToDAGISel::GetConvertOpcode(MVT DestTy, MVT SrcTy,
4044 LoadSDNode *LdNode) {
4045 bool IsSigned = LdNode && LdNode->getExtensionType() == ISD::SEXTLOAD;
4046 switch (SrcTy.SimpleTy) {
4047 default:
4048 llvm_unreachable("Unhandled source type");
4049 case MVT::i8:
4050 switch (DestTy.SimpleTy) {
4051 default:
4052 llvm_unreachable("Unhandled dest type");
4053 case MVT::i16:
4054 return IsSigned ? NVPTX::CVT_s16_s8 : NVPTX::CVT_u16_u8;
4055 case MVT::i32:
4056 return IsSigned ? NVPTX::CVT_s32_s8 : NVPTX::CVT_u32_u8;
4057 case MVT::i64:
4058 return IsSigned ? NVPTX::CVT_s64_s8 : NVPTX::CVT_u64_u8;
4059 }
4060 case MVT::i16:
4061 switch (DestTy.SimpleTy) {
4062 default:
4063 llvm_unreachable("Unhandled dest type");
4064 case MVT::i8:
4065 return IsSigned ? NVPTX::CVT_s8_s16 : NVPTX::CVT_u8_u16;
4066 case MVT::i32:
4067 return IsSigned ? NVPTX::CVT_s32_s16 : NVPTX::CVT_u32_u16;
4068 case MVT::i64:
4069 return IsSigned ? NVPTX::CVT_s64_s16 : NVPTX::CVT_u64_u16;
4070 }
4071 case MVT::i32:
4072 switch (DestTy.SimpleTy) {
4073 default:
4074 llvm_unreachable("Unhandled dest type");
4075 case MVT::i8:
4076 return IsSigned ? NVPTX::CVT_s8_s32 : NVPTX::CVT_u8_u32;
4077 case MVT::i16:
4078 return IsSigned ? NVPTX::CVT_s16_s32 : NVPTX::CVT_u16_u32;
4079 case MVT::i64:
4080 return IsSigned ? NVPTX::CVT_s64_s32 : NVPTX::CVT_u64_u32;
4081 }
4082 case MVT::i64:
4083 switch (DestTy.SimpleTy) {
4084 default:
4085 llvm_unreachable("Unhandled dest type");
4086 case MVT::i8:
4087 return IsSigned ? NVPTX::CVT_s8_s64 : NVPTX::CVT_u8_u64;
4088 case MVT::i16:
4089 return IsSigned ? NVPTX::CVT_s16_s64 : NVPTX::CVT_u16_u64;
4090 case MVT::i32:
4091 return IsSigned ? NVPTX::CVT_s32_s64 : NVPTX::CVT_u32_u64;
4092 }
4093 case MVT::f16:
4094 switch (DestTy.SimpleTy) {
4095 default:
4096 llvm_unreachable("Unhandled dest type");
4097 case MVT::f32:
4098 return NVPTX::CVT_f32_f16;
4099 case MVT::f64:
4100 return NVPTX::CVT_f64_f16;
4101 }
4102 }
4103}
amdgpu aa AMDGPU Address space based Alias Analysis Wrapper
static const LLT F64
static const LLT F32
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Atomic ordering constants.
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
uint64_t Addr
#define DEBUG_TYPE
#define F(x, y, z)
Definition: MD5.cpp:55
#define getOpcodeForVectorStParam(n, ty, isimm)
static unsigned int getCodeAddrSpace(MemSDNode *N)
static int getLdStRegType(EVT VT)
static unsigned pickOpcodeForVectorStParam(SmallVector< SDValue, 8 > &Ops, unsigned NumElts, MVT::SimpleValueType MemTy, SelectionDAG *CurDAG, SDLoc DL)
static unsigned int getCodeMemorySemantic(MemSDNode *N, const NVPTXSubtarget *Subtarget)
#define getOpcodeForVectorStParamV2(ty, isimm)
static cl::opt< bool > EnableRsqrtOpt("nvptx-rsqrt-approx-opt", cl::init(true), cl::Hidden, cl::desc("Enable reciprocal sqrt optimization"))
static unsigned getPTXCmpMode(const CondCodeSDNode &CondCode, bool FTZ)
static std::optional< unsigned > pickOpcodeForVT(MVT::SimpleValueType VT, unsigned Opcode_i8, unsigned Opcode_i16, unsigned Opcode_i32, std::optional< unsigned > Opcode_i64, unsigned Opcode_f32, std::optional< unsigned > Opcode_f64)
static bool canLowerToLDG(MemSDNode *N, const NVPTXSubtarget &Subtarget, unsigned CodeAddrSpace, MachineFunction *F)
if(VerifyEach)
const char LLVMTargetMachineRef TM
#define INITIALIZE_PASS(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:38
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
raw_pwrite_stream & OS
#define PASS_NAME
Value * RHS
Value * LHS
unsigned getSrcAddressSpace() const
unsigned getDestAddressSpace() const
This is an SDNode representing atomic operations.
const SDValue & getVal() const
const ConstantFP * getConstantFPValue() const
ConstantFP - Floating Point Values [float, double].
Definition: Constants.h:269
This is the shared class of boolean and integer constants.
Definition: Constants.h:81
const ConstantInt * getConstantIntValue() const
uint64_t getZExtValue() const
This class represents an Operation in the Expression.
unsigned getPointerSizeInBits(unsigned AS=0) const
Layout pointer size, in bits FIXME: The defaults need to be removed once all of the backends/clients ...
Definition: DataLayout.h:410
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:311
bool isIndexed() const
Return true if this is a pre/post inc/dec load/store.
This class is used to represent ISD::LOAD nodes.
ISD::LoadExtType getExtensionType() const
Return whether this is a plain node, or one of the varieties of value-extending loads.
Machine Value Type.
SimpleValueType SimpleTy
bool isVector() const
Return true if this is a vector value type.
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
MVT getVectorElementType() const
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
A description of a memory reference used in the backend.
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
EVT getMemoryVT() const
Return the type of the in-memory value.
NVPTXDAGToDAGISelLegacy(NVPTXTargetMachine &tm, CodeGenOptLevel OptLevel)
bool runOnMachineFunction(MachineFunction &MF) override
bool SelectInlineAsmMemoryOperand(const SDValue &Op, InlineAsm::ConstraintCode ConstraintID, std::vector< SDValue > &OutOps) override
SelectInlineAsmMemoryOperand - Implement addressing mode selection for inline asm expressions.
const NVPTXSubtarget * Subtarget
const NVPTXTargetLowering * getTargetLowering() const override
bool hasRelaxedMMIO() const
bool hasMemoryOrdering() const
bool useF32FTZ(const MachineFunction &MF) const
bool allowFMA(MachineFunction &MF, CodeGenOptLevel OptLevel) const
bool allowUnsafeFPMath(MachineFunction &MF) const
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
Represents one node in the SelectionDAG.
const SDValue & getOperand(unsigned Num) const
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
SDNode * getNode() const
get the SDNode which holds the desired result
EVT getValueType() const
Return the ValueType of the referenced return value.
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
unsigned getOpcode() const
SelectionDAGISel - This is the common base class used for SelectionDAG-based pattern-matching instruc...
MachineFunction * MF
CodeGenOptLevel OptLevel
void ReplaceUses(SDValue F, SDValue T)
ReplaceUses - replace all uses of the old node F with the use of the new node T.
void ReplaceNode(SDNode *F, SDNode *T)
Replace all uses of F with T, then remove F from the DAG.
virtual bool runOnMachineFunction(MachineFunction &mf)
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
Definition: SelectionDAG.h:227
SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
void setNodeMemRefs(MachineSDNode *N, ArrayRef< MachineMemOperand * > NewMemRefs)
Mutate the specified machine node's memory references to the provided list.
const DataLayout & getDataLayout() const
Definition: SelectionDAG.h:488
SDValue getTargetFrameIndex(int FI, EVT VT)
Definition: SelectionDAG.h:742
SDValue getTargetConstantFP(double Val, const SDLoc &DL, EVT VT)
Definition: SelectionDAG.h:723
SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
Definition: SelectionDAG.h:691
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
Definition: SmallString.h:26
bool empty() const
Definition: SmallVector.h:94
void push_back(const T &Elt)
Definition: SmallVector.h:426
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
This class is used to represent ISD::STORE nodes.
const SDValue & getValue() const
unsigned getPointerSizeInBits(unsigned AS) const
LLVM Value Representation.
Definition: Value.h:74
A raw_ostream that writes to an SmallVector or SmallString.
Definition: raw_ostream.h:691
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Definition: Lint.cpp:86
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:121
@ ConstantFP
Definition: ISDOpcodes.h:77
@ ATOMIC_STORE
OUTCHAIN = ATOMIC_STORE(INCHAIN, ptr, val) This corresponds to "store atomic" instruction.
Definition: ISDOpcodes.h:1284
@ ADD
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:246
@ LOAD
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
Definition: ISDOpcodes.h:1074
@ TargetExternalSymbol
Definition: ISDOpcodes.h:175
@ ATOMIC_LOAD
Val, OUTCHAIN = ATOMIC_LOAD(INCHAIN, ptr) This corresponds to "load atomic" instruction.
Definition: ISDOpcodes.h:1280
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
Definition: ISDOpcodes.h:215
@ TargetGlobalAddress
TargetGlobalAddress - Like GlobalAddress, but the DAG does no folding or anything else with this node...
Definition: ISDOpcodes.h:170
@ SHL
Shift and rotation operations.
Definition: ISDOpcodes.h:734
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition: ISDOpcodes.h:549
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition: ISDOpcodes.h:209
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition: ISDOpcodes.h:708
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition: ISDOpcodes.h:190
@ ADDRSPACECAST
ADDRSPACECAST - This operator converts between pointers of different address spaces.
Definition: ISDOpcodes.h:937
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition: ISDOpcodes.h:198
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
Definition: ISDOpcodes.h:1578
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition: STLExtras.h:329
@ Offset
Definition: DWP.cpp:480
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1722
bool Isv2x16VT(EVT VT)
int countr_one(T Value)
Count the number of ones from the least significant bit to the first zero bit.
Definition: bit.h:307
@ ADDRESS_SPACE_GENERIC
Definition: NVPTXBaseInfo.h:22
@ ADDRESS_SPACE_LOCAL
Definition: NVPTXBaseInfo.h:26
@ ADDRESS_SPACE_CONST
Definition: NVPTXBaseInfo.h:25
@ ADDRESS_SPACE_GLOBAL
Definition: NVPTXBaseInfo.h:23
@ ADDRESS_SPACE_PARAM
Definition: NVPTXBaseInfo.h:29
@ ADDRESS_SPACE_SHARED
Definition: NVPTXBaseInfo.h:24
FunctionPass * createNVPTXISelDag(NVPTXTargetMachine &TM, llvm::CodeGenOptLevel OptLevel)
createNVPTXISelDag - This pass converts a legalized DAG into a NVPTX-specific DAG,...
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition: bit.h:215
constexpr bool isShiftedMask_64(uint64_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (64 bit ver...
Definition: MathExtras.h:285
const char * toIRString(AtomicOrdering ao)
String used by LLVM IR to represent atomic ordering.
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition: Error.cpp:167
constexpr bool isMask_64(uint64_t Value)
Return true if the argument is a non-empty sequence of ones starting at the least significant bit wit...
Definition: MathExtras.h:273
CodeGenOptLevel
Code generation optimization level.
Definition: CodeGen.h:54
AtomicOrdering
Atomic ordering for LLVM's memory model.
bool isKernelFunction(const Function &F)
void getUnderlyingObjects(const Value *V, SmallVectorImpl< const Value * > &Objects, const LoopInfo *LI=nullptr, unsigned MaxLookup=6)
This method is similar to getUnderlyingObject except that it can look through phi and select instruct...
Implement std::hash so that hash_code can be used in STL containers.
Definition: BitVector.h:858
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:860
#define N
Extended Value Type.
Definition: ValueTypes.h:34
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition: ValueTypes.h:136
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
Definition: ValueTypes.h:146
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition: ValueTypes.h:306
bool isVector() const
Return true if this is a vector value type.
Definition: ValueTypes.h:167
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition: ValueTypes.h:318
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition: ValueTypes.h:326
This represents a list of ValueType's that has been intern'd by a SelectionDAG.