LLVM 20.0.0git
NVPTXISelDAGToDAG.cpp
Go to the documentation of this file.
1//===-- NVPTXISelDAGToDAG.cpp - A dag to dag inst selector for NVPTX ------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines an instruction selector for the NVPTX target.
10//
11//===----------------------------------------------------------------------===//
12
13#include "NVPTXISelDAGToDAG.h"
15#include "NVPTXUtilities.h"
18#include "llvm/IR/GlobalValue.h"
20#include "llvm/IR/IntrinsicsNVPTX.h"
23#include "llvm/Support/Debug.h"
27
28using namespace llvm;
29
30#define DEBUG_TYPE "nvptx-isel"
31#define PASS_NAME "NVPTX DAG->DAG Pattern Instruction Selection"
32
33static cl::opt<bool>
34 EnableRsqrtOpt("nvptx-rsqrt-approx-opt", cl::init(true), cl::Hidden,
35 cl::desc("Enable reciprocal sqrt optimization"));
36
37/// createNVPTXISelDag - This pass converts a legalized DAG into a
38/// NVPTX-specific DAG, ready for instruction scheduling.
40 llvm::CodeGenOptLevel OptLevel) {
41 return new NVPTXDAGToDAGISelLegacy(TM, OptLevel);
42}
43
45 CodeGenOptLevel OptLevel)
47 ID, std::make_unique<NVPTXDAGToDAGISel>(tm, OptLevel)) {}
48
50
52
54 CodeGenOptLevel OptLevel)
55 : SelectionDAGISel(tm, OptLevel), TM(tm) {
56 doMulWide = (OptLevel > CodeGenOptLevel::None);
57}
58
62}
63
64int NVPTXDAGToDAGISel::getDivF32Level() const {
66}
67
68bool NVPTXDAGToDAGISel::usePrecSqrtF32() const {
70}
71
72bool NVPTXDAGToDAGISel::useF32FTZ() const {
74}
75
76bool NVPTXDAGToDAGISel::allowFMA() const {
78 return TL->allowFMA(*MF, OptLevel);
79}
80
81bool NVPTXDAGToDAGISel::allowUnsafeFPMath() const {
83 return TL->allowUnsafeFPMath(*MF);
84}
85
86bool NVPTXDAGToDAGISel::doRsqrtOpt() const { return EnableRsqrtOpt; }
87
88/// Select - Select instructions not customized! Used for
89/// expanded, promoted and normal instructions.
90void NVPTXDAGToDAGISel::Select(SDNode *N) {
91
92 if (N->isMachineOpcode()) {
93 N->setNodeId(-1);
94 return; // Already selected.
95 }
96
97 switch (N->getOpcode()) {
98 case ISD::LOAD:
100 if (tryLoad(N))
101 return;
102 break;
103 case ISD::STORE:
105 if (tryStore(N))
106 return;
107 break;
109 if (tryEXTRACT_VECTOR_ELEMENT(N))
110 return;
111 break;
113 SelectSETP_F16X2(N);
114 return;
116 SelectSETP_BF16X2(N);
117 return;
118 case NVPTXISD::LoadV2:
119 case NVPTXISD::LoadV4:
120 if (tryLoadVector(N))
121 return;
122 break;
123 case NVPTXISD::LDGV2:
124 case NVPTXISD::LDGV4:
125 case NVPTXISD::LDUV2:
126 case NVPTXISD::LDUV4:
127 if (tryLDGLDU(N))
128 return;
129 break;
132 if (tryStoreVector(N))
133 return;
134 break;
138 if (tryLoadParam(N))
139 return;
140 break;
144 if (tryStoreRetval(N))
145 return;
146 break;
152 if (tryStoreParam(N))
153 return;
154 break;
156 if (tryIntrinsicNoChain(N))
157 return;
158 break;
160 if (tryIntrinsicChain(N))
161 return;
162 break;
337 if (tryTextureIntrinsic(N))
338 return;
339 break;
505 if (trySurfaceIntrinsic(N))
506 return;
507 break;
508 case ISD::AND:
509 case ISD::SRA:
510 case ISD::SRL:
511 // Try to select BFE
512 if (tryBFE(N))
513 return;
514 break;
516 SelectAddrSpaceCast(N);
517 return;
518 case ISD::ConstantFP:
519 if (tryConstantFP(N))
520 return;
521 break;
522 case ISD::CopyToReg: {
523 if (N->getOperand(1).getValueType() == MVT::i128) {
524 SelectV2I64toI128(N);
525 return;
526 }
527 break;
528 }
529 case ISD::CopyFromReg: {
530 if (N->getOperand(1).getValueType() == MVT::i128) {
531 SelectI128toV2I64(N);
532 return;
533 }
534 break;
535 }
536 default:
537 break;
538 }
539 SelectCode(N);
540}
541
542bool NVPTXDAGToDAGISel::tryIntrinsicChain(SDNode *N) {
543 unsigned IID = N->getConstantOperandVal(1);
544 switch (IID) {
545 default:
546 return false;
547 case Intrinsic::nvvm_ldg_global_f:
548 case Intrinsic::nvvm_ldg_global_i:
549 case Intrinsic::nvvm_ldg_global_p:
550 case Intrinsic::nvvm_ldu_global_f:
551 case Intrinsic::nvvm_ldu_global_i:
552 case Intrinsic::nvvm_ldu_global_p:
553 return tryLDGLDU(N);
554 }
555}
556
557// There's no way to specify FP16 and BF16 immediates in .(b)f16 ops, so we
558// have to load them into an .(b)f16 register first.
559bool NVPTXDAGToDAGISel::tryConstantFP(SDNode *N) {
560 if (N->getValueType(0) != MVT::f16 && N->getValueType(0) != MVT::bf16)
561 return false;
563 cast<ConstantFPSDNode>(N)->getValueAPF(), SDLoc(N), N->getValueType(0));
564 SDNode *LoadConstF16 = CurDAG->getMachineNode(
565 (N->getValueType(0) == MVT::f16 ? NVPTX::LOAD_CONST_F16
566 : NVPTX::LOAD_CONST_BF16),
567 SDLoc(N), N->getValueType(0), Val);
568 ReplaceNode(N, LoadConstF16);
569 return true;
570}
571
572// Map ISD:CONDCODE value to appropriate CmpMode expected by
573// NVPTXInstPrinter::printCmpMode()
574static unsigned getPTXCmpMode(const CondCodeSDNode &CondCode, bool FTZ) {
576 unsigned PTXCmpMode = [](ISD::CondCode CC) {
577 switch (CC) {
578 default:
579 llvm_unreachable("Unexpected condition code.");
580 case ISD::SETOEQ:
581 return CmpMode::EQ;
582 case ISD::SETOGT:
583 return CmpMode::GT;
584 case ISD::SETOGE:
585 return CmpMode::GE;
586 case ISD::SETOLT:
587 return CmpMode::LT;
588 case ISD::SETOLE:
589 return CmpMode::LE;
590 case ISD::SETONE:
591 return CmpMode::NE;
592 case ISD::SETO:
593 return CmpMode::NUM;
594 case ISD::SETUO:
595 return CmpMode::NotANumber;
596 case ISD::SETUEQ:
597 return CmpMode::EQU;
598 case ISD::SETUGT:
599 return CmpMode::GTU;
600 case ISD::SETUGE:
601 return CmpMode::GEU;
602 case ISD::SETULT:
603 return CmpMode::LTU;
604 case ISD::SETULE:
605 return CmpMode::LEU;
606 case ISD::SETUNE:
607 return CmpMode::NEU;
608 case ISD::SETEQ:
609 return CmpMode::EQ;
610 case ISD::SETGT:
611 return CmpMode::GT;
612 case ISD::SETGE:
613 return CmpMode::GE;
614 case ISD::SETLT:
615 return CmpMode::LT;
616 case ISD::SETLE:
617 return CmpMode::LE;
618 case ISD::SETNE:
619 return CmpMode::NE;
620 }
621 }(CondCode.get());
622
623 if (FTZ)
624 PTXCmpMode |= NVPTX::PTXCmpMode::FTZ_FLAG;
625
626 return PTXCmpMode;
627}
628
629bool NVPTXDAGToDAGISel::SelectSETP_F16X2(SDNode *N) {
630 unsigned PTXCmpMode =
631 getPTXCmpMode(*cast<CondCodeSDNode>(N->getOperand(2)), useF32FTZ());
632 SDLoc DL(N);
634 NVPTX::SETP_f16x2rr, DL, MVT::i1, MVT::i1, N->getOperand(0),
635 N->getOperand(1), CurDAG->getTargetConstant(PTXCmpMode, DL, MVT::i32));
636 ReplaceNode(N, SetP);
637 return true;
638}
639
640bool NVPTXDAGToDAGISel::SelectSETP_BF16X2(SDNode *N) {
641 unsigned PTXCmpMode =
642 getPTXCmpMode(*cast<CondCodeSDNode>(N->getOperand(2)), useF32FTZ());
643 SDLoc DL(N);
645 NVPTX::SETP_bf16x2rr, DL, MVT::i1, MVT::i1, N->getOperand(0),
646 N->getOperand(1), CurDAG->getTargetConstant(PTXCmpMode, DL, MVT::i32));
647 ReplaceNode(N, SetP);
648 return true;
649}
650
651// Find all instances of extract_vector_elt that use this v2f16 vector
652// and coalesce them into a scattering move instruction.
653bool NVPTXDAGToDAGISel::tryEXTRACT_VECTOR_ELEMENT(SDNode *N) {
654 SDValue Vector = N->getOperand(0);
655
656 // We only care about 16x2 as it's the only real vector type we
657 // need to deal with.
658 MVT VT = Vector.getSimpleValueType();
659 if (!Isv2x16VT(VT))
660 return false;
661 // Find and record all uses of this vector that extract element 0 or 1.
663 for (auto *U : Vector.getNode()->uses()) {
664 if (U->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
665 continue;
666 if (U->getOperand(0) != Vector)
667 continue;
668 if (const ConstantSDNode *IdxConst =
669 dyn_cast<ConstantSDNode>(U->getOperand(1))) {
670 if (IdxConst->getZExtValue() == 0)
671 E0.push_back(U);
672 else if (IdxConst->getZExtValue() == 1)
673 E1.push_back(U);
674 else
675 llvm_unreachable("Invalid vector index.");
676 }
677 }
678
679 // There's no point scattering f16x2 if we only ever access one
680 // element of it.
681 if (E0.empty() || E1.empty())
682 return false;
683
684 // Merge (f16 extractelt(V, 0), f16 extractelt(V,1))
685 // into f16,f16 SplitF16x2(V)
686 MVT EltVT = VT.getVectorElementType();
687 SDNode *ScatterOp =
688 CurDAG->getMachineNode(NVPTX::I32toV2I16, SDLoc(N), EltVT, EltVT, Vector);
689 for (auto *Node : E0)
690 ReplaceUses(SDValue(Node, 0), SDValue(ScatterOp, 0));
691 for (auto *Node : E1)
692 ReplaceUses(SDValue(Node, 0), SDValue(ScatterOp, 1));
693
694 return true;
695}
696
697static unsigned int getCodeAddrSpace(MemSDNode *N) {
698 const Value *Src = N->getMemOperand()->getValue();
699
700 if (!Src)
702
703 if (auto *PT = dyn_cast<PointerType>(Src->getType())) {
704 switch (PT->getAddressSpace()) {
711 default: break;
712 }
713 }
715}
716
717static bool canLowerToLDG(MemSDNode *N, const NVPTXSubtarget &Subtarget,
718 unsigned CodeAddrSpace, MachineFunction *F) {
719 // We use ldg (i.e. ld.global.nc) for invariant loads from the global address
720 // space.
721 //
722 // We have two ways of identifying invariant loads: Loads may be explicitly
723 // marked as invariant, or we may infer them to be invariant.
724 //
725 // We currently infer invariance for loads from
726 // - constant global variables, and
727 // - kernel function pointer params that are noalias (i.e. __restrict) and
728 // never written to.
729 //
730 // TODO: Perform a more powerful invariance analysis (ideally IPO, and ideally
731 // not during the SelectionDAG phase).
732 //
733 // TODO: Infer invariance only at -O2. We still want to use ldg at -O0 for
734 // explicitly invariant loads because these are how clang tells us to use ldg
735 // when the user uses a builtin.
736 if (!Subtarget.hasLDG() || CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL)
737 return false;
738
739 if (N->isInvariant())
740 return true;
741
742 bool IsKernelFn = isKernelFunction(F->getFunction());
743
744 // We use getUnderlyingObjects() here instead of getUnderlyingObject() mainly
745 // because the former looks through phi nodes while the latter does not. We
746 // need to look through phi nodes to handle pointer induction variables.
748 getUnderlyingObjects(N->getMemOperand()->getValue(), Objs);
749
750 return all_of(Objs, [&](const Value *V) {
751 if (auto *A = dyn_cast<const Argument>(V))
752 return IsKernelFn && A->onlyReadsMemory() && A->hasNoAliasAttr();
753 if (auto *GV = dyn_cast<const GlobalVariable>(V))
754 return GV->isConstant();
755 return false;
756 });
757}
758
759bool NVPTXDAGToDAGISel::tryIntrinsicNoChain(SDNode *N) {
760 unsigned IID = N->getConstantOperandVal(0);
761 switch (IID) {
762 default:
763 return false;
764 case Intrinsic::nvvm_texsurf_handle_internal:
765 SelectTexSurfHandle(N);
766 return true;
767 }
768}
769
770void NVPTXDAGToDAGISel::SelectTexSurfHandle(SDNode *N) {
771 // Op 0 is the intrinsic ID
772 SDValue Wrapper = N->getOperand(1);
773 SDValue GlobalVal = Wrapper.getOperand(0);
774 ReplaceNode(N, CurDAG->getMachineNode(NVPTX::texsurf_handles, SDLoc(N),
775 MVT::i64, GlobalVal));
776}
777
778void NVPTXDAGToDAGISel::SelectAddrSpaceCast(SDNode *N) {
779 SDValue Src = N->getOperand(0);
780 AddrSpaceCastSDNode *CastN = cast<AddrSpaceCastSDNode>(N);
781 unsigned SrcAddrSpace = CastN->getSrcAddressSpace();
782 unsigned DstAddrSpace = CastN->getDestAddressSpace();
783 assert(SrcAddrSpace != DstAddrSpace &&
784 "addrspacecast must be between different address spaces");
785
786 if (DstAddrSpace == ADDRESS_SPACE_GENERIC) {
787 // Specific to generic
788 unsigned Opc;
789 switch (SrcAddrSpace) {
790 default: report_fatal_error("Bad address space in addrspacecast");
792 Opc = TM.is64Bit() ? NVPTX::cvta_global_64 : NVPTX::cvta_global;
793 break;
795 Opc = TM.is64Bit() ? (TM.getPointerSizeInBits(SrcAddrSpace) == 32
796 ? NVPTX::cvta_shared_6432
797 : NVPTX::cvta_shared_64)
798 : NVPTX::cvta_shared;
799 break;
801 Opc = TM.is64Bit() ? (TM.getPointerSizeInBits(SrcAddrSpace) == 32
802 ? NVPTX::cvta_const_6432
803 : NVPTX::cvta_const_64)
804 : NVPTX::cvta_const;
805 break;
807 Opc = TM.is64Bit() ? (TM.getPointerSizeInBits(SrcAddrSpace) == 32
808 ? NVPTX::cvta_local_6432
809 : NVPTX::cvta_local_64)
810 : NVPTX::cvta_local;
811 break;
812 }
813 ReplaceNode(N, CurDAG->getMachineNode(Opc, SDLoc(N), N->getValueType(0),
814 Src));
815 return;
816 } else {
817 // Generic to specific
818 if (SrcAddrSpace != 0)
819 report_fatal_error("Cannot cast between two non-generic address spaces");
820 unsigned Opc;
821 switch (DstAddrSpace) {
822 default: report_fatal_error("Bad address space in addrspacecast");
824 Opc = TM.is64Bit() ? NVPTX::cvta_to_global_64 : NVPTX::cvta_to_global;
825 break;
827 Opc = TM.is64Bit() ? (TM.getPointerSizeInBits(DstAddrSpace) == 32
828 ? NVPTX::cvta_to_shared_3264
829 : NVPTX::cvta_to_shared_64)
830 : NVPTX::cvta_to_shared;
831 break;
833 Opc = TM.is64Bit() ? (TM.getPointerSizeInBits(DstAddrSpace) == 32
834 ? NVPTX::cvta_to_const_3264
835 : NVPTX::cvta_to_const_64)
836 : NVPTX::cvta_to_const;
837 break;
839 Opc = TM.is64Bit() ? (TM.getPointerSizeInBits(DstAddrSpace) == 32
840 ? NVPTX::cvta_to_local_3264
841 : NVPTX::cvta_to_local_64)
842 : NVPTX::cvta_to_local;
843 break;
845 Opc = TM.is64Bit() ? NVPTX::nvvm_ptr_gen_to_param_64
846 : NVPTX::nvvm_ptr_gen_to_param;
847 break;
848 }
849 ReplaceNode(N, CurDAG->getMachineNode(Opc, SDLoc(N), N->getValueType(0),
850 Src));
851 return;
852 }
853}
854
855// Helper function template to reduce amount of boilerplate code for
856// opcode selection.
857static std::optional<unsigned>
859 unsigned Opcode_i16, unsigned Opcode_i32,
860 std::optional<unsigned> Opcode_i64, unsigned Opcode_f32,
861 std::optional<unsigned> Opcode_f64) {
862 switch (VT) {
863 case MVT::i1:
864 case MVT::i8:
865 return Opcode_i8;
866 case MVT::i16:
867 return Opcode_i16;
868 case MVT::i32:
869 return Opcode_i32;
870 case MVT::i64:
871 return Opcode_i64;
872 case MVT::f16:
873 case MVT::bf16:
874 return Opcode_i16;
875 case MVT::v2f16:
876 case MVT::v2bf16:
877 case MVT::v2i16:
878 case MVT::v4i8:
879 return Opcode_i32;
880 case MVT::f32:
881 return Opcode_f32;
882 case MVT::f64:
883 return Opcode_f64;
884 default:
885 return std::nullopt;
886 }
887}
888
889static int getLdStRegType(EVT VT) {
890 if (VT.isFloatingPoint())
891 switch (VT.getSimpleVT().SimpleTy) {
892 case MVT::f16:
893 case MVT::bf16:
894 case MVT::v2f16:
895 case MVT::v2bf16:
897 default:
899 }
900 else
902}
903
904bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
905 SDLoc dl(N);
906 MemSDNode *LD = cast<MemSDNode>(N);
907 assert(LD->readMem() && "Expected load");
908 LoadSDNode *PlainLoad = dyn_cast<LoadSDNode>(N);
909 EVT LoadedVT = LD->getMemoryVT();
910 SDNode *NVPTXLD = nullptr;
911
912 // do not support pre/post inc/dec
913 if (PlainLoad && PlainLoad->isIndexed())
914 return false;
915
916 if (!LoadedVT.isSimple())
917 return false;
918
919 AtomicOrdering Ordering = LD->getSuccessOrdering();
920 // In order to lower atomic loads with stronger guarantees we would need to
921 // use load.acquire or insert fences. However these features were only added
922 // with PTX ISA 6.0 / sm_70.
923 // TODO: Check if we can actually use the new instructions and implement them.
924 if (isStrongerThanMonotonic(Ordering))
925 return false;
926
927 // Address Space Setting
928 unsigned int CodeAddrSpace = getCodeAddrSpace(LD);
929 if (canLowerToLDG(LD, *Subtarget, CodeAddrSpace, MF)) {
930 return tryLDGLDU(N);
931 }
932
933 unsigned int PointerSize =
934 CurDAG->getDataLayout().getPointerSizeInBits(LD->getAddressSpace());
935
936 // Volatile Setting
937 // - .volatile is only available for .global and .shared
938 // - .volatile has the same memory synchronization semantics as .relaxed.sys
939 bool isVolatile = LD->isVolatile() || Ordering == AtomicOrdering::Monotonic;
940 if (CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL &&
941 CodeAddrSpace != NVPTX::PTXLdStInstCode::SHARED &&
942 CodeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC)
943 isVolatile = false;
944
945 // Type Setting: fromType + fromTypeWidth
946 //
947 // Sign : ISD::SEXTLOAD
948 // Unsign : ISD::ZEXTLOAD, ISD::NON_EXTLOAD or ISD::EXTLOAD and the
949 // type is integer
950 // Float : ISD::NON_EXTLOAD or ISD::EXTLOAD and the type is float
951 MVT SimpleVT = LoadedVT.getSimpleVT();
952 MVT ScalarVT = SimpleVT.getScalarType();
953 // Read at least 8 bits (predicates are stored as 8-bit values)
954 unsigned fromTypeWidth = std::max(8U, (unsigned)ScalarVT.getSizeInBits());
955 unsigned int fromType;
956
957 // Vector Setting
958 unsigned vecType = NVPTX::PTXLdStInstCode::Scalar;
959 if (SimpleVT.isVector()) {
960 assert((Isv2x16VT(LoadedVT) || LoadedVT == MVT::v4i8) &&
961 "Unexpected vector type");
962 // v2f16/v2bf16/v2i16 is loaded using ld.b32
963 fromTypeWidth = 32;
964 }
965
966 if (PlainLoad && (PlainLoad->getExtensionType() == ISD::SEXTLOAD))
968 else
969 fromType = getLdStRegType(ScalarVT);
970
971 // Create the machine instruction DAG
972 SDValue Chain = N->getOperand(0);
973 SDValue N1 = N->getOperand(1);
976 std::optional<unsigned> Opcode;
977 MVT::SimpleValueType TargetVT = LD->getSimpleValueType(0).SimpleTy;
978
979 if (SelectDirectAddr(N1, Addr)) {
980 Opcode = pickOpcodeForVT(TargetVT, NVPTX::LD_i8_avar, NVPTX::LD_i16_avar,
981 NVPTX::LD_i32_avar, NVPTX::LD_i64_avar,
982 NVPTX::LD_f32_avar, NVPTX::LD_f64_avar);
983 if (!Opcode)
984 return false;
985 SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(CodeAddrSpace, dl),
986 getI32Imm(vecType, dl), getI32Imm(fromType, dl),
987 getI32Imm(fromTypeWidth, dl), Addr, Chain };
988 NVPTXLD = CurDAG->getMachineNode(*Opcode, dl, TargetVT, MVT::Other, Ops);
989 } else if (PointerSize == 64 ? SelectADDRsi64(N1.getNode(), N1, Base, Offset)
990 : SelectADDRsi(N1.getNode(), N1, Base, Offset)) {
991 Opcode = pickOpcodeForVT(TargetVT, NVPTX::LD_i8_asi, NVPTX::LD_i16_asi,
992 NVPTX::LD_i32_asi, NVPTX::LD_i64_asi,
993 NVPTX::LD_f32_asi, NVPTX::LD_f64_asi);
994 if (!Opcode)
995 return false;
996 SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(CodeAddrSpace, dl),
997 getI32Imm(vecType, dl), getI32Imm(fromType, dl),
998 getI32Imm(fromTypeWidth, dl), Base, Offset, Chain };
999 NVPTXLD = CurDAG->getMachineNode(*Opcode, dl, TargetVT, MVT::Other, Ops);
1000 } else if (PointerSize == 64 ? SelectADDRri64(N1.getNode(), N1, Base, Offset)
1001 : SelectADDRri(N1.getNode(), N1, Base, Offset)) {
1002 if (PointerSize == 64)
1003 Opcode =
1004 pickOpcodeForVT(TargetVT, NVPTX::LD_i8_ari_64, NVPTX::LD_i16_ari_64,
1005 NVPTX::LD_i32_ari_64, NVPTX::LD_i64_ari_64,
1006 NVPTX::LD_f32_ari_64, NVPTX::LD_f64_ari_64);
1007 else
1008 Opcode = pickOpcodeForVT(TargetVT, NVPTX::LD_i8_ari, NVPTX::LD_i16_ari,
1009 NVPTX::LD_i32_ari, NVPTX::LD_i64_ari,
1010 NVPTX::LD_f32_ari, NVPTX::LD_f64_ari);
1011 if (!Opcode)
1012 return false;
1013 SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(CodeAddrSpace, dl),
1014 getI32Imm(vecType, dl), getI32Imm(fromType, dl),
1015 getI32Imm(fromTypeWidth, dl), Base, Offset, Chain };
1016 NVPTXLD = CurDAG->getMachineNode(*Opcode, dl, TargetVT, MVT::Other, Ops);
1017 } else {
1018 if (PointerSize == 64)
1019 Opcode =
1020 pickOpcodeForVT(TargetVT, NVPTX::LD_i8_areg_64, NVPTX::LD_i16_areg_64,
1021 NVPTX::LD_i32_areg_64, NVPTX::LD_i64_areg_64,
1022 NVPTX::LD_f32_areg_64, NVPTX::LD_f64_areg_64);
1023 else
1024 Opcode = pickOpcodeForVT(TargetVT, NVPTX::LD_i8_areg, NVPTX::LD_i16_areg,
1025 NVPTX::LD_i32_areg, NVPTX::LD_i64_areg,
1026 NVPTX::LD_f32_areg, NVPTX::LD_f64_areg);
1027 if (!Opcode)
1028 return false;
1029 SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(CodeAddrSpace, dl),
1030 getI32Imm(vecType, dl), getI32Imm(fromType, dl),
1031 getI32Imm(fromTypeWidth, dl), N1, Chain };
1032 NVPTXLD = CurDAG->getMachineNode(*Opcode, dl, TargetVT, MVT::Other, Ops);
1033 }
1034
1035 if (!NVPTXLD)
1036 return false;
1037
1038 MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand();
1039 CurDAG->setNodeMemRefs(cast<MachineSDNode>(NVPTXLD), {MemRef});
1040
1041 ReplaceNode(N, NVPTXLD);
1042 return true;
1043}
1044
1045bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
1046
1047 SDValue Chain = N->getOperand(0);
1048 SDValue Op1 = N->getOperand(1);
1050 std::optional<unsigned> Opcode;
1051 SDLoc DL(N);
1052 SDNode *LD;
1053 MemSDNode *MemSD = cast<MemSDNode>(N);
1054 EVT LoadedVT = MemSD->getMemoryVT();
1055
1056 if (!LoadedVT.isSimple())
1057 return false;
1058
1059 // Address Space Setting
1060 unsigned int CodeAddrSpace = getCodeAddrSpace(MemSD);
1061 if (canLowerToLDG(MemSD, *Subtarget, CodeAddrSpace, MF)) {
1062 return tryLDGLDU(N);
1063 }
1064
1065 unsigned int PointerSize =
1067
1068 // Volatile Setting
1069 // - .volatile is only availalble for .global and .shared
1070 bool IsVolatile = MemSD->isVolatile();
1071 if (CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL &&
1072 CodeAddrSpace != NVPTX::PTXLdStInstCode::SHARED &&
1073 CodeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC)
1074 IsVolatile = false;
1075
1076 // Vector Setting
1077 MVT SimpleVT = LoadedVT.getSimpleVT();
1078
1079 // Type Setting: fromType + fromTypeWidth
1080 //
1081 // Sign : ISD::SEXTLOAD
1082 // Unsign : ISD::ZEXTLOAD, ISD::NON_EXTLOAD or ISD::EXTLOAD and the
1083 // type is integer
1084 // Float : ISD::NON_EXTLOAD or ISD::EXTLOAD and the type is float
1085 MVT ScalarVT = SimpleVT.getScalarType();
1086 // Read at least 8 bits (predicates are stored as 8-bit values)
1087 unsigned FromTypeWidth = std::max(8U, (unsigned)ScalarVT.getSizeInBits());
1088 unsigned int FromType;
1089 // The last operand holds the original LoadSDNode::getExtensionType() value
1090 unsigned ExtensionType = cast<ConstantSDNode>(
1091 N->getOperand(N->getNumOperands() - 1))->getZExtValue();
1092 if (ExtensionType == ISD::SEXTLOAD)
1094 else
1095 FromType = getLdStRegType(ScalarVT);
1096
1097 unsigned VecType;
1098
1099 switch (N->getOpcode()) {
1100 case NVPTXISD::LoadV2:
1102 break;
1103 case NVPTXISD::LoadV4:
1105 break;
1106 default:
1107 return false;
1108 }
1109
1110 EVT EltVT = N->getValueType(0);
1111
1112 // v8x16 is a special case. PTX doesn't have ld.v8.16
1113 // instruction. Instead, we split the vector into v2x16 chunks and
1114 // load them with ld.v4.b32.
1115 if (Isv2x16VT(EltVT)) {
1116 assert(N->getOpcode() == NVPTXISD::LoadV4 && "Unexpected load opcode.");
1117 EltVT = MVT::i32;
1119 FromTypeWidth = 32;
1120 }
1121
1122 if (SelectDirectAddr(Op1, Addr)) {
1123 switch (N->getOpcode()) {
1124 default:
1125 return false;
1126 case NVPTXISD::LoadV2:
1127 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1128 NVPTX::LDV_i8_v2_avar, NVPTX::LDV_i16_v2_avar,
1129 NVPTX::LDV_i32_v2_avar, NVPTX::LDV_i64_v2_avar,
1130 NVPTX::LDV_f32_v2_avar, NVPTX::LDV_f64_v2_avar);
1131 break;
1132 case NVPTXISD::LoadV4:
1133 Opcode =
1134 pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_avar,
1135 NVPTX::LDV_i16_v4_avar, NVPTX::LDV_i32_v4_avar,
1136 std::nullopt, NVPTX::LDV_f32_v4_avar, std::nullopt);
1137 break;
1138 }
1139 if (!Opcode)
1140 return false;
1141 SDValue Ops[] = { getI32Imm(IsVolatile, DL), getI32Imm(CodeAddrSpace, DL),
1142 getI32Imm(VecType, DL), getI32Imm(FromType, DL),
1143 getI32Imm(FromTypeWidth, DL), Addr, Chain };
1144 LD = CurDAG->getMachineNode(*Opcode, DL, N->getVTList(), Ops);
1145 } else if (PointerSize == 64
1146 ? SelectADDRsi64(Op1.getNode(), Op1, Base, Offset)
1147 : SelectADDRsi(Op1.getNode(), Op1, Base, Offset)) {
1148 switch (N->getOpcode()) {
1149 default:
1150 return false;
1151 case NVPTXISD::LoadV2:
1152 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1153 NVPTX::LDV_i8_v2_asi, NVPTX::LDV_i16_v2_asi,
1154 NVPTX::LDV_i32_v2_asi, NVPTX::LDV_i64_v2_asi,
1155 NVPTX::LDV_f32_v2_asi, NVPTX::LDV_f64_v2_asi);
1156 break;
1157 case NVPTXISD::LoadV4:
1158 Opcode =
1159 pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_asi,
1160 NVPTX::LDV_i16_v4_asi, NVPTX::LDV_i32_v4_asi,
1161 std::nullopt, NVPTX::LDV_f32_v4_asi, std::nullopt);
1162 break;
1163 }
1164 if (!Opcode)
1165 return false;
1166 SDValue Ops[] = { getI32Imm(IsVolatile, DL), getI32Imm(CodeAddrSpace, DL),
1167 getI32Imm(VecType, DL), getI32Imm(FromType, DL),
1168 getI32Imm(FromTypeWidth, DL), Base, Offset, Chain };
1169 LD = CurDAG->getMachineNode(*Opcode, DL, N->getVTList(), Ops);
1170 } else if (PointerSize == 64
1171 ? SelectADDRri64(Op1.getNode(), Op1, Base, Offset)
1172 : SelectADDRri(Op1.getNode(), Op1, Base, Offset)) {
1173 if (PointerSize == 64) {
1174 switch (N->getOpcode()) {
1175 default:
1176 return false;
1177 case NVPTXISD::LoadV2:
1178 Opcode =
1180 NVPTX::LDV_i8_v2_ari_64, NVPTX::LDV_i16_v2_ari_64,
1181 NVPTX::LDV_i32_v2_ari_64, NVPTX::LDV_i64_v2_ari_64,
1182 NVPTX::LDV_f32_v2_ari_64, NVPTX::LDV_f64_v2_ari_64);
1183 break;
1184 case NVPTXISD::LoadV4:
1185 Opcode = pickOpcodeForVT(
1186 EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_ari_64,
1187 NVPTX::LDV_i16_v4_ari_64, NVPTX::LDV_i32_v4_ari_64, std::nullopt,
1188 NVPTX::LDV_f32_v4_ari_64, std::nullopt);
1189 break;
1190 }
1191 } else {
1192 switch (N->getOpcode()) {
1193 default:
1194 return false;
1195 case NVPTXISD::LoadV2:
1196 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1197 NVPTX::LDV_i8_v2_ari, NVPTX::LDV_i16_v2_ari,
1198 NVPTX::LDV_i32_v2_ari, NVPTX::LDV_i64_v2_ari,
1199 NVPTX::LDV_f32_v2_ari, NVPTX::LDV_f64_v2_ari);
1200 break;
1201 case NVPTXISD::LoadV4:
1202 Opcode =
1203 pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_ari,
1204 NVPTX::LDV_i16_v4_ari, NVPTX::LDV_i32_v4_ari,
1205 std::nullopt, NVPTX::LDV_f32_v4_ari, std::nullopt);
1206 break;
1207 }
1208 }
1209 if (!Opcode)
1210 return false;
1211 SDValue Ops[] = { getI32Imm(IsVolatile, DL), getI32Imm(CodeAddrSpace, DL),
1212 getI32Imm(VecType, DL), getI32Imm(FromType, DL),
1213 getI32Imm(FromTypeWidth, DL), Base, Offset, Chain };
1214
1215 LD = CurDAG->getMachineNode(*Opcode, DL, N->getVTList(), Ops);
1216 } else {
1217 if (PointerSize == 64) {
1218 switch (N->getOpcode()) {
1219 default:
1220 return false;
1221 case NVPTXISD::LoadV2:
1222 Opcode = pickOpcodeForVT(
1223 EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v2_areg_64,
1224 NVPTX::LDV_i16_v2_areg_64, NVPTX::LDV_i32_v2_areg_64,
1225 NVPTX::LDV_i64_v2_areg_64, NVPTX::LDV_f32_v2_areg_64,
1226 NVPTX::LDV_f64_v2_areg_64);
1227 break;
1228 case NVPTXISD::LoadV4:
1229 Opcode = pickOpcodeForVT(
1230 EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_areg_64,
1231 NVPTX::LDV_i16_v4_areg_64, NVPTX::LDV_i32_v4_areg_64, std::nullopt,
1232 NVPTX::LDV_f32_v4_areg_64, std::nullopt);
1233 break;
1234 }
1235 } else {
1236 switch (N->getOpcode()) {
1237 default:
1238 return false;
1239 case NVPTXISD::LoadV2:
1240 Opcode =
1241 pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v2_areg,
1242 NVPTX::LDV_i16_v2_areg, NVPTX::LDV_i32_v2_areg,
1243 NVPTX::LDV_i64_v2_areg, NVPTX::LDV_f32_v2_areg,
1244 NVPTX::LDV_f64_v2_areg);
1245 break;
1246 case NVPTXISD::LoadV4:
1247 Opcode =
1248 pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_areg,
1249 NVPTX::LDV_i16_v4_areg, NVPTX::LDV_i32_v4_areg,
1250 std::nullopt, NVPTX::LDV_f32_v4_areg, std::nullopt);
1251 break;
1252 }
1253 }
1254 if (!Opcode)
1255 return false;
1256 SDValue Ops[] = { getI32Imm(IsVolatile, DL), getI32Imm(CodeAddrSpace, DL),
1257 getI32Imm(VecType, DL), getI32Imm(FromType, DL),
1258 getI32Imm(FromTypeWidth, DL), Op1, Chain };
1259 LD = CurDAG->getMachineNode(*Opcode, DL, N->getVTList(), Ops);
1260 }
1261
1262 MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand();
1263 CurDAG->setNodeMemRefs(cast<MachineSDNode>(LD), {MemRef});
1264
1265 ReplaceNode(N, LD);
1266 return true;
1267}
1268
1269bool NVPTXDAGToDAGISel::tryLDGLDU(SDNode *N) {
1270
1271 SDValue Chain = N->getOperand(0);
1272 SDValue Op1;
1273 MemSDNode *Mem;
1274 bool IsLDG = true;
1275
1276 // If this is an LDG intrinsic, the address is the third operand. If its an
1277 // LDG/LDU SD node (from custom vector handling), then its the second operand
1278 if (N->getOpcode() == ISD::INTRINSIC_W_CHAIN) {
1279 Op1 = N->getOperand(2);
1280 Mem = cast<MemIntrinsicSDNode>(N);
1281 unsigned IID = N->getConstantOperandVal(1);
1282 switch (IID) {
1283 default:
1284 return false;
1285 case Intrinsic::nvvm_ldg_global_f:
1286 case Intrinsic::nvvm_ldg_global_i:
1287 case Intrinsic::nvvm_ldg_global_p:
1288 IsLDG = true;
1289 break;
1290 case Intrinsic::nvvm_ldu_global_f:
1291 case Intrinsic::nvvm_ldu_global_i:
1292 case Intrinsic::nvvm_ldu_global_p:
1293 IsLDG = false;
1294 break;
1295 }
1296 } else {
1297 Op1 = N->getOperand(1);
1298 Mem = cast<MemSDNode>(N);
1299 }
1300
1301 std::optional<unsigned> Opcode;
1302 SDLoc DL(N);
1303 SDNode *LD;
1305 EVT OrigType = N->getValueType(0);
1306
1307 EVT EltVT = Mem->getMemoryVT();
1308 unsigned NumElts = 1;
1309 if (EltVT.isVector()) {
1310 NumElts = EltVT.getVectorNumElements();
1311 EltVT = EltVT.getVectorElementType();
1312 // vectors of 16bits type are loaded/stored as multiples of v2x16 elements.
1313 if ((EltVT == MVT::f16 && OrigType == MVT::v2f16) ||
1314 (EltVT == MVT::bf16 && OrigType == MVT::v2bf16) ||
1315 (EltVT == MVT::i16 && OrigType == MVT::v2i16)) {
1316 assert(NumElts % 2 == 0 && "Vector must have even number of elements");
1317 EltVT = OrigType;
1318 NumElts /= 2;
1319 } else if (OrigType == MVT::v4i8) {
1320 EltVT = OrigType;
1321 NumElts = 1;
1322 }
1323 }
1324
1325 // Build the "promoted" result VTList for the load. If we are really loading
1326 // i8s, then the return type will be promoted to i16 since we do not expose
1327 // 8-bit registers in NVPTX.
1328 EVT NodeVT = (EltVT == MVT::i8) ? MVT::i16 : EltVT;
1329 SmallVector<EVT, 5> InstVTs;
1330 for (unsigned i = 0; i != NumElts; ++i) {
1331 InstVTs.push_back(NodeVT);
1332 }
1333 InstVTs.push_back(MVT::Other);
1334 SDVTList InstVTList = CurDAG->getVTList(InstVTs);
1335
1336 if (SelectDirectAddr(Op1, Addr)) {
1337 switch (N->getOpcode()) {
1338 default:
1339 return false;
1340 case ISD::LOAD:
1342 if (IsLDG)
1343 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1344 NVPTX::INT_PTX_LDG_GLOBAL_i8avar,
1345 NVPTX::INT_PTX_LDG_GLOBAL_i16avar,
1346 NVPTX::INT_PTX_LDG_GLOBAL_i32avar,
1347 NVPTX::INT_PTX_LDG_GLOBAL_i64avar,
1348 NVPTX::INT_PTX_LDG_GLOBAL_f32avar,
1349 NVPTX::INT_PTX_LDG_GLOBAL_f64avar);
1350 else
1351 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1352 NVPTX::INT_PTX_LDU_GLOBAL_i8avar,
1353 NVPTX::INT_PTX_LDU_GLOBAL_i16avar,
1354 NVPTX::INT_PTX_LDU_GLOBAL_i32avar,
1355 NVPTX::INT_PTX_LDU_GLOBAL_i64avar,
1356 NVPTX::INT_PTX_LDU_GLOBAL_f32avar,
1357 NVPTX::INT_PTX_LDU_GLOBAL_f64avar);
1358 break;
1359 case NVPTXISD::LoadV2:
1360 case NVPTXISD::LDGV2:
1361 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1362 NVPTX::INT_PTX_LDG_G_v2i8_ELE_avar,
1363 NVPTX::INT_PTX_LDG_G_v2i16_ELE_avar,
1364 NVPTX::INT_PTX_LDG_G_v2i32_ELE_avar,
1365 NVPTX::INT_PTX_LDG_G_v2i64_ELE_avar,
1366 NVPTX::INT_PTX_LDG_G_v2f32_ELE_avar,
1367 NVPTX::INT_PTX_LDG_G_v2f64_ELE_avar);
1368 break;
1369 case NVPTXISD::LDUV2:
1370 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1371 NVPTX::INT_PTX_LDU_G_v2i8_ELE_avar,
1372 NVPTX::INT_PTX_LDU_G_v2i16_ELE_avar,
1373 NVPTX::INT_PTX_LDU_G_v2i32_ELE_avar,
1374 NVPTX::INT_PTX_LDU_G_v2i64_ELE_avar,
1375 NVPTX::INT_PTX_LDU_G_v2f32_ELE_avar,
1376 NVPTX::INT_PTX_LDU_G_v2f64_ELE_avar);
1377 break;
1378 case NVPTXISD::LoadV4:
1379 case NVPTXISD::LDGV4:
1380 Opcode = pickOpcodeForVT(
1381 EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDG_G_v4i8_ELE_avar,
1382 NVPTX::INT_PTX_LDG_G_v4i16_ELE_avar,
1383 NVPTX::INT_PTX_LDG_G_v4i32_ELE_avar, std::nullopt,
1384 NVPTX::INT_PTX_LDG_G_v4f32_ELE_avar, std::nullopt);
1385 break;
1386 case NVPTXISD::LDUV4:
1387 Opcode = pickOpcodeForVT(
1388 EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDU_G_v4i8_ELE_avar,
1389 NVPTX::INT_PTX_LDU_G_v4i16_ELE_avar,
1390 NVPTX::INT_PTX_LDU_G_v4i32_ELE_avar, std::nullopt,
1391 NVPTX::INT_PTX_LDU_G_v4f32_ELE_avar, std::nullopt);
1392 break;
1393 }
1394 if (!Opcode)
1395 return false;
1396 SDValue Ops[] = { Addr, Chain };
1397 LD = CurDAG->getMachineNode(*Opcode, DL, InstVTList, Ops);
1398 } else if (TM.is64Bit() ? SelectADDRri64(Op1.getNode(), Op1, Base, Offset)
1399 : SelectADDRri(Op1.getNode(), Op1, Base, Offset)) {
1400 if (TM.is64Bit()) {
1401 switch (N->getOpcode()) {
1402 default:
1403 return false;
1404 case ISD::LOAD:
1406 if (IsLDG)
1407 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1408 NVPTX::INT_PTX_LDG_GLOBAL_i8ari64,
1409 NVPTX::INT_PTX_LDG_GLOBAL_i16ari64,
1410 NVPTX::INT_PTX_LDG_GLOBAL_i32ari64,
1411 NVPTX::INT_PTX_LDG_GLOBAL_i64ari64,
1412 NVPTX::INT_PTX_LDG_GLOBAL_f32ari64,
1413 NVPTX::INT_PTX_LDG_GLOBAL_f64ari64);
1414 else
1415 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1416 NVPTX::INT_PTX_LDU_GLOBAL_i8ari64,
1417 NVPTX::INT_PTX_LDU_GLOBAL_i16ari64,
1418 NVPTX::INT_PTX_LDU_GLOBAL_i32ari64,
1419 NVPTX::INT_PTX_LDU_GLOBAL_i64ari64,
1420 NVPTX::INT_PTX_LDU_GLOBAL_f32ari64,
1421 NVPTX::INT_PTX_LDU_GLOBAL_f64ari64);
1422 break;
1423 case NVPTXISD::LoadV2:
1424 case NVPTXISD::LDGV2:
1425 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1426 NVPTX::INT_PTX_LDG_G_v2i8_ELE_ari64,
1427 NVPTX::INT_PTX_LDG_G_v2i16_ELE_ari64,
1428 NVPTX::INT_PTX_LDG_G_v2i32_ELE_ari64,
1429 NVPTX::INT_PTX_LDG_G_v2i64_ELE_ari64,
1430 NVPTX::INT_PTX_LDG_G_v2f32_ELE_ari64,
1431 NVPTX::INT_PTX_LDG_G_v2f64_ELE_ari64);
1432 break;
1433 case NVPTXISD::LDUV2:
1434 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1435 NVPTX::INT_PTX_LDU_G_v2i8_ELE_ari64,
1436 NVPTX::INT_PTX_LDU_G_v2i16_ELE_ari64,
1437 NVPTX::INT_PTX_LDU_G_v2i32_ELE_ari64,
1438 NVPTX::INT_PTX_LDU_G_v2i64_ELE_ari64,
1439 NVPTX::INT_PTX_LDU_G_v2f32_ELE_ari64,
1440 NVPTX::INT_PTX_LDU_G_v2f64_ELE_ari64);
1441 break;
1442 case NVPTXISD::LoadV4:
1443 case NVPTXISD::LDGV4:
1444 Opcode = pickOpcodeForVT(
1445 EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDG_G_v4i8_ELE_ari64,
1446 NVPTX::INT_PTX_LDG_G_v4i16_ELE_ari64,
1447 NVPTX::INT_PTX_LDG_G_v4i32_ELE_ari64, std::nullopt,
1448 NVPTX::INT_PTX_LDG_G_v4f32_ELE_ari64, std::nullopt);
1449 break;
1450 case NVPTXISD::LDUV4:
1451 Opcode = pickOpcodeForVT(
1452 EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDU_G_v4i8_ELE_ari64,
1453 NVPTX::INT_PTX_LDU_G_v4i16_ELE_ari64,
1454 NVPTX::INT_PTX_LDU_G_v4i32_ELE_ari64, std::nullopt,
1455 NVPTX::INT_PTX_LDU_G_v4f32_ELE_ari64, std::nullopt);
1456 break;
1457 }
1458 } else {
1459 switch (N->getOpcode()) {
1460 default:
1461 return false;
1462 case ISD::LOAD:
1464 if (IsLDG)
1465 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1466 NVPTX::INT_PTX_LDG_GLOBAL_i8ari,
1467 NVPTX::INT_PTX_LDG_GLOBAL_i16ari,
1468 NVPTX::INT_PTX_LDG_GLOBAL_i32ari,
1469 NVPTX::INT_PTX_LDG_GLOBAL_i64ari,
1470 NVPTX::INT_PTX_LDG_GLOBAL_f32ari,
1471 NVPTX::INT_PTX_LDG_GLOBAL_f64ari);
1472 else
1473 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1474 NVPTX::INT_PTX_LDU_GLOBAL_i8ari,
1475 NVPTX::INT_PTX_LDU_GLOBAL_i16ari,
1476 NVPTX::INT_PTX_LDU_GLOBAL_i32ari,
1477 NVPTX::INT_PTX_LDU_GLOBAL_i64ari,
1478 NVPTX::INT_PTX_LDU_GLOBAL_f32ari,
1479 NVPTX::INT_PTX_LDU_GLOBAL_f64ari);
1480 break;
1481 case NVPTXISD::LoadV2:
1482 case NVPTXISD::LDGV2:
1483 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1484 NVPTX::INT_PTX_LDG_G_v2i8_ELE_ari32,
1485 NVPTX::INT_PTX_LDG_G_v2i16_ELE_ari32,
1486 NVPTX::INT_PTX_LDG_G_v2i32_ELE_ari32,
1487 NVPTX::INT_PTX_LDG_G_v2i64_ELE_ari32,
1488 NVPTX::INT_PTX_LDG_G_v2f32_ELE_ari32,
1489 NVPTX::INT_PTX_LDG_G_v2f64_ELE_ari32);
1490 break;
1491 case NVPTXISD::LDUV2:
1492 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1493 NVPTX::INT_PTX_LDU_G_v2i8_ELE_ari32,
1494 NVPTX::INT_PTX_LDU_G_v2i16_ELE_ari32,
1495 NVPTX::INT_PTX_LDU_G_v2i32_ELE_ari32,
1496 NVPTX::INT_PTX_LDU_G_v2i64_ELE_ari32,
1497 NVPTX::INT_PTX_LDU_G_v2f32_ELE_ari32,
1498 NVPTX::INT_PTX_LDU_G_v2f64_ELE_ari32);
1499 break;
1500 case NVPTXISD::LoadV4:
1501 case NVPTXISD::LDGV4:
1502 Opcode = pickOpcodeForVT(
1503 EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDG_G_v4i8_ELE_ari32,
1504 NVPTX::INT_PTX_LDG_G_v4i16_ELE_ari32,
1505 NVPTX::INT_PTX_LDG_G_v4i32_ELE_ari32, std::nullopt,
1506 NVPTX::INT_PTX_LDG_G_v4f32_ELE_ari32, std::nullopt);
1507 break;
1508 case NVPTXISD::LDUV4:
1509 Opcode = pickOpcodeForVT(
1510 EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDU_G_v4i8_ELE_ari32,
1511 NVPTX::INT_PTX_LDU_G_v4i16_ELE_ari32,
1512 NVPTX::INT_PTX_LDU_G_v4i32_ELE_ari32, std::nullopt,
1513 NVPTX::INT_PTX_LDU_G_v4f32_ELE_ari32, std::nullopt);
1514 break;
1515 }
1516 }
1517 if (!Opcode)
1518 return false;
1519 SDValue Ops[] = {Base, Offset, Chain};
1520 LD = CurDAG->getMachineNode(*Opcode, DL, InstVTList, Ops);
1521 } else {
1522 if (TM.is64Bit()) {
1523 switch (N->getOpcode()) {
1524 default:
1525 return false;
1526 case ISD::LOAD:
1528 if (IsLDG)
1529 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1530 NVPTX::INT_PTX_LDG_GLOBAL_i8areg64,
1531 NVPTX::INT_PTX_LDG_GLOBAL_i16areg64,
1532 NVPTX::INT_PTX_LDG_GLOBAL_i32areg64,
1533 NVPTX::INT_PTX_LDG_GLOBAL_i64areg64,
1534 NVPTX::INT_PTX_LDG_GLOBAL_f32areg64,
1535 NVPTX::INT_PTX_LDG_GLOBAL_f64areg64);
1536 else
1537 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1538 NVPTX::INT_PTX_LDU_GLOBAL_i8areg64,
1539 NVPTX::INT_PTX_LDU_GLOBAL_i16areg64,
1540 NVPTX::INT_PTX_LDU_GLOBAL_i32areg64,
1541 NVPTX::INT_PTX_LDU_GLOBAL_i64areg64,
1542 NVPTX::INT_PTX_LDU_GLOBAL_f32areg64,
1543 NVPTX::INT_PTX_LDU_GLOBAL_f64areg64);
1544 break;
1545 case NVPTXISD::LoadV2:
1546 case NVPTXISD::LDGV2:
1547 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1548 NVPTX::INT_PTX_LDG_G_v2i8_ELE_areg64,
1549 NVPTX::INT_PTX_LDG_G_v2i16_ELE_areg64,
1550 NVPTX::INT_PTX_LDG_G_v2i32_ELE_areg64,
1551 NVPTX::INT_PTX_LDG_G_v2i64_ELE_areg64,
1552 NVPTX::INT_PTX_LDG_G_v2f32_ELE_areg64,
1553 NVPTX::INT_PTX_LDG_G_v2f64_ELE_areg64);
1554 break;
1555 case NVPTXISD::LDUV2:
1556 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1557 NVPTX::INT_PTX_LDU_G_v2i8_ELE_areg64,
1558 NVPTX::INT_PTX_LDU_G_v2i16_ELE_areg64,
1559 NVPTX::INT_PTX_LDU_G_v2i32_ELE_areg64,
1560 NVPTX::INT_PTX_LDU_G_v2i64_ELE_areg64,
1561 NVPTX::INT_PTX_LDU_G_v2f32_ELE_areg64,
1562 NVPTX::INT_PTX_LDU_G_v2f64_ELE_areg64);
1563 break;
1564 case NVPTXISD::LoadV4:
1565 case NVPTXISD::LDGV4:
1566 Opcode = pickOpcodeForVT(
1567 EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDG_G_v4i8_ELE_areg64,
1568 NVPTX::INT_PTX_LDG_G_v4i16_ELE_areg64,
1569 NVPTX::INT_PTX_LDG_G_v4i32_ELE_areg64, std::nullopt,
1570 NVPTX::INT_PTX_LDG_G_v4f32_ELE_areg64, std::nullopt);
1571 break;
1572 case NVPTXISD::LDUV4:
1573 Opcode = pickOpcodeForVT(
1574 EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDU_G_v4i8_ELE_areg64,
1575 NVPTX::INT_PTX_LDU_G_v4i16_ELE_areg64,
1576 NVPTX::INT_PTX_LDU_G_v4i32_ELE_areg64, std::nullopt,
1577 NVPTX::INT_PTX_LDU_G_v4f32_ELE_areg64, std::nullopt);
1578 break;
1579 }
1580 } else {
1581 switch (N->getOpcode()) {
1582 default:
1583 return false;
1584 case ISD::LOAD:
1586 if (IsLDG)
1587 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1588 NVPTX::INT_PTX_LDG_GLOBAL_i8areg,
1589 NVPTX::INT_PTX_LDG_GLOBAL_i16areg,
1590 NVPTX::INT_PTX_LDG_GLOBAL_i32areg,
1591 NVPTX::INT_PTX_LDG_GLOBAL_i64areg,
1592 NVPTX::INT_PTX_LDG_GLOBAL_f32areg,
1593 NVPTX::INT_PTX_LDG_GLOBAL_f64areg);
1594 else
1595 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1596 NVPTX::INT_PTX_LDU_GLOBAL_i8areg,
1597 NVPTX::INT_PTX_LDU_GLOBAL_i16areg,
1598 NVPTX::INT_PTX_LDU_GLOBAL_i32areg,
1599 NVPTX::INT_PTX_LDU_GLOBAL_i64areg,
1600 NVPTX::INT_PTX_LDU_GLOBAL_f32areg,
1601 NVPTX::INT_PTX_LDU_GLOBAL_f64areg);
1602 break;
1603 case NVPTXISD::LoadV2:
1604 case NVPTXISD::LDGV2:
1605 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1606 NVPTX::INT_PTX_LDG_G_v2i8_ELE_areg32,
1607 NVPTX::INT_PTX_LDG_G_v2i16_ELE_areg32,
1608 NVPTX::INT_PTX_LDG_G_v2i32_ELE_areg32,
1609 NVPTX::INT_PTX_LDG_G_v2i64_ELE_areg32,
1610 NVPTX::INT_PTX_LDG_G_v2f32_ELE_areg32,
1611 NVPTX::INT_PTX_LDG_G_v2f64_ELE_areg32);
1612 break;
1613 case NVPTXISD::LDUV2:
1614 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1615 NVPTX::INT_PTX_LDU_G_v2i8_ELE_areg32,
1616 NVPTX::INT_PTX_LDU_G_v2i16_ELE_areg32,
1617 NVPTX::INT_PTX_LDU_G_v2i32_ELE_areg32,
1618 NVPTX::INT_PTX_LDU_G_v2i64_ELE_areg32,
1619 NVPTX::INT_PTX_LDU_G_v2f32_ELE_areg32,
1620 NVPTX::INT_PTX_LDU_G_v2f64_ELE_areg32);
1621 break;
1622 case NVPTXISD::LoadV4:
1623 case NVPTXISD::LDGV4:
1624 Opcode = pickOpcodeForVT(
1625 EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDG_G_v4i8_ELE_areg32,
1626 NVPTX::INT_PTX_LDG_G_v4i16_ELE_areg32,
1627 NVPTX::INT_PTX_LDG_G_v4i32_ELE_areg32, std::nullopt,
1628 NVPTX::INT_PTX_LDG_G_v4f32_ELE_areg32, std::nullopt);
1629 break;
1630 case NVPTXISD::LDUV4:
1631 Opcode = pickOpcodeForVT(
1632 EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDU_G_v4i8_ELE_areg32,
1633 NVPTX::INT_PTX_LDU_G_v4i16_ELE_areg32,
1634 NVPTX::INT_PTX_LDU_G_v4i32_ELE_areg32, std::nullopt,
1635 NVPTX::INT_PTX_LDU_G_v4f32_ELE_areg32, std::nullopt);
1636 break;
1637 }
1638 }
1639 if (!Opcode)
1640 return false;
1641 SDValue Ops[] = { Op1, Chain };
1642 LD = CurDAG->getMachineNode(*Opcode, DL, InstVTList, Ops);
1643 }
1644
1645 // For automatic generation of LDG (through SelectLoad[Vector], not the
1646 // intrinsics), we may have an extending load like:
1647 //
1648 // i32,ch = load<LD1[%data1(addrspace=1)], zext from i8> t0, t7, undef:i64
1649 //
1650 // In this case, the matching logic above will select a load for the original
1651 // memory type (in this case, i8) and our types will not match (the node needs
1652 // to return an i32 in this case). Our LDG/LDU nodes do not support the
1653 // concept of sign-/zero-extension, so emulate it here by adding an explicit
1654 // CVT instruction. Ptxas should clean up any redundancies here.
1655
1656 LoadSDNode *LdNode = dyn_cast<LoadSDNode>(N);
1657
1658 if (OrigType != EltVT &&
1659 (LdNode || (OrigType.isFloatingPoint() && EltVT.isFloatingPoint()))) {
1660 // We have an extending-load. The instruction we selected operates on the
1661 // smaller type, but the SDNode we are replacing has the larger type. We
1662 // need to emit a CVT to make the types match.
1663 unsigned CvtOpc =
1664 GetConvertOpcode(OrigType.getSimpleVT(), EltVT.getSimpleVT(), LdNode);
1665
1666 // For each output value, apply the manual sign/zero-extension and make sure
1667 // all users of the load go through that CVT.
1668 for (unsigned i = 0; i != NumElts; ++i) {
1669 SDValue Res(LD, i);
1670 SDValue OrigVal(N, i);
1671
1672 SDNode *CvtNode =
1673 CurDAG->getMachineNode(CvtOpc, DL, OrigType, Res,
1675 DL, MVT::i32));
1676 ReplaceUses(OrigVal, SDValue(CvtNode, 0));
1677 }
1678 }
1679
1680 ReplaceNode(N, LD);
1681 return true;
1682}
1683
1684bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
1685 SDLoc dl(N);
1686 MemSDNode *ST = cast<MemSDNode>(N);
1687 assert(ST->writeMem() && "Expected store");
1688 StoreSDNode *PlainStore = dyn_cast<StoreSDNode>(N);
1689 AtomicSDNode *AtomicStore = dyn_cast<AtomicSDNode>(N);
1690 assert((PlainStore || AtomicStore) && "Expected store");
1691 EVT StoreVT = ST->getMemoryVT();
1692 SDNode *NVPTXST = nullptr;
1693
1694 // do not support pre/post inc/dec
1695 if (PlainStore && PlainStore->isIndexed())
1696 return false;
1697
1698 if (!StoreVT.isSimple())
1699 return false;
1700
1701 AtomicOrdering Ordering = ST->getSuccessOrdering();
1702 // In order to lower atomic loads with stronger guarantees we would need to
1703 // use store.release or insert fences. However these features were only added
1704 // with PTX ISA 6.0 / sm_70.
1705 // TODO: Check if we can actually use the new instructions and implement them.
1706 if (isStrongerThanMonotonic(Ordering))
1707 return false;
1708
1709 // Address Space Setting
1710 unsigned int CodeAddrSpace = getCodeAddrSpace(ST);
1711 unsigned int PointerSize =
1712 CurDAG->getDataLayout().getPointerSizeInBits(ST->getAddressSpace());
1713
1714 // Volatile Setting
1715 // - .volatile is only available for .global and .shared
1716 // - .volatile has the same memory synchronization semantics as .relaxed.sys
1717 bool isVolatile = ST->isVolatile() || Ordering == AtomicOrdering::Monotonic;
1718 if (CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL &&
1719 CodeAddrSpace != NVPTX::PTXLdStInstCode::SHARED &&
1720 CodeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC)
1721 isVolatile = false;
1722
1723 // Vector Setting
1724 MVT SimpleVT = StoreVT.getSimpleVT();
1725 unsigned vecType = NVPTX::PTXLdStInstCode::Scalar;
1726
1727 // Type Setting: toType + toTypeWidth
1728 // - for integer type, always use 'u'
1729 //
1730 MVT ScalarVT = SimpleVT.getScalarType();
1731 unsigned toTypeWidth = ScalarVT.getSizeInBits();
1732 if (SimpleVT.isVector()) {
1733 assert((Isv2x16VT(StoreVT) || StoreVT == MVT::v4i8) &&
1734 "Unexpected vector type");
1735 // v2x16 is stored using st.b32
1736 toTypeWidth = 32;
1737 }
1738
1739 unsigned int toType = getLdStRegType(ScalarVT);
1740
1741 // Create the machine instruction DAG
1742 SDValue Chain = ST->getChain();
1743 SDValue Value = PlainStore ? PlainStore->getValue() : AtomicStore->getVal();
1744 SDValue BasePtr = ST->getBasePtr();
1745 SDValue Addr;
1747 std::optional<unsigned> Opcode;
1748 MVT::SimpleValueType SourceVT =
1749 Value.getNode()->getSimpleValueType(0).SimpleTy;
1750
1751 if (SelectDirectAddr(BasePtr, Addr)) {
1752 Opcode = pickOpcodeForVT(SourceVT, NVPTX::ST_i8_avar, NVPTX::ST_i16_avar,
1753 NVPTX::ST_i32_avar, NVPTX::ST_i64_avar,
1754 NVPTX::ST_f32_avar, NVPTX::ST_f64_avar);
1755 if (!Opcode)
1756 return false;
1757 SDValue Ops[] = {Value,
1758 getI32Imm(isVolatile, dl),
1759 getI32Imm(CodeAddrSpace, dl),
1760 getI32Imm(vecType, dl),
1761 getI32Imm(toType, dl),
1762 getI32Imm(toTypeWidth, dl),
1763 Addr,
1764 Chain};
1765 NVPTXST = CurDAG->getMachineNode(*Opcode, dl, MVT::Other, Ops);
1766 } else if (PointerSize == 64
1767 ? SelectADDRsi64(BasePtr.getNode(), BasePtr, Base, Offset)
1768 : SelectADDRsi(BasePtr.getNode(), BasePtr, Base, Offset)) {
1769 Opcode = pickOpcodeForVT(SourceVT, NVPTX::ST_i8_asi, NVPTX::ST_i16_asi,
1770 NVPTX::ST_i32_asi, NVPTX::ST_i64_asi,
1771 NVPTX::ST_f32_asi, NVPTX::ST_f64_asi);
1772 if (!Opcode)
1773 return false;
1774 SDValue Ops[] = {Value,
1775 getI32Imm(isVolatile, dl),
1776 getI32Imm(CodeAddrSpace, dl),
1777 getI32Imm(vecType, dl),
1778 getI32Imm(toType, dl),
1779 getI32Imm(toTypeWidth, dl),
1780 Base,
1781 Offset,
1782 Chain};
1783 NVPTXST = CurDAG->getMachineNode(*Opcode, dl, MVT::Other, Ops);
1784 } else if (PointerSize == 64
1785 ? SelectADDRri64(BasePtr.getNode(), BasePtr, Base, Offset)
1786 : SelectADDRri(BasePtr.getNode(), BasePtr, Base, Offset)) {
1787 if (PointerSize == 64)
1788 Opcode =
1789 pickOpcodeForVT(SourceVT, NVPTX::ST_i8_ari_64, NVPTX::ST_i16_ari_64,
1790 NVPTX::ST_i32_ari_64, NVPTX::ST_i64_ari_64,
1791 NVPTX::ST_f32_ari_64, NVPTX::ST_f64_ari_64);
1792 else
1793 Opcode = pickOpcodeForVT(SourceVT, NVPTX::ST_i8_ari, NVPTX::ST_i16_ari,
1794 NVPTX::ST_i32_ari, NVPTX::ST_i64_ari,
1795 NVPTX::ST_f32_ari, NVPTX::ST_f64_ari);
1796 if (!Opcode)
1797 return false;
1798
1799 SDValue Ops[] = {Value,
1800 getI32Imm(isVolatile, dl),
1801 getI32Imm(CodeAddrSpace, dl),
1802 getI32Imm(vecType, dl),
1803 getI32Imm(toType, dl),
1804 getI32Imm(toTypeWidth, dl),
1805 Base,
1806 Offset,
1807 Chain};
1808 NVPTXST = CurDAG->getMachineNode(*Opcode, dl, MVT::Other, Ops);
1809 } else {
1810 if (PointerSize == 64)
1811 Opcode =
1812 pickOpcodeForVT(SourceVT, NVPTX::ST_i8_areg_64, NVPTX::ST_i16_areg_64,
1813 NVPTX::ST_i32_areg_64, NVPTX::ST_i64_areg_64,
1814 NVPTX::ST_f32_areg_64, NVPTX::ST_f64_areg_64);
1815 else
1816 Opcode = pickOpcodeForVT(SourceVT, NVPTX::ST_i8_areg, NVPTX::ST_i16_areg,
1817 NVPTX::ST_i32_areg, NVPTX::ST_i64_areg,
1818 NVPTX::ST_f32_areg, NVPTX::ST_f64_areg);
1819 if (!Opcode)
1820 return false;
1821 SDValue Ops[] = {Value,
1822 getI32Imm(isVolatile, dl),
1823 getI32Imm(CodeAddrSpace, dl),
1824 getI32Imm(vecType, dl),
1825 getI32Imm(toType, dl),
1826 getI32Imm(toTypeWidth, dl),
1827 BasePtr,
1828 Chain};
1829 NVPTXST = CurDAG->getMachineNode(*Opcode, dl, MVT::Other, Ops);
1830 }
1831
1832 if (!NVPTXST)
1833 return false;
1834
1835 MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand();
1836 CurDAG->setNodeMemRefs(cast<MachineSDNode>(NVPTXST), {MemRef});
1837 ReplaceNode(N, NVPTXST);
1838 return true;
1839}
1840
1841bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
1842 SDValue Chain = N->getOperand(0);
1843 SDValue Op1 = N->getOperand(1);
1845 std::optional<unsigned> Opcode;
1846 SDLoc DL(N);
1847 SDNode *ST;
1848 EVT EltVT = Op1.getValueType();
1849 MemSDNode *MemSD = cast<MemSDNode>(N);
1850 EVT StoreVT = MemSD->getMemoryVT();
1851
1852 // Address Space Setting
1853 unsigned CodeAddrSpace = getCodeAddrSpace(MemSD);
1854 if (CodeAddrSpace == NVPTX::PTXLdStInstCode::CONSTANT) {
1855 report_fatal_error("Cannot store to pointer that points to constant "
1856 "memory space");
1857 }
1858 unsigned int PointerSize =
1860
1861 // Volatile Setting
1862 // - .volatile is only availalble for .global and .shared
1863 bool IsVolatile = MemSD->isVolatile();
1864 if (CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL &&
1865 CodeAddrSpace != NVPTX::PTXLdStInstCode::SHARED &&
1866 CodeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC)
1867 IsVolatile = false;
1868
1869 // Type Setting: toType + toTypeWidth
1870 // - for integer type, always use 'u'
1871 assert(StoreVT.isSimple() && "Store value is not simple");
1872 MVT ScalarVT = StoreVT.getSimpleVT().getScalarType();
1873 unsigned ToTypeWidth = ScalarVT.getSizeInBits();
1874 unsigned ToType = getLdStRegType(ScalarVT);
1875
1877 SDValue N2;
1878 unsigned VecType;
1879
1880 switch (N->getOpcode()) {
1881 case NVPTXISD::StoreV2:
1883 StOps.push_back(N->getOperand(1));
1884 StOps.push_back(N->getOperand(2));
1885 N2 = N->getOperand(3);
1886 break;
1887 case NVPTXISD::StoreV4:
1889 StOps.push_back(N->getOperand(1));
1890 StOps.push_back(N->getOperand(2));
1891 StOps.push_back(N->getOperand(3));
1892 StOps.push_back(N->getOperand(4));
1893 N2 = N->getOperand(5);
1894 break;
1895 default:
1896 return false;
1897 }
1898
1899 // v8x16 is a special case. PTX doesn't have st.v8.x16
1900 // instruction. Instead, we split the vector into v2x16 chunks and
1901 // store them with st.v4.b32.
1902 if (Isv2x16VT(EltVT)) {
1903 assert(N->getOpcode() == NVPTXISD::StoreV4 && "Unexpected load opcode.");
1904 EltVT = MVT::i32;
1906 ToTypeWidth = 32;
1907 }
1908
1909 StOps.push_back(getI32Imm(IsVolatile, DL));
1910 StOps.push_back(getI32Imm(CodeAddrSpace, DL));
1911 StOps.push_back(getI32Imm(VecType, DL));
1912 StOps.push_back(getI32Imm(ToType, DL));
1913 StOps.push_back(getI32Imm(ToTypeWidth, DL));
1914
1915 if (SelectDirectAddr(N2, Addr)) {
1916 switch (N->getOpcode()) {
1917 default:
1918 return false;
1919 case NVPTXISD::StoreV2:
1920 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1921 NVPTX::STV_i8_v2_avar, NVPTX::STV_i16_v2_avar,
1922 NVPTX::STV_i32_v2_avar, NVPTX::STV_i64_v2_avar,
1923 NVPTX::STV_f32_v2_avar, NVPTX::STV_f64_v2_avar);
1924 break;
1925 case NVPTXISD::StoreV4:
1926 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1927 NVPTX::STV_i8_v4_avar, NVPTX::STV_i16_v4_avar,
1928 NVPTX::STV_i32_v4_avar, std::nullopt,
1929 NVPTX::STV_f32_v4_avar, std::nullopt);
1930 break;
1931 }
1932 StOps.push_back(Addr);
1933 } else if (PointerSize == 64 ? SelectADDRsi64(N2.getNode(), N2, Base, Offset)
1934 : SelectADDRsi(N2.getNode(), N2, Base, Offset)) {
1935 switch (N->getOpcode()) {
1936 default:
1937 return false;
1938 case NVPTXISD::StoreV2:
1939 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1940 NVPTX::STV_i8_v2_asi, NVPTX::STV_i16_v2_asi,
1941 NVPTX::STV_i32_v2_asi, NVPTX::STV_i64_v2_asi,
1942 NVPTX::STV_f32_v2_asi, NVPTX::STV_f64_v2_asi);
1943 break;
1944 case NVPTXISD::StoreV4:
1945 Opcode =
1946 pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v4_asi,
1947 NVPTX::STV_i16_v4_asi, NVPTX::STV_i32_v4_asi,
1948 std::nullopt, NVPTX::STV_f32_v4_asi, std::nullopt);
1949 break;
1950 }
1951 StOps.push_back(Base);
1952 StOps.push_back(Offset);
1953 } else if (PointerSize == 64 ? SelectADDRri64(N2.getNode(), N2, Base, Offset)
1954 : SelectADDRri(N2.getNode(), N2, Base, Offset)) {
1955 if (PointerSize == 64) {
1956 switch (N->getOpcode()) {
1957 default:
1958 return false;
1959 case NVPTXISD::StoreV2:
1960 Opcode =
1962 NVPTX::STV_i8_v2_ari_64, NVPTX::STV_i16_v2_ari_64,
1963 NVPTX::STV_i32_v2_ari_64, NVPTX::STV_i64_v2_ari_64,
1964 NVPTX::STV_f32_v2_ari_64, NVPTX::STV_f64_v2_ari_64);
1965 break;
1966 case NVPTXISD::StoreV4:
1967 Opcode = pickOpcodeForVT(
1968 EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v4_ari_64,
1969 NVPTX::STV_i16_v4_ari_64, NVPTX::STV_i32_v4_ari_64, std::nullopt,
1970 NVPTX::STV_f32_v4_ari_64, std::nullopt);
1971 break;
1972 }
1973 } else {
1974 switch (N->getOpcode()) {
1975 default:
1976 return false;
1977 case NVPTXISD::StoreV2:
1978 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1979 NVPTX::STV_i8_v2_ari, NVPTX::STV_i16_v2_ari,
1980 NVPTX::STV_i32_v2_ari, NVPTX::STV_i64_v2_ari,
1981 NVPTX::STV_f32_v2_ari, NVPTX::STV_f64_v2_ari);
1982 break;
1983 case NVPTXISD::StoreV4:
1984 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1985 NVPTX::STV_i8_v4_ari, NVPTX::STV_i16_v4_ari,
1986 NVPTX::STV_i32_v4_ari, std::nullopt,
1987 NVPTX::STV_f32_v4_ari, std::nullopt);
1988 break;
1989 }
1990 }
1991 StOps.push_back(Base);
1992 StOps.push_back(Offset);
1993 } else {
1994 if (PointerSize == 64) {
1995 switch (N->getOpcode()) {
1996 default:
1997 return false;
1998 case NVPTXISD::StoreV2:
1999 Opcode = pickOpcodeForVT(
2000 EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v2_areg_64,
2001 NVPTX::STV_i16_v2_areg_64, NVPTX::STV_i32_v2_areg_64,
2002 NVPTX::STV_i64_v2_areg_64, NVPTX::STV_f32_v2_areg_64,
2003 NVPTX::STV_f64_v2_areg_64);
2004 break;
2005 case NVPTXISD::StoreV4:
2006 Opcode = pickOpcodeForVT(
2007 EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v4_areg_64,
2008 NVPTX::STV_i16_v4_areg_64, NVPTX::STV_i32_v4_areg_64, std::nullopt,
2009 NVPTX::STV_f32_v4_areg_64, std::nullopt);
2010 break;
2011 }
2012 } else {
2013 switch (N->getOpcode()) {
2014 default:
2015 return false;
2016 case NVPTXISD::StoreV2:
2017 Opcode =
2018 pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v2_areg,
2019 NVPTX::STV_i16_v2_areg, NVPTX::STV_i32_v2_areg,
2020 NVPTX::STV_i64_v2_areg, NVPTX::STV_f32_v2_areg,
2021 NVPTX::STV_f64_v2_areg);
2022 break;
2023 case NVPTXISD::StoreV4:
2024 Opcode =
2025 pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v4_areg,
2026 NVPTX::STV_i16_v4_areg, NVPTX::STV_i32_v4_areg,
2027 std::nullopt, NVPTX::STV_f32_v4_areg, std::nullopt);
2028 break;
2029 }
2030 }
2031 StOps.push_back(N2);
2032 }
2033
2034 if (!Opcode)
2035 return false;
2036
2037 StOps.push_back(Chain);
2038
2039 ST = CurDAG->getMachineNode(*Opcode, DL, MVT::Other, StOps);
2040
2041 MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand();
2042 CurDAG->setNodeMemRefs(cast<MachineSDNode>(ST), {MemRef});
2043
2044 ReplaceNode(N, ST);
2045 return true;
2046}
2047
2048bool NVPTXDAGToDAGISel::tryLoadParam(SDNode *Node) {
2049 SDValue Chain = Node->getOperand(0);
2050 SDValue Offset = Node->getOperand(2);
2051 SDValue Glue = Node->getOperand(3);
2052 SDLoc DL(Node);
2053 MemSDNode *Mem = cast<MemSDNode>(Node);
2054
2055 unsigned VecSize;
2056 switch (Node->getOpcode()) {
2057 default:
2058 return false;
2060 VecSize = 1;
2061 break;
2063 VecSize = 2;
2064 break;
2066 VecSize = 4;
2067 break;
2068 }
2069
2070 EVT EltVT = Node->getValueType(0);
2071 EVT MemVT = Mem->getMemoryVT();
2072
2073 std::optional<unsigned> Opcode;
2074
2075 switch (VecSize) {
2076 default:
2077 return false;
2078 case 1:
2079 Opcode = pickOpcodeForVT(MemVT.getSimpleVT().SimpleTy,
2080 NVPTX::LoadParamMemI8, NVPTX::LoadParamMemI16,
2081 NVPTX::LoadParamMemI32, NVPTX::LoadParamMemI64,
2082 NVPTX::LoadParamMemF32, NVPTX::LoadParamMemF64);
2083 break;
2084 case 2:
2085 Opcode =
2086 pickOpcodeForVT(MemVT.getSimpleVT().SimpleTy, NVPTX::LoadParamMemV2I8,
2087 NVPTX::LoadParamMemV2I16, NVPTX::LoadParamMemV2I32,
2088 NVPTX::LoadParamMemV2I64, NVPTX::LoadParamMemV2F32,
2089 NVPTX::LoadParamMemV2F64);
2090 break;
2091 case 4:
2092 Opcode =
2093 pickOpcodeForVT(MemVT.getSimpleVT().SimpleTy, NVPTX::LoadParamMemV4I8,
2094 NVPTX::LoadParamMemV4I16, NVPTX::LoadParamMemV4I32,
2095 std::nullopt, NVPTX::LoadParamMemV4F32, std::nullopt);
2096 break;
2097 }
2098 if (!Opcode)
2099 return false;
2100
2101 SDVTList VTs;
2102 if (VecSize == 1) {
2103 VTs = CurDAG->getVTList(EltVT, MVT::Other, MVT::Glue);
2104 } else if (VecSize == 2) {
2105 VTs = CurDAG->getVTList(EltVT, EltVT, MVT::Other, MVT::Glue);
2106 } else {
2107 EVT EVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other, MVT::Glue };
2108 VTs = CurDAG->getVTList(EVTs);
2109 }
2110
2111 unsigned OffsetVal = Offset->getAsZExtVal();
2112
2114 Ops.push_back(CurDAG->getTargetConstant(OffsetVal, DL, MVT::i32));
2115 Ops.push_back(Chain);
2116 Ops.push_back(Glue);
2117
2118 ReplaceNode(Node, CurDAG->getMachineNode(*Opcode, DL, VTs, Ops));
2119 return true;
2120}
2121
2122bool NVPTXDAGToDAGISel::tryStoreRetval(SDNode *N) {
2123 SDLoc DL(N);
2124 SDValue Chain = N->getOperand(0);
2125 SDValue Offset = N->getOperand(1);
2126 unsigned OffsetVal = Offset->getAsZExtVal();
2127 MemSDNode *Mem = cast<MemSDNode>(N);
2128
2129 // How many elements do we have?
2130 unsigned NumElts = 1;
2131 switch (N->getOpcode()) {
2132 default:
2133 return false;
2135 NumElts = 1;
2136 break;
2138 NumElts = 2;
2139 break;
2141 NumElts = 4;
2142 break;
2143 }
2144
2145 // Build vector of operands
2147 for (unsigned i = 0; i < NumElts; ++i)
2148 Ops.push_back(N->getOperand(i + 2));
2149 Ops.push_back(CurDAG->getTargetConstant(OffsetVal, DL, MVT::i32));
2150 Ops.push_back(Chain);
2151
2152 // Determine target opcode
2153 // If we have an i1, use an 8-bit store. The lowering code in
2154 // NVPTXISelLowering will have already emitted an upcast.
2155 std::optional<unsigned> Opcode = 0;
2156 switch (NumElts) {
2157 default:
2158 return false;
2159 case 1:
2161 NVPTX::StoreRetvalI8, NVPTX::StoreRetvalI16,
2162 NVPTX::StoreRetvalI32, NVPTX::StoreRetvalI64,
2163 NVPTX::StoreRetvalF32, NVPTX::StoreRetvalF64);
2164 if (Opcode == NVPTX::StoreRetvalI8) {
2165 // Fine tune the opcode depending on the size of the operand.
2166 // This helps to avoid creating redundant COPY instructions in
2167 // InstrEmitter::AddRegisterOperand().
2168 switch (Ops[0].getSimpleValueType().SimpleTy) {
2169 default:
2170 break;
2171 case MVT::i32:
2172 Opcode = NVPTX::StoreRetvalI8TruncI32;
2173 break;
2174 case MVT::i64:
2175 Opcode = NVPTX::StoreRetvalI8TruncI64;
2176 break;
2177 }
2178 }
2179 break;
2180 case 2:
2182 NVPTX::StoreRetvalV2I8, NVPTX::StoreRetvalV2I16,
2183 NVPTX::StoreRetvalV2I32, NVPTX::StoreRetvalV2I64,
2184 NVPTX::StoreRetvalV2F32, NVPTX::StoreRetvalV2F64);
2185 break;
2186 case 4:
2188 NVPTX::StoreRetvalV4I8, NVPTX::StoreRetvalV4I16,
2189 NVPTX::StoreRetvalV4I32, std::nullopt,
2190 NVPTX::StoreRetvalV4F32, std::nullopt);
2191 break;
2192 }
2193 if (!Opcode)
2194 return false;
2195
2196 SDNode *Ret = CurDAG->getMachineNode(*Opcode, DL, MVT::Other, Ops);
2197 MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand();
2198 CurDAG->setNodeMemRefs(cast<MachineSDNode>(Ret), {MemRef});
2199
2200 ReplaceNode(N, Ret);
2201 return true;
2202}
2203
2204// Helpers for constructing opcode (ex: NVPTX::StoreParamV4F32_iiri)
2205#define getOpcV2H(ty, opKind0, opKind1) \
2206 NVPTX::StoreParamV2##ty##_##opKind0##opKind1
2207
2208#define getOpcV2H1(ty, opKind0, isImm1) \
2209 (isImm1) ? getOpcV2H(ty, opKind0, i) : getOpcV2H(ty, opKind0, r)
2210
2211#define getOpcodeForVectorStParamV2(ty, isimm) \
2212 (isimm[0]) ? getOpcV2H1(ty, i, isimm[1]) : getOpcV2H1(ty, r, isimm[1])
2213
2214#define getOpcV4H(ty, opKind0, opKind1, opKind2, opKind3) \
2215 NVPTX::StoreParamV4##ty##_##opKind0##opKind1##opKind2##opKind3
2216
2217#define getOpcV4H3(ty, opKind0, opKind1, opKind2, isImm3) \
2218 (isImm3) ? getOpcV4H(ty, opKind0, opKind1, opKind2, i) \
2219 : getOpcV4H(ty, opKind0, opKind1, opKind2, r)
2220
2221#define getOpcV4H2(ty, opKind0, opKind1, isImm2, isImm3) \
2222 (isImm2) ? getOpcV4H3(ty, opKind0, opKind1, i, isImm3) \
2223 : getOpcV4H3(ty, opKind0, opKind1, r, isImm3)
2224
2225#define getOpcV4H1(ty, opKind0, isImm1, isImm2, isImm3) \
2226 (isImm1) ? getOpcV4H2(ty, opKind0, i, isImm2, isImm3) \
2227 : getOpcV4H2(ty, opKind0, r, isImm2, isImm3)
2228
2229#define getOpcodeForVectorStParamV4(ty, isimm) \
2230 (isimm[0]) ? getOpcV4H1(ty, i, isimm[1], isimm[2], isimm[3]) \
2231 : getOpcV4H1(ty, r, isimm[1], isimm[2], isimm[3])
2232
2233#define getOpcodeForVectorStParam(n, ty, isimm) \
2234 (n == 2) ? getOpcodeForVectorStParamV2(ty, isimm) \
2235 : getOpcodeForVectorStParamV4(ty, isimm)
2236
2238 unsigned NumElts,
2240 SelectionDAG *CurDAG, SDLoc DL) {
2241 // Determine which inputs are registers and immediates make new operators
2242 // with constant values
2243 SmallVector<bool, 4> IsImm(NumElts, false);
2244 for (unsigned i = 0; i < NumElts; i++) {
2245 IsImm[i] = (isa<ConstantSDNode>(Ops[i]) || isa<ConstantFPSDNode>(Ops[i]));
2246 if (IsImm[i]) {
2247 SDValue Imm = Ops[i];
2248 if (MemTy == MVT::f32 || MemTy == MVT::f64) {
2249 const ConstantFPSDNode *ConstImm = cast<ConstantFPSDNode>(Imm);
2250 const ConstantFP *CF = ConstImm->getConstantFPValue();
2251 Imm = CurDAG->getTargetConstantFP(*CF, DL, Imm->getValueType(0));
2252 } else {
2253 const ConstantSDNode *ConstImm = cast<ConstantSDNode>(Imm);
2254 const ConstantInt *CI = ConstImm->getConstantIntValue();
2255 Imm = CurDAG->getTargetConstant(*CI, DL, Imm->getValueType(0));
2256 }
2257 Ops[i] = Imm;
2258 }
2259 }
2260
2261 // Get opcode for MemTy, size, and register/immediate operand ordering
2262 switch (MemTy) {
2263 case MVT::i8:
2264 return getOpcodeForVectorStParam(NumElts, I8, IsImm);
2265 case MVT::i16:
2266 return getOpcodeForVectorStParam(NumElts, I16, IsImm);
2267 case MVT::i32:
2268 return getOpcodeForVectorStParam(NumElts, I32, IsImm);
2269 case MVT::i64:
2270 assert(NumElts == 2 && "MVT too large for NumElts > 2");
2271 return getOpcodeForVectorStParamV2(I64, IsImm);
2272 case MVT::f32:
2273 return getOpcodeForVectorStParam(NumElts, F32, IsImm);
2274 case MVT::f64:
2275 assert(NumElts == 2 && "MVT too large for NumElts > 2");
2276 return getOpcodeForVectorStParamV2(F64, IsImm);
2277
2278 // These cases don't support immediates, just use the all register version
2279 // and generate moves.
2280 case MVT::i1:
2281 return (NumElts == 2) ? NVPTX::StoreParamV2I8_rr
2282 : NVPTX::StoreParamV4I8_rrrr;
2283 case MVT::f16:
2284 case MVT::bf16:
2285 return (NumElts == 2) ? NVPTX::StoreParamV2I16_rr
2286 : NVPTX::StoreParamV4I16_rrrr;
2287 case MVT::v2f16:
2288 case MVT::v2bf16:
2289 case MVT::v2i16:
2290 case MVT::v4i8:
2291 return (NumElts == 2) ? NVPTX::StoreParamV2I32_rr
2292 : NVPTX::StoreParamV4I32_rrrr;
2293 default:
2294 llvm_unreachable("Cannot select st.param for unknown MemTy");
2295 }
2296}
2297
2298bool NVPTXDAGToDAGISel::tryStoreParam(SDNode *N) {
2299 SDLoc DL(N);
2300 SDValue Chain = N->getOperand(0);
2301 SDValue Param = N->getOperand(1);
2302 unsigned ParamVal = Param->getAsZExtVal();
2303 SDValue Offset = N->getOperand(2);
2304 unsigned OffsetVal = Offset->getAsZExtVal();
2305 MemSDNode *Mem = cast<MemSDNode>(N);
2306 SDValue Glue = N->getOperand(N->getNumOperands() - 1);
2307
2308 // How many elements do we have?
2309 unsigned NumElts;
2310 switch (N->getOpcode()) {
2311 default:
2312 llvm_unreachable("Unexpected opcode");
2316 NumElts = 1;
2317 break;
2319 NumElts = 2;
2320 break;
2322 NumElts = 4;
2323 break;
2324 }
2325
2326 // Build vector of operands
2328 for (unsigned i = 0; i < NumElts; ++i)
2329 Ops.push_back(N->getOperand(i + 3));
2330 Ops.push_back(CurDAG->getTargetConstant(ParamVal, DL, MVT::i32));
2331 Ops.push_back(CurDAG->getTargetConstant(OffsetVal, DL, MVT::i32));
2332 Ops.push_back(Chain);
2333 Ops.push_back(Glue);
2334
2335 // Determine target opcode
2336 // If we have an i1, use an 8-bit store. The lowering code in
2337 // NVPTXISelLowering will have already emitted an upcast.
2338 std::optional<unsigned> Opcode;
2339 switch (N->getOpcode()) {
2340 default:
2341 switch (NumElts) {
2342 default:
2343 llvm_unreachable("Unexpected NumElts");
2344 case 1: {
2346 SDValue Imm = Ops[0];
2347 if (MemTy != MVT::f16 && MemTy != MVT::v2f16 &&
2348 (isa<ConstantSDNode>(Imm) || isa<ConstantFPSDNode>(Imm))) {
2349 // Convert immediate to target constant
2350 if (MemTy == MVT::f32 || MemTy == MVT::f64) {
2351 const ConstantFPSDNode *ConstImm = cast<ConstantFPSDNode>(Imm);
2352 const ConstantFP *CF = ConstImm->getConstantFPValue();
2353 Imm = CurDAG->getTargetConstantFP(*CF, DL, Imm->getValueType(0));
2354 } else {
2355 const ConstantSDNode *ConstImm = cast<ConstantSDNode>(Imm);
2356 const ConstantInt *CI = ConstImm->getConstantIntValue();
2357 Imm = CurDAG->getTargetConstant(*CI, DL, Imm->getValueType(0));
2358 }
2359 Ops[0] = Imm;
2360 // Use immediate version of store param
2361 Opcode = pickOpcodeForVT(MemTy, NVPTX::StoreParamI8_i,
2362 NVPTX::StoreParamI16_i, NVPTX::StoreParamI32_i,
2363 NVPTX::StoreParamI64_i, NVPTX::StoreParamF32_i,
2364 NVPTX::StoreParamF64_i);
2365 } else
2366 Opcode =
2368 NVPTX::StoreParamI8_r, NVPTX::StoreParamI16_r,
2369 NVPTX::StoreParamI32_r, NVPTX::StoreParamI64_r,
2370 NVPTX::StoreParamF32_r, NVPTX::StoreParamF64_r);
2371 if (Opcode == NVPTX::StoreParamI8_r) {
2372 // Fine tune the opcode depending on the size of the operand.
2373 // This helps to avoid creating redundant COPY instructions in
2374 // InstrEmitter::AddRegisterOperand().
2375 switch (Ops[0].getSimpleValueType().SimpleTy) {
2376 default:
2377 break;
2378 case MVT::i32:
2379 Opcode = NVPTX::StoreParamI8TruncI32_r;
2380 break;
2381 case MVT::i64:
2382 Opcode = NVPTX::StoreParamI8TruncI64_r;
2383 break;
2384 }
2385 }
2386 break;
2387 }
2388 case 2:
2389 case 4: {
2391 Opcode = pickOpcodeForVectorStParam(Ops, NumElts, MemTy, CurDAG, DL);
2392 break;
2393 }
2394 }
2395 break;
2396 // Special case: if we have a sign-extend/zero-extend node, insert the
2397 // conversion instruction first, and use that as the value operand to
2398 // the selected StoreParam node.
2400 Opcode = NVPTX::StoreParamI32_r;
2402 MVT::i32);
2403 SDNode *Cvt = CurDAG->getMachineNode(NVPTX::CVT_u32_u16, DL,
2404 MVT::i32, Ops[0], CvtNone);
2405 Ops[0] = SDValue(Cvt, 0);
2406 break;
2407 }
2409 Opcode = NVPTX::StoreParamI32_r;
2411 MVT::i32);
2412 SDNode *Cvt = CurDAG->getMachineNode(NVPTX::CVT_s32_s16, DL,
2413 MVT::i32, Ops[0], CvtNone);
2414 Ops[0] = SDValue(Cvt, 0);
2415 break;
2416 }
2417 }
2418
2419 SDVTList RetVTs = CurDAG->getVTList(MVT::Other, MVT::Glue);
2420 SDNode *Ret = CurDAG->getMachineNode(*Opcode, DL, RetVTs, Ops);
2421 MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand();
2422 CurDAG->setNodeMemRefs(cast<MachineSDNode>(Ret), {MemRef});
2423
2424 ReplaceNode(N, Ret);
2425 return true;
2426}
2427
2428bool NVPTXDAGToDAGISel::tryTextureIntrinsic(SDNode *N) {
2429 unsigned Opc = 0;
2430
2431 switch (N->getOpcode()) {
2432 default: return false;
2434 Opc = NVPTX::TEX_1D_F32_S32_RR;
2435 break;
2437 Opc = NVPTX::TEX_1D_F32_F32_RR;
2438 break;
2440 Opc = NVPTX::TEX_1D_F32_F32_LEVEL_RR;
2441 break;
2443 Opc = NVPTX::TEX_1D_F32_F32_GRAD_RR;
2444 break;
2446 Opc = NVPTX::TEX_1D_S32_S32_RR;
2447 break;
2449 Opc = NVPTX::TEX_1D_S32_F32_RR;
2450 break;
2452 Opc = NVPTX::TEX_1D_S32_F32_LEVEL_RR;
2453 break;
2455 Opc = NVPTX::TEX_1D_S32_F32_GRAD_RR;
2456 break;
2458 Opc = NVPTX::TEX_1D_U32_S32_RR;
2459 break;
2461 Opc = NVPTX::TEX_1D_U32_F32_RR;
2462 break;
2464 Opc = NVPTX::TEX_1D_U32_F32_LEVEL_RR;
2465 break;
2467 Opc = NVPTX::TEX_1D_U32_F32_GRAD_RR;
2468 break;
2470 Opc = NVPTX::TEX_1D_ARRAY_F32_S32_RR;
2471 break;
2473 Opc = NVPTX::TEX_1D_ARRAY_F32_F32_RR;
2474 break;
2476 Opc = NVPTX::TEX_1D_ARRAY_F32_F32_LEVEL_RR;
2477 break;
2479 Opc = NVPTX::TEX_1D_ARRAY_F32_F32_GRAD_RR;
2480 break;
2482 Opc = NVPTX::TEX_1D_ARRAY_S32_S32_RR;
2483 break;
2485 Opc = NVPTX::TEX_1D_ARRAY_S32_F32_RR;
2486 break;
2488 Opc = NVPTX::TEX_1D_ARRAY_S32_F32_LEVEL_RR;
2489 break;
2491 Opc = NVPTX::TEX_1D_ARRAY_S32_F32_GRAD_RR;
2492 break;
2494 Opc = NVPTX::TEX_1D_ARRAY_U32_S32_RR;
2495 break;
2497 Opc = NVPTX::TEX_1D_ARRAY_U32_F32_RR;
2498 break;
2500 Opc = NVPTX::TEX_1D_ARRAY_U32_F32_LEVEL_RR;
2501 break;
2503 Opc = NVPTX::TEX_1D_ARRAY_U32_F32_GRAD_RR;
2504 break;
2506 Opc = NVPTX::TEX_2D_F32_S32_RR;
2507 break;
2509 Opc = NVPTX::TEX_2D_F32_F32_RR;
2510 break;
2512 Opc = NVPTX::TEX_2D_F32_F32_LEVEL_RR;
2513 break;
2515 Opc = NVPTX::TEX_2D_F32_F32_GRAD_RR;
2516 break;
2518 Opc = NVPTX::TEX_2D_S32_S32_RR;
2519 break;
2521 Opc = NVPTX::TEX_2D_S32_F32_RR;
2522 break;
2524 Opc = NVPTX::TEX_2D_S32_F32_LEVEL_RR;
2525 break;
2527 Opc = NVPTX::TEX_2D_S32_F32_GRAD_RR;
2528 break;
2530 Opc = NVPTX::TEX_2D_U32_S32_RR;
2531 break;
2533 Opc = NVPTX::TEX_2D_U32_F32_RR;
2534 break;
2536 Opc = NVPTX::TEX_2D_U32_F32_LEVEL_RR;
2537 break;
2539 Opc = NVPTX::TEX_2D_U32_F32_GRAD_RR;
2540 break;
2542 Opc = NVPTX::TEX_2D_ARRAY_F32_S32_RR;
2543 break;
2545 Opc = NVPTX::TEX_2D_ARRAY_F32_F32_RR;
2546 break;
2548 Opc = NVPTX::TEX_2D_ARRAY_F32_F32_LEVEL_RR;
2549 break;
2551 Opc = NVPTX::TEX_2D_ARRAY_F32_F32_GRAD_RR;
2552 break;
2554 Opc = NVPTX::TEX_2D_ARRAY_S32_S32_RR;
2555 break;
2557 Opc = NVPTX::TEX_2D_ARRAY_S32_F32_RR;
2558 break;
2560 Opc = NVPTX::TEX_2D_ARRAY_S32_F32_LEVEL_RR;
2561 break;
2563 Opc = NVPTX::TEX_2D_ARRAY_S32_F32_GRAD_RR;
2564 break;
2566 Opc = NVPTX::TEX_2D_ARRAY_U32_S32_RR;
2567 break;
2569 Opc = NVPTX::TEX_2D_ARRAY_U32_F32_RR;
2570 break;
2572 Opc = NVPTX::TEX_2D_ARRAY_U32_F32_LEVEL_RR;
2573 break;
2575 Opc = NVPTX::TEX_2D_ARRAY_U32_F32_GRAD_RR;
2576 break;
2578 Opc = NVPTX::TEX_3D_F32_S32_RR;
2579 break;
2581 Opc = NVPTX::TEX_3D_F32_F32_RR;
2582 break;
2584 Opc = NVPTX::TEX_3D_F32_F32_LEVEL_RR;
2585 break;
2587 Opc = NVPTX::TEX_3D_F32_F32_GRAD_RR;
2588 break;
2590 Opc = NVPTX::TEX_3D_S32_S32_RR;
2591 break;
2593 Opc = NVPTX::TEX_3D_S32_F32_RR;
2594 break;
2596 Opc = NVPTX::TEX_3D_S32_F32_LEVEL_RR;
2597 break;
2599 Opc = NVPTX::TEX_3D_S32_F32_GRAD_RR;
2600 break;
2602 Opc = NVPTX::TEX_3D_U32_S32_RR;
2603 break;
2605 Opc = NVPTX::TEX_3D_U32_F32_RR;
2606 break;
2608 Opc = NVPTX::TEX_3D_U32_F32_LEVEL_RR;
2609 break;
2611 Opc = NVPTX::TEX_3D_U32_F32_GRAD_RR;
2612 break;
2614 Opc = NVPTX::TEX_CUBE_F32_F32_RR;
2615 break;
2617 Opc = NVPTX::TEX_CUBE_F32_F32_LEVEL_RR;
2618 break;
2620 Opc = NVPTX::TEX_CUBE_S32_F32_RR;
2621 break;
2623 Opc = NVPTX::TEX_CUBE_S32_F32_LEVEL_RR;
2624 break;
2626 Opc = NVPTX::TEX_CUBE_U32_F32_RR;
2627 break;
2629 Opc = NVPTX::TEX_CUBE_U32_F32_LEVEL_RR;
2630 break;
2632 Opc = NVPTX::TEX_CUBE_ARRAY_F32_F32_RR;
2633 break;
2635 Opc = NVPTX::TEX_CUBE_ARRAY_F32_F32_LEVEL_RR;
2636 break;
2638 Opc = NVPTX::TEX_CUBE_ARRAY_S32_F32_RR;
2639 break;
2641 Opc = NVPTX::TEX_CUBE_ARRAY_S32_F32_LEVEL_RR;
2642 break;
2644 Opc = NVPTX::TEX_CUBE_ARRAY_U32_F32_RR;
2645 break;
2647 Opc = NVPTX::TEX_CUBE_ARRAY_U32_F32_LEVEL_RR;
2648 break;
2650 Opc = NVPTX::TLD4_R_2D_F32_F32_RR;
2651 break;
2653 Opc = NVPTX::TLD4_G_2D_F32_F32_RR;
2654 break;
2656 Opc = NVPTX::TLD4_B_2D_F32_F32_RR;
2657 break;
2659 Opc = NVPTX::TLD4_A_2D_F32_F32_RR;
2660 break;
2662 Opc = NVPTX::TLD4_R_2D_S32_F32_RR;
2663 break;
2665 Opc = NVPTX::TLD4_G_2D_S32_F32_RR;
2666 break;
2668 Opc = NVPTX::TLD4_B_2D_S32_F32_RR;
2669 break;
2671 Opc = NVPTX::TLD4_A_2D_S32_F32_RR;
2672 break;
2674 Opc = NVPTX::TLD4_R_2D_U32_F32_RR;
2675 break;
2677 Opc = NVPTX::TLD4_G_2D_U32_F32_RR;
2678 break;
2680 Opc = NVPTX::TLD4_B_2D_U32_F32_RR;
2681 break;
2683 Opc = NVPTX::TLD4_A_2D_U32_F32_RR;
2684 break;
2686 Opc = NVPTX::TEX_UNIFIED_1D_F32_S32_R;
2687 break;
2689 Opc = NVPTX::TEX_UNIFIED_1D_F32_F32_R;
2690 break;
2692 Opc = NVPTX::TEX_UNIFIED_1D_F32_F32_LEVEL_R;
2693 break;
2695 Opc = NVPTX::TEX_UNIFIED_1D_F32_F32_GRAD_R;
2696 break;
2698 Opc = NVPTX::TEX_UNIFIED_1D_S32_S32_R;
2699 break;
2701 Opc = NVPTX::TEX_UNIFIED_1D_S32_F32_R;
2702 break;
2704 Opc = NVPTX::TEX_UNIFIED_1D_S32_F32_LEVEL_R;
2705 break;
2707 Opc = NVPTX::TEX_UNIFIED_1D_S32_F32_GRAD_R;
2708 break;
2710 Opc = NVPTX::TEX_UNIFIED_1D_U32_S32_R;
2711 break;
2713 Opc = NVPTX::TEX_UNIFIED_1D_U32_F32_R;
2714 break;
2716 Opc = NVPTX::TEX_UNIFIED_1D_U32_F32_LEVEL_R;
2717 break;
2719 Opc = NVPTX::TEX_UNIFIED_1D_U32_F32_GRAD_R;
2720 break;
2722 Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_F32_S32_R;
2723 break;
2725 Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_F32_F32_R;
2726 break;
2728 Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_F32_F32_LEVEL_R;
2729 break;
2731 Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_F32_F32_GRAD_R;
2732 break;
2734 Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_S32_S32_R;
2735 break;
2737 Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_S32_F32_R;
2738 break;
2740 Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_S32_F32_LEVEL_R;
2741 break;
2743 Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_S32_F32_GRAD_R;
2744 break;
2746 Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_U32_S32_R;
2747 break;
2749 Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_U32_F32_R;
2750 break;
2752 Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_U32_F32_LEVEL_R;
2753 break;
2755 Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_U32_F32_GRAD_R;
2756 break;
2758 Opc = NVPTX::TEX_UNIFIED_2D_F32_S32_R;
2759 break;
2761 Opc = NVPTX::TEX_UNIFIED_2D_F32_F32_R;
2762 break;
2764 Opc = NVPTX::TEX_UNIFIED_2D_F32_F32_LEVEL_R;
2765 break;
2767 Opc = NVPTX::TEX_UNIFIED_2D_F32_F32_GRAD_R;
2768 break;
2770 Opc = NVPTX::TEX_UNIFIED_2D_S32_S32_R;
2771 break;
2773 Opc = NVPTX::TEX_UNIFIED_2D_S32_F32_R;
2774 break;
2776 Opc = NVPTX::TEX_UNIFIED_2D_S32_F32_LEVEL_R;
2777 break;
2779 Opc = NVPTX::TEX_UNIFIED_2D_S32_F32_GRAD_R;
2780 break;
2782 Opc = NVPTX::TEX_UNIFIED_2D_U32_S32_R;
2783 break;
2785 Opc = NVPTX::TEX_UNIFIED_2D_U32_F32_R;
2786 break;
2788 Opc = NVPTX::TEX_UNIFIED_2D_U32_F32_LEVEL_R;
2789 break;
2791 Opc = NVPTX::TEX_UNIFIED_2D_U32_F32_GRAD_R;
2792 break;
2794 Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_F32_S32_R;
2795 break;
2797 Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_F32_F32_R;
2798 break;
2800 Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_F32_F32_LEVEL_R;
2801 break;
2803 Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_F32_F32_GRAD_R;
2804 break;
2806 Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_S32_S32_R;
2807 break;
2809 Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_S32_F32_R;
2810 break;
2812 Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_S32_F32_LEVEL_R;
2813 break;
2815 Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_S32_F32_GRAD_R;
2816 break;
2818 Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_U32_S32_R;
2819 break;
2821 Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_U32_F32_R;
2822 break;
2824 Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_U32_F32_LEVEL_R;
2825 break;
2827 Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_U32_F32_GRAD_R;
2828 break;
2830 Opc = NVPTX::TEX_UNIFIED_3D_F32_S32_R;
2831 break;
2833 Opc = NVPTX::TEX_UNIFIED_3D_F32_F32_R;
2834 break;
2836 Opc = NVPTX::TEX_UNIFIED_3D_F32_F32_LEVEL_R;
2837 break;
2839 Opc = NVPTX::TEX_UNIFIED_3D_F32_F32_GRAD_R;
2840 break;
2842 Opc = NVPTX::TEX_UNIFIED_3D_S32_S32_R;
2843 break;
2845 Opc = NVPTX::TEX_UNIFIED_3D_S32_F32_R;
2846 break;
2848 Opc = NVPTX::TEX_UNIFIED_3D_S32_F32_LEVEL_R;
2849 break;
2851 Opc = NVPTX::TEX_UNIFIED_3D_S32_F32_GRAD_R;
2852 break;
2854 Opc = NVPTX::TEX_UNIFIED_3D_U32_S32_R;
2855 break;
2857 Opc = NVPTX::TEX_UNIFIED_3D_U32_F32_R;
2858 break;
2860 Opc = NVPTX::TEX_UNIFIED_3D_U32_F32_LEVEL_R;
2861 break;
2863 Opc = NVPTX::TEX_UNIFIED_3D_U32_F32_GRAD_R;
2864 break;
2866 Opc = NVPTX::TEX_UNIFIED_CUBE_F32_F32_R;
2867 break;
2869 Opc = NVPTX::TEX_UNIFIED_CUBE_F32_F32_LEVEL_R;
2870 break;
2872 Opc = NVPTX::TEX_UNIFIED_CUBE_S32_F32_R;
2873 break;
2875 Opc = NVPTX::TEX_UNIFIED_CUBE_S32_F32_LEVEL_R;
2876 break;
2878 Opc = NVPTX::TEX_UNIFIED_CUBE_U32_F32_R;
2879 break;
2881 Opc = NVPTX::TEX_UNIFIED_CUBE_U32_F32_LEVEL_R;
2882 break;
2884 Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_F32_F32_R;
2885 break;
2887 Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_F32_F32_LEVEL_R;
2888 break;
2890 Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_S32_F32_R;
2891 break;
2893 Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_S32_F32_LEVEL_R;
2894 break;
2896 Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_U32_F32_R;
2897 break;
2899 Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_U32_F32_LEVEL_R;
2900 break;
2902 Opc = NVPTX::TLD4_UNIFIED_R_2D_F32_F32_R;
2903 break;
2905 Opc = NVPTX::TLD4_UNIFIED_G_2D_F32_F32_R;
2906 break;
2908 Opc = NVPTX::TLD4_UNIFIED_B_2D_F32_F32_R;
2909 break;
2911 Opc = NVPTX::TLD4_UNIFIED_A_2D_F32_F32_R;
2912 break;
2914 Opc = NVPTX::TLD4_UNIFIED_R_2D_S32_F32_R;
2915 break;
2917 Opc = NVPTX::TLD4_UNIFIED_G_2D_S32_F32_R;
2918 break;
2920 Opc = NVPTX::TLD4_UNIFIED_B_2D_S32_F32_R;
2921 break;
2923 Opc = NVPTX::TLD4_UNIFIED_A_2D_S32_F32_R;
2924 break;
2926 Opc = NVPTX::TLD4_UNIFIED_R_2D_U32_F32_R;
2927 break;
2929 Opc = NVPTX::TLD4_UNIFIED_G_2D_U32_F32_R;
2930 break;
2932 Opc = NVPTX::TLD4_UNIFIED_B_2D_U32_F32_R;
2933 break;
2935 Opc = NVPTX::TLD4_UNIFIED_A_2D_U32_F32_R;
2936 break;
2938 Opc = NVPTX::TEX_UNIFIED_CUBE_F32_F32_GRAD_R;
2939 break;
2941 Opc = NVPTX::TEX_UNIFIED_CUBE_S32_F32_GRAD_R;
2942 break;
2944 Opc = NVPTX::TEX_UNIFIED_CUBE_U32_F32_GRAD_R;
2945 break;
2947 Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_F32_F32_GRAD_R;
2948 break;
2950 Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_S32_F32_GRAD_R;
2951 break;
2953 Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_U32_F32_GRAD_R;
2954 break;
2955 }
2956
2957 // Copy over operands
2959 Ops.push_back(N->getOperand(0)); // Move chain to the back.
2960
2961 ReplaceNode(N, CurDAG->getMachineNode(Opc, SDLoc(N), N->getVTList(), Ops));
2962 return true;
2963}
2964
2965bool NVPTXDAGToDAGISel::trySurfaceIntrinsic(SDNode *N) {
2966 unsigned Opc = 0;
2967 switch (N->getOpcode()) {
2968 default: return false;
2970 Opc = NVPTX::SULD_1D_I8_CLAMP_R;
2971 break;
2973 Opc = NVPTX::SULD_1D_I16_CLAMP_R;
2974 break;
2976 Opc = NVPTX::SULD_1D_I32_CLAMP_R;
2977 break;
2979 Opc = NVPTX::SULD_1D_I64_CLAMP_R;
2980 break;
2982 Opc = NVPTX::SULD_1D_V2I8_CLAMP_R;
2983 break;
2985 Opc = NVPTX::SULD_1D_V2I16_CLAMP_R;
2986 break;
2988 Opc = NVPTX::SULD_1D_V2I32_CLAMP_R;
2989 break;
2991 Opc = NVPTX::SULD_1D_V2I64_CLAMP_R;
2992 break;
2994 Opc = NVPTX::SULD_1D_V4I8_CLAMP_R;
2995 break;
2997 Opc = NVPTX::SULD_1D_V4I16_CLAMP_R;
2998 break;
3000 Opc = NVPTX::SULD_1D_V4I32_CLAMP_R;
3001 break;
3003 Opc = NVPTX::SULD_1D_ARRAY_I8_CLAMP_R;
3004 break;
3006 Opc = NVPTX::SULD_1D_ARRAY_I16_CLAMP_R;
3007 break;
3009 Opc = NVPTX::SULD_1D_ARRAY_I32_CLAMP_R;
3010 break;
3012 Opc = NVPTX::SULD_1D_ARRAY_I64_CLAMP_R;
3013 break;
3015 Opc = NVPTX::SULD_1D_ARRAY_V2I8_CLAMP_R;
3016 break;
3018 Opc = NVPTX::SULD_1D_ARRAY_V2I16_CLAMP_R;
3019 break;
3021 Opc = NVPTX::SULD_1D_ARRAY_V2I32_CLAMP_R;
3022 break;
3024 Opc = NVPTX::SULD_1D_ARRAY_V2I64_CLAMP_R;
3025 break;
3027 Opc = NVPTX::SULD_1D_ARRAY_V4I8_CLAMP_R;
3028 break;
3030 Opc = NVPTX::SULD_1D_ARRAY_V4I16_CLAMP_R;
3031 break;
3033 Opc = NVPTX::SULD_1D_ARRAY_V4I32_CLAMP_R;
3034 break;
3036 Opc = NVPTX::SULD_2D_I8_CLAMP_R;
3037 break;
3039 Opc = NVPTX::SULD_2D_I16_CLAMP_R;
3040 break;
3042 Opc = NVPTX::SULD_2D_I32_CLAMP_R;
3043 break;
3045 Opc = NVPTX::SULD_2D_I64_CLAMP_R;
3046 break;
3048 Opc = NVPTX::SULD_2D_V2I8_CLAMP_R;
3049 break;
3051 Opc = NVPTX::SULD_2D_V2I16_CLAMP_R;
3052 break;
3054 Opc = NVPTX::SULD_2D_V2I32_CLAMP_R;
3055 break;
3057 Opc = NVPTX::SULD_2D_V2I64_CLAMP_R;
3058 break;
3060 Opc = NVPTX::SULD_2D_V4I8_CLAMP_R;
3061 break;
3063 Opc = NVPTX::SULD_2D_V4I16_CLAMP_R;
3064 break;
3066 Opc = NVPTX::SULD_2D_V4I32_CLAMP_R;
3067 break;
3069 Opc = NVPTX::SULD_2D_ARRAY_I8_CLAMP_R;
3070 break;
3072 Opc = NVPTX::SULD_2D_ARRAY_I16_CLAMP_R;
3073 break;
3075 Opc = NVPTX::SULD_2D_ARRAY_I32_CLAMP_R;
3076 break;
3078 Opc = NVPTX::SULD_2D_ARRAY_I64_CLAMP_R;
3079 break;
3081 Opc = NVPTX::SULD_2D_ARRAY_V2I8_CLAMP_R;
3082 break;
3084 Opc = NVPTX::SULD_2D_ARRAY_V2I16_CLAMP_R;
3085 break;
3087 Opc = NVPTX::SULD_2D_ARRAY_V2I32_CLAMP_R;
3088 break;
3090 Opc = NVPTX::SULD_2D_ARRAY_V2I64_CLAMP_R;
3091 break;
3093 Opc = NVPTX::SULD_2D_ARRAY_V4I8_CLAMP_R;
3094 break;
3096 Opc = NVPTX::SULD_2D_ARRAY_V4I16_CLAMP_R;
3097 break;
3099 Opc = NVPTX::SULD_2D_ARRAY_V4I32_CLAMP_R;
3100 break;
3102 Opc = NVPTX::SULD_3D_I8_CLAMP_R;
3103 break;
3105 Opc = NVPTX::SULD_3D_I16_CLAMP_R;
3106 break;
3108 Opc = NVPTX::SULD_3D_I32_CLAMP_R;
3109 break;
3111 Opc = NVPTX::SULD_3D_I64_CLAMP_R;
3112 break;
3114 Opc = NVPTX::SULD_3D_V2I8_CLAMP_R;
3115 break;
3117 Opc = NVPTX::SULD_3D_V2I16_CLAMP_R;
3118 break;
3120 Opc = NVPTX::SULD_3D_V2I32_CLAMP_R;
3121 break;
3123 Opc = NVPTX::SULD_3D_V2I64_CLAMP_R;
3124 break;
3126 Opc = NVPTX::SULD_3D_V4I8_CLAMP_R;
3127 break;
3129 Opc = NVPTX::SULD_3D_V4I16_CLAMP_R;
3130 break;
3132 Opc = NVPTX::SULD_3D_V4I32_CLAMP_R;
3133 break;
3135 Opc = NVPTX::SULD_1D_I8_TRAP_R;
3136 break;
3138 Opc = NVPTX::SULD_1D_I16_TRAP_R;
3139 break;
3141 Opc = NVPTX::SULD_1D_I32_TRAP_R;
3142 break;
3144 Opc = NVPTX::SULD_1D_I64_TRAP_R;
3145 break;
3147 Opc = NVPTX::SULD_1D_V2I8_TRAP_R;
3148 break;
3150 Opc = NVPTX::SULD_1D_V2I16_TRAP_R;
3151 break;
3153 Opc = NVPTX::SULD_1D_V2I32_TRAP_R;
3154 break;
3156 Opc = NVPTX::SULD_1D_V2I64_TRAP_R;
3157 break;
3159 Opc = NVPTX::SULD_1D_V4I8_TRAP_R;
3160 break;
3162 Opc = NVPTX::SULD_1D_V4I16_TRAP_R;
3163 break;
3165 Opc = NVPTX::SULD_1D_V4I32_TRAP_R;
3166 break;
3168 Opc = NVPTX::SULD_1D_ARRAY_I8_TRAP_R;
3169 break;
3171 Opc = NVPTX::SULD_1D_ARRAY_I16_TRAP_R;
3172 break;
3174 Opc = NVPTX::SULD_1D_ARRAY_I32_TRAP_R;
3175 break;
3177 Opc = NVPTX::SULD_1D_ARRAY_I64_TRAP_R;
3178 break;
3180 Opc = NVPTX::SULD_1D_ARRAY_V2I8_TRAP_R;
3181 break;
3183 Opc = NVPTX::SULD_1D_ARRAY_V2I16_TRAP_R;
3184 break;
3186 Opc = NVPTX::SULD_1D_ARRAY_V2I32_TRAP_R;
3187 break;
3189 Opc = NVPTX::SULD_1D_ARRAY_V2I64_TRAP_R;
3190 break;
3192 Opc = NVPTX::SULD_1D_ARRAY_V4I8_TRAP_R;
3193 break;
3195 Opc = NVPTX::SULD_1D_ARRAY_V4I16_TRAP_R;
3196 break;
3198 Opc = NVPTX::SULD_1D_ARRAY_V4I32_TRAP_R;
3199 break;
3201 Opc = NVPTX::SULD_2D_I8_TRAP_R;
3202 break;
3204 Opc = NVPTX::SULD_2D_I16_TRAP_R;
3205 break;
3207 Opc = NVPTX::SULD_2D_I32_TRAP_R;
3208 break;
3210 Opc = NVPTX::SULD_2D_I64_TRAP_R;
3211 break;
3213 Opc = NVPTX::SULD_2D_V2I8_TRAP_R;
3214 break;
3216 Opc = NVPTX::SULD_2D_V2I16_TRAP_R;
3217 break;
3219 Opc = NVPTX::SULD_2D_V2I32_TRAP_R;
3220 break;
3222 Opc = NVPTX::SULD_2D_V2I64_TRAP_R;
3223 break;
3225 Opc = NVPTX::SULD_2D_V4I8_TRAP_R;
3226 break;
3228 Opc = NVPTX::SULD_2D_V4I16_TRAP_R;
3229 break;
3231 Opc = NVPTX::SULD_2D_V4I32_TRAP_R;
3232 break;
3234 Opc = NVPTX::SULD_2D_ARRAY_I8_TRAP_R;
3235 break;
3237 Opc = NVPTX::SULD_2D_ARRAY_I16_TRAP_R;
3238 break;
3240 Opc = NVPTX::SULD_2D_ARRAY_I32_TRAP_R;
3241 break;
3243 Opc = NVPTX::SULD_2D_ARRAY_I64_TRAP_R;
3244 break;
3246 Opc = NVPTX::SULD_2D_ARRAY_V2I8_TRAP_R;
3247 break;
3249 Opc = NVPTX::SULD_2D_ARRAY_V2I16_TRAP_R;
3250 break;
3252 Opc = NVPTX::SULD_2D_ARRAY_V2I32_TRAP_R;
3253 break;
3255 Opc = NVPTX::SULD_2D_ARRAY_V2I64_TRAP_R;
3256 break;
3258 Opc = NVPTX::SULD_2D_ARRAY_V4I8_TRAP_R;
3259 break;
3261 Opc = NVPTX::SULD_2D_ARRAY_V4I16_TRAP_R;
3262 break;
3264 Opc = NVPTX::SULD_2D_ARRAY_V4I32_TRAP_R;
3265 break;
3267 Opc = NVPTX::SULD_3D_I8_TRAP_R;
3268 break;
3270 Opc = NVPTX::SULD_3D_I16_TRAP_R;
3271 break;
3273 Opc = NVPTX::SULD_3D_I32_TRAP_R;
3274 break;
3276 Opc = NVPTX::SULD_3D_I64_TRAP_R;
3277 break;
3279 Opc = NVPTX::SULD_3D_V2I8_TRAP_R;
3280 break;
3282 Opc = NVPTX::SULD_3D_V2I16_TRAP_R;
3283 break;
3285 Opc = NVPTX::SULD_3D_V2I32_TRAP_R;
3286 break;
3288 Opc = NVPTX::SULD_3D_V2I64_TRAP_R;
3289 break;
3291 Opc = NVPTX::SULD_3D_V4I8_TRAP_R;
3292 break;
3294 Opc = NVPTX::SULD_3D_V4I16_TRAP_R;
3295 break;
3297 Opc = NVPTX::SULD_3D_V4I32_TRAP_R;
3298 break;
3300 Opc = NVPTX::SULD_1D_I8_ZERO_R;
3301 break;
3303 Opc = NVPTX::SULD_1D_I16_ZERO_R;
3304 break;
3306 Opc = NVPTX::SULD_1D_I32_ZERO_R;
3307 break;
3309 Opc = NVPTX::SULD_1D_I64_ZERO_R;
3310 break;
3312 Opc = NVPTX::SULD_1D_V2I8_ZERO_R;
3313 break;
3315 Opc = NVPTX::SULD_1D_V2I16_ZERO_R;
3316 break;
3318 Opc = NVPTX::SULD_1D_V2I32_ZERO_R;
3319 break;
3321 Opc = NVPTX::SULD_1D_V2I64_ZERO_R;
3322 break;
3324 Opc = NVPTX::SULD_1D_V4I8_ZERO_R;
3325 break;
3327 Opc = NVPTX::SULD_1D_V4I16_ZERO_R;
3328 break;
3330 Opc = NVPTX::SULD_1D_V4I32_ZERO_R;
3331 break;
3333 Opc = NVPTX::SULD_1D_ARRAY_I8_ZERO_R;
3334 break;
3336 Opc = NVPTX::SULD_1D_ARRAY_I16_ZERO_R;
3337 break;
3339 Opc = NVPTX::SULD_1D_ARRAY_I32_ZERO_R;
3340 break;
3342 Opc = NVPTX::SULD_1D_ARRAY_I64_ZERO_R;
3343 break;
3345 Opc = NVPTX::SULD_1D_ARRAY_V2I8_ZERO_R;
3346 break;
3348 Opc = NVPTX::SULD_1D_ARRAY_V2I16_ZERO_R;
3349 break;
3351 Opc = NVPTX::SULD_1D_ARRAY_V2I32_ZERO_R;
3352 break;
3354 Opc = NVPTX::SULD_1D_ARRAY_V2I64_ZERO_R;
3355 break;
3357 Opc = NVPTX::SULD_1D_ARRAY_V4I8_ZERO_R;
3358 break;
3360 Opc = NVPTX::SULD_1D_ARRAY_V4I16_ZERO_R;
3361 break;
3363 Opc = NVPTX::SULD_1D_ARRAY_V4I32_ZERO_R;
3364 break;
3366 Opc = NVPTX::SULD_2D_I8_ZERO_R;
3367 break;
3369 Opc = NVPTX::SULD_2D_I16_ZERO_R;
3370 break;
3372 Opc = NVPTX::SULD_2D_I32_ZERO_R;
3373 break;
3375 Opc = NVPTX::SULD_2D_I64_ZERO_R;
3376 break;
3378 Opc = NVPTX::SULD_2D_V2I8_ZERO_R;
3379 break;
3381 Opc = NVPTX::SULD_2D_V2I16_ZERO_R;
3382 break;
3384 Opc = NVPTX::SULD_2D_V2I32_ZERO_R;
3385 break;
3387 Opc = NVPTX::SULD_2D_V2I64_ZERO_R;
3388 break;
3390 Opc = NVPTX::SULD_2D_V4I8_ZERO_R;
3391 break;
3393 Opc = NVPTX::SULD_2D_V4I16_ZERO_R;
3394 break;
3396 Opc = NVPTX::SULD_2D_V4I32_ZERO_R;
3397 break;
3399 Opc = NVPTX::SULD_2D_ARRAY_I8_ZERO_R;
3400 break;
3402 Opc = NVPTX::SULD_2D_ARRAY_I16_ZERO_R;
3403 break;
3405 Opc = NVPTX::SULD_2D_ARRAY_I32_ZERO_R;
3406 break;
3408 Opc = NVPTX::SULD_2D_ARRAY_I64_ZERO_R;
3409 break;
3411 Opc = NVPTX::SULD_2D_ARRAY_V2I8_ZERO_R;
3412 break;
3414 Opc = NVPTX::SULD_2D_ARRAY_V2I16_ZERO_R;
3415 break;
3417 Opc = NVPTX::SULD_2D_ARRAY_V2I32_ZERO_R;
3418 break;
3420 Opc = NVPTX::SULD_2D_ARRAY_V2I64_ZERO_R;
3421 break;
3423 Opc = NVPTX::SULD_2D_ARRAY_V4I8_ZERO_R;
3424 break;
3426 Opc = NVPTX::SULD_2D_ARRAY_V4I16_ZERO_R;
3427 break;
3429 Opc = NVPTX::SULD_2D_ARRAY_V4I32_ZERO_R;
3430 break;
3432 Opc = NVPTX::SULD_3D_I8_ZERO_R;
3433 break;
3435 Opc = NVPTX::SULD_3D_I16_ZERO_R;
3436 break;
3438 Opc = NVPTX::SULD_3D_I32_ZERO_R;
3439 break;
3441 Opc = NVPTX::SULD_3D_I64_ZERO_R;
3442 break;
3444 Opc = NVPTX::SULD_3D_V2I8_ZERO_R;
3445 break;
3447 Opc = NVPTX::SULD_3D_V2I16_ZERO_R;
3448 break;
3450 Opc = NVPTX::SULD_3D_V2I32_ZERO_R;
3451 break;
3453 Opc = NVPTX::SULD_3D_V2I64_ZERO_R;
3454 break;
3456 Opc = NVPTX::SULD_3D_V4I8_ZERO_R;
3457 break;
3459 Opc = NVPTX::SULD_3D_V4I16_ZERO_R;
3460 break;
3462 Opc = NVPTX::SULD_3D_V4I32_ZERO_R;
3463 break;
3464 }
3465
3466 // Copy over operands
3468 Ops.push_back(N->getOperand(0)); // Move chain to the back.
3469
3470 ReplaceNode(N, CurDAG->getMachineNode(Opc, SDLoc(N), N->getVTList(), Ops));
3471 return true;
3472}
3473
3474
3475/// SelectBFE - Look for instruction sequences that can be made more efficient
3476/// by using the 'bfe' (bit-field extract) PTX instruction
3477bool NVPTXDAGToDAGISel::tryBFE(SDNode *N) {
3478 SDLoc DL(N);
3479 SDValue LHS = N->getOperand(0);
3480 SDValue RHS = N->getOperand(1);
3481 SDValue Len;
3482 SDValue Start;
3483 SDValue Val;
3484 bool IsSigned = false;
3485
3486 if (N->getOpcode() == ISD::AND) {
3487 // Canonicalize the operands
3488 // We want 'and %val, %mask'
3489 if (isa<ConstantSDNode>(LHS) && !isa<ConstantSDNode>(RHS)) {
3490 std::swap(LHS, RHS);
3491 }
3492
3493 ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(RHS);
3494 if (!Mask) {
3495 // We need a constant mask on the RHS of the AND
3496 return false;
3497 }
3498
3499 // Extract the mask bits
3500 uint64_t MaskVal = Mask->getZExtValue();
3501 if (!isMask_64(MaskVal)) {
3502 // We *could* handle shifted masks here, but doing so would require an
3503 // 'and' operation to fix up the low-order bits so we would trade
3504 // shr+and for bfe+and, which has the same throughput
3505 return false;
3506 }
3507
3508 // How many bits are in our mask?
3509 int64_t NumBits = countr_one(MaskVal);
3510 Len = CurDAG->getTargetConstant(NumBits, DL, MVT::i32);
3511
3512 if (LHS.getOpcode() == ISD::SRL || LHS.getOpcode() == ISD::SRA) {
3513 // We have a 'srl/and' pair, extract the effective start bit and length
3514 Val = LHS.getNode()->getOperand(0);
3515 Start = LHS.getNode()->getOperand(1);
3516 ConstantSDNode *StartConst = dyn_cast<ConstantSDNode>(Start);
3517 if (StartConst) {
3518 uint64_t StartVal = StartConst->getZExtValue();
3519 // How many "good" bits do we have left? "good" is defined here as bits
3520 // that exist in the original value, not shifted in.
3521 int64_t GoodBits = Start.getValueSizeInBits() - StartVal;
3522 if (NumBits > GoodBits) {
3523 // Do not handle the case where bits have been shifted in. In theory
3524 // we could handle this, but the cost is likely higher than just
3525 // emitting the srl/and pair.
3526 return false;
3527 }
3528 Start = CurDAG->getTargetConstant(StartVal, DL, MVT::i32);
3529 } else {
3530 // Do not handle the case where the shift amount (can be zero if no srl
3531 // was found) is not constant. We could handle this case, but it would
3532 // require run-time logic that would be more expensive than just
3533 // emitting the srl/and pair.
3534 return false;
3535 }
3536 } else {
3537 // Do not handle the case where the LHS of the and is not a shift. While
3538 // it would be trivial to handle this case, it would just transform
3539 // 'and' -> 'bfe', but 'and' has higher-throughput.
3540 return false;
3541 }
3542 } else if (N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) {
3543 if (LHS->getOpcode() == ISD::AND) {
3544 ConstantSDNode *ShiftCnst = dyn_cast<ConstantSDNode>(RHS);
3545 if (!ShiftCnst) {
3546 // Shift amount must be constant
3547 return false;
3548 }
3549
3550 uint64_t ShiftAmt = ShiftCnst->getZExtValue();
3551
3552 SDValue AndLHS = LHS->getOperand(0);
3553 SDValue AndRHS = LHS->getOperand(1);
3554
3555 // Canonicalize the AND to have the mask on the RHS
3556 if (isa<ConstantSDNode>(AndLHS)) {
3557 std::swap(AndLHS, AndRHS);
3558 }
3559
3560 ConstantSDNode *MaskCnst = dyn_cast<ConstantSDNode>(AndRHS);
3561 if (!MaskCnst) {
3562 // Mask must be constant
3563 return false;
3564 }
3565
3566 uint64_t MaskVal = MaskCnst->getZExtValue();
3567 uint64_t NumZeros;
3568 uint64_t NumBits;
3569 if (isMask_64(MaskVal)) {
3570 NumZeros = 0;
3571 // The number of bits in the result bitfield will be the number of
3572 // trailing ones (the AND) minus the number of bits we shift off
3573 NumBits = llvm::countr_one(MaskVal) - ShiftAmt;
3574 } else if (isShiftedMask_64(MaskVal)) {
3575 NumZeros = llvm::countr_zero(MaskVal);
3576 unsigned NumOnes = llvm::countr_one(MaskVal >> NumZeros);
3577 // The number of bits in the result bitfield will be the number of
3578 // trailing zeros plus the number of set bits in the mask minus the
3579 // number of bits we shift off
3580 NumBits = NumZeros + NumOnes - ShiftAmt;
3581 } else {
3582 // This is not a mask we can handle
3583 return false;
3584 }
3585
3586 if (ShiftAmt < NumZeros) {
3587 // Handling this case would require extra logic that would make this
3588 // transformation non-profitable
3589 return false;
3590 }
3591
3592 Val = AndLHS;
3593 Start = CurDAG->getTargetConstant(ShiftAmt, DL, MVT::i32);
3594 Len = CurDAG->getTargetConstant(NumBits, DL, MVT::i32);
3595 } else if (LHS->getOpcode() == ISD::SHL) {
3596 // Here, we have a pattern like:
3597 //
3598 // (sra (shl val, NN), MM)
3599 // or
3600 // (srl (shl val, NN), MM)
3601 //
3602 // If MM >= NN, we can efficiently optimize this with bfe
3603 Val = LHS->getOperand(0);
3604
3605 SDValue ShlRHS = LHS->getOperand(1);
3606 ConstantSDNode *ShlCnst = dyn_cast<ConstantSDNode>(ShlRHS);
3607 if (!ShlCnst) {
3608 // Shift amount must be constant
3609 return false;
3610 }
3611 uint64_t InnerShiftAmt = ShlCnst->getZExtValue();
3612
3613 SDValue ShrRHS = RHS;
3614 ConstantSDNode *ShrCnst = dyn_cast<ConstantSDNode>(ShrRHS);
3615 if (!ShrCnst) {
3616 // Shift amount must be constant
3617 return false;
3618 }
3619 uint64_t OuterShiftAmt = ShrCnst->getZExtValue();
3620
3621 // To avoid extra codegen and be profitable, we need Outer >= Inner
3622 if (OuterShiftAmt < InnerShiftAmt) {
3623 return false;
3624 }
3625
3626 // If the outer shift is more than the type size, we have no bitfield to
3627 // extract (since we also check that the inner shift is <= the outer shift
3628 // then this also implies that the inner shift is < the type size)
3629 if (OuterShiftAmt >= Val.getValueSizeInBits()) {
3630 return false;
3631 }
3632
3633 Start = CurDAG->getTargetConstant(OuterShiftAmt - InnerShiftAmt, DL,
3634 MVT::i32);
3635 Len = CurDAG->getTargetConstant(Val.getValueSizeInBits() - OuterShiftAmt,
3636 DL, MVT::i32);
3637
3638 if (N->getOpcode() == ISD::SRA) {
3639 // If we have a arithmetic right shift, we need to use the signed bfe
3640 // variant
3641 IsSigned = true;
3642 }
3643 } else {
3644 // No can do...
3645 return false;
3646 }
3647 } else {
3648 // No can do...
3649 return false;
3650 }
3651
3652
3653 unsigned Opc;
3654 // For the BFE operations we form here from "and" and "srl", always use the
3655 // unsigned variants.
3656 if (Val.getValueType() == MVT::i32) {
3657 if (IsSigned) {
3658 Opc = NVPTX::BFE_S32rii;
3659 } else {
3660 Opc = NVPTX::BFE_U32rii;
3661 }
3662 } else if (Val.getValueType() == MVT::i64) {
3663 if (IsSigned) {
3664 Opc = NVPTX::BFE_S64rii;
3665 } else {
3666 Opc = NVPTX::BFE_U64rii;
3667 }
3668 } else {
3669 // We cannot handle this type
3670 return false;
3671 }
3672
3673 SDValue Ops[] = {
3674 Val, Start, Len
3675 };
3676
3677 ReplaceNode(N, CurDAG->getMachineNode(Opc, DL, N->getVTList(), Ops));
3678 return true;
3679}
3680
3681// SelectDirectAddr - Match a direct address for DAG.
3682// A direct address could be a globaladdress or externalsymbol.
3683bool NVPTXDAGToDAGISel::SelectDirectAddr(SDValue N, SDValue &Address) {
3684 // Return true if TGA or ES.
3685 if (N.getOpcode() == ISD::TargetGlobalAddress ||
3686 N.getOpcode() == ISD::TargetExternalSymbol) {
3687 Address = N;
3688 return true;
3689 }
3690 if (N.getOpcode() == NVPTXISD::Wrapper) {
3691 Address = N.getOperand(0);
3692 return true;
3693 }
3694 // addrspacecast(MoveParam(arg_symbol) to addrspace(PARAM)) -> arg_symbol
3695 if (AddrSpaceCastSDNode *CastN = dyn_cast<AddrSpaceCastSDNode>(N)) {
3696 if (CastN->getSrcAddressSpace() == ADDRESS_SPACE_GENERIC &&
3699 return SelectDirectAddr(CastN->getOperand(0).getOperand(0), Address);
3700 }
3701 return false;
3702}
3703
3704// symbol+offset
3705bool NVPTXDAGToDAGISel::SelectADDRsi_imp(
3706 SDNode *OpNode, SDValue Addr, SDValue &Base, SDValue &Offset, MVT mvt) {
3707 if (Addr.getOpcode() == ISD::ADD) {
3708 if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1))) {
3709 SDValue base = Addr.getOperand(0);
3710 if (SelectDirectAddr(base, Base)) {
3711 Offset = CurDAG->getTargetConstant(CN->getZExtValue(), SDLoc(OpNode),
3712 mvt);
3713 return true;
3714 }
3715 }
3716 }
3717 return false;
3718}
3719
3720// symbol+offset
3721bool NVPTXDAGToDAGISel::SelectADDRsi(SDNode *OpNode, SDValue Addr,
3723 return SelectADDRsi_imp(OpNode, Addr, Base, Offset, MVT::i32);
3724}
3725
3726// symbol+offset
3727bool NVPTXDAGToDAGISel::SelectADDRsi64(SDNode *OpNode, SDValue Addr,
3729 return SelectADDRsi_imp(OpNode, Addr, Base, Offset, MVT::i64);
3730}
3731
3732// register+offset
3733bool NVPTXDAGToDAGISel::SelectADDRri_imp(
3734 SDNode *OpNode, SDValue Addr, SDValue &Base, SDValue &Offset, MVT mvt) {
3735 if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Addr)) {
3736 Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), mvt);
3737 Offset = CurDAG->getTargetConstant(0, SDLoc(OpNode), mvt);
3738 return true;
3739 }
3740 if (Addr.getOpcode() == ISD::TargetExternalSymbol ||
3741 Addr.getOpcode() == ISD::TargetGlobalAddress)
3742 return false; // direct calls.
3743
3744 if (Addr.getOpcode() == ISD::ADD) {
3745 if (SelectDirectAddr(Addr.getOperand(0), Addr)) {
3746 return false;
3747 }
3748 if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1))) {
3749 if (FrameIndexSDNode *FIN =
3750 dyn_cast<FrameIndexSDNode>(Addr.getOperand(0)))
3751 // Constant offset from frame ref.
3752 Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), mvt);
3753 else
3754 Base = Addr.getOperand(0);
3755
3756 // Offset must fit in a 32-bit signed int in PTX [register+offset] address
3757 // mode
3758 if (!CN->getAPIntValue().isSignedIntN(32))
3759 return false;
3760
3761 Offset = CurDAG->getTargetConstant(CN->getSExtValue(), SDLoc(OpNode),
3762 MVT::i32);
3763 return true;
3764 }
3765 }
3766 return false;
3767}
3768
3769// register+offset
3770bool NVPTXDAGToDAGISel::SelectADDRri(SDNode *OpNode, SDValue Addr,
3772 return SelectADDRri_imp(OpNode, Addr, Base, Offset, MVT::i32);
3773}
3774
3775// register+offset
3776bool NVPTXDAGToDAGISel::SelectADDRri64(SDNode *OpNode, SDValue Addr,
3778 return SelectADDRri_imp(OpNode, Addr, Base, Offset, MVT::i64);
3779}
3780
3781bool NVPTXDAGToDAGISel::ChkMemSDNodeAddressSpace(SDNode *N,
3782 unsigned int spN) const {
3783 const Value *Src = nullptr;
3784 if (MemSDNode *mN = dyn_cast<MemSDNode>(N)) {
3785 if (spN == 0 && mN->getMemOperand()->getPseudoValue())
3786 return true;
3787 Src = mN->getMemOperand()->getValue();
3788 }
3789 if (!Src)
3790 return false;
3791 if (auto *PT = dyn_cast<PointerType>(Src->getType()))
3792 return (PT->getAddressSpace() == spN);
3793 return false;
3794}
3795
3796/// SelectInlineAsmMemoryOperand - Implement addressing mode selection for
3797/// inline asm expressions.
3799 const SDValue &Op, InlineAsm::ConstraintCode ConstraintID,
3800 std::vector<SDValue> &OutOps) {
3801 SDValue Op0, Op1;
3802 switch (ConstraintID) {
3803 default:
3804 return true;
3805 case InlineAsm::ConstraintCode::m: // memory
3806 if (SelectDirectAddr(Op, Op0)) {
3807 OutOps.push_back(Op0);
3808 OutOps.push_back(CurDAG->getTargetConstant(0, SDLoc(Op), MVT::i32));
3809 return false;
3810 }
3811 if (SelectADDRri(Op.getNode(), Op, Op0, Op1)) {
3812 OutOps.push_back(Op0);
3813 OutOps.push_back(Op1);
3814 return false;
3815 }
3816 break;
3817 }
3818 return true;
3819}
3820
3821void NVPTXDAGToDAGISel::SelectV2I64toI128(SDNode *N) {
3822 // Lower a CopyToReg with two 64-bit inputs
3823 // Dst:i128, lo:i64, hi:i64
3824 //
3825 // CopyToReg Dst, lo, hi;
3826 //
3827 // ==>
3828 //
3829 // tmp = V2I64toI128 {lo, hi};
3830 // CopyToReg Dst, tmp;
3831 SDValue Dst = N->getOperand(1);
3832 SDValue Lo = N->getOperand(2);
3833 SDValue Hi = N->getOperand(3);
3834
3835 SDLoc DL(N);
3836 SDNode *Mov =
3837 CurDAG->getMachineNode(NVPTX::V2I64toI128, DL, MVT::i128, {Lo, Hi});
3838
3839 SmallVector<SDValue, 4> NewOps(N->getNumOperands() - 1);
3840 NewOps[0] = N->getOperand(0);
3841 NewOps[1] = Dst;
3842 NewOps[2] = SDValue(Mov, 0);
3843 if (N->getNumOperands() == 5)
3844 NewOps[3] = N->getOperand(4);
3845 SDValue NewValue = CurDAG->getNode(ISD::CopyToReg, DL, SmallVector<EVT>(N->values()), NewOps);
3846
3847 ReplaceNode(N, NewValue.getNode());
3848}
3849
3850void NVPTXDAGToDAGISel::SelectI128toV2I64(SDNode *N) {
3851 // Lower CopyFromReg from a 128-bit regs to two 64-bit regs
3852 // Dst:i128, Src:i128
3853 //
3854 // {lo, hi} = CopyFromReg Src
3855 //
3856 // ==>
3857 //
3858 // {lo, hi} = I128toV2I64 Src
3859 //
3860 SDValue Ch = N->getOperand(0);
3861 SDValue Src = N->getOperand(1);
3862 SDValue Glue = N->getOperand(2);
3863 SDLoc DL(N);
3864
3865 // Add Glue and Ch to the operands and results to avoid break the execution
3866 // order
3868 NVPTX::I128toV2I64, DL,
3869 {MVT::i64, MVT::i64, Ch.getValueType(), Glue.getValueType()},
3870 {Src, Ch, Glue});
3871
3872 ReplaceNode(N, Mov);
3873}
3874
3875/// GetConvertOpcode - Returns the CVT_ instruction opcode that implements a
3876/// conversion from \p SrcTy to \p DestTy.
3877unsigned NVPTXDAGToDAGISel::GetConvertOpcode(MVT DestTy, MVT SrcTy,
3878 LoadSDNode *LdNode) {
3879 bool IsSigned = LdNode && LdNode->getExtensionType() == ISD::SEXTLOAD;
3880 switch (SrcTy.SimpleTy) {
3881 default:
3882 llvm_unreachable("Unhandled source type");
3883 case MVT::i8:
3884 switch (DestTy.SimpleTy) {
3885 default:
3886 llvm_unreachable("Unhandled dest type");
3887 case MVT::i16:
3888 return IsSigned ? NVPTX::CVT_s16_s8 : NVPTX::CVT_u16_u8;
3889 case MVT::i32:
3890 return IsSigned ? NVPTX::CVT_s32_s8 : NVPTX::CVT_u32_u8;
3891 case MVT::i64:
3892 return IsSigned ? NVPTX::CVT_s64_s8 : NVPTX::CVT_u64_u8;
3893 }
3894 case MVT::i16:
3895 switch (DestTy.SimpleTy) {
3896 default:
3897 llvm_unreachable("Unhandled dest type");
3898 case MVT::i8:
3899 return IsSigned ? NVPTX::CVT_s8_s16 : NVPTX::CVT_u8_u16;
3900 case MVT::i32:
3901 return IsSigned ? NVPTX::CVT_s32_s16 : NVPTX::CVT_u32_u16;
3902 case MVT::i64:
3903 return IsSigned ? NVPTX::CVT_s64_s16 : NVPTX::CVT_u64_u16;
3904 }
3905 case MVT::i32:
3906 switch (DestTy.SimpleTy) {
3907 default:
3908 llvm_unreachable("Unhandled dest type");
3909 case MVT::i8:
3910 return IsSigned ? NVPTX::CVT_s8_s32 : NVPTX::CVT_u8_u32;
3911 case MVT::i16:
3912 return IsSigned ? NVPTX::CVT_s16_s32 : NVPTX::CVT_u16_u32;
3913 case MVT::i64:
3914 return IsSigned ? NVPTX::CVT_s64_s32 : NVPTX::CVT_u64_u32;
3915 }
3916 case MVT::i64:
3917 switch (DestTy.SimpleTy) {
3918 default:
3919 llvm_unreachable("Unhandled dest type");
3920 case MVT::i8:
3921 return IsSigned ? NVPTX::CVT_s8_s64 : NVPTX::CVT_u8_u64;
3922 case MVT::i16:
3923 return IsSigned ? NVPTX::CVT_s16_s64 : NVPTX::CVT_u16_u64;
3924 case MVT::i32:
3925 return IsSigned ? NVPTX::CVT_s32_s64 : NVPTX::CVT_u32_u64;
3926 }
3927 case MVT::f16:
3928 switch (DestTy.SimpleTy) {
3929 default:
3930 llvm_unreachable("Unhandled dest type");
3931 case MVT::f32:
3932 return NVPTX::CVT_f32_f16;
3933 case MVT::f64:
3934 return NVPTX::CVT_f64_f16;
3935 }
3936 }
3937}
amdgpu aa AMDGPU Address space based Alias Analysis Wrapper
static const LLT F64
static const LLT F32
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Atomic ordering constants.
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
uint64_t Addr
#define DEBUG_TYPE
#define F(x, y, z)
Definition: MD5.cpp:55
#define getOpcodeForVectorStParam(n, ty, isimm)
static unsigned int getCodeAddrSpace(MemSDNode *N)
static int getLdStRegType(EVT VT)
static unsigned pickOpcodeForVectorStParam(SmallVector< SDValue, 8 > &Ops, unsigned NumElts, MVT::SimpleValueType MemTy, SelectionDAG *CurDAG, SDLoc DL)
#define getOpcodeForVectorStParamV2(ty, isimm)
static cl::opt< bool > EnableRsqrtOpt("nvptx-rsqrt-approx-opt", cl::init(true), cl::Hidden, cl::desc("Enable reciprocal sqrt optimization"))
static unsigned getPTXCmpMode(const CondCodeSDNode &CondCode, bool FTZ)
static std::optional< unsigned > pickOpcodeForVT(MVT::SimpleValueType VT, unsigned Opcode_i8, unsigned Opcode_i16, unsigned Opcode_i32, std::optional< unsigned > Opcode_i64, unsigned Opcode_f32, std::optional< unsigned > Opcode_f64)
static bool canLowerToLDG(MemSDNode *N, const NVPTXSubtarget &Subtarget, unsigned CodeAddrSpace, MachineFunction *F)
if(VerifyEach)
const char LLVMTargetMachineRef TM
#define INITIALIZE_PASS(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:38
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
#define PASS_NAME
Value * RHS
Value * LHS
unsigned getSrcAddressSpace() const
unsigned getDestAddressSpace() const
This is an SDNode representing atomic operations.
const SDValue & getVal() const
const ConstantFP * getConstantFPValue() const
ConstantFP - Floating Point Values [float, double].
Definition: Constants.h:269
This is the shared class of boolean and integer constants.
Definition: Constants.h:81
const ConstantInt * getConstantIntValue() const
uint64_t getZExtValue() const
This class represents an Operation in the Expression.
unsigned getPointerSizeInBits(unsigned AS=0) const
Layout pointer size, in bits FIXME: The defaults need to be removed once all of the backends/clients ...
Definition: DataLayout.h:410
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:311
bool isIndexed() const
Return true if this is a pre/post inc/dec load/store.
This class is used to represent ISD::LOAD nodes.
ISD::LoadExtType getExtensionType() const
Return whether this is a plain node, or one of the varieties of value-extending loads.
Machine Value Type.
SimpleValueType SimpleTy
bool isVector() const
Return true if this is a vector value type.
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
MVT getVectorElementType() const
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
A description of a memory reference used in the backend.
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
bool isVolatile() const
EVT getMemoryVT() const
Return the type of the in-memory value.
NVPTXDAGToDAGISelLegacy(NVPTXTargetMachine &tm, CodeGenOptLevel OptLevel)
bool runOnMachineFunction(MachineFunction &MF) override
bool SelectInlineAsmMemoryOperand(const SDValue &Op, InlineAsm::ConstraintCode ConstraintID, std::vector< SDValue > &OutOps) override
SelectInlineAsmMemoryOperand - Implement addressing mode selection for inline asm expressions.
const NVPTXSubtarget * Subtarget
const NVPTXTargetLowering * getTargetLowering() const override
bool useF32FTZ(const MachineFunction &MF) const
bool allowFMA(MachineFunction &MF, CodeGenOptLevel OptLevel) const
bool allowUnsafeFPMath(MachineFunction &MF) const
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
Represents one node in the SelectionDAG.
const SDValue & getOperand(unsigned Num) const
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
SDNode * getNode() const
get the SDNode which holds the desired result
EVT getValueType() const
Return the ValueType of the referenced return value.
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
unsigned getOpcode() const
SelectionDAGISel - This is the common base class used for SelectionDAG-based pattern-matching instruc...
MachineFunction * MF
CodeGenOptLevel OptLevel
void ReplaceUses(SDValue F, SDValue T)
ReplaceUses - replace all uses of the old node F with the use of the new node T.
void ReplaceNode(SDNode *F, SDNode *T)
Replace all uses of F with T, then remove F from the DAG.
virtual bool runOnMachineFunction(MachineFunction &mf)
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
Definition: SelectionDAG.h:227
SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
void setNodeMemRefs(MachineSDNode *N, ArrayRef< MachineMemOperand * > NewMemRefs)
Mutate the specified machine node's memory references to the provided list.
const DataLayout & getDataLayout() const
Definition: SelectionDAG.h:486
SDValue getTargetFrameIndex(int FI, EVT VT)
Definition: SelectionDAG.h:739
SDValue getTargetConstantFP(double Val, const SDLoc &DL, EVT VT)
Definition: SelectionDAG.h:720
SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
Definition: SelectionDAG.h:688
bool empty() const
Definition: SmallVector.h:94
void push_back(const T &Elt)
Definition: SmallVector.h:426
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
This class is used to represent ISD::STORE nodes.
const SDValue & getValue() const
unsigned getPointerSizeInBits(unsigned AS) const
LLVM Value Representation.
Definition: Value.h:74
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Definition: Lint.cpp:86
constexpr char IsVolatile[]
Key for Kernel::Arg::Metadata::mIsVolatile.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:121
@ ConstantFP
Definition: ISDOpcodes.h:77
@ ATOMIC_STORE
OUTCHAIN = ATOMIC_STORE(INCHAIN, ptr, val) This corresponds to "store atomic" instruction.
Definition: ISDOpcodes.h:1284
@ ADD
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:246
@ LOAD
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
Definition: ISDOpcodes.h:1074
@ TargetExternalSymbol
Definition: ISDOpcodes.h:175
@ ATOMIC_LOAD
Val, OUTCHAIN = ATOMIC_LOAD(INCHAIN, ptr) This corresponds to "load atomic" instruction.
Definition: ISDOpcodes.h:1280
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
Definition: ISDOpcodes.h:215
@ TargetGlobalAddress
TargetGlobalAddress - Like GlobalAddress, but the DAG does no folding or anything else with this node...
Definition: ISDOpcodes.h:170
@ SHL
Shift and rotation operations.
Definition: ISDOpcodes.h:734
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition: ISDOpcodes.h:549
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition: ISDOpcodes.h:209
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition: ISDOpcodes.h:708
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition: ISDOpcodes.h:190
@ ADDRSPACECAST
ADDRSPACECAST - This operator converts between pointers of different address spaces.
Definition: ISDOpcodes.h:937
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition: ISDOpcodes.h:198
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
Definition: ISDOpcodes.h:1578