LLVM 17.0.0git
NVPTXISelDAGToDAG.cpp
Go to the documentation of this file.
1//===-- NVPTXISelDAGToDAG.cpp - A dag to dag inst selector for NVPTX ------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines an instruction selector for the NVPTX target.
10//
11//===----------------------------------------------------------------------===//
12
13#include "NVPTXISelDAGToDAG.h"
15#include "NVPTXUtilities.h"
17#include "llvm/IR/GlobalValue.h"
19#include "llvm/IR/IntrinsicsNVPTX.h"
22#include "llvm/Support/Debug.h"
26
27using namespace llvm;
28
29#define DEBUG_TYPE "nvptx-isel"
30#define PASS_NAME "NVPTX DAG->DAG Pattern Instruction Selection"
31
32/// createNVPTXISelDag - This pass converts a legalized DAG into a
33/// NVPTX-specific DAG, ready for instruction scheduling.
35 llvm::CodeGenOpt::Level OptLevel) {
36 return new NVPTXDAGToDAGISel(TM, OptLevel);
37}
38
40
42
44 CodeGenOpt::Level OptLevel)
45 : SelectionDAGISel(ID, tm, OptLevel), TM(tm) {
46 doMulWide = (OptLevel > 0);
47}
48
52}
53
54int NVPTXDAGToDAGISel::getDivF32Level() const {
56}
57
58bool NVPTXDAGToDAGISel::usePrecSqrtF32() const {
60}
61
62bool NVPTXDAGToDAGISel::useF32FTZ() const {
64}
65
66bool NVPTXDAGToDAGISel::allowFMA() const {
68 return TL->allowFMA(*MF, OptLevel);
69}
70
71bool NVPTXDAGToDAGISel::allowUnsafeFPMath() const {
73 return TL->allowUnsafeFPMath(*MF);
74}
75
76bool NVPTXDAGToDAGISel::useShortPointers() const {
77 return TM.useShortPointers();
78}
79
80/// Select - Select instructions not customized! Used for
81/// expanded, promoted and normal instructions.
82void NVPTXDAGToDAGISel::Select(SDNode *N) {
83
84 if (N->isMachineOpcode()) {
85 N->setNodeId(-1);
86 return; // Already selected.
87 }
88
89 switch (N->getOpcode()) {
90 case ISD::LOAD:
92 if (tryLoad(N))
93 return;
94 break;
95 case ISD::STORE:
97 if (tryStore(N))
98 return;
99 break;
101 if (tryEXTRACT_VECTOR_ELEMENT(N))
102 return;
103 break;
105 SelectSETP_F16X2(N);
106 return;
107
108 case NVPTXISD::LoadV2:
109 case NVPTXISD::LoadV4:
110 if (tryLoadVector(N))
111 return;
112 break;
113 case NVPTXISD::LDGV2:
114 case NVPTXISD::LDGV4:
115 case NVPTXISD::LDUV2:
116 case NVPTXISD::LDUV4:
117 if (tryLDGLDU(N))
118 return;
119 break;
122 if (tryStoreVector(N))
123 return;
124 break;
128 if (tryLoadParam(N))
129 return;
130 break;
134 if (tryStoreRetval(N))
135 return;
136 break;
142 if (tryStoreParam(N))
143 return;
144 break;
146 if (tryIntrinsicNoChain(N))
147 return;
148 break;
150 if (tryIntrinsicChain(N))
151 return;
152 break;
321 if (tryTextureIntrinsic(N))
322 return;
323 break;
489 if (trySurfaceIntrinsic(N))
490 return;
491 break;
492 case ISD::AND:
493 case ISD::SRA:
494 case ISD::SRL:
495 // Try to select BFE
496 if (tryBFE(N))
497 return;
498 break;
500 SelectAddrSpaceCast(N);
501 return;
502 case ISD::ConstantFP:
503 if (tryConstantFP16(N))
504 return;
505 break;
506 default:
507 break;
508 }
509 SelectCode(N);
510}
511
512bool NVPTXDAGToDAGISel::tryIntrinsicChain(SDNode *N) {
513 unsigned IID = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
514 switch (IID) {
515 default:
516 return false;
517 case Intrinsic::nvvm_ldg_global_f:
518 case Intrinsic::nvvm_ldg_global_i:
519 case Intrinsic::nvvm_ldg_global_p:
520 case Intrinsic::nvvm_ldu_global_f:
521 case Intrinsic::nvvm_ldu_global_i:
522 case Intrinsic::nvvm_ldu_global_p:
523 return tryLDGLDU(N);
524 }
525}
526
527// There's no way to specify FP16 immediates in .f16 ops, so we have to
528// load them into an .f16 register first.
529bool NVPTXDAGToDAGISel::tryConstantFP16(SDNode *N) {
530 if (N->getValueType(0) != MVT::f16)
531 return false;
533 cast<ConstantFPSDNode>(N)->getValueAPF(), SDLoc(N), MVT::f16);
534 SDNode *LoadConstF16 =
535 CurDAG->getMachineNode(NVPTX::LOAD_CONST_F16, SDLoc(N), MVT::f16, Val);
536 ReplaceNode(N, LoadConstF16);
537 return true;
538}
539
540// Map ISD:CONDCODE value to appropriate CmpMode expected by
541// NVPTXInstPrinter::printCmpMode()
542static unsigned getPTXCmpMode(const CondCodeSDNode &CondCode, bool FTZ) {
544 unsigned PTXCmpMode = [](ISD::CondCode CC) {
545 switch (CC) {
546 default:
547 llvm_unreachable("Unexpected condition code.");
548 case ISD::SETOEQ:
549 return CmpMode::EQ;
550 case ISD::SETOGT:
551 return CmpMode::GT;
552 case ISD::SETOGE:
553 return CmpMode::GE;
554 case ISD::SETOLT:
555 return CmpMode::LT;
556 case ISD::SETOLE:
557 return CmpMode::LE;
558 case ISD::SETONE:
559 return CmpMode::NE;
560 case ISD::SETO:
561 return CmpMode::NUM;
562 case ISD::SETUO:
563 return CmpMode::NotANumber;
564 case ISD::SETUEQ:
565 return CmpMode::EQU;
566 case ISD::SETUGT:
567 return CmpMode::GTU;
568 case ISD::SETUGE:
569 return CmpMode::GEU;
570 case ISD::SETULT:
571 return CmpMode::LTU;
572 case ISD::SETULE:
573 return CmpMode::LEU;
574 case ISD::SETUNE:
575 return CmpMode::NEU;
576 case ISD::SETEQ:
577 return CmpMode::EQ;
578 case ISD::SETGT:
579 return CmpMode::GT;
580 case ISD::SETGE:
581 return CmpMode::GE;
582 case ISD::SETLT:
583 return CmpMode::LT;
584 case ISD::SETLE:
585 return CmpMode::LE;
586 case ISD::SETNE:
587 return CmpMode::NE;
588 }
589 }(CondCode.get());
590
591 if (FTZ)
592 PTXCmpMode |= NVPTX::PTXCmpMode::FTZ_FLAG;
593
594 return PTXCmpMode;
595}
596
597bool NVPTXDAGToDAGISel::SelectSETP_F16X2(SDNode *N) {
598 unsigned PTXCmpMode =
599 getPTXCmpMode(*cast<CondCodeSDNode>(N->getOperand(2)), useF32FTZ());
600 SDLoc DL(N);
602 NVPTX::SETP_f16x2rr, DL, MVT::i1, MVT::i1, N->getOperand(0),
603 N->getOperand(1), CurDAG->getTargetConstant(PTXCmpMode, DL, MVT::i32));
604 ReplaceNode(N, SetP);
605 return true;
606}
607
608// Find all instances of extract_vector_elt that use this v2f16 vector
609// and coalesce them into a scattering move instruction.
610bool NVPTXDAGToDAGISel::tryEXTRACT_VECTOR_ELEMENT(SDNode *N) {
611 SDValue Vector = N->getOperand(0);
612
613 // We only care about f16x2 as it's the only real vector type we
614 // need to deal with.
615 if (Vector.getSimpleValueType() != MVT::v2f16)
616 return false;
617
618 // Find and record all uses of this vector that extract element 0 or 1.
620 for (auto *U : Vector.getNode()->uses()) {
621 if (U->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
622 continue;
623 if (U->getOperand(0) != Vector)
624 continue;
625 if (const ConstantSDNode *IdxConst =
626 dyn_cast<ConstantSDNode>(U->getOperand(1))) {
627 if (IdxConst->getZExtValue() == 0)
628 E0.push_back(U);
629 else if (IdxConst->getZExtValue() == 1)
630 E1.push_back(U);
631 else
632 llvm_unreachable("Invalid vector index.");
633 }
634 }
635
636 // There's no point scattering f16x2 if we only ever access one
637 // element of it.
638 if (E0.empty() || E1.empty())
639 return false;
640
641 unsigned Op = NVPTX::SplitF16x2;
642 // If the vector has been BITCAST'ed from i32, we can use original
643 // value directly and avoid register-to-register move.
645 if (Vector->getOpcode() == ISD::BITCAST) {
646 Op = NVPTX::SplitI32toF16x2;
647 Source = Vector->getOperand(0);
648 }
649 // Merge (f16 extractelt(V, 0), f16 extractelt(V,1))
650 // into f16,f16 SplitF16x2(V)
651 SDNode *ScatterOp =
652 CurDAG->getMachineNode(Op, SDLoc(N), MVT::f16, MVT::f16, Source);
653 for (auto *Node : E0)
654 ReplaceUses(SDValue(Node, 0), SDValue(ScatterOp, 0));
655 for (auto *Node : E1)
656 ReplaceUses(SDValue(Node, 0), SDValue(ScatterOp, 1));
657
658 return true;
659}
660
661static unsigned int getCodeAddrSpace(MemSDNode *N) {
662 const Value *Src = N->getMemOperand()->getValue();
663
664 if (!Src)
666
667 if (auto *PT = dyn_cast<PointerType>(Src->getType())) {
668 switch (PT->getAddressSpace()) {
675 default: break;
676 }
677 }
679}
680
681static bool canLowerToLDG(MemSDNode *N, const NVPTXSubtarget &Subtarget,
682 unsigned CodeAddrSpace, MachineFunction *F) {
683 // We use ldg (i.e. ld.global.nc) for invariant loads from the global address
684 // space.
685 //
686 // We have two ways of identifying invariant loads: Loads may be explicitly
687 // marked as invariant, or we may infer them to be invariant.
688 //
689 // We currently infer invariance for loads from
690 // - constant global variables, and
691 // - kernel function pointer params that are noalias (i.e. __restrict) and
692 // never written to.
693 //
694 // TODO: Perform a more powerful invariance analysis (ideally IPO, and ideally
695 // not during the SelectionDAG phase).
696 //
697 // TODO: Infer invariance only at -O2. We still want to use ldg at -O0 for
698 // explicitly invariant loads because these are how clang tells us to use ldg
699 // when the user uses a builtin.
700 if (!Subtarget.hasLDG() || CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL)
701 return false;
702
703 if (N->isInvariant())
704 return true;
705
706 bool IsKernelFn = isKernelFunction(F->getFunction());
707
708 // We use getUnderlyingObjects() here instead of getUnderlyingObject() mainly
709 // because the former looks through phi nodes while the latter does not. We
710 // need to look through phi nodes to handle pointer induction variables.
712 getUnderlyingObjects(N->getMemOperand()->getValue(), Objs);
713
714 return all_of(Objs, [&](const Value *V) {
715 if (auto *A = dyn_cast<const Argument>(V))
716 return IsKernelFn && A->onlyReadsMemory() && A->hasNoAliasAttr();
717 if (auto *GV = dyn_cast<const GlobalVariable>(V))
718 return GV->isConstant();
719 return false;
720 });
721}
722
723bool NVPTXDAGToDAGISel::tryIntrinsicNoChain(SDNode *N) {
724 unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
725 switch (IID) {
726 default:
727 return false;
728 case Intrinsic::nvvm_texsurf_handle_internal:
729 SelectTexSurfHandle(N);
730 return true;
731 }
732}
733
734void NVPTXDAGToDAGISel::SelectTexSurfHandle(SDNode *N) {
735 // Op 0 is the intrinsic ID
736 SDValue Wrapper = N->getOperand(1);
737 SDValue GlobalVal = Wrapper.getOperand(0);
738 ReplaceNode(N, CurDAG->getMachineNode(NVPTX::texsurf_handles, SDLoc(N),
739 MVT::i64, GlobalVal));
740}
741
742void NVPTXDAGToDAGISel::SelectAddrSpaceCast(SDNode *N) {
743 SDValue Src = N->getOperand(0);
744 AddrSpaceCastSDNode *CastN = cast<AddrSpaceCastSDNode>(N);
745 unsigned SrcAddrSpace = CastN->getSrcAddressSpace();
746 unsigned DstAddrSpace = CastN->getDestAddressSpace();
747 assert(SrcAddrSpace != DstAddrSpace &&
748 "addrspacecast must be between different address spaces");
749
750 if (DstAddrSpace == ADDRESS_SPACE_GENERIC) {
751 // Specific to generic
752 unsigned Opc;
753 switch (SrcAddrSpace) {
754 default: report_fatal_error("Bad address space in addrspacecast");
756 Opc = TM.is64Bit() ? NVPTX::cvta_global_yes_64 : NVPTX::cvta_global_yes;
757 break;
759 Opc = TM.is64Bit() ? (useShortPointers() ? NVPTX::cvta_shared_yes_6432
760 : NVPTX::cvta_shared_yes_64)
761 : NVPTX::cvta_shared_yes;
762 break;
764 Opc = TM.is64Bit() ? (useShortPointers() ? NVPTX::cvta_const_yes_6432
765 : NVPTX::cvta_const_yes_64)
766 : NVPTX::cvta_const_yes;
767 break;
769 Opc = TM.is64Bit() ? (useShortPointers() ? NVPTX::cvta_local_yes_6432
770 : NVPTX::cvta_local_yes_64)
771 : NVPTX::cvta_local_yes;
772 break;
773 }
774 ReplaceNode(N, CurDAG->getMachineNode(Opc, SDLoc(N), N->getValueType(0),
775 Src));
776 return;
777 } else {
778 // Generic to specific
779 if (SrcAddrSpace != 0)
780 report_fatal_error("Cannot cast between two non-generic address spaces");
781 unsigned Opc;
782 switch (DstAddrSpace) {
783 default: report_fatal_error("Bad address space in addrspacecast");
785 Opc = TM.is64Bit() ? NVPTX::cvta_to_global_yes_64
786 : NVPTX::cvta_to_global_yes;
787 break;
789 Opc = TM.is64Bit() ? (useShortPointers() ? NVPTX::cvta_to_shared_yes_3264
790 : NVPTX::cvta_to_shared_yes_64)
791 : NVPTX::cvta_to_shared_yes;
792 break;
794 Opc = TM.is64Bit() ? (useShortPointers() ? NVPTX::cvta_to_const_yes_3264
795 : NVPTX::cvta_to_const_yes_64)
796 : NVPTX::cvta_to_const_yes;
797 break;
799 Opc = TM.is64Bit() ? (useShortPointers() ? NVPTX::cvta_to_local_yes_3264
800 : NVPTX::cvta_to_local_yes_64)
801 : NVPTX::cvta_to_local_yes;
802 break;
804 Opc = TM.is64Bit() ? NVPTX::nvvm_ptr_gen_to_param_64
805 : NVPTX::nvvm_ptr_gen_to_param;
806 break;
807 }
808 ReplaceNode(N, CurDAG->getMachineNode(Opc, SDLoc(N), N->getValueType(0),
809 Src));
810 return;
811 }
812}
813
814// Helper function template to reduce amount of boilerplate code for
815// opcode selection.
816static std::optional<unsigned>
818 unsigned Opcode_i16, unsigned Opcode_i32,
819 std::optional<unsigned> Opcode_i64, unsigned Opcode_f16,
820 unsigned Opcode_f16x2, unsigned Opcode_f32,
821 std::optional<unsigned> Opcode_f64) {
822 switch (VT) {
823 case MVT::i1:
824 case MVT::i8:
825 return Opcode_i8;
826 case MVT::i16:
827 return Opcode_i16;
828 case MVT::i32:
829 return Opcode_i32;
830 case MVT::i64:
831 return Opcode_i64;
832 case MVT::f16:
833 case MVT::bf16:
834 return Opcode_f16;
835 case MVT::v2f16:
836 case MVT::v2bf16:
837 return Opcode_f16x2;
838 case MVT::f32:
839 return Opcode_f32;
840 case MVT::f64:
841 return Opcode_f64;
842 default:
843 return std::nullopt;
844 }
845}
846
847static int getLdStRegType(EVT VT) {
848 if (VT.isFloatingPoint())
849 switch (VT.getSimpleVT().SimpleTy) {
850 case MVT::f16:
851 case MVT::bf16:
852 case MVT::v2f16:
853 case MVT::v2bf16:
855 default:
857 }
858 else
860}
861
862bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
863 SDLoc dl(N);
864 MemSDNode *LD = cast<MemSDNode>(N);
865 assert(LD->readMem() && "Expected load");
866 LoadSDNode *PlainLoad = dyn_cast<LoadSDNode>(N);
867 EVT LoadedVT = LD->getMemoryVT();
868 SDNode *NVPTXLD = nullptr;
869
870 // do not support pre/post inc/dec
871 if (PlainLoad && PlainLoad->isIndexed())
872 return false;
873
874 if (!LoadedVT.isSimple())
875 return false;
876
877 AtomicOrdering Ordering = LD->getSuccessOrdering();
878 // In order to lower atomic loads with stronger guarantees we would need to
879 // use load.acquire or insert fences. However these features were only added
880 // with PTX ISA 6.0 / sm_70.
881 // TODO: Check if we can actually use the new instructions and implement them.
882 if (isStrongerThanMonotonic(Ordering))
883 return false;
884
885 // Address Space Setting
886 unsigned int CodeAddrSpace = getCodeAddrSpace(LD);
887 if (canLowerToLDG(LD, *Subtarget, CodeAddrSpace, MF)) {
888 return tryLDGLDU(N);
889 }
890
891 unsigned int PointerSize =
892 CurDAG->getDataLayout().getPointerSizeInBits(LD->getAddressSpace());
893
894 // Volatile Setting
895 // - .volatile is only available for .global and .shared
896 // - .volatile has the same memory synchronization semantics as .relaxed.sys
897 bool isVolatile = LD->isVolatile() || Ordering == AtomicOrdering::Monotonic;
898 if (CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL &&
899 CodeAddrSpace != NVPTX::PTXLdStInstCode::SHARED &&
900 CodeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC)
901 isVolatile = false;
902
903 // Type Setting: fromType + fromTypeWidth
904 //
905 // Sign : ISD::SEXTLOAD
906 // Unsign : ISD::ZEXTLOAD, ISD::NON_EXTLOAD or ISD::EXTLOAD and the
907 // type is integer
908 // Float : ISD::NON_EXTLOAD or ISD::EXTLOAD and the type is float
909 MVT SimpleVT = LoadedVT.getSimpleVT();
910 MVT ScalarVT = SimpleVT.getScalarType();
911 // Read at least 8 bits (predicates are stored as 8-bit values)
912 unsigned fromTypeWidth = std::max(8U, (unsigned)ScalarVT.getSizeInBits());
913 unsigned int fromType;
914
915 // Vector Setting
916 unsigned vecType = NVPTX::PTXLdStInstCode::Scalar;
917 if (SimpleVT.isVector()) {
918 assert((LoadedVT == MVT::v2f16 || LoadedVT == MVT::v2bf16) &&
919 "Unexpected vector type");
920 // v2f16/v2bf16 is loaded using ld.b32
921 fromTypeWidth = 32;
922 }
923
924 if (PlainLoad && (PlainLoad->getExtensionType() == ISD::SEXTLOAD))
926 else
927 fromType = getLdStRegType(ScalarVT);
928
929 // Create the machine instruction DAG
930 SDValue Chain = N->getOperand(0);
931 SDValue N1 = N->getOperand(1);
934 std::optional<unsigned> Opcode;
935 MVT::SimpleValueType TargetVT = LD->getSimpleValueType(0).SimpleTy;
936
937 if (SelectDirectAddr(N1, Addr)) {
938 Opcode = pickOpcodeForVT(
939 TargetVT, NVPTX::LD_i8_avar, NVPTX::LD_i16_avar, NVPTX::LD_i32_avar,
940 NVPTX::LD_i64_avar, NVPTX::LD_f16_avar, NVPTX::LD_f16x2_avar,
941 NVPTX::LD_f32_avar, NVPTX::LD_f64_avar);
942 if (!Opcode)
943 return false;
944 SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(CodeAddrSpace, dl),
945 getI32Imm(vecType, dl), getI32Imm(fromType, dl),
946 getI32Imm(fromTypeWidth, dl), Addr, Chain };
947 NVPTXLD = CurDAG->getMachineNode(*Opcode, dl, TargetVT, MVT::Other, Ops);
948 } else if (PointerSize == 64 ? SelectADDRsi64(N1.getNode(), N1, Base, Offset)
949 : SelectADDRsi(N1.getNode(), N1, Base, Offset)) {
950 Opcode = pickOpcodeForVT(TargetVT, NVPTX::LD_i8_asi, NVPTX::LD_i16_asi,
951 NVPTX::LD_i32_asi, NVPTX::LD_i64_asi,
952 NVPTX::LD_f16_asi, NVPTX::LD_f16x2_asi,
953 NVPTX::LD_f32_asi, NVPTX::LD_f64_asi);
954 if (!Opcode)
955 return false;
956 SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(CodeAddrSpace, dl),
957 getI32Imm(vecType, dl), getI32Imm(fromType, dl),
958 getI32Imm(fromTypeWidth, dl), Base, Offset, Chain };
959 NVPTXLD = CurDAG->getMachineNode(*Opcode, dl, TargetVT, MVT::Other, Ops);
960 } else if (PointerSize == 64 ? SelectADDRri64(N1.getNode(), N1, Base, Offset)
961 : SelectADDRri(N1.getNode(), N1, Base, Offset)) {
962 if (PointerSize == 64)
963 Opcode = pickOpcodeForVT(
964 TargetVT, NVPTX::LD_i8_ari_64, NVPTX::LD_i16_ari_64,
965 NVPTX::LD_i32_ari_64, NVPTX::LD_i64_ari_64, NVPTX::LD_f16_ari_64,
966 NVPTX::LD_f16x2_ari_64, NVPTX::LD_f32_ari_64, NVPTX::LD_f64_ari_64);
967 else
968 Opcode = pickOpcodeForVT(
969 TargetVT, NVPTX::LD_i8_ari, NVPTX::LD_i16_ari, NVPTX::LD_i32_ari,
970 NVPTX::LD_i64_ari, NVPTX::LD_f16_ari, NVPTX::LD_f16x2_ari,
971 NVPTX::LD_f32_ari, NVPTX::LD_f64_ari);
972 if (!Opcode)
973 return false;
974 SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(CodeAddrSpace, dl),
975 getI32Imm(vecType, dl), getI32Imm(fromType, dl),
976 getI32Imm(fromTypeWidth, dl), Base, Offset, Chain };
977 NVPTXLD = CurDAG->getMachineNode(*Opcode, dl, TargetVT, MVT::Other, Ops);
978 } else {
979 if (PointerSize == 64)
980 Opcode = pickOpcodeForVT(
981 TargetVT, NVPTX::LD_i8_areg_64, NVPTX::LD_i16_areg_64,
982 NVPTX::LD_i32_areg_64, NVPTX::LD_i64_areg_64, NVPTX::LD_f16_areg_64,
983 NVPTX::LD_f16x2_areg_64, NVPTX::LD_f32_areg_64,
984 NVPTX::LD_f64_areg_64);
985 else
986 Opcode = pickOpcodeForVT(
987 TargetVT, NVPTX::LD_i8_areg, NVPTX::LD_i16_areg, NVPTX::LD_i32_areg,
988 NVPTX::LD_i64_areg, NVPTX::LD_f16_areg, NVPTX::LD_f16x2_areg,
989 NVPTX::LD_f32_areg, NVPTX::LD_f64_areg);
990 if (!Opcode)
991 return false;
992 SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(CodeAddrSpace, dl),
993 getI32Imm(vecType, dl), getI32Imm(fromType, dl),
994 getI32Imm(fromTypeWidth, dl), N1, Chain };
995 NVPTXLD = CurDAG->getMachineNode(*Opcode, dl, TargetVT, MVT::Other, Ops);
996 }
997
998 if (!NVPTXLD)
999 return false;
1000
1001 MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand();
1002 CurDAG->setNodeMemRefs(cast<MachineSDNode>(NVPTXLD), {MemRef});
1003
1004 ReplaceNode(N, NVPTXLD);
1005 return true;
1006}
1007
1008bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
1009
1010 SDValue Chain = N->getOperand(0);
1011 SDValue Op1 = N->getOperand(1);
1013 std::optional<unsigned> Opcode;
1014 SDLoc DL(N);
1015 SDNode *LD;
1016 MemSDNode *MemSD = cast<MemSDNode>(N);
1017 EVT LoadedVT = MemSD->getMemoryVT();
1018
1019 if (!LoadedVT.isSimple())
1020 return false;
1021
1022 // Address Space Setting
1023 unsigned int CodeAddrSpace = getCodeAddrSpace(MemSD);
1024 if (canLowerToLDG(MemSD, *Subtarget, CodeAddrSpace, MF)) {
1025 return tryLDGLDU(N);
1026 }
1027
1028 unsigned int PointerSize =
1030
1031 // Volatile Setting
1032 // - .volatile is only availalble for .global and .shared
1033 bool IsVolatile = MemSD->isVolatile();
1034 if (CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL &&
1035 CodeAddrSpace != NVPTX::PTXLdStInstCode::SHARED &&
1036 CodeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC)
1037 IsVolatile = false;
1038
1039 // Vector Setting
1040 MVT SimpleVT = LoadedVT.getSimpleVT();
1041
1042 // Type Setting: fromType + fromTypeWidth
1043 //
1044 // Sign : ISD::SEXTLOAD
1045 // Unsign : ISD::ZEXTLOAD, ISD::NON_EXTLOAD or ISD::EXTLOAD and the
1046 // type is integer
1047 // Float : ISD::NON_EXTLOAD or ISD::EXTLOAD and the type is float
1048 MVT ScalarVT = SimpleVT.getScalarType();
1049 // Read at least 8 bits (predicates are stored as 8-bit values)
1050 unsigned FromTypeWidth = std::max(8U, (unsigned)ScalarVT.getSizeInBits());
1051 unsigned int FromType;
1052 // The last operand holds the original LoadSDNode::getExtensionType() value
1053 unsigned ExtensionType = cast<ConstantSDNode>(
1054 N->getOperand(N->getNumOperands() - 1))->getZExtValue();
1055 if (ExtensionType == ISD::SEXTLOAD)
1057 else
1058 FromType = getLdStRegType(ScalarVT);
1059
1060 unsigned VecType;
1061
1062 switch (N->getOpcode()) {
1063 case NVPTXISD::LoadV2:
1065 break;
1066 case NVPTXISD::LoadV4:
1068 break;
1069 default:
1070 return false;
1071 }
1072
1073 EVT EltVT = N->getValueType(0);
1074
1075 // v8f16 is a special case. PTX doesn't have ld.v8.f16
1076 // instruction. Instead, we split the vector into v2f16 chunks and
1077 // load them with ld.v4.b32.
1078 if (EltVT == MVT::v2f16 || EltVT == MVT::v2bf16) {
1079 assert(N->getOpcode() == NVPTXISD::LoadV4 && "Unexpected load opcode.");
1080 EltVT = MVT::i32;
1082 FromTypeWidth = 32;
1083 }
1084
1085 if (SelectDirectAddr(Op1, Addr)) {
1086 switch (N->getOpcode()) {
1087 default:
1088 return false;
1089 case NVPTXISD::LoadV2:
1090 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1091 NVPTX::LDV_i8_v2_avar, NVPTX::LDV_i16_v2_avar,
1092 NVPTX::LDV_i32_v2_avar, NVPTX::LDV_i64_v2_avar,
1093 NVPTX::LDV_f16_v2_avar, NVPTX::LDV_f16x2_v2_avar,
1094 NVPTX::LDV_f32_v2_avar, NVPTX::LDV_f64_v2_avar);
1095 break;
1096 case NVPTXISD::LoadV4:
1097 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1098 NVPTX::LDV_i8_v4_avar, NVPTX::LDV_i16_v4_avar,
1099 NVPTX::LDV_i32_v4_avar, std::nullopt,
1100 NVPTX::LDV_f16_v4_avar, NVPTX::LDV_f16x2_v4_avar,
1101 NVPTX::LDV_f32_v4_avar, std::nullopt);
1102 break;
1103 }
1104 if (!Opcode)
1105 return false;
1106 SDValue Ops[] = { getI32Imm(IsVolatile, DL), getI32Imm(CodeAddrSpace, DL),
1107 getI32Imm(VecType, DL), getI32Imm(FromType, DL),
1108 getI32Imm(FromTypeWidth, DL), Addr, Chain };
1109 LD = CurDAG->getMachineNode(*Opcode, DL, N->getVTList(), Ops);
1110 } else if (PointerSize == 64
1111 ? SelectADDRsi64(Op1.getNode(), Op1, Base, Offset)
1112 : SelectADDRsi(Op1.getNode(), Op1, Base, Offset)) {
1113 switch (N->getOpcode()) {
1114 default:
1115 return false;
1116 case NVPTXISD::LoadV2:
1117 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1118 NVPTX::LDV_i8_v2_asi, NVPTX::LDV_i16_v2_asi,
1119 NVPTX::LDV_i32_v2_asi, NVPTX::LDV_i64_v2_asi,
1120 NVPTX::LDV_f16_v2_asi, NVPTX::LDV_f16x2_v2_asi,
1121 NVPTX::LDV_f32_v2_asi, NVPTX::LDV_f64_v2_asi);
1122 break;
1123 case NVPTXISD::LoadV4:
1124 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1125 NVPTX::LDV_i8_v4_asi, NVPTX::LDV_i16_v4_asi,
1126 NVPTX::LDV_i32_v4_asi, std::nullopt,
1127 NVPTX::LDV_f16_v4_asi, NVPTX::LDV_f16x2_v4_asi,
1128 NVPTX::LDV_f32_v4_asi, std::nullopt);
1129 break;
1130 }
1131 if (!Opcode)
1132 return false;
1133 SDValue Ops[] = { getI32Imm(IsVolatile, DL), getI32Imm(CodeAddrSpace, DL),
1134 getI32Imm(VecType, DL), getI32Imm(FromType, DL),
1135 getI32Imm(FromTypeWidth, DL), Base, Offset, Chain };
1136 LD = CurDAG->getMachineNode(*Opcode, DL, N->getVTList(), Ops);
1137 } else if (PointerSize == 64
1138 ? SelectADDRri64(Op1.getNode(), Op1, Base, Offset)
1139 : SelectADDRri(Op1.getNode(), Op1, Base, Offset)) {
1140 if (PointerSize == 64) {
1141 switch (N->getOpcode()) {
1142 default:
1143 return false;
1144 case NVPTXISD::LoadV2:
1145 Opcode = pickOpcodeForVT(
1146 EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v2_ari_64,
1147 NVPTX::LDV_i16_v2_ari_64, NVPTX::LDV_i32_v2_ari_64,
1148 NVPTX::LDV_i64_v2_ari_64, NVPTX::LDV_f16_v2_ari_64,
1149 NVPTX::LDV_f16x2_v2_ari_64, NVPTX::LDV_f32_v2_ari_64,
1150 NVPTX::LDV_f64_v2_ari_64);
1151 break;
1152 case NVPTXISD::LoadV4:
1153 Opcode = pickOpcodeForVT(
1154 EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_ari_64,
1155 NVPTX::LDV_i16_v4_ari_64, NVPTX::LDV_i32_v4_ari_64, std::nullopt,
1156 NVPTX::LDV_f16_v4_ari_64, NVPTX::LDV_f16x2_v4_ari_64,
1157 NVPTX::LDV_f32_v4_ari_64, std::nullopt);
1158 break;
1159 }
1160 } else {
1161 switch (N->getOpcode()) {
1162 default:
1163 return false;
1164 case NVPTXISD::LoadV2:
1165 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1166 NVPTX::LDV_i8_v2_ari, NVPTX::LDV_i16_v2_ari,
1167 NVPTX::LDV_i32_v2_ari, NVPTX::LDV_i64_v2_ari,
1168 NVPTX::LDV_f16_v2_ari, NVPTX::LDV_f16x2_v2_ari,
1169 NVPTX::LDV_f32_v2_ari, NVPTX::LDV_f64_v2_ari);
1170 break;
1171 case NVPTXISD::LoadV4:
1172 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1173 NVPTX::LDV_i8_v4_ari, NVPTX::LDV_i16_v4_ari,
1174 NVPTX::LDV_i32_v4_ari, std::nullopt,
1175 NVPTX::LDV_f16_v4_ari, NVPTX::LDV_f16x2_v4_ari,
1176 NVPTX::LDV_f32_v4_ari, std::nullopt);
1177 break;
1178 }
1179 }
1180 if (!Opcode)
1181 return false;
1182 SDValue Ops[] = { getI32Imm(IsVolatile, DL), getI32Imm(CodeAddrSpace, DL),
1183 getI32Imm(VecType, DL), getI32Imm(FromType, DL),
1184 getI32Imm(FromTypeWidth, DL), Base, Offset, Chain };
1185
1186 LD = CurDAG->getMachineNode(*Opcode, DL, N->getVTList(), Ops);
1187 } else {
1188 if (PointerSize == 64) {
1189 switch (N->getOpcode()) {
1190 default:
1191 return false;
1192 case NVPTXISD::LoadV2:
1193 Opcode = pickOpcodeForVT(
1194 EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v2_areg_64,
1195 NVPTX::LDV_i16_v2_areg_64, NVPTX::LDV_i32_v2_areg_64,
1196 NVPTX::LDV_i64_v2_areg_64, NVPTX::LDV_f16_v2_areg_64,
1197 NVPTX::LDV_f16x2_v2_areg_64, NVPTX::LDV_f32_v2_areg_64,
1198 NVPTX::LDV_f64_v2_areg_64);
1199 break;
1200 case NVPTXISD::LoadV4:
1201 Opcode = pickOpcodeForVT(
1202 EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_areg_64,
1203 NVPTX::LDV_i16_v4_areg_64, NVPTX::LDV_i32_v4_areg_64, std::nullopt,
1204 NVPTX::LDV_f16_v4_areg_64, NVPTX::LDV_f16x2_v4_areg_64,
1205 NVPTX::LDV_f32_v4_areg_64, std::nullopt);
1206 break;
1207 }
1208 } else {
1209 switch (N->getOpcode()) {
1210 default:
1211 return false;
1212 case NVPTXISD::LoadV2:
1213 Opcode =
1214 pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v2_areg,
1215 NVPTX::LDV_i16_v2_areg, NVPTX::LDV_i32_v2_areg,
1216 NVPTX::LDV_i64_v2_areg, NVPTX::LDV_f16_v2_areg,
1217 NVPTX::LDV_f16x2_v2_areg, NVPTX::LDV_f32_v2_areg,
1218 NVPTX::LDV_f64_v2_areg);
1219 break;
1220 case NVPTXISD::LoadV4:
1221 Opcode = pickOpcodeForVT(
1222 EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_areg,
1223 NVPTX::LDV_i16_v4_areg, NVPTX::LDV_i32_v4_areg, std::nullopt,
1224 NVPTX::LDV_f16_v4_areg, NVPTX::LDV_f16x2_v4_areg,
1225 NVPTX::LDV_f32_v4_areg, std::nullopt);
1226 break;
1227 }
1228 }
1229 if (!Opcode)
1230 return false;
1231 SDValue Ops[] = { getI32Imm(IsVolatile, DL), getI32Imm(CodeAddrSpace, DL),
1232 getI32Imm(VecType, DL), getI32Imm(FromType, DL),
1233 getI32Imm(FromTypeWidth, DL), Op1, Chain };
1234 LD = CurDAG->getMachineNode(*Opcode, DL, N->getVTList(), Ops);
1235 }
1236
1237 MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand();
1238 CurDAG->setNodeMemRefs(cast<MachineSDNode>(LD), {MemRef});
1239
1240 ReplaceNode(N, LD);
1241 return true;
1242}
1243
1244bool NVPTXDAGToDAGISel::tryLDGLDU(SDNode *N) {
1245
1246 SDValue Chain = N->getOperand(0);
1247 SDValue Op1;
1248 MemSDNode *Mem;
1249 bool IsLDG = true;
1250
1251 // If this is an LDG intrinsic, the address is the third operand. If its an
1252 // LDG/LDU SD node (from custom vector handling), then its the second operand
1253 if (N->getOpcode() == ISD::INTRINSIC_W_CHAIN) {
1254 Op1 = N->getOperand(2);
1255 Mem = cast<MemIntrinsicSDNode>(N);
1256 unsigned IID = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
1257 switch (IID) {
1258 default:
1259 return false;
1260 case Intrinsic::nvvm_ldg_global_f:
1261 case Intrinsic::nvvm_ldg_global_i:
1262 case Intrinsic::nvvm_ldg_global_p:
1263 IsLDG = true;
1264 break;
1265 case Intrinsic::nvvm_ldu_global_f:
1266 case Intrinsic::nvvm_ldu_global_i:
1267 case Intrinsic::nvvm_ldu_global_p:
1268 IsLDG = false;
1269 break;
1270 }
1271 } else {
1272 Op1 = N->getOperand(1);
1273 Mem = cast<MemSDNode>(N);
1274 }
1275
1276 std::optional<unsigned> Opcode;
1277 SDLoc DL(N);
1278 SDNode *LD;
1280
1281 EVT EltVT = Mem->getMemoryVT();
1282 unsigned NumElts = 1;
1283 if (EltVT.isVector()) {
1284 NumElts = EltVT.getVectorNumElements();
1285 EltVT = EltVT.getVectorElementType();
1286 // vectors of f16 are loaded/stored as multiples of v2f16 elements.
1287 if (EltVT == MVT::f16 && N->getValueType(0) == MVT::v2f16) {
1288 assert(NumElts % 2 == 0 && "Vector must have even number of elements");
1289 EltVT = MVT::v2f16;
1290 NumElts /= 2;
1291 }
1292 }
1293
1294 // Build the "promoted" result VTList for the load. If we are really loading
1295 // i8s, then the return type will be promoted to i16 since we do not expose
1296 // 8-bit registers in NVPTX.
1297 EVT NodeVT = (EltVT == MVT::i8) ? MVT::i16 : EltVT;
1298 SmallVector<EVT, 5> InstVTs;
1299 for (unsigned i = 0; i != NumElts; ++i) {
1300 InstVTs.push_back(NodeVT);
1301 }
1302 InstVTs.push_back(MVT::Other);
1303 SDVTList InstVTList = CurDAG->getVTList(InstVTs);
1304
1305 if (SelectDirectAddr(Op1, Addr)) {
1306 switch (N->getOpcode()) {
1307 default:
1308 return false;
1309 case ISD::LOAD:
1311 if (IsLDG)
1312 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1313 NVPTX::INT_PTX_LDG_GLOBAL_i8avar,
1314 NVPTX::INT_PTX_LDG_GLOBAL_i16avar,
1315 NVPTX::INT_PTX_LDG_GLOBAL_i32avar,
1316 NVPTX::INT_PTX_LDG_GLOBAL_i64avar,
1317 NVPTX::INT_PTX_LDG_GLOBAL_f16avar,
1318 NVPTX::INT_PTX_LDG_GLOBAL_f16x2avar,
1319 NVPTX::INT_PTX_LDG_GLOBAL_f32avar,
1320 NVPTX::INT_PTX_LDG_GLOBAL_f64avar);
1321 else
1322 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1323 NVPTX::INT_PTX_LDU_GLOBAL_i8avar,
1324 NVPTX::INT_PTX_LDU_GLOBAL_i16avar,
1325 NVPTX::INT_PTX_LDU_GLOBAL_i32avar,
1326 NVPTX::INT_PTX_LDU_GLOBAL_i64avar,
1327 NVPTX::INT_PTX_LDU_GLOBAL_f16avar,
1328 NVPTX::INT_PTX_LDU_GLOBAL_f16x2avar,
1329 NVPTX::INT_PTX_LDU_GLOBAL_f32avar,
1330 NVPTX::INT_PTX_LDU_GLOBAL_f64avar);
1331 break;
1332 case NVPTXISD::LoadV2:
1333 case NVPTXISD::LDGV2:
1334 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1335 NVPTX::INT_PTX_LDG_G_v2i8_ELE_avar,
1336 NVPTX::INT_PTX_LDG_G_v2i16_ELE_avar,
1337 NVPTX::INT_PTX_LDG_G_v2i32_ELE_avar,
1338 NVPTX::INT_PTX_LDG_G_v2i64_ELE_avar,
1339 NVPTX::INT_PTX_LDG_G_v2f16_ELE_avar,
1340 NVPTX::INT_PTX_LDG_G_v2f16x2_ELE_avar,
1341 NVPTX::INT_PTX_LDG_G_v2f32_ELE_avar,
1342 NVPTX::INT_PTX_LDG_G_v2f64_ELE_avar);
1343 break;
1344 case NVPTXISD::LDUV2:
1345 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1346 NVPTX::INT_PTX_LDU_G_v2i8_ELE_avar,
1347 NVPTX::INT_PTX_LDU_G_v2i16_ELE_avar,
1348 NVPTX::INT_PTX_LDU_G_v2i32_ELE_avar,
1349 NVPTX::INT_PTX_LDU_G_v2i64_ELE_avar,
1350 NVPTX::INT_PTX_LDU_G_v2f16_ELE_avar,
1351 NVPTX::INT_PTX_LDU_G_v2f16x2_ELE_avar,
1352 NVPTX::INT_PTX_LDU_G_v2f32_ELE_avar,
1353 NVPTX::INT_PTX_LDU_G_v2f64_ELE_avar);
1354 break;
1355 case NVPTXISD::LoadV4:
1356 case NVPTXISD::LDGV4:
1357 Opcode = pickOpcodeForVT(
1358 EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDG_G_v4i8_ELE_avar,
1359 NVPTX::INT_PTX_LDG_G_v4i16_ELE_avar,
1360 NVPTX::INT_PTX_LDG_G_v4i32_ELE_avar, std::nullopt,
1361 NVPTX::INT_PTX_LDG_G_v4f16_ELE_avar,
1362 NVPTX::INT_PTX_LDG_G_v4f16x2_ELE_avar,
1363 NVPTX::INT_PTX_LDG_G_v4f32_ELE_avar, std::nullopt);
1364 break;
1365 case NVPTXISD::LDUV4:
1366 Opcode = pickOpcodeForVT(
1367 EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDU_G_v4i8_ELE_avar,
1368 NVPTX::INT_PTX_LDU_G_v4i16_ELE_avar,
1369 NVPTX::INT_PTX_LDU_G_v4i32_ELE_avar, std::nullopt,
1370 NVPTX::INT_PTX_LDU_G_v4f16_ELE_avar,
1371 NVPTX::INT_PTX_LDU_G_v4f16x2_ELE_avar,
1372 NVPTX::INT_PTX_LDU_G_v4f32_ELE_avar, std::nullopt);
1373 break;
1374 }
1375 if (!Opcode)
1376 return false;
1377 SDValue Ops[] = { Addr, Chain };
1378 LD = CurDAG->getMachineNode(*Opcode, DL, InstVTList, Ops);
1379 } else if (TM.is64Bit() ? SelectADDRri64(Op1.getNode(), Op1, Base, Offset)
1380 : SelectADDRri(Op1.getNode(), Op1, Base, Offset)) {
1381 if (TM.is64Bit()) {
1382 switch (N->getOpcode()) {
1383 default:
1384 return false;
1385 case ISD::LOAD:
1387 if (IsLDG)
1388 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1389 NVPTX::INT_PTX_LDG_GLOBAL_i8ari64,
1390 NVPTX::INT_PTX_LDG_GLOBAL_i16ari64,
1391 NVPTX::INT_PTX_LDG_GLOBAL_i32ari64,
1392 NVPTX::INT_PTX_LDG_GLOBAL_i64ari64,
1393 NVPTX::INT_PTX_LDG_GLOBAL_f16ari64,
1394 NVPTX::INT_PTX_LDG_GLOBAL_f16x2ari64,
1395 NVPTX::INT_PTX_LDG_GLOBAL_f32ari64,
1396 NVPTX::INT_PTX_LDG_GLOBAL_f64ari64);
1397 else
1398 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1399 NVPTX::INT_PTX_LDU_GLOBAL_i8ari64,
1400 NVPTX::INT_PTX_LDU_GLOBAL_i16ari64,
1401 NVPTX::INT_PTX_LDU_GLOBAL_i32ari64,
1402 NVPTX::INT_PTX_LDU_GLOBAL_i64ari64,
1403 NVPTX::INT_PTX_LDU_GLOBAL_f16ari64,
1404 NVPTX::INT_PTX_LDU_GLOBAL_f16x2ari64,
1405 NVPTX::INT_PTX_LDU_GLOBAL_f32ari64,
1406 NVPTX::INT_PTX_LDU_GLOBAL_f64ari64);
1407 break;
1408 case NVPTXISD::LoadV2:
1409 case NVPTXISD::LDGV2:
1410 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1411 NVPTX::INT_PTX_LDG_G_v2i8_ELE_ari64,
1412 NVPTX::INT_PTX_LDG_G_v2i16_ELE_ari64,
1413 NVPTX::INT_PTX_LDG_G_v2i32_ELE_ari64,
1414 NVPTX::INT_PTX_LDG_G_v2i64_ELE_ari64,
1415 NVPTX::INT_PTX_LDG_G_v2f16_ELE_ari64,
1416 NVPTX::INT_PTX_LDG_G_v2f16x2_ELE_ari64,
1417 NVPTX::INT_PTX_LDG_G_v2f32_ELE_ari64,
1418 NVPTX::INT_PTX_LDG_G_v2f64_ELE_ari64);
1419 break;
1420 case NVPTXISD::LDUV2:
1421 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1422 NVPTX::INT_PTX_LDU_G_v2i8_ELE_ari64,
1423 NVPTX::INT_PTX_LDU_G_v2i16_ELE_ari64,
1424 NVPTX::INT_PTX_LDU_G_v2i32_ELE_ari64,
1425 NVPTX::INT_PTX_LDU_G_v2i64_ELE_ari64,
1426 NVPTX::INT_PTX_LDU_G_v2f16_ELE_ari64,
1427 NVPTX::INT_PTX_LDU_G_v2f16x2_ELE_ari64,
1428 NVPTX::INT_PTX_LDU_G_v2f32_ELE_ari64,
1429 NVPTX::INT_PTX_LDU_G_v2f64_ELE_ari64);
1430 break;
1431 case NVPTXISD::LoadV4:
1432 case NVPTXISD::LDGV4:
1433 Opcode = pickOpcodeForVT(
1434 EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDG_G_v4i8_ELE_ari64,
1435 NVPTX::INT_PTX_LDG_G_v4i16_ELE_ari64,
1436 NVPTX::INT_PTX_LDG_G_v4i32_ELE_ari64, std::nullopt,
1437 NVPTX::INT_PTX_LDG_G_v4f16_ELE_ari64,
1438 NVPTX::INT_PTX_LDG_G_v4f16x2_ELE_ari64,
1439 NVPTX::INT_PTX_LDG_G_v4f32_ELE_ari64, std::nullopt);
1440 break;
1441 case NVPTXISD::LDUV4:
1442 Opcode = pickOpcodeForVT(
1443 EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDU_G_v4i8_ELE_ari64,
1444 NVPTX::INT_PTX_LDU_G_v4i16_ELE_ari64,
1445 NVPTX::INT_PTX_LDU_G_v4i32_ELE_ari64, std::nullopt,
1446 NVPTX::INT_PTX_LDU_G_v4f16_ELE_ari64,
1447 NVPTX::INT_PTX_LDU_G_v4f16x2_ELE_ari64,
1448 NVPTX::INT_PTX_LDU_G_v4f32_ELE_ari64, std::nullopt);
1449 break;
1450 }
1451 } else {
1452 switch (N->getOpcode()) {
1453 default:
1454 return false;
1455 case ISD::LOAD:
1457 if (IsLDG)
1458 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1459 NVPTX::INT_PTX_LDG_GLOBAL_i8ari,
1460 NVPTX::INT_PTX_LDG_GLOBAL_i16ari,
1461 NVPTX::INT_PTX_LDG_GLOBAL_i32ari,
1462 NVPTX::INT_PTX_LDG_GLOBAL_i64ari,
1463 NVPTX::INT_PTX_LDG_GLOBAL_f16ari,
1464 NVPTX::INT_PTX_LDG_GLOBAL_f16x2ari,
1465 NVPTX::INT_PTX_LDG_GLOBAL_f32ari,
1466 NVPTX::INT_PTX_LDG_GLOBAL_f64ari);
1467 else
1468 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1469 NVPTX::INT_PTX_LDU_GLOBAL_i8ari,
1470 NVPTX::INT_PTX_LDU_GLOBAL_i16ari,
1471 NVPTX::INT_PTX_LDU_GLOBAL_i32ari,
1472 NVPTX::INT_PTX_LDU_GLOBAL_i64ari,
1473 NVPTX::INT_PTX_LDU_GLOBAL_f16ari,
1474 NVPTX::INT_PTX_LDU_GLOBAL_f16x2ari,
1475 NVPTX::INT_PTX_LDU_GLOBAL_f32ari,
1476 NVPTX::INT_PTX_LDU_GLOBAL_f64ari);
1477 break;
1478 case NVPTXISD::LoadV2:
1479 case NVPTXISD::LDGV2:
1480 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1481 NVPTX::INT_PTX_LDG_G_v2i8_ELE_ari32,
1482 NVPTX::INT_PTX_LDG_G_v2i16_ELE_ari32,
1483 NVPTX::INT_PTX_LDG_G_v2i32_ELE_ari32,
1484 NVPTX::INT_PTX_LDG_G_v2i64_ELE_ari32,
1485 NVPTX::INT_PTX_LDG_G_v2f16_ELE_ari32,
1486 NVPTX::INT_PTX_LDG_G_v2f16x2_ELE_ari32,
1487 NVPTX::INT_PTX_LDG_G_v2f32_ELE_ari32,
1488 NVPTX::INT_PTX_LDG_G_v2f64_ELE_ari32);
1489 break;
1490 case NVPTXISD::LDUV2:
1491 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1492 NVPTX::INT_PTX_LDU_G_v2i8_ELE_ari32,
1493 NVPTX::INT_PTX_LDU_G_v2i16_ELE_ari32,
1494 NVPTX::INT_PTX_LDU_G_v2i32_ELE_ari32,
1495 NVPTX::INT_PTX_LDU_G_v2i64_ELE_ari32,
1496 NVPTX::INT_PTX_LDU_G_v2f16_ELE_ari32,
1497 NVPTX::INT_PTX_LDU_G_v2f16x2_ELE_ari32,
1498 NVPTX::INT_PTX_LDU_G_v2f32_ELE_ari32,
1499 NVPTX::INT_PTX_LDU_G_v2f64_ELE_ari32);
1500 break;
1501 case NVPTXISD::LoadV4:
1502 case NVPTXISD::LDGV4:
1503 Opcode = pickOpcodeForVT(
1504 EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDG_G_v4i8_ELE_ari32,
1505 NVPTX::INT_PTX_LDG_G_v4i16_ELE_ari32,
1506 NVPTX::INT_PTX_LDG_G_v4i32_ELE_ari32, std::nullopt,
1507 NVPTX::INT_PTX_LDG_G_v4f16_ELE_ari32,
1508 NVPTX::INT_PTX_LDG_G_v4f16x2_ELE_ari32,
1509 NVPTX::INT_PTX_LDG_G_v4f32_ELE_ari32, std::nullopt);
1510 break;
1511 case NVPTXISD::LDUV4:
1512 Opcode = pickOpcodeForVT(
1513 EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDU_G_v4i8_ELE_ari32,
1514 NVPTX::INT_PTX_LDU_G_v4i16_ELE_ari32,
1515 NVPTX::INT_PTX_LDU_G_v4i32_ELE_ari32, std::nullopt,
1516 NVPTX::INT_PTX_LDU_G_v4f16_ELE_ari32,
1517 NVPTX::INT_PTX_LDU_G_v4f16x2_ELE_ari32,
1518 NVPTX::INT_PTX_LDU_G_v4f32_ELE_ari32, std::nullopt);
1519 break;
1520 }
1521 }
1522 if (!Opcode)
1523 return false;
1524 SDValue Ops[] = {Base, Offset, Chain};
1525 LD = CurDAG->getMachineNode(*Opcode, DL, InstVTList, Ops);
1526 } else {
1527 if (TM.is64Bit()) {
1528 switch (N->getOpcode()) {
1529 default:
1530 return false;
1531 case ISD::LOAD:
1533 if (IsLDG)
1534 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1535 NVPTX::INT_PTX_LDG_GLOBAL_i8areg64,
1536 NVPTX::INT_PTX_LDG_GLOBAL_i16areg64,
1537 NVPTX::INT_PTX_LDG_GLOBAL_i32areg64,
1538 NVPTX::INT_PTX_LDG_GLOBAL_i64areg64,
1539 NVPTX::INT_PTX_LDG_GLOBAL_f16areg64,
1540 NVPTX::INT_PTX_LDG_GLOBAL_f16x2areg64,
1541 NVPTX::INT_PTX_LDG_GLOBAL_f32areg64,
1542 NVPTX::INT_PTX_LDG_GLOBAL_f64areg64);
1543 else
1544 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1545 NVPTX::INT_PTX_LDU_GLOBAL_i8areg64,
1546 NVPTX::INT_PTX_LDU_GLOBAL_i16areg64,
1547 NVPTX::INT_PTX_LDU_GLOBAL_i32areg64,
1548 NVPTX::INT_PTX_LDU_GLOBAL_i64areg64,
1549 NVPTX::INT_PTX_LDU_GLOBAL_f16areg64,
1550 NVPTX::INT_PTX_LDU_GLOBAL_f16x2areg64,
1551 NVPTX::INT_PTX_LDU_GLOBAL_f32areg64,
1552 NVPTX::INT_PTX_LDU_GLOBAL_f64areg64);
1553 break;
1554 case NVPTXISD::LoadV2:
1555 case NVPTXISD::LDGV2:
1556 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1557 NVPTX::INT_PTX_LDG_G_v2i8_ELE_areg64,
1558 NVPTX::INT_PTX_LDG_G_v2i16_ELE_areg64,
1559 NVPTX::INT_PTX_LDG_G_v2i32_ELE_areg64,
1560 NVPTX::INT_PTX_LDG_G_v2i64_ELE_areg64,
1561 NVPTX::INT_PTX_LDG_G_v2f16_ELE_areg64,
1562 NVPTX::INT_PTX_LDG_G_v2f16x2_ELE_areg64,
1563 NVPTX::INT_PTX_LDG_G_v2f32_ELE_areg64,
1564 NVPTX::INT_PTX_LDG_G_v2f64_ELE_areg64);
1565 break;
1566 case NVPTXISD::LDUV2:
1567 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1568 NVPTX::INT_PTX_LDU_G_v2i8_ELE_areg64,
1569 NVPTX::INT_PTX_LDU_G_v2i16_ELE_areg64,
1570 NVPTX::INT_PTX_LDU_G_v2i32_ELE_areg64,
1571 NVPTX::INT_PTX_LDU_G_v2i64_ELE_areg64,
1572 NVPTX::INT_PTX_LDU_G_v2f16_ELE_areg64,
1573 NVPTX::INT_PTX_LDU_G_v2f16x2_ELE_areg64,
1574 NVPTX::INT_PTX_LDU_G_v2f32_ELE_areg64,
1575 NVPTX::INT_PTX_LDU_G_v2f64_ELE_areg64);
1576 break;
1577 case NVPTXISD::LoadV4:
1578 case NVPTXISD::LDGV4:
1579 Opcode = pickOpcodeForVT(
1580 EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDG_G_v4i8_ELE_areg64,
1581 NVPTX::INT_PTX_LDG_G_v4i16_ELE_areg64,
1582 NVPTX::INT_PTX_LDG_G_v4i32_ELE_areg64, std::nullopt,
1583 NVPTX::INT_PTX_LDG_G_v4f16_ELE_areg64,
1584 NVPTX::INT_PTX_LDG_G_v4f16x2_ELE_areg64,
1585 NVPTX::INT_PTX_LDG_G_v4f32_ELE_areg64, std::nullopt);
1586 break;
1587 case NVPTXISD::LDUV4:
1588 Opcode = pickOpcodeForVT(
1589 EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDU_G_v4i8_ELE_areg64,
1590 NVPTX::INT_PTX_LDU_G_v4i16_ELE_areg64,
1591 NVPTX::INT_PTX_LDU_G_v4i32_ELE_areg64, std::nullopt,
1592 NVPTX::INT_PTX_LDU_G_v4f16_ELE_areg64,
1593 NVPTX::INT_PTX_LDU_G_v4f16x2_ELE_areg64,
1594 NVPTX::INT_PTX_LDU_G_v4f32_ELE_areg64, std::nullopt);
1595 break;
1596 }
1597 } else {
1598 switch (N->getOpcode()) {
1599 default:
1600 return false;
1601 case ISD::LOAD:
1603 if (IsLDG)
1604 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1605 NVPTX::INT_PTX_LDG_GLOBAL_i8areg,
1606 NVPTX::INT_PTX_LDG_GLOBAL_i16areg,
1607 NVPTX::INT_PTX_LDG_GLOBAL_i32areg,
1608 NVPTX::INT_PTX_LDG_GLOBAL_i64areg,
1609 NVPTX::INT_PTX_LDG_GLOBAL_f16areg,
1610 NVPTX::INT_PTX_LDG_GLOBAL_f16x2areg,
1611 NVPTX::INT_PTX_LDG_GLOBAL_f32areg,
1612 NVPTX::INT_PTX_LDG_GLOBAL_f64areg);
1613 else
1614 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1615 NVPTX::INT_PTX_LDU_GLOBAL_i8areg,
1616 NVPTX::INT_PTX_LDU_GLOBAL_i16areg,
1617 NVPTX::INT_PTX_LDU_GLOBAL_i32areg,
1618 NVPTX::INT_PTX_LDU_GLOBAL_i64areg,
1619 NVPTX::INT_PTX_LDU_GLOBAL_f16areg,
1620 NVPTX::INT_PTX_LDU_GLOBAL_f16x2areg,
1621 NVPTX::INT_PTX_LDU_GLOBAL_f32areg,
1622 NVPTX::INT_PTX_LDU_GLOBAL_f64areg);
1623 break;
1624 case NVPTXISD::LoadV2:
1625 case NVPTXISD::LDGV2:
1626 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1627 NVPTX::INT_PTX_LDG_G_v2i8_ELE_areg32,
1628 NVPTX::INT_PTX_LDG_G_v2i16_ELE_areg32,
1629 NVPTX::INT_PTX_LDG_G_v2i32_ELE_areg32,
1630 NVPTX::INT_PTX_LDG_G_v2i64_ELE_areg32,
1631 NVPTX::INT_PTX_LDG_G_v2f16_ELE_areg32,
1632 NVPTX::INT_PTX_LDG_G_v2f16x2_ELE_areg32,
1633 NVPTX::INT_PTX_LDG_G_v2f32_ELE_areg32,
1634 NVPTX::INT_PTX_LDG_G_v2f64_ELE_areg32);
1635 break;
1636 case NVPTXISD::LDUV2:
1637 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1638 NVPTX::INT_PTX_LDU_G_v2i8_ELE_areg32,
1639 NVPTX::INT_PTX_LDU_G_v2i16_ELE_areg32,
1640 NVPTX::INT_PTX_LDU_G_v2i32_ELE_areg32,
1641 NVPTX::INT_PTX_LDU_G_v2i64_ELE_areg32,
1642 NVPTX::INT_PTX_LDU_G_v2f16_ELE_areg32,
1643 NVPTX::INT_PTX_LDU_G_v2f16x2_ELE_areg32,
1644 NVPTX::INT_PTX_LDU_G_v2f32_ELE_areg32,
1645 NVPTX::INT_PTX_LDU_G_v2f64_ELE_areg32);
1646 break;
1647 case NVPTXISD::LoadV4:
1648 case NVPTXISD::LDGV4:
1649 Opcode = pickOpcodeForVT(
1650 EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDG_G_v4i8_ELE_areg32,
1651 NVPTX::INT_PTX_LDG_G_v4i16_ELE_areg32,
1652 NVPTX::INT_PTX_LDG_G_v4i32_ELE_areg32, std::nullopt,
1653 NVPTX::INT_PTX_LDG_G_v4f16_ELE_areg32,
1654 NVPTX::INT_PTX_LDG_G_v4f16x2_ELE_areg32,
1655 NVPTX::INT_PTX_LDG_G_v4f32_ELE_areg32, std::nullopt);
1656 break;
1657 case NVPTXISD::LDUV4:
1658 Opcode = pickOpcodeForVT(
1659 EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDU_G_v4i8_ELE_areg32,
1660 NVPTX::INT_PTX_LDU_G_v4i16_ELE_areg32,
1661 NVPTX::INT_PTX_LDU_G_v4i32_ELE_areg32, std::nullopt,
1662 NVPTX::INT_PTX_LDU_G_v4f16_ELE_areg32,
1663 NVPTX::INT_PTX_LDU_G_v4f16x2_ELE_areg32,
1664 NVPTX::INT_PTX_LDU_G_v4f32_ELE_areg32, std::nullopt);
1665 break;
1666 }
1667 }
1668 if (!Opcode)
1669 return false;
1670 SDValue Ops[] = { Op1, Chain };
1671 LD = CurDAG->getMachineNode(*Opcode, DL, InstVTList, Ops);
1672 }
1673
1674 // For automatic generation of LDG (through SelectLoad[Vector], not the
1675 // intrinsics), we may have an extending load like:
1676 //
1677 // i32,ch = load<LD1[%data1(addrspace=1)], zext from i8> t0, t7, undef:i64
1678 //
1679 // In this case, the matching logic above will select a load for the original
1680 // memory type (in this case, i8) and our types will not match (the node needs
1681 // to return an i32 in this case). Our LDG/LDU nodes do not support the
1682 // concept of sign-/zero-extension, so emulate it here by adding an explicit
1683 // CVT instruction. Ptxas should clean up any redundancies here.
1684
1685 EVT OrigType = N->getValueType(0);
1686 LoadSDNode *LdNode = dyn_cast<LoadSDNode>(N);
1687
1688 if (OrigType != EltVT && LdNode) {
1689 // We have an extending-load. The instruction we selected operates on the
1690 // smaller type, but the SDNode we are replacing has the larger type. We
1691 // need to emit a CVT to make the types match.
1692 bool IsSigned = LdNode->getExtensionType() == ISD::SEXTLOAD;
1693 unsigned CvtOpc = GetConvertOpcode(OrigType.getSimpleVT(),
1694 EltVT.getSimpleVT(), IsSigned);
1695
1696 // For each output value, apply the manual sign/zero-extension and make sure
1697 // all users of the load go through that CVT.
1698 for (unsigned i = 0; i != NumElts; ++i) {
1699 SDValue Res(LD, i);
1700 SDValue OrigVal(N, i);
1701
1702 SDNode *CvtNode =
1703 CurDAG->getMachineNode(CvtOpc, DL, OrigType, Res,
1705 DL, MVT::i32));
1706 ReplaceUses(OrigVal, SDValue(CvtNode, 0));
1707 }
1708 }
1709
1710 ReplaceNode(N, LD);
1711 return true;
1712}
1713
1714bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
1715 SDLoc dl(N);
1716 MemSDNode *ST = cast<MemSDNode>(N);
1717 assert(ST->writeMem() && "Expected store");
1718 StoreSDNode *PlainStore = dyn_cast<StoreSDNode>(N);
1719 AtomicSDNode *AtomicStore = dyn_cast<AtomicSDNode>(N);
1720 assert((PlainStore || AtomicStore) && "Expected store");
1721 EVT StoreVT = ST->getMemoryVT();
1722 SDNode *NVPTXST = nullptr;
1723
1724 // do not support pre/post inc/dec
1725 if (PlainStore && PlainStore->isIndexed())
1726 return false;
1727
1728 if (!StoreVT.isSimple())
1729 return false;
1730
1731 AtomicOrdering Ordering = ST->getSuccessOrdering();
1732 // In order to lower atomic loads with stronger guarantees we would need to
1733 // use store.release or insert fences. However these features were only added
1734 // with PTX ISA 6.0 / sm_70.
1735 // TODO: Check if we can actually use the new instructions and implement them.
1736 if (isStrongerThanMonotonic(Ordering))
1737 return false;
1738
1739 // Address Space Setting
1740 unsigned int CodeAddrSpace = getCodeAddrSpace(ST);
1741 unsigned int PointerSize =
1742 CurDAG->getDataLayout().getPointerSizeInBits(ST->getAddressSpace());
1743
1744 // Volatile Setting
1745 // - .volatile is only available for .global and .shared
1746 // - .volatile has the same memory synchronization semantics as .relaxed.sys
1747 bool isVolatile = ST->isVolatile() || Ordering == AtomicOrdering::Monotonic;
1748 if (CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL &&
1749 CodeAddrSpace != NVPTX::PTXLdStInstCode::SHARED &&
1750 CodeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC)
1751 isVolatile = false;
1752
1753 // Vector Setting
1754 MVT SimpleVT = StoreVT.getSimpleVT();
1755 unsigned vecType = NVPTX::PTXLdStInstCode::Scalar;
1756
1757 // Type Setting: toType + toTypeWidth
1758 // - for integer type, always use 'u'
1759 //
1760 MVT ScalarVT = SimpleVT.getScalarType();
1761 unsigned toTypeWidth = ScalarVT.getSizeInBits();
1762 if (SimpleVT.isVector()) {
1763 assert((StoreVT == MVT::v2f16 || StoreVT == MVT::v2bf16) &&
1764 "Unexpected vector type");
1765 // v2f16 is stored using st.b32
1766 toTypeWidth = 32;
1767 }
1768
1769 unsigned int toType = getLdStRegType(ScalarVT);
1770
1771 // Create the machine instruction DAG
1772 SDValue Chain = ST->getChain();
1773 SDValue Value = PlainStore ? PlainStore->getValue() : AtomicStore->getVal();
1774 SDValue BasePtr = ST->getBasePtr();
1775 SDValue Addr;
1777 std::optional<unsigned> Opcode;
1778 MVT::SimpleValueType SourceVT =
1779 Value.getNode()->getSimpleValueType(0).SimpleTy;
1780
1781 if (SelectDirectAddr(BasePtr, Addr)) {
1782 Opcode = pickOpcodeForVT(SourceVT, NVPTX::ST_i8_avar, NVPTX::ST_i16_avar,
1783 NVPTX::ST_i32_avar, NVPTX::ST_i64_avar,
1784 NVPTX::ST_f16_avar, NVPTX::ST_f16x2_avar,
1785 NVPTX::ST_f32_avar, NVPTX::ST_f64_avar);
1786 if (!Opcode)
1787 return false;
1788 SDValue Ops[] = {Value,
1789 getI32Imm(isVolatile, dl),
1790 getI32Imm(CodeAddrSpace, dl),
1791 getI32Imm(vecType, dl),
1792 getI32Imm(toType, dl),
1793 getI32Imm(toTypeWidth, dl),
1794 Addr,
1795 Chain};
1796 NVPTXST = CurDAG->getMachineNode(*Opcode, dl, MVT::Other, Ops);
1797 } else if (PointerSize == 64
1798 ? SelectADDRsi64(BasePtr.getNode(), BasePtr, Base, Offset)
1799 : SelectADDRsi(BasePtr.getNode(), BasePtr, Base, Offset)) {
1800 Opcode = pickOpcodeForVT(SourceVT, NVPTX::ST_i8_asi, NVPTX::ST_i16_asi,
1801 NVPTX::ST_i32_asi, NVPTX::ST_i64_asi,
1802 NVPTX::ST_f16_asi, NVPTX::ST_f16x2_asi,
1803 NVPTX::ST_f32_asi, NVPTX::ST_f64_asi);
1804 if (!Opcode)
1805 return false;
1806 SDValue Ops[] = {Value,
1807 getI32Imm(isVolatile, dl),
1808 getI32Imm(CodeAddrSpace, dl),
1809 getI32Imm(vecType, dl),
1810 getI32Imm(toType, dl),
1811 getI32Imm(toTypeWidth, dl),
1812 Base,
1813 Offset,
1814 Chain};
1815 NVPTXST = CurDAG->getMachineNode(*Opcode, dl, MVT::Other, Ops);
1816 } else if (PointerSize == 64
1817 ? SelectADDRri64(BasePtr.getNode(), BasePtr, Base, Offset)
1818 : SelectADDRri(BasePtr.getNode(), BasePtr, Base, Offset)) {
1819 if (PointerSize == 64)
1820 Opcode = pickOpcodeForVT(
1821 SourceVT, NVPTX::ST_i8_ari_64, NVPTX::ST_i16_ari_64,
1822 NVPTX::ST_i32_ari_64, NVPTX::ST_i64_ari_64, NVPTX::ST_f16_ari_64,
1823 NVPTX::ST_f16x2_ari_64, NVPTX::ST_f32_ari_64, NVPTX::ST_f64_ari_64);
1824 else
1825 Opcode = pickOpcodeForVT(SourceVT, NVPTX::ST_i8_ari, NVPTX::ST_i16_ari,
1826 NVPTX::ST_i32_ari, NVPTX::ST_i64_ari,
1827 NVPTX::ST_f16_ari, NVPTX::ST_f16x2_ari,
1828 NVPTX::ST_f32_ari, NVPTX::ST_f64_ari);
1829 if (!Opcode)
1830 return false;
1831
1832 SDValue Ops[] = {Value,
1833 getI32Imm(isVolatile, dl),
1834 getI32Imm(CodeAddrSpace, dl),
1835 getI32Imm(vecType, dl),
1836 getI32Imm(toType, dl),
1837 getI32Imm(toTypeWidth, dl),
1838 Base,
1839 Offset,
1840 Chain};
1841 NVPTXST = CurDAG->getMachineNode(*Opcode, dl, MVT::Other, Ops);
1842 } else {
1843 if (PointerSize == 64)
1844 Opcode =
1845 pickOpcodeForVT(SourceVT, NVPTX::ST_i8_areg_64, NVPTX::ST_i16_areg_64,
1846 NVPTX::ST_i32_areg_64, NVPTX::ST_i64_areg_64,
1847 NVPTX::ST_f16_areg_64, NVPTX::ST_f16x2_areg_64,
1848 NVPTX::ST_f32_areg_64, NVPTX::ST_f64_areg_64);
1849 else
1850 Opcode = pickOpcodeForVT(SourceVT, NVPTX::ST_i8_areg, NVPTX::ST_i16_areg,
1851 NVPTX::ST_i32_areg, NVPTX::ST_i64_areg,
1852 NVPTX::ST_f16_areg, NVPTX::ST_f16x2_areg,
1853 NVPTX::ST_f32_areg, NVPTX::ST_f64_areg);
1854 if (!Opcode)
1855 return false;
1856 SDValue Ops[] = {Value,
1857 getI32Imm(isVolatile, dl),
1858 getI32Imm(CodeAddrSpace, dl),
1859 getI32Imm(vecType, dl),
1860 getI32Imm(toType, dl),
1861 getI32Imm(toTypeWidth, dl),
1862 BasePtr,
1863 Chain};
1864 NVPTXST = CurDAG->getMachineNode(*Opcode, dl, MVT::Other, Ops);
1865 }
1866
1867 if (!NVPTXST)
1868 return false;
1869
1870 MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand();
1871 CurDAG->setNodeMemRefs(cast<MachineSDNode>(NVPTXST), {MemRef});
1872 ReplaceNode(N, NVPTXST);
1873 return true;
1874}
1875
1876bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
1877 SDValue Chain = N->getOperand(0);
1878 SDValue Op1 = N->getOperand(1);
1880 std::optional<unsigned> Opcode;
1881 SDLoc DL(N);
1882 SDNode *ST;
1883 EVT EltVT = Op1.getValueType();
1884 MemSDNode *MemSD = cast<MemSDNode>(N);
1885 EVT StoreVT = MemSD->getMemoryVT();
1886
1887 // Address Space Setting
1888 unsigned CodeAddrSpace = getCodeAddrSpace(MemSD);
1889 if (CodeAddrSpace == NVPTX::PTXLdStInstCode::CONSTANT) {
1890 report_fatal_error("Cannot store to pointer that points to constant "
1891 "memory space");
1892 }
1893 unsigned int PointerSize =
1895
1896 // Volatile Setting
1897 // - .volatile is only availalble for .global and .shared
1898 bool IsVolatile = MemSD->isVolatile();
1899 if (CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL &&
1900 CodeAddrSpace != NVPTX::PTXLdStInstCode::SHARED &&
1901 CodeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC)
1902 IsVolatile = false;
1903
1904 // Type Setting: toType + toTypeWidth
1905 // - for integer type, always use 'u'
1906 assert(StoreVT.isSimple() && "Store value is not simple");
1907 MVT ScalarVT = StoreVT.getSimpleVT().getScalarType();
1908 unsigned ToTypeWidth = ScalarVT.getSizeInBits();
1909 unsigned ToType = getLdStRegType(ScalarVT);
1910
1912 SDValue N2;
1913 unsigned VecType;
1914
1915 switch (N->getOpcode()) {
1916 case NVPTXISD::StoreV2:
1918 StOps.push_back(N->getOperand(1));
1919 StOps.push_back(N->getOperand(2));
1920 N2 = N->getOperand(3);
1921 break;
1922 case NVPTXISD::StoreV4:
1924 StOps.push_back(N->getOperand(1));
1925 StOps.push_back(N->getOperand(2));
1926 StOps.push_back(N->getOperand(3));
1927 StOps.push_back(N->getOperand(4));
1928 N2 = N->getOperand(5);
1929 break;
1930 default:
1931 return false;
1932 }
1933
1934 // v8f16 is a special case. PTX doesn't have st.v8.f16
1935 // instruction. Instead, we split the vector into v2f16 chunks and
1936 // store them with st.v4.b32.
1937 if (EltVT == MVT::v2f16 || EltVT == MVT::v2bf16) {
1938 assert(N->getOpcode() == NVPTXISD::StoreV4 && "Unexpected load opcode.");
1939 EltVT = MVT::i32;
1941 ToTypeWidth = 32;
1942 }
1943
1944 StOps.push_back(getI32Imm(IsVolatile, DL));
1945 StOps.push_back(getI32Imm(CodeAddrSpace, DL));
1946 StOps.push_back(getI32Imm(VecType, DL));
1947 StOps.push_back(getI32Imm(ToType, DL));
1948 StOps.push_back(getI32Imm(ToTypeWidth, DL));
1949
1950 if (SelectDirectAddr(N2, Addr)) {
1951 switch (N->getOpcode()) {
1952 default:
1953 return false;
1954 case NVPTXISD::StoreV2:
1955 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1956 NVPTX::STV_i8_v2_avar, NVPTX::STV_i16_v2_avar,
1957 NVPTX::STV_i32_v2_avar, NVPTX::STV_i64_v2_avar,
1958 NVPTX::STV_f16_v2_avar, NVPTX::STV_f16x2_v2_avar,
1959 NVPTX::STV_f32_v2_avar, NVPTX::STV_f64_v2_avar);
1960 break;
1961 case NVPTXISD::StoreV4:
1962 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1963 NVPTX::STV_i8_v4_avar, NVPTX::STV_i16_v4_avar,
1964 NVPTX::STV_i32_v4_avar, std::nullopt,
1965 NVPTX::STV_f16_v4_avar, NVPTX::STV_f16x2_v4_avar,
1966 NVPTX::STV_f32_v4_avar, std::nullopt);
1967 break;
1968 }
1969 StOps.push_back(Addr);
1970 } else if (PointerSize == 64 ? SelectADDRsi64(N2.getNode(), N2, Base, Offset)
1971 : SelectADDRsi(N2.getNode(), N2, Base, Offset)) {
1972 switch (N->getOpcode()) {
1973 default:
1974 return false;
1975 case NVPTXISD::StoreV2:
1976 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1977 NVPTX::STV_i8_v2_asi, NVPTX::STV_i16_v2_asi,
1978 NVPTX::STV_i32_v2_asi, NVPTX::STV_i64_v2_asi,
1979 NVPTX::STV_f16_v2_asi, NVPTX::STV_f16x2_v2_asi,
1980 NVPTX::STV_f32_v2_asi, NVPTX::STV_f64_v2_asi);
1981 break;
1982 case NVPTXISD::StoreV4:
1983 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1984 NVPTX::STV_i8_v4_asi, NVPTX::STV_i16_v4_asi,
1985 NVPTX::STV_i32_v4_asi, std::nullopt,
1986 NVPTX::STV_f16_v4_asi, NVPTX::STV_f16x2_v4_asi,
1987 NVPTX::STV_f32_v4_asi, std::nullopt);
1988 break;
1989 }
1990 StOps.push_back(Base);
1991 StOps.push_back(Offset);
1992 } else if (PointerSize == 64 ? SelectADDRri64(N2.getNode(), N2, Base, Offset)
1993 : SelectADDRri(N2.getNode(), N2, Base, Offset)) {
1994 if (PointerSize == 64) {
1995 switch (N->getOpcode()) {
1996 default:
1997 return false;
1998 case NVPTXISD::StoreV2:
1999 Opcode = pickOpcodeForVT(
2000 EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v2_ari_64,
2001 NVPTX::STV_i16_v2_ari_64, NVPTX::STV_i32_v2_ari_64,
2002 NVPTX::STV_i64_v2_ari_64, NVPTX::STV_f16_v2_ari_64,
2003 NVPTX::STV_f16x2_v2_ari_64, NVPTX::STV_f32_v2_ari_64,
2004 NVPTX::STV_f64_v2_ari_64);
2005 break;
2006 case NVPTXISD::StoreV4:
2007 Opcode = pickOpcodeForVT(
2008 EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v4_ari_64,
2009 NVPTX::STV_i16_v4_ari_64, NVPTX::STV_i32_v4_ari_64, std::nullopt,
2010 NVPTX::STV_f16_v4_ari_64, NVPTX::STV_f16x2_v4_ari_64,
2011 NVPTX::STV_f32_v4_ari_64, std::nullopt);
2012 break;
2013 }
2014 } else {
2015 switch (N->getOpcode()) {
2016 default:
2017 return false;
2018 case NVPTXISD::StoreV2:
2019 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
2020 NVPTX::STV_i8_v2_ari, NVPTX::STV_i16_v2_ari,
2021 NVPTX::STV_i32_v2_ari, NVPTX::STV_i64_v2_ari,
2022 NVPTX::STV_f16_v2_ari, NVPTX::STV_f16x2_v2_ari,
2023 NVPTX::STV_f32_v2_ari, NVPTX::STV_f64_v2_ari);
2024 break;
2025 case NVPTXISD::StoreV4:
2026 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
2027 NVPTX::STV_i8_v4_ari, NVPTX::STV_i16_v4_ari,
2028 NVPTX::STV_i32_v4_ari, std::nullopt,
2029 NVPTX::STV_f16_v4_ari, NVPTX::STV_f16x2_v4_ari,
2030 NVPTX::STV_f32_v4_ari, std::nullopt);
2031 break;
2032 }
2033 }
2034 StOps.push_back(Base);
2035 StOps.push_back(Offset);
2036 } else {
2037 if (PointerSize == 64) {
2038 switch (N->getOpcode()) {
2039 default:
2040 return false;
2041 case NVPTXISD::StoreV2:
2042 Opcode = pickOpcodeForVT(
2043 EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v2_areg_64,
2044 NVPTX::STV_i16_v2_areg_64, NVPTX::STV_i32_v2_areg_64,
2045 NVPTX::STV_i64_v2_areg_64, NVPTX::STV_f16_v2_areg_64,
2046 NVPTX::STV_f16x2_v2_areg_64, NVPTX::STV_f32_v2_areg_64,
2047 NVPTX::STV_f64_v2_areg_64);
2048 break;
2049 case NVPTXISD::StoreV4:
2050 Opcode = pickOpcodeForVT(
2051 EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v4_areg_64,
2052 NVPTX::STV_i16_v4_areg_64, NVPTX::STV_i32_v4_areg_64, std::nullopt,
2053 NVPTX::STV_f16_v4_areg_64, NVPTX::STV_f16x2_v4_areg_64,
2054 NVPTX::STV_f32_v4_areg_64, std::nullopt);
2055 break;
2056 }
2057 } else {
2058 switch (N->getOpcode()) {
2059 default:
2060 return false;
2061 case NVPTXISD::StoreV2:
2062 Opcode =
2063 pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v2_areg,
2064 NVPTX::STV_i16_v2_areg, NVPTX::STV_i32_v2_areg,
2065 NVPTX::STV_i64_v2_areg, NVPTX::STV_f16_v2_areg,
2066 NVPTX::STV_f16x2_v2_areg, NVPTX::STV_f32_v2_areg,
2067 NVPTX::STV_f64_v2_areg);
2068 break;
2069 case NVPTXISD::StoreV4:
2070 Opcode = pickOpcodeForVT(
2071 EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v4_areg,
2072 NVPTX::STV_i16_v4_areg, NVPTX::STV_i32_v4_areg, std::nullopt,
2073 NVPTX::STV_f16_v4_areg, NVPTX::STV_f16x2_v4_areg,
2074 NVPTX::STV_f32_v4_areg, std::nullopt);
2075 break;
2076 }
2077 }
2078 StOps.push_back(N2);
2079 }
2080
2081 if (!Opcode)
2082 return false;
2083
2084 StOps.push_back(Chain);
2085
2086 ST = CurDAG->getMachineNode(*Opcode, DL, MVT::Other, StOps);
2087
2088 MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand();
2089 CurDAG->setNodeMemRefs(cast<MachineSDNode>(ST), {MemRef});
2090
2091 ReplaceNode(N, ST);
2092 return true;
2093}
2094
2095bool NVPTXDAGToDAGISel::tryLoadParam(SDNode *Node) {
2096 SDValue Chain = Node->getOperand(0);
2097 SDValue Offset = Node->getOperand(2);
2098 SDValue Glue = Node->getOperand(3);
2099 SDLoc DL(Node);
2100 MemSDNode *Mem = cast<MemSDNode>(Node);
2101
2102 unsigned VecSize;
2103 switch (Node->getOpcode()) {
2104 default:
2105 return false;
2107 VecSize = 1;
2108 break;
2110 VecSize = 2;
2111 break;
2113 VecSize = 4;
2114 break;
2115 }
2116
2117 EVT EltVT = Node->getValueType(0);
2118 EVT MemVT = Mem->getMemoryVT();
2119
2120 std::optional<unsigned> Opcode;
2121
2122 switch (VecSize) {
2123 default:
2124 return false;
2125 case 1:
2126 Opcode = pickOpcodeForVT(MemVT.getSimpleVT().SimpleTy,
2127 NVPTX::LoadParamMemI8, NVPTX::LoadParamMemI16,
2128 NVPTX::LoadParamMemI32, NVPTX::LoadParamMemI64,
2129 NVPTX::LoadParamMemF16, NVPTX::LoadParamMemF16x2,
2130 NVPTX::LoadParamMemF32, NVPTX::LoadParamMemF64);
2131 break;
2132 case 2:
2133 Opcode =
2134 pickOpcodeForVT(MemVT.getSimpleVT().SimpleTy, NVPTX::LoadParamMemV2I8,
2135 NVPTX::LoadParamMemV2I16, NVPTX::LoadParamMemV2I32,
2136 NVPTX::LoadParamMemV2I64, NVPTX::LoadParamMemV2F16,
2137 NVPTX::LoadParamMemV2F16x2, NVPTX::LoadParamMemV2F32,
2138 NVPTX::LoadParamMemV2F64);
2139 break;
2140 case 4:
2141 Opcode = pickOpcodeForVT(
2142 MemVT.getSimpleVT().SimpleTy, NVPTX::LoadParamMemV4I8,
2143 NVPTX::LoadParamMemV4I16, NVPTX::LoadParamMemV4I32, std::nullopt,
2144 NVPTX::LoadParamMemV4F16, NVPTX::LoadParamMemV4F16x2,
2145 NVPTX::LoadParamMemV4F32, std::nullopt);
2146 break;
2147 }
2148 if (!Opcode)
2149 return false;
2150
2151 SDVTList VTs;
2152 if (VecSize == 1) {
2153 VTs = CurDAG->getVTList(EltVT, MVT::Other, MVT::Glue);
2154 } else if (VecSize == 2) {
2155 VTs = CurDAG->getVTList(EltVT, EltVT, MVT::Other, MVT::Glue);
2156 } else {
2157 EVT EVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other, MVT::Glue };
2158 VTs = CurDAG->getVTList(EVTs);
2159 }
2160
2161 unsigned OffsetVal = cast<ConstantSDNode>(Offset)->getZExtValue();
2162
2164 Ops.push_back(CurDAG->getTargetConstant(OffsetVal, DL, MVT::i32));
2165 Ops.push_back(Chain);
2166 Ops.push_back(Glue);
2167
2168 ReplaceNode(Node, CurDAG->getMachineNode(*Opcode, DL, VTs, Ops));
2169 return true;
2170}
2171
2172bool NVPTXDAGToDAGISel::tryStoreRetval(SDNode *N) {
2173 SDLoc DL(N);
2174 SDValue Chain = N->getOperand(0);
2175 SDValue Offset = N->getOperand(1);
2176 unsigned OffsetVal = cast<ConstantSDNode>(Offset)->getZExtValue();
2177 MemSDNode *Mem = cast<MemSDNode>(N);
2178
2179 // How many elements do we have?
2180 unsigned NumElts = 1;
2181 switch (N->getOpcode()) {
2182 default:
2183 return false;
2185 NumElts = 1;
2186 break;
2188 NumElts = 2;
2189 break;
2191 NumElts = 4;
2192 break;
2193 }
2194
2195 // Build vector of operands
2197 for (unsigned i = 0; i < NumElts; ++i)
2198 Ops.push_back(N->getOperand(i + 2));
2199 Ops.push_back(CurDAG->getTargetConstant(OffsetVal, DL, MVT::i32));
2200 Ops.push_back(Chain);
2201
2202 // Determine target opcode
2203 // If we have an i1, use an 8-bit store. The lowering code in
2204 // NVPTXISelLowering will have already emitted an upcast.
2205 std::optional<unsigned> Opcode = 0;
2206 switch (NumElts) {
2207 default:
2208 return false;
2209 case 1:
2211 NVPTX::StoreRetvalI8, NVPTX::StoreRetvalI16,
2212 NVPTX::StoreRetvalI32, NVPTX::StoreRetvalI64,
2213 NVPTX::StoreRetvalF16, NVPTX::StoreRetvalF16x2,
2214 NVPTX::StoreRetvalF32, NVPTX::StoreRetvalF64);
2215 break;
2216 case 2:
2218 NVPTX::StoreRetvalV2I8, NVPTX::StoreRetvalV2I16,
2219 NVPTX::StoreRetvalV2I32, NVPTX::StoreRetvalV2I64,
2220 NVPTX::StoreRetvalV2F16, NVPTX::StoreRetvalV2F16x2,
2221 NVPTX::StoreRetvalV2F32, NVPTX::StoreRetvalV2F64);
2222 break;
2223 case 4:
2225 NVPTX::StoreRetvalV4I8, NVPTX::StoreRetvalV4I16,
2226 NVPTX::StoreRetvalV4I32, std::nullopt,
2227 NVPTX::StoreRetvalV4F16, NVPTX::StoreRetvalV4F16x2,
2228 NVPTX::StoreRetvalV4F32, std::nullopt);
2229 break;
2230 }
2231 if (!Opcode)
2232 return false;
2233
2234 SDNode *Ret = CurDAG->getMachineNode(*Opcode, DL, MVT::Other, Ops);
2235 MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand();
2236 CurDAG->setNodeMemRefs(cast<MachineSDNode>(Ret), {MemRef});
2237
2238 ReplaceNode(N, Ret);
2239 return true;
2240}
2241
2242bool NVPTXDAGToDAGISel::tryStoreParam(SDNode *N) {
2243 SDLoc DL(N);
2244 SDValue Chain = N->getOperand(0);
2245 SDValue Param = N->getOperand(1);
2246 unsigned ParamVal = cast<ConstantSDNode>(Param)->getZExtValue();
2247 SDValue Offset = N->getOperand(2);
2248 unsigned OffsetVal = cast<ConstantSDNode>(Offset)->getZExtValue();
2249 MemSDNode *Mem = cast<MemSDNode>(N);
2250 SDValue Glue = N->getOperand(N->getNumOperands() - 1);
2251
2252 // How many elements do we have?
2253 unsigned NumElts = 1;
2254 switch (N->getOpcode()) {
2255 default:
2256 return false;
2260 NumElts = 1;
2261 break;
2263 NumElts = 2;
2264 break;
2266 NumElts = 4;
2267 break;
2268 }
2269
2270 // Build vector of operands
2272 for (unsigned i = 0; i < NumElts; ++i)
2273 Ops.push_back(N->getOperand(i + 3));
2274 Ops.push_back(CurDAG->getTargetConstant(ParamVal, DL, MVT::i32));
2275 Ops.push_back(CurDAG->getTargetConstant(OffsetVal, DL, MVT::i32));
2276 Ops.push_back(Chain);
2277 Ops.push_back(Glue);
2278
2279 // Determine target opcode
2280 // If we have an i1, use an 8-bit store. The lowering code in
2281 // NVPTXISelLowering will have already emitted an upcast.
2282 std::optional<unsigned> Opcode = 0;
2283 switch (N->getOpcode()) {
2284 default:
2285 switch (NumElts) {
2286 default:
2287 return false;
2288 case 1:
2290 NVPTX::StoreParamI8, NVPTX::StoreParamI16,
2291 NVPTX::StoreParamI32, NVPTX::StoreParamI64,
2292 NVPTX::StoreParamF16, NVPTX::StoreParamF16x2,
2293 NVPTX::StoreParamF32, NVPTX::StoreParamF64);
2294 break;
2295 case 2:
2297 NVPTX::StoreParamV2I8, NVPTX::StoreParamV2I16,
2298 NVPTX::StoreParamV2I32, NVPTX::StoreParamV2I64,
2299 NVPTX::StoreParamV2F16, NVPTX::StoreParamV2F16x2,
2300 NVPTX::StoreParamV2F32, NVPTX::StoreParamV2F64);
2301 break;
2302 case 4:
2304 NVPTX::StoreParamV4I8, NVPTX::StoreParamV4I16,
2305 NVPTX::StoreParamV4I32, std::nullopt,
2306 NVPTX::StoreParamV4F16, NVPTX::StoreParamV4F16x2,
2307 NVPTX::StoreParamV4F32, std::nullopt);
2308 break;
2309 }
2310 if (!Opcode)
2311 return false;
2312 break;
2313 // Special case: if we have a sign-extend/zero-extend node, insert the
2314 // conversion instruction first, and use that as the value operand to
2315 // the selected StoreParam node.
2317 Opcode = NVPTX::StoreParamI32;
2319 MVT::i32);
2320 SDNode *Cvt = CurDAG->getMachineNode(NVPTX::CVT_u32_u16, DL,
2321 MVT::i32, Ops[0], CvtNone);
2322 Ops[0] = SDValue(Cvt, 0);
2323 break;
2324 }
2326 Opcode = NVPTX::StoreParamI32;
2328 MVT::i32);
2329 SDNode *Cvt = CurDAG->getMachineNode(NVPTX::CVT_s32_s16, DL,
2330 MVT::i32, Ops[0], CvtNone);
2331 Ops[0] = SDValue(Cvt, 0);
2332 break;
2333 }
2334 }
2335
2336 SDVTList RetVTs = CurDAG->getVTList(MVT::Other, MVT::Glue);
2337 SDNode *Ret = CurDAG->getMachineNode(*Opcode, DL, RetVTs, Ops);
2338 MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand();
2339 CurDAG->setNodeMemRefs(cast<MachineSDNode>(Ret), {MemRef});
2340
2341 ReplaceNode(N, Ret);
2342 return true;
2343}
2344
2345bool NVPTXDAGToDAGISel::tryTextureIntrinsic(SDNode *N) {
2346 unsigned Opc = 0;
2347
2348 switch (N->getOpcode()) {
2349 default: return false;
2351 Opc = NVPTX::TEX_1D_F32_S32_RR;
2352 break;
2354 Opc = NVPTX::TEX_1D_F32_F32_RR;
2355 break;
2357 Opc = NVPTX::TEX_1D_F32_F32_LEVEL_RR;
2358 break;
2360 Opc = NVPTX::TEX_1D_F32_F32_GRAD_RR;
2361 break;
2363 Opc = NVPTX::TEX_1D_S32_S32_RR;
2364 break;
2366 Opc = NVPTX::TEX_1D_S32_F32_RR;
2367 break;
2369 Opc = NVPTX::TEX_1D_S32_F32_LEVEL_RR;
2370 break;
2372 Opc = NVPTX::TEX_1D_S32_F32_GRAD_RR;
2373 break;
2375 Opc = NVPTX::TEX_1D_U32_S32_RR;
2376 break;
2378 Opc = NVPTX::TEX_1D_U32_F32_RR;
2379 break;
2381 Opc = NVPTX::TEX_1D_U32_F32_LEVEL_RR;
2382 break;
2384 Opc = NVPTX::TEX_1D_U32_F32_GRAD_RR;
2385 break;
2387 Opc = NVPTX::TEX_1D_ARRAY_F32_S32_RR;
2388 break;
2390 Opc = NVPTX::TEX_1D_ARRAY_F32_F32_RR;
2391 break;
2393 Opc = NVPTX::TEX_1D_ARRAY_F32_F32_LEVEL_RR;
2394 break;
2396 Opc = NVPTX::TEX_1D_ARRAY_F32_F32_GRAD_RR;
2397 break;
2399 Opc = NVPTX::TEX_1D_ARRAY_S32_S32_RR;
2400 break;
2402 Opc = NVPTX::TEX_1D_ARRAY_S32_F32_RR;
2403 break;
2405 Opc = NVPTX::TEX_1D_ARRAY_S32_F32_LEVEL_RR;
2406 break;
2408 Opc = NVPTX::TEX_1D_ARRAY_S32_F32_GRAD_RR;
2409 break;
2411 Opc = NVPTX::TEX_1D_ARRAY_U32_S32_RR;
2412 break;
2414 Opc = NVPTX::TEX_1D_ARRAY_U32_F32_RR;
2415 break;
2417 Opc = NVPTX::TEX_1D_ARRAY_U32_F32_LEVEL_RR;
2418 break;
2420 Opc = NVPTX::TEX_1D_ARRAY_U32_F32_GRAD_RR;
2421 break;
2423 Opc = NVPTX::TEX_2D_F32_S32_RR;
2424 break;
2426 Opc = NVPTX::TEX_2D_F32_F32_RR;
2427 break;
2429 Opc = NVPTX::TEX_2D_F32_F32_LEVEL_RR;
2430 break;
2432 Opc = NVPTX::TEX_2D_F32_F32_GRAD_RR;
2433 break;
2435 Opc = NVPTX::TEX_2D_S32_S32_RR;
2436 break;
2438 Opc = NVPTX::TEX_2D_S32_F32_RR;
2439 break;
2441 Opc = NVPTX::TEX_2D_S32_F32_LEVEL_RR;
2442 break;
2444 Opc = NVPTX::TEX_2D_S32_F32_GRAD_RR;
2445 break;
2447 Opc = NVPTX::TEX_2D_U32_S32_RR;
2448 break;
2450 Opc = NVPTX::TEX_2D_U32_F32_RR;
2451 break;
2453 Opc = NVPTX::TEX_2D_U32_F32_LEVEL_RR;
2454 break;
2456 Opc = NVPTX::TEX_2D_U32_F32_GRAD_RR;
2457 break;
2459 Opc = NVPTX::TEX_2D_ARRAY_F32_S32_RR;
2460 break;
2462 Opc = NVPTX::TEX_2D_ARRAY_F32_F32_RR;
2463 break;
2465 Opc = NVPTX::TEX_2D_ARRAY_F32_F32_LEVEL_RR;
2466 break;
2468 Opc = NVPTX::TEX_2D_ARRAY_F32_F32_GRAD_RR;
2469 break;
2471 Opc = NVPTX::TEX_2D_ARRAY_S32_S32_RR;
2472 break;
2474 Opc = NVPTX::TEX_2D_ARRAY_S32_F32_RR;
2475 break;
2477 Opc = NVPTX::TEX_2D_ARRAY_S32_F32_LEVEL_RR;
2478 break;
2480 Opc = NVPTX::TEX_2D_ARRAY_S32_F32_GRAD_RR;
2481 break;
2483 Opc = NVPTX::TEX_2D_ARRAY_U32_S32_RR;
2484 break;
2486 Opc = NVPTX::TEX_2D_ARRAY_U32_F32_RR;
2487 break;
2489 Opc = NVPTX::TEX_2D_ARRAY_U32_F32_LEVEL_RR;
2490 break;
2492 Opc = NVPTX::TEX_2D_ARRAY_U32_F32_GRAD_RR;
2493 break;
2495 Opc = NVPTX::TEX_3D_F32_S32_RR;
2496 break;
2498 Opc = NVPTX::TEX_3D_F32_F32_RR;
2499 break;
2501 Opc = NVPTX::TEX_3D_F32_F32_LEVEL_RR;
2502 break;
2504 Opc = NVPTX::TEX_3D_F32_F32_GRAD_RR;
2505 break;
2507 Opc = NVPTX::TEX_3D_S32_S32_RR;
2508 break;
2510 Opc = NVPTX::TEX_3D_S32_F32_RR;
2511 break;
2513 Opc = NVPTX::TEX_3D_S32_F32_LEVEL_RR;
2514 break;
2516 Opc = NVPTX::TEX_3D_S32_F32_GRAD_RR;
2517 break;
2519 Opc = NVPTX::TEX_3D_U32_S32_RR;
2520 break;
2522 Opc = NVPTX::TEX_3D_U32_F32_RR;
2523 break;
2525 Opc = NVPTX::TEX_3D_U32_F32_LEVEL_RR;
2526 break;
2528 Opc = NVPTX::TEX_3D_U32_F32_GRAD_RR;
2529 break;
2531 Opc = NVPTX::TEX_CUBE_F32_F32_RR;
2532 break;
2534 Opc = NVPTX::TEX_CUBE_F32_F32_LEVEL_RR;
2535 break;
2537 Opc = NVPTX::TEX_CUBE_S32_F32_RR;
2538 break;
2540 Opc = NVPTX::TEX_CUBE_S32_F32_LEVEL_RR;
2541 break;
2543 Opc = NVPTX::TEX_CUBE_U32_F32_RR;
2544 break;
2546 Opc = NVPTX::TEX_CUBE_U32_F32_LEVEL_RR;
2547 break;
2549 Opc = NVPTX::TEX_CUBE_ARRAY_F32_F32_RR;
2550 break;
2552 Opc = NVPTX::TEX_CUBE_ARRAY_F32_F32_LEVEL_RR;
2553 break;
2555 Opc = NVPTX::TEX_CUBE_ARRAY_S32_F32_RR;
2556 break;
2558 Opc = NVPTX::TEX_CUBE_ARRAY_S32_F32_LEVEL_RR;
2559 break;
2561 Opc = NVPTX::TEX_CUBE_ARRAY_U32_F32_RR;
2562 break;
2564 Opc = NVPTX::TEX_CUBE_ARRAY_U32_F32_LEVEL_RR;
2565 break;
2567 Opc = NVPTX::TLD4_R_2D_F32_F32_RR;
2568 break;
2570 Opc = NVPTX::TLD4_G_2D_F32_F32_RR;
2571 break;
2573 Opc = NVPTX::TLD4_B_2D_F32_F32_RR;
2574 break;
2576 Opc = NVPTX::TLD4_A_2D_F32_F32_RR;
2577 break;
2579 Opc = NVPTX::TLD4_R_2D_S32_F32_RR;
2580 break;
2582 Opc = NVPTX::TLD4_G_2D_S32_F32_RR;
2583 break;
2585 Opc = NVPTX::TLD4_B_2D_S32_F32_RR;
2586 break;
2588 Opc = NVPTX::TLD4_A_2D_S32_F32_RR;
2589 break;
2591 Opc = NVPTX::TLD4_R_2D_U32_F32_RR;
2592 break;
2594 Opc = NVPTX::TLD4_G_2D_U32_F32_RR;
2595 break;
2597 Opc = NVPTX::TLD4_B_2D_U32_F32_RR;
2598 break;
2600 Opc = NVPTX::TLD4_A_2D_U32_F32_RR;
2601 break;
2603 Opc = NVPTX::TEX_UNIFIED_1D_F32_S32_R;
2604 break;
2606 Opc = NVPTX::TEX_UNIFIED_1D_F32_F32_R;
2607 break;
2609 Opc = NVPTX::TEX_UNIFIED_1D_F32_F32_LEVEL_R;
2610 break;
2612 Opc = NVPTX::TEX_UNIFIED_1D_F32_F32_GRAD_R;
2613 break;
2615 Opc = NVPTX::TEX_UNIFIED_1D_S32_S32_R;
2616 break;
2618 Opc = NVPTX::TEX_UNIFIED_1D_S32_F32_R;
2619 break;
2621 Opc = NVPTX::TEX_UNIFIED_1D_S32_F32_LEVEL_R;
2622 break;
2624 Opc = NVPTX::TEX_UNIFIED_1D_S32_F32_GRAD_R;
2625 break;
2627 Opc = NVPTX::TEX_UNIFIED_1D_U32_S32_R;
2628 break;
2630 Opc = NVPTX::TEX_UNIFIED_1D_U32_F32_R;
2631 break;
2633 Opc = NVPTX::TEX_UNIFIED_1D_U32_F32_LEVEL_R;
2634 break;
2636 Opc = NVPTX::TEX_UNIFIED_1D_U32_F32_GRAD_R;
2637 break;
2639 Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_F32_S32_R;
2640 break;
2642 Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_F32_F32_R;
2643 break;
2645 Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_F32_F32_LEVEL_R;
2646 break;
2648 Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_F32_F32_GRAD_R;
2649 break;
2651 Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_S32_S32_R;
2652 break;
2654 Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_S32_F32_R;
2655 break;
2657 Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_S32_F32_LEVEL_R;
2658 break;
2660 Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_S32_F32_GRAD_R;
2661 break;
2663 Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_U32_S32_R;
2664 break;
2666 Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_U32_F32_R;
2667 break;
2669 Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_U32_F32_LEVEL_R;
2670 break;
2672 Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_U32_F32_GRAD_R;
2673 break;
2675 Opc = NVPTX::TEX_UNIFIED_2D_F32_S32_R;
2676 break;
2678 Opc = NVPTX::TEX_UNIFIED_2D_F32_F32_R;
2679 break;
2681 Opc = NVPTX::TEX_UNIFIED_2D_F32_F32_LEVEL_R;
2682 break;
2684 Opc = NVPTX::TEX_UNIFIED_2D_F32_F32_GRAD_R;
2685 break;
2687 Opc = NVPTX::TEX_UNIFIED_2D_S32_S32_R;
2688 break;
2690 Opc = NVPTX::TEX_UNIFIED_2D_S32_F32_R;
2691 break;
2693 Opc = NVPTX::TEX_UNIFIED_2D_S32_F32_LEVEL_R;
2694 break;
2696 Opc = NVPTX::TEX_UNIFIED_2D_S32_F32_GRAD_R;
2697 break;
2699 Opc = NVPTX::TEX_UNIFIED_2D_U32_S32_R;
2700 break;
2702 Opc = NVPTX::TEX_UNIFIED_2D_U32_F32_R;
2703 break;
2705 Opc = NVPTX::TEX_UNIFIED_2D_U32_F32_LEVEL_R;
2706 break;
2708 Opc = NVPTX::TEX_UNIFIED_2D_U32_F32_GRAD_R;
2709 break;
2711 Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_F32_S32_R;
2712 break;
2714 Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_F32_F32_R;
2715 break;
2717 Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_F32_F32_LEVEL_R;
2718 break;
2720 Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_F32_F32_GRAD_R;
2721 break;
2723 Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_S32_S32_R;
2724 break;
2726 Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_S32_F32_R;
2727 break;
2729 Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_S32_F32_LEVEL_R;
2730 break;
2732 Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_S32_F32_GRAD_R;
2733 break;
2735 Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_U32_S32_R;
2736 break;
2738 Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_U32_F32_R;
2739 break;
2741 Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_U32_F32_LEVEL_R;
2742 break;
2744 Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_U32_F32_GRAD_R;
2745 break;
2747 Opc = NVPTX::TEX_UNIFIED_3D_F32_S32_R;
2748 break;
2750 Opc = NVPTX::TEX_UNIFIED_3D_F32_F32_R;
2751 break;
2753 Opc = NVPTX::TEX_UNIFIED_3D_F32_F32_LEVEL_R;
2754 break;
2756 Opc = NVPTX::TEX_UNIFIED_3D_F32_F32_GRAD_R;
2757 break;
2759 Opc = NVPTX::TEX_UNIFIED_3D_S32_S32_R;
2760 break;
2762 Opc = NVPTX::TEX_UNIFIED_3D_S32_F32_R;
2763 break;
2765 Opc = NVPTX::TEX_UNIFIED_3D_S32_F32_LEVEL_R;
2766 break;
2768 Opc = NVPTX::TEX_UNIFIED_3D_S32_F32_GRAD_R;
2769 break;
2771 Opc = NVPTX::TEX_UNIFIED_3D_U32_S32_R;
2772 break;
2774 Opc = NVPTX::TEX_UNIFIED_3D_U32_F32_R;
2775 break;
2777 Opc = NVPTX::TEX_UNIFIED_3D_U32_F32_LEVEL_R;
2778 break;
2780 Opc = NVPTX::TEX_UNIFIED_3D_U32_F32_GRAD_R;
2781 break;
2783 Opc = NVPTX::TEX_UNIFIED_CUBE_F32_F32_R;
2784 break;
2786 Opc = NVPTX::TEX_UNIFIED_CUBE_F32_F32_LEVEL_R;
2787 break;
2789 Opc = NVPTX::TEX_UNIFIED_CUBE_S32_F32_R;
2790 break;
2792 Opc = NVPTX::TEX_UNIFIED_CUBE_S32_F32_LEVEL_R;
2793 break;
2795 Opc = NVPTX::TEX_UNIFIED_CUBE_U32_F32_R;
2796 break;
2798 Opc = NVPTX::TEX_UNIFIED_CUBE_U32_F32_LEVEL_R;
2799 break;
2801 Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_F32_F32_R;
2802 break;
2804 Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_F32_F32_LEVEL_R;
2805 break;
2807 Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_S32_F32_R;
2808 break;
2810 Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_S32_F32_LEVEL_R;
2811 break;
2813 Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_U32_F32_R;
2814 break;
2816 Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_U32_F32_LEVEL_R;
2817 break;
2819 Opc = NVPTX::TLD4_UNIFIED_R_2D_F32_F32_R;
2820 break;
2822 Opc = NVPTX::TLD4_UNIFIED_G_2D_F32_F32_R;
2823 break;
2825 Opc = NVPTX::TLD4_UNIFIED_B_2D_F32_F32_R;
2826 break;
2828 Opc = NVPTX::TLD4_UNIFIED_A_2D_F32_F32_R;
2829 break;
2831 Opc = NVPTX::TLD4_UNIFIED_R_2D_S32_F32_R;
2832 break;
2834 Opc = NVPTX::TLD4_UNIFIED_G_2D_S32_F32_R;
2835 break;
2837 Opc = NVPTX::TLD4_UNIFIED_B_2D_S32_F32_R;
2838 break;
2840 Opc = NVPTX::TLD4_UNIFIED_A_2D_S32_F32_R;
2841 break;
2843 Opc = NVPTX::TLD4_UNIFIED_R_2D_U32_F32_R;
2844 break;
2846 Opc = NVPTX::TLD4_UNIFIED_G_2D_U32_F32_R;
2847 break;
2849 Opc = NVPTX::TLD4_UNIFIED_B_2D_U32_F32_R;
2850 break;
2852 Opc = NVPTX::TLD4_UNIFIED_A_2D_U32_F32_R;
2853 break;
2854 }
2855
2856 // Copy over operands
2858 Ops.push_back(N->getOperand(0)); // Move chain to the back.
2859
2860 ReplaceNode(N, CurDAG->getMachineNode(Opc, SDLoc(N), N->getVTList(), Ops));
2861 return true;
2862}
2863
2864bool NVPTXDAGToDAGISel::trySurfaceIntrinsic(SDNode *N) {
2865 unsigned Opc = 0;
2866 switch (N->getOpcode()) {
2867 default: return false;
2869 Opc = NVPTX::SULD_1D_I8_CLAMP_R;
2870 break;
2872 Opc = NVPTX::SULD_1D_I16_CLAMP_R;
2873 break;
2875 Opc = NVPTX::SULD_1D_I32_CLAMP_R;
2876 break;
2878 Opc = NVPTX::SULD_1D_I64_CLAMP_R;
2879 break;
2881 Opc = NVPTX::SULD_1D_V2I8_CLAMP_R;
2882 break;
2884 Opc = NVPTX::SULD_1D_V2I16_CLAMP_R;
2885 break;
2887 Opc = NVPTX::SULD_1D_V2I32_CLAMP_R;
2888 break;
2890 Opc = NVPTX::SULD_1D_V2I64_CLAMP_R;
2891 break;
2893 Opc = NVPTX::SULD_1D_V4I8_CLAMP_R;
2894 break;
2896 Opc = NVPTX::SULD_1D_V4I16_CLAMP_R;
2897 break;
2899 Opc = NVPTX::SULD_1D_V4I32_CLAMP_R;
2900 break;
2902 Opc = NVPTX::SULD_1D_ARRAY_I8_CLAMP_R;
2903 break;
2905 Opc = NVPTX::SULD_1D_ARRAY_I16_CLAMP_R;
2906 break;
2908 Opc = NVPTX::SULD_1D_ARRAY_I32_CLAMP_R;
2909 break;
2911 Opc = NVPTX::SULD_1D_ARRAY_I64_CLAMP_R;
2912 break;
2914 Opc = NVPTX::SULD_1D_ARRAY_V2I8_CLAMP_R;
2915 break;
2917 Opc = NVPTX::SULD_1D_ARRAY_V2I16_CLAMP_R;
2918 break;
2920 Opc = NVPTX::SULD_1D_ARRAY_V2I32_CLAMP_R;
2921 break;
2923 Opc = NVPTX::SULD_1D_ARRAY_V2I64_CLAMP_R;
2924 break;
2926 Opc = NVPTX::SULD_1D_ARRAY_V4I8_CLAMP_R;
2927 break;
2929 Opc = NVPTX::SULD_1D_ARRAY_V4I16_CLAMP_R;
2930 break;
2932 Opc = NVPTX::SULD_1D_ARRAY_V4I32_CLAMP_R;
2933 break;
2935 Opc = NVPTX::SULD_2D_I8_CLAMP_R;
2936 break;
2938 Opc = NVPTX::SULD_2D_I16_CLAMP_R;
2939 break;
2941 Opc = NVPTX::SULD_2D_I32_CLAMP_R;
2942 break;
2944 Opc = NVPTX::SULD_2D_I64_CLAMP_R;
2945 break;
2947 Opc = NVPTX::SULD_2D_V2I8_CLAMP_R;
2948 break;
2950 Opc = NVPTX::SULD_2D_V2I16_CLAMP_R;
2951 break;
2953 Opc = NVPTX::SULD_2D_V2I32_CLAMP_R;
2954 break;
2956 Opc = NVPTX::SULD_2D_V2I64_CLAMP_R;
2957 break;
2959 Opc = NVPTX::SULD_2D_V4I8_CLAMP_R;
2960 break;
2962 Opc = NVPTX::SULD_2D_V4I16_CLAMP_R;
2963 break;
2965 Opc = NVPTX::SULD_2D_V4I32_CLAMP_R;
2966 break;
2968 Opc = NVPTX::SULD_2D_ARRAY_I8_CLAMP_R;
2969 break;
2971 Opc = NVPTX::SULD_2D_ARRAY_I16_CLAMP_R;
2972 break;
2974 Opc = NVPTX::SULD_2D_ARRAY_I32_CLAMP_R;
2975 break;
2977 Opc = NVPTX::SULD_2D_ARRAY_I64_CLAMP_R;
2978 break;
2980 Opc = NVPTX::SULD_2D_ARRAY_V2I8_CLAMP_R;
2981 break;
2983 Opc = NVPTX::SULD_2D_ARRAY_V2I16_CLAMP_R;
2984 break;
2986 Opc = NVPTX::SULD_2D_ARRAY_V2I32_CLAMP_R;
2987 break;
2989 Opc = NVPTX::SULD_2D_ARRAY_V2I64_CLAMP_R;
2990 break;
2992 Opc = NVPTX::SULD_2D_ARRAY_V4I8_CLAMP_R;
2993 break;
2995 Opc = NVPTX::SULD_2D_ARRAY_V4I16_CLAMP_R;
2996 break;
2998 Opc = NVPTX::SULD_2D_ARRAY_V4I32_CLAMP_R;
2999 break;
3001 Opc = NVPTX::SULD_3D_I8_CLAMP_R;
3002 break;
3004 Opc = NVPTX::SULD_3D_I16_CLAMP_R;
3005 break;
3007 Opc = NVPTX::SULD_3D_I32_CLAMP_R;
3008 break;
3010 Opc = NVPTX::SULD_3D_I64_CLAMP_R;
3011 break;
3013 Opc = NVPTX::SULD_3D_V2I8_CLAMP_R;
3014 break;
3016 Opc = NVPTX::SULD_3D_V2I16_CLAMP_R;
3017 break;
3019 Opc = NVPTX::SULD_3D_V2I32_CLAMP_R;
3020 break;
3022 Opc = NVPTX::SULD_3D_V2I64_CLAMP_R;
3023 break;
3025 Opc = NVPTX::SULD_3D_V4I8_CLAMP_R;
3026 break;
3028 Opc = NVPTX::SULD_3D_V4I16_CLAMP_R;
3029 break;
3031 Opc = NVPTX::SULD_3D_V4I32_CLAMP_R;
3032 break;
3034 Opc = NVPTX::SULD_1D_I8_TRAP_R;
3035 break;
3037 Opc = NVPTX::SULD_1D_I16_TRAP_R;
3038 break;
3040 Opc = NVPTX::SULD_1D_I32_TRAP_R;
3041 break;
3043 Opc = NVPTX::SULD_1D_I64_TRAP_R;
3044 break;
3046 Opc = NVPTX::SULD_1D_V2I8_TRAP_R;
3047 break;
3049 Opc = NVPTX::SULD_1D_V2I16_TRAP_R;
3050 break;
3052 Opc = NVPTX::SULD_1D_V2I32_TRAP_R;
3053 break;
3055 Opc = NVPTX::SULD_1D_V2I64_TRAP_R;
3056 break;
3058 Opc = NVPTX::SULD_1D_V4I8_TRAP_R;
3059 break;
3061 Opc = NVPTX::SULD_1D_V4I16_TRAP_R;
3062 break;
3064 Opc = NVPTX::SULD_1D_V4I32_TRAP_R;
3065 break;
3067 Opc = NVPTX::SULD_1D_ARRAY_I8_TRAP_R;
3068 break;
3070 Opc = NVPTX::SULD_1D_ARRAY_I16_TRAP_R;
3071 break;
3073 Opc = NVPTX::SULD_1D_ARRAY_I32_TRAP_R;
3074 break;
3076 Opc = NVPTX::SULD_1D_ARRAY_I64_TRAP_R;
3077 break;
3079 Opc = NVPTX::SULD_1D_ARRAY_V2I8_TRAP_R;
3080 break;
3082 Opc = NVPTX::SULD_1D_ARRAY_V2I16_TRAP_R;
3083 break;
3085 Opc = NVPTX::SULD_1D_ARRAY_V2I32_TRAP_R;
3086 break;
3088 Opc = NVPTX::SULD_1D_ARRAY_V2I64_TRAP_R;
3089 break;
3091 Opc = NVPTX::SULD_1D_ARRAY_V4I8_TRAP_R;
3092 break;
3094 Opc = NVPTX::SULD_1D_ARRAY_V4I16_TRAP_R;
3095 break;
3097 Opc = NVPTX::SULD_1D_ARRAY_V4I32_TRAP_R;
3098 break;
3100 Opc = NVPTX::SULD_2D_I8_TRAP_R;
3101 break;
3103 Opc = NVPTX::SULD_2D_I16_TRAP_R;
3104 break;
3106 Opc = NVPTX::SULD_2D_I32_TRAP_R;
3107 break;
3109 Opc = NVPTX::SULD_2D_I64_TRAP_R;
3110 break;
3112 Opc = NVPTX::SULD_2D_V2I8_TRAP_R;
3113 break;
3115 Opc = NVPTX::SULD_2D_V2I16_TRAP_R;
3116 break;
3118 Opc = NVPTX::SULD_2D_V2I32_TRAP_R;
3119 break;
3121 Opc = NVPTX::SULD_2D_V2I64_TRAP_R;
3122 break;
3124 Opc = NVPTX::SULD_2D_V4I8_TRAP_R;
3125 break;
3127 Opc = NVPTX::SULD_2D_V4I16_TRAP_R;
3128 break;
3130 Opc = NVPTX::SULD_2D_V4I32_TRAP_R;
3131 break;
3133 Opc = NVPTX::SULD_2D_ARRAY_I8_TRAP_R;
3134 break;
3136 Opc = NVPTX::SULD_2D_ARRAY_I16_TRAP_R;
3137 break;
3139 Opc = NVPTX::SULD_2D_ARRAY_I32_TRAP_R;
3140 break;
3142 Opc = NVPTX::SULD_2D_ARRAY_I64_TRAP_R;
3143 break;
3145 Opc = NVPTX::SULD_2D_ARRAY_V2I8_TRAP_R;
3146 break;
3148 Opc = NVPTX::SULD_2D_ARRAY_V2I16_TRAP_R;
3149 break;
3151 Opc = NVPTX::SULD_2D_ARRAY_V2I32_TRAP_R;
3152 break;
3154 Opc = NVPTX::SULD_2D_ARRAY_V2I64_TRAP_R;
3155 break;
3157 Opc = NVPTX::SULD_2D_ARRAY_V4I8_TRAP_R;
3158 break;
3160 Opc = NVPTX::SULD_2D_ARRAY_V4I16_TRAP_R;
3161 break;
3163 Opc = NVPTX::SULD_2D_ARRAY_V4I32_TRAP_R;
3164 break;
3166 Opc = NVPTX::SULD_3D_I8_TRAP_R;
3167 break;
3169 Opc = NVPTX::SULD_3D_I16_TRAP_R;
3170 break;
3172 Opc = NVPTX::SULD_3D_I32_TRAP_R;
3173 break;
3175 Opc = NVPTX::SULD_3D_I64_TRAP_R;
3176 break;
3178 Opc = NVPTX::SULD_3D_V2I8_TRAP_R;
3179 break;
3181 Opc = NVPTX::SULD_3D_V2I16_TRAP_R;
3182 break;
3184 Opc = NVPTX::SULD_3D_V2I32_TRAP_R;
3185 break;
3187 Opc = NVPTX::SULD_3D_V2I64_TRAP_R;
3188 break;
3190 Opc = NVPTX::SULD_3D_V4I8_TRAP_R;
3191 break;
3193 Opc = NVPTX::SULD_3D_V4I16_TRAP_R;
3194 break;
3196 Opc = NVPTX::SULD_3D_V4I32_TRAP_R;
3197 break;
3199 Opc = NVPTX::SULD_1D_I8_ZERO_R;
3200 break;
3202 Opc = NVPTX::SULD_1D_I16_ZERO_R;
3203 break;
3205 Opc = NVPTX::SULD_1D_I32_ZERO_R;
3206 break;
3208 Opc = NVPTX::SULD_1D_I64_ZERO_R;
3209 break;
3211 Opc = NVPTX::SULD_1D_V2I8_ZERO_R;
3212 break;
3214 Opc = NVPTX::SULD_1D_V2I16_ZERO_R;
3215 break;
3217 Opc = NVPTX::SULD_1D_V2I32_ZERO_R;
3218 break;
3220 Opc = NVPTX::SULD_1D_V2I64_ZERO_R;
3221 break;
3223 Opc = NVPTX::SULD_1D_V4I8_ZERO_R;
3224 break;
3226 Opc = NVPTX::SULD_1D_V4I16_ZERO_R;
3227 break;
3229 Opc = NVPTX::SULD_1D_V4I32_ZERO_R;
3230 break;
3232 Opc = NVPTX::SULD_1D_ARRAY_I8_ZERO_R;
3233 break;
3235 Opc = NVPTX::SULD_1D_ARRAY_I16_ZERO_R;
3236 break;
3238 Opc = NVPTX::SULD_1D_ARRAY_I32_ZERO_R;
3239 break;
3241 Opc = NVPTX::SULD_1D_ARRAY_I64_ZERO_R;
3242 break;
3244 Opc = NVPTX::SULD_1D_ARRAY_V2I8_ZERO_R;
3245 break;
3247 Opc = NVPTX::SULD_1D_ARRAY_V2I16_ZERO_R;
3248 break;
3250 Opc = NVPTX::SULD_1D_ARRAY_V2I32_ZERO_R;
3251 break;
3253 Opc = NVPTX::SULD_1D_ARRAY_V2I64_ZERO_R;
3254 break;
3256 Opc = NVPTX::SULD_1D_ARRAY_V4I8_ZERO_R;
3257 break;
3259 Opc = NVPTX::SULD_1D_ARRAY_V4I16_ZERO_R;
3260 break;
3262 Opc = NVPTX::SULD_1D_ARRAY_V4I32_ZERO_R;
3263 break;
3265 Opc = NVPTX::SULD_2D_I8_ZERO_R;
3266 break;
3268 Opc = NVPTX::SULD_2D_I16_ZERO_R;
3269 break;
3271 Opc = NVPTX::SULD_2D_I32_ZERO_R;
3272 break;
3274 Opc = NVPTX::SULD_2D_I64_ZERO_R;
3275 break;
3277 Opc = NVPTX::SULD_2D_V2I8_ZERO_R;
3278 break;
3280 Opc = NVPTX::SULD_2D_V2I16_ZERO_R;
3281 break;
3283 Opc = NVPTX::SULD_2D_V2I32_ZERO_R;
3284 break;
3286 Opc = NVPTX::SULD_2D_V2I64_ZERO_R;
3287 break;
3289 Opc = NVPTX::SULD_2D_V4I8_ZERO_R;
3290 break;
3292 Opc = NVPTX::SULD_2D_V4I16_ZERO_R;
3293 break;
3295 Opc = NVPTX::SULD_2D_V4I32_ZERO_R;
3296 break;
3298 Opc = NVPTX::SULD_2D_ARRAY_I8_ZERO_R;
3299 break;
3301 Opc = NVPTX::SULD_2D_ARRAY_I16_ZERO_R;
3302 break;
3304 Opc = NVPTX::SULD_2D_ARRAY_I32_ZERO_R;
3305 break;
3307 Opc = NVPTX::SULD_2D_ARRAY_I64_ZERO_R;
3308 break;
3310 Opc = NVPTX::SULD_2D_ARRAY_V2I8_ZERO_R;
3311 break;
3313 Opc = NVPTX::SULD_2D_ARRAY_V2I16_ZERO_R;
3314 break;
3316 Opc = NVPTX::SULD_2D_ARRAY_V2I32_ZERO_R;
3317 break;
3319 Opc = NVPTX::SULD_2D_ARRAY_V2I64_ZERO_R;
3320 break;
3322 Opc = NVPTX::SULD_2D_ARRAY_V4I8_ZERO_R;
3323 break;
3325 Opc = NVPTX::SULD_2D_ARRAY_V4I16_ZERO_R;
3326 break;
3328 Opc = NVPTX::SULD_2D_ARRAY_V4I32_ZERO_R;
3329 break;
3331 Opc = NVPTX::SULD_3D_I8_ZERO_R;
3332 break;
3334 Opc = NVPTX::SULD_3D_I16_ZERO_R;
3335 break;
3337 Opc = NVPTX::SULD_3D_I32_ZERO_R;
3338 break;
3340 Opc = NVPTX::SULD_3D_I64_ZERO_R;
3341 break;
3343 Opc = NVPTX::SULD_3D_V2I8_ZERO_R;
3344 break;
3346 Opc = NVPTX::SULD_3D_V2I16_ZERO_R;
3347 break;
3349 Opc = NVPTX::SULD_3D_V2I32_ZERO_R;
3350 break;
3352 Opc = NVPTX::SULD_3D_V2I64_ZERO_R;
3353 break;
3355 Opc = NVPTX::SULD_3D_V4I8_ZERO_R;
3356 break;
3358 Opc = NVPTX::SULD_3D_V4I16_ZERO_R;
3359 break;
3361 Opc = NVPTX::SULD_3D_V4I32_ZERO_R;
3362 break;
3363 }
3364
3365 // Copy over operands
3367 Ops.push_back(N->getOperand(0)); // Move chain to the back.
3368
3369 ReplaceNode(N, CurDAG->getMachineNode(Opc, SDLoc(N), N->getVTList(), Ops));
3370 return true;
3371}
3372
3373
3374/// SelectBFE - Look for instruction sequences that can be made more efficient
3375/// by using the 'bfe' (bit-field extract) PTX instruction
3376bool NVPTXDAGToDAGISel::tryBFE(SDNode *N) {
3377 SDLoc DL(N);
3378 SDValue LHS = N->getOperand(0);
3379 SDValue RHS = N->getOperand(1);
3380 SDValue Len;
3381 SDValue Start;
3382 SDValue Val;
3383 bool IsSigned = false;
3384
3385 if (N->getOpcode() == ISD::AND) {
3386 // Canonicalize the operands
3387 // We want 'and %val, %mask'
3388 if (isa<ConstantSDNode>(LHS) && !isa<ConstantSDNode>(RHS)) {
3389 std::swap(LHS, RHS);
3390 }
3391
3392 ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(RHS);
3393 if (!Mask) {
3394 // We need a constant mask on the RHS of the AND
3395 return false;
3396 }
3397
3398 // Extract the mask bits
3399 uint64_t MaskVal = Mask->getZExtValue();
3400 if (!isMask_64(MaskVal)) {
3401 // We *could* handle shifted masks here, but doing so would require an
3402 // 'and' operation to fix up the low-order bits so we would trade
3403 // shr+and for bfe+and, which has the same throughput
3404 return false;
3405 }
3406
3407 // How many bits are in our mask?
3408 uint64_t NumBits = llvm::countr_one(MaskVal);
3409 Len = CurDAG->getTargetConstant(NumBits, DL, MVT::i32);
3410
3411 if (LHS.getOpcode() == ISD::SRL || LHS.getOpcode() == ISD::SRA) {
3412 // We have a 'srl/and' pair, extract the effective start bit and length
3413 Val = LHS.getNode()->getOperand(0);
3414 Start = LHS.getNode()->getOperand(1);
3415 ConstantSDNode *StartConst = dyn_cast<ConstantSDNode>(Start);
3416 if (StartConst) {
3417 uint64_t StartVal = StartConst->getZExtValue();
3418 // How many "good" bits do we have left? "good" is defined here as bits
3419 // that exist in the original value, not shifted in.
3420 uint64_t GoodBits = Start.getValueSizeInBits() - StartVal;
3421 if (NumBits > GoodBits) {
3422 // Do not handle the case where bits have been shifted in. In theory
3423 // we could handle this, but the cost is likely higher than just
3424 // emitting the srl/and pair.
3425 return false;
3426 }
3427 Start = CurDAG->getTargetConstant(StartVal, DL, MVT::i32);
3428 } else {
3429 // Do not handle the case where the shift amount (can be zero if no srl
3430 // was found) is not constant. We could handle this case, but it would
3431 // require run-time logic that would be more expensive than just
3432 // emitting the srl/and pair.
3433 return false;
3434 }
3435 } else {
3436 // Do not handle the case where the LHS of the and is not a shift. While
3437 // it would be trivial to handle this case, it would just transform
3438 // 'and' -> 'bfe', but 'and' has higher-throughput.
3439 return false;
3440 }
3441 } else if (N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) {
3442 if (LHS->getOpcode() == ISD::AND) {
3443 ConstantSDNode *ShiftCnst = dyn_cast<ConstantSDNode>(RHS);
3444 if (!ShiftCnst) {
3445 // Shift amount must be constant
3446 return false;
3447 }
3448
3449 uint64_t ShiftAmt = ShiftCnst->getZExtValue();
3450
3451 SDValue AndLHS = LHS->getOperand(0);
3452 SDValue AndRHS = LHS->getOperand(1);
3453
3454 // Canonicalize the AND to have the mask on the RHS
3455 if (isa<ConstantSDNode>(AndLHS)) {
3456 std::swap(AndLHS, AndRHS);
3457 }
3458
3459 ConstantSDNode *MaskCnst = dyn_cast<ConstantSDNode>(AndRHS);
3460 if (!MaskCnst) {
3461 // Mask must be constant
3462 return false;
3463 }
3464
3465 uint64_t MaskVal = MaskCnst->getZExtValue();
3466 uint64_t NumZeros;
3467 uint64_t NumBits;
3468 if (isMask_64(MaskVal)) {
3469 NumZeros = 0;
3470 // The number of bits in the result bitfield will be the number of
3471 // trailing ones (the AND) minus the number of bits we shift off
3472 NumBits = llvm::countr_one(MaskVal) - ShiftAmt;
3473 } else if (isShiftedMask_64(MaskVal)) {
3474 NumZeros = llvm::countr_zero(MaskVal);
3475 unsigned NumOnes = llvm::countr_one(MaskVal >> NumZeros);
3476 // The number of bits in the result bitfield will be the number of
3477 // trailing zeros plus the number of set bits in the mask minus the
3478 // number of bits we shift off
3479 NumBits = NumZeros + NumOnes - ShiftAmt;
3480 } else {
3481 // This is not a mask we can handle
3482 return false;
3483 }
3484
3485 if (ShiftAmt < NumZeros) {
3486 // Handling this case would require extra logic that would make this
3487 // transformation non-profitable
3488 return false;
3489 }
3490
3491 Val = AndLHS;
3492 Start = CurDAG->getTargetConstant(ShiftAmt, DL, MVT::i32);
3493 Len = CurDAG->getTargetConstant(NumBits, DL, MVT::i32);
3494 } else if (LHS->getOpcode() == ISD::SHL) {
3495 // Here, we have a pattern like:
3496 //
3497 // (sra (shl val, NN), MM)
3498 // or
3499 // (srl (shl val, NN), MM)
3500 //
3501 // If MM >= NN, we can efficiently optimize this with bfe
3502 Val = LHS->getOperand(0);
3503
3504 SDValue ShlRHS = LHS->getOperand(1);
3505 ConstantSDNode *ShlCnst = dyn_cast<ConstantSDNode>(ShlRHS);
3506 if (!ShlCnst) {
3507 // Shift amount must be constant
3508 return false;
3509 }
3510 uint64_t InnerShiftAmt = ShlCnst->getZExtValue();
3511
3512 SDValue ShrRHS = RHS;
3513 ConstantSDNode *ShrCnst = dyn_cast<ConstantSDNode>(ShrRHS);
3514 if (!ShrCnst) {
3515 // Shift amount must be constant
3516 return false;
3517 }
3518 uint64_t OuterShiftAmt = ShrCnst->getZExtValue();
3519
3520 // To avoid extra codegen and be profitable, we need Outer >= Inner
3521 if (OuterShiftAmt < InnerShiftAmt) {
3522 return false;
3523 }
3524
3525 // If the outer shift is more than the type size, we have no bitfield to
3526 // extract (since we also check that the inner shift is <= the outer shift
3527 // then this also implies that the inner shift is < the type size)
3528 if (OuterShiftAmt >= Val.getValueSizeInBits()) {
3529 return false;
3530 }
3531
3532 Start = CurDAG->getTargetConstant(OuterShiftAmt - InnerShiftAmt, DL,
3533 MVT::i32);
3534 Len = CurDAG->getTargetConstant(Val.getValueSizeInBits() - OuterShiftAmt,
3535 DL, MVT::i32);
3536
3537 if (N->getOpcode() == ISD::SRA) {
3538 // If we have a arithmetic right shift, we need to use the signed bfe
3539 // variant
3540 IsSigned = true;
3541 }
3542 } else {
3543 // No can do...
3544 return false;
3545 }
3546 } else {
3547 // No can do...
3548 return false;
3549 }
3550
3551
3552 unsigned Opc;
3553 // For the BFE operations we form here from "and" and "srl", always use the
3554 // unsigned variants.
3555 if (Val.getValueType() == MVT::i32) {
3556 if (IsSigned) {
3557 Opc = NVPTX::BFE_S32rii;
3558 } else {
3559 Opc = NVPTX::BFE_U32rii;
3560 }
3561 } else if (Val.getValueType() == MVT::i64) {
3562 if (IsSigned) {
3563 Opc = NVPTX::BFE_S64rii;
3564 } else {
3565 Opc = NVPTX::BFE_U64rii;
3566 }
3567 } else {
3568 // We cannot handle this type
3569 return false;
3570 }
3571
3572 SDValue Ops[] = {
3573 Val, Start, Len
3574 };
3575
3576 ReplaceNode(N, CurDAG->getMachineNode(Opc, DL, N->getVTList(), Ops));
3577 return true;
3578}
3579
3580// SelectDirectAddr - Match a direct address for DAG.
3581// A direct address could be a globaladdress or externalsymbol.
3582bool NVPTXDAGToDAGISel::SelectDirectAddr(SDValue N, SDValue &Address) {
3583 // Return true if TGA or ES.
3584 if (N.getOpcode() == ISD::TargetGlobalAddress ||
3585 N.getOpcode() == ISD::TargetExternalSymbol) {
3586 Address = N;
3587 return true;
3588 }
3589 if (N.getOpcode() == NVPTXISD::Wrapper) {
3590 Address = N.getOperand(0);
3591 return true;
3592 }
3593 // addrspacecast(MoveParam(arg_symbol) to addrspace(PARAM)) -> arg_symbol
3594 if (AddrSpaceCastSDNode *CastN = dyn_cast<AddrSpaceCastSDNode>(N)) {
3595 if (CastN->getSrcAddressSpace() == ADDRESS_SPACE_GENERIC &&
3598 return SelectDirectAddr(CastN->getOperand(0).getOperand(0), Address);
3599 }
3600 return false;
3601}
3602
3603// symbol+offset
3604bool NVPTXDAGToDAGISel::SelectADDRsi_imp(
3605 SDNode *OpNode, SDValue Addr, SDValue &Base, SDValue &Offset, MVT mvt) {
3606 if (Addr.getOpcode() == ISD::ADD) {
3607 if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1))) {
3608 SDValue base = Addr.getOperand(0);
3609 if (SelectDirectAddr(base, Base)) {
3610 Offset = CurDAG->getTargetConstant(CN->getZExtValue(), SDLoc(OpNode),
3611 mvt);
3612 return true;
3613 }
3614 }
3615 }
3616 return false;
3617}
3618
3619// symbol+offset
3620bool NVPTXDAGToDAGISel::SelectADDRsi(SDNode *OpNode, SDValue Addr,
3622 return SelectADDRsi_imp(OpNode, Addr, Base, Offset, MVT::i32);
3623}
3624
3625// symbol+offset
3626bool NVPTXDAGToDAGISel::SelectADDRsi64(SDNode *OpNode, SDValue Addr,
3628 return SelectADDRsi_imp(OpNode, Addr, Base, Offset, MVT::i64);
3629}
3630
3631// register+offset
3632bool NVPTXDAGToDAGISel::SelectADDRri_imp(
3633 SDNode *OpNode, SDValue Addr, SDValue &Base, SDValue &Offset, MVT mvt) {
3634 if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Addr)) {
3635 Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), mvt);
3636 Offset = CurDAG->getTargetConstant(0, SDLoc(OpNode), mvt);
3637 return true;
3638 }
3639 if (Addr.getOpcode() == ISD::TargetExternalSymbol ||
3640 Addr.getOpcode() == ISD::TargetGlobalAddress)
3641 return false; // direct calls.
3642
3643 if (Addr.getOpcode() == ISD::ADD) {
3644 if (SelectDirectAddr(Addr.getOperand(0), Addr)) {
3645 return false;
3646 }
3647 if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1))) {
3648 if (FrameIndexSDNode *FIN =
3649 dyn_cast<FrameIndexSDNode>(Addr.getOperand(0)))
3650 // Constant offset from frame ref.
3651 Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), mvt);
3652 else
3653 Base = Addr.getOperand(0);
3654 Offset = CurDAG->getTargetConstant(CN->getZExtValue(), SDLoc(OpNode),
3655 mvt);
3656 return true;
3657 }
3658 }
3659 return false;
3660}
3661
3662// register+offset
3663bool NVPTXDAGToDAGISel::SelectADDRri(SDNode *OpNode, SDValue Addr,
3665 return SelectADDRri_imp(OpNode, Addr, Base, Offset, MVT::i32);
3666}
3667
3668// register+offset
3669bool NVPTXDAGToDAGISel::SelectADDRri64(SDNode *OpNode, SDValue Addr,
3671 return SelectADDRri_imp(OpNode, Addr, Base, Offset, MVT::i64);
3672}
3673
3674bool NVPTXDAGToDAGISel::ChkMemSDNodeAddressSpace(SDNode *N,
3675 unsigned int spN) const {
3676 const Value *Src = nullptr;
3677 if (MemSDNode *mN = dyn_cast<MemSDNode>(N)) {
3678 if (spN == 0 && mN->getMemOperand()->getPseudoValue())
3679 return true;
3680 Src = mN->getMemOperand()->getValue();
3681 }
3682 if (!Src)
3683 return false;
3684 if (auto *PT = dyn_cast<PointerType>(Src->getType()))
3685 return (PT->getAddressSpace() == spN);
3686 return false;
3687}
3688
3689/// SelectInlineAsmMemoryOperand - Implement addressing mode selection for
3690/// inline asm expressions.
3692 const SDValue &Op, unsigned ConstraintID, std::vector<SDValue> &OutOps) {
3693 SDValue Op0, Op1;
3694 switch (ConstraintID) {
3695 default:
3696 return true;
3697 case InlineAsm::Constraint_m: // memory
3698 if (SelectDirectAddr(Op, Op0)) {
3699 OutOps.push_back(Op0);
3700 OutOps.push_back(CurDAG->getTargetConstant(0, SDLoc(Op), MVT::i32));
3701 return false;
3702 }
3703 if (SelectADDRri(Op.getNode(), Op, Op0, Op1)) {
3704 OutOps.push_back(Op0);
3705 OutOps.push_back(Op1);
3706 return false;
3707 }
3708 break;
3709 }
3710 return true;
3711}
3712
3713/// GetConvertOpcode - Returns the CVT_ instruction opcode that implements a
3714/// conversion from \p SrcTy to \p DestTy.
3715unsigned NVPTXDAGToDAGISel::GetConvertOpcode(MVT DestTy, MVT SrcTy,
3716 bool IsSigned) {
3717 switch (SrcTy.SimpleTy) {
3718 default:
3719 llvm_unreachable("Unhandled source type");
3720 case MVT::i8:
3721 switch (DestTy.SimpleTy) {
3722 default:
3723 llvm_unreachable("Unhandled dest type");
3724 case MVT::i16:
3725 return IsSigned ? NVPTX::CVT_s16_s8 : NVPTX::CVT_u16_u8;
3726 case MVT::i32:
3727 return IsSigned ? NVPTX::CVT_s32_s8 : NVPTX::CVT_u32_u8;
3728 case MVT::i64:
3729 return IsSigned ? NVPTX::CVT_s64_s8 : NVPTX::CVT_u64_u8;
3730 }
3731 case MVT::i16:
3732 switch (DestTy.SimpleTy) {
3733 default:
3734 llvm_unreachable("Unhandled dest type");
3735 case MVT::i8:
3736 return IsSigned ? NVPTX::CVT_s8_s16 : NVPTX::CVT_u8_u16;
3737 case MVT::i32:
3738 return IsSigned ? NVPTX::CVT_s32_s16 : NVPTX::CVT_u32_u16;
3739 case MVT::i64:
3740 return IsSigned ? NVPTX::CVT_s64_s16 : NVPTX::CVT_u64_u16;
3741 }
3742 case MVT::i32:
3743 switch (DestTy.SimpleTy) {
3744 default:
3745 llvm_unreachable("Unhandled dest type");
3746 case MVT::i8:
3747 return IsSigned ? NVPTX::CVT_s8_s32 : NVPTX::CVT_u8_u32;
3748 case MVT::i16:
3749 return IsSigned ? NVPTX::CVT_s16_s32 : NVPTX::CVT_u16_u32;
3750 case MVT::i64:
3751 return IsSigned ? NVPTX::CVT_s64_s32 : NVPTX::CVT_u64_u32;
3752 }
3753 case MVT::i64:
3754 switch (DestTy.SimpleTy) {
3755 default:
3756 llvm_unreachable("Unhandled dest type");
3757 case MVT::i8:
3758 return IsSigned ? NVPTX::CVT_s8_s64 : NVPTX::CVT_u8_u64;
3759 case MVT::i16:
3760 return IsSigned ? NVPTX::CVT_s16_s64 : NVPTX::CVT_u16_u64;
3761 case MVT::i32:
3762 return IsSigned ? NVPTX::CVT_s32_s64 : NVPTX::CVT_u32_u64;
3763 }
3764 }
3765}
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
amdgpu aa AMDGPU Address space based Alias Analysis Wrapper
Atomic ordering constants.
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
uint64_t Addr
#define F(x, y, z)
Definition: MD5.cpp:55
static unsigned int getCodeAddrSpace(MemSDNode *N)
static std::optional< unsigned > pickOpcodeForVT(MVT::SimpleValueType VT, unsigned Opcode_i8, unsigned Opcode_i16, unsigned Opcode_i32, std::optional< unsigned > Opcode_i64, unsigned Opcode_f16, unsigned Opcode_f16x2, unsigned Opcode_f32, std::optional< unsigned > Opcode_f64)
static int getLdStRegType(EVT VT)
static unsigned getPTXCmpMode(const CondCodeSDNode &CondCode, bool FTZ)
#define PASS_NAME
#define DEBUG_TYPE
static bool canLowerToLDG(MemSDNode *N, const NVPTXSubtarget &Subtarget, unsigned CodeAddrSpace, MachineFunction *F)
if(VerifyEach)
const char LLVMTargetMachineRef TM
#define INITIALIZE_PASS(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:38
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
Value * RHS
Value * LHS
unsigned getSrcAddressSpace() const
unsigned getDestAddressSpace() const
This is an SDNode representing atomic operations.
const SDValue & getVal() const
uint64_t getZExtValue() const
unsigned getPointerSizeInBits(unsigned AS=0) const
Layout pointer size, in bits FIXME: The defaults need to be removed once all of the backends/clients ...
Definition: DataLayout.h:406
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:311
bool isIndexed() const
Return true if this is a pre/post inc/dec load/store.
This class is used to represent ISD::LOAD nodes.
ISD::LoadExtType getExtensionType() const
Return whether this is a plain node, or one of the varieties of value-extending loads.
Machine Value Type.
SimpleValueType SimpleTy
bool isVector() const
Return true if this is a vector value type.
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
A description of a memory reference used in the backend.
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
bool isVolatile() const
EVT getMemoryVT() const
Return the type of the in-memory value.
bool runOnMachineFunction(MachineFunction &MF) override
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...
bool SelectInlineAsmMemoryOperand(const SDValue &Op, unsigned ConstraintID, std::vector< SDValue > &OutOps) override
SelectInlineAsmMemoryOperand - Implement addressing mode selection for inline asm expressions.
const NVPTXSubtarget * Subtarget
const NVPTXTargetLowering * getTargetLowering() const override
bool useF32FTZ(const MachineFunction &MF) const
bool allowFMA(MachineFunction &MF, CodeGenOpt::Level OptLevel) const
bool allowUnsafeFPMath(MachineFunction &MF) const
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
Represents one node in the SelectionDAG.
const SDValue & getOperand(unsigned Num) const
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
SDNode * getNode() const
get the SDNode which holds the desired result
EVT getValueType() const
Return the ValueType of the referenced return value.
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
unsigned getOpcode() const
SelectionDAGISel - This is the common base class used for SelectionDAG-based pattern-matching instruc...
CodeGenOpt::Level OptLevel
MachineFunction * MF
void ReplaceUses(SDValue F, SDValue T)
ReplaceUses - replace all uses of the old node F with the use of the new node T.
bool runOnMachineFunction(MachineFunction &MF) override
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...
void ReplaceNode(SDNode *F, SDNode *T)
Replace all uses of F with T, then remove F from the DAG.
SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
void setNodeMemRefs(MachineSDNode *N, ArrayRef< MachineMemOperand * > NewMemRefs)
Mutate the specified machine node's memory references to the provided list.
const DataLayout & getDataLayout() const
Definition: SelectionDAG.h:472
SDValue getTargetFrameIndex(int FI, EVT VT)
Definition: SelectionDAG.h:726
SDValue getTargetConstantFP(double Val, const SDLoc &DL, EVT VT)
Definition: SelectionDAG.h:707
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
Definition: SelectionDAG.h:675
bool empty() const
Definition: SmallVector.h:94
void push_back(const T &Elt)
Definition: SmallVector.h:416
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1200
This class is used to represent ISD::STORE nodes.
const SDValue & getValue() const
LLVM Value Representation.
Definition: Value.h:74
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Definition: Lint.cpp:81
constexpr char IsVolatile[]
Key for Kernel::Arg::Metadata::mIsVolatile.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:119
Level
Code generation optimization level.
Definition: CodeGen.h:57
@ ConstantFP
Definition: ISDOpcodes.h:77
@ ATOMIC_STORE
OUTCHAIN = ATOMIC_STORE(INCHAIN, ptr, val) This corresponds to "store atomic" instruction.
Definition: ISDOpcodes.h:1178
@ ADD
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:239
@ LOAD
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
Definition: ISDOpcodes.h:978
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
Definition: ISDOpcodes.h:898
@ TargetExternalSymbol
Definition: ISDOpcodes.h:169
@ ATOMIC_LOAD
Val, OUTCHAIN = ATOMIC_LOAD(INCHAIN, ptr) This corresponds to "load atomic" instruction.
Definition: ISDOpcodes.h:1174
@ TargetGlobalAddress
TargetGlobalAddress - Like GlobalAddress, but the DAG does no folding or anything else with this node...
Definition: ISDOpcodes.h:164
@ SHL
Shift and rotation operations.
Definition: ISDOpcodes.h:704
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition: ISDOpcodes.h:534
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition: ISDOpcodes.h:679
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition: ISDOpcodes.h:184
@ ADDRSPACECAST
ADDRSPACECAST - This operator converts between pointers of different address spaces.
Definition: ISDOpcodes.h:902
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition: ISDOpcodes.h:192
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
Definition: ISDOpcodes.h:1447