LLVM  9.0.0svn
NVPTXISelDAGToDAG.cpp
Go to the documentation of this file.
1 //===-- NVPTXISelDAGToDAG.cpp - A dag to dag inst selector for NVPTX ------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file defines an instruction selector for the NVPTX target.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "NVPTXISelDAGToDAG.h"
14 #include "NVPTXUtilities.h"
16 #include "llvm/IR/GlobalValue.h"
17 #include "llvm/IR/Instructions.h"
20 #include "llvm/Support/Debug.h"
24 
25 using namespace llvm;
26 
27 #define DEBUG_TYPE "nvptx-isel"
28 
29 /// createNVPTXISelDag - This pass converts a legalized DAG into a
30 /// NVPTX-specific DAG, ready for instruction scheduling.
32  llvm::CodeGenOpt::Level OptLevel) {
33  return new NVPTXDAGToDAGISel(TM, OptLevel);
34 }
35 
37  CodeGenOpt::Level OptLevel)
38  : SelectionDAGISel(tm, OptLevel), TM(tm) {
39  doMulWide = (OptLevel > 0);
40 }
41 
43  Subtarget = &static_cast<const NVPTXSubtarget &>(MF.getSubtarget());
45 }
46 
47 int NVPTXDAGToDAGISel::getDivF32Level() const {
49 }
50 
51 bool NVPTXDAGToDAGISel::usePrecSqrtF32() const {
53 }
54 
55 bool NVPTXDAGToDAGISel::useF32FTZ() const {
57 }
58 
59 bool NVPTXDAGToDAGISel::allowFMA() const {
61  return TL->allowFMA(*MF, OptLevel);
62 }
63 
64 bool NVPTXDAGToDAGISel::allowUnsafeFPMath() const {
66  return TL->allowUnsafeFPMath(*MF);
67 }
68 
69 bool NVPTXDAGToDAGISel::useShortPointers() const {
70  return TM.useShortPointers();
71 }
72 
73 /// Select - Select instructions not customized! Used for
74 /// expanded, promoted and normal instructions.
75 void NVPTXDAGToDAGISel::Select(SDNode *N) {
76 
77  if (N->isMachineOpcode()) {
78  N->setNodeId(-1);
79  return; // Already selected.
80  }
81 
82  switch (N->getOpcode()) {
83  case ISD::LOAD:
84  case ISD::ATOMIC_LOAD:
85  if (tryLoad(N))
86  return;
87  break;
88  case ISD::STORE:
89  case ISD::ATOMIC_STORE:
90  if (tryStore(N))
91  return;
92  break;
94  if (tryEXTRACT_VECTOR_ELEMENT(N))
95  return;
96  break;
98  SelectSETP_F16X2(N);
99  return;
100 
101  case NVPTXISD::LoadV2:
102  case NVPTXISD::LoadV4:
103  if (tryLoadVector(N))
104  return;
105  break;
106  case NVPTXISD::LDGV2:
107  case NVPTXISD::LDGV4:
108  case NVPTXISD::LDUV2:
109  case NVPTXISD::LDUV4:
110  if (tryLDGLDU(N))
111  return;
112  break;
113  case NVPTXISD::StoreV2:
114  case NVPTXISD::StoreV4:
115  if (tryStoreVector(N))
116  return;
117  break;
118  case NVPTXISD::LoadParam:
121  if (tryLoadParam(N))
122  return;
123  break;
127  if (tryStoreRetval(N))
128  return;
129  break;
135  if (tryStoreParam(N))
136  return;
137  break;
139  if (tryIntrinsicNoChain(N))
140  return;
141  break;
143  if (tryIntrinsicChain(N))
144  return;
145  break;
314  if (tryTextureIntrinsic(N))
315  return;
316  break;
482  if (trySurfaceIntrinsic(N))
483  return;
484  break;
485  case ISD::AND:
486  case ISD::SRA:
487  case ISD::SRL:
488  // Try to select BFE
489  if (tryBFE(N))
490  return;
491  break;
492  case ISD::ADDRSPACECAST:
493  SelectAddrSpaceCast(N);
494  return;
495  case ISD::ConstantFP:
496  if (tryConstantFP16(N))
497  return;
498  break;
499  default:
500  break;
501  }
502  SelectCode(N);
503 }
504 
505 bool NVPTXDAGToDAGISel::tryIntrinsicChain(SDNode *N) {
506  unsigned IID = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
507  switch (IID) {
508  default:
509  return false;
510  case Intrinsic::nvvm_ldg_global_f:
511  case Intrinsic::nvvm_ldg_global_i:
512  case Intrinsic::nvvm_ldg_global_p:
513  case Intrinsic::nvvm_ldu_global_f:
514  case Intrinsic::nvvm_ldu_global_i:
515  case Intrinsic::nvvm_ldu_global_p:
516  return tryLDGLDU(N);
517  }
518 }
519 
520 // There's no way to specify FP16 immediates in .f16 ops, so we have to
521 // load them into an .f16 register first.
522 bool NVPTXDAGToDAGISel::tryConstantFP16(SDNode *N) {
523  if (N->getValueType(0) != MVT::f16)
524  return false;
526  cast<ConstantFPSDNode>(N)->getValueAPF(), SDLoc(N), MVT::f16);
527  SDNode *LoadConstF16 =
528  CurDAG->getMachineNode(NVPTX::LOAD_CONST_F16, SDLoc(N), MVT::f16, Val);
529  ReplaceNode(N, LoadConstF16);
530  return true;
531 }
532 
533 // Map ISD:CONDCODE value to appropriate CmpMode expected by
534 // NVPTXInstPrinter::printCmpMode()
535 static unsigned getPTXCmpMode(const CondCodeSDNode &CondCode, bool FTZ) {
537  unsigned PTXCmpMode = [](ISD::CondCode CC) {
538  switch (CC) {
539  default:
540  llvm_unreachable("Unexpected condition code.");
541  case ISD::SETOEQ:
542  return CmpMode::EQ;
543  case ISD::SETOGT:
544  return CmpMode::GT;
545  case ISD::SETOGE:
546  return CmpMode::GE;
547  case ISD::SETOLT:
548  return CmpMode::LT;
549  case ISD::SETOLE:
550  return CmpMode::LE;
551  case ISD::SETONE:
552  return CmpMode::NE;
553  case ISD::SETO:
554  return CmpMode::NUM;
555  case ISD::SETUO:
556  return CmpMode::NotANumber;
557  case ISD::SETUEQ:
558  return CmpMode::EQU;
559  case ISD::SETUGT:
560  return CmpMode::GTU;
561  case ISD::SETUGE:
562  return CmpMode::GEU;
563  case ISD::SETULT:
564  return CmpMode::LTU;
565  case ISD::SETULE:
566  return CmpMode::LEU;
567  case ISD::SETUNE:
568  return CmpMode::NEU;
569  case ISD::SETEQ:
570  return CmpMode::EQ;
571  case ISD::SETGT:
572  return CmpMode::GT;
573  case ISD::SETGE:
574  return CmpMode::GE;
575  case ISD::SETLT:
576  return CmpMode::LT;
577  case ISD::SETLE:
578  return CmpMode::LE;
579  case ISD::SETNE:
580  return CmpMode::NE;
581  }
582  }(CondCode.get());
583 
584  if (FTZ)
585  PTXCmpMode |= NVPTX::PTXCmpMode::FTZ_FLAG;
586 
587  return PTXCmpMode;
588 }
589 
590 bool NVPTXDAGToDAGISel::SelectSETP_F16X2(SDNode *N) {
591  unsigned PTXCmpMode =
592  getPTXCmpMode(*cast<CondCodeSDNode>(N->getOperand(2)), useF32FTZ());
593  SDLoc DL(N);
594  SDNode *SetP = CurDAG->getMachineNode(
595  NVPTX::SETP_f16x2rr, DL, MVT::i1, MVT::i1, N->getOperand(0),
596  N->getOperand(1), CurDAG->getTargetConstant(PTXCmpMode, DL, MVT::i32));
597  ReplaceNode(N, SetP);
598  return true;
599 }
600 
601 // Find all instances of extract_vector_elt that use this v2f16 vector
602 // and coalesce them into a scattering move instruction.
603 bool NVPTXDAGToDAGISel::tryEXTRACT_VECTOR_ELEMENT(SDNode *N) {
604  SDValue Vector = N->getOperand(0);
605 
606  // We only care about f16x2 as it's the only real vector type we
607  // need to deal with.
608  if (Vector.getSimpleValueType() != MVT::v2f16)
609  return false;
610 
611  // Find and record all uses of this vector that extract element 0 or 1.
613  for (const auto &U : Vector.getNode()->uses()) {
614  if (U->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
615  continue;
616  if (U->getOperand(0) != Vector)
617  continue;
618  if (const ConstantSDNode *IdxConst =
619  dyn_cast<ConstantSDNode>(U->getOperand(1))) {
620  if (IdxConst->getZExtValue() == 0)
621  E0.push_back(U);
622  else if (IdxConst->getZExtValue() == 1)
623  E1.push_back(U);
624  else
625  llvm_unreachable("Invalid vector index.");
626  }
627  }
628 
629  // There's no point scattering f16x2 if we only ever access one
630  // element of it.
631  if (E0.empty() || E1.empty())
632  return false;
633 
634  unsigned Op = NVPTX::SplitF16x2;
635  // If the vector has been BITCAST'ed from i32, we can use original
636  // value directly and avoid register-to-register move.
637  SDValue Source = Vector;
638  if (Vector->getOpcode() == ISD::BITCAST) {
639  Op = NVPTX::SplitI32toF16x2;
640  Source = Vector->getOperand(0);
641  }
642  // Merge (f16 extractelt(V, 0), f16 extractelt(V,1))
643  // into f16,f16 SplitF16x2(V)
644  SDNode *ScatterOp =
645  CurDAG->getMachineNode(Op, SDLoc(N), MVT::f16, MVT::f16, Source);
646  for (auto *Node : E0)
647  ReplaceUses(SDValue(Node, 0), SDValue(ScatterOp, 0));
648  for (auto *Node : E1)
649  ReplaceUses(SDValue(Node, 0), SDValue(ScatterOp, 1));
650 
651  return true;
652 }
653 
654 static unsigned int getCodeAddrSpace(MemSDNode *N) {
655  const Value *Src = N->getMemOperand()->getValue();
656 
657  if (!Src)
659 
660  if (auto *PT = dyn_cast<PointerType>(Src->getType())) {
661  switch (PT->getAddressSpace()) {
668  default: break;
669  }
670  }
672 }
673 
675  unsigned CodeAddrSpace, MachineFunction *F) {
676  // We use ldg (i.e. ld.global.nc) for invariant loads from the global address
677  // space.
678  //
679  // We have two ways of identifying invariant loads: Loads may be explicitly
680  // marked as invariant, or we may infer them to be invariant.
681  //
682  // We currently infer invariance for loads from
683  // - constant global variables, and
684  // - kernel function pointer params that are noalias (i.e. __restrict) and
685  // never written to.
686  //
687  // TODO: Perform a more powerful invariance analysis (ideally IPO, and ideally
688  // not during the SelectionDAG phase).
689  //
690  // TODO: Infer invariance only at -O2. We still want to use ldg at -O0 for
691  // explicitly invariant loads because these are how clang tells us to use ldg
692  // when the user uses a builtin.
693  if (!Subtarget.hasLDG() || CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL)
694  return false;
695 
696  if (N->isInvariant())
697  return true;
698 
699  bool IsKernelFn = isKernelFunction(F->getFunction());
700 
701  // We use GetUnderlyingObjects() here instead of GetUnderlyingObject() mainly
702  // because the former looks through phi nodes while the latter does not. We
703  // need to look through phi nodes to handle pointer induction variables.
705  GetUnderlyingObjects(const_cast<Value *>(N->getMemOperand()->getValue()),
706  Objs, F->getDataLayout());
707 
708  return all_of(Objs, [&](Value *V) {
709  if (auto *A = dyn_cast<const Argument>(V))
710  return IsKernelFn && A->onlyReadsMemory() && A->hasNoAliasAttr();
711  if (auto *GV = dyn_cast<const GlobalVariable>(V))
712  return GV->isConstant();
713  return false;
714  });
715 }
716 
717 bool NVPTXDAGToDAGISel::tryIntrinsicNoChain(SDNode *N) {
718  unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
719  switch (IID) {
720  default:
721  return false;
722  case Intrinsic::nvvm_texsurf_handle_internal:
723  SelectTexSurfHandle(N);
724  return true;
725  }
726 }
727 
728 void NVPTXDAGToDAGISel::SelectTexSurfHandle(SDNode *N) {
729  // Op 0 is the intrinsic ID
730  SDValue Wrapper = N->getOperand(1);
731  SDValue GlobalVal = Wrapper.getOperand(0);
732  ReplaceNode(N, CurDAG->getMachineNode(NVPTX::texsurf_handles, SDLoc(N),
733  MVT::i64, GlobalVal));
734 }
735 
736 void NVPTXDAGToDAGISel::SelectAddrSpaceCast(SDNode *N) {
737  SDValue Src = N->getOperand(0);
738  AddrSpaceCastSDNode *CastN = cast<AddrSpaceCastSDNode>(N);
739  unsigned SrcAddrSpace = CastN->getSrcAddressSpace();
740  unsigned DstAddrSpace = CastN->getDestAddressSpace();
741  assert(SrcAddrSpace != DstAddrSpace &&
742  "addrspacecast must be between different address spaces");
743 
744  if (DstAddrSpace == ADDRESS_SPACE_GENERIC) {
745  // Specific to generic
746  unsigned Opc;
747  switch (SrcAddrSpace) {
748  default: report_fatal_error("Bad address space in addrspacecast");
750  Opc = TM.is64Bit() ? NVPTX::cvta_global_yes_64 : NVPTX::cvta_global_yes;
751  break;
753  Opc = TM.is64Bit() ? (useShortPointers() ? NVPTX::cvta_shared_yes_6432
754  : NVPTX::cvta_shared_yes_64)
755  : NVPTX::cvta_shared_yes;
756  break;
757  case ADDRESS_SPACE_CONST:
758  Opc = TM.is64Bit() ? (useShortPointers() ? NVPTX::cvta_const_yes_6432
759  : NVPTX::cvta_const_yes_64)
760  : NVPTX::cvta_const_yes;
761  break;
762  case ADDRESS_SPACE_LOCAL:
763  Opc = TM.is64Bit() ? (useShortPointers() ? NVPTX::cvta_local_yes_6432
764  : NVPTX::cvta_local_yes_64)
765  : NVPTX::cvta_local_yes;
766  break;
767  }
769  Src));
770  return;
771  } else {
772  // Generic to specific
773  if (SrcAddrSpace != 0)
774  report_fatal_error("Cannot cast between two non-generic address spaces");
775  unsigned Opc;
776  switch (DstAddrSpace) {
777  default: report_fatal_error("Bad address space in addrspacecast");
779  Opc = TM.is64Bit() ? NVPTX::cvta_to_global_yes_64
780  : NVPTX::cvta_to_global_yes;
781  break;
783  Opc = TM.is64Bit() ? (useShortPointers() ? NVPTX::cvta_to_shared_yes_3264
784  : NVPTX::cvta_to_shared_yes_64)
785  : NVPTX::cvta_to_shared_yes;
786  break;
787  case ADDRESS_SPACE_CONST:
788  Opc = TM.is64Bit() ? (useShortPointers() ? NVPTX::cvta_to_const_yes_3264
789  : NVPTX::cvta_to_const_yes_64)
790  : NVPTX::cvta_to_const_yes;
791  break;
792  case ADDRESS_SPACE_LOCAL:
793  Opc = TM.is64Bit() ? (useShortPointers() ? NVPTX::cvta_to_local_yes_3264
794  : NVPTX::cvta_to_local_yes_64)
795  : NVPTX::cvta_to_local_yes;
796  break;
797  case ADDRESS_SPACE_PARAM:
798  Opc = TM.is64Bit() ? NVPTX::nvvm_ptr_gen_to_param_64
799  : NVPTX::nvvm_ptr_gen_to_param;
800  break;
801  }
803  Src));
804  return;
805  }
806 }
807 
808 // Helper function template to reduce amount of boilerplate code for
809 // opcode selection.
811  MVT::SimpleValueType VT, unsigned Opcode_i8, unsigned Opcode_i16,
812  unsigned Opcode_i32, Optional<unsigned> Opcode_i64, unsigned Opcode_f16,
813  unsigned Opcode_f16x2, unsigned Opcode_f32, Optional<unsigned> Opcode_f64) {
814  switch (VT) {
815  case MVT::i1:
816  case MVT::i8:
817  return Opcode_i8;
818  case MVT::i16:
819  return Opcode_i16;
820  case MVT::i32:
821  return Opcode_i32;
822  case MVT::i64:
823  return Opcode_i64;
824  case MVT::f16:
825  return Opcode_f16;
826  case MVT::v2f16:
827  return Opcode_f16x2;
828  case MVT::f32:
829  return Opcode_f32;
830  case MVT::f64:
831  return Opcode_f64;
832  default:
833  return None;
834  }
835 }
836 
837 bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
838  SDLoc dl(N);
839  MemSDNode *LD = cast<MemSDNode>(N);
840  assert(LD->readMem() && "Expected load");
841  LoadSDNode *PlainLoad = dyn_cast<LoadSDNode>(N);
842  EVT LoadedVT = LD->getMemoryVT();
843  SDNode *NVPTXLD = nullptr;
844 
845  // do not support pre/post inc/dec
846  if (PlainLoad && PlainLoad->isIndexed())
847  return false;
848 
849  if (!LoadedVT.isSimple())
850  return false;
851 
852  AtomicOrdering Ordering = LD->getOrdering();
853  // In order to lower atomic loads with stronger guarantees we would need to
854  // use load.acquire or insert fences. However these features were only added
855  // with PTX ISA 6.0 / sm_70.
856  // TODO: Check if we can actually use the new instructions and implement them.
857  if (isStrongerThanMonotonic(Ordering))
858  return false;
859 
860  // Address Space Setting
861  unsigned int CodeAddrSpace = getCodeAddrSpace(LD);
862  if (canLowerToLDG(LD, *Subtarget, CodeAddrSpace, MF)) {
863  return tryLDGLDU(N);
864  }
865 
866  unsigned int PointerSize =
868 
869  // Volatile Setting
870  // - .volatile is only available for .global and .shared
871  // - .volatile has the same memory synchronization semantics as .relaxed.sys
872  bool isVolatile = LD->isVolatile() || Ordering == AtomicOrdering::Monotonic;
873  if (CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL &&
874  CodeAddrSpace != NVPTX::PTXLdStInstCode::SHARED &&
875  CodeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC)
876  isVolatile = false;
877 
878  // Type Setting: fromType + fromTypeWidth
879  //
880  // Sign : ISD::SEXTLOAD
881  // Unsign : ISD::ZEXTLOAD, ISD::NON_EXTLOAD or ISD::EXTLOAD and the
882  // type is integer
883  // Float : ISD::NON_EXTLOAD or ISD::EXTLOAD and the type is float
884  MVT SimpleVT = LoadedVT.getSimpleVT();
885  MVT ScalarVT = SimpleVT.getScalarType();
886  // Read at least 8 bits (predicates are stored as 8-bit values)
887  unsigned fromTypeWidth = std::max(8U, ScalarVT.getSizeInBits());
888  unsigned int fromType;
889 
890  // Vector Setting
891  unsigned vecType = NVPTX::PTXLdStInstCode::Scalar;
892  if (SimpleVT.isVector()) {
893  assert(LoadedVT == MVT::v2f16 && "Unexpected vector type");
894  // v2f16 is loaded using ld.b32
895  fromTypeWidth = 32;
896  }
897 
898  if (PlainLoad && (PlainLoad->getExtensionType() == ISD::SEXTLOAD))
900  else if (ScalarVT.isFloatingPoint())
901  // f16 uses .b16 as its storage type.
902  fromType = ScalarVT.SimpleTy == MVT::f16 ? NVPTX::PTXLdStInstCode::Untyped
904  else
906 
907  // Create the machine instruction DAG
908  SDValue Chain = N->getOperand(0);
909  SDValue N1 = N->getOperand(1);
910  SDValue Addr;
912  Optional<unsigned> Opcode;
914 
915  if (SelectDirectAddr(N1, Addr)) {
916  Opcode = pickOpcodeForVT(
917  TargetVT, NVPTX::LD_i8_avar, NVPTX::LD_i16_avar, NVPTX::LD_i32_avar,
918  NVPTX::LD_i64_avar, NVPTX::LD_f16_avar, NVPTX::LD_f16x2_avar,
919  NVPTX::LD_f32_avar, NVPTX::LD_f64_avar);
920  if (!Opcode)
921  return false;
922  SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(CodeAddrSpace, dl),
923  getI32Imm(vecType, dl), getI32Imm(fromType, dl),
924  getI32Imm(fromTypeWidth, dl), Addr, Chain };
925  NVPTXLD = CurDAG->getMachineNode(Opcode.getValue(), dl, TargetVT,
926  MVT::Other, Ops);
927  } else if (PointerSize == 64 ? SelectADDRsi64(N1.getNode(), N1, Base, Offset)
928  : SelectADDRsi(N1.getNode(), N1, Base, Offset)) {
929  Opcode = pickOpcodeForVT(TargetVT, NVPTX::LD_i8_asi, NVPTX::LD_i16_asi,
930  NVPTX::LD_i32_asi, NVPTX::LD_i64_asi,
931  NVPTX::LD_f16_asi, NVPTX::LD_f16x2_asi,
932  NVPTX::LD_f32_asi, NVPTX::LD_f64_asi);
933  if (!Opcode)
934  return false;
935  SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(CodeAddrSpace, dl),
936  getI32Imm(vecType, dl), getI32Imm(fromType, dl),
937  getI32Imm(fromTypeWidth, dl), Base, Offset, Chain };
938  NVPTXLD = CurDAG->getMachineNode(Opcode.getValue(), dl, TargetVT,
939  MVT::Other, Ops);
940  } else if (PointerSize == 64 ? SelectADDRri64(N1.getNode(), N1, Base, Offset)
941  : SelectADDRri(N1.getNode(), N1, Base, Offset)) {
942  if (PointerSize == 64)
943  Opcode = pickOpcodeForVT(
944  TargetVT, NVPTX::LD_i8_ari_64, NVPTX::LD_i16_ari_64,
945  NVPTX::LD_i32_ari_64, NVPTX::LD_i64_ari_64, NVPTX::LD_f16_ari_64,
946  NVPTX::LD_f16x2_ari_64, NVPTX::LD_f32_ari_64, NVPTX::LD_f64_ari_64);
947  else
948  Opcode = pickOpcodeForVT(
949  TargetVT, NVPTX::LD_i8_ari, NVPTX::LD_i16_ari, NVPTX::LD_i32_ari,
950  NVPTX::LD_i64_ari, NVPTX::LD_f16_ari, NVPTX::LD_f16x2_ari,
951  NVPTX::LD_f32_ari, NVPTX::LD_f64_ari);
952  if (!Opcode)
953  return false;
954  SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(CodeAddrSpace, dl),
955  getI32Imm(vecType, dl), getI32Imm(fromType, dl),
956  getI32Imm(fromTypeWidth, dl), Base, Offset, Chain };
957  NVPTXLD = CurDAG->getMachineNode(Opcode.getValue(), dl, TargetVT,
958  MVT::Other, Ops);
959  } else {
960  if (PointerSize == 64)
961  Opcode = pickOpcodeForVT(
962  TargetVT, NVPTX::LD_i8_areg_64, NVPTX::LD_i16_areg_64,
963  NVPTX::LD_i32_areg_64, NVPTX::LD_i64_areg_64, NVPTX::LD_f16_areg_64,
964  NVPTX::LD_f16x2_areg_64, NVPTX::LD_f32_areg_64,
965  NVPTX::LD_f64_areg_64);
966  else
967  Opcode = pickOpcodeForVT(
968  TargetVT, NVPTX::LD_i8_areg, NVPTX::LD_i16_areg, NVPTX::LD_i32_areg,
969  NVPTX::LD_i64_areg, NVPTX::LD_f16_areg, NVPTX::LD_f16x2_areg,
970  NVPTX::LD_f32_areg, NVPTX::LD_f64_areg);
971  if (!Opcode)
972  return false;
973  SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(CodeAddrSpace, dl),
974  getI32Imm(vecType, dl), getI32Imm(fromType, dl),
975  getI32Imm(fromTypeWidth, dl), N1, Chain };
976  NVPTXLD = CurDAG->getMachineNode(Opcode.getValue(), dl, TargetVT,
977  MVT::Other, Ops);
978  }
979 
980  if (!NVPTXLD)
981  return false;
982 
983  MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand();
984  CurDAG->setNodeMemRefs(cast<MachineSDNode>(NVPTXLD), {MemRef});
985 
986  ReplaceNode(N, NVPTXLD);
987  return true;
988 }
989 
990 bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
991 
992  SDValue Chain = N->getOperand(0);
993  SDValue Op1 = N->getOperand(1);
994  SDValue Addr, Offset, Base;
995  Optional<unsigned> Opcode;
996  SDLoc DL(N);
997  SDNode *LD;
998  MemSDNode *MemSD = cast<MemSDNode>(N);
999  EVT LoadedVT = MemSD->getMemoryVT();
1000 
1001  if (!LoadedVT.isSimple())
1002  return false;
1003 
1004  // Address Space Setting
1005  unsigned int CodeAddrSpace = getCodeAddrSpace(MemSD);
1006  if (canLowerToLDG(MemSD, *Subtarget, CodeAddrSpace, MF)) {
1007  return tryLDGLDU(N);
1008  }
1009 
1010  unsigned int PointerSize =
1012 
1013  // Volatile Setting
1014  // - .volatile is only availalble for .global and .shared
1015  bool IsVolatile = MemSD->isVolatile();
1016  if (CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL &&
1017  CodeAddrSpace != NVPTX::PTXLdStInstCode::SHARED &&
1018  CodeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC)
1019  IsVolatile = false;
1020 
1021  // Vector Setting
1022  MVT SimpleVT = LoadedVT.getSimpleVT();
1023 
1024  // Type Setting: fromType + fromTypeWidth
1025  //
1026  // Sign : ISD::SEXTLOAD
1027  // Unsign : ISD::ZEXTLOAD, ISD::NON_EXTLOAD or ISD::EXTLOAD and the
1028  // type is integer
1029  // Float : ISD::NON_EXTLOAD or ISD::EXTLOAD and the type is float
1030  MVT ScalarVT = SimpleVT.getScalarType();
1031  // Read at least 8 bits (predicates are stored as 8-bit values)
1032  unsigned FromTypeWidth = std::max(8U, ScalarVT.getSizeInBits());
1033  unsigned int FromType;
1034  // The last operand holds the original LoadSDNode::getExtensionType() value
1035  unsigned ExtensionType = cast<ConstantSDNode>(
1036  N->getOperand(N->getNumOperands() - 1))->getZExtValue();
1037  if (ExtensionType == ISD::SEXTLOAD)
1038  FromType = NVPTX::PTXLdStInstCode::Signed;
1039  else if (ScalarVT.isFloatingPoint())
1040  FromType = ScalarVT.SimpleTy == MVT::f16 ? NVPTX::PTXLdStInstCode::Untyped
1042  else
1044 
1045  unsigned VecType;
1046 
1047  switch (N->getOpcode()) {
1048  case NVPTXISD::LoadV2:
1050  break;
1051  case NVPTXISD::LoadV4:
1053  break;
1054  default:
1055  return false;
1056  }
1057 
1058  EVT EltVT = N->getValueType(0);
1059 
1060  // v8f16 is a special case. PTX doesn't have ld.v8.f16
1061  // instruction. Instead, we split the vector into v2f16 chunks and
1062  // load them with ld.v4.b32.
1063  if (EltVT == MVT::v2f16) {
1064  assert(N->getOpcode() == NVPTXISD::LoadV4 && "Unexpected load opcode.");
1065  EltVT = MVT::i32;
1067  FromTypeWidth = 32;
1068  }
1069 
1070  if (SelectDirectAddr(Op1, Addr)) {
1071  switch (N->getOpcode()) {
1072  default:
1073  return false;
1074  case NVPTXISD::LoadV2:
1075  Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1076  NVPTX::LDV_i8_v2_avar, NVPTX::LDV_i16_v2_avar,
1077  NVPTX::LDV_i32_v2_avar, NVPTX::LDV_i64_v2_avar,
1078  NVPTX::LDV_f16_v2_avar, NVPTX::LDV_f16x2_v2_avar,
1079  NVPTX::LDV_f32_v2_avar, NVPTX::LDV_f64_v2_avar);
1080  break;
1081  case NVPTXISD::LoadV4:
1082  Opcode =
1083  pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_avar,
1084  NVPTX::LDV_i16_v4_avar, NVPTX::LDV_i32_v4_avar, None,
1085  NVPTX::LDV_f16_v4_avar, NVPTX::LDV_f16x2_v4_avar,
1086  NVPTX::LDV_f32_v4_avar, None);
1087  break;
1088  }
1089  if (!Opcode)
1090  return false;
1091  SDValue Ops[] = { getI32Imm(IsVolatile, DL), getI32Imm(CodeAddrSpace, DL),
1092  getI32Imm(VecType, DL), getI32Imm(FromType, DL),
1093  getI32Imm(FromTypeWidth, DL), Addr, Chain };
1094  LD = CurDAG->getMachineNode(Opcode.getValue(), DL, N->getVTList(), Ops);
1095  } else if (PointerSize == 64
1096  ? SelectADDRsi64(Op1.getNode(), Op1, Base, Offset)
1097  : SelectADDRsi(Op1.getNode(), Op1, Base, Offset)) {
1098  switch (N->getOpcode()) {
1099  default:
1100  return false;
1101  case NVPTXISD::LoadV2:
1102  Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1103  NVPTX::LDV_i8_v2_asi, NVPTX::LDV_i16_v2_asi,
1104  NVPTX::LDV_i32_v2_asi, NVPTX::LDV_i64_v2_asi,
1105  NVPTX::LDV_f16_v2_asi, NVPTX::LDV_f16x2_v2_asi,
1106  NVPTX::LDV_f32_v2_asi, NVPTX::LDV_f64_v2_asi);
1107  break;
1108  case NVPTXISD::LoadV4:
1109  Opcode =
1110  pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_asi,
1111  NVPTX::LDV_i16_v4_asi, NVPTX::LDV_i32_v4_asi, None,
1112  NVPTX::LDV_f16_v4_asi, NVPTX::LDV_f16x2_v4_asi,
1113  NVPTX::LDV_f32_v4_asi, None);
1114  break;
1115  }
1116  if (!Opcode)
1117  return false;
1118  SDValue Ops[] = { getI32Imm(IsVolatile, DL), getI32Imm(CodeAddrSpace, DL),
1119  getI32Imm(VecType, DL), getI32Imm(FromType, DL),
1120  getI32Imm(FromTypeWidth, DL), Base, Offset, Chain };
1121  LD = CurDAG->getMachineNode(Opcode.getValue(), DL, N->getVTList(), Ops);
1122  } else if (PointerSize == 64
1123  ? SelectADDRri64(Op1.getNode(), Op1, Base, Offset)
1124  : SelectADDRri(Op1.getNode(), Op1, Base, Offset)) {
1125  if (PointerSize == 64) {
1126  switch (N->getOpcode()) {
1127  default:
1128  return false;
1129  case NVPTXISD::LoadV2:
1130  Opcode = pickOpcodeForVT(
1131  EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v2_ari_64,
1132  NVPTX::LDV_i16_v2_ari_64, NVPTX::LDV_i32_v2_ari_64,
1133  NVPTX::LDV_i64_v2_ari_64, NVPTX::LDV_f16_v2_ari_64,
1134  NVPTX::LDV_f16x2_v2_ari_64, NVPTX::LDV_f32_v2_ari_64,
1135  NVPTX::LDV_f64_v2_ari_64);
1136  break;
1137  case NVPTXISD::LoadV4:
1138  Opcode = pickOpcodeForVT(
1139  EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_ari_64,
1140  NVPTX::LDV_i16_v4_ari_64, NVPTX::LDV_i32_v4_ari_64, None,
1141  NVPTX::LDV_f16_v4_ari_64, NVPTX::LDV_f16x2_v4_ari_64,
1142  NVPTX::LDV_f32_v4_ari_64, None);
1143  break;
1144  }
1145  } else {
1146  switch (N->getOpcode()) {
1147  default:
1148  return false;
1149  case NVPTXISD::LoadV2:
1150  Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1151  NVPTX::LDV_i8_v2_ari, NVPTX::LDV_i16_v2_ari,
1152  NVPTX::LDV_i32_v2_ari, NVPTX::LDV_i64_v2_ari,
1153  NVPTX::LDV_f16_v2_ari, NVPTX::LDV_f16x2_v2_ari,
1154  NVPTX::LDV_f32_v2_ari, NVPTX::LDV_f64_v2_ari);
1155  break;
1156  case NVPTXISD::LoadV4:
1157  Opcode =
1158  pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_ari,
1159  NVPTX::LDV_i16_v4_ari, NVPTX::LDV_i32_v4_ari, None,
1160  NVPTX::LDV_f16_v4_ari, NVPTX::LDV_f16x2_v4_ari,
1161  NVPTX::LDV_f32_v4_ari, None);
1162  break;
1163  }
1164  }
1165  if (!Opcode)
1166  return false;
1167  SDValue Ops[] = { getI32Imm(IsVolatile, DL), getI32Imm(CodeAddrSpace, DL),
1168  getI32Imm(VecType, DL), getI32Imm(FromType, DL),
1169  getI32Imm(FromTypeWidth, DL), Base, Offset, Chain };
1170 
1171  LD = CurDAG->getMachineNode(Opcode.getValue(), DL, N->getVTList(), Ops);
1172  } else {
1173  if (PointerSize == 64) {
1174  switch (N->getOpcode()) {
1175  default:
1176  return false;
1177  case NVPTXISD::LoadV2:
1178  Opcode = pickOpcodeForVT(
1179  EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v2_areg_64,
1180  NVPTX::LDV_i16_v2_areg_64, NVPTX::LDV_i32_v2_areg_64,
1181  NVPTX::LDV_i64_v2_areg_64, NVPTX::LDV_f16_v2_areg_64,
1182  NVPTX::LDV_f16x2_v2_areg_64, NVPTX::LDV_f32_v2_areg_64,
1183  NVPTX::LDV_f64_v2_areg_64);
1184  break;
1185  case NVPTXISD::LoadV4:
1186  Opcode = pickOpcodeForVT(
1187  EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_areg_64,
1188  NVPTX::LDV_i16_v4_areg_64, NVPTX::LDV_i32_v4_areg_64, None,
1189  NVPTX::LDV_f16_v4_areg_64, NVPTX::LDV_f16x2_v4_areg_64,
1190  NVPTX::LDV_f32_v4_areg_64, None);
1191  break;
1192  }
1193  } else {
1194  switch (N->getOpcode()) {
1195  default:
1196  return false;
1197  case NVPTXISD::LoadV2:
1198  Opcode =
1199  pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v2_areg,
1200  NVPTX::LDV_i16_v2_areg, NVPTX::LDV_i32_v2_areg,
1201  NVPTX::LDV_i64_v2_areg, NVPTX::LDV_f16_v2_areg,
1202  NVPTX::LDV_f16x2_v2_areg, NVPTX::LDV_f32_v2_areg,
1203  NVPTX::LDV_f64_v2_areg);
1204  break;
1205  case NVPTXISD::LoadV4:
1206  Opcode = pickOpcodeForVT(
1207  EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_areg,
1208  NVPTX::LDV_i16_v4_areg, NVPTX::LDV_i32_v4_areg, None,
1209  NVPTX::LDV_f16_v4_areg, NVPTX::LDV_f16x2_v4_areg,
1210  NVPTX::LDV_f32_v4_areg, None);
1211  break;
1212  }
1213  }
1214  if (!Opcode)
1215  return false;
1216  SDValue Ops[] = { getI32Imm(IsVolatile, DL), getI32Imm(CodeAddrSpace, DL),
1217  getI32Imm(VecType, DL), getI32Imm(FromType, DL),
1218  getI32Imm(FromTypeWidth, DL), Op1, Chain };
1219  LD = CurDAG->getMachineNode(Opcode.getValue(), DL, N->getVTList(), Ops);
1220  }
1221 
1222  MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand();
1223  CurDAG->setNodeMemRefs(cast<MachineSDNode>(LD), {MemRef});
1224 
1225  ReplaceNode(N, LD);
1226  return true;
1227 }
1228 
1229 bool NVPTXDAGToDAGISel::tryLDGLDU(SDNode *N) {
1230 
1231  SDValue Chain = N->getOperand(0);
1232  SDValue Op1;
1233  MemSDNode *Mem;
1234  bool IsLDG = true;
1235 
1236  // If this is an LDG intrinsic, the address is the third operand. If its an
1237  // LDG/LDU SD node (from custom vector handling), then its the second operand
1238  if (N->getOpcode() == ISD::INTRINSIC_W_CHAIN) {
1239  Op1 = N->getOperand(2);
1240  Mem = cast<MemIntrinsicSDNode>(N);
1241  unsigned IID = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
1242  switch (IID) {
1243  default:
1244  return false;
1245  case Intrinsic::nvvm_ldg_global_f:
1246  case Intrinsic::nvvm_ldg_global_i:
1247  case Intrinsic::nvvm_ldg_global_p:
1248  IsLDG = true;
1249  break;
1250  case Intrinsic::nvvm_ldu_global_f:
1251  case Intrinsic::nvvm_ldu_global_i:
1252  case Intrinsic::nvvm_ldu_global_p:
1253  IsLDG = false;
1254  break;
1255  }
1256  } else {
1257  Op1 = N->getOperand(1);
1258  Mem = cast<MemSDNode>(N);
1259  }
1260 
1261  Optional<unsigned> Opcode;
1262  SDLoc DL(N);
1263  SDNode *LD;
1264  SDValue Base, Offset, Addr;
1265 
1266  EVT EltVT = Mem->getMemoryVT();
1267  unsigned NumElts = 1;
1268  if (EltVT.isVector()) {
1269  NumElts = EltVT.getVectorNumElements();
1270  EltVT = EltVT.getVectorElementType();
1271  // vectors of f16 are loaded/stored as multiples of v2f16 elements.
1272  if (EltVT == MVT::f16 && N->getValueType(0) == MVT::v2f16) {
1273  assert(NumElts % 2 == 0 && "Vector must have even number of elements");
1274  EltVT = MVT::v2f16;
1275  NumElts /= 2;
1276  }
1277  }
1278 
1279  // Build the "promoted" result VTList for the load. If we are really loading
1280  // i8s, then the return type will be promoted to i16 since we do not expose
1281  // 8-bit registers in NVPTX.
1282  EVT NodeVT = (EltVT == MVT::i8) ? MVT::i16 : EltVT;
1283  SmallVector<EVT, 5> InstVTs;
1284  for (unsigned i = 0; i != NumElts; ++i) {
1285  InstVTs.push_back(NodeVT);
1286  }
1287  InstVTs.push_back(MVT::Other);
1288  SDVTList InstVTList = CurDAG->getVTList(InstVTs);
1289 
1290  if (SelectDirectAddr(Op1, Addr)) {
1291  switch (N->getOpcode()) {
1292  default:
1293  return false;
1294  case ISD::LOAD:
1296  if (IsLDG)
1297  Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1298  NVPTX::INT_PTX_LDG_GLOBAL_i8avar,
1299  NVPTX::INT_PTX_LDG_GLOBAL_i16avar,
1300  NVPTX::INT_PTX_LDG_GLOBAL_i32avar,
1301  NVPTX::INT_PTX_LDG_GLOBAL_i64avar,
1302  NVPTX::INT_PTX_LDG_GLOBAL_f16avar,
1303  NVPTX::INT_PTX_LDG_GLOBAL_f16x2avar,
1304  NVPTX::INT_PTX_LDG_GLOBAL_f32avar,
1305  NVPTX::INT_PTX_LDG_GLOBAL_f64avar);
1306  else
1307  Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1308  NVPTX::INT_PTX_LDU_GLOBAL_i8avar,
1309  NVPTX::INT_PTX_LDU_GLOBAL_i16avar,
1310  NVPTX::INT_PTX_LDU_GLOBAL_i32avar,
1311  NVPTX::INT_PTX_LDU_GLOBAL_i64avar,
1312  NVPTX::INT_PTX_LDU_GLOBAL_f16avar,
1313  NVPTX::INT_PTX_LDU_GLOBAL_f16x2avar,
1314  NVPTX::INT_PTX_LDU_GLOBAL_f32avar,
1315  NVPTX::INT_PTX_LDU_GLOBAL_f64avar);
1316  break;
1317  case NVPTXISD::LoadV2:
1318  case NVPTXISD::LDGV2:
1319  Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1320  NVPTX::INT_PTX_LDG_G_v2i8_ELE_avar,
1321  NVPTX::INT_PTX_LDG_G_v2i16_ELE_avar,
1322  NVPTX::INT_PTX_LDG_G_v2i32_ELE_avar,
1323  NVPTX::INT_PTX_LDG_G_v2i64_ELE_avar,
1324  NVPTX::INT_PTX_LDG_G_v2f16_ELE_avar,
1325  NVPTX::INT_PTX_LDG_G_v2f16x2_ELE_avar,
1326  NVPTX::INT_PTX_LDG_G_v2f32_ELE_avar,
1327  NVPTX::INT_PTX_LDG_G_v2f64_ELE_avar);
1328  break;
1329  case NVPTXISD::LDUV2:
1330  Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1331  NVPTX::INT_PTX_LDU_G_v2i8_ELE_avar,
1332  NVPTX::INT_PTX_LDU_G_v2i16_ELE_avar,
1333  NVPTX::INT_PTX_LDU_G_v2i32_ELE_avar,
1334  NVPTX::INT_PTX_LDU_G_v2i64_ELE_avar,
1335  NVPTX::INT_PTX_LDU_G_v2f16_ELE_avar,
1336  NVPTX::INT_PTX_LDU_G_v2f16x2_ELE_avar,
1337  NVPTX::INT_PTX_LDU_G_v2f32_ELE_avar,
1338  NVPTX::INT_PTX_LDU_G_v2f64_ELE_avar);
1339  break;
1340  case NVPTXISD::LoadV4:
1341  case NVPTXISD::LDGV4:
1342  Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1343  NVPTX::INT_PTX_LDG_G_v4i8_ELE_avar,
1344  NVPTX::INT_PTX_LDG_G_v4i16_ELE_avar,
1345  NVPTX::INT_PTX_LDG_G_v4i32_ELE_avar, None,
1346  NVPTX::INT_PTX_LDG_G_v4f16_ELE_avar,
1347  NVPTX::INT_PTX_LDG_G_v4f16x2_ELE_avar,
1348  NVPTX::INT_PTX_LDG_G_v4f32_ELE_avar, None);
1349  break;
1350  case NVPTXISD::LDUV4:
1351  Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1352  NVPTX::INT_PTX_LDU_G_v4i8_ELE_avar,
1353  NVPTX::INT_PTX_LDU_G_v4i16_ELE_avar,
1354  NVPTX::INT_PTX_LDU_G_v4i32_ELE_avar, None,
1355  NVPTX::INT_PTX_LDU_G_v4f16_ELE_avar,
1356  NVPTX::INT_PTX_LDU_G_v4f16x2_ELE_avar,
1357  NVPTX::INT_PTX_LDU_G_v4f32_ELE_avar, None);
1358  break;
1359  }
1360  if (!Opcode)
1361  return false;
1362  SDValue Ops[] = { Addr, Chain };
1363  LD = CurDAG->getMachineNode(Opcode.getValue(), DL, InstVTList, Ops);
1364  } else if (TM.is64Bit() ? SelectADDRri64(Op1.getNode(), Op1, Base, Offset)
1365  : SelectADDRri(Op1.getNode(), Op1, Base, Offset)) {
1366  if (TM.is64Bit()) {
1367  switch (N->getOpcode()) {
1368  default:
1369  return false;
1370  case ISD::LOAD:
1372  if (IsLDG)
1373  Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1374  NVPTX::INT_PTX_LDG_GLOBAL_i8ari64,
1375  NVPTX::INT_PTX_LDG_GLOBAL_i16ari64,
1376  NVPTX::INT_PTX_LDG_GLOBAL_i32ari64,
1377  NVPTX::INT_PTX_LDG_GLOBAL_i64ari64,
1378  NVPTX::INT_PTX_LDG_GLOBAL_f16ari64,
1379  NVPTX::INT_PTX_LDG_GLOBAL_f16x2ari64,
1380  NVPTX::INT_PTX_LDG_GLOBAL_f32ari64,
1381  NVPTX::INT_PTX_LDG_GLOBAL_f64ari64);
1382  else
1383  Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1384  NVPTX::INT_PTX_LDU_GLOBAL_i8ari64,
1385  NVPTX::INT_PTX_LDU_GLOBAL_i16ari64,
1386  NVPTX::INT_PTX_LDU_GLOBAL_i32ari64,
1387  NVPTX::INT_PTX_LDU_GLOBAL_i64ari64,
1388  NVPTX::INT_PTX_LDU_GLOBAL_f16ari64,
1389  NVPTX::INT_PTX_LDU_GLOBAL_f16x2ari64,
1390  NVPTX::INT_PTX_LDU_GLOBAL_f32ari64,
1391  NVPTX::INT_PTX_LDU_GLOBAL_f64ari64);
1392  break;
1393  case NVPTXISD::LoadV2:
1394  case NVPTXISD::LDGV2:
1395  Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1396  NVPTX::INT_PTX_LDG_G_v2i8_ELE_ari64,
1397  NVPTX::INT_PTX_LDG_G_v2i16_ELE_ari64,
1398  NVPTX::INT_PTX_LDG_G_v2i32_ELE_ari64,
1399  NVPTX::INT_PTX_LDG_G_v2i64_ELE_ari64,
1400  NVPTX::INT_PTX_LDG_G_v2f16_ELE_ari64,
1401  NVPTX::INT_PTX_LDG_G_v2f16x2_ELE_ari64,
1402  NVPTX::INT_PTX_LDG_G_v2f32_ELE_ari64,
1403  NVPTX::INT_PTX_LDG_G_v2f64_ELE_ari64);
1404  break;
1405  case NVPTXISD::LDUV2:
1406  Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1407  NVPTX::INT_PTX_LDU_G_v2i8_ELE_ari64,
1408  NVPTX::INT_PTX_LDU_G_v2i16_ELE_ari64,
1409  NVPTX::INT_PTX_LDU_G_v2i32_ELE_ari64,
1410  NVPTX::INT_PTX_LDU_G_v2i64_ELE_ari64,
1411  NVPTX::INT_PTX_LDU_G_v2f16_ELE_ari64,
1412  NVPTX::INT_PTX_LDU_G_v2f16x2_ELE_ari64,
1413  NVPTX::INT_PTX_LDU_G_v2f32_ELE_ari64,
1414  NVPTX::INT_PTX_LDU_G_v2f64_ELE_ari64);
1415  break;
1416  case NVPTXISD::LoadV4:
1417  case NVPTXISD::LDGV4:
1418  Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1419  NVPTX::INT_PTX_LDG_G_v4i8_ELE_ari64,
1420  NVPTX::INT_PTX_LDG_G_v4i16_ELE_ari64,
1421  NVPTX::INT_PTX_LDG_G_v4i32_ELE_ari64, None,
1422  NVPTX::INT_PTX_LDG_G_v4f16_ELE_ari64,
1423  NVPTX::INT_PTX_LDG_G_v4f16x2_ELE_ari64,
1424  NVPTX::INT_PTX_LDG_G_v4f32_ELE_ari64, None);
1425  break;
1426  case NVPTXISD::LDUV4:
1427  Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1428  NVPTX::INT_PTX_LDU_G_v4i8_ELE_ari64,
1429  NVPTX::INT_PTX_LDU_G_v4i16_ELE_ari64,
1430  NVPTX::INT_PTX_LDU_G_v4i32_ELE_ari64, None,
1431  NVPTX::INT_PTX_LDU_G_v4f16_ELE_ari64,
1432  NVPTX::INT_PTX_LDU_G_v4f16x2_ELE_ari64,
1433  NVPTX::INT_PTX_LDU_G_v4f32_ELE_ari64, None);
1434  break;
1435  }
1436  } else {
1437  switch (N->getOpcode()) {
1438  default:
1439  return false;
1440  case ISD::LOAD:
1442  if (IsLDG)
1443  Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1444  NVPTX::INT_PTX_LDG_GLOBAL_i8ari,
1445  NVPTX::INT_PTX_LDG_GLOBAL_i16ari,
1446  NVPTX::INT_PTX_LDG_GLOBAL_i32ari,
1447  NVPTX::INT_PTX_LDG_GLOBAL_i64ari,
1448  NVPTX::INT_PTX_LDG_GLOBAL_f16ari,
1449  NVPTX::INT_PTX_LDG_GLOBAL_f16x2ari,
1450  NVPTX::INT_PTX_LDG_GLOBAL_f32ari,
1451  NVPTX::INT_PTX_LDG_GLOBAL_f64ari);
1452  else
1453  Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1454  NVPTX::INT_PTX_LDU_GLOBAL_i8ari,
1455  NVPTX::INT_PTX_LDU_GLOBAL_i16ari,
1456  NVPTX::INT_PTX_LDU_GLOBAL_i32ari,
1457  NVPTX::INT_PTX_LDU_GLOBAL_i64ari,
1458  NVPTX::INT_PTX_LDU_GLOBAL_f16ari,
1459  NVPTX::INT_PTX_LDU_GLOBAL_f16x2ari,
1460  NVPTX::INT_PTX_LDU_GLOBAL_f32ari,
1461  NVPTX::INT_PTX_LDU_GLOBAL_f64ari);
1462  break;
1463  case NVPTXISD::LoadV2:
1464  case NVPTXISD::LDGV2:
1465  Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1466  NVPTX::INT_PTX_LDG_G_v2i8_ELE_ari32,
1467  NVPTX::INT_PTX_LDG_G_v2i16_ELE_ari32,
1468  NVPTX::INT_PTX_LDG_G_v2i32_ELE_ari32,
1469  NVPTX::INT_PTX_LDG_G_v2i64_ELE_ari32,
1470  NVPTX::INT_PTX_LDG_G_v2f16_ELE_ari32,
1471  NVPTX::INT_PTX_LDG_G_v2f16x2_ELE_ari32,
1472  NVPTX::INT_PTX_LDG_G_v2f32_ELE_ari32,
1473  NVPTX::INT_PTX_LDG_G_v2f64_ELE_ari32);
1474  break;
1475  case NVPTXISD::LDUV2:
1476  Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1477  NVPTX::INT_PTX_LDU_G_v2i8_ELE_ari32,
1478  NVPTX::INT_PTX_LDU_G_v2i16_ELE_ari32,
1479  NVPTX::INT_PTX_LDU_G_v2i32_ELE_ari32,
1480  NVPTX::INT_PTX_LDU_G_v2i64_ELE_ari32,
1481  NVPTX::INT_PTX_LDU_G_v2f16_ELE_ari32,
1482  NVPTX::INT_PTX_LDU_G_v2f16x2_ELE_ari32,
1483  NVPTX::INT_PTX_LDU_G_v2f32_ELE_ari32,
1484  NVPTX::INT_PTX_LDU_G_v2f64_ELE_ari32);
1485  break;
1486  case NVPTXISD::LoadV4:
1487  case NVPTXISD::LDGV4:
1488  Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1489  NVPTX::INT_PTX_LDG_G_v4i8_ELE_ari32,
1490  NVPTX::INT_PTX_LDG_G_v4i16_ELE_ari32,
1491  NVPTX::INT_PTX_LDG_G_v4i32_ELE_ari32, None,
1492  NVPTX::INT_PTX_LDG_G_v4f16_ELE_ari32,
1493  NVPTX::INT_PTX_LDG_G_v4f16x2_ELE_ari32,
1494  NVPTX::INT_PTX_LDG_G_v4f32_ELE_ari32, None);
1495  break;
1496  case NVPTXISD::LDUV4:
1497  Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1498  NVPTX::INT_PTX_LDU_G_v4i8_ELE_ari32,
1499  NVPTX::INT_PTX_LDU_G_v4i16_ELE_ari32,
1500  NVPTX::INT_PTX_LDU_G_v4i32_ELE_ari32, None,
1501  NVPTX::INT_PTX_LDU_G_v4f16_ELE_ari32,
1502  NVPTX::INT_PTX_LDU_G_v4f16x2_ELE_ari32,
1503  NVPTX::INT_PTX_LDU_G_v4f32_ELE_ari32, None);
1504  break;
1505  }
1506  }
1507  if (!Opcode)
1508  return false;
1509  SDValue Ops[] = {Base, Offset, Chain};
1510  LD = CurDAG->getMachineNode(Opcode.getValue(), DL, InstVTList, Ops);
1511  } else {
1512  if (TM.is64Bit()) {
1513  switch (N->getOpcode()) {
1514  default:
1515  return false;
1516  case ISD::LOAD:
1518  if (IsLDG)
1519  Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1520  NVPTX::INT_PTX_LDG_GLOBAL_i8areg64,
1521  NVPTX::INT_PTX_LDG_GLOBAL_i16areg64,
1522  NVPTX::INT_PTX_LDG_GLOBAL_i32areg64,
1523  NVPTX::INT_PTX_LDG_GLOBAL_i64areg64,
1524  NVPTX::INT_PTX_LDG_GLOBAL_f16areg64,
1525  NVPTX::INT_PTX_LDG_GLOBAL_f16x2areg64,
1526  NVPTX::INT_PTX_LDG_GLOBAL_f32areg64,
1527  NVPTX::INT_PTX_LDG_GLOBAL_f64areg64);
1528  else
1529  Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1530  NVPTX::INT_PTX_LDU_GLOBAL_i8areg64,
1531  NVPTX::INT_PTX_LDU_GLOBAL_i16areg64,
1532  NVPTX::INT_PTX_LDU_GLOBAL_i32areg64,
1533  NVPTX::INT_PTX_LDU_GLOBAL_i64areg64,
1534  NVPTX::INT_PTX_LDU_GLOBAL_f16areg64,
1535  NVPTX::INT_PTX_LDU_GLOBAL_f16x2areg64,
1536  NVPTX::INT_PTX_LDU_GLOBAL_f32areg64,
1537  NVPTX::INT_PTX_LDU_GLOBAL_f64areg64);
1538  break;
1539  case NVPTXISD::LoadV2:
1540  case NVPTXISD::LDGV2:
1541  Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1542  NVPTX::INT_PTX_LDG_G_v2i8_ELE_areg64,
1543  NVPTX::INT_PTX_LDG_G_v2i16_ELE_areg64,
1544  NVPTX::INT_PTX_LDG_G_v2i32_ELE_areg64,
1545  NVPTX::INT_PTX_LDG_G_v2i64_ELE_areg64,
1546  NVPTX::INT_PTX_LDG_G_v2f16_ELE_areg64,
1547  NVPTX::INT_PTX_LDG_G_v2f16x2_ELE_areg64,
1548  NVPTX::INT_PTX_LDG_G_v2f32_ELE_areg64,
1549  NVPTX::INT_PTX_LDG_G_v2f64_ELE_areg64);
1550  break;
1551  case NVPTXISD::LDUV2:
1552  Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1553  NVPTX::INT_PTX_LDU_G_v2i8_ELE_areg64,
1554  NVPTX::INT_PTX_LDU_G_v2i16_ELE_areg64,
1555  NVPTX::INT_PTX_LDU_G_v2i32_ELE_areg64,
1556  NVPTX::INT_PTX_LDU_G_v2i64_ELE_areg64,
1557  NVPTX::INT_PTX_LDU_G_v2f16_ELE_areg64,
1558  NVPTX::INT_PTX_LDU_G_v2f16x2_ELE_areg64,
1559  NVPTX::INT_PTX_LDU_G_v2f32_ELE_areg64,
1560  NVPTX::INT_PTX_LDU_G_v2f64_ELE_areg64);
1561  break;
1562  case NVPTXISD::LoadV4:
1563  case NVPTXISD::LDGV4:
1564  Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1565  NVPTX::INT_PTX_LDG_G_v4i8_ELE_areg64,
1566  NVPTX::INT_PTX_LDG_G_v4i16_ELE_areg64,
1567  NVPTX::INT_PTX_LDG_G_v4i32_ELE_areg64, None,
1568  NVPTX::INT_PTX_LDG_G_v4f16_ELE_areg64,
1569  NVPTX::INT_PTX_LDG_G_v4f16x2_ELE_areg64,
1570  NVPTX::INT_PTX_LDG_G_v4f32_ELE_areg64, None);
1571  break;
1572  case NVPTXISD::LDUV4:
1573  Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1574  NVPTX::INT_PTX_LDU_G_v4i8_ELE_areg64,
1575  NVPTX::INT_PTX_LDU_G_v4i16_ELE_areg64,
1576  NVPTX::INT_PTX_LDU_G_v4i32_ELE_areg64, None,
1577  NVPTX::INT_PTX_LDU_G_v4f16_ELE_areg64,
1578  NVPTX::INT_PTX_LDU_G_v4f16x2_ELE_areg64,
1579  NVPTX::INT_PTX_LDU_G_v4f32_ELE_areg64, None);
1580  break;
1581  }
1582  } else {
1583  switch (N->getOpcode()) {
1584  default:
1585  return false;
1586  case ISD::LOAD:
1588  if (IsLDG)
1589  Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1590  NVPTX::INT_PTX_LDG_GLOBAL_i8areg,
1591  NVPTX::INT_PTX_LDG_GLOBAL_i16areg,
1592  NVPTX::INT_PTX_LDG_GLOBAL_i32areg,
1593  NVPTX::INT_PTX_LDG_GLOBAL_i64areg,
1594  NVPTX::INT_PTX_LDG_GLOBAL_f16areg,
1595  NVPTX::INT_PTX_LDG_GLOBAL_f16x2areg,
1596  NVPTX::INT_PTX_LDG_GLOBAL_f32areg,
1597  NVPTX::INT_PTX_LDG_GLOBAL_f64areg);
1598  else
1599  Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1600  NVPTX::INT_PTX_LDU_GLOBAL_i8areg,
1601  NVPTX::INT_PTX_LDU_GLOBAL_i16areg,
1602  NVPTX::INT_PTX_LDU_GLOBAL_i32areg,
1603  NVPTX::INT_PTX_LDU_GLOBAL_i64areg,
1604  NVPTX::INT_PTX_LDU_GLOBAL_f16areg,
1605  NVPTX::INT_PTX_LDU_GLOBAL_f16x2areg,
1606  NVPTX::INT_PTX_LDU_GLOBAL_f32areg,
1607  NVPTX::INT_PTX_LDU_GLOBAL_f64areg);
1608  break;
1609  case NVPTXISD::LoadV2:
1610  case NVPTXISD::LDGV2:
1611  Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1612  NVPTX::INT_PTX_LDG_G_v2i8_ELE_areg32,
1613  NVPTX::INT_PTX_LDG_G_v2i16_ELE_areg32,
1614  NVPTX::INT_PTX_LDG_G_v2i32_ELE_areg32,
1615  NVPTX::INT_PTX_LDG_G_v2i64_ELE_areg32,
1616  NVPTX::INT_PTX_LDG_G_v2f16_ELE_areg32,
1617  NVPTX::INT_PTX_LDG_G_v2f16x2_ELE_areg32,
1618  NVPTX::INT_PTX_LDG_G_v2f32_ELE_areg32,
1619  NVPTX::INT_PTX_LDG_G_v2f64_ELE_areg32);
1620  break;
1621  case NVPTXISD::LDUV2:
1622  Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1623  NVPTX::INT_PTX_LDU_G_v2i8_ELE_areg32,
1624  NVPTX::INT_PTX_LDU_G_v2i16_ELE_areg32,
1625  NVPTX::INT_PTX_LDU_G_v2i32_ELE_areg32,
1626  NVPTX::INT_PTX_LDU_G_v2i64_ELE_areg32,
1627  NVPTX::INT_PTX_LDU_G_v2f16_ELE_areg32,
1628  NVPTX::INT_PTX_LDU_G_v2f16x2_ELE_areg32,
1629  NVPTX::INT_PTX_LDU_G_v2f32_ELE_areg32,
1630  NVPTX::INT_PTX_LDU_G_v2f64_ELE_areg32);
1631  break;
1632  case NVPTXISD::LoadV4:
1633  case NVPTXISD::LDGV4:
1634  Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1635  NVPTX::INT_PTX_LDG_G_v4i8_ELE_areg32,
1636  NVPTX::INT_PTX_LDG_G_v4i16_ELE_areg32,
1637  NVPTX::INT_PTX_LDG_G_v4i32_ELE_areg32, None,
1638  NVPTX::INT_PTX_LDG_G_v4f16_ELE_areg32,
1639  NVPTX::INT_PTX_LDG_G_v4f16x2_ELE_areg32,
1640  NVPTX::INT_PTX_LDG_G_v4f32_ELE_areg32, None);
1641  break;
1642  case NVPTXISD::LDUV4:
1643  Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1644  NVPTX::INT_PTX_LDU_G_v4i8_ELE_areg32,
1645  NVPTX::INT_PTX_LDU_G_v4i16_ELE_areg32,
1646  NVPTX::INT_PTX_LDU_G_v4i32_ELE_areg32, None,
1647  NVPTX::INT_PTX_LDU_G_v4f16_ELE_areg32,
1648  NVPTX::INT_PTX_LDU_G_v4f16x2_ELE_areg32,
1649  NVPTX::INT_PTX_LDU_G_v4f32_ELE_areg32, None);
1650  break;
1651  }
1652  }
1653  if (!Opcode)
1654  return false;
1655  SDValue Ops[] = { Op1, Chain };
1656  LD = CurDAG->getMachineNode(Opcode.getValue(), DL, InstVTList, Ops);
1657  }
1658 
1660  CurDAG->setNodeMemRefs(cast<MachineSDNode>(LD), {MemRef});
1661 
1662  // For automatic generation of LDG (through SelectLoad[Vector], not the
1663  // intrinsics), we may have an extending load like:
1664  //
1665  // i32,ch = load<LD1[%data1(addrspace=1)], zext from i8> t0, t7, undef:i64
1666  //
1667  // In this case, the matching logic above will select a load for the original
1668  // memory type (in this case, i8) and our types will not match (the node needs
1669  // to return an i32 in this case). Our LDG/LDU nodes do not support the
1670  // concept of sign-/zero-extension, so emulate it here by adding an explicit
1671  // CVT instruction. Ptxas should clean up any redundancies here.
1672 
1673  EVT OrigType = N->getValueType(0);
1674  LoadSDNode *LdNode = dyn_cast<LoadSDNode>(N);
1675 
1676  if (OrigType != EltVT && LdNode) {
1677  // We have an extending-load. The instruction we selected operates on the
1678  // smaller type, but the SDNode we are replacing has the larger type. We
1679  // need to emit a CVT to make the types match.
1680  bool IsSigned = LdNode->getExtensionType() == ISD::SEXTLOAD;
1681  unsigned CvtOpc = GetConvertOpcode(OrigType.getSimpleVT(),
1682  EltVT.getSimpleVT(), IsSigned);
1683 
1684  // For each output value, apply the manual sign/zero-extension and make sure
1685  // all users of the load go through that CVT.
1686  for (unsigned i = 0; i != NumElts; ++i) {
1687  SDValue Res(LD, i);
1688  SDValue OrigVal(N, i);
1689 
1690  SDNode *CvtNode =
1691  CurDAG->getMachineNode(CvtOpc, DL, OrigType, Res,
1693  DL, MVT::i32));
1694  ReplaceUses(OrigVal, SDValue(CvtNode, 0));
1695  }
1696  }
1697 
1698  ReplaceNode(N, LD);
1699  return true;
1700 }
1701 
1702 bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
1703  SDLoc dl(N);
1704  MemSDNode *ST = cast<MemSDNode>(N);
1705  assert(ST->writeMem() && "Expected store");
1706  StoreSDNode *PlainStore = dyn_cast<StoreSDNode>(N);
1707  AtomicSDNode *AtomicStore = dyn_cast<AtomicSDNode>(N);
1708  assert((PlainStore || AtomicStore) && "Expected store");
1709  EVT StoreVT = ST->getMemoryVT();
1710  SDNode *NVPTXST = nullptr;
1711 
1712  // do not support pre/post inc/dec
1713  if (PlainStore && PlainStore->isIndexed())
1714  return false;
1715 
1716  if (!StoreVT.isSimple())
1717  return false;
1718 
1719  AtomicOrdering Ordering = ST->getOrdering();
1720  // In order to lower atomic loads with stronger guarantees we would need to
1721  // use store.release or insert fences. However these features were only added
1722  // with PTX ISA 6.0 / sm_70.
1723  // TODO: Check if we can actually use the new instructions and implement them.
1724  if (isStrongerThanMonotonic(Ordering))
1725  return false;
1726 
1727  // Address Space Setting
1728  unsigned int CodeAddrSpace = getCodeAddrSpace(ST);
1729  unsigned int PointerSize =
1731 
1732  // Volatile Setting
1733  // - .volatile is only available for .global and .shared
1734  // - .volatile has the same memory synchronization semantics as .relaxed.sys
1735  bool isVolatile = ST->isVolatile() || Ordering == AtomicOrdering::Monotonic;
1736  if (CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL &&
1737  CodeAddrSpace != NVPTX::PTXLdStInstCode::SHARED &&
1738  CodeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC)
1739  isVolatile = false;
1740 
1741  // Vector Setting
1742  MVT SimpleVT = StoreVT.getSimpleVT();
1743  unsigned vecType = NVPTX::PTXLdStInstCode::Scalar;
1744 
1745  // Type Setting: toType + toTypeWidth
1746  // - for integer type, always use 'u'
1747  //
1748  MVT ScalarVT = SimpleVT.getScalarType();
1749  unsigned toTypeWidth = ScalarVT.getSizeInBits();
1750  if (SimpleVT.isVector()) {
1751  assert(StoreVT == MVT::v2f16 && "Unexpected vector type");
1752  // v2f16 is stored using st.b32
1753  toTypeWidth = 32;
1754  }
1755 
1756  unsigned int toType;
1757  if (ScalarVT.isFloatingPoint())
1758  // f16 uses .b16 as its storage type.
1759  toType = ScalarVT.SimpleTy == MVT::f16 ? NVPTX::PTXLdStInstCode::Untyped
1761  else
1763 
1764  // Create the machine instruction DAG
1765  SDValue Chain = ST->getChain();
1766  SDValue Value = PlainStore ? PlainStore->getValue() : AtomicStore->getVal();
1767  SDValue BasePtr = ST->getBasePtr();
1768  SDValue Addr;
1769  SDValue Offset, Base;
1770  Optional<unsigned> Opcode;
1771  MVT::SimpleValueType SourceVT =
1772  Value.getNode()->getSimpleValueType(0).SimpleTy;
1773 
1774  if (SelectDirectAddr(BasePtr, Addr)) {
1775  Opcode = pickOpcodeForVT(SourceVT, NVPTX::ST_i8_avar, NVPTX::ST_i16_avar,
1776  NVPTX::ST_i32_avar, NVPTX::ST_i64_avar,
1777  NVPTX::ST_f16_avar, NVPTX::ST_f16x2_avar,
1778  NVPTX::ST_f32_avar, NVPTX::ST_f64_avar);
1779  if (!Opcode)
1780  return false;
1781  SDValue Ops[] = {Value,
1782  getI32Imm(isVolatile, dl),
1783  getI32Imm(CodeAddrSpace, dl),
1784  getI32Imm(vecType, dl),
1785  getI32Imm(toType, dl),
1786  getI32Imm(toTypeWidth, dl),
1787  Addr,
1788  Chain};
1789  NVPTXST = CurDAG->getMachineNode(Opcode.getValue(), dl, MVT::Other, Ops);
1790  } else if (PointerSize == 64
1791  ? SelectADDRsi64(BasePtr.getNode(), BasePtr, Base, Offset)
1792  : SelectADDRsi(BasePtr.getNode(), BasePtr, Base, Offset)) {
1793  Opcode = pickOpcodeForVT(SourceVT, NVPTX::ST_i8_asi, NVPTX::ST_i16_asi,
1794  NVPTX::ST_i32_asi, NVPTX::ST_i64_asi,
1795  NVPTX::ST_f16_asi, NVPTX::ST_f16x2_asi,
1796  NVPTX::ST_f32_asi, NVPTX::ST_f64_asi);
1797  if (!Opcode)
1798  return false;
1799  SDValue Ops[] = {Value,
1800  getI32Imm(isVolatile, dl),
1801  getI32Imm(CodeAddrSpace, dl),
1802  getI32Imm(vecType, dl),
1803  getI32Imm(toType, dl),
1804  getI32Imm(toTypeWidth, dl),
1805  Base,
1806  Offset,
1807  Chain};
1808  NVPTXST = CurDAG->getMachineNode(Opcode.getValue(), dl, MVT::Other, Ops);
1809  } else if (PointerSize == 64
1810  ? SelectADDRri64(BasePtr.getNode(), BasePtr, Base, Offset)
1811  : SelectADDRri(BasePtr.getNode(), BasePtr, Base, Offset)) {
1812  if (PointerSize == 64)
1813  Opcode = pickOpcodeForVT(
1814  SourceVT, NVPTX::ST_i8_ari_64, NVPTX::ST_i16_ari_64,
1815  NVPTX::ST_i32_ari_64, NVPTX::ST_i64_ari_64, NVPTX::ST_f16_ari_64,
1816  NVPTX::ST_f16x2_ari_64, NVPTX::ST_f32_ari_64, NVPTX::ST_f64_ari_64);
1817  else
1818  Opcode = pickOpcodeForVT(SourceVT, NVPTX::ST_i8_ari, NVPTX::ST_i16_ari,
1819  NVPTX::ST_i32_ari, NVPTX::ST_i64_ari,
1820  NVPTX::ST_f16_ari, NVPTX::ST_f16x2_ari,
1821  NVPTX::ST_f32_ari, NVPTX::ST_f64_ari);
1822  if (!Opcode)
1823  return false;
1824 
1825  SDValue Ops[] = {Value,
1826  getI32Imm(isVolatile, dl),
1827  getI32Imm(CodeAddrSpace, dl),
1828  getI32Imm(vecType, dl),
1829  getI32Imm(toType, dl),
1830  getI32Imm(toTypeWidth, dl),
1831  Base,
1832  Offset,
1833  Chain};
1834  NVPTXST = CurDAG->getMachineNode(Opcode.getValue(), dl, MVT::Other, Ops);
1835  } else {
1836  if (PointerSize == 64)
1837  Opcode =
1838  pickOpcodeForVT(SourceVT, NVPTX::ST_i8_areg_64, NVPTX::ST_i16_areg_64,
1839  NVPTX::ST_i32_areg_64, NVPTX::ST_i64_areg_64,
1840  NVPTX::ST_f16_areg_64, NVPTX::ST_f16x2_areg_64,
1841  NVPTX::ST_f32_areg_64, NVPTX::ST_f64_areg_64);
1842  else
1843  Opcode = pickOpcodeForVT(SourceVT, NVPTX::ST_i8_areg, NVPTX::ST_i16_areg,
1844  NVPTX::ST_i32_areg, NVPTX::ST_i64_areg,
1845  NVPTX::ST_f16_areg, NVPTX::ST_f16x2_areg,
1846  NVPTX::ST_f32_areg, NVPTX::ST_f64_areg);
1847  if (!Opcode)
1848  return false;
1849  SDValue Ops[] = {Value,
1850  getI32Imm(isVolatile, dl),
1851  getI32Imm(CodeAddrSpace, dl),
1852  getI32Imm(vecType, dl),
1853  getI32Imm(toType, dl),
1854  getI32Imm(toTypeWidth, dl),
1855  BasePtr,
1856  Chain};
1857  NVPTXST = CurDAG->getMachineNode(Opcode.getValue(), dl, MVT::Other, Ops);
1858  }
1859 
1860  if (!NVPTXST)
1861  return false;
1862 
1863  MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand();
1864  CurDAG->setNodeMemRefs(cast<MachineSDNode>(NVPTXST), {MemRef});
1865  ReplaceNode(N, NVPTXST);
1866  return true;
1867 }
1868 
1869 bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
1870  SDValue Chain = N->getOperand(0);
1871  SDValue Op1 = N->getOperand(1);
1872  SDValue Addr, Offset, Base;
1873  Optional<unsigned> Opcode;
1874  SDLoc DL(N);
1875  SDNode *ST;
1876  EVT EltVT = Op1.getValueType();
1877  MemSDNode *MemSD = cast<MemSDNode>(N);
1878  EVT StoreVT = MemSD->getMemoryVT();
1879 
1880  // Address Space Setting
1881  unsigned CodeAddrSpace = getCodeAddrSpace(MemSD);
1882  if (CodeAddrSpace == NVPTX::PTXLdStInstCode::CONSTANT) {
1883  report_fatal_error("Cannot store to pointer that points to constant "
1884  "memory space");
1885  }
1886  unsigned int PointerSize =
1888 
1889  // Volatile Setting
1890  // - .volatile is only availalble for .global and .shared
1891  bool IsVolatile = MemSD->isVolatile();
1892  if (CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL &&
1893  CodeAddrSpace != NVPTX::PTXLdStInstCode::SHARED &&
1894  CodeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC)
1895  IsVolatile = false;
1896 
1897  // Type Setting: toType + toTypeWidth
1898  // - for integer type, always use 'u'
1899  assert(StoreVT.isSimple() && "Store value is not simple");
1900  MVT ScalarVT = StoreVT.getSimpleVT().getScalarType();
1901  unsigned ToTypeWidth = ScalarVT.getSizeInBits();
1902  unsigned ToType;
1903  if (ScalarVT.isFloatingPoint())
1904  ToType = ScalarVT.SimpleTy == MVT::f16 ? NVPTX::PTXLdStInstCode::Untyped
1906  else
1908 
1910  SDValue N2;
1911  unsigned VecType;
1912 
1913  switch (N->getOpcode()) {
1914  case NVPTXISD::StoreV2:
1916  StOps.push_back(N->getOperand(1));
1917  StOps.push_back(N->getOperand(2));
1918  N2 = N->getOperand(3);
1919  break;
1920  case NVPTXISD::StoreV4:
1922  StOps.push_back(N->getOperand(1));
1923  StOps.push_back(N->getOperand(2));
1924  StOps.push_back(N->getOperand(3));
1925  StOps.push_back(N->getOperand(4));
1926  N2 = N->getOperand(5);
1927  break;
1928  default:
1929  return false;
1930  }
1931 
1932  // v8f16 is a special case. PTX doesn't have st.v8.f16
1933  // instruction. Instead, we split the vector into v2f16 chunks and
1934  // store them with st.v4.b32.
1935  if (EltVT == MVT::v2f16) {
1936  assert(N->getOpcode() == NVPTXISD::StoreV4 && "Unexpected load opcode.");
1937  EltVT = MVT::i32;
1939  ToTypeWidth = 32;
1940  }
1941 
1942  StOps.push_back(getI32Imm(IsVolatile, DL));
1943  StOps.push_back(getI32Imm(CodeAddrSpace, DL));
1944  StOps.push_back(getI32Imm(VecType, DL));
1945  StOps.push_back(getI32Imm(ToType, DL));
1946  StOps.push_back(getI32Imm(ToTypeWidth, DL));
1947 
1948  if (SelectDirectAddr(N2, Addr)) {
1949  switch (N->getOpcode()) {
1950  default:
1951  return false;
1952  case NVPTXISD::StoreV2:
1953  Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1954  NVPTX::STV_i8_v2_avar, NVPTX::STV_i16_v2_avar,
1955  NVPTX::STV_i32_v2_avar, NVPTX::STV_i64_v2_avar,
1956  NVPTX::STV_f16_v2_avar, NVPTX::STV_f16x2_v2_avar,
1957  NVPTX::STV_f32_v2_avar, NVPTX::STV_f64_v2_avar);
1958  break;
1959  case NVPTXISD::StoreV4:
1960  Opcode =
1961  pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v4_avar,
1962  NVPTX::STV_i16_v4_avar, NVPTX::STV_i32_v4_avar, None,
1963  NVPTX::STV_f16_v4_avar, NVPTX::STV_f16x2_v4_avar,
1964  NVPTX::STV_f32_v4_avar, None);
1965  break;
1966  }
1967  StOps.push_back(Addr);
1968  } else if (PointerSize == 64 ? SelectADDRsi64(N2.getNode(), N2, Base, Offset)
1969  : SelectADDRsi(N2.getNode(), N2, Base, Offset)) {
1970  switch (N->getOpcode()) {
1971  default:
1972  return false;
1973  case NVPTXISD::StoreV2:
1974  Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1975  NVPTX::STV_i8_v2_asi, NVPTX::STV_i16_v2_asi,
1976  NVPTX::STV_i32_v2_asi, NVPTX::STV_i64_v2_asi,
1977  NVPTX::STV_f16_v2_asi, NVPTX::STV_f16x2_v2_asi,
1978  NVPTX::STV_f32_v2_asi, NVPTX::STV_f64_v2_asi);
1979  break;
1980  case NVPTXISD::StoreV4:
1981  Opcode =
1982  pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v4_asi,
1983  NVPTX::STV_i16_v4_asi, NVPTX::STV_i32_v4_asi, None,
1984  NVPTX::STV_f16_v4_asi, NVPTX::STV_f16x2_v4_asi,
1985  NVPTX::STV_f32_v4_asi, None);
1986  break;
1987  }
1988  StOps.push_back(Base);
1989  StOps.push_back(Offset);
1990  } else if (PointerSize == 64 ? SelectADDRri64(N2.getNode(), N2, Base, Offset)
1991  : SelectADDRri(N2.getNode(), N2, Base, Offset)) {
1992  if (PointerSize == 64) {
1993  switch (N->getOpcode()) {
1994  default:
1995  return false;
1996  case NVPTXISD::StoreV2:
1997  Opcode = pickOpcodeForVT(
1998  EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v2_ari_64,
1999  NVPTX::STV_i16_v2_ari_64, NVPTX::STV_i32_v2_ari_64,
2000  NVPTX::STV_i64_v2_ari_64, NVPTX::STV_f16_v2_ari_64,
2001  NVPTX::STV_f16x2_v2_ari_64, NVPTX::STV_f32_v2_ari_64,
2002  NVPTX::STV_f64_v2_ari_64);
2003  break;
2004  case NVPTXISD::StoreV4:
2005  Opcode = pickOpcodeForVT(
2006  EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v4_ari_64,
2007  NVPTX::STV_i16_v4_ari_64, NVPTX::STV_i32_v4_ari_64, None,
2008  NVPTX::STV_f16_v4_ari_64, NVPTX::STV_f16x2_v4_ari_64,
2009  NVPTX::STV_f32_v4_ari_64, None);
2010  break;
2011  }
2012  } else {
2013  switch (N->getOpcode()) {
2014  default:
2015  return false;
2016  case NVPTXISD::StoreV2:
2017  Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
2018  NVPTX::STV_i8_v2_ari, NVPTX::STV_i16_v2_ari,
2019  NVPTX::STV_i32_v2_ari, NVPTX::STV_i64_v2_ari,
2020  NVPTX::STV_f16_v2_ari, NVPTX::STV_f16x2_v2_ari,
2021  NVPTX::STV_f32_v2_ari, NVPTX::STV_f64_v2_ari);
2022  break;
2023  case NVPTXISD::StoreV4:
2024  Opcode =
2025  pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v4_ari,
2026  NVPTX::STV_i16_v4_ari, NVPTX::STV_i32_v4_ari, None,
2027  NVPTX::STV_f16_v4_ari, NVPTX::STV_f16x2_v4_ari,
2028  NVPTX::STV_f32_v4_ari, None);
2029  break;
2030  }
2031  }
2032  StOps.push_back(Base);
2033  StOps.push_back(Offset);
2034  } else {
2035  if (PointerSize == 64) {
2036  switch (N->getOpcode()) {
2037  default:
2038  return false;
2039  case NVPTXISD::StoreV2:
2040  Opcode = pickOpcodeForVT(
2041  EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v2_areg_64,
2042  NVPTX::STV_i16_v2_areg_64, NVPTX::STV_i32_v2_areg_64,
2043  NVPTX::STV_i64_v2_areg_64, NVPTX::STV_f16_v2_areg_64,
2044  NVPTX::STV_f16x2_v2_areg_64, NVPTX::STV_f32_v2_areg_64,
2045  NVPTX::STV_f64_v2_areg_64);
2046  break;
2047  case NVPTXISD::StoreV4:
2048  Opcode = pickOpcodeForVT(
2049  EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v4_areg_64,
2050  NVPTX::STV_i16_v4_areg_64, NVPTX::STV_i32_v4_areg_64, None,
2051  NVPTX::STV_f16_v4_areg_64, NVPTX::STV_f16x2_v4_areg_64,
2052  NVPTX::STV_f32_v4_areg_64, None);
2053  break;
2054  }
2055  } else {
2056  switch (N->getOpcode()) {
2057  default:
2058  return false;
2059  case NVPTXISD::StoreV2:
2060  Opcode =
2061  pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v2_areg,
2062  NVPTX::STV_i16_v2_areg, NVPTX::STV_i32_v2_areg,
2063  NVPTX::STV_i64_v2_areg, NVPTX::STV_f16_v2_areg,
2064  NVPTX::STV_f16x2_v2_areg, NVPTX::STV_f32_v2_areg,
2065  NVPTX::STV_f64_v2_areg);
2066  break;
2067  case NVPTXISD::StoreV4:
2068  Opcode =
2069  pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v4_areg,
2070  NVPTX::STV_i16_v4_areg, NVPTX::STV_i32_v4_areg, None,
2071  NVPTX::STV_f16_v4_areg, NVPTX::STV_f16x2_v4_areg,
2072  NVPTX::STV_f32_v4_areg, None);
2073  break;
2074  }
2075  }
2076  StOps.push_back(N2);
2077  }
2078 
2079  if (!Opcode)
2080  return false;
2081 
2082  StOps.push_back(Chain);
2083 
2084  ST = CurDAG->getMachineNode(Opcode.getValue(), DL, MVT::Other, StOps);
2085 
2086  MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand();
2087  CurDAG->setNodeMemRefs(cast<MachineSDNode>(ST), {MemRef});
2088 
2089  ReplaceNode(N, ST);
2090  return true;
2091 }
2092 
2093 bool NVPTXDAGToDAGISel::tryLoadParam(SDNode *Node) {
2094  SDValue Chain = Node->getOperand(0);
2095  SDValue Offset = Node->getOperand(2);
2096  SDValue Flag = Node->getOperand(3);
2097  SDLoc DL(Node);
2098  MemSDNode *Mem = cast<MemSDNode>(Node);
2099 
2100  unsigned VecSize;
2101  switch (Node->getOpcode()) {
2102  default:
2103  return false;
2104  case NVPTXISD::LoadParam:
2105  VecSize = 1;
2106  break;
2107  case NVPTXISD::LoadParamV2:
2108  VecSize = 2;
2109  break;
2110  case NVPTXISD::LoadParamV4:
2111  VecSize = 4;
2112  break;
2113  }
2114 
2115  EVT EltVT = Node->getValueType(0);
2116  EVT MemVT = Mem->getMemoryVT();
2117 
2118  Optional<unsigned> Opcode;
2119 
2120  switch (VecSize) {
2121  default:
2122  return false;
2123  case 1:
2124  Opcode = pickOpcodeForVT(MemVT.getSimpleVT().SimpleTy,
2125  NVPTX::LoadParamMemI8, NVPTX::LoadParamMemI16,
2126  NVPTX::LoadParamMemI32, NVPTX::LoadParamMemI64,
2127  NVPTX::LoadParamMemF16, NVPTX::LoadParamMemF16x2,
2128  NVPTX::LoadParamMemF32, NVPTX::LoadParamMemF64);
2129  break;
2130  case 2:
2131  Opcode =
2132  pickOpcodeForVT(MemVT.getSimpleVT().SimpleTy, NVPTX::LoadParamMemV2I8,
2133  NVPTX::LoadParamMemV2I16, NVPTX::LoadParamMemV2I32,
2134  NVPTX::LoadParamMemV2I64, NVPTX::LoadParamMemV2F16,
2135  NVPTX::LoadParamMemV2F16x2, NVPTX::LoadParamMemV2F32,
2136  NVPTX::LoadParamMemV2F64);
2137  break;
2138  case 4:
2139  Opcode = pickOpcodeForVT(
2140  MemVT.getSimpleVT().SimpleTy, NVPTX::LoadParamMemV4I8,
2141  NVPTX::LoadParamMemV4I16, NVPTX::LoadParamMemV4I32, None,
2142  NVPTX::LoadParamMemV4F16, NVPTX::LoadParamMemV4F16x2,
2143  NVPTX::LoadParamMemV4F32, None);
2144  break;
2145  }
2146  if (!Opcode)
2147  return false;
2148 
2149  SDVTList VTs;
2150  if (VecSize == 1) {
2151  VTs = CurDAG->getVTList(EltVT, MVT::Other, MVT::Glue);
2152  } else if (VecSize == 2) {
2153  VTs = CurDAG->getVTList(EltVT, EltVT, MVT::Other, MVT::Glue);
2154  } else {
2155  EVT EVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other, MVT::Glue };
2156  VTs = CurDAG->getVTList(EVTs);
2157  }
2158 
2159  unsigned OffsetVal = cast<ConstantSDNode>(Offset)->getZExtValue();
2160 
2162  Ops.push_back(CurDAG->getTargetConstant(OffsetVal, DL, MVT::i32));
2163  Ops.push_back(Chain);
2164  Ops.push_back(Flag);
2165 
2166  ReplaceNode(Node, CurDAG->getMachineNode(Opcode.getValue(), DL, VTs, Ops));
2167  return true;
2168 }
2169 
2170 bool NVPTXDAGToDAGISel::tryStoreRetval(SDNode *N) {
2171  SDLoc DL(N);
2172  SDValue Chain = N->getOperand(0);
2173  SDValue Offset = N->getOperand(1);
2174  unsigned OffsetVal = cast<ConstantSDNode>(Offset)->getZExtValue();
2175  MemSDNode *Mem = cast<MemSDNode>(N);
2176 
2177  // How many elements do we have?
2178  unsigned NumElts = 1;
2179  switch (N->getOpcode()) {
2180  default:
2181  return false;
2182  case NVPTXISD::StoreRetval:
2183  NumElts = 1;
2184  break;
2186  NumElts = 2;
2187  break;
2189  NumElts = 4;
2190  break;
2191  }
2192 
2193  // Build vector of operands
2195  for (unsigned i = 0; i < NumElts; ++i)
2196  Ops.push_back(N->getOperand(i + 2));
2197  Ops.push_back(CurDAG->getTargetConstant(OffsetVal, DL, MVT::i32));
2198  Ops.push_back(Chain);
2199 
2200  // Determine target opcode
2201  // If we have an i1, use an 8-bit store. The lowering code in
2202  // NVPTXISelLowering will have already emitted an upcast.
2203  Optional<unsigned> Opcode = 0;
2204  switch (NumElts) {
2205  default:
2206  return false;
2207  case 1:
2208  Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy,
2209  NVPTX::StoreRetvalI8, NVPTX::StoreRetvalI16,
2210  NVPTX::StoreRetvalI32, NVPTX::StoreRetvalI64,
2211  NVPTX::StoreRetvalF16, NVPTX::StoreRetvalF16x2,
2212  NVPTX::StoreRetvalF32, NVPTX::StoreRetvalF64);
2213  break;
2214  case 2:
2215  Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy,
2216  NVPTX::StoreRetvalV2I8, NVPTX::StoreRetvalV2I16,
2217  NVPTX::StoreRetvalV2I32, NVPTX::StoreRetvalV2I64,
2218  NVPTX::StoreRetvalV2F16, NVPTX::StoreRetvalV2F16x2,
2219  NVPTX::StoreRetvalV2F32, NVPTX::StoreRetvalV2F64);
2220  break;
2221  case 4:
2222  Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy,
2223  NVPTX::StoreRetvalV4I8, NVPTX::StoreRetvalV4I16,
2224  NVPTX::StoreRetvalV4I32, None,
2225  NVPTX::StoreRetvalV4F16, NVPTX::StoreRetvalV4F16x2,
2226  NVPTX::StoreRetvalV4F32, None);
2227  break;
2228  }
2229  if (!Opcode)
2230  return false;
2231 
2232  SDNode *Ret = CurDAG->getMachineNode(Opcode.getValue(), DL, MVT::Other, Ops);
2233  MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand();
2234  CurDAG->setNodeMemRefs(cast<MachineSDNode>(Ret), {MemRef});
2235 
2236  ReplaceNode(N, Ret);
2237  return true;
2238 }
2239 
2240 bool NVPTXDAGToDAGISel::tryStoreParam(SDNode *N) {
2241  SDLoc DL(N);
2242  SDValue Chain = N->getOperand(0);
2243  SDValue Param = N->getOperand(1);
2244  unsigned ParamVal = cast<ConstantSDNode>(Param)->getZExtValue();
2245  SDValue Offset = N->getOperand(2);
2246  unsigned OffsetVal = cast<ConstantSDNode>(Offset)->getZExtValue();
2247  MemSDNode *Mem = cast<MemSDNode>(N);
2248  SDValue Flag = N->getOperand(N->getNumOperands() - 1);
2249 
2250  // How many elements do we have?
2251  unsigned NumElts = 1;
2252  switch (N->getOpcode()) {
2253  default:
2254  return false;
2257  case NVPTXISD::StoreParam:
2258  NumElts = 1;
2259  break;
2261  NumElts = 2;
2262  break;
2264  NumElts = 4;
2265  break;
2266  }
2267 
2268  // Build vector of operands
2270  for (unsigned i = 0; i < NumElts; ++i)
2271  Ops.push_back(N->getOperand(i + 3));
2272  Ops.push_back(CurDAG->getTargetConstant(ParamVal, DL, MVT::i32));
2273  Ops.push_back(CurDAG->getTargetConstant(OffsetVal, DL, MVT::i32));
2274  Ops.push_back(Chain);
2275  Ops.push_back(Flag);
2276 
2277  // Determine target opcode
2278  // If we have an i1, use an 8-bit store. The lowering code in
2279  // NVPTXISelLowering will have already emitted an upcast.
2280  Optional<unsigned> Opcode = 0;
2281  switch (N->getOpcode()) {
2282  default:
2283  switch (NumElts) {
2284  default:
2285  return false;
2286  case 1:
2287  Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy,
2288  NVPTX::StoreParamI8, NVPTX::StoreParamI16,
2289  NVPTX::StoreParamI32, NVPTX::StoreParamI64,
2290  NVPTX::StoreParamF16, NVPTX::StoreParamF16x2,
2291  NVPTX::StoreParamF32, NVPTX::StoreParamF64);
2292  break;
2293  case 2:
2294  Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy,
2295  NVPTX::StoreParamV2I8, NVPTX::StoreParamV2I16,
2296  NVPTX::StoreParamV2I32, NVPTX::StoreParamV2I64,
2297  NVPTX::StoreParamV2F16, NVPTX::StoreParamV2F16x2,
2298  NVPTX::StoreParamV2F32, NVPTX::StoreParamV2F64);
2299  break;
2300  case 4:
2301  Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy,
2302  NVPTX::StoreParamV4I8, NVPTX::StoreParamV4I16,
2303  NVPTX::StoreParamV4I32, None,
2304  NVPTX::StoreParamV4F16, NVPTX::StoreParamV4F16x2,
2305  NVPTX::StoreParamV4F32, None);
2306  break;
2307  }
2308  if (!Opcode)
2309  return false;
2310  break;
2311  // Special case: if we have a sign-extend/zero-extend node, insert the
2312  // conversion instruction first, and use that as the value operand to
2313  // the selected StoreParam node.
2314  case NVPTXISD::StoreParamU32: {
2315  Opcode = NVPTX::StoreParamI32;
2317  MVT::i32);
2318  SDNode *Cvt = CurDAG->getMachineNode(NVPTX::CVT_u32_u16, DL,
2319  MVT::i32, Ops[0], CvtNone);
2320  Ops[0] = SDValue(Cvt, 0);
2321  break;
2322  }
2323  case NVPTXISD::StoreParamS32: {
2324  Opcode = NVPTX::StoreParamI32;
2326  MVT::i32);
2327  SDNode *Cvt = CurDAG->getMachineNode(NVPTX::CVT_s32_s16, DL,
2328  MVT::i32, Ops[0], CvtNone);
2329  Ops[0] = SDValue(Cvt, 0);
2330  break;
2331  }
2332  }
2333 
2335  SDNode *Ret =
2336  CurDAG->getMachineNode(Opcode.getValue(), DL, RetVTs, Ops);
2337  MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand();
2338  CurDAG->setNodeMemRefs(cast<MachineSDNode>(Ret), {MemRef});
2339 
2340  ReplaceNode(N, Ret);
2341  return true;
2342 }
2343 
2344 bool NVPTXDAGToDAGISel::tryTextureIntrinsic(SDNode *N) {
2345  unsigned Opc = 0;
2346 
2347  switch (N->getOpcode()) {
2348  default: return false;
2350  Opc = NVPTX::TEX_1D_F32_S32;
2351  break;
2353  Opc = NVPTX::TEX_1D_F32_F32;
2354  break;
2356  Opc = NVPTX::TEX_1D_F32_F32_LEVEL;
2357  break;
2359  Opc = NVPTX::TEX_1D_F32_F32_GRAD;
2360  break;
2361  case NVPTXISD::Tex1DS32S32:
2362  Opc = NVPTX::TEX_1D_S32_S32;
2363  break;
2365  Opc = NVPTX::TEX_1D_S32_F32;
2366  break;
2368  Opc = NVPTX::TEX_1D_S32_F32_LEVEL;
2369  break;
2371  Opc = NVPTX::TEX_1D_S32_F32_GRAD;
2372  break;
2373  case NVPTXISD::Tex1DU32S32:
2374  Opc = NVPTX::TEX_1D_U32_S32;
2375  break;
2377  Opc = NVPTX::TEX_1D_U32_F32;
2378  break;
2380  Opc = NVPTX::TEX_1D_U32_F32_LEVEL;
2381  break;
2383  Opc = NVPTX::TEX_1D_U32_F32_GRAD;
2384  break;
2386  Opc = NVPTX::TEX_1D_ARRAY_F32_S32;
2387  break;
2389  Opc = NVPTX::TEX_1D_ARRAY_F32_F32;
2390  break;
2392  Opc = NVPTX::TEX_1D_ARRAY_F32_F32_LEVEL;
2393  break;
2395  Opc = NVPTX::TEX_1D_ARRAY_F32_F32_GRAD;
2396  break;
2398  Opc = NVPTX::TEX_1D_ARRAY_S32_S32;
2399  break;
2401  Opc = NVPTX::TEX_1D_ARRAY_S32_F32;
2402  break;
2404  Opc = NVPTX::TEX_1D_ARRAY_S32_F32_LEVEL;
2405  break;
2407  Opc = NVPTX::TEX_1D_ARRAY_S32_F32_GRAD;
2408  break;
2410  Opc = NVPTX::TEX_1D_ARRAY_U32_S32;
2411  break;
2413  Opc = NVPTX::TEX_1D_ARRAY_U32_F32;
2414  break;
2416  Opc = NVPTX::TEX_1D_ARRAY_U32_F32_LEVEL;
2417  break;
2419  Opc = NVPTX::TEX_1D_ARRAY_U32_F32_GRAD;
2420  break;
2422  Opc = NVPTX::TEX_2D_F32_S32;
2423  break;
2425  Opc = NVPTX::TEX_2D_F32_F32;
2426  break;
2428  Opc = NVPTX::TEX_2D_F32_F32_LEVEL;
2429  break;
2431  Opc = NVPTX::TEX_2D_F32_F32_GRAD;
2432  break;
2433  case NVPTXISD::Tex2DS32S32:
2434  Opc = NVPTX::TEX_2D_S32_S32;
2435  break;
2437  Opc = NVPTX::TEX_2D_S32_F32;
2438  break;
2440  Opc = NVPTX::TEX_2D_S32_F32_LEVEL;
2441  break;
2443  Opc = NVPTX::TEX_2D_S32_F32_GRAD;
2444  break;
2445  case NVPTXISD::Tex2DU32S32:
2446  Opc = NVPTX::TEX_2D_U32_S32;
2447  break;
2449  Opc = NVPTX::TEX_2D_U32_F32;
2450  break;
2452  Opc = NVPTX::TEX_2D_U32_F32_LEVEL;
2453  break;
2455  Opc = NVPTX::TEX_2D_U32_F32_GRAD;
2456  break;
2458  Opc = NVPTX::TEX_2D_ARRAY_F32_S32;
2459  break;
2461  Opc = NVPTX::TEX_2D_ARRAY_F32_F32;
2462  break;
2464  Opc = NVPTX::TEX_2D_ARRAY_F32_F32_LEVEL;
2465  break;
2467  Opc = NVPTX::TEX_2D_ARRAY_F32_F32_GRAD;
2468  break;
2470  Opc = NVPTX::TEX_2D_ARRAY_S32_S32;
2471  break;
2473  Opc = NVPTX::TEX_2D_ARRAY_S32_F32;
2474  break;
2476  Opc = NVPTX::TEX_2D_ARRAY_S32_F32_LEVEL;
2477  break;
2479  Opc = NVPTX::TEX_2D_ARRAY_S32_F32_GRAD;
2480  break;
2482  Opc = NVPTX::TEX_2D_ARRAY_U32_S32;
2483  break;
2485  Opc = NVPTX::TEX_2D_ARRAY_U32_F32;
2486  break;
2488  Opc = NVPTX::TEX_2D_ARRAY_U32_F32_LEVEL;
2489  break;
2491  Opc = NVPTX::TEX_2D_ARRAY_U32_F32_GRAD;
2492  break;
2494  Opc = NVPTX::TEX_3D_F32_S32;
2495  break;
2497  Opc = NVPTX::TEX_3D_F32_F32;
2498  break;
2500  Opc = NVPTX::TEX_3D_F32_F32_LEVEL;
2501  break;
2503  Opc = NVPTX::TEX_3D_F32_F32_GRAD;
2504  break;
2505  case NVPTXISD::Tex3DS32S32:
2506  Opc = NVPTX::TEX_3D_S32_S32;
2507  break;
2509  Opc = NVPTX::TEX_3D_S32_F32;
2510  break;
2512  Opc = NVPTX::TEX_3D_S32_F32_LEVEL;
2513  break;
2515  Opc = NVPTX::TEX_3D_S32_F32_GRAD;
2516  break;
2517  case NVPTXISD::Tex3DU32S32:
2518  Opc = NVPTX::TEX_3D_U32_S32;
2519  break;
2521  Opc = NVPTX::TEX_3D_U32_F32;
2522  break;
2524  Opc = NVPTX::TEX_3D_U32_F32_LEVEL;
2525  break;
2527  Opc = NVPTX::TEX_3D_U32_F32_GRAD;
2528  break;
2530  Opc = NVPTX::TEX_CUBE_F32_F32;
2531  break;
2533  Opc = NVPTX::TEX_CUBE_F32_F32_LEVEL;
2534  break;
2536  Opc = NVPTX::TEX_CUBE_S32_F32;
2537  break;
2539  Opc = NVPTX::TEX_CUBE_S32_F32_LEVEL;
2540  break;
2542  Opc = NVPTX::TEX_CUBE_U32_F32;
2543  break;
2545  Opc = NVPTX::TEX_CUBE_U32_F32_LEVEL;
2546  break;
2548  Opc = NVPTX::TEX_CUBE_ARRAY_F32_F32;
2549  break;
2551  Opc = NVPTX::TEX_CUBE_ARRAY_F32_F32_LEVEL;
2552  break;
2554  Opc = NVPTX::TEX_CUBE_ARRAY_S32_F32;
2555  break;
2557  Opc = NVPTX::TEX_CUBE_ARRAY_S32_F32_LEVEL;
2558  break;
2560  Opc = NVPTX::TEX_CUBE_ARRAY_U32_F32;
2561  break;
2563  Opc = NVPTX::TEX_CUBE_ARRAY_U32_F32_LEVEL;
2564  break;
2566  Opc = NVPTX::TLD4_R_2D_F32_F32;
2567  break;
2569  Opc = NVPTX::TLD4_G_2D_F32_F32;
2570  break;
2572  Opc = NVPTX::TLD4_B_2D_F32_F32;
2573  break;
2575  Opc = NVPTX::TLD4_A_2D_F32_F32;
2576  break;
2578  Opc = NVPTX::TLD4_R_2D_S32_F32;
2579  break;
2581  Opc = NVPTX::TLD4_G_2D_S32_F32;
2582  break;
2584  Opc = NVPTX::TLD4_B_2D_S32_F32;
2585  break;
2587  Opc = NVPTX::TLD4_A_2D_S32_F32;
2588  break;
2590  Opc = NVPTX::TLD4_R_2D_U32_F32;
2591  break;
2593  Opc = NVPTX::TLD4_G_2D_U32_F32;
2594  break;
2596  Opc = NVPTX::TLD4_B_2D_U32_F32;
2597  break;
2599  Opc = NVPTX::TLD4_A_2D_U32_F32;
2600  break;
2602  Opc = NVPTX::TEX_UNIFIED_1D_F32_S32;
2603  break;
2605  Opc = NVPTX::TEX_UNIFIED_1D_F32_F32;
2606  break;
2608  Opc = NVPTX::TEX_UNIFIED_1D_F32_F32_LEVEL;
2609  break;
2611  Opc = NVPTX::TEX_UNIFIED_1D_F32_F32_GRAD;
2612  break;
2614  Opc = NVPTX::TEX_UNIFIED_1D_S32_S32;
2615  break;
2617  Opc = NVPTX::TEX_UNIFIED_1D_S32_F32;
2618  break;
2620  Opc = NVPTX::TEX_UNIFIED_1D_S32_F32_LEVEL;
2621  break;
2623  Opc = NVPTX::TEX_UNIFIED_1D_S32_F32_GRAD;
2624  break;
2626  Opc = NVPTX::TEX_UNIFIED_1D_U32_S32;
2627  break;
2629  Opc = NVPTX::TEX_UNIFIED_1D_U32_F32;
2630  break;
2632  Opc = NVPTX::TEX_UNIFIED_1D_U32_F32_LEVEL;
2633  break;
2635  Opc = NVPTX::TEX_UNIFIED_1D_U32_F32_GRAD;
2636  break;
2638  Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_F32_S32;
2639  break;
2641  Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_F32_F32;
2642  break;
2644  Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_F32_F32_LEVEL;
2645  break;
2647  Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_F32_F32_GRAD;
2648  break;
2650  Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_S32_S32;
2651  break;
2653  Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_S32_F32;
2654  break;
2656  Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_S32_F32_LEVEL;
2657  break;
2659  Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_S32_F32_GRAD;
2660  break;
2662  Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_U32_S32;
2663  break;
2665  Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_U32_F32;
2666  break;
2668  Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_U32_F32_LEVEL;
2669  break;
2671  Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_U32_F32_GRAD;
2672  break;
2674  Opc = NVPTX::TEX_UNIFIED_2D_F32_S32;
2675  break;
2677  Opc = NVPTX::TEX_UNIFIED_2D_F32_F32;
2678  break;
2680  Opc = NVPTX::TEX_UNIFIED_2D_F32_F32_LEVEL;
2681  break;
2683  Opc = NVPTX::TEX_UNIFIED_2D_F32_F32_GRAD;
2684  break;
2686  Opc = NVPTX::TEX_UNIFIED_2D_S32_S32;
2687  break;
2689  Opc = NVPTX::TEX_UNIFIED_2D_S32_F32;
2690  break;
2692  Opc = NVPTX::TEX_UNIFIED_2D_S32_F32_LEVEL;
2693  break;
2695  Opc = NVPTX::TEX_UNIFIED_2D_S32_F32_GRAD;
2696  break;
2698  Opc = NVPTX::TEX_UNIFIED_2D_U32_S32;
2699  break;
2701  Opc = NVPTX::TEX_UNIFIED_2D_U32_F32;
2702  break;
2704  Opc = NVPTX::TEX_UNIFIED_2D_U32_F32_LEVEL;
2705  break;
2707  Opc = NVPTX::TEX_UNIFIED_2D_U32_F32_GRAD;
2708  break;
2710  Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_F32_S32;
2711  break;
2713  Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_F32_F32;
2714  break;
2716  Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_F32_F32_LEVEL;
2717  break;
2719  Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_F32_F32_GRAD;
2720  break;
2722  Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_S32_S32;
2723  break;
2725  Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_S32_F32;
2726  break;
2728  Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_S32_F32_LEVEL;
2729  break;
2731  Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_S32_F32_GRAD;
2732  break;
2734  Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_U32_S32;
2735  break;
2737  Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_U32_F32;
2738  break;
2740  Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_U32_F32_LEVEL;
2741  break;
2743  Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_U32_F32_GRAD;
2744  break;
2746  Opc = NVPTX::TEX_UNIFIED_3D_F32_S32;
2747  break;
2749  Opc = NVPTX::TEX_UNIFIED_3D_F32_F32;
2750  break;
2752  Opc = NVPTX::TEX_UNIFIED_3D_F32_F32_LEVEL;
2753  break;
2755  Opc = NVPTX::TEX_UNIFIED_3D_F32_F32_GRAD;
2756  break;
2758  Opc = NVPTX::TEX_UNIFIED_3D_S32_S32;
2759  break;
2761  Opc = NVPTX::TEX_UNIFIED_3D_S32_F32;
2762  break;
2764  Opc = NVPTX::TEX_UNIFIED_3D_S32_F32_LEVEL;
2765  break;
2767  Opc = NVPTX::TEX_UNIFIED_3D_S32_F32_GRAD;
2768  break;
2770  Opc = NVPTX::TEX_UNIFIED_3D_U32_S32;
2771  break;
2773  Opc = NVPTX::TEX_UNIFIED_3D_U32_F32;
2774  break;
2776  Opc = NVPTX::TEX_UNIFIED_3D_U32_F32_LEVEL;
2777  break;
2779  Opc = NVPTX::TEX_UNIFIED_3D_U32_F32_GRAD;
2780  break;
2782  Opc = NVPTX::TEX_UNIFIED_CUBE_F32_F32;
2783  break;
2785  Opc = NVPTX::TEX_UNIFIED_CUBE_F32_F32_LEVEL;
2786  break;
2788  Opc = NVPTX::TEX_UNIFIED_CUBE_S32_F32;
2789  break;
2791  Opc = NVPTX::TEX_UNIFIED_CUBE_S32_F32_LEVEL;
2792  break;
2794  Opc = NVPTX::TEX_UNIFIED_CUBE_U32_F32;
2795  break;
2797  Opc = NVPTX::TEX_UNIFIED_CUBE_U32_F32_LEVEL;
2798  break;
2800  Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_F32_F32;
2801  break;
2803  Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_F32_F32_LEVEL;
2804  break;
2806  Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_S32_F32;
2807  break;
2809  Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_S32_F32_LEVEL;
2810  break;
2812  Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_U32_F32;
2813  break;
2815  Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_U32_F32_LEVEL;
2816  break;
2818  Opc = NVPTX::TLD4_UNIFIED_R_2D_F32_F32;
2819  break;
2821  Opc = NVPTX::TLD4_UNIFIED_G_2D_F32_F32;
2822  break;
2824  Opc = NVPTX::TLD4_UNIFIED_B_2D_F32_F32;
2825  break;
2827  Opc = NVPTX::TLD4_UNIFIED_A_2D_F32_F32;
2828  break;
2830  Opc = NVPTX::TLD4_UNIFIED_R_2D_S32_F32;
2831  break;
2833  Opc = NVPTX::TLD4_UNIFIED_G_2D_S32_F32;
2834  break;
2836  Opc = NVPTX::TLD4_UNIFIED_B_2D_S32_F32;
2837  break;
2839  Opc = NVPTX::TLD4_UNIFIED_A_2D_S32_F32;
2840  break;
2842  Opc = NVPTX::TLD4_UNIFIED_R_2D_U32_F32;
2843  break;
2845  Opc = NVPTX::TLD4_UNIFIED_G_2D_U32_F32;
2846  break;
2848  Opc = NVPTX::TLD4_UNIFIED_B_2D_U32_F32;
2849  break;
2851  Opc = NVPTX::TLD4_UNIFIED_A_2D_U32_F32;
2852  break;
2853  }
2854 
2855  // Copy over operands
2856  SmallVector<SDValue, 8> Ops(N->op_begin() + 1, N->op_end());
2857  Ops.push_back(N->getOperand(0)); // Move chain to the back.
2858 
2859  ReplaceNode(N, CurDAG->getMachineNode(Opc, SDLoc(N), N->getVTList(), Ops));
2860  return true;
2861 }
2862 
2863 bool NVPTXDAGToDAGISel::trySurfaceIntrinsic(SDNode *N) {
2864  unsigned Opc = 0;
2865  switch (N->getOpcode()) {
2866  default: return false;
2868  Opc = NVPTX::SULD_1D_I8_CLAMP;
2869  break;
2871  Opc = NVPTX::SULD_1D_I16_CLAMP;
2872  break;
2874  Opc = NVPTX::SULD_1D_I32_CLAMP;
2875  break;
2877  Opc = NVPTX::SULD_1D_I64_CLAMP;
2878  break;
2880  Opc = NVPTX::SULD_1D_V2I8_CLAMP;
2881  break;
2883  Opc = NVPTX::SULD_1D_V2I16_CLAMP;
2884  break;
2886  Opc = NVPTX::SULD_1D_V2I32_CLAMP;
2887  break;
2889  Opc = NVPTX::SULD_1D_V2I64_CLAMP;
2890  break;
2892  Opc = NVPTX::SULD_1D_V4I8_CLAMP;
2893  break;
2895  Opc = NVPTX::SULD_1D_V4I16_CLAMP;
2896  break;
2898  Opc = NVPTX::SULD_1D_V4I32_CLAMP;
2899  break;
2901  Opc = NVPTX::SULD_1D_ARRAY_I8_CLAMP;
2902  break;
2904  Opc = NVPTX::SULD_1D_ARRAY_I16_CLAMP;
2905  break;
2907  Opc = NVPTX::SULD_1D_ARRAY_I32_CLAMP;
2908  break;
2910  Opc = NVPTX::SULD_1D_ARRAY_I64_CLAMP;
2911  break;
2913  Opc = NVPTX::SULD_1D_ARRAY_V2I8_CLAMP;
2914  break;
2916  Opc = NVPTX::SULD_1D_ARRAY_V2I16_CLAMP;
2917  break;
2919  Opc = NVPTX::SULD_1D_ARRAY_V2I32_CLAMP;
2920  break;
2922  Opc = NVPTX::SULD_1D_ARRAY_V2I64_CLAMP;
2923  break;
2925  Opc = NVPTX::SULD_1D_ARRAY_V4I8_CLAMP;
2926  break;
2928  Opc = NVPTX::SULD_1D_ARRAY_V4I16_CLAMP;
2929  break;
2931  Opc = NVPTX::SULD_1D_ARRAY_V4I32_CLAMP;
2932  break;
2934  Opc = NVPTX::SULD_2D_I8_CLAMP;
2935  break;
2937  Opc = NVPTX::SULD_2D_I16_CLAMP;
2938  break;
2940  Opc = NVPTX::SULD_2D_I32_CLAMP;
2941  break;
2943  Opc = NVPTX::SULD_2D_I64_CLAMP;
2944  break;
2946  Opc = NVPTX::SULD_2D_V2I8_CLAMP;
2947  break;
2949  Opc = NVPTX::SULD_2D_V2I16_CLAMP;
2950  break;
2952  Opc = NVPTX::SULD_2D_V2I32_CLAMP;
2953  break;
2955  Opc = NVPTX::SULD_2D_V2I64_CLAMP;
2956  break;
2958  Opc = NVPTX::SULD_2D_V4I8_CLAMP;
2959  break;
2961  Opc = NVPTX::SULD_2D_V4I16_CLAMP;
2962  break;
2964  Opc = NVPTX::SULD_2D_V4I32_CLAMP;
2965  break;
2967  Opc = NVPTX::SULD_2D_ARRAY_I8_CLAMP;
2968  break;
2970  Opc = NVPTX::SULD_2D_ARRAY_I16_CLAMP;
2971  break;
2973  Opc = NVPTX::SULD_2D_ARRAY_I32_CLAMP;
2974  break;
2976  Opc = NVPTX::SULD_2D_ARRAY_I64_CLAMP;
2977  break;
2979  Opc = NVPTX::SULD_2D_ARRAY_V2I8_CLAMP;
2980  break;
2982  Opc = NVPTX::SULD_2D_ARRAY_V2I16_CLAMP;
2983  break;
2985  Opc = NVPTX::SULD_2D_ARRAY_V2I32_CLAMP;
2986  break;
2988  Opc = NVPTX::SULD_2D_ARRAY_V2I64_CLAMP;
2989  break;
2991  Opc = NVPTX::SULD_2D_ARRAY_V4I8_CLAMP;
2992  break;
2994  Opc = NVPTX::SULD_2D_ARRAY_V4I16_CLAMP;
2995  break;
2997  Opc = NVPTX::SULD_2D_ARRAY_V4I32_CLAMP;
2998  break;
3000  Opc = NVPTX::SULD_3D_I8_CLAMP;
3001  break;
3003  Opc = NVPTX::SULD_3D_I16_CLAMP;
3004  break;
3006  Opc = NVPTX::SULD_3D_I32_CLAMP;
3007  break;
3009  Opc = NVPTX::SULD_3D_I64_CLAMP;
3010  break;
3012  Opc = NVPTX::SULD_3D_V2I8_CLAMP;
3013  break;
3015  Opc = NVPTX::SULD_3D_V2I16_CLAMP;
3016  break;
3018  Opc = NVPTX::SULD_3D_V2I32_CLAMP;
3019  break;
3021  Opc = NVPTX::SULD_3D_V2I64_CLAMP;
3022  break;
3024  Opc = NVPTX::SULD_3D_V4I8_CLAMP;
3025  break;
3027  Opc = NVPTX::SULD_3D_V4I16_CLAMP;
3028  break;
3030  Opc = NVPTX::SULD_3D_V4I32_CLAMP;
3031  break;
3033  Opc = NVPTX::SULD_1D_I8_TRAP;
3034  break;
3036  Opc = NVPTX::SULD_1D_I16_TRAP;
3037  break;
3039  Opc = NVPTX::SULD_1D_I32_TRAP;
3040  break;
3042  Opc = NVPTX::SULD_1D_I64_TRAP;
3043  break;
3045  Opc = NVPTX::SULD_1D_V2I8_TRAP;
3046  break;
3048  Opc = NVPTX::SULD_1D_V2I16_TRAP;
3049  break;
3051  Opc = NVPTX::SULD_1D_V2I32_TRAP;
3052  break;
3054  Opc = NVPTX::SULD_1D_V2I64_TRAP;
3055  break;
3057  Opc = NVPTX::SULD_1D_V4I8_TRAP;
3058  break;
3060  Opc = NVPTX::SULD_1D_V4I16_TRAP;
3061  break;
3063  Opc = NVPTX::SULD_1D_V4I32_TRAP;
3064  break;
3066  Opc = NVPTX::SULD_1D_ARRAY_I8_TRAP;
3067  break;
3069  Opc = NVPTX::SULD_1D_ARRAY_I16_TRAP;
3070  break;
3072  Opc = NVPTX::SULD_1D_ARRAY_I32_TRAP;
3073  break;
3075  Opc = NVPTX::SULD_1D_ARRAY_I64_TRAP;
3076  break;
3078  Opc = NVPTX::SULD_1D_ARRAY_V2I8_TRAP;
3079  break;
3081  Opc = NVPTX::SULD_1D_ARRAY_V2I16_TRAP;
3082  break;
3084  Opc = NVPTX::SULD_1D_ARRAY_V2I32_TRAP;
3085  break;
3087  Opc = NVPTX::SULD_1D_ARRAY_V2I64_TRAP;
3088  break;
3090  Opc = NVPTX::SULD_1D_ARRAY_V4I8_TRAP;
3091  break;
3093  Opc = NVPTX::SULD_1D_ARRAY_V4I16_TRAP;
3094  break;
3096  Opc = NVPTX::SULD_1D_ARRAY_V4I32_TRAP;
3097  break;
3099  Opc = NVPTX::SULD_2D_I8_TRAP;
3100  break;
3102  Opc = NVPTX::SULD_2D_I16_TRAP;
3103  break;
3105  Opc = NVPTX::SULD_2D_I32_TRAP;
3106  break;
3108  Opc = NVPTX::SULD_2D_I64_TRAP;
3109  break;
3111  Opc = NVPTX::SULD_2D_V2I8_TRAP;
3112  break;
3114  Opc = NVPTX::SULD_2D_V2I16_TRAP;
3115  break;
3117  Opc = NVPTX::SULD_2D_V2I32_TRAP;
3118  break;
3120  Opc = NVPTX::SULD_2D_V2I64_TRAP;
3121  break;
3123  Opc = NVPTX::SULD_2D_V4I8_TRAP;
3124  break;
3126  Opc = NVPTX::SULD_2D_V4I16_TRAP;
3127  break;
3129  Opc = NVPTX::SULD_2D_V4I32_TRAP;
3130  break;
3132  Opc = NVPTX::SULD_2D_ARRAY_I8_TRAP;
3133  break;
3135  Opc = NVPTX::SULD_2D_ARRAY_I16_TRAP;
3136  break;
3138  Opc = NVPTX::SULD_2D_ARRAY_I32_TRAP;
3139  break;
3141  Opc = NVPTX::SULD_2D_ARRAY_I64_TRAP;
3142  break;
3144  Opc = NVPTX::SULD_2D_ARRAY_V2I8_TRAP;
3145  break;
3147  Opc = NVPTX::SULD_2D_ARRAY_V2I16_TRAP;
3148  break;
3150  Opc = NVPTX::SULD_2D_ARRAY_V2I32_TRAP;
3151  break;
3153  Opc = NVPTX::SULD_2D_ARRAY_V2I64_TRAP;
3154  break;
3156  Opc = NVPTX::SULD_2D_ARRAY_V4I8_TRAP;
3157  break;
3159  Opc = NVPTX::SULD_2D_ARRAY_V4I16_TRAP;
3160  break;
3162  Opc = NVPTX::SULD_2D_ARRAY_V4I32_TRAP;
3163  break;
3165  Opc = NVPTX::SULD_3D_I8_TRAP;
3166  break;
3168  Opc = NVPTX::SULD_3D_I16_TRAP;
3169  break;
3171  Opc = NVPTX::SULD_3D_I32_TRAP;
3172  break;
3174  Opc = NVPTX::SULD_3D_I64_TRAP;
3175  break;
3177  Opc = NVPTX::SULD_3D_V2I8_TRAP;
3178  break;
3180  Opc = NVPTX::SULD_3D_V2I16_TRAP;
3181  break;
3183  Opc = NVPTX::SULD_3D_V2I32_TRAP;
3184  break;
3186  Opc = NVPTX::SULD_3D_V2I64_TRAP;
3187  break;
3189  Opc = NVPTX::SULD_3D_V4I8_TRAP;
3190  break;
3192  Opc = NVPTX::SULD_3D_V4I16_TRAP;
3193  break;
3195  Opc = NVPTX::SULD_3D_V4I32_TRAP;
3196  break;
3198  Opc = NVPTX::SULD_1D_I8_ZERO;
3199  break;
3201  Opc = NVPTX::SULD_1D_I16_ZERO;
3202  break;
3204  Opc = NVPTX::SULD_1D_I32_ZERO;
3205  break;
3207  Opc = NVPTX::SULD_1D_I64_ZERO;
3208  break;
3210  Opc = NVPTX::SULD_1D_V2I8_ZERO;
3211  break;
3213  Opc = NVPTX::SULD_1D_V2I16_ZERO;
3214  break;
3216  Opc = NVPTX::SULD_1D_V2I32_ZERO;
3217  break;
3219  Opc = NVPTX::SULD_1D_V2I64_ZERO;
3220  break;
3222  Opc = NVPTX::SULD_1D_V4I8_ZERO;
3223  break;
3225  Opc = NVPTX::SULD_1D_V4I16_ZERO;
3226  break;
3228  Opc = NVPTX::SULD_1D_V4I32_ZERO;
3229  break;
3231  Opc = NVPTX::SULD_1D_ARRAY_I8_ZERO;
3232  break;
3234  Opc = NVPTX::SULD_1D_ARRAY_I16_ZERO;
3235  break;
3237  Opc = NVPTX::SULD_1D_ARRAY_I32_ZERO;
3238  break;
3240  Opc = NVPTX::SULD_1D_ARRAY_I64_ZERO;
3241  break;
3243  Opc = NVPTX::SULD_1D_ARRAY_V2I8_ZERO;
3244  break;
3246  Opc = NVPTX::SULD_1D_ARRAY_V2I16_ZERO;
3247  break;
3249  Opc = NVPTX::SULD_1D_ARRAY_V2I32_ZERO;
3250  break;
3252  Opc = NVPTX::SULD_1D_ARRAY_V2I64_ZERO;
3253  break;
3255  Opc = NVPTX::SULD_1D_ARRAY_V4I8_ZERO;
3256  break;
3258  Opc = NVPTX::SULD_1D_ARRAY_V4I16_ZERO;
3259  break;
3261  Opc = NVPTX::SULD_1D_ARRAY_V4I32_ZERO;
3262  break;
3264  Opc = NVPTX::SULD_2D_I8_ZERO;
3265  break;
3267  Opc = NVPTX::SULD_2D_I16_ZERO;
3268  break;
3270  Opc = NVPTX::SULD_2D_I32_ZERO;
3271  break;
3273  Opc = NVPTX::SULD_2D_I64_ZERO;
3274  break;
3276  Opc = NVPTX::SULD_2D_V2I8_ZERO;
3277  break;
3279  Opc = NVPTX::SULD_2D_V2I16_ZERO;
3280  break;
3282  Opc = NVPTX::SULD_2D_V2I32_ZERO;
3283  break;
3285  Opc = NVPTX::SULD_2D_V2I64_ZERO;
3286  break;
3288  Opc = NVPTX::SULD_2D_V4I8_ZERO;
3289  break;
3291  Opc = NVPTX::SULD_2D_V4I16_ZERO;
3292  break;
3294  Opc = NVPTX::SULD_2D_V4I32_ZERO;
3295  break;
3297  Opc = NVPTX::SULD_2D_ARRAY_I8_ZERO;
3298  break;
3300  Opc = NVPTX::SULD_2D_ARRAY_I16_ZERO;
3301  break;
3303  Opc = NVPTX::SULD_2D_ARRAY_I32_ZERO;
3304  break;
3306  Opc = NVPTX::SULD_2D_ARRAY_I64_ZERO;
3307  break;
3309  Opc = NVPTX::SULD_2D_ARRAY_V2I8_ZERO;
3310  break;
3312  Opc = NVPTX::SULD_2D_ARRAY_V2I16_ZERO;
3313  break;
3315  Opc = NVPTX::SULD_2D_ARRAY_V2I32_ZERO;
3316  break;
3318  Opc = NVPTX::SULD_2D_ARRAY_V2I64_ZERO;
3319  break;
3321  Opc = NVPTX::SULD_2D_ARRAY_V4I8_ZERO;
3322  break;
3324  Opc = NVPTX::SULD_2D_ARRAY_V4I16_ZERO;
3325  break;
3327  Opc = NVPTX::SULD_2D_ARRAY_V4I32_ZERO;
3328  break;
3330  Opc = NVPTX::SULD_3D_I8_ZERO;
3331  break;
3333  Opc = NVPTX::SULD_3D_I16_ZERO;
3334  break;
3336  Opc = NVPTX::SULD_3D_I32_ZERO;
3337  break;
3339  Opc = NVPTX::SULD_3D_I64_ZERO;
3340  break;
3342  Opc = NVPTX::SULD_3D_V2I8_ZERO;
3343  break;
3345  Opc = NVPTX::SULD_3D_V2I16_ZERO;
3346  break;
3348  Opc = NVPTX::SULD_3D_V2I32_ZERO;
3349  break;
3351  Opc = NVPTX::SULD_3D_V2I64_ZERO;
3352  break;
3354  Opc = NVPTX::SULD_3D_V4I8_ZERO;
3355  break;
3357  Opc = NVPTX::SULD_3D_V4I16_ZERO;
3358  break;
3360  Opc = NVPTX::SULD_3D_V4I32_ZERO;
3361  break;
3362  }
3363 
3364  // Copy over operands
3365  SmallVector<SDValue, 8> Ops(N->op_begin() + 1, N->op_end());
3366  Ops.push_back(N->getOperand(0)); // Move chain to the back.
3367 
3368  ReplaceNode(N, CurDAG->getMachineNode(Opc, SDLoc(N), N->getVTList(), Ops));
3369  return true;
3370 }
3371 
3372 
3373 /// SelectBFE - Look for instruction sequences that can be made more efficient
3374 /// by using the 'bfe' (bit-field extract) PTX instruction
3375 bool NVPTXDAGToDAGISel::tryBFE(SDNode *N) {
3376  SDLoc DL(N);
3377  SDValue LHS = N->getOperand(0);
3378  SDValue RHS = N->getOperand(1);
3379  SDValue Len;
3380  SDValue Start;
3381  SDValue Val;
3382  bool IsSigned = false;
3383 
3384  if (N->getOpcode() == ISD::AND) {
3385  // Canonicalize the operands
3386  // We want 'and %val, %mask'
3387  if (isa<ConstantSDNode>(LHS) && !isa<ConstantSDNode>(RHS)) {
3388  std::swap(LHS, RHS);
3389  }
3390 
3392  if (!Mask) {
3393  // We need a constant mask on the RHS of the AND
3394  return false;
3395  }
3396 
3397  // Extract the mask bits
3398  uint64_t MaskVal = Mask->getZExtValue();
3399  if (!isMask_64(MaskVal)) {
3400  // We *could* handle shifted masks here, but doing so would require an
3401  // 'and' operation to fix up the low-order bits so we would trade
3402  // shr+and for bfe+and, which has the same throughput
3403  return false;
3404  }
3405 
3406  // How many bits are in our mask?
3407  uint64_t NumBits = countTrailingOnes(MaskVal);
3408  Len = CurDAG->getTargetConstant(NumBits, DL, MVT::i32);
3409 
3410  if (LHS.getOpcode() == ISD::SRL || LHS.getOpcode() == ISD::SRA) {
3411  // We have a 'srl/and' pair, extract the effective start bit and length
3412  Val = LHS.getNode()->getOperand(0);
3413  Start = LHS.getNode()->getOperand(1);
3414  ConstantSDNode *StartConst = dyn_cast<ConstantSDNode>(Start);
3415  if (StartConst) {
3416  uint64_t StartVal = StartConst->getZExtValue();
3417  // How many "good" bits do we have left? "good" is defined here as bits
3418  // that exist in the original value, not shifted in.
3419  uint64_t GoodBits = Start.getValueSizeInBits() - StartVal;
3420  if (NumBits > GoodBits) {
3421  // Do not handle the case where bits have been shifted in. In theory
3422  // we could handle this, but the cost is likely higher than just
3423  // emitting the srl/and pair.
3424  return false;
3425  }
3426  Start = CurDAG->getTargetConstant(StartVal, DL, MVT::i32);
3427  } else {
3428  // Do not handle the case where the shift amount (can be zero if no srl
3429  // was found) is not constant. We could handle this case, but it would
3430  // require run-time logic that would be more expensive than just
3431  // emitting the srl/and pair.
3432  return false;
3433  }
3434  } else {
3435  // Do not handle the case where the LHS of the and is not a shift. While
3436  // it would be trivial to handle this case, it would just transform
3437  // 'and' -> 'bfe', but 'and' has higher-throughput.
3438  return false;
3439  }
3440  } else if (N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) {
3441  if (LHS->getOpcode() == ISD::AND) {
3442  ConstantSDNode *ShiftCnst = dyn_cast<ConstantSDNode>(RHS);
3443  if (!ShiftCnst) {
3444  // Shift amount must be constant
3445  return false;
3446  }
3447 
3448  uint64_t ShiftAmt = ShiftCnst->getZExtValue();
3449 
3450  SDValue AndLHS = LHS->getOperand(0);
3451  SDValue AndRHS = LHS->getOperand(1);
3452 
3453  // Canonicalize the AND to have the mask on the RHS
3454  if (isa<ConstantSDNode>(AndLHS)) {
3455  std::swap(AndLHS, AndRHS);
3456  }
3457 
3458  ConstantSDNode *MaskCnst = dyn_cast<ConstantSDNode>(AndRHS);
3459  if (!MaskCnst) {
3460  // Mask must be constant
3461  return false;
3462  }
3463 
3464  uint64_t MaskVal = MaskCnst->getZExtValue();
3465  uint64_t NumZeros;
3466  uint64_t NumBits;
3467  if (isMask_64(MaskVal)) {
3468  NumZeros = 0;
3469  // The number of bits in the result bitfield will be the number of
3470  // trailing ones (the AND) minus the number of bits we shift off
3471  NumBits = countTrailingOnes(MaskVal) - ShiftAmt;
3472  } else if (isShiftedMask_64(MaskVal)) {
3473  NumZeros = countTrailingZeros(MaskVal);
3474  unsigned NumOnes = countTrailingOnes(MaskVal >> NumZeros);
3475  // The number of bits in the result bitfield will be the number of
3476  // trailing zeros plus the number of set bits in the mask minus the
3477  // number of bits we shift off
3478  NumBits = NumZeros + NumOnes - ShiftAmt;
3479  } else {
3480  // This is not a mask we can handle
3481  return false;
3482  }
3483 
3484  if (ShiftAmt < NumZeros) {
3485  // Handling this case would require extra logic that would make this
3486  // transformation non-profitable
3487  return false;
3488  }
3489 
3490  Val = AndLHS;
3491  Start = CurDAG->getTargetConstant(ShiftAmt, DL, MVT::i32);
3492  Len = CurDAG->getTargetConstant(NumBits, DL, MVT::i32);
3493  } else if (LHS->getOpcode() == ISD::SHL) {
3494  // Here, we have a pattern like:
3495  //
3496  // (sra (shl val, NN), MM)
3497  // or
3498  // (srl (shl val, NN), MM)
3499  //
3500  // If MM >= NN, we can efficiently optimize this with bfe
3501  Val = LHS->getOperand(0);
3502 
3503  SDValue ShlRHS = LHS->getOperand(1);
3504  ConstantSDNode *ShlCnst = dyn_cast<ConstantSDNode>(ShlRHS);
3505  if (!ShlCnst) {
3506  // Shift amount must be constant
3507  return false;
3508  }
3509  uint64_t InnerShiftAmt = ShlCnst->getZExtValue();
3510 
3511  SDValue ShrRHS = RHS;
3512  ConstantSDNode *ShrCnst = dyn_cast<ConstantSDNode>(ShrRHS);
3513  if (!ShrCnst) {
3514  // Shift amount must be constant
3515  return false;
3516  }
3517  uint64_t OuterShiftAmt = ShrCnst->getZExtValue();
3518 
3519  // To avoid extra codegen and be profitable, we need Outer >= Inner
3520  if (OuterShiftAmt < InnerShiftAmt) {
3521  return false;
3522  }
3523 
3524  // If the outer shift is more than the type size, we have no bitfield to
3525  // extract (since we also check that the inner shift is <= the outer shift
3526  // then this also implies that the inner shift is < the type size)
3527  if (OuterShiftAmt >= Val.getValueSizeInBits()) {
3528  return false;
3529  }
3530 
3531  Start = CurDAG->getTargetConstant(OuterShiftAmt - InnerShiftAmt, DL,
3532  MVT::i32);
3533  Len = CurDAG->getTargetConstant(Val.getValueSizeInBits() - OuterShiftAmt,
3534  DL, MVT::i32);
3535 
3536  if (N->getOpcode() == ISD::SRA) {
3537  // If we have a arithmetic right shift, we need to use the signed bfe
3538  // variant
3539  IsSigned = true;
3540  }
3541  } else {
3542  // No can do...
3543  return false;
3544  }
3545  } else {
3546  // No can do...
3547  return false;
3548  }
3549 
3550 
3551  unsigned Opc;
3552  // For the BFE operations we form here from "and" and "srl", always use the
3553  // unsigned variants.
3554  if (Val.getValueType() == MVT::i32) {
3555  if (IsSigned) {
3556  Opc = NVPTX::BFE_S32rii;
3557  } else {
3558  Opc = NVPTX::BFE_U32rii;
3559  }
3560  } else if (Val.getValueType() == MVT::i64) {
3561  if (IsSigned) {
3562  Opc = NVPTX::BFE_S64rii;
3563  } else {
3564  Opc = NVPTX::BFE_U64rii;
3565  }
3566  } else {
3567  // We cannot handle this type
3568  return false;
3569  }
3570 
3571  SDValue Ops[] = {
3572  Val, Start, Len
3573  };
3574 
3575  ReplaceNode(N, CurDAG->getMachineNode(Opc, DL, N->getVTList(), Ops));
3576  return true;
3577 }
3578 
3579 // SelectDirectAddr - Match a direct address for DAG.
3580 // A direct address could be a globaladdress or externalsymbol.
3581 bool NVPTXDAGToDAGISel::SelectDirectAddr(SDValue N, SDValue &Address) {
3582  // Return true if TGA or ES.
3583  if (N.getOpcode() == ISD::TargetGlobalAddress ||
3585  Address = N;
3586  return true;
3587  }
3588  if (N.getOpcode() == NVPTXISD::Wrapper) {
3589  Address = N.getOperand(0);
3590  return true;
3591  }
3592  // addrspacecast(MoveParam(arg_symbol) to addrspace(PARAM)) -> arg_symbol
3593  if (AddrSpaceCastSDNode *CastN = dyn_cast<AddrSpaceCastSDNode>(N)) {
3594  if (CastN->getSrcAddressSpace() == ADDRESS_SPACE_GENERIC &&
3595  CastN->getDestAddressSpace() == ADDRESS_SPACE_PARAM &&
3596  CastN->getOperand(0).getOpcode() == NVPTXISD::MoveParam)
3597  return SelectDirectAddr(CastN->getOperand(0).getOperand(0), Address);
3598  }
3599  return false;
3600 }
3601 
3602 // symbol+offset
3603 bool NVPTXDAGToDAGISel::SelectADDRsi_imp(
3604  SDNode *OpNode, SDValue Addr, SDValue &Base, SDValue &Offset, MVT mvt) {
3605  if (Addr.getOpcode() == ISD::ADD) {
3606  if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1))) {
3607  SDValue base = Addr.getOperand(0);
3608  if (SelectDirectAddr(base, Base)) {
3609  Offset = CurDAG->getTargetConstant(CN->getZExtValue(), SDLoc(OpNode),
3610  mvt);
3611  return true;
3612  }
3613  }
3614  }
3615  return false;
3616 }
3617 
3618 // symbol+offset
3619 bool NVPTXDAGToDAGISel::SelectADDRsi(SDNode *OpNode, SDValue Addr,
3620  SDValue &Base, SDValue &Offset) {
3621  return SelectADDRsi_imp(OpNode, Addr, Base, Offset, MVT::i32);
3622 }
3623 
3624 // symbol+offset
3625 bool NVPTXDAGToDAGISel::SelectADDRsi64(SDNode *OpNode, SDValue Addr,
3626  SDValue &Base, SDValue &Offset) {
3627  return SelectADDRsi_imp(OpNode, Addr, Base, Offset, MVT::i64);
3628 }
3629 
3630 // register+offset
3631 bool NVPTXDAGToDAGISel::SelectADDRri_imp(
3632  SDNode *OpNode, SDValue Addr, SDValue &Base, SDValue &Offset, MVT mvt) {
3633  if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Addr)) {
3634  Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), mvt);
3635  Offset = CurDAG->getTargetConstant(0, SDLoc(OpNode), mvt);
3636  return true;
3637  }
3638  if (Addr.getOpcode() == ISD::TargetExternalSymbol ||
3640  return false; // direct calls.
3641 
3642  if (Addr.getOpcode() == ISD::ADD) {
3643  if (SelectDirectAddr(Addr.getOperand(0), Addr)) {
3644  return false;
3645  }
3646  if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1))) {
3647  if (FrameIndexSDNode *FIN =
3648  dyn_cast<FrameIndexSDNode>(Addr.getOperand(0)))
3649  // Constant offset from frame ref.
3650  Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), mvt);
3651  else
3652  Base = Addr.getOperand(0);
3653  Offset = CurDAG->getTargetConstant(CN->getZExtValue(), SDLoc(OpNode),
3654  mvt);
3655  return true;
3656  }
3657  }
3658  return false;
3659 }
3660 
3661 // register+offset
3662 bool NVPTXDAGToDAGISel::SelectADDRri(SDNode *OpNode, SDValue Addr,
3663  SDValue &Base, SDValue &Offset) {
3664  return SelectADDRri_imp(OpNode, Addr, Base, Offset, MVT::i32);
3665 }
3666 
3667 // register+offset
3668 bool NVPTXDAGToDAGISel::SelectADDRri64(SDNode *OpNode, SDValue Addr,
3669  SDValue &Base, SDValue &Offset) {
3670  return SelectADDRri_imp(OpNode, Addr, Base, Offset, MVT::i64);
3671 }
3672 
3673 bool NVPTXDAGToDAGISel::ChkMemSDNodeAddressSpace(SDNode *N,
3674  unsigned int spN) const {
3675  const Value *Src = nullptr;
3676  if (MemSDNode *mN = dyn_cast<MemSDNode>(N)) {
3677  if (spN == 0 && mN->getMemOperand()->getPseudoValue())
3678  return true;
3679  Src = mN->getMemOperand()->getValue();
3680  }
3681  if (!Src)
3682  return false;
3683  if (auto *PT = dyn_cast<PointerType>(Src->getType()))
3684  return (PT->getAddressSpace() == spN);
3685  return false;
3686 }
3687 
3688 /// SelectInlineAsmMemoryOperand - Implement addressing mode selection for
3689 /// inline asm expressions.
3691  const SDValue &Op, unsigned ConstraintID, std::vector<SDValue> &OutOps) {
3692  SDValue Op0, Op1;
3693  switch (ConstraintID) {
3694  default:
3695  return true;
3696  case InlineAsm::Constraint_m: // memory
3697  if (SelectDirectAddr(Op, Op0)) {
3698  OutOps.push_back(Op0);
3699  OutOps.push_back(CurDAG->getTargetConstant(0, SDLoc(Op), MVT::i32));
3700  return false;
3701  }
3702  if (SelectADDRri(Op.getNode(), Op, Op0, Op1)) {
3703  OutOps.push_back(Op0);
3704  OutOps.push_back(Op1);
3705  return false;
3706  }
3707  break;
3708  }
3709  return true;
3710 }
3711 
3712 /// GetConvertOpcode - Returns the CVT_ instruction opcode that implements a
3713 /// conversion from \p SrcTy to \p DestTy.
3714 unsigned NVPTXDAGToDAGISel::GetConvertOpcode(MVT DestTy, MVT SrcTy,
3715  bool IsSigned) {
3716  switch (SrcTy.SimpleTy) {
3717  default:
3718  llvm_unreachable("Unhandled source type");
3719  case MVT::i8:
3720  switch (DestTy.SimpleTy) {
3721  default:
3722  llvm_unreachable("Unhandled dest type");
3723  case MVT::i16:
3724  return IsSigned ? NVPTX::CVT_s16_s8 : NVPTX::CVT_u16_u8;
3725  case MVT::i32:
3726  return IsSigned ? NVPTX::CVT_s32_s8 : NVPTX::CVT_u32_u8;
3727  case MVT::i64:
3728  return IsSigned ? NVPTX::CVT_s64_s8 : NVPTX::CVT_u64_u8;
3729  }
3730  case MVT::i16:
3731  switch (DestTy.SimpleTy) {
3732  default:
3733  llvm_unreachable("Unhandled dest type");
3734  case MVT::i8:
3735  return IsSigned ? NVPTX::CVT_s8_s16 : NVPTX::CVT_u8_u16;
3736  case MVT::i32:
3737  return IsSigned ? NVPTX::CVT_s32_s16 : NVPTX::CVT_u32_u16;
3738  case MVT::i64:
3739  return IsSigned ? NVPTX::CVT_s64_s16 : NVPTX::CVT_u64_u16;
3740  }
3741  case MVT::i32:
3742  switch (DestTy.SimpleTy) {
3743  default:
3744  llvm_unreachable("Unhandled dest type");
3745  case MVT::i8:
3746  return IsSigned ? NVPTX::CVT_s8_s32 : NVPTX::CVT_u8_u32;
3747  case MVT::i16:
3748  return IsSigned ? NVPTX::CVT_s16_s32 : NVPTX::CVT_u16_u32;
3749  case MVT::i64:
3750  return IsSigned ? NVPTX::CVT_s64_s32 : NVPTX::CVT_u64_u32;
3751  }
3752  case MVT::i64:
3753  switch (DestTy.SimpleTy) {
3754  default:
3755  llvm_unreachable("Unhandled dest type");
3756  case MVT::i8:
3757  return IsSigned ? NVPTX::CVT_s8_s64 : NVPTX::CVT_u8_u64;
3758  case MVT::i16:
3759  return IsSigned ? NVPTX::CVT_s16_s64 : NVPTX::CVT_u16_u64;
3760  case MVT::i32:
3761  return IsSigned ? NVPTX::CVT_s32_s64 : NVPTX::CVT_u32_u64;
3762  }
3763  }
3764 }
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
Definition: ISDOpcodes.h:570
bool isInvariant() const
EVT getValueType() const
Return the ValueType of the referenced return value.
GCNRegPressure max(const GCNRegPressure &P1, const GCNRegPressure &P2)
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
Atomic ordering constants.
LLVM_ATTRIBUTE_NORETURN void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition: Error.cpp:139
This class represents lattice values for constants.
Definition: AllocatorList.h:23
constexpr char IsVolatile[]
Key for Kernel::Arg::Metadata::mIsVolatile.
void ReplaceUses(SDValue F, SDValue T)
ReplaceUses - replace all uses of the old node F with the use of the new node T.
bool isVector() const
Return true if this is a vector value type.
EVT getValueType(unsigned ResNo) const
Return the type of a specified result.
static Optional< unsigned > pickOpcodeForVT(MVT::SimpleValueType VT, unsigned Opcode_i8, unsigned Opcode_i16, unsigned Opcode_i32, Optional< unsigned > Opcode_i64, unsigned Opcode_f16, unsigned Opcode_f16x2, unsigned Opcode_f32, Optional< unsigned > Opcode_f64)
SDVTList getVTList() const
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition: ValueTypes.h:252
const SDValue & getChain() const
bool useF32FTZ(const MachineFunction &MF) const
static unsigned int getCodeAddrSpace(MemSDNode *N)
TargetGlobalAddress - Like GlobalAddress, but the DAG does no folding or anything else with this node...
Definition: ISDOpcodes.h:130
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly...
Definition: STLExtras.h:1185
unsigned getPointerSizeInBits(unsigned AS=0) const
Layout pointer size, in bits FIXME: The defaults need to be removed once all of the backends/clients ...
Definition: DataLayout.h:365
static unsigned getPTXCmpMode(const CondCodeSDNode &CondCode, bool FTZ)
MVT getSimpleValueType(unsigned ResNo) const
Return the type of a specified result as a simple type.
F(f)
void setNodeId(int Id)
Set unique node id.
SDNode * getNode() const
get the SDNode which holds the desired result
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
const NVPTXSubtarget * Subtarget
unsigned getValueSizeInBits() const
Returns the size of the value in bits.
MachineFunction * MF
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition: ISDOpcodes.h:158
unsigned getAddressSpace() const
Return the address space for the associated pointer.
NVPTXDAGToDAGISel(NVPTXTargetMachine &tm, CodeGenOpt::Level OptLevel)
A description of a memory reference used in the backend.
amdgpu aa AMDGPU Address space based Alias Analysis Wrapper
Shift and rotation operations.
Definition: ISDOpcodes.h:409
std::size_t countTrailingOnes(T Value, ZeroBehavior ZB=ZB_Width)
Count the number of ones from the least significant bit to the first zero bit.
Definition: MathExtras.h:477
MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s), MachineInstr opcode, and operands.
op_iterator op_end() const
ISD::LoadExtType getExtensionType() const
Return whether this is a plain node, or one of the varieties of value-extending loads.
SimpleValueType SimpleTy
const DataLayout & getDataLayout() const
Definition: SelectionDAG.h:400
SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
This is an SDNode representing atomic operations.
AtomicOrdering getOrdering() const
Return the atomic ordering requirements for this memory operation.
This represents a list of ValueType&#39;s that has been intern&#39;d by a SelectionDAG.
AtomicOrdering
Atomic ordering for LLVM&#39;s memory model.
unsigned getSizeInBits() const
bool runOnMachineFunction(MachineFunction &MF) override
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...
bool isKernelFunction(const Function &F)
bool writeMem() const
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:244
Definition: Lint.cpp:83
constexpr bool isMask_64(uint64_t Value)
Return true if the argument is a non-empty sequence of ones starting at the least significant bit wit...
Definition: MathExtras.h:410
SDValue getTargetFrameIndex(int FI, EVT VT)
Definition: SelectionDAG.h:627
const T & getValue() const LLVM_LVALUE_FUNCTION
Definition: Optional.h:256
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:200
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out...
Definition: ISDOpcodes.h:965
op_iterator op_begin() const
bool SelectInlineAsmMemoryOperand(const SDValue &Op, unsigned ConstraintID, std::vector< SDValue > &OutOps) override
SelectInlineAsmMemoryOperand - Implement addressing mode selection for inline asm expressions...
#define EQ(a, b)
Definition: regexec.c:112
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
Definition: SelectionDAG.h:575
bool isStrongerThanMonotonic(AtomicOrdering ao)
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition: ISDOpcodes.h:150
SDValue getTargetConstantFP(double Val, const SDLoc &DL, EVT VT)
Definition: SelectionDAG.h:607
unsigned getSrcAddressSpace() const
const DataLayout & getDataLayout() const
Return the DataLayout attached to the Module associated to this MF.
This class is used to represent ISD::STORE nodes.
Flag
These should be considered private to the implementation of the MCInstrDesc class.
Definition: MCInstrDesc.h:117
MVT getSimpleValueType() const
Return the simple ValueType of the referenced return value.
const Value * getValue() const
Return the base address of the memory access.
bool allowFMA(MachineFunction &MF, CodeGenOpt::Level OptLevel) const
CodeGenOpt::Level OptLevel
std::size_t countTrailingZeros(T Val, ZeroBehavior ZB=ZB_Width)
Count number of 0&#39;s from the least significant bit to the most stopping at the first 1...
Definition: MathExtras.h:119
Machine Value Type.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition: ValueTypes.h:272
const SDValue & getOperand(unsigned Num) const
const NVPTXTargetLowering * getTargetLowering() const override
unsigned getDestAddressSpace() const
static bool canLowerToLDG(MemSDNode *N, const NVPTXSubtarget &Subtarget, unsigned CodeAddrSpace, MachineFunction *F)
static ManagedStatic< std::set< EVT, EVT::compareRawBits > > EVTs
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:284
Extended Value Type.
Definition: ValueTypes.h:33
bool allowUnsafeFPMath(MachineFunction &MF) const
bool isVolatile() const
bool isMachineOpcode() const
Test if this node has a post-isel opcode, directly corresponding to a MachineInstr opcode...
unsigned getNumOperands() const
Return the number of values used by this operation.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition: ISDOpcodes.h:338
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition: ValueTypes.h:264
ADDRSPACECAST - This operator converts between pointers of different address spaces.
Definition: ISDOpcodes.h:574
This is a &#39;vector&#39; (really, a variable-sized array), optimized for the case when the array is small...
Definition: SmallVector.h:839
This is an abstract virtual class for memory operations.
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
Represents one node in the SelectionDAG.
SelectionDAGISel - This is the common base class used for SelectionDAG-based pattern-matching instruc...
bool readMem() const
const Function & getFunction() const
Return the LLVM function that this machine code represents.
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:940
bool runOnMachineFunction(MachineFunction &MF) override
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...
EVT getMemoryVT() const
Return the type of the in-memory value.
iterator_range< use_iterator > uses()
NVPTXTargetMachine.
constexpr bool isShiftedMask_64(uint64_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (64 bit ver...
Definition: MathExtras.h:422
bool isVector() const
Return true if this is a vector value type.
Definition: ValueTypes.h:150
FunctionPass * createNVPTXISelDag(NVPTXTargetMachine &TM, llvm::CodeGenOpt::Level OptLevel)
createNVPTXISelDag - This pass converts a legalized DAG into a NVPTX-specific DAG, ready for instruction scheduling.
Bitwise operators - logical and, logical or, logical xor.
Definition: ISDOpcodes.h:386
LLVM_NODISCARD bool empty() const
Definition: SmallVector.h:55
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
Definition: ISDOpcodes.h:615
#define N
ISD::CondCode get() const
LLVM_NODISCARD std::enable_if<!is_simple_type< Y >::value, typename cast_retty< X, const Y >::ret_type >::type dyn_cast(const Y &Val)
Definition: Casting.h:322
unsigned getOpcode() const
void GetUnderlyingObjects(Value *V, SmallVectorImpl< Value *> &Objects, const DataLayout &DL, LoopInfo *LI=nullptr, unsigned MaxLookup=6)
This method is similar to GetUnderlyingObject except that it can look through phi and select instruct...
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
LLVM Value Representation.
Definition: Value.h:72
std::underlying_type< E >::type Mask()
Get a bitmask with 1s in all places up to the high-order bit of E&#39;s largest value.
Definition: BitmaskEnum.h:80
void ReplaceNode(SDNode *F, SDNode *T)
Replace all uses of F with T, then remove F from the DAG.
static bool isVolatile(Instruction *Inst)
const SDValue & getOperand(unsigned i) const
OUTCHAIN = ATOMIC_STORE(INCHAIN, ptr, val) This corresponds to "store atomic" instruction.
Definition: ISDOpcodes.h:793
uint64_t getZExtValue() const
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition: ValueTypes.h:125
void setNodeMemRefs(MachineSDNode *N, ArrayRef< MachineMemOperand *> NewMemRefs)
Mutate the specified machine node&#39;s memory references to the provided list.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation...
Val, OUTCHAIN = ATOMIC_LOAD(INCHAIN, ptr) This corresponds to "load atomic" instruction.
Definition: ISDOpcodes.h:789
const SDValue & getBasePtr() const
This class is used to represent ISD::LOAD nodes.