LLVM  8.0.0svn
NVPTXISelDAGToDAG.cpp
Go to the documentation of this file.
1 //===-- NVPTXISelDAGToDAG.cpp - A dag to dag inst selector for NVPTX ------===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // This file defines an instruction selector for the NVPTX target.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "NVPTXISelDAGToDAG.h"
15 #include "NVPTXUtilities.h"
17 #include "llvm/IR/GlobalValue.h"
18 #include "llvm/IR/Instructions.h"
21 #include "llvm/Support/Debug.h"
25 
26 using namespace llvm;
27 
28 #define DEBUG_TYPE "nvptx-isel"
29 
30 /// createNVPTXISelDag - This pass converts a legalized DAG into a
31 /// NVPTX-specific DAG, ready for instruction scheduling.
33  llvm::CodeGenOpt::Level OptLevel) {
34  return new NVPTXDAGToDAGISel(TM, OptLevel);
35 }
36 
38  CodeGenOpt::Level OptLevel)
39  : SelectionDAGISel(tm, OptLevel), TM(tm) {
40  doMulWide = (OptLevel > 0);
41 }
42 
44  Subtarget = &static_cast<const NVPTXSubtarget &>(MF.getSubtarget());
46 }
47 
48 int NVPTXDAGToDAGISel::getDivF32Level() const {
50 }
51 
52 bool NVPTXDAGToDAGISel::usePrecSqrtF32() const {
54 }
55 
56 bool NVPTXDAGToDAGISel::useF32FTZ() const {
58 }
59 
60 bool NVPTXDAGToDAGISel::allowFMA() const {
62  return TL->allowFMA(*MF, OptLevel);
63 }
64 
65 bool NVPTXDAGToDAGISel::allowUnsafeFPMath() const {
67  return TL->allowUnsafeFPMath(*MF);
68 }
69 
70 bool NVPTXDAGToDAGISel::useShortPointers() const {
71  return TM.useShortPointers();
72 }
73 
74 /// Select - Select instructions not customized! Used for
75 /// expanded, promoted and normal instructions.
76 void NVPTXDAGToDAGISel::Select(SDNode *N) {
77 
78  if (N->isMachineOpcode()) {
79  N->setNodeId(-1);
80  return; // Already selected.
81  }
82 
83  switch (N->getOpcode()) {
84  case ISD::LOAD:
85  case ISD::ATOMIC_LOAD:
86  if (tryLoad(N))
87  return;
88  break;
89  case ISD::STORE:
90  case ISD::ATOMIC_STORE:
91  if (tryStore(N))
92  return;
93  break;
95  if (tryEXTRACT_VECTOR_ELEMENT(N))
96  return;
97  break;
99  SelectSETP_F16X2(N);
100  return;
101 
102  case NVPTXISD::LoadV2:
103  case NVPTXISD::LoadV4:
104  if (tryLoadVector(N))
105  return;
106  break;
107  case NVPTXISD::LDGV2:
108  case NVPTXISD::LDGV4:
109  case NVPTXISD::LDUV2:
110  case NVPTXISD::LDUV4:
111  if (tryLDGLDU(N))
112  return;
113  break;
114  case NVPTXISD::StoreV2:
115  case NVPTXISD::StoreV4:
116  if (tryStoreVector(N))
117  return;
118  break;
119  case NVPTXISD::LoadParam:
122  if (tryLoadParam(N))
123  return;
124  break;
128  if (tryStoreRetval(N))
129  return;
130  break;
136  if (tryStoreParam(N))
137  return;
138  break;
140  if (tryIntrinsicNoChain(N))
141  return;
142  break;
144  if (tryIntrinsicChain(N))
145  return;
146  break;
315  if (tryTextureIntrinsic(N))
316  return;
317  break;
483  if (trySurfaceIntrinsic(N))
484  return;
485  break;
486  case ISD::AND:
487  case ISD::SRA:
488  case ISD::SRL:
489  // Try to select BFE
490  if (tryBFE(N))
491  return;
492  break;
493  case ISD::ADDRSPACECAST:
494  SelectAddrSpaceCast(N);
495  return;
496  case ISD::ConstantFP:
497  if (tryConstantFP16(N))
498  return;
499  break;
500  default:
501  break;
502  }
503  SelectCode(N);
504 }
505 
506 bool NVPTXDAGToDAGISel::tryIntrinsicChain(SDNode *N) {
507  unsigned IID = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
508  switch (IID) {
509  default:
510  return false;
511  case Intrinsic::nvvm_ldg_global_f:
512  case Intrinsic::nvvm_ldg_global_i:
513  case Intrinsic::nvvm_ldg_global_p:
514  case Intrinsic::nvvm_ldu_global_f:
515  case Intrinsic::nvvm_ldu_global_i:
516  case Intrinsic::nvvm_ldu_global_p:
517  return tryLDGLDU(N);
518  }
519 }
520 
521 // There's no way to specify FP16 immediates in .f16 ops, so we have to
522 // load them into an .f16 register first.
523 bool NVPTXDAGToDAGISel::tryConstantFP16(SDNode *N) {
524  if (N->getValueType(0) != MVT::f16)
525  return false;
527  cast<ConstantFPSDNode>(N)->getValueAPF(), SDLoc(N), MVT::f16);
528  SDNode *LoadConstF16 =
529  CurDAG->getMachineNode(NVPTX::LOAD_CONST_F16, SDLoc(N), MVT::f16, Val);
530  ReplaceNode(N, LoadConstF16);
531  return true;
532 }
533 
534 // Map ISD:CONDCODE value to appropriate CmpMode expected by
535 // NVPTXInstPrinter::printCmpMode()
536 static unsigned getPTXCmpMode(const CondCodeSDNode &CondCode, bool FTZ) {
538  unsigned PTXCmpMode = [](ISD::CondCode CC) {
539  switch (CC) {
540  default:
541  llvm_unreachable("Unexpected condition code.");
542  case ISD::SETOEQ:
543  return CmpMode::EQ;
544  case ISD::SETOGT:
545  return CmpMode::GT;
546  case ISD::SETOGE:
547  return CmpMode::GE;
548  case ISD::SETOLT:
549  return CmpMode::LT;
550  case ISD::SETOLE:
551  return CmpMode::LE;
552  case ISD::SETONE:
553  return CmpMode::NE;
554  case ISD::SETO:
555  return CmpMode::NUM;
556  case ISD::SETUO:
557  return CmpMode::NotANumber;
558  case ISD::SETUEQ:
559  return CmpMode::EQU;
560  case ISD::SETUGT:
561  return CmpMode::GTU;
562  case ISD::SETUGE:
563  return CmpMode::GEU;
564  case ISD::SETULT:
565  return CmpMode::LTU;
566  case ISD::SETULE:
567  return CmpMode::LEU;
568  case ISD::SETUNE:
569  return CmpMode::NEU;
570  case ISD::SETEQ:
571  return CmpMode::EQ;
572  case ISD::SETGT:
573  return CmpMode::GT;
574  case ISD::SETGE:
575  return CmpMode::GE;
576  case ISD::SETLT:
577  return CmpMode::LT;
578  case ISD::SETLE:
579  return CmpMode::LE;
580  case ISD::SETNE:
581  return CmpMode::NE;
582  }
583  }(CondCode.get());
584 
585  if (FTZ)
586  PTXCmpMode |= NVPTX::PTXCmpMode::FTZ_FLAG;
587 
588  return PTXCmpMode;
589 }
590 
591 bool NVPTXDAGToDAGISel::SelectSETP_F16X2(SDNode *N) {
592  unsigned PTXCmpMode =
593  getPTXCmpMode(*cast<CondCodeSDNode>(N->getOperand(2)), useF32FTZ());
594  SDLoc DL(N);
595  SDNode *SetP = CurDAG->getMachineNode(
596  NVPTX::SETP_f16x2rr, DL, MVT::i1, MVT::i1, N->getOperand(0),
597  N->getOperand(1), CurDAG->getTargetConstant(PTXCmpMode, DL, MVT::i32));
598  ReplaceNode(N, SetP);
599  return true;
600 }
601 
602 // Find all instances of extract_vector_elt that use this v2f16 vector
603 // and coalesce them into a scattering move instruction.
604 bool NVPTXDAGToDAGISel::tryEXTRACT_VECTOR_ELEMENT(SDNode *N) {
605  SDValue Vector = N->getOperand(0);
606 
607  // We only care about f16x2 as it's the only real vector type we
608  // need to deal with.
609  if (Vector.getSimpleValueType() != MVT::v2f16)
610  return false;
611 
612  // Find and record all uses of this vector that extract element 0 or 1.
614  for (const auto &U : Vector.getNode()->uses()) {
615  if (U->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
616  continue;
617  if (U->getOperand(0) != Vector)
618  continue;
619  if (const ConstantSDNode *IdxConst =
620  dyn_cast<ConstantSDNode>(U->getOperand(1))) {
621  if (IdxConst->getZExtValue() == 0)
622  E0.push_back(U);
623  else if (IdxConst->getZExtValue() == 1)
624  E1.push_back(U);
625  else
626  llvm_unreachable("Invalid vector index.");
627  }
628  }
629 
630  // There's no point scattering f16x2 if we only ever access one
631  // element of it.
632  if (E0.empty() || E1.empty())
633  return false;
634 
635  unsigned Op = NVPTX::SplitF16x2;
636  // If the vector has been BITCAST'ed from i32, we can use original
637  // value directly and avoid register-to-register move.
638  SDValue Source = Vector;
639  if (Vector->getOpcode() == ISD::BITCAST) {
640  Op = NVPTX::SplitI32toF16x2;
641  Source = Vector->getOperand(0);
642  }
643  // Merge (f16 extractelt(V, 0), f16 extractelt(V,1))
644  // into f16,f16 SplitF16x2(V)
645  SDNode *ScatterOp =
646  CurDAG->getMachineNode(Op, SDLoc(N), MVT::f16, MVT::f16, Source);
647  for (auto *Node : E0)
648  ReplaceUses(SDValue(Node, 0), SDValue(ScatterOp, 0));
649  for (auto *Node : E1)
650  ReplaceUses(SDValue(Node, 0), SDValue(ScatterOp, 1));
651 
652  return true;
653 }
654 
655 static unsigned int getCodeAddrSpace(MemSDNode *N) {
656  const Value *Src = N->getMemOperand()->getValue();
657 
658  if (!Src)
660 
661  if (auto *PT = dyn_cast<PointerType>(Src->getType())) {
662  switch (PT->getAddressSpace()) {
669  default: break;
670  }
671  }
673 }
674 
676  unsigned CodeAddrSpace, MachineFunction *F) {
677  // We use ldg (i.e. ld.global.nc) for invariant loads from the global address
678  // space.
679  //
680  // We have two ways of identifying invariant loads: Loads may be explicitly
681  // marked as invariant, or we may infer them to be invariant.
682  //
683  // We currently infer invariance for loads from
684  // - constant global variables, and
685  // - kernel function pointer params that are noalias (i.e. __restrict) and
686  // never written to.
687  //
688  // TODO: Perform a more powerful invariance analysis (ideally IPO, and ideally
689  // not during the SelectionDAG phase).
690  //
691  // TODO: Infer invariance only at -O2. We still want to use ldg at -O0 for
692  // explicitly invariant loads because these are how clang tells us to use ldg
693  // when the user uses a builtin.
694  if (!Subtarget.hasLDG() || CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL)
695  return false;
696 
697  if (N->isInvariant())
698  return true;
699 
700  bool IsKernelFn = isKernelFunction(F->getFunction());
701 
702  // We use GetUnderlyingObjects() here instead of GetUnderlyingObject() mainly
703  // because the former looks through phi nodes while the latter does not. We
704  // need to look through phi nodes to handle pointer induction variables.
706  GetUnderlyingObjects(const_cast<Value *>(N->getMemOperand()->getValue()),
707  Objs, F->getDataLayout());
708 
709  return all_of(Objs, [&](Value *V) {
710  if (auto *A = dyn_cast<const Argument>(V))
711  return IsKernelFn && A->onlyReadsMemory() && A->hasNoAliasAttr();
712  if (auto *GV = dyn_cast<const GlobalVariable>(V))
713  return GV->isConstant();
714  return false;
715  });
716 }
717 
718 bool NVPTXDAGToDAGISel::tryIntrinsicNoChain(SDNode *N) {
719  unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
720  switch (IID) {
721  default:
722  return false;
723  case Intrinsic::nvvm_texsurf_handle_internal:
724  SelectTexSurfHandle(N);
725  return true;
726  }
727 }
728 
729 void NVPTXDAGToDAGISel::SelectTexSurfHandle(SDNode *N) {
730  // Op 0 is the intrinsic ID
731  SDValue Wrapper = N->getOperand(1);
732  SDValue GlobalVal = Wrapper.getOperand(0);
733  ReplaceNode(N, CurDAG->getMachineNode(NVPTX::texsurf_handles, SDLoc(N),
734  MVT::i64, GlobalVal));
735 }
736 
737 void NVPTXDAGToDAGISel::SelectAddrSpaceCast(SDNode *N) {
738  SDValue Src = N->getOperand(0);
739  AddrSpaceCastSDNode *CastN = cast<AddrSpaceCastSDNode>(N);
740  unsigned SrcAddrSpace = CastN->getSrcAddressSpace();
741  unsigned DstAddrSpace = CastN->getDestAddressSpace();
742  assert(SrcAddrSpace != DstAddrSpace &&
743  "addrspacecast must be between different address spaces");
744 
745  if (DstAddrSpace == ADDRESS_SPACE_GENERIC) {
746  // Specific to generic
747  unsigned Opc;
748  switch (SrcAddrSpace) {
749  default: report_fatal_error("Bad address space in addrspacecast");
751  Opc = TM.is64Bit() ? NVPTX::cvta_global_yes_64 : NVPTX::cvta_global_yes;
752  break;
754  Opc = TM.is64Bit() ? (useShortPointers() ? NVPTX::cvta_shared_yes_6432
755  : NVPTX::cvta_shared_yes_64)
756  : NVPTX::cvta_shared_yes;
757  break;
758  case ADDRESS_SPACE_CONST:
759  Opc = TM.is64Bit() ? (useShortPointers() ? NVPTX::cvta_const_yes_6432
760  : NVPTX::cvta_const_yes_64)
761  : NVPTX::cvta_const_yes;
762  break;
763  case ADDRESS_SPACE_LOCAL:
764  Opc = TM.is64Bit() ? (useShortPointers() ? NVPTX::cvta_local_yes_6432
765  : NVPTX::cvta_local_yes_64)
766  : NVPTX::cvta_local_yes;
767  break;
768  }
770  Src));
771  return;
772  } else {
773  // Generic to specific
774  if (SrcAddrSpace != 0)
775  report_fatal_error("Cannot cast between two non-generic address spaces");
776  unsigned Opc;
777  switch (DstAddrSpace) {
778  default: report_fatal_error("Bad address space in addrspacecast");
780  Opc = TM.is64Bit() ? NVPTX::cvta_to_global_yes_64
781  : NVPTX::cvta_to_global_yes;
782  break;
784  Opc = TM.is64Bit() ? (useShortPointers() ? NVPTX::cvta_to_shared_yes_3264
785  : NVPTX::cvta_to_shared_yes_64)
786  : NVPTX::cvta_to_shared_yes;
787  break;
788  case ADDRESS_SPACE_CONST:
789  Opc = TM.is64Bit() ? (useShortPointers() ? NVPTX::cvta_to_const_yes_3264
790  : NVPTX::cvta_to_const_yes_64)
791  : NVPTX::cvta_to_const_yes;
792  break;
793  case ADDRESS_SPACE_LOCAL:
794  Opc = TM.is64Bit() ? (useShortPointers() ? NVPTX::cvta_to_local_yes_3264
795  : NVPTX::cvta_to_local_yes_64)
796  : NVPTX::cvta_to_local_yes;
797  break;
798  case ADDRESS_SPACE_PARAM:
799  Opc = TM.is64Bit() ? NVPTX::nvvm_ptr_gen_to_param_64
800  : NVPTX::nvvm_ptr_gen_to_param;
801  break;
802  }
804  Src));
805  return;
806  }
807 }
808 
809 // Helper function template to reduce amount of boilerplate code for
810 // opcode selection.
812  MVT::SimpleValueType VT, unsigned Opcode_i8, unsigned Opcode_i16,
813  unsigned Opcode_i32, Optional<unsigned> Opcode_i64, unsigned Opcode_f16,
814  unsigned Opcode_f16x2, unsigned Opcode_f32, Optional<unsigned> Opcode_f64) {
815  switch (VT) {
816  case MVT::i1:
817  case MVT::i8:
818  return Opcode_i8;
819  case MVT::i16:
820  return Opcode_i16;
821  case MVT::i32:
822  return Opcode_i32;
823  case MVT::i64:
824  return Opcode_i64;
825  case MVT::f16:
826  return Opcode_f16;
827  case MVT::v2f16:
828  return Opcode_f16x2;
829  case MVT::f32:
830  return Opcode_f32;
831  case MVT::f64:
832  return Opcode_f64;
833  default:
834  return None;
835  }
836 }
837 
838 bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
839  SDLoc dl(N);
840  MemSDNode *LD = cast<MemSDNode>(N);
841  assert(LD->readMem() && "Expected load");
842  LoadSDNode *PlainLoad = dyn_cast<LoadSDNode>(N);
843  EVT LoadedVT = LD->getMemoryVT();
844  SDNode *NVPTXLD = nullptr;
845 
846  // do not support pre/post inc/dec
847  if (PlainLoad && PlainLoad->isIndexed())
848  return false;
849 
850  if (!LoadedVT.isSimple())
851  return false;
852 
853  AtomicOrdering Ordering = LD->getOrdering();
854  // In order to lower atomic loads with stronger guarantees we would need to
855  // use load.acquire or insert fences. However these features were only added
856  // with PTX ISA 6.0 / sm_70.
857  // TODO: Check if we can actually use the new instructions and implement them.
858  if (isStrongerThanMonotonic(Ordering))
859  return false;
860 
861  // Address Space Setting
862  unsigned int CodeAddrSpace = getCodeAddrSpace(LD);
863  if (canLowerToLDG(LD, *Subtarget, CodeAddrSpace, MF)) {
864  return tryLDGLDU(N);
865  }
866 
867  unsigned int PointerSize =
869 
870  // Volatile Setting
871  // - .volatile is only available for .global and .shared
872  // - .volatile has the same memory synchronization semantics as .relaxed.sys
873  bool isVolatile = LD->isVolatile() || Ordering == AtomicOrdering::Monotonic;
874  if (CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL &&
875  CodeAddrSpace != NVPTX::PTXLdStInstCode::SHARED &&
876  CodeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC)
877  isVolatile = false;
878 
879  // Type Setting: fromType + fromTypeWidth
880  //
881  // Sign : ISD::SEXTLOAD
882  // Unsign : ISD::ZEXTLOAD, ISD::NON_EXTLOAD or ISD::EXTLOAD and the
883  // type is integer
884  // Float : ISD::NON_EXTLOAD or ISD::EXTLOAD and the type is float
885  MVT SimpleVT = LoadedVT.getSimpleVT();
886  MVT ScalarVT = SimpleVT.getScalarType();
887  // Read at least 8 bits (predicates are stored as 8-bit values)
888  unsigned fromTypeWidth = std::max(8U, ScalarVT.getSizeInBits());
889  unsigned int fromType;
890 
891  // Vector Setting
892  unsigned vecType = NVPTX::PTXLdStInstCode::Scalar;
893  if (SimpleVT.isVector()) {
894  assert(LoadedVT == MVT::v2f16 && "Unexpected vector type");
895  // v2f16 is loaded using ld.b32
896  fromTypeWidth = 32;
897  }
898 
899  if (PlainLoad && (PlainLoad->getExtensionType() == ISD::SEXTLOAD))
901  else if (ScalarVT.isFloatingPoint())
902  // f16 uses .b16 as its storage type.
903  fromType = ScalarVT.SimpleTy == MVT::f16 ? NVPTX::PTXLdStInstCode::Untyped
905  else
907 
908  // Create the machine instruction DAG
909  SDValue Chain = N->getOperand(0);
910  SDValue N1 = N->getOperand(1);
911  SDValue Addr;
913  Optional<unsigned> Opcode;
915 
916  if (SelectDirectAddr(N1, Addr)) {
917  Opcode = pickOpcodeForVT(
918  TargetVT, NVPTX::LD_i8_avar, NVPTX::LD_i16_avar, NVPTX::LD_i32_avar,
919  NVPTX::LD_i64_avar, NVPTX::LD_f16_avar, NVPTX::LD_f16x2_avar,
920  NVPTX::LD_f32_avar, NVPTX::LD_f64_avar);
921  if (!Opcode)
922  return false;
923  SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(CodeAddrSpace, dl),
924  getI32Imm(vecType, dl), getI32Imm(fromType, dl),
925  getI32Imm(fromTypeWidth, dl), Addr, Chain };
926  NVPTXLD = CurDAG->getMachineNode(Opcode.getValue(), dl, TargetVT,
927  MVT::Other, Ops);
928  } else if (PointerSize == 64 ? SelectADDRsi64(N1.getNode(), N1, Base, Offset)
929  : SelectADDRsi(N1.getNode(), N1, Base, Offset)) {
930  Opcode = pickOpcodeForVT(TargetVT, NVPTX::LD_i8_asi, NVPTX::LD_i16_asi,
931  NVPTX::LD_i32_asi, NVPTX::LD_i64_asi,
932  NVPTX::LD_f16_asi, NVPTX::LD_f16x2_asi,
933  NVPTX::LD_f32_asi, NVPTX::LD_f64_asi);
934  if (!Opcode)
935  return false;
936  SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(CodeAddrSpace, dl),
937  getI32Imm(vecType, dl), getI32Imm(fromType, dl),
938  getI32Imm(fromTypeWidth, dl), Base, Offset, Chain };
939  NVPTXLD = CurDAG->getMachineNode(Opcode.getValue(), dl, TargetVT,
940  MVT::Other, Ops);
941  } else if (PointerSize == 64 ? SelectADDRri64(N1.getNode(), N1, Base, Offset)
942  : SelectADDRri(N1.getNode(), N1, Base, Offset)) {
943  if (PointerSize == 64)
944  Opcode = pickOpcodeForVT(
945  TargetVT, NVPTX::LD_i8_ari_64, NVPTX::LD_i16_ari_64,
946  NVPTX::LD_i32_ari_64, NVPTX::LD_i64_ari_64, NVPTX::LD_f16_ari_64,
947  NVPTX::LD_f16x2_ari_64, NVPTX::LD_f32_ari_64, NVPTX::LD_f64_ari_64);
948  else
949  Opcode = pickOpcodeForVT(
950  TargetVT, NVPTX::LD_i8_ari, NVPTX::LD_i16_ari, NVPTX::LD_i32_ari,
951  NVPTX::LD_i64_ari, NVPTX::LD_f16_ari, NVPTX::LD_f16x2_ari,
952  NVPTX::LD_f32_ari, NVPTX::LD_f64_ari);
953  if (!Opcode)
954  return false;
955  SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(CodeAddrSpace, dl),
956  getI32Imm(vecType, dl), getI32Imm(fromType, dl),
957  getI32Imm(fromTypeWidth, dl), Base, Offset, Chain };
958  NVPTXLD = CurDAG->getMachineNode(Opcode.getValue(), dl, TargetVT,
959  MVT::Other, Ops);
960  } else {
961  if (PointerSize == 64)
962  Opcode = pickOpcodeForVT(
963  TargetVT, NVPTX::LD_i8_areg_64, NVPTX::LD_i16_areg_64,
964  NVPTX::LD_i32_areg_64, NVPTX::LD_i64_areg_64, NVPTX::LD_f16_areg_64,
965  NVPTX::LD_f16x2_areg_64, NVPTX::LD_f32_areg_64,
966  NVPTX::LD_f64_areg_64);
967  else
968  Opcode = pickOpcodeForVT(
969  TargetVT, NVPTX::LD_i8_areg, NVPTX::LD_i16_areg, NVPTX::LD_i32_areg,
970  NVPTX::LD_i64_areg, NVPTX::LD_f16_areg, NVPTX::LD_f16x2_areg,
971  NVPTX::LD_f32_areg, NVPTX::LD_f64_areg);
972  if (!Opcode)
973  return false;
974  SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(CodeAddrSpace, dl),
975  getI32Imm(vecType, dl), getI32Imm(fromType, dl),
976  getI32Imm(fromTypeWidth, dl), N1, Chain };
977  NVPTXLD = CurDAG->getMachineNode(Opcode.getValue(), dl, TargetVT,
978  MVT::Other, Ops);
979  }
980 
981  if (!NVPTXLD)
982  return false;
983 
984  MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand();
985  CurDAG->setNodeMemRefs(cast<MachineSDNode>(NVPTXLD), {MemRef});
986 
987  ReplaceNode(N, NVPTXLD);
988  return true;
989 }
990 
991 bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
992 
993  SDValue Chain = N->getOperand(0);
994  SDValue Op1 = N->getOperand(1);
995  SDValue Addr, Offset, Base;
996  Optional<unsigned> Opcode;
997  SDLoc DL(N);
998  SDNode *LD;
999  MemSDNode *MemSD = cast<MemSDNode>(N);
1000  EVT LoadedVT = MemSD->getMemoryVT();
1001 
1002  if (!LoadedVT.isSimple())
1003  return false;
1004 
1005  // Address Space Setting
1006  unsigned int CodeAddrSpace = getCodeAddrSpace(MemSD);
1007  if (canLowerToLDG(MemSD, *Subtarget, CodeAddrSpace, MF)) {
1008  return tryLDGLDU(N);
1009  }
1010 
1011  unsigned int PointerSize =
1013 
1014  // Volatile Setting
1015  // - .volatile is only availalble for .global and .shared
1016  bool IsVolatile = MemSD->isVolatile();
1017  if (CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL &&
1018  CodeAddrSpace != NVPTX::PTXLdStInstCode::SHARED &&
1019  CodeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC)
1020  IsVolatile = false;
1021 
1022  // Vector Setting
1023  MVT SimpleVT = LoadedVT.getSimpleVT();
1024 
1025  // Type Setting: fromType + fromTypeWidth
1026  //
1027  // Sign : ISD::SEXTLOAD
1028  // Unsign : ISD::ZEXTLOAD, ISD::NON_EXTLOAD or ISD::EXTLOAD and the
1029  // type is integer
1030  // Float : ISD::NON_EXTLOAD or ISD::EXTLOAD and the type is float
1031  MVT ScalarVT = SimpleVT.getScalarType();
1032  // Read at least 8 bits (predicates are stored as 8-bit values)
1033  unsigned FromTypeWidth = std::max(8U, ScalarVT.getSizeInBits());
1034  unsigned int FromType;
1035  // The last operand holds the original LoadSDNode::getExtensionType() value
1036  unsigned ExtensionType = cast<ConstantSDNode>(
1037  N->getOperand(N->getNumOperands() - 1))->getZExtValue();
1038  if (ExtensionType == ISD::SEXTLOAD)
1039  FromType = NVPTX::PTXLdStInstCode::Signed;
1040  else if (ScalarVT.isFloatingPoint())
1041  FromType = ScalarVT.SimpleTy == MVT::f16 ? NVPTX::PTXLdStInstCode::Untyped
1043  else
1045 
1046  unsigned VecType;
1047 
1048  switch (N->getOpcode()) {
1049  case NVPTXISD::LoadV2:
1051  break;
1052  case NVPTXISD::LoadV4:
1054  break;
1055  default:
1056  return false;
1057  }
1058 
1059  EVT EltVT = N->getValueType(0);
1060 
1061  // v8f16 is a special case. PTX doesn't have ld.v8.f16
1062  // instruction. Instead, we split the vector into v2f16 chunks and
1063  // load them with ld.v4.b32.
1064  if (EltVT == MVT::v2f16) {
1065  assert(N->getOpcode() == NVPTXISD::LoadV4 && "Unexpected load opcode.");
1066  EltVT = MVT::i32;
1068  FromTypeWidth = 32;
1069  }
1070 
1071  if (SelectDirectAddr(Op1, Addr)) {
1072  switch (N->getOpcode()) {
1073  default:
1074  return false;
1075  case NVPTXISD::LoadV2:
1076  Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1077  NVPTX::LDV_i8_v2_avar, NVPTX::LDV_i16_v2_avar,
1078  NVPTX::LDV_i32_v2_avar, NVPTX::LDV_i64_v2_avar,
1079  NVPTX::LDV_f16_v2_avar, NVPTX::LDV_f16x2_v2_avar,
1080  NVPTX::LDV_f32_v2_avar, NVPTX::LDV_f64_v2_avar);
1081  break;
1082  case NVPTXISD::LoadV4:
1083  Opcode =
1084  pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_avar,
1085  NVPTX::LDV_i16_v4_avar, NVPTX::LDV_i32_v4_avar, None,
1086  NVPTX::LDV_f16_v4_avar, NVPTX::LDV_f16x2_v4_avar,
1087  NVPTX::LDV_f32_v4_avar, None);
1088  break;
1089  }
1090  if (!Opcode)
1091  return false;
1092  SDValue Ops[] = { getI32Imm(IsVolatile, DL), getI32Imm(CodeAddrSpace, DL),
1093  getI32Imm(VecType, DL), getI32Imm(FromType, DL),
1094  getI32Imm(FromTypeWidth, DL), Addr, Chain };
1095  LD = CurDAG->getMachineNode(Opcode.getValue(), DL, N->getVTList(), Ops);
1096  } else if (PointerSize == 64
1097  ? SelectADDRsi64(Op1.getNode(), Op1, Base, Offset)
1098  : SelectADDRsi(Op1.getNode(), Op1, Base, Offset)) {
1099  switch (N->getOpcode()) {
1100  default:
1101  return false;
1102  case NVPTXISD::LoadV2:
1103  Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1104  NVPTX::LDV_i8_v2_asi, NVPTX::LDV_i16_v2_asi,
1105  NVPTX::LDV_i32_v2_asi, NVPTX::LDV_i64_v2_asi,
1106  NVPTX::LDV_f16_v2_asi, NVPTX::LDV_f16x2_v2_asi,
1107  NVPTX::LDV_f32_v2_asi, NVPTX::LDV_f64_v2_asi);
1108  break;
1109  case NVPTXISD::LoadV4:
1110  Opcode =
1111  pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_asi,
1112  NVPTX::LDV_i16_v4_asi, NVPTX::LDV_i32_v4_asi, None,
1113  NVPTX::LDV_f16_v4_asi, NVPTX::LDV_f16x2_v4_asi,
1114  NVPTX::LDV_f32_v4_asi, None);
1115  break;
1116  }
1117  if (!Opcode)
1118  return false;
1119  SDValue Ops[] = { getI32Imm(IsVolatile, DL), getI32Imm(CodeAddrSpace, DL),
1120  getI32Imm(VecType, DL), getI32Imm(FromType, DL),
1121  getI32Imm(FromTypeWidth, DL), Base, Offset, Chain };
1122  LD = CurDAG->getMachineNode(Opcode.getValue(), DL, N->getVTList(), Ops);
1123  } else if (PointerSize == 64
1124  ? SelectADDRri64(Op1.getNode(), Op1, Base, Offset)
1125  : SelectADDRri(Op1.getNode(), Op1, Base, Offset)) {
1126  if (PointerSize == 64) {
1127  switch (N->getOpcode()) {
1128  default:
1129  return false;
1130  case NVPTXISD::LoadV2:
1131  Opcode = pickOpcodeForVT(
1132  EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v2_ari_64,
1133  NVPTX::LDV_i16_v2_ari_64, NVPTX::LDV_i32_v2_ari_64,
1134  NVPTX::LDV_i64_v2_ari_64, NVPTX::LDV_f16_v2_ari_64,
1135  NVPTX::LDV_f16x2_v2_ari_64, NVPTX::LDV_f32_v2_ari_64,
1136  NVPTX::LDV_f64_v2_ari_64);
1137  break;
1138  case NVPTXISD::LoadV4:
1139  Opcode = pickOpcodeForVT(
1140  EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_ari_64,
1141  NVPTX::LDV_i16_v4_ari_64, NVPTX::LDV_i32_v4_ari_64, None,
1142  NVPTX::LDV_f16_v4_ari_64, NVPTX::LDV_f16x2_v4_ari_64,
1143  NVPTX::LDV_f32_v4_ari_64, None);
1144  break;
1145  }
1146  } else {
1147  switch (N->getOpcode()) {
1148  default:
1149  return false;
1150  case NVPTXISD::LoadV2:
1151  Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1152  NVPTX::LDV_i8_v2_ari, NVPTX::LDV_i16_v2_ari,
1153  NVPTX::LDV_i32_v2_ari, NVPTX::LDV_i64_v2_ari,
1154  NVPTX::LDV_f16_v2_ari, NVPTX::LDV_f16x2_v2_ari,
1155  NVPTX::LDV_f32_v2_ari, NVPTX::LDV_f64_v2_ari);
1156  break;
1157  case NVPTXISD::LoadV4:
1158  Opcode =
1159  pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_ari,
1160  NVPTX::LDV_i16_v4_ari, NVPTX::LDV_i32_v4_ari, None,
1161  NVPTX::LDV_f16_v4_ari, NVPTX::LDV_f16x2_v4_ari,
1162  NVPTX::LDV_f32_v4_ari, None);
1163  break;
1164  }
1165  }
1166  if (!Opcode)
1167  return false;
1168  SDValue Ops[] = { getI32Imm(IsVolatile, DL), getI32Imm(CodeAddrSpace, DL),
1169  getI32Imm(VecType, DL), getI32Imm(FromType, DL),
1170  getI32Imm(FromTypeWidth, DL), Base, Offset, Chain };
1171 
1172  LD = CurDAG->getMachineNode(Opcode.getValue(), DL, N->getVTList(), Ops);
1173  } else {
1174  if (PointerSize == 64) {
1175  switch (N->getOpcode()) {
1176  default:
1177  return false;
1178  case NVPTXISD::LoadV2:
1179  Opcode = pickOpcodeForVT(
1180  EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v2_areg_64,
1181  NVPTX::LDV_i16_v2_areg_64, NVPTX::LDV_i32_v2_areg_64,
1182  NVPTX::LDV_i64_v2_areg_64, NVPTX::LDV_f16_v2_areg_64,
1183  NVPTX::LDV_f16x2_v2_areg_64, NVPTX::LDV_f32_v2_areg_64,
1184  NVPTX::LDV_f64_v2_areg_64);
1185  break;
1186  case NVPTXISD::LoadV4:
1187  Opcode = pickOpcodeForVT(
1188  EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_areg_64,
1189  NVPTX::LDV_i16_v4_areg_64, NVPTX::LDV_i32_v4_areg_64, None,
1190  NVPTX::LDV_f16_v4_areg_64, NVPTX::LDV_f16x2_v4_areg_64,
1191  NVPTX::LDV_f32_v4_areg_64, None);
1192  break;
1193  }
1194  } else {
1195  switch (N->getOpcode()) {
1196  default:
1197  return false;
1198  case NVPTXISD::LoadV2:
1199  Opcode =
1200  pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v2_areg,
1201  NVPTX::LDV_i16_v2_areg, NVPTX::LDV_i32_v2_areg,
1202  NVPTX::LDV_i64_v2_areg, NVPTX::LDV_f16_v2_areg,
1203  NVPTX::LDV_f16x2_v2_areg, NVPTX::LDV_f32_v2_areg,
1204  NVPTX::LDV_f64_v2_areg);
1205  break;
1206  case NVPTXISD::LoadV4:
1207  Opcode = pickOpcodeForVT(
1208  EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_areg,
1209  NVPTX::LDV_i16_v4_areg, NVPTX::LDV_i32_v4_areg, None,
1210  NVPTX::LDV_f16_v4_areg, NVPTX::LDV_f16x2_v4_areg,
1211  NVPTX::LDV_f32_v4_areg, None);
1212  break;
1213  }
1214  }
1215  if (!Opcode)
1216  return false;
1217  SDValue Ops[] = { getI32Imm(IsVolatile, DL), getI32Imm(CodeAddrSpace, DL),
1218  getI32Imm(VecType, DL), getI32Imm(FromType, DL),
1219  getI32Imm(FromTypeWidth, DL), Op1, Chain };
1220  LD = CurDAG->getMachineNode(Opcode.getValue(), DL, N->getVTList(), Ops);
1221  }
1222 
1223  MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand();
1224  CurDAG->setNodeMemRefs(cast<MachineSDNode>(LD), {MemRef});
1225 
1226  ReplaceNode(N, LD);
1227  return true;
1228 }
1229 
1230 bool NVPTXDAGToDAGISel::tryLDGLDU(SDNode *N) {
1231 
1232  SDValue Chain = N->getOperand(0);
1233  SDValue Op1;
1234  MemSDNode *Mem;
1235  bool IsLDG = true;
1236 
1237  // If this is an LDG intrinsic, the address is the third operand. If its an
1238  // LDG/LDU SD node (from custom vector handling), then its the second operand
1239  if (N->getOpcode() == ISD::INTRINSIC_W_CHAIN) {
1240  Op1 = N->getOperand(2);
1241  Mem = cast<MemIntrinsicSDNode>(N);
1242  unsigned IID = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
1243  switch (IID) {
1244  default:
1245  return false;
1246  case Intrinsic::nvvm_ldg_global_f:
1247  case Intrinsic::nvvm_ldg_global_i:
1248  case Intrinsic::nvvm_ldg_global_p:
1249  IsLDG = true;
1250  break;
1251  case Intrinsic::nvvm_ldu_global_f:
1252  case Intrinsic::nvvm_ldu_global_i:
1253  case Intrinsic::nvvm_ldu_global_p:
1254  IsLDG = false;
1255  break;
1256  }
1257  } else {
1258  Op1 = N->getOperand(1);
1259  Mem = cast<MemSDNode>(N);
1260  }
1261 
1262  Optional<unsigned> Opcode;
1263  SDLoc DL(N);
1264  SDNode *LD;
1265  SDValue Base, Offset, Addr;
1266 
1267  EVT EltVT = Mem->getMemoryVT();
1268  unsigned NumElts = 1;
1269  if (EltVT.isVector()) {
1270  NumElts = EltVT.getVectorNumElements();
1271  EltVT = EltVT.getVectorElementType();
1272  // vectors of f16 are loaded/stored as multiples of v2f16 elements.
1273  if (EltVT == MVT::f16 && N->getValueType(0) == MVT::v2f16) {
1274  assert(NumElts % 2 == 0 && "Vector must have even number of elements");
1275  EltVT = MVT::v2f16;
1276  NumElts /= 2;
1277  }
1278  }
1279 
1280  // Build the "promoted" result VTList for the load. If we are really loading
1281  // i8s, then the return type will be promoted to i16 since we do not expose
1282  // 8-bit registers in NVPTX.
1283  EVT NodeVT = (EltVT == MVT::i8) ? MVT::i16 : EltVT;
1284  SmallVector<EVT, 5> InstVTs;
1285  for (unsigned i = 0; i != NumElts; ++i) {
1286  InstVTs.push_back(NodeVT);
1287  }
1288  InstVTs.push_back(MVT::Other);
1289  SDVTList InstVTList = CurDAG->getVTList(InstVTs);
1290 
1291  if (SelectDirectAddr(Op1, Addr)) {
1292  switch (N->getOpcode()) {
1293  default:
1294  return false;
1295  case ISD::LOAD:
1297  if (IsLDG)
1298  Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1299  NVPTX::INT_PTX_LDG_GLOBAL_i8avar,
1300  NVPTX::INT_PTX_LDG_GLOBAL_i16avar,
1301  NVPTX::INT_PTX_LDG_GLOBAL_i32avar,
1302  NVPTX::INT_PTX_LDG_GLOBAL_i64avar,
1303  NVPTX::INT_PTX_LDG_GLOBAL_f16avar,
1304  NVPTX::INT_PTX_LDG_GLOBAL_f16x2avar,
1305  NVPTX::INT_PTX_LDG_GLOBAL_f32avar,
1306  NVPTX::INT_PTX_LDG_GLOBAL_f64avar);
1307  else
1308  Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1309  NVPTX::INT_PTX_LDU_GLOBAL_i8avar,
1310  NVPTX::INT_PTX_LDU_GLOBAL_i16avar,
1311  NVPTX::INT_PTX_LDU_GLOBAL_i32avar,
1312  NVPTX::INT_PTX_LDU_GLOBAL_i64avar,
1313  NVPTX::INT_PTX_LDU_GLOBAL_f16avar,
1314  NVPTX::INT_PTX_LDU_GLOBAL_f16x2avar,
1315  NVPTX::INT_PTX_LDU_GLOBAL_f32avar,
1316  NVPTX::INT_PTX_LDU_GLOBAL_f64avar);
1317  break;
1318  case NVPTXISD::LoadV2:
1319  case NVPTXISD::LDGV2:
1320  Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1321  NVPTX::INT_PTX_LDG_G_v2i8_ELE_avar,
1322  NVPTX::INT_PTX_LDG_G_v2i16_ELE_avar,
1323  NVPTX::INT_PTX_LDG_G_v2i32_ELE_avar,
1324  NVPTX::INT_PTX_LDG_G_v2i64_ELE_avar,
1325  NVPTX::INT_PTX_LDG_G_v2f16_ELE_avar,
1326  NVPTX::INT_PTX_LDG_G_v2f16x2_ELE_avar,
1327  NVPTX::INT_PTX_LDG_G_v2f32_ELE_avar,
1328  NVPTX::INT_PTX_LDG_G_v2f64_ELE_avar);
1329  break;
1330  case NVPTXISD::LDUV2:
1331  Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1332  NVPTX::INT_PTX_LDU_G_v2i8_ELE_avar,
1333  NVPTX::INT_PTX_LDU_G_v2i16_ELE_avar,
1334  NVPTX::INT_PTX_LDU_G_v2i32_ELE_avar,
1335  NVPTX::INT_PTX_LDU_G_v2i64_ELE_avar,
1336  NVPTX::INT_PTX_LDU_G_v2f16_ELE_avar,
1337  NVPTX::INT_PTX_LDU_G_v2f16x2_ELE_avar,
1338  NVPTX::INT_PTX_LDU_G_v2f32_ELE_avar,
1339  NVPTX::INT_PTX_LDU_G_v2f64_ELE_avar);
1340  break;
1341  case NVPTXISD::LoadV4:
1342  case NVPTXISD::LDGV4:
1343  Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1344  NVPTX::INT_PTX_LDG_G_v4i8_ELE_avar,
1345  NVPTX::INT_PTX_LDG_G_v4i16_ELE_avar,
1346  NVPTX::INT_PTX_LDG_G_v4i32_ELE_avar, None,
1347  NVPTX::INT_PTX_LDG_G_v4f16_ELE_avar,
1348  NVPTX::INT_PTX_LDG_G_v4f16x2_ELE_avar,
1349  NVPTX::INT_PTX_LDG_G_v4f32_ELE_avar, None);
1350  break;
1351  case NVPTXISD::LDUV4:
1352  Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1353  NVPTX::INT_PTX_LDU_G_v4i8_ELE_avar,
1354  NVPTX::INT_PTX_LDU_G_v4i16_ELE_avar,
1355  NVPTX::INT_PTX_LDU_G_v4i32_ELE_avar, None,
1356  NVPTX::INT_PTX_LDU_G_v4f16_ELE_avar,
1357  NVPTX::INT_PTX_LDU_G_v4f16x2_ELE_avar,
1358  NVPTX::INT_PTX_LDU_G_v4f32_ELE_avar, None);
1359  break;
1360  }
1361  if (!Opcode)
1362  return false;
1363  SDValue Ops[] = { Addr, Chain };
1364  LD = CurDAG->getMachineNode(Opcode.getValue(), DL, InstVTList, Ops);
1365  } else if (TM.is64Bit() ? SelectADDRri64(Op1.getNode(), Op1, Base, Offset)
1366  : SelectADDRri(Op1.getNode(), Op1, Base, Offset)) {
1367  if (TM.is64Bit()) {
1368  switch (N->getOpcode()) {
1369  default:
1370  return false;
1371  case ISD::LOAD:
1373  if (IsLDG)
1374  Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1375  NVPTX::INT_PTX_LDG_GLOBAL_i8ari64,
1376  NVPTX::INT_PTX_LDG_GLOBAL_i16ari64,
1377  NVPTX::INT_PTX_LDG_GLOBAL_i32ari64,
1378  NVPTX::INT_PTX_LDG_GLOBAL_i64ari64,
1379  NVPTX::INT_PTX_LDG_GLOBAL_f16ari64,
1380  NVPTX::INT_PTX_LDG_GLOBAL_f16x2ari64,
1381  NVPTX::INT_PTX_LDG_GLOBAL_f32ari64,
1382  NVPTX::INT_PTX_LDG_GLOBAL_f64ari64);
1383  else
1384  Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1385  NVPTX::INT_PTX_LDU_GLOBAL_i8ari64,
1386  NVPTX::INT_PTX_LDU_GLOBAL_i16ari64,
1387  NVPTX::INT_PTX_LDU_GLOBAL_i32ari64,
1388  NVPTX::INT_PTX_LDU_GLOBAL_i64ari64,
1389  NVPTX::INT_PTX_LDU_GLOBAL_f16ari64,
1390  NVPTX::INT_PTX_LDU_GLOBAL_f16x2ari64,
1391  NVPTX::INT_PTX_LDU_GLOBAL_f32ari64,
1392  NVPTX::INT_PTX_LDU_GLOBAL_f64ari64);
1393  break;
1394  case NVPTXISD::LoadV2:
1395  case NVPTXISD::LDGV2:
1396  Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1397  NVPTX::INT_PTX_LDG_G_v2i8_ELE_ari64,
1398  NVPTX::INT_PTX_LDG_G_v2i16_ELE_ari64,
1399  NVPTX::INT_PTX_LDG_G_v2i32_ELE_ari64,
1400  NVPTX::INT_PTX_LDG_G_v2i64_ELE_ari64,
1401  NVPTX::INT_PTX_LDG_G_v2f16_ELE_ari64,
1402  NVPTX::INT_PTX_LDG_G_v2f16x2_ELE_ari64,
1403  NVPTX::INT_PTX_LDG_G_v2f32_ELE_ari64,
1404  NVPTX::INT_PTX_LDG_G_v2f64_ELE_ari64);
1405  break;
1406  case NVPTXISD::LDUV2:
1407  Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1408  NVPTX::INT_PTX_LDU_G_v2i8_ELE_ari64,
1409  NVPTX::INT_PTX_LDU_G_v2i16_ELE_ari64,
1410  NVPTX::INT_PTX_LDU_G_v2i32_ELE_ari64,
1411  NVPTX::INT_PTX_LDU_G_v2i64_ELE_ari64,
1412  NVPTX::INT_PTX_LDU_G_v2f16_ELE_ari64,
1413  NVPTX::INT_PTX_LDU_G_v2f16x2_ELE_ari64,
1414  NVPTX::INT_PTX_LDU_G_v2f32_ELE_ari64,
1415  NVPTX::INT_PTX_LDU_G_v2f64_ELE_ari64);
1416  break;
1417  case NVPTXISD::LoadV4:
1418  case NVPTXISD::LDGV4:
1419  Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1420  NVPTX::INT_PTX_LDG_G_v4i8_ELE_ari64,
1421  NVPTX::INT_PTX_LDG_G_v4i16_ELE_ari64,
1422  NVPTX::INT_PTX_LDG_G_v4i32_ELE_ari64, None,
1423  NVPTX::INT_PTX_LDG_G_v4f16_ELE_ari64,
1424  NVPTX::INT_PTX_LDG_G_v4f16x2_ELE_ari64,
1425  NVPTX::INT_PTX_LDG_G_v4f32_ELE_ari64, None);
1426  break;
1427  case NVPTXISD::LDUV4:
1428  Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1429  NVPTX::INT_PTX_LDU_G_v4i8_ELE_ari64,
1430  NVPTX::INT_PTX_LDU_G_v4i16_ELE_ari64,
1431  NVPTX::INT_PTX_LDU_G_v4i32_ELE_ari64, None,
1432  NVPTX::INT_PTX_LDU_G_v4f16_ELE_ari64,
1433  NVPTX::INT_PTX_LDU_G_v4f16x2_ELE_ari64,
1434  NVPTX::INT_PTX_LDU_G_v4f32_ELE_ari64, None);
1435  break;
1436  }
1437  } else {
1438  switch (N->getOpcode()) {
1439  default:
1440  return false;
1441  case ISD::LOAD:
1443  if (IsLDG)
1444  Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1445  NVPTX::INT_PTX_LDG_GLOBAL_i8ari,
1446  NVPTX::INT_PTX_LDG_GLOBAL_i16ari,
1447  NVPTX::INT_PTX_LDG_GLOBAL_i32ari,
1448  NVPTX::INT_PTX_LDG_GLOBAL_i64ari,
1449  NVPTX::INT_PTX_LDG_GLOBAL_f16ari,
1450  NVPTX::INT_PTX_LDG_GLOBAL_f16x2ari,
1451  NVPTX::INT_PTX_LDG_GLOBAL_f32ari,
1452  NVPTX::INT_PTX_LDG_GLOBAL_f64ari);
1453  else
1454  Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1455  NVPTX::INT_PTX_LDU_GLOBAL_i8ari,
1456  NVPTX::INT_PTX_LDU_GLOBAL_i16ari,
1457  NVPTX::INT_PTX_LDU_GLOBAL_i32ari,
1458  NVPTX::INT_PTX_LDU_GLOBAL_i64ari,
1459  NVPTX::INT_PTX_LDU_GLOBAL_f16ari,
1460  NVPTX::INT_PTX_LDU_GLOBAL_f16x2ari,
1461  NVPTX::INT_PTX_LDU_GLOBAL_f32ari,
1462  NVPTX::INT_PTX_LDU_GLOBAL_f64ari);
1463  break;
1464  case NVPTXISD::LoadV2:
1465  case NVPTXISD::LDGV2:
1466  Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1467  NVPTX::INT_PTX_LDG_G_v2i8_ELE_ari32,
1468  NVPTX::INT_PTX_LDG_G_v2i16_ELE_ari32,
1469  NVPTX::INT_PTX_LDG_G_v2i32_ELE_ari32,
1470  NVPTX::INT_PTX_LDG_G_v2i64_ELE_ari32,
1471  NVPTX::INT_PTX_LDG_G_v2f16_ELE_ari32,
1472  NVPTX::INT_PTX_LDG_G_v2f16x2_ELE_ari32,
1473  NVPTX::INT_PTX_LDG_G_v2f32_ELE_ari32,
1474  NVPTX::INT_PTX_LDG_G_v2f64_ELE_ari32);
1475  break;
1476  case NVPTXISD::LDUV2:
1477  Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1478  NVPTX::INT_PTX_LDU_G_v2i8_ELE_ari32,
1479  NVPTX::INT_PTX_LDU_G_v2i16_ELE_ari32,
1480  NVPTX::INT_PTX_LDU_G_v2i32_ELE_ari32,
1481  NVPTX::INT_PTX_LDU_G_v2i64_ELE_ari32,
1482  NVPTX::INT_PTX_LDU_G_v2f16_ELE_ari32,
1483  NVPTX::INT_PTX_LDU_G_v2f16x2_ELE_ari32,
1484  NVPTX::INT_PTX_LDU_G_v2f32_ELE_ari32,
1485  NVPTX::INT_PTX_LDU_G_v2f64_ELE_ari32);
1486  break;
1487  case NVPTXISD::LoadV4:
1488  case NVPTXISD::LDGV4:
1489  Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1490  NVPTX::INT_PTX_LDG_G_v4i8_ELE_ari32,
1491  NVPTX::INT_PTX_LDG_G_v4i16_ELE_ari32,
1492  NVPTX::INT_PTX_LDG_G_v4i32_ELE_ari32, None,
1493  NVPTX::INT_PTX_LDG_G_v4f16_ELE_ari32,
1494  NVPTX::INT_PTX_LDG_G_v4f16x2_ELE_ari32,
1495  NVPTX::INT_PTX_LDG_G_v4f32_ELE_ari32, None);
1496  break;
1497  case NVPTXISD::LDUV4:
1498  Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1499  NVPTX::INT_PTX_LDU_G_v4i8_ELE_ari32,
1500  NVPTX::INT_PTX_LDU_G_v4i16_ELE_ari32,
1501  NVPTX::INT_PTX_LDU_G_v4i32_ELE_ari32, None,
1502  NVPTX::INT_PTX_LDU_G_v4f16_ELE_ari32,
1503  NVPTX::INT_PTX_LDU_G_v4f16x2_ELE_ari32,
1504  NVPTX::INT_PTX_LDU_G_v4f32_ELE_ari32, None);
1505  break;
1506  }
1507  }
1508  if (!Opcode)
1509  return false;
1510  SDValue Ops[] = {Base, Offset, Chain};
1511  LD = CurDAG->getMachineNode(Opcode.getValue(), DL, InstVTList, Ops);
1512  } else {
1513  if (TM.is64Bit()) {
1514  switch (N->getOpcode()) {
1515  default:
1516  return false;
1517  case ISD::LOAD:
1519  if (IsLDG)
1520  Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1521  NVPTX::INT_PTX_LDG_GLOBAL_i8areg64,
1522  NVPTX::INT_PTX_LDG_GLOBAL_i16areg64,
1523  NVPTX::INT_PTX_LDG_GLOBAL_i32areg64,
1524  NVPTX::INT_PTX_LDG_GLOBAL_i64areg64,
1525  NVPTX::INT_PTX_LDG_GLOBAL_f16areg64,
1526  NVPTX::INT_PTX_LDG_GLOBAL_f16x2areg64,
1527  NVPTX::INT_PTX_LDG_GLOBAL_f32areg64,
1528  NVPTX::INT_PTX_LDG_GLOBAL_f64areg64);
1529  else
1530  Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1531  NVPTX::INT_PTX_LDU_GLOBAL_i8areg64,
1532  NVPTX::INT_PTX_LDU_GLOBAL_i16areg64,
1533  NVPTX::INT_PTX_LDU_GLOBAL_i32areg64,
1534  NVPTX::INT_PTX_LDU_GLOBAL_i64areg64,
1535  NVPTX::INT_PTX_LDU_GLOBAL_f16areg64,
1536  NVPTX::INT_PTX_LDU_GLOBAL_f16x2areg64,
1537  NVPTX::INT_PTX_LDU_GLOBAL_f32areg64,
1538  NVPTX::INT_PTX_LDU_GLOBAL_f64areg64);
1539  break;
1540  case NVPTXISD::LoadV2:
1541  case NVPTXISD::LDGV2:
1542  Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1543  NVPTX::INT_PTX_LDG_G_v2i8_ELE_areg64,
1544  NVPTX::INT_PTX_LDG_G_v2i16_ELE_areg64,
1545  NVPTX::INT_PTX_LDG_G_v2i32_ELE_areg64,
1546  NVPTX::INT_PTX_LDG_G_v2i64_ELE_areg64,
1547  NVPTX::INT_PTX_LDG_G_v2f16_ELE_areg64,
1548  NVPTX::INT_PTX_LDG_G_v2f16x2_ELE_areg64,
1549  NVPTX::INT_PTX_LDG_G_v2f32_ELE_areg64,
1550  NVPTX::INT_PTX_LDG_G_v2f64_ELE_areg64);
1551  break;
1552  case NVPTXISD::LDUV2:
1553  Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1554  NVPTX::INT_PTX_LDU_G_v2i8_ELE_areg64,
1555  NVPTX::INT_PTX_LDU_G_v2i16_ELE_areg64,
1556  NVPTX::INT_PTX_LDU_G_v2i32_ELE_areg64,
1557  NVPTX::INT_PTX_LDU_G_v2i64_ELE_areg64,
1558  NVPTX::INT_PTX_LDU_G_v2f16_ELE_areg64,
1559  NVPTX::INT_PTX_LDU_G_v2f16x2_ELE_areg64,
1560  NVPTX::INT_PTX_LDU_G_v2f32_ELE_areg64,
1561  NVPTX::INT_PTX_LDU_G_v2f64_ELE_areg64);
1562  break;
1563  case NVPTXISD::LoadV4:
1564  case NVPTXISD::LDGV4:
1565  Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1566  NVPTX::INT_PTX_LDG_G_v4i8_ELE_areg64,
1567  NVPTX::INT_PTX_LDG_G_v4i16_ELE_areg64,
1568  NVPTX::INT_PTX_LDG_G_v4i32_ELE_areg64, None,
1569  NVPTX::INT_PTX_LDG_G_v4f16_ELE_areg64,
1570  NVPTX::INT_PTX_LDG_G_v4f16x2_ELE_areg64,
1571  NVPTX::INT_PTX_LDG_G_v4f32_ELE_areg64, None);
1572  break;
1573  case NVPTXISD::LDUV4:
1574  Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1575  NVPTX::INT_PTX_LDU_G_v4i8_ELE_areg64,
1576  NVPTX::INT_PTX_LDU_G_v4i16_ELE_areg64,
1577  NVPTX::INT_PTX_LDU_G_v4i32_ELE_areg64, None,
1578  NVPTX::INT_PTX_LDU_G_v4f16_ELE_areg64,
1579  NVPTX::INT_PTX_LDU_G_v4f16x2_ELE_areg64,
1580  NVPTX::INT_PTX_LDU_G_v4f32_ELE_areg64, None);
1581  break;
1582  }
1583  } else {
1584  switch (N->getOpcode()) {
1585  default:
1586  return false;
1587  case ISD::LOAD:
1589  if (IsLDG)
1590  Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1591  NVPTX::INT_PTX_LDG_GLOBAL_i8areg,
1592  NVPTX::INT_PTX_LDG_GLOBAL_i16areg,
1593  NVPTX::INT_PTX_LDG_GLOBAL_i32areg,
1594  NVPTX::INT_PTX_LDG_GLOBAL_i64areg,
1595  NVPTX::INT_PTX_LDG_GLOBAL_f16areg,
1596  NVPTX::INT_PTX_LDG_GLOBAL_f16x2areg,
1597  NVPTX::INT_PTX_LDG_GLOBAL_f32areg,
1598  NVPTX::INT_PTX_LDG_GLOBAL_f64areg);
1599  else
1600  Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1601  NVPTX::INT_PTX_LDU_GLOBAL_i8areg,
1602  NVPTX::INT_PTX_LDU_GLOBAL_i16areg,
1603  NVPTX::INT_PTX_LDU_GLOBAL_i32areg,
1604  NVPTX::INT_PTX_LDU_GLOBAL_i64areg,
1605  NVPTX::INT_PTX_LDU_GLOBAL_f16areg,
1606  NVPTX::INT_PTX_LDU_GLOBAL_f16x2areg,
1607  NVPTX::INT_PTX_LDU_GLOBAL_f32areg,
1608  NVPTX::INT_PTX_LDU_GLOBAL_f64areg);
1609  break;
1610  case NVPTXISD::LoadV2:
1611  case NVPTXISD::LDGV2:
1612  Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1613  NVPTX::INT_PTX_LDG_G_v2i8_ELE_areg32,
1614  NVPTX::INT_PTX_LDG_G_v2i16_ELE_areg32,
1615  NVPTX::INT_PTX_LDG_G_v2i32_ELE_areg32,
1616  NVPTX::INT_PTX_LDG_G_v2i64_ELE_areg32,
1617  NVPTX::INT_PTX_LDG_G_v2f16_ELE_areg32,
1618  NVPTX::INT_PTX_LDG_G_v2f16x2_ELE_areg32,
1619  NVPTX::INT_PTX_LDG_G_v2f32_ELE_areg32,
1620  NVPTX::INT_PTX_LDG_G_v2f64_ELE_areg32);
1621  break;
1622  case NVPTXISD::LDUV2:
1623  Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1624  NVPTX::INT_PTX_LDU_G_v2i8_ELE_areg32,
1625  NVPTX::INT_PTX_LDU_G_v2i16_ELE_areg32,
1626  NVPTX::INT_PTX_LDU_G_v2i32_ELE_areg32,
1627  NVPTX::INT_PTX_LDU_G_v2i64_ELE_areg32,
1628  NVPTX::INT_PTX_LDU_G_v2f16_ELE_areg32,
1629  NVPTX::INT_PTX_LDU_G_v2f16x2_ELE_areg32,
1630  NVPTX::INT_PTX_LDU_G_v2f32_ELE_areg32,
1631  NVPTX::INT_PTX_LDU_G_v2f64_ELE_areg32);
1632  break;
1633  case NVPTXISD::LoadV4:
1634  case NVPTXISD::LDGV4:
1635  Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1636  NVPTX::INT_PTX_LDG_G_v4i8_ELE_areg32,
1637  NVPTX::INT_PTX_LDG_G_v4i16_ELE_areg32,
1638  NVPTX::INT_PTX_LDG_G_v4i32_ELE_areg32, None,
1639  NVPTX::INT_PTX_LDG_G_v4f16_ELE_areg32,
1640  NVPTX::INT_PTX_LDG_G_v4f16x2_ELE_areg32,
1641  NVPTX::INT_PTX_LDG_G_v4f32_ELE_areg32, None);
1642  break;
1643  case NVPTXISD::LDUV4:
1644  Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1645  NVPTX::INT_PTX_LDU_G_v4i8_ELE_areg32,
1646  NVPTX::INT_PTX_LDU_G_v4i16_ELE_areg32,
1647  NVPTX::INT_PTX_LDU_G_v4i32_ELE_areg32, None,
1648  NVPTX::INT_PTX_LDU_G_v4f16_ELE_areg32,
1649  NVPTX::INT_PTX_LDU_G_v4f16x2_ELE_areg32,
1650  NVPTX::INT_PTX_LDU_G_v4f32_ELE_areg32, None);
1651  break;
1652  }
1653  }
1654  if (!Opcode)
1655  return false;
1656  SDValue Ops[] = { Op1, Chain };
1657  LD = CurDAG->getMachineNode(Opcode.getValue(), DL, InstVTList, Ops);
1658  }
1659 
1661  CurDAG->setNodeMemRefs(cast<MachineSDNode>(LD), {MemRef});
1662 
1663  // For automatic generation of LDG (through SelectLoad[Vector], not the
1664  // intrinsics), we may have an extending load like:
1665  //
1666  // i32,ch = load<LD1[%data1(addrspace=1)], zext from i8> t0, t7, undef:i64
1667  //
1668  // In this case, the matching logic above will select a load for the original
1669  // memory type (in this case, i8) and our types will not match (the node needs
1670  // to return an i32 in this case). Our LDG/LDU nodes do not support the
1671  // concept of sign-/zero-extension, so emulate it here by adding an explicit
1672  // CVT instruction. Ptxas should clean up any redundancies here.
1673 
1674  EVT OrigType = N->getValueType(0);
1675  LoadSDNode *LdNode = dyn_cast<LoadSDNode>(N);
1676 
1677  if (OrigType != EltVT && LdNode) {
1678  // We have an extending-load. The instruction we selected operates on the
1679  // smaller type, but the SDNode we are replacing has the larger type. We
1680  // need to emit a CVT to make the types match.
1681  bool IsSigned = LdNode->getExtensionType() == ISD::SEXTLOAD;
1682  unsigned CvtOpc = GetConvertOpcode(OrigType.getSimpleVT(),
1683  EltVT.getSimpleVT(), IsSigned);
1684 
1685  // For each output value, apply the manual sign/zero-extension and make sure
1686  // all users of the load go through that CVT.
1687  for (unsigned i = 0; i != NumElts; ++i) {
1688  SDValue Res(LD, i);
1689  SDValue OrigVal(N, i);
1690 
1691  SDNode *CvtNode =
1692  CurDAG->getMachineNode(CvtOpc, DL, OrigType, Res,
1694  DL, MVT::i32));
1695  ReplaceUses(OrigVal, SDValue(CvtNode, 0));
1696  }
1697  }
1698 
1699  ReplaceNode(N, LD);
1700  return true;
1701 }
1702 
1703 bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
1704  SDLoc dl(N);
1705  MemSDNode *ST = cast<MemSDNode>(N);
1706  assert(ST->writeMem() && "Expected store");
1707  StoreSDNode *PlainStore = dyn_cast<StoreSDNode>(N);
1708  AtomicSDNode *AtomicStore = dyn_cast<AtomicSDNode>(N);
1709  assert((PlainStore || AtomicStore) && "Expected store");
1710  EVT StoreVT = ST->getMemoryVT();
1711  SDNode *NVPTXST = nullptr;
1712 
1713  // do not support pre/post inc/dec
1714  if (PlainStore && PlainStore->isIndexed())
1715  return false;
1716 
1717  if (!StoreVT.isSimple())
1718  return false;
1719 
1720  AtomicOrdering Ordering = ST->getOrdering();
1721  // In order to lower atomic loads with stronger guarantees we would need to
1722  // use store.release or insert fences. However these features were only added
1723  // with PTX ISA 6.0 / sm_70.
1724  // TODO: Check if we can actually use the new instructions and implement them.
1725  if (isStrongerThanMonotonic(Ordering))
1726  return false;
1727 
1728  // Address Space Setting
1729  unsigned int CodeAddrSpace = getCodeAddrSpace(ST);
1730  unsigned int PointerSize =
1732 
1733  // Volatile Setting
1734  // - .volatile is only available for .global and .shared
1735  // - .volatile has the same memory synchronization semantics as .relaxed.sys
1736  bool isVolatile = ST->isVolatile() || Ordering == AtomicOrdering::Monotonic;
1737  if (CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL &&
1738  CodeAddrSpace != NVPTX::PTXLdStInstCode::SHARED &&
1739  CodeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC)
1740  isVolatile = false;
1741 
1742  // Vector Setting
1743  MVT SimpleVT = StoreVT.getSimpleVT();
1744  unsigned vecType = NVPTX::PTXLdStInstCode::Scalar;
1745 
1746  // Type Setting: toType + toTypeWidth
1747  // - for integer type, always use 'u'
1748  //
1749  MVT ScalarVT = SimpleVT.getScalarType();
1750  unsigned toTypeWidth = ScalarVT.getSizeInBits();
1751  if (SimpleVT.isVector()) {
1752  assert(StoreVT == MVT::v2f16 && "Unexpected vector type");
1753  // v2f16 is stored using st.b32
1754  toTypeWidth = 32;
1755  }
1756 
1757  unsigned int toType;
1758  if (ScalarVT.isFloatingPoint())
1759  // f16 uses .b16 as its storage type.
1760  toType = ScalarVT.SimpleTy == MVT::f16 ? NVPTX::PTXLdStInstCode::Untyped
1762  else
1764 
1765  // Create the machine instruction DAG
1766  SDValue Chain = ST->getChain();
1767  SDValue Value = PlainStore ? PlainStore->getValue() : AtomicStore->getVal();
1768  SDValue BasePtr = ST->getBasePtr();
1769  SDValue Addr;
1770  SDValue Offset, Base;
1771  Optional<unsigned> Opcode;
1772  MVT::SimpleValueType SourceVT =
1773  Value.getNode()->getSimpleValueType(0).SimpleTy;
1774 
1775  if (SelectDirectAddr(BasePtr, Addr)) {
1776  Opcode = pickOpcodeForVT(SourceVT, NVPTX::ST_i8_avar, NVPTX::ST_i16_avar,
1777  NVPTX::ST_i32_avar, NVPTX::ST_i64_avar,
1778  NVPTX::ST_f16_avar, NVPTX::ST_f16x2_avar,
1779  NVPTX::ST_f32_avar, NVPTX::ST_f64_avar);
1780  if (!Opcode)
1781  return false;
1782  SDValue Ops[] = {Value,
1783  getI32Imm(isVolatile, dl),
1784  getI32Imm(CodeAddrSpace, dl),
1785  getI32Imm(vecType, dl),
1786  getI32Imm(toType, dl),
1787  getI32Imm(toTypeWidth, dl),
1788  Addr,
1789  Chain};
1790  NVPTXST = CurDAG->getMachineNode(Opcode.getValue(), dl, MVT::Other, Ops);
1791  } else if (PointerSize == 64
1792  ? SelectADDRsi64(BasePtr.getNode(), BasePtr, Base, Offset)
1793  : SelectADDRsi(BasePtr.getNode(), BasePtr, Base, Offset)) {
1794  Opcode = pickOpcodeForVT(SourceVT, NVPTX::ST_i8_asi, NVPTX::ST_i16_asi,
1795  NVPTX::ST_i32_asi, NVPTX::ST_i64_asi,
1796  NVPTX::ST_f16_asi, NVPTX::ST_f16x2_asi,
1797  NVPTX::ST_f32_asi, NVPTX::ST_f64_asi);
1798  if (!Opcode)
1799  return false;
1800  SDValue Ops[] = {Value,
1801  getI32Imm(isVolatile, dl),
1802  getI32Imm(CodeAddrSpace, dl),
1803  getI32Imm(vecType, dl),
1804  getI32Imm(toType, dl),
1805  getI32Imm(toTypeWidth, dl),
1806  Base,
1807  Offset,
1808  Chain};
1809  NVPTXST = CurDAG->getMachineNode(Opcode.getValue(), dl, MVT::Other, Ops);
1810  } else if (PointerSize == 64
1811  ? SelectADDRri64(BasePtr.getNode(), BasePtr, Base, Offset)
1812  : SelectADDRri(BasePtr.getNode(), BasePtr, Base, Offset)) {
1813  if (PointerSize == 64)
1814  Opcode = pickOpcodeForVT(
1815  SourceVT, NVPTX::ST_i8_ari_64, NVPTX::ST_i16_ari_64,
1816  NVPTX::ST_i32_ari_64, NVPTX::ST_i64_ari_64, NVPTX::ST_f16_ari_64,
1817  NVPTX::ST_f16x2_ari_64, NVPTX::ST_f32_ari_64, NVPTX::ST_f64_ari_64);
1818  else
1819  Opcode = pickOpcodeForVT(SourceVT, NVPTX::ST_i8_ari, NVPTX::ST_i16_ari,
1820  NVPTX::ST_i32_ari, NVPTX::ST_i64_ari,
1821  NVPTX::ST_f16_ari, NVPTX::ST_f16x2_ari,
1822  NVPTX::ST_f32_ari, NVPTX::ST_f64_ari);
1823  if (!Opcode)
1824  return false;
1825 
1826  SDValue Ops[] = {Value,
1827  getI32Imm(isVolatile, dl),
1828  getI32Imm(CodeAddrSpace, dl),
1829  getI32Imm(vecType, dl),
1830  getI32Imm(toType, dl),
1831  getI32Imm(toTypeWidth, dl),
1832  Base,
1833  Offset,
1834  Chain};
1835  NVPTXST = CurDAG->getMachineNode(Opcode.getValue(), dl, MVT::Other, Ops);
1836  } else {
1837  if (PointerSize == 64)
1838  Opcode =
1839  pickOpcodeForVT(SourceVT, NVPTX::ST_i8_areg_64, NVPTX::ST_i16_areg_64,
1840  NVPTX::ST_i32_areg_64, NVPTX::ST_i64_areg_64,
1841  NVPTX::ST_f16_areg_64, NVPTX::ST_f16x2_areg_64,
1842  NVPTX::ST_f32_areg_64, NVPTX::ST_f64_areg_64);
1843  else
1844  Opcode = pickOpcodeForVT(SourceVT, NVPTX::ST_i8_areg, NVPTX::ST_i16_areg,
1845  NVPTX::ST_i32_areg, NVPTX::ST_i64_areg,
1846  NVPTX::ST_f16_areg, NVPTX::ST_f16x2_areg,
1847  NVPTX::ST_f32_areg, NVPTX::ST_f64_areg);
1848  if (!Opcode)
1849  return false;
1850  SDValue Ops[] = {Value,
1851  getI32Imm(isVolatile, dl),
1852  getI32Imm(CodeAddrSpace, dl),
1853  getI32Imm(vecType, dl),
1854  getI32Imm(toType, dl),
1855  getI32Imm(toTypeWidth, dl),
1856  BasePtr,
1857  Chain};
1858  NVPTXST = CurDAG->getMachineNode(Opcode.getValue(), dl, MVT::Other, Ops);
1859  }
1860 
1861  if (!NVPTXST)
1862  return false;
1863 
1864  MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand();
1865  CurDAG->setNodeMemRefs(cast<MachineSDNode>(NVPTXST), {MemRef});
1866  ReplaceNode(N, NVPTXST);
1867  return true;
1868 }
1869 
1870 bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
1871  SDValue Chain = N->getOperand(0);
1872  SDValue Op1 = N->getOperand(1);
1873  SDValue Addr, Offset, Base;
1874  Optional<unsigned> Opcode;
1875  SDLoc DL(N);
1876  SDNode *ST;
1877  EVT EltVT = Op1.getValueType();
1878  MemSDNode *MemSD = cast<MemSDNode>(N);
1879  EVT StoreVT = MemSD->getMemoryVT();
1880 
1881  // Address Space Setting
1882  unsigned CodeAddrSpace = getCodeAddrSpace(MemSD);
1883  if (CodeAddrSpace == NVPTX::PTXLdStInstCode::CONSTANT) {
1884  report_fatal_error("Cannot store to pointer that points to constant "
1885  "memory space");
1886  }
1887  unsigned int PointerSize =
1889 
1890  // Volatile Setting
1891  // - .volatile is only availalble for .global and .shared
1892  bool IsVolatile = MemSD->isVolatile();
1893  if (CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL &&
1894  CodeAddrSpace != NVPTX::PTXLdStInstCode::SHARED &&
1895  CodeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC)
1896  IsVolatile = false;
1897 
1898  // Type Setting: toType + toTypeWidth
1899  // - for integer type, always use 'u'
1900  assert(StoreVT.isSimple() && "Store value is not simple");
1901  MVT ScalarVT = StoreVT.getSimpleVT().getScalarType();
1902  unsigned ToTypeWidth = ScalarVT.getSizeInBits();
1903  unsigned ToType;
1904  if (ScalarVT.isFloatingPoint())
1905  ToType = ScalarVT.SimpleTy == MVT::f16 ? NVPTX::PTXLdStInstCode::Untyped
1907  else
1909 
1911  SDValue N2;
1912  unsigned VecType;
1913 
1914  switch (N->getOpcode()) {
1915  case NVPTXISD::StoreV2:
1917  StOps.push_back(N->getOperand(1));
1918  StOps.push_back(N->getOperand(2));
1919  N2 = N->getOperand(3);
1920  break;
1921  case NVPTXISD::StoreV4:
1923  StOps.push_back(N->getOperand(1));
1924  StOps.push_back(N->getOperand(2));
1925  StOps.push_back(N->getOperand(3));
1926  StOps.push_back(N->getOperand(4));
1927  N2 = N->getOperand(5);
1928  break;
1929  default:
1930  return false;
1931  }
1932 
1933  // v8f16 is a special case. PTX doesn't have st.v8.f16
1934  // instruction. Instead, we split the vector into v2f16 chunks and
1935  // store them with st.v4.b32.
1936  if (EltVT == MVT::v2f16) {
1937  assert(N->getOpcode() == NVPTXISD::StoreV4 && "Unexpected load opcode.");
1938  EltVT = MVT::i32;
1940  ToTypeWidth = 32;
1941  }
1942 
1943  StOps.push_back(getI32Imm(IsVolatile, DL));
1944  StOps.push_back(getI32Imm(CodeAddrSpace, DL));
1945  StOps.push_back(getI32Imm(VecType, DL));
1946  StOps.push_back(getI32Imm(ToType, DL));
1947  StOps.push_back(getI32Imm(ToTypeWidth, DL));
1948 
1949  if (SelectDirectAddr(N2, Addr)) {
1950  switch (N->getOpcode()) {
1951  default:
1952  return false;
1953  case NVPTXISD::StoreV2:
1954  Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1955  NVPTX::STV_i8_v2_avar, NVPTX::STV_i16_v2_avar,
1956  NVPTX::STV_i32_v2_avar, NVPTX::STV_i64_v2_avar,
1957  NVPTX::STV_f16_v2_avar, NVPTX::STV_f16x2_v2_avar,
1958  NVPTX::STV_f32_v2_avar, NVPTX::STV_f64_v2_avar);
1959  break;
1960  case NVPTXISD::StoreV4:
1961  Opcode =
1962  pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v4_avar,
1963  NVPTX::STV_i16_v4_avar, NVPTX::STV_i32_v4_avar, None,
1964  NVPTX::STV_f16_v4_avar, NVPTX::STV_f16x2_v4_avar,
1965  NVPTX::STV_f32_v4_avar, None);
1966  break;
1967  }
1968  StOps.push_back(Addr);
1969  } else if (PointerSize == 64 ? SelectADDRsi64(N2.getNode(), N2, Base, Offset)
1970  : SelectADDRsi(N2.getNode(), N2, Base, Offset)) {
1971  switch (N->getOpcode()) {
1972  default:
1973  return false;
1974  case NVPTXISD::StoreV2:
1975  Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1976  NVPTX::STV_i8_v2_asi, NVPTX::STV_i16_v2_asi,
1977  NVPTX::STV_i32_v2_asi, NVPTX::STV_i64_v2_asi,
1978  NVPTX::STV_f16_v2_asi, NVPTX::STV_f16x2_v2_asi,
1979  NVPTX::STV_f32_v2_asi, NVPTX::STV_f64_v2_asi);
1980  break;
1981  case NVPTXISD::StoreV4:
1982  Opcode =
1983  pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v4_asi,
1984  NVPTX::STV_i16_v4_asi, NVPTX::STV_i32_v4_asi, None,
1985  NVPTX::STV_f16_v4_asi, NVPTX::STV_f16x2_v4_asi,
1986  NVPTX::STV_f32_v4_asi, None);
1987  break;
1988  }
1989  StOps.push_back(Base);
1990  StOps.push_back(Offset);
1991  } else if (PointerSize == 64 ? SelectADDRri64(N2.getNode(), N2, Base, Offset)
1992  : SelectADDRri(N2.getNode(), N2, Base, Offset)) {
1993  if (PointerSize == 64) {
1994  switch (N->getOpcode()) {
1995  default:
1996  return false;
1997  case NVPTXISD::StoreV2:
1998  Opcode = pickOpcodeForVT(
1999  EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v2_ari_64,
2000  NVPTX::STV_i16_v2_ari_64, NVPTX::STV_i32_v2_ari_64,
2001  NVPTX::STV_i64_v2_ari_64, NVPTX::STV_f16_v2_ari_64,
2002  NVPTX::STV_f16x2_v2_ari_64, NVPTX::STV_f32_v2_ari_64,
2003  NVPTX::STV_f64_v2_ari_64);
2004  break;
2005  case NVPTXISD::StoreV4:
2006  Opcode = pickOpcodeForVT(
2007  EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v4_ari_64,
2008  NVPTX::STV_i16_v4_ari_64, NVPTX::STV_i32_v4_ari_64, None,
2009  NVPTX::STV_f16_v4_ari_64, NVPTX::STV_f16x2_v4_ari_64,
2010  NVPTX::STV_f32_v4_ari_64, None);
2011  break;
2012  }
2013  } else {
2014  switch (N->getOpcode()) {
2015  default:
2016  return false;
2017  case NVPTXISD::StoreV2:
2018  Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
2019  NVPTX::STV_i8_v2_ari, NVPTX::STV_i16_v2_ari,
2020  NVPTX::STV_i32_v2_ari, NVPTX::STV_i64_v2_ari,
2021  NVPTX::STV_f16_v2_ari, NVPTX::STV_f16x2_v2_ari,
2022  NVPTX::STV_f32_v2_ari, NVPTX::STV_f64_v2_ari);
2023  break;
2024  case NVPTXISD::StoreV4:
2025  Opcode =
2026  pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v4_ari,
2027  NVPTX::STV_i16_v4_ari, NVPTX::STV_i32_v4_ari, None,
2028  NVPTX::STV_f16_v4_ari, NVPTX::STV_f16x2_v4_ari,
2029  NVPTX::STV_f32_v4_ari, None);
2030  break;
2031  }
2032  }
2033  StOps.push_back(Base);
2034  StOps.push_back(Offset);
2035  } else {
2036  if (PointerSize == 64) {
2037  switch (N->getOpcode()) {
2038  default:
2039  return false;
2040  case NVPTXISD::StoreV2:
2041  Opcode = pickOpcodeForVT(
2042  EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v2_areg_64,
2043  NVPTX::STV_i16_v2_areg_64, NVPTX::STV_i32_v2_areg_64,
2044  NVPTX::STV_i64_v2_areg_64, NVPTX::STV_f16_v2_areg_64,
2045  NVPTX::STV_f16x2_v2_areg_64, NVPTX::STV_f32_v2_areg_64,
2046  NVPTX::STV_f64_v2_areg_64);
2047  break;
2048  case NVPTXISD::StoreV4:
2049  Opcode = pickOpcodeForVT(
2050  EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v4_areg_64,
2051  NVPTX::STV_i16_v4_areg_64, NVPTX::STV_i32_v4_areg_64, None,
2052  NVPTX::STV_f16_v4_areg_64, NVPTX::STV_f16x2_v4_areg_64,
2053  NVPTX::STV_f32_v4_areg_64, None);
2054  break;
2055  }
2056  } else {
2057  switch (N->getOpcode()) {
2058  default:
2059  return false;
2060  case NVPTXISD::StoreV2:
2061  Opcode =
2062  pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v2_areg,
2063  NVPTX::STV_i16_v2_areg, NVPTX::STV_i32_v2_areg,
2064  NVPTX::STV_i64_v2_areg, NVPTX::STV_f16_v2_areg,
2065  NVPTX::STV_f16x2_v2_areg, NVPTX::STV_f32_v2_areg,
2066  NVPTX::STV_f64_v2_areg);
2067  break;
2068  case NVPTXISD::StoreV4:
2069  Opcode =
2070  pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v4_areg,
2071  NVPTX::STV_i16_v4_areg, NVPTX::STV_i32_v4_areg, None,
2072  NVPTX::STV_f16_v4_areg, NVPTX::STV_f16x2_v4_areg,
2073  NVPTX::STV_f32_v4_areg, None);
2074  break;
2075  }
2076  }
2077  StOps.push_back(N2);
2078  }
2079 
2080  if (!Opcode)
2081  return false;
2082 
2083  StOps.push_back(Chain);
2084 
2085  ST = CurDAG->getMachineNode(Opcode.getValue(), DL, MVT::Other, StOps);
2086 
2087  MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand();
2088  CurDAG->setNodeMemRefs(cast<MachineSDNode>(ST), {MemRef});
2089 
2090  ReplaceNode(N, ST);
2091  return true;
2092 }
2093 
2094 bool NVPTXDAGToDAGISel::tryLoadParam(SDNode *Node) {
2095  SDValue Chain = Node->getOperand(0);
2096  SDValue Offset = Node->getOperand(2);
2097  SDValue Flag = Node->getOperand(3);
2098  SDLoc DL(Node);
2099  MemSDNode *Mem = cast<MemSDNode>(Node);
2100 
2101  unsigned VecSize;
2102  switch (Node->getOpcode()) {
2103  default:
2104  return false;
2105  case NVPTXISD::LoadParam:
2106  VecSize = 1;
2107  break;
2108  case NVPTXISD::LoadParamV2:
2109  VecSize = 2;
2110  break;
2111  case NVPTXISD::LoadParamV4:
2112  VecSize = 4;
2113  break;
2114  }
2115 
2116  EVT EltVT = Node->getValueType(0);
2117  EVT MemVT = Mem->getMemoryVT();
2118 
2119  Optional<unsigned> Opcode;
2120 
2121  switch (VecSize) {
2122  default:
2123  return false;
2124  case 1:
2125  Opcode = pickOpcodeForVT(MemVT.getSimpleVT().SimpleTy,
2126  NVPTX::LoadParamMemI8, NVPTX::LoadParamMemI16,
2127  NVPTX::LoadParamMemI32, NVPTX::LoadParamMemI64,
2128  NVPTX::LoadParamMemF16, NVPTX::LoadParamMemF16x2,
2129  NVPTX::LoadParamMemF32, NVPTX::LoadParamMemF64);
2130  break;
2131  case 2:
2132  Opcode =
2133  pickOpcodeForVT(MemVT.getSimpleVT().SimpleTy, NVPTX::LoadParamMemV2I8,
2134  NVPTX::LoadParamMemV2I16, NVPTX::LoadParamMemV2I32,
2135  NVPTX::LoadParamMemV2I64, NVPTX::LoadParamMemV2F16,
2136  NVPTX::LoadParamMemV2F16x2, NVPTX::LoadParamMemV2F32,
2137  NVPTX::LoadParamMemV2F64);
2138  break;
2139  case 4:
2140  Opcode = pickOpcodeForVT(
2141  MemVT.getSimpleVT().SimpleTy, NVPTX::LoadParamMemV4I8,
2142  NVPTX::LoadParamMemV4I16, NVPTX::LoadParamMemV4I32, None,
2143  NVPTX::LoadParamMemV4F16, NVPTX::LoadParamMemV4F16x2,
2144  NVPTX::LoadParamMemV4F32, None);
2145  break;
2146  }
2147  if (!Opcode)
2148  return false;
2149 
2150  SDVTList VTs;
2151  if (VecSize == 1) {
2152  VTs = CurDAG->getVTList(EltVT, MVT::Other, MVT::Glue);
2153  } else if (VecSize == 2) {
2154  VTs = CurDAG->getVTList(EltVT, EltVT, MVT::Other, MVT::Glue);
2155  } else {
2156  EVT EVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other, MVT::Glue };
2157  VTs = CurDAG->getVTList(EVTs);
2158  }
2159 
2160  unsigned OffsetVal = cast<ConstantSDNode>(Offset)->getZExtValue();
2161 
2163  Ops.push_back(CurDAG->getTargetConstant(OffsetVal, DL, MVT::i32));
2164  Ops.push_back(Chain);
2165  Ops.push_back(Flag);
2166 
2167  ReplaceNode(Node, CurDAG->getMachineNode(Opcode.getValue(), DL, VTs, Ops));
2168  return true;
2169 }
2170 
2171 bool NVPTXDAGToDAGISel::tryStoreRetval(SDNode *N) {
2172  SDLoc DL(N);
2173  SDValue Chain = N->getOperand(0);
2174  SDValue Offset = N->getOperand(1);
2175  unsigned OffsetVal = cast<ConstantSDNode>(Offset)->getZExtValue();
2176  MemSDNode *Mem = cast<MemSDNode>(N);
2177 
2178  // How many elements do we have?
2179  unsigned NumElts = 1;
2180  switch (N->getOpcode()) {
2181  default:
2182  return false;
2183  case NVPTXISD::StoreRetval:
2184  NumElts = 1;
2185  break;
2187  NumElts = 2;
2188  break;
2190  NumElts = 4;
2191  break;
2192  }
2193 
2194  // Build vector of operands
2196  for (unsigned i = 0; i < NumElts; ++i)
2197  Ops.push_back(N->getOperand(i + 2));
2198  Ops.push_back(CurDAG->getTargetConstant(OffsetVal, DL, MVT::i32));
2199  Ops.push_back(Chain);
2200 
2201  // Determine target opcode
2202  // If we have an i1, use an 8-bit store. The lowering code in
2203  // NVPTXISelLowering will have already emitted an upcast.
2204  Optional<unsigned> Opcode = 0;
2205  switch (NumElts) {
2206  default:
2207  return false;
2208  case 1:
2209  Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy,
2210  NVPTX::StoreRetvalI8, NVPTX::StoreRetvalI16,
2211  NVPTX::StoreRetvalI32, NVPTX::StoreRetvalI64,
2212  NVPTX::StoreRetvalF16, NVPTX::StoreRetvalF16x2,
2213  NVPTX::StoreRetvalF32, NVPTX::StoreRetvalF64);
2214  break;
2215  case 2:
2216  Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy,
2217  NVPTX::StoreRetvalV2I8, NVPTX::StoreRetvalV2I16,
2218  NVPTX::StoreRetvalV2I32, NVPTX::StoreRetvalV2I64,
2219  NVPTX::StoreRetvalV2F16, NVPTX::StoreRetvalV2F16x2,
2220  NVPTX::StoreRetvalV2F32, NVPTX::StoreRetvalV2F64);
2221  break;
2222  case 4:
2223  Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy,
2224  NVPTX::StoreRetvalV4I8, NVPTX::StoreRetvalV4I16,
2225  NVPTX::StoreRetvalV4I32, None,
2226  NVPTX::StoreRetvalV4F16, NVPTX::StoreRetvalV4F16x2,
2227  NVPTX::StoreRetvalV4F32, None);
2228  break;
2229  }
2230  if (!Opcode)
2231  return false;
2232 
2233  SDNode *Ret = CurDAG->getMachineNode(Opcode.getValue(), DL, MVT::Other, Ops);
2234  MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand();
2235  CurDAG->setNodeMemRefs(cast<MachineSDNode>(Ret), {MemRef});
2236 
2237  ReplaceNode(N, Ret);
2238  return true;
2239 }
2240 
2241 bool NVPTXDAGToDAGISel::tryStoreParam(SDNode *N) {
2242  SDLoc DL(N);
2243  SDValue Chain = N->getOperand(0);
2244  SDValue Param = N->getOperand(1);
2245  unsigned ParamVal = cast<ConstantSDNode>(Param)->getZExtValue();
2246  SDValue Offset = N->getOperand(2);
2247  unsigned OffsetVal = cast<ConstantSDNode>(Offset)->getZExtValue();
2248  MemSDNode *Mem = cast<MemSDNode>(N);
2249  SDValue Flag = N->getOperand(N->getNumOperands() - 1);
2250 
2251  // How many elements do we have?
2252  unsigned NumElts = 1;
2253  switch (N->getOpcode()) {
2254  default:
2255  return false;
2258  case NVPTXISD::StoreParam:
2259  NumElts = 1;
2260  break;
2262  NumElts = 2;
2263  break;
2265  NumElts = 4;
2266  break;
2267  }
2268 
2269  // Build vector of operands
2271  for (unsigned i = 0; i < NumElts; ++i)
2272  Ops.push_back(N->getOperand(i + 3));
2273  Ops.push_back(CurDAG->getTargetConstant(ParamVal, DL, MVT::i32));
2274  Ops.push_back(CurDAG->getTargetConstant(OffsetVal, DL, MVT::i32));
2275  Ops.push_back(Chain);
2276  Ops.push_back(Flag);
2277 
2278  // Determine target opcode
2279  // If we have an i1, use an 8-bit store. The lowering code in
2280  // NVPTXISelLowering will have already emitted an upcast.
2281  Optional<unsigned> Opcode = 0;
2282  switch (N->getOpcode()) {
2283  default:
2284  switch (NumElts) {
2285  default:
2286  return false;
2287  case 1:
2288  Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy,
2289  NVPTX::StoreParamI8, NVPTX::StoreParamI16,
2290  NVPTX::StoreParamI32, NVPTX::StoreParamI64,
2291  NVPTX::StoreParamF16, NVPTX::StoreParamF16x2,
2292  NVPTX::StoreParamF32, NVPTX::StoreParamF64);
2293  break;
2294  case 2:
2295  Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy,
2296  NVPTX::StoreParamV2I8, NVPTX::StoreParamV2I16,
2297  NVPTX::StoreParamV2I32, NVPTX::StoreParamV2I64,
2298  NVPTX::StoreParamV2F16, NVPTX::StoreParamV2F16x2,
2299  NVPTX::StoreParamV2F32, NVPTX::StoreParamV2F64);
2300  break;
2301  case 4:
2302  Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy,
2303  NVPTX::StoreParamV4I8, NVPTX::StoreParamV4I16,
2304  NVPTX::StoreParamV4I32, None,
2305  NVPTX::StoreParamV4F16, NVPTX::StoreParamV4F16x2,
2306  NVPTX::StoreParamV4F32, None);
2307  break;
2308  }
2309  if (!Opcode)
2310  return false;
2311  break;
2312  // Special case: if we have a sign-extend/zero-extend node, insert the
2313  // conversion instruction first, and use that as the value operand to
2314  // the selected StoreParam node.
2315  case NVPTXISD::StoreParamU32: {
2316  Opcode = NVPTX::StoreParamI32;
2318  MVT::i32);
2319  SDNode *Cvt = CurDAG->getMachineNode(NVPTX::CVT_u32_u16, DL,
2320  MVT::i32, Ops[0], CvtNone);
2321  Ops[0] = SDValue(Cvt, 0);
2322  break;
2323  }
2324  case NVPTXISD::StoreParamS32: {
2325  Opcode = NVPTX::StoreParamI32;
2327  MVT::i32);
2328  SDNode *Cvt = CurDAG->getMachineNode(NVPTX::CVT_s32_s16, DL,
2329  MVT::i32, Ops[0], CvtNone);
2330  Ops[0] = SDValue(Cvt, 0);
2331  break;
2332  }
2333  }
2334 
2336  SDNode *Ret =
2337  CurDAG->getMachineNode(Opcode.getValue(), DL, RetVTs, Ops);
2338  MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand();
2339  CurDAG->setNodeMemRefs(cast<MachineSDNode>(Ret), {MemRef});
2340 
2341  ReplaceNode(N, Ret);
2342  return true;
2343 }
2344 
2345 bool NVPTXDAGToDAGISel::tryTextureIntrinsic(SDNode *N) {
2346  unsigned Opc = 0;
2347 
2348  switch (N->getOpcode()) {
2349  default: return false;
2351  Opc = NVPTX::TEX_1D_F32_S32;
2352  break;
2354  Opc = NVPTX::TEX_1D_F32_F32;
2355  break;
2357  Opc = NVPTX::TEX_1D_F32_F32_LEVEL;
2358  break;
2360  Opc = NVPTX::TEX_1D_F32_F32_GRAD;
2361  break;
2362  case NVPTXISD::Tex1DS32S32:
2363  Opc = NVPTX::TEX_1D_S32_S32;
2364  break;
2366  Opc = NVPTX::TEX_1D_S32_F32;
2367  break;
2369  Opc = NVPTX::TEX_1D_S32_F32_LEVEL;
2370  break;
2372  Opc = NVPTX::TEX_1D_S32_F32_GRAD;
2373  break;
2374  case NVPTXISD::Tex1DU32S32:
2375  Opc = NVPTX::TEX_1D_U32_S32;
2376  break;
2378  Opc = NVPTX::TEX_1D_U32_F32;
2379  break;
2381  Opc = NVPTX::TEX_1D_U32_F32_LEVEL;
2382  break;
2384  Opc = NVPTX::TEX_1D_U32_F32_GRAD;
2385  break;
2387  Opc = NVPTX::TEX_1D_ARRAY_F32_S32;
2388  break;
2390  Opc = NVPTX::TEX_1D_ARRAY_F32_F32;
2391  break;
2393  Opc = NVPTX::TEX_1D_ARRAY_F32_F32_LEVEL;
2394  break;
2396  Opc = NVPTX::TEX_1D_ARRAY_F32_F32_GRAD;
2397  break;
2399  Opc = NVPTX::TEX_1D_ARRAY_S32_S32;
2400  break;
2402  Opc = NVPTX::TEX_1D_ARRAY_S32_F32;
2403  break;
2405  Opc = NVPTX::TEX_1D_ARRAY_S32_F32_LEVEL;
2406  break;
2408  Opc = NVPTX::TEX_1D_ARRAY_S32_F32_GRAD;
2409  break;
2411  Opc = NVPTX::TEX_1D_ARRAY_U32_S32;
2412  break;
2414  Opc = NVPTX::TEX_1D_ARRAY_U32_F32;
2415  break;
2417  Opc = NVPTX::TEX_1D_ARRAY_U32_F32_LEVEL;
2418  break;
2420  Opc = NVPTX::TEX_1D_ARRAY_U32_F32_GRAD;
2421  break;
2423  Opc = NVPTX::TEX_2D_F32_S32;
2424  break;
2426  Opc = NVPTX::TEX_2D_F32_F32;
2427  break;
2429  Opc = NVPTX::TEX_2D_F32_F32_LEVEL;
2430  break;
2432  Opc = NVPTX::TEX_2D_F32_F32_GRAD;
2433  break;
2434  case NVPTXISD::Tex2DS32S32:
2435  Opc = NVPTX::TEX_2D_S32_S32;
2436  break;
2438  Opc = NVPTX::TEX_2D_S32_F32;
2439  break;
2441  Opc = NVPTX::TEX_2D_S32_F32_LEVEL;
2442  break;
2444  Opc = NVPTX::TEX_2D_S32_F32_GRAD;
2445  break;
2446  case NVPTXISD::Tex2DU32S32:
2447  Opc = NVPTX::TEX_2D_U32_S32;
2448  break;
2450  Opc = NVPTX::TEX_2D_U32_F32;
2451  break;
2453  Opc = NVPTX::TEX_2D_U32_F32_LEVEL;
2454  break;
2456  Opc = NVPTX::TEX_2D_U32_F32_GRAD;
2457  break;
2459  Opc = NVPTX::TEX_2D_ARRAY_F32_S32;
2460  break;
2462  Opc = NVPTX::TEX_2D_ARRAY_F32_F32;
2463  break;
2465  Opc = NVPTX::TEX_2D_ARRAY_F32_F32_LEVEL;
2466  break;
2468  Opc = NVPTX::TEX_2D_ARRAY_F32_F32_GRAD;
2469  break;
2471  Opc = NVPTX::TEX_2D_ARRAY_S32_S32;
2472  break;
2474  Opc = NVPTX::TEX_2D_ARRAY_S32_F32;
2475  break;
2477  Opc = NVPTX::TEX_2D_ARRAY_S32_F32_LEVEL;
2478  break;
2480  Opc = NVPTX::TEX_2D_ARRAY_S32_F32_GRAD;
2481  break;
2483  Opc = NVPTX::TEX_2D_ARRAY_U32_S32;
2484  break;
2486  Opc = NVPTX::TEX_2D_ARRAY_U32_F32;
2487  break;
2489  Opc = NVPTX::TEX_2D_ARRAY_U32_F32_LEVEL;
2490  break;
2492  Opc = NVPTX::TEX_2D_ARRAY_U32_F32_GRAD;
2493  break;
2495  Opc = NVPTX::TEX_3D_F32_S32;
2496  break;
2498  Opc = NVPTX::TEX_3D_F32_F32;
2499  break;
2501  Opc = NVPTX::TEX_3D_F32_F32_LEVEL;
2502  break;
2504  Opc = NVPTX::TEX_3D_F32_F32_GRAD;
2505  break;
2506  case NVPTXISD::Tex3DS32S32:
2507  Opc = NVPTX::TEX_3D_S32_S32;
2508  break;
2510  Opc = NVPTX::TEX_3D_S32_F32;
2511  break;
2513  Opc = NVPTX::TEX_3D_S32_F32_LEVEL;
2514  break;
2516  Opc = NVPTX::TEX_3D_S32_F32_GRAD;
2517  break;
2518  case NVPTXISD::Tex3DU32S32:
2519  Opc = NVPTX::TEX_3D_U32_S32;
2520  break;
2522  Opc = NVPTX::TEX_3D_U32_F32;
2523  break;
2525  Opc = NVPTX::TEX_3D_U32_F32_LEVEL;
2526  break;
2528  Opc = NVPTX::TEX_3D_U32_F32_GRAD;
2529  break;
2531  Opc = NVPTX::TEX_CUBE_F32_F32;
2532  break;
2534  Opc = NVPTX::TEX_CUBE_F32_F32_LEVEL;
2535  break;
2537  Opc = NVPTX::TEX_CUBE_S32_F32;
2538  break;
2540  Opc = NVPTX::TEX_CUBE_S32_F32_LEVEL;
2541  break;
2543  Opc = NVPTX::TEX_CUBE_U32_F32;
2544  break;
2546  Opc = NVPTX::TEX_CUBE_U32_F32_LEVEL;
2547  break;
2549  Opc = NVPTX::TEX_CUBE_ARRAY_F32_F32;
2550  break;
2552  Opc = NVPTX::TEX_CUBE_ARRAY_F32_F32_LEVEL;
2553  break;
2555  Opc = NVPTX::TEX_CUBE_ARRAY_S32_F32;
2556  break;
2558  Opc = NVPTX::TEX_CUBE_ARRAY_S32_F32_LEVEL;
2559  break;
2561  Opc = NVPTX::TEX_CUBE_ARRAY_U32_F32;
2562  break;
2564  Opc = NVPTX::TEX_CUBE_ARRAY_U32_F32_LEVEL;
2565  break;
2567  Opc = NVPTX::TLD4_R_2D_F32_F32;
2568  break;
2570  Opc = NVPTX::TLD4_G_2D_F32_F32;
2571  break;
2573  Opc = NVPTX::TLD4_B_2D_F32_F32;
2574  break;
2576  Opc = NVPTX::TLD4_A_2D_F32_F32;
2577  break;
2579  Opc = NVPTX::TLD4_R_2D_S32_F32;
2580  break;
2582  Opc = NVPTX::TLD4_G_2D_S32_F32;
2583  break;
2585  Opc = NVPTX::TLD4_B_2D_S32_F32;
2586  break;
2588  Opc = NVPTX::TLD4_A_2D_S32_F32;
2589  break;
2591  Opc = NVPTX::TLD4_R_2D_U32_F32;
2592  break;
2594  Opc = NVPTX::TLD4_G_2D_U32_F32;
2595  break;
2597  Opc = NVPTX::TLD4_B_2D_U32_F32;
2598  break;
2600  Opc = NVPTX::TLD4_A_2D_U32_F32;
2601  break;
2603  Opc = NVPTX::TEX_UNIFIED_1D_F32_S32;
2604  break;
2606  Opc = NVPTX::TEX_UNIFIED_1D_F32_F32;
2607  break;
2609  Opc = NVPTX::TEX_UNIFIED_1D_F32_F32_LEVEL;
2610  break;
2612  Opc = NVPTX::TEX_UNIFIED_1D_F32_F32_GRAD;
2613  break;
2615  Opc = NVPTX::TEX_UNIFIED_1D_S32_S32;
2616  break;
2618  Opc = NVPTX::TEX_UNIFIED_1D_S32_F32;
2619  break;
2621  Opc = NVPTX::TEX_UNIFIED_1D_S32_F32_LEVEL;
2622  break;
2624  Opc = NVPTX::TEX_UNIFIED_1D_S32_F32_GRAD;
2625  break;
2627  Opc = NVPTX::TEX_UNIFIED_1D_U32_S32;
2628  break;
2630  Opc = NVPTX::TEX_UNIFIED_1D_U32_F32;
2631  break;
2633  Opc = NVPTX::TEX_UNIFIED_1D_U32_F32_LEVEL;
2634  break;
2636  Opc = NVPTX::TEX_UNIFIED_1D_U32_F32_GRAD;
2637  break;
2639  Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_F32_S32;
2640  break;
2642  Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_F32_F32;
2643  break;
2645  Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_F32_F32_LEVEL;
2646  break;
2648  Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_F32_F32_GRAD;
2649  break;
2651  Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_S32_S32;
2652  break;
2654  Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_S32_F32;
2655  break;
2657  Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_S32_F32_LEVEL;
2658  break;
2660  Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_S32_F32_GRAD;
2661  break;
2663  Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_U32_S32;
2664  break;
2666  Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_U32_F32;
2667  break;
2669  Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_U32_F32_LEVEL;
2670  break;
2672  Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_U32_F32_GRAD;
2673  break;
2675  Opc = NVPTX::TEX_UNIFIED_2D_F32_S32;
2676  break;
2678  Opc = NVPTX::TEX_UNIFIED_2D_F32_F32;
2679  break;
2681  Opc = NVPTX::TEX_UNIFIED_2D_F32_F32_LEVEL;
2682  break;
2684  Opc = NVPTX::TEX_UNIFIED_2D_F32_F32_GRAD;
2685  break;
2687  Opc = NVPTX::TEX_UNIFIED_2D_S32_S32;
2688  break;
2690  Opc = NVPTX::TEX_UNIFIED_2D_S32_F32;
2691  break;
2693  Opc = NVPTX::TEX_UNIFIED_2D_S32_F32_LEVEL;
2694  break;
2696  Opc = NVPTX::TEX_UNIFIED_2D_S32_F32_GRAD;
2697  break;
2699  Opc = NVPTX::TEX_UNIFIED_2D_U32_S32;
2700  break;
2702  Opc = NVPTX::TEX_UNIFIED_2D_U32_F32;
2703  break;
2705  Opc = NVPTX::TEX_UNIFIED_2D_U32_F32_LEVEL;
2706  break;
2708  Opc = NVPTX::TEX_UNIFIED_2D_U32_F32_GRAD;
2709  break;
2711  Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_F32_S32;
2712  break;
2714  Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_F32_F32;
2715  break;
2717  Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_F32_F32_LEVEL;
2718  break;
2720  Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_F32_F32_GRAD;
2721  break;
2723  Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_S32_S32;
2724  break;
2726  Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_S32_F32;
2727  break;
2729  Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_S32_F32_LEVEL;
2730  break;
2732  Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_S32_F32_GRAD;
2733  break;
2735  Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_U32_S32;
2736  break;
2738  Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_U32_F32;
2739  break;
2741  Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_U32_F32_LEVEL;
2742  break;
2744  Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_U32_F32_GRAD;
2745  break;
2747  Opc = NVPTX::TEX_UNIFIED_3D_F32_S32;
2748  break;
2750  Opc = NVPTX::TEX_UNIFIED_3D_F32_F32;
2751  break;
2753  Opc = NVPTX::TEX_UNIFIED_3D_F32_F32_LEVEL;
2754  break;
2756  Opc = NVPTX::TEX_UNIFIED_3D_F32_F32_GRAD;
2757  break;
2759  Opc = NVPTX::TEX_UNIFIED_3D_S32_S32;
2760  break;
2762  Opc = NVPTX::TEX_UNIFIED_3D_S32_F32;
2763  break;
2765  Opc = NVPTX::TEX_UNIFIED_3D_S32_F32_LEVEL;
2766  break;
2768  Opc = NVPTX::TEX_UNIFIED_3D_S32_F32_GRAD;
2769  break;
2771  Opc = NVPTX::TEX_UNIFIED_3D_U32_S32;
2772  break;
2774  Opc = NVPTX::TEX_UNIFIED_3D_U32_F32;
2775  break;
2777  Opc = NVPTX::TEX_UNIFIED_3D_U32_F32_LEVEL;
2778  break;
2780  Opc = NVPTX::TEX_UNIFIED_3D_U32_F32_GRAD;
2781  break;
2783  Opc = NVPTX::TEX_UNIFIED_CUBE_F32_F32;
2784  break;
2786  Opc = NVPTX::TEX_UNIFIED_CUBE_F32_F32_LEVEL;
2787  break;
2789  Opc = NVPTX::TEX_UNIFIED_CUBE_S32_F32;
2790  break;
2792  Opc = NVPTX::TEX_UNIFIED_CUBE_S32_F32_LEVEL;
2793  break;
2795  Opc = NVPTX::TEX_UNIFIED_CUBE_U32_F32;
2796  break;
2798  Opc = NVPTX::TEX_UNIFIED_CUBE_U32_F32_LEVEL;
2799  break;
2801  Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_F32_F32;
2802  break;
2804  Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_F32_F32_LEVEL;
2805  break;
2807  Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_S32_F32;
2808  break;
2810  Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_S32_F32_LEVEL;
2811  break;
2813  Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_U32_F32;
2814  break;
2816  Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_U32_F32_LEVEL;
2817  break;
2819  Opc = NVPTX::TLD4_UNIFIED_R_2D_F32_F32;
2820  break;
2822  Opc = NVPTX::TLD4_UNIFIED_G_2D_F32_F32;
2823  break;
2825  Opc = NVPTX::TLD4_UNIFIED_B_2D_F32_F32;
2826  break;
2828  Opc = NVPTX::TLD4_UNIFIED_A_2D_F32_F32;
2829  break;
2831  Opc = NVPTX::TLD4_UNIFIED_R_2D_S32_F32;
2832  break;
2834  Opc = NVPTX::TLD4_UNIFIED_G_2D_S32_F32;
2835  break;
2837  Opc = NVPTX::TLD4_UNIFIED_B_2D_S32_F32;
2838  break;
2840  Opc = NVPTX::TLD4_UNIFIED_A_2D_S32_F32;
2841  break;
2843  Opc = NVPTX::TLD4_UNIFIED_R_2D_U32_F32;
2844  break;
2846  Opc = NVPTX::TLD4_UNIFIED_G_2D_U32_F32;
2847  break;
2849  Opc = NVPTX::TLD4_UNIFIED_B_2D_U32_F32;
2850  break;
2852  Opc = NVPTX::TLD4_UNIFIED_A_2D_U32_F32;
2853  break;
2854  }
2855 
2856  // Copy over operands
2857  SmallVector<SDValue, 8> Ops(N->op_begin() + 1, N->op_end());
2858  Ops.push_back(N->getOperand(0)); // Move chain to the back.
2859 
2860  ReplaceNode(N, CurDAG->getMachineNode(Opc, SDLoc(N), N->getVTList(), Ops));
2861  return true;
2862 }
2863 
2864 bool NVPTXDAGToDAGISel::trySurfaceIntrinsic(SDNode *N) {
2865  unsigned Opc = 0;
2866  switch (N->getOpcode()) {
2867  default: return false;
2869  Opc = NVPTX::SULD_1D_I8_CLAMP;
2870  break;
2872  Opc = NVPTX::SULD_1D_I16_CLAMP;
2873  break;
2875  Opc = NVPTX::SULD_1D_I32_CLAMP;
2876  break;
2878  Opc = NVPTX::SULD_1D_I64_CLAMP;
2879  break;
2881  Opc = NVPTX::SULD_1D_V2I8_CLAMP;
2882  break;
2884  Opc = NVPTX::SULD_1D_V2I16_CLAMP;
2885  break;
2887  Opc = NVPTX::SULD_1D_V2I32_CLAMP;
2888  break;
2890  Opc = NVPTX::SULD_1D_V2I64_CLAMP;
2891  break;
2893  Opc = NVPTX::SULD_1D_V4I8_CLAMP;
2894  break;
2896  Opc = NVPTX::SULD_1D_V4I16_CLAMP;
2897  break;
2899  Opc = NVPTX::SULD_1D_V4I32_CLAMP;
2900  break;
2902  Opc = NVPTX::SULD_1D_ARRAY_I8_CLAMP;
2903  break;
2905  Opc = NVPTX::SULD_1D_ARRAY_I16_CLAMP;
2906  break;
2908  Opc = NVPTX::SULD_1D_ARRAY_I32_CLAMP;
2909  break;
2911  Opc = NVPTX::SULD_1D_ARRAY_I64_CLAMP;
2912  break;
2914  Opc = NVPTX::SULD_1D_ARRAY_V2I8_CLAMP;
2915  break;
2917  Opc = NVPTX::SULD_1D_ARRAY_V2I16_CLAMP;
2918  break;
2920  Opc = NVPTX::SULD_1D_ARRAY_V2I32_CLAMP;
2921  break;
2923  Opc = NVPTX::SULD_1D_ARRAY_V2I64_CLAMP;
2924  break;
2926  Opc = NVPTX::SULD_1D_ARRAY_V4I8_CLAMP;
2927  break;
2929  Opc = NVPTX::SULD_1D_ARRAY_V4I16_CLAMP;
2930  break;
2932  Opc = NVPTX::SULD_1D_ARRAY_V4I32_CLAMP;
2933  break;
2935  Opc = NVPTX::SULD_2D_I8_CLAMP;
2936  break;
2938  Opc = NVPTX::SULD_2D_I16_CLAMP;
2939  break;
2941  Opc = NVPTX::SULD_2D_I32_CLAMP;
2942  break;
2944  Opc = NVPTX::SULD_2D_I64_CLAMP;
2945  break;
2947  Opc = NVPTX::SULD_2D_V2I8_CLAMP;
2948  break;
2950  Opc = NVPTX::SULD_2D_V2I16_CLAMP;
2951  break;
2953  Opc = NVPTX::SULD_2D_V2I32_CLAMP;
2954  break;
2956  Opc = NVPTX::SULD_2D_V2I64_CLAMP;
2957  break;
2959  Opc = NVPTX::SULD_2D_V4I8_CLAMP;
2960  break;
2962  Opc = NVPTX::SULD_2D_V4I16_CLAMP;
2963  break;
2965  Opc = NVPTX::SULD_2D_V4I32_CLAMP;
2966  break;
2968  Opc = NVPTX::SULD_2D_ARRAY_I8_CLAMP;
2969  break;
2971  Opc = NVPTX::SULD_2D_ARRAY_I16_CLAMP;
2972  break;
2974  Opc = NVPTX::SULD_2D_ARRAY_I32_CLAMP;
2975  break;
2977  Opc = NVPTX::SULD_2D_ARRAY_I64_CLAMP;
2978  break;
2980  Opc = NVPTX::SULD_2D_ARRAY_V2I8_CLAMP;
2981  break;
2983  Opc = NVPTX::SULD_2D_ARRAY_V2I16_CLAMP;
2984  break;
2986  Opc = NVPTX::SULD_2D_ARRAY_V2I32_CLAMP;
2987  break;
2989  Opc = NVPTX::SULD_2D_ARRAY_V2I64_CLAMP;
2990  break;
2992  Opc = NVPTX::SULD_2D_ARRAY_V4I8_CLAMP;
2993  break;
2995  Opc = NVPTX::SULD_2D_ARRAY_V4I16_CLAMP;
2996  break;
2998  Opc = NVPTX::SULD_2D_ARRAY_V4I32_CLAMP;
2999  break;
3001  Opc = NVPTX::SULD_3D_I8_CLAMP;
3002  break;
3004  Opc = NVPTX::SULD_3D_I16_CLAMP;
3005  break;
3007  Opc = NVPTX::SULD_3D_I32_CLAMP;
3008  break;
3010  Opc = NVPTX::SULD_3D_I64_CLAMP;
3011  break;
3013  Opc = NVPTX::SULD_3D_V2I8_CLAMP;
3014  break;
3016  Opc = NVPTX::SULD_3D_V2I16_CLAMP;
3017  break;
3019  Opc = NVPTX::SULD_3D_V2I32_CLAMP;
3020  break;
3022  Opc = NVPTX::SULD_3D_V2I64_CLAMP;
3023  break;
3025  Opc = NVPTX::SULD_3D_V4I8_CLAMP;
3026  break;
3028  Opc = NVPTX::SULD_3D_V4I16_CLAMP;
3029  break;
3031  Opc = NVPTX::SULD_3D_V4I32_CLAMP;
3032  break;
3034  Opc = NVPTX::SULD_1D_I8_TRAP;
3035  break;
3037  Opc = NVPTX::SULD_1D_I16_TRAP;
3038  break;
3040  Opc = NVPTX::SULD_1D_I32_TRAP;
3041  break;
3043  Opc = NVPTX::SULD_1D_I64_TRAP;
3044  break;
3046  Opc = NVPTX::SULD_1D_V2I8_TRAP;
3047  break;
3049  Opc = NVPTX::SULD_1D_V2I16_TRAP;
3050  break;
3052  Opc = NVPTX::SULD_1D_V2I32_TRAP;
3053  break;
3055  Opc = NVPTX::SULD_1D_V2I64_TRAP;
3056  break;
3058  Opc = NVPTX::SULD_1D_V4I8_TRAP;
3059  break;
3061  Opc = NVPTX::SULD_1D_V4I16_TRAP;
3062  break;
3064  Opc = NVPTX::SULD_1D_V4I32_TRAP;
3065  break;
3067  Opc = NVPTX::SULD_1D_ARRAY_I8_TRAP;
3068  break;
3070  Opc = NVPTX::SULD_1D_ARRAY_I16_TRAP;
3071  break;
3073  Opc = NVPTX::SULD_1D_ARRAY_I32_TRAP;
3074  break;
3076  Opc = NVPTX::SULD_1D_ARRAY_I64_TRAP;
3077  break;
3079  Opc = NVPTX::SULD_1D_ARRAY_V2I8_TRAP;
3080  break;
3082  Opc = NVPTX::SULD_1D_ARRAY_V2I16_TRAP;
3083  break;
3085  Opc = NVPTX::SULD_1D_ARRAY_V2I32_TRAP;
3086  break;
3088  Opc = NVPTX::SULD_1D_ARRAY_V2I64_TRAP;
3089  break;
3091  Opc = NVPTX::SULD_1D_ARRAY_V4I8_TRAP;
3092  break;
3094  Opc = NVPTX::SULD_1D_ARRAY_V4I16_TRAP;
3095  break;
3097  Opc = NVPTX::SULD_1D_ARRAY_V4I32_TRAP;
3098  break;
3100  Opc = NVPTX::SULD_2D_I8_TRAP;
3101  break;
3103  Opc = NVPTX::SULD_2D_I16_TRAP;
3104  break;
3106  Opc = NVPTX::SULD_2D_I32_TRAP;
3107  break;
3109  Opc = NVPTX::SULD_2D_I64_TRAP;
3110  break;
3112  Opc = NVPTX::SULD_2D_V2I8_TRAP;
3113  break;
3115  Opc = NVPTX::SULD_2D_V2I16_TRAP;
3116  break;
3118  Opc = NVPTX::SULD_2D_V2I32_TRAP;
3119  break;
3121  Opc = NVPTX::SULD_2D_V2I64_TRAP;
3122  break;
3124  Opc = NVPTX::SULD_2D_V4I8_TRAP;
3125  break;
3127  Opc = NVPTX::SULD_2D_V4I16_TRAP;
3128  break;
3130  Opc = NVPTX::SULD_2D_V4I32_TRAP;
3131  break;
3133  Opc = NVPTX::SULD_2D_ARRAY_I8_TRAP;
3134  break;
3136  Opc = NVPTX::SULD_2D_ARRAY_I16_TRAP;
3137  break;
3139  Opc = NVPTX::SULD_2D_ARRAY_I32_TRAP;
3140  break;
3142  Opc = NVPTX::SULD_2D_ARRAY_I64_TRAP;
3143  break;
3145  Opc = NVPTX::SULD_2D_ARRAY_V2I8_TRAP;
3146  break;
3148  Opc = NVPTX::SULD_2D_ARRAY_V2I16_TRAP;
3149  break;
3151  Opc = NVPTX::SULD_2D_ARRAY_V2I32_TRAP;
3152  break;
3154  Opc = NVPTX::SULD_2D_ARRAY_V2I64_TRAP;
3155  break;
3157  Opc = NVPTX::SULD_2D_ARRAY_V4I8_TRAP;
3158  break;
3160  Opc = NVPTX::SULD_2D_ARRAY_V4I16_TRAP;
3161  break;
3163  Opc = NVPTX::SULD_2D_ARRAY_V4I32_TRAP;
3164  break;
3166  Opc = NVPTX::SULD_3D_I8_TRAP;
3167  break;
3169  Opc = NVPTX::SULD_3D_I16_TRAP;
3170  break;
3172  Opc = NVPTX::SULD_3D_I32_TRAP;
3173  break;
3175  Opc = NVPTX::SULD_3D_I64_TRAP;
3176  break;
3178  Opc = NVPTX::SULD_3D_V2I8_TRAP;
3179  break;
3181  Opc = NVPTX::SULD_3D_V2I16_TRAP;
3182  break;
3184  Opc = NVPTX::SULD_3D_V2I32_TRAP;
3185  break;
3187  Opc = NVPTX::SULD_3D_V2I64_TRAP;
3188  break;
3190  Opc = NVPTX::SULD_3D_V4I8_TRAP;
3191  break;
3193  Opc = NVPTX::SULD_3D_V4I16_TRAP;
3194  break;
3196  Opc = NVPTX::SULD_3D_V4I32_TRAP;
3197  break;
3199  Opc = NVPTX::SULD_1D_I8_ZERO;
3200  break;
3202  Opc = NVPTX::SULD_1D_I16_ZERO;
3203  break;
3205  Opc = NVPTX::SULD_1D_I32_ZERO;
3206  break;
3208  Opc = NVPTX::SULD_1D_I64_ZERO;
3209  break;
3211  Opc = NVPTX::SULD_1D_V2I8_ZERO;
3212  break;
3214  Opc = NVPTX::SULD_1D_V2I16_ZERO;
3215  break;
3217  Opc = NVPTX::SULD_1D_V2I32_ZERO;
3218  break;
3220  Opc = NVPTX::SULD_1D_V2I64_ZERO;
3221  break;
3223  Opc = NVPTX::SULD_1D_V4I8_ZERO;
3224  break;
3226  Opc = NVPTX::SULD_1D_V4I16_ZERO;
3227  break;
3229  Opc = NVPTX::SULD_1D_V4I32_ZERO;
3230  break;
3232  Opc = NVPTX::SULD_1D_ARRAY_I8_ZERO;
3233  break;
3235  Opc = NVPTX::SULD_1D_ARRAY_I16_ZERO;
3236  break;
3238  Opc = NVPTX::SULD_1D_ARRAY_I32_ZERO;
3239  break;
3241  Opc = NVPTX::SULD_1D_ARRAY_I64_ZERO;
3242  break;
3244  Opc = NVPTX::SULD_1D_ARRAY_V2I8_ZERO;
3245  break;
3247  Opc = NVPTX::SULD_1D_ARRAY_V2I16_ZERO;
3248  break;
3250  Opc = NVPTX::SULD_1D_ARRAY_V2I32_ZERO;
3251  break;
3253  Opc = NVPTX::SULD_1D_ARRAY_V2I64_ZERO;
3254  break;
3256  Opc = NVPTX::SULD_1D_ARRAY_V4I8_ZERO;
3257  break;
3259  Opc = NVPTX::SULD_1D_ARRAY_V4I16_ZERO;
3260  break;
3262  Opc = NVPTX::SULD_1D_ARRAY_V4I32_ZERO;
3263  break;
3265  Opc = NVPTX::SULD_2D_I8_ZERO;
3266  break;
3268  Opc = NVPTX::SULD_2D_I16_ZERO;
3269  break;
3271  Opc = NVPTX::SULD_2D_I32_ZERO;
3272  break;
3274  Opc = NVPTX::SULD_2D_I64_ZERO;
3275  break;
3277  Opc = NVPTX::SULD_2D_V2I8_ZERO;
3278  break;
3280  Opc = NVPTX::SULD_2D_V2I16_ZERO;
3281  break;
3283  Opc = NVPTX::SULD_2D_V2I32_ZERO;
3284  break;
3286  Opc = NVPTX::SULD_2D_V2I64_ZERO;
3287  break;
3289  Opc = NVPTX::SULD_2D_V4I8_ZERO;
3290  break;
3292  Opc = NVPTX::SULD_2D_V4I16_ZERO;
3293  break;
3295  Opc = NVPTX::SULD_2D_V4I32_ZERO;
3296  break;
3298  Opc = NVPTX::SULD_2D_ARRAY_I8_ZERO;
3299  break;
3301  Opc = NVPTX::SULD_2D_ARRAY_I16_ZERO;
3302  break;
3304  Opc = NVPTX::SULD_2D_ARRAY_I32_ZERO;
3305  break;
3307  Opc = NVPTX::SULD_2D_ARRAY_I64_ZERO;
3308  break;
3310  Opc = NVPTX::SULD_2D_ARRAY_V2I8_ZERO;
3311  break;
3313  Opc = NVPTX::SULD_2D_ARRAY_V2I16_ZERO;
3314  break;
3316  Opc = NVPTX::SULD_2D_ARRAY_V2I32_ZERO;
3317  break;
3319  Opc = NVPTX::SULD_2D_ARRAY_V2I64_ZERO;
3320  break;
3322  Opc = NVPTX::SULD_2D_ARRAY_V4I8_ZERO;
3323  break;
3325  Opc = NVPTX::SULD_2D_ARRAY_V4I16_ZERO;
3326  break;
3328  Opc = NVPTX::SULD_2D_ARRAY_V4I32_ZERO;
3329  break;
3331  Opc = NVPTX::SULD_3D_I8_ZERO;
3332  break;
3334  Opc = NVPTX::SULD_3D_I16_ZERO;
3335  break;
3337  Opc = NVPTX::SULD_3D_I32_ZERO;
3338  break;
3340  Opc = NVPTX::SULD_3D_I64_ZERO;
3341  break;
3343  Opc = NVPTX::SULD_3D_V2I8_ZERO;
3344  break;
3346  Opc = NVPTX::SULD_3D_V2I16_ZERO;
3347  break;
3349  Opc = NVPTX::SULD_3D_V2I32_ZERO;
3350  break;
3352  Opc = NVPTX::SULD_3D_V2I64_ZERO;
3353  break;
3355  Opc = NVPTX::SULD_3D_V4I8_ZERO;
3356  break;
3358  Opc = NVPTX::SULD_3D_V4I16_ZERO;
3359  break;
3361  Opc = NVPTX::SULD_3D_V4I32_ZERO;
3362  break;
3363  }
3364 
3365  // Copy over operands
3366  SmallVector<SDValue, 8> Ops(N->op_begin() + 1, N->op_end());
3367  Ops.push_back(N->getOperand(0)); // Move chain to the back.
3368 
3369  ReplaceNode(N, CurDAG->getMachineNode(Opc, SDLoc(N), N->getVTList(), Ops));
3370  return true;
3371 }
3372 
3373 
3374 /// SelectBFE - Look for instruction sequences that can be made more efficient
3375 /// by using the 'bfe' (bit-field extract) PTX instruction
3376 bool NVPTXDAGToDAGISel::tryBFE(SDNode *N) {
3377  SDLoc DL(N);
3378  SDValue LHS = N->getOperand(0);
3379  SDValue RHS = N->getOperand(1);
3380  SDValue Len;
3381  SDValue Start;
3382  SDValue Val;
3383  bool IsSigned = false;
3384 
3385  if (N->getOpcode() == ISD::AND) {
3386  // Canonicalize the operands
3387  // We want 'and %val, %mask'
3388  if (isa<ConstantSDNode>(LHS) && !isa<ConstantSDNode>(RHS)) {
3389  std::swap(LHS, RHS);
3390  }
3391 
3393  if (!Mask) {
3394  // We need a constant mask on the RHS of the AND
3395  return false;
3396  }
3397 
3398  // Extract the mask bits
3399  uint64_t MaskVal = Mask->getZExtValue();
3400  if (!isMask_64(MaskVal)) {
3401  // We *could* handle shifted masks here, but doing so would require an
3402  // 'and' operation to fix up the low-order bits so we would trade
3403  // shr+and for bfe+and, which has the same throughput
3404  return false;
3405  }
3406 
3407  // How many bits are in our mask?
3408  uint64_t NumBits = countTrailingOnes(MaskVal);
3409  Len = CurDAG->getTargetConstant(NumBits, DL, MVT::i32);
3410 
3411  if (LHS.getOpcode() == ISD::SRL || LHS.getOpcode() == ISD::SRA) {
3412  // We have a 'srl/and' pair, extract the effective start bit and length
3413  Val = LHS.getNode()->getOperand(0);
3414  Start = LHS.getNode()->getOperand(1);
3415  ConstantSDNode *StartConst = dyn_cast<ConstantSDNode>(Start);
3416  if (StartConst) {
3417  uint64_t StartVal = StartConst->getZExtValue();
3418  // How many "good" bits do we have left? "good" is defined here as bits
3419  // that exist in the original value, not shifted in.
3420  uint64_t GoodBits = Start.getValueSizeInBits() - StartVal;
3421  if (NumBits > GoodBits) {
3422  // Do not handle the case where bits have been shifted in. In theory
3423  // we could handle this, but the cost is likely higher than just
3424  // emitting the srl/and pair.
3425  return false;
3426  }
3427  Start = CurDAG->getTargetConstant(StartVal, DL, MVT::i32);
3428  } else {
3429  // Do not handle the case where the shift amount (can be zero if no srl
3430  // was found) is not constant. We could handle this case, but it would
3431  // require run-time logic that would be more expensive than just
3432  // emitting the srl/and pair.
3433  return false;
3434  }
3435  } else {
3436  // Do not handle the case where the LHS of the and is not a shift. While
3437  // it would be trivial to handle this case, it would just transform
3438  // 'and' -> 'bfe', but 'and' has higher-throughput.
3439  return false;
3440  }
3441  } else if (N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) {
3442  if (LHS->getOpcode() == ISD::AND) {
3443  ConstantSDNode *ShiftCnst = dyn_cast<ConstantSDNode>(RHS);
3444  if (!ShiftCnst) {
3445  // Shift amount must be constant
3446  return false;
3447  }
3448 
3449  uint64_t ShiftAmt = ShiftCnst->getZExtValue();
3450 
3451  SDValue AndLHS = LHS->getOperand(0);
3452  SDValue AndRHS = LHS->getOperand(1);
3453 
3454  // Canonicalize the AND to have the mask on the RHS
3455  if (isa<ConstantSDNode>(AndLHS)) {
3456  std::swap(AndLHS, AndRHS);
3457  }
3458 
3459  ConstantSDNode *MaskCnst = dyn_cast<ConstantSDNode>(AndRHS);
3460  if (!MaskCnst) {
3461  // Mask must be constant
3462  return false;
3463  }
3464 
3465  uint64_t MaskVal = MaskCnst->getZExtValue();
3466  uint64_t NumZeros;
3467  uint64_t NumBits;
3468  if (isMask_64(MaskVal)) {
3469  NumZeros = 0;
3470  // The number of bits in the result bitfield will be the number of
3471  // trailing ones (the AND) minus the number of bits we shift off
3472  NumBits = countTrailingOnes(MaskVal) - ShiftAmt;
3473  } else if (isShiftedMask_64(MaskVal)) {
3474  NumZeros = countTrailingZeros(MaskVal);
3475  unsigned NumOnes = countTrailingOnes(MaskVal >> NumZeros);
3476  // The number of bits in the result bitfield will be the number of
3477  // trailing zeros plus the number of set bits in the mask minus the
3478  // number of bits we shift off
3479  NumBits = NumZeros + NumOnes - ShiftAmt;
3480  } else {
3481  // This is not a mask we can handle
3482  return false;
3483  }
3484 
3485  if (ShiftAmt < NumZeros) {
3486  // Handling this case would require extra logic that would make this
3487  // transformation non-profitable
3488  return false;
3489  }
3490 
3491  Val = AndLHS;
3492  Start = CurDAG->getTargetConstant(ShiftAmt, DL, MVT::i32);
3493  Len = CurDAG->getTargetConstant(NumBits, DL, MVT::i32);
3494  } else if (LHS->getOpcode() == ISD::SHL) {
3495  // Here, we have a pattern like:
3496  //
3497  // (sra (shl val, NN), MM)
3498  // or
3499  // (srl (shl val, NN), MM)
3500  //
3501  // If MM >= NN, we can efficiently optimize this with bfe
3502  Val = LHS->getOperand(0);
3503 
3504  SDValue ShlRHS = LHS->getOperand(1);
3505  ConstantSDNode *ShlCnst = dyn_cast<ConstantSDNode>(ShlRHS);
3506  if (!ShlCnst) {
3507  // Shift amount must be constant
3508  return false;
3509  }
3510  uint64_t InnerShiftAmt = ShlCnst->getZExtValue();
3511 
3512  SDValue ShrRHS = RHS;
3513  ConstantSDNode *ShrCnst = dyn_cast<ConstantSDNode>(ShrRHS);
3514  if (!ShrCnst) {
3515  // Shift amount must be constant
3516  return false;
3517  }
3518  uint64_t OuterShiftAmt = ShrCnst->getZExtValue();
3519 
3520  // To avoid extra codegen and be profitable, we need Outer >= Inner
3521  if (OuterShiftAmt < InnerShiftAmt) {
3522  return false;
3523  }
3524 
3525  // If the outer shift is more than the type size, we have no bitfield to
3526  // extract (since we also check that the inner shift is <= the outer shift
3527  // then this also implies that the inner shift is < the type size)
3528  if (OuterShiftAmt >= Val.getValueSizeInBits()) {
3529  return false;
3530  }
3531 
3532  Start = CurDAG->getTargetConstant(OuterShiftAmt - InnerShiftAmt, DL,
3533  MVT::i32);
3534  Len = CurDAG->getTargetConstant(Val.getValueSizeInBits() - OuterShiftAmt,
3535  DL, MVT::i32);
3536 
3537  if (N->getOpcode() == ISD::SRA) {
3538  // If we have a arithmetic right shift, we need to use the signed bfe
3539  // variant
3540  IsSigned = true;
3541  }
3542  } else {
3543  // No can do...
3544  return false;
3545  }
3546  } else {
3547  // No can do...
3548  return false;
3549  }
3550 
3551 
3552  unsigned Opc;
3553  // For the BFE operations we form here from "and" and "srl", always use the
3554  // unsigned variants.
3555  if (Val.getValueType() == MVT::i32) {
3556  if (IsSigned) {
3557  Opc = NVPTX::BFE_S32rii;
3558  } else {
3559  Opc = NVPTX::BFE_U32rii;
3560  }
3561  } else if (Val.getValueType() == MVT::i64) {
3562  if (IsSigned) {
3563  Opc = NVPTX::BFE_S64rii;
3564  } else {
3565  Opc = NVPTX::BFE_U64rii;
3566  }
3567  } else {
3568  // We cannot handle this type
3569  return false;
3570  }
3571 
3572  SDValue Ops[] = {
3573  Val, Start, Len
3574  };
3575 
3576  ReplaceNode(N, CurDAG->getMachineNode(Opc, DL, N->getVTList(), Ops));
3577  return true;
3578 }
3579 
3580 // SelectDirectAddr - Match a direct address for DAG.
3581 // A direct address could be a globaladdress or externalsymbol.
3582 bool NVPTXDAGToDAGISel::SelectDirectAddr(SDValue N, SDValue &Address) {
3583  // Return true if TGA or ES.
3584  if (N.getOpcode() == ISD::TargetGlobalAddress ||
3586  Address = N;
3587  return true;
3588  }
3589  if (N.getOpcode() == NVPTXISD::Wrapper) {
3590  Address = N.getOperand(0);
3591  return true;
3592  }
3593  // addrspacecast(MoveParam(arg_symbol) to addrspace(PARAM)) -> arg_symbol
3594  if (AddrSpaceCastSDNode *CastN = dyn_cast<AddrSpaceCastSDNode>(N)) {
3595  if (CastN->getSrcAddressSpace() == ADDRESS_SPACE_GENERIC &&
3596  CastN->getDestAddressSpace() == ADDRESS_SPACE_PARAM &&
3597  CastN->getOperand(0).getOpcode() == NVPTXISD::MoveParam)
3598  return SelectDirectAddr(CastN->getOperand(0).getOperand(0), Address);
3599  }
3600  return false;
3601 }
3602 
3603 // symbol+offset
3604 bool NVPTXDAGToDAGISel::SelectADDRsi_imp(
3605  SDNode *OpNode, SDValue Addr, SDValue &Base, SDValue &Offset, MVT mvt) {
3606  if (Addr.getOpcode() == ISD::ADD) {
3607  if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1))) {
3608  SDValue base = Addr.getOperand(0);
3609  if (SelectDirectAddr(base, Base)) {
3610  Offset = CurDAG->getTargetConstant(CN->getZExtValue(), SDLoc(OpNode),
3611  mvt);
3612  return true;
3613  }
3614  }
3615  }
3616  return false;
3617 }
3618 
3619 // symbol+offset
3620 bool NVPTXDAGToDAGISel::SelectADDRsi(SDNode *OpNode, SDValue Addr,
3621  SDValue &Base, SDValue &Offset) {
3622  return SelectADDRsi_imp(OpNode, Addr, Base, Offset, MVT::i32);
3623 }
3624 
3625 // symbol+offset
3626 bool NVPTXDAGToDAGISel::SelectADDRsi64(SDNode *OpNode, SDValue Addr,
3627  SDValue &Base, SDValue &Offset) {
3628  return SelectADDRsi_imp(OpNode, Addr, Base, Offset, MVT::i64);
3629 }
3630 
3631 // register+offset
3632 bool NVPTXDAGToDAGISel::SelectADDRri_imp(
3633  SDNode *OpNode, SDValue Addr, SDValue &Base, SDValue &Offset, MVT mvt) {
3634  if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Addr)) {
3635  Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), mvt);
3636  Offset = CurDAG->getTargetConstant(0, SDLoc(OpNode), mvt);
3637  return true;
3638  }
3639  if (Addr.getOpcode() == ISD::TargetExternalSymbol ||
3641  return false; // direct calls.
3642 
3643  if (Addr.getOpcode() == ISD::ADD) {
3644  if (SelectDirectAddr(Addr.getOperand(0), Addr)) {
3645  return false;
3646  }
3647  if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1))) {
3648  if (FrameIndexSDNode *FIN =
3649  dyn_cast<FrameIndexSDNode>(Addr.getOperand(0)))
3650  // Constant offset from frame ref.
3651  Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), mvt);
3652  else
3653  Base = Addr.getOperand(0);
3654  Offset = CurDAG->getTargetConstant(CN->getZExtValue(), SDLoc(OpNode),
3655  mvt);
3656  return true;
3657  }
3658  }
3659  return false;
3660 }
3661 
3662 // register+offset
3663 bool NVPTXDAGToDAGISel::SelectADDRri(SDNode *OpNode, SDValue Addr,
3664  SDValue &Base, SDValue &Offset) {
3665  return SelectADDRri_imp(OpNode, Addr, Base, Offset, MVT::i32);
3666 }
3667 
3668 // register+offset
3669 bool NVPTXDAGToDAGISel::SelectADDRri64(SDNode *OpNode, SDValue Addr,
3670  SDValue &Base, SDValue &Offset) {
3671  return SelectADDRri_imp(OpNode, Addr, Base, Offset, MVT::i64);
3672 }
3673 
3674 bool NVPTXDAGToDAGISel::ChkMemSDNodeAddressSpace(SDNode *N,
3675  unsigned int spN) const {
3676  const Value *Src = nullptr;
3677  if (MemSDNode *mN = dyn_cast<MemSDNode>(N)) {
3678  if (spN == 0 && mN->getMemOperand()->getPseudoValue())
3679  return true;
3680  Src = mN->getMemOperand()->getValue();
3681  }
3682  if (!Src)
3683  return false;
3684  if (auto *PT = dyn_cast<PointerType>(Src->getType()))
3685  return (PT->getAddressSpace() == spN);
3686  return false;
3687 }
3688 
3689 /// SelectInlineAsmMemoryOperand - Implement addressing mode selection for
3690 /// inline asm expressions.
3692  const SDValue &Op, unsigned ConstraintID, std::vector<SDValue> &OutOps) {
3693  SDValue Op0, Op1;
3694  switch (ConstraintID) {
3695  default:
3696  return true;
3697  case InlineAsm::Constraint_m: // memory
3698  if (SelectDirectAddr(Op, Op0)) {
3699  OutOps.push_back(Op0);
3700  OutOps.push_back(CurDAG->getTargetConstant(0, SDLoc(Op), MVT::i32));
3701  return false;
3702  }
3703  if (SelectADDRri(Op.getNode(), Op, Op0, Op1)) {
3704  OutOps.push_back(Op0);
3705  OutOps.push_back(Op1);
3706  return false;
3707  }
3708  break;
3709  }
3710  return true;
3711 }
3712 
3713 /// GetConvertOpcode - Returns the CVT_ instruction opcode that implements a
3714 /// conversion from \p SrcTy to \p DestTy.
3715 unsigned NVPTXDAGToDAGISel::GetConvertOpcode(MVT DestTy, MVT SrcTy,
3716  bool IsSigned) {
3717  switch (SrcTy.SimpleTy) {
3718  default:
3719  llvm_unreachable("Unhandled source type");
3720  case MVT::i8:
3721  switch (DestTy.SimpleTy) {
3722  default:
3723  llvm_unreachable("Unhandled dest type");
3724  case MVT::i16:
3725  return IsSigned ? NVPTX::CVT_s16_s8 : NVPTX::CVT_u16_u8;
3726  case MVT::i32:
3727  return IsSigned ? NVPTX::CVT_s32_s8 : NVPTX::CVT_u32_u8;
3728  case MVT::i64:
3729  return IsSigned ? NVPTX::CVT_s64_s8 : NVPTX::CVT_u64_u8;
3730  }
3731  case MVT::i16:
3732  switch (DestTy.SimpleTy) {
3733  default:
3734  llvm_unreachable("Unhandled dest type");
3735  case MVT::i8:
3736  return IsSigned ? NVPTX::CVT_s8_s16 : NVPTX::CVT_u8_u16;
3737  case MVT::i32:
3738  return IsSigned ? NVPTX::CVT_s32_s16 : NVPTX::CVT_u32_u16;
3739  case MVT::i64:
3740  return IsSigned ? NVPTX::CVT_s64_s16 : NVPTX::CVT_u64_u16;
3741  }
3742  case MVT::i32:
3743  switch (DestTy.SimpleTy) {
3744  default:
3745  llvm_unreachable("Unhandled dest type");
3746  case MVT::i8:
3747  return IsSigned ? NVPTX::CVT_s8_s32 : NVPTX::CVT_u8_u32;
3748  case MVT::i16:
3749  return IsSigned ? NVPTX::CVT_s16_s32 : NVPTX::CVT_u16_u32;
3750  case MVT::i64:
3751  return IsSigned ? NVPTX::CVT_s64_s32 : NVPTX::CVT_u64_u32;
3752  }
3753  case MVT::i64:
3754  switch (DestTy.SimpleTy) {
3755  default:
3756  llvm_unreachable("Unhandled dest type");
3757  case MVT::i8:
3758  return IsSigned ? NVPTX::CVT_s8_s64 : NVPTX::CVT_u8_u64;
3759  case MVT::i16:
3760  return IsSigned ? NVPTX::CVT_s16_s64 : NVPTX::CVT_u16_u64;
3761  case MVT::i32:
3762  return IsSigned ? NVPTX::CVT_s32_s64 : NVPTX::CVT_u32_u64;
3763  }
3764  }
3765 }
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
Definition: ISDOpcodes.h:549
bool isInvariant() const
EVT getValueType() const
Return the ValueType of the referenced return value.
GCNRegPressure max(const GCNRegPressure &P1, const GCNRegPressure &P2)
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
Atomic ordering constants.
LLVM_ATTRIBUTE_NORETURN void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition: Error.cpp:139
Compute iterated dominance frontiers using a linear time algorithm.
Definition: AllocatorList.h:24
constexpr char IsVolatile[]
Key for Kernel::Arg::Metadata::mIsVolatile.
void ReplaceUses(SDValue F, SDValue T)
ReplaceUses - replace all uses of the old node F with the use of the new node T.
bool isVector() const
Return true if this is a vector value type.
EVT getValueType(unsigned ResNo) const
Return the type of a specified result.
static Optional< unsigned > pickOpcodeForVT(MVT::SimpleValueType VT, unsigned Opcode_i8, unsigned Opcode_i16, unsigned Opcode_i32, Optional< unsigned > Opcode_i64, unsigned Opcode_f16, unsigned Opcode_f16x2, unsigned Opcode_f32, Optional< unsigned > Opcode_f64)
SDVTList getVTList() const
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition: ValueTypes.h:253
const SDValue & getChain() const
bool useF32FTZ(const MachineFunction &MF) const
static unsigned int getCodeAddrSpace(MemSDNode *N)
TargetGlobalAddress - Like GlobalAddress, but the DAG does no folding or anything else with this node...
Definition: ISDOpcodes.h:131
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly...
Definition: STLExtras.h:1042
unsigned getPointerSizeInBits(unsigned AS=0) const
Layout pointer size, in bits FIXME: The defaults need to be removed once all of the backends/clients ...
Definition: DataLayout.h:360
static unsigned getPTXCmpMode(const CondCodeSDNode &CondCode, bool FTZ)
MVT getSimpleValueType(unsigned ResNo) const
Return the type of a specified result as a simple type.
F(f)
void setNodeId(int Id)
Set unique node id.
SDNode * getNode() const
get the SDNode which holds the desired result
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
const NVPTXSubtarget * Subtarget
unsigned getValueSizeInBits() const
Returns the size of the value in bits.
MachineFunction * MF
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition: ISDOpcodes.h:159
unsigned getAddressSpace() const
Return the address space for the associated pointer.
NVPTXDAGToDAGISel(NVPTXTargetMachine &tm, CodeGenOpt::Level OptLevel)
A description of a memory reference used in the backend.
Shift and rotation operations.
Definition: ISDOpcodes.h:390
std::size_t countTrailingOnes(T Value, ZeroBehavior ZB=ZB_Width)
Count the number of ones from the least significant bit to the first zero bit.
Definition: MathExtras.h:478
MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s), MachineInstr opcode, and operands.
op_iterator op_end() const
ISD::LoadExtType getExtensionType() const
Return whether this is a plain node, or one of the varieties of value-extending loads.
SimpleValueType SimpleTy
const DataLayout & getDataLayout() const
Definition: SelectionDAG.h:398
SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
This is an SDNode representing atomic operations.
AtomicOrdering getOrdering() const
Return the atomic ordering requirements for this memory operation.
This represents a list of ValueType&#39;s that has been intern&#39;d by a SelectionDAG.
AtomicOrdering
Atomic ordering for LLVM&#39;s memory model.
unsigned getSizeInBits() const
bool runOnMachineFunction(MachineFunction &MF) override
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...
bool isKernelFunction(const Function &F)
bool writeMem() const
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:245
Definition: Lint.cpp:84
constexpr bool isMask_64(uint64_t Value)
Return true if the argument is a non-empty sequence of ones starting at the least significant bit wit...
Definition: MathExtras.h:411
SDValue getTargetFrameIndex(int FI, EVT VT)
Definition: SelectionDAG.h:625
const T & getValue() const LLVM_LVALUE_FUNCTION
Definition: Optional.h:179
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:201
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out...
Definition: ISDOpcodes.h:928
op_iterator op_begin() const
bool SelectInlineAsmMemoryOperand(const SDValue &Op, unsigned ConstraintID, std::vector< SDValue > &OutOps) override
SelectInlineAsmMemoryOperand - Implement addressing mode selection for inline asm expressions...
#define EQ(a, b)
Definition: regexec.c:112
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
Definition: SelectionDAG.h:573
bool isStrongerThanMonotonic(AtomicOrdering ao)
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition: ISDOpcodes.h:151
SDValue getTargetConstantFP(double Val, const SDLoc &DL, EVT VT)
Definition: SelectionDAG.h:605
unsigned getSrcAddressSpace() const
const DataLayout & getDataLayout() const
Return the DataLayout attached to the Module associated to this MF.
This class is used to represent ISD::STORE nodes.
Flag
These should be considered private to the implementation of the MCInstrDesc class.
Definition: MCInstrDesc.h:118
MVT getSimpleValueType() const
Return the simple ValueType of the referenced return value.
const Value * getValue() const
Return the base address of the memory access.
bool allowFMA(MachineFunction &MF, CodeGenOpt::Level OptLevel) const
CodeGenOpt::Level OptLevel
std::size_t countTrailingZeros(T Val, ZeroBehavior ZB=ZB_Width)
Count number of 0&#39;s from the least significant bit to the most stopping at the first 1...
Definition: MathExtras.h:120
Machine Value Type.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition: ValueTypes.h:273
const SDValue & getOperand(unsigned Num) const
const NVPTXTargetLowering * getTargetLowering() const override
unsigned getDestAddressSpace() const
static bool canLowerToLDG(MemSDNode *N, const NVPTXSubtarget &Subtarget, unsigned CodeAddrSpace, MachineFunction *F)
static ManagedStatic< std::set< EVT, EVT::compareRawBits > > EVTs
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:285
Extended Value Type.
Definition: ValueTypes.h:34
bool allowUnsafeFPMath(MachineFunction &MF) const
bool isVolatile() const
bool isMachineOpcode() const
Test if this node has a post-isel opcode, directly corresponding to a MachineInstr opcode...
unsigned getNumOperands() const
Return the number of values used by this operation.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition: ISDOpcodes.h:323
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition: ValueTypes.h:265
ADDRSPACECAST - This operator converts between pointers of different address spaces.
Definition: ISDOpcodes.h:553
This is a &#39;vector&#39; (really, a variable-sized array), optimized for the case when the array is small...
Definition: SmallVector.h:847
This is an abstract virtual class for memory operations.
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
Represents one node in the SelectionDAG.
SelectionDAGISel - This is the common base class used for SelectionDAG-based pattern-matching instruc...
bool readMem() const
const Function & getFunction() const
Return the LLVM function that this machine code represents.
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:941
bool runOnMachineFunction(MachineFunction &MF) override
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...
EVT getMemoryVT() const
Return the type of the in-memory value.
iterator_range< use_iterator > uses()
NVPTXTargetMachine.
constexpr bool isShiftedMask_64(uint64_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (64 bit ver...
Definition: MathExtras.h:423
bool isVector() const
Return true if this is a vector value type.
Definition: ValueTypes.h:151
FunctionPass * createNVPTXISelDag(NVPTXTargetMachine &TM, llvm::CodeGenOpt::Level OptLevel)
createNVPTXISelDag - This pass converts a legalized DAG into a NVPTX-specific DAG, ready for instruction scheduling.
Bitwise operators - logical and, logical or, logical xor.
Definition: ISDOpcodes.h:371
LLVM_NODISCARD bool empty() const
Definition: SmallVector.h:56
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
Definition: ISDOpcodes.h:583
#define N
ISD::CondCode get() const
LLVM_NODISCARD std::enable_if<!is_simple_type< Y >::value, typename cast_retty< X, const Y >::ret_type >::type dyn_cast(const Y &Val)
Definition: Casting.h:323
unsigned getOpcode() const
void GetUnderlyingObjects(Value *V, SmallVectorImpl< Value *> &Objects, const DataLayout &DL, LoopInfo *LI=nullptr, unsigned MaxLookup=6)
This method is similar to GetUnderlyingObject except that it can look through phi and select instruct...
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
LLVM Value Representation.
Definition: Value.h:73
std::underlying_type< E >::type Mask()
Get a bitmask with 1s in all places up to the high-order bit of E&#39;s largest value.
Definition: BitmaskEnum.h:81
void ReplaceNode(SDNode *F, SDNode *T)
Replace all uses of F with T, then remove F from the DAG.
static bool isVolatile(Instruction *Inst)
const SDValue & getOperand(unsigned i) const
OUTCHAIN = ATOMIC_STORE(INCHAIN, ptr, val) This corresponds to "store atomic" instruction.
Definition: ISDOpcodes.h:758
uint64_t getZExtValue() const
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition: ValueTypes.h:126
void setNodeMemRefs(MachineSDNode *N, ArrayRef< MachineMemOperand *> NewMemRefs)
Mutate the specified machine node&#39;s memory references to the provided list.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation...
Val, OUTCHAIN = ATOMIC_LOAD(INCHAIN, ptr) This corresponds to "load atomic" instruction.
Definition: ISDOpcodes.h:754
const SDValue & getBasePtr() const
This class is used to represent ISD::LOAD nodes.