LLVM  6.0.0svn
NVPTXISelDAGToDAG.cpp
Go to the documentation of this file.
1 //===-- NVPTXISelDAGToDAG.cpp - A dag to dag inst selector for NVPTX ------===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // This file defines an instruction selector for the NVPTX target.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "NVPTXISelDAGToDAG.h"
15 #include "NVPTXUtilities.h"
17 #include "llvm/IR/GlobalValue.h"
18 #include "llvm/IR/Instructions.h"
20 #include "llvm/Support/Debug.h"
24 
25 using namespace llvm;
26 
27 #define DEBUG_TYPE "nvptx-isel"
28 
29 /// createNVPTXISelDag - This pass converts a legalized DAG into a
30 /// NVPTX-specific DAG, ready for instruction scheduling.
32  llvm::CodeGenOpt::Level OptLevel) {
33  return new NVPTXDAGToDAGISel(TM, OptLevel);
34 }
35 
37  CodeGenOpt::Level OptLevel)
38  : SelectionDAGISel(tm, OptLevel), TM(tm) {
39  doMulWide = (OptLevel > 0);
40 }
41 
43  Subtarget = &static_cast<const NVPTXSubtarget &>(MF.getSubtarget());
45 }
46 
47 int NVPTXDAGToDAGISel::getDivF32Level() const {
49 }
50 
51 bool NVPTXDAGToDAGISel::usePrecSqrtF32() const {
53 }
54 
55 bool NVPTXDAGToDAGISel::useF32FTZ() const {
57 }
58 
59 bool NVPTXDAGToDAGISel::allowFMA() const {
61  return TL->allowFMA(*MF, OptLevel);
62 }
63 
64 bool NVPTXDAGToDAGISel::allowUnsafeFPMath() const {
66  return TL->allowUnsafeFPMath(*MF);
67 }
68 
69 /// Select - Select instructions not customized! Used for
70 /// expanded, promoted and normal instructions.
71 void NVPTXDAGToDAGISel::Select(SDNode *N) {
72 
73  if (N->isMachineOpcode()) {
74  N->setNodeId(-1);
75  return; // Already selected.
76  }
77 
78  switch (N->getOpcode()) {
79  case ISD::LOAD:
80  if (tryLoad(N))
81  return;
82  break;
83  case ISD::STORE:
84  if (tryStore(N))
85  return;
86  break;
88  if (tryEXTRACT_VECTOR_ELEMENT(N))
89  return;
90  break;
92  SelectSETP_F16X2(N);
93  return;
94 
95  case NVPTXISD::LoadV2:
96  case NVPTXISD::LoadV4:
97  if (tryLoadVector(N))
98  return;
99  break;
100  case NVPTXISD::LDGV2:
101  case NVPTXISD::LDGV4:
102  case NVPTXISD::LDUV2:
103  case NVPTXISD::LDUV4:
104  if (tryLDGLDU(N))
105  return;
106  break;
107  case NVPTXISD::StoreV2:
108  case NVPTXISD::StoreV4:
109  if (tryStoreVector(N))
110  return;
111  break;
112  case NVPTXISD::LoadParam:
115  if (tryLoadParam(N))
116  return;
117  break;
121  if (tryStoreRetval(N))
122  return;
123  break;
129  if (tryStoreParam(N))
130  return;
131  break;
133  if (tryIntrinsicNoChain(N))
134  return;
135  break;
137  if (tryIntrinsicChain(N))
138  return;
139  break;
308  if (tryTextureIntrinsic(N))
309  return;
310  break;
476  if (trySurfaceIntrinsic(N))
477  return;
478  break;
479  case ISD::AND:
480  case ISD::SRA:
481  case ISD::SRL:
482  // Try to select BFE
483  if (tryBFE(N))
484  return;
485  break;
486  case ISD::ADDRSPACECAST:
487  SelectAddrSpaceCast(N);
488  return;
489  case ISD::ConstantFP:
490  if (tryConstantFP16(N))
491  return;
492  break;
493  default:
494  break;
495  }
496  SelectCode(N);
497 }
498 
499 // Each instruction has four addressing variants. WMMA_VARIANTS() macro below
500 // constructs an array indexed by WmmaVariant which getWmmaLdVariant() uses to
501 // look up the intrinsic ID of particular variant.
507 };
508 
509 // clang-format off
510 #define WMMA_VARIANTS(base) \
511  {{ base##_ari64, base##_ari64_stride, base##_avar, base##_avar_stride }}
512 // clang-format on
513 
514 static unsigned getWmmaLdVariant(WmmaVariant Variant, bool Stride,
515  const std::array<unsigned, 4> Variants) {
516  if (Stride) {
517  if (Variant == WMMA_VARIANT_ARI64)
518  Variant = WMMA_VARIANT_ARI64_STRIDE;
519  else if (Variant == WMMA_VARIANT_AVAR)
520  Variant = WMMA_VARIANT_AVAR_STRIDE;
521  }
522  return Variants[Variant];
523 }
524 
525 static Optional<unsigned>
526 getWmmaLdStOpcode(unsigned IntrinsicID,
527  WmmaVariant Variant = WMMA_VARIANT_ARI64) {
528  switch (IntrinsicID) {
529  default:
530  return None;
531  //
532  // WMMA_LOAD_A f16
533  //
534  case Intrinsic::nvvm_wmma_load_a_f16_col:
535  return getWmmaLdVariant(Variant, /*Stride=*/false,
536  WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_A_col));
537  case Intrinsic::nvvm_wmma_load_a_f16_row:
538  return getWmmaLdVariant(Variant, /*Stride=*/false,
539  WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_A_row));
540  case Intrinsic::nvvm_wmma_load_a_f16_col_stride:
541  return getWmmaLdVariant(Variant, /*Stride=*/true,
542  WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_A_col));
543  case Intrinsic::nvvm_wmma_load_a_f16_row_stride:
544  return getWmmaLdVariant(Variant, /*Stride=*/true,
545  WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_A_row));
546  case Intrinsic::nvvm_wmma_load_a_f16_col_shared:
547  return getWmmaLdVariant(Variant, /*Stride=*/false,
548  WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_A_col_shared));
549  case Intrinsic::nvvm_wmma_load_a_f16_row_shared:
550  return getWmmaLdVariant(Variant, /*Stride=*/false,
551  WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_A_row_shared));
552  case Intrinsic::nvvm_wmma_load_a_f16_col_shared_stride:
553  return getWmmaLdVariant(Variant, /*Stride=*/true,
554  WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_A_col_shared));
555  case Intrinsic::nvvm_wmma_load_a_f16_row_shared_stride:
556  return getWmmaLdVariant(Variant, /*Stride=*/true,
557  WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_A_row_shared));
558  case Intrinsic::nvvm_wmma_load_a_f16_col_global:
559  return getWmmaLdVariant(Variant, /*Stride=*/false,
560  WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_A_col_global));
561  case Intrinsic::nvvm_wmma_load_a_f16_row_global:
562  return getWmmaLdVariant(Variant, /*Stride=*/false,
563  WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_A_row_global));
564  case Intrinsic::nvvm_wmma_load_a_f16_col_global_stride:
565  return getWmmaLdVariant(Variant, /*Stride=*/true,
566  WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_A_col_global));
567  case Intrinsic::nvvm_wmma_load_a_f16_row_global_stride:
568  return getWmmaLdVariant(Variant, /*Stride=*/true,
569  WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_A_row_global));
570 
571  //
572  // WMMA_LOAD_B f16
573  //
574  case Intrinsic::nvvm_wmma_load_b_f16_col:
575  return getWmmaLdVariant(Variant, /*Stride=*/false,
576  WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_B_col));
577  case Intrinsic::nvvm_wmma_load_b_f16_row:
578  return getWmmaLdVariant(Variant, /*Stride=*/false,
579  WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_B_row));
580  case Intrinsic::nvvm_wmma_load_b_f16_col_stride:
581  return getWmmaLdVariant(Variant, /*Stride=*/true,
582  WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_B_col));
583  case Intrinsic::nvvm_wmma_load_b_f16_row_stride:
584  return getWmmaLdVariant(Variant, /*Stride=*/true,
585  WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_B_row));
586  case Intrinsic::nvvm_wmma_load_b_f16_col_shared:
587  return getWmmaLdVariant(Variant, /*Stride=*/false,
588  WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_B_col_shared));
589  case Intrinsic::nvvm_wmma_load_b_f16_row_shared:
590  return getWmmaLdVariant(Variant, /*Stride=*/false,
591  WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_B_row_shared));
592  case Intrinsic::nvvm_wmma_load_b_f16_col_shared_stride:
593  return getWmmaLdVariant(Variant, /*Stride=*/true,
594  WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_B_col_shared));
595  case Intrinsic::nvvm_wmma_load_b_f16_row_shared_stride:
596  return getWmmaLdVariant(Variant, /*Stride=*/true,
597  WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_B_row_shared));
598  case Intrinsic::nvvm_wmma_load_b_f16_col_global:
599  return getWmmaLdVariant(Variant, /*Stride=*/false,
600  WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_B_col_global));
601  case Intrinsic::nvvm_wmma_load_b_f16_row_global:
602  return getWmmaLdVariant(Variant, /*Stride=*/false,
603  WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_B_row_global));
604  case Intrinsic::nvvm_wmma_load_b_f16_col_global_stride:
605  return getWmmaLdVariant(Variant, /*Stride=*/true,
606  WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_B_col_global));
607  case Intrinsic::nvvm_wmma_load_b_f16_row_global_stride:
608  return getWmmaLdVariant(Variant, /*Stride=*/true,
609  WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_B_row_global));
610 
611  //
612  // WMMA_LOAD_C f16
613  //
614  case Intrinsic::nvvm_wmma_load_c_f16_col:
615  return getWmmaLdVariant(Variant, /*Stride=*/false,
616  WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_C_f16_col));
617  case Intrinsic::nvvm_wmma_load_c_f16_row:
618  return getWmmaLdVariant(Variant, /*Stride=*/false,
619  WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_C_f16_row));
620  case Intrinsic::nvvm_wmma_load_c_f16_col_stride:
621  return getWmmaLdVariant(Variant, /*Stride=*/true,
622  WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_C_f16_col));
623  case Intrinsic::nvvm_wmma_load_c_f16_row_stride:
624  return getWmmaLdVariant(Variant, /*Stride=*/true,
625  WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_C_f16_row));
626  case Intrinsic::nvvm_wmma_load_c_f16_col_shared:
627  return getWmmaLdVariant(
628  Variant, /*Stride=*/false,
629  WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_C_f16_col_shared));
630  case Intrinsic::nvvm_wmma_load_c_f16_row_shared:
631  return getWmmaLdVariant(
632  Variant, /*Stride=*/false,
633  WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_C_f16_row_shared));
634  case Intrinsic::nvvm_wmma_load_c_f16_col_shared_stride:
635  return getWmmaLdVariant(
636  Variant, /*Stride=*/true,
637  WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_C_f16_col_shared));
638  case Intrinsic::nvvm_wmma_load_c_f16_row_shared_stride:
639  return getWmmaLdVariant(
640  Variant, /*Stride=*/true,
641  WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_C_f16_row_shared));
642  case Intrinsic::nvvm_wmma_load_c_f16_col_global:
643  return getWmmaLdVariant(
644  Variant, /*Stride=*/false,
645  WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_C_f16_col_global));
646  case Intrinsic::nvvm_wmma_load_c_f16_row_global:
647  return getWmmaLdVariant(
648  Variant, /*Stride=*/false,
649  WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_C_f16_row_global));
650  case Intrinsic::nvvm_wmma_load_c_f16_col_global_stride:
651  return getWmmaLdVariant(
652  Variant, /*Stride=*/true,
653  WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_C_f16_col_global));
654  case Intrinsic::nvvm_wmma_load_c_f16_row_global_stride:
655  return getWmmaLdVariant(
656  Variant, /*Stride=*/true,
657  WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_C_f16_row_global));
658 
659  //
660  // WMMA_LOAD_C f32
661  //
662  case Intrinsic::nvvm_wmma_load_c_f32_col:
663  return getWmmaLdVariant(Variant, /*Stride=*/false,
664  WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_C_f32_col));
665  case Intrinsic::nvvm_wmma_load_c_f32_row:
666  return getWmmaLdVariant(Variant, /*Stride=*/false,
667  WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_C_f32_row));
668  case Intrinsic::nvvm_wmma_load_c_f32_col_stride:
669  return getWmmaLdVariant(Variant, /*Stride=*/true,
670  WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_C_f32_col));
671  case Intrinsic::nvvm_wmma_load_c_f32_row_stride:
672  return getWmmaLdVariant(Variant, /*Stride=*/true,
673  WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_C_f32_row));
674  case Intrinsic::nvvm_wmma_load_c_f32_col_shared:
675  return getWmmaLdVariant(
676  Variant, /*Stride=*/false,
677  WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_C_f32_col_shared));
678  case Intrinsic::nvvm_wmma_load_c_f32_row_shared:
679  return getWmmaLdVariant(
680  Variant, /*Stride=*/false,
681  WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_C_f32_row_shared));
682  case Intrinsic::nvvm_wmma_load_c_f32_col_shared_stride:
683  return getWmmaLdVariant(
684  Variant, /*Stride=*/true,
685  WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_C_f32_col_shared));
686  case Intrinsic::nvvm_wmma_load_c_f32_row_shared_stride:
687  return getWmmaLdVariant(
688  Variant, /*Stride=*/true,
689  WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_C_f32_row_shared));
690  case Intrinsic::nvvm_wmma_load_c_f32_col_global:
691  return getWmmaLdVariant(
692  Variant, /*Stride=*/false,
693  WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_C_f32_col_global));
694  case Intrinsic::nvvm_wmma_load_c_f32_row_global:
695  return getWmmaLdVariant(
696  Variant, /*Stride=*/false,
697  WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_C_f32_row_global));
698  case Intrinsic::nvvm_wmma_load_c_f32_col_global_stride:
699  return getWmmaLdVariant(
700  Variant, /*Stride=*/true,
701  WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_C_f32_col_global));
702  case Intrinsic::nvvm_wmma_load_c_f32_row_global_stride:
703  return getWmmaLdVariant(
704  Variant, /*Stride=*/true,
705  WMMA_VARIANTS(NVPTX::INT_WMMA_LOAD_C_f32_row_global));
706 
707  //
708  // WMMA_STORE_D f16
709  //
710  case Intrinsic::nvvm_wmma_store_d_f16_col:
711  return getWmmaLdVariant(Variant, /*Stride=*/false,
712  WMMA_VARIANTS(NVPTX::INT_WMMA_STORE_D_f16_col));
713  case Intrinsic::nvvm_wmma_store_d_f16_row:
714  return getWmmaLdVariant(Variant, /*Stride=*/false,
715  WMMA_VARIANTS(NVPTX::INT_WMMA_STORE_D_f16_row));
716  case Intrinsic::nvvm_wmma_store_d_f16_col_stride:
717  return getWmmaLdVariant(Variant, /*Stride=*/true,
718  WMMA_VARIANTS(NVPTX::INT_WMMA_STORE_D_f16_col));
719  case Intrinsic::nvvm_wmma_store_d_f16_row_stride:
720  return getWmmaLdVariant(Variant, /*Stride=*/true,
721  WMMA_VARIANTS(NVPTX::INT_WMMA_STORE_D_f16_row));
722  case Intrinsic::nvvm_wmma_store_d_f16_col_shared:
723  return getWmmaLdVariant(
724  Variant, /*Stride=*/false,
725  WMMA_VARIANTS(NVPTX::INT_WMMA_STORE_D_f16_col_shared));
726  case Intrinsic::nvvm_wmma_store_d_f16_row_shared:
727  return getWmmaLdVariant(
728  Variant, /*Stride=*/false,
729  WMMA_VARIANTS(NVPTX::INT_WMMA_STORE_D_f16_row_shared));
730  case Intrinsic::nvvm_wmma_store_d_f16_col_shared_stride:
731  return getWmmaLdVariant(
732  Variant, /*Stride=*/true,
733  WMMA_VARIANTS(NVPTX::INT_WMMA_STORE_D_f16_col_shared));
734  case Intrinsic::nvvm_wmma_store_d_f16_row_shared_stride:
735  return getWmmaLdVariant(
736  Variant, /*Stride=*/true,
737  WMMA_VARIANTS(NVPTX::INT_WMMA_STORE_D_f16_row_shared));
738  case Intrinsic::nvvm_wmma_store_d_f16_col_global:
739  return getWmmaLdVariant(
740  Variant, /*Stride=*/false,
741  WMMA_VARIANTS(NVPTX::INT_WMMA_STORE_D_f16_col_global));
742  case Intrinsic::nvvm_wmma_store_d_f16_row_global:
743  return getWmmaLdVariant(
744  Variant, /*Stride=*/false,
745  WMMA_VARIANTS(NVPTX::INT_WMMA_STORE_D_f16_row_global));
746  case Intrinsic::nvvm_wmma_store_d_f16_col_global_stride:
747  return getWmmaLdVariant(
748  Variant, /*Stride=*/true,
749  WMMA_VARIANTS(NVPTX::INT_WMMA_STORE_D_f16_col_global));
750  case Intrinsic::nvvm_wmma_store_d_f16_row_global_stride:
751  return getWmmaLdVariant(
752  Variant, /*Stride=*/true,
753  WMMA_VARIANTS(NVPTX::INT_WMMA_STORE_D_f16_row_global));
754 
755  //
756  // WMMA_STORE_D f32
757  //
758  case Intrinsic::nvvm_wmma_store_d_f32_col:
759  return getWmmaLdVariant(Variant, /*Stride=*/false,
760  WMMA_VARIANTS(NVPTX::INT_WMMA_STORE_D_f32_col));
761  case Intrinsic::nvvm_wmma_store_d_f32_row:
762  return getWmmaLdVariant(Variant, /*Stride=*/false,
763  WMMA_VARIANTS(NVPTX::INT_WMMA_STORE_D_f32_row));
764  case Intrinsic::nvvm_wmma_store_d_f32_col_stride:
765  return getWmmaLdVariant(Variant, /*Stride=*/true,
766  WMMA_VARIANTS(NVPTX::INT_WMMA_STORE_D_f32_col));
767  case Intrinsic::nvvm_wmma_store_d_f32_row_stride:
768  return getWmmaLdVariant(Variant, /*Stride=*/true,
769  WMMA_VARIANTS(NVPTX::INT_WMMA_STORE_D_f32_row));
770  case Intrinsic::nvvm_wmma_store_d_f32_col_shared:
771  return getWmmaLdVariant(
772  Variant, /*Stride=*/false,
773  WMMA_VARIANTS(NVPTX::INT_WMMA_STORE_D_f32_col_shared));
774  case Intrinsic::nvvm_wmma_store_d_f32_row_shared:
775  return getWmmaLdVariant(
776  Variant, /*Stride=*/false,
777  WMMA_VARIANTS(NVPTX::INT_WMMA_STORE_D_f32_row_shared));
778  case Intrinsic::nvvm_wmma_store_d_f32_col_shared_stride:
779  return getWmmaLdVariant(
780  Variant, /*Stride=*/true,
781  WMMA_VARIANTS(NVPTX::INT_WMMA_STORE_D_f32_col_shared));
782  case Intrinsic::nvvm_wmma_store_d_f32_row_shared_stride:
783  return getWmmaLdVariant(
784  Variant, /*Stride=*/true,
785  WMMA_VARIANTS(NVPTX::INT_WMMA_STORE_D_f32_row_shared));
786  case Intrinsic::nvvm_wmma_store_d_f32_col_global:
787  return getWmmaLdVariant(
788  Variant, /*Stride=*/false,
789  WMMA_VARIANTS(NVPTX::INT_WMMA_STORE_D_f32_col_global));
790  case Intrinsic::nvvm_wmma_store_d_f32_row_global:
791  return getWmmaLdVariant(
792  Variant, /*Stride=*/false,
793  WMMA_VARIANTS(NVPTX::INT_WMMA_STORE_D_f32_row_global));
794  case Intrinsic::nvvm_wmma_store_d_f32_col_global_stride:
795  return getWmmaLdVariant(
796  Variant, /*Stride=*/true,
797  WMMA_VARIANTS(NVPTX::INT_WMMA_STORE_D_f32_col_global));
798  case Intrinsic::nvvm_wmma_store_d_f32_row_global_stride:
799  return getWmmaLdVariant(
800  Variant, /*Stride=*/true,
801  WMMA_VARIANTS(NVPTX::INT_WMMA_STORE_D_f32_row_global));
802  }
803 }
804 #undef WMMA_VARIANTS
805 
806 bool NVPTXDAGToDAGISel::tryIntrinsicChain(SDNode *N) {
807  unsigned IID = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
808  if (getWmmaLdStOpcode(IID))
809  return tryWMMA_LDST(N);
810 
811  switch (IID) {
812  default:
813  return false;
814  case Intrinsic::nvvm_match_all_sync_i32p:
815  case Intrinsic::nvvm_match_all_sync_i64p:
816  SelectMatchAll(N);
817  return true;
818  case Intrinsic::nvvm_ldg_global_f:
819  case Intrinsic::nvvm_ldg_global_i:
820  case Intrinsic::nvvm_ldg_global_p:
821  case Intrinsic::nvvm_ldu_global_f:
822  case Intrinsic::nvvm_ldu_global_i:
823  case Intrinsic::nvvm_ldu_global_p:
824  return tryLDGLDU(N);
825  }
826 }
827 
828 // There's no way to specify FP16 immediates in .f16 ops, so we have to
829 // load them into an .f16 register first.
830 bool NVPTXDAGToDAGISel::tryConstantFP16(SDNode *N) {
831  if (N->getValueType(0) != MVT::f16)
832  return false;
834  cast<ConstantFPSDNode>(N)->getValueAPF(), SDLoc(N), MVT::f16);
835  SDNode *LoadConstF16 =
836  CurDAG->getMachineNode(NVPTX::LOAD_CONST_F16, SDLoc(N), MVT::f16, Val);
837  ReplaceNode(N, LoadConstF16);
838  return true;
839 }
840 
841 // Map ISD:CONDCODE value to appropriate CmpMode expected by
842 // NVPTXInstPrinter::printCmpMode()
843 static unsigned getPTXCmpMode(const CondCodeSDNode &CondCode, bool FTZ) {
845  unsigned PTXCmpMode = [](ISD::CondCode CC) {
846  switch (CC) {
847  default:
848  llvm_unreachable("Unexpected condition code.");
849  case ISD::SETOEQ:
850  return CmpMode::EQ;
851  case ISD::SETOGT:
852  return CmpMode::GT;
853  case ISD::SETOGE:
854  return CmpMode::GE;
855  case ISD::SETOLT:
856  return CmpMode::LT;
857  case ISD::SETOLE:
858  return CmpMode::LE;
859  case ISD::SETONE:
860  return CmpMode::NE;
861  case ISD::SETO:
862  return CmpMode::NUM;
863  case ISD::SETUO:
864  return CmpMode::NotANumber;
865  case ISD::SETUEQ:
866  return CmpMode::EQU;
867  case ISD::SETUGT:
868  return CmpMode::GTU;
869  case ISD::SETUGE:
870  return CmpMode::GEU;
871  case ISD::SETULT:
872  return CmpMode::LTU;
873  case ISD::SETULE:
874  return CmpMode::LEU;
875  case ISD::SETUNE:
876  return CmpMode::NEU;
877  case ISD::SETEQ:
878  return CmpMode::EQ;
879  case ISD::SETGT:
880  return CmpMode::GT;
881  case ISD::SETGE:
882  return CmpMode::GE;
883  case ISD::SETLT:
884  return CmpMode::LT;
885  case ISD::SETLE:
886  return CmpMode::LE;
887  case ISD::SETNE:
888  return CmpMode::NE;
889  }
890  }(CondCode.get());
891 
892  if (FTZ)
893  PTXCmpMode |= NVPTX::PTXCmpMode::FTZ_FLAG;
894 
895  return PTXCmpMode;
896 }
897 
898 bool NVPTXDAGToDAGISel::SelectSETP_F16X2(SDNode *N) {
899  unsigned PTXCmpMode =
900  getPTXCmpMode(*cast<CondCodeSDNode>(N->getOperand(2)), useF32FTZ());
901  SDLoc DL(N);
902  SDNode *SetP = CurDAG->getMachineNode(
903  NVPTX::SETP_f16x2rr, DL, MVT::i1, MVT::i1, N->getOperand(0),
904  N->getOperand(1), CurDAG->getTargetConstant(PTXCmpMode, DL, MVT::i32));
905  ReplaceNode(N, SetP);
906  return true;
907 }
908 
909 // Find all instances of extract_vector_elt that use this v2f16 vector
910 // and coalesce them into a scattering move instruction.
911 bool NVPTXDAGToDAGISel::tryEXTRACT_VECTOR_ELEMENT(SDNode *N) {
912  SDValue Vector = N->getOperand(0);
913 
914  // We only care about f16x2 as it's the only real vector type we
915  // need to deal with.
916  if (Vector.getSimpleValueType() != MVT::v2f16)
917  return false;
918 
919  // Find and record all uses of this vector that extract element 0 or 1.
921  for (const auto &U : Vector.getNode()->uses()) {
922  if (U->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
923  continue;
924  if (U->getOperand(0) != Vector)
925  continue;
926  if (const ConstantSDNode *IdxConst =
927  dyn_cast<ConstantSDNode>(U->getOperand(1))) {
928  if (IdxConst->getZExtValue() == 0)
929  E0.push_back(U);
930  else if (IdxConst->getZExtValue() == 1)
931  E1.push_back(U);
932  else
933  llvm_unreachable("Invalid vector index.");
934  }
935  }
936 
937  // There's no point scattering f16x2 if we only ever access one
938  // element of it.
939  if (E0.empty() || E1.empty())
940  return false;
941 
942  unsigned Op = NVPTX::SplitF16x2;
943  // If the vector has been BITCAST'ed from i32, we can use original
944  // value directly and avoid register-to-register move.
945  SDValue Source = Vector;
946  if (Vector->getOpcode() == ISD::BITCAST) {
947  Op = NVPTX::SplitI32toF16x2;
948  Source = Vector->getOperand(0);
949  }
950  // Merge (f16 extractelt(V, 0), f16 extractelt(V,1))
951  // into f16,f16 SplitF16x2(V)
952  SDNode *ScatterOp =
953  CurDAG->getMachineNode(Op, SDLoc(N), MVT::f16, MVT::f16, Source);
954  for (auto *Node : E0)
955  ReplaceUses(SDValue(Node, 0), SDValue(ScatterOp, 0));
956  for (auto *Node : E1)
957  ReplaceUses(SDValue(Node, 0), SDValue(ScatterOp, 1));
958 
959  return true;
960 }
961 
962 static unsigned int getCodeAddrSpace(MemSDNode *N) {
963  const Value *Src = N->getMemOperand()->getValue();
964 
965  if (!Src)
967 
968  if (auto *PT = dyn_cast<PointerType>(Src->getType())) {
969  switch (PT->getAddressSpace()) {
976  default: break;
977  }
978  }
980 }
981 
983  unsigned CodeAddrSpace, MachineFunction *F) {
984  // We use ldg (i.e. ld.global.nc) for invariant loads from the global address
985  // space.
986  //
987  // We have two ways of identifying invariant loads: Loads may be explicitly
988  // marked as invariant, or we may infer them to be invariant.
989  //
990  // We currently infer invariance only for kernel function pointer params that
991  // are noalias (i.e. __restrict) and never written to.
992  //
993  // TODO: Perform a more powerful invariance analysis (ideally IPO, and ideally
994  // not during the SelectionDAG phase).
995  //
996  // TODO: Infer invariance only at -O2. We still want to use ldg at -O0 for
997  // explicitly invariant loads because these are how clang tells us to use ldg
998  // when the user uses a builtin.
999  if (!Subtarget.hasLDG() || CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL)
1000  return false;
1001 
1002  if (N->isInvariant())
1003  return true;
1004 
1005  // Load wasn't explicitly invariant. Attempt to infer invariance.
1006  if (!isKernelFunction(*F->getFunction()))
1007  return false;
1008 
1009  // We use GetUnderlyingObjects() here instead of
1010  // GetUnderlyingObject() mainly because the former looks through phi
1011  // nodes while the latter does not. We need to look through phi
1012  // nodes to handle pointer induction variables.
1014  GetUnderlyingObjects(const_cast<Value *>(N->getMemOperand()->getValue()),
1015  Objs, F->getDataLayout());
1016  for (Value *Obj : Objs) {
1017  auto *A = dyn_cast<const Argument>(Obj);
1018  if (!A || !A->onlyReadsMemory() || !A->hasNoAliasAttr()) return false;
1019  }
1020 
1021  return true;
1022 }
1023 
1024 bool NVPTXDAGToDAGISel::tryIntrinsicNoChain(SDNode *N) {
1025  unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
1026  switch (IID) {
1027  default:
1028  return false;
1029  case Intrinsic::nvvm_texsurf_handle_internal:
1030  SelectTexSurfHandle(N);
1031  return true;
1032  case Intrinsic::nvvm_wmma_mma_sync_col_col_f16_f16:
1033  case Intrinsic::nvvm_wmma_mma_sync_col_col_f16_f16_satfinite:
1034  case Intrinsic::nvvm_wmma_mma_sync_col_col_f16_f32:
1035  case Intrinsic::nvvm_wmma_mma_sync_col_col_f16_f32_satfinite:
1036  case Intrinsic::nvvm_wmma_mma_sync_col_col_f32_f16:
1037  case Intrinsic::nvvm_wmma_mma_sync_col_col_f32_f16_satfinite:
1038  case Intrinsic::nvvm_wmma_mma_sync_col_col_f32_f32:
1039  case Intrinsic::nvvm_wmma_mma_sync_col_col_f32_f32_satfinite:
1040  case Intrinsic::nvvm_wmma_mma_sync_col_row_f16_f16:
1041  case Intrinsic::nvvm_wmma_mma_sync_col_row_f16_f16_satfinite:
1042  case Intrinsic::nvvm_wmma_mma_sync_col_row_f16_f32:
1043  case Intrinsic::nvvm_wmma_mma_sync_col_row_f16_f32_satfinite:
1044  case Intrinsic::nvvm_wmma_mma_sync_col_row_f32_f16:
1045  case Intrinsic::nvvm_wmma_mma_sync_col_row_f32_f16_satfinite:
1046  case Intrinsic::nvvm_wmma_mma_sync_col_row_f32_f32:
1047  case Intrinsic::nvvm_wmma_mma_sync_col_row_f32_f32_satfinite:
1048  case Intrinsic::nvvm_wmma_mma_sync_row_col_f16_f16:
1049  case Intrinsic::nvvm_wmma_mma_sync_row_col_f16_f16_satfinite:
1050  case Intrinsic::nvvm_wmma_mma_sync_row_col_f16_f32:
1051  case Intrinsic::nvvm_wmma_mma_sync_row_col_f16_f32_satfinite:
1052  case Intrinsic::nvvm_wmma_mma_sync_row_col_f32_f16:
1053  case Intrinsic::nvvm_wmma_mma_sync_row_col_f32_f16_satfinite:
1054  case Intrinsic::nvvm_wmma_mma_sync_row_col_f32_f32:
1055  case Intrinsic::nvvm_wmma_mma_sync_row_col_f32_f32_satfinite:
1056  case Intrinsic::nvvm_wmma_mma_sync_row_row_f16_f16:
1057  case Intrinsic::nvvm_wmma_mma_sync_row_row_f16_f16_satfinite:
1058  case Intrinsic::nvvm_wmma_mma_sync_row_row_f16_f32:
1059  case Intrinsic::nvvm_wmma_mma_sync_row_row_f16_f32_satfinite:
1060  case Intrinsic::nvvm_wmma_mma_sync_row_row_f32_f16:
1061  case Intrinsic::nvvm_wmma_mma_sync_row_row_f32_f16_satfinite:
1062  case Intrinsic::nvvm_wmma_mma_sync_row_row_f32_f32:
1063  case Intrinsic::nvvm_wmma_mma_sync_row_row_f32_f32_satfinite:
1064  return tryWMMA_MMA(N);
1065  }
1066 }
1067 
1068 void NVPTXDAGToDAGISel::SelectTexSurfHandle(SDNode *N) {
1069  // Op 0 is the intrinsic ID
1070  SDValue Wrapper = N->getOperand(1);
1071  SDValue GlobalVal = Wrapper.getOperand(0);
1072  ReplaceNode(N, CurDAG->getMachineNode(NVPTX::texsurf_handles, SDLoc(N),
1073  MVT::i64, GlobalVal));
1074 }
1075 
1076 void NVPTXDAGToDAGISel::SelectMatchAll(SDNode *N) {
1077  SDLoc DL(N);
1078  enum { IS_I64 = 4, HAS_CONST_VALUE = 2, HAS_CONST_MASK = 1 };
1079  unsigned IID = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
1080  unsigned OpcodeIndex =
1081  (IID == Intrinsic::nvvm_match_all_sync_i64p) ? IS_I64 : 0;
1082  SDValue MaskOp = N->getOperand(2);
1083  SDValue ValueOp = N->getOperand(3);
1084  if (ConstantSDNode *ValueConst = dyn_cast<ConstantSDNode>(ValueOp)) {
1085  OpcodeIndex |= HAS_CONST_VALUE;
1086  ValueOp = CurDAG->getTargetConstant(ValueConst->getZExtValue(), DL,
1087  ValueConst->getValueType(0));
1088  }
1089  if (ConstantSDNode *MaskConst = dyn_cast<ConstantSDNode>(MaskOp)) {
1090  OpcodeIndex |= HAS_CONST_MASK;
1091  MaskOp = CurDAG->getTargetConstant(MaskConst->getZExtValue(), DL,
1092  MaskConst->getValueType(0));
1093  }
1094  // Maps {IS_I64, HAS_CONST_VALUE, HAS_CONST_MASK} -> opcode
1095  unsigned Opcodes[8] = {
1096  NVPTX::MATCH_ALLP_SYNC_32rr, NVPTX::MATCH_ALLP_SYNC_32ri,
1097  NVPTX::MATCH_ALLP_SYNC_32ir, NVPTX::MATCH_ALLP_SYNC_32ii,
1098  NVPTX::MATCH_ALLP_SYNC_64rr, NVPTX::MATCH_ALLP_SYNC_64ri,
1099  NVPTX::MATCH_ALLP_SYNC_64ir, NVPTX::MATCH_ALLP_SYNC_64ii};
1100  SDNode *NewNode = CurDAG->getMachineNode(
1101  Opcodes[OpcodeIndex], DL, {ValueOp->getValueType(0), MVT::i1, MVT::Other},
1102  {MaskOp, ValueOp});
1103  ReplaceNode(N, NewNode);
1104 }
1105 
1106 void NVPTXDAGToDAGISel::SelectAddrSpaceCast(SDNode *N) {
1107  SDValue Src = N->getOperand(0);
1108  AddrSpaceCastSDNode *CastN = cast<AddrSpaceCastSDNode>(N);
1109  unsigned SrcAddrSpace = CastN->getSrcAddressSpace();
1110  unsigned DstAddrSpace = CastN->getDestAddressSpace();
1111 
1112  assert(SrcAddrSpace != DstAddrSpace &&
1113  "addrspacecast must be between different address spaces");
1114 
1115  if (DstAddrSpace == ADDRESS_SPACE_GENERIC) {
1116  // Specific to generic
1117  unsigned Opc;
1118  switch (SrcAddrSpace) {
1119  default: report_fatal_error("Bad address space in addrspacecast");
1120  case ADDRESS_SPACE_GLOBAL:
1121  Opc = TM.is64Bit() ? NVPTX::cvta_global_yes_64 : NVPTX::cvta_global_yes;
1122  break;
1123  case ADDRESS_SPACE_SHARED:
1124  Opc = TM.is64Bit() ? NVPTX::cvta_shared_yes_64 : NVPTX::cvta_shared_yes;
1125  break;
1126  case ADDRESS_SPACE_CONST:
1127  Opc = TM.is64Bit() ? NVPTX::cvta_const_yes_64 : NVPTX::cvta_const_yes;
1128  break;
1129  case ADDRESS_SPACE_LOCAL:
1130  Opc = TM.is64Bit() ? NVPTX::cvta_local_yes_64 : NVPTX::cvta_local_yes;
1131  break;
1132  }
1133  ReplaceNode(N, CurDAG->getMachineNode(Opc, SDLoc(N), N->getValueType(0),
1134  Src));
1135  return;
1136  } else {
1137  // Generic to specific
1138  if (SrcAddrSpace != 0)
1139  report_fatal_error("Cannot cast between two non-generic address spaces");
1140  unsigned Opc;
1141  switch (DstAddrSpace) {
1142  default: report_fatal_error("Bad address space in addrspacecast");
1143  case ADDRESS_SPACE_GLOBAL:
1144  Opc = TM.is64Bit() ? NVPTX::cvta_to_global_yes_64
1145  : NVPTX::cvta_to_global_yes;
1146  break;
1147  case ADDRESS_SPACE_SHARED:
1148  Opc = TM.is64Bit() ? NVPTX::cvta_to_shared_yes_64
1149  : NVPTX::cvta_to_shared_yes;
1150  break;
1151  case ADDRESS_SPACE_CONST:
1152  Opc =
1153  TM.is64Bit() ? NVPTX::cvta_to_const_yes_64 : NVPTX::cvta_to_const_yes;
1154  break;
1155  case ADDRESS_SPACE_LOCAL:
1156  Opc =
1157  TM.is64Bit() ? NVPTX::cvta_to_local_yes_64 : NVPTX::cvta_to_local_yes;
1158  break;
1159  case ADDRESS_SPACE_PARAM:
1160  Opc = TM.is64Bit() ? NVPTX::nvvm_ptr_gen_to_param_64
1161  : NVPTX::nvvm_ptr_gen_to_param;
1162  break;
1163  }
1164  ReplaceNode(N, CurDAG->getMachineNode(Opc, SDLoc(N), N->getValueType(0),
1165  Src));
1166  return;
1167  }
1168 }
1169 
1170 // Helper function template to reduce amount of boilerplate code for
1171 // opcode selection.
1173  MVT::SimpleValueType VT, unsigned Opcode_i8, unsigned Opcode_i16,
1174  unsigned Opcode_i32, Optional<unsigned> Opcode_i64, unsigned Opcode_f16,
1175  unsigned Opcode_f16x2, unsigned Opcode_f32, Optional<unsigned> Opcode_f64) {
1176  switch (VT) {
1177  case MVT::i1:
1178  case MVT::i8:
1179  return Opcode_i8;
1180  case MVT::i16:
1181  return Opcode_i16;
1182  case MVT::i32:
1183  return Opcode_i32;
1184  case MVT::i64:
1185  return Opcode_i64;
1186  case MVT::f16:
1187  return Opcode_f16;
1188  case MVT::v2f16:
1189  return Opcode_f16x2;
1190  case MVT::f32:
1191  return Opcode_f32;
1192  case MVT::f64:
1193  return Opcode_f64;
1194  default:
1195  return None;
1196  }
1197 }
1198 
1199 bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
1200  SDLoc dl(N);
1201  LoadSDNode *LD = cast<LoadSDNode>(N);
1202  EVT LoadedVT = LD->getMemoryVT();
1203  SDNode *NVPTXLD = nullptr;
1204 
1205  // do not support pre/post inc/dec
1206  if (LD->isIndexed())
1207  return false;
1208 
1209  if (!LoadedVT.isSimple())
1210  return false;
1211 
1212  // Address Space Setting
1213  unsigned int codeAddrSpace = getCodeAddrSpace(LD);
1214 
1215  if (canLowerToLDG(LD, *Subtarget, codeAddrSpace, MF)) {
1216  return tryLDGLDU(N);
1217  }
1218 
1219  // Volatile Setting
1220  // - .volatile is only availalble for .global and .shared
1221  bool isVolatile = LD->isVolatile();
1222  if (codeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL &&
1223  codeAddrSpace != NVPTX::PTXLdStInstCode::SHARED &&
1224  codeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC)
1225  isVolatile = false;
1226 
1227  // Type Setting: fromType + fromTypeWidth
1228  //
1229  // Sign : ISD::SEXTLOAD
1230  // Unsign : ISD::ZEXTLOAD, ISD::NON_EXTLOAD or ISD::EXTLOAD and the
1231  // type is integer
1232  // Float : ISD::NON_EXTLOAD or ISD::EXTLOAD and the type is float
1233  MVT SimpleVT = LoadedVT.getSimpleVT();
1234  MVT ScalarVT = SimpleVT.getScalarType();
1235  // Read at least 8 bits (predicates are stored as 8-bit values)
1236  unsigned fromTypeWidth = std::max(8U, ScalarVT.getSizeInBits());
1237  unsigned int fromType;
1238 
1239  // Vector Setting
1240  unsigned vecType = NVPTX::PTXLdStInstCode::Scalar;
1241  if (SimpleVT.isVector()) {
1242  assert(LoadedVT == MVT::v2f16 && "Unexpected vector type");
1243  // v2f16 is loaded using ld.b32
1244  fromTypeWidth = 32;
1245  }
1246 
1247  if ((LD->getExtensionType() == ISD::SEXTLOAD))
1248  fromType = NVPTX::PTXLdStInstCode::Signed;
1249  else if (ScalarVT.isFloatingPoint())
1250  // f16 uses .b16 as its storage type.
1251  fromType = ScalarVT.SimpleTy == MVT::f16 ? NVPTX::PTXLdStInstCode::Untyped
1253  else
1255 
1256  // Create the machine instruction DAG
1257  SDValue Chain = N->getOperand(0);
1258  SDValue N1 = N->getOperand(1);
1259  SDValue Addr;
1260  SDValue Offset, Base;
1261  Optional<unsigned> Opcode;
1263 
1264  if (SelectDirectAddr(N1, Addr)) {
1265  Opcode = pickOpcodeForVT(
1266  TargetVT, NVPTX::LD_i8_avar, NVPTX::LD_i16_avar, NVPTX::LD_i32_avar,
1267  NVPTX::LD_i64_avar, NVPTX::LD_f16_avar, NVPTX::LD_f16x2_avar,
1268  NVPTX::LD_f32_avar, NVPTX::LD_f64_avar);
1269  if (!Opcode)
1270  return false;
1271  SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(codeAddrSpace, dl),
1272  getI32Imm(vecType, dl), getI32Imm(fromType, dl),
1273  getI32Imm(fromTypeWidth, dl), Addr, Chain };
1274  NVPTXLD = CurDAG->getMachineNode(Opcode.getValue(), dl, TargetVT,
1275  MVT::Other, Ops);
1276  } else if (TM.is64Bit() ? SelectADDRsi64(N1.getNode(), N1, Base, Offset)
1277  : SelectADDRsi(N1.getNode(), N1, Base, Offset)) {
1278  Opcode = pickOpcodeForVT(TargetVT, NVPTX::LD_i8_asi, NVPTX::LD_i16_asi,
1279  NVPTX::LD_i32_asi, NVPTX::LD_i64_asi,
1280  NVPTX::LD_f16_asi, NVPTX::LD_f16x2_asi,
1281  NVPTX::LD_f32_asi, NVPTX::LD_f64_asi);
1282  if (!Opcode)
1283  return false;
1284  SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(codeAddrSpace, dl),
1285  getI32Imm(vecType, dl), getI32Imm(fromType, dl),
1286  getI32Imm(fromTypeWidth, dl), Base, Offset, Chain };
1287  NVPTXLD = CurDAG->getMachineNode(Opcode.getValue(), dl, TargetVT,
1288  MVT::Other, Ops);
1289  } else if (TM.is64Bit() ? SelectADDRri64(N1.getNode(), N1, Base, Offset)
1290  : SelectADDRri(N1.getNode(), N1, Base, Offset)) {
1291  if (TM.is64Bit())
1292  Opcode = pickOpcodeForVT(
1293  TargetVT, NVPTX::LD_i8_ari_64, NVPTX::LD_i16_ari_64,
1294  NVPTX::LD_i32_ari_64, NVPTX::LD_i64_ari_64, NVPTX::LD_f16_ari_64,
1295  NVPTX::LD_f16x2_ari_64, NVPTX::LD_f32_ari_64, NVPTX::LD_f64_ari_64);
1296  else
1297  Opcode = pickOpcodeForVT(
1298  TargetVT, NVPTX::LD_i8_ari, NVPTX::LD_i16_ari, NVPTX::LD_i32_ari,
1299  NVPTX::LD_i64_ari, NVPTX::LD_f16_ari, NVPTX::LD_f16x2_ari,
1300  NVPTX::LD_f32_ari, NVPTX::LD_f64_ari);
1301  if (!Opcode)
1302  return false;
1303  SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(codeAddrSpace, dl),
1304  getI32Imm(vecType, dl), getI32Imm(fromType, dl),
1305  getI32Imm(fromTypeWidth, dl), Base, Offset, Chain };
1306  NVPTXLD = CurDAG->getMachineNode(Opcode.getValue(), dl, TargetVT,
1307  MVT::Other, Ops);
1308  } else {
1309  if (TM.is64Bit())
1310  Opcode = pickOpcodeForVT(
1311  TargetVT, NVPTX::LD_i8_areg_64, NVPTX::LD_i16_areg_64,
1312  NVPTX::LD_i32_areg_64, NVPTX::LD_i64_areg_64, NVPTX::LD_f16_areg_64,
1313  NVPTX::LD_f16x2_areg_64, NVPTX::LD_f32_areg_64,
1314  NVPTX::LD_f64_areg_64);
1315  else
1316  Opcode = pickOpcodeForVT(
1317  TargetVT, NVPTX::LD_i8_areg, NVPTX::LD_i16_areg, NVPTX::LD_i32_areg,
1318  NVPTX::LD_i64_areg, NVPTX::LD_f16_areg, NVPTX::LD_f16x2_areg,
1319  NVPTX::LD_f32_areg, NVPTX::LD_f64_areg);
1320  if (!Opcode)
1321  return false;
1322  SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(codeAddrSpace, dl),
1323  getI32Imm(vecType, dl), getI32Imm(fromType, dl),
1324  getI32Imm(fromTypeWidth, dl), N1, Chain };
1325  NVPTXLD = CurDAG->getMachineNode(Opcode.getValue(), dl, TargetVT,
1326  MVT::Other, Ops);
1327  }
1328 
1329  if (!NVPTXLD)
1330  return false;
1331 
1333  MemRefs0[0] = cast<MemSDNode>(N)->getMemOperand();
1334  cast<MachineSDNode>(NVPTXLD)->setMemRefs(MemRefs0, MemRefs0 + 1);
1335 
1336  ReplaceNode(N, NVPTXLD);
1337  return true;
1338 }
1339 
1340 bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
1341 
1342  SDValue Chain = N->getOperand(0);
1343  SDValue Op1 = N->getOperand(1);
1344  SDValue Addr, Offset, Base;
1345  Optional<unsigned> Opcode;
1346  SDLoc DL(N);
1347  SDNode *LD;
1348  MemSDNode *MemSD = cast<MemSDNode>(N);
1349  EVT LoadedVT = MemSD->getMemoryVT();
1350 
1351  if (!LoadedVT.isSimple())
1352  return false;
1353 
1354  // Address Space Setting
1355  unsigned int CodeAddrSpace = getCodeAddrSpace(MemSD);
1356 
1357  if (canLowerToLDG(MemSD, *Subtarget, CodeAddrSpace, MF)) {
1358  return tryLDGLDU(N);
1359  }
1360 
1361  // Volatile Setting
1362  // - .volatile is only availalble for .global and .shared
1363  bool IsVolatile = MemSD->isVolatile();
1364  if (CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL &&
1365  CodeAddrSpace != NVPTX::PTXLdStInstCode::SHARED &&
1366  CodeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC)
1367  IsVolatile = false;
1368 
1369  // Vector Setting
1370  MVT SimpleVT = LoadedVT.getSimpleVT();
1371 
1372  // Type Setting: fromType + fromTypeWidth
1373  //
1374  // Sign : ISD::SEXTLOAD
1375  // Unsign : ISD::ZEXTLOAD, ISD::NON_EXTLOAD or ISD::EXTLOAD and the
1376  // type is integer
1377  // Float : ISD::NON_EXTLOAD or ISD::EXTLOAD and the type is float
1378  MVT ScalarVT = SimpleVT.getScalarType();
1379  // Read at least 8 bits (predicates are stored as 8-bit values)
1380  unsigned FromTypeWidth = std::max(8U, ScalarVT.getSizeInBits());
1381  unsigned int FromType;
1382  // The last operand holds the original LoadSDNode::getExtensionType() value
1383  unsigned ExtensionType = cast<ConstantSDNode>(
1384  N->getOperand(N->getNumOperands() - 1))->getZExtValue();
1385  if (ExtensionType == ISD::SEXTLOAD)
1386  FromType = NVPTX::PTXLdStInstCode::Signed;
1387  else if (ScalarVT.isFloatingPoint())
1388  FromType = ScalarVT.SimpleTy == MVT::f16 ? NVPTX::PTXLdStInstCode::Untyped
1390  else
1392 
1393  unsigned VecType;
1394 
1395  switch (N->getOpcode()) {
1396  case NVPTXISD::LoadV2:
1398  break;
1399  case NVPTXISD::LoadV4:
1401  break;
1402  default:
1403  return false;
1404  }
1405 
1406  EVT EltVT = N->getValueType(0);
1407 
1408  // v8f16 is a special case. PTX doesn't have ld.v8.f16
1409  // instruction. Instead, we split the vector into v2f16 chunks and
1410  // load them with ld.v4.b32.
1411  if (EltVT == MVT::v2f16) {
1412  assert(N->getOpcode() == NVPTXISD::LoadV4 && "Unexpected load opcode.");
1413  EltVT = MVT::i32;
1415  FromTypeWidth = 32;
1416  }
1417 
1418  if (SelectDirectAddr(Op1, Addr)) {
1419  switch (N->getOpcode()) {
1420  default:
1421  return false;
1422  case NVPTXISD::LoadV2:
1423  Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1424  NVPTX::LDV_i8_v2_avar, NVPTX::LDV_i16_v2_avar,
1425  NVPTX::LDV_i32_v2_avar, NVPTX::LDV_i64_v2_avar,
1426  NVPTX::LDV_f16_v2_avar, NVPTX::LDV_f16x2_v2_avar,
1427  NVPTX::LDV_f32_v2_avar, NVPTX::LDV_f64_v2_avar);
1428  break;
1429  case NVPTXISD::LoadV4:
1430  Opcode =
1431  pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_avar,
1432  NVPTX::LDV_i16_v4_avar, NVPTX::LDV_i32_v4_avar, None,
1433  NVPTX::LDV_f16_v4_avar, NVPTX::LDV_f16x2_v4_avar,
1434  NVPTX::LDV_f32_v4_avar, None);
1435  break;
1436  }
1437  if (!Opcode)
1438  return false;
1439  SDValue Ops[] = { getI32Imm(IsVolatile, DL), getI32Imm(CodeAddrSpace, DL),
1440  getI32Imm(VecType, DL), getI32Imm(FromType, DL),
1441  getI32Imm(FromTypeWidth, DL), Addr, Chain };
1442  LD = CurDAG->getMachineNode(Opcode.getValue(), DL, N->getVTList(), Ops);
1443  } else if (TM.is64Bit() ? SelectADDRsi64(Op1.getNode(), Op1, Base, Offset)
1444  : SelectADDRsi(Op1.getNode(), Op1, Base, Offset)) {
1445  switch (N->getOpcode()) {
1446  default:
1447  return false;
1448  case NVPTXISD::LoadV2:
1449  Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1450  NVPTX::LDV_i8_v2_asi, NVPTX::LDV_i16_v2_asi,
1451  NVPTX::LDV_i32_v2_asi, NVPTX::LDV_i64_v2_asi,
1452  NVPTX::LDV_f16_v2_asi, NVPTX::LDV_f16x2_v2_asi,
1453  NVPTX::LDV_f32_v2_asi, NVPTX::LDV_f64_v2_asi);
1454  break;
1455  case NVPTXISD::LoadV4:
1456  Opcode =
1457  pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_asi,
1458  NVPTX::LDV_i16_v4_asi, NVPTX::LDV_i32_v4_asi, None,
1459  NVPTX::LDV_f16_v4_asi, NVPTX::LDV_f16x2_v4_asi,
1460  NVPTX::LDV_f32_v4_asi, None);
1461  break;
1462  }
1463  if (!Opcode)
1464  return false;
1465  SDValue Ops[] = { getI32Imm(IsVolatile, DL), getI32Imm(CodeAddrSpace, DL),
1466  getI32Imm(VecType, DL), getI32Imm(FromType, DL),
1467  getI32Imm(FromTypeWidth, DL), Base, Offset, Chain };
1468  LD = CurDAG->getMachineNode(Opcode.getValue(), DL, N->getVTList(), Ops);
1469  } else if (TM.is64Bit() ? SelectADDRri64(Op1.getNode(), Op1, Base, Offset)
1470  : SelectADDRri(Op1.getNode(), Op1, Base, Offset)) {
1471  if (TM.is64Bit()) {
1472  switch (N->getOpcode()) {
1473  default:
1474  return false;
1475  case NVPTXISD::LoadV2:
1476  Opcode = pickOpcodeForVT(
1477  EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v2_ari_64,
1478  NVPTX::LDV_i16_v2_ari_64, NVPTX::LDV_i32_v2_ari_64,
1479  NVPTX::LDV_i64_v2_ari_64, NVPTX::LDV_f16_v2_ari_64,
1480  NVPTX::LDV_f16x2_v2_ari_64, NVPTX::LDV_f32_v2_ari_64,
1481  NVPTX::LDV_f64_v2_ari_64);
1482  break;
1483  case NVPTXISD::LoadV4:
1484  Opcode = pickOpcodeForVT(
1485  EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_ari_64,
1486  NVPTX::LDV_i16_v4_ari_64, NVPTX::LDV_i32_v4_ari_64, None,
1487  NVPTX::LDV_f16_v4_ari_64, NVPTX::LDV_f16x2_v4_ari_64,
1488  NVPTX::LDV_f32_v4_ari_64, None);
1489  break;
1490  }
1491  } else {
1492  switch (N->getOpcode()) {
1493  default:
1494  return false;
1495  case NVPTXISD::LoadV2:
1496  Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1497  NVPTX::LDV_i8_v2_ari, NVPTX::LDV_i16_v2_ari,
1498  NVPTX::LDV_i32_v2_ari, NVPTX::LDV_i64_v2_ari,
1499  NVPTX::LDV_f16_v2_ari, NVPTX::LDV_f16x2_v2_ari,
1500  NVPTX::LDV_f32_v2_ari, NVPTX::LDV_f64_v2_ari);
1501  break;
1502  case NVPTXISD::LoadV4:
1503  Opcode =
1504  pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_ari,
1505  NVPTX::LDV_i16_v4_ari, NVPTX::LDV_i32_v4_ari, None,
1506  NVPTX::LDV_f16_v4_ari, NVPTX::LDV_f16x2_v4_ari,
1507  NVPTX::LDV_f32_v4_ari, None);
1508  break;
1509  }
1510  }
1511  if (!Opcode)
1512  return false;
1513  SDValue Ops[] = { getI32Imm(IsVolatile, DL), getI32Imm(CodeAddrSpace, DL),
1514  getI32Imm(VecType, DL), getI32Imm(FromType, DL),
1515  getI32Imm(FromTypeWidth, DL), Base, Offset, Chain };
1516 
1517  LD = CurDAG->getMachineNode(Opcode.getValue(), DL, N->getVTList(), Ops);
1518  } else {
1519  if (TM.is64Bit()) {
1520  switch (N->getOpcode()) {
1521  default:
1522  return false;
1523  case NVPTXISD::LoadV2:
1524  Opcode = pickOpcodeForVT(
1525  EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v2_areg_64,
1526  NVPTX::LDV_i16_v2_areg_64, NVPTX::LDV_i32_v2_areg_64,
1527  NVPTX::LDV_i64_v2_areg_64, NVPTX::LDV_f16_v2_areg_64,
1528  NVPTX::LDV_f16x2_v2_areg_64, NVPTX::LDV_f32_v2_areg_64,
1529  NVPTX::LDV_f64_v2_areg_64);
1530  break;
1531  case NVPTXISD::LoadV4:
1532  Opcode = pickOpcodeForVT(
1533  EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_areg_64,
1534  NVPTX::LDV_i16_v4_areg_64, NVPTX::LDV_i32_v4_areg_64, None,
1535  NVPTX::LDV_f16_v4_areg_64, NVPTX::LDV_f16x2_v4_areg_64,
1536  NVPTX::LDV_f32_v4_areg_64, None);
1537  break;
1538  }
1539  } else {
1540  switch (N->getOpcode()) {
1541  default:
1542  return false;
1543  case NVPTXISD::LoadV2:
1544  Opcode =
1545  pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v2_areg,
1546  NVPTX::LDV_i16_v2_areg, NVPTX::LDV_i32_v2_areg,
1547  NVPTX::LDV_i64_v2_areg, NVPTX::LDV_f16_v2_areg,
1548  NVPTX::LDV_f16x2_v2_areg, NVPTX::LDV_f32_v2_areg,
1549  NVPTX::LDV_f64_v2_areg);
1550  break;
1551  case NVPTXISD::LoadV4:
1552  Opcode = pickOpcodeForVT(
1553  EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_areg,
1554  NVPTX::LDV_i16_v4_areg, NVPTX::LDV_i32_v4_areg, None,
1555  NVPTX::LDV_f16_v4_areg, NVPTX::LDV_f16x2_v4_areg,
1556  NVPTX::LDV_f32_v4_areg, None);
1557  break;
1558  }
1559  }
1560  if (!Opcode)
1561  return false;
1562  SDValue Ops[] = { getI32Imm(IsVolatile, DL), getI32Imm(CodeAddrSpace, DL),
1563  getI32Imm(VecType, DL), getI32Imm(FromType, DL),
1564  getI32Imm(FromTypeWidth, DL), Op1, Chain };
1565  LD = CurDAG->getMachineNode(Opcode.getValue(), DL, N->getVTList(), Ops);
1566  }
1567 
1569  MemRefs0[0] = cast<MemSDNode>(N)->getMemOperand();
1570  cast<MachineSDNode>(LD)->setMemRefs(MemRefs0, MemRefs0 + 1);
1571 
1572  ReplaceNode(N, LD);
1573  return true;
1574 }
1575 
1576 bool NVPTXDAGToDAGISel::tryLDGLDU(SDNode *N) {
1577 
1578  SDValue Chain = N->getOperand(0);
1579  SDValue Op1;
1580  MemSDNode *Mem;
1581  bool IsLDG = true;
1582 
1583  // If this is an LDG intrinsic, the address is the third operand. If its an
1584  // LDG/LDU SD node (from custom vector handling), then its the second operand
1585  if (N->getOpcode() == ISD::INTRINSIC_W_CHAIN) {
1586  Op1 = N->getOperand(2);
1587  Mem = cast<MemIntrinsicSDNode>(N);
1588  unsigned IID = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
1589  switch (IID) {
1590  default:
1591  return false;
1592  case Intrinsic::nvvm_ldg_global_f:
1593  case Intrinsic::nvvm_ldg_global_i:
1594  case Intrinsic::nvvm_ldg_global_p:
1595  IsLDG = true;
1596  break;
1597  case Intrinsic::nvvm_ldu_global_f:
1598  case Intrinsic::nvvm_ldu_global_i:
1599  case Intrinsic::nvvm_ldu_global_p:
1600  IsLDG = false;
1601  break;
1602  }
1603  } else {
1604  Op1 = N->getOperand(1);
1605  Mem = cast<MemSDNode>(N);
1606  }
1607 
1608  Optional<unsigned> Opcode;
1609  SDLoc DL(N);
1610  SDNode *LD;
1611  SDValue Base, Offset, Addr;
1612 
1613  EVT EltVT = Mem->getMemoryVT();
1614  unsigned NumElts = 1;
1615  if (EltVT.isVector()) {
1616  NumElts = EltVT.getVectorNumElements();
1617  EltVT = EltVT.getVectorElementType();
1618  }
1619 
1620  // Build the "promoted" result VTList for the load. If we are really loading
1621  // i8s, then the return type will be promoted to i16 since we do not expose
1622  // 8-bit registers in NVPTX.
1623  EVT NodeVT = (EltVT == MVT::i8) ? MVT::i16 : EltVT;
1624  SmallVector<EVT, 5> InstVTs;
1625  for (unsigned i = 0; i != NumElts; ++i) {
1626  InstVTs.push_back(NodeVT);
1627  }
1628  InstVTs.push_back(MVT::Other);
1629  SDVTList InstVTList = CurDAG->getVTList(InstVTs);
1630 
1631  if (SelectDirectAddr(Op1, Addr)) {
1632  switch (N->getOpcode()) {
1633  default:
1634  return false;
1636  if (IsLDG)
1637  Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1638  NVPTX::INT_PTX_LDG_GLOBAL_i8avar,
1639  NVPTX::INT_PTX_LDG_GLOBAL_i16avar,
1640  NVPTX::INT_PTX_LDG_GLOBAL_i32avar,
1641  NVPTX::INT_PTX_LDG_GLOBAL_i64avar,
1642  NVPTX::INT_PTX_LDG_GLOBAL_f16avar,
1643  NVPTX::INT_PTX_LDG_GLOBAL_f16x2avar,
1644  NVPTX::INT_PTX_LDG_GLOBAL_f32avar,
1645  NVPTX::INT_PTX_LDG_GLOBAL_f64avar);
1646  else
1647  Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1648  NVPTX::INT_PTX_LDU_GLOBAL_i8avar,
1649  NVPTX::INT_PTX_LDU_GLOBAL_i16avar,
1650  NVPTX::INT_PTX_LDU_GLOBAL_i32avar,
1651  NVPTX::INT_PTX_LDU_GLOBAL_i64avar,
1652  NVPTX::INT_PTX_LDU_GLOBAL_f16avar,
1653  NVPTX::INT_PTX_LDU_GLOBAL_f16x2avar,
1654  NVPTX::INT_PTX_LDU_GLOBAL_f32avar,
1655  NVPTX::INT_PTX_LDU_GLOBAL_f64avar);
1656  break;
1657  case NVPTXISD::LDGV2:
1658  Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1659  NVPTX::INT_PTX_LDG_G_v2i8_ELE_avar,
1660  NVPTX::INT_PTX_LDG_G_v2i16_ELE_avar,
1661  NVPTX::INT_PTX_LDG_G_v2i32_ELE_avar,
1662  NVPTX::INT_PTX_LDG_G_v2i64_ELE_avar,
1663  NVPTX::INT_PTX_LDG_G_v2f16_ELE_avar,
1664  NVPTX::INT_PTX_LDG_G_v2f16x2_ELE_avar,
1665  NVPTX::INT_PTX_LDG_G_v2f32_ELE_avar,
1666  NVPTX::INT_PTX_LDG_G_v2f64_ELE_avar);
1667  break;
1668  case NVPTXISD::LDUV2:
1669  Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1670  NVPTX::INT_PTX_LDU_G_v2i8_ELE_avar,
1671  NVPTX::INT_PTX_LDU_G_v2i16_ELE_avar,
1672  NVPTX::INT_PTX_LDU_G_v2i32_ELE_avar,
1673  NVPTX::INT_PTX_LDU_G_v2i64_ELE_avar,
1674  NVPTX::INT_PTX_LDU_G_v2f16_ELE_avar,
1675  NVPTX::INT_PTX_LDU_G_v2f16x2_ELE_avar,
1676  NVPTX::INT_PTX_LDU_G_v2f32_ELE_avar,
1677  NVPTX::INT_PTX_LDU_G_v2f64_ELE_avar);
1678  break;
1679  case NVPTXISD::LDGV4:
1680  Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1681  NVPTX::INT_PTX_LDG_G_v4i8_ELE_avar,
1682  NVPTX::INT_PTX_LDG_G_v4i16_ELE_avar,
1683  NVPTX::INT_PTX_LDG_G_v4i32_ELE_avar, None,
1684  NVPTX::INT_PTX_LDG_G_v4f16_ELE_avar,
1685  NVPTX::INT_PTX_LDG_G_v4f16x2_ELE_avar,
1686  NVPTX::INT_PTX_LDG_G_v4f32_ELE_avar, None);
1687  break;
1688  case NVPTXISD::LDUV4:
1689  Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1690  NVPTX::INT_PTX_LDU_G_v4i8_ELE_avar,
1691  NVPTX::INT_PTX_LDU_G_v4i16_ELE_avar,
1692  NVPTX::INT_PTX_LDU_G_v4i32_ELE_avar, None,
1693  NVPTX::INT_PTX_LDU_G_v4f16_ELE_avar,
1694  NVPTX::INT_PTX_LDU_G_v4f16x2_ELE_avar,
1695  NVPTX::INT_PTX_LDU_G_v4f32_ELE_avar, None);
1696  break;
1697  }
1698  if (!Opcode)
1699  return false;
1700  SDValue Ops[] = { Addr, Chain };
1701  LD = CurDAG->getMachineNode(Opcode.getValue(), DL, InstVTList, Ops);
1702  } else if (TM.is64Bit() ? SelectADDRri64(Op1.getNode(), Op1, Base, Offset)
1703  : SelectADDRri(Op1.getNode(), Op1, Base, Offset)) {
1704  if (TM.is64Bit()) {
1705  switch (N->getOpcode()) {
1706  default:
1707  return false;
1708  case ISD::LOAD:
1710  if (IsLDG)
1711  Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1712  NVPTX::INT_PTX_LDG_GLOBAL_i8ari64,
1713  NVPTX::INT_PTX_LDG_GLOBAL_i16ari64,
1714  NVPTX::INT_PTX_LDG_GLOBAL_i32ari64,
1715  NVPTX::INT_PTX_LDG_GLOBAL_i64ari64,
1716  NVPTX::INT_PTX_LDG_GLOBAL_f16ari64,
1717  NVPTX::INT_PTX_LDG_GLOBAL_f16x2ari64,
1718  NVPTX::INT_PTX_LDG_GLOBAL_f32ari64,
1719  NVPTX::INT_PTX_LDG_GLOBAL_f64ari64);
1720  else
1721  Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1722  NVPTX::INT_PTX_LDU_GLOBAL_i8ari64,
1723  NVPTX::INT_PTX_LDU_GLOBAL_i16ari64,
1724  NVPTX::INT_PTX_LDU_GLOBAL_i32ari64,
1725  NVPTX::INT_PTX_LDU_GLOBAL_i64ari64,
1726  NVPTX::INT_PTX_LDU_GLOBAL_f16ari64,
1727  NVPTX::INT_PTX_LDU_GLOBAL_f16x2ari64,
1728  NVPTX::INT_PTX_LDU_GLOBAL_f32ari64,
1729  NVPTX::INT_PTX_LDU_GLOBAL_f64ari64);
1730  break;
1731  case NVPTXISD::LoadV2:
1732  case NVPTXISD::LDGV2:
1733  Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1734  NVPTX::INT_PTX_LDG_G_v2i8_ELE_ari64,
1735  NVPTX::INT_PTX_LDG_G_v2i16_ELE_ari64,
1736  NVPTX::INT_PTX_LDG_G_v2i32_ELE_ari64,
1737  NVPTX::INT_PTX_LDG_G_v2i64_ELE_ari64,
1738  NVPTX::INT_PTX_LDG_G_v2f16_ELE_ari64,
1739  NVPTX::INT_PTX_LDG_G_v2f16x2_ELE_ari64,
1740  NVPTX::INT_PTX_LDG_G_v2f32_ELE_ari64,
1741  NVPTX::INT_PTX_LDG_G_v2f64_ELE_ari64);
1742  break;
1743  case NVPTXISD::LDUV2:
1744  Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1745  NVPTX::INT_PTX_LDU_G_v2i8_ELE_ari64,
1746  NVPTX::INT_PTX_LDU_G_v2i16_ELE_ari64,
1747  NVPTX::INT_PTX_LDU_G_v2i32_ELE_ari64,
1748  NVPTX::INT_PTX_LDU_G_v2i64_ELE_ari64,
1749  NVPTX::INT_PTX_LDU_G_v2f16_ELE_ari64,
1750  NVPTX::INT_PTX_LDU_G_v2f16x2_ELE_ari64,
1751  NVPTX::INT_PTX_LDU_G_v2f32_ELE_ari64,
1752  NVPTX::INT_PTX_LDU_G_v2f64_ELE_ari64);
1753  break;
1754  case NVPTXISD::LoadV4:
1755  case NVPTXISD::LDGV4:
1756  Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1757  NVPTX::INT_PTX_LDG_G_v4i8_ELE_ari64,
1758  NVPTX::INT_PTX_LDG_G_v4i16_ELE_ari64,
1759  NVPTX::INT_PTX_LDG_G_v4i32_ELE_ari64, None,
1760  NVPTX::INT_PTX_LDG_G_v4f16_ELE_ari64,
1761  NVPTX::INT_PTX_LDG_G_v4f16x2_ELE_ari64,
1762  NVPTX::INT_PTX_LDG_G_v4f32_ELE_ari64, None);
1763  break;
1764  case NVPTXISD::LDUV4:
1765  Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1766  NVPTX::INT_PTX_LDU_G_v4i8_ELE_ari64,
1767  NVPTX::INT_PTX_LDU_G_v4i16_ELE_ari64,
1768  NVPTX::INT_PTX_LDU_G_v4i32_ELE_ari64, None,
1769  NVPTX::INT_PTX_LDU_G_v4f16_ELE_ari64,
1770  NVPTX::INT_PTX_LDU_G_v4f16x2_ELE_ari64,
1771  NVPTX::INT_PTX_LDU_G_v4f32_ELE_ari64, None);
1772  break;
1773  }
1774  } else {
1775  switch (N->getOpcode()) {
1776  default:
1777  return false;
1778  case ISD::LOAD:
1780  if (IsLDG)
1781  Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1782  NVPTX::INT_PTX_LDG_GLOBAL_i8ari,
1783  NVPTX::INT_PTX_LDG_GLOBAL_i16ari,
1784  NVPTX::INT_PTX_LDG_GLOBAL_i32ari,
1785  NVPTX::INT_PTX_LDG_GLOBAL_i64ari,
1786  NVPTX::INT_PTX_LDG_GLOBAL_f16ari,
1787  NVPTX::INT_PTX_LDG_GLOBAL_f16x2ari,
1788  NVPTX::INT_PTX_LDG_GLOBAL_f32ari,
1789  NVPTX::INT_PTX_LDG_GLOBAL_f64ari);
1790  else
1791  Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1792  NVPTX::INT_PTX_LDU_GLOBAL_i8ari,
1793  NVPTX::INT_PTX_LDU_GLOBAL_i16ari,
1794  NVPTX::INT_PTX_LDU_GLOBAL_i32ari,
1795  NVPTX::INT_PTX_LDU_GLOBAL_i64ari,
1796  NVPTX::INT_PTX_LDU_GLOBAL_f16ari,
1797  NVPTX::INT_PTX_LDU_GLOBAL_f16x2ari,
1798  NVPTX::INT_PTX_LDU_GLOBAL_f32ari,
1799  NVPTX::INT_PTX_LDU_GLOBAL_f64ari);
1800  break;
1801  case NVPTXISD::LoadV2:
1802  case NVPTXISD::LDGV2:
1803  Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1804  NVPTX::INT_PTX_LDG_G_v2i8_ELE_ari32,
1805  NVPTX::INT_PTX_LDG_G_v2i16_ELE_ari32,
1806  NVPTX::INT_PTX_LDG_G_v2i32_ELE_ari32,
1807  NVPTX::INT_PTX_LDG_G_v2i64_ELE_ari32,
1808  NVPTX::INT_PTX_LDG_G_v2f16_ELE_ari32,
1809  NVPTX::INT_PTX_LDG_G_v2f16x2_ELE_ari32,
1810  NVPTX::INT_PTX_LDG_G_v2f32_ELE_ari32,
1811  NVPTX::INT_PTX_LDG_G_v2f64_ELE_ari32);
1812  break;
1813  case NVPTXISD::LDUV2:
1814  Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1815  NVPTX::INT_PTX_LDU_G_v2i8_ELE_ari32,
1816  NVPTX::INT_PTX_LDU_G_v2i16_ELE_ari32,
1817  NVPTX::INT_PTX_LDU_G_v2i32_ELE_ari32,
1818  NVPTX::INT_PTX_LDU_G_v2i64_ELE_ari32,
1819  NVPTX::INT_PTX_LDU_G_v2f16_ELE_ari32,
1820  NVPTX::INT_PTX_LDU_G_v2f16x2_ELE_ari32,
1821  NVPTX::INT_PTX_LDU_G_v2f32_ELE_ari32,
1822  NVPTX::INT_PTX_LDU_G_v2f64_ELE_ari32);
1823  break;
1824  case NVPTXISD::LoadV4:
1825  case NVPTXISD::LDGV4:
1826  Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1827  NVPTX::INT_PTX_LDG_G_v4i8_ELE_ari32,
1828  NVPTX::INT_PTX_LDG_G_v4i16_ELE_ari32,
1829  NVPTX::INT_PTX_LDG_G_v4i32_ELE_ari32, None,
1830  NVPTX::INT_PTX_LDG_G_v4f16_ELE_ari32,
1831  NVPTX::INT_PTX_LDG_G_v4f16x2_ELE_ari32,
1832  NVPTX::INT_PTX_LDG_G_v4f32_ELE_ari32, None);
1833  break;
1834  case NVPTXISD::LDUV4:
1835  Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1836  NVPTX::INT_PTX_LDU_G_v4i8_ELE_ari32,
1837  NVPTX::INT_PTX_LDU_G_v4i16_ELE_ari32,
1838  NVPTX::INT_PTX_LDU_G_v4i32_ELE_ari32, None,
1839  NVPTX::INT_PTX_LDU_G_v4f16_ELE_ari32,
1840  NVPTX::INT_PTX_LDU_G_v4f16x2_ELE_ari32,
1841  NVPTX::INT_PTX_LDU_G_v4f32_ELE_ari32, None);
1842  break;
1843  }
1844  }
1845  if (!Opcode)
1846  return false;
1847  SDValue Ops[] = {Base, Offset, Chain};
1848  LD = CurDAG->getMachineNode(Opcode.getValue(), DL, InstVTList, Ops);
1849  } else {
1850  if (TM.is64Bit()) {
1851  switch (N->getOpcode()) {
1852  default:
1853  return false;
1854  case ISD::LOAD:
1856  if (IsLDG)
1857  Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1858  NVPTX::INT_PTX_LDG_GLOBAL_i8areg64,
1859  NVPTX::INT_PTX_LDG_GLOBAL_i16areg64,
1860  NVPTX::INT_PTX_LDG_GLOBAL_i32areg64,
1861  NVPTX::INT_PTX_LDG_GLOBAL_i64areg64,
1862  NVPTX::INT_PTX_LDG_GLOBAL_f16areg64,
1863  NVPTX::INT_PTX_LDG_GLOBAL_f16x2areg64,
1864  NVPTX::INT_PTX_LDG_GLOBAL_f32areg64,
1865  NVPTX::INT_PTX_LDG_GLOBAL_f64areg64);
1866  else
1867  Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1868  NVPTX::INT_PTX_LDU_GLOBAL_i8areg64,
1869  NVPTX::INT_PTX_LDU_GLOBAL_i16areg64,
1870  NVPTX::INT_PTX_LDU_GLOBAL_i32areg64,
1871  NVPTX::INT_PTX_LDU_GLOBAL_i64areg64,
1872  NVPTX::INT_PTX_LDU_GLOBAL_f16areg64,
1873  NVPTX::INT_PTX_LDU_GLOBAL_f16x2areg64,
1874  NVPTX::INT_PTX_LDU_GLOBAL_f32areg64,
1875  NVPTX::INT_PTX_LDU_GLOBAL_f64areg64);
1876  break;
1877  case NVPTXISD::LoadV2:
1878  case NVPTXISD::LDGV2:
1879  Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1880  NVPTX::INT_PTX_LDG_G_v2i8_ELE_areg64,
1881  NVPTX::INT_PTX_LDG_G_v2i16_ELE_areg64,
1882  NVPTX::INT_PTX_LDG_G_v2i32_ELE_areg64,
1883  NVPTX::INT_PTX_LDG_G_v2i64_ELE_areg64,
1884  NVPTX::INT_PTX_LDG_G_v2f16_ELE_areg64,
1885  NVPTX::INT_PTX_LDG_G_v2f16x2_ELE_areg64,
1886  NVPTX::INT_PTX_LDG_G_v2f32_ELE_areg64,
1887  NVPTX::INT_PTX_LDG_G_v2f64_ELE_areg64);
1888  break;
1889  case NVPTXISD::LDUV2:
1890  Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1891  NVPTX::INT_PTX_LDU_G_v2i8_ELE_areg64,
1892  NVPTX::INT_PTX_LDU_G_v2i16_ELE_areg64,
1893  NVPTX::INT_PTX_LDU_G_v2i32_ELE_areg64,
1894  NVPTX::INT_PTX_LDU_G_v2i64_ELE_areg64,
1895  NVPTX::INT_PTX_LDU_G_v2f16_ELE_areg64,
1896  NVPTX::INT_PTX_LDU_G_v2f16x2_ELE_areg64,
1897  NVPTX::INT_PTX_LDU_G_v2f32_ELE_areg64,
1898  NVPTX::INT_PTX_LDU_G_v2f64_ELE_areg64);
1899  break;
1900  case NVPTXISD::LoadV4:
1901  case NVPTXISD::LDGV4:
1902  Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1903  NVPTX::INT_PTX_LDG_G_v4i8_ELE_areg64,
1904  NVPTX::INT_PTX_LDG_G_v4i16_ELE_areg64,
1905  NVPTX::INT_PTX_LDG_G_v4i32_ELE_areg64, None,
1906  NVPTX::INT_PTX_LDG_G_v4f16_ELE_areg64,
1907  NVPTX::INT_PTX_LDG_G_v4f16x2_ELE_areg64,
1908  NVPTX::INT_PTX_LDG_G_v4f32_ELE_areg64, None);
1909  break;
1910  case NVPTXISD::LDUV4:
1911  Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1912  NVPTX::INT_PTX_LDU_G_v4i8_ELE_areg64,
1913  NVPTX::INT_PTX_LDU_G_v4i16_ELE_areg64,
1914  NVPTX::INT_PTX_LDU_G_v4i32_ELE_areg64, None,
1915  NVPTX::INT_PTX_LDU_G_v4f16_ELE_areg64,
1916  NVPTX::INT_PTX_LDU_G_v4f16x2_ELE_areg64,
1917  NVPTX::INT_PTX_LDU_G_v4f32_ELE_areg64, None);
1918  break;
1919  }
1920  } else {
1921  switch (N->getOpcode()) {
1922  default:
1923  return false;
1924  case ISD::LOAD:
1926  if (IsLDG)
1927  Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1928  NVPTX::INT_PTX_LDG_GLOBAL_i8areg,
1929  NVPTX::INT_PTX_LDG_GLOBAL_i16areg,
1930  NVPTX::INT_PTX_LDG_GLOBAL_i32areg,
1931  NVPTX::INT_PTX_LDG_GLOBAL_i64areg,
1932  NVPTX::INT_PTX_LDG_GLOBAL_f16areg,
1933  NVPTX::INT_PTX_LDG_GLOBAL_f16x2areg,
1934  NVPTX::INT_PTX_LDG_GLOBAL_f32areg,
1935  NVPTX::INT_PTX_LDG_GLOBAL_f64areg);
1936  else
1937  Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1938  NVPTX::INT_PTX_LDU_GLOBAL_i8areg,
1939  NVPTX::INT_PTX_LDU_GLOBAL_i16areg,
1940  NVPTX::INT_PTX_LDU_GLOBAL_i32areg,
1941  NVPTX::INT_PTX_LDU_GLOBAL_i64areg,
1942  NVPTX::INT_PTX_LDU_GLOBAL_f16areg,
1943  NVPTX::INT_PTX_LDU_GLOBAL_f16x2areg,
1944  NVPTX::INT_PTX_LDU_GLOBAL_f32areg,
1945  NVPTX::INT_PTX_LDU_GLOBAL_f64areg);
1946  break;
1947  case NVPTXISD::LoadV2:
1948  case NVPTXISD::LDGV2:
1949  Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1950  NVPTX::INT_PTX_LDG_G_v2i8_ELE_areg32,
1951  NVPTX::INT_PTX_LDG_G_v2i16_ELE_areg32,
1952  NVPTX::INT_PTX_LDG_G_v2i32_ELE_areg32,
1953  NVPTX::INT_PTX_LDG_G_v2i64_ELE_areg32,
1954  NVPTX::INT_PTX_LDG_G_v2f16_ELE_areg32,
1955  NVPTX::INT_PTX_LDG_G_v2f16x2_ELE_areg32,
1956  NVPTX::INT_PTX_LDG_G_v2f32_ELE_areg32,
1957  NVPTX::INT_PTX_LDG_G_v2f64_ELE_areg32);
1958  break;
1959  case NVPTXISD::LDUV2:
1960  Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1961  NVPTX::INT_PTX_LDU_G_v2i8_ELE_areg32,
1962  NVPTX::INT_PTX_LDU_G_v2i16_ELE_areg32,
1963  NVPTX::INT_PTX_LDU_G_v2i32_ELE_areg32,
1964  NVPTX::INT_PTX_LDU_G_v2i64_ELE_areg32,
1965  NVPTX::INT_PTX_LDU_G_v2f16_ELE_areg32,
1966  NVPTX::INT_PTX_LDU_G_v2f16x2_ELE_areg32,
1967  NVPTX::INT_PTX_LDU_G_v2f32_ELE_areg32,
1968  NVPTX::INT_PTX_LDU_G_v2f64_ELE_areg32);
1969  break;
1970  case NVPTXISD::LoadV4:
1971  case NVPTXISD::LDGV4:
1972  Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1973  NVPTX::INT_PTX_LDG_G_v4i8_ELE_areg32,
1974  NVPTX::INT_PTX_LDG_G_v4i16_ELE_areg32,
1975  NVPTX::INT_PTX_LDG_G_v4i32_ELE_areg32, None,
1976  NVPTX::INT_PTX_LDG_G_v4f16_ELE_areg32,
1977  NVPTX::INT_PTX_LDG_G_v4f16x2_ELE_areg32,
1978  NVPTX::INT_PTX_LDG_G_v4f32_ELE_areg32, None);
1979  break;
1980  case NVPTXISD::LDUV4:
1981  Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1982  NVPTX::INT_PTX_LDU_G_v4i8_ELE_areg32,
1983  NVPTX::INT_PTX_LDU_G_v4i16_ELE_areg32,
1984  NVPTX::INT_PTX_LDU_G_v4i32_ELE_areg32, None,
1985  NVPTX::INT_PTX_LDU_G_v4f16_ELE_areg32,
1986  NVPTX::INT_PTX_LDU_G_v4f16x2_ELE_areg32,
1987  NVPTX::INT_PTX_LDU_G_v4f32_ELE_areg32, None);
1988  break;
1989  }
1990  }
1991  if (!Opcode)
1992  return false;
1993  SDValue Ops[] = { Op1, Chain };
1994  LD = CurDAG->getMachineNode(Opcode.getValue(), DL, InstVTList, Ops);
1995  }
1996 
1998  MemRefs0[0] = Mem->getMemOperand();
1999  cast<MachineSDNode>(LD)->setMemRefs(MemRefs0, MemRefs0 + 1);
2000 
2001  // For automatic generation of LDG (through SelectLoad[Vector], not the
2002  // intrinsics), we may have an extending load like:
2003  //
2004  // i32,ch = load<LD1[%data1(addrspace=1)], zext from i8> t0, t7, undef:i64
2005  //
2006  // In this case, the matching logic above will select a load for the original
2007  // memory type (in this case, i8) and our types will not match (the node needs
2008  // to return an i32 in this case). Our LDG/LDU nodes do not support the
2009  // concept of sign-/zero-extension, so emulate it here by adding an explicit
2010  // CVT instruction. Ptxas should clean up any redundancies here.
2011 
2012  EVT OrigType = N->getValueType(0);
2013  LoadSDNode *LdNode = dyn_cast<LoadSDNode>(N);
2014 
2015  if (OrigType != EltVT && LdNode) {
2016  // We have an extending-load. The instruction we selected operates on the
2017  // smaller type, but the SDNode we are replacing has the larger type. We
2018  // need to emit a CVT to make the types match.
2019  bool IsSigned = LdNode->getExtensionType() == ISD::SEXTLOAD;
2020  unsigned CvtOpc = GetConvertOpcode(OrigType.getSimpleVT(),
2021  EltVT.getSimpleVT(), IsSigned);
2022 
2023  // For each output value, apply the manual sign/zero-extension and make sure
2024  // all users of the load go through that CVT.
2025  for (unsigned i = 0; i != NumElts; ++i) {
2026  SDValue Res(LD, i);
2027  SDValue OrigVal(N, i);
2028 
2029  SDNode *CvtNode =
2030  CurDAG->getMachineNode(CvtOpc, DL, OrigType, Res,
2032  DL, MVT::i32));
2033  ReplaceUses(OrigVal, SDValue(CvtNode, 0));
2034  }
2035  }
2036 
2037  ReplaceNode(N, LD);
2038  return true;
2039 }
2040 
2041 bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
2042  SDLoc dl(N);
2043  StoreSDNode *ST = cast<StoreSDNode>(N);
2044  EVT StoreVT = ST->getMemoryVT();
2045  SDNode *NVPTXST = nullptr;
2046 
2047  // do not support pre/post inc/dec
2048  if (ST->isIndexed())
2049  return false;
2050 
2051  if (!StoreVT.isSimple())
2052  return false;
2053 
2054  // Address Space Setting
2055  unsigned int codeAddrSpace = getCodeAddrSpace(ST);
2056 
2057  // Volatile Setting
2058  // - .volatile is only availalble for .global and .shared
2059  bool isVolatile = ST->isVolatile();
2060  if (codeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL &&
2061  codeAddrSpace != NVPTX::PTXLdStInstCode::SHARED &&
2062  codeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC)
2063  isVolatile = false;
2064 
2065  // Vector Setting
2066  MVT SimpleVT = StoreVT.getSimpleVT();
2067  unsigned vecType = NVPTX::PTXLdStInstCode::Scalar;
2068 
2069  // Type Setting: toType + toTypeWidth
2070  // - for integer type, always use 'u'
2071  //
2072  MVT ScalarVT = SimpleVT.getScalarType();
2073  unsigned toTypeWidth = ScalarVT.getSizeInBits();
2074  if (SimpleVT.isVector()) {
2075  assert(StoreVT == MVT::v2f16 && "Unexpected vector type");
2076  // v2f16 is stored using st.b32
2077  toTypeWidth = 32;
2078  }
2079 
2080  unsigned int toType;
2081  if (ScalarVT.isFloatingPoint())
2082  // f16 uses .b16 as its storage type.
2083  toType = ScalarVT.SimpleTy == MVT::f16 ? NVPTX::PTXLdStInstCode::Untyped
2085  else
2087 
2088  // Create the machine instruction DAG
2089  SDValue Chain = N->getOperand(0);
2090  SDValue N1 = N->getOperand(1);
2091  SDValue N2 = N->getOperand(2);
2092  SDValue Addr;
2093  SDValue Offset, Base;
2094  Optional<unsigned> Opcode;
2096 
2097  if (SelectDirectAddr(N2, Addr)) {
2098  Opcode = pickOpcodeForVT(SourceVT, NVPTX::ST_i8_avar, NVPTX::ST_i16_avar,
2099  NVPTX::ST_i32_avar, NVPTX::ST_i64_avar,
2100  NVPTX::ST_f16_avar, NVPTX::ST_f16x2_avar,
2101  NVPTX::ST_f32_avar, NVPTX::ST_f64_avar);
2102  if (!Opcode)
2103  return false;
2104  SDValue Ops[] = { N1, getI32Imm(isVolatile, dl),
2105  getI32Imm(codeAddrSpace, dl), getI32Imm(vecType, dl),
2106  getI32Imm(toType, dl), getI32Imm(toTypeWidth, dl), Addr,
2107  Chain };
2108  NVPTXST = CurDAG->getMachineNode(Opcode.getValue(), dl, MVT::Other, Ops);
2109  } else if (TM.is64Bit() ? SelectADDRsi64(N2.getNode(), N2, Base, Offset)
2110  : SelectADDRsi(N2.getNode(), N2, Base, Offset)) {
2111  Opcode = pickOpcodeForVT(SourceVT, NVPTX::ST_i8_asi, NVPTX::ST_i16_asi,
2112  NVPTX::ST_i32_asi, NVPTX::ST_i64_asi,
2113  NVPTX::ST_f16_asi, NVPTX::ST_f16x2_asi,
2114  NVPTX::ST_f32_asi, NVPTX::ST_f64_asi);
2115  if (!Opcode)
2116  return false;
2117  SDValue Ops[] = { N1, getI32Imm(isVolatile, dl),
2118  getI32Imm(codeAddrSpace, dl), getI32Imm(vecType, dl),
2119  getI32Imm(toType, dl), getI32Imm(toTypeWidth, dl), Base,
2120  Offset, Chain };
2121  NVPTXST = CurDAG->getMachineNode(Opcode.getValue(), dl, MVT::Other, Ops);
2122  } else if (TM.is64Bit() ? SelectADDRri64(N2.getNode(), N2, Base, Offset)
2123  : SelectADDRri(N2.getNode(), N2, Base, Offset)) {
2124  if (TM.is64Bit())
2125  Opcode = pickOpcodeForVT(
2126  SourceVT, NVPTX::ST_i8_ari_64, NVPTX::ST_i16_ari_64,
2127  NVPTX::ST_i32_ari_64, NVPTX::ST_i64_ari_64, NVPTX::ST_f16_ari_64,
2128  NVPTX::ST_f16x2_ari_64, NVPTX::ST_f32_ari_64, NVPTX::ST_f64_ari_64);
2129  else
2130  Opcode = pickOpcodeForVT(SourceVT, NVPTX::ST_i8_ari, NVPTX::ST_i16_ari,
2131  NVPTX::ST_i32_ari, NVPTX::ST_i64_ari,
2132  NVPTX::ST_f16_ari, NVPTX::ST_f16x2_ari,
2133  NVPTX::ST_f32_ari, NVPTX::ST_f64_ari);
2134  if (!Opcode)
2135  return false;
2136 
2137  SDValue Ops[] = { N1, getI32Imm(isVolatile, dl),
2138  getI32Imm(codeAddrSpace, dl), getI32Imm(vecType, dl),
2139  getI32Imm(toType, dl), getI32Imm(toTypeWidth, dl), Base,
2140  Offset, Chain };
2141  NVPTXST = CurDAG->getMachineNode(Opcode.getValue(), dl, MVT::Other, Ops);
2142  } else {
2143  if (TM.is64Bit())
2144  Opcode =
2145  pickOpcodeForVT(SourceVT, NVPTX::ST_i8_areg_64, NVPTX::ST_i16_areg_64,
2146  NVPTX::ST_i32_areg_64, NVPTX::ST_i64_areg_64,
2147  NVPTX::ST_f16_areg_64, NVPTX::ST_f16x2_areg_64,
2148  NVPTX::ST_f32_areg_64, NVPTX::ST_f64_areg_64);
2149  else
2150  Opcode = pickOpcodeForVT(SourceVT, NVPTX::ST_i8_areg, NVPTX::ST_i16_areg,
2151  NVPTX::ST_i32_areg, NVPTX::ST_i64_areg,
2152  NVPTX::ST_f16_areg, NVPTX::ST_f16x2_areg,
2153  NVPTX::ST_f32_areg, NVPTX::ST_f64_areg);
2154  if (!Opcode)
2155  return false;
2156  SDValue Ops[] = { N1, getI32Imm(isVolatile, dl),
2157  getI32Imm(codeAddrSpace, dl), getI32Imm(vecType, dl),
2158  getI32Imm(toType, dl), getI32Imm(toTypeWidth, dl), N2,
2159  Chain };
2160  NVPTXST = CurDAG->getMachineNode(Opcode.getValue(), dl, MVT::Other, Ops);
2161  }
2162 
2163  if (!NVPTXST)
2164  return false;
2165 
2167  MemRefs0[0] = cast<MemSDNode>(N)->getMemOperand();
2168  cast<MachineSDNode>(NVPTXST)->setMemRefs(MemRefs0, MemRefs0 + 1);
2169  ReplaceNode(N, NVPTXST);
2170  return true;
2171 }
2172 
2173 bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
2174  SDValue Chain = N->getOperand(0);
2175  SDValue Op1 = N->getOperand(1);
2176  SDValue Addr, Offset, Base;
2177  Optional<unsigned> Opcode;
2178  SDLoc DL(N);
2179  SDNode *ST;
2180  EVT EltVT = Op1.getValueType();
2181  MemSDNode *MemSD = cast<MemSDNode>(N);
2182  EVT StoreVT = MemSD->getMemoryVT();
2183 
2184  // Address Space Setting
2185  unsigned CodeAddrSpace = getCodeAddrSpace(MemSD);
2186 
2187  if (CodeAddrSpace == NVPTX::PTXLdStInstCode::CONSTANT) {
2188  report_fatal_error("Cannot store to pointer that points to constant "
2189  "memory space");
2190  }
2191 
2192  // Volatile Setting
2193  // - .volatile is only availalble for .global and .shared
2194  bool IsVolatile = MemSD->isVolatile();
2195  if (CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL &&
2196  CodeAddrSpace != NVPTX::PTXLdStInstCode::SHARED &&
2197  CodeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC)
2198  IsVolatile = false;
2199 
2200  // Type Setting: toType + toTypeWidth
2201  // - for integer type, always use 'u'
2202  assert(StoreVT.isSimple() && "Store value is not simple");
2203  MVT ScalarVT = StoreVT.getSimpleVT().getScalarType();
2204  unsigned ToTypeWidth = ScalarVT.getSizeInBits();
2205  unsigned ToType;
2206  if (ScalarVT.isFloatingPoint())
2207  ToType = ScalarVT.SimpleTy == MVT::f16 ? NVPTX::PTXLdStInstCode::Untyped
2209  else
2211 
2213  SDValue N2;
2214  unsigned VecType;
2215 
2216  switch (N->getOpcode()) {
2217  case NVPTXISD::StoreV2:
2219  StOps.push_back(N->getOperand(1));
2220  StOps.push_back(N->getOperand(2));
2221  N2 = N->getOperand(3);
2222  break;
2223  case NVPTXISD::StoreV4:
2225  StOps.push_back(N->getOperand(1));
2226  StOps.push_back(N->getOperand(2));
2227  StOps.push_back(N->getOperand(3));
2228  StOps.push_back(N->getOperand(4));
2229  N2 = N->getOperand(5);
2230  break;
2231  default:
2232  return false;
2233  }
2234 
2235  // v8f16 is a special case. PTX doesn't have st.v8.f16
2236  // instruction. Instead, we split the vector into v2f16 chunks and
2237  // store them with st.v4.b32.
2238  if (EltVT == MVT::v2f16) {
2239  assert(N->getOpcode() == NVPTXISD::StoreV4 && "Unexpected load opcode.");
2240  EltVT = MVT::i32;
2242  ToTypeWidth = 32;
2243  }
2244 
2245  StOps.push_back(getI32Imm(IsVolatile, DL));
2246  StOps.push_back(getI32Imm(CodeAddrSpace, DL));
2247  StOps.push_back(getI32Imm(VecType, DL));
2248  StOps.push_back(getI32Imm(ToType, DL));
2249  StOps.push_back(getI32Imm(ToTypeWidth, DL));
2250 
2251  if (SelectDirectAddr(N2, Addr)) {
2252  switch (N->getOpcode()) {
2253  default:
2254  return false;
2255  case NVPTXISD::StoreV2:
2256  Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
2257  NVPTX::STV_i8_v2_avar, NVPTX::STV_i16_v2_avar,
2258  NVPTX::STV_i32_v2_avar, NVPTX::STV_i64_v2_avar,
2259  NVPTX::STV_f16_v2_avar, NVPTX::STV_f16x2_v2_avar,
2260  NVPTX::STV_f32_v2_avar, NVPTX::STV_f64_v2_avar);
2261  break;
2262  case NVPTXISD::StoreV4:
2263  Opcode =
2264  pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v4_avar,
2265  NVPTX::STV_i16_v4_avar, NVPTX::STV_i32_v4_avar, None,
2266  NVPTX::STV_f16_v4_avar, NVPTX::STV_f16x2_v4_avar,
2267  NVPTX::STV_f32_v4_avar, None);
2268  break;
2269  }
2270  StOps.push_back(Addr);
2271  } else if (TM.is64Bit() ? SelectADDRsi64(N2.getNode(), N2, Base, Offset)
2272  : SelectADDRsi(N2.getNode(), N2, Base, Offset)) {
2273  switch (N->getOpcode()) {
2274  default:
2275  return false;
2276  case NVPTXISD::StoreV2:
2277  Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
2278  NVPTX::STV_i8_v2_asi, NVPTX::STV_i16_v2_asi,
2279  NVPTX::STV_i32_v2_asi, NVPTX::STV_i64_v2_asi,
2280  NVPTX::STV_f16_v2_asi, NVPTX::STV_f16x2_v2_asi,
2281  NVPTX::STV_f32_v2_asi, NVPTX::STV_f64_v2_asi);
2282  break;
2283  case NVPTXISD::StoreV4:
2284  Opcode =
2285  pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v4_asi,
2286  NVPTX::STV_i16_v4_asi, NVPTX::STV_i32_v4_asi, None,
2287  NVPTX::STV_f16_v4_asi, NVPTX::STV_f16x2_v4_asi,
2288  NVPTX::STV_f32_v4_asi, None);
2289  break;
2290  }
2291  StOps.push_back(Base);
2292  StOps.push_back(Offset);
2293  } else if (TM.is64Bit() ? SelectADDRri64(N2.getNode(), N2, Base, Offset)
2294  : SelectADDRri(N2.getNode(), N2, Base, Offset)) {
2295  if (TM.is64Bit()) {
2296  switch (N->getOpcode()) {
2297  default:
2298  return false;
2299  case NVPTXISD::StoreV2:
2300  Opcode = pickOpcodeForVT(
2301  EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v2_ari_64,
2302  NVPTX::STV_i16_v2_ari_64, NVPTX::STV_i32_v2_ari_64,
2303  NVPTX::STV_i64_v2_ari_64, NVPTX::STV_f16_v2_ari_64,
2304  NVPTX::STV_f16x2_v2_ari_64, NVPTX::STV_f32_v2_ari_64,
2305  NVPTX::STV_f64_v2_ari_64);
2306  break;
2307  case NVPTXISD::StoreV4:
2308  Opcode = pickOpcodeForVT(
2309  EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v4_ari_64,
2310  NVPTX::STV_i16_v4_ari_64, NVPTX::STV_i32_v4_ari_64, None,
2311  NVPTX::STV_f16_v4_ari_64, NVPTX::STV_f16x2_v4_ari_64,
2312  NVPTX::STV_f32_v4_ari_64, None);
2313  break;
2314  }
2315  } else {
2316  switch (N->getOpcode()) {
2317  default:
2318  return false;
2319  case NVPTXISD::StoreV2:
2320  Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
2321  NVPTX::STV_i8_v2_ari, NVPTX::STV_i16_v2_ari,
2322  NVPTX::STV_i32_v2_ari, NVPTX::STV_i64_v2_ari,
2323  NVPTX::STV_f16_v2_ari, NVPTX::STV_f16x2_v2_ari,
2324  NVPTX::STV_f32_v2_ari, NVPTX::STV_f64_v2_ari);
2325  break;
2326  case NVPTXISD::StoreV4:
2327  Opcode =
2328  pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v4_ari,
2329  NVPTX::STV_i16_v4_ari, NVPTX::STV_i32_v4_ari, None,
2330  NVPTX::STV_f16_v4_ari, NVPTX::STV_f16x2_v4_ari,
2331  NVPTX::STV_f32_v4_ari, None);
2332  break;
2333  }
2334  }
2335  StOps.push_back(Base);
2336  StOps.push_back(Offset);
2337  } else {
2338  if (TM.is64Bit()) {
2339  switch (N->getOpcode()) {
2340  default:
2341  return false;
2342  case NVPTXISD::StoreV2:
2343  Opcode = pickOpcodeForVT(
2344  EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v2_areg_64,
2345  NVPTX::STV_i16_v2_areg_64, NVPTX::STV_i32_v2_areg_64,
2346  NVPTX::STV_i64_v2_areg_64, NVPTX::STV_f16_v2_areg_64,
2347  NVPTX::STV_f16x2_v2_areg_64, NVPTX::STV_f32_v2_areg_64,
2348  NVPTX::STV_f64_v2_areg_64);
2349  break;
2350  case NVPTXISD::StoreV4:
2351  Opcode = pickOpcodeForVT(
2352  EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v4_areg_64,
2353  NVPTX::STV_i16_v4_areg_64, NVPTX::STV_i32_v4_areg_64, None,
2354  NVPTX::STV_f16_v4_areg_64, NVPTX::STV_f16x2_v4_areg_64,
2355  NVPTX::STV_f32_v4_areg_64, None);
2356  break;
2357  }
2358  } else {
2359  switch (N->getOpcode()) {
2360  default:
2361  return false;
2362  case NVPTXISD::StoreV2:
2363  Opcode =
2364  pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v2_areg,
2365  NVPTX::STV_i16_v2_areg, NVPTX::STV_i32_v2_areg,
2366  NVPTX::STV_i64_v2_areg, NVPTX::STV_f16_v2_areg,
2367  NVPTX::STV_f16x2_v2_areg, NVPTX::STV_f32_v2_areg,
2368  NVPTX::STV_f64_v2_areg);
2369  break;
2370  case NVPTXISD::StoreV4:
2371  Opcode =
2372  pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v4_areg,
2373  NVPTX::STV_i16_v4_areg, NVPTX::STV_i32_v4_areg, None,
2374  NVPTX::STV_f16_v4_areg, NVPTX::STV_f16x2_v4_areg,
2375  NVPTX::STV_f32_v4_areg, None);
2376  break;
2377  }
2378  }
2379  StOps.push_back(N2);
2380  }
2381 
2382  if (!Opcode)
2383  return false;
2384 
2385  StOps.push_back(Chain);
2386 
2387  ST = CurDAG->getMachineNode(Opcode.getValue(), DL, MVT::Other, StOps);
2388 
2390  MemRefs0[0] = cast<MemSDNode>(N)->getMemOperand();
2391  cast<MachineSDNode>(ST)->setMemRefs(MemRefs0, MemRefs0 + 1);
2392 
2393  ReplaceNode(N, ST);
2394  return true;
2395 }
2396 
2397 bool NVPTXDAGToDAGISel::tryLoadParam(SDNode *Node) {
2398  SDValue Chain = Node->getOperand(0);
2399  SDValue Offset = Node->getOperand(2);
2400  SDValue Flag = Node->getOperand(3);
2401  SDLoc DL(Node);
2402  MemSDNode *Mem = cast<MemSDNode>(Node);
2403 
2404  unsigned VecSize;
2405  switch (Node->getOpcode()) {
2406  default:
2407  return false;
2408  case NVPTXISD::LoadParam:
2409  VecSize = 1;
2410  break;
2411  case NVPTXISD::LoadParamV2:
2412  VecSize = 2;
2413  break;
2414  case NVPTXISD::LoadParamV4:
2415  VecSize = 4;
2416  break;
2417  }
2418 
2419  EVT EltVT = Node->getValueType(0);
2420  EVT MemVT = Mem->getMemoryVT();
2421 
2422  Optional<unsigned> Opcode;
2423 
2424  switch (VecSize) {
2425  default:
2426  return false;
2427  case 1:
2428  Opcode = pickOpcodeForVT(MemVT.getSimpleVT().SimpleTy,
2429  NVPTX::LoadParamMemI8, NVPTX::LoadParamMemI16,
2430  NVPTX::LoadParamMemI32, NVPTX::LoadParamMemI64,
2431  NVPTX::LoadParamMemF16, NVPTX::LoadParamMemF16x2,
2432  NVPTX::LoadParamMemF32, NVPTX::LoadParamMemF64);
2433  break;
2434  case 2:
2435  Opcode =
2436  pickOpcodeForVT(MemVT.getSimpleVT().SimpleTy, NVPTX::LoadParamMemV2I8,
2437  NVPTX::LoadParamMemV2I16, NVPTX::LoadParamMemV2I32,
2438  NVPTX::LoadParamMemV2I64, NVPTX::LoadParamMemV2F16,
2439  NVPTX::LoadParamMemV2F16x2, NVPTX::LoadParamMemV2F32,
2440  NVPTX::LoadParamMemV2F64);
2441  break;
2442  case 4:
2443  Opcode = pickOpcodeForVT(
2444  MemVT.getSimpleVT().SimpleTy, NVPTX::LoadParamMemV4I8,
2445  NVPTX::LoadParamMemV4I16, NVPTX::LoadParamMemV4I32, None,
2446  NVPTX::LoadParamMemV4F16, NVPTX::LoadParamMemV4F16x2,
2447  NVPTX::LoadParamMemV4F32, None);
2448  break;
2449  }
2450  if (!Opcode)
2451  return false;
2452 
2453  SDVTList VTs;
2454  if (VecSize == 1) {
2455  VTs = CurDAG->getVTList(EltVT, MVT::Other, MVT::Glue);
2456  } else if (VecSize == 2) {
2457  VTs = CurDAG->getVTList(EltVT, EltVT, MVT::Other, MVT::Glue);
2458  } else {
2459  EVT EVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other, MVT::Glue };
2460  VTs = CurDAG->getVTList(EVTs);
2461  }
2462 
2463  unsigned OffsetVal = cast<ConstantSDNode>(Offset)->getZExtValue();
2464 
2466  Ops.push_back(CurDAG->getTargetConstant(OffsetVal, DL, MVT::i32));
2467  Ops.push_back(Chain);
2468  Ops.push_back(Flag);
2469 
2470  ReplaceNode(Node, CurDAG->getMachineNode(Opcode.getValue(), DL, VTs, Ops));
2471  return true;
2472 }
2473 
2474 bool NVPTXDAGToDAGISel::tryStoreRetval(SDNode *N) {
2475  SDLoc DL(N);
2476  SDValue Chain = N->getOperand(0);
2477  SDValue Offset = N->getOperand(1);
2478  unsigned OffsetVal = cast<ConstantSDNode>(Offset)->getZExtValue();
2479  MemSDNode *Mem = cast<MemSDNode>(N);
2480 
2481  // How many elements do we have?
2482  unsigned NumElts = 1;
2483  switch (N->getOpcode()) {
2484  default:
2485  return false;
2486  case NVPTXISD::StoreRetval:
2487  NumElts = 1;
2488  break;
2490  NumElts = 2;
2491  break;
2493  NumElts = 4;
2494  break;
2495  }
2496 
2497  // Build vector of operands
2499  for (unsigned i = 0; i < NumElts; ++i)
2500  Ops.push_back(N->getOperand(i + 2));
2501  Ops.push_back(CurDAG->getTargetConstant(OffsetVal, DL, MVT::i32));
2502  Ops.push_back(Chain);
2503 
2504  // Determine target opcode
2505  // If we have an i1, use an 8-bit store. The lowering code in
2506  // NVPTXISelLowering will have already emitted an upcast.
2507  Optional<unsigned> Opcode = 0;
2508  switch (NumElts) {
2509  default:
2510  return false;
2511  case 1:
2512  Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy,
2513  NVPTX::StoreRetvalI8, NVPTX::StoreRetvalI16,
2514  NVPTX::StoreRetvalI32, NVPTX::StoreRetvalI64,
2515  NVPTX::StoreRetvalF16, NVPTX::StoreRetvalF16x2,
2516  NVPTX::StoreRetvalF32, NVPTX::StoreRetvalF64);
2517  break;
2518  case 2:
2519  Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy,
2520  NVPTX::StoreRetvalV2I8, NVPTX::StoreRetvalV2I16,
2521  NVPTX::StoreRetvalV2I32, NVPTX::StoreRetvalV2I64,
2522  NVPTX::StoreRetvalV2F16, NVPTX::StoreRetvalV2F16x2,
2523  NVPTX::StoreRetvalV2F32, NVPTX::StoreRetvalV2F64);
2524  break;
2525  case 4:
2526  Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy,
2527  NVPTX::StoreRetvalV4I8, NVPTX::StoreRetvalV4I16,
2528  NVPTX::StoreRetvalV4I32, None,
2529  NVPTX::StoreRetvalV4F16, NVPTX::StoreRetvalV4F16x2,
2530  NVPTX::StoreRetvalV4F32, None);
2531  break;
2532  }
2533  if (!Opcode)
2534  return false;
2535 
2536  SDNode *Ret = CurDAG->getMachineNode(Opcode.getValue(), DL, MVT::Other, Ops);
2538  MemRefs0[0] = cast<MemSDNode>(N)->getMemOperand();
2539  cast<MachineSDNode>(Ret)->setMemRefs(MemRefs0, MemRefs0 + 1);
2540 
2541  ReplaceNode(N, Ret);
2542  return true;
2543 }
2544 
2545 bool NVPTXDAGToDAGISel::tryStoreParam(SDNode *N) {
2546  SDLoc DL(N);
2547  SDValue Chain = N->getOperand(0);
2548  SDValue Param = N->getOperand(1);
2549  unsigned ParamVal = cast<ConstantSDNode>(Param)->getZExtValue();
2550  SDValue Offset = N->getOperand(2);
2551  unsigned OffsetVal = cast<ConstantSDNode>(Offset)->getZExtValue();
2552  MemSDNode *Mem = cast<MemSDNode>(N);
2553  SDValue Flag = N->getOperand(N->getNumOperands() - 1);
2554 
2555  // How many elements do we have?
2556  unsigned NumElts = 1;
2557  switch (N->getOpcode()) {
2558  default:
2559  return false;
2562  case NVPTXISD::StoreParam:
2563  NumElts = 1;
2564  break;
2566  NumElts = 2;
2567  break;
2569  NumElts = 4;
2570  break;
2571  }
2572 
2573  // Build vector of operands
2575  for (unsigned i = 0; i < NumElts; ++i)
2576  Ops.push_back(N->getOperand(i + 3));
2577  Ops.push_back(CurDAG->getTargetConstant(ParamVal, DL, MVT::i32));
2578  Ops.push_back(CurDAG->getTargetConstant(OffsetVal, DL, MVT::i32));
2579  Ops.push_back(Chain);
2580  Ops.push_back(Flag);
2581 
2582  // Determine target opcode
2583  // If we have an i1, use an 8-bit store. The lowering code in
2584  // NVPTXISelLowering will have already emitted an upcast.
2585  Optional<unsigned> Opcode = 0;
2586  switch (N->getOpcode()) {
2587  default:
2588  switch (NumElts) {
2589  default:
2590  return false;
2591  case 1:
2592  Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy,
2593  NVPTX::StoreParamI8, NVPTX::StoreParamI16,
2594  NVPTX::StoreParamI32, NVPTX::StoreParamI64,
2595  NVPTX::StoreParamF16, NVPTX::StoreParamF16x2,
2596  NVPTX::StoreParamF32, NVPTX::StoreParamF64);
2597  break;
2598  case 2:
2599  Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy,
2600  NVPTX::StoreParamV2I8, NVPTX::StoreParamV2I16,
2601  NVPTX::StoreParamV2I32, NVPTX::StoreParamV2I64,
2602  NVPTX::StoreParamV2F16, NVPTX::StoreParamV2F16x2,
2603  NVPTX::StoreParamV2F32, NVPTX::StoreParamV2F64);
2604  break;
2605  case 4:
2606  Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy,
2607  NVPTX::StoreParamV4I8, NVPTX::StoreParamV4I16,
2608  NVPTX::StoreParamV4I32, None,
2609  NVPTX::StoreParamV4F16, NVPTX::StoreParamV4F16x2,
2610  NVPTX::StoreParamV4F32, None);
2611  break;
2612  }
2613  if (!Opcode)
2614  return false;
2615  break;
2616  // Special case: if we have a sign-extend/zero-extend node, insert the
2617  // conversion instruction first, and use that as the value operand to
2618  // the selected StoreParam node.
2619  case NVPTXISD::StoreParamU32: {
2620  Opcode = NVPTX::StoreParamI32;
2622  MVT::i32);
2623  SDNode *Cvt = CurDAG->getMachineNode(NVPTX::CVT_u32_u16, DL,
2624  MVT::i32, Ops[0], CvtNone);
2625  Ops[0] = SDValue(Cvt, 0);
2626  break;
2627  }
2628  case NVPTXISD::StoreParamS32: {
2629  Opcode = NVPTX::StoreParamI32;
2631  MVT::i32);
2632  SDNode *Cvt = CurDAG->getMachineNode(NVPTX::CVT_s32_s16, DL,
2633  MVT::i32, Ops[0], CvtNone);
2634  Ops[0] = SDValue(Cvt, 0);
2635  break;
2636  }
2637  }
2638 
2640  SDNode *Ret =
2641  CurDAG->getMachineNode(Opcode.getValue(), DL, RetVTs, Ops);
2643  MemRefs0[0] = cast<MemSDNode>(N)->getMemOperand();
2644  cast<MachineSDNode>(Ret)->setMemRefs(MemRefs0, MemRefs0 + 1);
2645 
2646  ReplaceNode(N, Ret);
2647  return true;
2648 }
2649 
2650 bool NVPTXDAGToDAGISel::tryTextureIntrinsic(SDNode *N) {
2651  unsigned Opc = 0;
2652 
2653  switch (N->getOpcode()) {
2654  default: return false;
2656  Opc = NVPTX::TEX_1D_F32_S32;
2657  break;
2659  Opc = NVPTX::TEX_1D_F32_F32;
2660  break;
2662  Opc = NVPTX::TEX_1D_F32_F32_LEVEL;
2663  break;
2665  Opc = NVPTX::TEX_1D_F32_F32_GRAD;
2666  break;
2667  case NVPTXISD::Tex1DS32S32:
2668  Opc = NVPTX::TEX_1D_S32_S32;
2669  break;
2671  Opc = NVPTX::TEX_1D_S32_F32;
2672  break;
2674  Opc = NVPTX::TEX_1D_S32_F32_LEVEL;
2675  break;
2677  Opc = NVPTX::TEX_1D_S32_F32_GRAD;
2678  break;
2679  case NVPTXISD::Tex1DU32S32:
2680  Opc = NVPTX::TEX_1D_U32_S32;
2681  break;
2683  Opc = NVPTX::TEX_1D_U32_F32;
2684  break;
2686  Opc = NVPTX::TEX_1D_U32_F32_LEVEL;
2687  break;
2689  Opc = NVPTX::TEX_1D_U32_F32_GRAD;
2690  break;
2692  Opc = NVPTX::TEX_1D_ARRAY_F32_S32;
2693  break;
2695  Opc = NVPTX::TEX_1D_ARRAY_F32_F32;
2696  break;
2698  Opc = NVPTX::TEX_1D_ARRAY_F32_F32_LEVEL;
2699  break;
2701  Opc = NVPTX::TEX_1D_ARRAY_F32_F32_GRAD;
2702  break;
2704  Opc = NVPTX::TEX_1D_ARRAY_S32_S32;
2705  break;
2707  Opc = NVPTX::TEX_1D_ARRAY_S32_F32;
2708  break;
2710  Opc = NVPTX::TEX_1D_ARRAY_S32_F32_LEVEL;
2711  break;
2713  Opc = NVPTX::TEX_1D_ARRAY_S32_F32_GRAD;
2714  break;
2716  Opc = NVPTX::TEX_1D_ARRAY_U32_S32;
2717  break;
2719  Opc = NVPTX::TEX_1D_ARRAY_U32_F32;
2720  break;
2722  Opc = NVPTX::TEX_1D_ARRAY_U32_F32_LEVEL;
2723  break;
2725  Opc = NVPTX::TEX_1D_ARRAY_U32_F32_GRAD;
2726  break;
2728  Opc = NVPTX::TEX_2D_F32_S32;
2729  break;
2731  Opc = NVPTX::TEX_2D_F32_F32;
2732  break;
2734  Opc = NVPTX::TEX_2D_F32_F32_LEVEL;
2735  break;
2737  Opc = NVPTX::TEX_2D_F32_F32_GRAD;
2738  break;
2739  case NVPTXISD::Tex2DS32S32:
2740  Opc = NVPTX::TEX_2D_S32_S32;
2741  break;
2743  Opc = NVPTX::TEX_2D_S32_F32;
2744  break;
2746  Opc = NVPTX::TEX_2D_S32_F32_LEVEL;
2747  break;
2749  Opc = NVPTX::TEX_2D_S32_F32_GRAD;
2750  break;
2751  case NVPTXISD::Tex2DU32S32:
2752  Opc = NVPTX::TEX_2D_U32_S32;
2753  break;
2755  Opc = NVPTX::TEX_2D_U32_F32;
2756  break;
2758  Opc = NVPTX::TEX_2D_U32_F32_LEVEL;
2759  break;
2761  Opc = NVPTX::TEX_2D_U32_F32_GRAD;
2762  break;
2764  Opc = NVPTX::TEX_2D_ARRAY_F32_S32;
2765  break;
2767  Opc = NVPTX::TEX_2D_ARRAY_F32_F32;
2768  break;
2770  Opc = NVPTX::TEX_2D_ARRAY_F32_F32_LEVEL;
2771  break;
2773  Opc = NVPTX::TEX_2D_ARRAY_F32_F32_GRAD;
2774  break;
2776  Opc = NVPTX::TEX_2D_ARRAY_S32_S32;
2777  break;
2779  Opc = NVPTX::TEX_2D_ARRAY_S32_F32;
2780  break;
2782  Opc = NVPTX::TEX_2D_ARRAY_S32_F32_LEVEL;
2783  break;
2785  Opc = NVPTX::TEX_2D_ARRAY_S32_F32_GRAD;
2786  break;
2788  Opc = NVPTX::TEX_2D_ARRAY_U32_S32;
2789  break;
2791  Opc = NVPTX::TEX_2D_ARRAY_U32_F32;
2792  break;
2794  Opc = NVPTX::TEX_2D_ARRAY_U32_F32_LEVEL;
2795  break;
2797  Opc = NVPTX::TEX_2D_ARRAY_U32_F32_GRAD;
2798  break;
2800  Opc = NVPTX::TEX_3D_F32_S32;
2801  break;
2803  Opc = NVPTX::TEX_3D_F32_F32;
2804  break;
2806  Opc = NVPTX::TEX_3D_F32_F32_LEVEL;
2807  break;
2809  Opc = NVPTX::TEX_3D_F32_F32_GRAD;
2810  break;
2811  case NVPTXISD::Tex3DS32S32:
2812  Opc = NVPTX::TEX_3D_S32_S32;
2813  break;
2815  Opc = NVPTX::TEX_3D_S32_F32;
2816  break;
2818  Opc = NVPTX::TEX_3D_S32_F32_LEVEL;
2819  break;
2821  Opc = NVPTX::TEX_3D_S32_F32_GRAD;
2822  break;
2823  case NVPTXISD::Tex3DU32S32:
2824  Opc = NVPTX::TEX_3D_U32_S32;
2825  break;
2827  Opc = NVPTX::TEX_3D_U32_F32;
2828  break;
2830  Opc = NVPTX::TEX_3D_U32_F32_LEVEL;
2831  break;
2833  Opc = NVPTX::TEX_3D_U32_F32_GRAD;
2834  break;
2836  Opc = NVPTX::TEX_CUBE_F32_F32;
2837  break;
2839  Opc = NVPTX::TEX_CUBE_F32_F32_LEVEL;
2840  break;
2842  Opc = NVPTX::TEX_CUBE_S32_F32;
2843  break;
2845  Opc = NVPTX::TEX_CUBE_S32_F32_LEVEL;
2846  break;
2848  Opc = NVPTX::TEX_CUBE_U32_F32;
2849  break;
2851  Opc = NVPTX::TEX_CUBE_U32_F32_LEVEL;
2852  break;
2854  Opc = NVPTX::TEX_CUBE_ARRAY_F32_F32;
2855  break;
2857  Opc = NVPTX::TEX_CUBE_ARRAY_F32_F32_LEVEL;
2858  break;
2860  Opc = NVPTX::TEX_CUBE_ARRAY_S32_F32;
2861  break;
2863  Opc = NVPTX::TEX_CUBE_ARRAY_S32_F32_LEVEL;
2864  break;
2866  Opc = NVPTX::TEX_CUBE_ARRAY_U32_F32;
2867  break;
2869  Opc = NVPTX::TEX_CUBE_ARRAY_U32_F32_LEVEL;
2870  break;
2872  Opc = NVPTX::TLD4_R_2D_F32_F32;
2873  break;
2875  Opc = NVPTX::TLD4_G_2D_F32_F32;
2876  break;
2878  Opc = NVPTX::TLD4_B_2D_F32_F32;
2879  break;
2881  Opc = NVPTX::TLD4_A_2D_F32_F32;
2882  break;
2884  Opc = NVPTX::TLD4_R_2D_S32_F32;
2885  break;
2887  Opc = NVPTX::TLD4_G_2D_S32_F32;
2888  break;
2890  Opc = NVPTX::TLD4_B_2D_S32_F32;
2891  break;
2893  Opc = NVPTX::TLD4_A_2D_S32_F32;
2894  break;
2896  Opc = NVPTX::TLD4_R_2D_U32_F32;
2897  break;
2899  Opc = NVPTX::TLD4_G_2D_U32_F32;
2900  break;
2902  Opc = NVPTX::TLD4_B_2D_U32_F32;
2903  break;
2905  Opc = NVPTX::TLD4_A_2D_U32_F32;
2906  break;
2908  Opc = NVPTX::TEX_UNIFIED_1D_F32_S32;
2909  break;
2911  Opc = NVPTX::TEX_UNIFIED_1D_F32_F32;
2912  break;
2914  Opc = NVPTX::TEX_UNIFIED_1D_F32_F32_LEVEL;
2915  break;
2917  Opc = NVPTX::TEX_UNIFIED_1D_F32_F32_GRAD;
2918  break;
2920  Opc = NVPTX::TEX_UNIFIED_1D_S32_S32;
2921  break;
2923  Opc = NVPTX::TEX_UNIFIED_1D_S32_F32;
2924  break;
2926  Opc = NVPTX::TEX_UNIFIED_1D_S32_F32_LEVEL;
2927  break;
2929  Opc = NVPTX::TEX_UNIFIED_1D_S32_F32_GRAD;
2930  break;
2932  Opc = NVPTX::TEX_UNIFIED_1D_U32_S32;
2933  break;
2935  Opc = NVPTX::TEX_UNIFIED_1D_U32_F32;
2936  break;
2938  Opc = NVPTX::TEX_UNIFIED_1D_U32_F32_LEVEL;
2939  break;
2941  Opc = NVPTX::TEX_UNIFIED_1D_U32_F32_GRAD;
2942  break;
2944  Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_F32_S32;
2945  break;
2947  Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_F32_F32;
2948  break;
2950  Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_F32_F32_LEVEL;
2951  break;
2953  Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_F32_F32_GRAD;
2954  break;
2956  Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_S32_S32;
2957  break;
2959  Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_S32_F32;
2960  break;
2962  Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_S32_F32_LEVEL;
2963  break;
2965  Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_S32_F32_GRAD;
2966  break;
2968  Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_U32_S32;
2969  break;
2971  Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_U32_F32;
2972  break;
2974  Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_U32_F32_LEVEL;
2975  break;
2977  Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_U32_F32_GRAD;
2978  break;
2980  Opc = NVPTX::TEX_UNIFIED_2D_F32_S32;
2981  break;
2983  Opc = NVPTX::TEX_UNIFIED_2D_F32_F32;
2984  break;
2986  Opc = NVPTX::TEX_UNIFIED_2D_F32_F32_LEVEL;
2987  break;
2989  Opc = NVPTX::TEX_UNIFIED_2D_F32_F32_GRAD;
2990  break;
2992  Opc = NVPTX::TEX_UNIFIED_2D_S32_S32;
2993  break;
2995  Opc = NVPTX::TEX_UNIFIED_2D_S32_F32;
2996  break;
2998  Opc = NVPTX::TEX_UNIFIED_2D_S32_F32_LEVEL;
2999  break;
3001  Opc = NVPTX::TEX_UNIFIED_2D_S32_F32_GRAD;
3002  break;
3004  Opc = NVPTX::TEX_UNIFIED_2D_U32_S32;
3005  break;
3007  Opc = NVPTX::TEX_UNIFIED_2D_U32_F32;
3008  break;
3010  Opc = NVPTX::TEX_UNIFIED_2D_U32_F32_LEVEL;
3011  break;
3013  Opc = NVPTX::TEX_UNIFIED_2D_U32_F32_GRAD;
3014  break;
3016  Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_F32_S32;
3017  break;
3019  Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_F32_F32;
3020  break;
3022  Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_F32_F32_LEVEL;
3023  break;
3025  Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_F32_F32_GRAD;
3026  break;
3028  Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_S32_S32;
3029  break;
3031  Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_S32_F32;
3032  break;
3034  Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_S32_F32_LEVEL;
3035  break;
3037  Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_S32_F32_GRAD;
3038  break;
3040  Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_U32_S32;
3041  break;
3043  Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_U32_F32;
3044  break;
3046  Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_U32_F32_LEVEL;
3047  break;
3049  Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_U32_F32_GRAD;
3050  break;
3052  Opc = NVPTX::TEX_UNIFIED_3D_F32_S32;
3053  break;
3055  Opc = NVPTX::TEX_UNIFIED_3D_F32_F32;
3056  break;
3058  Opc = NVPTX::TEX_UNIFIED_3D_F32_F32_LEVEL;
3059  break;
3061  Opc = NVPTX::TEX_UNIFIED_3D_F32_F32_GRAD;
3062  break;
3064  Opc = NVPTX::TEX_UNIFIED_3D_S32_S32;
3065  break;
3067  Opc = NVPTX::TEX_UNIFIED_3D_S32_F32;
3068  break;
3070  Opc = NVPTX::TEX_UNIFIED_3D_S32_F32_LEVEL;
3071  break;
3073  Opc = NVPTX::TEX_UNIFIED_3D_S32_F32_GRAD;
3074  break;
3076  Opc = NVPTX::TEX_UNIFIED_3D_U32_S32;
3077  break;
3079  Opc = NVPTX::TEX_UNIFIED_3D_U32_F32;
3080  break;
3082  Opc = NVPTX::TEX_UNIFIED_3D_U32_F32_LEVEL;
3083  break;
3085  Opc = NVPTX::TEX_UNIFIED_3D_U32_F32_GRAD;
3086  break;
3088  Opc = NVPTX::TEX_UNIFIED_CUBE_F32_F32;
3089  break;
3091  Opc = NVPTX::TEX_UNIFIED_CUBE_F32_F32_LEVEL;
3092  break;
3094  Opc = NVPTX::TEX_UNIFIED_CUBE_S32_F32;
3095  break;
3097  Opc = NVPTX::TEX_UNIFIED_CUBE_S32_F32_LEVEL;
3098  break;
3100  Opc = NVPTX::TEX_UNIFIED_CUBE_U32_F32;
3101  break;
3103  Opc = NVPTX::TEX_UNIFIED_CUBE_U32_F32_LEVEL;
3104  break;
3106  Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_F32_F32;
3107  break;
3109  Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_F32_F32_LEVEL;
3110  break;
3112  Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_S32_F32;
3113  break;
3115  Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_S32_F32_LEVEL;
3116  break;
3118  Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_U32_F32;
3119  break;
3121  Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_U32_F32_LEVEL;
3122  break;
3124  Opc = NVPTX::TLD4_UNIFIED_R_2D_F32_F32;
3125  break;
3127  Opc = NVPTX::TLD4_UNIFIED_G_2D_F32_F32;
3128  break;
3130  Opc = NVPTX::TLD4_UNIFIED_B_2D_F32_F32;
3131  break;
3133  Opc = NVPTX::TLD4_UNIFIED_A_2D_F32_F32;
3134  break;
3136  Opc = NVPTX::TLD4_UNIFIED_R_2D_S32_F32;
3137  break;
3139  Opc = NVPTX::TLD4_UNIFIED_G_2D_S32_F32;
3140  break;
3142  Opc = NVPTX::TLD4_UNIFIED_B_2D_S32_F32;
3143  break;
3145  Opc = NVPTX::TLD4_UNIFIED_A_2D_S32_F32;
3146  break;
3148  Opc = NVPTX::TLD4_UNIFIED_R_2D_U32_F32;
3149  break;
3151  Opc = NVPTX::TLD4_UNIFIED_G_2D_U32_F32;
3152  break;
3154  Opc = NVPTX::TLD4_UNIFIED_B_2D_U32_F32;
3155  break;
3157  Opc = NVPTX::TLD4_UNIFIED_A_2D_U32_F32;
3158  break;
3159  }
3160 
3161  // Copy over operands
3162  SmallVector<SDValue, 8> Ops(N->op_begin() + 1, N->op_end());
3163  Ops.push_back(N->getOperand(0)); // Move chain to the back.
3164 
3165  ReplaceNode(N, CurDAG->getMachineNode(Opc, SDLoc(N), N->getVTList(), Ops));
3166  return true;
3167 }
3168 
3169 bool NVPTXDAGToDAGISel::trySurfaceIntrinsic(SDNode *N) {
3170  unsigned Opc = 0;
3171  switch (N->getOpcode()) {
3172  default: return false;
3174  Opc = NVPTX::SULD_1D_I8_CLAMP;
3175  break;
3177  Opc = NVPTX::SULD_1D_I16_CLAMP;
3178  break;
3180  Opc = NVPTX::SULD_1D_I32_CLAMP;
3181  break;
3183  Opc = NVPTX::SULD_1D_I64_CLAMP;
3184  break;
3186  Opc = NVPTX::SULD_1D_V2I8_CLAMP;
3187  break;
3189  Opc = NVPTX::SULD_1D_V2I16_CLAMP;
3190  break;
3192  Opc = NVPTX::SULD_1D_V2I32_CLAMP;
3193  break;
3195  Opc = NVPTX::SULD_1D_V2I64_CLAMP;
3196  break;
3198  Opc = NVPTX::SULD_1D_V4I8_CLAMP;
3199  break;
3201  Opc = NVPTX::SULD_1D_V4I16_CLAMP;
3202  break;
3204  Opc = NVPTX::SULD_1D_V4I32_CLAMP;
3205  break;
3207  Opc = NVPTX::SULD_1D_ARRAY_I8_CLAMP;
3208  break;
3210  Opc = NVPTX::SULD_1D_ARRAY_I16_CLAMP;
3211  break;
3213  Opc = NVPTX::SULD_1D_ARRAY_I32_CLAMP;
3214  break;
3216  Opc = NVPTX::SULD_1D_ARRAY_I64_CLAMP;
3217  break;
3219  Opc = NVPTX::SULD_1D_ARRAY_V2I8_CLAMP;
3220  break;
3222  Opc = NVPTX::SULD_1D_ARRAY_V2I16_CLAMP;
3223  break;
3225  Opc = NVPTX::SULD_1D_ARRAY_V2I32_CLAMP;
3226  break;
3228  Opc = NVPTX::SULD_1D_ARRAY_V2I64_CLAMP;
3229  break;
3231  Opc = NVPTX::SULD_1D_ARRAY_V4I8_CLAMP;
3232  break;
3234  Opc = NVPTX::SULD_1D_ARRAY_V4I16_CLAMP;
3235  break;
3237  Opc = NVPTX::SULD_1D_ARRAY_V4I32_CLAMP;
3238  break;
3240  Opc = NVPTX::SULD_2D_I8_CLAMP;
3241  break;
3243  Opc = NVPTX::SULD_2D_I16_CLAMP;
3244  break;
3246  Opc = NVPTX::SULD_2D_I32_CLAMP;
3247  break;
3249  Opc = NVPTX::SULD_2D_I64_CLAMP;
3250  break;
3252  Opc = NVPTX::SULD_2D_V2I8_CLAMP;
3253  break;
3255  Opc = NVPTX::SULD_2D_V2I16_CLAMP;
3256  break;
3258  Opc = NVPTX::SULD_2D_V2I32_CLAMP;
3259  break;
3261  Opc = NVPTX::SULD_2D_V2I64_CLAMP;
3262  break;
3264  Opc = NVPTX::SULD_2D_V4I8_CLAMP;
3265  break;
3267  Opc = NVPTX::SULD_2D_V4I16_CLAMP;
3268  break;
3270  Opc = NVPTX::SULD_2D_V4I32_CLAMP;
3271  break;
3273  Opc = NVPTX::SULD_2D_ARRAY_I8_CLAMP;
3274  break;
3276  Opc = NVPTX::SULD_2D_ARRAY_I16_CLAMP;
3277  break;
3279  Opc = NVPTX::SULD_2D_ARRAY_I32_CLAMP;
3280  break;
3282  Opc = NVPTX::SULD_2D_ARRAY_I64_CLAMP;
3283  break;
3285  Opc = NVPTX::SULD_2D_ARRAY_V2I8_CLAMP;
3286  break;
3288  Opc = NVPTX::SULD_2D_ARRAY_V2I16_CLAMP;
3289  break;
3291  Opc = NVPTX::SULD_2D_ARRAY_V2I32_CLAMP;
3292  break;
3294  Opc = NVPTX::SULD_2D_ARRAY_V2I64_CLAMP;
3295  break;
3297  Opc = NVPTX::SULD_2D_ARRAY_V4I8_CLAMP;
3298  break;
3300  Opc = NVPTX::SULD_2D_ARRAY_V4I16_CLAMP;
3301  break;
3303  Opc = NVPTX::SULD_2D_ARRAY_V4I32_CLAMP;
3304  break;
3306  Opc = NVPTX::SULD_3D_I8_CLAMP;
3307  break;
3309  Opc = NVPTX::SULD_3D_I16_CLAMP;
3310  break;
3312  Opc = NVPTX::SULD_3D_I32_CLAMP;
3313  break;
3315  Opc = NVPTX::SULD_3D_I64_CLAMP;
3316  break;
3318  Opc = NVPTX::SULD_3D_V2I8_CLAMP;
3319  break;
3321  Opc = NVPTX::SULD_3D_V2I16_CLAMP;
3322  break;
3324  Opc = NVPTX::SULD_3D_V2I32_CLAMP;
3325  break;
3327  Opc = NVPTX::SULD_3D_V2I64_CLAMP;
3328  break;
3330  Opc = NVPTX::SULD_3D_V4I8_CLAMP;
3331  break;
3333  Opc = NVPTX::SULD_3D_V4I16_CLAMP;
3334  break;
3336  Opc = NVPTX::SULD_3D_V4I32_CLAMP;
3337  break;
3339  Opc = NVPTX::SULD_1D_I8_TRAP;
3340  break;
3342  Opc = NVPTX::SULD_1D_I16_TRAP;
3343  break;
3345  Opc = NVPTX::SULD_1D_I32_TRAP;
3346  break;
3348  Opc = NVPTX::SULD_1D_I64_TRAP;
3349  break;
3351  Opc = NVPTX::SULD_1D_V2I8_TRAP;
3352  break;
3354  Opc = NVPTX::SULD_1D_V2I16_TRAP;
3355  break;
3357  Opc = NVPTX::SULD_1D_V2I32_TRAP;
3358  break;
3360  Opc = NVPTX::SULD_1D_V2I64_TRAP;
3361  break;
3363  Opc = NVPTX::SULD_1D_V4I8_TRAP;
3364  break;
3366  Opc = NVPTX::SULD_1D_V4I16_TRAP;
3367  break;
3369  Opc = NVPTX::SULD_1D_V4I32_TRAP;
3370  break;
3372  Opc = NVPTX::SULD_1D_ARRAY_I8_TRAP;
3373  break;
3375  Opc = NVPTX::SULD_1D_ARRAY_I16_TRAP;
3376  break;
3378  Opc = NVPTX::SULD_1D_ARRAY_I32_TRAP;
3379  break;
3381  Opc = NVPTX::SULD_1D_ARRAY_I64_TRAP;
3382  break;
3384  Opc = NVPTX::SULD_1D_ARRAY_V2I8_TRAP;
3385  break;
3387  Opc = NVPTX::SULD_1D_ARRAY_V2I16_TRAP;
3388  break;
3390  Opc = NVPTX::SULD_1D_ARRAY_V2I32_TRAP;
3391  break;
3393  Opc = NVPTX::SULD_1D_ARRAY_V2I64_TRAP;
3394  break;
3396  Opc = NVPTX::SULD_1D_ARRAY_V4I8_TRAP;
3397  break;
3399  Opc = NVPTX::SULD_1D_ARRAY_V4I16_TRAP;
3400  break;
3402  Opc = NVPTX::SULD_1D_ARRAY_V4I32_TRAP;
3403  break;
3405  Opc = NVPTX::SULD_2D_I8_TRAP;
3406  break;
3408  Opc = NVPTX::SULD_2D_I16_TRAP;
3409  break;
3411  Opc = NVPTX::SULD_2D_I32_TRAP;
3412  break;
3414  Opc = NVPTX::SULD_2D_I64_TRAP;
3415  break;
3417  Opc = NVPTX::SULD_2D_V2I8_TRAP;
3418  break;
3420  Opc = NVPTX::SULD_2D_V2I16_TRAP;
3421  break;
3423  Opc = NVPTX::SULD_2D_V2I32_TRAP;
3424  break;
3426  Opc = NVPTX::SULD_2D_V2I64_TRAP;
3427  break;
3429  Opc = NVPTX::SULD_2D_V4I8_TRAP;
3430  break;
3432  Opc = NVPTX::SULD_2D_V4I16_TRAP;
3433  break;
3435  Opc = NVPTX::SULD_2D_V4I32_TRAP;
3436  break;
3438  Opc = NVPTX::SULD_2D_ARRAY_I8_TRAP;
3439  break;
3441  Opc = NVPTX::SULD_2D_ARRAY_I16_TRAP;
3442  break;
3444  Opc = NVPTX::SULD_2D_ARRAY_I32_TRAP;
3445  break;
3447  Opc = NVPTX::SULD_2D_ARRAY_I64_TRAP;
3448  break;
3450  Opc = NVPTX::SULD_2D_ARRAY_V2I8_TRAP;
3451  break;
3453  Opc = NVPTX::SULD_2D_ARRAY_V2I16_TRAP;
3454  break;
3456  Opc = NVPTX::SULD_2D_ARRAY_V2I32_TRAP;
3457  break;
3459  Opc = NVPTX::SULD_2D_ARRAY_V2I64_TRAP;
3460  break;
3462  Opc = NVPTX::SULD_2D_ARRAY_V4I8_TRAP;
3463  break;
3465  Opc = NVPTX::SULD_2D_ARRAY_V4I16_TRAP;
3466  break;
3468  Opc = NVPTX::SULD_2D_ARRAY_V4I32_TRAP;
3469  break;
3471  Opc = NVPTX::SULD_3D_I8_TRAP;
3472  break;
3474  Opc = NVPTX::SULD_3D_I16_TRAP;
3475  break;
3477  Opc = NVPTX::SULD_3D_I32_TRAP;
3478  break;
3480  Opc = NVPTX::SULD_3D_I64_TRAP;
3481  break;
3483  Opc = NVPTX::SULD_3D_V2I8_TRAP;
3484  break;
3486  Opc = NVPTX::SULD_3D_V2I16_TRAP;
3487  break;
3489  Opc = NVPTX::SULD_3D_V2I32_TRAP;
3490  break;
3492  Opc = NVPTX::SULD_3D_V2I64_TRAP;
3493  break;
3495  Opc = NVPTX::SULD_3D_V4I8_TRAP;
3496  break;
3498  Opc = NVPTX::SULD_3D_V4I16_TRAP;
3499  break;
3501  Opc = NVPTX::SULD_3D_V4I32_TRAP;
3502  break;
3504  Opc = NVPTX::SULD_1D_I8_ZERO;
3505  break;
3507  Opc = NVPTX::SULD_1D_I16_ZERO;
3508  break;
3510  Opc = NVPTX::SULD_1D_I32_ZERO;
3511  break;
3513  Opc = NVPTX::SULD_1D_I64_ZERO;
3514  break;
3516  Opc = NVPTX::SULD_1D_V2I8_ZERO;
3517  break;
3519  Opc = NVPTX::SULD_1D_V2I16_ZERO;
3520  break;
3522  Opc = NVPTX::SULD_1D_V2I32_ZERO;
3523  break;
3525  Opc = NVPTX::SULD_1D_V2I64_ZERO;
3526  break;
3528  Opc = NVPTX::SULD_1D_V4I8_ZERO;
3529  break;
3531  Opc = NVPTX::SULD_1D_V4I16_ZERO;
3532  break;
3534  Opc = NVPTX::SULD_1D_V4I32_ZERO;
3535  break;
3537  Opc = NVPTX::SULD_1D_ARRAY_I8_ZERO;
3538  break;
3540  Opc = NVPTX::SULD_1D_ARRAY_I16_ZERO;
3541  break;
3543  Opc = NVPTX::SULD_1D_ARRAY_I32_ZERO;
3544  break;
3546  Opc = NVPTX::SULD_1D_ARRAY_I64_ZERO;
3547  break;
3549  Opc = NVPTX::SULD_1D_ARRAY_V2I8_ZERO;
3550  break;
3552  Opc = NVPTX::SULD_1D_ARRAY_V2I16_ZERO;
3553  break;
3555  Opc = NVPTX::SULD_1D_ARRAY_V2I32_ZERO;
3556  break;
3558  Opc = NVPTX::SULD_1D_ARRAY_V2I64_ZERO;
3559  break;
3561  Opc = NVPTX::SULD_1D_ARRAY_V4I8_ZERO;
3562  break;
3564  Opc = NVPTX::SULD_1D_ARRAY_V4I16_ZERO;
3565  break;
3567  Opc = NVPTX::SULD_1D_ARRAY_V4I32_ZERO;
3568  break;
3570  Opc = NVPTX::SULD_2D_I8_ZERO;
3571  break;
3573  Opc = NVPTX::SULD_2D_I16_ZERO;
3574  break;
3576  Opc = NVPTX::SULD_2D_I32_ZERO;
3577  break;
3579  Opc = NVPTX::SULD_2D_I64_ZERO;
3580  break;
3582  Opc = NVPTX::SULD_2D_V2I8_ZERO;
3583  break;
3585  Opc = NVPTX::SULD_2D_V2I16_ZERO;
3586  break;
3588  Opc = NVPTX::SULD_2D_V2I32_ZERO;
3589  break;
3591  Opc = NVPTX::SULD_2D_V2I64_ZERO;
3592  break;
3594  Opc = NVPTX::SULD_2D_V4I8_ZERO;
3595  break;
3597  Opc = NVPTX::SULD_2D_V4I16_ZERO;
3598  break;
3600  Opc = NVPTX::SULD_2D_V4I32_ZERO;
3601  break;
3603  Opc = NVPTX::SULD_2D_ARRAY_I8_ZERO;
3604  break;
3606  Opc = NVPTX::SULD_2D_ARRAY_I16_ZERO;
3607  break;
3609  Opc = NVPTX::SULD_2D_ARRAY_I32_ZERO;
3610  break;
3612  Opc = NVPTX::SULD_2D_ARRAY_I64_ZERO;
3613  break;
3615  Opc = NVPTX::SULD_2D_ARRAY_V2I8_ZERO;
3616  break;
3618  Opc = NVPTX::SULD_2D_ARRAY_V2I16_ZERO;
3619  break;
3621  Opc = NVPTX::SULD_2D_ARRAY_V2I32_ZERO;
3622  break;
3624  Opc = NVPTX::SULD_2D_ARRAY_V2I64_ZERO;
3625  break;
3627  Opc = NVPTX::SULD_2D_ARRAY_V4I8_ZERO;
3628  break;
3630  Opc = NVPTX::SULD_2D_ARRAY_V4I16_ZERO;
3631  break;
3633  Opc = NVPTX::SULD_2D_ARRAY_V4I32_ZERO;
3634  break;
3636  Opc = NVPTX::SULD_3D_I8_ZERO;
3637  break;
3639  Opc = NVPTX::SULD_3D_I16_ZERO;
3640  break;
3642  Opc = NVPTX::SULD_3D_I32_ZERO;
3643  break;
3645  Opc = NVPTX::SULD_3D_I64_ZERO;
3646  break;
3648  Opc = NVPTX::SULD_3D_V2I8_ZERO;
3649  break;
3651  Opc = NVPTX::SULD_3D_V2I16_ZERO;
3652  break;
3654  Opc = NVPTX::SULD_3D_V2I32_ZERO;
3655  break;
3657  Opc = NVPTX::SULD_3D_V2I64_ZERO;
3658  break;
3660  Opc = NVPTX::SULD_3D_V4I8_ZERO;
3661  break;
3663  Opc = NVPTX::SULD_3D_V4I16_ZERO;
3664  break;
3666  Opc = NVPTX::SULD_3D_V4I32_ZERO;
3667  break;
3668  }
3669 
3670  // Copy over operands
3671  SmallVector<SDValue, 8> Ops(N->op_begin() + 1, N->op_end());
3672  Ops.push_back(N->getOperand(0)); // Move chain to the back.
3673 
3674  ReplaceNode(N, CurDAG->getMachineNode(Opc, SDLoc(N), N->getVTList(), Ops));
3675  return true;
3676 }
3677 
3678 
3679 /// SelectBFE - Look for instruction sequences that can be made more efficient
3680 /// by using the 'bfe' (bit-field extract) PTX instruction
3681 bool NVPTXDAGToDAGISel::tryBFE(SDNode *N) {
3682  SDLoc DL(N);
3683  SDValue LHS = N->getOperand(0);
3684  SDValue RHS = N->getOperand(1);
3685  SDValue Len;
3686  SDValue Start;
3687  SDValue Val;
3688  bool IsSigned = false;
3689 
3690  if (N->getOpcode() == ISD::AND) {
3691  // Canonicalize the operands
3692  // We want 'and %val, %mask'
3693  if (isa<ConstantSDNode>(LHS) && !isa<ConstantSDNode>(RHS)) {
3694  std::swap(LHS, RHS);
3695  }
3696 
3698  if (!Mask) {
3699  // We need a constant mask on the RHS of the AND
3700  return false;
3701  }
3702 
3703  // Extract the mask bits
3704  uint64_t MaskVal = Mask->getZExtValue();
3705  if (!isMask_64(MaskVal)) {
3706  // We *could* handle shifted masks here, but doing so would require an
3707  // 'and' operation to fix up the low-order bits so we would trade
3708  // shr+and for bfe+and, which has the same throughput
3709  return false;
3710  }
3711 
3712  // How many bits are in our mask?
3713  uint64_t NumBits = countTrailingOnes(MaskVal);
3714  Len = CurDAG->getTargetConstant(NumBits, DL, MVT::i32);
3715 
3716  if (LHS.getOpcode() == ISD::SRL || LHS.getOpcode() == ISD::SRA) {
3717  // We have a 'srl/and' pair, extract the effective start bit and length
3718  Val = LHS.getNode()->getOperand(0);
3719  Start = LHS.getNode()->getOperand(1);
3720  ConstantSDNode *StartConst = dyn_cast<ConstantSDNode>(Start);
3721  if (StartConst) {
3722  uint64_t StartVal = StartConst->getZExtValue();
3723  // How many "good" bits do we have left? "good" is defined here as bits
3724  // that exist in the original value, not shifted in.
3725  uint64_t GoodBits = Start.getValueSizeInBits() - StartVal;
3726  if (NumBits > GoodBits) {
3727  // Do not handle the case where bits have been shifted in. In theory
3728  // we could handle this, but the cost is likely higher than just
3729  // emitting the srl/and pair.
3730  return false;
3731  }
3732  Start = CurDAG->getTargetConstant(StartVal, DL, MVT::i32);
3733  } else {
3734  // Do not handle the case where the shift amount (can be zero if no srl
3735  // was found) is not constant. We could handle this case, but it would
3736  // require run-time logic that would be more expensive than just
3737  // emitting the srl/and pair.
3738  return false;
3739  }
3740  } else {
3741  // Do not handle the case where the LHS of the and is not a shift. While
3742  // it would be trivial to handle this case, it would just transform
3743  // 'and' -> 'bfe', but 'and' has higher-throughput.
3744  return false;
3745  }
3746  } else if (N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) {
3747  if (LHS->getOpcode() == ISD::AND) {
3748  ConstantSDNode *ShiftCnst = dyn_cast<ConstantSDNode>(RHS);
3749  if (!ShiftCnst) {
3750  // Shift amount must be constant
3751  return false;
3752  }
3753 
3754  uint64_t ShiftAmt = ShiftCnst->getZExtValue();
3755 
3756  SDValue AndLHS = LHS->getOperand(0);
3757  SDValue AndRHS = LHS->getOperand(1);
3758 
3759  // Canonicalize the AND to have the mask on the RHS
3760  if (isa<ConstantSDNode>(AndLHS)) {
3761  std::swap(AndLHS, AndRHS);
3762  }
3763 
3764  ConstantSDNode *MaskCnst = dyn_cast<ConstantSDNode>(AndRHS);
3765  if (!MaskCnst) {
3766  // Mask must be constant
3767  return false;
3768  }
3769 
3770  uint64_t MaskVal = MaskCnst->getZExtValue();
3771  uint64_t NumZeros;
3772  uint64_t NumBits;
3773  if (isMask_64(MaskVal)) {
3774  NumZeros = 0;
3775  // The number of bits in the result bitfield will be the number of
3776  // trailing ones (the AND) minus the number of bits we shift off
3777  NumBits = countTrailingOnes(MaskVal) - ShiftAmt;
3778  } else if (isShiftedMask_64(MaskVal)) {
3779  NumZeros = countTrailingZeros(MaskVal);
3780  unsigned NumOnes = countTrailingOnes(MaskVal >> NumZeros);
3781  // The number of bits in the result bitfield will be the number of
3782  // trailing zeros plus the number of set bits in the mask minus the
3783  // number of bits we shift off
3784  NumBits = NumZeros + NumOnes - ShiftAmt;
3785  } else {
3786  // This is not a mask we can handle
3787  return false;
3788  }
3789 
3790  if (ShiftAmt < NumZeros) {
3791  // Handling this case would require extra logic that would make this
3792  // transformation non-profitable
3793  return false;
3794  }
3795 
3796  Val = AndLHS;
3797  Start = CurDAG->getTargetConstant(ShiftAmt, DL, MVT::i32);
3798  Len = CurDAG->getTargetConstant(NumBits, DL, MVT::i32);
3799  } else if (LHS->getOpcode() == ISD::SHL) {
3800  // Here, we have a pattern like:
3801  //
3802  // (sra (shl val, NN), MM)
3803  // or
3804  // (srl (shl val, NN), MM)
3805  //
3806  // If MM >= NN, we can efficiently optimize this with bfe
3807  Val = LHS->getOperand(0);
3808 
3809  SDValue ShlRHS = LHS->getOperand(1);
3810  ConstantSDNode *ShlCnst = dyn_cast<ConstantSDNode>(ShlRHS);
3811  if (!ShlCnst) {
3812  // Shift amount must be constant
3813  return false;
3814  }
3815  uint64_t InnerShiftAmt = ShlCnst->getZExtValue();
3816 
3817  SDValue ShrRHS = RHS;
3818  ConstantSDNode *ShrCnst = dyn_cast<ConstantSDNode>(ShrRHS);
3819  if (!ShrCnst) {
3820  // Shift amount must be constant
3821  return false;
3822  }
3823  uint64_t OuterShiftAmt = ShrCnst->getZExtValue();
3824 
3825  // To avoid extra codegen and be profitable, we need Outer >= Inner
3826  if (OuterShiftAmt < InnerShiftAmt) {
3827  return false;
3828  }
3829 
3830  // If the outer shift is more than the type size, we have no bitfield to
3831  // extract (since we also check that the inner shift is <= the outer shift
3832  // then this also implies that the inner shift is < the type size)
3833  if (OuterShiftAmt >= Val.getValueSizeInBits()) {
3834  return false;
3835  }
3836 
3837  Start = CurDAG->getTargetConstant(OuterShiftAmt - InnerShiftAmt, DL,
3838  MVT::i32);
3839  Len = CurDAG->getTargetConstant(Val.getValueSizeInBits() - OuterShiftAmt,
3840  DL, MVT::i32);
3841 
3842  if (N->getOpcode() == ISD::SRA) {
3843  // If we have a arithmetic right shift, we need to use the signed bfe
3844  // variant
3845  IsSigned = true;
3846  }
3847  } else {
3848  // No can do...
3849  return false;
3850  }
3851  } else {
3852  // No can do...
3853  return false;
3854  }
3855 
3856 
3857  unsigned Opc;
3858  // For the BFE operations we form here from "and" and "srl", always use the
3859  // unsigned variants.
3860  if (Val.getValueType() == MVT::i32) {
3861  if (IsSigned) {
3862  Opc = NVPTX::BFE_S32rii;
3863  } else {
3864  Opc = NVPTX::BFE_U32rii;
3865  }
3866  } else if (Val.getValueType() == MVT::i64) {
3867  if (IsSigned) {
3868  Opc = NVPTX::BFE_S64rii;
3869  } else {
3870  Opc = NVPTX::BFE_U64rii;
3871  }
3872  } else {
3873  // We cannot handle this type
3874  return false;
3875  }
3876 
3877  SDValue Ops[] = {
3878  Val, Start, Len
3879  };
3880 
3881  ReplaceNode(N, CurDAG->getMachineNode(Opc, DL, N->getVTList(), Ops));
3882  return true;
3883 }
3884 
3885 // SelectDirectAddr - Match a direct address for DAG.
3886 // A direct address could be a globaladdress or externalsymbol.
3887 bool NVPTXDAGToDAGISel::SelectDirectAddr(SDValue N, SDValue &Address) {
3888  // Return true if TGA or ES.
3889  if (N.getOpcode() == ISD::TargetGlobalAddress ||
3891  Address = N;
3892  return true;
3893  }
3894  if (N.getOpcode() == NVPTXISD::Wrapper) {
3895  Address = N.getOperand(0);
3896  return true;
3897  }
3898  // addrspacecast(MoveParam(arg_symbol) to addrspace(PARAM)) -> arg_symbol
3899  if (AddrSpaceCastSDNode *CastN = dyn_cast<AddrSpaceCastSDNode>(N)) {
3900  if (CastN->getSrcAddressSpace() == ADDRESS_SPACE_GENERIC &&
3901  CastN->getDestAddressSpace() == ADDRESS_SPACE_PARAM &&
3902  CastN->getOperand(0).getOpcode() == NVPTXISD::MoveParam)
3903  return SelectDirectAddr(CastN->getOperand(0).getOperand(0), Address);
3904  }
3905  return false;
3906 }
3907 
3908 // symbol+offset
3909 bool NVPTXDAGToDAGISel::SelectADDRsi_imp(
3910  SDNode *OpNode, SDValue Addr, SDValue &Base, SDValue &Offset, MVT mvt) {
3911  if (Addr.getOpcode() == ISD::ADD) {
3912  if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1))) {
3913  SDValue base = Addr.getOperand(0);
3914  if (SelectDirectAddr(base, Base)) {
3915  Offset = CurDAG->getTargetConstant(CN->getZExtValue(), SDLoc(OpNode),
3916  mvt);
3917  return true;
3918  }
3919  }
3920  }
3921  return false;
3922 }
3923 
3924 // symbol+offset
3925 bool NVPTXDAGToDAGISel::SelectADDRsi(SDNode *OpNode, SDValue Addr,
3926  SDValue &Base, SDValue &Offset) {
3927  return SelectADDRsi_imp(OpNode, Addr, Base, Offset, MVT::i32);
3928 }
3929 
3930 // symbol+offset
3931 bool NVPTXDAGToDAGISel::SelectADDRsi64(SDNode *OpNode, SDValue Addr,
3932  SDValue &Base, SDValue &Offset) {
3933  return SelectADDRsi_imp(OpNode, Addr, Base, Offset, MVT::i64);
3934 }
3935 
3936 // register+offset
3937 bool NVPTXDAGToDAGISel::SelectADDRri_imp(
3938  SDNode *OpNode, SDValue Addr, SDValue &Base, SDValue &Offset, MVT mvt) {
3939  if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Addr)) {
3940  Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), mvt);
3941  Offset = CurDAG->getTargetConstant(0, SDLoc(OpNode), mvt);
3942  return true;
3943  }
3944  if (Addr.getOpcode() == ISD::TargetExternalSymbol ||
3946  return false; // direct calls.
3947 
3948  if (Addr.getOpcode() == ISD::ADD) {
3949  if (SelectDirectAddr(Addr.getOperand(0), Addr)) {
3950  return false;
3951  }
3952  if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1))) {
3953  if (FrameIndexSDNode *FIN =
3954  dyn_cast<FrameIndexSDNode>(Addr.getOperand(0)))
3955  // Constant offset from frame ref.
3956  Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), mvt);
3957  else
3958  Base = Addr.getOperand(0);
3959  Offset = CurDAG->getTargetConstant(CN->getZExtValue(), SDLoc(OpNode),
3960  mvt);
3961  return true;
3962  }
3963  }
3964  return false;
3965 }
3966 
3967 // register+offset
3968 bool NVPTXDAGToDAGISel::SelectADDRri(SDNode *OpNode, SDValue Addr,
3969  SDValue &Base, SDValue &Offset) {
3970  return SelectADDRri_imp(OpNode, Addr, Base, Offset, MVT::i32);
3971 }
3972 
3973 // register+offset
3974 bool NVPTXDAGToDAGISel::SelectADDRri64(SDNode *OpNode, SDValue Addr,
3975  SDValue &Base, SDValue &Offset) {
3976  return SelectADDRri_imp(OpNode, Addr, Base, Offset, MVT::i64);
3977 }
3978 
3979 bool NVPTXDAGToDAGISel::ChkMemSDNodeAddressSpace(SDNode *N,
3980  unsigned int spN) const {
3981  const Value *Src = nullptr;
3982  if (MemSDNode *mN = dyn_cast<MemSDNode>(N)) {
3983  if (spN == 0 && mN->getMemOperand()->getPseudoValue())
3984  return true;
3985  Src = mN->getMemOperand()->getValue();
3986  }
3987  if (!Src)
3988  return false;
3989  if (auto *PT = dyn_cast<PointerType>(Src->getType()))
3990  return (PT->getAddressSpace() == spN);
3991  return false;
3992 }
3993 
3994 /// SelectInlineAsmMemoryOperand - Implement addressing mode selection for
3995 /// inline asm expressions.
3997  const SDValue &Op, unsigned ConstraintID, std::vector<SDValue> &OutOps) {
3998  SDValue Op0, Op1;
3999  switch (ConstraintID) {
4000  default:
4001  return true;
4002  case InlineAsm::Constraint_m: // memory
4003  if (SelectDirectAddr(Op, Op0)) {
4004  OutOps.push_back(Op0);
4005  OutOps.push_back(CurDAG->getTargetConstant(0, SDLoc(Op), MVT::i32));
4006  return false;
4007  }
4008  if (SelectADDRri(Op.getNode(), Op, Op0, Op1)) {
4009  OutOps.push_back(Op0);
4010  OutOps.push_back(Op1);
4011  return false;
4012  }
4013  break;
4014  }
4015  return true;
4016 }
4017 
4018 /// GetConvertOpcode - Returns the CVT_ instruction opcode that implements a
4019 /// conversion from \p SrcTy to \p DestTy.
4020 unsigned NVPTXDAGToDAGISel::GetConvertOpcode(MVT DestTy, MVT SrcTy,
4021  bool IsSigned) {
4022  switch (SrcTy.SimpleTy) {
4023  default:
4024  llvm_unreachable("Unhandled source type");
4025  case MVT::i8:
4026  switch (DestTy.SimpleTy) {
4027  default:
4028  llvm_unreachable("Unhandled dest type");
4029  case MVT::i16:
4030  return IsSigned ? NVPTX::CVT_s16_s8 : NVPTX::CVT_u16_u8;
4031  case MVT::i32:
4032  return IsSigned ? NVPTX::CVT_s32_s8 : NVPTX::CVT_u32_u8;
4033  case MVT::i64:
4034  return IsSigned ? NVPTX::CVT_s64_s8 : NVPTX::CVT_u64_u8;
4035  }
4036  case MVT::i16:
4037  switch (DestTy.SimpleTy) {
4038  default:
4039  llvm_unreachable("Unhandled dest type");
4040  case MVT::i8:
4041  return IsSigned ? NVPTX::CVT_s8_s16 : NVPTX::CVT_u8_u16;
4042  case MVT::i32:
4043  return IsSigned ? NVPTX::CVT_s32_s16 : NVPTX::CVT_u32_u16;
4044  case MVT::i64:
4045  return IsSigned ? NVPTX::CVT_s64_s16 : NVPTX::CVT_u64_u16;
4046  }
4047  case MVT::i32:
4048  switch (DestTy.SimpleTy) {
4049  default:
4050  llvm_unreachable("Unhandled dest type");
4051  case MVT::i8:
4052  return IsSigned ? NVPTX::CVT_s8_s32 : NVPTX::CVT_u8_u32;
4053  case MVT::i16:
4054  return IsSigned ? NVPTX::CVT_s16_s32 : NVPTX::CVT_u16_u32;
4055  case MVT::i64:
4056  return IsSigned ? NVPTX::CVT_s64_s32 : NVPTX::CVT_u64_u32;
4057  }
4058  case MVT::i64:
4059  switch (DestTy.SimpleTy) {
4060  default:
4061  llvm_unreachable("Unhandled dest type");
4062  case MVT::i8:
4063  return IsSigned ? NVPTX::CVT_s8_s64 : NVPTX::CVT_u8_u64;
4064  case MVT::i16:
4065  return IsSigned ? NVPTX::CVT_s16_s64 : NVPTX::CVT_u16_u64;
4066  case MVT::i32:
4067  return IsSigned ? NVPTX::CVT_s32_s64 : NVPTX::CVT_u32_u64;
4068  }
4069  }
4070 }
4071 
4072 bool NVPTXDAGToDAGISel::tryWMMA_LDST(SDNode *N) {
4073  SDValue Chain = N->getOperand(0);
4074  unsigned IID = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
4075  SDValue Op1 = N->getOperand(2);
4076  SDValue Addr, Offset, Base;
4077  Optional<unsigned> Opcode;
4078  SDLoc DL(N);
4079  MemSDNode *MemSD = cast<MemIntrinsicSDNode>(N);
4080  WmmaVariant Variant;
4082  bool isStore = N->getNumValues() == 1; // Store ops only return a chain.
4083 
4084  if (SelectDirectAddr(Op1, Addr)) {
4085  Variant = WMMA_VARIANT_AVAR;
4086  Ops.push_back(Addr);
4087  } else if (SelectADDRsi64(Op1.getNode(), Op1, Base, Offset) ||
4088  SelectADDRri64(Op1.getNode(), Op1, Base, Offset)) {
4089  Variant = WMMA_VARIANT_ARI64;
4090  Ops.push_back(Base);
4091  Ops.push_back(Offset);
4092  } else {
4093  Variant = WMMA_VARIANT_AVAR;
4094  Ops.push_back(Op1);
4095  }
4096  unsigned NumOps = N->getNumOperands();
4097  // Pass through the rest of the operands to the machine node.
4098  for (unsigned i = 3; i < NumOps; ++i)
4099  Ops.push_back(N->getOperand(i));
4100  Ops.push_back(Chain);
4101 
4102  Opcode = getWmmaLdStOpcode(IID, Variant);
4103  if (!Opcode) {
4104  llvm::errs() << "tryWMMALD - no Opcode.\n";
4105  return false;
4106  }
4107 
4108  EVT MemVT = MemSD->getMemoryVT();
4109  assert(MemVT.isVector() && "Expected vector return type.");
4110 
4111  SDNode *MN;
4112  if (isStore) {
4113  MN = CurDAG->getMachineNode(Opcode.getValue(), DL, MVT::Other, Ops);
4114  } else {
4115  SmallVector<EVT, 9> InstVTs(MemVT.getVectorNumElements(),
4116  MemSD->getValueType(0));
4117  InstVTs.push_back(MVT::Other);
4118  MN = CurDAG->getMachineNode(Opcode.getValue(), DL, InstVTs, Ops);
4119  }
4120 
4121  ReplaceNode(N, MN);
4122  return true;
4123 }
4124 
4125 bool NVPTXDAGToDAGISel::tryWMMA_MMA(SDNode *N) {
4126  unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
4127  SDLoc DL(N);
4128  unsigned Opc;
4129 
4130  switch (IID) {
4131  default:
4132  return false;
4133  case Intrinsic::nvvm_wmma_mma_sync_col_col_f16_f16:
4134  Opc = NVPTX::INT_WMMA_MMA_col_col_f16_f16;
4135  break;
4136  case Intrinsic::nvvm_wmma_mma_sync_col_col_f16_f16_satfinite:
4137  Opc = NVPTX::INT_WMMA_MMA_col_col_f16_f16_satfinite;
4138  break;
4139  case Intrinsic::nvvm_wmma_mma_sync_col_col_f16_f32:
4140  Opc = NVPTX::INT_WMMA_MMA_col_col_f16_f32;
4141  break;
4142  case Intrinsic::nvvm_wmma_mma_sync_col_col_f16_f32_satfinite:
4143  Opc = NVPTX::INT_WMMA_MMA_col_col_f16_f32_satfinite;
4144  break;
4145  case Intrinsic::nvvm_wmma_mma_sync_col_col_f32_f16:
4146  Opc = NVPTX::INT_WMMA_MMA_col_col_f32_f16;
4147  break;
4148  case Intrinsic::nvvm_wmma_mma_sync_col_col_f32_f16_satfinite:
4149  Opc = NVPTX::INT_WMMA_MMA_col_col_f32_f16_satfinite;
4150  break;
4151  case Intrinsic::nvvm_wmma_mma_sync_col_col_f32_f32:
4152  Opc = NVPTX::INT_WMMA_MMA_col_col_f32_f32;
4153  break;
4154  case Intrinsic::nvvm_wmma_mma_sync_col_col_f32_f32_satfinite:
4155  Opc = NVPTX::INT_WMMA_MMA_col_col_f32_f32_satfinite;
4156  break;
4157  case Intrinsic::nvvm_wmma_mma_sync_col_row_f16_f16:
4158  Opc = NVPTX::INT_WMMA_MMA_col_row_f16_f16;
4159  break;
4160  case Intrinsic::nvvm_wmma_mma_sync_col_row_f16_f16_satfinite:
4161  Opc = NVPTX::INT_WMMA_MMA_col_row_f16_f16_satfinite;
4162  break;
4163  case Intrinsic::nvvm_wmma_mma_sync_col_row_f16_f32:
4164  Opc = NVPTX::INT_WMMA_MMA_col_row_f16_f32;
4165  break;
4166  case Intrinsic::nvvm_wmma_mma_sync_col_row_f16_f32_satfinite:
4167  Opc = NVPTX::INT_WMMA_MMA_col_row_f16_f32_satfinite;
4168  break;
4169  case Intrinsic::nvvm_wmma_mma_sync_col_row_f32_f16:
4170  Opc = NVPTX::INT_WMMA_MMA_col_row_f32_f16;
4171  break;
4172  case Intrinsic::nvvm_wmma_mma_sync_col_row_f32_f16_satfinite:
4173  Opc = NVPTX::INT_WMMA_MMA_col_row_f32_f16_satfinite;
4174  break;
4175  case Intrinsic::nvvm_wmma_mma_sync_col_row_f32_f32:
4176  Opc = NVPTX::INT_WMMA_MMA_col_row_f32_f32;
4177  break;
4178  case Intrinsic::nvvm_wmma_mma_sync_col_row_f32_f32_satfinite:
4179  Opc = NVPTX::INT_WMMA_MMA_col_row_f32_f32_satfinite;
4180  break;
4181  case Intrinsic::nvvm_wmma_mma_sync_row_col_f16_f16:
4182  Opc = NVPTX::INT_WMMA_MMA_row_col_f16_f16;
4183  break;
4184  case Intrinsic::nvvm_wmma_mma_sync_row_col_f16_f16_satfinite:
4185  Opc = NVPTX::INT_WMMA_MMA_row_col_f16_f16_satfinite;
4186  break;
4187  case Intrinsic::nvvm_wmma_mma_sync_row_col_f16_f32:
4188  Opc = NVPTX::INT_WMMA_MMA_row_col_f16_f32;
4189  break;
4190  case Intrinsic::nvvm_wmma_mma_sync_row_col_f16_f32_satfinite:
4191  Opc = NVPTX::INT_WMMA_MMA_row_col_f16_f32_satfinite;
4192  break;
4193  case Intrinsic::nvvm_wmma_mma_sync_row_col_f32_f16:
4194  Opc = NVPTX::INT_WMMA_MMA_row_col_f32_f16;
4195  break;
4196  case Intrinsic::nvvm_wmma_mma_sync_row_col_f32_f16_satfinite:
4197  Opc = NVPTX::INT_WMMA_MMA_row_col_f32_f16_satfinite;
4198  break;
4199  case Intrinsic::nvvm_wmma_mma_sync_row_col_f32_f32:
4200  Opc = NVPTX::INT_WMMA_MMA_row_col_f32_f32;
4201  break;
4202  case Intrinsic::nvvm_wmma_mma_sync_row_col_f32_f32_satfinite:
4203  Opc = NVPTX::INT_WMMA_MMA_row_col_f32_f32_satfinite;
4204  break;
4205  case Intrinsic::nvvm_wmma_mma_sync_row_row_f16_f16:
4206  Opc = NVPTX::INT_WMMA_MMA_row_row_f16_f16;
4207  break;
4208  case Intrinsic::nvvm_wmma_mma_sync_row_row_f16_f16_satfinite:
4209  Opc = NVPTX::INT_WMMA_MMA_row_row_f16_f16_satfinite;
4210  break;
4211  case Intrinsic::nvvm_wmma_mma_sync_row_row_f16_f32:
4212  Opc = NVPTX::INT_WMMA_MMA_row_row_f16_f32;
4213  break;
4214  case Intrinsic::nvvm_wmma_mma_sync_row_row_f16_f32_satfinite:
4215  Opc = NVPTX::INT_WMMA_MMA_row_row_f16_f32_satfinite;
4216  break;
4217  case Intrinsic::nvvm_wmma_mma_sync_row_row_f32_f16:
4218  Opc = NVPTX::INT_WMMA_MMA_row_row_f32_f16;
4219  break;
4220  case Intrinsic::nvvm_wmma_mma_sync_row_row_f32_f16_satfinite:
4221  Opc = NVPTX::INT_WMMA_MMA_row_row_f32_f16_satfinite;
4222  break;
4223  case Intrinsic::nvvm_wmma_mma_sync_row_row_f32_f32:
4224  Opc = NVPTX::INT_WMMA_MMA_row_row_f32_f32;
4225  break;
4226  case Intrinsic::nvvm_wmma_mma_sync_row_row_f32_f32_satfinite:
4227  Opc = NVPTX::INT_WMMA_MMA_row_row_f32_f32_satfinite;
4228  break;
4229  }
4230 
4232  // Pass through operands and return value types to the machine node.
4233  for (unsigned i = 1; i < N->getNumOperands(); ++i)
4234  Ops.push_back(N->getOperand(i));
4235  SmallVector<EVT, 8> InstVTs(N->getNumValues(), N->getValueType(0));
4236  SDNode *MN = CurDAG->getMachineNode(Opc, DL, InstVTs, Ops);
4237  ReplaceNode(N, MN);
4238  return true;
4239 }
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
Definition: ISDOpcodes.h:545
bool isInvariant() const
EVT getValueType() const
Return the ValueType of the referenced return value.
raw_ostream & errs()
This returns a reference to a raw_ostream for standard error.
GCNRegPressure max(const GCNRegPressure &P1, const GCNRegPressure &P2)
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
This class represents an incoming formal argument to a Function.
Definition: Argument.h:30
bool isIndexed() const
Return true if this is a pre/post inc/dec load/store.
LLVM_ATTRIBUTE_NORETURN void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition: Error.cpp:115
Compute iterated dominance frontiers using a linear time algorithm.
Definition: AllocatorList.h:24
#define WMMA_VARIANTS(base)
constexpr char IsVolatile[]
Key for Kernel::Arg::Metadata::mIsVolatile.
void ReplaceUses(SDValue F, SDValue T)
ReplaceUses - replace all uses of the old node F with the use of the new node T.
bool isVector() const
Return true if this is a vector value type.
EVT getValueType(unsigned ResNo) const
Return the type of a specified result.
static Optional< unsigned > pickOpcodeForVT(MVT::SimpleValueType VT, unsigned Opcode_i8, unsigned Opcode_i16, unsigned Opcode_i32, Optional< unsigned > Opcode_i64, unsigned Opcode_f16, unsigned Opcode_f16x2, unsigned Opcode_f32, Optional< unsigned > Opcode_f64)
SDVTList getVTList() const
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition: ValueTypes.h:253
bool useF32FTZ(const MachineFunction &MF) const
static unsigned int getCodeAddrSpace(MemSDNode *N)
TargetGlobalAddress - Like GlobalAddress, but the DAG does no folding or anything else with this node...
Definition: ISDOpcodes.h:131
static unsigned getPTXCmpMode(const CondCodeSDNode &CondCode, bool FTZ)
MVT getSimpleValueType(unsigned ResNo) const
Return the type of a specified result as a simple type.
F(f)
void setNodeId(int Id)
Set unique node id.
SDNode * getNode() const
get the SDNode which holds the desired result
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
const NVPTXSubtarget * Subtarget
unsigned getValueSizeInBits() const
Returns the size of the value in bits.
MachineFunction * MF
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition: ISDOpcodes.h:159
NVPTXDAGToDAGISel(NVPTXTargetMachine &tm, CodeGenOpt::Level OptLevel)
A description of a memory reference used in the backend.
Shift and rotation operations.
Definition: ISDOpcodes.h:379
std::size_t countTrailingOnes(T Value, ZeroBehavior ZB=ZB_Width)
Count the number of ones from the least significant bit to the first zero bit.
Definition: MathExtras.h:470
static unsigned getWmmaLdVariant(WmmaVariant Variant, bool Stride, const std::array< unsigned, 4 > Variants)
MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s), MachineInstr opcode, and operands.
op_iterator op_end() const
ISD::LoadExtType getExtensionType() const
Return whether this is a plain node, or one of the varieties of value-extending loads.
SimpleValueType SimpleTy
SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
This represents a list of ValueType&#39;s that has been intern&#39;d by a SelectionDAG.
unsigned getSizeInBits() const
bool runOnMachineFunction(MachineFunction &MF) override
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...
bool isKernelFunction(const Function &F)
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:245
constexpr bool isMask_64(uint64_t Value)
Return true if the argument is a non-empty sequence of ones starting at the least significant bit wit...
Definition: MathExtras.h:403
SDValue getTargetFrameIndex(int FI, EVT VT)
Definition: SelectionDAG.h:606
const T & getValue() const LLVM_LVALUE_FUNCTION
Definition: Optional.h:127
static Optional< unsigned > getWmmaLdStOpcode(unsigned IntrinsicID, WmmaVariant Variant=WMMA_VARIANT_ARI64)
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:200
static bool isStore(int Opcode)
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out...
Definition: ISDOpcodes.h:916
op_iterator op_begin() const
bool SelectInlineAsmMemoryOperand(const SDValue &Op, unsigned ConstraintID, std::vector< SDValue > &OutOps) override
SelectInlineAsmMemoryOperand - Implement addressing mode selection for inline asm expressions...
#define EQ(a, b)
Definition: regexec.c:112
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
Definition: SelectionDAG.h:558
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition: ISDOpcodes.h:151
SDValue getTargetConstantFP(double Val, const SDLoc &DL, EVT VT)
Definition: SelectionDAG.h:586
unsigned getSrcAddressSpace() const
const DataLayout & getDataLayout() const
Return the DataLayout attached to the Module associated to this MF.
This class is used to represent ISD::STORE nodes.
Flag
These should be considered private to the implementation of the MCInstrDesc class.
Definition: MCInstrDesc.h:121
MVT getSimpleValueType() const
Return the simple ValueType of the referenced return value.
const Value * getValue() const
Return the base address of the memory access.
bool allowFMA(MachineFunction &MF, CodeGenOpt::Level OptLevel) const
unsigned getNumValues() const
Return the number of values defined/returned by this operator.
CodeGenOpt::Level OptLevel
std::size_t countTrailingZeros(T Val, ZeroBehavior ZB=ZB_Width)
Count number of 0&#39;s from the least significant bit to the most stopping at the first 1...
Definition: MathExtras.h:112
Machine Value Type.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition: ValueTypes.h:273
const SDValue & getOperand(unsigned Num) const
const NVPTXTargetLowering * getTargetLowering() const override
unsigned getDestAddressSpace() const
static bool canLowerToLDG(MemSDNode *N, const NVPTXSubtarget &Subtarget, unsigned CodeAddrSpace, MachineFunction *F)
static ManagedStatic< std::set< EVT, EVT::compareRawBits > > EVTs
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:285
Extended Value Type.
Definition: ValueTypes.h:34
bool allowUnsafeFPMath(MachineFunction &MF) const
bool isVolatile() const
bool isMachineOpcode() const
Test if this node has a post-isel opcode, directly corresponding to a MachineInstr opcode...
unsigned getNumOperands() const
Return the number of values used by this operation.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition: ISDOpcodes.h:314
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition: ValueTypes.h:265
ADDRSPACECAST - This operator converts between pointers of different address spaces.
Definition: ISDOpcodes.h:549
This is a &#39;vector&#39; (really, a variable-sized array), optimized for the case when the array is small...
Definition: SmallVector.h:864
This is an abstract virtual class for memory operations.
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
Represents one node in the SelectionDAG.
SelectionDAGISel - This is the common base class used for SelectionDAG-based pattern-matching instruc...
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:923
bool runOnMachineFunction(MachineFunction &MF) override
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...
EVT getMemoryVT() const
Return the type of the in-memory value.
iterator_range< use_iterator > uses()
NVPTXTargetMachine.
constexpr bool isShiftedMask_64(uint64_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (64 bit ver...
Definition: MathExtras.h:415
bool isVector() const
Return true if this is a vector value type.
Definition: ValueTypes.h:151
FunctionPass * createNVPTXISelDag(NVPTXTargetMachine &TM, llvm::CodeGenOpt::Level OptLevel)
createNVPTXISelDag - This pass converts a legalized DAG into a NVPTX-specific DAG, ready for instruction scheduling.
Bitwise operators - logical and, logical or, logical xor.
Definition: ISDOpcodes.h:362
LLVM_NODISCARD bool empty() const
Definition: SmallVector.h:61