LLVM  7.0.0svn
NVPTXISelLowering.cpp
Go to the documentation of this file.
1 //===-- NVPTXISelLowering.cpp - NVPTX DAG Lowering Implementation ---------===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // This file defines the interfaces that NVPTX uses to lower LLVM code into a
11 // selection DAG.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #include "NVPTXISelLowering.h"
17 #include "NVPTX.h"
18 #include "NVPTXSubtarget.h"
19 #include "NVPTXTargetMachine.h"
20 #include "NVPTXTargetObjectFile.h"
21 #include "NVPTXUtilities.h"
22 #include "llvm/ADT/APInt.h"
23 #include "llvm/ADT/SmallVector.h"
24 #include "llvm/ADT/StringRef.h"
25 #include "llvm/CodeGen/Analysis.h"
33 #include "llvm/IR/Argument.h"
34 #include "llvm/IR/Attributes.h"
35 #include "llvm/IR/CallSite.h"
36 #include "llvm/IR/Constants.h"
37 #include "llvm/IR/DataLayout.h"
38 #include "llvm/IR/DerivedTypes.h"
39 #include "llvm/IR/Function.h"
40 #include "llvm/IR/GlobalValue.h"
41 #include "llvm/IR/Instruction.h"
42 #include "llvm/IR/Instructions.h"
43 #include "llvm/IR/Module.h"
44 #include "llvm/IR/Type.h"
45 #include "llvm/IR/Value.h"
46 #include "llvm/Support/Casting.h"
47 #include "llvm/Support/CodeGen.h"
55 #include <algorithm>
56 #include <cassert>
57 #include <cstdint>
58 #include <iterator>
59 #include <sstream>
60 #include <string>
61 #include <utility>
62 #include <vector>
63 
64 #define DEBUG_TYPE "nvptx-lower"
65 
66 using namespace llvm;
67 
68 static unsigned int uniqueCallSite = 0;
69 
71  "nvptx-sched4reg",
72  cl::desc("NVPTX Specific: schedule for register pressue"), cl::init(false));
73 
74 static cl::opt<unsigned>
76  cl::desc("NVPTX Specific: FMA contraction (0: don't do it"
77  " 1: do it 2: do it aggressively"),
78  cl::init(2));
79 
81  "nvptx-prec-divf32", cl::ZeroOrMore, cl::Hidden,
82  cl::desc("NVPTX Specifies: 0 use div.approx, 1 use div.full, 2 use"
83  " IEEE Compliant F32 div.rnd if available."),
84  cl::init(2));
85 
87  "nvptx-prec-sqrtf32", cl::Hidden,
88  cl::desc("NVPTX Specific: 0 use sqrt.approx, 1 use sqrt.rn."),
89  cl::init(true));
90 
92  "nvptx-f32ftz", cl::ZeroOrMore, cl::Hidden,
93  cl::desc("NVPTX Specific: Flush f32 subnormals to sign-preserving zero."),
94  cl::init(false));
95 
97  if (UsePrecDivF32.getNumOccurrences() > 0) {
98  // If nvptx-prec-div32=N is used on the command-line, always honor it
99  return UsePrecDivF32;
100  } else {
101  // Otherwise, use div.approx if fast math is enabled
103  return 0;
104  else
105  return 2;
106  }
107 }
108 
110  if (UsePrecSqrtF32.getNumOccurrences() > 0) {
111  // If nvptx-prec-sqrtf32 is used on the command-line, always honor it
112  return UsePrecSqrtF32;
113  } else {
114  // Otherwise, use sqrt.approx if fast math is enabled
116  }
117 }
118 
120  // TODO: Get rid of this flag; there can be only one way to do this.
121  if (FtzEnabled.getNumOccurrences() > 0) {
122  // If nvptx-f32ftz is used on the command-line, always honor it
123  return FtzEnabled;
124  } else {
125  const Function &F = MF.getFunction();
126  // Otherwise, check for an nvptx-f32ftz attribute on the function
127  if (F.hasFnAttribute("nvptx-f32ftz"))
128  return F.getFnAttribute("nvptx-f32ftz").getValueAsString() == "true";
129  else
130  return false;
131  }
132 }
133 
134 static bool IsPTXVectorType(MVT VT) {
135  switch (VT.SimpleTy) {
136  default:
137  return false;
138  case MVT::v2i1:
139  case MVT::v4i1:
140  case MVT::v2i8:
141  case MVT::v4i8:
142  case MVT::v2i16:
143  case MVT::v4i16:
144  case MVT::v2i32:
145  case MVT::v4i32:
146  case MVT::v2i64:
147  case MVT::v2f16:
148  case MVT::v4f16:
149  case MVT::v8f16: // <4 x f16x2>
150  case MVT::v2f32:
151  case MVT::v4f32:
152  case MVT::v2f64:
153  return true;
154  }
155 }
156 
157 /// ComputePTXValueVTs - For the given Type \p Ty, returns the set of primitive
158 /// EVTs that compose it. Unlike ComputeValueVTs, this will break apart vectors
159 /// into their primitive components.
160 /// NOTE: This is a band-aid for code that expects ComputeValueVTs to return the
161 /// same number of types as the Ins/Outs arrays in LowerFormalArguments,
162 /// LowerCall, and LowerReturn.
163 static void ComputePTXValueVTs(const TargetLowering &TLI, const DataLayout &DL,
164  Type *Ty, SmallVectorImpl<EVT> &ValueVTs,
166  uint64_t StartingOffset = 0) {
167  SmallVector<EVT, 16> TempVTs;
168  SmallVector<uint64_t, 16> TempOffsets;
169 
170  // Special case for i128 - decompose to (i64, i64)
171  if (Ty->isIntegerTy(128)) {
172  ValueVTs.push_back(EVT(MVT::i64));
173  ValueVTs.push_back(EVT(MVT::i64));
174 
175  if (Offsets) {
176  Offsets->push_back(StartingOffset + 0);
177  Offsets->push_back(StartingOffset + 8);
178  }
179 
180  return;
181  }
182 
183  ComputeValueVTs(TLI, DL, Ty, TempVTs, &TempOffsets, StartingOffset);
184  for (unsigned i = 0, e = TempVTs.size(); i != e; ++i) {
185  EVT VT = TempVTs[i];
186  uint64_t Off = TempOffsets[i];
187  // Split vectors into individual elements, except for v2f16, which
188  // we will pass as a single scalar.
189  if (VT.isVector()) {
190  unsigned NumElts = VT.getVectorNumElements();
191  EVT EltVT = VT.getVectorElementType();
192  // Vectors with an even number of f16 elements will be passed to
193  // us as an array of v2f16 elements. We must match this so we
194  // stay in sync with Ins/Outs.
195  if (EltVT == MVT::f16 && NumElts % 2 == 0) {
196  EltVT = MVT::v2f16;
197  NumElts /= 2;
198  }
199  for (unsigned j = 0; j != NumElts; ++j) {
200  ValueVTs.push_back(EltVT);
201  if (Offsets)
202  Offsets->push_back(Off + j * EltVT.getStoreSize());
203  }
204  } else {
205  ValueVTs.push_back(VT);
206  if (Offsets)
207  Offsets->push_back(Off);
208  }
209  }
210 }
211 
212 // Check whether we can merge loads/stores of some of the pieces of a
213 // flattened function parameter or return value into a single vector
214 // load/store.
215 //
216 // The flattened parameter is represented as a list of EVTs and
217 // offsets, and the whole structure is aligned to ParamAlignment. This
218 // function determines whether we can load/store pieces of the
219 // parameter starting at index Idx using a single vectorized op of
220 // size AccessSize. If so, it returns the number of param pieces
221 // covered by the vector op. Otherwise, it returns 1.
223  unsigned Idx, uint32_t AccessSize, const SmallVectorImpl<EVT> &ValueVTs,
224  const SmallVectorImpl<uint64_t> &Offsets, unsigned ParamAlignment) {
225  assert(isPowerOf2_32(AccessSize) && "must be a power of 2!");
226 
227  // Can't vectorize if param alignment is not sufficient.
228  if (AccessSize > ParamAlignment)
229  return 1;
230  // Can't vectorize if offset is not aligned.
231  if (Offsets[Idx] & (AccessSize - 1))
232  return 1;
233 
234  EVT EltVT = ValueVTs[Idx];
235  unsigned EltSize = EltVT.getStoreSize();
236 
237  // Element is too large to vectorize.
238  if (EltSize >= AccessSize)
239  return 1;
240 
241  unsigned NumElts = AccessSize / EltSize;
242  // Can't vectorize if AccessBytes if not a multiple of EltSize.
243  if (AccessSize != EltSize * NumElts)
244  return 1;
245 
246  // We don't have enough elements to vectorize.
247  if (Idx + NumElts > ValueVTs.size())
248  return 1;
249 
250  // PTX ISA can only deal with 2- and 4-element vector ops.
251  if (NumElts != 4 && NumElts != 2)
252  return 1;
253 
254  for (unsigned j = Idx + 1; j < Idx + NumElts; ++j) {
255  // Types do not match.
256  if (ValueVTs[j] != EltVT)
257  return 1;
258 
259  // Elements are not contiguous.
260  if (Offsets[j] - Offsets[j - 1] != EltSize)
261  return 1;
262  }
263  // OK. We can vectorize ValueVTs[i..i+NumElts)
264  return NumElts;
265 }
266 
267 // Flags for tracking per-element vectorization state of loads/stores
268 // of a flattened function parameter or return value.
270  PVF_INNER = 0x0, // Middle elements of a vector.
271  PVF_FIRST = 0x1, // First element of the vector.
272  PVF_LAST = 0x2, // Last element of the vector.
273  // Scalar is effectively a 1-element vector.
275 };
276 
277 // Computes whether and how we can vectorize the loads/stores of a
278 // flattened function parameter or return value.
279 //
280 // The flattened parameter is represented as the list of ValueVTs and
281 // Offsets, and is aligned to ParamAlignment bytes. We return a vector
282 // of the same size as ValueVTs indicating how each piece should be
283 // loaded/stored (i.e. as a scalar, or as part of a vector
284 // load/store).
288  unsigned ParamAlignment) {
289  // Set vector size to match ValueVTs and mark all elements as
290  // scalars by default.
292  VectorInfo.assign(ValueVTs.size(), PVF_SCALAR);
293 
294  // Check what we can vectorize using 128/64/32-bit accesses.
295  for (int I = 0, E = ValueVTs.size(); I != E; ++I) {
296  // Skip elements we've already processed.
297  assert(VectorInfo[I] == PVF_SCALAR && "Unexpected vector info state.");
298  for (unsigned AccessSize : {16, 8, 4, 2}) {
299  unsigned NumElts = CanMergeParamLoadStoresStartingAt(
300  I, AccessSize, ValueVTs, Offsets, ParamAlignment);
301  // Mark vectorized elements.
302  switch (NumElts) {
303  default:
304  llvm_unreachable("Unexpected return value");
305  case 1:
306  // Can't vectorize using this size, try next smaller size.
307  continue;
308  case 2:
309  assert(I + 1 < E && "Not enough elements.");
310  VectorInfo[I] = PVF_FIRST;
311  VectorInfo[I + 1] = PVF_LAST;
312  I += 1;
313  break;
314  case 4:
315  assert(I + 3 < E && "Not enough elements.");
316  VectorInfo[I] = PVF_FIRST;
317  VectorInfo[I + 1] = PVF_INNER;
318  VectorInfo[I + 2] = PVF_INNER;
319  VectorInfo[I + 3] = PVF_LAST;
320  I += 3;
321  break;
322  }
323  // Break out of the inner loop because we've already succeeded
324  // using largest possible AccessSize.
325  break;
326  }
327  }
328  return VectorInfo;
329 }
330 
331 // NVPTXTargetLowering Constructor.
333  const NVPTXSubtarget &STI)
334  : TargetLowering(TM), nvTM(&TM), STI(STI) {
335  // always lower memset, memcpy, and memmove intrinsics to load/store
336  // instructions, rather
337  // then generating calls to memset, mempcy or memmove.
338  MaxStoresPerMemset = (unsigned) 0xFFFFFFFF;
339  MaxStoresPerMemcpy = (unsigned) 0xFFFFFFFF;
340  MaxStoresPerMemmove = (unsigned) 0xFFFFFFFF;
341 
344 
345  // Jump is Expensive. Don't create extra control flow for 'and', 'or'
346  // condition branches.
347  setJumpIsExpensive(true);
348 
349  // Wide divides are _very_ slow. Try to reduce the width of the divide if
350  // possible.
351  addBypassSlowDiv(64, 32);
352 
353  // By default, use the Source scheduling
354  if (sched4reg)
356  else
358 
359  auto setFP16OperationAction = [&](unsigned Op, MVT VT, LegalizeAction Action,
360  LegalizeAction NoF16Action) {
361  setOperationAction(Op, VT, STI.allowFP16Math() ? Action : NoF16Action);
362  };
363 
364  addRegisterClass(MVT::i1, &NVPTX::Int1RegsRegClass);
365  addRegisterClass(MVT::i16, &NVPTX::Int16RegsRegClass);
366  addRegisterClass(MVT::i32, &NVPTX::Int32RegsRegClass);
367  addRegisterClass(MVT::i64, &NVPTX::Int64RegsRegClass);
368  addRegisterClass(MVT::f32, &NVPTX::Float32RegsRegClass);
369  addRegisterClass(MVT::f64, &NVPTX::Float64RegsRegClass);
370  addRegisterClass(MVT::f16, &NVPTX::Float16RegsRegClass);
371  addRegisterClass(MVT::v2f16, &NVPTX::Float16x2RegsRegClass);
372 
373  // Conversion to/from FP16/FP16x2 is always legal.
379 
380  setFP16OperationAction(ISD::SETCC, MVT::f16, Legal, Promote);
381  setFP16OperationAction(ISD::SETCC, MVT::v2f16, Legal, Expand);
382 
383  // Operations not directly supported by NVPTX.
388  }
389 
390  // Some SIGN_EXTEND_INREG can be done using cvt instruction.
391  // For others we will expand to a SHL/SRA pair.
397 
404 
407 
408  // TODO: we may consider expanding ROTL/ROTR on older GPUs. Currently on GPUs
409  // that don't have h/w rotation we lower them to multi-instruction assembly.
410  // See ROT*_sw in NVPTXIntrInfo.td
415 
423 
424  // Indirect branch is not supported.
425  // This also disables Jump Table creation.
428 
431 
432  // We want to legalize constant related memmove and memcopy
433  // intrinsics.
435 
436  // Turn FP extload into load/fpextend
446  // Turn FP truncstore into trunc + store.
447  // FIXME: vector types should also be expanded
451 
452  // PTX does not support load / store predicate registers
455 
456  for (MVT VT : MVT::integer_valuetypes()) {
460  }
461 
462  // This is legal in NVPTX
466 
467  // TRAP can be lowered to PTX trap
469 
470  // Register custom handling for vector loads/stores
471  for (MVT VT : MVT::vector_valuetypes()) {
472  if (IsPTXVectorType(VT)) {
476  }
477  }
478 
479  // Custom handling for i8 intrinsics
481 
482  for (const auto& Ty : {MVT::i16, MVT::i32, MVT::i64}) {
488 
491  }
492 
496 
497  // PTX does not directly support SELP of i1, so promote to i32 first
499 
500  // PTX cannot multiply two i64s in a single instruction.
503 
504  // We have some custom DAG combine patterns for these nodes
512 
513  // setcc for f16x2 needs special handling to prevent legalizer's
514  // attempt to scalarize it due to v2i1 not being legal.
515  if (STI.allowFP16Math())
517 
518  // Promote fp16 arithmetic if fp16 hardware isn't available or the
519  // user passed --nvptx-no-fp16-math. The flag is useful because,
520  // although sm_53+ GPUs have some sort of FP16 support in
521  // hardware, only sm_53 and sm_60 have full implementation. Others
522  // only have token amount of hardware and are likely to run faster
523  // by using fp32 units instead.
524  for (const auto &Op : {ISD::FADD, ISD::FMUL, ISD::FSUB, ISD::FMA}) {
525  setFP16OperationAction(Op, MVT::f16, Legal, Promote);
526  setFP16OperationAction(Op, MVT::v2f16, Legal, Expand);
527  }
528 
529  // There's no neg.f16 instruction. Expand to (0-x).
532 
533  // (would be) Library functions.
534 
535  // These map to conversion instructions for scalar FP types.
536  for (const auto &Op : {ISD::FCEIL, ISD::FFLOOR, ISD::FNEARBYINT, ISD::FRINT,
542  }
543 
544  // 'Expand' implements FCOPYSIGN without calling an external library.
549 
550  // These map to corresponding instructions for f32/f64. f16 must be
551  // promoted to f32. v2f16 is expanded to f16, which is then promoted
552  // to f32.
553  for (const auto &Op : {ISD::FDIV, ISD::FREM, ISD::FSQRT, ISD::FSIN, ISD::FCOS,
559  }
564 
565  // No FEXP2, FLOG2. The PTX ex2 and log2 functions are always approximate.
566  // No FPOW or FREM in PTX.
567 
568  // Now deduce the information based on the above mentioned
569  // actions
571 }
572 
573 const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const {
574  switch ((NVPTXISD::NodeType)Opcode) {
576  break;
577  case NVPTXISD::CALL:
578  return "NVPTXISD::CALL";
579  case NVPTXISD::RET_FLAG:
580  return "NVPTXISD::RET_FLAG";
582  return "NVPTXISD::LOAD_PARAM";
583  case NVPTXISD::Wrapper:
584  return "NVPTXISD::Wrapper";
586  return "NVPTXISD::DeclareParam";
588  return "NVPTXISD::DeclareScalarParam";
590  return "NVPTXISD::DeclareRet";
592  return "NVPTXISD::DeclareScalarRet";
594  return "NVPTXISD::DeclareRetParam";
595  case NVPTXISD::PrintCall:
596  return "NVPTXISD::PrintCall";
598  return "NVPTXISD::PrintConvergentCall";
600  return "NVPTXISD::PrintCallUni";
602  return "NVPTXISD::PrintConvergentCallUni";
603  case NVPTXISD::LoadParam:
604  return "NVPTXISD::LoadParam";
606  return "NVPTXISD::LoadParamV2";
608  return "NVPTXISD::LoadParamV4";
610  return "NVPTXISD::StoreParam";
612  return "NVPTXISD::StoreParamV2";
614  return "NVPTXISD::StoreParamV4";
616  return "NVPTXISD::StoreParamS32";
618  return "NVPTXISD::StoreParamU32";
620  return "NVPTXISD::CallArgBegin";
621  case NVPTXISD::CallArg:
622  return "NVPTXISD::CallArg";
624  return "NVPTXISD::LastCallArg";
626  return "NVPTXISD::CallArgEnd";
627  case NVPTXISD::CallVoid:
628  return "NVPTXISD::CallVoid";
629  case NVPTXISD::CallVal:
630  return "NVPTXISD::CallVal";
632  return "NVPTXISD::CallSymbol";
633  case NVPTXISD::Prototype:
634  return "NVPTXISD::Prototype";
635  case NVPTXISD::MoveParam:
636  return "NVPTXISD::MoveParam";
638  return "NVPTXISD::StoreRetval";
640  return "NVPTXISD::StoreRetvalV2";
642  return "NVPTXISD::StoreRetvalV4";
644  return "NVPTXISD::PseudoUseParam";
645  case NVPTXISD::RETURN:
646  return "NVPTXISD::RETURN";
648  return "NVPTXISD::CallSeqBegin";
650  return "NVPTXISD::CallSeqEnd";
652  return "NVPTXISD::CallPrototype";
653  case NVPTXISD::LoadV2:
654  return "NVPTXISD::LoadV2";
655  case NVPTXISD::LoadV4:
656  return "NVPTXISD::LoadV4";
657  case NVPTXISD::LDGV2:
658  return "NVPTXISD::LDGV2";
659  case NVPTXISD::LDGV4:
660  return "NVPTXISD::LDGV4";
661  case NVPTXISD::LDUV2:
662  return "NVPTXISD::LDUV2";
663  case NVPTXISD::LDUV4:
664  return "NVPTXISD::LDUV4";
665  case NVPTXISD::StoreV2:
666  return "NVPTXISD::StoreV2";
667  case NVPTXISD::StoreV4:
668  return "NVPTXISD::StoreV4";
670  return "NVPTXISD::FUN_SHFL_CLAMP";
672  return "NVPTXISD::FUN_SHFR_CLAMP";
673  case NVPTXISD::IMAD:
674  return "NVPTXISD::IMAD";
676  return "NVPTXISD::SETP_F16X2";
677  case NVPTXISD::Dummy:
678  return "NVPTXISD::Dummy";
680  return "NVPTXISD::MUL_WIDE_SIGNED";
682  return "NVPTXISD::MUL_WIDE_UNSIGNED";
683  case NVPTXISD::Tex1DFloatS32: return "NVPTXISD::Tex1DFloatS32";
684  case NVPTXISD::Tex1DFloatFloat: return "NVPTXISD::Tex1DFloatFloat";
686  return "NVPTXISD::Tex1DFloatFloatLevel";
688  return "NVPTXISD::Tex1DFloatFloatGrad";
689  case NVPTXISD::Tex1DS32S32: return "NVPTXISD::Tex1DS32S32";
690  case NVPTXISD::Tex1DS32Float: return "NVPTXISD::Tex1DS32Float";
692  return "NVPTXISD::Tex1DS32FloatLevel";
694  return "NVPTXISD::Tex1DS32FloatGrad";
695  case NVPTXISD::Tex1DU32S32: return "NVPTXISD::Tex1DU32S32";
696  case NVPTXISD::Tex1DU32Float: return "NVPTXISD::Tex1DU32Float";
698  return "NVPTXISD::Tex1DU32FloatLevel";
700  return "NVPTXISD::Tex1DU32FloatGrad";
701  case NVPTXISD::Tex1DArrayFloatS32: return "NVPTXISD::Tex1DArrayFloatS32";
702  case NVPTXISD::Tex1DArrayFloatFloat: return "NVPTXISD::Tex1DArrayFloatFloat";
704  return "NVPTXISD::Tex1DArrayFloatFloatLevel";
706  return "NVPTXISD::Tex1DArrayFloatFloatGrad";
707  case NVPTXISD::Tex1DArrayS32S32: return "NVPTXISD::Tex1DArrayS32S32";
708  case NVPTXISD::Tex1DArrayS32Float: return "NVPTXISD::Tex1DArrayS32Float";
710  return "NVPTXISD::Tex1DArrayS32FloatLevel";
712  return "NVPTXISD::Tex1DArrayS32FloatGrad";
713  case NVPTXISD::Tex1DArrayU32S32: return "NVPTXISD::Tex1DArrayU32S32";
714  case NVPTXISD::Tex1DArrayU32Float: return "NVPTXISD::Tex1DArrayU32Float";
716  return "NVPTXISD::Tex1DArrayU32FloatLevel";
718  return "NVPTXISD::Tex1DArrayU32FloatGrad";
719  case NVPTXISD::Tex2DFloatS32: return "NVPTXISD::Tex2DFloatS32";
720  case NVPTXISD::Tex2DFloatFloat: return "NVPTXISD::Tex2DFloatFloat";
722  return "NVPTXISD::Tex2DFloatFloatLevel";
724  return "NVPTXISD::Tex2DFloatFloatGrad";
725  case NVPTXISD::Tex2DS32S32: return "NVPTXISD::Tex2DS32S32";
726  case NVPTXISD::Tex2DS32Float: return "NVPTXISD::Tex2DS32Float";
728  return "NVPTXISD::Tex2DS32FloatLevel";
730  return "NVPTXISD::Tex2DS32FloatGrad";
731  case NVPTXISD::Tex2DU32S32: return "NVPTXISD::Tex2DU32S32";
732  case NVPTXISD::Tex2DU32Float: return "NVPTXISD::Tex2DU32Float";
734  return "NVPTXISD::Tex2DU32FloatLevel";
736  return "NVPTXISD::Tex2DU32FloatGrad";
737  case NVPTXISD::Tex2DArrayFloatS32: return "NVPTXISD::Tex2DArrayFloatS32";
738  case NVPTXISD::Tex2DArrayFloatFloat: return "NVPTXISD::Tex2DArrayFloatFloat";
740  return "NVPTXISD::Tex2DArrayFloatFloatLevel";
742  return "NVPTXISD::Tex2DArrayFloatFloatGrad";
743  case NVPTXISD::Tex2DArrayS32S32: return "NVPTXISD::Tex2DArrayS32S32";
744  case NVPTXISD::Tex2DArrayS32Float: return "NVPTXISD::Tex2DArrayS32Float";
746  return "NVPTXISD::Tex2DArrayS32FloatLevel";
748  return "NVPTXISD::Tex2DArrayS32FloatGrad";
749  case NVPTXISD::Tex2DArrayU32S32: return "NVPTXISD::Tex2DArrayU32S32";
750  case NVPTXISD::Tex2DArrayU32Float: return "NVPTXISD::Tex2DArrayU32Float";
752  return "NVPTXISD::Tex2DArrayU32FloatLevel";
754  return "NVPTXISD::Tex2DArrayU32FloatGrad";
755  case NVPTXISD::Tex3DFloatS32: return "NVPTXISD::Tex3DFloatS32";
756  case NVPTXISD::Tex3DFloatFloat: return "NVPTXISD::Tex3DFloatFloat";
758  return "NVPTXISD::Tex3DFloatFloatLevel";
760  return "NVPTXISD::Tex3DFloatFloatGrad";
761  case NVPTXISD::Tex3DS32S32: return "NVPTXISD::Tex3DS32S32";
762  case NVPTXISD::Tex3DS32Float: return "NVPTXISD::Tex3DS32Float";
764  return "NVPTXISD::Tex3DS32FloatLevel";
766  return "NVPTXISD::Tex3DS32FloatGrad";
767  case NVPTXISD::Tex3DU32S32: return "NVPTXISD::Tex3DU32S32";
768  case NVPTXISD::Tex3DU32Float: return "NVPTXISD::Tex3DU32Float";
770  return "NVPTXISD::Tex3DU32FloatLevel";
772  return "NVPTXISD::Tex3DU32FloatGrad";
773  case NVPTXISD::TexCubeFloatFloat: return "NVPTXISD::TexCubeFloatFloat";
775  return "NVPTXISD::TexCubeFloatFloatLevel";
776  case NVPTXISD::TexCubeS32Float: return "NVPTXISD::TexCubeS32Float";
778  return "NVPTXISD::TexCubeS32FloatLevel";
779  case NVPTXISD::TexCubeU32Float: return "NVPTXISD::TexCubeU32Float";
781  return "NVPTXISD::TexCubeU32FloatLevel";
783  return "NVPTXISD::TexCubeArrayFloatFloat";
785  return "NVPTXISD::TexCubeArrayFloatFloatLevel";
787  return "NVPTXISD::TexCubeArrayS32Float";
789  return "NVPTXISD::TexCubeArrayS32FloatLevel";
791  return "NVPTXISD::TexCubeArrayU32Float";
793  return "NVPTXISD::TexCubeArrayU32FloatLevel";
795  return "NVPTXISD::Tld4R2DFloatFloat";
797  return "NVPTXISD::Tld4G2DFloatFloat";
799  return "NVPTXISD::Tld4B2DFloatFloat";
801  return "NVPTXISD::Tld4A2DFloatFloat";
803  return "NVPTXISD::Tld4R2DS64Float";
805  return "NVPTXISD::Tld4G2DS64Float";
807  return "NVPTXISD::Tld4B2DS64Float";
809  return "NVPTXISD::Tld4A2DS64Float";
811  return "NVPTXISD::Tld4R2DU64Float";
813  return "NVPTXISD::Tld4G2DU64Float";
815  return "NVPTXISD::Tld4B2DU64Float";
817  return "NVPTXISD::Tld4A2DU64Float";
818 
820  return "NVPTXISD::TexUnified1DFloatS32";
822  return "NVPTXISD::TexUnified1DFloatFloat";
824  return "NVPTXISD::TexUnified1DFloatFloatLevel";
826  return "NVPTXISD::TexUnified1DFloatFloatGrad";
828  return "NVPTXISD::TexUnified1DS32S32";
830  return "NVPTXISD::TexUnified1DS32Float";
832  return "NVPTXISD::TexUnified1DS32FloatLevel";
834  return "NVPTXISD::TexUnified1DS32FloatGrad";
836  return "NVPTXISD::TexUnified1DU32S32";
838  return "NVPTXISD::TexUnified1DU32Float";
840  return "NVPTXISD::TexUnified1DU32FloatLevel";
842  return "NVPTXISD::TexUnified1DU32FloatGrad";
844  return "NVPTXISD::TexUnified1DArrayFloatS32";
846  return "NVPTXISD::TexUnified1DArrayFloatFloat";
848  return "NVPTXISD::TexUnified1DArrayFloatFloatLevel";
850  return "NVPTXISD::TexUnified1DArrayFloatFloatGrad";
852  return "NVPTXISD::TexUnified1DArrayS32S32";
854  return "NVPTXISD::TexUnified1DArrayS32Float";
856  return "NVPTXISD::TexUnified1DArrayS32FloatLevel";
858  return "NVPTXISD::TexUnified1DArrayS32FloatGrad";
860  return "NVPTXISD::TexUnified1DArrayU32S32";
862  return "NVPTXISD::TexUnified1DArrayU32Float";
864  return "NVPTXISD::TexUnified1DArrayU32FloatLevel";
866  return "NVPTXISD::TexUnified1DArrayU32FloatGrad";
868  return "NVPTXISD::TexUnified2DFloatS32";
870  return "NVPTXISD::TexUnified2DFloatFloat";
872  return "NVPTXISD::TexUnified2DFloatFloatLevel";
874  return "NVPTXISD::TexUnified2DFloatFloatGrad";
876  return "NVPTXISD::TexUnified2DS32S32";
878  return "NVPTXISD::TexUnified2DS32Float";
880  return "NVPTXISD::TexUnified2DS32FloatLevel";
882  return "NVPTXISD::TexUnified2DS32FloatGrad";
884  return "NVPTXISD::TexUnified2DU32S32";
886  return "NVPTXISD::TexUnified2DU32Float";
888  return "NVPTXISD::TexUnified2DU32FloatLevel";
890  return "NVPTXISD::TexUnified2DU32FloatGrad";
892  return "NVPTXISD::TexUnified2DArrayFloatS32";
894  return "NVPTXISD::TexUnified2DArrayFloatFloat";
896  return "NVPTXISD::TexUnified2DArrayFloatFloatLevel";
898  return "NVPTXISD::TexUnified2DArrayFloatFloatGrad";
900  return "NVPTXISD::TexUnified2DArrayS32S32";
902  return "NVPTXISD::TexUnified2DArrayS32Float";
904  return "NVPTXISD::TexUnified2DArrayS32FloatLevel";
906  return "NVPTXISD::TexUnified2DArrayS32FloatGrad";
908  return "NVPTXISD::TexUnified2DArrayU32S32";
910  return "NVPTXISD::TexUnified2DArrayU32Float";
912  return "NVPTXISD::TexUnified2DArrayU32FloatLevel";
914  return "NVPTXISD::TexUnified2DArrayU32FloatGrad";
916  return "NVPTXISD::TexUnified3DFloatS32";
918  return "NVPTXISD::TexUnified3DFloatFloat";
920  return "NVPTXISD::TexUnified3DFloatFloatLevel";
922  return "NVPTXISD::TexUnified3DFloatFloatGrad";
924  return "NVPTXISD::TexUnified3DS32S32";
926  return "NVPTXISD::TexUnified3DS32Float";
928  return "NVPTXISD::TexUnified3DS32FloatLevel";
930  return "NVPTXISD::TexUnified3DS32FloatGrad";
932  return "NVPTXISD::TexUnified3DU32S32";
934  return "NVPTXISD::TexUnified3DU32Float";
936  return "NVPTXISD::TexUnified3DU32FloatLevel";
938  return "NVPTXISD::TexUnified3DU32FloatGrad";
940  return "NVPTXISD::TexUnifiedCubeFloatFloat";
942  return "NVPTXISD::TexUnifiedCubeFloatFloatLevel";
944  return "NVPTXISD::TexUnifiedCubeS32Float";
946  return "NVPTXISD::TexUnifiedCubeS32FloatLevel";
948  return "NVPTXISD::TexUnifiedCubeU32Float";
950  return "NVPTXISD::TexUnifiedCubeU32FloatLevel";
952  return "NVPTXISD::TexUnifiedCubeArrayFloatFloat";
954  return "NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel";
956  return "NVPTXISD::TexUnifiedCubeArrayS32Float";
958  return "NVPTXISD::TexUnifiedCubeArrayS32FloatLevel";
960  return "NVPTXISD::TexUnifiedCubeArrayU32Float";
962  return "NVPTXISD::TexUnifiedCubeArrayU32FloatLevel";
964  return "NVPTXISD::Tld4UnifiedR2DFloatFloat";
966  return "NVPTXISD::Tld4UnifiedG2DFloatFloat";
968  return "NVPTXISD::Tld4UnifiedB2DFloatFloat";
970  return "NVPTXISD::Tld4UnifiedA2DFloatFloat";
972  return "NVPTXISD::Tld4UnifiedR2DS64Float";
974  return "NVPTXISD::Tld4UnifiedG2DS64Float";
976  return "NVPTXISD::Tld4UnifiedB2DS64Float";
978  return "NVPTXISD::Tld4UnifiedA2DS64Float";
980  return "NVPTXISD::Tld4UnifiedR2DU64Float";
982  return "NVPTXISD::Tld4UnifiedG2DU64Float";
984  return "NVPTXISD::Tld4UnifiedB2DU64Float";
986  return "NVPTXISD::Tld4UnifiedA2DU64Float";
987 
988  case NVPTXISD::Suld1DI8Clamp: return "NVPTXISD::Suld1DI8Clamp";
989  case NVPTXISD::Suld1DI16Clamp: return "NVPTXISD::Suld1DI16Clamp";
990  case NVPTXISD::Suld1DI32Clamp: return "NVPTXISD::Suld1DI32Clamp";
991  case NVPTXISD::Suld1DI64Clamp: return "NVPTXISD::Suld1DI64Clamp";
992  case NVPTXISD::Suld1DV2I8Clamp: return "NVPTXISD::Suld1DV2I8Clamp";
993  case NVPTXISD::Suld1DV2I16Clamp: return "NVPTXISD::Suld1DV2I16Clamp";
994  case NVPTXISD::Suld1DV2I32Clamp: return "NVPTXISD::Suld1DV2I32Clamp";
995  case NVPTXISD::Suld1DV2I64Clamp: return "NVPTXISD::Suld1DV2I64Clamp";
996  case NVPTXISD::Suld1DV4I8Clamp: return "NVPTXISD::Suld1DV4I8Clamp";
997  case NVPTXISD::Suld1DV4I16Clamp: return "NVPTXISD::Suld1DV4I16Clamp";
998  case NVPTXISD::Suld1DV4I32Clamp: return "NVPTXISD::Suld1DV4I32Clamp";
999 
1000  case NVPTXISD::Suld1DArrayI8Clamp: return "NVPTXISD::Suld1DArrayI8Clamp";
1001  case NVPTXISD::Suld1DArrayI16Clamp: return "NVPTXISD::Suld1DArrayI16Clamp";
1002  case NVPTXISD::Suld1DArrayI32Clamp: return "NVPTXISD::Suld1DArrayI32Clamp";
1003  case NVPTXISD::Suld1DArrayI64Clamp: return "NVPTXISD::Suld1DArrayI64Clamp";
1004  case NVPTXISD::Suld1DArrayV2I8Clamp: return "NVPTXISD::Suld1DArrayV2I8Clamp";
1005  case NVPTXISD::Suld1DArrayV2I16Clamp:return "NVPTXISD::Suld1DArrayV2I16Clamp";
1006  case NVPTXISD::Suld1DArrayV2I32Clamp:return "NVPTXISD::Suld1DArrayV2I32Clamp";
1007  case NVPTXISD::Suld1DArrayV2I64Clamp:return "NVPTXISD::Suld1DArrayV2I64Clamp";
1008  case NVPTXISD::Suld1DArrayV4I8Clamp: return "NVPTXISD::Suld1DArrayV4I8Clamp";
1009  case NVPTXISD::Suld1DArrayV4I16Clamp:return "NVPTXISD::Suld1DArrayV4I16Clamp";
1010  case NVPTXISD::Suld1DArrayV4I32Clamp:return "NVPTXISD::Suld1DArrayV4I32Clamp";
1011 
1012  case NVPTXISD::Suld2DI8Clamp: return "NVPTXISD::Suld2DI8Clamp";
1013  case NVPTXISD::Suld2DI16Clamp: return "NVPTXISD::Suld2DI16Clamp";
1014  case NVPTXISD::Suld2DI32Clamp: return "NVPTXISD::Suld2DI32Clamp";
1015  case NVPTXISD::Suld2DI64Clamp: return "NVPTXISD::Suld2DI64Clamp";
1016  case NVPTXISD::Suld2DV2I8Clamp: return "NVPTXISD::Suld2DV2I8Clamp";
1017  case NVPTXISD::Suld2DV2I16Clamp: return "NVPTXISD::Suld2DV2I16Clamp";
1018  case NVPTXISD::Suld2DV2I32Clamp: return "NVPTXISD::Suld2DV2I32Clamp";
1019  case NVPTXISD::Suld2DV2I64Clamp: return "NVPTXISD::Suld2DV2I64Clamp";
1020  case NVPTXISD::Suld2DV4I8Clamp: return "NVPTXISD::Suld2DV4I8Clamp";
1021  case NVPTXISD::Suld2DV4I16Clamp: return "NVPTXISD::Suld2DV4I16Clamp";
1022  case NVPTXISD::Suld2DV4I32Clamp: return "NVPTXISD::Suld2DV4I32Clamp";
1023 
1024  case NVPTXISD::Suld2DArrayI8Clamp: return "NVPTXISD::Suld2DArrayI8Clamp";
1025  case NVPTXISD::Suld2DArrayI16Clamp: return "NVPTXISD::Suld2DArrayI16Clamp";
1026  case NVPTXISD::Suld2DArrayI32Clamp: return "NVPTXISD::Suld2DArrayI32Clamp";
1027  case NVPTXISD::Suld2DArrayI64Clamp: return "NVPTXISD::Suld2DArrayI64Clamp";
1028  case NVPTXISD::Suld2DArrayV2I8Clamp: return "NVPTXISD::Suld2DArrayV2I8Clamp";
1029  case NVPTXISD::Suld2DArrayV2I16Clamp:return "NVPTXISD::Suld2DArrayV2I16Clamp";
1030  case NVPTXISD::Suld2DArrayV2I32Clamp:return "NVPTXISD::Suld2DArrayV2I32Clamp";
1031  case NVPTXISD::Suld2DArrayV2I64Clamp:return "NVPTXISD::Suld2DArrayV2I64Clamp";
1032  case NVPTXISD::Suld2DArrayV4I8Clamp: return "NVPTXISD::Suld2DArrayV4I8Clamp";
1033  case NVPTXISD::Suld2DArrayV4I16Clamp:return "NVPTXISD::Suld2DArrayV4I16Clamp";
1034  case NVPTXISD::Suld2DArrayV4I32Clamp:return "NVPTXISD::Suld2DArrayV4I32Clamp";
1035 
1036  case NVPTXISD::Suld3DI8Clamp: return "NVPTXISD::Suld3DI8Clamp";
1037  case NVPTXISD::Suld3DI16Clamp: return "NVPTXISD::Suld3DI16Clamp";
1038  case NVPTXISD::Suld3DI32Clamp: return "NVPTXISD::Suld3DI32Clamp";
1039  case NVPTXISD::Suld3DI64Clamp: return "NVPTXISD::Suld3DI64Clamp";
1040  case NVPTXISD::Suld3DV2I8Clamp: return "NVPTXISD::Suld3DV2I8Clamp";
1041  case NVPTXISD::Suld3DV2I16Clamp: return "NVPTXISD::Suld3DV2I16Clamp";
1042  case NVPTXISD::Suld3DV2I32Clamp: return "NVPTXISD::Suld3DV2I32Clamp";
1043  case NVPTXISD::Suld3DV2I64Clamp: return "NVPTXISD::Suld3DV2I64Clamp";
1044  case NVPTXISD::Suld3DV4I8Clamp: return "NVPTXISD::Suld3DV4I8Clamp";
1045  case NVPTXISD::Suld3DV4I16Clamp: return "NVPTXISD::Suld3DV4I16Clamp";
1046  case NVPTXISD::Suld3DV4I32Clamp: return "NVPTXISD::Suld3DV4I32Clamp";
1047 
1048  case NVPTXISD::Suld1DI8Trap: return "NVPTXISD::Suld1DI8Trap";
1049  case NVPTXISD::Suld1DI16Trap: return "NVPTXISD::Suld1DI16Trap";
1050  case NVPTXISD::Suld1DI32Trap: return "NVPTXISD::Suld1DI32Trap";
1051  case NVPTXISD::Suld1DI64Trap: return "NVPTXISD::Suld1DI64Trap";
1052  case NVPTXISD::Suld1DV2I8Trap: return "NVPTXISD::Suld1DV2I8Trap";
1053  case NVPTXISD::Suld1DV2I16Trap: return "NVPTXISD::Suld1DV2I16Trap";
1054  case NVPTXISD::Suld1DV2I32Trap: return "NVPTXISD::Suld1DV2I32Trap";
1055  case NVPTXISD::Suld1DV2I64Trap: return "NVPTXISD::Suld1DV2I64Trap";
1056  case NVPTXISD::Suld1DV4I8Trap: return "NVPTXISD::Suld1DV4I8Trap";
1057  case NVPTXISD::Suld1DV4I16Trap: return "NVPTXISD::Suld1DV4I16Trap";
1058  case NVPTXISD::Suld1DV4I32Trap: return "NVPTXISD::Suld1DV4I32Trap";
1059 
1060  case NVPTXISD::Suld1DArrayI8Trap: return "NVPTXISD::Suld1DArrayI8Trap";
1061  case NVPTXISD::Suld1DArrayI16Trap: return "NVPTXISD::Suld1DArrayI16Trap";
1062  case NVPTXISD::Suld1DArrayI32Trap: return "NVPTXISD::Suld1DArrayI32Trap";
1063  case NVPTXISD::Suld1DArrayI64Trap: return "NVPTXISD::Suld1DArrayI64Trap";
1064  case NVPTXISD::Suld1DArrayV2I8Trap: return "NVPTXISD::Suld1DArrayV2I8Trap";
1065  case NVPTXISD::Suld1DArrayV2I16Trap: return "NVPTXISD::Suld1DArrayV2I16Trap";
1066  case NVPTXISD::Suld1DArrayV2I32Trap: return "NVPTXISD::Suld1DArrayV2I32Trap";
1067  case NVPTXISD::Suld1DArrayV2I64Trap: return "NVPTXISD::Suld1DArrayV2I64Trap";
1068  case NVPTXISD::Suld1DArrayV4I8Trap: return "NVPTXISD::Suld1DArrayV4I8Trap";
1069  case NVPTXISD::Suld1DArrayV4I16Trap: return "NVPTXISD::Suld1DArrayV4I16Trap";
1070  case NVPTXISD::Suld1DArrayV4I32Trap: return "NVPTXISD::Suld1DArrayV4I32Trap";
1071 
1072  case NVPTXISD::Suld2DI8Trap: return "NVPTXISD::Suld2DI8Trap";
1073  case NVPTXISD::Suld2DI16Trap: return "NVPTXISD::Suld2DI16Trap";
1074  case NVPTXISD::Suld2DI32Trap: return "NVPTXISD::Suld2DI32Trap";
1075  case NVPTXISD::Suld2DI64Trap: return "NVPTXISD::Suld2DI64Trap";
1076  case NVPTXISD::Suld2DV2I8Trap: return "NVPTXISD::Suld2DV2I8Trap";
1077  case NVPTXISD::Suld2DV2I16Trap: return "NVPTXISD::Suld2DV2I16Trap";
1078  case NVPTXISD::Suld2DV2I32Trap: return "NVPTXISD::Suld2DV2I32Trap";
1079  case NVPTXISD::Suld2DV2I64Trap: return "NVPTXISD::Suld2DV2I64Trap";
1080  case NVPTXISD::Suld2DV4I8Trap: return "NVPTXISD::Suld2DV4I8Trap";
1081  case NVPTXISD::Suld2DV4I16Trap: return "NVPTXISD::Suld2DV4I16Trap";
1082  case NVPTXISD::Suld2DV4I32Trap: return "NVPTXISD::Suld2DV4I32Trap";
1083 
1084  case NVPTXISD::Suld2DArrayI8Trap: return "NVPTXISD::Suld2DArrayI8Trap";
1085  case NVPTXISD::Suld2DArrayI16Trap: return "NVPTXISD::Suld2DArrayI16Trap";
1086  case NVPTXISD::Suld2DArrayI32Trap: return "NVPTXISD::Suld2DArrayI32Trap";
1087  case NVPTXISD::Suld2DArrayI64Trap: return "NVPTXISD::Suld2DArrayI64Trap";
1088  case NVPTXISD::Suld2DArrayV2I8Trap: return "NVPTXISD::Suld2DArrayV2I8Trap";
1089  case NVPTXISD::Suld2DArrayV2I16Trap: return "NVPTXISD::Suld2DArrayV2I16Trap";
1090  case NVPTXISD::Suld2DArrayV2I32Trap: return "NVPTXISD::Suld2DArrayV2I32Trap";
1091  case NVPTXISD::Suld2DArrayV2I64Trap: return "NVPTXISD::Suld2DArrayV2I64Trap";
1092  case NVPTXISD::Suld2DArrayV4I8Trap: return "NVPTXISD::Suld2DArrayV4I8Trap";
1093  case NVPTXISD::Suld2DArrayV4I16Trap: return "NVPTXISD::Suld2DArrayV4I16Trap";
1094  case NVPTXISD::Suld2DArrayV4I32Trap: return "NVPTXISD::Suld2DArrayV4I32Trap";
1095 
1096  case NVPTXISD::Suld3DI8Trap: return "NVPTXISD::Suld3DI8Trap";
1097  case NVPTXISD::Suld3DI16Trap: return "NVPTXISD::Suld3DI16Trap";
1098  case NVPTXISD::Suld3DI32Trap: return "NVPTXISD::Suld3DI32Trap";
1099  case NVPTXISD::Suld3DI64Trap: return "NVPTXISD::Suld3DI64Trap";
1100  case NVPTXISD::Suld3DV2I8Trap: return "NVPTXISD::Suld3DV2I8Trap";
1101  case NVPTXISD::Suld3DV2I16Trap: return "NVPTXISD::Suld3DV2I16Trap";
1102  case NVPTXISD::Suld3DV2I32Trap: return "NVPTXISD::Suld3DV2I32Trap";
1103  case NVPTXISD::Suld3DV2I64Trap: return "NVPTXISD::Suld3DV2I64Trap";
1104  case NVPTXISD::Suld3DV4I8Trap: return "NVPTXISD::Suld3DV4I8Trap";
1105  case NVPTXISD::Suld3DV4I16Trap: return "NVPTXISD::Suld3DV4I16Trap";
1106  case NVPTXISD::Suld3DV4I32Trap: return "NVPTXISD::Suld3DV4I32Trap";
1107 
1108  case NVPTXISD::Suld1DI8Zero: return "NVPTXISD::Suld1DI8Zero";
1109  case NVPTXISD::Suld1DI16Zero: return "NVPTXISD::Suld1DI16Zero";
1110  case NVPTXISD::Suld1DI32Zero: return "NVPTXISD::Suld1DI32Zero";
1111  case NVPTXISD::Suld1DI64Zero: return "NVPTXISD::Suld1DI64Zero";
1112  case NVPTXISD::Suld1DV2I8Zero: return "NVPTXISD::Suld1DV2I8Zero";
1113  case NVPTXISD::Suld1DV2I16Zero: return "NVPTXISD::Suld1DV2I16Zero";
1114  case NVPTXISD::Suld1DV2I32Zero: return "NVPTXISD::Suld1DV2I32Zero";
1115  case NVPTXISD::Suld1DV2I64Zero: return "NVPTXISD::Suld1DV2I64Zero";
1116  case NVPTXISD::Suld1DV4I8Zero: return "NVPTXISD::Suld1DV4I8Zero";
1117  case NVPTXISD::Suld1DV4I16Zero: return "NVPTXISD::Suld1DV4I16Zero";
1118  case NVPTXISD::Suld1DV4I32Zero: return "NVPTXISD::Suld1DV4I32Zero";
1119 
1120  case NVPTXISD::Suld1DArrayI8Zero: return "NVPTXISD::Suld1DArrayI8Zero";
1121  case NVPTXISD::Suld1DArrayI16Zero: return "NVPTXISD::Suld1DArrayI16Zero";
1122  case NVPTXISD::Suld1DArrayI32Zero: return "NVPTXISD::Suld1DArrayI32Zero";
1123  case NVPTXISD::Suld1DArrayI64Zero: return "NVPTXISD::Suld1DArrayI64Zero";
1124  case NVPTXISD::Suld1DArrayV2I8Zero: return "NVPTXISD::Suld1DArrayV2I8Zero";
1125  case NVPTXISD::Suld1DArrayV2I16Zero: return "NVPTXISD::Suld1DArrayV2I16Zero";
1126  case NVPTXISD::Suld1DArrayV2I32Zero: return "NVPTXISD::Suld1DArrayV2I32Zero";
1127  case NVPTXISD::Suld1DArrayV2I64Zero: return "NVPTXISD::Suld1DArrayV2I64Zero";
1128  case NVPTXISD::Suld1DArrayV4I8Zero: return "NVPTXISD::Suld1DArrayV4I8Zero";
1129  case NVPTXISD::Suld1DArrayV4I16Zero: return "NVPTXISD::Suld1DArrayV4I16Zero";
1130  case NVPTXISD::Suld1DArrayV4I32Zero: return "NVPTXISD::Suld1DArrayV4I32Zero";
1131 
1132  case NVPTXISD::Suld2DI8Zero: return "NVPTXISD::Suld2DI8Zero";
1133  case NVPTXISD::Suld2DI16Zero: return "NVPTXISD::Suld2DI16Zero";
1134  case NVPTXISD::Suld2DI32Zero: return "NVPTXISD::Suld2DI32Zero";
1135  case NVPTXISD::Suld2DI64Zero: return "NVPTXISD::Suld2DI64Zero";
1136  case NVPTXISD::Suld2DV2I8Zero: return "NVPTXISD::Suld2DV2I8Zero";
1137  case NVPTXISD::Suld2DV2I16Zero: return "NVPTXISD::Suld2DV2I16Zero";
1138  case NVPTXISD::Suld2DV2I32Zero: return "NVPTXISD::Suld2DV2I32Zero";
1139  case NVPTXISD::Suld2DV2I64Zero: return "NVPTXISD::Suld2DV2I64Zero";
1140  case NVPTXISD::Suld2DV4I8Zero: return "NVPTXISD::Suld2DV4I8Zero";
1141  case NVPTXISD::Suld2DV4I16Zero: return "NVPTXISD::Suld2DV4I16Zero";
1142  case NVPTXISD::Suld2DV4I32Zero: return "NVPTXISD::Suld2DV4I32Zero";
1143 
1144  case NVPTXISD::Suld2DArrayI8Zero: return "NVPTXISD::Suld2DArrayI8Zero";
1145  case NVPTXISD::Suld2DArrayI16Zero: return "NVPTXISD::Suld2DArrayI16Zero";
1146  case NVPTXISD::Suld2DArrayI32Zero: return "NVPTXISD::Suld2DArrayI32Zero";
1147  case NVPTXISD::Suld2DArrayI64Zero: return "NVPTXISD::Suld2DArrayI64Zero";
1148  case NVPTXISD::Suld2DArrayV2I8Zero: return "NVPTXISD::Suld2DArrayV2I8Zero";
1149  case NVPTXISD::Suld2DArrayV2I16Zero: return "NVPTXISD::Suld2DArrayV2I16Zero";
1150  case NVPTXISD::Suld2DArrayV2I32Zero: return "NVPTXISD::Suld2DArrayV2I32Zero";
1151  case NVPTXISD::Suld2DArrayV2I64Zero: return "NVPTXISD::Suld2DArrayV2I64Zero";
1152  case NVPTXISD::Suld2DArrayV4I8Zero: return "NVPTXISD::Suld2DArrayV4I8Zero";
1153  case NVPTXISD::Suld2DArrayV4I16Zero: return "NVPTXISD::Suld2DArrayV4I16Zero";
1154  case NVPTXISD::Suld2DArrayV4I32Zero: return "NVPTXISD::Suld2DArrayV4I32Zero";
1155 
1156  case NVPTXISD::Suld3DI8Zero: return "NVPTXISD::Suld3DI8Zero";
1157  case NVPTXISD::Suld3DI16Zero: return "NVPTXISD::Suld3DI16Zero";
1158  case NVPTXISD::Suld3DI32Zero: return "NVPTXISD::Suld3DI32Zero";
1159  case NVPTXISD::Suld3DI64Zero: return "NVPTXISD::Suld3DI64Zero";
1160  case NVPTXISD::Suld3DV2I8Zero: return "NVPTXISD::Suld3DV2I8Zero";
1161  case NVPTXISD::Suld3DV2I16Zero: return "NVPTXISD::Suld3DV2I16Zero";
1162  case NVPTXISD::Suld3DV2I32Zero: return "NVPTXISD::Suld3DV2I32Zero";
1163  case NVPTXISD::Suld3DV2I64Zero: return "NVPTXISD::Suld3DV2I64Zero";
1164  case NVPTXISD::Suld3DV4I8Zero: return "NVPTXISD::Suld3DV4I8Zero";
1165  case NVPTXISD::Suld3DV4I16Zero: return "NVPTXISD::Suld3DV4I16Zero";
1166  case NVPTXISD::Suld3DV4I32Zero: return "NVPTXISD::Suld3DV4I32Zero";
1167  }
1168  return nullptr;
1169 }
1170 
1173  if (VT.getVectorNumElements() != 1 && VT.getScalarType() == MVT::i1)
1174  return TypeSplitVector;
1175  if (VT == MVT::v2f16)
1176  return TypeLegal;
1178 }
1179 
1181  int Enabled, int &ExtraSteps,
1182  bool &UseOneConst,
1183  bool Reciprocal) const {
1184  if (!(Enabled == ReciprocalEstimate::Enabled ||
1185  (Enabled == ReciprocalEstimate::Unspecified && !usePrecSqrtF32())))
1186  return SDValue();
1187 
1188  if (ExtraSteps == ReciprocalEstimate::Unspecified)
1189  ExtraSteps = 0;
1190 
1191  SDLoc DL(Operand);
1192  EVT VT = Operand.getValueType();
1193  bool Ftz = useF32FTZ(DAG.getMachineFunction());
1194 
1195  auto MakeIntrinsicCall = [&](Intrinsic::ID IID) {
1196  return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
1197  DAG.getConstant(IID, DL, MVT::i32), Operand);
1198  };
1199 
1200  // The sqrt and rsqrt refinement processes assume we always start out with an
1201  // approximation of the rsqrt. Therefore, if we're going to do any refinement
1202  // (i.e. ExtraSteps > 0), we must return an rsqrt. But if we're *not* doing
1203  // any refinement, we must return a regular sqrt.
1204  if (Reciprocal || ExtraSteps > 0) {
1205  if (VT == MVT::f32)
1206  return MakeIntrinsicCall(Ftz ? Intrinsic::nvvm_rsqrt_approx_ftz_f
1207  : Intrinsic::nvvm_rsqrt_approx_f);
1208  else if (VT == MVT::f64)
1209  return MakeIntrinsicCall(Intrinsic::nvvm_rsqrt_approx_d);
1210  else
1211  return SDValue();
1212  } else {
1213  if (VT == MVT::f32)
1214  return MakeIntrinsicCall(Ftz ? Intrinsic::nvvm_sqrt_approx_ftz_f
1215  : Intrinsic::nvvm_sqrt_approx_f);
1216  else {
1217  // There's no sqrt.approx.f64 instruction, so we emit
1218  // reciprocal(rsqrt(x)). This is faster than
1219  // select(x == 0, 0, x * rsqrt(x)). (In fact, it's faster than plain
1220  // x * rsqrt(x).)
1221  return DAG.getNode(
1222  ISD::INTRINSIC_WO_CHAIN, DL, VT,
1223  DAG.getConstant(Intrinsic::nvvm_rcp_approx_ftz_d, DL, MVT::i32),
1224  MakeIntrinsicCall(Intrinsic::nvvm_rsqrt_approx_d));
1225  }
1226  }
1227 }
1228 
1229 SDValue
1231  SDLoc dl(Op);
1232  const GlobalAddressSDNode *GAN = cast<GlobalAddressSDNode>(Op);
1233  auto PtrVT = getPointerTy(DAG.getDataLayout(), GAN->getAddressSpace());
1234  Op = DAG.getTargetGlobalAddress(GAN->getGlobal(), dl, PtrVT);
1235  return DAG.getNode(NVPTXISD::Wrapper, dl, PtrVT, Op);
1236 }
1237 
1239  const DataLayout &DL, Type *retTy, const ArgListTy &Args,
1240  const SmallVectorImpl<ISD::OutputArg> &Outs, unsigned retAlignment,
1241  ImmutableCallSite CS) const {
1242  auto PtrVT = getPointerTy(DL);
1243 
1244  bool isABI = (STI.getSmVersion() >= 20);
1245  assert(isABI && "Non-ABI compilation is not supported");
1246  if (!isABI)
1247  return "";
1248 
1249  std::stringstream O;
1250  O << "prototype_" << uniqueCallSite << " : .callprototype ";
1251 
1252  if (retTy->getTypeID() == Type::VoidTyID) {
1253  O << "()";
1254  } else {
1255  O << "(";
1256  if (retTy->isFloatingPointTy() || (retTy->isIntegerTy() && !retTy->isIntegerTy(128))) {
1257  unsigned size = 0;
1258  if (auto *ITy = dyn_cast<IntegerType>(retTy)) {
1259  size = ITy->getBitWidth();
1260  } else {
1261  assert(retTy->isFloatingPointTy() &&
1262  "Floating point type expected here");
1263  size = retTy->getPrimitiveSizeInBits();
1264  }
1265  // PTX ABI requires all scalar return values to be at least 32
1266  // bits in size. fp16 normally uses .b16 as its storage type in
1267  // PTX, so its size must be adjusted here, too.
1268  if (size < 32)
1269  size = 32;
1270 
1271  O << ".param .b" << size << " _";
1272  } else if (isa<PointerType>(retTy)) {
1273  O << ".param .b" << PtrVT.getSizeInBits() << " _";
1274  } else if (retTy->isAggregateType() || retTy->isVectorTy() || retTy->isIntegerTy(128)) {
1275  auto &DL = CS.getCalledFunction()->getParent()->getDataLayout();
1276  O << ".param .align " << retAlignment << " .b8 _["
1277  << DL.getTypeAllocSize(retTy) << "]";
1278  } else {
1279  llvm_unreachable("Unknown return type");
1280  }
1281  O << ") ";
1282  }
1283  O << "_ (";
1284 
1285  bool first = true;
1286 
1287  unsigned OIdx = 0;
1288  for (unsigned i = 0, e = Args.size(); i != e; ++i, ++OIdx) {
1289  Type *Ty = Args[i].Ty;
1290  if (!first) {
1291  O << ", ";
1292  }
1293  first = false;
1294 
1295  if (!Outs[OIdx].Flags.isByVal()) {
1296  if (Ty->isAggregateType() || Ty->isVectorTy() || Ty->isIntegerTy(128)) {
1297  unsigned align = 0;
1298  const CallInst *CallI = cast<CallInst>(CS.getInstruction());
1299  // +1 because index 0 is reserved for return type alignment
1300  if (!getAlign(*CallI, i + 1, align))
1301  align = DL.getABITypeAlignment(Ty);
1302  unsigned sz = DL.getTypeAllocSize(Ty);
1303  O << ".param .align " << align << " .b8 ";
1304  O << "_";
1305  O << "[" << sz << "]";
1306  // update the index for Outs
1307  SmallVector<EVT, 16> vtparts;
1308  ComputeValueVTs(*this, DL, Ty, vtparts);
1309  if (unsigned len = vtparts.size())
1310  OIdx += len - 1;
1311  continue;
1312  }
1313  // i8 types in IR will be i16 types in SDAG
1314  assert((getValueType(DL, Ty) == Outs[OIdx].VT ||
1315  (getValueType(DL, Ty) == MVT::i8 && Outs[OIdx].VT == MVT::i16)) &&
1316  "type mismatch between callee prototype and arguments");
1317  // scalar type
1318  unsigned sz = 0;
1319  if (isa<IntegerType>(Ty)) {
1320  sz = cast<IntegerType>(Ty)->getBitWidth();
1321  if (sz < 32)
1322  sz = 32;
1323  } else if (isa<PointerType>(Ty)) {
1324  sz = PtrVT.getSizeInBits();
1325  } else if (Ty->isHalfTy())
1326  // PTX ABI requires all scalar parameters to be at least 32
1327  // bits in size. fp16 normally uses .b16 as its storage type
1328  // in PTX, so its size must be adjusted here, too.
1329  sz = 32;
1330  else
1331  sz = Ty->getPrimitiveSizeInBits();
1332  O << ".param .b" << sz << " ";
1333  O << "_";
1334  continue;
1335  }
1336  auto *PTy = dyn_cast<PointerType>(Ty);
1337  assert(PTy && "Param with byval attribute should be a pointer type");
1338  Type *ETy = PTy->getElementType();
1339 
1340  unsigned align = Outs[OIdx].Flags.getByValAlign();
1341  unsigned sz = DL.getTypeAllocSize(ETy);
1342  O << ".param .align " << align << " .b8 ";
1343  O << "_";
1344  O << "[" << sz << "]";
1345  }
1346  O << ");";
1347  return O.str();
1348 }
1349 
1350 unsigned NVPTXTargetLowering::getArgumentAlignment(SDValue Callee,
1351  ImmutableCallSite CS,
1352  Type *Ty, unsigned Idx,
1353  const DataLayout &DL) const {
1354  if (!CS) {
1355  // CallSite is zero, fallback to ABI type alignment
1356  return DL.getABITypeAlignment(Ty);
1357  }
1358 
1359  unsigned Align = 0;
1360  const Value *DirectCallee = CS.getCalledFunction();
1361 
1362  if (!DirectCallee) {
1363  // We don't have a direct function symbol, but that may be because of
1364  // constant cast instructions in the call.
1365  const Instruction *CalleeI = CS.getInstruction();
1366  assert(CalleeI && "Call target is not a function or derived value?");
1367 
1368  // With bitcast'd call targets, the instruction will be the call
1369  if (isa<CallInst>(CalleeI)) {
1370  // Check if we have call alignment metadata
1371  if (getAlign(*cast<CallInst>(CalleeI), Idx, Align))
1372  return Align;
1373 
1374  const Value *CalleeV = cast<CallInst>(CalleeI)->getCalledValue();
1375  // Ignore any bitcast instructions
1376  while (isa<ConstantExpr>(CalleeV)) {
1377  const ConstantExpr *CE = cast<ConstantExpr>(CalleeV);
1378  if (!CE->isCast())
1379  break;
1380  // Look through the bitcast
1381  CalleeV = cast<ConstantExpr>(CalleeV)->getOperand(0);
1382  }
1383 
1384  // We have now looked past all of the bitcasts. Do we finally have a
1385  // Function?
1386  if (isa<Function>(CalleeV))
1387  DirectCallee = CalleeV;
1388  }
1389  }
1390 
1391  // Check for function alignment information if we found that the
1392  // ultimate target is a Function
1393  if (DirectCallee)
1394  if (getAlign(*cast<Function>(DirectCallee), Idx, Align))
1395  return Align;
1396 
1397  // Call is indirect or alignment information is not available, fall back to
1398  // the ABI type alignment
1399  return DL.getABITypeAlignment(Ty);
1400 }
1401 
1403  SmallVectorImpl<SDValue> &InVals) const {
1404  SelectionDAG &DAG = CLI.DAG;
1405  SDLoc dl = CLI.DL;
1407  SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
1409  SDValue Chain = CLI.Chain;
1410  SDValue Callee = CLI.Callee;
1411  bool &isTailCall = CLI.IsTailCall;
1412  ArgListTy &Args = CLI.getArgs();
1413  Type *RetTy = CLI.RetTy;
1414  ImmutableCallSite CS = CLI.CS;
1415  const DataLayout &DL = DAG.getDataLayout();
1416 
1417  bool isABI = (STI.getSmVersion() >= 20);
1418  assert(isABI && "Non-ABI compilation is not supported");
1419  if (!isABI)
1420  return Chain;
1421 
1422  SDValue tempChain = Chain;
1423  Chain = DAG.getCALLSEQ_START(Chain, uniqueCallSite, 0, dl);
1424  SDValue InFlag = Chain.getValue(1);
1425 
1426  unsigned paramCount = 0;
1427  // Args.size() and Outs.size() need not match.
1428  // Outs.size() will be larger
1429  // * if there is an aggregate argument with multiple fields (each field
1430  // showing up separately in Outs)
1431  // * if there is a vector argument with more than typical vector-length
1432  // elements (generally if more than 4) where each vector element is
1433  // individually present in Outs.
1434  // So a different index should be used for indexing into Outs/OutVals.
1435  // See similar issue in LowerFormalArguments.
1436  unsigned OIdx = 0;
1437  // Declare the .params or .reg need to pass values
1438  // to the function
1439  for (unsigned i = 0, e = Args.size(); i != e; ++i, ++OIdx) {
1440  EVT VT = Outs[OIdx].VT;
1441  Type *Ty = Args[i].Ty;
1442 
1443  if (!Outs[OIdx].Flags.isByVal()) {
1446  ComputePTXValueVTs(*this, DL, Ty, VTs, &Offsets);
1447  unsigned ArgAlign =
1448  getArgumentAlignment(Callee, CS, Ty, paramCount + 1, DL);
1449  unsigned AllocSize = DL.getTypeAllocSize(Ty);
1450  SDVTList DeclareParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1451  bool NeedAlign; // Does argument declaration specify alignment?
1452  if (Ty->isAggregateType() || Ty->isVectorTy() || Ty->isIntegerTy(128)) {
1453  // declare .param .align <align> .b8 .param<n>[<size>];
1454  SDValue DeclareParamOps[] = {
1455  Chain, DAG.getConstant(ArgAlign, dl, MVT::i32),
1456  DAG.getConstant(paramCount, dl, MVT::i32),
1457  DAG.getConstant(AllocSize, dl, MVT::i32), InFlag};
1458  Chain = DAG.getNode(NVPTXISD::DeclareParam, dl, DeclareParamVTs,
1459  DeclareParamOps);
1460  NeedAlign = true;
1461  } else {
1462  // declare .param .b<size> .param<n>;
1463  if ((VT.isInteger() || VT.isFloatingPoint()) && AllocSize < 4) {
1464  // PTX ABI requires integral types to be at least 32 bits in
1465  // size. FP16 is loaded/stored using i16, so it's handled
1466  // here as well.
1467  AllocSize = 4;
1468  }
1469  SDValue DeclareScalarParamOps[] = {
1470  Chain, DAG.getConstant(paramCount, dl, MVT::i32),
1471  DAG.getConstant(AllocSize * 8, dl, MVT::i32),
1472  DAG.getConstant(0, dl, MVT::i32), InFlag};
1473  Chain = DAG.getNode(NVPTXISD::DeclareScalarParam, dl, DeclareParamVTs,
1474  DeclareScalarParamOps);
1475  NeedAlign = false;
1476  }
1477  InFlag = Chain.getValue(1);
1478 
1479  // PTX Interoperability Guide 3.3(A): [Integer] Values shorter
1480  // than 32-bits are sign extended or zero extended, depending on
1481  // whether they are signed or unsigned types. This case applies
1482  // only to scalar parameters and not to aggregate values.
1483  bool ExtendIntegerParam =
1484  Ty->isIntegerTy() && DL.getTypeAllocSizeInBits(Ty) < 32;
1485 
1486  auto VectorInfo = VectorizePTXValueVTs(VTs, Offsets, ArgAlign);
1487  SmallVector<SDValue, 6> StoreOperands;
1488  for (unsigned j = 0, je = VTs.size(); j != je; ++j) {
1489  // New store.
1490  if (VectorInfo[j] & PVF_FIRST) {
1491  assert(StoreOperands.empty() && "Unfinished preceeding store.");
1492  StoreOperands.push_back(Chain);
1493  StoreOperands.push_back(DAG.getConstant(paramCount, dl, MVT::i32));
1494  StoreOperands.push_back(DAG.getConstant(Offsets[j], dl, MVT::i32));
1495  }
1496 
1497  EVT EltVT = VTs[j];
1498  SDValue StVal = OutVals[OIdx];
1499  if (ExtendIntegerParam) {
1500  assert(VTs.size() == 1 && "Scalar can't have multiple parts.");
1501  // zext/sext to i32
1502  StVal = DAG.getNode(Outs[OIdx].Flags.isSExt() ? ISD::SIGN_EXTEND
1503  : ISD::ZERO_EXTEND,
1504  dl, MVT::i32, StVal);
1505  } else if (EltVT.getSizeInBits() < 16) {
1506  // Use 16-bit registers for small stores as it's the
1507  // smallest general purpose register size supported by NVPTX.
1508  StVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, StVal);
1509  }
1510 
1511  // Record the value to store.
1512  StoreOperands.push_back(StVal);
1513 
1514  if (VectorInfo[j] & PVF_LAST) {
1515  unsigned NumElts = StoreOperands.size() - 3;
1517  switch (NumElts) {
1518  case 1:
1519  Op = NVPTXISD::StoreParam;
1520  break;
1521  case 2:
1523  break;
1524  case 4:
1526  break;
1527  default:
1528  llvm_unreachable("Invalid vector info.");
1529  }
1530 
1531  StoreOperands.push_back(InFlag);
1532 
1533  // Adjust type of the store op if we've extended the scalar
1534  // return value.
1535  EVT TheStoreType = ExtendIntegerParam ? MVT::i32 : VTs[j];
1536  unsigned EltAlign =
1537  NeedAlign ? GreatestCommonDivisor64(ArgAlign, Offsets[j]) : 0;
1538 
1539  Chain = DAG.getMemIntrinsicNode(
1540  Op, dl, DAG.getVTList(MVT::Other, MVT::Glue), StoreOperands,
1541  TheStoreType, MachinePointerInfo(), EltAlign,
1543  InFlag = Chain.getValue(1);
1544 
1545  // Cleanup.
1546  StoreOperands.clear();
1547  }
1548  ++OIdx;
1549  }
1550  assert(StoreOperands.empty() && "Unfinished parameter store.");
1551  if (VTs.size() > 0)
1552  --OIdx;
1553  ++paramCount;
1554  continue;
1555  }
1556 
1557  // ByVal arguments
1560  auto *PTy = dyn_cast<PointerType>(Args[i].Ty);
1561  assert(PTy && "Type of a byval parameter should be pointer");
1562  ComputePTXValueVTs(*this, DL, PTy->getElementType(), VTs, &Offsets, 0);
1563 
1564  // declare .param .align <align> .b8 .param<n>[<size>];
1565  unsigned sz = Outs[OIdx].Flags.getByValSize();
1566  SDVTList DeclareParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1567  unsigned ArgAlign = Outs[OIdx].Flags.getByValAlign();
1568  // The ByValAlign in the Outs[OIdx].Flags is alway set at this point,
1569  // so we don't need to worry about natural alignment or not.
1570  // See TargetLowering::LowerCallTo().
1571 
1572  // Enforce minumum alignment of 4 to work around ptxas miscompile
1573  // for sm_50+. See corresponding alignment adjustment in
1574  // emitFunctionParamList() for details.
1575  if (ArgAlign < 4)
1576  ArgAlign = 4;
1577  SDValue DeclareParamOps[] = {Chain, DAG.getConstant(ArgAlign, dl, MVT::i32),
1578  DAG.getConstant(paramCount, dl, MVT::i32),
1579  DAG.getConstant(sz, dl, MVT::i32), InFlag};
1580  Chain = DAG.getNode(NVPTXISD::DeclareParam, dl, DeclareParamVTs,
1581  DeclareParamOps);
1582  InFlag = Chain.getValue(1);
1583  for (unsigned j = 0, je = VTs.size(); j != je; ++j) {
1584  EVT elemtype = VTs[j];
1585  int curOffset = Offsets[j];
1586  unsigned PartAlign = GreatestCommonDivisor64(ArgAlign, curOffset);
1587  auto PtrVT = getPointerTy(DL);
1588  SDValue srcAddr = DAG.getNode(ISD::ADD, dl, PtrVT, OutVals[OIdx],
1589  DAG.getConstant(curOffset, dl, PtrVT));
1590  SDValue theVal = DAG.getLoad(elemtype, dl, tempChain, srcAddr,
1591  MachinePointerInfo(), PartAlign);
1592  if (elemtype.getSizeInBits() < 16) {
1593  theVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, theVal);
1594  }
1595  SDVTList CopyParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1596  SDValue CopyParamOps[] = { Chain,
1597  DAG.getConstant(paramCount, dl, MVT::i32),
1598  DAG.getConstant(curOffset, dl, MVT::i32),
1599  theVal, InFlag };
1600  Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreParam, dl, CopyParamVTs,
1601  CopyParamOps, elemtype,
1602  MachinePointerInfo(), /* Align */ 0,
1604 
1605  InFlag = Chain.getValue(1);
1606  }
1607  ++paramCount;
1608  }
1609 
1611  unsigned retAlignment = 0;
1612 
1613  // Handle Result
1614  if (Ins.size() > 0) {
1615  SmallVector<EVT, 16> resvtparts;
1616  ComputeValueVTs(*this, DL, RetTy, resvtparts);
1617 
1618  // Declare
1619  // .param .align 16 .b8 retval0[<size-in-bytes>], or
1620  // .param .b<size-in-bits> retval0
1621  unsigned resultsz = DL.getTypeAllocSizeInBits(RetTy);
1622  // Emit ".param .b<size-in-bits> retval0" instead of byte arrays only for
1623  // these three types to match the logic in
1624  // NVPTXAsmPrinter::printReturnValStr and NVPTXTargetLowering::getPrototype.
1625  // Plus, this behavior is consistent with nvcc's.
1626  if (RetTy->isFloatingPointTy() || RetTy->isPointerTy() ||
1627  (RetTy->isIntegerTy() && !RetTy->isIntegerTy(128))) {
1628  // Scalar needs to be at least 32bit wide
1629  if (resultsz < 32)
1630  resultsz = 32;
1631  SDVTList DeclareRetVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1632  SDValue DeclareRetOps[] = { Chain, DAG.getConstant(1, dl, MVT::i32),
1633  DAG.getConstant(resultsz, dl, MVT::i32),
1634  DAG.getConstant(0, dl, MVT::i32), InFlag };
1635  Chain = DAG.getNode(NVPTXISD::DeclareRet, dl, DeclareRetVTs,
1636  DeclareRetOps);
1637  InFlag = Chain.getValue(1);
1638  } else {
1639  retAlignment = getArgumentAlignment(Callee, CS, RetTy, 0, DL);
1640  SDVTList DeclareRetVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1641  SDValue DeclareRetOps[] = { Chain,
1642  DAG.getConstant(retAlignment, dl, MVT::i32),
1643  DAG.getConstant(resultsz / 8, dl, MVT::i32),
1644  DAG.getConstant(0, dl, MVT::i32), InFlag };
1645  Chain = DAG.getNode(NVPTXISD::DeclareRetParam, dl, DeclareRetVTs,
1646  DeclareRetOps);
1647  InFlag = Chain.getValue(1);
1648  }
1649  }
1650 
1651  if (!Func) {
1652  // This is indirect function call case : PTX requires a prototype of the
1653  // form
1654  // proto_0 : .callprototype(.param .b32 _) _ (.param .b32 _);
1655  // to be emitted, and the label has to used as the last arg of call
1656  // instruction.
1657  // The prototype is embedded in a string and put as the operand for a
1658  // CallPrototype SDNode which will print out to the value of the string.
1659  SDVTList ProtoVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1660  std::string Proto = getPrototype(DL, RetTy, Args, Outs, retAlignment, CS);
1661  const char *ProtoStr =
1662  nvTM->getManagedStrPool()->getManagedString(Proto.c_str())->c_str();
1663  SDValue ProtoOps[] = {
1664  Chain, DAG.getTargetExternalSymbol(ProtoStr, MVT::i32), InFlag,
1665  };
1666  Chain = DAG.getNode(NVPTXISD::CallPrototype, dl, ProtoVTs, ProtoOps);
1667  InFlag = Chain.getValue(1);
1668  }
1669  // Op to just print "call"
1670  SDVTList PrintCallVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1671  SDValue PrintCallOps[] = {
1672  Chain, DAG.getConstant((Ins.size() == 0) ? 0 : 1, dl, MVT::i32), InFlag
1673  };
1674  // We model convergent calls as separate opcodes.
1675  unsigned Opcode = Func ? NVPTXISD::PrintCallUni : NVPTXISD::PrintCall;
1676  if (CLI.IsConvergent)
1679  Chain = DAG.getNode(Opcode, dl, PrintCallVTs, PrintCallOps);
1680  InFlag = Chain.getValue(1);
1681 
1682  // Ops to print out the function name
1683  SDVTList CallVoidVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1684  SDValue CallVoidOps[] = { Chain, Callee, InFlag };
1685  Chain = DAG.getNode(NVPTXISD::CallVoid, dl, CallVoidVTs, CallVoidOps);
1686  InFlag = Chain.getValue(1);
1687 
1688  // Ops to print out the param list
1689  SDVTList CallArgBeginVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1690  SDValue CallArgBeginOps[] = { Chain, InFlag };
1691  Chain = DAG.getNode(NVPTXISD::CallArgBegin, dl, CallArgBeginVTs,
1692  CallArgBeginOps);
1693  InFlag = Chain.getValue(1);
1694 
1695  for (unsigned i = 0, e = paramCount; i != e; ++i) {
1696  unsigned opcode;
1697  if (i == (e - 1))
1698  opcode = NVPTXISD::LastCallArg;
1699  else
1700  opcode = NVPTXISD::CallArg;
1701  SDVTList CallArgVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1702  SDValue CallArgOps[] = { Chain, DAG.getConstant(1, dl, MVT::i32),
1703  DAG.getConstant(i, dl, MVT::i32), InFlag };
1704  Chain = DAG.getNode(opcode, dl, CallArgVTs, CallArgOps);
1705  InFlag = Chain.getValue(1);
1706  }
1707  SDVTList CallArgEndVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1708  SDValue CallArgEndOps[] = { Chain,
1709  DAG.getConstant(Func ? 1 : 0, dl, MVT::i32),
1710  InFlag };
1711  Chain = DAG.getNode(NVPTXISD::CallArgEnd, dl, CallArgEndVTs, CallArgEndOps);
1712  InFlag = Chain.getValue(1);
1713 
1714  if (!Func) {
1715  SDVTList PrototypeVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1716  SDValue PrototypeOps[] = { Chain,
1718  InFlag };
1719  Chain = DAG.getNode(NVPTXISD::Prototype, dl, PrototypeVTs, PrototypeOps);
1720  InFlag = Chain.getValue(1);
1721  }
1722 
1723  // Generate loads from param memory/moves from registers for result
1724  if (Ins.size() > 0) {
1727  ComputePTXValueVTs(*this, DL, RetTy, VTs, &Offsets, 0);
1728  assert(VTs.size() == Ins.size() && "Bad value decomposition");
1729 
1730  unsigned RetAlign = getArgumentAlignment(Callee, CS, RetTy, 0, DL);
1731  auto VectorInfo = VectorizePTXValueVTs(VTs, Offsets, RetAlign);
1732 
1733  SmallVector<EVT, 6> LoadVTs;
1734  int VecIdx = -1; // Index of the first element of the vector.
1735 
1736  // PTX Interoperability Guide 3.3(A): [Integer] Values shorter than
1737  // 32-bits are sign extended or zero extended, depending on whether
1738  // they are signed or unsigned types.
1739  bool ExtendIntegerRetVal =
1740  RetTy->isIntegerTy() && DL.getTypeAllocSizeInBits(RetTy) < 32;
1741 
1742  for (unsigned i = 0, e = VTs.size(); i != e; ++i) {
1743  bool needTruncate = false;
1744  EVT TheLoadType = VTs[i];
1745  EVT EltType = Ins[i].VT;
1746  unsigned EltAlign = GreatestCommonDivisor64(RetAlign, Offsets[i]);
1747  if (ExtendIntegerRetVal) {
1748  TheLoadType = MVT::i32;
1749  EltType = MVT::i32;
1750  needTruncate = true;
1751  } else if (TheLoadType.getSizeInBits() < 16) {
1752  if (VTs[i].isInteger())
1753  needTruncate = true;
1754  EltType = MVT::i16;
1755  }
1756 
1757  // Record index of the very first element of the vector.
1758  if (VectorInfo[i] & PVF_FIRST) {
1759  assert(VecIdx == -1 && LoadVTs.empty() && "Orphaned operand list.");
1760  VecIdx = i;
1761  }
1762 
1763  LoadVTs.push_back(EltType);
1764 
1765  if (VectorInfo[i] & PVF_LAST) {
1766  unsigned NumElts = LoadVTs.size();
1767  LoadVTs.push_back(MVT::Other);
1768  LoadVTs.push_back(MVT::Glue);
1770  switch (NumElts) {
1771  case 1:
1772  Op = NVPTXISD::LoadParam;
1773  break;
1774  case 2:
1775  Op = NVPTXISD::LoadParamV2;
1776  break;
1777  case 4:
1778  Op = NVPTXISD::LoadParamV4;
1779  break;
1780  default:
1781  llvm_unreachable("Invalid vector info.");
1782  }
1783 
1784  SDValue LoadOperands[] = {
1785  Chain, DAG.getConstant(1, dl, MVT::i32),
1786  DAG.getConstant(Offsets[VecIdx], dl, MVT::i32), InFlag};
1787  SDValue RetVal = DAG.getMemIntrinsicNode(
1788  Op, dl, DAG.getVTList(LoadVTs), LoadOperands, TheLoadType,
1789  MachinePointerInfo(), EltAlign,
1791 
1792  for (unsigned j = 0; j < NumElts; ++j) {
1793  SDValue Ret = RetVal.getValue(j);
1794  if (needTruncate)
1795  Ret = DAG.getNode(ISD::TRUNCATE, dl, Ins[VecIdx + j].VT, Ret);
1796  InVals.push_back(Ret);
1797  }
1798  Chain = RetVal.getValue(NumElts);
1799  InFlag = RetVal.getValue(NumElts + 1);
1800 
1801  // Cleanup
1802  VecIdx = -1;
1803  LoadVTs.clear();
1804  }
1805  }
1806  }
1807 
1808  Chain = DAG.getCALLSEQ_END(Chain,
1809  DAG.getIntPtrConstant(uniqueCallSite, dl, true),
1810  DAG.getIntPtrConstant(uniqueCallSite + 1, dl,
1811  true),
1812  InFlag, dl);
1813  uniqueCallSite++;
1814 
1815  // set isTailCall to false for now, until we figure out how to express
1816  // tail call optimization in PTX
1817  isTailCall = false;
1818  return Chain;
1819 }
1820 
1821 // By default CONCAT_VECTORS is lowered by ExpandVectorBuildThroughStack()
1822 // (see LegalizeDAG.cpp). This is slow and uses local memory.
1823 // We use extract/insert/build vector just as what LegalizeOp() does in llvm 2.5
1824 SDValue
1825 NVPTXTargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const {
1826  SDNode *Node = Op.getNode();
1827  SDLoc dl(Node);
1829  unsigned NumOperands = Node->getNumOperands();
1830  for (unsigned i = 0; i < NumOperands; ++i) {
1831  SDValue SubOp = Node->getOperand(i);
1832  EVT VVT = SubOp.getNode()->getValueType(0);
1833  EVT EltVT = VVT.getVectorElementType();
1834  unsigned NumSubElem = VVT.getVectorNumElements();
1835  for (unsigned j = 0; j < NumSubElem; ++j) {
1836  Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, SubOp,
1837  DAG.getIntPtrConstant(j, dl)));
1838  }
1839  }
1840  return DAG.getBuildVector(Node->getValueType(0), dl, Ops);
1841 }
1842 
1843 // We can init constant f16x2 with a single .b32 move. Normally it
1844 // would get lowered as two constant loads and vector-packing move.
1845 // mov.b16 %h1, 0x4000;
1846 // mov.b16 %h2, 0x3C00;
1847 // mov.b32 %hh2, {%h2, %h1};
1848 // Instead we want just a constant move:
1849 // mov.b32 %hh2, 0x40003C00
1850 //
1851 // This results in better SASS code with CUDA 7.x. Ptxas in CUDA 8.0
1852 // generates good SASS in both cases.
1853 SDValue NVPTXTargetLowering::LowerBUILD_VECTOR(SDValue Op,
1854  SelectionDAG &DAG) const {
1855  //return Op;
1856  if (!(Op->getValueType(0) == MVT::v2f16 &&
1857  isa<ConstantFPSDNode>(Op->getOperand(0)) &&
1858  isa<ConstantFPSDNode>(Op->getOperand(1))))
1859  return Op;
1860 
1861  APInt E0 =
1862  cast<ConstantFPSDNode>(Op->getOperand(0))->getValueAPF().bitcastToAPInt();
1863  APInt E1 =
1864  cast<ConstantFPSDNode>(Op->getOperand(1))->getValueAPF().bitcastToAPInt();
1865  SDValue Const =
1866  DAG.getConstant(E1.zext(32).shl(16) | E0.zext(32), SDLoc(Op), MVT::i32);
1867  return DAG.getNode(ISD::BITCAST, SDLoc(Op), MVT::v2f16, Const);
1868 }
1869 
1870 SDValue NVPTXTargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
1871  SelectionDAG &DAG) const {
1872  SDValue Index = Op->getOperand(1);
1873  // Constant index will be matched by tablegen.
1874  if (isa<ConstantSDNode>(Index.getNode()))
1875  return Op;
1876 
1877  // Extract individual elements and select one of them.
1878  SDValue Vector = Op->getOperand(0);
1879  EVT VectorVT = Vector.getValueType();
1880  assert(VectorVT == MVT::v2f16 && "Unexpected vector type.");
1881  EVT EltVT = VectorVT.getVectorElementType();
1882 
1883  SDLoc dl(Op.getNode());
1884  SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Vector,
1885  DAG.getIntPtrConstant(0, dl));
1886  SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Vector,
1887  DAG.getIntPtrConstant(1, dl));
1888  return DAG.getSelectCC(dl, Index, DAG.getIntPtrConstant(0, dl), E0, E1,
1890 }
1891 
1892 /// LowerShiftRightParts - Lower SRL_PARTS, SRA_PARTS, which
1893 /// 1) returns two i32 values and take a 2 x i32 value to shift plus a shift
1894 /// amount, or
1895 /// 2) returns two i64 values and take a 2 x i64 value to shift plus a shift
1896 /// amount.
1897 SDValue NVPTXTargetLowering::LowerShiftRightParts(SDValue Op,
1898  SelectionDAG &DAG) const {
1899  assert(Op.getNumOperands() == 3 && "Not a double-shift!");
1901 
1902  EVT VT = Op.getValueType();
1903  unsigned VTBits = VT.getSizeInBits();
1904  SDLoc dl(Op);
1905  SDValue ShOpLo = Op.getOperand(0);
1906  SDValue ShOpHi = Op.getOperand(1);
1907  SDValue ShAmt = Op.getOperand(2);
1908  unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL;
1909 
1910  if (VTBits == 32 && STI.getSmVersion() >= 35) {
1911  // For 32bit and sm35, we can use the funnel shift 'shf' instruction.
1912  // {dHi, dLo} = {aHi, aLo} >> Amt
1913  // dHi = aHi >> Amt
1914  // dLo = shf.r.clamp aLo, aHi, Amt
1915 
1916  SDValue Hi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
1917  SDValue Lo = DAG.getNode(NVPTXISD::FUN_SHFR_CLAMP, dl, VT, ShOpLo, ShOpHi,
1918  ShAmt);
1919 
1920  SDValue Ops[2] = { Lo, Hi };
1921  return DAG.getMergeValues(Ops, dl);
1922  }
1923  else {
1924  // {dHi, dLo} = {aHi, aLo} >> Amt
1925  // - if (Amt>=size) then
1926  // dLo = aHi >> (Amt-size)
1927  // dHi = aHi >> Amt (this is either all 0 or all 1)
1928  // else
1929  // dLo = (aLo >>logic Amt) | (aHi << (size-Amt))
1930  // dHi = aHi >> Amt
1931 
1932  SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
1933  DAG.getConstant(VTBits, dl, MVT::i32),
1934  ShAmt);
1935  SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt);
1936  SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
1937  DAG.getConstant(VTBits, dl, MVT::i32));
1938  SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt);
1939  SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
1940  SDValue TrueVal = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt);
1941 
1942  SDValue Cmp = DAG.getSetCC(dl, MVT::i1, ShAmt,
1943  DAG.getConstant(VTBits, dl, MVT::i32),
1944  ISD::SETGE);
1945  SDValue Hi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
1946  SDValue Lo = DAG.getNode(ISD::SELECT, dl, VT, Cmp, TrueVal, FalseVal);
1947 
1948  SDValue Ops[2] = { Lo, Hi };
1949  return DAG.getMergeValues(Ops, dl);
1950  }
1951 }
1952 
1953 /// LowerShiftLeftParts - Lower SHL_PARTS, which
1954 /// 1) returns two i32 values and take a 2 x i32 value to shift plus a shift
1955 /// amount, or
1956 /// 2) returns two i64 values and take a 2 x i64 value to shift plus a shift
1957 /// amount.
1958 SDValue NVPTXTargetLowering::LowerShiftLeftParts(SDValue Op,
1959  SelectionDAG &DAG) const {
1960  assert(Op.getNumOperands() == 3 && "Not a double-shift!");
1961  assert(Op.getOpcode() == ISD::SHL_PARTS);
1962 
1963  EVT VT = Op.getValueType();
1964  unsigned VTBits = VT.getSizeInBits();
1965  SDLoc dl(Op);
1966  SDValue ShOpLo = Op.getOperand(0);
1967  SDValue ShOpHi = Op.getOperand(1);
1968  SDValue ShAmt = Op.getOperand(2);
1969 
1970  if (VTBits == 32 && STI.getSmVersion() >= 35) {
1971  // For 32bit and sm35, we can use the funnel shift 'shf' instruction.
1972  // {dHi, dLo} = {aHi, aLo} << Amt
1973  // dHi = shf.l.clamp aLo, aHi, Amt
1974  // dLo = aLo << Amt
1975 
1976  SDValue Hi = DAG.getNode(NVPTXISD::FUN_SHFL_CLAMP, dl, VT, ShOpLo, ShOpHi,
1977  ShAmt);
1978  SDValue Lo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
1979 
1980  SDValue Ops[2] = { Lo, Hi };
1981  return DAG.getMergeValues(Ops, dl);
1982  }
1983  else {
1984  // {dHi, dLo} = {aHi, aLo} << Amt
1985  // - if (Amt>=size) then
1986  // dLo = aLo << Amt (all 0)
1987  // dLo = aLo << (Amt-size)
1988  // else
1989  // dLo = aLo << Amt
1990  // dHi = (aHi << Amt) | (aLo >> (size-Amt))
1991 
1992  SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
1993  DAG.getConstant(VTBits, dl, MVT::i32),
1994  ShAmt);
1995  SDValue Tmp1 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt);
1996  SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
1997  DAG.getConstant(VTBits, dl, MVT::i32));
1998  SDValue Tmp2 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt);
1999  SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
2000  SDValue TrueVal = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt);
2001 
2002  SDValue Cmp = DAG.getSetCC(dl, MVT::i1, ShAmt,
2003  DAG.getConstant(VTBits, dl, MVT::i32),
2004  ISD::SETGE);
2005  SDValue Lo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
2006  SDValue Hi = DAG.getNode(ISD::SELECT, dl, VT, Cmp, TrueVal, FalseVal);
2007 
2008  SDValue Ops[2] = { Lo, Hi };
2009  return DAG.getMergeValues(Ops, dl);
2010  }
2011 }
2012 
2013 SDValue
2015  switch (Op.getOpcode()) {
2016  case ISD::RETURNADDR:
2017  return SDValue();
2018  case ISD::FRAMEADDR:
2019  return SDValue();
2020  case ISD::GlobalAddress:
2021  return LowerGlobalAddress(Op, DAG);
2023  return Op;
2024  case ISD::BUILD_VECTOR:
2025  return LowerBUILD_VECTOR(Op, DAG);
2027  return Op;
2029  return LowerEXTRACT_VECTOR_ELT(Op, DAG);
2030  case ISD::CONCAT_VECTORS:
2031  return LowerCONCAT_VECTORS(Op, DAG);
2032  case ISD::STORE:
2033  return LowerSTORE(Op, DAG);
2034  case ISD::LOAD:
2035  return LowerLOAD(Op, DAG);
2036  case ISD::SHL_PARTS:
2037  return LowerShiftLeftParts(Op, DAG);
2038  case ISD::SRA_PARTS:
2039  case ISD::SRL_PARTS:
2040  return LowerShiftRightParts(Op, DAG);
2041  case ISD::SELECT:
2042  return LowerSelect(Op, DAG);
2043  default:
2044  llvm_unreachable("Custom lowering not defined for operation");
2045  }
2046 }
2047 
2048 SDValue NVPTXTargetLowering::LowerSelect(SDValue Op, SelectionDAG &DAG) const {
2049  SDValue Op0 = Op->getOperand(0);
2050  SDValue Op1 = Op->getOperand(1);
2051  SDValue Op2 = Op->getOperand(2);
2052  SDLoc DL(Op.getNode());
2053 
2054  assert(Op.getValueType() == MVT::i1 && "Custom lowering enabled only for i1");
2055 
2056  Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op1);
2057  Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op2);
2058  SDValue Select = DAG.getNode(ISD::SELECT, DL, MVT::i32, Op0, Op1, Op2);
2059  SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Select);
2060 
2061  return Trunc;
2062 }
2063 
2064 SDValue NVPTXTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
2065  if (Op.getValueType() == MVT::i1)
2066  return LowerLOADi1(Op, DAG);
2067 
2068  // v2f16 is legal, so we can't rely on legalizer to handle unaligned
2069  // loads and have to handle it here.
2070  if (Op.getValueType() == MVT::v2f16) {
2071  LoadSDNode *Load = cast<LoadSDNode>(Op);
2072  EVT MemVT = Load->getMemoryVT();
2073  if (!allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), MemVT,
2074  Load->getAddressSpace(), Load->getAlignment())) {
2075  SDValue Ops[2];
2076  std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(Load, DAG);
2077  return DAG.getMergeValues(Ops, SDLoc(Op));
2078  }
2079  }
2080 
2081  return SDValue();
2082 }
2083 
2084 // v = ld i1* addr
2085 // =>
2086 // v1 = ld i8* addr (-> i16)
2087 // v = trunc i16 to i1
2088 SDValue NVPTXTargetLowering::LowerLOADi1(SDValue Op, SelectionDAG &DAG) const {
2089  SDNode *Node = Op.getNode();
2090  LoadSDNode *LD = cast<LoadSDNode>(Node);
2091  SDLoc dl(Node);
2093  assert(Node->getValueType(0) == MVT::i1 &&
2094  "Custom lowering for i1 load only");
2095  SDValue newLD = DAG.getLoad(MVT::i16, dl, LD->getChain(), LD->getBasePtr(),
2096  LD->getPointerInfo(), LD->getAlignment(),
2097  LD->getMemOperand()->getFlags());
2098  SDValue result = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, newLD);
2099  // The legalizer (the caller) is expecting two values from the legalized
2100  // load, so we build a MergeValues node for it. See ExpandUnalignedLoad()
2101  // in LegalizeDAG.cpp which also uses MergeValues.
2102  SDValue Ops[] = { result, LD->getChain() };
2103  return DAG.getMergeValues(Ops, dl);
2104 }
2105 
2106 SDValue NVPTXTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
2107  StoreSDNode *Store = cast<StoreSDNode>(Op);
2108  EVT VT = Store->getMemoryVT();
2109 
2110  if (VT == MVT::i1)
2111  return LowerSTOREi1(Op, DAG);
2112 
2113  // v2f16 is legal, so we can't rely on legalizer to handle unaligned
2114  // stores and have to handle it here.
2115  if (VT == MVT::v2f16 &&
2116  !allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
2117  Store->getAddressSpace(), Store->getAlignment()))
2118  return expandUnalignedStore(Store, DAG);
2119 
2120  if (VT.isVector())
2121  return LowerSTOREVector(Op, DAG);
2122 
2123  return SDValue();
2124 }
2125 
2126 SDValue
2127 NVPTXTargetLowering::LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const {
2128  SDNode *N = Op.getNode();
2129  SDValue Val = N->getOperand(1);
2130  SDLoc DL(N);
2131  EVT ValVT = Val.getValueType();
2132 
2133  if (ValVT.isVector()) {
2134  // We only handle "native" vector sizes for now, e.g. <4 x double> is not
2135  // legal. We can (and should) split that into 2 stores of <2 x double> here
2136  // but I'm leaving that as a TODO for now.
2137  if (!ValVT.isSimple())
2138  return SDValue();
2139  switch (ValVT.getSimpleVT().SimpleTy) {
2140  default:
2141  return SDValue();
2142  case MVT::v2i8:
2143  case MVT::v2i16:
2144  case MVT::v2i32:
2145  case MVT::v2i64:
2146  case MVT::v2f16:
2147  case MVT::v2f32:
2148  case MVT::v2f64:
2149  case MVT::v4i8:
2150  case MVT::v4i16:
2151  case MVT::v4i32:
2152  case MVT::v4f16:
2153  case MVT::v4f32:
2154  case MVT::v8f16: // <4 x f16x2>
2155  // This is a "native" vector type
2156  break;
2157  }
2158 
2159  MemSDNode *MemSD = cast<MemSDNode>(N);
2160  const DataLayout &TD = DAG.getDataLayout();
2161 
2162  unsigned Align = MemSD->getAlignment();
2163  unsigned PrefAlign =
2164  TD.getPrefTypeAlignment(ValVT.getTypeForEVT(*DAG.getContext()));
2165  if (Align < PrefAlign) {
2166  // This store is not sufficiently aligned, so bail out and let this vector
2167  // store be scalarized. Note that we may still be able to emit smaller
2168  // vector stores. For example, if we are storing a <4 x float> with an
2169  // alignment of 8, this check will fail but the legalizer will try again
2170  // with 2 x <2 x float>, which will succeed with an alignment of 8.
2171  return SDValue();
2172  }
2173 
2174  unsigned Opcode = 0;
2175  EVT EltVT = ValVT.getVectorElementType();
2176  unsigned NumElts = ValVT.getVectorNumElements();
2177 
2178  // Since StoreV2 is a target node, we cannot rely on DAG type legalization.
2179  // Therefore, we must ensure the type is legal. For i1 and i8, we set the
2180  // stored type to i16 and propagate the "real" type as the memory type.
2181  bool NeedExt = false;
2182  if (EltVT.getSizeInBits() < 16)
2183  NeedExt = true;
2184 
2185  bool StoreF16x2 = false;
2186  switch (NumElts) {
2187  default:
2188  return SDValue();
2189  case 2:
2190  Opcode = NVPTXISD::StoreV2;
2191  break;
2192  case 4:
2193  Opcode = NVPTXISD::StoreV4;
2194  break;
2195  case 8:
2196  // v8f16 is a special case. PTX doesn't have st.v8.f16
2197  // instruction. Instead, we split the vector into v2f16 chunks and
2198  // store them with st.v4.b32.
2199  assert(EltVT == MVT::f16 && "Wrong type for the vector.");
2200  Opcode = NVPTXISD::StoreV4;
2201  StoreF16x2 = true;
2202  break;
2203  }
2204 
2206 
2207  // First is the chain
2208  Ops.push_back(N->getOperand(0));
2209 
2210  if (StoreF16x2) {
2211  // Combine f16,f16 -> v2f16
2212  NumElts /= 2;
2213  for (unsigned i = 0; i < NumElts; ++i) {
2214  SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f16, Val,
2215  DAG.getIntPtrConstant(i * 2, DL));
2216  SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f16, Val,
2217  DAG.getIntPtrConstant(i * 2 + 1, DL));
2218  SDValue V2 = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2f16, E0, E1);
2219  Ops.push_back(V2);
2220  }
2221  } else {
2222  // Then the split values
2223  for (unsigned i = 0; i < NumElts; ++i) {
2224  SDValue ExtVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Val,
2225  DAG.getIntPtrConstant(i, DL));
2226  if (NeedExt)
2227  ExtVal = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i16, ExtVal);
2228  Ops.push_back(ExtVal);
2229  }
2230  }
2231 
2232  // Then any remaining arguments
2233  Ops.append(N->op_begin() + 2, N->op_end());
2234 
2235  SDValue NewSt =
2236  DAG.getMemIntrinsicNode(Opcode, DL, DAG.getVTList(MVT::Other), Ops,
2237  MemSD->getMemoryVT(), MemSD->getMemOperand());
2238 
2239  // return DCI.CombineTo(N, NewSt, true);
2240  return NewSt;
2241  }
2242 
2243  return SDValue();
2244 }
2245 
2246 // st i1 v, addr
2247 // =>
2248 // v1 = zxt v to i16
2249 // st.u8 i16, addr
2250 SDValue NVPTXTargetLowering::LowerSTOREi1(SDValue Op, SelectionDAG &DAG) const {
2251  SDNode *Node = Op.getNode();
2252  SDLoc dl(Node);
2253  StoreSDNode *ST = cast<StoreSDNode>(Node);
2254  SDValue Tmp1 = ST->getChain();
2255  SDValue Tmp2 = ST->getBasePtr();
2256  SDValue Tmp3 = ST->getValue();
2257  assert(Tmp3.getValueType() == MVT::i1 && "Custom lowering for i1 store only");
2258  Tmp3 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Tmp3);
2259  SDValue Result =
2260  DAG.getTruncStore(Tmp1, dl, Tmp3, Tmp2, ST->getPointerInfo(), MVT::i8,
2261  ST->getAlignment(), ST->getMemOperand()->getFlags());
2262  return Result;
2263 }
2264 
2265 SDValue
2266 NVPTXTargetLowering::getParamSymbol(SelectionDAG &DAG, int idx, EVT v) const {
2267  std::string ParamSym;
2268  raw_string_ostream ParamStr(ParamSym);
2269 
2270  ParamStr << DAG.getMachineFunction().getName() << "_param_" << idx;
2271  ParamStr.flush();
2272 
2273  std::string *SavedStr =
2274  nvTM->getManagedStrPool()->getManagedString(ParamSym.c_str());
2275  return DAG.getTargetExternalSymbol(SavedStr->c_str(), v);
2276 }
2277 
2278 // Check to see if the kernel argument is image*_t or sampler_t
2279 
2280 static bool isImageOrSamplerVal(const Value *arg, const Module *context) {
2281  static const char *const specialTypes[] = { "struct._image2d_t",
2282  "struct._image3d_t",
2283  "struct._sampler_t" };
2284 
2285  Type *Ty = arg->getType();
2286  auto *PTy = dyn_cast<PointerType>(Ty);
2287 
2288  if (!PTy)
2289  return false;
2290 
2291  if (!context)
2292  return false;
2293 
2294  auto *STy = dyn_cast<StructType>(PTy->getElementType());
2295  if (!STy || STy->isLiteral())
2296  return false;
2297 
2298  return std::find(std::begin(specialTypes), std::end(specialTypes),
2299  STy->getName()) != std::end(specialTypes);
2300 }
2301 
2303  SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
2304  const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
2305  SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
2306  MachineFunction &MF = DAG.getMachineFunction();
2307  const DataLayout &DL = DAG.getDataLayout();
2308  auto PtrVT = getPointerTy(DAG.getDataLayout());
2309 
2310  const Function *F = &MF.getFunction();
2311  const AttributeList &PAL = F->getAttributes();
2312  const TargetLowering *TLI = STI.getTargetLowering();
2313 
2314  SDValue Root = DAG.getRoot();
2315  std::vector<SDValue> OutChains;
2316 
2317  bool isABI = (STI.getSmVersion() >= 20);
2318  assert(isABI && "Non-ABI compilation is not supported");
2319  if (!isABI)
2320  return Chain;
2321 
2322  std::vector<Type *> argTypes;
2323  std::vector<const Argument *> theArgs;
2324  for (const Argument &I : F->args()) {
2325  theArgs.push_back(&I);
2326  argTypes.push_back(I.getType());
2327  }
2328  // argTypes.size() (or theArgs.size()) and Ins.size() need not match.
2329  // Ins.size() will be larger
2330  // * if there is an aggregate argument with multiple fields (each field
2331  // showing up separately in Ins)
2332  // * if there is a vector argument with more than typical vector-length
2333  // elements (generally if more than 4) where each vector element is
2334  // individually present in Ins.
2335  // So a different index should be used for indexing into Ins.
2336  // See similar issue in LowerCall.
2337  unsigned InsIdx = 0;
2338 
2339  int idx = 0;
2340  for (unsigned i = 0, e = theArgs.size(); i != e; ++i, ++idx, ++InsIdx) {
2341  Type *Ty = argTypes[i];
2342 
2343  // If the kernel argument is image*_t or sampler_t, convert it to
2344  // a i32 constant holding the parameter position. This can later
2345  // matched in the AsmPrinter to output the correct mangled name.
2346  if (isImageOrSamplerVal(
2347  theArgs[i],
2348  (theArgs[i]->getParent() ? theArgs[i]->getParent()->getParent()
2349  : nullptr))) {
2350  assert(isKernelFunction(*F) &&
2351  "Only kernels can have image/sampler params");
2352  InVals.push_back(DAG.getConstant(i + 1, dl, MVT::i32));
2353  continue;
2354  }
2355 
2356  if (theArgs[i]->use_empty()) {
2357  // argument is dead
2358  if (Ty->isAggregateType() || Ty->isIntegerTy(128)) {
2359  SmallVector<EVT, 16> vtparts;
2360 
2361  ComputePTXValueVTs(*this, DAG.getDataLayout(), Ty, vtparts);
2362  assert(vtparts.size() > 0 && "empty aggregate type not expected");
2363  for (unsigned parti = 0, parte = vtparts.size(); parti != parte;
2364  ++parti) {
2365  InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT));
2366  ++InsIdx;
2367  }
2368  if (vtparts.size() > 0)
2369  --InsIdx;
2370  continue;
2371  }
2372  if (Ty->isVectorTy()) {
2373  EVT ObjectVT = getValueType(DL, Ty);
2374  unsigned NumRegs = TLI->getNumRegisters(F->getContext(), ObjectVT);
2375  for (unsigned parti = 0; parti < NumRegs; ++parti) {
2376  InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT));
2377  ++InsIdx;
2378  }
2379  if (NumRegs > 0)
2380  --InsIdx;
2381  continue;
2382  }
2383  InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT));
2384  continue;
2385  }
2386 
2387  // In the following cases, assign a node order of "idx+1"
2388  // to newly created nodes. The SDNodes for params have to
2389  // appear in the same order as their order of appearance
2390  // in the original function. "idx+1" holds that order.
2391  if (!PAL.hasParamAttribute(i, Attribute::ByVal)) {
2392  bool aggregateIsPacked = false;
2393  if (StructType *STy = dyn_cast<StructType>(Ty))
2394  aggregateIsPacked = STy->isPacked();
2395 
2398  ComputePTXValueVTs(*this, DL, Ty, VTs, &Offsets, 0);
2399  assert(VTs.size() > 0 && "Unexpected empty type.");
2400  auto VectorInfo =
2401  VectorizePTXValueVTs(VTs, Offsets, DL.getABITypeAlignment(Ty));
2402 
2403  SDValue Arg = getParamSymbol(DAG, idx, PtrVT);
2404  int VecIdx = -1; // Index of the first element of the current vector.
2405  for (unsigned parti = 0, parte = VTs.size(); parti != parte; ++parti) {
2406  if (VectorInfo[parti] & PVF_FIRST) {
2407  assert(VecIdx == -1 && "Orphaned vector.");
2408  VecIdx = parti;
2409  }
2410 
2411  // That's the last element of this store op.
2412  if (VectorInfo[parti] & PVF_LAST) {
2413  unsigned NumElts = parti - VecIdx + 1;
2414  EVT EltVT = VTs[parti];
2415  // i1 is loaded/stored as i8.
2416  EVT LoadVT = EltVT;
2417  if (EltVT == MVT::i1)
2418  LoadVT = MVT::i8;
2419  else if (EltVT == MVT::v2f16)
2420  // getLoad needs a vector type, but it can't handle
2421  // vectors which contain v2f16 elements. So we must load
2422  // using i32 here and then bitcast back.
2423  LoadVT = MVT::i32;
2424 
2425  EVT VecVT = EVT::getVectorVT(F->getContext(), LoadVT, NumElts);
2426  SDValue VecAddr =
2427  DAG.getNode(ISD::ADD, dl, PtrVT, Arg,
2428  DAG.getConstant(Offsets[VecIdx], dl, PtrVT));
2430  EltVT.getTypeForEVT(F->getContext()), ADDRESS_SPACE_PARAM));
2431  SDValue P =
2432  DAG.getLoad(VecVT, dl, Root, VecAddr,
2433  MachinePointerInfo(srcValue), aggregateIsPacked,
2436  if (P.getNode())
2437  P.getNode()->setIROrder(idx + 1);
2438  for (unsigned j = 0; j < NumElts; ++j) {
2439  SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, LoadVT, P,
2440  DAG.getIntPtrConstant(j, dl));
2441  // We've loaded i1 as an i8 and now must truncate it back to i1
2442  if (EltVT == MVT::i1)
2443  Elt = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Elt);
2444  // v2f16 was loaded as an i32. Now we must bitcast it back.
2445  else if (EltVT == MVT::v2f16)
2446  Elt = DAG.getNode(ISD::BITCAST, dl, MVT::v2f16, Elt);
2447  // Extend the element if necessary (e.g. an i8 is loaded
2448  // into an i16 register)
2449  if (Ins[InsIdx].VT.isInteger() &&
2450  Ins[InsIdx].VT.getSizeInBits() > LoadVT.getSizeInBits()) {
2451  unsigned Extend = Ins[InsIdx].Flags.isSExt() ? ISD::SIGN_EXTEND
2452  : ISD::ZERO_EXTEND;
2453  Elt = DAG.getNode(Extend, dl, Ins[InsIdx].VT, Elt);
2454  }
2455  InVals.push_back(Elt);
2456  }
2457 
2458  // Reset vector tracking state.
2459  VecIdx = -1;
2460  }
2461  ++InsIdx;
2462  }
2463  if (VTs.size() > 0)
2464  --InsIdx;
2465  continue;
2466  }
2467 
2468  // Param has ByVal attribute
2469  // Return MoveParam(param symbol).
2470  // Ideally, the param symbol can be returned directly,
2471  // but when SDNode builder decides to use it in a CopyToReg(),
2472  // machine instruction fails because TargetExternalSymbol
2473  // (not lowered) is target dependent, and CopyToReg assumes
2474  // the source is lowered.
2475  EVT ObjectVT = getValueType(DL, Ty);
2476  assert(ObjectVT == Ins[InsIdx].VT &&
2477  "Ins type did not match function type");
2478  SDValue Arg = getParamSymbol(DAG, idx, PtrVT);
2479  SDValue p = DAG.getNode(NVPTXISD::MoveParam, dl, ObjectVT, Arg);
2480  if (p.getNode())
2481  p.getNode()->setIROrder(idx + 1);
2482  InVals.push_back(p);
2483  }
2484 
2485  // Clang will check explicit VarArg and issue error if any. However, Clang
2486  // will let code with
2487  // implicit var arg like f() pass. See bug 617733.
2488  // We treat this case as if the arg list is empty.
2489  // if (F.isVarArg()) {
2490  // assert(0 && "VarArg not supported yet!");
2491  //}
2492 
2493  if (!OutChains.empty())
2494  DAG.setRoot(DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains));
2495 
2496  return Chain;
2497 }
2498 
2499 SDValue
2501  bool isVarArg,
2502  const SmallVectorImpl<ISD::OutputArg> &Outs,
2503  const SmallVectorImpl<SDValue> &OutVals,
2504  const SDLoc &dl, SelectionDAG &DAG) const {
2505  MachineFunction &MF = DAG.getMachineFunction();
2506  Type *RetTy = MF.getFunction().getReturnType();
2507 
2508  bool isABI = (STI.getSmVersion() >= 20);
2509  assert(isABI && "Non-ABI compilation is not supported");
2510  if (!isABI)
2511  return Chain;
2512 
2513  const DataLayout DL = DAG.getDataLayout();
2516  ComputePTXValueVTs(*this, DL, RetTy, VTs, &Offsets);
2517  assert(VTs.size() == OutVals.size() && "Bad return value decomposition");
2518 
2519  auto VectorInfo = VectorizePTXValueVTs(
2520  VTs, Offsets, RetTy->isSized() ? DL.getABITypeAlignment(RetTy) : 1);
2521 
2522  // PTX Interoperability Guide 3.3(A): [Integer] Values shorter than
2523  // 32-bits are sign extended or zero extended, depending on whether
2524  // they are signed or unsigned types.
2525  bool ExtendIntegerRetVal =
2526  RetTy->isIntegerTy() && DL.getTypeAllocSizeInBits(RetTy) < 32;
2527 
2528  SmallVector<SDValue, 6> StoreOperands;
2529  for (unsigned i = 0, e = VTs.size(); i != e; ++i) {
2530  // New load/store. Record chain and offset operands.
2531  if (VectorInfo[i] & PVF_FIRST) {
2532  assert(StoreOperands.empty() && "Orphaned operand list.");
2533  StoreOperands.push_back(Chain);
2534  StoreOperands.push_back(DAG.getConstant(Offsets[i], dl, MVT::i32));
2535  }
2536 
2537  SDValue RetVal = OutVals[i];
2538  if (ExtendIntegerRetVal) {
2539  RetVal = DAG.getNode(Outs[i].Flags.isSExt() ? ISD::SIGN_EXTEND
2540  : ISD::ZERO_EXTEND,
2541  dl, MVT::i32, RetVal);
2542  } else if (RetVal.getValueSizeInBits() < 16) {
2543  // Use 16-bit registers for small load-stores as it's the
2544  // smallest general purpose register size supported by NVPTX.
2545  RetVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, RetVal);
2546  }
2547 
2548  // Record the value to return.
2549  StoreOperands.push_back(RetVal);
2550 
2551  // That's the last element of this store op.
2552  if (VectorInfo[i] & PVF_LAST) {
2554  unsigned NumElts = StoreOperands.size() - 2;
2555  switch (NumElts) {
2556  case 1:
2557  Op = NVPTXISD::StoreRetval;
2558  break;
2559  case 2:
2561  break;
2562  case 4:
2564  break;
2565  default:
2566  llvm_unreachable("Invalid vector info.");
2567  }
2568 
2569  // Adjust type of load/store op if we've extended the scalar
2570  // return value.
2571  EVT TheStoreType = ExtendIntegerRetVal ? MVT::i32 : VTs[i];
2572  Chain = DAG.getMemIntrinsicNode(Op, dl, DAG.getVTList(MVT::Other),
2573  StoreOperands, TheStoreType,
2574  MachinePointerInfo(), /* Align */ 1,
2576  // Cleanup vector state.
2577  StoreOperands.clear();
2578  }
2579  }
2580 
2581  return DAG.getNode(NVPTXISD::RET_FLAG, dl, MVT::Other, Chain);
2582 }
2583 
2585  SDValue Op, std::string &Constraint, std::vector<SDValue> &Ops,
2586  SelectionDAG &DAG) const {
2587  if (Constraint.length() > 1)
2588  return;
2589  else
2590  TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
2591 }
2592 
2593 static unsigned getOpcForTextureInstr(unsigned Intrinsic) {
2594  switch (Intrinsic) {
2595  default:
2596  return 0;
2597 
2598  case Intrinsic::nvvm_tex_1d_v4f32_s32:
2599  return NVPTXISD::Tex1DFloatS32;
2600  case Intrinsic::nvvm_tex_1d_v4f32_f32:
2602  case Intrinsic::nvvm_tex_1d_level_v4f32_f32:
2604  case Intrinsic::nvvm_tex_1d_grad_v4f32_f32:
2606  case Intrinsic::nvvm_tex_1d_v4s32_s32:
2607  return NVPTXISD::Tex1DS32S32;
2608  case Intrinsic::nvvm_tex_1d_v4s32_f32:
2609  return NVPTXISD::Tex1DS32Float;
2610  case Intrinsic::nvvm_tex_1d_level_v4s32_f32:
2612  case Intrinsic::nvvm_tex_1d_grad_v4s32_f32:
2614  case Intrinsic::nvvm_tex_1d_v4u32_s32:
2615  return NVPTXISD::Tex1DU32S32;
2616  case Intrinsic::nvvm_tex_1d_v4u32_f32:
2617  return NVPTXISD::Tex1DU32Float;
2618  case Intrinsic::nvvm_tex_1d_level_v4u32_f32:
2620  case Intrinsic::nvvm_tex_1d_grad_v4u32_f32:
2622 
2623  case Intrinsic::nvvm_tex_1d_array_v4f32_s32:
2625  case Intrinsic::nvvm_tex_1d_array_v4f32_f32:
2627  case Intrinsic::nvvm_tex_1d_array_level_v4f32_f32:
2629  case Intrinsic::nvvm_tex_1d_array_grad_v4f32_f32:
2631  case Intrinsic::nvvm_tex_1d_array_v4s32_s32:
2633  case Intrinsic::nvvm_tex_1d_array_v4s32_f32:
2635  case Intrinsic::nvvm_tex_1d_array_level_v4s32_f32:
2637  case Intrinsic::nvvm_tex_1d_array_grad_v4s32_f32:
2639  case Intrinsic::nvvm_tex_1d_array_v4u32_s32:
2641  case Intrinsic::nvvm_tex_1d_array_v4u32_f32:
2643  case Intrinsic::nvvm_tex_1d_array_level_v4u32_f32:
2645  case Intrinsic::nvvm_tex_1d_array_grad_v4u32_f32:
2647 
2648  case Intrinsic::nvvm_tex_2d_v4f32_s32:
2649  return NVPTXISD::Tex2DFloatS32;
2650  case Intrinsic::nvvm_tex_2d_v4f32_f32:
2652  case Intrinsic::nvvm_tex_2d_level_v4f32_f32:
2654  case Intrinsic::nvvm_tex_2d_grad_v4f32_f32:
2656  case Intrinsic::nvvm_tex_2d_v4s32_s32:
2657  return NVPTXISD::Tex2DS32S32;
2658  case Intrinsic::nvvm_tex_2d_v4s32_f32:
2659  return NVPTXISD::Tex2DS32Float;
2660  case Intrinsic::nvvm_tex_2d_level_v4s32_f32:
2662  case Intrinsic::nvvm_tex_2d_grad_v4s32_f32:
2664  case Intrinsic::nvvm_tex_2d_v4u32_s32:
2665  return NVPTXISD::Tex2DU32S32;
2666  case Intrinsic::nvvm_tex_2d_v4u32_f32:
2667  return NVPTXISD::Tex2DU32Float;
2668  case Intrinsic::nvvm_tex_2d_level_v4u32_f32:
2670  case Intrinsic::nvvm_tex_2d_grad_v4u32_f32:
2672 
2673  case Intrinsic::nvvm_tex_2d_array_v4f32_s32:
2675  case Intrinsic::nvvm_tex_2d_array_v4f32_f32:
2677  case Intrinsic::nvvm_tex_2d_array_level_v4f32_f32:
2679  case Intrinsic::nvvm_tex_2d_array_grad_v4f32_f32:
2681  case Intrinsic::nvvm_tex_2d_array_v4s32_s32:
2683  case Intrinsic::nvvm_tex_2d_array_v4s32_f32:
2685  case Intrinsic::nvvm_tex_2d_array_level_v4s32_f32:
2687  case Intrinsic::nvvm_tex_2d_array_grad_v4s32_f32:
2689  case Intrinsic::nvvm_tex_2d_array_v4u32_s32:
2691  case Intrinsic::nvvm_tex_2d_array_v4u32_f32:
2693  case Intrinsic::nvvm_tex_2d_array_level_v4u32_f32:
2695  case Intrinsic::nvvm_tex_2d_array_grad_v4u32_f32:
2697 
2698  case Intrinsic::nvvm_tex_3d_v4f32_s32:
2699  return NVPTXISD::Tex3DFloatS32;
2700  case Intrinsic::nvvm_tex_3d_v4f32_f32:
2702  case Intrinsic::nvvm_tex_3d_level_v4f32_f32:
2704  case Intrinsic::nvvm_tex_3d_grad_v4f32_f32:
2706  case Intrinsic::nvvm_tex_3d_v4s32_s32:
2707  return NVPTXISD::Tex3DS32S32;
2708  case Intrinsic::nvvm_tex_3d_v4s32_f32:
2709  return NVPTXISD::Tex3DS32Float;
2710  case Intrinsic::nvvm_tex_3d_level_v4s32_f32:
2712  case Intrinsic::nvvm_tex_3d_grad_v4s32_f32:
2714  case Intrinsic::nvvm_tex_3d_v4u32_s32:
2715  return NVPTXISD::Tex3DU32S32;
2716  case Intrinsic::nvvm_tex_3d_v4u32_f32:
2717  return NVPTXISD::Tex3DU32Float;
2718  case Intrinsic::nvvm_tex_3d_level_v4u32_f32:
2720  case Intrinsic::nvvm_tex_3d_grad_v4u32_f32:
2722 
2723  case Intrinsic::nvvm_tex_cube_v4f32_f32:
2725  case Intrinsic::nvvm_tex_cube_level_v4f32_f32:
2727  case Intrinsic::nvvm_tex_cube_v4s32_f32:
2729  case Intrinsic::nvvm_tex_cube_level_v4s32_f32:
2731  case Intrinsic::nvvm_tex_cube_v4u32_f32:
2733  case Intrinsic::nvvm_tex_cube_level_v4u32_f32:
2735 
2736  case Intrinsic::nvvm_tex_cube_array_v4f32_f32:
2738  case Intrinsic::nvvm_tex_cube_array_level_v4f32_f32:
2740  case Intrinsic::nvvm_tex_cube_array_v4s32_f32:
2742  case Intrinsic::nvvm_tex_cube_array_level_v4s32_f32:
2744  case Intrinsic::nvvm_tex_cube_array_v4u32_f32:
2746  case Intrinsic::nvvm_tex_cube_array_level_v4u32_f32:
2748 
2749  case Intrinsic::nvvm_tld4_r_2d_v4f32_f32:
2751  case Intrinsic::nvvm_tld4_g_2d_v4f32_f32:
2753  case Intrinsic::nvvm_tld4_b_2d_v4f32_f32:
2755  case Intrinsic::nvvm_tld4_a_2d_v4f32_f32:
2757  case Intrinsic::nvvm_tld4_r_2d_v4s32_f32:
2759  case Intrinsic::nvvm_tld4_g_2d_v4s32_f32:
2761  case Intrinsic::nvvm_tld4_b_2d_v4s32_f32:
2763  case Intrinsic::nvvm_tld4_a_2d_v4s32_f32:
2765  case Intrinsic::nvvm_tld4_r_2d_v4u32_f32:
2767  case Intrinsic::nvvm_tld4_g_2d_v4u32_f32:
2769  case Intrinsic::nvvm_tld4_b_2d_v4u32_f32:
2771  case Intrinsic::nvvm_tld4_a_2d_v4u32_f32:
2773 
2774  case Intrinsic::nvvm_tex_unified_1d_v4f32_s32:
2776  case Intrinsic::nvvm_tex_unified_1d_v4f32_f32:
2778  case Intrinsic::nvvm_tex_unified_1d_level_v4f32_f32:
2780  case Intrinsic::nvvm_tex_unified_1d_grad_v4f32_f32:
2782  case Intrinsic::nvvm_tex_unified_1d_v4s32_s32:
2784  case Intrinsic::nvvm_tex_unified_1d_v4s32_f32:
2786  case Intrinsic::nvvm_tex_unified_1d_level_v4s32_f32:
2788  case Intrinsic::nvvm_tex_unified_1d_grad_v4s32_f32:
2790  case Intrinsic::nvvm_tex_unified_1d_v4u32_s32:
2792  case Intrinsic::nvvm_tex_unified_1d_v4u32_f32:
2794  case Intrinsic::nvvm_tex_unified_1d_level_v4u32_f32:
2796  case Intrinsic::nvvm_tex_unified_1d_grad_v4u32_f32:
2798 
2799  case Intrinsic::nvvm_tex_unified_1d_array_v4f32_s32:
2801  case Intrinsic::nvvm_tex_unified_1d_array_v4f32_f32:
2803  case Intrinsic::nvvm_tex_unified_1d_array_level_v4f32_f32:
2805  case Intrinsic::nvvm_tex_unified_1d_array_grad_v4f32_f32:
2807  case Intrinsic::nvvm_tex_unified_1d_array_v4s32_s32:
2809  case Intrinsic::nvvm_tex_unified_1d_array_v4s32_f32:
2811  case Intrinsic::nvvm_tex_unified_1d_array_level_v4s32_f32:
2813  case Intrinsic::nvvm_tex_unified_1d_array_grad_v4s32_f32:
2815  case Intrinsic::nvvm_tex_unified_1d_array_v4u32_s32:
2817  case Intrinsic::nvvm_tex_unified_1d_array_v4u32_f32:
2819  case Intrinsic::nvvm_tex_unified_1d_array_level_v4u32_f32:
2821  case Intrinsic::nvvm_tex_unified_1d_array_grad_v4u32_f32:
2823 
2824  case Intrinsic::nvvm_tex_unified_2d_v4f32_s32:
2826  case Intrinsic::nvvm_tex_unified_2d_v4f32_f32:
2828  case Intrinsic::nvvm_tex_unified_2d_level_v4f32_f32:
2830  case Intrinsic::nvvm_tex_unified_2d_grad_v4f32_f32:
2832  case Intrinsic::nvvm_tex_unified_2d_v4s32_s32:
2834  case Intrinsic::nvvm_tex_unified_2d_v4s32_f32:
2836  case Intrinsic::nvvm_tex_unified_2d_level_v4s32_f32:
2838  case Intrinsic::nvvm_tex_unified_2d_grad_v4s32_f32:
2840  case Intrinsic::nvvm_tex_unified_2d_v4u32_s32:
2842  case Intrinsic::nvvm_tex_unified_2d_v4u32_f32:
2844  case Intrinsic::nvvm_tex_unified_2d_level_v4u32_f32:
2846  case Intrinsic::nvvm_tex_unified_2d_grad_v4u32_f32:
2848 
2849  case Intrinsic::nvvm_tex_unified_2d_array_v4f32_s32:
2851  case Intrinsic::nvvm_tex_unified_2d_array_v4f32_f32:
2853  case Intrinsic::nvvm_tex_unified_2d_array_level_v4f32_f32:
2855  case Intrinsic::nvvm_tex_unified_2d_array_grad_v4f32_f32:
2857  case Intrinsic::nvvm_tex_unified_2d_array_v4s32_s32:
2859  case Intrinsic::nvvm_tex_unified_2d_array_v4s32_f32:
2861  case Intrinsic::nvvm_tex_unified_2d_array_level_v4s32_f32:
2863  case Intrinsic::nvvm_tex_unified_2d_array_grad_v4s32_f32:
2865  case Intrinsic::nvvm_tex_unified_2d_array_v4u32_s32:
2867  case Intrinsic::nvvm_tex_unified_2d_array_v4u32_f32:
2869  case Intrinsic::nvvm_tex_unified_2d_array_level_v4u32_f32:
2871  case Intrinsic::nvvm_tex_unified_2d_array_grad_v4u32_f32:
2873 
2874  case Intrinsic::nvvm_tex_unified_3d_v4f32_s32:
2876  case Intrinsic::nvvm_tex_unified_3d_v4f32_f32:
2878  case Intrinsic::nvvm_tex_unified_3d_level_v4f32_f32:
2880  case Intrinsic::nvvm_tex_unified_3d_grad_v4f32_f32:
2882  case Intrinsic::nvvm_tex_unified_3d_v4s32_s32:
2884  case Intrinsic::nvvm_tex_unified_3d_v4s32_f32:
2886  case Intrinsic::nvvm_tex_unified_3d_level_v4s32_f32:
2888  case Intrinsic::nvvm_tex_unified_3d_grad_v4s32_f32:
2890  case Intrinsic::nvvm_tex_unified_3d_v4u32_s32:
2892  case Intrinsic::nvvm_tex_unified_3d_v4u32_f32:
2894  case Intrinsic::nvvm_tex_unified_3d_level_v4u32_f32:
2896  case Intrinsic::nvvm_tex_unified_3d_grad_v4u32_f32:
2898 
2899  case Intrinsic::nvvm_tex_unified_cube_v4f32_f32:
2901  case Intrinsic::nvvm_tex_unified_cube_level_v4f32_f32:
2903  case Intrinsic::nvvm_tex_unified_cube_v4s32_f32:
2905  case Intrinsic::nvvm_tex_unified_cube_level_v4s32_f32:
2907  case Intrinsic::nvvm_tex_unified_cube_v4u32_f32:
2909  case Intrinsic::nvvm_tex_unified_cube_level_v4u32_f32:
2911 
2912  case Intrinsic::nvvm_tex_unified_cube_array_v4f32_f32:
2914  case Intrinsic::nvvm_tex_unified_cube_array_level_v4f32_f32:
2916  case Intrinsic::nvvm_tex_unified_cube_array_v4s32_f32:
2918  case Intrinsic::nvvm_tex_unified_cube_array_level_v4s32_f32:
2920  case Intrinsic::nvvm_tex_unified_cube_array_v4u32_f32:
2922  case Intrinsic::nvvm_tex_unified_cube_array_level_v4u32_f32:
2924 
2925  case Intrinsic::nvvm_tld4_unified_r_2d_v4f32_f32:
2927  case Intrinsic::nvvm_tld4_unified_g_2d_v4f32_f32:
2929  case Intrinsic::nvvm_tld4_unified_b_2d_v4f32_f32:
2931  case Intrinsic::nvvm_tld4_unified_a_2d_v4f32_f32:
2933  case Intrinsic::nvvm_tld4_unified_r_2d_v4s32_f32:
2935  case Intrinsic::nvvm_tld4_unified_g_2d_v4s32_f32:
2937  case Intrinsic::nvvm_tld4_unified_b_2d_v4s32_f32:
2939  case Intrinsic::nvvm_tld4_unified_a_2d_v4s32_f32:
2941  case Intrinsic::nvvm_tld4_unified_r_2d_v4u32_f32:
2943  case Intrinsic::nvvm_tld4_unified_g_2d_v4u32_f32:
2945  case Intrinsic::nvvm_tld4_unified_b_2d_v4u32_f32:
2947  case Intrinsic::nvvm_tld4_unified_a_2d_v4u32_f32:
2949  }
2950 }
2951 
2952 static unsigned getOpcForSurfaceInstr(unsigned Intrinsic) {
2953  switch (Intrinsic) {
2954  default:
2955  return 0;
2956  case Intrinsic::nvvm_suld_1d_i8_clamp:
2957  return NVPTXISD::Suld1DI8Clamp;
2958  case Intrinsic::nvvm_suld_1d_i16_clamp:
2959  return NVPTXISD::Suld1DI16Clamp;
2960  case Intrinsic::nvvm_suld_1d_i32_clamp:
2961  return NVPTXISD::Suld1DI32Clamp;
2962  case Intrinsic::nvvm_suld_1d_i64_clamp:
2963  return NVPTXISD::Suld1DI64Clamp;
2964  case Intrinsic::nvvm_suld_1d_v2i8_clamp:
2966  case Intrinsic::nvvm_suld_1d_v2i16_clamp:
2968  case Intrinsic::nvvm_suld_1d_v2i32_clamp:
2970  case Intrinsic::nvvm_suld_1d_v2i64_clamp:
2972  case Intrinsic::nvvm_suld_1d_v4i8_clamp:
2974  case Intrinsic::nvvm_suld_1d_v4i16_clamp:
2976  case Intrinsic::nvvm_suld_1d_v4i32_clamp:
2978  case Intrinsic::nvvm_suld_1d_array_i8_clamp:
2980  case Intrinsic::nvvm_suld_1d_array_i16_clamp:
2982  case Intrinsic::nvvm_suld_1d_array_i32_clamp:
2984  case Intrinsic::nvvm_suld_1d_array_i64_clamp:
2986  case Intrinsic::nvvm_suld_1d_array_v2i8_clamp:
2988  case Intrinsic::nvvm_suld_1d_array_v2i16_clamp:
2990  case Intrinsic::nvvm_suld_1d_array_v2i32_clamp:
2992  case Intrinsic::nvvm_suld_1d_array_v2i64_clamp:
2994  case Intrinsic::nvvm_suld_1d_array_v4i8_clamp:
2996  case Intrinsic::nvvm_suld_1d_array_v4i16_clamp:
2998  case Intrinsic::nvvm_suld_1d_array_v4i32_clamp:
3000  case Intrinsic::nvvm_suld_2d_i8_clamp:
3001  return NVPTXISD::Suld2DI8Clamp;
3002  case Intrinsic::nvvm_suld_2d_i16_clamp:
3003  return NVPTXISD::Suld2DI16Clamp;
3004  case Intrinsic::nvvm_suld_2d_i32_clamp:
3005  return NVPTXISD::Suld2DI32Clamp;
3006  case Intrinsic::nvvm_suld_2d_i64_clamp:
3007  return NVPTXISD::Suld2DI64Clamp;
3008  case Intrinsic::nvvm_suld_2d_v2i8_clamp:
3010  case Intrinsic::nvvm_suld_2d_v2i16_clamp:
3012  case Intrinsic::nvvm_suld_2d_v2i32_clamp:
3014  case Intrinsic::nvvm_suld_2d_v2i64_clamp:
3016  case Intrinsic::nvvm_suld_2d_v4i8_clamp:
3018  case Intrinsic::nvvm_suld_2d_v4i16_clamp:
3020  case Intrinsic::nvvm_suld_2d_v4i32_clamp:
3022  case Intrinsic::nvvm_suld_2d_array_i8_clamp:
3024  case Intrinsic::nvvm_suld_2d_array_i16_clamp:
3026  case Intrinsic::nvvm_suld_2d_array_i32_clamp:
3028  case Intrinsic::nvvm_suld_2d_array_i64_clamp:
3030  case Intrinsic::nvvm_suld_2d_array_v2i8_clamp:
3032  case Intrinsic::nvvm_suld_2d_array_v2i16_clamp:
3034  case Intrinsic::nvvm_suld_2d_array_v2i32_clamp:
3036  case Intrinsic::nvvm_suld_2d_array_v2i64_clamp:
3038  case Intrinsic::nvvm_suld_2d_array_v4i8_clamp:
3040  case Intrinsic::nvvm_suld_2d_array_v4i16_clamp:
3042  case Intrinsic::nvvm_suld_2d_array_v4i32_clamp:
3044  case Intrinsic::nvvm_suld_3d_i8_clamp:
3045  return NVPTXISD::Suld3DI8Clamp;
3046  case Intrinsic::nvvm_suld_3d_i16_clamp:
3047  return NVPTXISD::Suld3DI16Clamp;
3048  case Intrinsic::nvvm_suld_3d_i32_clamp:
3049  return NVPTXISD::Suld3DI32Clamp;
3050  case Intrinsic::nvvm_suld_3d_i64_clamp:
3051  return NVPTXISD::Suld3DI64Clamp;
3052  case Intrinsic::nvvm_suld_3d_v2i8_clamp:
3054  case Intrinsic::nvvm_suld_3d_v2i16_clamp:
3056  case Intrinsic::nvvm_suld_3d_v2i32_clamp:
3058  case Intrinsic::nvvm_suld_3d_v2i64_clamp:
3060  case Intrinsic::nvvm_suld_3d_v4i8_clamp:
3062  case Intrinsic::nvvm_suld_3d_v4i16_clamp:
3064  case Intrinsic::nvvm_suld_3d_v4i32_clamp:
3066  case Intrinsic::nvvm_suld_1d_i8_trap:
3067  return NVPTXISD::Suld1DI8Trap;
3068  case Intrinsic::nvvm_suld_1d_i16_trap:
3069  return NVPTXISD::Suld1DI16Trap;
3070  case Intrinsic::nvvm_suld_1d_i32_trap:
3071  return NVPTXISD::Suld1DI32Trap;
3072  case Intrinsic::nvvm_suld_1d_i64_trap:
3073  return NVPTXISD::Suld1DI64Trap;
3074  case Intrinsic::nvvm_suld_1d_v2i8_trap:
3075  return NVPTXISD::Suld1DV2I8Trap;
3076  case Intrinsic::nvvm_suld_1d_v2i16_trap:
3078  case Intrinsic::nvvm_suld_1d_v2i32_trap:
3080  case Intrinsic::nvvm_suld_1d_v2i64_trap:
3082  case Intrinsic::nvvm_suld_1d_v4i8_trap:
3083  return NVPTXISD::Suld1DV4I8Trap;
3084  case Intrinsic::nvvm_suld_1d_v4i16_trap:
3086  case Intrinsic::nvvm_suld_1d_v4i32_trap:
3088  case Intrinsic::nvvm_suld_1d_array_i8_trap:
3090  case Intrinsic::nvvm_suld_1d_array_i16_trap:
3092  case Intrinsic::nvvm_suld_1d_array_i32_trap:
3094  case Intrinsic::nvvm_suld_1d_array_i64_trap:
3096  case Intrinsic::nvvm_suld_1d_array_v2i8_trap:
3098  case Intrinsic::nvvm_suld_1d_array_v2i16_trap:
3100  case Intrinsic::nvvm_suld_1d_array_v2i32_trap:
3102  case Intrinsic::nvvm_suld_1d_array_v2i64_trap:
3104  case Intrinsic::nvvm_suld_1d_array_v4i8_trap:
3106  case Intrinsic::nvvm_suld_1d_array_v4i16_trap:
3108  case Intrinsic::nvvm_suld_1d_array_v4i32_trap:
3110  case Intrinsic::nvvm_suld_2d_i8_trap:
3111  return NVPTXISD::Suld2DI8Trap;
3112  case Intrinsic::nvvm_suld_2d_i16_trap:
3113  return NVPTXISD::Suld2DI16Trap;
3114  case Intrinsic::nvvm_suld_2d_i32_trap:
3115  return NVPTXISD::Suld2DI32Trap;
3116  case Intrinsic::nvvm_suld_2d_i64_trap:
3117  return NVPTXISD::Suld2DI64Trap;
3118  case Intrinsic::nvvm_suld_2d_v2i8_trap:
3119  return NVPTXISD::Suld2DV2I8Trap;
3120  case Intrinsic::nvvm_suld_2d_v2i16_trap:
3122  case Intrinsic::nvvm_suld_2d_v2i32_trap:
3124  case Intrinsic::nvvm_suld_2d_v2i64_trap:
3126  case Intrinsic::nvvm_suld_2d_v4i8_trap:
3127  return NVPTXISD::Suld2DV4I8Trap;
3128  case Intrinsic::nvvm_suld_2d_v4i16_trap:
3130  case Intrinsic::nvvm_suld_2d_v4i32_trap:
3132  case Intrinsic::nvvm_suld_2d_array_i8_trap:
3134  case Intrinsic::nvvm_suld_2d_array_i16_trap:
3136  case Intrinsic::nvvm_suld_2d_array_i32_trap:
3138  case Intrinsic::nvvm_suld_2d_array_i64_trap:
3140  case Intrinsic::nvvm_suld_2d_array_v2i8_trap:
3142  case Intrinsic::nvvm_suld_2d_array_v2i16_trap:
3144  case Intrinsic::nvvm_suld_2d_array_v2i32_trap:
3146  case Intrinsic::nvvm_suld_2d_array_v2i64_trap:
3148  case Intrinsic::nvvm_suld_2d_array_v4i8_trap:
3150  case Intrinsic::nvvm_suld_2d_array_v4i16_trap:
3152  case Intrinsic::nvvm_suld_2d_array_v4i32_trap:
3154  case Intrinsic::nvvm_suld_3d_i8_trap:
3155  return NVPTXISD::Suld3DI8Trap;
3156  case Intrinsic::nvvm_suld_3d_i16_trap:
3157  return NVPTXISD::Suld3DI16Trap;
3158  case Intrinsic::nvvm_suld_3d_i32_trap:
3159  return NVPTXISD::Suld3DI32Trap;
3160  case Intrinsic::nvvm_suld_3d_i64_trap:
3161  return NVPTXISD::Suld3DI64Trap;
3162  case Intrinsic::nvvm_suld_3d_v2i8_trap:
3163  return NVPTXISD::Suld3DV2I8Trap;
3164  case Intrinsic::nvvm_suld_3d_v2i16_trap:
3166  case Intrinsic::nvvm_suld_3d_v2i32_trap:
3168  case Intrinsic::nvvm_suld_3d_v2i64_trap:
3170  case Intrinsic::nvvm_suld_3d_v4i8_trap:
3171  return NVPTXISD::Suld3DV4I8Trap;
3172  case Intrinsic::nvvm_suld_3d_v4i16_trap:
3174  case Intrinsic::nvvm_suld_3d_v4i32_trap:
3176  case Intrinsic::nvvm_suld_1d_i8_zero:
3177  return NVPTXISD::Suld1DI8Zero;
3178  case Intrinsic::nvvm_suld_1d_i16_zero:
3179  return NVPTXISD::Suld1DI16Zero;
3180  case Intrinsic::nvvm_suld_1d_i32_zero:
3181  return NVPTXISD::Suld1DI32Zero;
3182  case Intrinsic::nvvm_suld_1d_i64_zero:
3183  return NVPTXISD::Suld1DI64Zero;
3184  case Intrinsic::nvvm_suld_1d_v2i8_zero:
3185  return NVPTXISD::Suld1DV2I8Zero;
3186  case Intrinsic::nvvm_suld_1d_v2i16_zero:
3188  case Intrinsic::nvvm_suld_1d_v2i32_zero:
3190  case Intrinsic::nvvm_suld_1d_v2i64_zero:
3192  case Intrinsic::nvvm_suld_1d_v4i8_zero:
3193  return NVPTXISD::Suld1DV4I8Zero;
3194  case Intrinsic::nvvm_suld_1d_v4i16_zero:
3196  case Intrinsic::nvvm_suld_1d_v4i32_zero:
3198  case Intrinsic::nvvm_suld_1d_array_i8_zero:
3200  case Intrinsic::nvvm_suld_1d_array_i16_zero:
3202  case Intrinsic::nvvm_suld_1d_array_i32_zero:
3204  case Intrinsic::nvvm_suld_1d_array_i64_zero:
3206  case Intrinsic::nvvm_suld_1d_array_v2i8_zero:
3208  case Intrinsic::nvvm_suld_1d_array_v2i16_zero:
3210  case Intrinsic::nvvm_suld_1d_array_v2i32_zero:
3212  case Intrinsic::nvvm_suld_1d_array_v2i64_zero:
3214  case Intrinsic::nvvm_suld_1d_array_v4i8_zero:
3216  case Intrinsic::nvvm_suld_1d_array_v4i16_zero:
3218  case Intrinsic::nvvm_suld_1d_array_v4i32_zero:
3220  case Intrinsic::nvvm_suld_2d_i8_zero:
3221  return NVPTXISD::Suld2DI8Zero;
3222  case Intrinsic::nvvm_suld_2d_i16_zero:
3223  return NVPTXISD::Suld2DI16Zero;
3224  case Intrinsic::nvvm_suld_2d_i32_zero:
3225  return NVPTXISD::Suld2DI32Zero;
3226  case Intrinsic::nvvm_suld_2d_i64_zero:
3227  return NVPTXISD::Suld2DI64Zero;
3228  case Intrinsic::nvvm_suld_2d_v2i8_zero:
3229  return NVPTXISD::Suld2DV2I8Zero;
3230  case Intrinsic::nvvm_suld_2d_v2i16_zero:
3232  case Intrinsic::nvvm_suld_2d_v2i32_zero:
3234  case Intrinsic::nvvm_suld_2d_v2i64_zero:
3236  case Intrinsic::nvvm_suld_2d_v4i8_zero:
3237  return NVPTXISD::Suld2DV4I8Zero;
3238  case Intrinsic::nvvm_suld_2d_v4i16_zero:
3240  case Intrinsic::nvvm_suld_2d_v4i32_zero:
3242  case Intrinsic::nvvm_suld_2d_array_i8_zero:
3244  case Intrinsic::nvvm_suld_2d_array_i16_zero:
3246  case Intrinsic::nvvm_suld_2d_array_i32_zero:
3248  case Intrinsic::nvvm_suld_2d_array_i64_zero:
3250  case Intrinsic::nvvm_suld_2d_array_v2i8_zero:
3252  case Intrinsic::nvvm_suld_2d_array_v2i16_zero:
3254  case Intrinsic::nvvm_suld_2d_array_v2i32_zero:
3256  case Intrinsic::nvvm_suld_2d_array_v2i64_zero:
3258  case Intrinsic::nvvm_suld_2d_array_v4i8_zero:
3260  case Intrinsic::nvvm_suld_2d_array_v4i16_zero:
3262  case Intrinsic::nvvm_suld_2d_array_v4i32_zero:
3264  case Intrinsic::nvvm_suld_3d_i8_zero:
3265  return NVPTXISD::Suld3DI8Zero;
3266  case Intrinsic::nvvm_suld_3d_i16_zero:
3267  return NVPTXISD::Suld3DI16Zero;
3268  case Intrinsic::nvvm_suld_3d_i32_zero:
3269  return NVPTXISD::Suld3DI32Zero;
3270  case Intrinsic::nvvm_suld_3d_i64_zero:
3271  return NVPTXISD::Suld3DI64Zero;
3272  case Intrinsic::nvvm_suld_3d_v2i8_zero:
3273  return NVPTXISD::Suld3DV2I8Zero;
3274  case Intrinsic::nvvm_suld_3d_v2i16_zero:
3276  case Intrinsic::nvvm_suld_3d_v2i32_zero:
3278  case Intrinsic::nvvm_suld_3d_v2i64_zero:
3280  case Intrinsic::nvvm_suld_3d_v4i8_zero:
3281  return NVPTXISD::Suld3DV4I8Zero;
3282  case Intrinsic::nvvm_suld_3d_v4i16_zero:
3284  case Intrinsic::nvvm_suld_3d_v4i32_zero:
3286  }
3287 }
3288 
3289 // llvm.ptx.memcpy.const and llvm.ptx.memmove.const need to be modeled as
3290 // TgtMemIntrinsic
3291 // because we need the information that is only available in the "Value" type
3292 // of destination
3293 // pointer. In particular, the address space information.
3295  IntrinsicInfo &Info, const CallInst &I,
3296  MachineFunction &MF, unsigned Intrinsic) const {
3297  switch (Intrinsic) {
3298  default:
3299  return false;
3300  case Intrinsic::nvvm_match_all_sync_i32p:
3301  case Intrinsic::nvvm_match_all_sync_i64p:
3302  Info.opc = ISD::INTRINSIC_W_CHAIN;
3303  // memVT is bogus. These intrinsics have IntrInaccessibleMemOnly attribute
3304  // in order to model data exchange with other threads, but perform no real
3305  // memory accesses.
3306  Info.memVT = MVT::i1;
3307 
3308  // Our result depends on both our and other thread's arguments.
3310  return true;
3311  case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_col:
3312  case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_row:
3313  case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_col_stride:
3314  case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_row_stride:
3315  case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_col:
3316  case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_row:
3317  case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_col_stride:
3318  case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_row_stride:
3319  case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_col:
3320  case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_row:
3321  case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_col_stride:
3322  case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_row_stride:
3323  case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_col:
3324  case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_row:
3325  case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_col_stride:
3326  case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_row_stride:
3327  case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_col:
3328  case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_row:
3329  case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_col_stride:
3330  case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_row_stride:
3331  case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_col:
3332  case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_row:
3333  case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_col_stride:
3334  case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_row_stride: {
3335  Info.opc = ISD::INTRINSIC_W_CHAIN;
3336  Info.memVT = MVT::v8f16;
3337  Info.ptrVal = I.getArgOperand(0);
3338  Info.offset = 0;
3340  Info.align = 16;
3341  return true;
3342  }
3343 
3344  case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_col:
3345  case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_row:
3346  case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_col_stride:
3347  case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_row_stride:
3348  case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_col:
3349  case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_row:
3350  case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_col_stride:
3351  case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_row_stride:
3352  case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_col:
3353  case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_row:
3354  case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_col_stride:
3355  case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_row_stride: {
3356  Info.opc = ISD::INTRINSIC_W_CHAIN;
3357  Info.memVT = MVT::v4f16;
3358  Info.ptrVal = I.getArgOperand(0);
3359  Info.offset = 0;
3361  Info.align = 16;
3362  return true;
3363  }
3364 
3365  case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_col:
3366  case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_row:
3367  case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_col_stride:
3368  case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_row_stride:
3369  case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_col:
3370  case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_row:
3371  case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_col_stride:
3372  case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_row_stride:
3373  case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_col:
3374  case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_row:
3375  case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_col_stride:
3376  case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_row_stride: {
3377  Info.opc = ISD::INTRINSIC_W_CHAIN;
3378  Info.memVT = MVT::v8f32;
3379  Info.ptrVal = I.getArgOperand(0);
3380  Info.offset = 0;
3382  Info.align = 16;
3383  return true;
3384  }
3385 
3386  case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_col:
3387  case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_row:
3388  case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_col_stride:
3389  case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_row_stride:
3390  case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_col:
3391  case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_row:
3392  case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_col_stride:
3393  case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_row_stride:
3394  case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_col:
3395  case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_row:
3396  case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_col_stride:
3397  case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_row_stride: {
3398  Info.opc = ISD::INTRINSIC_VOID;
3399  Info.memVT = MVT::v4f16;
3400  Info.ptrVal = I.getArgOperand(0);
3401  Info.offset = 0;
3403  Info.align = 16;
3404  return true;
3405  }
3406 
3407  case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_col:
3408  case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_row:
3409  case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_col_stride:
3410  case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_row_stride:
3411  case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_col:
3412  case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_row:
3413  case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_col_stride:
3414  case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_row_stride:
3415  case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_col:
3416  case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_row:
3417  case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_col_stride:
3418  case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_row_stride: {
3419  Info.opc = ISD::INTRINSIC_VOID;
3420  Info.memVT = MVT::v8f32;
3421  Info.ptrVal = I.getArgOperand(0);
3422  Info.offset = 0;
3424  Info.align = 16;
3425  return true;
3426  }
3427 
3428  case Intrinsic::nvvm_atomic_load_add_f32:
3429  case Intrinsic::nvvm_atomic_load_add_f64:
3430  case Intrinsic::nvvm_atomic_load_inc_32:
3431  case Intrinsic::nvvm_atomic_load_dec_32:
3432 
3433  case Intrinsic::nvvm_atomic_add_gen_f_cta:
3434  case Intrinsic::nvvm_atomic_add_gen_f_sys:
3435  case Intrinsic::nvvm_atomic_add_gen_i_cta:
3436  case Intrinsic::nvvm_atomic_add_gen_i_sys:
3437  case Intrinsic::nvvm_atomic_and_gen_i_cta:
3438  case Intrinsic::nvvm_atomic_and_gen_i_sys:
3439  case Intrinsic::nvvm_atomic_cas_gen_i_cta:
3440  case Intrinsic::nvvm_atomic_cas_gen_i_sys:
3441  case Intrinsic::nvvm_atomic_dec_gen_i_cta:
3442  case Intrinsic::nvvm_atomic_dec_gen_i_sys:
3443  case Intrinsic::nvvm_atomic_inc_gen_i_cta:
3444  case Intrinsic::nvvm_atomic_inc_gen_i_sys:
3445  case Intrinsic::nvvm_atomic_max_gen_i_cta:
3446  case Intrinsic::nvvm_atomic_max_gen_i_sys:
3447  case Intrinsic::nvvm_atomic_min_gen_i_cta:
3448  case Intrinsic::nvvm_atomic_min_gen_i_sys:
3449  case Intrinsic::nvvm_atomic_or_gen_i_cta:
3450  case Intrinsic::nvvm_atomic_or_gen_i_sys:
3451  case Intrinsic::nvvm_atomic_exch_gen_i_cta:
3452  case Intrinsic::nvvm_atomic_exch_gen_i_sys:
3453  case Intrinsic::nvvm_atomic_xor_gen_i_cta:
3454  case Intrinsic::nvvm_atomic_xor_gen_i_sys: {
3455  auto &DL = I.getModule()->getDataLayout();
3456  Info.opc = ISD::INTRINSIC_W_CHAIN;
3457  Info.memVT = getValueType(DL, I.getType());
3458  Info.ptrVal = I.getArgOperand(0);
3459  Info.offset = 0;
3461  Info.align = 0;
3462  return true;
3463  }
3464 
3465  case Intrinsic::nvvm_ldu_global_i:
3466  case Intrinsic::nvvm_ldu_global_f:
3467  case Intrinsic::nvvm_ldu_global_p: {
3468  auto &DL = I.getModule()->getDataLayout();
3469  Info.opc = ISD::INTRINSIC_W_CHAIN;
3470  if (Intrinsic == Intrinsic::nvvm_ldu_global_i)
3471  Info.memVT = getValueType(DL, I.getType());
3472  else if(Intrinsic == Intrinsic::nvvm_ldu_global_p)
3473  Info.memVT = getPointerTy(DL);
3474  else
3475  Info.memVT = getValueType(DL, I.getType());
3476  Info.ptrVal = I.getArgOperand(0);
3477  Info.offset = 0;
3479  Info.align = cast<ConstantInt>(I.getArgOperand(1))->getZExtValue();
3480 
3481  return true;
3482  }
3483  case Intrinsic::nvvm_ldg_global_i:
3484  case Intrinsic::nvvm_ldg_global_f:
3485  case Intrinsic::nvvm_ldg_global_p: {
3486  auto &DL = I.getModule()->getDataLayout();
3487 
3488  Info.opc = ISD::INTRINSIC_W_CHAIN;
3489  if (Intrinsic == Intrinsic::nvvm_ldg_global_i)
3490  Info.memVT = getValueType(DL, I.getType());
3491  else if(Intrinsic == Intrinsic::nvvm_ldg_global_p)
3492  Info.memVT = getPointerTy(DL);
3493  else
3494  Info.memVT = getValueType(DL, I.getType());
3495  Info.ptrVal = I.getArgOperand(0);
3496  Info.offset = 0;
3498  Info.align = cast<ConstantInt>(I.getArgOperand(1))->getZExtValue();
3499 
3500  return true;
3501  }
3502 
3503  case Intrinsic::nvvm_tex_1d_v4f32_s32:
3504  case Intrinsic::nvvm_tex_1d_v4f32_f32:
3505  case Intrinsic::nvvm_tex_1d_level_v4f32_f32:
3506  case Intrinsic::nvvm_tex_1d_grad_v4f32_f32:
3507  case Intrinsic::nvvm_tex_1d_array_v4f32_s32:
3508  case Intrinsic::nvvm_tex_1d_array_v4f32_f32:
3509  case Intrinsic::nvvm_tex_1d_array_level_v4f32_f32:
3510  case Intrinsic::nvvm_tex_1d_array_grad_v4f32_f32:
3511  case Intrinsic::nvvm_tex_2d_v4f32_s32:
3512  case Intrinsic::nvvm_tex_2d_v4f32_f32:
3513  case Intrinsic::nvvm_tex_2d_level_v4f32_f32:
3514  case Intrinsic::nvvm_tex_2d_grad_v4f32_f32:
3515  case Intrinsic::nvvm_tex_2d_array_v4f32_s32:
3516  case Intrinsic::nvvm_tex_2d_array_v4f32_f32:
3517  case Intrinsic::nvvm_tex_2d_array_level_v4f32_f32:
3518  case Intrinsic::nvvm_tex_2d_array_grad_v4f32_f32:
3519  case Intrinsic::nvvm_tex_3d_v4f32_s32:
3520  case Intrinsic::nvvm_tex_3d_v4f32_f32:
3521  case Intrinsic::nvvm_tex_3d_level_v4f32_f32:
3522  case Intrinsic::nvvm_tex_3d_grad_v4f32_f32:
3523  case Intrinsic::nvvm_tex_cube_v4f32_f32:
3524  case Intrinsic::nvvm_tex_cube_level_v4f32_f32:
3525  case Intrinsic::nvvm_tex_cube_array_v4f32_f32:
3526  case Intrinsic::nvvm_tex_cube_array_level_v4f32_f32:
3527  case Intrinsic::nvvm_tld4_r_2d_v4f32_f32:
3528  case Intrinsic::nvvm_tld4_g_2d_v4f32_f32:
3529  case Intrinsic::nvvm_tld4_b_2d_v4f32_f32:
3530  case Intrinsic::nvvm_tld4_a_2d_v4f32_f32:
3531  case Intrinsic::nvvm_tex_unified_1d_v4f32_s32:
3532  case Intrinsic::nvvm_tex_unified_1d_v4f32_f32:
3533  case Intrinsic::nvvm_tex_unified_1d_level_v4f32_f32:
3534  case Intrinsic::nvvm_tex_unified_1d_grad_v4f32_f32:
3535  case Intrinsic::nvvm_tex_unified_1d_array_v4f32_s32:
3536  case Intrinsic::nvvm_tex_unified_1d_array_v4f32_f32:
3537  case Intrinsic::nvvm_tex_unified_1d_array_level_v4f32_f32:
3538  case Intrinsic::nvvm_tex_unified_1d_array_grad_v4f32_f32:
3539  case Intrinsic::nvvm_tex_unified_2d_v4f32_s32:
3540  case Intrinsic::nvvm_tex_unified_2d_v4f32_f32:
3541  case Intrinsic::nvvm_tex_unified_2d_level_v4f32_f32:
3542  case Intrinsic::nvvm_tex_unified_2d_grad_v4f32_f32:
3543  case Intrinsic::nvvm_tex_unified_2d_array_v4f32_s32:
3544  case Intrinsic::nvvm_tex_unified_2d_array_v4f32_f32:
3545  case Intrinsic::nvvm_tex_unified_2d_array_level_v4f32_f32:
3546  case Intrinsic::nvvm_tex_unified_2d_array_grad_v4f32_f32:
3547  case Intrinsic::nvvm_tex_unified_3d_v4f32_s32:
3548  case Intrinsic::nvvm_tex_unified_3d_v4f32_f32:
3549  case Intrinsic::nvvm_tex_unified_3d_level_v4f32_f32:
3550  case Intrinsic::nvvm_tex_unified_3d_grad_v4f32_f32:
3551  case Intrinsic::nvvm_tex_unified_cube_v4f32_f32:
3552  case Intrinsic::nvvm_tex_unified_cube_level_v4f32_f32:
3553  case Intrinsic::nvvm_tex_unified_cube_array_v4f32_f32:
3554  case Intrinsic::nvvm_tex_unified_cube_array_level_v4f32_f32:
3555  case Intrinsic::nvvm_tld4_unified_r_2d_v4f32_f32:
3556  case Intrinsic::nvvm_tld4_unified_g_2d_v4f32_f32:
3557  case Intrinsic::nvvm_tld4_unified_b_2d_v4f32_f32:
3558  case Intrinsic::nvvm_tld4_unified_a_2d_v4f32_f32:
3559  Info.opc = getOpcForTextureInstr(Intrinsic);
3560  Info.memVT = MVT::v4f32;
3561  Info.ptrVal = nullptr;
3562  Info.offset = 0;
3564  Info.align = 16;
3565  return true;
3566 
3567  case Intrinsic::nvvm_tex_1d_v4s32_s32:
3568  case Intrinsic::nvvm_tex_1d_v4s32_f32:
3569  case Intrinsic::nvvm_tex_1d_level_v4s32_f32:
3570  case Intrinsic::nvvm_tex_1d_grad_v4s32_f32:
3571  case Intrinsic::nvvm_tex_1d_array_v4s32_s32:
3572  case Intrinsic::nvvm_tex_1d_array_v4s32_f32:
3573  case Intrinsic::nvvm_tex_1d_array_level_v4s32_f32:
3574  case Intrinsic::nvvm_tex_1d_array_grad_v4s32_f32:
3575  case Intrinsic::nvvm_tex_2d_v4s32_s32:
3576  case Intrinsic::nvvm_tex_2d_v4s32_f32:
3577  case Intrinsic::nvvm_tex_2d_level_v4s32_f32:
3578  case Intrinsic::nvvm_tex_2d_grad_v4s32_f32:
3579  case Intrinsic::nvvm_tex_2d_array_v4s32_s32:
3580  case Intrinsic::nvvm_tex_2d_array_v4s32_f32:
3581  case Intrinsic::nvvm_tex_2d_array_level_v4s32_f32:
3582  case Intrinsic::nvvm_tex_2d_array_grad_v4s32_f32:
3583  case Intrinsic::nvvm_tex_3d_v4s32_s32:
3584  case Intrinsic::nvvm_tex_3d_v4s32_f32:
3585  case Intrinsic::nvvm_tex_3d_level_v4s32_f32:
3586  case Intrinsic::nvvm_tex_3d_grad_v4s32_f32:
3587  case Intrinsic::nvvm_tex_cube_v4s32_f32:
3588  case Intrinsic::nvvm_tex_cube_level_v4s32_f32:
3589  case Intrinsic::nvvm_tex_cube_array_v4s32_f32:
3590  case Intrinsic::nvvm_tex_cube_array_level_v4s32_f32:
3591  case Intrinsic::nvvm_tex_cube_v4u32_f32:
3592  case Intrinsic::nvvm_tex_cube_level_v4u32_f32:
3593  case Intrinsic::nvvm_tex_cube_array_v4u32_f32:
3594  case Intrinsic::nvvm_tex_cube_array_level_v4u32_f32:
3595  case Intrinsic::nvvm_tex_1d_v4u32_s32:
3596  case Intrinsic::nvvm_tex_1d_v4u32_f32:
3597  case Intrinsic::nvvm_tex_1d_level_v4u32_f32:
3598  case Intrinsic::nvvm_tex_1d_grad_v4u32_f32:
3599  case Intrinsic::nvvm_tex_1d_array_v4u32_s32:
3600  case Intrinsic::nvvm_tex_1d_array_v4u32_f32:
3601  case Intrinsic::nvvm_tex_1d_array_level_v4u32_f32:
3602  case Intrinsic::nvvm_tex_1d_array_grad_v4u32_f32:
3603  case Intrinsic::nvvm_tex_2d_v4u32_s32:
3604  case Intrinsic::nvvm_tex_2d_v4u32_f32:
3605  case Intrinsic::nvvm_tex_2d_level_v4u32_f32:
3606  case Intrinsic::nvvm_tex_2d_grad_v4u32_f32:
3607  case Intrinsic::nvvm_tex_2d_array_v4u32_s32:
3608  case Intrinsic::nvvm_tex_2d_array_v4u32_f32:
3609  case Intrinsic::nvvm_tex_2d_array_level_v4u32_f32:
3610  case Intrinsic::nvvm_tex_2d_array_grad_v4u32_f32:
3611  case Intrinsic::nvvm_tex_3d_v4u32_s32:
3612  case Intrinsic::nvvm_tex_3d_v4u32_f32:
3613  case Intrinsic::nvvm_tex_3d_level_v4u32_f32:
3614  case Intrinsic::nvvm_tex_3d_grad_v4u32_f32:
3615  case Intrinsic::nvvm_tld4_r_2d_v4s32_f32:
3616  case Intrinsic::nvvm_tld4_g_2d_v4s32_f32:
3617  case Intrinsic::nvvm_tld4_b_2d_v4s32_f32:
3618  case Intrinsic::nvvm_tld4_a_2d_v4s32_f32:
3619  case Intrinsic::nvvm_tld4_r_2d_v4u32_f32:
3620  case Intrinsic::nvvm_tld4_g_2d_v4u32_f32:
3621  case Intrinsic::nvvm_tld4_b_2d_v4u32_f32:
3622  case Intrinsic::nvvm_tld4_a_2d_v4u32_f32:
3623  case Intrinsic::nvvm_tex_unified_1d_v4s32_s32:
3624  case Intrinsic::nvvm_tex_unified_1d_v4s32_f32:
3625  case Intrinsic::nvvm_tex_unified_1d_level_v4s32_f32:
3626  case Intrinsic::nvvm_tex_unified_1d_grad_v4s32_f32:
3627  case Intrinsic::nvvm_tex_unified_1d_array_v4s32_s32:
3628  case Intrinsic::nvvm_tex_unified_1d_array_v4s32_f32:
3629  case Intrinsic::nvvm_tex_unified_1d_array_level_v4s32_f32:
3630  case Intrinsic::nvvm_tex_unified_1d_array_grad_v4s32_f32:
3631  case Intrinsic::nvvm_tex_unified_2d_v4s32_s32:
3632  case Intrinsic::nvvm_tex_unified_2d_v4s32_f32:
3633  case Intrinsic::nvvm_tex_unified_2d_level_v4s32_f32:
3634  case Intrinsic::nvvm_tex_unified_2d_grad_v4s32_f32:
3635  case Intrinsic::nvvm_tex_unified_2d_array_v4s32_s32:
3636  case Intrinsic::nvvm_tex_unified_2d_array_v4s32_f32:
3637  case Intrinsic::nvvm_tex_unified_2d_array_level_v4s32_f32:
3638  case Intrinsic::nvvm_tex_unified_2d_array_grad_v4s32_f32:
3639  case Intrinsic::nvvm_tex_unified_3d_v4s32_s32:
3640  case Intrinsic::nvvm_tex_unified_3d_v4s32_f32:
3641  case Intrinsic::nvvm_tex_unified_3d_level_v4s32_f32:
3642  case Intrinsic::nvvm_tex_unified_3d_grad_v4s32_f32:
3643  case Intrinsic::nvvm_tex_unified_1d_v4u32_s32:
3644  case Intrinsic::nvvm_tex_unified_1d_v4u32_f32:
3645  case Intrinsic::nvvm_tex_unified_1d_level_v4u32_f32:
3646  case Intrinsic::nvvm_tex_unified_1d_grad_v4u32_f32:
3647  case Intrinsic::nvvm_tex_unified_1d_array_v4u32_s32:
3648  case Intrinsic::nvvm_tex_unified_1d_array_v4u32_f32:
3649  case Intrinsic::nvvm_tex_unified_1d_array_level_v4u32_f32:
3650  case Intrinsic::nvvm_tex_unified_1d_array_grad_v4u32_f32:
3651  case Intrinsic::nvvm_tex_unified_2d_v4u32_s32:
3652  case Intrinsic::nvvm_tex_unified_2d_v4u32_f32:
3653  case Intrinsic::nvvm_tex_unified_2d_level_v4u32_f32:
3654  case Intrinsic::nvvm_tex_unified_2d_grad_v4u32_f32:
3655  case Intrinsic::nvvm_tex_unified_2d_array_v4u32_s32:
3656  case Intrinsic::nvvm_tex_unified_2d_array_v4u32_f32:
3657  case Intrinsic::nvvm_tex_unified_2d_array_level_v4u32_f32:
3658  case Intrinsic::nvvm_tex_unified_2d_array_grad_v4u32_f32:
3659  case Intrinsic::nvvm_tex_unified_3d_v4u32_s32:
3660  case Intrinsic::nvvm_tex_unified_3d_v4u32_f32:
3661  case Intrinsic::nvvm_tex_unified_3d_level_v4u32_f32:
3662  case Intrinsic::nvvm_tex_unified_3d_grad_v4u32_f32:
3663  case Intrinsic::nvvm_tex_unified_cube_v4s32_f32:
3664  case Intrinsic::nvvm_tex_unified_cube_level_v4s32_f32:
3665  case Intrinsic::nvvm_tex_unified_cube_array_v4s32_f32:
3666  case Intrinsic::nvvm_tex_unified_cube_array_level_v4s32_f32:
3667  case Intrinsic::nvvm_tex_unified_cube_v4u32_f32:
3668  case Intrinsic::nvvm_tex_unified_cube_level_v4u32_f32:
3669  case Intrinsic::nvvm_tex_unified_cube_array_v4u32_f32:
3670  case Intrinsic::nvvm_tex_unified_cube_array_level_v4u32_f32:
3671  case Intrinsic::nvvm_tld4_unified_r_2d_v4s32_f32:
3672  case Intrinsic::nvvm_tld4_unified_g_2d_v4s32_f32:
3673  case Intrinsic::nvvm_tld4_unified_b_2d_v4s32_f32:
3674  case Intrinsic::nvvm_tld4_unified_a_2d_v4s32_f32:
3675  case Intrinsic::nvvm_tld4_unified_r_2d_v4u32_f32:
3676  case Intrinsic::nvvm_tld4_unified_g_2d_v4u32_f32:
3677  case Intrinsic::nvvm_tld4_unified_b_2d_v4u32_f32:
3678  case Intrinsic::nvvm_tld4_unified_a_2d_v4u32_f32:
3679  Info.opc = getOpcForTextureInstr(Intrinsic);
3680  Info.memVT = MVT::v4i32;
3681  Info.ptrVal = nullptr;
3682  Info.offset = 0;
3684  Info.align = 16;
3685  return true;
3686 
3687  case Intrinsic::nvvm_suld_1d_i8_clamp:
3688  case Intrinsic::nvvm_suld_1d_v2i8_clamp:
3689  case Intrinsic::nvvm_suld_1d_v4i8_clamp:
3690  case Intrinsic::nvvm_suld_1d_array_i8_clamp:
3691  case Intrinsic::nvvm_suld_1d_array_v2i8_clamp:
3692  case Intrinsic::nvvm_suld_1d_array_v4i8_clamp:
3693  case Intrinsic::nvvm_suld_2d_i8_clamp:
3694  case Intrinsic::nvvm_suld_2d_v2i8_clamp:
3695  case Intrinsic::nvvm_suld_2d_v4i8_clamp:
3696  case Intrinsic::nvvm_suld_2d_array_i8_clamp:
3697  case Intrinsic::nvvm_suld_2d_array_v2i8_clamp:
3698  case Intrinsic::nvvm_suld_2d_array_v4i8_clamp:
3699  case Intrinsic::nvvm_suld_3d_i8_clamp:
3700  case Intrinsic::nvvm_suld_3d_v2i8_clamp:
3701  case Intrinsic::nvvm_suld_3d_v4i8_clamp:
3702  case Intrinsic::nvvm_suld_1d_i8_trap:
3703  case Intrinsic::nvvm_suld_1d_v2i8_trap:
3704  case Intrinsic::nvvm_suld_1d_v4i8_trap:
3705  case Intrinsic::nvvm_suld_1d_array_i8_trap:
3706  case Intrinsic::nvvm_suld_1d_array_v2i8_trap:
3707  case Intrinsic::nvvm_suld_1d_array_v4i8_trap:
3708  case Intrinsic::nvvm_suld_2d_i8_trap:
3709  case Intrinsic::nvvm_suld_2d_v2i8_trap:
3710  case Intrinsic::nvvm_suld_2d_v4i8_trap:
3711  case Intrinsic::nvvm_suld_2d_array_i8_trap:
3712  case Intrinsic::nvvm_suld_2d_array_v2i8_trap:
3713  case Intrinsic::nvvm_suld_2d_array_v4i8_trap:
3714  case Intrinsic::nvvm_suld_3d_i8_trap:
3715  case Intrinsic::nvvm_suld_3d_v2i8_trap:
3716  case Intrinsic::nvvm_suld_3d_v4i8_trap:
3717  case Intrinsic::nvvm_suld_1d_i8_zero:
3718  case Intrinsic::nvvm_suld_1d_v2i8_zero:
3719  case Intrinsic::nvvm_suld_1d_v4i8_zero:
3720  case Intrinsic::nvvm_suld_1d_array_i8_zero:
3721  case Intrinsic::nvvm_suld_1d_array_v2i8_zero:
3722  case Intrinsic::nvvm_suld_1d_array_v4i8_zero:
3723  case Intrinsic::nvvm_suld_2d_i8_zero:
3724  case Intrinsic::nvvm_suld_2d_v2i8_zero:
3725  case Intrinsic::nvvm_suld_2d_v4i8_zero:
3726  case Intrinsic::nvvm_suld_2d_array_i8_zero:
3727  case Intrinsic::nvvm_suld_2d_array_v2i8_zero:
3728  case Intrinsic::nvvm_suld_2d_array_v4i8_zero:
3729  case Intrinsic::nvvm_suld_3d_i8_zero:
3730  case Intrinsic::nvvm_suld_3d_v2i8_zero:
3731  case Intrinsic::nvvm_suld_3d_v4i8_zero:
3732  Info.opc = getOpcForSurfaceInstr(Intrinsic);
3733  Info.memVT = MVT::i8;
3734  Info.ptrVal = nullptr;
3735  Info.offset = 0;
3737  Info.align = 16;
3738  return true;
3739 
3740  case Intrinsic::nvvm_suld_1d_i16_clamp:
3741  case Intrinsic::nvvm_suld_1d_v2i16_clamp:
3742  case Intrinsic::nvvm_suld_1d_v4i16_clamp:
3743  case Intrinsic::nvvm_suld_1d_array_i16_clamp:
3744  case Intrinsic::nvvm_suld_1d_array_v2i16_clamp:
3745  case Intrinsic::nvvm_suld_1d_array_v4i16_clamp:
3746  case Intrinsic::nvvm_suld_2d_i16_clamp:
3747  case Intrinsic::nvvm_suld_2d_v2i16_clamp:
3748  case Intrinsic::nvvm_suld_2d_v4i16_clamp:
3749  case Intrinsic::nvvm_suld_2d_array_i16_clamp:
3750  case Intrinsic::nvvm_suld_2d_array_v2i16_clamp:
3751  case Intrinsic::nvvm_suld_2d_array_v4i16_clamp:
3752  case Intrinsic::nvvm_suld_3d_i16_clamp:
3753  case Intrinsic::nvvm_suld_3d_v2i16_clamp:
3754  case Intrinsic::nvvm_suld_3d_v4i16_clamp:
3755  case Intrinsic::nvvm_suld_1d_i16_trap:
3756  case Intrinsic::nvvm_suld_1d_v2i16_trap:
3757  case Intrinsic::nvvm_suld_1d_v4i16_trap:
3758  case Intrinsic::nvvm_suld_1d_array_i16_trap:
3759  case Intrinsic::nvvm_suld_1d_array_v2i16_trap:
3760  case Intrinsic::nvvm_suld_1d_array_v4i16_trap:
3761  case Intrinsic::nvvm_suld_2d_i16_trap:
3762  case Intrinsic::nvvm_suld_2d_v2i16_trap:
3763  case Intrinsic::nvvm_suld_2d_v4i16_trap:
3764  case Intrinsic::nvvm_suld_2d_array_i16_trap:
3765  case Intrinsic::nvvm_suld_2d_array_v2i16_trap:
3766  case Intrinsic::nvvm_suld_2d_array_v4i16_trap:
3767  case Intrinsic::nvvm_suld_3d_i16_trap:
3768  case Intrinsic::nvvm_suld_3d_v2i16_trap:
3769  case Intrinsic::nvvm_suld_3d_v4i16_trap:
3770  case Intrinsic::nvvm_suld_1d_i16_zero:
3771  case Intrinsic::nvvm_suld_1d_v2i16_zero:
3772  case Intrinsic::nvvm_suld_1d_v4i16_zero:
3773  case Intrinsic::nvvm_suld_1d_array_i16_zero:
3774  case Intrinsic::nvvm_suld_1d_array_v2i16_zero:
3775  case Intrinsic::nvvm_suld_1d_array_v4i16_zero:
3776  case Intrinsic::nvvm_suld_2d_i16_zero:
3777  case Intrinsic::nvvm_suld_2d_v2i16_zero:
3778  case Intrinsic::nvvm_suld_2d_v4i16_zero:
3779  case Intrinsic::nvvm_suld_2d_array_i16_zero:
3780  case Intrinsic::nvvm_suld_2d_array_v2i16_zero:
3781  case Intrinsic::nvvm_suld_2d_array_v4i16_zero:
3782  case Intrinsic::nvvm_suld_3d_i16_zero:
3783  case Intrinsic::nvvm_suld_3d_v2i16_zero:
3784  case Intrinsic::nvvm_suld_3d_v4i16_zero:
3785  Info.opc = getOpcForSurfaceInstr(Intrinsic);
3786  Info.memVT = MVT::i16;
3787  Info.ptrVal = nullptr;
3788  Info.offset = 0;
3790  Info.align = 16;
3791  return true;
3792 
3793  case Intrinsic::nvvm_suld_1d_i32_clamp:
3794  case Intrinsic::nvvm_suld_1d_v2i32_clamp:
3795  case Intrinsic::nvvm_suld_1d_v4i32_clamp:
3796  case Intrinsic::nvvm_suld_1d_array_i32_clamp:
3797  case Intrinsic::nvvm_suld_1d_array_v2i32_clamp:
3798  case Intrinsic::nvvm_suld_1d_array_v4i32_clamp:
3799  case Intrinsic::nvvm_suld_2d_i32_clamp:
3800  case Intrinsic::nvvm_suld_2d_v2i32_clamp:
3801  case Intrinsic::nvvm_suld_2d_v4i32_clamp:
3802  case Intrinsic::nvvm_suld_2d_array_i32_clamp:
3803  case Intrinsic::nvvm_suld_2d_array_v2i32_clamp:
3804  case Intrinsic::nvvm_suld_2d_array_v4i32_clamp:
3805  case Intrinsic::nvvm_suld_3d_i32_clamp:
3806  case Intrinsic::nvvm_suld_3d_v2i32_clamp:
3807  case Intrinsic::nvvm_suld_3d_v4i32_clamp:
3808  case Intrinsic::nvvm_suld_1d_i32_trap:
3809  case Intrinsic::nvvm_suld_1d_v2i32_trap:
3810  case Intrinsic::nvvm_suld_1d_v4i32_trap:
3811  case Intrinsic::nvvm_suld_1d_array_i32_trap:
3812  case Intrinsic::nvvm_suld_1d_array_v2i32_trap:
3813  case Intrinsic::nvvm_suld_1d_array_v4i32_trap:
3814  case Intrinsic::nvvm_suld_2d_i32_trap:
3815  case Intrinsic::nvvm_suld_2d_v2i32_trap:
3816  case Intrinsic::nvvm_suld_2d_v4i32_trap:
3817  case Intrinsic::nvvm_suld_2d_array_i32_trap:
3818  case Intrinsic::nvvm_suld_2d_array_v2i32_trap:
3819  case Intrinsic::nvvm_suld_2d_array_v4i32_trap:
3820  case Intrinsic::nvvm_suld_3d_i32_trap:
3821  case Intrinsic::nvvm_suld_3d_v2i32_trap:
3822  case Intrinsic::nvvm_suld_3d_v4i32_trap:
3823  case Intrinsic::nvvm_suld_1d_i32_zero:
3824  case Intrinsic::nvvm_suld_1d_v2i32_zero:
3825  case Intrinsic::nvvm_suld_1d_v4i32_zero:
3826  case Intrinsic::nvvm_suld_1d_array_i32_zero:
3827  case Intrinsic::nvvm_suld_1d_array_v2i32_zero:
3828  case Intrinsic::nvvm_suld_1d_array_v4i32_zero:
3829  case Intrinsic::nvvm_suld_2d_i32_zero:
3830  case Intrinsic::nvvm_suld_2d_v2i32_zero:
3831  case Intrinsic::nvvm_suld_2d_v4i32_zero:
3832  case Intrinsic::nvvm_suld_2d_array_i32_zero:
3833  case Intrinsic::nvvm_suld_2d_array_v2i32_zero:
3834  case Intrinsic::nvvm_suld_2d_array_v4i32_zero:
3835  case Intrinsic::nvvm_suld_3d_i32_zero:
3836  case Intrinsic::nvvm_suld_3d_v2i32_zero:
3837  case Intrinsic::nvvm_suld_3d_v4i32_zero:
3838  Info.opc = getOpcForSurfaceInstr(Intrinsic);
3839  Info.memVT = MVT::i32;
3840  Info.ptrVal = nullptr;
3841  Info.offset = 0;
3843  Info.align = 16;
3844  return true;
3845 
3846  case Intrinsic::nvvm_suld_1d_i64_clamp:
3847  case Intrinsic::nvvm_suld_1d_v2i64_clamp:
3848  case Intrinsic::nvvm_suld_1d_array_i64_clamp:
3849  case Intrinsic::nvvm_suld_1d_array_v2i64_clamp:
3850  case Intrinsic::nvvm_suld_2d_i64_clamp:
3851  case Intrinsic::nvvm_suld_2d_v2i64_clamp:
3852  case Intrinsic::nvvm_suld_2d_array_i64_clamp:
3853  case Intrinsic::nvvm_suld_2d_array_v2i64_clamp:
3854  case Intrinsic::nvvm_suld_3d_i64_clamp:
3855  case Intrinsic::nvvm_suld_3d_v2i64_clamp:
3856  case Intrinsic::nvvm_suld_1d_i64_trap:
3857  case Intrinsic::nvvm_suld_1d_v2i64_trap:
3858  case Intrinsic::nvvm_suld_1d_array_i64_trap:
3859  case Intrinsic::nvvm_suld_1d_array_v2i64_trap:
3860  case Intrinsic::nvvm_suld_2d_i64_trap:
3861  case Intrinsic::nvvm_suld_2d_v2i64_trap:
3862  case Intrinsic::nvvm_suld_2d_array_i64_trap:
3863  case Intrinsic::nvvm_suld_2d_array_v2i64_trap:
3864  case Intrinsic::nvvm_suld_3d_i64_trap:
3865  case Intrinsic::nvvm_suld_3d_v2i64_trap:
3866  case Intrinsic::nvvm_suld_1d_i64_zero:
3867  case Intrinsic::nvvm_suld_1d_v2i64_zero:
3868  case Intrinsic::nvvm_suld_1d_array_i64_zero:
3869  case Intrinsic::nvvm_suld_1d_array_v2i64_zero:
3870  case Intrinsic::nvvm_suld_2d_i64_zero:
3871  case Intrinsic::nvvm_suld_2d_v2i64_zero:
3872  case Intrinsic::nvvm_suld_2d_array_i64_zero:
3873  case Intrinsic::nvvm_suld_2d_array_v2i64_zero:
3874  case Intrinsic::nvvm_suld_3d_i64_zero:
3875  case Intrinsic::nvvm_suld_3d_v2i64_zero:
3876  Info.opc = getOpcForSurfaceInstr(Intrinsic);
3877  Info.memVT = MVT::i64;
3878  Info.ptrVal = nullptr;
3879  Info.offset = 0;
3881  Info.align = 16;
3882  return true;
3883  }
3884  return false;
3885 }
3886 
3887 /// isLegalAddressingMode - Return true if the addressing mode represented
3888 /// by AM is legal for this target, for a load/store of the specified type.
3889 /// Used to guide target specific optimizations, like loop strength reduction
3890 /// (LoopStrengthReduce.cpp) and memory optimization for address mode
3891 /// (CodeGenPrepare.cpp)
3893  const AddrMode &AM, Type *Ty,
3894  unsigned AS, Instruction *I) const {
3895  // AddrMode - This represents an addressing mode of:
3896  // BaseGV + BaseOffs + BaseReg + Scale*ScaleReg
3897  //
3898  // The legal address modes are
3899  // - [avar]
3900  // - [areg]
3901  // - [areg+immoff]
3902  // - [immAddr]
3903 
3904  if (AM.BaseGV) {
3905  return !AM.BaseOffs && !AM.HasBaseReg && !AM.Scale;
3906  }
3907 
3908  switch (AM.Scale) {
3909  case 0: // "r", "r+i" or "i" is allowed
3910  break;
3911  case 1:
3912  if (AM.HasBaseReg) // "r+r+i" or "r+r" is not allowed.
3913  return false;
3914  // Otherwise we have r+i.
3915  break;
3916  default:
3917  // No scale > 1 is allowed
3918  return false;
3919  }
3920  return true;
3921 }
3922 
3923 //===----------------------------------------------------------------------===//
3924 // NVPTX Inline Assembly Support
3925 //===----------------------------------------------------------------------===//
3926 
3927 /// getConstraintType - Given a constraint letter, return the type of
3928 /// constraint it is for this target.
3931  if (Constraint.size() == 1) {
3932  switch (Constraint[0]) {
3933  default:
3934  break;
3935  case 'b':
3936  case 'r':
3937  case 'h':
3938  case 'c':
3939  case 'l':
3940  case 'f':
3941  case 'd':
3942  case '0':
3943  case 'N':
3944  return C_RegisterClass;
3945  }
3946  }
3947  return TargetLowering::getConstraintType(Constraint);
3948 }
3949 
3950 std::pair<unsigned, const TargetRegisterClass *>
3952  StringRef Constraint,
3953  MVT VT) const {
3954  if (Constraint.size() == 1) {
3955  switch (Constraint[0]) {
3956  case 'b':
3957  return std::make_pair(0U, &NVPTX::Int1RegsRegClass);
3958  case 'c':
3959  return std::make_pair(0U, &NVPTX::Int16RegsRegClass);
3960  case 'h':
3961  return std::make_pair(0U, &NVPTX::Int16RegsRegClass);
3962  case 'r':
3963  return std::make_pair(0U, &NVPTX::Int32RegsRegClass);
3964  case 'l':
3965  case 'N':
3966  return std::make_pair(0U, &NVPTX::Int64RegsRegClass);
3967  case 'f':
3968  return std::make_pair(0U, &NVPTX::Float32RegsRegClass);
3969  case 'd':
3970  return std::make_pair(0U, &NVPTX::Float64RegsRegClass);
3971  }
3972  }
3973  return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
3974 }
3975 
3976 //===----------------------------------------------------------------------===//
3977 // NVPTX DAG Combining
3978 //===----------------------------------------------------------------------===//
3979 
3981  CodeGenOpt::Level OptLevel) const {
3982  // Always honor command-line argument
3983  if (FMAContractLevelOpt.getNumOccurrences() > 0)
3984  return FMAContractLevelOpt > 0;
3985 
3986  // Do not contract if we're not optimizing the code.
3987  if (OptLevel == 0)
3988  return false;
3989 
3990  // Honor TargetOptions flags that explicitly say fusion is okay.
3992  return true;
3993 
3994  return allowUnsafeFPMath(MF);
3995 }
3996 
3998  // Honor TargetOptions flags that explicitly say unsafe math is okay.
3999  if (MF.getTarget().Options.UnsafeFPMath)
4000  return true;
4001 
4002  // Allow unsafe math if unsafe-fp-math attribute explicitly says so.
4003  const Function &F = MF.getFunction();
4004  if (F.hasFnAttribute("unsafe-fp-math")) {
4005  Attribute Attr = F.getFnAttribute("unsafe-fp-math");
4006  StringRef Val = Attr.getValueAsString();
4007  if (Val == "true")
4008  return true;
4009  }
4010 
4011  return false;
4012 }
4013 
4014 /// PerformADDCombineWithOperands - Try DAG combinations for an ADD with
4015 /// operands N0 and N1. This is a helper for PerformADDCombine that is
4016 /// called with the default operands, and if that fails, with commuted
4017 /// operands.
4020  const NVPTXSubtarget &Subtarget,
4021  CodeGenOpt::Level OptLevel) {
4022  SelectionDAG &DAG = DCI.DAG;
4023  // Skip non-integer, non-scalar case
4024  EVT VT=N0.getValueType();
4025  if (VT.isVector())
4026  return SDValue();
4027 
4028  // fold (add (mul a, b), c) -> (mad a, b, c)
4029  //
4030  if (N0.getOpcode() == ISD::MUL) {
4031  assert (VT.isInteger());
4032  // For integer:
4033  // Since integer multiply-add costs the same as integer multiply
4034  // but is more costly than integer add, do the fusion only when
4035  // the mul is only used in the add.
4036  if (OptLevel==CodeGenOpt::None || VT != MVT::i32 ||
4037  !N0.getNode()->hasOneUse())
4038  return SDValue();
4039 
4040  // Do the folding
4041  return DAG.getNode(NVPTXISD::IMAD, SDLoc(N), VT,
4042  N0.getOperand(0), N0.getOperand(1), N1);
4043  }
4044  else if (N0.getOpcode() == ISD::FMUL) {
4045  if (VT == MVT::f32 || VT == MVT::f64) {
4046  const auto *TLI = static_cast<const NVPTXTargetLowering *>(
4047  &DAG.getTargetLoweringInfo());
4048  if (!TLI->allowFMA(DAG.getMachineFunction(), OptLevel))
4049  return SDValue();
4050 
4051  // For floating point:
4052  // Do the fusion only when the mul has less than 5 uses and all
4053  // are add.
4054  // The heuristic is that if a use is not an add, then that use
4055  // cannot be fused into fma, therefore mul is still needed anyway.
4056  // If there are more than 4 uses, even if they are all add, fusing
4057  // them will increase register pressue.
4058  //
4059  int numUses = 0;
4060  int nonAddCount = 0;
4061  for (SDNode::use_iterator UI = N0.getNode()->use_begin(),
4062  UE = N0.getNode()->use_end();
4063  UI != UE; ++UI) {
4064  numUses++;
4065  SDNode *User = *UI;
4066  if (User->getOpcode() != ISD::FADD)
4067  ++nonAddCount;
4068  }
4069  if (numUses >= 5)
4070  return SDValue();
4071  if (nonAddCount) {
4072  int orderNo = N->getIROrder();
4073  int orderNo2 = N0.getNode()->getIROrder();
4074  // simple heuristics here for considering potential register
4075  // pressure, the logics here is that the differnce are used
4076  // to measure the distance between def and use, the longer distance
4077  // more likely cause register pressure.
4078  if (orderNo - orderNo2 < 500)
4079  return SDValue();
4080 
4081  // Now, check if at least one of the FMUL's operands is live beyond the node N,
4082  // which guarantees that the FMA will not increase register pressure at node N.
4083  bool opIsLive = false;
4084  const SDNode *left = N0.getOperand(0).getNode();
4085  const SDNode *right = N0.getOperand(1).getNode();
4086 
4087  if (isa<ConstantSDNode>(left) || isa<ConstantSDNode>(right))
4088  opIsLive = true;
4089 
4090  if (!opIsLive)
4091  for (SDNode::use_iterator UI = left->use_begin(), UE = left->use_end(); UI != UE; ++UI) {
4092  SDNode *User = *UI;
4093  int orderNo3 = User->getIROrder();
4094  if (orderNo3 > orderNo) {
4095  opIsLive = true;
4096  break;
4097  }
4098  }
4099 
4100  if (!opIsLive)
4101  for (SDNode::use_iterator UI = right->use_begin(), UE = right->use_end(); UI != UE; ++UI) {
4102  SDNode *User = *UI;
4103  int orderNo3 = User->getIROrder();
4104  if (orderNo3 > orderNo) {
4105  opIsLive = true;
4106  break;
4107  }
4108  }
4109 
4110  if (!opIsLive)
4111  return SDValue();
4112  }
4113 
4114  return DAG.getNode(ISD::FMA, SDLoc(N), VT,
4115  N0.getOperand(0), N0.getOperand(1), N1);
4116  }
4117  }
4118 
4119  return SDValue();
4120 }
4121 
4122 /// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD.
4123 ///
4126  const NVPTXSubtarget &Subtarget,
4127  CodeGenOpt::Level OptLevel) {
4128  SDValue N0 = N->getOperand(0);
4129  SDValue N1 = N->getOperand(1);
4130 
4131  // First try with the default operand order.
4132  if (SDValue Result =
4133  PerformADDCombineWithOperands(N, N0, N1, DCI, Subtarget, OptLevel))
4134  return Result;
4135 
4136  // If that didn't work, try again with the operands commuted.
4137  return PerformADDCombineWithOperands(N, N1, N0, DCI, Subtarget, OptLevel);
4138 }
4139 
4142  // The type legalizer turns a vector load of i8 values into a zextload to i16
4143  // registers, optionally ANY_EXTENDs it (if target type is integer),
4144  // and ANDs off the high 8 bits. Since we turn this load into a
4145  // target-specific DAG node, the DAG combiner fails to eliminate these AND
4146  // nodes. Do that here.
4147  SDValue Val = N->getOperand(0);
4148  SDValue Mask = N->getOperand(1);
4149 
4150  if (isa<ConstantSDNode>(Val)) {
4151  std::swap(Val, Mask);
4152  }
4153 
4154  SDValue AExt;
4155  // Generally, we will see zextload -> IMOV16rr -> ANY_EXTEND -> and
4156  if (Val.getOpcode() == ISD::ANY_EXTEND) {
4157  AExt = Val;
4158  Val = Val->getOperand(0);
4159  }
4160 
4161  if (Val->isMachineOpcode() && Val->getMachineOpcode() == NVPTX::IMOV16rr) {
4162  Val = Val->getOperand(0);
4163  }
4164 
4165  if (Val->getOpcode() == NVPTXISD::LoadV2 ||
4166  Val->getOpcode() == NVPTXISD::LoadV4) {
4168  if (!MaskCnst) {
4169  // Not an AND with a constant
4170  return SDValue();
4171  }
4172 
4173  uint64_t MaskVal = MaskCnst->getZExtValue();
4174  if (MaskVal != 0xff) {
4175  // Not an AND that chops off top 8 bits
4176  return SDValue();
4177  }
4178 
4179  MemSDNode *Mem = dyn_cast<MemSDNode>(Val);
4180  if (!Mem) {
4181  // Not a MemSDNode?!?
4182  return SDValue();
4183  }
4184 
4185  EVT MemVT = Mem->getMemoryVT();
4186  if (MemVT != MVT::v2i8 && MemVT != MVT::v4i8) {
4187  // We only handle the i8 case
4188  return SDValue();
4189  }
4190 
4191  unsigned ExtType =
4192  cast<ConstantSDNode>(Val->getOperand(Val->getNumOperands()-1))->
4193  getZExtValue();
4194  if (ExtType == ISD::SEXTLOAD) {
4195  // If for some reason the load is a sextload, the and is needed to zero
4196  // out the high 8 bits
4197  return SDValue();
4198  }
4199 
4200  bool AddTo = false;
4201  if (AExt.getNode() != nullptr) {
4202  // Re-insert the ext as a zext.
4203  Val = DCI.DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N),
4204  AExt.getValueType(), Val);
4205  AddTo = true;
4206  }
4207 
4208  // If we get here, the AND is unnecessary. Just replace it with the load
4209  DCI.CombineTo(N, Val, AddTo);
4210  }
4211 
4212  return SDValue();
4213 }
4214 
4217  CodeGenOpt::Level OptLevel) {
4218  assert(N->getOpcode() == ISD::SREM || N->getOpcode() == ISD::UREM);
4219 
4220  // Don't do anything at less than -O2.
4221  if (OptLevel < CodeGenOpt::Default)
4222  return SDValue();
4223 
4224  SelectionDAG &DAG = DCI.DAG;
4225  SDLoc DL(N);
4226  EVT VT = N->getValueType(0);
4227  bool IsSigned = N->getOpcode() == ISD::SREM;
4228  unsigned DivOpc = IsSigned ? ISD::SDIV : ISD::UDIV;
4229 
4230  const SDValue &Num = N->getOperand(0);
4231  const SDValue &Den = N->getOperand(1);
4232 
4233  for (const SDNode *U : Num->uses()) {
4234  if (U->getOpcode() == DivOpc && U->getOperand(0) == Num &&
4235  U->getOperand(1) == Den) {
4236  // Num % Den -> Num - (Num / Den) * Den
4237  return DAG.getNode(ISD::SUB, DL, VT, Num,
4238  DAG.getNode(ISD::MUL, DL, VT,
4239  DAG.getNode(DivOpc, DL, VT, Num, Den),
4240  Den));
4241  }
4242  }
4243  return SDValue();
4244 }
4245 
4247  Signed = 0,
4250 };
4251 
4252 /// IsMulWideOperandDemotable - Checks if the provided DAG node is an operand
4253 /// that can be demoted to \p OptSize bits without loss of information. The
4254 /// signedness of the operand, if determinable, is placed in \p S.
4256  unsigned OptSize,
4257  OperandSignedness &S) {
4258  S = Unknown;
4259 
4260  if (Op.getOpcode() == ISD::SIGN_EXTEND ||
4262  EVT OrigVT = Op.getOperand(0).getValueType();
4263  if (OrigVT.getSizeInBits() <= OptSize) {
4264  S = Signed;
4265  return true;
4266  }
4267  } else if (Op.getOpcode() == ISD::ZERO_EXTEND) {
4268  EVT OrigVT = Op.getOperand(0).getValueType();
4269  if (OrigVT.getSizeInBits() <= OptSize) {
4270  S = Unsigned;
4271  return true;
4272  }
4273  }
4274 
4275  return false;
4276 }
4277 
4278 /// AreMulWideOperandsDemotable - Checks if the given LHS and RHS operands can
4279 /// be demoted to \p OptSize bits without loss of information. If the operands
4280 /// contain a constant, it should appear as the RHS operand. The signedness of
4281 /// the operands is placed in \p IsSigned.
4283  unsigned OptSize,
4284  bool &IsSigned) {
4285  OperandSignedness LHSSign;
4286 
4287  // The LHS operand must be a demotable op
4288  if (!IsMulWideOperandDemotable(LHS, OptSize, LHSSign))
4289  return false;
4290 
4291  // We should have been able to determine the signedness from the LHS
4292  if (LHSSign == Unknown)
4293  return false;
4294 
4295  IsSigned = (LHSSign == Signed);
4296 
4297  // The RHS can be a demotable op or a constant
4298  if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(RHS)) {
4299  const APInt &Val = CI->getAPIntValue();
4300  if (LHSSign == Unsigned) {
4301  return Val.isIntN(OptSize);
4302  } else {
4303  return Val.isSignedIntN(OptSize);
4304  }
4305  } else {
4306  OperandSignedness RHSSign;
4307  if (!IsMulWideOperandDemotable(RHS, OptSize, RHSSign))
4308  return false;
4309 
4310  return LHSSign == RHSSign;
4311  }
4312 }
4313 
4314 /// TryMULWIDECombine - Attempt to replace a multiply of M bits with a multiply
4315 /// of M/2 bits that produces an M-bit result (i.e. mul.wide). This transform
4316 /// works on both multiply DAG nodes and SHL DAG nodes with a constant shift
4317 /// amount.
4320  EVT MulType = N->getValueType(0);
4321  if (MulType != MVT::i32 && MulType != MVT::i64) {
4322  return SDValue();
4323  }
4324 
4325  SDLoc DL(N);
4326  unsigned OptSize = MulType.getSizeInBits() >> 1;
4327  SDValue LHS = N->getOperand(0);
4328  SDValue RHS = N->getOperand(1);
4329 
4330  // Canonicalize the multiply so the constant (if any) is on the right
4331  if (N->getOpcode() == ISD::MUL) {
4332  if (isa<ConstantSDNode>(LHS)) {
4333  std::swap(LHS, RHS);
4334  }
4335  }
4336 
4337  // If we have a SHL, determine the actual multiply amount
4338  if (N->getOpcode() == ISD::SHL) {
4339  ConstantSDNode *ShlRHS = dyn_cast<ConstantSDNode>(RHS);
4340  if (!ShlRHS) {
4341  return SDValue();
4342  }
4343 
4344  APInt ShiftAmt = ShlRHS->getAPIntValue();
4345  unsigned BitWidth = MulType.getSizeInBits();
4346  if (ShiftAmt.sge(0) && ShiftAmt.slt(BitWidth)) {
4347  APInt MulVal = APInt(BitWidth, 1) << ShiftAmt;
4348  RHS = DCI.DAG.getConstant(MulVal, DL, MulType);
4349  } else {
4350  return SDValue();
4351  }
4352  }
4353 
4354  bool Signed;
4355  // Verify that our operands are demotable
4356  if (!AreMulWideOperandsDemotable(LHS, RHS, OptSize, Signed)) {
4357  return SDValue();
4358  }
4359 
4360  EVT DemotedVT;
4361  if (MulType == MVT::i32) {
4362  DemotedVT = MVT::i16;
4363  } else {
4364  DemotedVT = MVT::i32;
4365  }
4366 
4367  // Truncate the operands to the correct size. Note that these are just for
4368  // type consistency and will (likely) be eliminated in later phases.
4369  SDValue TruncLHS =
4370  DCI.DAG.getNode(ISD::TRUNCATE, DL, DemotedVT, LHS);
4371  SDValue TruncRHS =
4372  DCI.DAG.getNode(ISD::TRUNCATE, DL, DemotedVT, RHS);
4373 
4374  unsigned Opc;
4375  if (Signed) {
4377  } else {
4379  }
4380 
4381  return DCI.DAG.getNode(Opc, DL, MulType, TruncLHS, TruncRHS);
4382 }
4383 
4384 /// PerformMULCombine - Runs PTX-specific DAG combine patterns on MUL nodes.
4387  CodeGenOpt::Level OptLevel) {
4388  if (OptLevel > 0) {
4389  // Try mul.wide combining at OptLevel > 0
4390  if (SDValue Ret = TryMULWIDECombine(N, DCI))
4391  return Ret;
4392  }
4393 
4394  return SDValue();
4395 }
4396 
4397 /// PerformSHLCombine - Runs PTX-specific DAG combine patterns on SHL nodes.
4400  CodeGenOpt::Level OptLevel) {
4401  if (OptLevel > 0) {
4402  // Try mul.wide combining at OptLevel > 0
4403  if (SDValue Ret = TryMULWIDECombine(N, DCI))
4404  return Ret;
4405  }
4406 
4407  return SDValue();
4408 }
4409 
4412  EVT CCType = N->getValueType(0);
4413  SDValue A = N->getOperand(0);
4414  SDValue B = N->getOperand(1);
4415 
4416  if (CCType != MVT::v2i1 || A.getValueType() != MVT::v2f16)
4417  return SDValue();
4418 
4419  SDLoc DL(N);
4420  // setp.f16x2 returns two scalar predicates, which we need to
4421  // convert back to v2i1. The returned result will be scalarized by
4422  // the legalizer, but the comparison will remain a single vector
4423  // instruction.
4424  SDValue CCNode = DCI.DAG.getNode(NVPTXISD::SETP_F16X2, DL,
4425  DCI.DAG.getVTList(MVT::i1, MVT::i1),
4426  {A, B, N->getOperand(2)});
4427  return DCI.DAG.getNode(ISD::BUILD_VECTOR, DL, CCType, CCNode.getValue(0),
4428  CCNode.getValue(1));
4429 }
4430 
4431 SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N,
4432  DAGCombinerInfo &DCI) const {
4434  switch (N->getOpcode()) {
4435  default: break;
4436  case ISD::ADD:
4437  case ISD::FADD:
4438  return PerformADDCombine(N, DCI, STI, OptLevel);
4439  case ISD::MUL:
4440  return PerformMULCombine(N, DCI, OptLevel);
4441  case ISD::SHL:
4442  return PerformSHLCombine(N, DCI, OptLevel);
4443  case ISD::AND:
4444  return PerformANDCombine(N, DCI);
4445  case ISD::UREM:
4446  case ISD::SREM:
4447  return PerformREMCombine(N, DCI, OptLevel);
4448  case ISD::SETCC:
4449  return PerformSETCCCombine(N, DCI);
4450  }
4451  return SDValue();
4452 }
4453 
4454 /// ReplaceVectorLoad - Convert vector loads into multi-output scalar loads.
4457  EVT ResVT = N->getValueType(0);
4458  SDLoc DL(N);
4459 
4460  assert(ResVT.isVector() && "Vector load must have vector type");
4461 
4462  // We only handle "native" vector sizes for now, e.g. <4 x double> is not
4463  // legal. We can (and should) split that into 2 loads of <2 x double> here
4464  // but I'm leaving that as a TODO for now.
4465  assert(ResVT.isSimple() && "Can only handle simple types");
4466  switch (ResVT.getSimpleVT().SimpleTy) {
4467  default:
4468  return;
4469  case MVT::v2i8:
4470  case MVT::v2i16:
4471  case MVT::v2i32:
4472  case MVT::v2i64:
4473  case MVT::v2f16:
4474  case MVT::v2f32:
4475  case MVT::v2f64:
4476  case MVT::v4i8:
4477  case MVT::v4i16:
4478  case MVT::v4i32:
4479  case MVT::v4f16:
4480  case MVT::v4f32:
4481  case MVT::v8f16: // <4 x f16x2>
4482  // This is a "native" vector type
4483  break;
4484  }
4485 
4486  LoadSDNode *LD = cast<LoadSDNode>(N);
4487 
4488  unsigned Align = LD->getAlignment();
4489  auto &TD = DAG.getDataLayout();
4490  unsigned PrefAlign =
4491  TD.getPrefTypeAlignment(ResVT.getTypeForEVT(*DAG.getContext()));
4492  if (Align < PrefAlign) {
4493  // This load is not sufficiently aligned, so bail out and let this vector
4494  // load be scalarized. Note that we may still be able to emit smaller
4495  // vector loads. For example, if we are loading a <4 x float> with an
4496  // alignment of 8, this check will fail but the legalizer will try again
4497  // with 2 x <2 x float>, which will succeed with an alignment of 8.
4498  return;
4499  }
4500 
4501  EVT EltVT = ResVT.getVectorElementType();
4502  unsigned NumElts = ResVT.getVectorNumElements();
4503 
4504  // Since LoadV2 is a target node, we cannot rely on DAG type legalization.
4505  // Therefore, we must ensure the type is legal. For i1 and i8, we set the
4506  // loaded type to i16 and propagate the "real" type as the memory type.
4507  bool NeedTrunc = false;
4508  if (EltVT.getSizeInBits() < 16) {
4509  EltVT = MVT::i16;
4510  NeedTrunc = true;
4511  }
4512 
4513  unsigned Opcode = 0;
4514  SDVTList LdResVTs;
4515  bool LoadF16x2 = false;
4516 
4517  switch (NumElts) {
4518  default:
4519  return;
4520  case 2:
4521  Opcode = NVPTXISD::LoadV2;
4522  LdResVTs = DAG.getVTList(EltVT, EltVT, MVT::Other);
4523  break;
4524  case 4: {
4525  Opcode = NVPTXISD::LoadV4;
4526  EVT ListVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other };
4527  LdResVTs = DAG.getVTList(ListVTs);
4528  break;
4529  }
4530  case 8: {
4531  // v8f16 is a special case. PTX doesn't have ld.v8.f16
4532  // instruction. Instead, we split the vector into v2f16 chunks and
4533  // load them with ld.v4.b32.
4534  assert(EltVT == MVT::f16 && "Unsupported v8 vector type.");
4535  LoadF16x2 = true;
4536  Opcode = NVPTXISD::LoadV4;
4537  EVT ListVTs[] = {MVT::v2f16, MVT::v2f16, MVT::v2f16, MVT::v2f16,
4538  MVT::Other};
4539  LdResVTs = DAG.getVTList(ListVTs);
4540  break;
4541  }
4542  }
4543 
4544  // Copy regular operands
4545  SmallVector<SDValue, 8> OtherOps(N->op_begin(), N->op_end());
4546 
4547  // The select routine does not have access to the LoadSDNode instance, so
4548  // pass along the extension information
4549  OtherOps.push_back(DAG.getIntPtrConstant(LD->getExtensionType(), DL));
4550 
4551  SDValue NewLD = DAG.getMemIntrinsicNode(Opcode, DL, LdResVTs, OtherOps,
4552  LD->getMemoryVT(),
4553  LD->getMemOperand());
4554 
4555  SmallVector<SDValue, 8> ScalarRes;
4556  if (LoadF16x2) {
4557  // Split v2f16 subvectors back into individual elements.
4558  NumElts /= 2;
4559  for (unsigned i = 0; i < NumElts; ++i) {
4560  SDValue SubVector = NewLD.getValue(i);
4561  SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, SubVector,
4562  DAG.getIntPtrConstant(0, DL));
4563  SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, SubVector,
4564  DAG.getIntPtrConstant(1, DL));
4565  ScalarRes.push_back(E0);
4566  ScalarRes.push_back(E1);
4567  }
4568  } else {
4569  for (unsigned i = 0; i < NumElts; ++i) {
4570  SDValue Res = NewLD.getValue(i);
4571  if (NeedTrunc)
4572  Res = DAG.getNode(ISD::TRUNCATE, DL, ResVT.getVectorElementType(), Res);
4573  ScalarRes.push_back(Res);
4574  }
4575  }
4576 
4577  SDValue LoadChain = NewLD.getValue(NumElts);
4578 
4579  SDValue BuildVec = DAG.getBuildVector(ResVT, DL, ScalarRes);
4580 
4581  Results.push_back(BuildVec);
4582  Results.push_back(LoadChain);
4583 }
4584 
4587  SDValue Chain = N->getOperand(0);
4588  SDValue Intrin = N->getOperand(1);
4589  SDLoc DL(N);
4590 
4591  // Get the intrinsic ID
4592  unsigned IntrinNo = cast<ConstantSDNode>(Intrin.getNode())->getZExtValue();
4593  switch (IntrinNo) {
4594  default:
4595  return;
4596  case Intrinsic::nvvm_ldg_global_i:
4597  case Intrinsic::nvvm_ldg_global_f:
4598  case Intrinsic::nvvm_ldg_global_p:
4599  case Intrinsic::nvvm_ldu_global_i:
4600  case Intrinsic::nvvm_ldu_global_f:
4601  case Intrinsic::nvvm_ldu_global_p: {
4602  EVT ResVT = N->getValueType(0);
4603 
4604  if (ResVT.isVector()) {
4605  // Vector LDG/LDU
4606 
4607  unsigned NumElts = ResVT.getVectorNumElements();
4608  EVT EltVT = ResVT.getVectorElementType();
4609 
4610  // Since LDU/LDG are target nodes, we cannot rely on DAG type
4611  // legalization.
4612  // Therefore, we must ensure the type is legal. For i1 and i8, we set the
4613  // loaded type to i16 and propagate the "real" type as the memory type.
4614  bool NeedTrunc = false;
4615  if (EltVT.getSizeInBits() < 16) {
4616  EltVT = MVT::i16;
4617  NeedTrunc = true;
4618  }
4619 
4620  unsigned Opcode = 0;
4621  SDVTList LdResVTs;
4622 
4623  switch (NumElts) {
4624  default:
4625  return;
4626  case 2:
4627  switch (IntrinNo) {
4628  default:
4629  return;
4630  case Intrinsic::nvvm_ldg_global_i:
4631  case Intrinsic::nvvm_ldg_global_f:
4632  case Intrinsic::nvvm_ldg_global_p:
4633  Opcode = NVPTXISD::LDGV2;
4634  break;
4635  case Intrinsic::nvvm_ldu_global_i:
4636  case Intrinsic::nvvm_ldu_global_f:
4637  case Intrinsic::nvvm_ldu_global_p:
4638  Opcode = NVPTXISD::LDUV2;
4639  break;
4640  }
4641  LdResVTs = DAG.getVTList(EltVT, EltVT, MVT::Other);
4642  break;
4643  case 4: {
4644  switch (IntrinNo) {
4645  default:
4646  return;
4647  case Intrinsic::nvvm_ldg_global_i:
4648  case Intrinsic::nvvm_ldg_global_f:
4649  case Intrinsic::nvvm_ldg_global_p:
4650  Opcode = NVPTXISD::LDGV4;
4651  break;
4652  case Intrinsic::nvvm_ldu_global_i:
4653  case Intrinsic::nvvm_ldu_global_f:
4654  case Intrinsic::nvvm_ldu_global_p:
4655  Opcode = NVPTXISD::LDUV4;
4656  break;
4657  }
4658  EVT ListVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other };
4659  LdResVTs = DAG.getVTList(ListVTs);
4660  break;
4661  }
4662  }
4663