LLVM  7.0.0svn
NVPTXISelLowering.cpp
Go to the documentation of this file.
1 //===-- NVPTXISelLowering.cpp - NVPTX DAG Lowering Implementation ---------===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // This file defines the interfaces that NVPTX uses to lower LLVM code into a
11 // selection DAG.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #include "NVPTXISelLowering.h"
17 #include "NVPTX.h"
18 #include "NVPTXSection.h"
19 #include "NVPTXSubtarget.h"
20 #include "NVPTXTargetMachine.h"
21 #include "NVPTXTargetObjectFile.h"
22 #include "NVPTXUtilities.h"
23 #include "llvm/ADT/APInt.h"
24 #include "llvm/ADT/SmallVector.h"
25 #include "llvm/ADT/StringRef.h"
26 #include "llvm/CodeGen/Analysis.h"
35 #include "llvm/IR/Argument.h"
36 #include "llvm/IR/Attributes.h"
37 #include "llvm/IR/CallSite.h"
38 #include "llvm/IR/Constants.h"
39 #include "llvm/IR/DataLayout.h"
40 #include "llvm/IR/DerivedTypes.h"
41 #include "llvm/IR/Function.h"
42 #include "llvm/IR/GlobalValue.h"
43 #include "llvm/IR/Instruction.h"
44 #include "llvm/IR/Instructions.h"
45 #include "llvm/IR/Module.h"
46 #include "llvm/IR/Type.h"
47 #include "llvm/IR/Value.h"
48 #include "llvm/Support/Casting.h"
49 #include "llvm/Support/CodeGen.h"
56 #include <algorithm>
57 #include <cassert>
58 #include <cstdint>
59 #include <iterator>
60 #include <sstream>
61 #include <string>
62 #include <utility>
63 #include <vector>
64 
65 #define DEBUG_TYPE "nvptx-lower"
66 
67 using namespace llvm;
68 
69 static unsigned int uniqueCallSite = 0;
70 
72  "nvptx-sched4reg",
73  cl::desc("NVPTX Specific: schedule for register pressue"), cl::init(false));
74 
75 static cl::opt<unsigned>
77  cl::desc("NVPTX Specific: FMA contraction (0: don't do it"
78  " 1: do it 2: do it aggressively"),
79  cl::init(2));
80 
82  "nvptx-prec-divf32", cl::ZeroOrMore, cl::Hidden,
83  cl::desc("NVPTX Specifies: 0 use div.approx, 1 use div.full, 2 use"
84  " IEEE Compliant F32 div.rnd if available."),
85  cl::init(2));
86 
88  "nvptx-prec-sqrtf32", cl::Hidden,
89  cl::desc("NVPTX Specific: 0 use sqrt.approx, 1 use sqrt.rn."),
90  cl::init(true));
91 
93  "nvptx-f32ftz", cl::ZeroOrMore, cl::Hidden,
94  cl::desc("NVPTX Specific: Flush f32 subnormals to sign-preserving zero."),
95  cl::init(false));
96 
98  if (UsePrecDivF32.getNumOccurrences() > 0) {
99  // If nvptx-prec-div32=N is used on the command-line, always honor it
100  return UsePrecDivF32;
101  } else {
102  // Otherwise, use div.approx if fast math is enabled
104  return 0;
105  else
106  return 2;
107  }
108 }
109 
111  if (UsePrecSqrtF32.getNumOccurrences() > 0) {
112  // If nvptx-prec-sqrtf32 is used on the command-line, always honor it
113  return UsePrecSqrtF32;
114  } else {
115  // Otherwise, use sqrt.approx if fast math is enabled
117  }
118 }
119 
121  // TODO: Get rid of this flag; there can be only one way to do this.
122  if (FtzEnabled.getNumOccurrences() > 0) {
123  // If nvptx-f32ftz is used on the command-line, always honor it
124  return FtzEnabled;
125  } else {
126  const Function &F = MF.getFunction();
127  // Otherwise, check for an nvptx-f32ftz attribute on the function
128  if (F.hasFnAttribute("nvptx-f32ftz"))
129  return F.getFnAttribute("nvptx-f32ftz").getValueAsString() == "true";
130  else
131  return false;
132  }
133 }
134 
135 static bool IsPTXVectorType(MVT VT) {
136  switch (VT.SimpleTy) {
137  default:
138  return false;
139  case MVT::v2i1:
140  case MVT::v4i1:
141  case MVT::v2i8:
142  case MVT::v4i8:
143  case MVT::v2i16:
144  case MVT::v4i16:
145  case MVT::v2i32:
146  case MVT::v4i32:
147  case MVT::v2i64:
148  case MVT::v2f16:
149  case MVT::v4f16:
150  case MVT::v8f16: // <4 x f16x2>
151  case MVT::v2f32:
152  case MVT::v4f32:
153  case MVT::v2f64:
154  return true;
155  }
156 }
157 
158 /// ComputePTXValueVTs - For the given Type \p Ty, returns the set of primitive
159 /// EVTs that compose it. Unlike ComputeValueVTs, this will break apart vectors
160 /// into their primitive components.
161 /// NOTE: This is a band-aid for code that expects ComputeValueVTs to return the
162 /// same number of types as the Ins/Outs arrays in LowerFormalArguments,
163 /// LowerCall, and LowerReturn.
164 static void ComputePTXValueVTs(const TargetLowering &TLI, const DataLayout &DL,
165  Type *Ty, SmallVectorImpl<EVT> &ValueVTs,
167  uint64_t StartingOffset = 0) {
168  SmallVector<EVT, 16> TempVTs;
169  SmallVector<uint64_t, 16> TempOffsets;
170 
171  // Special case for i128 - decompose to (i64, i64)
172  if (Ty->isIntegerTy(128)) {
173  ValueVTs.push_back(EVT(MVT::i64));
174  ValueVTs.push_back(EVT(MVT::i64));
175 
176  if (Offsets) {
177  Offsets->push_back(StartingOffset + 0);
178  Offsets->push_back(StartingOffset + 8);
179  }
180 
181  return;
182  }
183 
184  ComputeValueVTs(TLI, DL, Ty, TempVTs, &TempOffsets, StartingOffset);
185  for (unsigned i = 0, e = TempVTs.size(); i != e; ++i) {
186  EVT VT = TempVTs[i];
187  uint64_t Off = TempOffsets[i];
188  // Split vectors into individual elements, except for v2f16, which
189  // we will pass as a single scalar.
190  if (VT.isVector()) {
191  unsigned NumElts = VT.getVectorNumElements();
192  EVT EltVT = VT.getVectorElementType();
193  // Vectors with an even number of f16 elements will be passed to
194  // us as an array of v2f16 elements. We must match this so we
195  // stay in sync with Ins/Outs.
196  if (EltVT == MVT::f16 && NumElts % 2 == 0) {
197  EltVT = MVT::v2f16;
198  NumElts /= 2;
199  }
200  for (unsigned j = 0; j != NumElts; ++j) {
201  ValueVTs.push_back(EltVT);
202  if (Offsets)
203  Offsets->push_back(Off + j * EltVT.getStoreSize());
204  }
205  } else {
206  ValueVTs.push_back(VT);
207  if (Offsets)
208  Offsets->push_back(Off);
209  }
210  }
211 }
212 
213 // Check whether we can merge loads/stores of some of the pieces of a
214 // flattened function parameter or return value into a single vector
215 // load/store.
216 //
217 // The flattened parameter is represented as a list of EVTs and
218 // offsets, and the whole structure is aligned to ParamAlignment. This
219 // function determines whether we can load/store pieces of the
220 // parameter starting at index Idx using a single vectorized op of
221 // size AccessSize. If so, it returns the number of param pieces
222 // covered by the vector op. Otherwise, it returns 1.
224  unsigned Idx, uint32_t AccessSize, const SmallVectorImpl<EVT> &ValueVTs,
225  const SmallVectorImpl<uint64_t> &Offsets, unsigned ParamAlignment) {
226  assert(isPowerOf2_32(AccessSize) && "must be a power of 2!");
227 
228  // Can't vectorize if param alignment is not sufficient.
229  if (AccessSize > ParamAlignment)
230  return 1;
231  // Can't vectorize if offset is not aligned.
232  if (Offsets[Idx] & (AccessSize - 1))
233  return 1;
234 
235  EVT EltVT = ValueVTs[Idx];
236  unsigned EltSize = EltVT.getStoreSize();
237 
238  // Element is too large to vectorize.
239  if (EltSize >= AccessSize)
240  return 1;
241 
242  unsigned NumElts = AccessSize / EltSize;
243  // Can't vectorize if AccessBytes if not a multiple of EltSize.
244  if (AccessSize != EltSize * NumElts)
245  return 1;
246 
247  // We don't have enough elements to vectorize.
248  if (Idx + NumElts > ValueVTs.size())
249  return 1;
250 
251  // PTX ISA can only deal with 2- and 4-element vector ops.
252  if (NumElts != 4 && NumElts != 2)
253  return 1;
254 
255  for (unsigned j = Idx + 1; j < Idx + NumElts; ++j) {
256  // Types do not match.
257  if (ValueVTs[j] != EltVT)
258  return 1;
259 
260  // Elements are not contiguous.
261  if (Offsets[j] - Offsets[j - 1] != EltSize)
262  return 1;
263  }
264  // OK. We can vectorize ValueVTs[i..i+NumElts)
265  return NumElts;
266 }
267 
268 // Flags for tracking per-element vectorization state of loads/stores
269 // of a flattened function parameter or return value.
271  PVF_INNER = 0x0, // Middle elements of a vector.
272  PVF_FIRST = 0x1, // First element of the vector.
273  PVF_LAST = 0x2, // Last element of the vector.
274  // Scalar is effectively a 1-element vector.
276 };
277 
278 // Computes whether and how we can vectorize the loads/stores of a
279 // flattened function parameter or return value.
280 //
281 // The flattened parameter is represented as the list of ValueVTs and
282 // Offsets, and is aligned to ParamAlignment bytes. We return a vector
283 // of the same size as ValueVTs indicating how each piece should be
284 // loaded/stored (i.e. as a scalar, or as part of a vector
285 // load/store).
289  unsigned ParamAlignment) {
290  // Set vector size to match ValueVTs and mark all elements as
291  // scalars by default.
293  VectorInfo.assign(ValueVTs.size(), PVF_SCALAR);
294 
295  // Check what we can vectorize using 128/64/32-bit accesses.
296  for (int I = 0, E = ValueVTs.size(); I != E; ++I) {
297  // Skip elements we've already processed.
298  assert(VectorInfo[I] == PVF_SCALAR && "Unexpected vector info state.");
299  for (unsigned AccessSize : {16, 8, 4, 2}) {
300  unsigned NumElts = CanMergeParamLoadStoresStartingAt(
301  I, AccessSize, ValueVTs, Offsets, ParamAlignment);
302  // Mark vectorized elements.
303  switch (NumElts) {
304  default:
305  llvm_unreachable("Unexpected return value");
306  case 1:
307  // Can't vectorize using this size, try next smaller size.
308  continue;
309  case 2:
310  assert(I + 1 < E && "Not enough elements.");
311  VectorInfo[I] = PVF_FIRST;
312  VectorInfo[I + 1] = PVF_LAST;
313  I += 1;
314  break;
315  case 4:
316  assert(I + 3 < E && "Not enough elements.");
317  VectorInfo[I] = PVF_FIRST;
318  VectorInfo[I + 1] = PVF_INNER;
319  VectorInfo[I + 2] = PVF_INNER;
320  VectorInfo[I + 3] = PVF_LAST;
321  I += 3;
322  break;
323  }
324  // Break out of the inner loop because we've already succeeded
325  // using largest possible AccessSize.
326  break;
327  }
328  }
329  return VectorInfo;
330 }
331 
332 // NVPTXTargetLowering Constructor.
334  const NVPTXSubtarget &STI)
335  : TargetLowering(TM), nvTM(&TM), STI(STI) {
336  // always lower memset, memcpy, and memmove intrinsics to load/store
337  // instructions, rather
338  // then generating calls to memset, mempcy or memmove.
339  MaxStoresPerMemset = (unsigned) 0xFFFFFFFF;
340  MaxStoresPerMemcpy = (unsigned) 0xFFFFFFFF;
341  MaxStoresPerMemmove = (unsigned) 0xFFFFFFFF;
342 
345 
346  // Jump is Expensive. Don't create extra control flow for 'and', 'or'
347  // condition branches.
348  setJumpIsExpensive(true);
349 
350  // Wide divides are _very_ slow. Try to reduce the width of the divide if
351  // possible.
352  addBypassSlowDiv(64, 32);
353 
354  // By default, use the Source scheduling
355  if (sched4reg)
357  else
359 
360  auto setFP16OperationAction = [&](unsigned Op, MVT VT, LegalizeAction Action,
361  LegalizeAction NoF16Action) {
362  setOperationAction(Op, VT, STI.allowFP16Math() ? Action : NoF16Action);
363  };
364 
365  addRegisterClass(MVT::i1, &NVPTX::Int1RegsRegClass);
366  addRegisterClass(MVT::i16, &NVPTX::Int16RegsRegClass);
367  addRegisterClass(MVT::i32, &NVPTX::Int32RegsRegClass);
368  addRegisterClass(MVT::i64, &NVPTX::Int64RegsRegClass);
369  addRegisterClass(MVT::f32, &NVPTX::Float32RegsRegClass);
370  addRegisterClass(MVT::f64, &NVPTX::Float64RegsRegClass);
371  addRegisterClass(MVT::f16, &NVPTX::Float16RegsRegClass);
372  addRegisterClass(MVT::v2f16, &NVPTX::Float16x2RegsRegClass);
373 
374  // Conversion to/from FP16/FP16x2 is always legal.
379 
380  setFP16OperationAction(ISD::SETCC, MVT::f16, Legal, Promote);
381  setFP16OperationAction(ISD::SETCC, MVT::v2f16, Legal, Expand);
382 
383  // Operations not directly supported by NVPTX.
402  // Some SIGN_EXTEND_INREG can be done using cvt instruction.
403  // For others we will expand to a SHL/SRA pair.
409 
416 
419 
420  if (STI.hasROT64()) {
423  } else {
426  }
427  if (STI.hasROT32()) {
430  } else {
433  }
434 
442 
443  // Indirect branch is not supported.
444  // This also disables Jump Table creation.
447 
450 
451  // We want to legalize constant related memmove and memcopy
452  // intrinsics.
454 
455  // Turn FP extload into load/fpextend
465  // Turn FP truncstore into trunc + store.
466  // FIXME: vector types should also be expanded
470 
471  // PTX does not support load / store predicate registers
474 
475  for (MVT VT : MVT::integer_valuetypes()) {
479  }
480 
481  // This is legal in NVPTX
485 
486  // TRAP can be lowered to PTX trap
488 
491 
492  // Register custom handling for vector loads/stores
493  for (MVT VT : MVT::vector_valuetypes()) {
494  if (IsPTXVectorType(VT)) {
498  }
499  }
500 
501  // Custom handling for i8 intrinsics
503 
504  for (const auto& Ty : {MVT::i16, MVT::i32, MVT::i64}) {
510 
513  }
514 
518 
519  // PTX does not directly support SELP of i1, so promote to i32 first
521 
522  // PTX cannot multiply two i64s in a single instruction.
525 
526  // We have some custom DAG combine patterns for these nodes
534 
535  // setcc for f16x2 needs special handling to prevent legalizer's
536  // attempt to scalarize it due to v2i1 not being legal.
537  if (STI.allowFP16Math())
539 
540  // Promote fp16 arithmetic if fp16 hardware isn't available or the
541  // user passed --nvptx-no-fp16-math. The flag is useful because,
542  // although sm_53+ GPUs have some sort of FP16 support in
543  // hardware, only sm_53 and sm_60 have full implementation. Others
544  // only have token amount of hardware and are likely to run faster
545  // by using fp32 units instead.
546  for (const auto &Op : {ISD::FADD, ISD::FMUL, ISD::FSUB, ISD::FMA}) {
547  setFP16OperationAction(Op, MVT::f16, Legal, Promote);
548  setFP16OperationAction(Op, MVT::v2f16, Legal, Expand);
549  }
550 
551  // There's no neg.f16 instruction. Expand to (0-x).
554 
555  // (would be) Library functions.
556 
557  // These map to conversion instructions for scalar FP types.
558  for (const auto &Op : {ISD::FCEIL, ISD::FFLOOR, ISD::FNEARBYINT, ISD::FRINT,
564  }
565 
566  // 'Expand' implements FCOPYSIGN without calling an external library.
571 
572  // These map to corresponding instructions for f32/f64. f16 must be
573  // promoted to f32. v2f16 is expanded to f16, which is then promoted
574  // to f32.
575  for (const auto &Op : {ISD::FDIV, ISD::FREM, ISD::FSQRT, ISD::FSIN, ISD::FCOS,
581  }
586 
587  // No FEXP2, FLOG2. The PTX ex2 and log2 functions are always approximate.
588  // No FPOW or FREM in PTX.
589 
590  // Now deduce the information based on the above mentioned
591  // actions
593 }
594 
595 const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const {
596  switch ((NVPTXISD::NodeType)Opcode) {
598  break;
599  case NVPTXISD::CALL:
600  return "NVPTXISD::CALL";
601  case NVPTXISD::RET_FLAG:
602  return "NVPTXISD::RET_FLAG";
604  return "NVPTXISD::LOAD_PARAM";
605  case NVPTXISD::Wrapper:
606  return "NVPTXISD::Wrapper";
608  return "NVPTXISD::DeclareParam";
610  return "NVPTXISD::DeclareScalarParam";
612  return "NVPTXISD::DeclareRet";
614  return "NVPTXISD::DeclareScalarRet";
616  return "NVPTXISD::DeclareRetParam";
617  case NVPTXISD::PrintCall:
618  return "NVPTXISD::PrintCall";
620  return "NVPTXISD::PrintConvergentCall";
622  return "NVPTXISD::PrintCallUni";
624  return "NVPTXISD::PrintConvergentCallUni";
625  case NVPTXISD::LoadParam:
626  return "NVPTXISD::LoadParam";
628  return "NVPTXISD::LoadParamV2";
630  return "NVPTXISD::LoadParamV4";
632  return "NVPTXISD::StoreParam";
634  return "NVPTXISD::StoreParamV2";
636  return "NVPTXISD::StoreParamV4";
638  return "NVPTXISD::StoreParamS32";
640  return "NVPTXISD::StoreParamU32";
642  return "NVPTXISD::CallArgBegin";
643  case NVPTXISD::CallArg:
644  return "NVPTXISD::CallArg";
646  return "NVPTXISD::LastCallArg";
648  return "NVPTXISD::CallArgEnd";
649  case NVPTXISD::CallVoid:
650  return "NVPTXISD::CallVoid";
651  case NVPTXISD::CallVal:
652  return "NVPTXISD::CallVal";
654  return "NVPTXISD::CallSymbol";
655  case NVPTXISD::Prototype:
656  return "NVPTXISD::Prototype";
657  case NVPTXISD::MoveParam:
658  return "NVPTXISD::MoveParam";
660  return "NVPTXISD::StoreRetval";
662  return "NVPTXISD::StoreRetvalV2";
664  return "NVPTXISD::StoreRetvalV4";
666  return "NVPTXISD::PseudoUseParam";
667  case NVPTXISD::RETURN:
668  return "NVPTXISD::RETURN";
670  return "NVPTXISD::CallSeqBegin";
672  return "NVPTXISD::CallSeqEnd";
674  return "NVPTXISD::CallPrototype";
675  case NVPTXISD::LoadV2:
676  return "NVPTXISD::LoadV2";
677  case NVPTXISD::LoadV4:
678  return "NVPTXISD::LoadV4";
679  case NVPTXISD::LDGV2:
680  return "NVPTXISD::LDGV2";
681  case NVPTXISD::LDGV4:
682  return "NVPTXISD::LDGV4";
683  case NVPTXISD::LDUV2:
684  return "NVPTXISD::LDUV2";
685  case NVPTXISD::LDUV4:
686  return "NVPTXISD::LDUV4";
687  case NVPTXISD::StoreV2:
688  return "NVPTXISD::StoreV2";
689  case NVPTXISD::StoreV4:
690  return "NVPTXISD::StoreV4";
692  return "NVPTXISD::FUN_SHFL_CLAMP";
694  return "NVPTXISD::FUN_SHFR_CLAMP";
695  case NVPTXISD::IMAD:
696  return "NVPTXISD::IMAD";
698  return "NVPTXISD::SETP_F16X2";
699  case NVPTXISD::Dummy:
700  return "NVPTXISD::Dummy";
702  return "NVPTXISD::MUL_WIDE_SIGNED";
704  return "NVPTXISD::MUL_WIDE_UNSIGNED";
705  case NVPTXISD::Tex1DFloatS32: return "NVPTXISD::Tex1DFloatS32";
706  case NVPTXISD::Tex1DFloatFloat: return "NVPTXISD::Tex1DFloatFloat";
708  return "NVPTXISD::Tex1DFloatFloatLevel";
710  return "NVPTXISD::Tex1DFloatFloatGrad";
711  case NVPTXISD::Tex1DS32S32: return "NVPTXISD::Tex1DS32S32";
712  case NVPTXISD::Tex1DS32Float: return "NVPTXISD::Tex1DS32Float";
714  return "NVPTXISD::Tex1DS32FloatLevel";
716  return "NVPTXISD::Tex1DS32FloatGrad";
717  case NVPTXISD::Tex1DU32S32: return "NVPTXISD::Tex1DU32S32";
718  case NVPTXISD::Tex1DU32Float: return "NVPTXISD::Tex1DU32Float";
720  return "NVPTXISD::Tex1DU32FloatLevel";
722  return "NVPTXISD::Tex1DU32FloatGrad";
723  case NVPTXISD::Tex1DArrayFloatS32: return "NVPTXISD::Tex1DArrayFloatS32";
724  case NVPTXISD::Tex1DArrayFloatFloat: return "NVPTXISD::Tex1DArrayFloatFloat";
726  return "NVPTXISD::Tex1DArrayFloatFloatLevel";
728  return "NVPTXISD::Tex1DArrayFloatFloatGrad";
729  case NVPTXISD::Tex1DArrayS32S32: return "NVPTXISD::Tex1DArrayS32S32";
730  case NVPTXISD::Tex1DArrayS32Float: return "NVPTXISD::Tex1DArrayS32Float";
732  return "NVPTXISD::Tex1DArrayS32FloatLevel";
734  return "NVPTXISD::Tex1DArrayS32FloatGrad";
735  case NVPTXISD::Tex1DArrayU32S32: return "NVPTXISD::Tex1DArrayU32S32";
736  case NVPTXISD::Tex1DArrayU32Float: return "NVPTXISD::Tex1DArrayU32Float";
738  return "NVPTXISD::Tex1DArrayU32FloatLevel";
740  return "NVPTXISD::Tex1DArrayU32FloatGrad";
741  case NVPTXISD::Tex2DFloatS32: return "NVPTXISD::Tex2DFloatS32";
742  case NVPTXISD::Tex2DFloatFloat: return "NVPTXISD::Tex2DFloatFloat";
744  return "NVPTXISD::Tex2DFloatFloatLevel";
746  return "NVPTXISD::Tex2DFloatFloatGrad";
747  case NVPTXISD::Tex2DS32S32: return "NVPTXISD::Tex2DS32S32";
748  case NVPTXISD::Tex2DS32Float: return "NVPTXISD::Tex2DS32Float";
750  return "NVPTXISD::Tex2DS32FloatLevel";
752  return "NVPTXISD::Tex2DS32FloatGrad";
753  case NVPTXISD::Tex2DU32S32: return "NVPTXISD::Tex2DU32S32";
754  case NVPTXISD::Tex2DU32Float: return "NVPTXISD::Tex2DU32Float";
756  return "NVPTXISD::Tex2DU32FloatLevel";
758  return "NVPTXISD::Tex2DU32FloatGrad";
759  case NVPTXISD::Tex2DArrayFloatS32: return "NVPTXISD::Tex2DArrayFloatS32";
760  case NVPTXISD::Tex2DArrayFloatFloat: return "NVPTXISD::Tex2DArrayFloatFloat";
762  return "NVPTXISD::Tex2DArrayFloatFloatLevel";
764  return "NVPTXISD::Tex2DArrayFloatFloatGrad";
765  case NVPTXISD::Tex2DArrayS32S32: return "NVPTXISD::Tex2DArrayS32S32";
766  case NVPTXISD::Tex2DArrayS32Float: return "NVPTXISD::Tex2DArrayS32Float";
768  return "NVPTXISD::Tex2DArrayS32FloatLevel";
770  return "NVPTXISD::Tex2DArrayS32FloatGrad";
771  case NVPTXISD::Tex2DArrayU32S32: return "NVPTXISD::Tex2DArrayU32S32";
772  case NVPTXISD::Tex2DArrayU32Float: return "NVPTXISD::Tex2DArrayU32Float";
774  return "NVPTXISD::Tex2DArrayU32FloatLevel";
776  return "NVPTXISD::Tex2DArrayU32FloatGrad";
777  case NVPTXISD::Tex3DFloatS32: return "NVPTXISD::Tex3DFloatS32";
778  case NVPTXISD::Tex3DFloatFloat: return "NVPTXISD::Tex3DFloatFloat";
780  return "NVPTXISD::Tex3DFloatFloatLevel";
782  return "NVPTXISD::Tex3DFloatFloatGrad";
783  case NVPTXISD::Tex3DS32S32: return "NVPTXISD::Tex3DS32S32";
784  case NVPTXISD::Tex3DS32Float: return "NVPTXISD::Tex3DS32Float";
786  return "NVPTXISD::Tex3DS32FloatLevel";
788  return "NVPTXISD::Tex3DS32FloatGrad";
789  case NVPTXISD::Tex3DU32S32: return "NVPTXISD::Tex3DU32S32";
790  case NVPTXISD::Tex3DU32Float: return "NVPTXISD::Tex3DU32Float";
792  return "NVPTXISD::Tex3DU32FloatLevel";
794  return "NVPTXISD::Tex3DU32FloatGrad";
795  case NVPTXISD::TexCubeFloatFloat: return "NVPTXISD::TexCubeFloatFloat";
797  return "NVPTXISD::TexCubeFloatFloatLevel";
798  case NVPTXISD::TexCubeS32Float: return "NVPTXISD::TexCubeS32Float";
800  return "NVPTXISD::TexCubeS32FloatLevel";
801  case NVPTXISD::TexCubeU32Float: return "NVPTXISD::TexCubeU32Float";
803  return "NVPTXISD::TexCubeU32FloatLevel";
805  return "NVPTXISD::TexCubeArrayFloatFloat";
807  return "NVPTXISD::TexCubeArrayFloatFloatLevel";
809  return "NVPTXISD::TexCubeArrayS32Float";
811  return "NVPTXISD::TexCubeArrayS32FloatLevel";
813  return "NVPTXISD::TexCubeArrayU32Float";
815  return "NVPTXISD::TexCubeArrayU32FloatLevel";
817  return "NVPTXISD::Tld4R2DFloatFloat";
819  return "NVPTXISD::Tld4G2DFloatFloat";
821  return "NVPTXISD::Tld4B2DFloatFloat";
823  return "NVPTXISD::Tld4A2DFloatFloat";
825  return "NVPTXISD::Tld4R2DS64Float";
827  return "NVPTXISD::Tld4G2DS64Float";
829  return "NVPTXISD::Tld4B2DS64Float";
831  return "NVPTXISD::Tld4A2DS64Float";
833  return "NVPTXISD::Tld4R2DU64Float";
835  return "NVPTXISD::Tld4G2DU64Float";
837  return "NVPTXISD::Tld4B2DU64Float";
839  return "NVPTXISD::Tld4A2DU64Float";
840 
842  return "NVPTXISD::TexUnified1DFloatS32";
844  return "NVPTXISD::TexUnified1DFloatFloat";
846  return "NVPTXISD::TexUnified1DFloatFloatLevel";
848  return "NVPTXISD::TexUnified1DFloatFloatGrad";
850  return "NVPTXISD::TexUnified1DS32S32";
852  return "NVPTXISD::TexUnified1DS32Float";
854  return "NVPTXISD::TexUnified1DS32FloatLevel";
856  return "NVPTXISD::TexUnified1DS32FloatGrad";
858  return "NVPTXISD::TexUnified1DU32S32";
860  return "NVPTXISD::TexUnified1DU32Float";
862  return "NVPTXISD::TexUnified1DU32FloatLevel";
864  return "NVPTXISD::TexUnified1DU32FloatGrad";
866  return "NVPTXISD::TexUnified1DArrayFloatS32";
868  return "NVPTXISD::TexUnified1DArrayFloatFloat";
870  return "NVPTXISD::TexUnified1DArrayFloatFloatLevel";
872  return "NVPTXISD::TexUnified1DArrayFloatFloatGrad";
874  return "NVPTXISD::TexUnified1DArrayS32S32";
876  return "NVPTXISD::TexUnified1DArrayS32Float";
878  return "NVPTXISD::TexUnified1DArrayS32FloatLevel";
880  return "NVPTXISD::TexUnified1DArrayS32FloatGrad";
882  return "NVPTXISD::TexUnified1DArrayU32S32";
884  return "NVPTXISD::TexUnified1DArrayU32Float";
886  return "NVPTXISD::TexUnified1DArrayU32FloatLevel";
888  return "NVPTXISD::TexUnified1DArrayU32FloatGrad";
890  return "NVPTXISD::TexUnified2DFloatS32";
892  return "NVPTXISD::TexUnified2DFloatFloat";
894  return "NVPTXISD::TexUnified2DFloatFloatLevel";
896  return "NVPTXISD::TexUnified2DFloatFloatGrad";
898  return "NVPTXISD::TexUnified2DS32S32";
900  return "NVPTXISD::TexUnified2DS32Float";
902  return "NVPTXISD::TexUnified2DS32FloatLevel";
904  return "NVPTXISD::TexUnified2DS32FloatGrad";
906  return "NVPTXISD::TexUnified2DU32S32";
908  return "NVPTXISD::TexUnified2DU32Float";
910  return "NVPTXISD::TexUnified2DU32FloatLevel";
912  return "NVPTXISD::TexUnified2DU32FloatGrad";
914  return "NVPTXISD::TexUnified2DArrayFloatS32";
916  return "NVPTXISD::TexUnified2DArrayFloatFloat";
918  return "NVPTXISD::TexUnified2DArrayFloatFloatLevel";
920  return "NVPTXISD::TexUnified2DArrayFloatFloatGrad";
922  return "NVPTXISD::TexUnified2DArrayS32S32";
924  return "NVPTXISD::TexUnified2DArrayS32Float";
926  return "NVPTXISD::TexUnified2DArrayS32FloatLevel";
928  return "NVPTXISD::TexUnified2DArrayS32FloatGrad";
930  return "NVPTXISD::TexUnified2DArrayU32S32";
932  return "NVPTXISD::TexUnified2DArrayU32Float";
934  return "NVPTXISD::TexUnified2DArrayU32FloatLevel";
936  return "NVPTXISD::TexUnified2DArrayU32FloatGrad";
938  return "NVPTXISD::TexUnified3DFloatS32";
940  return "NVPTXISD::TexUnified3DFloatFloat";
942  return "NVPTXISD::TexUnified3DFloatFloatLevel";
944  return "NVPTXISD::TexUnified3DFloatFloatGrad";
946  return "NVPTXISD::TexUnified3DS32S32";
948  return "NVPTXISD::TexUnified3DS32Float";
950  return "NVPTXISD::TexUnified3DS32FloatLevel";
952  return "NVPTXISD::TexUnified3DS32FloatGrad";
954  return "NVPTXISD::TexUnified3DU32S32";
956  return "NVPTXISD::TexUnified3DU32Float";
958  return "NVPTXISD::TexUnified3DU32FloatLevel";
960  return "NVPTXISD::TexUnified3DU32FloatGrad";
962  return "NVPTXISD::TexUnifiedCubeFloatFloat";
964  return "NVPTXISD::TexUnifiedCubeFloatFloatLevel";
966  return "NVPTXISD::TexUnifiedCubeS32Float";
968  return "NVPTXISD::TexUnifiedCubeS32FloatLevel";
970  return "NVPTXISD::TexUnifiedCubeU32Float";
972  return "NVPTXISD::TexUnifiedCubeU32FloatLevel";
974  return "NVPTXISD::TexUnifiedCubeArrayFloatFloat";
976  return "NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel";
978  return "NVPTXISD::TexUnifiedCubeArrayS32Float";
980  return "NVPTXISD::TexUnifiedCubeArrayS32FloatLevel";
982  return "NVPTXISD::TexUnifiedCubeArrayU32Float";
984  return "NVPTXISD::TexUnifiedCubeArrayU32FloatLevel";
986  return "NVPTXISD::Tld4UnifiedR2DFloatFloat";
988  return "NVPTXISD::Tld4UnifiedG2DFloatFloat";
990  return "NVPTXISD::Tld4UnifiedB2DFloatFloat";
992  return "NVPTXISD::Tld4UnifiedA2DFloatFloat";
994  return "NVPTXISD::Tld4UnifiedR2DS64Float";
996  return "NVPTXISD::Tld4UnifiedG2DS64Float";
998  return "NVPTXISD::Tld4UnifiedB2DS64Float";
1000  return "NVPTXISD::Tld4UnifiedA2DS64Float";
1002  return "NVPTXISD::Tld4UnifiedR2DU64Float";
1004  return "NVPTXISD::Tld4UnifiedG2DU64Float";
1006  return "NVPTXISD::Tld4UnifiedB2DU64Float";
1008  return "NVPTXISD::Tld4UnifiedA2DU64Float";
1009 
1010  case NVPTXISD::Suld1DI8Clamp: return "NVPTXISD::Suld1DI8Clamp";
1011  case NVPTXISD::Suld1DI16Clamp: return "NVPTXISD::Suld1DI16Clamp";
1012  case NVPTXISD::Suld1DI32Clamp: return "NVPTXISD::Suld1DI32Clamp";
1013  case NVPTXISD::Suld1DI64Clamp: return "NVPTXISD::Suld1DI64Clamp";
1014  case NVPTXISD::Suld1DV2I8Clamp: return "NVPTXISD::Suld1DV2I8Clamp";
1015  case NVPTXISD::Suld1DV2I16Clamp: return "NVPTXISD::Suld1DV2I16Clamp";
1016  case NVPTXISD::Suld1DV2I32Clamp: return "NVPTXISD::Suld1DV2I32Clamp";
1017  case NVPTXISD::Suld1DV2I64Clamp: return "NVPTXISD::Suld1DV2I64Clamp";
1018  case NVPTXISD::Suld1DV4I8Clamp: return "NVPTXISD::Suld1DV4I8Clamp";
1019  case NVPTXISD::Suld1DV4I16Clamp: return "NVPTXISD::Suld1DV4I16Clamp";
1020  case NVPTXISD::Suld1DV4I32Clamp: return "NVPTXISD::Suld1DV4I32Clamp";
1021 
1022  case NVPTXISD::Suld1DArrayI8Clamp: return "NVPTXISD::Suld1DArrayI8Clamp";
1023  case NVPTXISD::Suld1DArrayI16Clamp: return "NVPTXISD::Suld1DArrayI16Clamp";
1024  case NVPTXISD::Suld1DArrayI32Clamp: return "NVPTXISD::Suld1DArrayI32Clamp";
1025  case NVPTXISD::Suld1DArrayI64Clamp: return "NVPTXISD::Suld1DArrayI64Clamp";
1026  case NVPTXISD::Suld1DArrayV2I8Clamp: return "NVPTXISD::Suld1DArrayV2I8Clamp";
1027  case NVPTXISD::Suld1DArrayV2I16Clamp:return "NVPTXISD::Suld1DArrayV2I16Clamp";
1028  case NVPTXISD::Suld1DArrayV2I32Clamp:return "NVPTXISD::Suld1DArrayV2I32Clamp";
1029  case NVPTXISD::Suld1DArrayV2I64Clamp:return "NVPTXISD::Suld1DArrayV2I64Clamp";
1030  case NVPTXISD::Suld1DArrayV4I8Clamp: return "NVPTXISD::Suld1DArrayV4I8Clamp";
1031  case NVPTXISD::Suld1DArrayV4I16Clamp:return "NVPTXISD::Suld1DArrayV4I16Clamp";
1032  case NVPTXISD::Suld1DArrayV4I32Clamp:return "NVPTXISD::Suld1DArrayV4I32Clamp";
1033 
1034  case NVPTXISD::Suld2DI8Clamp: return "NVPTXISD::Suld2DI8Clamp";
1035  case NVPTXISD::Suld2DI16Clamp: return "NVPTXISD::Suld2DI16Clamp";
1036  case NVPTXISD::Suld2DI32Clamp: return "NVPTXISD::Suld2DI32Clamp";
1037  case NVPTXISD::Suld2DI64Clamp: return "NVPTXISD::Suld2DI64Clamp";
1038  case NVPTXISD::Suld2DV2I8Clamp: return "NVPTXISD::Suld2DV2I8Clamp";
1039  case NVPTXISD::Suld2DV2I16Clamp: return "NVPTXISD::Suld2DV2I16Clamp";
1040  case NVPTXISD::Suld2DV2I32Clamp: return "NVPTXISD::Suld2DV2I32Clamp";
1041  case NVPTXISD::Suld2DV2I64Clamp: return "NVPTXISD::Suld2DV2I64Clamp";
1042  case NVPTXISD::Suld2DV4I8Clamp: return "NVPTXISD::Suld2DV4I8Clamp";
1043  case NVPTXISD::Suld2DV4I16Clamp: return "NVPTXISD::Suld2DV4I16Clamp";
1044  case NVPTXISD::Suld2DV4I32Clamp: return "NVPTXISD::Suld2DV4I32Clamp";
1045 
1046  case NVPTXISD::Suld2DArrayI8Clamp: return "NVPTXISD::Suld2DArrayI8Clamp";
1047  case NVPTXISD::Suld2DArrayI16Clamp: return "NVPTXISD::Suld2DArrayI16Clamp";
1048  case NVPTXISD::Suld2DArrayI32Clamp: return "NVPTXISD::Suld2DArrayI32Clamp";
1049  case NVPTXISD::Suld2DArrayI64Clamp: return "NVPTXISD::Suld2DArrayI64Clamp";
1050  case NVPTXISD::Suld2DArrayV2I8Clamp: return "NVPTXISD::Suld2DArrayV2I8Clamp";
1051  case NVPTXISD::Suld2DArrayV2I16Clamp:return "NVPTXISD::Suld2DArrayV2I16Clamp";
1052  case NVPTXISD::Suld2DArrayV2I32Clamp:return "NVPTXISD::Suld2DArrayV2I32Clamp";
1053  case NVPTXISD::Suld2DArrayV2I64Clamp:return "NVPTXISD::Suld2DArrayV2I64Clamp";
1054  case NVPTXISD::Suld2DArrayV4I8Clamp: return "NVPTXISD::Suld2DArrayV4I8Clamp";
1055  case NVPTXISD::Suld2DArrayV4I16Clamp:return "NVPTXISD::Suld2DArrayV4I16Clamp";
1056  case NVPTXISD::Suld2DArrayV4I32Clamp:return "NVPTXISD::Suld2DArrayV4I32Clamp";
1057 
1058  case NVPTXISD::Suld3DI8Clamp: return "NVPTXISD::Suld3DI8Clamp";
1059  case NVPTXISD::Suld3DI16Clamp: return "NVPTXISD::Suld3DI16Clamp";
1060  case NVPTXISD::Suld3DI32Clamp: return "NVPTXISD::Suld3DI32Clamp";
1061  case NVPTXISD::Suld3DI64Clamp: return "NVPTXISD::Suld3DI64Clamp";
1062  case NVPTXISD::Suld3DV2I8Clamp: return "NVPTXISD::Suld3DV2I8Clamp";
1063  case NVPTXISD::Suld3DV2I16Clamp: return "NVPTXISD::Suld3DV2I16Clamp";
1064  case NVPTXISD::Suld3DV2I32Clamp: return "NVPTXISD::Suld3DV2I32Clamp";
1065  case NVPTXISD::Suld3DV2I64Clamp: return "NVPTXISD::Suld3DV2I64Clamp";
1066  case NVPTXISD::Suld3DV4I8Clamp: return "NVPTXISD::Suld3DV4I8Clamp";
1067  case NVPTXISD::Suld3DV4I16Clamp: return "NVPTXISD::Suld3DV4I16Clamp";
1068  case NVPTXISD::Suld3DV4I32Clamp: return "NVPTXISD::Suld3DV4I32Clamp";
1069 
1070  case NVPTXISD::Suld1DI8Trap: return "NVPTXISD::Suld1DI8Trap";
1071  case NVPTXISD::Suld1DI16Trap: return "NVPTXISD::Suld1DI16Trap";
1072  case NVPTXISD::Suld1DI32Trap: return "NVPTXISD::Suld1DI32Trap";
1073  case NVPTXISD::Suld1DI64Trap: return "NVPTXISD::Suld1DI64Trap";
1074  case NVPTXISD::Suld1DV2I8Trap: return "NVPTXISD::Suld1DV2I8Trap";
1075  case NVPTXISD::Suld1DV2I16Trap: return "NVPTXISD::Suld1DV2I16Trap";
1076  case NVPTXISD::Suld1DV2I32Trap: return "NVPTXISD::Suld1DV2I32Trap";
1077  case NVPTXISD::Suld1DV2I64Trap: return "NVPTXISD::Suld1DV2I64Trap";
1078  case NVPTXISD::Suld1DV4I8Trap: return "NVPTXISD::Suld1DV4I8Trap";
1079  case NVPTXISD::Suld1DV4I16Trap: return "NVPTXISD::Suld1DV4I16Trap";
1080  case NVPTXISD::Suld1DV4I32Trap: return "NVPTXISD::Suld1DV4I32Trap";
1081 
1082  case NVPTXISD::Suld1DArrayI8Trap: return "NVPTXISD::Suld1DArrayI8Trap";
1083  case NVPTXISD::Suld1DArrayI16Trap: return "NVPTXISD::Suld1DArrayI16Trap";
1084  case NVPTXISD::Suld1DArrayI32Trap: return "NVPTXISD::Suld1DArrayI32Trap";
1085  case NVPTXISD::Suld1DArrayI64Trap: return "NVPTXISD::Suld1DArrayI64Trap";
1086  case NVPTXISD::Suld1DArrayV2I8Trap: return "NVPTXISD::Suld1DArrayV2I8Trap";
1087  case NVPTXISD::Suld1DArrayV2I16Trap: return "NVPTXISD::Suld1DArrayV2I16Trap";
1088  case NVPTXISD::Suld1DArrayV2I32Trap: return "NVPTXISD::Suld1DArrayV2I32Trap";
1089  case NVPTXISD::Suld1DArrayV2I64Trap: return "NVPTXISD::Suld1DArrayV2I64Trap";
1090  case NVPTXISD::Suld1DArrayV4I8Trap: return "NVPTXISD::Suld1DArrayV4I8Trap";
1091  case NVPTXISD::Suld1DArrayV4I16Trap: return "NVPTXISD::Suld1DArrayV4I16Trap";
1092  case NVPTXISD::Suld1DArrayV4I32Trap: return "NVPTXISD::Suld1DArrayV4I32Trap";
1093 
1094  case NVPTXISD::Suld2DI8Trap: return "NVPTXISD::Suld2DI8Trap";
1095  case NVPTXISD::Suld2DI16Trap: return "NVPTXISD::Suld2DI16Trap";
1096  case NVPTXISD::Suld2DI32Trap: return "NVPTXISD::Suld2DI32Trap";
1097  case NVPTXISD::Suld2DI64Trap: return "NVPTXISD::Suld2DI64Trap";
1098  case NVPTXISD::Suld2DV2I8Trap: return "NVPTXISD::Suld2DV2I8Trap";
1099  case NVPTXISD::Suld2DV2I16Trap: return "NVPTXISD::Suld2DV2I16Trap";
1100  case NVPTXISD::Suld2DV2I32Trap: return "NVPTXISD::Suld2DV2I32Trap";
1101  case NVPTXISD::Suld2DV2I64Trap: return "NVPTXISD::Suld2DV2I64Trap";
1102  case NVPTXISD::Suld2DV4I8Trap: return "NVPTXISD::Suld2DV4I8Trap";
1103  case NVPTXISD::Suld2DV4I16Trap: return "NVPTXISD::Suld2DV4I16Trap";
1104  case NVPTXISD::Suld2DV4I32Trap: return "NVPTXISD::Suld2DV4I32Trap";
1105 
1106  case NVPTXISD::Suld2DArrayI8Trap: return "NVPTXISD::Suld2DArrayI8Trap";
1107  case NVPTXISD::Suld2DArrayI16Trap: return "NVPTXISD::Suld2DArrayI16Trap";
1108  case NVPTXISD::Suld2DArrayI32Trap: return "NVPTXISD::Suld2DArrayI32Trap";
1109  case NVPTXISD::Suld2DArrayI64Trap: return "NVPTXISD::Suld2DArrayI64Trap";
1110  case NVPTXISD::Suld2DArrayV2I8Trap: return "NVPTXISD::Suld2DArrayV2I8Trap";
1111  case NVPTXISD::Suld2DArrayV2I16Trap: return "NVPTXISD::Suld2DArrayV2I16Trap";
1112  case NVPTXISD::Suld2DArrayV2I32Trap: return "NVPTXISD::Suld2DArrayV2I32Trap";
1113  case NVPTXISD::Suld2DArrayV2I64Trap: return "NVPTXISD::Suld2DArrayV2I64Trap";
1114  case NVPTXISD::Suld2DArrayV4I8Trap: return "NVPTXISD::Suld2DArrayV4I8Trap";
1115  case NVPTXISD::Suld2DArrayV4I16Trap: return "NVPTXISD::Suld2DArrayV4I16Trap";
1116  case NVPTXISD::Suld2DArrayV4I32Trap: return "NVPTXISD::Suld2DArrayV4I32Trap";
1117 
1118  case NVPTXISD::Suld3DI8Trap: return "NVPTXISD::Suld3DI8Trap";
1119  case NVPTXISD::Suld3DI16Trap: return "NVPTXISD::Suld3DI16Trap";
1120  case NVPTXISD::Suld3DI32Trap: return "NVPTXISD::Suld3DI32Trap";
1121  case NVPTXISD::Suld3DI64Trap: return "NVPTXISD::Suld3DI64Trap";
1122  case NVPTXISD::Suld3DV2I8Trap: return "NVPTXISD::Suld3DV2I8Trap";
1123  case NVPTXISD::Suld3DV2I16Trap: return "NVPTXISD::Suld3DV2I16Trap";
1124  case NVPTXISD::Suld3DV2I32Trap: return "NVPTXISD::Suld3DV2I32Trap";
1125  case NVPTXISD::Suld3DV2I64Trap: return "NVPTXISD::Suld3DV2I64Trap";
1126  case NVPTXISD::Suld3DV4I8Trap: return "NVPTXISD::Suld3DV4I8Trap";
1127  case NVPTXISD::Suld3DV4I16Trap: return "NVPTXISD::Suld3DV4I16Trap";
1128  case NVPTXISD::Suld3DV4I32Trap: return "NVPTXISD::Suld3DV4I32Trap";
1129 
1130  case NVPTXISD::Suld1DI8Zero: return "NVPTXISD::Suld1DI8Zero";
1131  case NVPTXISD::Suld1DI16Zero: return "NVPTXISD::Suld1DI16Zero";
1132  case NVPTXISD::Suld1DI32Zero: return "NVPTXISD::Suld1DI32Zero";
1133  case NVPTXISD::Suld1DI64Zero: return "NVPTXISD::Suld1DI64Zero";
1134  case NVPTXISD::Suld1DV2I8Zero: return "NVPTXISD::Suld1DV2I8Zero";
1135  case NVPTXISD::Suld1DV2I16Zero: return "NVPTXISD::Suld1DV2I16Zero";
1136  case NVPTXISD::Suld1DV2I32Zero: return "NVPTXISD::Suld1DV2I32Zero";
1137  case NVPTXISD::Suld1DV2I64Zero: return "NVPTXISD::Suld1DV2I64Zero";
1138  case NVPTXISD::Suld1DV4I8Zero: return "NVPTXISD::Suld1DV4I8Zero";
1139  case NVPTXISD::Suld1DV4I16Zero: return "NVPTXISD::Suld1DV4I16Zero";
1140  case NVPTXISD::Suld1DV4I32Zero: return "NVPTXISD::Suld1DV4I32Zero";
1141 
1142  case NVPTXISD::Suld1DArrayI8Zero: return "NVPTXISD::Suld1DArrayI8Zero";
1143  case NVPTXISD::Suld1DArrayI16Zero: return "NVPTXISD::Suld1DArrayI16Zero";
1144  case NVPTXISD::Suld1DArrayI32Zero: return "NVPTXISD::Suld1DArrayI32Zero";
1145  case NVPTXISD::Suld1DArrayI64Zero: return "NVPTXISD::Suld1DArrayI64Zero";
1146  case NVPTXISD::Suld1DArrayV2I8Zero: return "NVPTXISD::Suld1DArrayV2I8Zero";
1147  case NVPTXISD::Suld1DArrayV2I16Zero: return "NVPTXISD::Suld1DArrayV2I16Zero";
1148  case NVPTXISD::Suld1DArrayV2I32Zero: return "NVPTXISD::Suld1DArrayV2I32Zero";
1149  case NVPTXISD::Suld1DArrayV2I64Zero: return "NVPTXISD::Suld1DArrayV2I64Zero";
1150  case NVPTXISD::Suld1DArrayV4I8Zero: return "NVPTXISD::Suld1DArrayV4I8Zero";
1151  case NVPTXISD::Suld1DArrayV4I16Zero: return "NVPTXISD::Suld1DArrayV4I16Zero";
1152  case NVPTXISD::Suld1DArrayV4I32Zero: return "NVPTXISD::Suld1DArrayV4I32Zero";
1153 
1154  case NVPTXISD::Suld2DI8Zero: return "NVPTXISD::Suld2DI8Zero";
1155  case NVPTXISD::Suld2DI16Zero: return "NVPTXISD::Suld2DI16Zero";
1156  case NVPTXISD::Suld2DI32Zero: return "NVPTXISD::Suld2DI32Zero";
1157  case NVPTXISD::Suld2DI64Zero: return "NVPTXISD::Suld2DI64Zero";
1158  case NVPTXISD::Suld2DV2I8Zero: return "NVPTXISD::Suld2DV2I8Zero";
1159  case NVPTXISD::Suld2DV2I16Zero: return "NVPTXISD::Suld2DV2I16Zero";
1160  case NVPTXISD::Suld2DV2I32Zero: return "NVPTXISD::Suld2DV2I32Zero";
1161  case NVPTXISD::Suld2DV2I64Zero: return "NVPTXISD::Suld2DV2I64Zero";
1162  case NVPTXISD::Suld2DV4I8Zero: return "NVPTXISD::Suld2DV4I8Zero";
1163  case NVPTXISD::Suld2DV4I16Zero: return "NVPTXISD::Suld2DV4I16Zero";
1164  case NVPTXISD::Suld2DV4I32Zero: return "NVPTXISD::Suld2DV4I32Zero";
1165 
1166  case NVPTXISD::Suld2DArrayI8Zero: return "NVPTXISD::Suld2DArrayI8Zero";
1167  case NVPTXISD::Suld2DArrayI16Zero: return "NVPTXISD::Suld2DArrayI16Zero";
1168  case NVPTXISD::Suld2DArrayI32Zero: return "NVPTXISD::Suld2DArrayI32Zero";
1169  case NVPTXISD::Suld2DArrayI64Zero: return "NVPTXISD::Suld2DArrayI64Zero";
1170  case NVPTXISD::Suld2DArrayV2I8Zero: return "NVPTXISD::Suld2DArrayV2I8Zero";
1171  case NVPTXISD::Suld2DArrayV2I16Zero: return "NVPTXISD::Suld2DArrayV2I16Zero";
1172  case NVPTXISD::Suld2DArrayV2I32Zero: return "NVPTXISD::Suld2DArrayV2I32Zero";
1173  case NVPTXISD::Suld2DArrayV2I64Zero: return "NVPTXISD::Suld2DArrayV2I64Zero";
1174  case NVPTXISD::Suld2DArrayV4I8Zero: return "NVPTXISD::Suld2DArrayV4I8Zero";
1175  case NVPTXISD::Suld2DArrayV4I16Zero: return "NVPTXISD::Suld2DArrayV4I16Zero";
1176  case NVPTXISD::Suld2DArrayV4I32Zero: return "NVPTXISD::Suld2DArrayV4I32Zero";
1177 
1178  case NVPTXISD::Suld3DI8Zero: return "NVPTXISD::Suld3DI8Zero";
1179  case NVPTXISD::Suld3DI16Zero: return "NVPTXISD::Suld3DI16Zero";
1180  case NVPTXISD::Suld3DI32Zero: return "NVPTXISD::Suld3DI32Zero";
1181  case NVPTXISD::Suld3DI64Zero: return "NVPTXISD::Suld3DI64Zero";
1182  case NVPTXISD::Suld3DV2I8Zero: return "NVPTXISD::Suld3DV2I8Zero";
1183  case NVPTXISD::Suld3DV2I16Zero: return "NVPTXISD::Suld3DV2I16Zero";
1184  case NVPTXISD::Suld3DV2I32Zero: return "NVPTXISD::Suld3DV2I32Zero";
1185  case NVPTXISD::Suld3DV2I64Zero: return "NVPTXISD::Suld3DV2I64Zero";
1186  case NVPTXISD::Suld3DV4I8Zero: return "NVPTXISD::Suld3DV4I8Zero";
1187  case NVPTXISD::Suld3DV4I16Zero: return "NVPTXISD::Suld3DV4I16Zero";
1188  case NVPTXISD::Suld3DV4I32Zero: return "NVPTXISD::Suld3DV4I32Zero";
1189  }
1190  return nullptr;
1191 }
1192 
1195  if (VT.getVectorNumElements() != 1 && VT.getScalarType() == MVT::i1)
1196  return TypeSplitVector;
1197  if (VT == MVT::v2f16)
1198  return TypeLegal;
1200 }
1201 
1203  int Enabled, int &ExtraSteps,
1204  bool &UseOneConst,
1205  bool Reciprocal) const {
1206  if (!(Enabled == ReciprocalEstimate::Enabled ||
1207  (Enabled == ReciprocalEstimate::Unspecified && !usePrecSqrtF32())))
1208  return SDValue();
1209 
1210  if (ExtraSteps == ReciprocalEstimate::Unspecified)
1211  ExtraSteps = 0;
1212 
1213  SDLoc DL(Operand);
1214  EVT VT = Operand.getValueType();
1215  bool Ftz = useF32FTZ(DAG.getMachineFunction());
1216 
1217  auto MakeIntrinsicCall = [&](Intrinsic::ID IID) {
1218  return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
1219  DAG.getConstant(IID, DL, MVT::i32), Operand);
1220  };
1221 
1222  // The sqrt and rsqrt refinement processes assume we always start out with an
1223  // approximation of the rsqrt. Therefore, if we're going to do any refinement
1224  // (i.e. ExtraSteps > 0), we must return an rsqrt. But if we're *not* doing
1225  // any refinement, we must return a regular sqrt.
1226  if (Reciprocal || ExtraSteps > 0) {
1227  if (VT == MVT::f32)
1228  return MakeIntrinsicCall(Ftz ? Intrinsic::nvvm_rsqrt_approx_ftz_f
1229  : Intrinsic::nvvm_rsqrt_approx_f);
1230  else if (VT == MVT::f64)
1231  return MakeIntrinsicCall(Intrinsic::nvvm_rsqrt_approx_d);
1232  else
1233  return SDValue();
1234  } else {
1235  if (VT == MVT::f32)
1236  return MakeIntrinsicCall(Ftz ? Intrinsic::nvvm_sqrt_approx_ftz_f
1237  : Intrinsic::nvvm_sqrt_approx_f);
1238  else {
1239  // There's no sqrt.approx.f64 instruction, so we emit
1240  // reciprocal(rsqrt(x)). This is faster than
1241  // select(x == 0, 0, x * rsqrt(x)). (In fact, it's faster than plain
1242  // x * rsqrt(x).)
1243  return DAG.getNode(
1244  ISD::INTRINSIC_WO_CHAIN, DL, VT,
1245  DAG.getConstant(Intrinsic::nvvm_rcp_approx_ftz_d, DL, MVT::i32),
1246  MakeIntrinsicCall(Intrinsic::nvvm_rsqrt_approx_d));
1247  }
1248  }
1249 }
1250 
1251 SDValue
1253  SDLoc dl(Op);
1254  const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
1255  auto PtrVT = getPointerTy(DAG.getDataLayout());
1256  Op = DAG.getTargetGlobalAddress(GV, dl, PtrVT);
1257  return DAG.getNode(NVPTXISD::Wrapper, dl, PtrVT, Op);
1258 }
1259 
1261  const DataLayout &DL, Type *retTy, const ArgListTy &Args,
1262  const SmallVectorImpl<ISD::OutputArg> &Outs, unsigned retAlignment,
1263  ImmutableCallSite CS) const {
1264  auto PtrVT = getPointerTy(DL);
1265 
1266  bool isABI = (STI.getSmVersion() >= 20);
1267  assert(isABI && "Non-ABI compilation is not supported");
1268  if (!isABI)
1269  return "";
1270 
1271  std::stringstream O;
1272  O << "prototype_" << uniqueCallSite << " : .callprototype ";
1273 
1274  if (retTy->getTypeID() == Type::VoidTyID) {
1275  O << "()";
1276  } else {
1277  O << "(";
1278  if (retTy->isFloatingPointTy() || (retTy->isIntegerTy() && !retTy->isIntegerTy(128))) {
1279  unsigned size = 0;
1280  if (auto *ITy = dyn_cast<IntegerType>(retTy)) {
1281  size = ITy->getBitWidth();
1282  } else {
1283  assert(retTy->isFloatingPointTy() &&
1284  "Floating point type expected here");
1285  size = retTy->getPrimitiveSizeInBits();
1286  }
1287  // PTX ABI requires all scalar return values to be at least 32
1288  // bits in size. fp16 normally uses .b16 as its storage type in
1289  // PTX, so its size must be adjusted here, too.
1290  if (size < 32)
1291  size = 32;
1292 
1293  O << ".param .b" << size << " _";
1294  } else if (isa<PointerType>(retTy)) {
1295  O << ".param .b" << PtrVT.getSizeInBits() << " _";
1296  } else if (retTy->isAggregateType() || retTy->isVectorTy() || retTy->isIntegerTy(128)) {
1297  auto &DL = CS.getCalledFunction()->getParent()->getDataLayout();
1298  O << ".param .align " << retAlignment << " .b8 _["
1299  << DL.getTypeAllocSize(retTy) << "]";
1300  } else {
1301  llvm_unreachable("Unknown return type");
1302  }
1303  O << ") ";
1304  }
1305  O << "_ (";
1306 
1307  bool first = true;
1308 
1309  unsigned OIdx = 0;
1310  for (unsigned i = 0, e = Args.size(); i != e; ++i, ++OIdx) {
1311  Type *Ty = Args[i].Ty;
1312  if (!first) {
1313  O << ", ";
1314  }
1315  first = false;
1316 
1317  if (!Outs[OIdx].Flags.isByVal()) {
1318  if (Ty->isAggregateType() || Ty->isVectorTy() || Ty->isIntegerTy(128)) {
1319  unsigned align = 0;
1320  const CallInst *CallI = cast<CallInst>(CS.getInstruction());
1321  // +1 because index 0 is reserved for return type alignment
1322  if (!getAlign(*CallI, i + 1, align))
1323  align = DL.getABITypeAlignment(Ty);
1324  unsigned sz = DL.getTypeAllocSize(Ty);
1325  O << ".param .align " << align << " .b8 ";
1326  O << "_";
1327  O << "[" << sz << "]";
1328  // update the index for Outs
1329  SmallVector<EVT, 16> vtparts;
1330  ComputeValueVTs(*this, DL, Ty, vtparts);
1331  if (unsigned len = vtparts.size())
1332  OIdx += len - 1;
1333  continue;
1334  }
1335  // i8 types in IR will be i16 types in SDAG
1336  assert((getValueType(DL, Ty) == Outs[OIdx].VT ||
1337  (getValueType(DL, Ty) == MVT::i8 && Outs[OIdx].VT == MVT::i16)) &&
1338  "type mismatch between callee prototype and arguments");
1339  // scalar type
1340  unsigned sz = 0;
1341  if (isa<IntegerType>(Ty)) {
1342  sz = cast<IntegerType>(Ty)->getBitWidth();
1343  if (sz < 32)
1344  sz = 32;
1345  } else if (isa<PointerType>(Ty)) {
1346  sz = PtrVT.getSizeInBits();
1347  } else if (Ty->isHalfTy())
1348  // PTX ABI requires all scalar parameters to be at least 32
1349  // bits in size. fp16 normally uses .b16 as its storage type
1350  // in PTX, so its size must be adjusted here, too.
1351  sz = 32;
1352  else
1353  sz = Ty->getPrimitiveSizeInBits();
1354  O << ".param .b" << sz << " ";
1355  O << "_";
1356  continue;
1357  }
1358  auto *PTy = dyn_cast<PointerType>(Ty);
1359  assert(PTy && "Param with byval attribute should be a pointer type");
1360  Type *ETy = PTy->getElementType();
1361 
1362  unsigned align = Outs[OIdx].Flags.getByValAlign();
1363  unsigned sz = DL.getTypeAllocSize(ETy);
1364  O << ".param .align " << align << " .b8 ";
1365  O << "_";
1366  O << "[" << sz << "]";
1367  }
1368  O << ");";
1369  return O.str();
1370 }
1371 
1372 unsigned NVPTXTargetLowering::getArgumentAlignment(SDValue Callee,
1373  ImmutableCallSite CS,
1374  Type *Ty, unsigned Idx,
1375  const DataLayout &DL) const {
1376  if (!CS) {
1377  // CallSite is zero, fallback to ABI type alignment
1378  return DL.getABITypeAlignment(Ty);
1379  }
1380 
1381  unsigned Align = 0;
1382  const Value *DirectCallee = CS.getCalledFunction();
1383 
1384  if (!DirectCallee) {
1385  // We don't have a direct function symbol, but that may be because of
1386  // constant cast instructions in the call.
1387  const Instruction *CalleeI = CS.getInstruction();
1388  assert(CalleeI && "Call target is not a function or derived value?");
1389 
1390  // With bitcast'd call targets, the instruction will be the call
1391  if (isa<CallInst>(CalleeI)) {
1392  // Check if we have call alignment metadata
1393  if (getAlign(*cast<CallInst>(CalleeI), Idx, Align))
1394  return Align;
1395 
1396  const Value *CalleeV = cast<CallInst>(CalleeI)->getCalledValue();
1397  // Ignore any bitcast instructions
1398  while (isa<ConstantExpr>(CalleeV)) {
1399  const ConstantExpr *CE = cast<ConstantExpr>(CalleeV);
1400  if (!CE->isCast())
1401  break;
1402  // Look through the bitcast
1403  CalleeV = cast<ConstantExpr>(CalleeV)->getOperand(0);
1404  }
1405 
1406  // We have now looked past all of the bitcasts. Do we finally have a
1407  // Function?
1408  if (isa<Function>(CalleeV))
1409  DirectCallee = CalleeV;
1410  }
1411  }
1412 
1413  // Check for function alignment information if we found that the
1414  // ultimate target is a Function
1415  if (DirectCallee)
1416  if (getAlign(*cast<Function>(DirectCallee), Idx, Align))
1417  return Align;
1418 
1419  // Call is indirect or alignment information is not available, fall back to
1420  // the ABI type alignment
1421  return DL.getABITypeAlignment(Ty);
1422 }
1423 
1425  SmallVectorImpl<SDValue> &InVals) const {
1426  SelectionDAG &DAG = CLI.DAG;
1427  SDLoc dl = CLI.DL;
1429  SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
1431  SDValue Chain = CLI.Chain;
1432  SDValue Callee = CLI.Callee;
1433  bool &isTailCall = CLI.IsTailCall;
1434  ArgListTy &Args = CLI.getArgs();
1435  Type *RetTy = CLI.RetTy;
1436  ImmutableCallSite CS = CLI.CS;
1437  const DataLayout &DL = DAG.getDataLayout();
1438 
1439  bool isABI = (STI.getSmVersion() >= 20);
1440  assert(isABI && "Non-ABI compilation is not supported");
1441  if (!isABI)
1442  return Chain;
1443 
1444  SDValue tempChain = Chain;
1445  Chain = DAG.getCALLSEQ_START(Chain, uniqueCallSite, 0, dl);
1446  SDValue InFlag = Chain.getValue(1);
1447 
1448  unsigned paramCount = 0;
1449  // Args.size() and Outs.size() need not match.
1450  // Outs.size() will be larger
1451  // * if there is an aggregate argument with multiple fields (each field
1452  // showing up separately in Outs)
1453  // * if there is a vector argument with more than typical vector-length
1454  // elements (generally if more than 4) where each vector element is
1455  // individually present in Outs.
1456  // So a different index should be used for indexing into Outs/OutVals.
1457  // See similar issue in LowerFormalArguments.
1458  unsigned OIdx = 0;
1459  // Declare the .params or .reg need to pass values
1460  // to the function
1461  for (unsigned i = 0, e = Args.size(); i != e; ++i, ++OIdx) {
1462  EVT VT = Outs[OIdx].VT;
1463  Type *Ty = Args[i].Ty;
1464 
1465  if (!Outs[OIdx].Flags.isByVal()) {
1468  ComputePTXValueVTs(*this, DL, Ty, VTs, &Offsets);
1469  unsigned ArgAlign =
1470  getArgumentAlignment(Callee, CS, Ty, paramCount + 1, DL);
1471  unsigned AllocSize = DL.getTypeAllocSize(Ty);
1472  SDVTList DeclareParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1473  bool NeedAlign; // Does argument declaration specify alignment?
1474  if (Ty->isAggregateType() || Ty->isVectorTy() || Ty->isIntegerTy(128)) {
1475  // declare .param .align <align> .b8 .param<n>[<size>];
1476  SDValue DeclareParamOps[] = {
1477  Chain, DAG.getConstant(ArgAlign, dl, MVT::i32),
1478  DAG.getConstant(paramCount, dl, MVT::i32),
1479  DAG.getConstant(AllocSize, dl, MVT::i32), InFlag};
1480  Chain = DAG.getNode(NVPTXISD::DeclareParam, dl, DeclareParamVTs,
1481  DeclareParamOps);
1482  NeedAlign = true;
1483  } else {
1484  // declare .param .b<size> .param<n>;
1485  if ((VT.isInteger() || VT.isFloatingPoint()) && AllocSize < 4) {
1486  // PTX ABI requires integral types to be at least 32 bits in
1487  // size. FP16 is loaded/stored using i16, so it's handled
1488  // here as well.
1489  AllocSize = 4;
1490  }
1491  SDValue DeclareScalarParamOps[] = {
1492  Chain, DAG.getConstant(paramCount, dl, MVT::i32),
1493  DAG.getConstant(AllocSize * 8, dl, MVT::i32),
1494  DAG.getConstant(0, dl, MVT::i32), InFlag};
1495  Chain = DAG.getNode(NVPTXISD::DeclareScalarParam, dl, DeclareParamVTs,
1496  DeclareScalarParamOps);
1497  NeedAlign = false;
1498  }
1499  InFlag = Chain.getValue(1);
1500 
1501  // PTX Interoperability Guide 3.3(A): [Integer] Values shorter
1502  // than 32-bits are sign extended or zero extended, depending on
1503  // whether they are signed or unsigned types. This case applies
1504  // only to scalar parameters and not to aggregate values.
1505  bool ExtendIntegerParam =
1506  Ty->isIntegerTy() && DL.getTypeAllocSizeInBits(Ty) < 32;
1507 
1508  auto VectorInfo = VectorizePTXValueVTs(VTs, Offsets, ArgAlign);
1509  SmallVector<SDValue, 6> StoreOperands;
1510  for (unsigned j = 0, je = VTs.size(); j != je; ++j) {
1511  // New store.
1512  if (VectorInfo[j] & PVF_FIRST) {
1513  assert(StoreOperands.empty() && "Unfinished preceeding store.");
1514  StoreOperands.push_back(Chain);
1515  StoreOperands.push_back(DAG.getConstant(paramCount, dl, MVT::i32));
1516  StoreOperands.push_back(DAG.getConstant(Offsets[j], dl, MVT::i32));
1517  }
1518 
1519  EVT EltVT = VTs[j];
1520  SDValue StVal = OutVals[OIdx];
1521  if (ExtendIntegerParam) {
1522  assert(VTs.size() == 1 && "Scalar can't have multiple parts.");
1523  // zext/sext to i32
1524  StVal = DAG.getNode(Outs[OIdx].Flags.isSExt() ? ISD::SIGN_EXTEND
1525  : ISD::ZERO_EXTEND,
1526  dl, MVT::i32, StVal);
1527  } else if (EltVT.getSizeInBits() < 16) {
1528  // Use 16-bit registers for small stores as it's the
1529  // smallest general purpose register size supported by NVPTX.
1530  StVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, StVal);
1531  }
1532 
1533  // Record the value to store.
1534  StoreOperands.push_back(StVal);
1535 
1536  if (VectorInfo[j] & PVF_LAST) {
1537  unsigned NumElts = StoreOperands.size() - 3;
1539  switch (NumElts) {
1540  case 1:
1541  Op = NVPTXISD::StoreParam;
1542  break;
1543  case 2:
1545  break;
1546  case 4:
1548  break;
1549  default:
1550  llvm_unreachable("Invalid vector info.");
1551  }
1552 
1553  StoreOperands.push_back(InFlag);
1554 
1555  // Adjust type of the store op if we've extended the scalar
1556  // return value.
1557  EVT TheStoreType = ExtendIntegerParam ? MVT::i32 : VTs[j];
1558  unsigned EltAlign =
1559  NeedAlign ? GreatestCommonDivisor64(ArgAlign, Offsets[j]) : 0;
1560 
1561  Chain = DAG.getMemIntrinsicNode(
1562  Op, dl, DAG.getVTList(MVT::Other, MVT::Glue), StoreOperands,
1563  TheStoreType, MachinePointerInfo(), EltAlign,
1565  InFlag = Chain.getValue(1);
1566 
1567  // Cleanup.
1568  StoreOperands.clear();
1569  }
1570  ++OIdx;
1571  }
1572  assert(StoreOperands.empty() && "Unfinished parameter store.");
1573  if (VTs.size() > 0)
1574  --OIdx;
1575  ++paramCount;
1576  continue;
1577  }
1578 
1579  // ByVal arguments
1582  auto *PTy = dyn_cast<PointerType>(Args[i].Ty);
1583  assert(PTy && "Type of a byval parameter should be pointer");
1584  ComputePTXValueVTs(*this, DL, PTy->getElementType(), VTs, &Offsets, 0);
1585 
1586  // declare .param .align <align> .b8 .param<n>[<size>];
1587  unsigned sz = Outs[OIdx].Flags.getByValSize();
1588  SDVTList DeclareParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1589  unsigned ArgAlign = Outs[OIdx].Flags.getByValAlign();
1590  // The ByValAlign in the Outs[OIdx].Flags is alway set at this point,
1591  // so we don't need to worry about natural alignment or not.
1592  // See TargetLowering::LowerCallTo().
1593 
1594  // Enforce minumum alignment of 4 to work around ptxas miscompile
1595  // for sm_50+. See corresponding alignment adjustment in
1596  // emitFunctionParamList() for details.
1597  if (ArgAlign < 4)
1598  ArgAlign = 4;
1599  SDValue DeclareParamOps[] = {Chain, DAG.getConstant(ArgAlign, dl, MVT::i32),
1600  DAG.getConstant(paramCount, dl, MVT::i32),
1601  DAG.getConstant(sz, dl, MVT::i32), InFlag};
1602  Chain = DAG.getNode(NVPTXISD::DeclareParam, dl, DeclareParamVTs,
1603  DeclareParamOps);
1604  InFlag = Chain.getValue(1);
1605  for (unsigned j = 0, je = VTs.size(); j != je; ++j) {
1606  EVT elemtype = VTs[j];
1607  int curOffset = Offsets[j];
1608  unsigned PartAlign = GreatestCommonDivisor64(ArgAlign, curOffset);
1609  auto PtrVT = getPointerTy(DL);
1610  SDValue srcAddr = DAG.getNode(ISD::ADD, dl, PtrVT, OutVals[OIdx],
1611  DAG.getConstant(curOffset, dl, PtrVT));
1612  SDValue theVal = DAG.getLoad(elemtype, dl, tempChain, srcAddr,
1613  MachinePointerInfo(), PartAlign);
1614  if (elemtype.getSizeInBits() < 16) {
1615  theVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, theVal);
1616  }
1617  SDVTList CopyParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1618  SDValue CopyParamOps[] = { Chain,
1619  DAG.getConstant(paramCount, dl, MVT::i32),
1620  DAG.getConstant(curOffset, dl, MVT::i32),
1621  theVal, InFlag };
1622  Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreParam, dl, CopyParamVTs,
1623  CopyParamOps, elemtype,
1624  MachinePointerInfo(), /* Align */ 0,
1626 
1627  InFlag = Chain.getValue(1);
1628  }
1629  ++paramCount;
1630  }
1631 
1633  unsigned retAlignment = 0;
1634 
1635  // Handle Result
1636  if (Ins.size() > 0) {
1637  SmallVector<EVT, 16> resvtparts;
1638  ComputeValueVTs(*this, DL, RetTy, resvtparts);
1639 
1640  // Declare
1641  // .param .align 16 .b8 retval0[<size-in-bytes>], or
1642  // .param .b<size-in-bits> retval0
1643  unsigned resultsz = DL.getTypeAllocSizeInBits(RetTy);
1644  // Emit ".param .b<size-in-bits> retval0" instead of byte arrays only for
1645  // these three types to match the logic in
1646  // NVPTXAsmPrinter::printReturnValStr and NVPTXTargetLowering::getPrototype.
1647  // Plus, this behavior is consistent with nvcc's.
1648  if (RetTy->isFloatingPointTy() || RetTy->isPointerTy() ||
1649  (RetTy->isIntegerTy() && !RetTy->isIntegerTy(128))) {
1650  // Scalar needs to be at least 32bit wide
1651  if (resultsz < 32)
1652  resultsz = 32;
1653  SDVTList DeclareRetVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1654  SDValue DeclareRetOps[] = { Chain, DAG.getConstant(1, dl, MVT::i32),
1655  DAG.getConstant(resultsz, dl, MVT::i32),
1656  DAG.getConstant(0, dl, MVT::i32), InFlag };
1657  Chain = DAG.getNode(NVPTXISD::DeclareRet, dl, DeclareRetVTs,
1658  DeclareRetOps);
1659  InFlag = Chain.getValue(1);
1660  } else {
1661  retAlignment = getArgumentAlignment(Callee, CS, RetTy, 0, DL);
1662  SDVTList DeclareRetVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1663  SDValue DeclareRetOps[] = { Chain,
1664  DAG.getConstant(retAlignment, dl, MVT::i32),
1665  DAG.getConstant(resultsz / 8, dl, MVT::i32),
1666  DAG.getConstant(0, dl, MVT::i32), InFlag };
1667  Chain = DAG.getNode(NVPTXISD::DeclareRetParam, dl, DeclareRetVTs,
1668  DeclareRetOps);
1669  InFlag = Chain.getValue(1);
1670  }
1671  }
1672 
1673  if (!Func) {
1674  // This is indirect function call case : PTX requires a prototype of the
1675  // form
1676  // proto_0 : .callprototype(.param .b32 _) _ (.param .b32 _);
1677  // to be emitted, and the label has to used as the last arg of call
1678  // instruction.
1679  // The prototype is embedded in a string and put as the operand for a
1680  // CallPrototype SDNode which will print out to the value of the string.
1681  SDVTList ProtoVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1682  std::string Proto = getPrototype(DL, RetTy, Args, Outs, retAlignment, CS);
1683  const char *ProtoStr =
1684  nvTM->getManagedStrPool()->getManagedString(Proto.c_str())->c_str();
1685  SDValue ProtoOps[] = {
1686  Chain, DAG.getTargetExternalSymbol(ProtoStr, MVT::i32), InFlag,
1687  };
1688  Chain = DAG.getNode(NVPTXISD::CallPrototype, dl, ProtoVTs, ProtoOps);
1689  InFlag = Chain.getValue(1);
1690  }
1691  // Op to just print "call"
1692  SDVTList PrintCallVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1693  SDValue PrintCallOps[] = {
1694  Chain, DAG.getConstant((Ins.size() == 0) ? 0 : 1, dl, MVT::i32), InFlag
1695  };
1696  // We model convergent calls as separate opcodes.
1697  unsigned Opcode = Func ? NVPTXISD::PrintCallUni : NVPTXISD::PrintCall;
1698  if (CLI.IsConvergent)
1701  Chain = DAG.getNode(Opcode, dl, PrintCallVTs, PrintCallOps);
1702  InFlag = Chain.getValue(1);
1703 
1704  // Ops to print out the function name
1705  SDVTList CallVoidVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1706  SDValue CallVoidOps[] = { Chain, Callee, InFlag };
1707  Chain = DAG.getNode(NVPTXISD::CallVoid, dl, CallVoidVTs, CallVoidOps);
1708  InFlag = Chain.getValue(1);
1709 
1710  // Ops to print out the param list
1711  SDVTList CallArgBeginVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1712  SDValue CallArgBeginOps[] = { Chain, InFlag };
1713  Chain = DAG.getNode(NVPTXISD::CallArgBegin, dl, CallArgBeginVTs,
1714  CallArgBeginOps);
1715  InFlag = Chain.getValue(1);
1716 
1717  for (unsigned i = 0, e = paramCount; i != e; ++i) {
1718  unsigned opcode;
1719  if (i == (e - 1))
1720  opcode = NVPTXISD::LastCallArg;
1721  else
1722  opcode = NVPTXISD::CallArg;
1723  SDVTList CallArgVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1724  SDValue CallArgOps[] = { Chain, DAG.getConstant(1, dl, MVT::i32),
1725  DAG.getConstant(i, dl, MVT::i32), InFlag };
1726  Chain = DAG.getNode(opcode, dl, CallArgVTs, CallArgOps);
1727  InFlag = Chain.getValue(1);
1728  }
1729  SDVTList CallArgEndVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1730  SDValue CallArgEndOps[] = { Chain,
1731  DAG.getConstant(Func ? 1 : 0, dl, MVT::i32),
1732  InFlag };
1733  Chain = DAG.getNode(NVPTXISD::CallArgEnd, dl, CallArgEndVTs, CallArgEndOps);
1734  InFlag = Chain.getValue(1);
1735 
1736  if (!Func) {
1737  SDVTList PrototypeVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1738  SDValue PrototypeOps[] = { Chain,
1740  InFlag };
1741  Chain = DAG.getNode(NVPTXISD::Prototype, dl, PrototypeVTs, PrototypeOps);
1742  InFlag = Chain.getValue(1);
1743  }
1744 
1745  // Generate loads from param memory/moves from registers for result
1746  if (Ins.size() > 0) {
1749  ComputePTXValueVTs(*this, DL, RetTy, VTs, &Offsets, 0);
1750  assert(VTs.size() == Ins.size() && "Bad value decomposition");
1751 
1752  unsigned RetAlign = getArgumentAlignment(Callee, CS, RetTy, 0, DL);
1753  auto VectorInfo = VectorizePTXValueVTs(VTs, Offsets, RetAlign);
1754 
1755  SmallVector<EVT, 6> LoadVTs;
1756  int VecIdx = -1; // Index of the first element of the vector.
1757 
1758  // PTX Interoperability Guide 3.3(A): [Integer] Values shorter than
1759  // 32-bits are sign extended or zero extended, depending on whether
1760  // they are signed or unsigned types.
1761  bool ExtendIntegerRetVal =
1762  RetTy->isIntegerTy() && DL.getTypeAllocSizeInBits(RetTy) < 32;
1763 
1764  for (unsigned i = 0, e = VTs.size(); i != e; ++i) {
1765  bool needTruncate = false;
1766  EVT TheLoadType = VTs[i];
1767  EVT EltType = Ins[i].VT;
1768  unsigned EltAlign = GreatestCommonDivisor64(RetAlign, Offsets[i]);
1769  if (ExtendIntegerRetVal) {
1770  TheLoadType = MVT::i32;
1771  EltType = MVT::i32;
1772  needTruncate = true;
1773  } else if (TheLoadType.getSizeInBits() < 16) {
1774  if (VTs[i].isInteger())
1775  needTruncate = true;
1776  EltType = MVT::i16;
1777  }
1778 
1779  // Record index of the very first element of the vector.
1780  if (VectorInfo[i] & PVF_FIRST) {
1781  assert(VecIdx == -1 && LoadVTs.empty() && "Orphaned operand list.");
1782  VecIdx = i;
1783  }
1784 
1785  LoadVTs.push_back(EltType);
1786 
1787  if (VectorInfo[i] & PVF_LAST) {
1788  unsigned NumElts = LoadVTs.size();
1789  LoadVTs.push_back(MVT::Other);
1790  LoadVTs.push_back(MVT::Glue);
1792  switch (NumElts) {
1793  case 1:
1794  Op = NVPTXISD::LoadParam;
1795  break;
1796  case 2:
1797  Op = NVPTXISD::LoadParamV2;
1798  break;
1799  case 4:
1800  Op = NVPTXISD::LoadParamV4;
1801  break;
1802  default:
1803  llvm_unreachable("Invalid vector info.");
1804  }
1805 
1806  SDValue LoadOperands[] = {
1807  Chain, DAG.getConstant(1, dl, MVT::i32),
1808  DAG.getConstant(Offsets[VecIdx], dl, MVT::i32), InFlag};
1809  SDValue RetVal = DAG.getMemIntrinsicNode(
1810  Op, dl, DAG.getVTList(LoadVTs), LoadOperands, TheLoadType,
1811  MachinePointerInfo(), EltAlign,
1813 
1814  for (unsigned j = 0; j < NumElts; ++j) {
1815  SDValue Ret = RetVal.getValue(j);
1816  if (needTruncate)
1817  Ret = DAG.getNode(ISD::TRUNCATE, dl, Ins[VecIdx + j].VT, Ret);
1818  InVals.push_back(Ret);
1819  }
1820  Chain = RetVal.getValue(NumElts);
1821  InFlag = RetVal.getValue(NumElts + 1);
1822 
1823  // Cleanup
1824  VecIdx = -1;
1825  LoadVTs.clear();
1826  }
1827  }
1828  }
1829 
1830  Chain = DAG.getCALLSEQ_END(Chain,
1831  DAG.getIntPtrConstant(uniqueCallSite, dl, true),
1832  DAG.getIntPtrConstant(uniqueCallSite + 1, dl,
1833  true),
1834  InFlag, dl);
1835  uniqueCallSite++;
1836 
1837  // set isTailCall to false for now, until we figure out how to express
1838  // tail call optimization in PTX
1839  isTailCall = false;
1840  return Chain;
1841 }
1842 
1843 // By default CONCAT_VECTORS is lowered by ExpandVectorBuildThroughStack()
1844 // (see LegalizeDAG.cpp). This is slow and uses local memory.
1845 // We use extract/insert/build vector just as what LegalizeOp() does in llvm 2.5
1846 SDValue
1847 NVPTXTargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const {
1848  SDNode *Node = Op.getNode();
1849  SDLoc dl(Node);
1851  unsigned NumOperands = Node->getNumOperands();
1852  for (unsigned i = 0; i < NumOperands; ++i) {
1853  SDValue SubOp = Node->getOperand(i);
1854  EVT VVT = SubOp.getNode()->getValueType(0);
1855  EVT EltVT = VVT.getVectorElementType();
1856  unsigned NumSubElem = VVT.getVectorNumElements();
1857  for (unsigned j = 0; j < NumSubElem; ++j) {
1858  Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, SubOp,
1859  DAG.getIntPtrConstant(j, dl)));
1860  }
1861  }
1862  return DAG.getBuildVector(Node->getValueType(0), dl, Ops);
1863 }
1864 
1865 // We can init constant f16x2 with a single .b32 move. Normally it
1866 // would get lowered as two constant loads and vector-packing move.
1867 // mov.b16 %h1, 0x4000;
1868 // mov.b16 %h2, 0x3C00;
1869 // mov.b32 %hh2, {%h2, %h1};
1870 // Instead we want just a constant move:
1871 // mov.b32 %hh2, 0x40003C00
1872 //
1873 // This results in better SASS code with CUDA 7.x. Ptxas in CUDA 8.0
1874 // generates good SASS in both cases.
1875 SDValue NVPTXTargetLowering::LowerBUILD_VECTOR(SDValue Op,
1876  SelectionDAG &DAG) const {
1877  //return Op;
1878  if (!(Op->getValueType(0) == MVT::v2f16 &&
1879  isa<ConstantFPSDNode>(Op->getOperand(0)) &&
1880  isa<ConstantFPSDNode>(Op->getOperand(1))))
1881  return Op;
1882 
1883  APInt E0 =
1884  cast<ConstantFPSDNode>(Op->getOperand(0))->getValueAPF().bitcastToAPInt();
1885  APInt E1 =
1886  cast<ConstantFPSDNode>(Op->getOperand(1))->getValueAPF().bitcastToAPInt();
1887  SDValue Const =
1888  DAG.getConstant(E1.zext(32).shl(16) | E0.zext(32), SDLoc(Op), MVT::i32);
1889  return DAG.getNode(ISD::BITCAST, SDLoc(Op), MVT::v2f16, Const);
1890 }
1891 
1892 SDValue NVPTXTargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
1893  SelectionDAG &DAG) const {
1894  SDValue Index = Op->getOperand(1);
1895  // Constant index will be matched by tablegen.
1896  if (isa<ConstantSDNode>(Index.getNode()))
1897  return Op;
1898 
1899  // Extract individual elements and select one of them.
1900  SDValue Vector = Op->getOperand(0);
1901  EVT VectorVT = Vector.getValueType();
1902  assert(VectorVT == MVT::v2f16 && "Unexpected vector type.");
1903  EVT EltVT = VectorVT.getVectorElementType();
1904 
1905  SDLoc dl(Op.getNode());
1906  SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Vector,
1907  DAG.getIntPtrConstant(0, dl));
1908  SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Vector,
1909  DAG.getIntPtrConstant(1, dl));
1910  return DAG.getSelectCC(dl, Index, DAG.getIntPtrConstant(0, dl), E0, E1,
1912 }
1913 
1914 /// LowerShiftRightParts - Lower SRL_PARTS, SRA_PARTS, which
1915 /// 1) returns two i32 values and take a 2 x i32 value to shift plus a shift
1916 /// amount, or
1917 /// 2) returns two i64 values and take a 2 x i64 value to shift plus a shift
1918 /// amount.
1919 SDValue NVPTXTargetLowering::LowerShiftRightParts(SDValue Op,
1920  SelectionDAG &DAG) const {
1921  assert(Op.getNumOperands() == 3 && "Not a double-shift!");
1923 
1924  EVT VT = Op.getValueType();
1925  unsigned VTBits = VT.getSizeInBits();
1926  SDLoc dl(Op);
1927  SDValue ShOpLo = Op.getOperand(0);
1928  SDValue ShOpHi = Op.getOperand(1);
1929  SDValue ShAmt = Op.getOperand(2);
1930  unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL;
1931 
1932  if (VTBits == 32 && STI.getSmVersion() >= 35) {
1933  // For 32bit and sm35, we can use the funnel shift 'shf' instruction.
1934  // {dHi, dLo} = {aHi, aLo} >> Amt
1935  // dHi = aHi >> Amt
1936  // dLo = shf.r.clamp aLo, aHi, Amt
1937 
1938  SDValue Hi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
1939  SDValue Lo = DAG.getNode(NVPTXISD::FUN_SHFR_CLAMP, dl, VT, ShOpLo, ShOpHi,
1940  ShAmt);
1941 
1942  SDValue Ops[2] = { Lo, Hi };
1943  return DAG.getMergeValues(Ops, dl);
1944  }
1945  else {
1946  // {dHi, dLo} = {aHi, aLo} >> Amt
1947  // - if (Amt>=size) then
1948  // dLo = aHi >> (Amt-size)
1949  // dHi = aHi >> Amt (this is either all 0 or all 1)
1950  // else
1951  // dLo = (aLo >>logic Amt) | (aHi << (size-Amt))
1952  // dHi = aHi >> Amt
1953 
1954  SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
1955  DAG.getConstant(VTBits, dl, MVT::i32),
1956  ShAmt);
1957  SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt);
1958  SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
1959  DAG.getConstant(VTBits, dl, MVT::i32));
1960  SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt);
1961  SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
1962  SDValue TrueVal = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt);
1963 
1964  SDValue Cmp = DAG.getSetCC(dl, MVT::i1, ShAmt,
1965  DAG.getConstant(VTBits, dl, MVT::i32),
1966  ISD::SETGE);
1967  SDValue Hi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
1968  SDValue Lo = DAG.getNode(ISD::SELECT, dl, VT, Cmp, TrueVal, FalseVal);
1969 
1970  SDValue Ops[2] = { Lo, Hi };
1971  return DAG.getMergeValues(Ops, dl);
1972  }
1973 }
1974 
1975 /// LowerShiftLeftParts - Lower SHL_PARTS, which
1976 /// 1) returns two i32 values and take a 2 x i32 value to shift plus a shift
1977 /// amount, or
1978 /// 2) returns two i64 values and take a 2 x i64 value to shift plus a shift
1979 /// amount.
1980 SDValue NVPTXTargetLowering::LowerShiftLeftParts(SDValue Op,
1981  SelectionDAG &DAG) const {
1982  assert(Op.getNumOperands() == 3 && "Not a double-shift!");
1983  assert(Op.getOpcode() == ISD::SHL_PARTS);
1984 
1985  EVT VT = Op.getValueType();
1986  unsigned VTBits = VT.getSizeInBits();
1987  SDLoc dl(Op);
1988  SDValue ShOpLo = Op.getOperand(0);
1989  SDValue ShOpHi = Op.getOperand(1);
1990  SDValue ShAmt = Op.getOperand(2);
1991 
1992  if (VTBits == 32 && STI.getSmVersion() >= 35) {
1993  // For 32bit and sm35, we can use the funnel shift 'shf' instruction.
1994  // {dHi, dLo} = {aHi, aLo} << Amt
1995  // dHi = shf.l.clamp aLo, aHi, Amt
1996  // dLo = aLo << Amt
1997 
1998  SDValue Hi = DAG.getNode(NVPTXISD::FUN_SHFL_CLAMP, dl, VT, ShOpLo, ShOpHi,
1999  ShAmt);
2000  SDValue Lo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
2001 
2002  SDValue Ops[2] = { Lo, Hi };
2003  return DAG.getMergeValues(Ops, dl);
2004  }
2005  else {
2006  // {dHi, dLo} = {aHi, aLo} << Amt
2007  // - if (Amt>=size) then
2008  // dLo = aLo << Amt (all 0)
2009  // dLo = aLo << (Amt-size)
2010  // else
2011  // dLo = aLo << Amt
2012  // dHi = (aHi << Amt) | (aLo >> (size-Amt))
2013 
2014  SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
2015  DAG.getConstant(VTBits, dl, MVT::i32),
2016  ShAmt);
2017  SDValue Tmp1 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt);
2018  SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
2019  DAG.getConstant(VTBits, dl, MVT::i32));
2020  SDValue Tmp2 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt);
2021  SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
2022  SDValue TrueVal = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt);
2023 
2024  SDValue Cmp = DAG.getSetCC(dl, MVT::i1, ShAmt,
2025  DAG.getConstant(VTBits, dl, MVT::i32),
2026  ISD::SETGE);
2027  SDValue Lo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
2028  SDValue Hi = DAG.getNode(ISD::SELECT, dl, VT, Cmp, TrueVal, FalseVal);
2029 
2030  SDValue Ops[2] = { Lo, Hi };
2031  return DAG.getMergeValues(Ops, dl);
2032  }
2033 }
2034 
2035 SDValue
2037  switch (Op.getOpcode()) {
2038  case ISD::RETURNADDR:
2039  return SDValue();
2040  case ISD::FRAMEADDR:
2041  return SDValue();
2042  case ISD::GlobalAddress:
2043  return LowerGlobalAddress(Op, DAG);
2045  return Op;
2046  case ISD::BUILD_VECTOR:
2047  return LowerBUILD_VECTOR(Op, DAG);
2049  return Op;
2051  return LowerEXTRACT_VECTOR_ELT(Op, DAG);
2052  case ISD::CONCAT_VECTORS:
2053  return LowerCONCAT_VECTORS(Op, DAG);
2054  case ISD::STORE:
2055  return LowerSTORE(Op, DAG);
2056  case ISD::LOAD:
2057  return LowerLOAD(Op, DAG);
2058  case ISD::SHL_PARTS:
2059  return LowerShiftLeftParts(Op, DAG);
2060  case ISD::SRA_PARTS:
2061  case ISD::SRL_PARTS:
2062  return LowerShiftRightParts(Op, DAG);
2063  case ISD::SELECT:
2064  return LowerSelect(Op, DAG);
2065  default:
2066  llvm_unreachable("Custom lowering not defined for operation");
2067  }
2068 }
2069 
2070 SDValue NVPTXTargetLowering::LowerSelect(SDValue Op, SelectionDAG &DAG) const {
2071  SDValue Op0 = Op->getOperand(0);
2072  SDValue Op1 = Op->getOperand(1);
2073  SDValue Op2 = Op->getOperand(2);
2074  SDLoc DL(Op.getNode());
2075 
2076  assert(Op.getValueType() == MVT::i1 && "Custom lowering enabled only for i1");
2077 
2078  Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op1);
2079  Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op2);
2080  SDValue Select = DAG.getNode(ISD::SELECT, DL, MVT::i32, Op0, Op1, Op2);
2081  SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Select);
2082 
2083  return Trunc;
2084 }
2085 
2086 SDValue NVPTXTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
2087  if (Op.getValueType() == MVT::i1)
2088  return LowerLOADi1(Op, DAG);
2089 
2090  // v2f16 is legal, so we can't rely on legalizer to handle unaligned
2091  // loads and have to handle it here.
2092  if (Op.getValueType() == MVT::v2f16) {
2093  LoadSDNode *Load = cast<LoadSDNode>(Op);
2094  EVT MemVT = Load->getMemoryVT();
2095  if (!allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), MemVT,
2096  Load->getAddressSpace(), Load->getAlignment())) {
2097  SDValue Ops[2];
2098  std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(Load, DAG);
2099  return DAG.getMergeValues(Ops, SDLoc(Op));
2100  }
2101  }
2102 
2103  return SDValue();
2104 }
2105 
2106 // v = ld i1* addr
2107 // =>
2108 // v1 = ld i8* addr (-> i16)
2109 // v = trunc i16 to i1
2110 SDValue NVPTXTargetLowering::LowerLOADi1(SDValue Op, SelectionDAG &DAG) const {
2111  SDNode *Node = Op.getNode();
2112  LoadSDNode *LD = cast<LoadSDNode>(Node);
2113  SDLoc dl(Node);
2115  assert(Node->getValueType(0) == MVT::i1 &&
2116  "Custom lowering for i1 load only");
2117  SDValue newLD = DAG.getLoad(MVT::i16, dl, LD->getChain(), LD->getBasePtr(),
2118  LD->getPointerInfo(), LD->getAlignment(),
2119  LD->getMemOperand()->getFlags());
2120  SDValue result = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, newLD);
2121  // The legalizer (the caller) is expecting two values from the legalized
2122  // load, so we build a MergeValues node for it. See ExpandUnalignedLoad()
2123  // in LegalizeDAG.cpp which also uses MergeValues.
2124  SDValue Ops[] = { result, LD->getChain() };
2125  return DAG.getMergeValues(Ops, dl);
2126 }
2127 
2128 SDValue NVPTXTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
2129  StoreSDNode *Store = cast<StoreSDNode>(Op);
2130  EVT VT = Store->getMemoryVT();
2131 
2132  if (VT == MVT::i1)
2133  return LowerSTOREi1(Op, DAG);
2134 
2135  // v2f16 is legal, so we can't rely on legalizer to handle unaligned
2136  // stores and have to handle it here.
2137  if (VT == MVT::v2f16 &&
2138  !allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
2139  Store->getAddressSpace(), Store->getAlignment()))
2140  return expandUnalignedStore(Store, DAG);
2141 
2142  if (VT.isVector())
2143  return LowerSTOREVector(Op, DAG);
2144 
2145  return SDValue();
2146 }
2147 
2148 SDValue
2149 NVPTXTargetLowering::LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const {
2150  SDNode *N = Op.getNode();
2151  SDValue Val = N->getOperand(1);
2152  SDLoc DL(N);
2153  EVT ValVT = Val.getValueType();
2154 
2155  if (ValVT.isVector()) {
2156  // We only handle "native" vector sizes for now, e.g. <4 x double> is not
2157  // legal. We can (and should) split that into 2 stores of <2 x double> here
2158  // but I'm leaving that as a TODO for now.
2159  if (!ValVT.isSimple())
2160  return SDValue();
2161  switch (ValVT.getSimpleVT().SimpleTy) {
2162  default:
2163  return SDValue();
2164  case MVT::v2i8:
2165  case MVT::v2i16:
2166  case MVT::v2i32:
2167  case MVT::v2i64:
2168  case MVT::v2f16:
2169  case MVT::v2f32:
2170  case MVT::v2f64:
2171  case MVT::v4i8:
2172  case MVT::v4i16:
2173  case MVT::v4i32:
2174  case MVT::v4f16:
2175  case MVT::v4f32:
2176  case MVT::v8f16: // <4 x f16x2>
2177  // This is a "native" vector type
2178  break;
2179  }
2180 
2181  MemSDNode *MemSD = cast<MemSDNode>(N);
2182  const DataLayout &TD = DAG.getDataLayout();
2183 
2184  unsigned Align = MemSD->getAlignment();
2185  unsigned PrefAlign =
2186  TD.getPrefTypeAlignment(ValVT.getTypeForEVT(*DAG.getContext()));
2187  if (Align < PrefAlign) {
2188  // This store is not sufficiently aligned, so bail out and let this vector
2189  // store be scalarized. Note that we may still be able to emit smaller
2190  // vector stores. For example, if we are storing a <4 x float> with an
2191  // alignment of 8, this check will fail but the legalizer will try again
2192  // with 2 x <2 x float>, which will succeed with an alignment of 8.
2193  return SDValue();
2194  }
2195 
2196  unsigned Opcode = 0;
2197  EVT EltVT = ValVT.getVectorElementType();
2198  unsigned NumElts = ValVT.getVectorNumElements();
2199 
2200  // Since StoreV2 is a target node, we cannot rely on DAG type legalization.
2201  // Therefore, we must ensure the type is legal. For i1 and i8, we set the
2202  // stored type to i16 and propagate the "real" type as the memory type.
2203  bool NeedExt = false;
2204  if (EltVT.getSizeInBits() < 16)
2205  NeedExt = true;
2206 
2207  bool StoreF16x2 = false;
2208  switch (NumElts) {
2209  default:
2210  return SDValue();
2211  case 2:
2212  Opcode = NVPTXISD::StoreV2;
2213  break;
2214  case 4:
2215  Opcode = NVPTXISD::StoreV4;
2216  break;
2217  case 8:
2218  // v8f16 is a special case. PTX doesn't have st.v8.f16
2219  // instruction. Instead, we split the vector into v2f16 chunks and
2220  // store them with st.v4.b32.
2221  assert(EltVT == MVT::f16 && "Wrong type for the vector.");
2222  Opcode = NVPTXISD::StoreV4;
2223  StoreF16x2 = true;
2224  break;
2225  }
2226 
2228 
2229  // First is the chain
2230  Ops.push_back(N->getOperand(0));
2231 
2232  if (StoreF16x2) {
2233  // Combine f16,f16 -> v2f16
2234  NumElts /= 2;
2235  for (unsigned i = 0; i < NumElts; ++i) {
2236  SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f16, Val,
2237  DAG.getIntPtrConstant(i * 2, DL));
2238  SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f16, Val,
2239  DAG.getIntPtrConstant(i * 2 + 1, DL));
2240  SDValue V2 = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2f16, E0, E1);
2241  Ops.push_back(V2);
2242  }
2243  } else {
2244  // Then the split values
2245  for (unsigned i = 0; i < NumElts; ++i) {
2246  SDValue ExtVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Val,
2247  DAG.getIntPtrConstant(i, DL));
2248  if (NeedExt)
2249  ExtVal = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i16, ExtVal);
2250  Ops.push_back(ExtVal);
2251  }
2252  }
2253 
2254  // Then any remaining arguments
2255  Ops.append(N->op_begin() + 2, N->op_end());
2256 
2257  SDValue NewSt =
2258  DAG.getMemIntrinsicNode(Opcode, DL, DAG.getVTList(MVT::Other), Ops,
2259  MemSD->getMemoryVT(), MemSD->getMemOperand());
2260 
2261  // return DCI.CombineTo(N, NewSt, true);
2262  return NewSt;
2263  }
2264 
2265  return SDValue();
2266 }
2267 
2268 // st i1 v, addr
2269 // =>
2270 // v1 = zxt v to i16
2271 // st.u8 i16, addr
2272 SDValue NVPTXTargetLowering::LowerSTOREi1(SDValue Op, SelectionDAG &DAG) const {
2273  SDNode *Node = Op.getNode();
2274  SDLoc dl(Node);
2275  StoreSDNode *ST = cast<StoreSDNode>(Node);
2276  SDValue Tmp1 = ST->getChain();
2277  SDValue Tmp2 = ST->getBasePtr();
2278  SDValue Tmp3 = ST->getValue();
2279  assert(Tmp3.getValueType() == MVT::i1 && "Custom lowering for i1 store only");
2280  Tmp3 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Tmp3);
2281  SDValue Result =
2282  DAG.getTruncStore(Tmp1, dl, Tmp3, Tmp2, ST->getPointerInfo(), MVT::i8,
2283  ST->getAlignment(), ST->getMemOperand()->getFlags());
2284  return Result;
2285 }
2286 
2287 SDValue
2288 NVPTXTargetLowering::getParamSymbol(SelectionDAG &DAG, int idx, EVT v) const {
2289  std::string ParamSym;
2290  raw_string_ostream ParamStr(ParamSym);
2291 
2292  ParamStr << DAG.getMachineFunction().getName() << "_param_" << idx;
2293  ParamStr.flush();
2294 
2295  std::string *SavedStr =
2296  nvTM->getManagedStrPool()->getManagedString(ParamSym.c_str());
2297  return DAG.getTargetExternalSymbol(SavedStr->c_str(), v);
2298 }
2299 
2300 // Check to see if the kernel argument is image*_t or sampler_t
2301 
2302 static bool isImageOrSamplerVal(const Value *arg, const Module *context) {
2303  static const char *const specialTypes[] = { "struct._image2d_t",
2304  "struct._image3d_t",
2305  "struct._sampler_t" };
2306 
2307  Type *Ty = arg->getType();
2308  auto *PTy = dyn_cast<PointerType>(Ty);
2309 
2310  if (!PTy)
2311  return false;
2312 
2313  if (!context)
2314  return false;
2315 
2316  auto *STy = dyn_cast<StructType>(PTy->getElementType());
2317  if (!STy || STy->isLiteral())
2318  return false;
2319 
2320  return std::find(std::begin(specialTypes), std::end(specialTypes),
2321  STy->getName()) != std::end(specialTypes);
2322 }
2323 
2325  SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
2326  const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
2327  SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
2328  MachineFunction &MF = DAG.getMachineFunction();
2329  const DataLayout &DL = DAG.getDataLayout();
2330  auto PtrVT = getPointerTy(DAG.getDataLayout());
2331 
2332  const Function *F = &MF.getFunction();
2333  const AttributeList &PAL = F->getAttributes();
2334  const TargetLowering *TLI = STI.getTargetLowering();
2335 
2336  SDValue Root = DAG.getRoot();
2337  std::vector<SDValue> OutChains;
2338 
2339  bool isABI = (STI.getSmVersion() >= 20);
2340  assert(isABI && "Non-ABI compilation is not supported");
2341  if (!isABI)
2342  return Chain;
2343 
2344  std::vector<Type *> argTypes;
2345  std::vector<const Argument *> theArgs;
2346  for (const Argument &I : F->args()) {
2347  theArgs.push_back(&I);
2348  argTypes.push_back(I.getType());
2349  }
2350  // argTypes.size() (or theArgs.size()) and Ins.size() need not match.
2351  // Ins.size() will be larger
2352  // * if there is an aggregate argument with multiple fields (each field
2353  // showing up separately in Ins)
2354  // * if there is a vector argument with more than typical vector-length
2355  // elements (generally if more than 4) where each vector element is
2356  // individually present in Ins.
2357  // So a different index should be used for indexing into Ins.
2358  // See similar issue in LowerCall.
2359  unsigned InsIdx = 0;
2360 
2361  int idx = 0;
2362  for (unsigned i = 0, e = theArgs.size(); i != e; ++i, ++idx, ++InsIdx) {
2363  Type *Ty = argTypes[i];
2364 
2365  // If the kernel argument is image*_t or sampler_t, convert it to
2366  // a i32 constant holding the parameter position. This can later
2367  // matched in the AsmPrinter to output the correct mangled name.
2368  if (isImageOrSamplerVal(
2369  theArgs[i],
2370  (theArgs[i]->getParent() ? theArgs[i]->getParent()->getParent()
2371  : nullptr))) {
2372  assert(isKernelFunction(*F) &&
2373  "Only kernels can have image/sampler params");
2374  InVals.push_back(DAG.getConstant(i + 1, dl, MVT::i32));
2375  continue;
2376  }
2377 
2378  if (theArgs[i]->use_empty()) {
2379  // argument is dead
2380  if (Ty->isAggregateType() || Ty->isIntegerTy(128)) {
2381  SmallVector<EVT, 16> vtparts;
2382 
2383  ComputePTXValueVTs(*this, DAG.getDataLayout(), Ty, vtparts);
2384  assert(vtparts.size() > 0 && "empty aggregate type not expected");
2385  for (unsigned parti = 0, parte = vtparts.size(); parti != parte;
2386  ++parti) {
2387  InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT));
2388  ++InsIdx;
2389  }
2390  if (vtparts.size() > 0)
2391  --InsIdx;
2392  continue;
2393  }
2394  if (Ty->isVectorTy()) {
2395  EVT ObjectVT = getValueType(DL, Ty);
2396  unsigned NumRegs = TLI->getNumRegisters(F->getContext(), ObjectVT);
2397  for (unsigned parti = 0; parti < NumRegs; ++parti) {
2398  InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT));
2399  ++InsIdx;
2400  }
2401  if (NumRegs > 0)
2402  --InsIdx;
2403  continue;
2404  }
2405  InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT));
2406  continue;
2407  }
2408 
2409  // In the following cases, assign a node order of "idx+1"
2410  // to newly created nodes. The SDNodes for params have to
2411  // appear in the same order as their order of appearance
2412  // in the original function. "idx+1" holds that order.
2413  if (!PAL.hasParamAttribute(i, Attribute::ByVal)) {
2414  bool aggregateIsPacked = false;
2415  if (StructType *STy = dyn_cast<StructType>(Ty))
2416  aggregateIsPacked = STy->isPacked();
2417 
2420  ComputePTXValueVTs(*this, DL, Ty, VTs, &Offsets, 0);
2421  assert(VTs.size() > 0 && "Unexpected empty type.");
2422  auto VectorInfo =
2423  VectorizePTXValueVTs(VTs, Offsets, DL.getABITypeAlignment(Ty));
2424 
2425  SDValue Arg = getParamSymbol(DAG, idx, PtrVT);
2426  int VecIdx = -1; // Index of the first element of the current vector.
2427  for (unsigned parti = 0, parte = VTs.size(); parti != parte; ++parti) {
2428  if (VectorInfo[parti] & PVF_FIRST) {
2429  assert(VecIdx == -1 && "Orphaned vector.");
2430  VecIdx = parti;
2431  }
2432 
2433  // That's the last element of this store op.
2434  if (VectorInfo[parti] & PVF_LAST) {
2435  unsigned NumElts = parti - VecIdx + 1;
2436  EVT EltVT = VTs[parti];
2437  // i1 is loaded/stored as i8.
2438  EVT LoadVT = EltVT;
2439  if (EltVT == MVT::i1)
2440  LoadVT = MVT::i8;
2441  else if (EltVT == MVT::v2f16)
2442  // getLoad needs a vector type, but it can't handle
2443  // vectors which contain v2f16 elements. So we must load
2444  // using i32 here and then bitcast back.
2445  LoadVT = MVT::i32;
2446 
2447  EVT VecVT = EVT::getVectorVT(F->getContext(), LoadVT, NumElts);
2448  SDValue VecAddr =
2449  DAG.getNode(ISD::ADD, dl, PtrVT, Arg,
2450  DAG.getConstant(Offsets[VecIdx], dl, PtrVT));
2452  EltVT.getTypeForEVT(F->getContext()), ADDRESS_SPACE_PARAM));
2453  SDValue P =
2454  DAG.getLoad(VecVT, dl, Root, VecAddr,
2455  MachinePointerInfo(srcValue), aggregateIsPacked,
2458  if (P.getNode())
2459  P.getNode()->setIROrder(idx + 1);
2460  for (unsigned j = 0; j < NumElts; ++j) {
2461  SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, LoadVT, P,
2462  DAG.getIntPtrConstant(j, dl));
2463  // We've loaded i1 as an i8 and now must truncate it back to i1
2464  if (EltVT == MVT::i1)
2465  Elt = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Elt);
2466  // v2f16 was loaded as an i32. Now we must bitcast it back.
2467  else if (EltVT == MVT::v2f16)
2468  Elt = DAG.getNode(ISD::BITCAST, dl, MVT::v2f16, Elt);
2469  // Extend the element if necessary (e.g. an i8 is loaded
2470  // into an i16 register)
2471  if (Ins[InsIdx].VT.isInteger() &&
2472  Ins[InsIdx].VT.getSizeInBits() > LoadVT.getSizeInBits()) {
2473  unsigned Extend = Ins[InsIdx].Flags.isSExt() ? ISD::SIGN_EXTEND
2474  : ISD::ZERO_EXTEND;
2475  Elt = DAG.getNode(Extend, dl, Ins[InsIdx].VT, Elt);
2476  }
2477  InVals.push_back(Elt);
2478  }
2479 
2480  // Reset vector tracking state.
2481  VecIdx = -1;
2482  }
2483  ++InsIdx;
2484  }
2485  if (VTs.size() > 0)
2486  --InsIdx;
2487  continue;
2488  }
2489 
2490  // Param has ByVal attribute
2491  // Return MoveParam(param symbol).
2492  // Ideally, the param symbol can be returned directly,
2493  // but when SDNode builder decides to use it in a CopyToReg(),
2494  // machine instruction fails because TargetExternalSymbol
2495  // (not lowered) is target dependent, and CopyToReg assumes
2496  // the source is lowered.
2497  EVT ObjectVT = getValueType(DL, Ty);
2498  assert(ObjectVT == Ins[InsIdx].VT &&
2499  "Ins type did not match function type");
2500  SDValue Arg = getParamSymbol(DAG, idx, PtrVT);
2501  SDValue p = DAG.getNode(NVPTXISD::MoveParam, dl, ObjectVT, Arg);
2502  if (p.getNode())
2503  p.getNode()->setIROrder(idx + 1);
2504  InVals.push_back(p);
2505  }
2506 
2507  // Clang will check explicit VarArg and issue error if any. However, Clang
2508  // will let code with
2509  // implicit var arg like f() pass. See bug 617733.
2510  // We treat this case as if the arg list is empty.
2511  // if (F.isVarArg()) {
2512  // assert(0 && "VarArg not supported yet!");
2513  //}
2514 
2515  if (!OutChains.empty())
2516  DAG.setRoot(DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains));
2517 
2518  return Chain;
2519 }
2520 
2521 SDValue
2523  bool isVarArg,
2524  const SmallVectorImpl<ISD::OutputArg> &Outs,
2525  const SmallVectorImpl<SDValue> &OutVals,
2526  const SDLoc &dl, SelectionDAG &DAG) const {
2527  MachineFunction &MF = DAG.getMachineFunction();
2528  Type *RetTy = MF.getFunction().getReturnType();
2529 
2530  bool isABI = (STI.getSmVersion() >= 20);
2531  assert(isABI && "Non-ABI compilation is not supported");
2532  if (!isABI)
2533  return Chain;
2534 
2535  const DataLayout DL = DAG.getDataLayout();
2538  ComputePTXValueVTs(*this, DL, RetTy, VTs, &Offsets);
2539  assert(VTs.size() == OutVals.size() && "Bad return value decomposition");
2540 
2541  auto VectorInfo = VectorizePTXValueVTs(
2542  VTs, Offsets, RetTy->isSized() ? DL.getABITypeAlignment(RetTy) : 1);
2543 
2544  // PTX Interoperability Guide 3.3(A): [Integer] Values shorter than
2545  // 32-bits are sign extended or zero extended, depending on whether
2546  // they are signed or unsigned types.
2547  bool ExtendIntegerRetVal =
2548  RetTy->isIntegerTy() && DL.getTypeAllocSizeInBits(RetTy) < 32;
2549 
2550  SmallVector<SDValue, 6> StoreOperands;
2551  for (unsigned i = 0, e = VTs.size(); i != e; ++i) {
2552  // New load/store. Record chain and offset operands.
2553  if (VectorInfo[i] & PVF_FIRST) {
2554  assert(StoreOperands.empty() && "Orphaned operand list.");
2555  StoreOperands.push_back(Chain);
2556  StoreOperands.push_back(DAG.getConstant(Offsets[i], dl, MVT::i32));
2557  }
2558 
2559  SDValue RetVal = OutVals[i];
2560  if (ExtendIntegerRetVal) {
2561  RetVal = DAG.getNode(Outs[i].Flags.isSExt() ? ISD::SIGN_EXTEND
2562  : ISD::ZERO_EXTEND,
2563  dl, MVT::i32, RetVal);
2564  } else if (RetVal.getValueSizeInBits() < 16) {
2565  // Use 16-bit registers for small load-stores as it's the
2566  // smallest general purpose register size supported by NVPTX.
2567  RetVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, RetVal);
2568  }
2569 
2570  // Record the value to return.
2571  StoreOperands.push_back(RetVal);
2572 
2573  // That's the last element of this store op.
2574  if (VectorInfo[i] & PVF_LAST) {
2576  unsigned NumElts = StoreOperands.size() - 2;
2577  switch (NumElts) {
2578  case 1:
2579  Op = NVPTXISD::StoreRetval;
2580  break;
2581  case 2:
2583  break;
2584  case 4:
2586  break;
2587  default:
2588  llvm_unreachable("Invalid vector info.");
2589  }
2590 
2591  // Adjust type of load/store op if we've extended the scalar
2592  // return value.
2593  EVT TheStoreType = ExtendIntegerRetVal ? MVT::i32 : VTs[i];
2594  Chain = DAG.getMemIntrinsicNode(Op, dl, DAG.getVTList(MVT::Other),
2595  StoreOperands, TheStoreType,
2596  MachinePointerInfo(), /* Align */ 1,
2598  // Cleanup vector state.
2599  StoreOperands.clear();
2600  }
2601  }
2602 
2603  return DAG.getNode(NVPTXISD::RET_FLAG, dl, MVT::Other, Chain);
2604 }
2605 
2607  SDValue Op, std::string &Constraint, std::vector<SDValue> &Ops,
2608  SelectionDAG &DAG) const {
2609  if (Constraint.length() > 1)
2610  return;
2611  else
2612  TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
2613 }
2614 
2615 static unsigned getOpcForTextureInstr(unsigned Intrinsic) {
2616  switch (Intrinsic) {
2617  default:
2618  return 0;
2619 
2620  case Intrinsic::nvvm_tex_1d_v4f32_s32:
2621  return NVPTXISD::Tex1DFloatS32;
2622  case Intrinsic::nvvm_tex_1d_v4f32_f32:
2624  case Intrinsic::nvvm_tex_1d_level_v4f32_f32:
2626  case Intrinsic::nvvm_tex_1d_grad_v4f32_f32:
2628  case Intrinsic::nvvm_tex_1d_v4s32_s32:
2629  return NVPTXISD::Tex1DS32S32;
2630  case Intrinsic::nvvm_tex_1d_v4s32_f32:
2631  return NVPTXISD::Tex1DS32Float;
2632  case Intrinsic::nvvm_tex_1d_level_v4s32_f32:
2634  case Intrinsic::nvvm_tex_1d_grad_v4s32_f32:
2636  case Intrinsic::nvvm_tex_1d_v4u32_s32:
2637  return NVPTXISD::Tex1DU32S32;
2638  case Intrinsic::nvvm_tex_1d_v4u32_f32:
2639  return NVPTXISD::Tex1DU32Float;
2640  case Intrinsic::nvvm_tex_1d_level_v4u32_f32:
2642  case Intrinsic::nvvm_tex_1d_grad_v4u32_f32:
2644 
2645  case Intrinsic::nvvm_tex_1d_array_v4f32_s32:
2647  case Intrinsic::nvvm_tex_1d_array_v4f32_f32:
2649  case Intrinsic::nvvm_tex_1d_array_level_v4f32_f32:
2651  case Intrinsic::nvvm_tex_1d_array_grad_v4f32_f32:
2653  case Intrinsic::nvvm_tex_1d_array_v4s32_s32:
2655  case Intrinsic::nvvm_tex_1d_array_v4s32_f32:
2657  case Intrinsic::nvvm_tex_1d_array_level_v4s32_f32:
2659  case Intrinsic::nvvm_tex_1d_array_grad_v4s32_f32:
2661  case Intrinsic::nvvm_tex_1d_array_v4u32_s32:
2663  case Intrinsic::nvvm_tex_1d_array_v4u32_f32:
2665  case Intrinsic::nvvm_tex_1d_array_level_v4u32_f32:
2667  case Intrinsic::nvvm_tex_1d_array_grad_v4u32_f32:
2669 
2670  case Intrinsic::nvvm_tex_2d_v4f32_s32:
2671  return NVPTXISD::Tex2DFloatS32;
2672  case Intrinsic::nvvm_tex_2d_v4f32_f32:
2674  case Intrinsic::nvvm_tex_2d_level_v4f32_f32:
2676  case Intrinsic::nvvm_tex_2d_grad_v4f32_f32:
2678  case Intrinsic::nvvm_tex_2d_v4s32_s32:
2679  return NVPTXISD::Tex2DS32S32;
2680  case Intrinsic::nvvm_tex_2d_v4s32_f32:
2681  return NVPTXISD::Tex2DS32Float;
2682  case Intrinsic::nvvm_tex_2d_level_v4s32_f32:
2684  case Intrinsic::nvvm_tex_2d_grad_v4s32_f32:
2686  case Intrinsic::nvvm_tex_2d_v4u32_s32:
2687  return NVPTXISD::Tex2DU32S32;
2688  case Intrinsic::nvvm_tex_2d_v4u32_f32:
2689  return NVPTXISD::Tex2DU32Float;
2690  case Intrinsic::nvvm_tex_2d_level_v4u32_f32:
2692  case Intrinsic::nvvm_tex_2d_grad_v4u32_f32:
2694 
2695  case Intrinsic::nvvm_tex_2d_array_v4f32_s32:
2697  case Intrinsic::nvvm_tex_2d_array_v4f32_f32:
2699  case Intrinsic::nvvm_tex_2d_array_level_v4f32_f32:
2701  case Intrinsic::nvvm_tex_2d_array_grad_v4f32_f32:
2703  case Intrinsic::nvvm_tex_2d_array_v4s32_s32:
2705  case Intrinsic::nvvm_tex_2d_array_v4s32_f32:
2707  case Intrinsic::nvvm_tex_2d_array_level_v4s32_f32:
2709  case Intrinsic::nvvm_tex_2d_array_grad_v4s32_f32:
2711  case Intrinsic::nvvm_tex_2d_array_v4u32_s32:
2713  case Intrinsic::nvvm_tex_2d_array_v4u32_f32:
2715  case Intrinsic::nvvm_tex_2d_array_level_v4u32_f32:
2717  case Intrinsic::nvvm_tex_2d_array_grad_v4u32_f32:
2719 
2720  case Intrinsic::nvvm_tex_3d_v4f32_s32:
2721  return NVPTXISD::Tex3DFloatS32;
2722  case Intrinsic::nvvm_tex_3d_v4f32_f32:
2724  case Intrinsic::nvvm_tex_3d_level_v4f32_f32:
2726  case Intrinsic::nvvm_tex_3d_grad_v4f32_f32:
2728  case Intrinsic::nvvm_tex_3d_v4s32_s32:
2729  return NVPTXISD::Tex3DS32S32;
2730  case Intrinsic::nvvm_tex_3d_v4s32_f32:
2731  return NVPTXISD::Tex3DS32Float;
2732  case Intrinsic::nvvm_tex_3d_level_v4s32_f32:
2734  case Intrinsic::nvvm_tex_3d_grad_v4s32_f32:
2736  case Intrinsic::nvvm_tex_3d_v4u32_s32:
2737  return NVPTXISD::Tex3DU32S32;
2738  case Intrinsic::nvvm_tex_3d_v4u32_f32:
2739  return NVPTXISD::Tex3DU32Float;
2740  case Intrinsic::nvvm_tex_3d_level_v4u32_f32:
2742  case Intrinsic::nvvm_tex_3d_grad_v4u32_f32:
2744 
2745  case Intrinsic::nvvm_tex_cube_v4f32_f32:
2747  case Intrinsic::nvvm_tex_cube_level_v4f32_f32:
2749  case Intrinsic::nvvm_tex_cube_v4s32_f32:
2751  case Intrinsic::nvvm_tex_cube_level_v4s32_f32:
2753  case Intrinsic::nvvm_tex_cube_v4u32_f32:
2755  case Intrinsic::nvvm_tex_cube_level_v4u32_f32:
2757 
2758  case Intrinsic::nvvm_tex_cube_array_v4f32_f32:
2760  case Intrinsic::nvvm_tex_cube_array_level_v4f32_f32:
2762  case Intrinsic::nvvm_tex_cube_array_v4s32_f32:
2764  case Intrinsic::nvvm_tex_cube_array_level_v4s32_f32:
2766  case Intrinsic::nvvm_tex_cube_array_v4u32_f32:
2768  case Intrinsic::nvvm_tex_cube_array_level_v4u32_f32:
2770 
2771  case Intrinsic::nvvm_tld4_r_2d_v4f32_f32:
2773  case Intrinsic::nvvm_tld4_g_2d_v4f32_f32:
2775  case Intrinsic::nvvm_tld4_b_2d_v4f32_f32:
2777  case Intrinsic::nvvm_tld4_a_2d_v4f32_f32:
2779  case Intrinsic::nvvm_tld4_r_2d_v4s32_f32:
2781  case Intrinsic::nvvm_tld4_g_2d_v4s32_f32:
2783  case Intrinsic::nvvm_tld4_b_2d_v4s32_f32:
2785  case Intrinsic::nvvm_tld4_a_2d_v4s32_f32:
2787  case Intrinsic::nvvm_tld4_r_2d_v4u32_f32:
2789  case Intrinsic::nvvm_tld4_g_2d_v4u32_f32:
2791  case Intrinsic::nvvm_tld4_b_2d_v4u32_f32:
2793  case Intrinsic::nvvm_tld4_a_2d_v4u32_f32:
2795 
2796  case Intrinsic::nvvm_tex_unified_1d_v4f32_s32:
2798  case Intrinsic::nvvm_tex_unified_1d_v4f32_f32:
2800  case Intrinsic::nvvm_tex_unified_1d_level_v4f32_f32:
2802  case Intrinsic::nvvm_tex_unified_1d_grad_v4f32_f32:
2804  case Intrinsic::nvvm_tex_unified_1d_v4s32_s32:
2806  case Intrinsic::nvvm_tex_unified_1d_v4s32_f32:
2808  case Intrinsic::nvvm_tex_unified_1d_level_v4s32_f32:
2810  case Intrinsic::nvvm_tex_unified_1d_grad_v4s32_f32:
2812  case Intrinsic::nvvm_tex_unified_1d_v4u32_s32:
2814  case Intrinsic::nvvm_tex_unified_1d_v4u32_f32:
2816  case Intrinsic::nvvm_tex_unified_1d_level_v4u32_f32:
2818  case Intrinsic::nvvm_tex_unified_1d_grad_v4u32_f32:
2820 
2821  case Intrinsic::nvvm_tex_unified_1d_array_v4f32_s32:
2823  case Intrinsic::nvvm_tex_unified_1d_array_v4f32_f32:
2825  case Intrinsic::nvvm_tex_unified_1d_array_level_v4f32_f32:
2827  case Intrinsic::nvvm_tex_unified_1d_array_grad_v4f32_f32:
2829  case Intrinsic::nvvm_tex_unified_1d_array_v4s32_s32:
2831  case Intrinsic::nvvm_tex_unified_1d_array_v4s32_f32:
2833  case Intrinsic::nvvm_tex_unified_1d_array_level_v4s32_f32:
2835  case Intrinsic::nvvm_tex_unified_1d_array_grad_v4s32_f32:
2837  case Intrinsic::nvvm_tex_unified_1d_array_v4u32_s32:
2839  case Intrinsic::nvvm_tex_unified_1d_array_v4u32_f32:
2841  case Intrinsic::nvvm_tex_unified_1d_array_level_v4u32_f32:
2843  case Intrinsic::nvvm_tex_unified_1d_array_grad_v4u32_f32:
2845 
2846  case Intrinsic::nvvm_tex_unified_2d_v4f32_s32:
2848  case Intrinsic::nvvm_tex_unified_2d_v4f32_f32:
2850  case Intrinsic::nvvm_tex_unified_2d_level_v4f32_f32:
2852  case Intrinsic::nvvm_tex_unified_2d_grad_v4f32_f32:
2854  case Intrinsic::nvvm_tex_unified_2d_v4s32_s32:
2856  case Intrinsic::nvvm_tex_unified_2d_v4s32_f32:
2858  case Intrinsic::nvvm_tex_unified_2d_level_v4s32_f32:
2860  case Intrinsic::nvvm_tex_unified_2d_grad_v4s32_f32:
2862  case Intrinsic::nvvm_tex_unified_2d_v4u32_s32:
2864  case Intrinsic::nvvm_tex_unified_2d_v4u32_f32:
2866  case Intrinsic::nvvm_tex_unified_2d_level_v4u32_f32:
2868  case Intrinsic::nvvm_tex_unified_2d_grad_v4u32_f32:
2870 
2871  case Intrinsic::nvvm_tex_unified_2d_array_v4f32_s32:
2873  case Intrinsic::nvvm_tex_unified_2d_array_v4f32_f32:
2875  case Intrinsic::nvvm_tex_unified_2d_array_level_v4f32_f32:
2877  case Intrinsic::nvvm_tex_unified_2d_array_grad_v4f32_f32:
2879  case Intrinsic::nvvm_tex_unified_2d_array_v4s32_s32:
2881  case Intrinsic::nvvm_tex_unified_2d_array_v4s32_f32:
2883  case Intrinsic::nvvm_tex_unified_2d_array_level_v4s32_f32:
2885  case Intrinsic::nvvm_tex_unified_2d_array_grad_v4s32_f32:
2887  case Intrinsic::nvvm_tex_unified_2d_array_v4u32_s32:
2889  case Intrinsic::nvvm_tex_unified_2d_array_v4u32_f32:
2891  case Intrinsic::nvvm_tex_unified_2d_array_level_v4u32_f32:
2893  case Intrinsic::nvvm_tex_unified_2d_array_grad_v4u32_f32:
2895 
2896  case Intrinsic::nvvm_tex_unified_3d_v4f32_s32:
2898  case Intrinsic::nvvm_tex_unified_3d_v4f32_f32:
2900  case Intrinsic::nvvm_tex_unified_3d_level_v4f32_f32:
2902  case Intrinsic::nvvm_tex_unified_3d_grad_v4f32_f32:
2904  case Intrinsic::nvvm_tex_unified_3d_v4s32_s32:
2906  case Intrinsic::nvvm_tex_unified_3d_v4s32_f32:
2908  case Intrinsic::nvvm_tex_unified_3d_level_v4s32_f32:
2910  case Intrinsic::nvvm_tex_unified_3d_grad_v4s32_f32:
2912  case Intrinsic::nvvm_tex_unified_3d_v4u32_s32:
2914  case Intrinsic::nvvm_tex_unified_3d_v4u32_f32:
2916  case Intrinsic::nvvm_tex_unified_3d_level_v4u32_f32:
2918  case Intrinsic::nvvm_tex_unified_3d_grad_v4u32_f32:
2920 
2921  case Intrinsic::nvvm_tex_unified_cube_v4f32_f32:
2923  case Intrinsic::nvvm_tex_unified_cube_level_v4f32_f32:
2925  case Intrinsic::nvvm_tex_unified_cube_v4s32_f32:
2927  case Intrinsic::nvvm_tex_unified_cube_level_v4s32_f32:
2929  case Intrinsic::nvvm_tex_unified_cube_v4u32_f32:
2931  case Intrinsic::nvvm_tex_unified_cube_level_v4u32_f32:
2933 
2934  case Intrinsic::nvvm_tex_unified_cube_array_v4f32_f32:
2936  case Intrinsic::nvvm_tex_unified_cube_array_level_v4f32_f32:
2938  case Intrinsic::nvvm_tex_unified_cube_array_v4s32_f32:
2940  case Intrinsic::nvvm_tex_unified_cube_array_level_v4s32_f32:
2942  case Intrinsic::nvvm_tex_unified_cube_array_v4u32_f32:
2944  case Intrinsic::nvvm_tex_unified_cube_array_level_v4u32_f32:
2946 
2947  case Intrinsic::nvvm_tld4_unified_r_2d_v4f32_f32:
2949  case Intrinsic::nvvm_tld4_unified_g_2d_v4f32_f32:
2951  case Intrinsic::nvvm_tld4_unified_b_2d_v4f32_f32:
2953  case Intrinsic::nvvm_tld4_unified_a_2d_v4f32_f32:
2955  case Intrinsic::nvvm_tld4_unified_r_2d_v4s32_f32:
2957  case Intrinsic::nvvm_tld4_unified_g_2d_v4s32_f32:
2959  case Intrinsic::nvvm_tld4_unified_b_2d_v4s32_f32:
2961  case Intrinsic::nvvm_tld4_unified_a_2d_v4s32_f32:
2963  case Intrinsic::nvvm_tld4_unified_r_2d_v4u32_f32:
2965  case Intrinsic::nvvm_tld4_unified_g_2d_v4u32_f32:
2967  case Intrinsic::nvvm_tld4_unified_b_2d_v4u32_f32:
2969  case Intrinsic::nvvm_tld4_unified_a_2d_v4u32_f32:
2971  }
2972 }
2973 
2974 static unsigned getOpcForSurfaceInstr(unsigned Intrinsic) {
2975  switch (Intrinsic) {
2976  default:
2977  return 0;
2978  case Intrinsic::nvvm_suld_1d_i8_clamp:
2979  return NVPTXISD::Suld1DI8Clamp;
2980  case Intrinsic::nvvm_suld_1d_i16_clamp:
2981  return NVPTXISD::Suld1DI16Clamp;
2982  case Intrinsic::nvvm_suld_1d_i32_clamp:
2983  return NVPTXISD::Suld1DI32Clamp;
2984  case Intrinsic::nvvm_suld_1d_i64_clamp:
2985  return NVPTXISD::Suld1DI64Clamp;
2986  case Intrinsic::nvvm_suld_1d_v2i8_clamp:
2988  case Intrinsic::nvvm_suld_1d_v2i16_clamp:
2990  case Intrinsic::nvvm_suld_1d_v2i32_clamp:
2992  case Intrinsic::nvvm_suld_1d_v2i64_clamp:
2994  case Intrinsic::nvvm_suld_1d_v4i8_clamp:
2996  case Intrinsic::nvvm_suld_1d_v4i16_clamp:
2998  case Intrinsic::nvvm_suld_1d_v4i32_clamp:
3000  case Intrinsic::nvvm_suld_1d_array_i8_clamp:
3002  case Intrinsic::nvvm_suld_1d_array_i16_clamp:
3004  case Intrinsic::nvvm_suld_1d_array_i32_clamp:
3006  case Intrinsic::nvvm_suld_1d_array_i64_clamp:
3008  case Intrinsic::nvvm_suld_1d_array_v2i8_clamp:
3010  case Intrinsic::nvvm_suld_1d_array_v2i16_clamp:
3012  case Intrinsic::nvvm_suld_1d_array_v2i32_clamp:
3014  case Intrinsic::nvvm_suld_1d_array_v2i64_clamp:
3016  case Intrinsic::nvvm_suld_1d_array_v4i8_clamp:
3018  case Intrinsic::nvvm_suld_1d_array_v4i16_clamp:
3020  case Intrinsic::nvvm_suld_1d_array_v4i32_clamp:
3022  case Intrinsic::nvvm_suld_2d_i8_clamp:
3023  return NVPTXISD::Suld2DI8Clamp;
3024  case Intrinsic::nvvm_suld_2d_i16_clamp:
3025  return NVPTXISD::Suld2DI16Clamp;
3026  case Intrinsic::nvvm_suld_2d_i32_clamp:
3027  return NVPTXISD::Suld2DI32Clamp;
3028  case Intrinsic::nvvm_suld_2d_i64_clamp:
3029  return NVPTXISD::Suld2DI64Clamp;
3030  case Intrinsic::nvvm_suld_2d_v2i8_clamp:
3032  case Intrinsic::nvvm_suld_2d_v2i16_clamp:
3034  case Intrinsic::nvvm_suld_2d_v2i32_clamp:
3036  case Intrinsic::nvvm_suld_2d_v2i64_clamp:
3038  case Intrinsic::nvvm_suld_2d_v4i8_clamp:
3040  case Intrinsic::nvvm_suld_2d_v4i16_clamp:
3042  case Intrinsic::nvvm_suld_2d_v4i32_clamp:
3044  case Intrinsic::nvvm_suld_2d_array_i8_clamp:
3046  case Intrinsic::nvvm_suld_2d_array_i16_clamp:
3048  case Intrinsic::nvvm_suld_2d_array_i32_clamp:
3050  case Intrinsic::nvvm_suld_2d_array_i64_clamp:
3052  case Intrinsic::nvvm_suld_2d_array_v2i8_clamp:
3054  case Intrinsic::nvvm_suld_2d_array_v2i16_clamp:
3056  case Intrinsic::nvvm_suld_2d_array_v2i32_clamp:
3058  case Intrinsic::nvvm_suld_2d_array_v2i64_clamp:
3060  case Intrinsic::nvvm_suld_2d_array_v4i8_clamp:
3062  case Intrinsic::nvvm_suld_2d_array_v4i16_clamp:
3064  case Intrinsic::nvvm_suld_2d_array_v4i32_clamp:
3066  case Intrinsic::nvvm_suld_3d_i8_clamp:
3067  return NVPTXISD::Suld3DI8Clamp;
3068  case Intrinsic::nvvm_suld_3d_i16_clamp:
3069  return NVPTXISD::Suld3DI16Clamp;
3070  case Intrinsic::nvvm_suld_3d_i32_clamp:
3071  return NVPTXISD::Suld3DI32Clamp;
3072  case Intrinsic::nvvm_suld_3d_i64_clamp:
3073  return NVPTXISD::Suld3DI64Clamp;
3074  case Intrinsic::nvvm_suld_3d_v2i8_clamp:
3076  case Intrinsic::nvvm_suld_3d_v2i16_clamp:
3078  case Intrinsic::nvvm_suld_3d_v2i32_clamp:
3080  case Intrinsic::nvvm_suld_3d_v2i64_clamp:
3082  case Intrinsic::nvvm_suld_3d_v4i8_clamp:
3084  case Intrinsic::nvvm_suld_3d_v4i16_clamp:
3086  case Intrinsic::nvvm_suld_3d_v4i32_clamp:
3088  case Intrinsic::nvvm_suld_1d_i8_trap:
3089  return NVPTXISD::Suld1DI8Trap;
3090  case Intrinsic::nvvm_suld_1d_i16_trap:
3091  return NVPTXISD::Suld1DI16Trap;
3092  case Intrinsic::nvvm_suld_1d_i32_trap:
3093  return NVPTXISD::Suld1DI32Trap;
3094  case Intrinsic::nvvm_suld_1d_i64_trap:
3095  return NVPTXISD::Suld1DI64Trap;
3096  case Intrinsic::nvvm_suld_1d_v2i8_trap:
3097  return NVPTXISD::Suld1DV2I8Trap;
3098  case Intrinsic::nvvm_suld_1d_v2i16_trap:
3100  case Intrinsic::nvvm_suld_1d_v2i32_trap:
3102  case Intrinsic::nvvm_suld_1d_v2i64_trap:
3104  case Intrinsic::nvvm_suld_1d_v4i8_trap:
3105  return NVPTXISD::Suld1DV4I8Trap;
3106  case Intrinsic::nvvm_suld_1d_v4i16_trap:
3108  case Intrinsic::nvvm_suld_1d_v4i32_trap:
3110  case Intrinsic::nvvm_suld_1d_array_i8_trap:
3112  case Intrinsic::nvvm_suld_1d_array_i16_trap:
3114  case Intrinsic::nvvm_suld_1d_array_i32_trap:
3116  case Intrinsic::nvvm_suld_1d_array_i64_trap:
3118  case Intrinsic::nvvm_suld_1d_array_v2i8_trap:
3120  case Intrinsic::nvvm_suld_1d_array_v2i16_trap:
3122  case Intrinsic::nvvm_suld_1d_array_v2i32_trap:
3124  case Intrinsic::nvvm_suld_1d_array_v2i64_trap:
3126  case Intrinsic::nvvm_suld_1d_array_v4i8_trap:
3128  case Intrinsic::nvvm_suld_1d_array_v4i16_trap:
3130  case Intrinsic::nvvm_suld_1d_array_v4i32_trap:
3132  case Intrinsic::nvvm_suld_2d_i8_trap:
3133  return NVPTXISD::Suld2DI8Trap;
3134  case Intrinsic::nvvm_suld_2d_i16_trap:
3135  return NVPTXISD::Suld2DI16Trap;
3136  case Intrinsic::nvvm_suld_2d_i32_trap:
3137  return NVPTXISD::Suld2DI32Trap;
3138  case Intrinsic::nvvm_suld_2d_i64_trap:
3139  return NVPTXISD::Suld2DI64Trap;
3140  case Intrinsic::nvvm_suld_2d_v2i8_trap:
3141  return NVPTXISD::Suld2DV2I8Trap;
3142  case Intrinsic::nvvm_suld_2d_v2i16_trap:
3144  case Intrinsic::nvvm_suld_2d_v2i32_trap:
3146  case Intrinsic::nvvm_suld_2d_v2i64_trap:
3148  case Intrinsic::nvvm_suld_2d_v4i8_trap:
3149  return NVPTXISD::Suld2DV4I8Trap;
3150  case Intrinsic::nvvm_suld_2d_v4i16_trap:
3152  case Intrinsic::nvvm_suld_2d_v4i32_trap:
3154  case Intrinsic::nvvm_suld_2d_array_i8_trap:
3156  case Intrinsic::nvvm_suld_2d_array_i16_trap:
3158  case Intrinsic::nvvm_suld_2d_array_i32_trap:
3160  case Intrinsic::nvvm_suld_2d_array_i64_trap:
3162  case Intrinsic::nvvm_suld_2d_array_v2i8_trap:
3164  case Intrinsic::nvvm_suld_2d_array_v2i16_trap:
3166  case Intrinsic::nvvm_suld_2d_array_v2i32_trap:
3168  case Intrinsic::nvvm_suld_2d_array_v2i64_trap:
3170  case Intrinsic::nvvm_suld_2d_array_v4i8_trap:
3172  case Intrinsic::nvvm_suld_2d_array_v4i16_trap:
3174  case Intrinsic::nvvm_suld_2d_array_v4i32_trap:
3176  case Intrinsic::nvvm_suld_3d_i8_trap:
3177  return NVPTXISD::Suld3DI8Trap;
3178  case Intrinsic::nvvm_suld_3d_i16_trap:
3179  return NVPTXISD::Suld3DI16Trap;
3180  case Intrinsic::nvvm_suld_3d_i32_trap:
3181  return NVPTXISD::Suld3DI32Trap;
3182  case Intrinsic::nvvm_suld_3d_i64_trap:
3183  return NVPTXISD::Suld3DI64Trap;
3184  case Intrinsic::nvvm_suld_3d_v2i8_trap:
3185  return NVPTXISD::Suld3DV2I8Trap;
3186  case Intrinsic::nvvm_suld_3d_v2i16_trap:
3188  case Intrinsic::nvvm_suld_3d_v2i32_trap:
3190  case Intrinsic::nvvm_suld_3d_v2i64_trap:
3192  case Intrinsic::nvvm_suld_3d_v4i8_trap:
3193  return NVPTXISD::Suld3DV4I8Trap;
3194  case Intrinsic::nvvm_suld_3d_v4i16_trap:
3196  case Intrinsic::nvvm_suld_3d_v4i32_trap:
3198  case Intrinsic::nvvm_suld_1d_i8_zero:
3199  return NVPTXISD::Suld1DI8Zero;
3200  case Intrinsic::nvvm_suld_1d_i16_zero:
3201  return NVPTXISD::Suld1DI16Zero;
3202  case Intrinsic::nvvm_suld_1d_i32_zero:
3203  return NVPTXISD::Suld1DI32Zero;
3204  case Intrinsic::nvvm_suld_1d_i64_zero:
3205  return NVPTXISD::Suld1DI64Zero;
3206  case Intrinsic::nvvm_suld_1d_v2i8_zero:
3207  return NVPTXISD::Suld1DV2I8Zero;
3208  case Intrinsic::nvvm_suld_1d_v2i16_zero:
3210  case Intrinsic::nvvm_suld_1d_v2i32_zero:
3212  case Intrinsic::nvvm_suld_1d_v2i64_zero:
3214  case Intrinsic::nvvm_suld_1d_v4i8_zero:
3215  return NVPTXISD::Suld1DV4I8Zero;
3216  case Intrinsic::nvvm_suld_1d_v4i16_zero:
3218  case Intrinsic::nvvm_suld_1d_v4i32_zero:
3220  case Intrinsic::nvvm_suld_1d_array_i8_zero:
3222  case Intrinsic::nvvm_suld_1d_array_i16_zero:
3224  case Intrinsic::nvvm_suld_1d_array_i32_zero:
3226  case Intrinsic::nvvm_suld_1d_array_i64_zero:
3228  case Intrinsic::nvvm_suld_1d_array_v2i8_zero:
3230  case Intrinsic::nvvm_suld_1d_array_v2i16_zero:
3232  case Intrinsic::nvvm_suld_1d_array_v2i32_zero:
3234  case Intrinsic::nvvm_suld_1d_array_v2i64_zero:
3236  case Intrinsic::nvvm_suld_1d_array_v4i8_zero:
3238  case Intrinsic::nvvm_suld_1d_array_v4i16_zero:
3240  case Intrinsic::nvvm_suld_1d_array_v4i32_zero:
3242  case Intrinsic::nvvm_suld_2d_i8_zero:
3243  return NVPTXISD::Suld2DI8Zero;
3244  case Intrinsic::nvvm_suld_2d_i16_zero:
3245  return NVPTXISD::Suld2DI16Zero;
3246  case Intrinsic::nvvm_suld_2d_i32_zero:
3247  return NVPTXISD::Suld2DI32Zero;
3248  case Intrinsic::nvvm_suld_2d_i64_zero:
3249  return NVPTXISD::Suld2DI64Zero;
3250  case Intrinsic::nvvm_suld_2d_v2i8_zero:
3251  return NVPTXISD::Suld2DV2I8Zero;
3252  case Intrinsic::nvvm_suld_2d_v2i16_zero:
3254  case Intrinsic::nvvm_suld_2d_v2i32_zero:
3256  case Intrinsic::nvvm_suld_2d_v2i64_zero:
3258  case Intrinsic::nvvm_suld_2d_v4i8_zero:
3259  return NVPTXISD::Suld2DV4I8Zero;
3260  case Intrinsic::nvvm_suld_2d_v4i16_zero:
3262  case Intrinsic::nvvm_suld_2d_v4i32_zero:
3264  case Intrinsic::nvvm_suld_2d_array_i8_zero:
3266  case Intrinsic::nvvm_suld_2d_array_i16_zero:
3268  case Intrinsic::nvvm_suld_2d_array_i32_zero:
3270  case Intrinsic::nvvm_suld_2d_array_i64_zero:
3272  case Intrinsic::nvvm_suld_2d_array_v2i8_zero:
3274  case Intrinsic::nvvm_suld_2d_array_v2i16_zero:
3276  case Intrinsic::nvvm_suld_2d_array_v2i32_zero:
3278  case Intrinsic::nvvm_suld_2d_array_v2i64_zero:
3280  case Intrinsic::nvvm_suld_2d_array_v4i8_zero:
3282  case Intrinsic::nvvm_suld_2d_array_v4i16_zero:
3284  case Intrinsic::nvvm_suld_2d_array_v4i32_zero:
3286  case Intrinsic::nvvm_suld_3d_i8_zero:
3287  return NVPTXISD::Suld3DI8Zero;
3288  case Intrinsic::nvvm_suld_3d_i16_zero:
3289  return NVPTXISD::Suld3DI16Zero;
3290  case Intrinsic::nvvm_suld_3d_i32_zero:
3291  return NVPTXISD::Suld3DI32Zero;
3292  case Intrinsic::nvvm_suld_3d_i64_zero:
3293  return NVPTXISD::Suld3DI64Zero;
3294  case Intrinsic::nvvm_suld_3d_v2i8_zero:
3295  return NVPTXISD::Suld3DV2I8Zero;
3296  case Intrinsic::nvvm_suld_3d_v2i16_zero:
3298  case Intrinsic::nvvm_suld_3d_v2i32_zero:
3300  case Intrinsic::nvvm_suld_3d_v2i64_zero:
3302  case Intrinsic::nvvm_suld_3d_v4i8_zero:
3303  return NVPTXISD::Suld3DV4I8Zero;
3304  case Intrinsic::nvvm_suld_3d_v4i16_zero:
3306  case Intrinsic::nvvm_suld_3d_v4i32_zero:
3308  }
3309 }
3310 
3311 // llvm.ptx.memcpy.const and llvm.ptx.memmove.const need to be modeled as
3312 // TgtMemIntrinsic
3313 // because we need the information that is only available in the "Value" type
3314 // of destination
3315 // pointer. In particular, the address space information.
3317  IntrinsicInfo &Info, const CallInst &I,
3318  MachineFunction &MF, unsigned Intrinsic) const {
3319  switch (Intrinsic) {
3320  default:
3321  return false;
3322  case Intrinsic::nvvm_match_all_sync_i32p:
3323  case Intrinsic::nvvm_match_all_sync_i64p:
3324  Info.opc = ISD::INTRINSIC_W_CHAIN;
3325  // memVT is bogus. These intrinsics have IntrInaccessibleMemOnly attribute
3326  // in order to model data exchange with other threads, but perform no real
3327  // memory accesses.
3328  Info.memVT = MVT::i1;
3329 
3330  // Our result depends on both our and other thread's arguments.
3332  return true;
3333  case Intrinsic::nvvm_wmma_load_a_f16_col:
3334  case Intrinsic::nvvm_wmma_load_a_f16_row:
3335  case Intrinsic::nvvm_wmma_load_a_f16_col_stride:
3336  case Intrinsic::nvvm_wmma_load_a_f16_row_stride:
3337  case Intrinsic::nvvm_wmma_load_a_f16_col_shared:
3338  case Intrinsic::nvvm_wmma_load_a_f16_row_shared:
3339  case Intrinsic::nvvm_wmma_load_a_f16_col_shared_stride:
3340  case Intrinsic::nvvm_wmma_load_a_f16_row_shared_stride:
3341  case Intrinsic::nvvm_wmma_load_a_f16_col_global:
3342  case Intrinsic::nvvm_wmma_load_a_f16_row_global:
3343  case Intrinsic::nvvm_wmma_load_a_f16_col_global_stride:
3344  case Intrinsic::nvvm_wmma_load_a_f16_row_global_stride:
3345  case Intrinsic::nvvm_wmma_load_b_f16_col:
3346  case Intrinsic::nvvm_wmma_load_b_f16_row:
3347  case Intrinsic::nvvm_wmma_load_b_f16_col_stride:
3348  case Intrinsic::nvvm_wmma_load_b_f16_row_stride:
3349  case Intrinsic::nvvm_wmma_load_b_f16_col_shared:
3350  case Intrinsic::nvvm_wmma_load_b_f16_row_shared:
3351  case Intrinsic::nvvm_wmma_load_b_f16_col_shared_stride:
3352  case Intrinsic::nvvm_wmma_load_b_f16_row_shared_stride:
3353  case Intrinsic::nvvm_wmma_load_b_f16_col_global:
3354  case Intrinsic::nvvm_wmma_load_b_f16_row_global:
3355  case Intrinsic::nvvm_wmma_load_b_f16_col_global_stride:
3356  case Intrinsic::nvvm_wmma_load_b_f16_row_global_stride: {
3357  Info.opc = ISD::INTRINSIC_W_CHAIN;
3358  Info.memVT = MVT::v8f16;
3359  Info.ptrVal = I.getArgOperand(0);
3360  Info.offset = 0;
3362  Info.align = 16;
3363  return true;
3364  }
3365 
3366  case Intrinsic::nvvm_wmma_load_c_f16_col:
3367  case Intrinsic::nvvm_wmma_load_c_f16_row:
3368  case Intrinsic::nvvm_wmma_load_c_f16_col_stride:
3369  case Intrinsic::nvvm_wmma_load_c_f16_row_stride:
3370  case Intrinsic::nvvm_wmma_load_c_f16_col_shared:
3371  case Intrinsic::nvvm_wmma_load_c_f16_row_shared:
3372  case Intrinsic::nvvm_wmma_load_c_f16_col_shared_stride:
3373  case Intrinsic::nvvm_wmma_load_c_f16_row_shared_stride:
3374  case Intrinsic::nvvm_wmma_load_c_f16_col_global:
3375  case Intrinsic::nvvm_wmma_load_c_f16_row_global:
3376  case Intrinsic::nvvm_wmma_load_c_f16_col_global_stride:
3377  case Intrinsic::nvvm_wmma_load_c_f16_row_global_stride: {
3378  Info.opc = ISD::INTRINSIC_W_CHAIN;
3379  Info.memVT = MVT::v4f16;
3380  Info.ptrVal = I.getArgOperand(0);
3381  Info.offset = 0;
3383  Info.align = 16;
3384  return true;
3385  }
3386 
3387  case Intrinsic::nvvm_wmma_load_c_f32_col:
3388  case Intrinsic::nvvm_wmma_load_c_f32_row:
3389  case Intrinsic::nvvm_wmma_load_c_f32_col_stride:
3390  case Intrinsic::nvvm_wmma_load_c_f32_row_stride:
3391  case Intrinsic::nvvm_wmma_load_c_f32_col_shared:
3392  case Intrinsic::nvvm_wmma_load_c_f32_row_shared:
3393  case Intrinsic::nvvm_wmma_load_c_f32_col_shared_stride:
3394  case Intrinsic::nvvm_wmma_load_c_f32_row_shared_stride:
3395  case Intrinsic::nvvm_wmma_load_c_f32_col_global:
3396  case Intrinsic::nvvm_wmma_load_c_f32_row_global:
3397  case Intrinsic::nvvm_wmma_load_c_f32_col_global_stride:
3398  case Intrinsic::nvvm_wmma_load_c_f32_row_global_stride: {
3399  Info.opc = ISD::INTRINSIC_W_CHAIN;
3400  Info.memVT = MVT::v8f32;
3401  Info.ptrVal = I.getArgOperand(0);
3402  Info.offset = 0;
3404  Info.align = 16;
3405  return true;
3406  }
3407 
3408  case Intrinsic::nvvm_wmma_store_d_f16_col:
3409  case Intrinsic::nvvm_wmma_store_d_f16_row:
3410  case Intrinsic::nvvm_wmma_store_d_f16_col_stride:
3411  case Intrinsic::nvvm_wmma_store_d_f16_row_stride:
3412  case Intrinsic::nvvm_wmma_store_d_f16_col_shared:
3413  case Intrinsic::nvvm_wmma_store_d_f16_row_shared:
3414  case Intrinsic::nvvm_wmma_store_d_f16_col_shared_stride:
3415  case Intrinsic::nvvm_wmma_store_d_f16_row_shared_stride:
3416  case Intrinsic::nvvm_wmma_store_d_f16_col_global:
3417  case Intrinsic::nvvm_wmma_store_d_f16_row_global:
3418  case Intrinsic::nvvm_wmma_store_d_f16_col_global_stride:
3419  case Intrinsic::nvvm_wmma_store_d_f16_row_global_stride: {
3420  Info.opc = ISD::INTRINSIC_W_CHAIN;
3421  Info.memVT = MVT::v4f16;
3422  Info.ptrVal = I.getArgOperand(0);
3423  Info.offset = 0;
3425  Info.align = 16;
3426  return true;
3427  }
3428 
3429  case Intrinsic::nvvm_wmma_store_d_f32_col:
3430  case Intrinsic::nvvm_wmma_store_d_f32_row:
3431  case Intrinsic::nvvm_wmma_store_d_f32_col_stride:
3432  case Intrinsic::nvvm_wmma_store_d_f32_row_stride:
3433  case Intrinsic::nvvm_wmma_store_d_f32_col_shared:
3434  case Intrinsic::nvvm_wmma_store_d_f32_row_shared:
3435  case Intrinsic::nvvm_wmma_store_d_f32_col_shared_stride:
3436  case Intrinsic::nvvm_wmma_store_d_f32_row_shared_stride:
3437  case Intrinsic::nvvm_wmma_store_d_f32_col_global:
3438  case Intrinsic::nvvm_wmma_store_d_f32_row_global:
3439  case Intrinsic::nvvm_wmma_store_d_f32_col_global_stride:
3440  case Intrinsic::nvvm_wmma_store_d_f32_row_global_stride: {
3441  Info.opc = ISD::INTRINSIC_W_CHAIN;
3442  Info.memVT = MVT::v8f32;
3443  Info.ptrVal = I.getArgOperand(0);
3444  Info.offset = 0;
3446  Info.align = 16;
3447  return true;
3448  }
3449 
3450  case Intrinsic::nvvm_atomic_load_add_f32:
3451  case Intrinsic::nvvm_atomic_load_add_f64:
3452  case Intrinsic::nvvm_atomic_load_inc_32:
3453  case Intrinsic::nvvm_atomic_load_dec_32:
3454 
3455  case Intrinsic::nvvm_atomic_add_gen_f_cta:
3456  case Intrinsic::nvvm_atomic_add_gen_f_sys:
3457  case Intrinsic::nvvm_atomic_add_gen_i_cta:
3458  case Intrinsic::nvvm_atomic_add_gen_i_sys:
3459  case Intrinsic::nvvm_atomic_and_gen_i_cta:
3460  case Intrinsic::nvvm_atomic_and_gen_i_sys:
3461  case Intrinsic::nvvm_atomic_cas_gen_i_cta:
3462  case Intrinsic::nvvm_atomic_cas_gen_i_sys:
3463  case Intrinsic::nvvm_atomic_dec_gen_i_cta:
3464  case Intrinsic::nvvm_atomic_dec_gen_i_sys:
3465  case Intrinsic::nvvm_atomic_inc_gen_i_cta:
3466  case Intrinsic::nvvm_atomic_inc_gen_i_sys:
3467  case Intrinsic::nvvm_atomic_max_gen_i_cta:
3468  case Intrinsic::nvvm_atomic_max_gen_i_sys:
3469  case Intrinsic::nvvm_atomic_min_gen_i_cta:
3470  case Intrinsic::nvvm_atomic_min_gen_i_sys:
3471  case Intrinsic::nvvm_atomic_or_gen_i_cta:
3472  case Intrinsic::nvvm_atomic_or_gen_i_sys:
3473  case Intrinsic::nvvm_atomic_exch_gen_i_cta:
3474  case Intrinsic::nvvm_atomic_exch_gen_i_sys:
3475  case Intrinsic::nvvm_atomic_xor_gen_i_cta:
3476  case Intrinsic::nvvm_atomic_xor_gen_i_sys: {
3477  auto &DL = I.getModule()->getDataLayout();
3478  Info.opc = ISD::INTRINSIC_W_CHAIN;
3479  Info.memVT = getValueType(DL, I.getType());
3480  Info.ptrVal = I.getArgOperand(0);
3481  Info.offset = 0;
3483  Info.align = 0;
3484  return true;
3485  }
3486 
3487  case Intrinsic::nvvm_ldu_global_i:
3488  case Intrinsic::nvvm_ldu_global_f:
3489  case Intrinsic::nvvm_ldu_global_p: {
3490  auto &DL = I.getModule()->getDataLayout();
3491  Info.opc = ISD::INTRINSIC_W_CHAIN;
3492  if (Intrinsic == Intrinsic::nvvm_ldu_global_i)
3493  Info.memVT = getValueType(DL, I.getType());
3494  else if(Intrinsic == Intrinsic::nvvm_ldu_global_p)
3495  Info.memVT = getPointerTy(DL);
3496  else
3497  Info.memVT = getValueType(DL, I.getType());
3498  Info.ptrVal = I.getArgOperand(0);
3499  Info.offset = 0;
3501  Info.align = cast<ConstantInt>(I.getArgOperand(1))->getZExtValue();
3502 
3503  return true;
3504  }
3505  case Intrinsic::nvvm_ldg_global_i:
3506  case Intrinsic::nvvm_ldg_global_f:
3507  case Intrinsic::nvvm_ldg_global_p: {
3508  auto &DL = I.getModule()->getDataLayout();
3509 
3510  Info.opc = ISD::INTRINSIC_W_CHAIN;
3511  if (Intrinsic == Intrinsic::nvvm_ldg_global_i)
3512  Info.memVT = getValueType(DL, I.getType());
3513  else if(Intrinsic == Intrinsic::nvvm_ldg_global_p)
3514  Info.memVT = getPointerTy(DL);
3515  else
3516  Info.memVT = getValueType(DL, I.getType());
3517  Info.ptrVal = I.getArgOperand(0);
3518  Info.offset = 0;
3520  Info.align = cast<ConstantInt>(I.getArgOperand(1))->getZExtValue();
3521 
3522  return true;
3523  }
3524 
3525  case Intrinsic::nvvm_tex_1d_v4f32_s32:
3526  case Intrinsic::nvvm_tex_1d_v4f32_f32:
3527  case Intrinsic::nvvm_tex_1d_level_v4f32_f32:
3528  case Intrinsic::nvvm_tex_1d_grad_v4f32_f32:
3529  case Intrinsic::nvvm_tex_1d_array_v4f32_s32:
3530  case Intrinsic::nvvm_tex_1d_array_v4f32_f32:
3531  case Intrinsic::nvvm_tex_1d_array_level_v4f32_f32:
3532  case Intrinsic::nvvm_tex_1d_array_grad_v4f32_f32:
3533  case Intrinsic::nvvm_tex_2d_v4f32_s32:
3534  case Intrinsic::nvvm_tex_2d_v4f32_f32:
3535  case Intrinsic::nvvm_tex_2d_level_v4f32_f32:
3536  case Intrinsic::nvvm_tex_2d_grad_v4f32_f32:
3537  case Intrinsic::nvvm_tex_2d_array_v4f32_s32:
3538  case Intrinsic::nvvm_tex_2d_array_v4f32_f32:
3539  case Intrinsic::nvvm_tex_2d_array_level_v4f32_f32:
3540  case Intrinsic::nvvm_tex_2d_array_grad_v4f32_f32:
3541  case Intrinsic::nvvm_tex_3d_v4f32_s32:
3542  case Intrinsic::nvvm_tex_3d_v4f32_f32:
3543  case Intrinsic::nvvm_tex_3d_level_v4f32_f32:
3544  case Intrinsic::nvvm_tex_3d_grad_v4f32_f32:
3545  case Intrinsic::nvvm_tex_cube_v4f32_f32:
3546  case Intrinsic::nvvm_tex_cube_level_v4f32_f32:
3547  case Intrinsic::nvvm_tex_cube_array_v4f32_f32:
3548  case Intrinsic::nvvm_tex_cube_array_level_v4f32_f32:
3549  case Intrinsic::nvvm_tld4_r_2d_v4f32_f32:
3550  case Intrinsic::nvvm_tld4_g_2d_v4f32_f32:
3551  case Intrinsic::nvvm_tld4_b_2d_v4f32_f32:
3552  case Intrinsic::nvvm_tld4_a_2d_v4f32_f32:
3553  case Intrinsic::nvvm_tex_unified_1d_v4f32_s32:
3554  case Intrinsic::nvvm_tex_unified_1d_v4f32_f32:
3555  case Intrinsic::nvvm_tex_unified_1d_level_v4f32_f32:
3556  case Intrinsic::nvvm_tex_unified_1d_grad_v4f32_f32:
3557  case Intrinsic::nvvm_tex_unified_1d_array_v4f32_s32:
3558  case Intrinsic::nvvm_tex_unified_1d_array_v4f32_f32:
3559  case Intrinsic::nvvm_tex_unified_1d_array_level_v4f32_f32:
3560  case Intrinsic::nvvm_tex_unified_1d_array_grad_v4f32_f32:
3561  case Intrinsic::nvvm_tex_unified_2d_v4f32_s32:
3562  case Intrinsic::nvvm_tex_unified_2d_v4f32_f32:
3563  case Intrinsic::nvvm_tex_unified_2d_level_v4f32_f32:
3564  case Intrinsic::nvvm_tex_unified_2d_grad_v4f32_f32:
3565  case Intrinsic::nvvm_tex_unified_2d_array_v4f32_s32:
3566  case Intrinsic::nvvm_tex_unified_2d_array_v4f32_f32:
3567  case Intrinsic::nvvm_tex_unified_2d_array_level_v4f32_f32:
3568  case Intrinsic::nvvm_tex_unified_2d_array_grad_v4f32_f32:
3569  case Intrinsic::nvvm_tex_unified_3d_v4f32_s32:
3570  case Intrinsic::nvvm_tex_unified_3d_v4f32_f32:
3571  case Intrinsic::nvvm_tex_unified_3d_level_v4f32_f32:
3572  case Intrinsic::nvvm_tex_unified_3d_grad_v4f32_f32:
3573  case Intrinsic::nvvm_tex_unified_cube_v4f32_f32:
3574  case Intrinsic::nvvm_tex_unified_cube_level_v4f32_f32:
3575  case Intrinsic::nvvm_tex_unified_cube_array_v4f32_f32:
3576  case Intrinsic::nvvm_tex_unified_cube_array_level_v4f32_f32:
3577  case Intrinsic::nvvm_tld4_unified_r_2d_v4f32_f32:
3578  case Intrinsic::nvvm_tld4_unified_g_2d_v4f32_f32:
3579  case Intrinsic::nvvm_tld4_unified_b_2d_v4f32_f32:
3580  case Intrinsic::nvvm_tld4_unified_a_2d_v4f32_f32:
3581  Info.opc = getOpcForTextureInstr(Intrinsic);
3582  Info.memVT = MVT::v4f32;
3583  Info.ptrVal = nullptr;
3584  Info.offset = 0;
3586  Info.align = 16;
3587  return true;
3588 
3589  case Intrinsic::nvvm_tex_1d_v4s32_s32:
3590  case Intrinsic::nvvm_tex_1d_v4s32_f32:
3591  case Intrinsic::nvvm_tex_1d_level_v4s32_f32:
3592  case Intrinsic::nvvm_tex_1d_grad_v4s32_f32:
3593  case Intrinsic::nvvm_tex_1d_array_v4s32_s32:
3594  case Intrinsic::nvvm_tex_1d_array_v4s32_f32:
3595  case Intrinsic::nvvm_tex_1d_array_level_v4s32_f32:
3596  case Intrinsic::nvvm_tex_1d_array_grad_v4s32_f32:
3597  case Intrinsic::nvvm_tex_2d_v4s32_s32:
3598  case Intrinsic::nvvm_tex_2d_v4s32_f32:
3599  case Intrinsic::nvvm_tex_2d_level_v4s32_f32:
3600  case Intrinsic::nvvm_tex_2d_grad_v4s32_f32:
3601  case Intrinsic::nvvm_tex_2d_array_v4s32_s32:
3602  case Intrinsic::nvvm_tex_2d_array_v4s32_f32:
3603  case Intrinsic::nvvm_tex_2d_array_level_v4s32_f32:
3604  case Intrinsic::nvvm_tex_2d_array_grad_v4s32_f32:
3605  case Intrinsic::nvvm_tex_3d_v4s32_s32:
3606  case Intrinsic::nvvm_tex_3d_v4s32_f32:
3607  case Intrinsic::nvvm_tex_3d_level_v4s32_f32:
3608  case Intrinsic::nvvm_tex_3d_grad_v4s32_f32:
3609  case Intrinsic::nvvm_tex_cube_v4s32_f32:
3610  case Intrinsic::nvvm_tex_cube_level_v4s32_f32:
3611  case Intrinsic::nvvm_tex_cube_array_v4s32_f32:
3612  case Intrinsic::nvvm_tex_cube_array_level_v4s32_f32:
3613  case Intrinsic::nvvm_tex_cube_v4u32_f32:
3614  case Intrinsic::nvvm_tex_cube_level_v4u32_f32:
3615  case Intrinsic::nvvm_tex_cube_array_v4u32_f32:
3616  case Intrinsic::nvvm_tex_cube_array_level_v4u32_f32:
3617  case Intrinsic::nvvm_tex_1d_v4u32_s32:
3618  case Intrinsic::nvvm_tex_1d_v4u32_f32:
3619  case Intrinsic::nvvm_tex_1d_level_v4u32_f32:
3620  case Intrinsic::nvvm_tex_1d_grad_v4u32_f32:
3621  case Intrinsic::nvvm_tex_1d_array_v4u32_s32:
3622  case Intrinsic::nvvm_tex_1d_array_v4u32_f32:
3623  case Intrinsic::nvvm_tex_1d_array_level_v4u32_f32:
3624  case Intrinsic::nvvm_tex_1d_array_grad_v4u32_f32:
3625  case Intrinsic::nvvm_tex_2d_v4u32_s32:
3626  case Intrinsic::nvvm_tex_2d_v4u32_f32:
3627  case Intrinsic::nvvm_tex_2d_level_v4u32_f32:
3628  case Intrinsic::nvvm_tex_2d_grad_v4u32_f32:
3629  case Intrinsic::nvvm_tex_2d_array_v4u32_s32:
3630  case Intrinsic::nvvm_tex_2d_array_v4u32_f32:
3631  case Intrinsic::nvvm_tex_2d_array_level_v4u32_f32:
3632  case Intrinsic::nvvm_tex_2d_array_grad_v4u32_f32:
3633  case Intrinsic::nvvm_tex_3d_v4u32_s32:
3634  case Intrinsic::nvvm_tex_3d_v4u32_f32:
3635  case Intrinsic::nvvm_tex_3d_level_v4u32_f32:
3636  case Intrinsic::nvvm_tex_3d_grad_v4u32_f32:
3637  case Intrinsic::nvvm_tld4_r_2d_v4s32_f32:
3638  case Intrinsic::nvvm_tld4_g_2d_v4s32_f32:
3639  case Intrinsic::nvvm_tld4_b_2d_v4s32_f32:
3640  case Intrinsic::nvvm_tld4_a_2d_v4s32_f32:
3641  case Intrinsic::nvvm_tld4_r_2d_v4u32_f32:
3642  case Intrinsic::nvvm_tld4_g_2d_v4u32_f32:
3643  case Intrinsic::nvvm_tld4_b_2d_v4u32_f32:
3644  case Intrinsic::nvvm_tld4_a_2d_v4u32_f32:
3645  case Intrinsic::nvvm_tex_unified_1d_v4s32_s32:
3646  case Intrinsic::nvvm_tex_unified_1d_v4s32_f32:
3647  case Intrinsic::nvvm_tex_unified_1d_level_v4s32_f32:
3648  case Intrinsic::nvvm_tex_unified_1d_grad_v4s32_f32:
3649  case Intrinsic::nvvm_tex_unified_1d_array_v4s32_s32:
3650  case Intrinsic::nvvm_tex_unified_1d_array_v4s32_f32:
3651  case Intrinsic::nvvm_tex_unified_1d_array_level_v4s32_f32:
3652  case Intrinsic::nvvm_tex_unified_1d_array_grad_v4s32_f32:
3653  case Intrinsic::nvvm_tex_unified_2d_v4s32_s32:
3654  case Intrinsic::nvvm_tex_unified_2d_v4s32_f32:
3655  case Intrinsic::nvvm_tex_unified_2d_level_v4s32_f32:
3656  case Intrinsic::nvvm_tex_unified_2d_grad_v4s32_f32:
3657  case Intrinsic::nvvm_tex_unified_2d_array_v4s32_s32:
3658  case Intrinsic::nvvm_tex_unified_2d_array_v4s32_f32:
3659  case Intrinsic::nvvm_tex_unified_2d_array_level_v4s32_f32:
3660  case Intrinsic::nvvm_tex_unified_2d_array_grad_v4s32_f32:
3661  case Intrinsic::nvvm_tex_unified_3d_v4s32_s32:
3662  case Intrinsic::nvvm_tex_unified_3d_v4s32_f32:
3663  case Intrinsic::nvvm_tex_unified_3d_level_v4s32_f32:
3664  case Intrinsic::nvvm_tex_unified_3d_grad_v4s32_f32:
3665  case Intrinsic::nvvm_tex_unified_1d_v4u32_s32:
3666  case Intrinsic::nvvm_tex_unified_1d_v4u32_f32:
3667  case Intrinsic::nvvm_tex_unified_1d_level_v4u32_f32:
3668  case Intrinsic::nvvm_tex_unified_1d_grad_v4u32_f32:
3669  case Intrinsic::nvvm_tex_unified_1d_array_v4u32_s32:
3670  case Intrinsic::nvvm_tex_unified_1d_array_v4u32_f32:
3671  case Intrinsic::nvvm_tex_unified_1d_array_level_v4u32_f32:
3672  case Intrinsic::nvvm_tex_unified_1d_array_grad_v4u32_f32:
3673  case Intrinsic::nvvm_tex_unified_2d_v4u32_s32:
3674  case Intrinsic::nvvm_tex_unified_2d_v4u32_f32:
3675  case Intrinsic::nvvm_tex_unified_2d_level_v4u32_f32:
3676  case Intrinsic::nvvm_tex_unified_2d_grad_v4u32_f32:
3677  case Intrinsic::nvvm_tex_unified_2d_array_v4u32_s32:
3678  case Intrinsic::nvvm_tex_unified_2d_array_v4u32_f32:
3679  case Intrinsic::nvvm_tex_unified_2d_array_level_v4u32_f32:
3680  case Intrinsic::nvvm_tex_unified_2d_array_grad_v4u32_f32:
3681  case Intrinsic::nvvm_tex_unified_3d_v4u32_s32:
3682  case Intrinsic::nvvm_tex_unified_3d_v4u32_f32:
3683  case Intrinsic::nvvm_tex_unified_3d_level_v4u32_f32:
3684  case Intrinsic::nvvm_tex_unified_3d_grad_v4u32_f32:
3685  case Intrinsic::nvvm_tex_unified_cube_v4s32_f32:
3686  case Intrinsic::nvvm_tex_unified_cube_level_v4s32_f32:
3687  case Intrinsic::nvvm_tex_unified_cube_array_v4s32_f32:
3688  case Intrinsic::nvvm_tex_unified_cube_array_level_v4s32_f32:
3689  case Intrinsic::nvvm_tex_unified_cube_v4u32_f32:
3690  case Intrinsic::nvvm_tex_unified_cube_level_v4u32_f32:
3691  case Intrinsic::nvvm_tex_unified_cube_array_v4u32_f32:
3692  case Intrinsic::nvvm_tex_unified_cube_array_level_v4u32_f32:
3693  case Intrinsic::nvvm_tld4_unified_r_2d_v4s32_f32:
3694  case Intrinsic::nvvm_tld4_unified_g_2d_v4s32_f32:
3695  case Intrinsic::nvvm_tld4_unified_b_2d_v4s32_f32:
3696  case Intrinsic::nvvm_tld4_unified_a_2d_v4s32_f32:
3697  case Intrinsic::nvvm_tld4_unified_r_2d_v4u32_f32:
3698  case Intrinsic::nvvm_tld4_unified_g_2d_v4u32_f32:
3699  case Intrinsic::nvvm_tld4_unified_b_2d_v4u32_f32:
3700  case Intrinsic::nvvm_tld4_unified_a_2d_v4u32_f32:
3701  Info.opc = getOpcForTextureInstr(Intrinsic);
3702  Info.memVT = MVT::v4i32;
3703  Info.ptrVal = nullptr;
3704  Info.offset = 0;
3706  Info.align = 16;
3707  return true;
3708 
3709  case Intrinsic::nvvm_suld_1d_i8_clamp:
3710  case Intrinsic::nvvm_suld_1d_v2i8_clamp:
3711  case Intrinsic::nvvm_suld_1d_v4i8_clamp:
3712  case Intrinsic::nvvm_suld_1d_array_i8_clamp:
3713  case Intrinsic::nvvm_suld_1d_array_v2i8_clamp:
3714  case Intrinsic::nvvm_suld_1d_array_v4i8_clamp:
3715  case Intrinsic::nvvm_suld_2d_i8_clamp:
3716  case Intrinsic::nvvm_suld_2d_v2i8_clamp:
3717  case Intrinsic::nvvm_suld_2d_v4i8_clamp:
3718  case Intrinsic::nvvm_suld_2d_array_i8_clamp:
3719  case Intrinsic::nvvm_suld_2d_array_v2i8_clamp:
3720  case Intrinsic::nvvm_suld_2d_array_v4i8_clamp:
3721  case Intrinsic::nvvm_suld_3d_i8_clamp:
3722  case Intrinsic::nvvm_suld_3d_v2i8_clamp:
3723  case Intrinsic::nvvm_suld_3d_v4i8_clamp:
3724  case Intrinsic::nvvm_suld_1d_i8_trap:
3725  case Intrinsic::nvvm_suld_1d_v2i8_trap:
3726  case Intrinsic::nvvm_suld_1d_v4i8_trap:
3727  case Intrinsic::nvvm_suld_1d_array_i8_trap:
3728  case Intrinsic::nvvm_suld_1d_array_v2i8_trap:
3729  case Intrinsic::nvvm_suld_1d_array_v4i8_trap:
3730  case Intrinsic::nvvm_suld_2d_i8_trap:
3731  case Intrinsic::nvvm_suld_2d_v2i8_trap:
3732  case Intrinsic::nvvm_suld_2d_v4i8_trap:
3733  case Intrinsic::nvvm_suld_2d_array_i8_trap:
3734  case Intrinsic::nvvm_suld_2d_array_v2i8_trap:
3735  case Intrinsic::nvvm_suld_2d_array_v4i8_trap:
3736  case Intrinsic::nvvm_suld_3d_i8_trap:
3737  case Intrinsic::nvvm_suld_3d_v2i8_trap:
3738  case Intrinsic::nvvm_suld_3d_v4i8_trap:
3739  case Intrinsic::nvvm_suld_1d_i8_zero:
3740  case Intrinsic::nvvm_suld_1d_v2i8_zero:
3741  case Intrinsic::nvvm_suld_1d_v4i8_zero:
3742  case Intrinsic::nvvm_suld_1d_array_i8_zero:
3743  case Intrinsic::nvvm_suld_1d_array_v2i8_zero:
3744  case Intrinsic::nvvm_suld_1d_array_v4i8_zero:
3745  case Intrinsic::nvvm_suld_2d_i8_zero:
3746  case Intrinsic::nvvm_suld_2d_v2i8_zero:
3747  case Intrinsic::nvvm_suld_2d_v4i8_zero:
3748  case Intrinsic::nvvm_suld_2d_array_i8_zero:
3749  case Intrinsic::nvvm_suld_2d_array_v2i8_zero:
3750  case Intrinsic::nvvm_suld_2d_array_v4i8_zero:
3751  case Intrinsic::nvvm_suld_3d_i8_zero:
3752  case Intrinsic::nvvm_suld_3d_v2i8_zero:
3753  case Intrinsic::nvvm_suld_3d_v4i8_zero:
3754  Info.opc = getOpcForSurfaceInstr(Intrinsic);
3755  Info.memVT = MVT::i8;
3756  Info.ptrVal = nullptr;
3757  Info.offset = 0;
3759  Info.align = 16;
3760  return true;
3761 
3762  case Intrinsic::nvvm_suld_1d_i16_clamp:
3763  case Intrinsic::nvvm_suld_1d_v2i16_clamp:
3764  case Intrinsic::nvvm_suld_1d_v4i16_clamp:
3765  case Intrinsic::nvvm_suld_1d_array_i16_clamp:
3766  case Intrinsic::nvvm_suld_1d_array_v2i16_clamp:
3767  case Intrinsic::nvvm_suld_1d_array_v4i16_clamp:
3768  case Intrinsic::nvvm_suld_2d_i16_clamp:
3769  case Intrinsic::nvvm_suld_2d_v2i16_clamp:
3770  case Intrinsic::nvvm_suld_2d_v4i16_clamp:
3771  case Intrinsic::nvvm_suld_2d_array_i16_clamp:
3772  case Intrinsic::nvvm_suld_2d_array_v2i16_clamp:
3773  case Intrinsic::nvvm_suld_2d_array_v4i16_clamp:
3774  case Intrinsic::nvvm_suld_3d_i16_clamp:
3775  case Intrinsic::nvvm_suld_3d_v2i16_clamp:
3776  case Intrinsic::nvvm_suld_3d_v4i16_clamp:
3777  case Intrinsic::nvvm_suld_1d_i16_trap:
3778  case Intrinsic::nvvm_suld_1d_v2i16_trap:
3779  case Intrinsic::nvvm_suld_1d_v4i16_trap:
3780  case Intrinsic::nvvm_suld_1d_array_i16_trap:
3781  case Intrinsic::nvvm_suld_1d_array_v2i16_trap:
3782  case Intrinsic::nvvm_suld_1d_array_v4i16_trap:
3783  case Intrinsic::nvvm_suld_2d_i16_trap:
3784  case Intrinsic::nvvm_suld_2d_v2i16_trap:
3785  case Intrinsic::nvvm_suld_2d_v4i16_trap:
3786  case Intrinsic::nvvm_suld_2d_array_i16_trap:
3787  case Intrinsic::nvvm_suld_2d_array_v2i16_trap:
3788  case Intrinsic::nvvm_suld_2d_array_v4i16_trap:
3789  case Intrinsic::nvvm_suld_3d_i16_trap:
3790  case Intrinsic::nvvm_suld_3d_v2i16_trap:
3791  case Intrinsic::nvvm_suld_3d_v4i16_trap:
3792  case Intrinsic::nvvm_suld_1d_i16_zero:
3793  case Intrinsic::nvvm_suld_1d_v2i16_zero:
3794  case Intrinsic::nvvm_suld_1d_v4i16_zero:
3795  case Intrinsic::nvvm_suld_1d_array_i16_zero:
3796  case Intrinsic::nvvm_suld_1d_array_v2i16_zero:
3797  case Intrinsic::nvvm_suld_1d_array_v4i16_zero:
3798  case Intrinsic::nvvm_suld_2d_i16_zero:
3799  case Intrinsic::nvvm_suld_2d_v2i16_zero:
3800  case Intrinsic::nvvm_suld_2d_v4i16_zero:
3801  case Intrinsic::nvvm_suld_2d_array_i16_zero:
3802  case Intrinsic::nvvm_suld_2d_array_v2i16_zero:
3803  case Intrinsic::nvvm_suld_2d_array_v4i16_zero:
3804  case Intrinsic::nvvm_suld_3d_i16_zero:
3805  case Intrinsic::nvvm_suld_3d_v2i16_zero:
3806  case Intrinsic::nvvm_suld_3d_v4i16_zero:
3807  Info.opc = getOpcForSurfaceInstr(Intrinsic);
3808  Info.memVT = MVT::i16;
3809  Info.ptrVal = nullptr;
3810  Info.offset = 0;
3812  Info.align = 16;
3813  return true;
3814 
3815  case Intrinsic::nvvm_suld_1d_i32_clamp:
3816  case Intrinsic::nvvm_suld_1d_v2i32_clamp:
3817  case Intrinsic::nvvm_suld_1d_v4i32_clamp:
3818  case Intrinsic::nvvm_suld_1d_array_i32_clamp:
3819  case Intrinsic::nvvm_suld_1d_array_v2i32_clamp:
3820  case Intrinsic::nvvm_suld_1d_array_v4i32_clamp:
3821  case Intrinsic::nvvm_suld_2d_i32_clamp:
3822  case Intrinsic::nvvm_suld_2d_v2i32_clamp:
3823  case Intrinsic::nvvm_suld_2d_v4i32_clamp:
3824  case Intrinsic::nvvm_suld_2d_array_i32_clamp:
3825  case Intrinsic::nvvm_suld_2d_array_v2i32_clamp:
3826  case Intrinsic::nvvm_suld_2d_array_v4i32_clamp:
3827  case Intrinsic::nvvm_suld_3d_i32_clamp:
3828  case Intrinsic::nvvm_suld_3d_v2i32_clamp:
3829  case Intrinsic::nvvm_suld_3d_v4i32_clamp:
3830  case Intrinsic::nvvm_suld_1d_i32_trap:
3831  case Intrinsic::nvvm_suld_1d_v2i32_trap:
3832  case Intrinsic::nvvm_suld_1d_v4i32_trap:
3833  case Intrinsic::nvvm_suld_1d_array_i32_trap:
3834  case Intrinsic::nvvm_suld_1d_array_v2i32_trap:
3835  case Intrinsic::nvvm_suld_1d_array_v4i32_trap:
3836  case Intrinsic::nvvm_suld_2d_i32_trap:
3837  case Intrinsic::nvvm_suld_2d_v2i32_trap:
3838  case Intrinsic::nvvm_suld_2d_v4i32_trap:
3839  case Intrinsic::nvvm_suld_2d_array_i32_trap:
3840  case Intrinsic::nvvm_suld_2d_array_v2i32_trap:
3841  case Intrinsic::nvvm_suld_2d_array_v4i32_trap:
3842  case Intrinsic::nvvm_suld_3d_i32_trap:
3843  case Intrinsic::nvvm_suld_3d_v2i32_trap:
3844  case Intrinsic::nvvm_suld_3d_v4i32_trap:
3845  case Intrinsic::nvvm_suld_1d_i32_zero:
3846  case Intrinsic::nvvm_suld_1d_v2i32_zero:
3847  case Intrinsic::nvvm_suld_1d_v4i32_zero:
3848  case Intrinsic::nvvm_suld_1d_array_i32_zero:
3849  case Intrinsic::nvvm_suld_1d_array_v2i32_zero:
3850  case Intrinsic::nvvm_suld_1d_array_v4i32_zero:
3851  case Intrinsic::nvvm_suld_2d_i32_zero:
3852  case Intrinsic::nvvm_suld_2d_v2i32_zero:
3853  case Intrinsic::nvvm_suld_2d_v4i32_zero:
3854  case Intrinsic::nvvm_suld_2d_array_i32_zero:
3855  case Intrinsic::nvvm_suld_2d_array_v2i32_zero:
3856  case Intrinsic::nvvm_suld_2d_array_v4i32_zero:
3857  case Intrinsic::nvvm_suld_3d_i32_zero:
3858  case Intrinsic::nvvm_suld_3d_v2i32_zero:
3859  case Intrinsic::nvvm_suld_3d_v4i32_zero:
3860  Info.opc = getOpcForSurfaceInstr(Intrinsic);
3861  Info.memVT = MVT::i32;
3862  Info.ptrVal = nullptr;
3863  Info.offset = 0;
3865  Info.align = 16;
3866  return true;
3867 
3868  case Intrinsic::nvvm_suld_1d_i64_clamp:
3869  case Intrinsic::nvvm_suld_1d_v2i64_clamp:
3870  case Intrinsic::nvvm_suld_1d_array_i64_clamp:
3871  case Intrinsic::nvvm_suld_1d_array_v2i64_clamp:
3872  case Intrinsic::nvvm_suld_2d_i64_clamp:
3873  case Intrinsic::nvvm_suld_2d_v2i64_clamp:
3874  case Intrinsic::nvvm_suld_2d_array_i64_clamp:
3875  case Intrinsic::nvvm_suld_2d_array_v2i64_clamp:
3876  case Intrinsic::nvvm_suld_3d_i64_clamp:
3877  case Intrinsic::nvvm_suld_3d_v2i64_clamp:
3878  case Intrinsic::nvvm_suld_1d_i64_trap:
3879  case Intrinsic::nvvm_suld_1d_v2i64_trap:
3880  case Intrinsic::nvvm_suld_1d_array_i64_trap:
3881  case Intrinsic::nvvm_suld_1d_array_v2i64_trap:
3882  case Intrinsic::nvvm_suld_2d_i64_trap:
3883  case Intrinsic::nvvm_suld_2d_v2i64_trap:
3884  case Intrinsic::nvvm_suld_2d_array_i64_trap:
3885  case Intrinsic::nvvm_suld_2d_array_v2i64_trap:
3886  case Intrinsic::nvvm_suld_3d_i64_trap:
3887  case Intrinsic::nvvm_suld_3d_v2i64_trap:
3888  case Intrinsic::nvvm_suld_1d_i64_zero:
3889  case Intrinsic::nvvm_suld_1d_v2i64_zero:
3890  case Intrinsic::nvvm_suld_1d_array_i64_zero:
3891  case Intrinsic::nvvm_suld_1d_array_v2i64_zero:
3892  case Intrinsic::nvvm_suld_2d_i64_zero:
3893  case Intrinsic::nvvm_suld_2d_v2i64_zero:
3894  case Intrinsic::nvvm_suld_2d_array_i64_zero:
3895  case Intrinsic::nvvm_suld_2d_array_v2i64_zero:
3896  case Intrinsic::nvvm_suld_3d_i64_zero:
3897  case Intrinsic::nvvm_suld_3d_v2i64_zero:
3898  Info.opc = getOpcForSurfaceInstr(Intrinsic);
3899  Info.memVT = MVT::i64;
3900  Info.ptrVal = nullptr;
3901  Info.offset = 0;
3903  Info.align = 16;
3904  return true;
3905  }
3906  return false;
3907 }
3908 
3909 /// isLegalAddressingMode - Return true if the addressing mode represented
3910 /// by AM is legal for this target, for a load/store of the specified type.
3911 /// Used to guide target specific optimizations, like loop strength reduction
3912 /// (LoopStrengthReduce.cpp) and memory optimization for address mode
3913 /// (CodeGenPrepare.cpp)
3915  const AddrMode &AM, Type *Ty,
3916  unsigned AS, Instruction *I) const {
3917  // AddrMode - This represents an addressing mode of:
3918  // BaseGV + BaseOffs + BaseReg + Scale*ScaleReg
3919  //
3920  // The legal address modes are
3921  // - [avar]
3922  // - [areg]
3923  // - [areg+immoff]
3924  // - [immAddr]
3925 
3926  if (AM.BaseGV) {
3927  return !AM.BaseOffs && !AM.HasBaseReg && !AM.Scale;
3928  }
3929 
3930  switch (AM.Scale) {
3931  case 0: // "r", "r+i" or "i" is allowed
3932  break;
3933  case 1:
3934  if (AM.HasBaseReg) // "r+r+i" or "r+r" is not allowed.
3935  return false;
3936  // Otherwise we have r+i.
3937  break;
3938  default:
3939  // No scale > 1 is allowed
3940  return false;
3941  }
3942  return true;
3943 }
3944 
3945 //===----------------------------------------------------------------------===//
3946 // NVPTX Inline Assembly Support
3947 //===----------------------------------------------------------------------===//
3948 
3949 /// getConstraintType - Given a constraint letter, return the type of
3950 /// constraint it is for this target.
3953  if (Constraint.size() == 1) {
3954  switch (Constraint[0]) {
3955  default:
3956  break;
3957  case 'b':
3958  case 'r':
3959  case 'h':
3960  case 'c':
3961  case 'l':
3962  case 'f':
3963  case 'd':
3964  case '0':
3965  case 'N':
3966  return C_RegisterClass;
3967  }
3968  }
3969  return TargetLowering::getConstraintType(Constraint);
3970 }
3971 
3972 std::pair<unsigned, const TargetRegisterClass *>
3974  StringRef Constraint,
3975  MVT VT) const {
3976  if (Constraint.size() == 1) {
3977  switch (Constraint[0]) {
3978  case 'b':
3979  return std::make_pair(0U, &NVPTX::Int1RegsRegClass);
3980  case 'c':
3981  return std::make_pair(0U, &NVPTX::Int16RegsRegClass);
3982  case 'h':
3983  return std::make_pair(0U, &NVPTX::Int16RegsRegClass);
3984  case 'r':
3985  return std::make_pair(0U, &NVPTX::Int32RegsRegClass);
3986  case 'l':
3987  case 'N':
3988  return std::make_pair(0U, &NVPTX::Int64RegsRegClass);
3989  case 'f':
3990  return std::make_pair(0U, &NVPTX::Float32RegsRegClass);
3991  case 'd':
3992  return std::make_pair(0U, &NVPTX::Float64RegsRegClass);
3993  }
3994  }
3995  return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
3996 }
3997 
3998 //===----------------------------------------------------------------------===//
3999 // NVPTX DAG Combining
4000 //===----------------------------------------------------------------------===//
4001 
4003  CodeGenOpt::Level OptLevel) const {
4004  // Always honor command-line argument
4005  if (FMAContractLevelOpt.getNumOccurrences() > 0)
4006  return FMAContractLevelOpt > 0;
4007 
4008  // Do not contract if we're not optimizing the code.
4009  if (OptLevel == 0)
4010  return false;
4011 
4012  // Honor TargetOptions flags that explicitly say fusion is okay.
4014  return true;
4015 
4016  return allowUnsafeFPMath(MF);
4017 }
4018 
4020  // Honor TargetOptions flags that explicitly say unsafe math is okay.
4021  if (MF.getTarget().Options.UnsafeFPMath)
4022  return true;
4023 
4024  // Allow unsafe math if unsafe-fp-math attribute explicitly says so.
4025  const Function &F = MF.getFunction();
4026  if (F.hasFnAttribute("unsafe-fp-math")) {
4027  Attribute Attr = F.getFnAttribute("unsafe-fp-math");
4028  StringRef Val = Attr.getValueAsString();
4029  if (Val == "true")
4030  return true;
4031  }
4032 
4033  return false;
4034 }
4035 
4036 /// PerformADDCombineWithOperands - Try DAG combinations for an ADD with
4037 /// operands N0 and N1. This is a helper for PerformADDCombine that is
4038 /// called with the default operands, and if that fails, with commuted
4039 /// operands.
4042  const NVPTXSubtarget &Subtarget,
4043  CodeGenOpt::Level OptLevel) {
4044  SelectionDAG &DAG = DCI.DAG;
4045  // Skip non-integer, non-scalar case
4046  EVT VT=N0.getValueType();
4047  if (VT.isVector())
4048  return SDValue();
4049 
4050  // fold (add (mul a, b), c) -> (mad a, b, c)
4051  //
4052  if (N0.getOpcode() == ISD::MUL) {
4053  assert (VT.isInteger());
4054  // For integer:
4055  // Since integer multiply-add costs the same as integer multiply
4056  // but is more costly than integer add, do the fusion only when
4057  // the mul is only used in the add.
4058  if (OptLevel==CodeGenOpt::None || VT != MVT::i32 ||
4059  !N0.getNode()->hasOneUse())
4060  return SDValue();
4061 
4062  // Do the folding
4063  return DAG.getNode(NVPTXISD::IMAD, SDLoc(N), VT,
4064  N0.getOperand(0), N0.getOperand(1), N1);
4065  }
4066  else if (N0.getOpcode() == ISD::FMUL) {
4067  if (VT == MVT::f32 || VT == MVT::f64) {
4068  const auto *TLI = static_cast<const NVPTXTargetLowering *>(
4069  &DAG.getTargetLoweringInfo());
4070  if (!TLI->allowFMA(DAG.getMachineFunction(), OptLevel))
4071  return SDValue();
4072 
4073  // For floating point:
4074  // Do the fusion only when the mul has less than 5 uses and all
4075  // are add.
4076  // The heuristic is that if a use is not an add, then that use
4077  // cannot be fused into fma, therefore mul is still needed anyway.
4078  // If there are more than 4 uses, even if they are all add, fusing
4079  // them will increase register pressue.
4080  //
4081  int numUses = 0;
4082  int nonAddCount = 0;
4083  for (SDNode::use_iterator UI = N0.getNode()->use_begin(),
4084  UE = N0.getNode()->use_end();
4085  UI != UE; ++UI) {
4086  numUses++;
4087  SDNode *User = *UI;
4088  if (User->getOpcode() != ISD::FADD)
4089  ++nonAddCount;
4090  }
4091  if (numUses >= 5)
4092  return SDValue();
4093  if (nonAddCount) {
4094  int orderNo = N->getIROrder();
4095  int orderNo2 = N0.getNode()->getIROrder();
4096  // simple heuristics here for considering potential register
4097  // pressure, the logics here is that the differnce are used
4098  // to measure the distance between def and use, the longer distance
4099  // more likely cause register pressure.
4100  if (orderNo - orderNo2 < 500)
4101  return SDValue();
4102 
4103  // Now, check if at least one of the FMUL's operands is live beyond the node N,
4104  // which guarantees that the FMA will not increase register pressure at node N.
4105  bool opIsLive = false;
4106  const SDNode *left = N0.getOperand(0).getNode();
4107  const SDNode *right = N0.getOperand(1).getNode();
4108 
4109  if (isa<ConstantSDNode>(left) || isa<ConstantSDNode>(right))
4110  opIsLive = true;
4111 
4112  if (!opIsLive)
4113  for (SDNode::use_iterator UI = left->use_begin(), UE = left->use_end(); UI != UE; ++UI) {
4114  SDNode *User = *UI;
4115  int orderNo3 = User->getIROrder();
4116  if (orderNo3 > orderNo) {
4117  opIsLive = true;
4118  break;
4119  }
4120  }
4121 
4122  if (!opIsLive)
4123  for (SDNode::use_iterator UI = right->use_begin(), UE = right->use_end(); UI != UE; ++UI) {
4124  SDNode *User = *UI;
4125  int orderNo3 = User->getIROrder();
4126  if (orderNo3 > orderNo) {
4127  opIsLive = true;
4128  break;
4129  }
4130  }
4131 
4132  if (!opIsLive)
4133  return SDValue();
4134  }
4135 
4136  return DAG.getNode(ISD::FMA, SDLoc(N), VT,
4137  N0.getOperand(0), N0.getOperand(1), N1);
4138  }
4139  }
4140 
4141  return SDValue();
4142 }
4143 
4144 /// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD.
4145 ///
4148  const NVPTXSubtarget &Subtarget,
4149  CodeGenOpt::Level OptLevel) {
4150  SDValue N0 = N->getOperand(0);
4151  SDValue N1 = N->getOperand(1);
4152 
4153  // First try with the default operand order.
4154  if (SDValue Result =
4155  PerformADDCombineWithOperands(N, N0, N1, DCI, Subtarget, OptLevel))
4156  return Result;
4157 
4158  // If that didn't work, try again with the operands commuted.
4159  return PerformADDCombineWithOperands(N, N1, N0, DCI, Subtarget, OptLevel);
4160 }
4161 
4164  // The type legalizer turns a vector load of i8 values into a zextload to i16
4165  // registers, optionally ANY_EXTENDs it (if target type is integer),
4166  // and ANDs off the high 8 bits. Since we turn this load into a
4167  // target-specific DAG node, the DAG combiner fails to eliminate these AND
4168  // nodes. Do that here.
4169  SDValue Val = N->getOperand(0);
4170  SDValue Mask = N->getOperand(1);
4171 
4172  if (isa<ConstantSDNode>(Val)) {
4173  std::swap(Val, Mask);
4174  }
4175 
4176  SDValue AExt;
4177  // Generally, we will see zextload -> IMOV16rr -> ANY_EXTEND -> and
4178  if (Val.getOpcode() == ISD::ANY_EXTEND) {
4179  AExt = Val;
4180  Val = Val->getOperand(0);
4181  }
4182 
4183  if (Val->isMachineOpcode() && Val->getMachineOpcode() == NVPTX::IMOV16rr) {
4184  Val = Val->getOperand(0);
4185  }
4186 
4187  if (Val->getOpcode() == NVPTXISD::LoadV2 ||
4188  Val->getOpcode() == NVPTXISD::LoadV4) {
4190  if (!MaskCnst) {
4191  // Not an AND with a constant
4192  return SDValue();
4193  }
4194 
4195  uint64_t MaskVal = MaskCnst->getZExtValue();
4196  if (MaskVal != 0xff) {
4197  // Not an AND that chops off top 8 bits
4198  return SDValue();
4199  }
4200 
4201  MemSDNode *Mem = dyn_cast<MemSDNode>(Val);
4202  if (!Mem) {
4203  // Not a MemSDNode?!?
4204  return SDValue();
4205  }
4206 
4207  EVT MemVT = Mem->getMemoryVT();
4208  if (MemVT != MVT::v2i8 && MemVT != MVT::v4i8) {
4209  // We only handle the i8 case
4210  return SDValue();
4211  }
4212 
4213  unsigned ExtType =
4214  cast<ConstantSDNode>(Val->getOperand(Val->getNumOperands()-1))->
4215  getZExtValue();
4216  if (ExtType == ISD::SEXTLOAD) {
4217  // If for some reason the load is a sextload, the and is needed to zero
4218  // out the high 8 bits
4219  return SDValue();
4220  }
4221 
4222  bool AddTo = false;
4223  if (AExt.getNode() != nullptr) {
4224  // Re-insert the ext as a zext.
4225  Val = DCI.DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N),
4226  AExt.getValueType(), Val);
4227  AddTo = true;
4228  }
4229 
4230  // If we get here, the AND is unnecessary. Just replace it with the load
4231  DCI.CombineTo(N, Val, AddTo);
4232  }
4233 
4234  return SDValue();
4235 }
4236 
4239  CodeGenOpt::Level OptLevel) {
4240  assert(N->getOpcode() == ISD::SREM || N->getOpcode() == ISD::UREM);
4241 
4242  // Don't do anything at less than -O2.
4243  if (OptLevel < CodeGenOpt::Default)
4244  return SDValue();
4245 
4246  SelectionDAG &DAG = DCI.DAG;
4247  SDLoc DL(N);
4248  EVT VT = N->getValueType(0);
4249  bool IsSigned = N->getOpcode() == ISD::SREM;
4250  unsigned DivOpc = IsSigned ? ISD::SDIV : ISD::UDIV;
4251 
4252  const SDValue &Num = N->getOperand(0);
4253  const SDValue &Den = N->getOperand(1);
4254 
4255  for (const SDNode *U : Num->uses()) {
4256  if (U->getOpcode() == DivOpc && U->getOperand(0) == Num &&
4257  U->getOperand(1) == Den) {
4258  // Num % Den -> Num - (Num / Den) * Den
4259  return DAG.getNode(ISD::SUB, DL, VT, Num,
4260  DAG.getNode(ISD::MUL, DL, VT,
4261  DAG.getNode(DivOpc, DL, VT, Num, Den),
4262  Den));
4263  }
4264  }
4265  return SDValue();
4266 }
4267 
4269  Signed = 0,
4272 };
4273 
4274 /// IsMulWideOperandDemotable - Checks if the provided DAG node is an operand
4275 /// that can be demoted to \p OptSize bits without loss of information. The
4276 /// signedness of the operand, if determinable, is placed in \p S.
4278  unsigned OptSize,
4279  OperandSignedness &S) {
4280  S = Unknown;
4281 
4282  if (Op.getOpcode() == ISD::SIGN_EXTEND ||
4284  EVT OrigVT = Op.getOperand(0).getValueType();
4285  if (OrigVT.getSizeInBits() <= OptSize) {
4286  S = Signed;
4287  return true;
4288  }
4289  } else if (Op.getOpcode() == ISD::ZERO_EXTEND) {
4290  EVT OrigVT = Op.getOperand(0).getValueType();
4291  if (OrigVT.getSizeInBits() <= OptSize) {
4292  S = Unsigned;
4293  return true;
4294  }
4295  }
4296 
4297  return false;
4298 }
4299 
4300 /// AreMulWideOperandsDemotable - Checks if the given LHS and RHS operands can
4301 /// be demoted to \p OptSize bits without loss of information. If the operands
4302 /// contain a constant, it should appear as the RHS operand. The signedness of
4303 /// the operands is placed in \p IsSigned.
4305  unsigned OptSize,
4306  bool &IsSigned) {
4307  OperandSignedness LHSSign;
4308 
4309  // The LHS operand must be a demotable op
4310  if (!IsMulWideOperandDemotable(LHS, OptSize, LHSSign))
4311  return false;
4312 
4313  // We should have been able to determine the signedness from the LHS
4314  if (LHSSign == Unknown)
4315  return false;
4316 
4317  IsSigned = (LHSSign == Signed);
4318 
4319  // The RHS can be a demotable op or a constant
4320  if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(RHS)) {
4321  const APInt &Val = CI->getAPIntValue();
4322  if (LHSSign == Unsigned) {
4323  return Val.isIntN(OptSize);
4324  } else {
4325  return Val.isSignedIntN(OptSize);
4326  }
4327  } else {
4328  OperandSignedness RHSSign;
4329  if (!IsMulWideOperandDemotable(RHS, OptSize, RHSSign))
4330  return false;
4331 
4332  return LHSSign == RHSSign;
4333  }
4334 }
4335 
4336 /// TryMULWIDECombine - Attempt to replace a multiply of M bits with a multiply
4337 /// of M/2 bits that produces an M-bit result (i.e. mul.wide). This transform
4338 /// works on both multiply DAG nodes and SHL DAG nodes with a constant shift
4339 /// amount.
4342  EVT MulType = N->getValueType(0);
4343  if (MulType != MVT::i32 && MulType != MVT::i64) {
4344  return SDValue();
4345  }
4346 
4347  SDLoc DL(N);
4348  unsigned OptSize = MulType.getSizeInBits() >> 1;
4349  SDValue LHS = N->getOperand(0);
4350  SDValue RHS = N->getOperand(1);
4351 
4352  // Canonicalize the multiply so the constant (if any) is on the right
4353  if (N->getOpcode() == ISD::MUL) {
4354  if (isa<ConstantSDNode>(LHS)) {
4355  std::swap(LHS, RHS);
4356  }
4357  }
4358 
4359  // If we have a SHL, determine the actual multiply amount
4360  if (N->getOpcode() == ISD::SHL) {
4361  ConstantSDNode *ShlRHS = dyn_cast<ConstantSDNode>(RHS);
4362  if (!ShlRHS) {
4363  return SDValue();
4364  }
4365 
4366  APInt ShiftAmt = ShlRHS->getAPIntValue();
4367  unsigned BitWidth = MulType.getSizeInBits();
4368  if (ShiftAmt.sge(0) && ShiftAmt.slt(BitWidth)) {
4369  APInt MulVal = APInt(BitWidth, 1) << ShiftAmt;
4370  RHS = DCI.DAG.getConstant(MulVal, DL, MulType);
4371  } else {
4372  return SDValue();
4373  }
4374  }
4375 
4376  bool Signed;
4377  // Verify that our operands are demotable
4378  if (!AreMulWideOperandsDemotable(LHS, RHS, OptSize, Signed)) {
4379  return SDValue();
4380  }
4381 
4382  EVT DemotedVT;
4383  if (MulType == MVT::i32) {
4384  DemotedVT = MVT::i16;
4385  } else {
4386  DemotedVT = MVT::i32;
4387  }
4388 
4389  // Truncate the operands to the correct size. Note that these are just for
4390  // type consistency and will (likely) be eliminated in later phases.
4391  SDValue TruncLHS =
4392  DCI.DAG.getNode(ISD::TRUNCATE, DL, DemotedVT, LHS);
4393  SDValue TruncRHS =
4394  DCI.DAG.getNode(ISD::TRUNCATE, DL, DemotedVT, RHS);
4395 
4396  unsigned Opc;
4397  if (Signed) {
4399  } else {
4401  }
4402 
4403  return DCI.DAG.getNode(Opc, DL, MulType, TruncLHS, TruncRHS);
4404 }
4405 
4406 /// PerformMULCombine - Runs PTX-specific DAG combine patterns on MUL nodes.
4409  CodeGenOpt::Level OptLevel) {
4410  if (OptLevel > 0) {
4411  // Try mul.wide combining at OptLevel > 0
4412  if (SDValue Ret = TryMULWIDECombine(N, DCI))
4413  return Ret;
4414  }
4415 
4416  return SDValue();
4417 }
4418 
4419 /// PerformSHLCombine - Runs PTX-specific DAG combine patterns on SHL nodes.
4422  CodeGenOpt::Level OptLevel) {
4423  if (OptLevel > 0) {
4424  // Try mul.wide combining at OptLevel > 0
4425  if (SDValue Ret = TryMULWIDECombine(N, DCI))
4426  return Ret;
4427  }
4428 
4429  return SDValue();
4430 }
4431 
4434  EVT CCType = N->getValueType(0);
4435  SDValue A = N->getOperand(0);
4436  SDValue B = N->getOperand(1);
4437 
4438  if (CCType != MVT::v2i1 || A.getValueType() != MVT::v2f16)
4439  return SDValue();
4440 
4441  SDLoc DL(N);
4442  // setp.f16x2 returns two scalar predicates, which we need to
4443  // convert back to v2i1. The returned result will be scalarized by
4444  // the legalizer, but the comparison will remain a single vector
4445  // instruction.
4446  SDValue CCNode = DCI.DAG.getNode(NVPTXISD::SETP_F16X2, DL,
4447  DCI.DAG.getVTList(MVT::i1, MVT::i1),
4448  {A, B, N->getOperand(2)});
4449  return DCI.DAG.getNode(ISD::BUILD_VECTOR, DL, CCType, CCNode.getValue(0),
4450  CCNode.getValue(1));
4451 }
4452 
4453 SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N,
4454  DAGCombinerInfo &DCI) const {
4456  switch (N->getOpcode()) {
4457  default: break;
4458  case ISD::ADD:
4459  case ISD::FADD:
4460  return PerformADDCombine(N, DCI, STI, OptLevel);
4461  case ISD::MUL:
4462  return PerformMULCombine(N, DCI, OptLevel);
4463  case ISD::SHL:
4464  return PerformSHLCombine(N, DCI, OptLevel);
4465  case ISD::AND:
4466  return PerformANDCombine(N, DCI);
4467  case ISD::UREM:
4468  case ISD::SREM:
4469  return PerformREMCombine(N, DCI, OptLevel);
4470  case ISD::SETCC:
4471  return PerformSETCCCombine(N, DCI);
4472  }
4473  return SDValue();
4474 }
4475 
4476 /// ReplaceVectorLoad - Convert vector loads into multi-output scalar loads.
4479  EVT ResVT = N->getValueType(0);
4480  SDLoc DL(N);
4481 
4482  assert(ResVT.isVector() && "Vector load must have vector type");
4483 
4484  // We only handle "native" vector sizes for now, e.g. <4 x double> is not
4485  // legal. We can (and should) split that into 2 loads of <2 x double> here
4486  // but I'm leaving that as a TODO for now.
4487  assert(ResVT.isSimple() && "Can only handle simple types");
4488  switch (ResVT.getSimpleVT().SimpleTy) {
4489  default:
4490  return;
4491  case MVT::v2i8:
4492  case MVT::v2i16:
4493  case MVT::v2i32:
4494  case MVT::v2i64:
4495  case MVT::v2f16:
4496  case MVT::v2f32:
4497  case MVT::v2f64:
4498  case MVT::v4i8:
4499  case MVT::v4i16:
4500  case MVT::v4i32:
4501  case MVT::v4f16:
4502  case MVT::v4f32:
4503  case MVT::v8f16: // <4 x f16x2>
4504  // This is a "native" vector type
4505  break;
4506  }
4507 
4508  LoadSDNode *LD = cast<LoadSDNode>(N);
4509 
4510  unsigned Align = LD->getAlignment();
4511  auto &TD = DAG.getDataLayout();
4512  unsigned PrefAlign =
4513  TD.getPrefTypeAlignment(ResVT.getTypeForEVT(*DAG.getContext()));
4514  if (Align < PrefAlign) {
4515  // This load is not sufficiently aligned, so bail out and let this vector
4516  // load be scalarized. Note that we may still be able to emit smaller
4517  // vector loads. For example, if we are loading a <4 x float> with an
4518  // alignment of 8, this check will fail but the legalizer will try again
4519  // with 2 x <2 x float>, which will succeed with an alignment of 8.
4520  return;
4521  }
4522 
4523  EVT EltVT = ResVT.getVectorElementType();
4524  unsigned NumElts = ResVT.getVectorNumElements();
4525 
4526  // Since LoadV2 is a target node, we cannot rely on DAG type legalization.
4527  // Therefore, we must ensure the type is legal. For i1 and i8, we set the
4528  // loaded type to i16 and propagate the "real" type as the memory type.
4529  bool NeedTrunc = false;
4530  if (EltVT.getSizeInBits() < 16) {
4531  EltVT = MVT::i16;
4532  NeedTrunc = true;
4533  }
4534 
4535  unsigned Opcode = 0;
4536  SDVTList LdResVTs;
4537  bool LoadF16x2 = false;
4538 
4539  switch (NumElts) {
4540  default:
4541  return;
4542  case 2:
4543  Opcode = NVPTXISD::LoadV2;
4544  LdResVTs = DAG.getVTList(EltVT, EltVT, MVT::Other);
4545  break;
4546  case 4: {
4547  Opcode = NVPTXISD::LoadV4;
4548  EVT ListVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other };
4549  LdResVTs = DAG.getVTList(ListVTs);
4550  break;
4551  }
4552  case 8: {
4553  // v8f16 is a special case. PTX doesn't have ld.v8.f16
4554  // instruction. Instead, we split the vector into v2f16 chunks and
4555  // load them with ld.v4.b32.
4556  assert(EltVT == MVT::f16 && "Unsupported v8 vector type.");
4557  LoadF16x2 = true;
4558  Opcode = NVPTXISD::LoadV4;
4559  EVT ListVTs[] = {MVT::v2f16, MVT::v2f16, MVT::v2f16, MVT::v2f16,
4560  MVT::Other};
4561  LdResVTs = DAG.getVTList(ListVTs);
4562  break;
4563  }
4564  }
4565 
4566  // Copy regular operands
4567  SmallVector<SDValue, 8> OtherOps(N->op_begin(), N->op_end());
4568 
4569  // The select routine does not have access to the LoadSDNode instance, so
4570  // pass along the extension information
4571  OtherOps.push_back(DAG.getIntPtrConstant(LD->getExtensionType(), DL));
4572 
4573  SDValue NewLD = DAG.getMemIntrinsicNode(Opcode, DL, LdResVTs, OtherOps,
4574  LD->getMemoryVT(),
4575  LD->getMemOperand());
4576 
4577  SmallVector<SDValue, 8> ScalarRes;
4578  if (LoadF16x2) {
4579  // Split v2f16 subvectors back into individual elements.
4580  NumElts /= 2;
4581  for (unsigned i = 0; i < NumElts; ++i) {
4582  SDValue SubVector = NewLD.getValue(i);
4583  SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, SubVector,
4584  DAG.getIntPtrConstant(0, DL));
4585  SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, SubVector,
4586  DAG.getIntPtrConstant(1, DL));
4587  ScalarRes.push_back(E0);
4588  ScalarRes.push_back(E1);
4589  }
4590  } else {
4591  for (unsigned i = 0; i < NumElts; ++i) {
4592  SDValue Res = NewLD.getValue(i);
4593  if (NeedTrunc)
4594  Res = DAG.getNode(ISD::TRUNCATE, DL, ResVT.getVectorElementType(), Res);
4595  ScalarRes.push_back(Res);
4596  }
4597  }
4598 
4599  SDValue LoadChain = NewLD.getValue(NumElts);
4600 
4601  SDValue BuildVec = DAG.getBuildVector(ResVT, DL, ScalarRes);
4602 
4603  Results.push_back(BuildVec);
4604  Results.push_back(LoadChain);
4605 }
4606 
4609  SDValue Chain = N->getOperand(0);
4610  SDValue Intrin = N->getOperand(1);
4611  SDLoc DL(N);
4612 
4613  // Get the intrinsic ID
4614  unsigned IntrinNo = cast<ConstantSDNode>(Intrin.getNode())->getZExtValue();
4615  switch (IntrinNo) {
4616