LLVM 19.0.0git
NVPTXISelLowering.cpp
Go to the documentation of this file.
1//===-- NVPTXISelLowering.cpp - NVPTX DAG Lowering Implementation ---------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines the interfaces that NVPTX uses to lower LLVM code into a
10// selection DAG.
11//
12//===----------------------------------------------------------------------===//
13
14#include "NVPTXISelLowering.h"
16#include "NVPTX.h"
17#include "NVPTXSubtarget.h"
18#include "NVPTXTargetMachine.h"
20#include "NVPTXUtilities.h"
21#include "llvm/ADT/APInt.h"
22#include "llvm/ADT/STLExtras.h"
24#include "llvm/ADT/StringRef.h"
35#include "llvm/IR/Argument.h"
36#include "llvm/IR/Attributes.h"
37#include "llvm/IR/Constants.h"
38#include "llvm/IR/DataLayout.h"
41#include "llvm/IR/FPEnv.h"
42#include "llvm/IR/Function.h"
43#include "llvm/IR/GlobalValue.h"
44#include "llvm/IR/Instruction.h"
46#include "llvm/IR/IntrinsicsNVPTX.h"
47#include "llvm/IR/Module.h"
48#include "llvm/IR/Type.h"
49#include "llvm/IR/Value.h"
58#include <algorithm>
59#include <cassert>
60#include <cmath>
61#include <cstdint>
62#include <iterator>
63#include <optional>
64#include <sstream>
65#include <string>
66#include <utility>
67#include <vector>
68
69#define DEBUG_TYPE "nvptx-lower"
70
71using namespace llvm;
72
73static std::atomic<unsigned> GlobalUniqueCallSite;
74
76 "nvptx-sched4reg",
77 cl::desc("NVPTX Specific: schedule for register pressue"), cl::init(false));
78
80 "nvptx-fma-level", cl::Hidden,
81 cl::desc("NVPTX Specific: FMA contraction (0: don't do it"
82 " 1: do it 2: do it aggressively"),
83 cl::init(2));
84
86 "nvptx-prec-divf32", cl::Hidden,
87 cl::desc("NVPTX Specifies: 0 use div.approx, 1 use div.full, 2 use"
88 " IEEE Compliant F32 div.rnd if available."),
89 cl::init(2));
90
92 "nvptx-prec-sqrtf32", cl::Hidden,
93 cl::desc("NVPTX Specific: 0 use sqrt.approx, 1 use sqrt.rn."),
94 cl::init(true));
95
97 "nvptx-force-min-byval-param-align", cl::Hidden,
98 cl::desc("NVPTX Specific: force 4-byte minimal alignment for byval"
99 " params of device functions."),
100 cl::init(false));
101
103 if (UsePrecDivF32.getNumOccurrences() > 0) {
104 // If nvptx-prec-div32=N is used on the command-line, always honor it
105 return UsePrecDivF32;
106 } else {
107 // Otherwise, use div.approx if fast math is enabled
108 if (getTargetMachine().Options.UnsafeFPMath)
109 return 0;
110 else
111 return 2;
112 }
113}
114
116 if (UsePrecSqrtF32.getNumOccurrences() > 0) {
117 // If nvptx-prec-sqrtf32 is used on the command-line, always honor it
118 return UsePrecSqrtF32;
119 } else {
120 // Otherwise, use sqrt.approx if fast math is enabled
122 }
123}
124
128}
129
130static bool IsPTXVectorType(MVT VT) {
131 switch (VT.SimpleTy) {
132 default:
133 return false;
134 case MVT::v2i1:
135 case MVT::v4i1:
136 case MVT::v2i8:
137 case MVT::v4i8:
138 case MVT::v2i16:
139 case MVT::v4i16:
140 case MVT::v8i16: // <4 x i16x2>
141 case MVT::v2i32:
142 case MVT::v4i32:
143 case MVT::v2i64:
144 case MVT::v2f16:
145 case MVT::v4f16:
146 case MVT::v8f16: // <4 x f16x2>
147 case MVT::v2bf16:
148 case MVT::v4bf16:
149 case MVT::v8bf16: // <4 x bf16x2>
150 case MVT::v2f32:
151 case MVT::v4f32:
152 case MVT::v2f64:
153 return true;
154 }
155}
156
157static bool Is16bitsType(MVT VT) {
158 return (VT.SimpleTy == MVT::f16 || VT.SimpleTy == MVT::bf16 ||
159 VT.SimpleTy == MVT::i16);
160}
161
162/// ComputePTXValueVTs - For the given Type \p Ty, returns the set of primitive
163/// EVTs that compose it. Unlike ComputeValueVTs, this will break apart vectors
164/// into their primitive components.
165/// NOTE: This is a band-aid for code that expects ComputeValueVTs to return the
166/// same number of types as the Ins/Outs arrays in LowerFormalArguments,
167/// LowerCall, and LowerReturn.
168static void ComputePTXValueVTs(const TargetLowering &TLI, const DataLayout &DL,
169 Type *Ty, SmallVectorImpl<EVT> &ValueVTs,
170 SmallVectorImpl<uint64_t> *Offsets = nullptr,
171 uint64_t StartingOffset = 0) {
172 SmallVector<EVT, 16> TempVTs;
173 SmallVector<uint64_t, 16> TempOffsets;
174
175 // Special case for i128 - decompose to (i64, i64)
176 if (Ty->isIntegerTy(128)) {
177 ValueVTs.push_back(EVT(MVT::i64));
178 ValueVTs.push_back(EVT(MVT::i64));
179
180 if (Offsets) {
181 Offsets->push_back(StartingOffset + 0);
182 Offsets->push_back(StartingOffset + 8);
183 }
184
185 return;
186 }
187
188 // Given a struct type, recursively traverse the elements with custom ComputePTXValueVTs.
189 if (StructType *STy = dyn_cast<StructType>(Ty)) {
190 auto const *SL = DL.getStructLayout(STy);
191 auto ElementNum = 0;
192 for(auto *EI : STy->elements()) {
193 ComputePTXValueVTs(TLI, DL, EI, ValueVTs, Offsets,
194 StartingOffset + SL->getElementOffset(ElementNum));
195 ++ElementNum;
196 }
197 return;
198 }
199
200 ComputeValueVTs(TLI, DL, Ty, TempVTs, &TempOffsets, StartingOffset);
201 for (unsigned i = 0, e = TempVTs.size(); i != e; ++i) {
202 EVT VT = TempVTs[i];
203 uint64_t Off = TempOffsets[i];
204 // Split vectors into individual elements, except for v2f16, which
205 // we will pass as a single scalar.
206 if (VT.isVector()) {
207 unsigned NumElts = VT.getVectorNumElements();
208 EVT EltVT = VT.getVectorElementType();
209 // Vectors with an even number of f16 elements will be passed to
210 // us as an array of v2f16/v2bf16 elements. We must match this so we
211 // stay in sync with Ins/Outs.
212 if ((Is16bitsType(EltVT.getSimpleVT())) && NumElts % 2 == 0) {
213 switch (EltVT.getSimpleVT().SimpleTy) {
214 case MVT::f16:
215 EltVT = MVT::v2f16;
216 break;
217 case MVT::bf16:
218 EltVT = MVT::v2bf16;
219 break;
220 case MVT::i16:
221 EltVT = MVT::v2i16;
222 break;
223 default:
224 llvm_unreachable("Unexpected type");
225 }
226 NumElts /= 2;
227 } else if (EltVT.getSimpleVT() == MVT::i8 &&
228 (NumElts % 4 == 0 || NumElts == 3)) {
229 // v*i8 are formally lowered as v4i8
230 EltVT = MVT::v4i8;
231 NumElts = (NumElts + 3) / 4;
232 }
233 for (unsigned j = 0; j != NumElts; ++j) {
234 ValueVTs.push_back(EltVT);
235 if (Offsets)
236 Offsets->push_back(Off + j * EltVT.getStoreSize());
237 }
238 } else {
239 ValueVTs.push_back(VT);
240 if (Offsets)
241 Offsets->push_back(Off);
242 }
243 }
244}
245
246/// PromoteScalarIntegerPTX
247/// Used to make sure the arguments/returns are suitable for passing
248/// and promote them to a larger size if they're not.
249///
250/// The promoted type is placed in \p PromoteVT if the function returns true.
251static bool PromoteScalarIntegerPTX(const EVT &VT, MVT *PromotedVT) {
252 if (VT.isScalarInteger()) {
253 switch (PowerOf2Ceil(VT.getFixedSizeInBits())) {
254 default:
256 "Promotion is not suitable for scalars of size larger than 64-bits");
257 case 1:
258 *PromotedVT = MVT::i1;
259 break;
260 case 2:
261 case 4:
262 case 8:
263 *PromotedVT = MVT::i8;
264 break;
265 case 16:
266 *PromotedVT = MVT::i16;
267 break;
268 case 32:
269 *PromotedVT = MVT::i32;
270 break;
271 case 64:
272 *PromotedVT = MVT::i64;
273 break;
274 }
275 return EVT(*PromotedVT) != VT;
276 }
277 return false;
278}
279
280// Check whether we can merge loads/stores of some of the pieces of a
281// flattened function parameter or return value into a single vector
282// load/store.
283//
284// The flattened parameter is represented as a list of EVTs and
285// offsets, and the whole structure is aligned to ParamAlignment. This
286// function determines whether we can load/store pieces of the
287// parameter starting at index Idx using a single vectorized op of
288// size AccessSize. If so, it returns the number of param pieces
289// covered by the vector op. Otherwise, it returns 1.
291 unsigned Idx, uint32_t AccessSize, const SmallVectorImpl<EVT> &ValueVTs,
292 const SmallVectorImpl<uint64_t> &Offsets, Align ParamAlignment) {
293
294 // Can't vectorize if param alignment is not sufficient.
295 if (ParamAlignment < AccessSize)
296 return 1;
297 // Can't vectorize if offset is not aligned.
298 if (Offsets[Idx] & (AccessSize - 1))
299 return 1;
300
301 EVT EltVT = ValueVTs[Idx];
302 unsigned EltSize = EltVT.getStoreSize();
303
304 // Element is too large to vectorize.
305 if (EltSize >= AccessSize)
306 return 1;
307
308 unsigned NumElts = AccessSize / EltSize;
309 // Can't vectorize if AccessBytes if not a multiple of EltSize.
310 if (AccessSize != EltSize * NumElts)
311 return 1;
312
313 // We don't have enough elements to vectorize.
314 if (Idx + NumElts > ValueVTs.size())
315 return 1;
316
317 // PTX ISA can only deal with 2- and 4-element vector ops.
318 if (NumElts != 4 && NumElts != 2)
319 return 1;
320
321 for (unsigned j = Idx + 1; j < Idx + NumElts; ++j) {
322 // Types do not match.
323 if (ValueVTs[j] != EltVT)
324 return 1;
325
326 // Elements are not contiguous.
327 if (Offsets[j] - Offsets[j - 1] != EltSize)
328 return 1;
329 }
330 // OK. We can vectorize ValueVTs[i..i+NumElts)
331 return NumElts;
332}
333
334// Flags for tracking per-element vectorization state of loads/stores
335// of a flattened function parameter or return value.
337 PVF_INNER = 0x0, // Middle elements of a vector.
338 PVF_FIRST = 0x1, // First element of the vector.
339 PVF_LAST = 0x2, // Last element of the vector.
340 // Scalar is effectively a 1-element vector.
343
344// Computes whether and how we can vectorize the loads/stores of a
345// flattened function parameter or return value.
346//
347// The flattened parameter is represented as the list of ValueVTs and
348// Offsets, and is aligned to ParamAlignment bytes. We return a vector
349// of the same size as ValueVTs indicating how each piece should be
350// loaded/stored (i.e. as a scalar, or as part of a vector
351// load/store).
354 const SmallVectorImpl<uint64_t> &Offsets,
355 Align ParamAlignment, bool IsVAArg = false) {
356 // Set vector size to match ValueVTs and mark all elements as
357 // scalars by default.
359 VectorInfo.assign(ValueVTs.size(), PVF_SCALAR);
360
361 if (IsVAArg)
362 return VectorInfo;
363
364 // Check what we can vectorize using 128/64/32-bit accesses.
365 for (int I = 0, E = ValueVTs.size(); I != E; ++I) {
366 // Skip elements we've already processed.
367 assert(VectorInfo[I] == PVF_SCALAR && "Unexpected vector info state.");
368 for (unsigned AccessSize : {16, 8, 4, 2}) {
369 unsigned NumElts = CanMergeParamLoadStoresStartingAt(
370 I, AccessSize, ValueVTs, Offsets, ParamAlignment);
371 // Mark vectorized elements.
372 switch (NumElts) {
373 default:
374 llvm_unreachable("Unexpected return value");
375 case 1:
376 // Can't vectorize using this size, try next smaller size.
377 continue;
378 case 2:
379 assert(I + 1 < E && "Not enough elements.");
380 VectorInfo[I] = PVF_FIRST;
381 VectorInfo[I + 1] = PVF_LAST;
382 I += 1;
383 break;
384 case 4:
385 assert(I + 3 < E && "Not enough elements.");
386 VectorInfo[I] = PVF_FIRST;
387 VectorInfo[I + 1] = PVF_INNER;
388 VectorInfo[I + 2] = PVF_INNER;
389 VectorInfo[I + 3] = PVF_LAST;
390 I += 3;
391 break;
392 }
393 // Break out of the inner loop because we've already succeeded
394 // using largest possible AccessSize.
395 break;
396 }
397 }
398 return VectorInfo;
399}
400
401// NVPTXTargetLowering Constructor.
403 const NVPTXSubtarget &STI)
404 : TargetLowering(TM), nvTM(&TM), STI(STI) {
405 // always lower memset, memcpy, and memmove intrinsics to load/store
406 // instructions, rather
407 // then generating calls to memset, mempcy or memmove.
411
414
415 // Jump is Expensive. Don't create extra control flow for 'and', 'or'
416 // condition branches.
417 setJumpIsExpensive(true);
418
419 // Wide divides are _very_ slow. Try to reduce the width of the divide if
420 // possible.
421 addBypassSlowDiv(64, 32);
422
423 // By default, use the Source scheduling
424 if (sched4reg)
426 else
428
429 auto setFP16OperationAction = [&](unsigned Op, MVT VT, LegalizeAction Action,
430 LegalizeAction NoF16Action) {
431 setOperationAction(Op, VT, STI.allowFP16Math() ? Action : NoF16Action);
432 };
433
434 auto setBF16OperationAction = [&](unsigned Op, MVT VT, LegalizeAction Action,
435 LegalizeAction NoBF16Action) {
436 bool IsOpSupported = STI.hasBF16Math();
437 // Few instructions are available on sm_90 only
438 switch(Op) {
439 case ISD::FADD:
440 case ISD::FMUL:
441 case ISD::FSUB:
442 case ISD::SELECT:
443 case ISD::SELECT_CC:
444 case ISD::SETCC:
445 case ISD::FEXP2:
446 case ISD::FCEIL:
447 case ISD::FFLOOR:
448 case ISD::FNEARBYINT:
449 case ISD::FRINT:
450 case ISD::FROUNDEVEN:
451 case ISD::FTRUNC:
452 IsOpSupported = STI.getSmVersion() >= 90 && STI.getPTXVersion() >= 78;
453 break;
454 }
456 Op, VT, IsOpSupported ? Action : NoBF16Action);
457 };
458
459 auto setI16x2OperationAction = [&](unsigned Op, MVT VT, LegalizeAction Action,
460 LegalizeAction NoI16x2Action) {
461 bool IsOpSupported = false;
462 // instructions are available on sm_90 only
463 switch (Op) {
464 case ISD::ADD:
465 case ISD::SMAX:
466 case ISD::SMIN:
467 case ISD::UMIN:
468 case ISD::UMAX:
469 IsOpSupported = STI.getSmVersion() >= 90 && STI.getPTXVersion() >= 80;
470 break;
471 }
472 setOperationAction(Op, VT, IsOpSupported ? Action : NoI16x2Action);
473 };
474
475 addRegisterClass(MVT::i1, &NVPTX::Int1RegsRegClass);
476 addRegisterClass(MVT::i16, &NVPTX::Int16RegsRegClass);
477 addRegisterClass(MVT::v2i16, &NVPTX::Int32RegsRegClass);
478 addRegisterClass(MVT::v4i8, &NVPTX::Int32RegsRegClass);
479 addRegisterClass(MVT::i32, &NVPTX::Int32RegsRegClass);
480 addRegisterClass(MVT::i64, &NVPTX::Int64RegsRegClass);
481 addRegisterClass(MVT::f32, &NVPTX::Float32RegsRegClass);
482 addRegisterClass(MVT::f64, &NVPTX::Float64RegsRegClass);
483 addRegisterClass(MVT::f16, &NVPTX::Int16RegsRegClass);
484 addRegisterClass(MVT::v2f16, &NVPTX::Int32RegsRegClass);
485 addRegisterClass(MVT::bf16, &NVPTX::Int16RegsRegClass);
486 addRegisterClass(MVT::v2bf16, &NVPTX::Int32RegsRegClass);
487
488 // Conversion to/from FP16/FP16x2 is always legal.
493
495 if (STI.getSmVersion() >= 30 && STI.getPTXVersion() > 31)
497
498 setFP16OperationAction(ISD::SETCC, MVT::f16, Legal, Promote);
499 setFP16OperationAction(ISD::SETCC, MVT::v2f16, Legal, Expand);
500
501 // Conversion to/from BFP16/BFP16x2 is always legal.
506
507 setBF16OperationAction(ISD::SETCC, MVT::v2bf16, Legal, Expand);
508 setBF16OperationAction(ISD::SETCC, MVT::bf16, Legal, Promote);
509 if (getOperationAction(ISD::SETCC, MVT::bf16) == Promote)
510 AddPromotedToType(ISD::SETCC, MVT::bf16, MVT::f32);
511
512 // Conversion to/from i16/i16x2 is always legal.
517
522 // Only logical ops can be done on v4i8 directly, others must be done
523 // elementwise.
540 MVT::v4i8, Expand);
541
542 // Operations not directly supported by NVPTX.
543 for (MVT VT : {MVT::bf16, MVT::f16, MVT::v2bf16, MVT::v2f16, MVT::f32,
544 MVT::f64, MVT::i1, MVT::i8, MVT::i16, MVT::v2i16, MVT::v4i8,
545 MVT::i32, MVT::i64}) {
548 }
549
550 // Some SIGN_EXTEND_INREG can be done using cvt instruction.
551 // For others we will expand to a SHL/SRA pair.
558
565
568
569 // TODO: we may consider expanding ROTL/ROTR on older GPUs. Currently on GPUs
570 // that don't have h/w rotation we lower them to multi-instruction assembly.
571 // See ROT*_sw in NVPTXIntrInfo.td
576
578 setOperationAction(ISD::ROTL, MVT::v2i16, Expand);
580 setOperationAction(ISD::ROTR, MVT::v2i16, Expand);
584
585 // Indirect branch is not supported.
586 // This also disables Jump Table creation.
589
592
593 // We want to legalize constant related memmove and memcopy
594 // intrinsics.
596
597 // Turn FP extload into load/fpextend
598 setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
599 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
600 setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::bf16, Expand);
601 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::bf16, Expand);
602 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
603 setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2f16, Expand);
604 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand);
605 setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2bf16, Expand);
606 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2bf16, Expand);
607 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f32, Expand);
608 setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Expand);
609 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Expand);
610 setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4bf16, Expand);
611 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4bf16, Expand);
612 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f32, Expand);
613 setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Expand);
614 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Expand);
615 setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8bf16, Expand);
616 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8bf16, Expand);
617 // Turn FP truncstore into trunc + store.
618 // FIXME: vector types should also be expanded
619 setTruncStoreAction(MVT::f32, MVT::f16, Expand);
620 setTruncStoreAction(MVT::f64, MVT::f16, Expand);
621 setTruncStoreAction(MVT::f32, MVT::bf16, Expand);
622 setTruncStoreAction(MVT::f64, MVT::bf16, Expand);
623 setTruncStoreAction(MVT::f64, MVT::f32, Expand);
624
625 // PTX does not support load / store predicate registers
628
629 for (MVT VT : MVT::integer_valuetypes()) {
632 setTruncStoreAction(VT, MVT::i1, Expand);
633 }
634
635 // expand extload of vector of integers.
637 MVT::v2i8, Expand);
638 setTruncStoreAction(MVT::v2i16, MVT::v2i8, Expand);
639
640 // This is legal in NVPTX
645
648
649 // TRAP can be lowered to PTX trap
650 setOperationAction(ISD::TRAP, MVT::Other, Legal);
651
652 // Register custom handling for vector loads/stores
654 if (IsPTXVectorType(VT)) {
658 }
659 }
660
661 // Support varargs.
666
667 // Custom handling for i8 intrinsics
669
670 for (const auto& Ty : {MVT::i16, MVT::i32, MVT::i64}) {
676
679 }
680
681 setI16x2OperationAction(ISD::ABS, MVT::v2i16, Legal, Custom);
682 setI16x2OperationAction(ISD::SMIN, MVT::v2i16, Legal, Custom);
683 setI16x2OperationAction(ISD::SMAX, MVT::v2i16, Legal, Custom);
684 setI16x2OperationAction(ISD::UMIN, MVT::v2i16, Legal, Custom);
685 setI16x2OperationAction(ISD::UMAX, MVT::v2i16, Legal, Custom);
686 setI16x2OperationAction(ISD::CTPOP, MVT::v2i16, Legal, Expand);
687 setI16x2OperationAction(ISD::CTLZ, MVT::v2i16, Legal, Expand);
688
689 setI16x2OperationAction(ISD::ADD, MVT::v2i16, Legal, Custom);
690 setI16x2OperationAction(ISD::SUB, MVT::v2i16, Legal, Custom);
691 setI16x2OperationAction(ISD::MUL, MVT::v2i16, Legal, Custom);
692 setI16x2OperationAction(ISD::SHL, MVT::v2i16, Legal, Custom);
693 setI16x2OperationAction(ISD::SREM, MVT::v2i16, Legal, Custom);
694 setI16x2OperationAction(ISD::UREM, MVT::v2i16, Legal, Custom);
695
696 // Other arithmetic and logic ops are unsupported.
700 MVT::v2i16, Expand);
701
706 if (STI.getPTXVersion() >= 43) {
711 }
712
714 setOperationAction(ISD::CTTZ, MVT::v2i16, Expand);
717
718 // PTX does not directly support SELP of i1, so promote to i32 first
720
721 // PTX cannot multiply two i64s in a single instruction.
724
725 // We have some custom DAG combine patterns for these nodes
728 ISD::VSELECT});
729
730 // setcc for f16x2 and bf16x2 needs special handling to prevent
731 // legalizer's attempt to scalarize it due to v2i1 not being legal.
732 if (STI.allowFP16Math() || STI.hasBF16Math())
734
735 // Promote fp16 arithmetic if fp16 hardware isn't available or the
736 // user passed --nvptx-no-fp16-math. The flag is useful because,
737 // although sm_53+ GPUs have some sort of FP16 support in
738 // hardware, only sm_53 and sm_60 have full implementation. Others
739 // only have token amount of hardware and are likely to run faster
740 // by using fp32 units instead.
741 for (const auto &Op : {ISD::FADD, ISD::FMUL, ISD::FSUB, ISD::FMA}) {
742 setFP16OperationAction(Op, MVT::f16, Legal, Promote);
743 setFP16OperationAction(Op, MVT::v2f16, Legal, Expand);
744 setBF16OperationAction(Op, MVT::v2bf16, Legal, Expand);
745 // bf16 must be promoted to f32.
746 setBF16OperationAction(Op, MVT::bf16, Legal, Promote);
747 if (getOperationAction(Op, MVT::bf16) == Promote)
748 AddPromotedToType(Op, MVT::bf16, MVT::f32);
749 }
750
751 // f16/f16x2 neg was introduced in PTX 60, SM_53.
752 const bool IsFP16FP16x2NegAvailable = STI.getSmVersion() >= 53 &&
753 STI.getPTXVersion() >= 60 &&
754 STI.allowFP16Math();
755 for (const auto &VT : {MVT::f16, MVT::v2f16})
757 IsFP16FP16x2NegAvailable ? Legal : Expand);
758
759 setBF16OperationAction(ISD::FNEG, MVT::bf16, Legal, Expand);
760 setBF16OperationAction(ISD::FNEG, MVT::v2bf16, Legal, Expand);
761 // (would be) Library functions.
762
763 // These map to conversion instructions for scalar FP types.
764 for (const auto &Op : {ISD::FCEIL, ISD::FFLOOR, ISD::FNEARBYINT, ISD::FRINT,
766 setOperationAction(Op, MVT::f16, Legal);
767 setOperationAction(Op, MVT::f32, Legal);
768 setOperationAction(Op, MVT::f64, Legal);
769 setOperationAction(Op, MVT::v2f16, Expand);
770 setOperationAction(Op, MVT::v2bf16, Expand);
771 setBF16OperationAction(Op, MVT::bf16, Legal, Promote);
772 if (getOperationAction(Op, MVT::bf16) == Promote)
773 AddPromotedToType(Op, MVT::bf16, MVT::f32);
774 }
775
776 if (STI.getSmVersion() < 80 || STI.getPTXVersion() < 71) {
778 }
779 if (STI.getSmVersion() < 90 || STI.getPTXVersion() < 78) {
780 for (MVT VT : {MVT::bf16, MVT::f32, MVT::f64}) {
783 }
784 }
785
786 // sm_80 only has conversions between f32 and bf16. Custom lower all other
787 // bf16 conversions.
788 if (STI.getSmVersion() < 90 || STI.getPTXVersion() < 78) {
789 for (MVT VT : {MVT::i1, MVT::i16, MVT::i32, MVT::i64}) {
792 VT, Custom);
793 }
796 MVT::bf16, Custom);
797 }
798
805 AddPromotedToType(ISD::FROUND, MVT::bf16, MVT::f32);
806
807 // 'Expand' implements FCOPYSIGN without calling an external library.
814
815 // These map to corresponding instructions for f32/f64. f16 must be
816 // promoted to f32. v2f16 is expanded to f16, which is then promoted
817 // to f32.
818 for (const auto &Op :
820 setOperationAction(Op, MVT::f16, Promote);
821 setOperationAction(Op, MVT::f32, Legal);
822 setOperationAction(Op, MVT::f64, Legal);
823 setOperationAction(Op, MVT::v2f16, Expand);
824 setOperationAction(Op, MVT::v2bf16, Expand);
825 setOperationAction(Op, MVT::bf16, Promote);
826 AddPromotedToType(Op, MVT::bf16, MVT::f32);
827 }
828 for (const auto &Op : {ISD::FABS}) {
829 setOperationAction(Op, MVT::f16, Promote);
830 setOperationAction(Op, MVT::f32, Legal);
831 setOperationAction(Op, MVT::f64, Legal);
832 setOperationAction(Op, MVT::v2f16, Expand);
833 setBF16OperationAction(Op, MVT::v2bf16, Legal, Expand);
834 setBF16OperationAction(Op, MVT::bf16, Legal, Promote);
835 if (getOperationAction(Op, MVT::bf16) == Promote)
836 AddPromotedToType(Op, MVT::bf16, MVT::f32);
837 }
838
839 // max.f16, max.f16x2 and max.NaN are supported on sm_80+.
840 auto GetMinMaxAction = [&](LegalizeAction NotSm80Action) {
841 bool IsAtLeastSm80 = STI.getSmVersion() >= 80 && STI.getPTXVersion() >= 70;
842 return IsAtLeastSm80 ? Legal : NotSm80Action;
843 };
844 for (const auto &Op : {ISD::FMINNUM, ISD::FMAXNUM}) {
845 setFP16OperationAction(Op, MVT::f16, GetMinMaxAction(Promote), Promote);
846 setOperationAction(Op, MVT::f32, Legal);
847 setOperationAction(Op, MVT::f64, Legal);
848 setFP16OperationAction(Op, MVT::v2f16, GetMinMaxAction(Expand), Expand);
849 setBF16OperationAction(Op, MVT::v2bf16, Legal, Expand);
850 setBF16OperationAction(Op, MVT::bf16, Legal, Promote);
851 if (getOperationAction(Op, MVT::bf16) == Promote)
852 AddPromotedToType(Op, MVT::bf16, MVT::f32);
853 }
854 for (const auto &Op : {ISD::FMINIMUM, ISD::FMAXIMUM}) {
855 setFP16OperationAction(Op, MVT::f16, GetMinMaxAction(Expand), Expand);
856 setFP16OperationAction(Op, MVT::bf16, Legal, Expand);
857 setOperationAction(Op, MVT::f32, GetMinMaxAction(Expand));
858 setFP16OperationAction(Op, MVT::v2f16, GetMinMaxAction(Expand), Expand);
859 setBF16OperationAction(Op, MVT::v2bf16, Legal, Expand);
860 }
861
862 // No FEXP2, FLOG2. The PTX ex2 and log2 functions are always approximate.
863 // No FPOW or FREM in PTX.
864
865 // Now deduce the information based on the above mentioned
866 // actions
868
871}
872
873const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const {
874
875#define MAKE_CASE(V) \
876 case V: \
877 return #V;
878
879 switch ((NVPTXISD::NodeType)Opcode) {
881 break;
882
1026
1117
1129
1141
1153
1165
1177
1189
1201
1213
1225
1237
1249
1261
1273
1285
1297 }
1298 return nullptr;
1299
1300#undef MAKE_CASE
1301}
1302
1305 if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&
1306 VT.getScalarType() == MVT::i1)
1307 return TypeSplitVector;
1308 if (Isv2x16VT(VT))
1309 return TypeLegal;
1311}
1312
1314 int Enabled, int &ExtraSteps,
1315 bool &UseOneConst,
1316 bool Reciprocal) const {
1319 return SDValue();
1320
1321 if (ExtraSteps == ReciprocalEstimate::Unspecified)
1322 ExtraSteps = 0;
1323
1324 SDLoc DL(Operand);
1325 EVT VT = Operand.getValueType();
1326 bool Ftz = useF32FTZ(DAG.getMachineFunction());
1327
1328 auto MakeIntrinsicCall = [&](Intrinsic::ID IID) {
1329 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
1330 DAG.getConstant(IID, DL, MVT::i32), Operand);
1331 };
1332
1333 // The sqrt and rsqrt refinement processes assume we always start out with an
1334 // approximation of the rsqrt. Therefore, if we're going to do any refinement
1335 // (i.e. ExtraSteps > 0), we must return an rsqrt. But if we're *not* doing
1336 // any refinement, we must return a regular sqrt.
1337 if (Reciprocal || ExtraSteps > 0) {
1338 if (VT == MVT::f32)
1339 return MakeIntrinsicCall(Ftz ? Intrinsic::nvvm_rsqrt_approx_ftz_f
1340 : Intrinsic::nvvm_rsqrt_approx_f);
1341 else if (VT == MVT::f64)
1342 return MakeIntrinsicCall(Intrinsic::nvvm_rsqrt_approx_d);
1343 else
1344 return SDValue();
1345 } else {
1346 if (VT == MVT::f32)
1347 return MakeIntrinsicCall(Ftz ? Intrinsic::nvvm_sqrt_approx_ftz_f
1348 : Intrinsic::nvvm_sqrt_approx_f);
1349 else {
1350 // There's no sqrt.approx.f64 instruction, so we emit
1351 // reciprocal(rsqrt(x)). This is faster than
1352 // select(x == 0, 0, x * rsqrt(x)). (In fact, it's faster than plain
1353 // x * rsqrt(x).)
1354 return DAG.getNode(
1356 DAG.getConstant(Intrinsic::nvvm_rcp_approx_ftz_d, DL, MVT::i32),
1357 MakeIntrinsicCall(Intrinsic::nvvm_rsqrt_approx_d));
1358 }
1359 }
1360}
1361
1362SDValue
1364 SDLoc dl(Op);
1365 const GlobalAddressSDNode *GAN = cast<GlobalAddressSDNode>(Op);
1366 auto PtrVT = getPointerTy(DAG.getDataLayout(), GAN->getAddressSpace());
1367 Op = DAG.getTargetGlobalAddress(GAN->getGlobal(), dl, PtrVT);
1368 return DAG.getNode(NVPTXISD::Wrapper, dl, PtrVT, Op);
1369}
1370
1371static bool IsTypePassedAsArray(const Type *Ty) {
1372 return Ty->isAggregateType() || Ty->isVectorTy() || Ty->isIntegerTy(128) ||
1373 Ty->isHalfTy() || Ty->isBFloatTy();
1374}
1375
1377 const DataLayout &DL, Type *retTy, const ArgListTy &Args,
1378 const SmallVectorImpl<ISD::OutputArg> &Outs, MaybeAlign retAlignment,
1379 std::optional<std::pair<unsigned, const APInt &>> VAInfo,
1380 const CallBase &CB, unsigned UniqueCallSite) const {
1381 auto PtrVT = getPointerTy(DL);
1382
1383 bool isABI = (STI.getSmVersion() >= 20);
1384 assert(isABI && "Non-ABI compilation is not supported");
1385 if (!isABI)
1386 return "";
1387
1388 std::string Prototype;
1389 raw_string_ostream O(Prototype);
1390 O << "prototype_" << UniqueCallSite << " : .callprototype ";
1391
1392 if (retTy->getTypeID() == Type::VoidTyID) {
1393 O << "()";
1394 } else {
1395 O << "(";
1396 if ((retTy->isFloatingPointTy() || retTy->isIntegerTy()) &&
1397 !IsTypePassedAsArray(retTy)) {
1398 unsigned size = 0;
1399 if (auto *ITy = dyn_cast<IntegerType>(retTy)) {
1400 size = ITy->getBitWidth();
1401 } else {
1402 assert(retTy->isFloatingPointTy() &&
1403 "Floating point type expected here");
1404 size = retTy->getPrimitiveSizeInBits();
1405 }
1406 // PTX ABI requires all scalar return values to be at least 32
1407 // bits in size. fp16 normally uses .b16 as its storage type in
1408 // PTX, so its size must be adjusted here, too.
1410
1411 O << ".param .b" << size << " _";
1412 } else if (isa<PointerType>(retTy)) {
1413 O << ".param .b" << PtrVT.getSizeInBits() << " _";
1414 } else if (IsTypePassedAsArray(retTy)) {
1415 O << ".param .align " << (retAlignment ? retAlignment->value() : 0)
1416 << " .b8 _[" << DL.getTypeAllocSize(retTy) << "]";
1417 } else {
1418 llvm_unreachable("Unknown return type");
1419 }
1420 O << ") ";
1421 }
1422 O << "_ (";
1423
1424 bool first = true;
1425
1426 const Function *F = CB.getFunction();
1427 unsigned NumArgs = VAInfo ? VAInfo->first : Args.size();
1428 for (unsigned i = 0, OIdx = 0; i != NumArgs; ++i, ++OIdx) {
1429 Type *Ty = Args[i].Ty;
1430 if (!first) {
1431 O << ", ";
1432 }
1433 first = false;
1434
1435 if (!Outs[OIdx].Flags.isByVal()) {
1436 if (IsTypePassedAsArray(Ty)) {
1437 const CallInst *CallI = cast<CallInst>(&CB);
1438 Align ParamAlign =
1440 .value_or(getFunctionParamOptimizedAlign(F, Ty, DL));
1441 O << ".param .align " << ParamAlign.value() << " .b8 ";
1442 O << "_";
1443 O << "[" << DL.getTypeAllocSize(Ty) << "]";
1444 // update the index for Outs
1445 SmallVector<EVT, 16> vtparts;
1446 ComputeValueVTs(*this, DL, Ty, vtparts);
1447 if (unsigned len = vtparts.size())
1448 OIdx += len - 1;
1449 continue;
1450 }
1451 // i8 types in IR will be i16 types in SDAG
1452 assert((getValueType(DL, Ty) == Outs[OIdx].VT ||
1453 (getValueType(DL, Ty) == MVT::i8 && Outs[OIdx].VT == MVT::i16)) &&
1454 "type mismatch between callee prototype and arguments");
1455 // scalar type
1456 unsigned sz = 0;
1457 if (isa<IntegerType>(Ty)) {
1458 sz = cast<IntegerType>(Ty)->getBitWidth();
1460 } else if (isa<PointerType>(Ty)) {
1461 sz = PtrVT.getSizeInBits();
1462 } else {
1463 sz = Ty->getPrimitiveSizeInBits();
1464 }
1465 O << ".param .b" << sz << " ";
1466 O << "_";
1467 continue;
1468 }
1469
1470 Type *ETy = Args[i].IndirectType;
1471 Align InitialAlign = Outs[OIdx].Flags.getNonZeroByValAlign();
1472 Align ParamByValAlign =
1473 getFunctionByValParamAlign(F, ETy, InitialAlign, DL);
1474
1475 O << ".param .align " << ParamByValAlign.value() << " .b8 ";
1476 O << "_";
1477 O << "[" << Outs[OIdx].Flags.getByValSize() << "]";
1478 }
1479
1480 if (VAInfo)
1481 O << (first ? "" : ",") << " .param .align " << VAInfo->second
1482 << " .b8 _[]\n";
1483 O << ")";
1485 O << " .noreturn";
1486 O << ";";
1487
1488 return Prototype;
1489}
1490
1492 const Function *F, Type *Ty, unsigned Idx, const DataLayout &DL) const {
1493 return getAlign(*F, Idx).value_or(getFunctionParamOptimizedAlign(F, Ty, DL));
1494}
1495
1496Align NVPTXTargetLowering::getArgumentAlignment(const CallBase *CB, Type *Ty,
1497 unsigned Idx,
1498 const DataLayout &DL) const {
1499 if (!CB) {
1500 // CallSite is zero, fallback to ABI type alignment
1501 return DL.getABITypeAlign(Ty);
1502 }
1503
1504 const Function *DirectCallee = CB->getCalledFunction();
1505
1506 if (!DirectCallee) {
1507 // We don't have a direct function symbol, but that may be because of
1508 // constant cast instructions in the call.
1509
1510 // With bitcast'd call targets, the instruction will be the call
1511 if (const auto *CI = dyn_cast<CallInst>(CB)) {
1512 // Check if we have call alignment metadata
1513 if (MaybeAlign StackAlign = getAlign(*CI, Idx))
1514 return StackAlign.value();
1515 }
1516 DirectCallee = getMaybeBitcastedCallee(CB);
1517 }
1518
1519 // Check for function alignment information if we found that the
1520 // ultimate target is a Function
1521 if (DirectCallee)
1522 return getFunctionArgumentAlignment(DirectCallee, Ty, Idx, DL);
1523
1524 // Call is indirect, fall back to the ABI type alignment
1525 return DL.getABITypeAlign(Ty);
1526}
1527
1528static bool adjustElementType(EVT &ElementType) {
1529 switch (ElementType.getSimpleVT().SimpleTy) {
1530 default:
1531 return false;
1532 case MVT::f16:
1533 case MVT::bf16:
1534 ElementType = MVT::i16;
1535 return true;
1536 case MVT::f32:
1537 case MVT::v2f16:
1538 case MVT::v2bf16:
1539 ElementType = MVT::i32;
1540 return true;
1541 case MVT::f64:
1542 ElementType = MVT::i64;
1543 return true;
1544 }
1545}
1546
1547// Use byte-store when the param address of the argument value is unaligned.
1548// This may happen when the return value is a field of a packed structure.
1549//
1550// This is called in LowerCall() when passing the param values.
1552 uint64_t Offset, EVT ElementType,
1553 SDValue StVal, SDValue &InGlue,
1554 unsigned ArgID, const SDLoc &dl) {
1555 // Bit logic only works on integer types
1556 if (adjustElementType(ElementType))
1557 StVal = DAG.getNode(ISD::BITCAST, dl, ElementType, StVal);
1558
1559 // Store each byte
1560 SDVTList StoreVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1561 for (unsigned i = 0, n = ElementType.getSizeInBits() / 8; i < n; i++) {
1562 // Shift the byte to the last byte position
1563 SDValue ShiftVal = DAG.getNode(ISD::SRL, dl, ElementType, StVal,
1564 DAG.getConstant(i * 8, dl, MVT::i32));
1565 SDValue StoreOperands[] = {Chain, DAG.getConstant(ArgID, dl, MVT::i32),
1566 DAG.getConstant(Offset + i, dl, MVT::i32),
1567 ShiftVal, InGlue};
1568 // Trunc store only the last byte by using
1569 // st.param.b8
1570 // The register type can be larger than b8.
1571 Chain = DAG.getMemIntrinsicNode(
1572 NVPTXISD::StoreParam, dl, StoreVTs, StoreOperands, MVT::i8,
1574 InGlue = Chain.getValue(1);
1575 }
1576 return Chain;
1577}
1578
1579// Use byte-load when the param adress of the returned value is unaligned.
1580// This may happen when the returned value is a field of a packed structure.
1581static SDValue
1583 EVT ElementType, SDValue &InGlue,
1584 SmallVectorImpl<SDValue> &TempProxyRegOps,
1585 const SDLoc &dl) {
1586 // Bit logic only works on integer types
1587 EVT MergedType = ElementType;
1588 adjustElementType(MergedType);
1589
1590 // Load each byte and construct the whole value. Initial value to 0
1591 SDValue RetVal = DAG.getConstant(0, dl, MergedType);
1592 // LoadParamMemI8 loads into i16 register only
1593 SDVTList LoadVTs = DAG.getVTList(MVT::i16, MVT::Other, MVT::Glue);
1594 for (unsigned i = 0, n = ElementType.getSizeInBits() / 8; i < n; i++) {
1595 SDValue LoadOperands[] = {Chain, DAG.getConstant(1, dl, MVT::i32),
1596 DAG.getConstant(Offset + i, dl, MVT::i32),
1597 InGlue};
1598 // This will be selected to LoadParamMemI8
1599 SDValue LdVal =
1600 DAG.getMemIntrinsicNode(NVPTXISD::LoadParam, dl, LoadVTs, LoadOperands,
1601 MVT::i8, MachinePointerInfo(), Align(1));
1602 SDValue TmpLdVal = LdVal.getValue(0);
1603 Chain = LdVal.getValue(1);
1604 InGlue = LdVal.getValue(2);
1605
1606 TmpLdVal = DAG.getNode(NVPTXISD::ProxyReg, dl,
1607 TmpLdVal.getSimpleValueType(), TmpLdVal);
1608 TempProxyRegOps.push_back(TmpLdVal);
1609
1610 SDValue CMask = DAG.getConstant(255, dl, MergedType);
1611 SDValue CShift = DAG.getConstant(i * 8, dl, MVT::i32);
1612 // Need to extend the i16 register to the whole width.
1613 TmpLdVal = DAG.getNode(ISD::ZERO_EXTEND, dl, MergedType, TmpLdVal);
1614 // Mask off the high bits. Leave only the lower 8bits.
1615 // Do this because we are using loadparam.b8.
1616 TmpLdVal = DAG.getNode(ISD::AND, dl, MergedType, TmpLdVal, CMask);
1617 // Shift and merge
1618 TmpLdVal = DAG.getNode(ISD::SHL, dl, MergedType, TmpLdVal, CShift);
1619 RetVal = DAG.getNode(ISD::OR, dl, MergedType, RetVal, TmpLdVal);
1620 }
1621 if (ElementType != MergedType)
1622 RetVal = DAG.getNode(ISD::BITCAST, dl, ElementType, RetVal);
1623
1624 return RetVal;
1625}
1626
1628 SmallVectorImpl<SDValue> &InVals) const {
1629
1630 if (CLI.IsVarArg && (STI.getPTXVersion() < 60 || STI.getSmVersion() < 30))
1632 "Support for variadic functions (unsized array parameter) introduced "
1633 "in PTX ISA version 6.0 and requires target sm_30.");
1634
1635 SelectionDAG &DAG = CLI.DAG;
1636 SDLoc dl = CLI.DL;
1638 SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
1640 SDValue Chain = CLI.Chain;
1641 SDValue Callee = CLI.Callee;
1642 bool &isTailCall = CLI.IsTailCall;
1643 ArgListTy &Args = CLI.getArgs();
1644 Type *RetTy = CLI.RetTy;
1645 const CallBase *CB = CLI.CB;
1646 const DataLayout &DL = DAG.getDataLayout();
1647
1648 bool isABI = (STI.getSmVersion() >= 20);
1649 assert(isABI && "Non-ABI compilation is not supported");
1650 if (!isABI)
1651 return Chain;
1652
1653 // Variadic arguments.
1654 //
1655 // Normally, for each argument, we declare a param scalar or a param
1656 // byte array in the .param space, and store the argument value to that
1657 // param scalar or array starting at offset 0.
1658 //
1659 // In the case of the first variadic argument, we declare a vararg byte array
1660 // with size 0. The exact size of this array isn't known at this point, so
1661 // it'll be patched later. All the variadic arguments will be stored to this
1662 // array at a certain offset (which gets tracked by 'VAOffset'). The offset is
1663 // initially set to 0, so it can be used for non-variadic arguments (which use
1664 // 0 offset) to simplify the code.
1665 //
1666 // After all vararg is processed, 'VAOffset' holds the size of the
1667 // vararg byte array.
1668
1669 SDValue VADeclareParam; // vararg byte array
1670 unsigned FirstVAArg = CLI.NumFixedArgs; // position of the first variadic
1671 unsigned VAOffset = 0; // current offset in the param array
1672
1673 unsigned UniqueCallSite = GlobalUniqueCallSite.fetch_add(1);
1674 SDValue TempChain = Chain;
1675 Chain = DAG.getCALLSEQ_START(Chain, UniqueCallSite, 0, dl);
1676 SDValue InGlue = Chain.getValue(1);
1677
1678 unsigned ParamCount = 0;
1679 // Args.size() and Outs.size() need not match.
1680 // Outs.size() will be larger
1681 // * if there is an aggregate argument with multiple fields (each field
1682 // showing up separately in Outs)
1683 // * if there is a vector argument with more than typical vector-length
1684 // elements (generally if more than 4) where each vector element is
1685 // individually present in Outs.
1686 // So a different index should be used for indexing into Outs/OutVals.
1687 // See similar issue in LowerFormalArguments.
1688 unsigned OIdx = 0;
1689 // Declare the .params or .reg need to pass values
1690 // to the function
1691 for (unsigned i = 0, e = Args.size(); i != e; ++i, ++OIdx) {
1692 EVT VT = Outs[OIdx].VT;
1693 Type *Ty = Args[i].Ty;
1694 bool IsVAArg = (i >= CLI.NumFixedArgs);
1695 bool IsByVal = Outs[OIdx].Flags.isByVal();
1696
1699
1700 assert((!IsByVal || Args[i].IndirectType) &&
1701 "byval arg must have indirect type");
1702 Type *ETy = (IsByVal ? Args[i].IndirectType : Ty);
1703 ComputePTXValueVTs(*this, DL, ETy, VTs, &Offsets, IsByVal ? 0 : VAOffset);
1704
1705 Align ArgAlign;
1706 if (IsByVal) {
1707 // The ByValAlign in the Outs[OIdx].Flags is always set at this point,
1708 // so we don't need to worry whether it's naturally aligned or not.
1709 // See TargetLowering::LowerCallTo().
1710 Align InitialAlign = Outs[OIdx].Flags.getNonZeroByValAlign();
1711 ArgAlign = getFunctionByValParamAlign(CB->getCalledFunction(), ETy,
1712 InitialAlign, DL);
1713 if (IsVAArg)
1714 VAOffset = alignTo(VAOffset, ArgAlign);
1715 } else {
1716 ArgAlign = getArgumentAlignment(CB, Ty, ParamCount + 1, DL);
1717 }
1718
1719 unsigned TypeSize =
1720 (IsByVal ? Outs[OIdx].Flags.getByValSize() : DL.getTypeAllocSize(Ty));
1721 SDVTList DeclareParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1722
1723 bool NeedAlign; // Does argument declaration specify alignment?
1724 bool PassAsArray = IsByVal || IsTypePassedAsArray(Ty);
1725 if (IsVAArg) {
1726 if (ParamCount == FirstVAArg) {
1727 SDValue DeclareParamOps[] = {
1728 Chain, DAG.getConstant(STI.getMaxRequiredAlignment(), dl, MVT::i32),
1729 DAG.getConstant(ParamCount, dl, MVT::i32),
1730 DAG.getConstant(1, dl, MVT::i32), InGlue};
1731 VADeclareParam = Chain = DAG.getNode(NVPTXISD::DeclareParam, dl,
1732 DeclareParamVTs, DeclareParamOps);
1733 }
1734 NeedAlign = PassAsArray;
1735 } else if (PassAsArray) {
1736 // declare .param .align <align> .b8 .param<n>[<size>];
1737 SDValue DeclareParamOps[] = {
1738 Chain, DAG.getConstant(ArgAlign.value(), dl, MVT::i32),
1739 DAG.getConstant(ParamCount, dl, MVT::i32),
1740 DAG.getConstant(TypeSize, dl, MVT::i32), InGlue};
1741 Chain = DAG.getNode(NVPTXISD::DeclareParam, dl, DeclareParamVTs,
1742 DeclareParamOps);
1743 NeedAlign = true;
1744 } else {
1745 // declare .param .b<size> .param<n>;
1746 if (VT.isInteger() || VT.isFloatingPoint()) {
1747 // PTX ABI requires integral types to be at least 32 bits in
1748 // size. FP16 is loaded/stored using i16, so it's handled
1749 // here as well.
1751 }
1752 SDValue DeclareScalarParamOps[] = {
1753 Chain, DAG.getConstant(ParamCount, dl, MVT::i32),
1754 DAG.getConstant(TypeSize * 8, dl, MVT::i32),
1755 DAG.getConstant(0, dl, MVT::i32), InGlue};
1756 Chain = DAG.getNode(NVPTXISD::DeclareScalarParam, dl, DeclareParamVTs,
1757 DeclareScalarParamOps);
1758 NeedAlign = false;
1759 }
1760 InGlue = Chain.getValue(1);
1761
1762 // PTX Interoperability Guide 3.3(A): [Integer] Values shorter
1763 // than 32-bits are sign extended or zero extended, depending on
1764 // whether they are signed or unsigned types. This case applies
1765 // only to scalar parameters and not to aggregate values.
1766 bool ExtendIntegerParam =
1767 Ty->isIntegerTy() && DL.getTypeAllocSizeInBits(Ty) < 32;
1768
1769 auto VectorInfo = VectorizePTXValueVTs(VTs, Offsets, ArgAlign, IsVAArg);
1770 SmallVector<SDValue, 6> StoreOperands;
1771 for (unsigned j = 0, je = VTs.size(); j != je; ++j) {
1772 EVT EltVT = VTs[j];
1773 int CurOffset = Offsets[j];
1774 MaybeAlign PartAlign;
1775 if (NeedAlign)
1776 PartAlign = commonAlignment(ArgAlign, CurOffset);
1777
1778 SDValue StVal = OutVals[OIdx];
1779
1780 MVT PromotedVT;
1781 if (PromoteScalarIntegerPTX(EltVT, &PromotedVT)) {
1782 EltVT = EVT(PromotedVT);
1783 }
1784 if (PromoteScalarIntegerPTX(StVal.getValueType(), &PromotedVT)) {
1786 Outs[OIdx].Flags.isSExt() ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
1787 StVal = DAG.getNode(Ext, dl, PromotedVT, StVal);
1788 }
1789
1790 if (IsByVal) {
1791 auto PtrVT = getPointerTy(DL);
1792 SDValue srcAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StVal,
1793 DAG.getConstant(CurOffset, dl, PtrVT));
1794 StVal = DAG.getLoad(EltVT, dl, TempChain, srcAddr, MachinePointerInfo(),
1795 PartAlign);
1796 } else if (ExtendIntegerParam) {
1797 assert(VTs.size() == 1 && "Scalar can't have multiple parts.");
1798 // zext/sext to i32
1799 StVal = DAG.getNode(Outs[OIdx].Flags.isSExt() ? ISD::SIGN_EXTEND
1801 dl, MVT::i32, StVal);
1802 }
1803
1804 if (!ExtendIntegerParam && EltVT.getSizeInBits() < 16) {
1805 // Use 16-bit registers for small stores as it's the
1806 // smallest general purpose register size supported by NVPTX.
1807 StVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, StVal);
1808 }
1809
1810 // If we have a PVF_SCALAR entry, it may not be sufficiently aligned for a
1811 // scalar store. In such cases, fall back to byte stores.
1812 if (VectorInfo[j] == PVF_SCALAR && !IsVAArg && PartAlign.has_value() &&
1813 PartAlign.value() <
1814 DL.getABITypeAlign(EltVT.getTypeForEVT(*DAG.getContext()))) {
1815 assert(StoreOperands.empty() && "Unfinished preceeding store.");
1817 DAG, Chain, IsByVal ? CurOffset + VAOffset : CurOffset, EltVT,
1818 StVal, InGlue, ParamCount, dl);
1819
1820 // LowerUnalignedStoreParam took care of inserting the necessary nodes
1821 // into the SDAG, so just move on to the next element.
1822 if (!IsByVal)
1823 ++OIdx;
1824 continue;
1825 }
1826
1827 // New store.
1828 if (VectorInfo[j] & PVF_FIRST) {
1829 assert(StoreOperands.empty() && "Unfinished preceding store.");
1830 StoreOperands.push_back(Chain);
1831 StoreOperands.push_back(
1832 DAG.getConstant(IsVAArg ? FirstVAArg : ParamCount, dl, MVT::i32));
1833
1834 StoreOperands.push_back(DAG.getConstant(
1835 IsByVal ? CurOffset + VAOffset : (IsVAArg ? VAOffset : CurOffset),
1836 dl, MVT::i32));
1837 }
1838
1839 // Record the value to store.
1840 StoreOperands.push_back(StVal);
1841
1842 if (VectorInfo[j] & PVF_LAST) {
1843 unsigned NumElts = StoreOperands.size() - 3;
1845 switch (NumElts) {
1846 case 1:
1848 break;
1849 case 2:
1851 break;
1852 case 4:
1854 break;
1855 default:
1856 llvm_unreachable("Invalid vector info.");
1857 }
1858
1859 StoreOperands.push_back(InGlue);
1860
1861 // Adjust type of the store op if we've extended the scalar
1862 // return value.
1863 EVT TheStoreType = ExtendIntegerParam ? MVT::i32 : EltVT;
1864
1865 Chain = DAG.getMemIntrinsicNode(
1866 Op, dl, DAG.getVTList(MVT::Other, MVT::Glue), StoreOperands,
1867 TheStoreType, MachinePointerInfo(), PartAlign,
1869 InGlue = Chain.getValue(1);
1870
1871 // Cleanup.
1872 StoreOperands.clear();
1873
1874 // TODO: We may need to support vector types that can be passed
1875 // as scalars in variadic arguments.
1876 if (!IsByVal && IsVAArg) {
1877 assert(NumElts == 1 &&
1878 "Vectorization is expected to be disabled for variadics.");
1879 VAOffset += DL.getTypeAllocSize(
1880 TheStoreType.getTypeForEVT(*DAG.getContext()));
1881 }
1882 }
1883 if (!IsByVal)
1884 ++OIdx;
1885 }
1886 assert(StoreOperands.empty() && "Unfinished parameter store.");
1887 if (!IsByVal && VTs.size() > 0)
1888 --OIdx;
1889 ++ParamCount;
1890 if (IsByVal && IsVAArg)
1891 VAOffset += TypeSize;
1892 }
1893
1894 GlobalAddressSDNode *Func = dyn_cast<GlobalAddressSDNode>(Callee.getNode());
1895 MaybeAlign retAlignment = std::nullopt;
1896
1897 // Handle Result
1898 if (Ins.size() > 0) {
1899 SmallVector<EVT, 16> resvtparts;
1900 ComputeValueVTs(*this, DL, RetTy, resvtparts);
1901
1902 // Declare
1903 // .param .align N .b8 retval0[<size-in-bytes>], or
1904 // .param .b<size-in-bits> retval0
1905 unsigned resultsz = DL.getTypeAllocSizeInBits(RetTy);
1906 if (!IsTypePassedAsArray(RetTy)) {
1907 resultsz = promoteScalarArgumentSize(resultsz);
1908 SDVTList DeclareRetVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1909 SDValue DeclareRetOps[] = { Chain, DAG.getConstant(1, dl, MVT::i32),
1910 DAG.getConstant(resultsz, dl, MVT::i32),
1911 DAG.getConstant(0, dl, MVT::i32), InGlue };
1912 Chain = DAG.getNode(NVPTXISD::DeclareRet, dl, DeclareRetVTs,
1913 DeclareRetOps);
1914 InGlue = Chain.getValue(1);
1915 } else {
1916 retAlignment = getArgumentAlignment(CB, RetTy, 0, DL);
1917 assert(retAlignment && "retAlignment is guaranteed to be set");
1918 SDVTList DeclareRetVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1919 SDValue DeclareRetOps[] = {
1920 Chain, DAG.getConstant(retAlignment->value(), dl, MVT::i32),
1921 DAG.getConstant(resultsz / 8, dl, MVT::i32),
1922 DAG.getConstant(0, dl, MVT::i32), InGlue};
1923 Chain = DAG.getNode(NVPTXISD::DeclareRetParam, dl, DeclareRetVTs,
1924 DeclareRetOps);
1925 InGlue = Chain.getValue(1);
1926 }
1927 }
1928
1929 bool HasVAArgs = CLI.IsVarArg && (CLI.Args.size() > CLI.NumFixedArgs);
1930 // Set the size of the vararg param byte array if the callee is a variadic
1931 // function and the variadic part is not empty.
1932 if (HasVAArgs) {
1933 SDValue DeclareParamOps[] = {
1934 VADeclareParam.getOperand(0), VADeclareParam.getOperand(1),
1935 VADeclareParam.getOperand(2), DAG.getConstant(VAOffset, dl, MVT::i32),
1936 VADeclareParam.getOperand(4)};
1937 DAG.MorphNodeTo(VADeclareParam.getNode(), VADeclareParam.getOpcode(),
1938 VADeclareParam->getVTList(), DeclareParamOps);
1939 }
1940
1941 // Both indirect calls and libcalls have nullptr Func. In order to distinguish
1942 // between them we must rely on the call site value which is valid for
1943 // indirect calls but is always null for libcalls.
1944 bool isIndirectCall = !Func && CB;
1945
1946 if (isa<ExternalSymbolSDNode>(Callee)) {
1947 Function* CalleeFunc = nullptr;
1948
1949 // Try to find the callee in the current module.
1950 Callee = DAG.getSymbolFunctionGlobalAddress(Callee, &CalleeFunc);
1951 assert(CalleeFunc != nullptr && "Libcall callee must be set.");
1952
1953 // Set the "libcall callee" attribute to indicate that the function
1954 // must always have a declaration.
1955 CalleeFunc->addFnAttr("nvptx-libcall-callee", "true");
1956 }
1957
1958 if (isIndirectCall) {
1959 // This is indirect function call case : PTX requires a prototype of the
1960 // form
1961 // proto_0 : .callprototype(.param .b32 _) _ (.param .b32 _);
1962 // to be emitted, and the label has to used as the last arg of call
1963 // instruction.
1964 // The prototype is embedded in a string and put as the operand for a
1965 // CallPrototype SDNode which will print out to the value of the string.
1966 SDVTList ProtoVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1967 std::string Proto = getPrototype(
1968 DL, RetTy, Args, Outs, retAlignment,
1969 HasVAArgs
1970 ? std::optional<std::pair<unsigned, const APInt &>>(std::make_pair(
1971 CLI.NumFixedArgs, VADeclareParam->getConstantOperandAPInt(1)))
1972 : std::nullopt,
1973 *CB, UniqueCallSite);
1974 const char *ProtoStr = nvTM->getStrPool().save(Proto).data();
1975 SDValue ProtoOps[] = {
1976 Chain,
1977 DAG.getTargetExternalSymbol(ProtoStr, MVT::i32),
1978 InGlue,
1979 };
1980 Chain = DAG.getNode(NVPTXISD::CallPrototype, dl, ProtoVTs, ProtoOps);
1981 InGlue = Chain.getValue(1);
1982 }
1983 // Op to just print "call"
1984 SDVTList PrintCallVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1985 SDValue PrintCallOps[] = {
1986 Chain, DAG.getConstant((Ins.size() == 0) ? 0 : 1, dl, MVT::i32), InGlue
1987 };
1988 // We model convergent calls as separate opcodes.
1990 if (CLI.IsConvergent)
1993 Chain = DAG.getNode(Opcode, dl, PrintCallVTs, PrintCallOps);
1994 InGlue = Chain.getValue(1);
1995
1996 // Ops to print out the function name
1997 SDVTList CallVoidVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1998 SDValue CallVoidOps[] = { Chain, Callee, InGlue };
1999 Chain = DAG.getNode(NVPTXISD::CallVoid, dl, CallVoidVTs, CallVoidOps);
2000 InGlue = Chain.getValue(1);
2001
2002 // Ops to print out the param list
2003 SDVTList CallArgBeginVTs = DAG.getVTList(MVT::Other, MVT::Glue);
2004 SDValue CallArgBeginOps[] = { Chain, InGlue };
2005 Chain = DAG.getNode(NVPTXISD::CallArgBegin, dl, CallArgBeginVTs,
2006 CallArgBeginOps);
2007 InGlue = Chain.getValue(1);
2008
2009 for (unsigned i = 0, e = std::min(CLI.NumFixedArgs + 1, ParamCount); i != e;
2010 ++i) {
2011 unsigned opcode;
2012 if (i == (e - 1))
2013 opcode = NVPTXISD::LastCallArg;
2014 else
2015 opcode = NVPTXISD::CallArg;
2016 SDVTList CallArgVTs = DAG.getVTList(MVT::Other, MVT::Glue);
2017 SDValue CallArgOps[] = { Chain, DAG.getConstant(1, dl, MVT::i32),
2018 DAG.getConstant(i, dl, MVT::i32), InGlue };
2019 Chain = DAG.getNode(opcode, dl, CallArgVTs, CallArgOps);
2020 InGlue = Chain.getValue(1);
2021 }
2022 SDVTList CallArgEndVTs = DAG.getVTList(MVT::Other, MVT::Glue);
2023 SDValue CallArgEndOps[] = { Chain,
2024 DAG.getConstant(isIndirectCall ? 0 : 1, dl, MVT::i32),
2025 InGlue };
2026 Chain = DAG.getNode(NVPTXISD::CallArgEnd, dl, CallArgEndVTs, CallArgEndOps);
2027 InGlue = Chain.getValue(1);
2028
2029 if (isIndirectCall) {
2030 SDVTList PrototypeVTs = DAG.getVTList(MVT::Other, MVT::Glue);
2031 SDValue PrototypeOps[] = {
2032 Chain, DAG.getConstant(UniqueCallSite, dl, MVT::i32), InGlue};
2033 Chain = DAG.getNode(NVPTXISD::Prototype, dl, PrototypeVTs, PrototypeOps);
2034 InGlue = Chain.getValue(1);
2035 }
2036
2037 SmallVector<SDValue, 16> ProxyRegOps;
2038 SmallVector<std::optional<MVT>, 16> ProxyRegTruncates;
2039 // An item of the vector is filled if the element does not need a ProxyReg
2040 // operation on it and should be added to InVals as is. ProxyRegOps and
2041 // ProxyRegTruncates contain empty/none items at the same index.
2043 // A temporary ProxyReg operations inserted in `LowerUnalignedLoadRetParam()`
2044 // to use the values of `LoadParam`s and to be replaced later then
2045 // `CALLSEQ_END` is added.
2046 SmallVector<SDValue, 16> TempProxyRegOps;
2047
2048 // Generate loads from param memory/moves from registers for result
2049 if (Ins.size() > 0) {
2052 ComputePTXValueVTs(*this, DL, RetTy, VTs, &Offsets, 0);
2053 assert(VTs.size() == Ins.size() && "Bad value decomposition");
2054
2055 Align RetAlign = getArgumentAlignment(CB, RetTy, 0, DL);
2056 auto VectorInfo = VectorizePTXValueVTs(VTs, Offsets, RetAlign);
2057
2058 SmallVector<EVT, 6> LoadVTs;
2059 int VecIdx = -1; // Index of the first element of the vector.
2060
2061 // PTX Interoperability Guide 3.3(A): [Integer] Values shorter than
2062 // 32-bits are sign extended or zero extended, depending on whether
2063 // they are signed or unsigned types.
2064 bool ExtendIntegerRetVal =
2065 RetTy->isIntegerTy() && DL.getTypeAllocSizeInBits(RetTy) < 32;
2066
2067 for (unsigned i = 0, e = VTs.size(); i != e; ++i) {
2068 bool needTruncate = false;
2069 EVT TheLoadType = VTs[i];
2070 EVT EltType = Ins[i].VT;
2071 Align EltAlign = commonAlignment(RetAlign, Offsets[i]);
2072 MVT PromotedVT;
2073
2074 if (PromoteScalarIntegerPTX(TheLoadType, &PromotedVT)) {
2075 TheLoadType = EVT(PromotedVT);
2076 EltType = EVT(PromotedVT);
2077 needTruncate = true;
2078 }
2079
2080 if (ExtendIntegerRetVal) {
2081 TheLoadType = MVT::i32;
2082 EltType = MVT::i32;
2083 needTruncate = true;
2084 } else if (TheLoadType.getSizeInBits() < 16) {
2085 if (VTs[i].isInteger())
2086 needTruncate = true;
2087 EltType = MVT::i16;
2088 }
2089
2090 // If we have a PVF_SCALAR entry, it may not be sufficiently aligned for a
2091 // scalar load. In such cases, fall back to byte loads.
2092 if (VectorInfo[i] == PVF_SCALAR && RetTy->isAggregateType() &&
2093 EltAlign < DL.getABITypeAlign(
2094 TheLoadType.getTypeForEVT(*DAG.getContext()))) {
2095 assert(VecIdx == -1 && LoadVTs.empty() && "Orphaned operand list.");
2097 DAG, Chain, Offsets[i], TheLoadType, InGlue, TempProxyRegOps, dl);
2098 ProxyRegOps.push_back(SDValue());
2099 ProxyRegTruncates.push_back(std::optional<MVT>());
2100 RetElts.resize(i);
2101 RetElts.push_back(Ret);
2102
2103 continue;
2104 }
2105
2106 // Record index of the very first element of the vector.
2107 if (VectorInfo[i] & PVF_FIRST) {
2108 assert(VecIdx == -1 && LoadVTs.empty() && "Orphaned operand list.");
2109 VecIdx = i;
2110 }
2111
2112 LoadVTs.push_back(EltType);
2113
2114 if (VectorInfo[i] & PVF_LAST) {
2115 unsigned NumElts = LoadVTs.size();
2116 LoadVTs.push_back(MVT::Other);
2117 LoadVTs.push_back(MVT::Glue);
2119 switch (NumElts) {
2120 case 1:
2122 break;
2123 case 2:
2125 break;
2126 case 4:
2128 break;
2129 default:
2130 llvm_unreachable("Invalid vector info.");
2131 }
2132
2133 SDValue LoadOperands[] = {
2134 Chain, DAG.getConstant(1, dl, MVT::i32),
2135 DAG.getConstant(Offsets[VecIdx], dl, MVT::i32), InGlue};
2136 SDValue RetVal = DAG.getMemIntrinsicNode(
2137 Op, dl, DAG.getVTList(LoadVTs), LoadOperands, TheLoadType,
2138 MachinePointerInfo(), EltAlign,
2140
2141 for (unsigned j = 0; j < NumElts; ++j) {
2142 ProxyRegOps.push_back(RetVal.getValue(j));
2143
2144 if (needTruncate)
2145 ProxyRegTruncates.push_back(std::optional<MVT>(Ins[VecIdx + j].VT));
2146 else
2147 ProxyRegTruncates.push_back(std::optional<MVT>());
2148 }
2149
2150 Chain = RetVal.getValue(NumElts);
2151 InGlue = RetVal.getValue(NumElts + 1);
2152
2153 // Cleanup
2154 VecIdx = -1;
2155 LoadVTs.clear();
2156 }
2157 }
2158 }
2159
2160 Chain =
2161 DAG.getCALLSEQ_END(Chain, UniqueCallSite, UniqueCallSite + 1, InGlue, dl);
2162 InGlue = Chain.getValue(1);
2163
2164 // Append ProxyReg instructions to the chain to make sure that `callseq_end`
2165 // will not get lost. Otherwise, during libcalls expansion, the nodes can become
2166 // dangling.
2167 for (unsigned i = 0; i < ProxyRegOps.size(); ++i) {
2168 if (i < RetElts.size() && RetElts[i]) {
2169 InVals.push_back(RetElts[i]);
2170 continue;
2171 }
2172
2173 SDValue Ret = DAG.getNode(
2175 DAG.getVTList(ProxyRegOps[i].getSimpleValueType(), MVT::Other, MVT::Glue),
2176 { Chain, ProxyRegOps[i], InGlue }
2177 );
2178
2179 Chain = Ret.getValue(1);
2180 InGlue = Ret.getValue(2);
2181
2182 if (ProxyRegTruncates[i]) {
2183 Ret = DAG.getNode(ISD::TRUNCATE, dl, *ProxyRegTruncates[i], Ret);
2184 }
2185
2186 InVals.push_back(Ret);
2187 }
2188
2189 for (SDValue &T : TempProxyRegOps) {
2190 SDValue Repl = DAG.getNode(
2192 DAG.getVTList(T.getSimpleValueType(), MVT::Other, MVT::Glue),
2193 {Chain, T.getOperand(0), InGlue});
2194 DAG.ReplaceAllUsesWith(T, Repl);
2195 DAG.RemoveDeadNode(T.getNode());
2196
2197 Chain = Repl.getValue(1);
2198 InGlue = Repl.getValue(2);
2199 }
2200
2201 // set isTailCall to false for now, until we figure out how to express
2202 // tail call optimization in PTX
2203 isTailCall = false;
2204 return Chain;
2205}
2206
2208 SelectionDAG &DAG) const {
2209
2210 if (STI.getPTXVersion() < 73 || STI.getSmVersion() < 52) {
2211 const Function &Fn = DAG.getMachineFunction().getFunction();
2212
2213 DiagnosticInfoUnsupported NoDynamicAlloca(
2214 Fn,
2215 "Support for dynamic alloca introduced in PTX ISA version 7.3 and "
2216 "requires target sm_52.",
2217 SDLoc(Op).getDebugLoc());
2218 DAG.getContext()->diagnose(NoDynamicAlloca);
2219 auto Ops = {DAG.getConstant(0, SDLoc(), Op.getValueType()),
2220 Op.getOperand(0)};
2221 return DAG.getMergeValues(Ops, SDLoc());
2222 }
2223
2224 SDValue Chain = Op.getOperand(0);
2225 SDValue Size = Op.getOperand(1);
2226 uint64_t Align = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
2227 SDLoc DL(Op.getNode());
2228
2229 // The size for ptx alloca instruction is 64-bit for m64 and 32-bit for m32.
2230 if (nvTM->is64Bit())
2231 Size = DAG.getZExtOrTrunc(Size, DL, MVT::i64);
2232 else
2233 Size = DAG.getZExtOrTrunc(Size, DL, MVT::i32);
2234
2235 SDValue AllocOps[] = {Chain, Size,
2236 DAG.getTargetConstant(Align, DL, MVT::i32)};
2238 nvTM->is64Bit() ? MVT::i64 : MVT::i32, AllocOps);
2239
2240 SDValue MergeOps[] = {Alloca, Chain};
2241 return DAG.getMergeValues(MergeOps, DL);
2242}
2243
2244// By default CONCAT_VECTORS is lowered by ExpandVectorBuildThroughStack()
2245// (see LegalizeDAG.cpp). This is slow and uses local memory.
2246// We use extract/insert/build vector just as what LegalizeOp() does in llvm 2.5
2247SDValue
2248NVPTXTargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const {
2249 SDNode *Node = Op.getNode();
2250 SDLoc dl(Node);
2252 unsigned NumOperands = Node->getNumOperands();
2253 for (unsigned i = 0; i < NumOperands; ++i) {
2254 SDValue SubOp = Node->getOperand(i);
2255 EVT VVT = SubOp.getNode()->getValueType(0);
2256 EVT EltVT = VVT.getVectorElementType();
2257 unsigned NumSubElem = VVT.getVectorNumElements();
2258 for (unsigned j = 0; j < NumSubElem; ++j) {
2259 Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, SubOp,
2260 DAG.getIntPtrConstant(j, dl)));
2261 }
2262 }
2263 return DAG.getBuildVector(Node->getValueType(0), dl, Ops);
2264}
2265
2266// We can init constant f16x2/v2i16/v4i8 with a single .b32 move. Normally it
2267// would get lowered as two constant loads and vector-packing move.
2268// Instead we want just a constant move:
2269// mov.b32 %r2, 0x40003C00
2270SDValue NVPTXTargetLowering::LowerBUILD_VECTOR(SDValue Op,
2271 SelectionDAG &DAG) const {
2272 EVT VT = Op->getValueType(0);
2273 if (!(Isv2x16VT(VT) || VT == MVT::v4i8))
2274 return Op;
2275
2276 SDLoc DL(Op);
2277
2278 if (!llvm::all_of(Op->ops(), [](SDValue Operand) {
2279 return Operand->isUndef() || isa<ConstantSDNode>(Operand) ||
2280 isa<ConstantFPSDNode>(Operand);
2281 })) {
2282 // Lower non-const v4i8 vector as byte-wise constructed i32, which allows us
2283 // to optimize calculation of constant parts.
2284 if (VT == MVT::v4i8) {
2285 SDValue C8 = DAG.getConstant(8, DL, MVT::i32);
2286 SDValue E01 = DAG.getNode(
2287 NVPTXISD::BFI, DL, MVT::i32,
2288 DAG.getAnyExtOrTrunc(Op->getOperand(1), DL, MVT::i32),
2289 DAG.getAnyExtOrTrunc(Op->getOperand(0), DL, MVT::i32), C8, C8);
2290 SDValue E012 =
2291 DAG.getNode(NVPTXISD::BFI, DL, MVT::i32,
2292 DAG.getAnyExtOrTrunc(Op->getOperand(2), DL, MVT::i32),
2293 E01, DAG.getConstant(16, DL, MVT::i32), C8);
2294 SDValue E0123 =
2295 DAG.getNode(NVPTXISD::BFI, DL, MVT::i32,
2296 DAG.getAnyExtOrTrunc(Op->getOperand(3), DL, MVT::i32),
2297 E012, DAG.getConstant(24, DL, MVT::i32), C8);
2298 return DAG.getNode(ISD::BITCAST, DL, VT, E0123);
2299 }
2300 return Op;
2301 }
2302
2303 // Get value or the Nth operand as an APInt(32). Undef values treated as 0.
2304 auto GetOperand = [](SDValue Op, int N) -> APInt {
2305 const SDValue &Operand = Op->getOperand(N);
2306 EVT VT = Op->getValueType(0);
2307 if (Operand->isUndef())
2308 return APInt(32, 0);
2309 APInt Value;
2310 if (VT == MVT::v2f16 || VT == MVT::v2bf16)
2311 Value = cast<ConstantFPSDNode>(Operand)->getValueAPF().bitcastToAPInt();
2312 else if (VT == MVT::v2i16 || VT == MVT::v4i8)
2313 Value = Operand->getAsAPIntVal();
2314 else
2315 llvm_unreachable("Unsupported type");
2316 // i8 values are carried around as i16, so we need to zero out upper bits,
2317 // so they do not get in the way of combining individual byte values
2318 if (VT == MVT::v4i8)
2319 Value = Value.trunc(8);
2320 return Value.zext(32);
2321 };
2322 APInt Value;
2323 if (Isv2x16VT(VT)) {
2324 Value = GetOperand(Op, 0) | GetOperand(Op, 1).shl(16);
2325 } else if (VT == MVT::v4i8) {
2326 Value = GetOperand(Op, 0) | GetOperand(Op, 1).shl(8) |
2327 GetOperand(Op, 2).shl(16) | GetOperand(Op, 3).shl(24);
2328 } else {
2329 llvm_unreachable("Unsupported type");
2330 }
2331 SDValue Const = DAG.getConstant(Value, SDLoc(Op), MVT::i32);
2332 return DAG.getNode(ISD::BITCAST, SDLoc(Op), Op->getValueType(0), Const);
2333}
2334
2335SDValue NVPTXTargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
2336 SelectionDAG &DAG) const {
2337 SDValue Index = Op->getOperand(1);
2338 SDValue Vector = Op->getOperand(0);
2339 SDLoc DL(Op);
2340 EVT VectorVT = Vector.getValueType();
2341
2342 if (VectorVT == MVT::v4i8) {
2343 SDValue BFE =
2344 DAG.getNode(NVPTXISD::BFE, DL, MVT::i32,
2345 {Vector,
2346 DAG.getNode(ISD::MUL, DL, MVT::i32,
2347 DAG.getZExtOrTrunc(Index, DL, MVT::i32),
2348 DAG.getConstant(8, DL, MVT::i32)),
2349 DAG.getConstant(8, DL, MVT::i32)});
2350 return DAG.getAnyExtOrTrunc(BFE, DL, Op->getValueType(0));
2351 }
2352
2353 // Constant index will be matched by tablegen.
2354 if (isa<ConstantSDNode>(Index.getNode()))
2355 return Op;
2356
2357 // Extract individual elements and select one of them.
2358 assert(Isv2x16VT(VectorVT) && "Unexpected vector type.");
2359 EVT EltVT = VectorVT.getVectorElementType();
2360
2361 SDLoc dl(Op.getNode());
2362 SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Vector,
2363 DAG.getIntPtrConstant(0, dl));
2364 SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Vector,
2365 DAG.getIntPtrConstant(1, dl));
2366 return DAG.getSelectCC(dl, Index, DAG.getIntPtrConstant(0, dl), E0, E1,
2368}
2369
2370SDValue NVPTXTargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
2371 SelectionDAG &DAG) const {
2372 SDValue Vector = Op->getOperand(0);
2373 EVT VectorVT = Vector.getValueType();
2374
2375 if (VectorVT != MVT::v4i8)
2376 return Op;
2377 SDLoc DL(Op);
2378 SDValue Value = Op->getOperand(1);
2379 if (Value->isUndef())
2380 return Vector;
2381
2382 SDValue Index = Op->getOperand(2);
2383
2384 SDValue BFI =
2385 DAG.getNode(NVPTXISD::BFI, DL, MVT::i32,
2386 {DAG.getZExtOrTrunc(Value, DL, MVT::i32), Vector,
2387 DAG.getNode(ISD::MUL, DL, MVT::i32,
2388 DAG.getZExtOrTrunc(Index, DL, MVT::i32),
2389 DAG.getConstant(8, DL, MVT::i32)),
2390 DAG.getConstant(8, DL, MVT::i32)});
2391 return DAG.getNode(ISD::BITCAST, DL, Op->getValueType(0), BFI);
2392}
2393
2394SDValue NVPTXTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
2395 SelectionDAG &DAG) const {
2396 SDValue V1 = Op.getOperand(0);
2397 EVT VectorVT = V1.getValueType();
2398 if (VectorVT != MVT::v4i8 || Op.getValueType() != MVT::v4i8)
2399 return Op;
2400
2401 // Lower shuffle to PRMT instruction.
2402 const ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
2403 SDValue V2 = Op.getOperand(1);
2404 uint32_t Selector = 0;
2405 for (auto I : llvm::enumerate(SVN->getMask())) {
2406 if (I.value() != -1) // -1 is a placeholder for undef.
2407 Selector |= (I.value() << (I.index() * 4));
2408 }
2409
2410 SDLoc DL(Op);
2411 return DAG.getNode(NVPTXISD::PRMT, DL, MVT::v4i8, V1, V2,
2412 DAG.getConstant(Selector, DL, MVT::i32),
2413 DAG.getConstant(NVPTX::PTXPrmtMode::NONE, DL, MVT::i32));
2414}
2415/// LowerShiftRightParts - Lower SRL_PARTS, SRA_PARTS, which
2416/// 1) returns two i32 values and take a 2 x i32 value to shift plus a shift
2417/// amount, or
2418/// 2) returns two i64 values and take a 2 x i64 value to shift plus a shift
2419/// amount.
2420SDValue NVPTXTargetLowering::LowerShiftRightParts(SDValue Op,
2421 SelectionDAG &DAG) const {
2422 assert(Op.getNumOperands() == 3 && "Not a double-shift!");
2423 assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS);
2424
2425 EVT VT = Op.getValueType();
2426 unsigned VTBits = VT.getSizeInBits();
2427 SDLoc dl(Op);
2428 SDValue ShOpLo = Op.getOperand(0);
2429 SDValue ShOpHi = Op.getOperand(1);
2430 SDValue ShAmt = Op.getOperand(2);
2431 unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL;
2432
2433 if (VTBits == 32 && STI.getSmVersion() >= 35) {
2434 // For 32bit and sm35, we can use the funnel shift 'shf' instruction.
2435 // {dHi, dLo} = {aHi, aLo} >> Amt
2436 // dHi = aHi >> Amt
2437 // dLo = shf.r.clamp aLo, aHi, Amt
2438
2439 SDValue Hi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
2440 SDValue Lo = DAG.getNode(NVPTXISD::FUN_SHFR_CLAMP, dl, VT, ShOpLo, ShOpHi,
2441 ShAmt);
2442
2443 SDValue Ops[2] = { Lo, Hi };
2444 return DAG.getMergeValues(Ops, dl);
2445 }
2446 else {
2447 // {dHi, dLo} = {aHi, aLo} >> Amt
2448 // - if (Amt>=size) then
2449 // dLo = aHi >> (Amt-size)
2450 // dHi = aHi >> Amt (this is either all 0 or all 1)
2451 // else
2452 // dLo = (aLo >>logic Amt) | (aHi << (size-Amt))
2453 // dHi = aHi >> Amt
2454
2455 SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
2456 DAG.getConstant(VTBits, dl, MVT::i32),
2457 ShAmt);
2458 SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt);
2459 SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
2460 DAG.getConstant(VTBits, dl, MVT::i32));
2461 SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt);
2462 SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
2463 SDValue TrueVal = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt);
2464
2465 SDValue Cmp = DAG.getSetCC(dl, MVT::i1, ShAmt,
2466 DAG.getConstant(VTBits, dl, MVT::i32),
2467 ISD::SETGE);
2468 SDValue Hi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
2469 SDValue Lo = DAG.getNode(ISD::SELECT, dl, VT, Cmp, TrueVal, FalseVal);
2470
2471 SDValue Ops[2] = { Lo, Hi };
2472 return DAG.getMergeValues(Ops, dl);
2473 }
2474}
2475
2476/// LowerShiftLeftParts - Lower SHL_PARTS, which
2477/// 1) returns two i32 values and take a 2 x i32 value to shift plus a shift
2478/// amount, or
2479/// 2) returns two i64 values and take a 2 x i64 value to shift plus a shift
2480/// amount.
2481SDValue NVPTXTargetLowering::LowerShiftLeftParts(SDValue Op,
2482 SelectionDAG &DAG) const {
2483 assert(Op.getNumOperands() == 3 && "Not a double-shift!");
2484 assert(Op.getOpcode() == ISD::SHL_PARTS);
2485
2486 EVT VT = Op.getValueType();
2487 unsigned VTBits = VT.getSizeInBits();
2488 SDLoc dl(Op);
2489 SDValue ShOpLo = Op.getOperand(0);
2490 SDValue ShOpHi = Op.getOperand(1);
2491 SDValue ShAmt = Op.getOperand(2);
2492
2493 if (VTBits == 32 && STI.getSmVersion() >= 35) {
2494 // For 32bit and sm35, we can use the funnel shift 'shf' instruction.
2495 // {dHi, dLo} = {aHi, aLo} << Amt
2496 // dHi = shf.l.clamp aLo, aHi, Amt
2497 // dLo = aLo << Amt
2498
2499 SDValue Hi = DAG.getNode(NVPTXISD::FUN_SHFL_CLAMP, dl, VT, ShOpLo, ShOpHi,
2500 ShAmt);
2501 SDValue Lo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
2502
2503 SDValue Ops[2] = { Lo, Hi };
2504 return DAG.getMergeValues(Ops, dl);
2505 }
2506 else {
2507 // {dHi, dLo} = {aHi, aLo} << Amt
2508 // - if (Amt>=size) then
2509 // dLo = aLo << Amt (all 0)
2510 // dLo = aLo << (Amt-size)
2511 // else
2512 // dLo = aLo << Amt
2513 // dHi = (aHi << Amt) | (aLo >> (size-Amt))
2514
2515 SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
2516 DAG.getConstant(VTBits, dl, MVT::i32),
2517 ShAmt);
2518 SDValue Tmp1 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt);
2519 SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
2520 DAG.getConstant(VTBits, dl, MVT::i32));
2521 SDValue Tmp2 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt);
2522 SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
2523 SDValue TrueVal = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt);
2524
2525 SDValue Cmp = DAG.getSetCC(dl, MVT::i1, ShAmt,
2526 DAG.getConstant(VTBits, dl, MVT::i32),
2527 ISD::SETGE);
2528 SDValue Lo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
2529 SDValue Hi = DAG.getNode(ISD::SELECT, dl, VT, Cmp, TrueVal, FalseVal);
2530
2531 SDValue Ops[2] = { Lo, Hi };
2532 return DAG.getMergeValues(Ops, dl);
2533 }
2534}
2535
2536SDValue NVPTXTargetLowering::LowerFROUND(SDValue Op, SelectionDAG &DAG) const {
2537 EVT VT = Op.getValueType();
2538
2539 if (VT == MVT::f32)
2540 return LowerFROUND32(Op, DAG);
2541
2542 if (VT == MVT::f64)
2543 return LowerFROUND64(Op, DAG);
2544
2545 llvm_unreachable("unhandled type");
2546}
2547
2548// This is the the rounding method used in CUDA libdevice in C like code:
2549// float roundf(float A)
2550// {
2551// float RoundedA = (float) (int) ( A > 0 ? (A + 0.5f) : (A - 0.5f));
2552// RoundedA = abs(A) > 0x1.0p23 ? A : RoundedA;
2553// return abs(A) < 0.5 ? (float)(int)A : RoundedA;
2554// }
2555SDValue NVPTXTargetLowering::LowerFROUND32(SDValue Op,
2556 SelectionDAG &DAG) const {
2557 SDLoc SL(Op);
2558 SDValue A = Op.getOperand(0);
2559 EVT VT = Op.getValueType();
2560
2561 SDValue AbsA = DAG.getNode(ISD::FABS, SL, VT, A);
2562
2563 // RoundedA = (float) (int) ( A > 0 ? (A + 0.5f) : (A - 0.5f))
2564 SDValue Bitcast = DAG.getNode(ISD::BITCAST, SL, MVT::i32, A);
2565 const int SignBitMask = 0x80000000;
2566 SDValue Sign = DAG.getNode(ISD::AND, SL, MVT::i32, Bitcast,
2567 DAG.getConstant(SignBitMask, SL, MVT::i32));
2568 const int PointFiveInBits = 0x3F000000;
2569 SDValue PointFiveWithSignRaw =
2570 DAG.getNode(ISD::OR, SL, MVT::i32, Sign,
2571 DAG.getConstant(PointFiveInBits, SL, MVT::i32));
2572 SDValue PointFiveWithSign =
2573 DAG.getNode(ISD::BITCAST, SL, VT, PointFiveWithSignRaw);
2574 SDValue AdjustedA = DAG.getNode(ISD::FADD, SL, VT, A, PointFiveWithSign);
2575 SDValue RoundedA = DAG.getNode(ISD::FTRUNC, SL, VT, AdjustedA);
2576
2577 // RoundedA = abs(A) > 0x1.0p23 ? A : RoundedA;
2578 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2579 SDValue IsLarge =
2580 DAG.getSetCC(SL, SetCCVT, AbsA, DAG.getConstantFP(pow(2.0, 23.0), SL, VT),
2581 ISD::SETOGT);
2582 RoundedA = DAG.getNode(ISD::SELECT, SL, VT, IsLarge, A, RoundedA);
2583
2584 // return abs(A) < 0.5 ? (float)(int)A : RoundedA;
2585 SDValue IsSmall =DAG.getSetCC(SL, SetCCVT, AbsA,
2586 DAG.getConstantFP(0.5, SL, VT), ISD::SETOLT);
2587 SDValue RoundedAForSmallA = DAG.getNode(ISD::FTRUNC, SL, VT, A);
2588 return DAG.getNode(ISD::SELECT, SL, VT, IsSmall, RoundedAForSmallA, RoundedA);
2589}
2590
2591// The implementation of round(double) is similar to that of round(float) in
2592// that they both separate the value range into three regions and use a method
2593// specific to the region to round the values. However, round(double) first
2594// calculates the round of the absolute value and then adds the sign back while
2595// round(float) directly rounds the value with sign.
2596SDValue NVPTXTargetLowering::LowerFROUND64(SDValue Op,
2597 SelectionDAG &DAG) const {
2598 SDLoc SL(Op);
2599 SDValue A = Op.getOperand(0);
2600 EVT VT = Op.getValueType();
2601
2602 SDValue AbsA = DAG.getNode(ISD::FABS, SL, VT, A);
2603
2604 // double RoundedA = (double) (int) (abs(A) + 0.5f);
2605 SDValue AdjustedA = DAG.getNode(ISD::FADD, SL, VT, AbsA,
2606 DAG.getConstantFP(0.5, SL, VT));
2607 SDValue RoundedA = DAG.getNode(ISD::FTRUNC, SL, VT, AdjustedA);
2608
2609 // RoundedA = abs(A) < 0.5 ? (double)0 : RoundedA;
2610 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2611 SDValue IsSmall =DAG.getSetCC(SL, SetCCVT, AbsA,
2612 DAG.getConstantFP(0.5, SL, VT), ISD::SETOLT);
2613 RoundedA = DAG.getNode(ISD::SELECT, SL, VT, IsSmall,
2614 DAG.getConstantFP(0, SL, VT),
2615 RoundedA);
2616
2617 // Add sign to rounded_A
2618 RoundedA = DAG.getNode(ISD::FCOPYSIGN, SL, VT, RoundedA, A);
2619 DAG.getNode(ISD::FTRUNC, SL, VT, A);
2620
2621 // RoundedA = abs(A) > 0x1.0p52 ? A : RoundedA;
2622 SDValue IsLarge =
2623 DAG.getSetCC(SL, SetCCVT, AbsA, DAG.getConstantFP(pow(2.0, 52.0), SL, VT),
2624 ISD::SETOGT);
2625 return DAG.getNode(ISD::SELECT, SL, VT, IsLarge, A, RoundedA);
2626}
2627
2628SDValue NVPTXTargetLowering::LowerINT_TO_FP(SDValue Op,
2629 SelectionDAG &DAG) const {
2630 assert(STI.getSmVersion() < 90 || STI.getPTXVersion() < 78);
2631
2632 if (Op.getValueType() == MVT::bf16) {
2633 SDLoc Loc(Op);
2634 return DAG.getNode(
2635 ISD::FP_ROUND, Loc, MVT::bf16,
2636 DAG.getNode(Op.getOpcode(), Loc, MVT::f32, Op.getOperand(0)),
2637 DAG.getIntPtrConstant(0, Loc));
2638 }
2639
2640 // Everything else is considered legal.
2641 return Op;
2642}
2643
2644SDValue NVPTXTargetLowering::LowerFP_TO_INT(SDValue Op,
2645 SelectionDAG &DAG) const {
2646 assert(STI.getSmVersion() < 90 || STI.getPTXVersion() < 78);
2647
2648 if (Op.getOperand(0).getValueType() == MVT::bf16) {
2649 SDLoc Loc(Op);
2650 return DAG.getNode(
2651 Op.getOpcode(), Loc, Op.getValueType(),
2652 DAG.getNode(ISD::FP_EXTEND, Loc, MVT::f32, Op.getOperand(0)));
2653 }
2654
2655 // Everything else is considered legal.
2656 return Op;
2657}
2658
2659SDValue NVPTXTargetLowering::LowerFP_ROUND(SDValue Op,
2660 SelectionDAG &DAG) const {
2661 EVT NarrowVT = Op.getValueType();
2662 SDValue Wide = Op.getOperand(0);
2663 EVT WideVT = Wide.getValueType();
2664 if (NarrowVT.getScalarType() == MVT::bf16) {
2665 const TargetLowering *TLI = STI.getTargetLowering();
2666 if (STI.getSmVersion() < 80 || STI.getPTXVersion() < 70) {
2667 return TLI->expandFP_ROUND(Op.getNode(), DAG);
2668 }
2669 if (STI.getSmVersion() < 90 || STI.getPTXVersion() < 78) {
2670 // This combination was the first to support f32 -> bf16.
2671 if (STI.getSmVersion() >= 80 && STI.getPTXVersion() >= 70) {
2672 if (WideVT.getScalarType() == MVT::f32) {
2673 return Op;
2674 }
2675 if (WideVT.getScalarType() == MVT::f64) {
2676 SDLoc Loc(Op);
2677 // Round-inexact-to-odd f64 to f32, then do the final rounding using
2678 // the hardware f32 -> bf16 instruction.
2680 WideVT.isVector() ? WideVT.changeVectorElementType(MVT::f32)
2681 : MVT::f32,
2682 Wide, Loc, DAG);
2683 return DAG.getFPExtendOrRound(rod, Loc, NarrowVT);
2684 }
2685 }
2686 return TLI->expandFP_ROUND(Op.getNode(), DAG);
2687 }
2688 }
2689
2690 // Everything else is considered legal.
2691 return Op;
2692}
2693
2694SDValue NVPTXTargetLowering::LowerFP_EXTEND(SDValue Op,
2695 SelectionDAG &DAG) const {
2696 SDValue Narrow = Op.getOperand(0);
2697 EVT NarrowVT = Narrow.getValueType();
2698 EVT WideVT = Op.getValueType();
2699 if (NarrowVT.getScalarType() == MVT::bf16) {
2700 if (WideVT.getScalarType() == MVT::f32 &&
2701 (STI.getSmVersion() < 80 || STI.getPTXVersion() < 71)) {
2702 SDLoc Loc(Op);
2703 return DAG.getNode(ISD::BF16_TO_FP, Loc, WideVT, Narrow);
2704 }
2705 if (WideVT.getScalarType() == MVT::f64 &&
2706 (STI.getSmVersion() < 90 || STI.getPTXVersion() < 78)) {
2707 EVT F32 = NarrowVT.isVector() ? NarrowVT.changeVectorElementType(MVT::f32)
2708 : MVT::f32;
2709 SDLoc Loc(Op);
2710 if (STI.getSmVersion() >= 80 && STI.getPTXVersion() >= 71) {
2711 Op = DAG.getNode(ISD::FP_EXTEND, Loc, F32, Narrow);
2712 } else {
2713 Op = DAG.getNode(ISD::BF16_TO_FP, Loc, F32, Narrow);
2714 }
2715 return DAG.getNode(ISD::FP_EXTEND, Loc, WideVT, Op);
2716 }
2717 }
2718
2719 // Everything else is considered legal.
2720 return Op;
2721}
2722
2724 SDLoc DL(Op);
2725 if (Op.getValueType() != MVT::v2i16)
2726 return Op;
2727 EVT EltVT = Op.getValueType().getVectorElementType();
2728 SmallVector<SDValue> VecElements;
2729 for (int I = 0, E = Op.getValueType().getVectorNumElements(); I < E; I++) {
2730 SmallVector<SDValue> ScalarArgs;
2731 llvm::transform(Op->ops(), std::back_inserter(ScalarArgs),
2732 [&](const SDUse &O) {
2733 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT,
2734 O.get(), DAG.getIntPtrConstant(I, DL));
2735 });
2736 VecElements.push_back(DAG.getNode(Op.getOpcode(), DL, EltVT, ScalarArgs));
2737 }
2738 SDValue V =
2739 DAG.getNode(ISD::BUILD_VECTOR, DL, Op.getValueType(), VecElements);
2740 return V;
2741}
2742
2743SDValue
2745 switch (Op.getOpcode()) {
2746 case ISD::RETURNADDR:
2747 return SDValue();
2748 case ISD::FRAMEADDR:
2749 return SDValue();
2750 case ISD::GlobalAddress:
2751 return LowerGlobalAddress(Op, DAG);
2753 return Op;
2754 case ISD::BUILD_VECTOR:
2755 return LowerBUILD_VECTOR(Op, DAG);
2757 return Op;
2759 return LowerEXTRACT_VECTOR_ELT(Op, DAG);
2761 return LowerINSERT_VECTOR_ELT(Op, DAG);
2763 return LowerVECTOR_SHUFFLE(Op, DAG);
2765 return LowerCONCAT_VECTORS(Op, DAG);
2766 case ISD::STORE:
2767 return LowerSTORE(Op, DAG);
2768 case ISD::LOAD:
2769 return LowerLOAD(Op, DAG);
2770 case ISD::SHL_PARTS:
2771 return LowerShiftLeftParts(Op, DAG);
2772 case ISD::SRA_PARTS:
2773 case ISD::SRL_PARTS:
2774 return LowerShiftRightParts(Op, DAG);
2775 case ISD::SELECT:
2776 return LowerSelect(Op, DAG);
2777 case ISD::FROUND:
2778 return LowerFROUND(Op, DAG);
2779 case ISD::SINT_TO_FP:
2780 case ISD::UINT_TO_FP:
2781 return LowerINT_TO_FP(Op, DAG);
2782 case ISD::FP_TO_SINT:
2783 case ISD::FP_TO_UINT:
2784 return LowerFP_TO_INT(Op, DAG);
2785 case ISD::FP_ROUND:
2786 return LowerFP_ROUND(Op, DAG);
2787 case ISD::FP_EXTEND:
2788 return LowerFP_EXTEND(Op, DAG);
2789 case ISD::VAARG:
2790 return LowerVAARG(Op, DAG);
2791 case ISD::VASTART:
2792 return LowerVASTART(Op, DAG);
2793 case ISD::ABS:
2794 case ISD::SMIN:
2795 case ISD::SMAX:
2796 case ISD::UMIN:
2797 case ISD::UMAX:
2798 case ISD::ADD:
2799 case ISD::SUB:
2800 case ISD::MUL:
2801 case ISD::SHL:
2802 case ISD::SREM:
2803 case ISD::UREM:
2804 return LowerVectorArith(Op, DAG);
2806 return LowerDYNAMIC_STACKALLOC(Op, DAG);
2807 default:
2808 llvm_unreachable("Custom lowering not defined for operation");
2809 }
2810}
2811
2812// This function is almost a copy of SelectionDAG::expandVAArg().
2813// The only diff is that this one produces loads from local address space.
2814SDValue NVPTXTargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
2815 const TargetLowering *TLI = STI.getTargetLowering();
2816 SDLoc DL(Op);
2817
2818 SDNode *Node = Op.getNode();
2819 const Value *V = cast<SrcValueSDNode>(Node->getOperand(2))->getValue();
2820 EVT VT = Node->getValueType(0);
2821 auto *Ty = VT.getTypeForEVT(*DAG.getContext());
2822 SDValue Tmp1 = Node->getOperand(0);
2823 SDValue Tmp2 = Node->getOperand(1);
2824 const MaybeAlign MA(Node->getConstantOperandVal(3));
2825
2826 SDValue VAListLoad = DAG.getLoad(TLI->getPointerTy(DAG.getDataLayout()), DL,
2827 Tmp1, Tmp2, MachinePointerInfo(V));
2828 SDValue VAList = VAListLoad;
2829
2830 if (MA && *MA > TLI->getMinStackArgumentAlignment()) {
2831 VAList = DAG.getNode(
2832 ISD::ADD, DL, VAList.getValueType(), VAList,
2833 DAG.getConstant(MA->value() - 1, DL, VAList.getValueType()));
2834
2835 VAList = DAG.getNode(
2836 ISD::AND, DL, VAList.getValueType(), VAList,
2837 DAG.getConstant(-(int64_t)MA->value(), DL, VAList.getValueType()));
2838 }
2839
2840 // Increment the pointer, VAList, to the next vaarg
2841 Tmp1 = DAG.getNode(ISD::ADD, DL, VAList.getValueType(), VAList,
2843 DL, VAList.getValueType()));
2844
2845 // Store the incremented VAList to the legalized pointer
2846 Tmp1 = DAG.getStore(VAListLoad.getValue(1), DL, Tmp1, Tmp2,
2848
2849 const Value *SrcV =
2851
2852 // Load the actual argument out of the pointer VAList
2853 return DAG.getLoad(VT, DL, Tmp1, VAList, MachinePointerInfo(SrcV));
2854}
2855
2856SDValue NVPTXTargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
2857 const TargetLowering *TLI = STI.getTargetLowering();
2858 SDLoc DL(Op);
2859 EVT PtrVT = TLI->getPointerTy(DAG.getDataLayout());
2860
2861 // Store the address of unsized array <function>_vararg[] in the ap object.
2862 SDValue Arg = getParamSymbol(DAG, /* vararg */ -1, PtrVT);
2863 SDValue VAReg = DAG.getNode(NVPTXISD::Wrapper, DL, PtrVT, Arg);
2864
2865 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
2866 return DAG.getStore(Op.getOperand(0), DL, VAReg, Op.getOperand(1),
2867 MachinePointerInfo(SV));
2868}
2869
2870SDValue NVPTXTargetLowering::LowerSelect(SDValue Op, SelectionDAG &DAG) const {
2871 SDValue Op0 = Op->getOperand(0);
2872 SDValue Op1 = Op->getOperand(1);
2873 SDValue Op2 = Op->getOperand(2);
2874 SDLoc DL(Op.getNode());
2875
2876 assert(Op.getValueType() == MVT::i1 && "Custom lowering enabled only for i1");
2877
2878 Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op1);
2879 Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op2);
2880 SDValue Select = DAG.getNode(ISD::SELECT, DL, MVT::i32, Op0, Op1, Op2);
2881 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Select);
2882
2883 return Trunc;
2884}
2885
2886SDValue NVPTXTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
2887 if (Op.getValueType() == MVT::i1)
2888 return LowerLOADi1(Op, DAG);
2889
2890 // v2f16/v2bf16/v2i16/v4i8 are legal, so we can't rely on legalizer to handle
2891 // unaligned loads and have to handle it here.
2892 EVT VT = Op.getValueType();
2893 if (Isv2x16VT(VT) || VT == MVT::v4i8) {
2894 LoadSDNode *Load = cast<LoadSDNode>(Op);
2895 EVT MemVT = Load->getMemoryVT();
2897 MemVT, *Load->getMemOperand())) {
2898 SDValue Ops[2];
2899 std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(Load, DAG);
2900 return DAG.getMergeValues(Ops, SDLoc(Op));
2901 }
2902 }
2903
2904 return SDValue();
2905}
2906
2907// v = ld i1* addr
2908// =>
2909// v1 = ld i8* addr (-> i16)
2910// v = trunc i16 to i1
2911SDValue NVPTXTargetLowering::LowerLOADi1(SDValue Op, SelectionDAG &DAG) const {
2912 SDNode *Node = Op.getNode();
2913 LoadSDNode *LD = cast<LoadSDNode>(Node);
2914 SDLoc dl(Node);
2915 assert(LD->getExtensionType() == ISD::NON_EXTLOAD);
2916 assert(Node->getValueType(0) == MVT::i1 &&
2917 "Custom lowering for i1 load only");
2918 SDValue newLD = DAG.getLoad(MVT::i16, dl, LD->getChain(), LD->getBasePtr(),
2919 LD->getPointerInfo(), LD->getAlign(),
2920 LD->getMemOperand()->getFlags());
2921 SDValue result = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, newLD);
2922 // The legalizer (the caller) is expecting two values from the legalized
2923 // load, so we build a MergeValues node for it. See ExpandUnalignedLoad()
2924 // in LegalizeDAG.cpp which also uses MergeValues.
2925 SDValue Ops[] = { result, LD->getChain() };
2926 return DAG.getMergeValues(Ops, dl);
2927}
2928
2929SDValue NVPTXTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
2930 StoreSDNode *Store = cast<StoreSDNode>(Op);
2931 EVT VT = Store->getMemoryVT();
2932
2933 if (VT == MVT::i1)
2934 return LowerSTOREi1(Op, DAG);
2935
2936 // v2f16 is legal, so we can't rely on legalizer to handle unaligned
2937 // stores and have to handle it here.
2938 if ((Isv2x16VT(VT) || VT == MVT::v4i8) &&
2940 VT, *Store->getMemOperand()))
2941 return expandUnalignedStore(Store, DAG);
2942
2943 // v2f16, v2bf16 and v2i16 don't need special handling.
2944 if (Isv2x16VT(VT) || VT == MVT::v4i8)
2945 return SDValue();
2946
2947 if (VT.isVector())
2948 return LowerSTOREVector(Op, DAG);
2949
2950 return SDValue();
2951}
2952
2953SDValue
2954NVPTXTargetLowering::LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const {
2955 SDNode *N = Op.getNode();
2956 SDValue Val = N->getOperand(1);
2957 SDLoc DL(N);
2958 EVT ValVT = Val.getValueType();
2959
2960 if (ValVT.isVector()) {
2961 // We only handle "native" vector sizes for now, e.g. <4 x double> is not
2962 // legal. We can (and should) split that into 2 stores of <2 x double> here
2963 // but I'm leaving that as a TODO for now.
2964 if (!ValVT.isSimple())
2965 return SDValue();
2966 switch (ValVT.getSimpleVT().SimpleTy) {
2967 default:
2968 return SDValue();
2969 case MVT::v2i8:
2970 case MVT::v2i16:
2971 case MVT::v2i32:
2972 case MVT::v2i64:
2973 case MVT::v2f16:
2974 case MVT::v2bf16:
2975 case MVT::v2f32:
2976 case MVT::v2f64:
2977 case MVT::v4i8:
2978 case MVT::v4i16:
2979 case MVT::v4i32:
2980 case MVT::v4f16:
2981 case MVT::v4bf16:
2982 case MVT::v4f32:
2983 case MVT::v8f16: // <4 x f16x2>
2984 case MVT::v8bf16: // <4 x bf16x2>
2985 case MVT::v8i16: // <4 x i16x2>
2986 // This is a "native" vector type
2987 break;
2988 }
2989
2990 MemSDNode *MemSD = cast<MemSDNode>(N);
2991 const DataLayout &TD = DAG.getDataLayout();
2992
2993 Align Alignment = MemSD->getAlign();
2994 Align PrefAlign =
2995 TD.getPrefTypeAlign(ValVT.getTypeForEVT(*DAG.getContext()));
2996 if (Alignment < PrefAlign) {
2997 // This store is not sufficiently aligned, so bail out and let this vector
2998 // store be scalarized. Note that we may still be able to emit smaller
2999 // vector stores. For example, if we are storing a <4 x float> with an
3000 // alignment of 8, this check will fail but the legalizer will try again
3001 // with 2 x <2 x float>, which will succeed with an alignment of 8.
3002 return SDValue();
3003 }
3004
3005 unsigned Opcode = 0;
3006 EVT EltVT = ValVT.getVectorElementType();
3007 unsigned NumElts = ValVT.getVectorNumElements();
3008
3009 // Since StoreV2 is a target node, we cannot rely on DAG type legalization.
3010 // Therefore, we must ensure the type is legal. For i1 and i8, we set the
3011 // stored type to i16 and propagate the "real" type as the memory type.
3012 bool NeedExt = false;
3013 if (EltVT.getSizeInBits() < 16)
3014 NeedExt = true;
3015
3016 bool StoreF16x2 = false;
3017 switch (NumElts) {
3018 default:
3019 return SDValue();
3020 case 2:
3021 Opcode = NVPTXISD::StoreV2;
3022 break;
3023 case 4:
3024 Opcode = NVPTXISD::StoreV4;
3025 break;
3026 case 8:
3027 // v8f16 is a special case. PTX doesn't have st.v8.f16
3028 // instruction. Instead, we split the vector into v2f16 chunks and
3029 // store them with st.v4.b32.
3030 assert(Is16bitsType(EltVT.getSimpleVT()) && "Wrong type for the vector.");
3031 Opcode = NVPTXISD::StoreV4;
3032 StoreF16x2 = true;
3033 break;
3034 }
3035
3037
3038 // First is the chain
3039 Ops.push_back(N->getOperand(0));
3040
3041 if (StoreF16x2) {
3042 // Combine f16,f16 -> v2f16
3043 NumElts /= 2;
3044 for (unsigned i = 0; i < NumElts; ++i) {
3045 SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Val,
3046 DAG.getIntPtrConstant(i * 2, DL));
3047 SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Val,
3048 DAG.getIntPtrConstant(i * 2 + 1, DL));
3049 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, 2);
3050 SDValue V2 = DAG.getNode(ISD::BUILD_VECTOR, DL, VecVT, E0, E1);
3051 Ops.push_back(V2);
3052 }
3053 } else {
3054 // Then the split values
3055 for (unsigned i = 0; i < NumElts; ++i) {
3056 SDValue ExtVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Val,
3057 DAG.getIntPtrConstant(i, DL));
3058 if (NeedExt)
3059 ExtVal = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i16, ExtVal);
3060 Ops.push_back(ExtVal);
3061 }
3062 }
3063
3064 // Then any remaining arguments
3065 Ops.append(N->op_begin() + 2, N->op_end());
3066
3067 SDValue NewSt =
3068 DAG.getMemIntrinsicNode(Opcode, DL, DAG.getVTList(MVT::Other), Ops,
3069 MemSD->getMemoryVT(), MemSD->getMemOperand());
3070
3071 // return DCI.CombineTo(N, NewSt, true);
3072 return NewSt;
3073 }
3074
3075 return SDValue();
3076}
3077
3078// st i1 v, addr
3079// =>
3080// v1 = zxt v to i16
3081// st.u8 i16, addr
3082SDValue NVPTXTargetLowering::LowerSTOREi1(SDValue Op, SelectionDAG &DAG) const {
3083 SDNode *Node = Op.getNode();
3084 SDLoc dl(Node);
3085 StoreSDNode *ST = cast<StoreSDNode>(Node);
3086 SDValue Tmp1 = ST->getChain();
3087 SDValue Tmp2 = ST->getBasePtr();
3088 SDValue Tmp3 = ST->getValue();
3089 assert(Tmp3.getValueType() == MVT::i1 && "Custom lowering for i1 store only");
3090 Tmp3 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Tmp3);
3091 SDValue Result =
3092 DAG.getTruncStore(Tmp1, dl, Tmp3, Tmp2, ST->getPointerInfo(), MVT::i8,
3093 ST->getAlign(), ST->getMemOperand()->getFlags());
3094 return Result;
3095}
3096
3097// This creates target external symbol for a function parameter.
3098// Name of the symbol is composed from its index and the function name.
3099// Negative index corresponds to special parameter (unsized array) used for
3100// passing variable arguments.
3101SDValue NVPTXTargetLowering::getParamSymbol(SelectionDAG &DAG, int idx,
3102 EVT v) const {
3103 StringRef SavedStr = nvTM->getStrPool().save(
3105 return DAG.getTargetExternalSymbol(SavedStr.data(), v);
3106}
3107
3109 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
3110 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
3111 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
3113 const DataLayout &DL = DAG.getDataLayout();
3114 auto PtrVT = getPointerTy(DAG.getDataLayout());
3115
3116 const Function *F = &MF.getFunction();
3117 const AttributeList &PAL = F->getAttributes();
3118 const TargetLowering *TLI = STI.getTargetLowering();
3119
3120 SDValue Root = DAG.getRoot();
3121 std::vector<SDValue> OutChains;
3122
3123 bool isABI = (STI.getSmVersion() >= 20);
3124 assert(isABI && "Non-ABI compilation is not supported");
3125 if (!isABI)
3126 return Chain;
3127
3128 std::vector<Type *> argTypes;
3129 std::vector<const Argument *> theArgs;
3130 for (const Argument &I : F->args()) {
3131 theArgs.push_back(&I);
3132 argTypes.push_back(I.getType());
3133 }
3134 // argTypes.size() (or theArgs.size()) and Ins.size() need not match.
3135 // Ins.size() will be larger
3136 // * if there is an aggregate argument with multiple fields (each field
3137 // showing up separately in Ins)
3138 // * if there is a vector argument with more than typical vector-length
3139 // elements (generally if more than 4) where each vector element is
3140 // individually present in Ins.
3141 // So a different index should be used for indexing into Ins.
3142 // See similar issue in LowerCall.
3143 unsigned InsIdx = 0;
3144
3145 for (unsigned i = 0, e = theArgs.size(); i != e; ++i, ++InsIdx) {
3146 Type *Ty = argTypes[i];
3147
3148 if (theArgs[i]->use_empty()) {
3149 // argument is dead
3150 if (IsTypePassedAsArray(Ty) && !Ty->isVectorTy()) {
3151 SmallVector<EVT, 16> vtparts;
3152
3153 ComputePTXValueVTs(*this, DAG.getDataLayout(), Ty, vtparts);
3154 if (vtparts.empty())
3155 report_fatal_error("Empty parameter types are not supported");
3156
3157 for (unsigned parti = 0, parte = vtparts.size(); parti != parte;
3158 ++parti) {
3159 InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT));
3160 ++InsIdx;
3161 }
3162 if (vtparts.size() > 0)
3163 --InsIdx;
3164 continue;
3165 }
3166 if (Ty->isVectorTy()) {
3167 EVT ObjectVT = getValueType(DL, Ty);
3168 unsigned NumRegs = TLI->getNumRegisters(F->getContext(), ObjectVT);
3169 for (unsigned parti = 0; parti < NumRegs; ++parti) {
3170 InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT));
3171 ++InsIdx;
3172 }
3173 if (NumRegs > 0)
3174 --InsIdx;
3175 continue;
3176 }
3177 InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT));
3178 continue;
3179 }
3180
3181 // In the following cases, assign a node order of "i+1"
3182 // to newly created nodes. The SDNodes for params have to
3183 // appear in the same order as their order of appearance
3184 // in the original function. "i+1" holds that order.
3185 if (!PAL.hasParamAttr(i, Attribute::ByVal)) {
3186 bool aggregateIsPacked = false;
3187 if (StructType *STy = dyn_cast<StructType>(Ty))
3188 aggregateIsPacked = STy->isPacked();
3189
3192 ComputePTXValueVTs(*this, DL, Ty, VTs, &Offsets, 0);
3193 if (VTs.empty())
3194 report_fatal_error("Empty parameter types are not supported");
3195
3198 auto VectorInfo = VectorizePTXValueVTs(VTs, Offsets, ArgAlign);
3199
3200 SDValue Arg = getParamSymbol(DAG, i, PtrVT);
3201 int VecIdx = -1; // Index of the first element of the current vector.
3202 for (unsigned parti = 0, parte = VTs.size(); parti != parte; ++parti) {
3203 if (VectorInfo[parti] & PVF_FIRST) {
3204 assert(VecIdx == -1 && "Orphaned vector.");
3205 VecIdx = parti;
3206 }
3207
3208 // That's the last element of this store op.
3209 if (VectorInfo[parti] & PVF_LAST) {
3210 unsigned NumElts = parti - VecIdx + 1;
3211 EVT EltVT = VTs[parti];
3212 // i1 is loaded/stored as i8.
3213 EVT LoadVT = EltVT;
3214 if (EltVT == MVT::i1)
3215 LoadVT = MVT::i8;
3216 else if (Isv2x16VT(EltVT) || EltVT == MVT::v4i8)
3217 // getLoad needs a vector type, but it can't handle
3218 // vectors which contain v2f16 or v2bf16 elements. So we must load
3219 // using i32 here and then bitcast back.
3220 LoadVT = MVT::i32;
3221
3222 EVT VecVT = EVT::getVectorVT(F->getContext(), LoadVT, NumElts);
3223 SDValue VecAddr =
3224 DAG.getNode(ISD::ADD, dl, PtrVT, Arg,
3225 DAG.getConstant(Offsets[VecIdx], dl, PtrVT));
3227 EltVT.getTypeForEVT(F->getContext()), ADDRESS_SPACE_PARAM));
3228
3229 const MaybeAlign PartAlign = [&]() -> MaybeAlign {
3230 if (aggregateIsPacked)
3231 return Align(1);
3232 if (NumElts != 1)
3233 return std::nullopt;
3234 Align PartAlign =
3235 DL.getABITypeAlign(EltVT.getTypeForEVT(F->getContext()));
3236 return commonAlignment(PartAlign, Offsets[parti]);
3237 }();
3238 SDValue P = DAG.getLoad(VecVT, dl, Root, VecAddr,
3239 MachinePointerInfo(srcValue), PartAlign,
3242 if (P.getNode())
3243 P.getNode()->setIROrder(i + 1);
3244 for (unsigned j = 0; j < NumElts; ++j) {
3245 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, LoadVT, P,
3246 DAG.getIntPtrConstant(j, dl));
3247 // We've loaded i1 as an i8 and now must truncate it back to i1
3248 if (EltVT == MVT::i1)
3249 Elt = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Elt);
3250 // v2f16 was loaded as an i32. Now we must bitcast it back.
3251 else if (EltVT != LoadVT)
3252 Elt = DAG.getNode(ISD::BITCAST, dl, EltVT, Elt);
3253
3254 // If a promoted integer type is used, truncate down to the original
3255 MVT PromotedVT;
3256 if (PromoteScalarIntegerPTX(EltVT, &PromotedVT)) {
3257 Elt = DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);
3258 }
3259
3260 // Extend the element if necessary (e.g. an i8 is loaded
3261 // into an i16 register)
3262 if (Ins[InsIdx].VT.isInteger() &&
3263 Ins[InsIdx].VT.getFixedSizeInBits() >
3264 LoadVT.getFixedSizeInBits()) {
3265 unsigned Extend = Ins[InsIdx].Flags.isSExt() ? ISD::SIGN_EXTEND
3267 Elt = DAG.getNode(Extend, dl, Ins[InsIdx].VT, Elt);
3268 }
3269 InVals.push_back(Elt);
3270 }
3271
3272 // Reset vector tracking state.
3273 VecIdx = -1;
3274 }
3275 ++InsIdx;
3276 }
3277 if (VTs.size() > 0)
3278 --InsIdx;
3279 continue;
3280 }
3281
3282 // Param has ByVal attribute
3283 // Return MoveParam(param symbol).
3284 // Ideally, the param symbol can be returned directly,
3285 // but when SDNode builder decides to use it in a CopyToReg(),
3286 // machine instruction fails because TargetExternalSymbol
3287 // (not lowered) is target dependent, and CopyToReg assumes
3288 // the source is lowered.
3289 EVT ObjectVT = getValueType(DL, Ty);
3290 assert(ObjectVT == Ins[InsIdx].VT &&
3291 "Ins type did not match function type");
3292 SDValue Arg = getParamSymbol(DAG, i, PtrVT);
3293 SDValue p = DAG.getNode(NVPTXISD::MoveParam, dl, ObjectVT, Arg);
3294 if (p.getNode())
3295 p.getNode()->setIROrder(i + 1);
3296 InVals.push_back(p);
3297 }
3298
3299 if (!OutChains.empty())
3300 DAG.setRoot(DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains));
3301
3302 return Chain;
3303}
3304
3305// Use byte-store when the param adress of the return value is unaligned.
3306// This may happen when the return value is a field of a packed structure.
3308 uint64_t Offset, EVT ElementType,
3309 SDValue RetVal, const SDLoc &dl) {
3310 // Bit logic only works on integer types
3311 if (adjustElementType(ElementType))
3312 RetVal = DAG.getNode(ISD::BITCAST, dl, ElementType, RetVal);
3313
3314 // Store each byte
3315 for (unsigned i = 0, n = ElementType.getSizeInBits() / 8; i < n; i++) {
3316 // Shift the byte to the last byte position
3317 SDValue ShiftVal = DAG.getNode(ISD::SRL, dl, ElementType, RetVal,
3318 DAG.getConstant(i * 8, dl, MVT::i32));
3319 SDValue StoreOperands[] = {Chain, DAG.getConstant(Offset + i, dl, MVT::i32),
3320 ShiftVal};
3321 // Trunc store only the last byte by using
3322 // st.param.b8
3323 // The register type can be larger than b8.
3325 DAG.getVTList(MVT::Other), StoreOperands,
3326 MVT::i8, MachinePointerInfo(), std::nullopt,
3328 }
3329 return Chain;
3330}
3331
3332SDValue
3334 bool isVarArg,
3336 const SmallVectorImpl<SDValue> &OutVals,
3337 const SDLoc &dl, SelectionDAG &DAG) const {
3338 const MachineFunction &MF = DAG.getMachineFunction();
3339 const Function &F = MF.getFunction();
3341
3342 bool isABI = (STI.getSmVersion() >= 20);
3343 assert(isABI && "Non-ABI compilation is not supported");
3344 if (!isABI)
3345 return Chain;
3346
3347 const DataLayout &DL = DAG.getDataLayout();
3348 SmallVector<SDValue, 16> PromotedOutVals;
3351 ComputePTXValueVTs(*this, DL, RetTy, VTs, &Offsets);
3352 assert(VTs.size() == OutVals.size() && "Bad return value decomposition");
3353
3354 for (unsigned i = 0, e = VTs.size(); i != e; ++i) {
3355 SDValue PromotedOutVal = OutVals[i];
3356 MVT PromotedVT;
3357 if (PromoteScalarIntegerPTX(VTs[i], &PromotedVT)) {
3358 VTs[i] = EVT(PromotedVT);
3359 }
3360 if (PromoteScalarIntegerPTX(PromotedOutVal.getValueType(), &PromotedVT)) {
3362 Outs[i].Flags.isSExt() ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
3363 PromotedOutVal = DAG.getNode(Ext, dl, PromotedVT, PromotedOutVal);
3364 }
3365 PromotedOutVals.push_back(PromotedOutVal);
3366 }
3367
3368 auto VectorInfo = VectorizePTXValueVTs(
3369 VTs, Offsets,
3371 : Align(1));
3372
3373 // PTX Interoperability Guide 3.3(A): [Integer] Values shorter than
3374 // 32-bits are sign extended or zero extended, depending on whether
3375 // they are signed or unsigned types.
3376 bool ExtendIntegerRetVal =
3377 RetTy->isIntegerTy() && DL.getTypeAllocSizeInBits(RetTy) < 32;
3378
3379 SmallVector<SDValue, 6> StoreOperands;
3380 for (unsigned i = 0, e = VTs.size(); i != e; ++i) {
3381 SDValue OutVal = OutVals[i];
3382 SDValue RetVal = PromotedOutVals[i];
3383
3384 if (ExtendIntegerRetVal) {
3385 RetVal = DAG.getNode(Outs[i].Flags.isSExt() ? ISD::SIGN_EXTEND
3387 dl, MVT::i32, RetVal);
3388 } else if (OutVal.getValueSizeInBits() < 16) {
3389 // Use 16-bit registers for small load-stores as it's the
3390 // smallest general purpose register size supported by NVPTX.
3391 RetVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, RetVal);
3392 }
3393
3394 // If we have a PVF_SCALAR entry, it may not even be sufficiently aligned
3395 // for a scalar store. In such cases, fall back to byte stores.
3396 if (VectorInfo[i] == PVF_SCALAR && RetTy->isAggregateType()) {
3397 EVT ElementType = ExtendIntegerRetVal ? MVT::i32 : VTs[i];
3398 Align ElementTypeAlign =
3399 DL.getABITypeAlign(ElementType.getTypeForEVT(RetTy->getContext()));
3400 Align ElementAlign =
3401 commonAlignment(DL.getABITypeAlign(RetTy), Offsets[i]);
3402 if (ElementAlign < ElementTypeAlign) {
3403 assert(StoreOperands.empty() && "Orphaned operand list.");
3404 Chain = LowerUnalignedStoreRet(DAG, Chain, Offsets[i], ElementType,
3405 RetVal, dl);
3406
3407 // The call to LowerUnalignedStoreRet inserted the necessary SDAG nodes
3408 // into the graph, so just move on to the next element.
3409 continue;
3410 }
3411 }
3412
3413 // New load/store. Record chain and offset operands.
3414 if (VectorInfo[i] & PVF_FIRST) {
3415 assert(StoreOperands.empty() && "Orphaned operand list.");
3416 StoreOperands.push_back(Chain);
3417 StoreOperands.push_back(DAG.getConstant(Offsets[i], dl, MVT::i32));
3418 }
3419
3420 // Record the value to return.
3421 StoreOperands.push_back(RetVal);
3422
3423 // That's the last element of this store op.
3424 if (VectorInfo[i] & PVF_LAST) {
3426 unsigned NumElts = StoreOperands.size() - 2;
3427 switch (NumElts) {
3428 case 1:
3430 break;
3431 case 2:
3433 break;
3434 case 4:
3436 break;
3437 default:
3438 llvm_unreachable("Invalid vector info.");
3439 }
3440
3441 // Adjust type of load/store op if we've extended the scalar
3442 // return value.
3443 EVT TheStoreType = ExtendIntegerRetVal ? MVT::i32 : VTs[i];
3444 Chain = DAG.getMemIntrinsicNode(
3445 Op, dl, DAG.getVTList(MVT::Other), StoreOperands, TheStoreType,
3447 // Cleanup vector state.
3448 StoreOperands.clear();
3449 }
3450 }
3451
3452 return DAG.getNode(NVPTXISD::RET_GLUE, dl, MVT::Other, Chain);
3453}
3454
3456 SDValue Op, StringRef Constraint, std::vector<SDValue> &Ops,
3457 SelectionDAG &DAG) const {
3458 if (Constraint.size() > 1)
3459 return;
3460 TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
3461}
3462
3463static unsigned getOpcForTextureInstr(unsigned Intrinsic) {
3464 switch (Intrinsic) {
3465 default:
3466 return 0;
3467
3468 case Intrinsic::nvvm_tex_1d_v4f32_s32:
3470 case Intrinsic::nvvm_tex_1d_v4f32_f32:
3472 case Intrinsic::nvvm_tex_1d_level_v4f32_f32:
3474 case Intrinsic::nvvm_tex_1d_grad_v4f32_f32:
3476 case Intrinsic::nvvm_tex_1d_v4s32_s32:
3477 return NVPTXISD::Tex1DS32S32;
3478 case Intrinsic::nvvm_tex_1d_v4s32_f32:
3480 case Intrinsic::nvvm_tex_1d_level_v4s32_f32:
3482 case Intrinsic::nvvm_tex_1d_grad_v4s32_f32:
3484 case Intrinsic::nvvm_tex_1d_v4u32_s32:
3485 return NVPTXISD::Tex1DU32S32;
3486 case Intrinsic::nvvm_tex_1d_v4u32_f32:
3488 case Intrinsic::nvvm_tex_1d_level_v4u32_f32:
3490 case Intrinsic::nvvm_tex_1d_grad_v4u32_f32:
3492
3493 case Intrinsic::nvvm_tex_1d_array_v4f32_s32:
3495 case Intrinsic::nvvm_tex_1d_array_v4f32_f32:
3497 case Intrinsic::nvvm_tex_1d_array_level_v4f32_f32:
3499 case Intrinsic::nvvm_tex_1d_array_grad_v4f32_f32:
3501 case Intrinsic::nvvm_tex_1d_array_v4s32_s32:
3503 case Intrinsic::nvvm_tex_1d_array_v4s32_f32:
3505 case Intrinsic::nvvm_tex_1d_array_level_v4s32_f32:
3507 case Intrinsic::nvvm_tex_1d_array_grad_v4s32_f32:
3509 case Intrinsic::nvvm_tex_1d_array_v4u32_s32:
3511 case Intrinsic::nvvm_tex_1d_array_v4u32_f32:
3513 case Intrinsic::nvvm_tex_1d_array_level_v4u32_f32:
3515 case Intrinsic::nvvm_tex_1d_array_grad_v4u32_f32:
3517
3518 case Intrinsic::nvvm_tex_2d_v4f32_s32:
3520 case Intrinsic::nvvm_tex_2d_v4f32_f32:
3522 case Intrinsic::nvvm_tex_2d_level_v4f32_f32:
3524 case Intrinsic::nvvm_tex_2d_grad_v4f32_f32:
3526 case Intrinsic::nvvm_tex_2d_v4s32_s32:
3527 return NVPTXISD::Tex2DS32S32;
3528 case Intrinsic::nvvm_tex_2d_v4s32_f32:
3530 case Intrinsic::nvvm_tex_2d_level_v4s32_f32:
3532 case Intrinsic::nvvm_tex_2d_grad_v4s32_f32:
3534 case Intrinsic::nvvm_tex_2d_v4u32_s32:
3535 return NVPTXISD::Tex2DU32S32;
3536 case Intrinsic::nvvm_tex_2d_v4u32_f32:
3538 case Intrinsic::nvvm_tex_2d_level_v4u32_f32:
3540 case Intrinsic::nvvm_tex_2d_grad_v4u32_f32:
3542
3543 case Intrinsic::nvvm_tex_2d_array_v4f32_s32:
3545 case Intrinsic::nvvm_tex_2d_array_v4f32_f32:
3547 case Intrinsic::nvvm_tex_2d_array_level_v4f32_f32:
3549 case Intrinsic::nvvm_tex_2d_array_grad_v4f32_f32:
3551 case Intrinsic::nvvm_tex_2d_array_v4s32_s32:
3553 case Intrinsic::nvvm_tex_2d_array_v4s32_f32:
3555 case Intrinsic::nvvm_tex_2d_array_level_v4s32_f32:
3557 case Intrinsic::nvvm_tex_2d_array_grad_v4s32_f32:
3559 case Intrinsic::nvvm_tex_2d_array_v4u32_s32:
3561 case Intrinsic::nvvm_tex_2d_array_v4u32_f32:
3563 case Intrinsic::nvvm_tex_2d_array_level_v4u32_f32:
3565 case Intrinsic::nvvm_tex_2d_array_grad_v4u32_f32:
3567
3568 case Intrinsic::nvvm_tex_3d_v4f32_s32:
3570 case Intrinsic::nvvm_tex_3d_v4f32_f32:
3572 case Intrinsic::nvvm_tex_3d_level_v4f32_f32:
3574 case Intrinsic::nvvm_tex_3d_grad_v4f32_f32:
3576 case Intrinsic::nvvm_tex_3d_v4s32_s32:
3577 return NVPTXISD::Tex3DS32S32;
3578 case Intrinsic::nvvm_tex_3d_v4s32_f32:
3580 case Intrinsic::nvvm_tex_3d_level_v4s32_f32:
3582 case Intrinsic::nvvm_tex_3d_grad_v4s32_f32:
3584 case Intrinsic::nvvm_tex_3d_v4u32_s32:
3585 return NVPTXISD::Tex3DU32S32;
3586 case Intrinsic::nvvm_tex_3d_v4u32_f32:
3588 case Intrinsic::nvvm_tex_3d_level_v4u32_f32:
3590 case Intrinsic::nvvm_tex_3d_grad_v4u32_f32:
3592
3593 case Intrinsic::nvvm_tex_cube_v4f32_f32:
3595 case Intrinsic::nvvm_tex_cube_level_v4f32_f32:
3597 case Intrinsic::nvvm_tex_cube_v4s32_f32:
3599 case Intrinsic::nvvm_tex_cube_level_v4s32_f32:
3601 case Intrinsic::nvvm_tex_cube_v4u32_f32:
3603 case Intrinsic::nvvm_tex_cube_level_v4u32_f32:
3605
3606 case Intrinsic::nvvm_tex_cube_array_v4f32_f32:
3608 case Intrinsic::nvvm_tex_cube_array_level_v4f32_f32:
3610 case Intrinsic::nvvm_tex_cube_array_v4s32_f32:
3612 case Intrinsic::nvvm_tex_cube_array_level_v4s32_f32:
3614 case Intrinsic::nvvm_tex_cube_array_v4u32_f32:
3616 case Intrinsic::nvvm_tex_cube_array_level_v4u32_f32:
3618
3619 case Intrinsic::nvvm_tld4_r_2d_v4f32_f32:
3621 case Intrinsic::nvvm_tld4_g_2d_v4f32_f32:
3623 case Intrinsic::nvvm_tld4_b_2d_v4f32_f32:
3625 case Intrinsic::nvvm_tld4_a_2d_v4f32_f32:
3627 case Intrinsic::nvvm_tld4_r_2d_v4s32_f32:
3629 case Intrinsic::nvvm_tld4_g_2d_v4s32_f32:
3631 case Intrinsic::nvvm_tld4_b_2d_v4s32_f32:
3633 case Intrinsic::nvvm_tld4_a_2d_v4s32_f32:
3635 case Intrinsic::nvvm_tld4_r_2d_v4u32_f32:
3637 case Intrinsic::nvvm_tld4_g_2d_v4u32_f32:
3639 case Intrinsic::nvvm_tld4_b_2d_v4u32_f32:
3641 case Intrinsic::nvvm_tld4_a_2d_v4u32_f32:
3643
3644 case Intrinsic::nvvm_tex_unified_1d_v4f32_s32:
3646 case Intrinsic::nvvm_tex_unified_1d_v4f32_f32:
3648 case Intrinsic::nvvm_tex_unified_1d_level_v4f32_f32:
3650 case Intrinsic::nvvm_tex_unified_1d_grad_v4f32_f32:
3652 case Intrinsic::nvvm_tex_unified_1d_v4s32_s32:
3654 case Intrinsic::nvvm_tex_unified_1d_v4s32_f32:
3656 case Intrinsic::nvvm_tex_unified_1d_level_v4s32_f32:
3658 case Intrinsic::nvvm_tex_unified_1d_grad_v4s32_f32:
3660 case Intrinsic::nvvm_tex_unified_1d_v4u32_s32:
3662 case Intrinsic::nvvm_tex_unified_1d_v4u32_f32:
3664 case Intrinsic::nvvm_tex_unified_1d_level_v4u32_f32:
3666 case Intrinsic::nvvm_tex_unified_1d_grad_v4u32_f32:
3668
3669 case Intrinsic::nvvm_tex_unified_1d_array_v4f32_s32:
3671 case Intrinsic::nvvm_tex_unified_1d_array_v4f32_f32:
3673 case Intrinsic::nvvm_tex_unified_1d_array_level_v4f32_f32:
3675 case Intrinsic::nvvm_tex_unified_1d_array_grad_v4f32_f32:
3677 case Intrinsic::nvvm_tex_unified_1d_array_v4s32_s32:
3679 case Intrinsic::nvvm_tex_unified_1d_array_v4s32_f32:
3681 case Intrinsic::nvvm_tex_unified_1d_array_level_v4s32_f32:
3683 case Intrinsic::nvvm_tex_unified_1d_array_grad_v4s32_f32:
3685 case Intrinsic::nvvm_tex_unified_1d_array_v4u32_s32:
3687 case Intrinsic::nvvm_tex_unified_1d_array_v4u32_f32:
3689 case Intrinsic::nvvm_tex_unified_1d_array_level_v4u32_f32:
3691 case Intrinsic::nvvm_tex_unified_1d_array_grad_v4u32_f32:
3693
3694 case Intrinsic::nvvm_tex_unified_2d_v4f32_s32:
3696 case Intrinsic::nvvm_tex_unified_2d_v4f32_f32:
3698 case Intrinsic::nvvm_tex_unified_2d_level_v4f32_f32:
3700 case Intrinsic::nvvm_tex_unified_2d_grad_v4f32_f32:
3702 case Intrinsic::nvvm_tex_unified_2d_v4s32_s32:
3704 case Intrinsic::nvvm_tex_unified_2d_v4s32_f32:
3706 case Intrinsic::nvvm_tex_unified_2d_level_v4s32_f32:
3708 case Intrinsic::nvvm_tex_unified_2d_grad_v4s32_f32:
3710 case Intrinsic::nvvm_tex_unified_2d_v4u32_s32:
3712 case Intrinsic::nvvm_tex_unified_2d_v4u32_f32:
3714 case Intrinsic::nvvm_tex_unified_2d_level_v4u32_f32:
3716 case Intrinsic::nvvm_tex_unified_2d_grad_v4u32_f32:
3718
3719 case Intrinsic::nvvm_tex_unified_2d_array_v4f32_s32:
3721 case Intrinsic::nvvm_tex_unified_2d_array_v4f32_f32:
3723 case Intrinsic::nvvm_tex_unified_2d_array_level_v4f32_f32:
3725 case Intrinsic::nvvm_tex_unified_2d_array_grad_v4f32_f32:
3727 case Intrinsic::nvvm_tex_unified_2d_array_v4s32_s32:
3729 case Intrinsic::nvvm_tex_unified_2d_array_v4s32_f32:
3731 case Intrinsic::nvvm_tex_unified_2d_array_level_v4s32_f32:
3733 case Intrinsic::nvvm_tex_unified_2d_array_grad_v4s32_f32:
3735 case Intrinsic::nvvm_tex_unified_2d_array_v4u32_s32:
3737 case Intrinsic::nvvm_tex_unified_2d_array_v4u32_f32:
3739 case Intrinsic::nvvm_tex_unified_2d_array_level_v4u32_f32:
3741 case Intrinsic::nvvm_tex_unified_2d_array_grad_v4u32_f32:
3743
3744 case Intrinsic::nvvm_tex_unified_3d_v4f32_s32:
3746 case Intrinsic::nvvm_tex_unified_3d_v4f32_f32:
3748 case Intrinsic::nvvm_tex_unified_3d_level_v4f32_f32:
3750 case Intrinsic::nvvm_tex_unified_3d_grad_v4f32_f32:
3752 case Intrinsic::nvvm_tex_unified_3d_v4s32_s32:
3754 case Intrinsic::nvvm_tex_unified_3d_v4s32_f32:
3756 case Intrinsic::nvvm_tex_unified_3d_level_v4s32_f32:
3758 case Intrinsic::nvvm_tex_unified_3d_grad_v4s32_f32:
3760 case Intrinsic::nvvm_tex_unified_3d_v4u32_s32:
3762 case Intrinsic::nvvm_tex_unified_3d_v4u32_f32:
3764 case Intrinsic::nvvm_tex_unified_3d_level_v4u32_f32:
3766 case Intrinsic::nvvm_tex_unified_3d_grad_v4u32_f32:
3768
3769 case Intrinsic::nvvm_tex_unified_cube_v4f32_f32:
3771 case Intrinsic::nvvm_tex_unified_cube_level_v4f32_f32:
3773 case Intrinsic::nvvm_tex_unified_cube_v4s32_f32:
3775 case Intrinsic::nvvm_tex_unified_cube_level_v4s32_f32:
3777 case Intrinsic::nvvm_tex_unified_cube_v4u32_f32:
3779 case Intrinsic::nvvm_tex_unified_cube_level_v4u32_f32:
3781
3782 case Intrinsic::nvvm_tex_unified_cube_array_v4f32_f32:
3784 case Intrinsic::nvvm_tex_unified_cube_array_level_v4f32_f32:
3786 case Intrinsic::nvvm_tex_unified_cube_array_v4s32_f32:
3788 case Intrinsic::nvvm_tex_unified_cube_array_level_v4s32_f32:
3790 case Intrinsic::nvvm_tex_unified_cube_array_v4u32_f32:
3792 case Intrinsic::nvvm_tex_unified_cube_array_level_v4u32_f32:
3794
3795 case Intrinsic::nvvm_tex_unified_cube_grad_v4f32_f32:
3797 case Intrinsic::nvvm_tex_unified_cube_grad_v4s32_f32:
3799 case Intrinsic::nvvm_tex_unified_cube_grad_v4u32_f32:
3801 case Intrinsic::nvvm_tex_unified_cube_array_grad_v4f32_f32:
3803 case Intrinsic::nvvm_tex_unified_cube_array_grad_v4s32_f32:
3805 case Intrinsic::nvvm_tex_unified_cube_array_grad_v4u32_f32:
3807
3808 case Intrinsic::nvvm_tld4_unified_r_2d_v4f32_f32:
3810 case Intrinsic::nvvm_tld4_unified_g_2d_v4f32_f32:
3812 case Intrinsic::nvvm_tld4_unified_b_2d_v4f32_f32:
3814 case Intrinsic::nvvm_tld4_unified_a_2d_v4f32_f32:
3816 case Intrinsic::nvvm_tld4_unified_r_2d_v4s32_f32:
3818 case Intrinsic::nvvm_tld4_unified_g_2d_v4s32_f32:
3820 case Intrinsic::nvvm_tld4_unified_b_2d_v4s32_f32:
3822 case Intrinsic::nvvm_tld4_unified_a_2d_v4s32_f32:
3824 case Intrinsic::nvvm_tld4_unified_r_2d_v4u32_f32:
3826 case Intrinsic::nvvm_tld4_unified_g_2d_v4u32_f32:
3828 case Intrinsic::nvvm_tld4_unified_b_2d_v4u32_f32:
3830 case Intrinsic::nvvm_tld4_unified_a_2d_v4u32_f32:
3832 }
3833}
3834
3835static unsigned getOpcForSurfaceInstr(unsigned Intrinsic) {
3836 switch (Intrinsic) {
3837 default:
3838 return 0;
3839 case Intrinsic::nvvm_suld_1d_i8_clamp:
3841 case Intrinsic::nvvm_suld_1d_i16_clamp:
3843 case Intrinsic::nvvm_suld_1d_i32_clamp:
3845 case Intrinsic::nvvm_suld_1d_i64_clamp:
3847 case Intrinsic::nvvm_suld_1d_v2i8_clamp:
3849 case Intrinsic::nvvm_suld_1d_v2i16_clamp:
3851 case Intrinsic::nvvm_suld_1d_v2i32_clamp:
3853 case Intrinsic::nvvm_suld_1d_v2i64_clamp:
3855 case Intrinsic::nvvm_suld_1d_v4i8_clamp:
3857 case Intrinsic::nvvm_suld_1d_v4i16_clamp:
3859 case Intrinsic::nvvm_suld_1d_v4i32_clamp:
3861 case Intrinsic::nvvm_suld_1d_array_i8_clamp:
3863 case Intrinsic::nvvm_suld_1d_array_i16_clamp:
3865 case Intrinsic::nvvm_suld_1d_array_i32_clamp:
3867 case Intrinsic::nvvm_suld_1d_array_i64_clamp:
3869 case Intrinsic::nvvm_suld_1d_array_v2i8_clamp:
3871 case Intrinsic::nvvm_suld_1d_array_v2i16_clamp:
3873 case Intrinsic::nvvm_suld_1d_array_v2i32_clamp:
3875 case Intrinsic::nvvm_suld_1d_array_v2i64_clamp:
3877 case Intrinsic::nvvm_suld_1d_array_v4i8_clamp:
3879 case Intrinsic::nvvm_suld_1d_array_v4i16_clamp:
3881 case Intrinsic::nvvm_suld_1d_array_v4i32_clamp:
3883 case Intrinsic::nvvm_suld_2d_i8_clamp:
3885 case Intrinsic::nvvm_suld_2d_i16_clamp:
3887 case Intrinsic::nvvm_suld_2d_i32_clamp:
3889 case Intrinsic::nvvm_suld_2d_i64_clamp:
3891 case Intrinsic::nvvm_suld_2d_v2i8_clamp:
3893 case Intrinsic::nvvm_suld_2d_v2i16_clamp:
3895 case Intrinsic::nvvm_suld_2d_v2i32_clamp:
3897 case Intrinsic::nvvm_suld_2d_v2i64_clamp:
3899 case Intrinsic::nvvm_suld_2d_v4i8_clamp:
3901 case Intrinsic::nvvm_suld_2d_v4i16_clamp:
3903 case Intrinsic::nvvm_suld_2d_v4i32_clamp:
3905 case Intrinsic::nvvm_suld_2d_array_i8_clamp:
3907 case Intrinsic::nvvm_suld_2d_array_i16_clamp:
3909 case Intrinsic::nvvm_suld_2d_array_i32_clamp:
3911 case Intrinsic::nvvm_suld_2d_array_i64_clamp:
3913 case Intrinsic::nvvm_suld_2d_array_v2i8_clamp:
3915 case Intrinsic::nvvm_suld_2d_array_v2i16_clamp:
3917 case Intrinsic::nvvm_suld_2d_array_v2i32_clamp:
3919 case Intrinsic::nvvm_suld_2d_array_v2i64_clamp:
3921 case Intrinsic::nvvm_suld_2d_array_v4i8_clamp:
3923 case Intrinsic::nvvm_suld_2d_array_v4i16_clamp:
3925 case Intrinsic::nvvm_suld_2d_array_v4i32_clamp:
3927 case Intrinsic::nvvm_suld_3d_i8_clamp:
3929 case Intrinsic::nvvm_suld_3d_i16_clamp:
3931 case Intrinsic::nvvm_suld_3d_i32_clamp:
3933 case Intrinsic::nvvm_suld_3d_i64_clamp:
3935 case Intrinsic::nvvm_suld_3d_v2i8_clamp:
3937 case Intrinsic::nvvm_suld_3d_v2i16_clamp:
3939 case Intrinsic::nvvm_suld_3d_v2i32_clamp:
3941 case Intrinsic::nvvm_suld_3d_v2i64_clamp:
3943 case Intrinsic::nvvm_suld_3d_v4i8_clamp:
3945 case Intrinsic::nvvm_suld_3d_v4i16_clamp:
3947 case Intrinsic::nvvm_suld_3d_v4i32_clamp:
3949 case Intrinsic::nvvm_suld_1d_i8_trap:
3951 case Intrinsic::nvvm_suld_1d_i16_trap:
3953 case Intrinsic::nvvm_suld_1d_i32_trap:
3955 case Intrinsic::nvvm_suld_1d_i64_trap:
3957 case Intrinsic::nvvm_suld_1d_v2i8_trap:
3959 case Intrinsic::nvvm_suld_1d_v2i16_trap:
3961 case Intrinsic::nvvm_suld_1d_v2i32_trap:
3963 case Intrinsic::nvvm_suld_1d_v2i64_trap:
3965 case Intrinsic::nvvm_suld_1d_v4i8_trap:
3967 case Intrinsic::nvvm_suld_1d_v4i16_trap:
3969 case Intrinsic::nvvm_suld_1d_v4i32_trap:
3971 case Intrinsic::nvvm_suld_1d_array_i8_trap:
3973 case Intrinsic::nvvm_suld_1d_array_i16_trap:
3975 case Intrinsic::nvvm_suld_1d_array_i32_trap:
3977 case Intrinsic::nvvm_suld_1d_array_i64_trap:
3979 case Intrinsic::nvvm_suld_1d_array_v2i8_trap:
3981 case Intrinsic::nvvm_suld_1d_array_v2i16_trap:
3983 case Intrinsic::nvvm_suld_1d_array_v2i32_trap:
3985 case Intrinsic::nvvm_suld_1d_array_v2i64_trap:
3987 case Intrinsic::nvvm_suld_1d_array_v4i8_trap:
3989 case Intrinsic::nvvm_suld_1d_array_v4i16_trap:
3991 case Intrinsic::nvvm_suld_1d_array_v4i32_trap:
3993 case Intrinsic::nvvm_suld_2d_i8_trap:
3995 case Intrinsic::nvvm_suld_2d_i16_trap:
3997 case Intrinsic::nvvm_suld_2d_i32_trap:
3999 case Intrinsic::nvvm_suld_2d_i64_trap:
4001 case Intrinsic::nvvm_suld_2d_v2i8_trap:
4003 case Intrinsic::nvvm_suld_2d_v2i16_trap:
4005 case Intrinsic::nvvm_suld_2d_v2i32_trap:
4007 case Intrinsic::nvvm_suld_2d_v2i64_trap:
4009 case Intrinsic::nvvm_suld_2d_v4i8_trap:
4011 case Intrinsic::nvvm_suld_2d_v4i16_trap:
4013 case Intrinsic::nvvm_suld_2d_v4i32_trap:
4015 case Intrinsic::nvvm_suld_2d_array_i8_trap:
4017 case Intrinsic::nvvm_suld_2d_array_i16_trap:
4019 case Intrinsic::nvvm_suld_2d_array_i32_trap:
4021 case Intrinsic::nvvm_suld_2d_array_i64_trap:
4023 case Intrinsic::nvvm_suld_2d_array_v2i8_trap:
4025 case Intrinsic::nvvm_suld_2d_array_v2i16_trap:
4027 case Intrinsic::nvvm_suld_2d_array_v2i32_trap:
4029 case Intrinsic::nvvm_suld_2d_array_v2i64_trap:
4031 case Intrinsic::nvvm_suld_2d_array_v4i8_trap:
4033 case Intrinsic::nvvm_suld_2d_array_v4i16_trap:
4035 case Intrinsic::nvvm_suld_2d_array_v4i32_trap:
4037 case Intrinsic::nvvm_suld_3d_i8_trap:
4039 case Intrinsic::nvvm_suld_3d_i16_trap:
4041 case Intrinsic::nvvm_suld_3d_i32_trap:
4043 case Intrinsic::nvvm_suld_3d_i64_trap:
4045 case Intrinsic::nvvm_suld_3d_v2i8_trap:
4047 case Intrinsic::nvvm_suld_3d_v2i16_trap:
4049 case Intrinsic::nvvm_suld_3d_v2i32_trap:
4051 case Intrinsic::nvvm_suld_3d_v2i64_trap:
4053 case Intrinsic::nvvm_suld_3d_v4i8_trap:
4055 case Intrinsic::nvvm_suld_3d_v4i16_trap:
4057 case Intrinsic::nvvm_suld_3d_v4i32_trap:
4059 case Intrinsic::nvvm_suld_1d_i8_zero:
4061 case Intrinsic::nvvm_suld_1d_i16_zero:
4063 case Intrinsic::nvvm_suld_1d_i32_zero:
4065 case Intrinsic::nvvm_suld_1d_i64_zero:
4067 case Intrinsic::nvvm_suld_1d_v2i8_zero:
4069 case Intrinsic::nvvm_suld_1d_v2i16_zero:
4071 case Intrinsic::nvvm_suld_1d_v2i32_zero:
4073 case Intrinsic::nvvm_suld_1d_v2i64_zero:
4075 case Intrinsic::nvvm_suld_1d_v4i8_zero:
4077 case Intrinsic::nvvm_suld_1d_v4i16_zero:
4079 case Intrinsic::nvvm_suld_1d_v4i32_zero:
4081 case Intrinsic::nvvm_suld_1d_array_i8_zero:
4083 case Intrinsic::nvvm_suld_1d_array_i16_zero:
4085 case Intrinsic::nvvm_suld_1d_array_i32_zero:
4087 case Intrinsic::nvvm_suld_1d_array_i64_zero:
4089 case Intrinsic::nvvm_suld_1d_array_v2i8_zero:
4091 case Intrinsic::nvvm_suld_1d_array_v2i16_zero:
4093 case Intrinsic::nvvm_suld_1d_array_v2i32_zero:
4095 case Intrinsic::nvvm_suld_1d_array_v2i64_zero:
4097 case Intrinsic::nvvm_suld_1d_array_v4i8_zero:
4099 case Intrinsic::nvvm_suld_1d_array_v4i16_zero:
4101 case Intrinsic::nvvm_suld_1d_array_v4i32_zero:
4103 case Intrinsic::nvvm_suld_2d_i8_zero:
4105 case Intrinsic::nvvm_suld_2d_i16_zero:
4107 case Intrinsic::nvvm_suld_2d_i32_zero:
4109 case Intrinsic::nvvm_suld_2d_i64_zero:
4111 case Intrinsic::nvvm_suld_2d_v2i8_zero:
4113 case Intrinsic::nvvm_suld_2d_v2i16_zero:
4115 case Intrinsic::nvvm_suld_2d_v2i32_zero:
4117 case Intrinsic::nvvm_suld_2d_v2i64_zero:
4119 case Intrinsic::nvvm_suld_2d_v4i8_zero:
4121 case Intrinsic::nvvm_suld_2d_v4i16_zero:
4123 case Intrinsic::nvvm_suld_2d_v4i32_zero:
4125 case Intrinsic::nvvm_suld_2d_array_i8_zero:
4127 case Intrinsic::nvvm_suld_2d_array_i16_zero:
4129 case Intrinsic::nvvm_suld_2d_array_i32_zero:
4131 case Intrinsic::nvvm_suld_2d_array_i64_zero:
4133 case Intrinsic::nvvm_suld_2d_array_v2i8_zero:
4135 case Intrinsic::nvvm_suld_2d_array_v2i16_zero:
4137 case Intrinsic::nvvm_suld_2d_array_v2i32_zero:
4139 case Intrinsic::nvvm_suld_2d_array_v2i64_zero:
4141 case Intrinsic::nvvm_suld_2d_array_v4i8_zero:
4143 case Intrinsic::nvvm_suld_2d_array_v4i16_zero:
4145 case Intrinsic::nvvm_suld_2d_array_v4i32_zero:
4147 case Intrinsic::nvvm_suld_3d_i8_zero:
4149 case Intrinsic::nvvm_suld_3d_i16_zero:
4151 case Intrinsic::nvvm_suld_3d_i32_zero:
4153 case Intrinsic::nvvm_suld_3d_i64_zero:
4155 case Intrinsic::nvvm_suld_3d_v2i8_zero:
4157 case Intrinsic::nvvm_suld_3d_v2i16_zero:
4159 case Intrinsic::nvvm_suld_3d_v2i32_zero:
4161 case Intrinsic::nvvm_suld_3d_v2i64_zero:
4163 case Intrinsic::nvvm_suld_3d_v4i8_zero:
4165 case Intrinsic::nvvm_suld_3d_v4i16_zero:
4167 case Intrinsic::nvvm_suld_3d_v4i32_zero:
4169 }
4170}
4171
4172// llvm.ptx.memcpy.const and llvm.ptx.memmove.const need to be modeled as
4173// TgtMemIntrinsic
4174// because we need the information that is only available in the "Value" type
4175// of destination
4176// pointer. In particular, the address space information.
4178 IntrinsicInfo &Info, const CallInst &I,
4179 MachineFunction &MF, unsigned Intrinsic) const {
4180 switch (Intrinsic) {
4181 default:
4182 return false;
4183 case Intrinsic::nvvm_match_all_sync_i32p:
4184 case Intrinsic::nvvm_match_all_sync_i64p:
4186 // memVT is bogus. These intrinsics have IntrInaccessibleMemOnly attribute
4187 // in order to model data exchange with other threads, but perform no real
4188 // memory accesses.
4189 Info.memVT = MVT::i1;
4190
4191 // Our result depends on both our and other thread's arguments.
4193 return true;
4194 case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_col:
4195 case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_row:
4196 case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_col_stride:
4197 case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_row_stride:
4198 case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_col:
4199 case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_row:
4200 case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_col_stride:
4201 case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_row_stride:
4202 case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_col:
4203 case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_row:
4204 case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_col_stride:
4205 case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_row_stride:
4206 case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_col:
4207 case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_row:
4208 case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_col_stride:
4209 case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_row_stride:
4210 case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_col:
4211 case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_row:
4212 case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_col_stride:
4213 case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_row_stride:
4214 case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_col:
4215 case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_row:
4216 case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_col_stride:
4217 case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_row_stride: {
4219 Info.memVT = MVT::v8f16;
4220 Info.ptrVal = I.getArgOperand(0);
4221 Info.offset = 0;
4223 Info.align = Align(16);
4224 return true;
4225 }
4226 case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_col:
4227 case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_col_stride:
4228 case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_col_stride:
4229 case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_col:
4230 case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_row:
4231 case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_row_stride:
4232 case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_row_stride:
4233 case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_row:
4234 case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_col:
4235 case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_col_stride:
4236 case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_row:
4237 case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_row_stride:
4238 case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_col:
4239 case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_col_stride:
4240 case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_col_stride:
4241 case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_col:
4242 case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_row:
4243 case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_row_stride:
4244 case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_row_stride:
4245 case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_row:
4246 case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_col:
4247 case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_col_stride:
4248 case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_row:
4249 case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_row_stride: {
4251 Info.memVT = MVT::v2i32;
4252 Info.ptrVal = I.getArgOperand(0);
4253 Info.offset = 0;
4255 Info.align = Align(8);
4256 return true;
4257 }
4258
4259 case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_col:
4260 case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_col_stride:
4261 case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_col_stride:
4262 case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_col:
4263 case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_row:
4264 case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_row_stride:
4265 case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_row_stride:
4266 case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_row:
4267 case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_col:
4268 case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_col_stride:
4269 case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_row:
4270 case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_row_stride:
4271 case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_col:
4272 case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_col_stride:
4273 case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_row:
4274 case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_row_stride:
4275
4276 case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_col:
4277 case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_col_stride:
4278 case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_col_stride:
4279 case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_col:
4280 case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_row:
4281 case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_row_stride:
4282 case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_row_stride:
4283 case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_row:
4284 case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_col:
4285 case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_col_stride:
4286 case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_row:
4287 case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_row_stride:
4288 case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_col:
4289 case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_col_stride:
4290 case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_row:
4291 case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_row_stride:
4292 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x4_b16:
4293 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x4_trans_b16: {
4295 Info.memVT = MVT::v4i32;
4296 Info.ptrVal = I.getArgOperand(0);
4297 Info.offset = 0;
4299 Info.align = Align(16);
4300 return true;
4301 }
4302
4303 case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_col:
4304 case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_col_stride:
4305 case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_col_stride:
4306 case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_col:
4307 case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_row:
4308 case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_row_stride:
4309 case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_row_stride:
4310 case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_row:
4311
4312 case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_col:
4313 case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_col_stride:
4314 case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_col_stride:
4315 case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_col:
4316 case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_row:
4317 case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_row_stride:
4318 case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_row_stride:
4319 case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_row:
4320 case Intrinsic::nvvm_wmma_m8n8k128_load_a_b1_row:
4321 case Intrinsic::nvvm_wmma_m8n8k128_load_a_b1_row_stride:
4322 case Intrinsic::nvvm_wmma_m8n8k128_load_b_b1_col:
4323 case Intrinsic::nvvm_wmma_m8n8k128_load_b_b1_col_stride:
4324 case Intrinsic::nvvm_wmma_m8n8k32_load_a_s4_row:
4325 case Intrinsic::nvvm_wmma_m8n8k32_load_a_s4_row_stride:
4326 case Intrinsic::nvvm_wmma_m8n8k32_load_a_u4_row_stride:
4327 case Intrinsic::nvvm_wmma_m8n8k32_load_a_u4_row:
4328 case Intrinsic::nvvm_wmma_m8n8k32_load_b_s4_col:
4329 case Intrinsic::nvvm_wmma_m8n8k32_load_b_s4_col_stride:
4330 case Intrinsic::nvvm_wmma_m8n8k32_load_b_u4_col_stride:
4331 case Intrinsic::nvvm_wmma_m8n8k32_load_b_u4_col:
4332 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x1_b16:
4333 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x1_trans_b16: {
4335 Info.memVT = MVT::i32;
4336 Info.ptrVal = I.getArgOperand(0);
4337 Info.offset = 0;
4339 Info.align = Align(4);
4340 return true;
4341 }
4342
4343 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_col:
4344 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_row:
4345 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_col_stride:
4346 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_row_stride:
4347 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_col:
4348 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_row:
4349 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_col_stride:
4350 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_row_stride:
4351 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_col:
4352 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_row:
4353 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_col_stride:
4354 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_row_stride: {
4356 Info.memVT = MVT::v4f16;
4357 Info.ptrVal = I.getArgOperand(0);
4358 Info.offset = 0;
4360 Info.align = Align(16);
4361 return true;
4362 }
4363
4364 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_col:
4365 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_row:
4366 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_col_stride:
4367 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_row_stride:
4368 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_col:
4369 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_row:
4370 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_col_stride:
4371 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_row_stride:
4372 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_col:
4373 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_row:
4374 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_col_stride:
4375 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_row_stride:
4376 case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_col:
4377 case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_row:
4378 case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_col_stride:
4379 case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_row_stride: {
4381 Info.memVT = MVT::v8f32;
4382 Info.ptrVal = I.getArgOperand(0);
4383 Info.offset = 0;
4385 Info.align = Align(16);
4386 return true;
4387 }
4388
4389 case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_col:
4390 case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_col_stride:
4391 case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_row:
4392 case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_row_stride:
4393
4394 case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_col:
4395 case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_col_stride:
4396 case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_row:
4397 case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_row_stride:
4398
4399 case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_col:
4400 case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_col_stride:
4401 case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_row:
4402 case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_row_stride:
4403 case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_col:
4404 case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_col_stride:
4405 case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_row:
4406 case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_row_stride:
4407 case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_col:
4408 case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_col_stride:
4409 case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_row:
4410 case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_row_stride: {
4412 Info.memVT = MVT::v8i32;
4413 Info.ptrVal = I.getArgOperand(0);
4414 Info.offset = 0;
4416 Info.align = Align(16);
4417 return true;
4418 }
4419
4420 case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_col:
4421 case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_col_stride:
4422 case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_row:
4423 case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_row_stride:
4424 case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_col:
4425 case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_col_stride:
4426 case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_row:
4427 case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_row_stride:
4428 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x2_b16:
4429 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x2_trans_b16: {
4431 Info.memVT = MVT::v2i32;
4432 Info.ptrVal = I.getArgOperand(0);
4433 Info.offset = 0;
4435 Info.align = Align(8);
4436 return true;
4437 }
4438
4439 case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_col:
4440 case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_col_stride:
4441 case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_row:
4442 case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_row_stride:
4443
4444 case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_col:
4445 case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_col_stride:
4446 case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_row:
4447 case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_row_stride: {
4449 Info.memVT = MVT::f64;
4450 Info.ptrVal = I.getArgOperand(0);
4451 Info.offset = 0;
4453 Info.align = Align(8);
4454 return true;
4455 }
4456
4457 case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_col:
4458 case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_col_stride:
4459 case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_row:
4460 case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_row_stride: {
4462 Info.memVT = MVT::v2f64;
4463 Info.ptrVal = I.getArgOperand(0);
4464 Info.offset = 0;
4466 Info.align = Align(16);
4467 return true;
4468 }
4469
4470 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_col:
4471 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_row:
4472 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_col_stride:
4473 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_row_stride:
4474 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_col:
4475 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_row:
4476 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_col_stride:
4477 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_row_stride:
4478 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_col:
4479 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_row:
4480 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_col_stride:
4481 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_row_stride: {
4483 Info.memVT = MVT::v4f16;
4484 Info.ptrVal = I.getArgOperand(0);
4485 Info.offset = 0;
4487 Info.align = Align(16);
4488 return true;
4489 }
4490
4491 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_col:
4492 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_row:
4493 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_col_stride:
4494 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_row_stride:
4495 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_col:
4496 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_row:
4497 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_col_stride:
4498 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_row_stride:
4499 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_col:
4500 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_row:
4501 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_col_stride:
4502 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_row_stride:
4503 case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_col:
4504 case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_row:
4505 case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_col_stride:
4506 case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_row_stride: {
4508 Info.memVT = MVT::v8f32;
4509 Info.ptrVal = I.getArgOperand(0);
4510 Info.offset = 0;
4512 Info.align = Align(16);
4513 return true;
4514 }
4515
4516 case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_col:
4517 case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_col_stride:
4518 case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_row:
4519 case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_row_stride:
4520 case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_col:
4521 case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_col_stride:
4522 case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_row:
4523 case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_row_stride:
4524 case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_col:
4525 case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_col_stride:
4526 case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_row:
4527 case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_row_stride: {
4529 Info.memVT = MVT::v8i32;
4530 Info.ptrVal = I.getArgOperand(0);
4531 Info.offset = 0;
4533 Info.align = Align(16);
4534 return true;
4535 }
4536
4537 case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_col:
4538 case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_col_stride:
4539 case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_row:
4540 case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_row_stride:
4541 case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_col:
4542 case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_col_stride:
4543 case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_row:
4544 case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_row_stride: {
4546 Info.memVT = MVT::v2i32;
4547 Info.ptrVal = I.getArgOperand(0);
4548 Info.offset = 0;
4550 Info.align = Align(8);
4551 return true;
4552 }
4553
4554 case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_col:
4555 case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_col_stride:
4556 case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_row:
4557 case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_row_stride: {
4559 Info.memVT = MVT::v2f64;
4560 Info.ptrVal = I.getArgOperand(0);
4561 Info.offset = 0;
4563 Info.align = Align(16);
4564 return true;
4565 }
4566
4567 case Intrinsic::nvvm_atomic_load_inc_32:
4568 case Intrinsic::nvvm_atomic_load_dec_32:
4569
4570 case Intrinsic::nvvm_atomic_add_gen_f_cta:
4571 case Intrinsic::nvvm_atomic_add_gen_f_sys:
4572 case Intrinsic::nvvm_atomic_add_gen_i_cta:
4573 case Intrinsic::nvvm_atomic_add_gen_i_sys:
4574 case Intrinsic::nvvm_atomic_and_gen_i_cta:
4575 case Intrinsic::nvvm_atomic_and_gen_i_sys:
4576 case Intrinsic::nvvm_atomic_cas_gen_i_cta:
4577 case Intrinsic::nvvm_atomic_cas_gen_i_sys:
4578 case Intrinsic::nvvm_atomic_dec_gen_i_cta:
4579 case Intrinsic::nvvm_atomic_dec_gen_i_sys:
4580 case Intrinsic::nvvm_atomic_inc_gen_i_cta:
4581 case Intrinsic::nvvm_atomic_inc_gen_i_sys:
4582 case Intrinsic::nvvm_atomic_max_gen_i_cta:
4583 case Intrinsic::nvvm_atomic_max_gen_i_sys:
4584 case Intrinsic::nvvm_atomic_min_gen_i_cta:
4585 case Intrinsic::nvvm_atomic_min_gen_i_sys:
4586 case Intrinsic::nvvm_atomic_or_gen_i_cta:
4587 case Intrinsic::nvvm_atomic_or_gen_i_sys:
4588 case Intrinsic::nvvm_atomic_exch_gen_i_cta:
4589 case Intrinsic::nvvm_atomic_exch_gen_i_sys:
4590 case Intrinsic::nvvm_atomic_xor_gen_i_cta:
4591 case Intrinsic::nvvm_atomic_xor_gen_i_sys: {
4592 auto &DL = I.getDataLayout();
4594 Info.memVT = getValueType(DL, I.getType());
4595 Info.ptrVal = I.getArgOperand(0);
4596 Info.offset = 0;
4598 Info.align.reset();
4599 return true;
4600 }
4601
4602 case Intrinsic::nvvm_ldu_global_i:
4603 case Intrinsic::nvvm_ldu_global_f:
4604 case Intrinsic::nvvm_ldu_global_p: {
4605 auto &DL = I.getDataLayout();
4607 if (Intrinsic == Intrinsic::nvvm_ldu_global_i)
4608 Info.memVT = getValueType(DL, I.getType());
4609 else if(Intrinsic == Intrinsic::nvvm_ldu_global_p)
4610 Info.memVT = getPointerTy(DL);
4611 else
4612 Info.memVT = getValueType(DL, I.getType());
4613 Info.ptrVal = I.getArgOperand(0);
4614 Info.offset = 0;
4616 Info.align = cast<ConstantInt>(I.getArgOperand(1))->getMaybeAlignValue();
4617
4618 return true;
4619 }
4620 case Intrinsic::nvvm_ldg_global_i:
4621 case Intrinsic::nvvm_ldg_global_f:
4622 case Intrinsic::nvvm_ldg_global_p: {
4623 auto &DL = I.getDataLayout();
4624
4626 if (Intrinsic == Intrinsic::nvvm_ldg_global_i)
4627 Info.memVT = getValueType(DL, I.getType());
4628 else if(Intrinsic == Intrinsic::nvvm_ldg_global_p)
4629 Info.memVT = getPointerTy(DL);
4630 else
4631 Info.memVT = getValueType(DL, I.getType());
4632 Info.ptrVal = I.getArgOperand(0);
4633 Info.offset = 0;
4635 Info.align = cast<ConstantInt>(I.getArgOperand(1))->getMaybeAlignValue();
4636
4637 return true;
4638 }
4639
4640 case Intrinsic::nvvm_tex_1d_v4f32_s32:
4641 case Intrinsic::nvvm_tex_1d_v4f32_f32:
4642 case Intrinsic::nvvm_tex_1d_level_v4f32_f32:
4643 case Intrinsic::nvvm_tex_1d_grad_v4f32_f32:
4644 case Intrinsic::nvvm_tex_1d_array_v4f32_s32:
4645 case Intrinsic::nvvm_tex_1d_array_v4f32_f32:
4646 case Intrinsic::nvvm_tex_1d_array_level_v4f32_f32:
4647 case Intrinsic::nvvm_tex_1d_array_grad_v4f32_f32:
4648 case Intrinsic::nvvm_tex_2d_v4f32_s32:
4649 case Intrinsic::nvvm_tex_2d_v4f32_f32:
4650 case Intrinsic::nvvm_tex_2d_level_v4f32_f32:
4651 case Intrinsic::nvvm_tex_2d_grad_v4f32_f32:
4652 case Intrinsic::nvvm_tex_2d_array_v4f32_s32:
4653 case Intrinsic::nvvm_tex_2d_array_v4f32_f32:
4654 case Intrinsic::nvvm_tex_2d_array_level_v4f32_f32:
4655 case Intrinsic::nvvm_tex_2d_array_grad_v4f32_f32:
4656 case Intrinsic::nvvm_tex_3d_v4f32_s32:
4657 case Intrinsic::nvvm_tex_3d_v4f32_f32:
4658 case Intrinsic::nvvm_tex_3d_level_v4f32_f32:
4659 case Intrinsic::nvvm_tex_3d_grad_v4f32_f32:
4660 case Intrinsic::nvvm_tex_cube_v4f32_f32:
4661 case Intrinsic::nvvm_tex_cube_level_v4f32_f32:
4662 case Intrinsic::nvvm_tex_cube_array_v4f32_f32:
4663 case Intrinsic::nvvm_tex_cube_array_level_v4f32_f32:
4664 case Intrinsic::nvvm_tld4_r_2d_v4f32_f32:
4665 case Intrinsic::nvvm_tld4_g_2d_v4f32_f32:
4666 case Intrinsic::nvvm_tld4_b_2d_v4f32_f32:
4667 case Intrinsic::nvvm_tld4_a_2d_v4f32_f32:
4668 case Intrinsic::nvvm_tex_unified_1d_v4f32_s32:
4669 case Intrinsic::nvvm_tex_unified_1d_v4f32_f32:
4670 case Intrinsic::nvvm_tex_unified_1d_level_v4f32_f32:
4671 case Intrinsic::nvvm_tex_unified_1d_grad_v4f32_f32:
4672 case Intrinsic::nvvm_tex_unified_1d_array_v4f32_s32:
4673 case Intrinsic::nvvm_tex_unified_1d_array_v4f32_f32:
4674 case Intrinsic::nvvm_tex_unified_1d_array_level_v4f32_f32:
4675 case Intrinsic::nvvm_tex_unified_1d_array_grad_v4f32_f32:
4676 case Intrinsic::nvvm_tex_unified_2d_v4f32_s32:
4677 case Intrinsic::nvvm_tex_unified_2d_v4f32_f32:
4678 case Intrinsic::nvvm_tex_unified_2d_level_v4f32_f32:
4679 case Intrinsic::nvvm_tex_unified_2d_grad_v4f32_f32:
4680 case Intrinsic::nvvm_tex_unified_2d_array_v4f32_s32:
4681 case Intrinsic::nvvm_tex_unified_2d_array_v4f32_f32:
4682 case Intrinsic::nvvm_tex_unified_2d_array_level_v4f32_f32:
4683 case Intrinsic::nvvm_tex_unified_2d_array_grad_v4f32_f32:
4684 case Intrinsic::nvvm_tex_unified_3d_v4f32_s32:
4685 case Intrinsic::nvvm_tex_unified_3d_v4f32_f32:
4686 case Intrinsic::nvvm_tex_unified_3d_level_v4f32_f32:
4687 case Intrinsic::nvvm_tex_unified_3d_grad_v4f32_f32:
4688 case Intrinsic::nvvm_tex_unified_cube_v4f32_f32:
4689 case Intrinsic::nvvm_tex_unified_cube_level_v4f32_f32:
4690 case Intrinsic::nvvm_tex_unified_cube_array_v4f32_f32:
4691 case Intrinsic::nvvm_tex_unified_cube_array_level_v4f32_f32:
4692 case Intrinsic::nvvm_tex_unified_cube_grad_v4f32_f32:
4693 case Intrinsic::nvvm_tex_unified_cube_array_grad_v4f32_f32:
4694 case Intrinsic::nvvm_tld4_unified_r_2d_v4f32_f32:
4695 case Intrinsic::nvvm_tld4_unified_g_2d_v4f32_f32:
4696 case Intrinsic::nvvm_tld4_unified_b_2d_v4f32_f32:
4697 case Intrinsic::nvvm_tld4_unified_a_2d_v4f32_f32:
4698 Info.opc = getOpcForTextureInstr(Intrinsic);
4699 Info.memVT = MVT::v4f32;
4700 Info.ptrVal = nullptr;
4701 Info.offset = 0;
4703 Info.align = Align(16);
4704 return true;
4705
4706 case Intrinsic::nvvm_tex_1d_v4s32_s32:
4707 case Intrinsic::nvvm_tex_1d_v4s32_f32:
4708 case Intrinsic::nvvm_tex_1d_level_v4s32_f32:
4709 case Intrinsic::nvvm_tex_1d_grad_v4s32_f32:
4710 case Intrinsic::nvvm_tex_1d_array_v4s32_s32:
4711 case Intrinsic::nvvm_tex_1d_array_v4s32_f32:
4712 case Intrinsic::nvvm_tex_1d_array_level_v4s32_f32:
4713 case Intrinsic::nvvm_tex_1d_array_grad_v4s32_f32:
4714 case Intrinsic::nvvm_tex_2d_v4s32_s32:
4715 case Intrinsic::nvvm_tex_2d_v4s32_f32:
4716 case Intrinsic::nvvm_tex_2d_level_v4s32_f32:
4717 case Intrinsic::nvvm_tex_2d_grad_v4s32_f32:
4718 case Intrinsic::nvvm_tex_2d_array_v4s32_s32:
4719 case Intrinsic::nvvm_tex_2d_array_v4s32_f32:
4720 case Intrinsic::nvvm_tex_2d_array_level_v4s32_f32:
4721 case Intrinsic::nvvm_tex_2d_array_grad_v4s32_f32:
4722 case Intrinsic::nvvm_tex_3d_v4s32_s32:
4723 case Intrinsic::nvvm_tex_3d_v4s32_f32:
4724 case Intrinsic::nvvm_tex_3d_level_v4s32_f32:
4725 case Intrinsic::nvvm_tex_3d_grad_v4s32_f32:
4726 case Intrinsic::nvvm_tex_cube_v4s32_f32:
4727 case Intrinsic::nvvm_tex_cube_level_v4s32_f32:
4728 case Intrinsic::nvvm_tex_cube_array_v4s32_f32:
4729 case Intrinsic::nvvm_tex_cube_array_level_v4s32_f32:
4730 case Intrinsic::nvvm_tex_cube_v4u32_f32:
4731 case Intrinsic::nvvm_tex_cube_level_v4u32_f32:
4732 case Intrinsic::nvvm_tex_cube_array_v4u32_f32:
4733 case Intrinsic::nvvm_tex_cube_array_level_v4u32_f32:
4734 case Intrinsic::nvvm_tex_1d_v4u32_s32:
4735 case Intrinsic::nvvm_tex_1d_v4u32_f32:
4736 case Intrinsic::nvvm_tex_1d_level_v4u32_f32:
4737 case Intrinsic::nvvm_tex_1d_grad_v4u32_f32:
4738 case Intrinsic::nvvm_tex_1d_array_v4u32_s32:
4739 case Intrinsic::nvvm_tex_1d_array_v4u32_f32:
4740 case Intrinsic::nvvm_tex_1d_array_level_v4u32_f32:
4741 case Intrinsic::nvvm_tex_1d_array_grad_v4u32_f32:
4742 case Intrinsic::nvvm_tex_2d_v4u32_s32:
4743 case Intrinsic::nvvm_tex_2d_v4u32_f32:
4744 case Intrinsic::nvvm_tex_2d_level_v4u32_f32:
4745 case Intrinsic::nvvm_tex_2d_grad_v4u32_f32:
4746 case Intrinsic::nvvm_tex_2d_array_v4u32_s32:
4747 case Intrinsic::nvvm_tex_2d_array_v4u32_f32:
4748 case Intrinsic::nvvm_tex_2d_array_level_v4u32_f32:
4749 case Intrinsic::nvvm_tex_2d_array_grad_v4u32_f32:
4750 case Intrinsic::nvvm_tex_3d_v4u32_s32:
4751 case Intrinsic::nvvm_tex_3d_v4u32_f32:
4752 case Intrinsic::nvvm_tex_3d_level_v4u32_f32:
4753 case Intrinsic::nvvm_tex_3d_grad_v4u32_f32:
4754 case Intrinsic::nvvm_tld4_r_2d_v4s32_f32:
4755 case Intrinsic::nvvm_tld4_g_2d_v4s32_f32:
4756 case Intrinsic::nvvm_tld4_b_2d_v4s32_f32:
4757 case Intrinsic::nvvm_tld4_a_2d_v4s32_f32:
4758 case Intrinsic::nvvm_tld4_r_2d_v4u32_f32:
4759 case Intrinsic::nvvm_tld4_g_2d_v4u32_f32:
4760 case Intrinsic::nvvm_tld4_b_2d_v4u32_f32:
4761 case Intrinsic::nvvm_tld4_a_2d_v4u32_f32:
4762 case Intrinsic::nvvm_tex_unified_1d_v4s32_s32:
4763 case Intrinsic::nvvm_tex_unified_1d_v4s32_f32:
4764 case Intrinsic::nvvm_tex_unified_1d_level_v4s32_f32:
4765 case Intrinsic::nvvm_tex_unified_1d_grad_v4s32_f32:
4766 case Intrinsic::nvvm_tex_unified_1d_array_v4s32_s32:
4767 case Intrinsic::nvvm_tex_unified_1d_array_v4s32_f32:
4768 case Intrinsic::nvvm_tex_unified_1d_array_level_v4s32_f32:
4769 case Intrinsic::nvvm_tex_unified_1d_array_grad_v4s32_f32:
4770 case Intrinsic::nvvm_tex_unified_2d_v4s32_s32:
4771 case Intrinsic::nvvm_tex_unified_2d_v4s32_f32:
4772 case Intrinsic::nvvm_tex_unified_2d_level_v4s32_f32:
4773 case Intrinsic::nvvm_tex_unified_2d_grad_v4s32_f32:
4774 case Intrinsic::nvvm_tex_unified_2d_array_v4s32_s32:
4775 case Intrinsic::nvvm_tex_unified_2d_array_v4s32_f32:
4776 case Intrinsic::nvvm_tex_unified_2d_array_level_v4s32_f32:
4777 case Intrinsic::nvvm_tex_unified_2d_array_grad_v4s32_f32:
4778 case Intrinsic::nvvm_tex_unified_3d_v4s32_s32:
4779 case Intrinsic::nvvm_tex_unified_3d_v4s32_f32:
4780 case Intrinsic::nvvm_tex_unified_3d_level_v4s32_f32:
4781 case Intrinsic::nvvm_tex_unified_3d_grad_v4s32_f32:
4782 case Intrinsic::nvvm_tex_unified_1d_v4u32_s32:
4783 case Intrinsic::nvvm_tex_unified_1d_v4u32_f32:
4784 case Intrinsic::nvvm_tex_unified_1d_level_v4u32_f32:
4785 case Intrinsic::nvvm_tex_unified_1d_grad_v4u32_f32:
4786 case Intrinsic::nvvm_tex_unified_1d_array_v4u32_s32:
4787 case Intrinsic::nvvm_tex_unified_1d_array_v4u32_f32:
4788 case Intrinsic::nvvm_tex_unified_1d_array_level_v4u32_f32:
4789 case Intrinsic::nvvm_tex_unified_1d_array_grad_v4u32_f32:
4790 case Intrinsic::nvvm_tex_unified_2d_v4u32_s32:
4791 case Intrinsic::nvvm_tex_unified_2d_v4u32_f32:
4792 case Intrinsic::nvvm_tex_unified_2d_level_v4u32_f32:
4793 case Intrinsic::nvvm_tex_unified_2d_grad_v4u32_f32:
4794 case Intrinsic::nvvm_tex_unified_2d_array_v4u32_s32:
4795 case Intrinsic::nvvm_tex_unified_2d_array_v4u32_f32:
4796 case Intrinsic::nvvm_tex_unified_2d_array_level_v4u32_f32:
4797 case Intrinsic::nvvm_tex_unified_2d_array_grad_v4u32_f32:
4798 case Intrinsic::nvvm_tex_unified_3d_v4u32_s32:
4799 case Intrinsic::nvvm_tex_unified_3d_v4u32_f32:
4800 case Intrinsic::nvvm_tex_unified_3d_level_v4u32_f32:
4801 case Intrinsic::nvvm_tex_unified_3d_grad_v4u32_f32:
4802 case Intrinsic::nvvm_tex_unified_cube_v4s32_f32:
4803 case Intrinsic::nvvm_tex_unified_cube_level_v4s32_f32:
4804 case Intrinsic::nvvm_tex_unified_cube_array_v4s32_f32:
4805 case Intrinsic::nvvm_tex_unified_cube_array_level_v4s32_f32:
4806 case Intrinsic::nvvm_tex_unified_cube_v4u32_f32:
4807 case Intrinsic::nvvm_tex_unified_cube_level_v4u32_f32:
4808 case Intrinsic::nvvm_tex_unified_cube_array_v4u32_f32:
4809 case Intrinsic::nvvm_tex_unified_cube_array_level_v4u32_f32:
4810 case Intrinsic::nvvm_tex_unified_cube_grad_v4s32_f32:
4811 case Intrinsic::nvvm_tex_unified_cube_grad_v4u32_f32:
4812 case Intrinsic::nvvm_tex_unified_cube_array_grad_v4s32_f32:
4813 case Intrinsic::nvvm_tex_unified_cube_array_grad_v4u32_f32:
4814 case Intrinsic::nvvm_tld4_unified_r_2d_v4s32_f32:
4815 case Intrinsic::nvvm_tld4_unified_g_2d_v4s32_f32:
4816 case Intrinsic::nvvm_tld4_unified_b_2d_v4s32_f32:
4817 case Intrinsic::nvvm_tld4_unified_a_2d_v4s32_f32:
4818 case Intrinsic::nvvm_tld4_unified_r_2d_v4u32_f32:
4819 case Intrinsic::nvvm_tld4_unified_g_2d_v4u32_f32:
4820 case Intrinsic::nvvm_tld4_unified_b_2d_v4u32_f32:
4821 case Intrinsic::nvvm_tld4_unified_a_2d_v4u32_f32:
4822 Info.opc = getOpcForTextureInstr(Intrinsic);
4823 Info.memVT = MVT::v4i32;
4824 Info.ptrVal = nullptr;
4825 Info.offset = 0;
4827 Info.align = Align(16);
4828 return true;
4829
4830 case Intrinsic::nvvm_suld_1d_i8_clamp:
4831 case Intrinsic::nvvm_suld_1d_v2i8_clamp:
4832 case Intrinsic::nvvm_suld_1d_v4i8_clamp:
4833 case Intrinsic::nvvm_suld_1d_array_i8_clamp:
4834 case Intrinsic::nvvm_suld_1d_array_v2i8_clamp:
4835 case Intrinsic::nvvm_suld_1d_array_v4i8_clamp:
4836 case Intrinsic::nvvm_suld_2d_i8_clamp:
4837 case Intrinsic::nvvm_suld_2d_v2i8_clamp:
4838 case Intrinsic::nvvm_suld_2d_v4i8_clamp:
4839 case Intrinsic::nvvm_suld_2d_array_i8_clamp:
4840 case Intrinsic::nvvm_suld_2d_array_v2i8_clamp:
4841 case Intrinsic::nvvm_suld_2d_array_v4i8_clamp:
4842 case Intrinsic::nvvm_suld_3d_i8_clamp:
4843 case Intrinsic::nvvm_suld_3d_v2i8_clamp:
4844 case Intrinsic::nvvm_suld_3d_v4i8_clamp:
4845 case Intrinsic::nvvm_suld_1d_i8_trap:
4846 case Intrinsic::nvvm_suld_1d_v2i8_trap:
4847 case Intrinsic::nvvm_suld_1d_v4i8_trap:
4848 case Intrinsic::nvvm_suld_1d_array_i8_trap:
4849 case Intrinsic::nvvm_suld_1d_array_v2i8_trap:
4850 case Intrinsic::nvvm_suld_1d_array_v4i8_trap:
4851 case Intrinsic::nvvm_suld_2d_i8_trap:
4852 case Intrinsic::nvvm_suld_2d_v2i8_trap:
4853 case Intrinsic::nvvm_suld_2d_v4i8_trap:
4854 case Intrinsic::nvvm_suld_2d_array_i8_trap:
4855 case Intrinsic::nvvm_suld_2d_array_v2i8_trap:
4856 case Intrinsic::nvvm_suld_2d_array_v4i8_trap:
4857 case Intrinsic::nvvm_suld_3d_i8_trap:
4858 case Intrinsic::nvvm_suld_3d_v2i8_trap:
4859 case Intrinsic::nvvm_suld_3d_v4i8_trap:
4860 case Intrinsic::nvvm_suld_1d_i8_zero:
4861 case Intrinsic::nvvm_suld_1d_v2i8_zero:
4862 case Intrinsic::nvvm_suld_1d_v4i8_zero:
4863 case Intrinsic::nvvm_suld_1d_array_i8_zero:
4864 case Intrinsic::nvvm_suld_1d_array_v2i8_zero:
4865 case Intrinsic::nvvm_suld_1d_array_v4i8_zero:
4866 case Intrinsic::nvvm_suld_2d_i8_zero:
4867 case Intrinsic::nvvm_suld_2d_v2i8_zero:
4868 case Intrinsic::nvvm_suld_2d_v4i8_zero:
4869 case Intrinsic::nvvm_suld_2d_array_i8_zero:
4870 case Intrinsic::nvvm_suld_2d_array_v2i8_zero:
4871 case Intrinsic::nvvm_suld_2d_array_v4i8_zero:
4872 case Intrinsic::nvvm_suld_3d_i8_zero:
4873 case Intrinsic::nvvm_suld_3d_v2i8_zero:
4874 case Intrinsic::nvvm_suld_3d_v4i8_zero:
4875 Info.opc = getOpcForSurfaceInstr(Intrinsic);
4876 Info.memVT = MVT::i8;
4877 Info.ptrVal = nullptr;
4878 Info.offset = 0;
4880 Info.align = Align(16);
4881 return true;
4882
4883 case Intrinsic::nvvm_suld_1d_i16_clamp:
4884 case Intrinsic::nvvm_suld_1d_v2i16_clamp:
4885 case Intrinsic::nvvm_suld_1d_v4i16_clamp:
4886 case Intrinsic::nvvm_suld_1d_array_i16_clamp:
4887 case Intrinsic::nvvm_suld_1d_array_v2i16_clamp:
4888 case Intrinsic::nvvm_suld_1d_array_v4i16_clamp:
4889 case Intrinsic::nvvm_suld_2d_i16_clamp:
4890 case Intrinsic::nvvm_suld_2d_v2i16_clamp:
4891 case Intrinsic::nvvm_suld_2d_v4i16_clamp:
4892 case Intrinsic::nvvm_suld_2d_array_i16_clamp:
4893 case Intrinsic::nvvm_suld_2d_array_v2i16_clamp:
4894 case Intrinsic::nvvm_suld_2d_array_v4i16_clamp:
4895 case Intrinsic::nvvm_suld_3d_i16_clamp:
4896 case Intrinsic::nvvm_suld_3d_v2i16_clamp:
4897 case Intrinsic::nvvm_suld_3d_v4i16_clamp:
4898 case Intrinsic::nvvm_suld_1d_i16_trap:
4899 case Intrinsic::nvvm_suld_1d_v2i16_trap:
4900 case Intrinsic::nvvm_suld_1d_v4i16_trap:
4901 case Intrinsic::nvvm_suld_1d_array_i16_trap:
4902 case Intrinsic::nvvm_suld_1d_array_v2i16_trap:
4903 case Intrinsic::nvvm_suld_1d_array_v4i16_trap:
4904 case Intrinsic::nvvm_suld_2d_i16_trap:
4905 case Intrinsic::nvvm_suld_2d_v2i16_trap:
4906 case Intrinsic::nvvm_suld_2d_v4i16_trap:
4907 case Intrinsic::nvvm_suld_2d_array_i16_trap:
4908 case Intrinsic::nvvm_suld_2d_array_v2i16_trap:
4909 case Intrinsic::nvvm_suld_2d_array_v4i16_trap:
4910 case Intrinsic::nvvm_suld_3d_i16_trap:
4911 case Intrinsic::nvvm_suld_3d_v2i16_trap:
4912 case Intrinsic::nvvm_suld_3d_v4i16_trap:
4913 case Intrinsic::nvvm_suld_1d_i16_zero:
4914 case Intrinsic::nvvm_suld_1d_v2i16_zero:
4915 case Intrinsic::nvvm_suld_1d_v4i16_zero:
4916 case Intrinsic::nvvm_suld_1d_array_i16_zero:
4917 case Intrinsic::nvvm_suld_1d_array_v2i16_zero:
4918 case Intrinsic::nvvm_suld_1d_array_v4i16_zero:
4919 case Intrinsic::nvvm_suld_2d_i16_zero:
4920 case Intrinsic::nvvm_suld_2d_v2i16_zero:
4921 case Intrinsic::nvvm_suld_2d_v4i16_zero:
4922 case Intrinsic::nvvm_suld_2d_array_i16_zero:
4923 case Intrinsic::nvvm_suld_2d_array_v2i16_zero:
4924 case Intrinsic::nvvm_suld_2d_array_v4i16_zero:
4925 case Intrinsic::nvvm_suld_3d_i16_zero:
4926 case Intrinsic::nvvm_suld_3d_v2i16_zero:
4927 case Intrinsic::nvvm_suld_3d_v4i16_zero:
4928 Info.opc = getOpcForSurfaceInstr(Intrinsic);
4929 Info.memVT = MVT::i16;
4930 Info.ptrVal = nullptr;
4931 Info.offset = 0;
4933 Info.align = Align(16);
4934 return true;
4935
4936 case Intrinsic::nvvm_suld_1d_i32_clamp:
4937 case Intrinsic::nvvm_suld_1d_v2i32_clamp:
4938 case Intrinsic::nvvm_suld_1d_v4i32_clamp:
4939 case Intrinsic::nvvm_suld_1d_array_i32_clamp:
4940 case Intrinsic::nvvm_suld_1d_array_v2i32_clamp:
4941 case Intrinsic::nvvm_suld_1d_array_v4i32_clamp:
4942 case Intrinsic::nvvm_suld_2d_i32_clamp:
4943 case Intrinsic::nvvm_suld_2d_v2i32_clamp:
4944 case Intrinsic::nvvm_suld_2d_v4i32_clamp:
4945 case Intrinsic::nvvm_suld_2d_array_i32_clamp:
4946 case Intrinsic::nvvm_suld_2d_array_v2i32_clamp:
4947 case Intrinsic::nvvm_suld_2d_array_v4i32_clamp:
4948 case Intrinsic::nvvm_suld_3d_i32_clamp:
4949 case Intrinsic::nvvm_suld_3d_v2i32_clamp:
4950 case Intrinsic::nvvm_suld_3d_v4i32_clamp:
4951 case Intrinsic::nvvm_suld_1d_i32_trap:
4952 case Intrinsic::nvvm_suld_1d_v2i32_trap:
4953 case Intrinsic::nvvm_suld_1d_v4i32_trap:
4954 case Intrinsic::nvvm_suld_1d_array_i32_trap:
4955 case Intrinsic::nvvm_suld_1d_array_v2i32_trap:
4956 case Intrinsic::nvvm_suld_1d_array_v4i32_trap:
4957 case Intrinsic::nvvm_suld_2d_i32_trap:
4958 case Intrinsic::nvvm_suld_2d_v2i32_trap:
4959 case Intrinsic::nvvm_suld_2d_v4i32_trap:
4960 case Intrinsic::nvvm_suld_2d_array_i32_trap:
4961 case Intrinsic::nvvm_suld_2d_array_v2i32_trap:
4962 case Intrinsic::nvvm_suld_2d_array_v4i32_trap:
4963 case Intrinsic::nvvm_suld_3d_i32_trap:
4964 case Intrinsic::nvvm_suld_3d_v2i32_trap:
4965 case Intrinsic::nvvm_suld_3d_v4i32_trap:
4966 case Intrinsic::nvvm_suld_1d_i32_zero:
4967 case Intrinsic::nvvm_suld_1d_v2i32_zero:
4968 case Intrinsic::nvvm_suld_1d_v4i32_zero:
4969 case Intrinsic::nvvm_suld_1d_array_i32_zero:
4970 case Intrinsic::nvvm_suld_1d_array_v2i32_zero:
4971 case Intrinsic::nvvm_suld_1d_array_v4i32_zero:
4972 case Intrinsic::nvvm_suld_2d_i32_zero:
4973 case Intrinsic::nvvm_suld_2d_v2i32_zero:
4974 case Intrinsic::nvvm_suld_2d_v4i32_zero:
4975 case Intrinsic::nvvm_suld_2d_array_i32_zero:
4976 case Intrinsic::nvvm_suld_2d_array_v2i32_zero:
4977 case Intrinsic::nvvm_suld_2d_array_v4i32_zero:
4978 case Intrinsic::nvvm_suld_3d_i32_zero:
4979 case Intrinsic::nvvm_suld_3d_v2i32_zero:
4980 case Intrinsic::nvvm_suld_3d_v4i32_zero:
4981 Info.opc = getOpcForSurfaceInstr(Intrinsic);
4982 Info.memVT = MVT::i32;
4983 Info.ptrVal = nullptr;
4984 Info.offset = 0;
4986 Info.align = Align(16);
4987 return true;
4988
4989 case Intrinsic::nvvm_suld_1d_i64_clamp:
4990 case Intrinsic::nvvm_suld_1d_v2i64_clamp:
4991 case Intrinsic::nvvm_suld_1d_array_i64_clamp:
4992 case Intrinsic::nvvm_suld_1d_array_v2i64_clamp:
4993 case Intrinsic::nvvm_suld_2d_i64_clamp:
4994 case Intrinsic::nvvm_suld_2d_v2i64_clamp:
4995 case Intrinsic::nvvm_suld_2d_array_i64_clamp:
4996 case Intrinsic::nvvm_suld_2d_array_v2i64_clamp:
4997 case Intrinsic::nvvm_suld_3d_i64_clamp:
4998 case Intrinsic::nvvm_suld_3d_v2i64_clamp:
4999 case Intrinsic::nvvm_suld_1d_i64_trap:
5000 case Intrinsic::nvvm_suld_1d_v2i64_trap:
5001 case Intrinsic::nvvm_suld_1d_array_i64_trap:
5002 case Intrinsic::nvvm_suld_1d_array_v2i64_trap:
5003 case Intrinsic::nvvm_suld_2d_i64_trap:
5004 case Intrinsic::nvvm_suld_2d_v2i64_trap:
5005 case Intrinsic::nvvm_suld_2d_array_i64_trap:
5006 case Intrinsic::nvvm_suld_2d_array_v2i64_trap:
5007 case Intrinsic::nvvm_suld_3d_i64_trap:
5008 case Intrinsic::nvvm_suld_3d_v2i64_trap:
5009 case Intrinsic::nvvm_suld_1d_i64_zero:
5010 case Intrinsic::nvvm_suld_1d_v2i64_zero:
5011 case Intrinsic::nvvm_suld_1d_array_i64_zero:
5012 case Intrinsic::nvvm_suld_1d_array_v2i64_zero:
5013 case Intrinsic::nvvm_suld_2d_i64_zero:
5014 case Intrinsic::nvvm_suld_2d_v2i64_zero:
5015 case Intrinsic::nvvm_suld_2d_array_i64_zero:
5016 case Intrinsic::nvvm_suld_2d_array_v2i64_zero:
5017 case Intrinsic::nvvm_suld_3d_i64_zero:
5018 case Intrinsic::nvvm_suld_3d_v2i64_zero:
5019 Info.opc = getOpcForSurfaceInstr(Intrinsic);
5020 Info.memVT = MVT::i64;
5021 Info.ptrVal = nullptr;
5022 Info.offset = 0;
5024 Info.align = Align(16);
5025 return true;
5026 }
5027 return false;
5028}
5029
5030/// getFunctionParamOptimizedAlign - since function arguments are passed via
5031/// .param space, we may want to increase their alignment in a way that
5032/// ensures that we can effectively vectorize their loads & stores. We can
5033/// increase alignment only if the function has internal or has private
5034/// linkage as for other linkage types callers may already rely on default
5035/// alignment. To allow using 128-bit vectorized loads/stores, this function
5036/// ensures that alignment is 16 or greater.
5038 const Function *F, Type *ArgTy, const DataLayout &DL) const {
5039 // Capping the alignment to 128 bytes as that is the maximum alignment
5040 // supported by PTX.
5041 const Align ABITypeAlign = std::min(Align(128), DL.getABITypeAlign(ArgTy));
5042
5043 // If a function has linkage different from internal or private, we
5044 // must use default ABI alignment as external users rely on it. Same
5045 // for a function that may be called from a function pointer.
5046 if (!F || !F->hasLocalLinkage() ||
5047 F->hasAddressTaken(/*Users=*/nullptr,
5048 /*IgnoreCallbackUses=*/false,
5049 /*IgnoreAssumeLikeCalls=*/true,
5050 /*IgnoreLLVMUsed=*/true))
5051 return ABITypeAlign;
5052
5053 assert(!isKernelFunction(*F) && "Expect kernels to have non-local linkage");
5054 return std::max(Align(16), ABITypeAlign);
5055}
5056
5057/// Helper for computing alignment of a device function byval parameter.
5059 const Function *F, Type *ArgTy, Align InitialAlign,
5060 const DataLayout &DL) const {
5061 Align ArgAlign = InitialAlign;
5062 // Try to increase alignment to enhance vectorization options.
5063 if (F)
5064 ArgAlign = std::max(ArgAlign, getFunctionParamOptimizedAlign(F, ArgTy, DL));
5065
5066 // Old ptx versions have a bug. When PTX code takes address of
5067 // byval parameter with alignment < 4, ptxas generates code to
5068 // spill argument into memory. Alas on sm_50+ ptxas generates
5069 // SASS code that fails with misaligned access. To work around
5070 // the problem, make sure that we align byval parameters by at
5071 // least 4. This bug seems to be fixed at least starting from
5072 // ptxas > 9.0.
5073 // TODO: remove this after verifying the bug is not reproduced
5074 // on non-deprecated ptxas versions.
5076 ArgAlign = std::max(ArgAlign, Align(4));
5077
5078 return ArgAlign;
5079}
5080
5081// Helper for getting a function parameter name. Name is composed from
5082// its index and the function name. Negative index corresponds to special
5083// parameter (unsized array) used for passing variable arguments.
5085 int Idx) const {
5086 std::string ParamName;
5087 raw_string_ostream ParamStr(ParamName);
5088
5089 ParamStr << getTargetMachine().getSymbol(F)->getName();
5090 if (Idx < 0)
5091 ParamStr << "_vararg";
5092 else
5093 ParamStr << "_param_" << Idx;
5094
5095 return ParamName;
5096}
5097
5098/// isLegalAddressingMode - Return true if the addressing mode represented
5099/// by AM is legal for this target, for a load/store of the specified type.
5100/// Used to guide target specific optimizations, like loop strength reduction
5101/// (LoopStrengthReduce.cpp) and memory optimization for address mode
5102/// (CodeGenPrepare.cpp)
5104 const AddrMode &AM, Type *Ty,
5105 unsigned AS, Instruction *I) const {
5106 // AddrMode - This represents an addressing mode of:
5107 // BaseGV + BaseOffs + BaseReg + Scale*ScaleReg
5108 //
5109 // The legal address modes are
5110 // - [avar]
5111 // - [areg]
5112 // - [areg+immoff]
5113 // - [immAddr]
5114
5115 if (AM.BaseGV) {
5116 return !AM.BaseOffs && !AM.HasBaseReg && !AM.Scale;
5117 }
5118
5119 switch (AM.Scale) {
5120 case 0: // "r", "r+i" or "i" is allowed
5121 break;
5122 case 1:
5123 if (AM.HasBaseReg) // "r+r+i" or "r+r" is not allowed.
5124 return false;
5125 // Otherwise we have r+i.
5126 break;
5127 default:
5128 // No scale > 1 is allowed
5129 return false;
5130 }
5131 return true;
5132}
5133
5134//===----------------------------------------------------------------------===//
5135// NVPTX Inline Assembly Support
5136//===----------------------------------------------------------------------===//
5137
5138/// getConstraintType - Given a constraint letter, return the type of
5139/// constraint it is for this target.
5142 if (Constraint.size() == 1) {
5143 switch (Constraint[0]) {
5144 default:
5145 break;
5146 case 'b':
5147 case 'r':
5148 case 'h':
5149 case 'c':
5150 case 'l':
5151 case 'f':
5152 case 'd':
5153 case '0':
5154 case 'N':
5155 return C_RegisterClass;
5156 }
5157 }
5158 return TargetLowering::getConstraintType(Constraint);
5159}
5160
5161std::pair<unsigned, const TargetRegisterClass *>
5163 StringRef Constraint,
5164 MVT VT) const {
5165 if (Constraint.size() == 1) {
5166 switch (Constraint[0]) {
5167 case 'b':
5168 return std::make_pair(0U, &NVPTX::Int1RegsRegClass);
5169 case 'c':
5170 return std::make_pair(0U, &NVPTX::Int16RegsRegClass);
5171 case 'h':
5172 return std::make_pair(0U, &NVPTX::Int16RegsRegClass);
5173 case 'r':
5174 return std::make_pair(0U, &NVPTX::Int32RegsRegClass);
5175 case 'l':
5176 case 'N':
5177 return std::make_pair(0U, &NVPTX::Int64RegsRegClass);
5178 case 'f':
5179 return std::make_pair(0U, &NVPTX::Float32RegsRegClass);
5180 case 'd':
5181 return std::make_pair(0U, &NVPTX::Float64RegsRegClass);
5182 }
5183 }
5184 return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
5185}
5186
5187//===----------------------------------------------------------------------===//
5188// NVPTX DAG Combining
5189//===----------------------------------------------------------------------===//
5190
5192 CodeGenOptLevel OptLevel) const {
5193 // Always honor command-line argument
5194 if (FMAContractLevelOpt.getNumOccurrences() > 0)
5195 return FMAContractLevelOpt > 0;
5196
5197 // Do not contract if we're not optimizing the code.
5198 if (OptLevel == CodeGenOptLevel::None)
5199 return false;
5200
5201 // Honor TargetOptions flags that explicitly say fusion is okay.
5203 return true;
5204
5205 return allowUnsafeFPMath(MF);
5206}
5207
5209 // Honor TargetOptions flags that explicitly say unsafe math is okay.
5211 return true;
5212
5213 // Allow unsafe math if unsafe-fp-math attribute explicitly says so.
5214 const Function &F = MF.getFunction();
5215 return F.getFnAttribute("unsafe-fp-math").getValueAsBool();
5216}
5217
5218static bool isConstZero(const SDValue &Operand) {
5219 const auto *Const = dyn_cast<ConstantSDNode>(Operand);
5220 return Const && Const->getZExtValue() == 0;
5221}
5222
5223/// PerformADDCombineWithOperands - Try DAG combinations for an ADD with
5224/// operands N0 and N1. This is a helper for PerformADDCombine that is
5225/// called with the default operands, and if that fails, with commuted
5226/// operands.
5227static SDValue
5230 EVT VT = N0.getValueType();
5231
5232 // Since integer multiply-add costs the same as integer multiply
5233 // but is more costly than integer add, do the fusion only when
5234 // the mul is only used in the add.
5235 // TODO: this may not be true for later architectures, consider relaxing this
5236 if (!N0.getNode()->hasOneUse())
5237 return SDValue();
5238
5239 // fold (add (mul a, b), c) -> (mad a, b, c)
5240 //
5241 if (N0.getOpcode() == ISD::MUL)
5242 return DCI.DAG.getNode(NVPTXISD::IMAD, SDLoc(N), VT, N0.getOperand(0),
5243 N0.getOperand(1), N1);
5244
5245 // fold (add (select cond, 0, (mul a, b)), c)
5246 // -> (select cond, c, (mad a, b, c))
5247 //
5248 if (N0.getOpcode() == ISD::SELECT) {
5249 unsigned ZeroOpNum;
5250 if (isConstZero(N0->getOperand(1)))
5251 ZeroOpNum = 1;
5252 else if (isConstZero(N0->getOperand(2)))
5253 ZeroOpNum = 2;
5254 else
5255 return SDValue();
5256
5257 SDValue M = N0->getOperand((ZeroOpNum == 1) ? 2 : 1);
5258 if (M->getOpcode() != ISD::MUL || !M.getNode()->hasOneUse())
5259 return SDValue();
5260
5261 SDValue MAD = DCI.DAG.getNode(NVPTXISD::IMAD, SDLoc(N), VT,
5262 M->getOperand(0), M->getOperand(1), N1);
5263 return DCI.DAG.getSelect(SDLoc(N), VT, N0->getOperand(0),
5264 ((ZeroOpNum == 1) ? N1 : MAD),
5265 ((ZeroOpNum == 1) ? MAD : N1));
5266 }
5267
5268 return SDValue();
5269}
5270
5271static SDValue
5274 CodeGenOptLevel OptLevel) {
5275 EVT VT = N0.getValueType();
5276 if (N0.getOpcode() == ISD::FMUL) {
5277 const auto *TLI = static_cast<const NVPTXTargetLowering *>(
5278 &DCI.DAG.getTargetLoweringInfo());
5279 if (!TLI->allowFMA(DCI.DAG.getMachineFunction(), OptLevel))
5280 return SDValue();
5281
5282 // For floating point:
5283 // Do the fusion only when the mul has less than 5 uses and all
5284 // are add.
5285 // The heuristic is that if a use is not an add, then that use
5286 // cannot be fused into fma, therefore mul is still needed anyway.
5287 // If there are more than 4 uses, even if they are all add, fusing
5288 // them will increase register pressue.
5289 //
5290 int numUses = 0;
5291 int nonAddCount = 0;
5292 for (const SDNode *User : N0.getNode()->uses()) {
5293 numUses++;
5294 if (User->getOpcode() != ISD::FADD)
5295 ++nonAddCount;
5296 if (numUses >= 5)
5297 return SDValue();
5298 }
5299 if (nonAddCount) {
5300 int orderNo = N->getIROrder();
5301 int orderNo2 = N0.getNode()->getIROrder();
5302 // simple heuristics here for considering potential register
5303 // pressure, the logics here is that the differnce are used
5304 // to measure the distance between def and use, the longer distance
5305 // more likely cause register pressure.
5306 if (orderNo - orderNo2 < 500)
5307 return SDValue();
5308
5309 // Now, check if at least one of the FMUL's operands is live beyond the
5310 // node N, which guarantees that the FMA will not increase register
5311 // pressure at node N.
5312 bool opIsLive = false;
5313 const SDNode *left = N0.getOperand(0).getNode();
5314 const SDNode *right = N0.getOperand(1).getNode();
5315
5316 if (isa<ConstantSDNode>(left) || isa<ConstantSDNode>(right))
5317 opIsLive = true;
5318
5319 if (!opIsLive)
5320 for (const SDNode *User : left->uses()) {
5321 int orderNo3 = User->getIROrder();
5322 if (orderNo3 > orderNo) {
5323 opIsLive = true;
5324 break;
5325 }
5326 }
5327
5328 if (!opIsLive)
5329 for (const SDNode *User : right->uses()) {
5330 int orderNo3 = User->getIROrder();
5331 if (orderNo3 > orderNo) {
5332 opIsLive = true;
5333 break;
5334 }
5335 }
5336
5337 if (!opIsLive)
5338 return SDValue();
5339 }
5340
5341 return DCI.DAG.getNode(ISD::FMA, SDLoc(N), VT, N0.getOperand(0),
5342 N0.getOperand(1), N1);
5343 }
5344
5345 return SDValue();
5346}
5347
5348static SDValue PerformStoreCombineHelper(SDNode *N, std::size_t Front,
5349 std::size_t Back) {
5350 if (all_of(N->ops().drop_front(Front).drop_back(Back),
5351 [](const SDUse &U) { return U.get()->isUndef(); }))
5352 // Operand 0 is the previous value in the chain. Cannot return EntryToken
5353 // as the previous value will become unused and eliminated later.
5354 return N->getOperand(0);
5355
5356 return SDValue();
5357}
5358
5360 // Operands from the 3rd to the 2nd last one are the values to be stored.
5361 // {Chain, ArgID, Offset, Val, Glue}
5362 return PerformStoreCombineHelper(N, 3, 1);
5363}
5364
5366 // Operands from the 2nd to the last one are the values to be stored
5367 return PerformStoreCombineHelper(N, 2, 0);
5368}
5369
5370/// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD.
5371///
5374 CodeGenOptLevel OptLevel) {
5375 if (OptLevel == CodeGenOptLevel::None)
5376 return SDValue();
5377
5378 SDValue N0 = N->getOperand(0);
5379 SDValue N1 = N->getOperand(1);
5380
5381 // Skip non-integer, non-scalar case
5382 EVT VT = N0.getValueType();
5383 if (VT.isVector() || VT != MVT::i32)
5384 return SDValue();
5385
5386 // First try with the default operand order.
5387 if (SDValue Result = PerformADDCombineWithOperands(N, N0, N1, DCI))
5388 return Result;
5389
5390 // If that didn't work, try again with the operands commuted.
5391 return PerformADDCombineWithOperands(N, N1, N0, DCI);
5392}
5393
5394/// PerformFADDCombine - Target-specific dag combine xforms for ISD::FADD.
5395///
5398 CodeGenOptLevel OptLevel) {
5399 SDValue N0 = N->getOperand(0);
5400 SDValue N1 = N->getOperand(1);
5401
5402 EVT VT = N0.getValueType();
5403 if (VT.isVector() || !(VT == MVT::f32 || VT == MVT::f64))
5404 return SDValue();
5405
5406 // First try with the default operand order.
5407 if (SDValue Result = PerformFADDCombineWithOperands(N, N0, N1, DCI, OptLevel))
5408 return Result;
5409
5410 // If that didn't work, try again with the operands commuted.
5411 return PerformFADDCombineWithOperands(N, N1, N0, DCI, OptLevel);
5412}
5413
5416 // The type legalizer turns a vector load of i8 values into a zextload to i16
5417 // registers, optionally ANY_EXTENDs it (if target type is integer),
5418 // and ANDs off the high 8 bits. Since we turn this load into a
5419 // target-specific DAG node, the DAG combiner fails to eliminate these AND
5420 // nodes. Do that here.
5421 SDValue Val = N->getOperand(0);
5422 SDValue Mask = N->getOperand(1);
5423
5424 if (isa<ConstantSDNode>(Val)) {
5425 std::swap(Val, Mask);
5426 }
5427
5428 SDValue AExt;
5429
5430 // Convert BFE-> truncate i16 -> and 255
5431 // To just BFE-> truncate i16, as the value already has all the bits in the
5432 // right places.
5433 if (Val.getOpcode() == ISD::TRUNCATE) {
5434 SDValue BFE = Val.getOperand(0);
5435 if (BFE.getOpcode() != NVPTXISD::BFE)
5436 return SDValue();
5437
5438 ConstantSDNode *BFEBits = dyn_cast<ConstantSDNode>(BFE.getOperand(0));
5439 if (!BFEBits)
5440 return SDValue();
5441 uint64_t BFEBitsVal = BFEBits->getZExtValue();
5442
5443 ConstantSDNode *MaskCnst = dyn_cast<ConstantSDNode>(Mask);
5444 if (!MaskCnst) {
5445 // Not an AND with a constant
5446 return SDValue();
5447 }
5448 uint64_t MaskVal = MaskCnst->getZExtValue();
5449
5450 if (MaskVal != (uint64_t(1) << BFEBitsVal) - 1)
5451 return SDValue();
5452 // If we get here, the AND is unnecessary. Just replace it with the trunc
5453 DCI.CombineTo(N, Val, false);
5454 }
5455 // Generally, we will see zextload -> IMOV16rr -> ANY_EXTEND -> and
5456 if (Val.getOpcode() == ISD::ANY_EXTEND) {
5457 AExt = Val;
5458 Val = Val->getOperand(0);
5459 }
5460
5461 if (Val->isMachineOpcode() && Val->getMachineOpcode() == NVPTX::IMOV16rr) {
5462 Val = Val->getOperand(0);
5463 }
5464
5465 if (Val->getOpcode() == NVPTXISD::LoadV2 ||
5466 Val->getOpcode() == NVPTXISD::LoadV4) {
5467 ConstantSDNode *MaskCnst = dyn_cast<ConstantSDNode>(Mask);
5468 if (!MaskCnst) {
5469 // Not an AND with a constant
5470 return SDValue();
5471 }
5472
5473 uint64_t MaskVal = MaskCnst->getZExtValue();
5474 if (MaskVal != 0xff) {
5475 // Not an AND that chops off top 8 bits
5476 return SDValue();
5477 }
5478
5479 MemSDNode *Mem = dyn_cast<MemSDNode>(Val);
5480 if (!Mem) {
5481 // Not a MemSDNode?!?
5482 return SDValue();
5483 }
5484
5485 EVT MemVT = Mem->getMemoryVT();
5486 if (MemVT != MVT::v2i8 && MemVT != MVT::v4i8) {
5487 // We only handle the i8 case
5488 return SDValue();
5489 }
5490
5491 unsigned ExtType = Val->getConstantOperandVal(Val->getNumOperands() - 1);
5492 if (ExtType == ISD::SEXTLOAD) {
5493 // If for some reason the load is a sextload, the and is needed to zero
5494 // out the high 8 bits
5495 return SDValue();
5496 }
5497
5498 bool AddTo = false;
5499 if (AExt.getNode() != nullptr) {
5500 // Re-insert the ext as a zext.
5501 Val = DCI.DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N),
5502 AExt.getValueType(), Val);
5503 AddTo = true;
5504 }
5505
5506 // If we get here, the AND is unnecessary. Just replace it with the load
5507 DCI.CombineTo(N, Val, AddTo);
5508 }
5509
5510 return SDValue();
5511}
5512
5515 CodeGenOptLevel OptLevel) {
5516 assert(N->getOpcode() == ISD::SREM || N->getOpcode() == ISD::UREM);
5517
5518 // Don't do anything at less than -O2.
5519 if (OptLevel < CodeGenOptLevel::Default)
5520 return SDValue();
5521
5522 SelectionDAG &DAG = DCI.DAG;
5523 SDLoc DL(N);
5524 EVT VT = N->getValueType(0);
5525 bool IsSigned = N->getOpcode() == ISD::SREM;
5526 unsigned DivOpc = IsSigned ? ISD::SDIV : ISD::UDIV;
5527
5528 const SDValue &Num = N->getOperand(0);
5529 const SDValue &Den = N->getOperand(1);
5530
5531 for (const SDNode *U : Num->uses()) {
5532 if (U->getOpcode() == DivOpc && U->getOperand(0) == Num &&
5533 U->getOperand(1) == Den) {
5534 // Num % Den -> Num - (Num / Den) * Den
5535 return DAG.getNode(ISD::SUB, DL, VT, Num,
5536 DAG.getNode(ISD::MUL, DL, VT,
5537 DAG.getNode(DivOpc, DL, VT, Num, Den),
5538 Den));
5539 }
5540 }
5541 return SDValue();
5542}
5543
5547 Unknown
5549
5550/// IsMulWideOperandDemotable - Checks if the provided DAG node is an operand
5551/// that can be demoted to \p OptSize bits without loss of information. The
5552/// signedness of the operand, if determinable, is placed in \p S.
5554 unsigned OptSize,
5555 OperandSignedness &S) {
5556 S = Unknown;
5557
5558 if (Op.getOpcode() == ISD::SIGN_EXTEND ||
5559 Op.getOpcode() == ISD::SIGN_EXTEND_INREG) {
5560 EVT OrigVT = Op.getOperand(0).getValueType();
5561 if (OrigVT.getFixedSizeInBits() <= OptSize) {
5562 S = Signed;
5563 return true;
5564 }
5565 } else if (Op.getOpcode() == ISD::ZERO_EXTEND) {
5566 EVT OrigVT = Op.getOperand(0).getValueType();
5567 if (OrigVT.getFixedSizeInBits() <= OptSize) {
5568 S = Unsigned;
5569 return true;
5570 }
5571 }
5572
5573 return false;
5574}
5575
5576/// AreMulWideOperandsDemotable - Checks if the given LHS and RHS operands can
5577/// be demoted to \p OptSize bits without loss of information. If the operands
5578/// contain a constant, it should appear as the RHS operand. The signedness of
5579/// the operands is placed in \p IsSigned.
5581 unsigned OptSize,
5582 bool &IsSigned) {
5583 OperandSignedness LHSSign;
5584
5585 // The LHS operand must be a demotable op
5586 if (!IsMulWideOperandDemotable(LHS, OptSize, LHSSign))
5587 return false;
5588
5589 // We should have been able to determine the signedness from the LHS
5590 if (LHSSign == Unknown)
5591 return false;
5592
5593 IsSigned = (LHSSign == Signed);
5594
5595 // The RHS can be a demotable op or a constant
5596 if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(RHS)) {
5597 const APInt &Val = CI->getAPIntValue();
5598 if (LHSSign == Unsigned) {
5599 return Val.isIntN(OptSize);
5600 } else {
5601 return Val.isSignedIntN(OptSize);
5602 }
5603 } else {
5604 OperandSignedness RHSSign;
5605 if (!IsMulWideOperandDemotable(RHS, OptSize, RHSSign))
5606 return false;
5607
5608 return LHSSign == RHSSign;
5609 }
5610}
5611
5612/// TryMULWIDECombine - Attempt to replace a multiply of M bits with a multiply
5613/// of M/2 bits that produces an M-bit result (i.e. mul.wide). This transform
5614/// works on both multiply DAG nodes and SHL DAG nodes with a constant shift
5615/// amount.
5618 EVT MulType = N->getValueType(0);
5619 if (MulType != MVT::i32 && MulType != MVT::i64) {
5620 return SDValue();
5621 }
5622
5623 SDLoc DL(N);
5624 unsigned OptSize = MulType.getSizeInBits() >> 1;
5625 SDValue LHS = N->getOperand(0);
5626 SDValue RHS = N->getOperand(1);
5627
5628 // Canonicalize the multiply so the constant (if any) is on the right
5629 if (N->getOpcode() == ISD::MUL) {
5630 if (isa<ConstantSDNode>(LHS)) {
5631 std::swap(LHS, RHS);
5632 }
5633 }
5634
5635 // If we have a SHL, determine the actual multiply amount
5636 if (N->getOpcode() == ISD::SHL) {
5637 ConstantSDNode *ShlRHS = dyn_cast<ConstantSDNode>(RHS);
5638 if (!ShlRHS) {
5639 return SDValue();
5640 }
5641
5642 APInt ShiftAmt = ShlRHS->getAPIntValue();
5643 unsigned BitWidth = MulType.getSizeInBits();
5644 if (ShiftAmt.sge(0) && ShiftAmt.slt(BitWidth)) {
5645 APInt MulVal = APInt(BitWidth, 1) << ShiftAmt;
5646 RHS = DCI.DAG.getConstant(MulVal, DL, MulType);
5647 } else {
5648 return SDValue();
5649 }
5650 }
5651
5652 bool Signed;
5653 // Verify that our operands are demotable
5654 if (!AreMulWideOperandsDemotable(LHS, RHS, OptSize, Signed)) {
5655 return SDValue();
5656 }
5657
5658 EVT DemotedVT;
5659 if (MulType == MVT::i32) {
5660 DemotedVT = MVT::i16;
5661 } else {
5662 DemotedVT = MVT::i32;
5663 }
5664
5665 // Truncate the operands to the correct size. Note that these are just for
5666 // type consistency and will (likely) be eliminated in later phases.
5667 SDValue TruncLHS =
5668 DCI.DAG.getNode(ISD::TRUNCATE, DL, DemotedVT, LHS);
5669 SDValue TruncRHS =
5670 DCI.DAG.getNode(ISD::TRUNCATE, DL, DemotedVT, RHS);
5671
5672 unsigned Opc;
5673 if (Signed) {
5675 } else {
5677 }
5678
5679 return DCI.DAG.getNode(Opc, DL, MulType, TruncLHS, TruncRHS);
5680}
5681
5682static bool isConstOne(const SDValue &Operand) {
5683 const auto *Const = dyn_cast<ConstantSDNode>(Operand);
5684 return Const && Const->getZExtValue() == 1;
5685}
5686
5688 if (Add->getOpcode() != ISD::ADD)
5689 return SDValue();
5690
5691 if (isConstOne(Add->getOperand(0)))
5692 return Add->getOperand(1);
5693
5694 if (isConstOne(Add->getOperand(1)))
5695 return Add->getOperand(0);
5696
5697 return SDValue();
5698}
5699
5702
5704 return DCI.DAG.getNode(NVPTXISD::IMAD, DL, VT, X, Y, X);
5705
5706 return SDValue();
5707}
5708
5710 SDLoc DL,
5712 if (Select->getOpcode() != ISD::SELECT)
5713 return SDValue();
5714
5715 SDValue Cond = Select->getOperand(0);
5716
5717 unsigned ConstOpNo;
5718 if (isConstOne(Select->getOperand(1)))
5719 ConstOpNo = 1;
5720 else if (isConstOne(Select->getOperand(2)))
5721 ConstOpNo = 2;
5722 else
5723 return SDValue();
5724
5725 SDValue Y = Select->getOperand((ConstOpNo == 1) ? 2 : 1);
5726
5727 // Do not combine if the resulting sequence is not obviously profitable.
5729 return SDValue();
5730
5731 SDValue NewMul = DCI.DAG.getNode(ISD::MUL, DL, VT, X, Y);
5732
5733 return DCI.DAG.getNode(ISD::SELECT, DL, VT, Cond,
5734 (ConstOpNo == 1) ? X : NewMul,
5735 (ConstOpNo == 1) ? NewMul : X);
5736}
5737
5738static SDValue
5741
5742 EVT VT = N0.getValueType();
5743 if (VT.isVector())
5744 return SDValue();
5745
5746 if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64)
5747 return SDValue();
5748
5749 SDLoc DL(N);
5750
5751 // (mul x, (add y, 1)) -> (mad x, y, x)
5752 if (SDValue Res = combineMADConstOne(N0, N1, VT, DL, DCI))
5753 return Res;
5754 if (SDValue Res = combineMADConstOne(N1, N0, VT, DL, DCI))
5755 return Res;
5756
5757 // (mul x, (select y, 1)) -> (select (mul x, y), x)
5758 if (SDValue Res = combineMulSelectConstOne(N0, N1, VT, DL, DCI))
5759 return Res;
5760 if (SDValue Res = combineMulSelectConstOne(N1, N0, VT, DL, DCI))
5761 return Res;
5762
5763 return SDValue();
5764}
5765
5766/// PerformMULCombine - Runs PTX-specific DAG combine patterns on MUL nodes.
5769 CodeGenOptLevel OptLevel) {
5770 if (OptLevel == CodeGenOptLevel::None)
5771 return SDValue();
5772
5773 if (SDValue Ret = TryMULWIDECombine(N, DCI))
5774 return Ret;
5775
5776 SDValue N0 = N->getOperand(0);
5777 SDValue N1 = N->getOperand(1);
5778 return PerformMULCombineWithOperands(N, N0, N1, DCI);
5779}
5780
5781/// PerformSHLCombine - Runs PTX-specific DAG combine patterns on SHL nodes.
5784 CodeGenOptLevel OptLevel) {
5785 if (OptLevel > CodeGenOptLevel::None) {
5786 // Try mul.wide combining at OptLevel > 0
5787 if (SDValue Ret = TryMULWIDECombine(N, DCI))
5788 return Ret;
5789 }
5790
5791 return SDValue();
5792}
5793
5796 unsigned int SmVersion) {
5797 EVT CCType = N->getValueType(0);
5798 SDValue A = N->getOperand(0);
5799 SDValue B = N->getOperand(1);
5800
5801 EVT AType = A.getValueType();
5802 if (!(CCType == MVT::v2i1 && (AType == MVT::v2f16 || AType == MVT::v2bf16)))
5803 return SDValue();
5804
5805 if (A.getValueType() == MVT::v2bf16 && SmVersion < 90)
5806 return SDValue();
5807
5808 SDLoc DL(N);
5809 // setp.f16x2 returns two scalar predicates, which we need to
5810 // convert back to v2i1. The returned result will be scalarized by
5811 // the legalizer, but the comparison will remain a single vector
5812 // instruction.
5813 SDValue CCNode = DCI.DAG.getNode(
5814 A.getValueType() == MVT::v2f16 ? NVPTXISD::SETP_F16X2
5816 DL, DCI.DAG.getVTList(MVT::i1, MVT::i1), {A, B, N->getOperand(2)});
5817 return DCI.DAG.getNode(ISD::BUILD_VECTOR, DL, CCType, CCNode.getValue(0),
5818 CCNode.getValue(1));
5819}
5820
5823 SDValue Vector = N->getOperand(0);
5824 SDLoc DL(N);
5825 EVT VectorVT = Vector.getValueType();
5826 if (Vector->getOpcode() == ISD::LOAD && VectorVT.isSimple() &&
5827 IsPTXVectorType(VectorVT.getSimpleVT()))
5828 return SDValue(); // Native vector loads already combine nicely w/
5829 // extract_vector_elt.
5830 // Don't mess with singletons or v2*16, v4i8 and v8i8 types, we already
5831 // handle them OK.
5832 if (VectorVT.getVectorNumElements() == 1 || Isv2x16VT(VectorVT) ||
5833 VectorVT == MVT::v4i8 || VectorVT == MVT::v8i8)
5834 return SDValue();
5835
5836 // Don't mess with undef values as sra may be simplified to 0, not undef.
5837 if (Vector->isUndef() || ISD::allOperandsUndef(Vector.getNode()))
5838 return SDValue();
5839
5840 uint64_t VectorBits = VectorVT.getSizeInBits();
5841 // We only handle the types we can extract in-register.
5842 if (!(VectorBits == 16 || VectorBits == 32 || VectorBits == 64))
5843 return SDValue();
5844
5845 ConstantSDNode *Index = dyn_cast<ConstantSDNode>(N->getOperand(1));
5846 // Index == 0 is handled by generic DAG combiner.
5847 if (!Index || Index->getZExtValue() == 0)
5848 return SDValue();
5849
5850 MVT IVT = MVT::getIntegerVT(VectorBits);
5851 EVT EltVT = VectorVT.getVectorElementType();
5852 EVT EltIVT = EltVT.changeTypeToInteger();
5853 uint64_t EltBits = EltVT.getScalarSizeInBits();
5854
5855 SDValue Result = DCI.DAG.getNode(
5856 ISD::TRUNCATE, DL, EltIVT,
5857 DCI.DAG.getNode(
5858 ISD::SRA, DL, IVT, DCI.DAG.getNode(ISD::BITCAST, DL, IVT, Vector),
5859 DCI.DAG.getConstant(Index->getZExtValue() * EltBits, DL, IVT)));
5860
5861 // If element has non-integer type, bitcast it back to the expected type.
5862 if (EltVT != EltIVT)
5863 Result = DCI.DAG.getNode(ISD::BITCAST, DL, EltVT, Result);
5864 // Past legalizer, we may need to extent i8 -> i16 to match the register type.
5865 if (EltVT != N->getValueType(0))
5866 Result = DCI.DAG.getNode(ISD::ANY_EXTEND, DL, N->getValueType(0), Result);
5867
5868 return Result;
5869}
5870
5873 SDValue VA = N->getOperand(1);
5874 EVT VectorVT = VA.getValueType();
5875 if (VectorVT != MVT::v4i8)
5876 return SDValue();
5877
5878 // We need to split vselect into individual per-element operations Because we
5879 // use BFE/BFI instruction for byte extraction/insertion, we do end up with
5880 // 32-bit values, so we may as well do comparison as i32 to avoid conversions
5881 // to/from i16 normally used for i8 values.
5883 SDLoc DL(N);
5884 SDValue VCond = N->getOperand(0);
5885 SDValue VB = N->getOperand(2);
5886 for (int I = 0; I < 4; ++I) {
5887 SDValue C = DCI.DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i1, VCond,
5888 DCI.DAG.getConstant(I, DL, MVT::i32));
5889 SDValue EA = DCI.DAG.getAnyExtOrTrunc(
5890 DCI.DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8, VA,
5891 DCI.DAG.getConstant(I, DL, MVT::i32)),
5892 DL, MVT::i32);
5893 SDValue EB = DCI.DAG.getAnyExtOrTrunc(
5894 DCI.DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8, VB,
5895 DCI.DAG.getConstant(I, DL, MVT::i32)),
5896 DL, MVT::i32);
5898 DCI.DAG.getNode(ISD::SELECT, DL, MVT::i32, C, EA, EB), DL, MVT::i8));
5899 }
5900 return DCI.DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v4i8, E);
5901}
5902
5905 SelectionDAG &DAG = DCI.DAG;
5906 LoadSDNode *LD = cast<LoadSDNode>(N);
5907
5908 // Lower a v16i8 load into a LoadV4 operation with i32 results instead of
5909 // letting ReplaceLoadVector split it into smaller loads during legalization.
5910 // This is done at dag-combine1 time, so that vector operations with i8
5911 // elements can be optimised away instead of being needlessly split during
5912 // legalization, which involves storing to the stack and loading it back.
5913 EVT VT = N->getValueType(0);
5914 if (VT != MVT::v16i8)
5915 return SDValue();
5916
5917 SDLoc DL(N);
5918
5919 // Create a v4i32 vector load operation, effectively <4 x v4i8>.
5920 unsigned Opc = NVPTXISD::LoadV4;
5921 EVT NewVT = MVT::v4i32;
5922 EVT EltVT = NewVT.getVectorElementType();
5923 unsigned NumElts = NewVT.getVectorNumElements();
5924 EVT RetVTs[] = {EltVT, EltVT, EltVT, EltVT, MVT::Other};
5925 SDVTList RetVTList = DAG.getVTList(RetVTs);
5926 SmallVector<SDValue, 8> Ops(N->ops());
5927 Ops.push_back(DAG.getIntPtrConstant(LD->getExtensionType(), DL));
5928 SDValue NewLoad = DAG.getMemIntrinsicNode(Opc, DL, RetVTList, Ops, NewVT,
5929 LD->getMemOperand());
5930 SDValue NewChain = NewLoad.getValue(NumElts);
5931
5932 // Create a vector of the same type returned by the original load.
5934 for (unsigned i = 0; i < NumElts; i++)
5935 Elts.push_back(NewLoad.getValue(i));
5936 return DCI.DAG.getMergeValues(
5937 {DCI.DAG.getBitcast(VT, DCI.DAG.getBuildVector(NewVT, DL, Elts)),
5938 NewChain},
5939 DL);
5940}
5941
5942SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N,
5943 DAGCombinerInfo &DCI) const {
5945 switch (N->getOpcode()) {
5946 default: break;
5947 case ISD::ADD:
5948 return PerformADDCombine(N, DCI, OptLevel);
5949 case ISD::FADD:
5950 return PerformFADDCombine(N, DCI, OptLevel);
5951 case ISD::MUL:
5952 return PerformMULCombine(N, DCI, OptLevel);
5953 case ISD::SHL:
5954 return PerformSHLCombine(N, DCI, OptLevel);
5955 case ISD::AND:
5956 return PerformANDCombine(N, DCI);
5957 case ISD::UREM:
5958 case ISD::SREM:
5959 return PerformREMCombine(N, DCI, OptLevel);
5960 case ISD::SETCC:
5961 return PerformSETCCCombine(N, DCI, STI.getSmVersion());
5962 case ISD::LOAD:
5963 return PerformLOADCombine(N, DCI);
5973 return PerformEXTRACTCombine(N, DCI);
5974 case ISD::VSELECT:
5975 return PerformVSELECTCombine(N, DCI);
5976 }
5977 return SDValue();
5978}
5979
5980/// ReplaceVectorLoad - Convert vector loads into multi-output scalar loads.
5983 EVT ResVT = N->getValueType(0);
5984 SDLoc DL(N);
5985
5986 assert(ResVT.isVector() && "Vector load must have vector type");
5987
5988 // We only handle "native" vector sizes for now, e.g. <4 x double> is not
5989 // legal. We can (and should) split that into 2 loads of <2 x double> here
5990 // but I'm leaving that as a TODO for now.
5991 assert(ResVT.isSimple() && "Can only handle simple types");
5992 switch (ResVT.getSimpleVT().SimpleTy) {
5993 default:
5994 return;
5995 case MVT::v2i8:
5996 case MVT::v2i16:
5997 case MVT::v2i32:
5998 case MVT::v2i64:
5999 case MVT::v2f16:
6000 case MVT::v2f32:
6001 case MVT::v2f64:
6002 case MVT::v4i8:
6003 case MVT::v4i16:
6004 case MVT::v4i32:
6005 case MVT::v4f16:
6006 case MVT::v4f32:
6007 case MVT::v8f16: // <4 x f16x2>
6008 case MVT::v8bf16: // <4 x bf16x2>
6009 case MVT::v8i16: // <4 x i16x2>
6010 // This is a "native" vector type
6011 break;
6012 }
6013
6014 LoadSDNode *LD = cast<LoadSDNode>(N);
6015
6016 Align Alignment = LD->getAlign();
6017 auto &TD = DAG.getDataLayout();
6018 Align PrefAlign =
6019 TD.getPrefTypeAlign(LD->getMemoryVT().getTypeForEVT(*DAG.getContext()));
6020 if (Alignment < PrefAlign) {
6021 // This load is not sufficiently aligned, so bail out and let this vector
6022 // load be scalarized. Note that we may still be able to emit smaller
6023 // vector loads. For example, if we are loading a <4 x float> with an
6024 // alignment of 8, this check will fail but the legalizer will try again
6025 // with 2 x <2 x float>, which will succeed with an alignment of 8.
6026 return;
6027 }
6028
6029 EVT EltVT = ResVT.getVectorElementType();
6030 unsigned NumElts = ResVT.getVectorNumElements();
6031
6032 // Since LoadV2 is a target node, we cannot rely on DAG type legalization.
6033 // Therefore, we must ensure the type is legal. For i1 and i8, we set the
6034 // loaded type to i16 and propagate the "real" type as the memory type.
6035 bool NeedTrunc = false;
6036 if (EltVT.getSizeInBits() < 16) {
6037 EltVT = MVT::i16;
6038 NeedTrunc = true;
6039 }
6040
6041 unsigned Opcode = 0;
6042 SDVTList LdResVTs;
6043 bool Load16x2 = false;
6044
6045 switch (NumElts) {
6046 default:
6047 return;
6048 case 2:
6049 Opcode = NVPTXISD::LoadV2;
6050 LdResVTs = DAG.getVTList(EltVT, EltVT, MVT::Other);
6051 break;
6052 case 4: {
6053 Opcode = NVPTXISD::LoadV4;
6054 EVT ListVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other };
6055 LdResVTs = DAG.getVTList(ListVTs);
6056 break;
6057 }
6058 case 8: {
6059 // v8f16 is a special case. PTX doesn't have ld.v8.f16
6060 // instruction. Instead, we split the vector into v2f16 chunks and
6061 // load them with ld.v4.b32.
6062 assert(Is16bitsType(EltVT.getSimpleVT()) && "Unsupported v8 vector type.");
6063 Load16x2 = true;
6064 Opcode = NVPTXISD::LoadV4;
6065 EVT VVT;
6066 switch (EltVT.getSimpleVT().SimpleTy) {
6067 case MVT::f16:
6068 VVT = MVT::v2f16;
6069 break;
6070 case MVT::bf16:
6071 VVT = MVT::v2bf16;
6072 break;
6073 case MVT::i16:
6074 VVT = MVT::v2i16;
6075 break;
6076 default:
6077 llvm_unreachable("Unsupported v8 vector type.");
6078 }
6079 EVT ListVTs[] = {VVT, VVT, VVT, VVT, MVT::Other};
6080 LdResVTs = DAG.getVTList(ListVTs);
6081 break;
6082 }
6083 }
6084
6085 // Copy regular operands
6086 SmallVector<SDValue, 8> OtherOps(N->op_begin(), N->op_end());
6087
6088 // The select routine does not have access to the LoadSDNode instance, so
6089 // pass along the extension information
6090 OtherOps.push_back(DAG.getIntPtrConstant(LD->getExtensionType(), DL));
6091
6092 SDValue NewLD = DAG.getMemIntrinsicNode(Opcode, DL, LdResVTs, OtherOps,
6093 LD->getMemoryVT(),
6094 LD->getMemOperand());
6095
6096 SmallVector<SDValue, 8> ScalarRes;
6097 if (Load16x2) {
6098 // Split v2f16 subvectors back into individual elements.
6099 NumElts /= 2;
6100 for (unsigned i = 0; i < NumElts; ++i) {
6101 SDValue SubVector = NewLD.getValue(i);
6102 SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, SubVector,
6103 DAG.getIntPtrConstant(0, DL));
6104 SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, SubVector,
6105 DAG.getIntPtrConstant(1, DL));
6106 ScalarRes.push_back(E0);
6107 ScalarRes.push_back(E1);
6108 }
6109 } else {
6110 for (unsigned i = 0; i < NumElts; ++i) {
6111 SDValue Res = NewLD.getValue(i);
6112 if (NeedTrunc)
6113 Res = DAG.getNode(ISD::TRUNCATE, DL, ResVT.getVectorElementType(), Res);
6114 ScalarRes.push_back(Res);
6115 }
6116 }
6117
6118 SDValue LoadChain = NewLD.getValue(NumElts);
6119
6120 SDValue BuildVec = DAG.getBuildVector(ResVT, DL, ScalarRes);
6121
6122 Results.push_back(BuildVec);
6123 Results.push_back(LoadChain);
6124}
6125
6128 SDValue Chain = N->getOperand(0);
6129 SDValue Intrin = N->getOperand(1);
6130 SDLoc DL(N);
6131
6132 // Get the intrinsic ID
6133 unsigned IntrinNo = Intrin.getNode()->getAsZExtVal();
6134 switch (IntrinNo) {
6135 default:
6136 return;
6137 case Intrinsic::nvvm_ldg_global_i:
6138 case Intrinsic::nvvm_ldg_global_f:
6139 case Intrinsic::nvvm_ldg_global_p:
6140 case Intrinsic::nvvm_ldu_global_i:
6141 case Intrinsic::nvvm_ldu_global_f:
6142 case Intrinsic::nvvm_ldu_global_p: {
6143 EVT ResVT = N->getValueType(0);
6144
6145 if (ResVT.isVector()) {
6146 // Vector LDG/LDU
6147
6148 unsigned NumElts = ResVT.getVectorNumElements();
6149 EVT EltVT = ResVT.getVectorElementType();
6150
6151 // Since LDU/LDG are target nodes, we cannot rely on DAG type
6152 // legalization.
6153 // Therefore, we must ensure the type is legal. For i1 and i8, we set the
6154 // loaded type to i16 and propagate the "real" type as the memory type.
6155 bool NeedTrunc = false;
6156 if (EltVT.getSizeInBits() < 16) {
6157 EltVT = MVT::i16;
6158 NeedTrunc = true;
6159 }
6160
6161 unsigned Opcode = 0;
6162 SDVTList LdResVTs;
6163
6164 switch (NumElts) {
6165 default:
6166 return;
6167 case 2:
6168 switch (IntrinNo) {
6169 default:
6170 return;
6171 case Intrinsic::nvvm_ldg_global_i:
6172 case Intrinsic::nvvm_ldg_global_f:
6173 case Intrinsic::nvvm_ldg_global_p:
6174 Opcode = NVPTXISD::LDGV2;
6175 break;
6176 case Intrinsic::nvvm_ldu_global_i:
6177 case Intrinsic::nvvm_ldu_global_f:
6178 case Intrinsic::nvvm_ldu_global_p:
6179 Opcode = NVPTXISD::LDUV2;
6180 break;
6181 }
6182 LdResVTs = DAG.getVTList(EltVT, EltVT, MVT::Other);
6183 break;
6184 case 4: {
6185 switch (IntrinNo) {
6186 default:
6187 return;
6188 case Intrinsic::nvvm_ldg_global_i:
6189 case Intrinsic::nvvm_ldg_global_f:
6190 case Intrinsic::nvvm_ldg_global_p:
6191 Opcode = NVPTXISD::LDGV4;
6192 break;
6193 case Intrinsic::nvvm_ldu_global_i:
6194 case Intrinsic::nvvm_ldu_global_f:
6195 case Intrinsic::nvvm_ldu_global_p:
6196 Opcode = NVPTXISD::LDUV4;
6197 break;
6198 }
6199 EVT ListVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other };
6200 LdResVTs = DAG.getVTList(ListVTs);
6201 break;
6202 }
6203 }
6204
6205 SmallVector<SDValue, 8> OtherOps;
6206
6207 // Copy regular operands
6208
6209 OtherOps.push_back(Chain); // Chain
6210 // Skip operand 1 (intrinsic ID)
6211 // Others
6212 OtherOps.append(N->op_begin() + 2, N->op_end());
6213
6214 MemIntrinsicSDNode *MemSD = cast<MemIntrinsicSDNode>(N);
6215
6216 SDValue NewLD = DAG.getMemIntrinsicNode(Opcode, DL, LdResVTs, OtherOps,
6217 MemSD->getMemoryVT(),
6218 MemSD->getMemOperand());
6219
6220 SmallVector<SDValue, 4> ScalarRes;
6221
6222 for (unsigned i = 0; i < NumElts; ++i) {
6223 SDValue Res = NewLD.getValue(i);
6224 if (NeedTrunc)
6225 Res =
6226 DAG.getNode(ISD::TRUNCATE, DL, ResVT.getVectorElementType(), Res);
6227 ScalarRes.push_back(Res);
6228 }
6229
6230 SDValue LoadChain = NewLD.getValue(NumElts);
6231
6232 SDValue BuildVec =
6233 DAG.getBuildVector(ResVT, DL, ScalarRes);
6234
6235 Results.push_back(BuildVec);
6236 Results.push_back(LoadChain);
6237 } else {
6238 // i8 LDG/LDU
6239 assert(ResVT.isSimple() && ResVT.getSimpleVT().SimpleTy == MVT::i8 &&
6240 "Custom handling of non-i8 ldu/ldg?");
6241
6242 // Just copy all operands as-is
6243 SmallVector<SDValue, 4> Ops(N->op_begin(), N->op_end());
6244
6245 // Force output to i16
6246 SDVTList LdResVTs = DAG.getVTList(MVT::i16, MVT::Other);
6247
6248 MemIntrinsicSDNode *MemSD = cast<MemIntrinsicSDNode>(N);
6249
6250 // We make sure the memory type is i8, which will be used during isel
6251 // to select the proper instruction.
6252 SDValue NewLD =
6253 DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, DL, LdResVTs, Ops,
6254 MVT::i8, MemSD->getMemOperand());
6255
6256 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
6257 NewLD.getValue(0)));
6258 Results.push_back(NewLD.getValue(1));
6259 }
6260 }
6261 }
6262}
6263
6264void NVPTXTargetLowering::ReplaceNodeResults(
6266 switch (N->getOpcode()) {
6267 default:
6268 report_fatal_error("Unhandled custom legalization");
6269 case ISD::LOAD:
6271 return;
6274 return;
6275 }
6276}
6277
6280 Type *Ty = AI->getValOperand()->getType();
6281
6282 if (AI->isFloatingPointOperation()) {
6284 if (Ty->isHalfTy() && STI.getSmVersion() >= 70 &&
6285 STI.getPTXVersion() >= 63)
6287 if (Ty->isBFloatTy() && STI.getSmVersion() >= 90 &&
6288 STI.getPTXVersion() >= 78)
6290 if (Ty->isFloatTy())
6292 if (Ty->isDoubleTy() && STI.hasAtomAddF64())
6294 }
6296 }
6297
6298 assert(Ty->isIntegerTy() && "Ty should be integer at this point");
6299 auto ITy = cast<llvm::IntegerType>(Ty);
6300
6301 switch (AI->getOperation()) {
6302 default:
6308 switch (ITy->getBitWidth()) {
6309 case 8:
6310 case 16:
6312 case 32:
6314 case 64:
6315 if (STI.hasAtomBitwise64())
6318 default:
6319 llvm_unreachable("unsupported width encountered");
6320 }
6327 switch (ITy->getBitWidth()) {
6328 case 8:
6329 case 16:
6331 case 32:
6333 case 64:
6334 if (STI.hasAtomMinMax64())
6337 default:
6338 llvm_unreachable("unsupported width encountered");
6339 }
6340 }
6341
6343}
6344
6345// Pin NVPTXTargetObjectFile's vtables to this file.
6347
6349 const GlobalObject *GO, SectionKind Kind, const TargetMachine &TM) const {
6350 return getDataSection();
6351}
#define MAKE_CASE(V)
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static const LLT F32
amdgpu AMDGPU Register Bank Select
This file implements a class to represent arbitrary precision integral constant values and operations...
static SDValue PerformLOADCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue PerformADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformADDCombineWithOperands - Try DAG combinations for an ADD with operands N0 and N1.
static SDValue PerformADDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformADDCombine - Target-specific dag combine xforms for ISD::ADD.
static SDValue PerformVSELECTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue PerformMULCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue PerformFADDCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue PerformANDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
Function Alias Analysis Results
This file contains the simple types necessary to represent the attributes associated with functions a...
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
This file contains the declarations for the subclasses of Constant, which represent the different fla...
return RetTy
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
uint64_t Size
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
This file contains the declarations of entities that describe floating point environment and related ...
static LVOptions Options
Definition: LVOptions.cpp:25
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
static DebugLoc getDebugLoc(MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
Return the first found DebugLoc that has a DILocation, given a range of instructions.
unsigned const TargetRegisterInfo * TRI
Module.h This file contains the declarations for the Module class.
static cl::opt< bool > sched4reg("nvptx-sched4reg", cl::desc("NVPTX Specific: schedule for register pressue"), cl::init(false))
static SDValue PerformEXTRACTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static bool isConstOne(const SDValue &Operand)
static cl::opt< unsigned > FMAContractLevelOpt("nvptx-fma-level", cl::Hidden, cl::desc("NVPTX Specific: FMA contraction (0: don't do it" " 1: do it 2: do it aggressively"), cl::init(2))
static bool IsPTXVectorType(MVT VT)
static cl::opt< int > UsePrecDivF32("nvptx-prec-divf32", cl::Hidden, cl::desc("NVPTX Specifies: 0 use div.approx, 1 use div.full, 2 use" " IEEE Compliant F32 div.rnd if available."), cl::init(2))
static SDValue PerformStoreParamCombine(SDNode *N)
static void ReplaceLoadVector(SDNode *N, SelectionDAG &DAG, SmallVectorImpl< SDValue > &Results)
ReplaceVectorLoad - Convert vector loads into multi-output scalar loads.
static unsigned getOpcForSurfaceInstr(unsigned Intrinsic)
static bool Is16bitsType(MVT VT)
static SDValue combineMADConstOne(SDValue X, SDValue Add, EVT VT, SDLoc DL, TargetLowering::DAGCombinerInfo &DCI)
static bool IsTypePassedAsArray(const Type *Ty)
static SmallVector< ParamVectorizationFlags, 16 > VectorizePTXValueVTs(const SmallVectorImpl< EVT > &ValueVTs, const SmallVectorImpl< uint64_t > &Offsets, Align ParamAlignment, bool IsVAArg=false)
static unsigned CanMergeParamLoadStoresStartingAt(unsigned Idx, uint32_t AccessSize, const SmallVectorImpl< EVT > &ValueVTs, const SmallVectorImpl< uint64_t > &Offsets, Align ParamAlignment)
static void ReplaceINTRINSIC_W_CHAIN(SDNode *N, SelectionDAG &DAG, SmallVectorImpl< SDValue > &Results)
static SDValue PerformFADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1, TargetLowering::DAGCombinerInfo &DCI, CodeGenOptLevel OptLevel)
static unsigned getOpcForTextureInstr(unsigned Intrinsic)
static bool isConstZero(const SDValue &Operand)
static SDValue LowerVectorArith(SDValue Op, SelectionDAG &DAG)
static void ComputePTXValueVTs(const TargetLowering &TLI, const DataLayout &DL, Type *Ty, SmallVectorImpl< EVT > &ValueVTs, SmallVectorImpl< uint64_t > *Offsets=nullptr, uint64_t StartingOffset=0)
ComputePTXValueVTs - For the given Type Ty, returns the set of primitive EVTs that compose it.
static bool IsMulWideOperandDemotable(SDValue Op, unsigned OptSize, OperandSignedness &S)
IsMulWideOperandDemotable - Checks if the provided DAG node is an operand that can be demoted to OptS...
static SDValue LowerUnalignedStoreParam(SelectionDAG &DAG, SDValue Chain, uint64_t Offset, EVT ElementType, SDValue StVal, SDValue &InGlue, unsigned ArgID, const SDLoc &dl)
static SDValue PerformREMCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, CodeGenOptLevel OptLevel)
static SDValue PerformMULCombineWithOperands(SDNode *N, SDValue N0, SDValue N1, TargetLowering::DAGCombinerInfo &DCI)
static SDValue PerformStoreRetvalCombine(SDNode *N)
static bool AreMulWideOperandsDemotable(SDValue LHS, SDValue RHS, unsigned OptSize, bool &IsSigned)
AreMulWideOperandsDemotable - Checks if the given LHS and RHS operands can be demoted to OptSize bits...
static SDValue PerformStoreCombineHelper(SDNode *N, std::size_t Front, std::size_t Back)
static bool adjustElementType(EVT &ElementType)
static SDValue TryMULWIDECombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
TryMULWIDECombine - Attempt to replace a multiply of M bits with a multiply of M/2 bits that produces...
static SDValue combineMulSelectConstOne(SDValue X, SDValue Select, EVT VT, SDLoc DL, TargetLowering::DAGCombinerInfo &DCI)
static SDValue matchMADConstOnePattern(SDValue Add)
static cl::opt< bool > UsePrecSqrtF32("nvptx-prec-sqrtf32", cl::Hidden, cl::desc("NVPTX Specific: 0 use sqrt.approx, 1 use sqrt.rn."), cl::init(true))
ParamVectorizationFlags
@ PVF_FIRST
@ PVF_SCALAR
@ PVF_INNER
@ PVF_LAST
static SDValue LowerUnalignedStoreRet(SelectionDAG &DAG, SDValue Chain, uint64_t Offset, EVT ElementType, SDValue RetVal, const SDLoc &dl)
static bool PromoteScalarIntegerPTX(const EVT &VT, MVT *PromotedVT)
PromoteScalarIntegerPTX Used to make sure the arguments/returns are suitable for passing and promote ...
OperandSignedness
static SDValue PerformSETCCCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, unsigned int SmVersion)
static SDValue LowerUnalignedLoadRetParam(SelectionDAG &DAG, SDValue &Chain, uint64_t Offset, EVT ElementType, SDValue &InGlue, SmallVectorImpl< SDValue > &TempProxyRegOps, const SDLoc &dl)
static std::atomic< unsigned > GlobalUniqueCallSite
static cl::opt< bool > ForceMinByValParamAlign("nvptx-force-min-byval-param-align", cl::Hidden, cl::desc("NVPTX Specific: force 4-byte minimal alignment for byval" " params of device functions."), cl::init(false))
static SDValue PerformSHLCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, CodeGenOptLevel OptLevel)
PerformSHLCombine - Runs PTX-specific DAG combine patterns on SHL nodes.
unsigned SmVersion
Definition: NVVMReflect.cpp:81
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
#define P(N)
if(VerifyEach)
const char LLVMTargetMachineRef TM
const SmallVectorImpl< MachineOperand > & Cond
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file contains some templates that are useful if you are working with the STL at all.
This file defines the SmallVector class.
static bool Enabled
Definition: Statistic.cpp:46
This file describes how to lower LLVM code to machine code.
Value * RHS
Value * LHS
Class for arbitrary precision integers.
Definition: APInt.h:77
bool isSignedIntN(unsigned N) const
Check if this APInt has an N-bits signed integer value.
Definition: APInt.h:414
bool slt(const APInt &RHS) const
Signed less than comparison.
Definition: APInt.h:1109
bool isIntN(unsigned N) const
Check if this APInt has an N-bits unsigned integer value.
Definition: APInt.h:411
bool sge(const APInt &RHS) const
Signed greater or equal comparison.
Definition: APInt.h:1216
This class represents an incoming formal argument to a Function.
Definition: Argument.h:31
an instruction that atomically reads a memory location, combines it with another value,...
Definition: Instructions.h:695
@ Add
*p = old + v
Definition: Instructions.h:711
@ FAdd
*p = old + v
Definition: Instructions.h:732
@ Min
*p = old <signed v ? old : v
Definition: Instructions.h:725
@ Or
*p = old | v
Definition: Instructions.h:719
@ Sub
*p = old - v
Definition: Instructions.h:713
@ And
*p = old & v
Definition: Instructions.h:715
@ Xor
*p = old ^ v
Definition: Instructions.h:721
@ Max
*p = old >signed v ? old : v
Definition: Instructions.h:723
@ UMin
*p = old <unsigned v ? old : v
Definition: Instructions.h:729
@ UMax
*p = old >unsigned v ? old : v
Definition: Instructions.h:727
bool isFloatingPointOperation() const
Definition: Instructions.h:863
BinOp getOperation() const
Definition: Instructions.h:786
Value * getValOperand()
Definition: Instructions.h:855
bool hasParamAttr(unsigned ArgNo, Attribute::AttrKind Kind) const
Return true if the attribute exists for the given argument.
Definition: Attributes.h:805
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
Definition: InstrTypes.h:1236
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
Definition: InstrTypes.h:1465
This class represents a function call, abstracting a target machine's calling convention.
uint64_t getZExtValue() const
const APInt & getAPIntValue() const
static Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
Definition: Constants.cpp:370
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:110
TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
Definition: DataLayout.h:504
Align getPrefTypeAlign(Type *Ty) const
Returns the preferred stack/global alignment for the specified type.
Definition: DataLayout.cpp:874
Diagnostic information for unsupported feature in backend.
void addFnAttr(Attribute::AttrKind Kind)
Add function attributes to this function.
Definition: Function.cpp:600
Type * getReturnType() const
Returns the type of the ret val.
Definition: Function.h:212
unsigned getAddressSpace() const
const GlobalValue * getGlobal() const
const Function * getFunction() const
Return the function this instruction belongs to.
Definition: Instruction.cpp:70
void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
This class is used to represent ISD::LOAD nodes.
MCSection * getDataSection() const
Instances of this class represent a uniqued identifier for a section in the current translation unit.
Definition: MCSection.h:36
StringRef getName() const
getName - Get the symbol name.
Definition: MCSymbol.h:205
Machine Value Type.
SimpleValueType SimpleTy
unsigned getVectorNumElements() const
bool isScalableVector() const
Return true if this is a vector value type where the runtime length is machine dependent.
static auto integer_valuetypes()
static auto fixedlen_vector_valuetypes()
static MVT getIntegerVT(unsigned BitWidth)
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
Function & getFunction()
Return the LLVM function that this machine code represents.
const LLVMTargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MOInvariant
The memory access always returns the same value (or traps).
@ MOStore
The memory access writes data.
This SDNode is used for target intrinsics that touch memory and need an associated MachineMemOperand.
This is an abstract virtual class for memory operations.
Align getAlign() const
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
EVT getMemoryVT() const
Return the type of the in-memory value.
unsigned getMaxRequiredAlignment() const
bool hasAtomMinMax64() const
bool hasAtomAddF64() const
const NVPTXTargetLowering * getTargetLowering() const override
unsigned getPTXVersion() const
const NVPTXRegisterInfo * getRegisterInfo() const override
unsigned int getSmVersion() const
bool hasAtomBitwise64() const
bool hasBF16Math() const
bool allowFP16Math() const
ConstraintType getConstraintType(StringRef Constraint) const override
getConstraintType - Given a constraint letter, return the type of constraint it is for this target.
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
const NVPTXTargetMachine * nvTM
SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const
NVPTXTargetLowering(const NVPTXTargetMachine &TM, const NVPTXSubtarget &STI)
bool useF32FTZ(const MachineFunction &MF) const
Align getFunctionArgumentAlignment(const Function *F, Type *Ty, unsigned Idx, const DataLayout &DL) const
SDValue getSqrtEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled, int &ExtraSteps, bool &UseOneConst, bool Reciprocal) const override
Hooks for building estimates in place of slower divisions and square roots.
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &dl, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &dl, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower the incoming (formal) arguments, described by the Ins array,...
void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const override
Lower the specified operand into the Ops vector.
std::string getParamName(const Function *F, int Idx) const
TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const override
Return the preferred vector type legalization action.
std::string getPrototype(const DataLayout &DL, Type *, const ArgListTy &, const SmallVectorImpl< ISD::OutputArg > &, MaybeAlign retAlignment, std::optional< std::pair< unsigned, const APInt & > > VAInfo, const CallBase &CB, unsigned UniqueCallSite) const
Align getFunctionParamOptimizedAlign(const Function *F, Type *ArgTy, const DataLayout &DL) const
getFunctionParamOptimizedAlign - since function arguments are passed via .param space,...
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Ctx, EVT VT) const override
Return the ValueType of the result of SETCC operations.
std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override
Given a physical register constraint (e.g.
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I=nullptr) const override
isLegalAddressingMode - Return true if the addressing mode represented by AM is legal for this target...
AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override
Returns how the IR-level AtomicExpand pass should expand the given AtomicRMW, if at all.
Align getFunctionByValParamAlign(const Function *F, Type *ArgTy, Align InitialAlign, const DataLayout &DL) const
Helper for computing alignment of a device function byval parameter.
bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I, MachineFunction &MF, unsigned Intrinsic) const override
Given an intrinsic, checks if on the target the intrinsic will need to map to a MemIntrinsicNode (tou...
const char * getTargetNodeName(unsigned Opcode) const override
This method returns the name of a target specific DAG node.
bool allowFMA(MachineFunction &MF, CodeGenOptLevel OptLevel) const
bool allowUnsafeFPMath(MachineFunction &MF) const
SDValue LowerCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower calls into the specified DAG.
UniqueStringSaver & getStrPool() const
MCSection * SelectSectionForGlobal(const GlobalObject *GO, SectionKind Kind, const TargetMachine &TM) const override
static PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
Represents one node in the SelectionDAG.
const APInt & getAsAPIntVal() const
Helper method returns the APInt value of a ConstantSDNode.
bool isMachineOpcode() const
Test if this node has a post-isel opcode, directly corresponding to a MachineInstr opcode.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool hasOneUse() const
Return true if there is exactly one use of this node.
unsigned getIROrder() const
Return the node ordering.
iterator_range< use_iterator > uses()
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getNumOperands() const
Return the number of values used by this operation.
unsigned getMachineOpcode() const
This may only be called if isMachineOpcode returns true.
SDVTList getVTList() const
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
const APInt & getConstantOperandAPInt(unsigned Num) const
Helper method returns the APInt of a ConstantSDNode operand.
EVT getValueType(unsigned ResNo) const
Return the type of a specified result.
bool isUndef() const
Return true if the type of the node type undefined.
Represents a use of a SDNode.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
SDNode * getNode() const
get the SDNode which holds the desired result
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
MVT getSimpleValueType() const
Return the simple ValueType of the referenced return value.
unsigned getOpcode() const
SectionKind - This is a simple POD value that classifies the properties of a section.
Definition: SectionKind.h:22
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
Definition: SelectionDAG.h:227
SDValue getTargetGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, unsigned TargetFlags=0)
Definition: SelectionDAG.h:736
const SDValue & getRoot() const
Return the root tag of the SelectionDAG.
Definition: SelectionDAG.h:565
SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false)
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
SDValue getSymbolFunctionGlobalAddress(SDValue Op, Function **TargetFunction=nullptr)
Return a GlobalAddress of the function from the current module with name matching the given ExternalS...
SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
const TargetLowering & getTargetLoweringInfo() const
Definition: SelectionDAG.h:492
SDNode * MorphNodeTo(SDNode *N, unsigned Opc, SDVTList VTs, ArrayRef< SDValue > Ops)
This mutates the specified node to have the specified return type, opcode, and operands.
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
Definition: SelectionDAG.h:842
SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build Select's if you just have operands and don't want to check...
const DataLayout & getDataLayout() const
Definition: SelectionDAG.h:486
SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
void RemoveDeadNode(SDNode *N)
Remove the specified node from the system.
SDValue getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either any-extending or truncat...
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond)
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
SDValue getIntPtrConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
SDValue getFPExtendOrRound(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of float type, to the float type VT, by either extending or rounding (by tr...
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
Definition: SelectionDAG.h:690
MachineFunction & getMachineFunction() const
Definition: SelectionDAG.h:481
SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
LLVMContext * getContext() const
Definition: SelectionDAG.h:499
const SDValue & setRoot(SDValue N)
Set the current root tag of the SelectionDAG.
Definition: SelectionDAG.h:574
SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, LocationSize Size=0, const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
SDValue getTargetExternalSymbol(const char *Sym, EVT VT, unsigned TargetFlags=0)
This SDNode is used to implement the code generator support for the llvm IR shufflevector instruction...
ArrayRef< int > getMask() const
bool empty() const
Definition: SmallVector.h:94
size_t size() const
Definition: SmallVector.h:91
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:586
void assign(size_type NumElts, ValueParamT Elt)
Definition: SmallVector.h:717
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
Definition: SmallVector.h:696
void resize(size_type N)
Definition: SmallVector.h:651
void push_back(const T &Elt)
Definition: SmallVector.h:426
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
This class is used to represent ISD::STORE nodes.
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
constexpr size_t size() const
size - Get the string size.
Definition: StringRef.h:137
constexpr const char * data() const
data - Get a pointer to the start of the string (which may not be null terminated).
Definition: StringRef.h:131
Class to represent struct types.
Definition: DerivedTypes.h:216
void setBooleanVectorContents(BooleanContent Ty)
Specify how the target extends the result of a vector boolean value from a vector of i1 to a wider ty...
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
LegalizeAction
This enum indicates whether operations are valid for a target, and if not, what action should be used...
unsigned MaxStoresPerMemcpyOptSize
Likewise for functions with the OptSize attribute.
const TargetMachine & getTargetMachine() const
LegalizeTypeAction
This enum indicates whether a types are legal for a target, and if not, what action should be used to...
void addBypassSlowDiv(unsigned int SlowBitWidth, unsigned int FastBitWidth)
Tells the code generator which bitwidths to bypass.
virtual unsigned getNumRegisters(LLVMContext &Context, EVT VT, std::optional< MVT > RegisterVT=std::nullopt) const
Return the number of registers that this ValueType will eventually require.
void setMaxAtomicSizeInBitsSupported(unsigned SizeInBits)
Set the maximum atomic operation size supported by the backend.
virtual TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const
Return the preferred vector type legalization action.
unsigned MaxStoresPerMemsetOptSize
Likewise for functions with the OptSize attribute.
void setBooleanContents(BooleanContent Ty)
Specify how the target extends the result of integer and floating point boolean values from i1 to a w...
unsigned MaxStoresPerMemmove
Specify maximum number of store instructions per memmove call.
void computeRegisterProperties(const TargetRegisterInfo *TRI)
Once all of the register classes are added, this allows us to compute derived properties we expose.
unsigned MaxStoresPerMemmoveOptSize
Likewise for functions with the OptSize attribute.
void addRegisterClass(MVT VT, const TargetRegisterClass *RC)
Add the specified register class as an available regclass for the specified value type.
virtual MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
unsigned MaxStoresPerMemset
Specify maximum number of store instructions per memset call.
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
void setMinCmpXchgSizeInBits(unsigned SizeInBits)
Sets the minimum cmpxchg or ll/sc size supported by the backend.
void AddPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
If Opc/OrigVT is specified as being promoted, the promotion code defaults to trying a larger integer/...
AtomicExpansionKind
Enum that specifies what an atomic load/AtomicRMWInst is expanded to, if at all.
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
Align getMinStackArgumentAlignment() const
Return the minimum stack alignment of an argument.
void setLoadExtAction(unsigned ExtType, MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified load with extension does not work with the specified type and indicate wh...
std::vector< ArgListEntry > ArgListTy
bool allowsMemoryAccessForAlignment(LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const
This function returns true if the memory access is aligned or if the target allows this specific unal...
unsigned MaxStoresPerMemcpy
Specify maximum number of store instructions per memcpy call.
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
void setJumpIsExpensive(bool isExpensive=true)
Tells the code generator not to expand logic operations on comparison predicates into separate sequen...
LegalizeAction getOperationAction(unsigned Op, EVT VT) const
Return how this operation should be treated: either it is legal, needs to be promoted to a larger siz...
This class defines information used to lower LLVM code to legal SelectionDAG operators that the targe...
SDValue expandUnalignedStore(StoreSDNode *ST, SelectionDAG &DAG) const
Expands an unaligned store to 2 half-size stores for integer values, and possibly more for vectors.
virtual ConstraintType getConstraintType(StringRef Constraint) const
Given a constraint, return the type of constraint it is for this target.
std::pair< SDValue, SDValue > expandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Expands an unaligned load to 2 half-size loads for an integer, and possibly more for vectors.
virtual std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const
Given a physical register constraint (e.g.
SDValue expandRoundInexactToOdd(EVT ResultVT, SDValue Op, const SDLoc &DL, SelectionDAG &DAG) const
Truncate Op to ResultVT.
SDValue expandFP_ROUND(SDNode *Node, SelectionDAG &DAG) const
Expand round(fp) to fp conversion.
virtual void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const
Lower the specified operand into the Ops vector.
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:77
TargetOptions Options
MCSymbol * getSymbol(const GlobalValue *GV) const
CodeGenOptLevel getOptLevel() const
Returns the optimization level: None, Less, Default, or Aggressive.
unsigned UnsafeFPMath
UnsafeFPMath - This flag is enabled when the -enable-unsafe-fp-math flag is specified on the command ...
FPOpFusion::FPOpFusionMode AllowFPOpFusion
AllowFPOpFusion - This flag is set by the -fp-contract=xxx option.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:265
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition: Type.h:154
bool isBFloatTy() const
Return true if this is 'bfloat', a 16-bit bfloat type.
Definition: Type.h:146
@ VoidTyID
type with no size
Definition: Type.h:63
bool isAggregateType() const
Return true if the type is an aggregate type.
Definition: Type.h:295
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition: Type.h:143
bool isDoubleTy() const
Return true if this is 'double', a 64-bit IEEE fp type.
Definition: Type.h:157
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition: Type.h:185
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:228
TypeID getTypeID() const
Return the type id for the type.
Definition: Type.h:137
TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
StringRef save(const char *S)
Definition: StringSaver.h:52
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
A raw_ostream that writes to an std::string.
Definition: raw_ostream.h:661
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
NodeType
ISD::NodeType enum - This enum defines the target-independent operators for a SelectionDAG.
Definition: ISDOpcodes.h:40
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition: ISDOpcodes.h:764
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition: ISDOpcodes.h:257
@ BSWAP
Byte Swap and Counting operators.
Definition: ISDOpcodes.h:728
@ VAEND
VAEND, VASTART - VAEND and VASTART have three operands: an input chain, pointer, and a SRCVALUE.
Definition: ISDOpcodes.h:1176
@ ConstantFP
Definition: ISDOpcodes.h:77
@ ADDC
Carry-setting nodes for multiple precision addition and subtraction.
Definition: ISDOpcodes.h:276
@ ADD
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:246
@ LOAD
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
Definition: ISDOpcodes.h:1052
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition: ISDOpcodes.h:797
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition: ISDOpcodes.h:491
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
Definition: ISDOpcodes.h:205
@ GlobalAddress
Definition: ISDOpcodes.h:78
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition: ISDOpcodes.h:804
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
Definition: ISDOpcodes.h:551
@ FADD
Simple binary floating point operators.
Definition: ISDOpcodes.h:397
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
Definition: ISDOpcodes.h:702
@ SDIVREM
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition: ISDOpcodes.h:262
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
Definition: ISDOpcodes.h:917
@ SIGN_EXTEND
Conversion operators.
Definition: ISDOpcodes.h:788
@ READSTEADYCOUNTER
READSTEADYCOUNTER - This corresponds to the readfixedcounter intrinsic.
Definition: ISDOpcodes.h:1209
@ FNEG
Perform various unary floating-point operations inspired by libm.
Definition: ISDOpcodes.h:944
@ BR_CC
BR_CC - Conditional branch.
Definition: ISDOpcodes.h:1098
@ SSUBO
Same for subtraction.
Definition: ISDOpcodes.h:334
@ BRIND
BRIND - Indirect branch.
Definition: ISDOpcodes.h:1073
@ BR_JT
BR_JT - Jumptable branch.
Definition: ISDOpcodes.h:1077
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
Definition: ISDOpcodes.h:356
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition: ISDOpcodes.h:741
@ UNDEF
UNDEF - An undefined node.
Definition: ISDOpcodes.h:218
@ VACOPY
VACOPY - VACOPY has 5 operands: an input chain, a destination pointer, a source pointer,...
Definition: ISDOpcodes.h:1172
@ SADDO
RESULT, BOOL = [SU]ADDO(LHS, RHS) - Overflow-aware nodes for addition.
Definition: ISDOpcodes.h:330
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition: ISDOpcodes.h:659
@ SHL
Shift and rotation operations.
Definition: ISDOpcodes.h:719
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition: ISDOpcodes.h:608
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
Definition: ISDOpcodes.h:581
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition: ISDOpcodes.h:543
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition: ISDOpcodes.h:794
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
Definition: ISDOpcodes.h:756
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum or maximum on two values.
Definition: ISDOpcodes.h:986
@ SSHLSAT
RESULT = [US]SHLSAT(LHS, RHS) - Perform saturation left shift.
Definition: ISDOpcodes.h:366
@ SMULO
Same for multiplication.
Definition: ISDOpcodes.h:338
@ DYNAMIC_STACKALLOC
DYNAMIC_STACKALLOC - Allocate some number of bytes on the stack aligned to a specified boundary.
Definition: ISDOpcodes.h:1062
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition: ISDOpcodes.h:812
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition: ISDOpcodes.h:682
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:902
@ VSELECT
Select with a vector condition (op #0) and two vector operands (ops #1 and #2), returning a vector re...
Definition: ISDOpcodes.h:750
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition: ISDOpcodes.h:310
@ BF16_TO_FP
BF16_TO_FP, FP_TO_BF16 - These operators are used to perform promotions and truncation for bfloat16.
Definition: ISDOpcodes.h:936
@ FRAMEADDR
FRAMEADDR, RETURNADDR - These nodes represent llvm.frameaddress and llvm.returnaddress on the DAG.
Definition: ISDOpcodes.h:100
@ FMINIMUM
FMINIMUM/FMAXIMUM - NaN-propagating minimum/maximum that also treat -0.0 as less than 0....
Definition: ISDOpcodes.h:1005
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition: ISDOpcodes.h:850
@ READCYCLECOUNTER
READCYCLECOUNTER - This corresponds to the readcyclecounter intrinsic.
Definition: ISDOpcodes.h:1203
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition: ISDOpcodes.h:694
@ TRAP
TRAP - Trapping instruction.
Definition: ISDOpcodes.h:1229
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition: ISDOpcodes.h:190
@ ADDE
Carry-using nodes for multiple precision addition and subtraction.
Definition: ISDOpcodes.h:286
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
Definition: ISDOpcodes.h:532
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
Definition: ISDOpcodes.h:52
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition: ISDOpcodes.h:883
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition: ISDOpcodes.h:800
@ VAARG
VAARG - VAARG has four operands: an input chain, a pointer, a SRCVALUE, and the alignment.
Definition: ISDOpcodes.h:1167
@ SHL_PARTS
SHL_PARTS/SRA_PARTS/SRL_PARTS - These operators are used for expanded integer shift operations.
Definition: ISDOpcodes.h:777
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
Definition: ISDOpcodes.h:501
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
Definition: ISDOpcodes.h:347
@ SADDO_CARRY
Carry-using overflow-aware nodes for multiple precision addition and subtraction.
Definition: ISDOpcodes.h:320
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition: ISDOpcodes.h:198
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
Definition: ISDOpcodes.h:523
bool allOperandsUndef(const SDNode *N)
Return true if the node has at least one operand and all operands of the specified node are ISD::UNDE...
@ Bitcast
Perform the operation on a different, but equivalently sized type.
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
@ FalseVal
Definition: TGLexer.h:59
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
@ Offset
Definition: DWP.cpp:480
static bool isIndirectCall(const MachineInstr &MI)
bool shouldEmitPTXNoReturn(const Value *V, const TargetMachine &TM)
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1722
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition: STLExtras.h:1680
bool Isv2x16VT(EVT VT)
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are are tuples (A,...
Definition: STLExtras.h:2400
@ ADDRESS_SPACE_LOCAL
Definition: NVPTXBaseInfo.h:26
@ ADDRESS_SPACE_PARAM
Definition: NVPTXBaseInfo.h:29
MaybeAlign getAlign(const Function &F, unsigned Index)
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition: MathExtras.h:372
OutputIt transform(R &&Range, OutputIt d_first, UnaryFunction F)
Wrapper function around std::transform to apply a function to a range and store the result elsewhere.
Definition: STLExtras.h:1928
unsigned promoteScalarArgumentSize(unsigned size)
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition: Error.cpp:167
CodeGenOptLevel
Code generation optimization level.
Definition: CodeGen.h:54
@ Add
Sum of integers.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition: Alignment.h:155
DWARFExpression::Operation Op
void ComputeValueVTs(const TargetLowering &TLI, const DataLayout &DL, Type *Ty, SmallVectorImpl< EVT > &ValueVTs, SmallVectorImpl< EVT > *MemVTs, SmallVectorImpl< TypeSize > *Offsets=nullptr, TypeSize StartingOffset=TypeSize::getZero())
ComputeValueVTs - Given an LLVM IR type, compute a sequence of EVTs that represent all the individual...
Definition: Analysis.cpp:79
constexpr unsigned BitWidth
Definition: BitmaskEnum.h:191
bool isKernelFunction(const Function &F)
Function * getMaybeBitcastedCallee(const CallBase *CB)
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition: Alignment.h:212
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:860
#define N
static const fltSemantics & IEEEsingle() LLVM_READNONE
Definition: APFloat.cpp:271
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
uint64_t value() const
This is a hole in the type system and should not be abused.
Definition: Alignment.h:85
@ PreserveSign
The sign of a flushed-to-zero number is preserved in the sign of 0.
DenormalModeKind Output
Denormal flushing mode for floating point instruction results in the default floating point environme...
Extended Value Type.
Definition: ValueTypes.h:34
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
Definition: ValueTypes.h:380
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition: ValueTypes.h:136
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition: ValueTypes.h:73
EVT changeTypeToInteger() const
Return the type converted to an equivalently sized integer or vector with integer element type.
Definition: ValueTypes.h:120
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
Definition: ValueTypes.h:146
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition: ValueTypes.h:358
uint64_t getScalarSizeInBits() const
Definition: ValueTypes.h:370
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition: ValueTypes.h:306
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
Definition: ValueTypes.h:366
bool isVector() const
Return true if this is a vector value type.
Definition: ValueTypes.h:167
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition: ValueTypes.h:313
Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
Definition: ValueTypes.cpp:203
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition: ValueTypes.h:318
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
Definition: ValueTypes.h:156
EVT changeVectorElementType(EVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
Definition: ValueTypes.h:101
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition: ValueTypes.h:326
bool isInteger() const
Return true if this is an integer or a vector integer type.
Definition: ValueTypes.h:151
This class contains a discriminated union of information about pointers in memory operands,...
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
This structure contains all information that is necessary for lowering calls.
SmallVector< ISD::InputArg, 32 > Ins
SmallVector< ISD::OutputArg, 32 > Outs
SmallVector< SDValue, 32 > OutVals
SDValue CombineTo(SDNode *N, ArrayRef< SDValue > To, bool AddTo=true)