LLVM 20.0.0git
NVPTXISelLowering.cpp
Go to the documentation of this file.
1//===-- NVPTXISelLowering.cpp - NVPTX DAG Lowering Implementation ---------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines the interfaces that NVPTX uses to lower LLVM code into a
10// selection DAG.
11//
12//===----------------------------------------------------------------------===//
13
14#include "NVPTXISelLowering.h"
16#include "NVPTX.h"
17#include "NVPTXSubtarget.h"
18#include "NVPTXTargetMachine.h"
20#include "NVPTXUtilities.h"
21#include "llvm/ADT/APInt.h"
22#include "llvm/ADT/STLExtras.h"
24#include "llvm/ADT/StringRef.h"
36#include "llvm/IR/Argument.h"
37#include "llvm/IR/Attributes.h"
38#include "llvm/IR/Constants.h"
39#include "llvm/IR/DataLayout.h"
42#include "llvm/IR/FPEnv.h"
43#include "llvm/IR/Function.h"
44#include "llvm/IR/GlobalValue.h"
45#include "llvm/IR/Instruction.h"
47#include "llvm/IR/IntrinsicsNVPTX.h"
48#include "llvm/IR/Module.h"
49#include "llvm/IR/Type.h"
50#include "llvm/IR/Value.h"
59#include <algorithm>
60#include <cassert>
61#include <cmath>
62#include <cstdint>
63#include <iterator>
64#include <optional>
65#include <sstream>
66#include <string>
67#include <utility>
68#include <vector>
69
70#define DEBUG_TYPE "nvptx-lower"
71
72using namespace llvm;
73
74static std::atomic<unsigned> GlobalUniqueCallSite;
75
77 "nvptx-sched4reg",
78 cl::desc("NVPTX Specific: schedule for register pressue"), cl::init(false));
79
81 "nvptx-fma-level", cl::Hidden,
82 cl::desc("NVPTX Specific: FMA contraction (0: don't do it"
83 " 1: do it 2: do it aggressively"),
84 cl::init(2));
85
87 "nvptx-prec-divf32", cl::Hidden,
88 cl::desc("NVPTX Specifies: 0 use div.approx, 1 use div.full, 2 use"
89 " IEEE Compliant F32 div.rnd if available."),
90 cl::init(2));
91
93 "nvptx-prec-sqrtf32", cl::Hidden,
94 cl::desc("NVPTX Specific: 0 use sqrt.approx, 1 use sqrt.rn."),
95 cl::init(true));
96
98 "nvptx-force-min-byval-param-align", cl::Hidden,
99 cl::desc("NVPTX Specific: force 4-byte minimal alignment for byval"
100 " params of device functions."),
101 cl::init(false));
102
104 if (UsePrecDivF32.getNumOccurrences() > 0) {
105 // If nvptx-prec-div32=N is used on the command-line, always honor it
106 return UsePrecDivF32;
107 } else {
108 // Otherwise, use div.approx if fast math is enabled
109 if (getTargetMachine().Options.UnsafeFPMath)
110 return 0;
111 else
112 return 2;
113 }
114}
115
117 if (UsePrecSqrtF32.getNumOccurrences() > 0) {
118 // If nvptx-prec-sqrtf32 is used on the command-line, always honor it
119 return UsePrecSqrtF32;
120 } else {
121 // Otherwise, use sqrt.approx if fast math is enabled
123 }
124}
125
129}
130
131static bool IsPTXVectorType(MVT VT) {
132 switch (VT.SimpleTy) {
133 default:
134 return false;
135 case MVT::v2i1:
136 case MVT::v4i1:
137 case MVT::v2i8:
138 case MVT::v4i8:
139 case MVT::v2i16:
140 case MVT::v4i16:
141 case MVT::v8i16: // <4 x i16x2>
142 case MVT::v2i32:
143 case MVT::v4i32:
144 case MVT::v2i64:
145 case MVT::v2f16:
146 case MVT::v4f16:
147 case MVT::v8f16: // <4 x f16x2>
148 case MVT::v2bf16:
149 case MVT::v4bf16:
150 case MVT::v8bf16: // <4 x bf16x2>
151 case MVT::v2f32:
152 case MVT::v4f32:
153 case MVT::v2f64:
154 return true;
155 }
156}
157
158static bool Is16bitsType(MVT VT) {
159 return (VT.SimpleTy == MVT::f16 || VT.SimpleTy == MVT::bf16 ||
160 VT.SimpleTy == MVT::i16);
161}
162
163/// ComputePTXValueVTs - For the given Type \p Ty, returns the set of primitive
164/// EVTs that compose it. Unlike ComputeValueVTs, this will break apart vectors
165/// into their primitive components.
166/// NOTE: This is a band-aid for code that expects ComputeValueVTs to return the
167/// same number of types as the Ins/Outs arrays in LowerFormalArguments,
168/// LowerCall, and LowerReturn.
169static void ComputePTXValueVTs(const TargetLowering &TLI, const DataLayout &DL,
170 Type *Ty, SmallVectorImpl<EVT> &ValueVTs,
171 SmallVectorImpl<uint64_t> *Offsets = nullptr,
172 uint64_t StartingOffset = 0) {
173 SmallVector<EVT, 16> TempVTs;
174 SmallVector<uint64_t, 16> TempOffsets;
175
176 // Special case for i128 - decompose to (i64, i64)
177 if (Ty->isIntegerTy(128)) {
178 ValueVTs.push_back(EVT(MVT::i64));
179 ValueVTs.push_back(EVT(MVT::i64));
180
181 if (Offsets) {
182 Offsets->push_back(StartingOffset + 0);
183 Offsets->push_back(StartingOffset + 8);
184 }
185
186 return;
187 }
188
189 // Given a struct type, recursively traverse the elements with custom ComputePTXValueVTs.
190 if (StructType *STy = dyn_cast<StructType>(Ty)) {
191 auto const *SL = DL.getStructLayout(STy);
192 auto ElementNum = 0;
193 for(auto *EI : STy->elements()) {
194 ComputePTXValueVTs(TLI, DL, EI, ValueVTs, Offsets,
195 StartingOffset + SL->getElementOffset(ElementNum));
196 ++ElementNum;
197 }
198 return;
199 }
200
201 ComputeValueVTs(TLI, DL, Ty, TempVTs, &TempOffsets, StartingOffset);
202 for (unsigned i = 0, e = TempVTs.size(); i != e; ++i) {
203 EVT VT = TempVTs[i];
204 uint64_t Off = TempOffsets[i];
205 // Split vectors into individual elements, except for v2f16, which
206 // we will pass as a single scalar.
207 if (VT.isVector()) {
208 unsigned NumElts = VT.getVectorNumElements();
209 EVT EltVT = VT.getVectorElementType();
210 // Vectors with an even number of f16 elements will be passed to
211 // us as an array of v2f16/v2bf16 elements. We must match this so we
212 // stay in sync with Ins/Outs.
213 if ((Is16bitsType(EltVT.getSimpleVT())) && NumElts % 2 == 0) {
214 switch (EltVT.getSimpleVT().SimpleTy) {
215 case MVT::f16:
216 EltVT = MVT::v2f16;
217 break;
218 case MVT::bf16:
219 EltVT = MVT::v2bf16;
220 break;
221 case MVT::i16:
222 EltVT = MVT::v2i16;
223 break;
224 default:
225 llvm_unreachable("Unexpected type");
226 }
227 NumElts /= 2;
228 } else if (EltVT.getSimpleVT() == MVT::i8 &&
229 (NumElts % 4 == 0 || NumElts == 3)) {
230 // v*i8 are formally lowered as v4i8
231 EltVT = MVT::v4i8;
232 NumElts = (NumElts + 3) / 4;
233 }
234 for (unsigned j = 0; j != NumElts; ++j) {
235 ValueVTs.push_back(EltVT);
236 if (Offsets)
237 Offsets->push_back(Off + j * EltVT.getStoreSize());
238 }
239 } else {
240 ValueVTs.push_back(VT);
241 if (Offsets)
242 Offsets->push_back(Off);
243 }
244 }
245}
246
247/// PromoteScalarIntegerPTX
248/// Used to make sure the arguments/returns are suitable for passing
249/// and promote them to a larger size if they're not.
250///
251/// The promoted type is placed in \p PromoteVT if the function returns true.
252static bool PromoteScalarIntegerPTX(const EVT &VT, MVT *PromotedVT) {
253 if (VT.isScalarInteger()) {
254 switch (PowerOf2Ceil(VT.getFixedSizeInBits())) {
255 default:
257 "Promotion is not suitable for scalars of size larger than 64-bits");
258 case 1:
259 *PromotedVT = MVT::i1;
260 break;
261 case 2:
262 case 4:
263 case 8:
264 *PromotedVT = MVT::i8;
265 break;
266 case 16:
267 *PromotedVT = MVT::i16;
268 break;
269 case 32:
270 *PromotedVT = MVT::i32;
271 break;
272 case 64:
273 *PromotedVT = MVT::i64;
274 break;
275 }
276 return EVT(*PromotedVT) != VT;
277 }
278 return false;
279}
280
281// Check whether we can merge loads/stores of some of the pieces of a
282// flattened function parameter or return value into a single vector
283// load/store.
284//
285// The flattened parameter is represented as a list of EVTs and
286// offsets, and the whole structure is aligned to ParamAlignment. This
287// function determines whether we can load/store pieces of the
288// parameter starting at index Idx using a single vectorized op of
289// size AccessSize. If so, it returns the number of param pieces
290// covered by the vector op. Otherwise, it returns 1.
292 unsigned Idx, uint32_t AccessSize, const SmallVectorImpl<EVT> &ValueVTs,
293 const SmallVectorImpl<uint64_t> &Offsets, Align ParamAlignment) {
294
295 // Can't vectorize if param alignment is not sufficient.
296 if (ParamAlignment < AccessSize)
297 return 1;
298 // Can't vectorize if offset is not aligned.
299 if (Offsets[Idx] & (AccessSize - 1))
300 return 1;
301
302 EVT EltVT = ValueVTs[Idx];
303 unsigned EltSize = EltVT.getStoreSize();
304
305 // Element is too large to vectorize.
306 if (EltSize >= AccessSize)
307 return 1;
308
309 unsigned NumElts = AccessSize / EltSize;
310 // Can't vectorize if AccessBytes if not a multiple of EltSize.
311 if (AccessSize != EltSize * NumElts)
312 return 1;
313
314 // We don't have enough elements to vectorize.
315 if (Idx + NumElts > ValueVTs.size())
316 return 1;
317
318 // PTX ISA can only deal with 2- and 4-element vector ops.
319 if (NumElts != 4 && NumElts != 2)
320 return 1;
321
322 for (unsigned j = Idx + 1; j < Idx + NumElts; ++j) {
323 // Types do not match.
324 if (ValueVTs[j] != EltVT)
325 return 1;
326
327 // Elements are not contiguous.
328 if (Offsets[j] - Offsets[j - 1] != EltSize)
329 return 1;
330 }
331 // OK. We can vectorize ValueVTs[i..i+NumElts)
332 return NumElts;
333}
334
335// Flags for tracking per-element vectorization state of loads/stores
336// of a flattened function parameter or return value.
338 PVF_INNER = 0x0, // Middle elements of a vector.
339 PVF_FIRST = 0x1, // First element of the vector.
340 PVF_LAST = 0x2, // Last element of the vector.
341 // Scalar is effectively a 1-element vector.
344
345// Computes whether and how we can vectorize the loads/stores of a
346// flattened function parameter or return value.
347//
348// The flattened parameter is represented as the list of ValueVTs and
349// Offsets, and is aligned to ParamAlignment bytes. We return a vector
350// of the same size as ValueVTs indicating how each piece should be
351// loaded/stored (i.e. as a scalar, or as part of a vector
352// load/store).
355 const SmallVectorImpl<uint64_t> &Offsets,
356 Align ParamAlignment, bool IsVAArg = false) {
357 // Set vector size to match ValueVTs and mark all elements as
358 // scalars by default.
360 VectorInfo.assign(ValueVTs.size(), PVF_SCALAR);
361
362 if (IsVAArg)
363 return VectorInfo;
364
365 // Check what we can vectorize using 128/64/32-bit accesses.
366 for (int I = 0, E = ValueVTs.size(); I != E; ++I) {
367 // Skip elements we've already processed.
368 assert(VectorInfo[I] == PVF_SCALAR && "Unexpected vector info state.");
369 for (unsigned AccessSize : {16, 8, 4, 2}) {
370 unsigned NumElts = CanMergeParamLoadStoresStartingAt(
371 I, AccessSize, ValueVTs, Offsets, ParamAlignment);
372 // Mark vectorized elements.
373 switch (NumElts) {
374 default:
375 llvm_unreachable("Unexpected return value");
376 case 1:
377 // Can't vectorize using this size, try next smaller size.
378 continue;
379 case 2:
380 assert(I + 1 < E && "Not enough elements.");
381 VectorInfo[I] = PVF_FIRST;
382 VectorInfo[I + 1] = PVF_LAST;
383 I += 1;
384 break;
385 case 4:
386 assert(I + 3 < E && "Not enough elements.");
387 VectorInfo[I] = PVF_FIRST;
388 VectorInfo[I + 1] = PVF_INNER;
389 VectorInfo[I + 2] = PVF_INNER;
390 VectorInfo[I + 3] = PVF_LAST;
391 I += 3;
392 break;
393 }
394 // Break out of the inner loop because we've already succeeded
395 // using largest possible AccessSize.
396 break;
397 }
398 }
399 return VectorInfo;
400}
401
402// NVPTXTargetLowering Constructor.
404 const NVPTXSubtarget &STI)
405 : TargetLowering(TM), nvTM(&TM), STI(STI) {
406 // always lower memset, memcpy, and memmove intrinsics to load/store
407 // instructions, rather
408 // then generating calls to memset, mempcy or memmove.
412
415
416 // Jump is Expensive. Don't create extra control flow for 'and', 'or'
417 // condition branches.
418 setJumpIsExpensive(true);
419
420 // Wide divides are _very_ slow. Try to reduce the width of the divide if
421 // possible.
422 addBypassSlowDiv(64, 32);
423
424 // By default, use the Source scheduling
425 if (sched4reg)
427 else
429
430 auto setFP16OperationAction = [&](unsigned Op, MVT VT, LegalizeAction Action,
431 LegalizeAction NoF16Action) {
432 bool IsOpSupported = STI.allowFP16Math();
433 switch (Op) {
434 // Several FP16 instructions are available on sm_80 only.
435 case ISD::FMINNUM:
436 case ISD::FMAXNUM:
439 case ISD::FMAXIMUM:
440 case ISD::FMINIMUM:
441 IsOpSupported &= STI.getSmVersion() >= 80 && STI.getPTXVersion() >= 70;
442 break;
443 }
444 setOperationAction(Op, VT, IsOpSupported ? Action : NoF16Action);
445 };
446
447 auto setBF16OperationAction = [&](unsigned Op, MVT VT, LegalizeAction Action,
448 LegalizeAction NoBF16Action) {
449 bool IsOpSupported = STI.hasBF16Math();
450 switch (Op) {
451 // Several BF16 instructions are available on sm_90 only.
452 case ISD::FADD:
453 case ISD::FMUL:
454 case ISD::FSUB:
455 case ISD::SELECT:
456 case ISD::SELECT_CC:
457 case ISD::SETCC:
458 case ISD::FEXP2:
459 case ISD::FCEIL:
460 case ISD::FFLOOR:
461 case ISD::FNEARBYINT:
462 case ISD::FRINT:
463 case ISD::FROUNDEVEN:
464 case ISD::FTRUNC:
465 IsOpSupported = STI.getSmVersion() >= 90 && STI.getPTXVersion() >= 78;
466 break;
467 // Several BF16 instructions are available on sm_80 only.
468 case ISD::FMINNUM:
469 case ISD::FMAXNUM:
472 case ISD::FMAXIMUM:
473 case ISD::FMINIMUM:
474 IsOpSupported &= STI.getSmVersion() >= 80 && STI.getPTXVersion() >= 70;
475 break;
476 }
478 Op, VT, IsOpSupported ? Action : NoBF16Action);
479 };
480
481 auto setI16x2OperationAction = [&](unsigned Op, MVT VT, LegalizeAction Action,
482 LegalizeAction NoI16x2Action) {
483 bool IsOpSupported = false;
484 // instructions are available on sm_90 only
485 switch (Op) {
486 case ISD::ADD:
487 case ISD::SMAX:
488 case ISD::SMIN:
489 case ISD::UMIN:
490 case ISD::UMAX:
491 IsOpSupported = STI.getSmVersion() >= 90 && STI.getPTXVersion() >= 80;
492 break;
493 }
494 setOperationAction(Op, VT, IsOpSupported ? Action : NoI16x2Action);
495 };
496
497 addRegisterClass(MVT::i1, &NVPTX::Int1RegsRegClass);
498 addRegisterClass(MVT::i16, &NVPTX::Int16RegsRegClass);
499 addRegisterClass(MVT::v2i16, &NVPTX::Int32RegsRegClass);
500 addRegisterClass(MVT::v4i8, &NVPTX::Int32RegsRegClass);
501 addRegisterClass(MVT::i32, &NVPTX::Int32RegsRegClass);
502 addRegisterClass(MVT::i64, &NVPTX::Int64RegsRegClass);
503 addRegisterClass(MVT::f32, &NVPTX::Float32RegsRegClass);
504 addRegisterClass(MVT::f64, &NVPTX::Float64RegsRegClass);
505 addRegisterClass(MVT::f16, &NVPTX::Int16RegsRegClass);
506 addRegisterClass(MVT::v2f16, &NVPTX::Int32RegsRegClass);
507 addRegisterClass(MVT::bf16, &NVPTX::Int16RegsRegClass);
508 addRegisterClass(MVT::v2bf16, &NVPTX::Int32RegsRegClass);
509
510 // Conversion to/from FP16/FP16x2 is always legal.
515
517 if (STI.getSmVersion() >= 30 && STI.getPTXVersion() > 31)
519
520 setFP16OperationAction(ISD::SETCC, MVT::f16, Legal, Promote);
521 setFP16OperationAction(ISD::SETCC, MVT::v2f16, Legal, Expand);
522
523 // Conversion to/from BFP16/BFP16x2 is always legal.
528
529 setBF16OperationAction(ISD::SETCC, MVT::v2bf16, Legal, Expand);
530 setBF16OperationAction(ISD::SETCC, MVT::bf16, Legal, Promote);
531 if (getOperationAction(ISD::SETCC, MVT::bf16) == Promote)
532 AddPromotedToType(ISD::SETCC, MVT::bf16, MVT::f32);
533
534 // Conversion to/from i16/i16x2 is always legal.
539
544 // Only logical ops can be done on v4i8 directly, others must be done
545 // elementwise.
562 MVT::v4i8, Expand);
563
564 // Operations not directly supported by NVPTX.
565 for (MVT VT : {MVT::bf16, MVT::f16, MVT::v2bf16, MVT::v2f16, MVT::f32,
566 MVT::f64, MVT::i1, MVT::i8, MVT::i16, MVT::v2i16, MVT::v4i8,
567 MVT::i32, MVT::i64}) {
570 }
571
572 // Some SIGN_EXTEND_INREG can be done using cvt instruction.
573 // For others we will expand to a SHL/SRA pair.
580
587
590
591 // TODO: we may consider expanding ROTL/ROTR on older GPUs. Currently on GPUs
592 // that don't have h/w rotation we lower them to multi-instruction assembly.
593 // See ROT*_sw in NVPTXIntrInfo.td
598
600 setOperationAction(ISD::ROTL, MVT::v2i16, Expand);
602 setOperationAction(ISD::ROTR, MVT::v2i16, Expand);
606
609
612
613 // We want to legalize constant related memmove and memcopy
614 // intrinsics.
616
617 // Turn FP extload into load/fpextend
618 setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
619 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
620 setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::bf16, Expand);
621 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::bf16, Expand);
622 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
623 setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2f16, Expand);
624 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand);
625 setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2bf16, Expand);
626 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2bf16, Expand);
627 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f32, Expand);
628 setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Expand);
629 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Expand);
630 setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4bf16, Expand);
631 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4bf16, Expand);
632 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f32, Expand);
633 setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Expand);
634 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Expand);
635 setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8bf16, Expand);
636 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8bf16, Expand);
637 // Turn FP truncstore into trunc + store.
638 // FIXME: vector types should also be expanded
639 setTruncStoreAction(MVT::f32, MVT::f16, Expand);
640 setTruncStoreAction(MVT::f64, MVT::f16, Expand);
641 setTruncStoreAction(MVT::f32, MVT::bf16, Expand);
642 setTruncStoreAction(MVT::f64, MVT::bf16, Expand);
643 setTruncStoreAction(MVT::f64, MVT::f32, Expand);
644
645 // PTX does not support load / store predicate registers
648
649 for (MVT VT : MVT::integer_valuetypes()) {
653 setTruncStoreAction(VT, MVT::i1, Expand);
654 }
655
656 // expand extload of vector of integers.
658 MVT::v2i8, Expand);
659 setTruncStoreAction(MVT::v2i16, MVT::v2i8, Expand);
660
661 // This is legal in NVPTX
666
669
670 // TRAP can be lowered to PTX trap
671 setOperationAction(ISD::TRAP, MVT::Other, Legal);
672 // DEBUGTRAP can be lowered to PTX brkpt
674
675 // Register custom handling for vector loads/stores
677 if (IsPTXVectorType(VT)) {
681 }
682 }
683
684 // Support varargs.
689
690 // Custom handling for i8 intrinsics
692
693 for (const auto& Ty : {MVT::i16, MVT::i32, MVT::i64}) {
699
702 }
703
704 setI16x2OperationAction(ISD::ABS, MVT::v2i16, Legal, Custom);
705 setI16x2OperationAction(ISD::SMIN, MVT::v2i16, Legal, Custom);
706 setI16x2OperationAction(ISD::SMAX, MVT::v2i16, Legal, Custom);
707 setI16x2OperationAction(ISD::UMIN, MVT::v2i16, Legal, Custom);
708 setI16x2OperationAction(ISD::UMAX, MVT::v2i16, Legal, Custom);
709 setI16x2OperationAction(ISD::CTPOP, MVT::v2i16, Legal, Expand);
710 setI16x2OperationAction(ISD::CTLZ, MVT::v2i16, Legal, Expand);
711
712 setI16x2OperationAction(ISD::ADD, MVT::v2i16, Legal, Custom);
713 setI16x2OperationAction(ISD::SUB, MVT::v2i16, Legal, Custom);
714 setI16x2OperationAction(ISD::MUL, MVT::v2i16, Legal, Custom);
715 setI16x2OperationAction(ISD::SHL, MVT::v2i16, Legal, Custom);
716 setI16x2OperationAction(ISD::SREM, MVT::v2i16, Legal, Custom);
717 setI16x2OperationAction(ISD::UREM, MVT::v2i16, Legal, Custom);
718
719 // Other arithmetic and logic ops are unsupported.
723 MVT::v2i16, Expand);
724
729 if (STI.getPTXVersion() >= 43) {
734 }
735
737 setOperationAction(ISD::CTTZ, MVT::v2i16, Expand);
740
741 // PTX does not directly support SELP of i1, so promote to i32 first
743
744 // PTX cannot multiply two i64s in a single instruction.
747
748 // We have some custom DAG combine patterns for these nodes
751 ISD::VSELECT});
752
753 // setcc for f16x2 and bf16x2 needs special handling to prevent
754 // legalizer's attempt to scalarize it due to v2i1 not being legal.
755 if (STI.allowFP16Math() || STI.hasBF16Math())
757
758 // Promote fp16 arithmetic if fp16 hardware isn't available or the
759 // user passed --nvptx-no-fp16-math. The flag is useful because,
760 // although sm_53+ GPUs have some sort of FP16 support in
761 // hardware, only sm_53 and sm_60 have full implementation. Others
762 // only have token amount of hardware and are likely to run faster
763 // by using fp32 units instead.
764 for (const auto &Op : {ISD::FADD, ISD::FMUL, ISD::FSUB, ISD::FMA}) {
765 setFP16OperationAction(Op, MVT::f16, Legal, Promote);
766 setFP16OperationAction(Op, MVT::v2f16, Legal, Expand);
767 setBF16OperationAction(Op, MVT::v2bf16, Legal, Expand);
768 // bf16 must be promoted to f32.
769 setBF16OperationAction(Op, MVT::bf16, Legal, Promote);
770 if (getOperationAction(Op, MVT::bf16) == Promote)
771 AddPromotedToType(Op, MVT::bf16, MVT::f32);
772 }
773
774 // f16/f16x2 neg was introduced in PTX 60, SM_53.
775 const bool IsFP16FP16x2NegAvailable = STI.getSmVersion() >= 53 &&
776 STI.getPTXVersion() >= 60 &&
777 STI.allowFP16Math();
778 for (const auto &VT : {MVT::f16, MVT::v2f16})
780 IsFP16FP16x2NegAvailable ? Legal : Expand);
781
782 setBF16OperationAction(ISD::FNEG, MVT::bf16, Legal, Expand);
783 setBF16OperationAction(ISD::FNEG, MVT::v2bf16, Legal, Expand);
784 // (would be) Library functions.
785
786 // These map to conversion instructions for scalar FP types.
787 for (const auto &Op : {ISD::FCEIL, ISD::FFLOOR, ISD::FNEARBYINT, ISD::FRINT,
789 setOperationAction(Op, MVT::f16, Legal);
790 setOperationAction(Op, MVT::f32, Legal);
791 setOperationAction(Op, MVT::f64, Legal);
792 setOperationAction(Op, MVT::v2f16, Expand);
793 setOperationAction(Op, MVT::v2bf16, Expand);
794 setBF16OperationAction(Op, MVT::bf16, Legal, Promote);
795 if (getOperationAction(Op, MVT::bf16) == Promote)
796 AddPromotedToType(Op, MVT::bf16, MVT::f32);
797 }
798
799 if (STI.getSmVersion() < 80 || STI.getPTXVersion() < 71) {
801 }
802 if (STI.getSmVersion() < 90 || STI.getPTXVersion() < 78) {
803 for (MVT VT : {MVT::bf16, MVT::f32, MVT::f64}) {
806 }
807 }
808
809 // sm_80 only has conversions between f32 and bf16. Custom lower all other
810 // bf16 conversions.
811 if (STI.getSmVersion() < 90 || STI.getPTXVersion() < 78) {
812 for (MVT VT : {MVT::i1, MVT::i16, MVT::i32, MVT::i64}) {
815 VT, Custom);
816 }
819 MVT::bf16, Custom);
820 }
821
828 AddPromotedToType(ISD::FROUND, MVT::bf16, MVT::f32);
829
830 // 'Expand' implements FCOPYSIGN without calling an external library.
837
838 // These map to corresponding instructions for f32/f64. f16 must be
839 // promoted to f32. v2f16 is expanded to f16, which is then promoted
840 // to f32.
841 for (const auto &Op :
843 setOperationAction(Op, MVT::f16, Promote);
844 setOperationAction(Op, MVT::f32, Legal);
845 setOperationAction(Op, MVT::f64, Legal);
846 setOperationAction(Op, MVT::v2f16, Expand);
847 setOperationAction(Op, MVT::v2bf16, Expand);
848 setOperationAction(Op, MVT::bf16, Promote);
849 AddPromotedToType(Op, MVT::bf16, MVT::f32);
850 }
851 for (const auto &Op : {ISD::FABS}) {
852 setOperationAction(Op, MVT::f16, Promote);
853 setOperationAction(Op, MVT::f32, Legal);
854 setOperationAction(Op, MVT::f64, Legal);
855 setOperationAction(Op, MVT::v2f16, Expand);
856 setBF16OperationAction(Op, MVT::v2bf16, Legal, Expand);
857 setBF16OperationAction(Op, MVT::bf16, Legal, Promote);
858 if (getOperationAction(Op, MVT::bf16) == Promote)
859 AddPromotedToType(Op, MVT::bf16, MVT::f32);
860 }
861
862 for (const auto &Op : {ISD::FMINNUM, ISD::FMAXNUM}) {
863 setOperationAction(Op, MVT::f32, Legal);
864 setOperationAction(Op, MVT::f64, Legal);
865 setFP16OperationAction(Op, MVT::f16, Legal, Promote);
866 setFP16OperationAction(Op, MVT::v2f16, Legal, Expand);
867 setBF16OperationAction(Op, MVT::v2bf16, Legal, Expand);
868 setBF16OperationAction(Op, MVT::bf16, Legal, Promote);
869 if (getOperationAction(Op, MVT::bf16) == Promote)
870 AddPromotedToType(Op, MVT::bf16, MVT::f32);
871 }
872 bool SupportsF32MinMaxNaN =
873 STI.getSmVersion() >= 80 && STI.getPTXVersion() >= 70;
874 for (const auto &Op : {ISD::FMINIMUM, ISD::FMAXIMUM}) {
875 setOperationAction(Op, MVT::f32, SupportsF32MinMaxNaN ? Legal : Expand);
876 setFP16OperationAction(Op, MVT::f16, Legal, Expand);
877 setFP16OperationAction(Op, MVT::v2f16, Legal, Expand);
878 setBF16OperationAction(Op, MVT::bf16, Legal, Expand);
879 setBF16OperationAction(Op, MVT::v2bf16, Legal, Expand);
880 }
881
882 // Custom lowering for inline asm with 128-bit operands
885
886 // No FEXP2, FLOG2. The PTX ex2 and log2 functions are always approximate.
887 // No FPOW or FREM in PTX.
888
889 // Now deduce the information based on the above mentioned
890 // actions
892
896}
897
898const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const {
899
900#define MAKE_CASE(V) \
901 case V: \
902 return #V;
903
904 switch ((NVPTXISD::NodeType)Opcode) {
906 break;
907
1054
1145
1157
1169
1181
1193
1205
1217
1229
1241
1253
1265
1277
1289
1301
1313
1325 }
1326 return nullptr;
1327
1328#undef MAKE_CASE
1329}
1330
1333 if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&
1334 VT.getScalarType() == MVT::i1)
1335 return TypeSplitVector;
1336 if (Isv2x16VT(VT))
1337 return TypeLegal;
1339}
1340
1342 int Enabled, int &ExtraSteps,
1343 bool &UseOneConst,
1344 bool Reciprocal) const {
1347 return SDValue();
1348
1349 if (ExtraSteps == ReciprocalEstimate::Unspecified)
1350 ExtraSteps = 0;
1351
1352 SDLoc DL(Operand);
1353 EVT VT = Operand.getValueType();
1354 bool Ftz = useF32FTZ(DAG.getMachineFunction());
1355
1356 auto MakeIntrinsicCall = [&](Intrinsic::ID IID) {
1357 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
1358 DAG.getConstant(IID, DL, MVT::i32), Operand);
1359 };
1360
1361 // The sqrt and rsqrt refinement processes assume we always start out with an
1362 // approximation of the rsqrt. Therefore, if we're going to do any refinement
1363 // (i.e. ExtraSteps > 0), we must return an rsqrt. But if we're *not* doing
1364 // any refinement, we must return a regular sqrt.
1365 if (Reciprocal || ExtraSteps > 0) {
1366 if (VT == MVT::f32)
1367 return MakeIntrinsicCall(Ftz ? Intrinsic::nvvm_rsqrt_approx_ftz_f
1368 : Intrinsic::nvvm_rsqrt_approx_f);
1369 else if (VT == MVT::f64)
1370 return MakeIntrinsicCall(Intrinsic::nvvm_rsqrt_approx_d);
1371 else
1372 return SDValue();
1373 } else {
1374 if (VT == MVT::f32)
1375 return MakeIntrinsicCall(Ftz ? Intrinsic::nvvm_sqrt_approx_ftz_f
1376 : Intrinsic::nvvm_sqrt_approx_f);
1377 else {
1378 // There's no sqrt.approx.f64 instruction, so we emit
1379 // reciprocal(rsqrt(x)). This is faster than
1380 // select(x == 0, 0, x * rsqrt(x)). (In fact, it's faster than plain
1381 // x * rsqrt(x).)
1382 return DAG.getNode(
1384 DAG.getConstant(Intrinsic::nvvm_rcp_approx_ftz_d, DL, MVT::i32),
1385 MakeIntrinsicCall(Intrinsic::nvvm_rsqrt_approx_d));
1386 }
1387 }
1388}
1389
1390SDValue
1392 SDLoc dl(Op);
1393 const GlobalAddressSDNode *GAN = cast<GlobalAddressSDNode>(Op);
1394 auto PtrVT = getPointerTy(DAG.getDataLayout(), GAN->getAddressSpace());
1395 Op = DAG.getTargetGlobalAddress(GAN->getGlobal(), dl, PtrVT);
1396 return DAG.getNode(NVPTXISD::Wrapper, dl, PtrVT, Op);
1397}
1398
1399static bool IsTypePassedAsArray(const Type *Ty) {
1400 return Ty->isAggregateType() || Ty->isVectorTy() || Ty->isIntegerTy(128) ||
1401 Ty->isHalfTy() || Ty->isBFloatTy();
1402}
1403
1405 const DataLayout &DL, Type *retTy, const ArgListTy &Args,
1406 const SmallVectorImpl<ISD::OutputArg> &Outs, MaybeAlign retAlignment,
1407 std::optional<std::pair<unsigned, const APInt &>> VAInfo,
1408 const CallBase &CB, unsigned UniqueCallSite) const {
1409 auto PtrVT = getPointerTy(DL);
1410
1411 bool isABI = (STI.getSmVersion() >= 20);
1412 assert(isABI && "Non-ABI compilation is not supported");
1413 if (!isABI)
1414 return "";
1415
1416 std::string Prototype;
1417 raw_string_ostream O(Prototype);
1418 O << "prototype_" << UniqueCallSite << " : .callprototype ";
1419
1420 if (retTy->getTypeID() == Type::VoidTyID) {
1421 O << "()";
1422 } else {
1423 O << "(";
1424 if ((retTy->isFloatingPointTy() || retTy->isIntegerTy()) &&
1425 !IsTypePassedAsArray(retTy)) {
1426 unsigned size = 0;
1427 if (auto *ITy = dyn_cast<IntegerType>(retTy)) {
1428 size = ITy->getBitWidth();
1429 } else {
1430 assert(retTy->isFloatingPointTy() &&
1431 "Floating point type expected here");
1432 size = retTy->getPrimitiveSizeInBits();
1433 }
1434 // PTX ABI requires all scalar return values to be at least 32
1435 // bits in size. fp16 normally uses .b16 as its storage type in
1436 // PTX, so its size must be adjusted here, too.
1438
1439 O << ".param .b" << size << " _";
1440 } else if (isa<PointerType>(retTy)) {
1441 O << ".param .b" << PtrVT.getSizeInBits() << " _";
1442 } else if (IsTypePassedAsArray(retTy)) {
1443 O << ".param .align " << (retAlignment ? retAlignment->value() : 0)
1444 << " .b8 _[" << DL.getTypeAllocSize(retTy) << "]";
1445 } else {
1446 llvm_unreachable("Unknown return type");
1447 }
1448 O << ") ";
1449 }
1450 O << "_ (";
1451
1452 bool first = true;
1453
1454 unsigned NumArgs = VAInfo ? VAInfo->first : Args.size();
1455 for (unsigned i = 0, OIdx = 0; i != NumArgs; ++i, ++OIdx) {
1456 Type *Ty = Args[i].Ty;
1457 if (!first) {
1458 O << ", ";
1459 }
1460 first = false;
1461
1462 if (!Outs[OIdx].Flags.isByVal()) {
1463 if (IsTypePassedAsArray(Ty)) {
1464 Align ParamAlign =
1465 getArgumentAlignment(&CB, Ty, i + AttributeList::FirstArgIndex, DL);
1466 O << ".param .align " << ParamAlign.value() << " .b8 ";
1467 O << "_";
1468 O << "[" << DL.getTypeAllocSize(Ty) << "]";
1469 // update the index for Outs
1470 SmallVector<EVT, 16> vtparts;
1471 ComputeValueVTs(*this, DL, Ty, vtparts);
1472 if (unsigned len = vtparts.size())
1473 OIdx += len - 1;
1474 continue;
1475 }
1476 // i8 types in IR will be i16 types in SDAG
1477 assert((getValueType(DL, Ty) == Outs[OIdx].VT ||
1478 (getValueType(DL, Ty) == MVT::i8 && Outs[OIdx].VT == MVT::i16)) &&
1479 "type mismatch between callee prototype and arguments");
1480 // scalar type
1481 unsigned sz = 0;
1482 if (isa<IntegerType>(Ty)) {
1483 sz = cast<IntegerType>(Ty)->getBitWidth();
1485 } else if (isa<PointerType>(Ty)) {
1486 sz = PtrVT.getSizeInBits();
1487 } else {
1488 sz = Ty->getPrimitiveSizeInBits();
1489 }
1490 O << ".param .b" << sz << " ";
1491 O << "_";
1492 continue;
1493 }
1494
1495 // Indirect calls need strict ABI alignment so we disable optimizations by
1496 // not providing a function to optimize.
1497 Type *ETy = Args[i].IndirectType;
1498 Align InitialAlign = Outs[OIdx].Flags.getNonZeroByValAlign();
1499 Align ParamByValAlign =
1500 getFunctionByValParamAlign(/*F=*/nullptr, ETy, InitialAlign, DL);
1501
1502 O << ".param .align " << ParamByValAlign.value() << " .b8 ";
1503 O << "_";
1504 O << "[" << Outs[OIdx].Flags.getByValSize() << "]";
1505 }
1506
1507 if (VAInfo)
1508 O << (first ? "" : ",") << " .param .align " << VAInfo->second
1509 << " .b8 _[]\n";
1510 O << ")";
1512 O << " .noreturn";
1513 O << ";";
1514
1515 return Prototype;
1516}
1517
1519 const Function *F, Type *Ty, unsigned Idx, const DataLayout &DL) const {
1520 return getAlign(*F, Idx).value_or(getFunctionParamOptimizedAlign(F, Ty, DL));
1521}
1522
1523Align NVPTXTargetLowering::getArgumentAlignment(const CallBase *CB, Type *Ty,
1524 unsigned Idx,
1525 const DataLayout &DL) const {
1526 if (!CB) {
1527 // CallSite is zero, fallback to ABI type alignment
1528 return DL.getABITypeAlign(Ty);
1529 }
1530
1531 const Function *DirectCallee = CB->getCalledFunction();
1532
1533 if (!DirectCallee) {
1534 // We don't have a direct function symbol, but that may be because of
1535 // constant cast instructions in the call.
1536
1537 // With bitcast'd call targets, the instruction will be the call
1538 if (const auto *CI = dyn_cast<CallInst>(CB)) {
1539 // Check if we have call alignment metadata
1540 if (MaybeAlign StackAlign = getAlign(*CI, Idx))
1541 return StackAlign.value();
1542 }
1543 DirectCallee = getMaybeBitcastedCallee(CB);
1544 }
1545
1546 // Check for function alignment information if we found that the
1547 // ultimate target is a Function
1548 if (DirectCallee)
1549 return getFunctionArgumentAlignment(DirectCallee, Ty, Idx, DL);
1550
1551 // Call is indirect, fall back to the ABI type alignment
1552 return DL.getABITypeAlign(Ty);
1553}
1554
1555static bool adjustElementType(EVT &ElementType) {
1556 switch (ElementType.getSimpleVT().SimpleTy) {
1557 default:
1558 return false;
1559 case MVT::f16:
1560 case MVT::bf16:
1561 ElementType = MVT::i16;
1562 return true;
1563 case MVT::f32:
1564 case MVT::v2f16:
1565 case MVT::v2bf16:
1566 ElementType = MVT::i32;
1567 return true;
1568 case MVT::f64:
1569 ElementType = MVT::i64;
1570 return true;
1571 }
1572}
1573
1574// Use byte-store when the param address of the argument value is unaligned.
1575// This may happen when the return value is a field of a packed structure.
1576//
1577// This is called in LowerCall() when passing the param values.
1579 uint64_t Offset, EVT ElementType,
1580 SDValue StVal, SDValue &InGlue,
1581 unsigned ArgID, const SDLoc &dl) {
1582 // Bit logic only works on integer types
1583 if (adjustElementType(ElementType))
1584 StVal = DAG.getNode(ISD::BITCAST, dl, ElementType, StVal);
1585
1586 // Store each byte
1587 SDVTList StoreVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1588 for (unsigned i = 0, n = ElementType.getSizeInBits() / 8; i < n; i++) {
1589 // Shift the byte to the last byte position
1590 SDValue ShiftVal = DAG.getNode(ISD::SRL, dl, ElementType, StVal,
1591 DAG.getConstant(i * 8, dl, MVT::i32));
1592 SDValue StoreOperands[] = {Chain, DAG.getConstant(ArgID, dl, MVT::i32),
1593 DAG.getConstant(Offset + i, dl, MVT::i32),
1594 ShiftVal, InGlue};
1595 // Trunc store only the last byte by using
1596 // st.param.b8
1597 // The register type can be larger than b8.
1598 Chain = DAG.getMemIntrinsicNode(
1599 NVPTXISD::StoreParam, dl, StoreVTs, StoreOperands, MVT::i8,
1601 InGlue = Chain.getValue(1);
1602 }
1603 return Chain;
1604}
1605
1606// Use byte-load when the param adress of the returned value is unaligned.
1607// This may happen when the returned value is a field of a packed structure.
1608static SDValue
1610 EVT ElementType, SDValue &InGlue,
1611 SmallVectorImpl<SDValue> &TempProxyRegOps,
1612 const SDLoc &dl) {
1613 // Bit logic only works on integer types
1614 EVT MergedType = ElementType;
1615 adjustElementType(MergedType);
1616
1617 // Load each byte and construct the whole value. Initial value to 0
1618 SDValue RetVal = DAG.getConstant(0, dl, MergedType);
1619 // LoadParamMemI8 loads into i16 register only
1620 SDVTList LoadVTs = DAG.getVTList(MVT::i16, MVT::Other, MVT::Glue);
1621 for (unsigned i = 0, n = ElementType.getSizeInBits() / 8; i < n; i++) {
1622 SDValue LoadOperands[] = {Chain, DAG.getConstant(1, dl, MVT::i32),
1623 DAG.getConstant(Offset + i, dl, MVT::i32),
1624 InGlue};
1625 // This will be selected to LoadParamMemI8
1626 SDValue LdVal =
1627 DAG.getMemIntrinsicNode(NVPTXISD::LoadParam, dl, LoadVTs, LoadOperands,
1628 MVT::i8, MachinePointerInfo(), Align(1));
1629 SDValue TmpLdVal = LdVal.getValue(0);
1630 Chain = LdVal.getValue(1);
1631 InGlue = LdVal.getValue(2);
1632
1633 TmpLdVal = DAG.getNode(NVPTXISD::ProxyReg, dl,
1634 TmpLdVal.getSimpleValueType(), TmpLdVal);
1635 TempProxyRegOps.push_back(TmpLdVal);
1636
1637 SDValue CMask = DAG.getConstant(255, dl, MergedType);
1638 SDValue CShift = DAG.getConstant(i * 8, dl, MVT::i32);
1639 // Need to extend the i16 register to the whole width.
1640 TmpLdVal = DAG.getNode(ISD::ZERO_EXTEND, dl, MergedType, TmpLdVal);
1641 // Mask off the high bits. Leave only the lower 8bits.
1642 // Do this because we are using loadparam.b8.
1643 TmpLdVal = DAG.getNode(ISD::AND, dl, MergedType, TmpLdVal, CMask);
1644 // Shift and merge
1645 TmpLdVal = DAG.getNode(ISD::SHL, dl, MergedType, TmpLdVal, CShift);
1646 RetVal = DAG.getNode(ISD::OR, dl, MergedType, RetVal, TmpLdVal);
1647 }
1648 if (ElementType != MergedType)
1649 RetVal = DAG.getNode(ISD::BITCAST, dl, ElementType, RetVal);
1650
1651 return RetVal;
1652}
1653
1655 SmallVectorImpl<SDValue> &InVals) const {
1656
1657 if (CLI.IsVarArg && (STI.getPTXVersion() < 60 || STI.getSmVersion() < 30))
1659 "Support for variadic functions (unsized array parameter) introduced "
1660 "in PTX ISA version 6.0 and requires target sm_30.");
1661
1662 SelectionDAG &DAG = CLI.DAG;
1663 SDLoc dl = CLI.DL;
1665 SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
1667 SDValue Chain = CLI.Chain;
1668 SDValue Callee = CLI.Callee;
1669 bool &isTailCall = CLI.IsTailCall;
1670 ArgListTy &Args = CLI.getArgs();
1671 Type *RetTy = CLI.RetTy;
1672 const CallBase *CB = CLI.CB;
1673 const DataLayout &DL = DAG.getDataLayout();
1674
1675 bool isABI = (STI.getSmVersion() >= 20);
1676 assert(isABI && "Non-ABI compilation is not supported");
1677 if (!isABI)
1678 return Chain;
1679
1680 // Variadic arguments.
1681 //
1682 // Normally, for each argument, we declare a param scalar or a param
1683 // byte array in the .param space, and store the argument value to that
1684 // param scalar or array starting at offset 0.
1685 //
1686 // In the case of the first variadic argument, we declare a vararg byte array
1687 // with size 0. The exact size of this array isn't known at this point, so
1688 // it'll be patched later. All the variadic arguments will be stored to this
1689 // array at a certain offset (which gets tracked by 'VAOffset'). The offset is
1690 // initially set to 0, so it can be used for non-variadic arguments (which use
1691 // 0 offset) to simplify the code.
1692 //
1693 // After all vararg is processed, 'VAOffset' holds the size of the
1694 // vararg byte array.
1695
1696 SDValue VADeclareParam; // vararg byte array
1697 unsigned FirstVAArg = CLI.NumFixedArgs; // position of the first variadic
1698 unsigned VAOffset = 0; // current offset in the param array
1699
1700 unsigned UniqueCallSite = GlobalUniqueCallSite.fetch_add(1);
1701 SDValue TempChain = Chain;
1702 Chain = DAG.getCALLSEQ_START(Chain, UniqueCallSite, 0, dl);
1703 SDValue InGlue = Chain.getValue(1);
1704
1705 unsigned ParamCount = 0;
1706 // Args.size() and Outs.size() need not match.
1707 // Outs.size() will be larger
1708 // * if there is an aggregate argument with multiple fields (each field
1709 // showing up separately in Outs)
1710 // * if there is a vector argument with more than typical vector-length
1711 // elements (generally if more than 4) where each vector element is
1712 // individually present in Outs.
1713 // So a different index should be used for indexing into Outs/OutVals.
1714 // See similar issue in LowerFormalArguments.
1715 unsigned OIdx = 0;
1716 // Declare the .params or .reg need to pass values
1717 // to the function
1718 for (unsigned i = 0, e = Args.size(); i != e; ++i, ++OIdx) {
1719 EVT VT = Outs[OIdx].VT;
1720 Type *Ty = Args[i].Ty;
1721 bool IsVAArg = (i >= CLI.NumFixedArgs);
1722 bool IsByVal = Outs[OIdx].Flags.isByVal();
1723
1726
1727 assert((!IsByVal || Args[i].IndirectType) &&
1728 "byval arg must have indirect type");
1729 Type *ETy = (IsByVal ? Args[i].IndirectType : Ty);
1730 ComputePTXValueVTs(*this, DL, ETy, VTs, &Offsets, IsByVal ? 0 : VAOffset);
1731
1732 Align ArgAlign;
1733 if (IsByVal) {
1734 // The ByValAlign in the Outs[OIdx].Flags is always set at this point,
1735 // so we don't need to worry whether it's naturally aligned or not.
1736 // See TargetLowering::LowerCallTo().
1737 Align InitialAlign = Outs[OIdx].Flags.getNonZeroByValAlign();
1738 ArgAlign = getFunctionByValParamAlign(CB->getCalledFunction(), ETy,
1739 InitialAlign, DL);
1740 if (IsVAArg)
1741 VAOffset = alignTo(VAOffset, ArgAlign);
1742 } else {
1743 ArgAlign = getArgumentAlignment(CB, Ty, ParamCount + 1, DL);
1744 }
1745
1746 unsigned TypeSize =
1747 (IsByVal ? Outs[OIdx].Flags.getByValSize() : DL.getTypeAllocSize(Ty));
1748 SDVTList DeclareParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1749
1750 bool NeedAlign; // Does argument declaration specify alignment?
1751 bool PassAsArray = IsByVal || IsTypePassedAsArray(Ty);
1752 if (IsVAArg) {
1753 if (ParamCount == FirstVAArg) {
1754 SDValue DeclareParamOps[] = {
1755 Chain, DAG.getConstant(STI.getMaxRequiredAlignment(), dl, MVT::i32),
1756 DAG.getConstant(ParamCount, dl, MVT::i32),
1757 DAG.getConstant(1, dl, MVT::i32), InGlue};
1758 VADeclareParam = Chain = DAG.getNode(NVPTXISD::DeclareParam, dl,
1759 DeclareParamVTs, DeclareParamOps);
1760 }
1761 NeedAlign = PassAsArray;
1762 } else if (PassAsArray) {
1763 // declare .param .align <align> .b8 .param<n>[<size>];
1764 SDValue DeclareParamOps[] = {
1765 Chain, DAG.getConstant(ArgAlign.value(), dl, MVT::i32),
1766 DAG.getConstant(ParamCount, dl, MVT::i32),
1767 DAG.getConstant(TypeSize, dl, MVT::i32), InGlue};
1768 Chain = DAG.getNode(NVPTXISD::DeclareParam, dl, DeclareParamVTs,
1769 DeclareParamOps);
1770 NeedAlign = true;
1771 } else {
1772 // declare .param .b<size> .param<n>;
1773 if (VT.isInteger() || VT.isFloatingPoint()) {
1774 // PTX ABI requires integral types to be at least 32 bits in
1775 // size. FP16 is loaded/stored using i16, so it's handled
1776 // here as well.
1778 }
1779 SDValue DeclareScalarParamOps[] = {
1780 Chain, DAG.getConstant(ParamCount, dl, MVT::i32),
1781 DAG.getConstant(TypeSize * 8, dl, MVT::i32),
1782 DAG.getConstant(0, dl, MVT::i32), InGlue};
1783 Chain = DAG.getNode(NVPTXISD::DeclareScalarParam, dl, DeclareParamVTs,
1784 DeclareScalarParamOps);
1785 NeedAlign = false;
1786 }
1787 InGlue = Chain.getValue(1);
1788
1789 // PTX Interoperability Guide 3.3(A): [Integer] Values shorter
1790 // than 32-bits are sign extended or zero extended, depending on
1791 // whether they are signed or unsigned types. This case applies
1792 // only to scalar parameters and not to aggregate values.
1793 bool ExtendIntegerParam =
1794 Ty->isIntegerTy() && DL.getTypeAllocSizeInBits(Ty) < 32;
1795
1796 auto VectorInfo = VectorizePTXValueVTs(VTs, Offsets, ArgAlign, IsVAArg);
1797 SmallVector<SDValue, 6> StoreOperands;
1798 for (unsigned j = 0, je = VTs.size(); j != je; ++j) {
1799 EVT EltVT = VTs[j];
1800 int CurOffset = Offsets[j];
1801 MaybeAlign PartAlign;
1802 if (NeedAlign)
1803 PartAlign = commonAlignment(ArgAlign, CurOffset);
1804
1805 SDValue StVal = OutVals[OIdx];
1806
1807 MVT PromotedVT;
1808 if (PromoteScalarIntegerPTX(EltVT, &PromotedVT)) {
1809 EltVT = EVT(PromotedVT);
1810 }
1811 if (PromoteScalarIntegerPTX(StVal.getValueType(), &PromotedVT)) {
1813 Outs[OIdx].Flags.isSExt() ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
1814 StVal = DAG.getNode(Ext, dl, PromotedVT, StVal);
1815 }
1816
1817 if (IsByVal) {
1818 auto PtrVT = getPointerTy(DL);
1819 SDValue srcAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StVal,
1820 DAG.getConstant(CurOffset, dl, PtrVT));
1821 StVal = DAG.getLoad(EltVT, dl, TempChain, srcAddr, MachinePointerInfo(),
1822 PartAlign);
1823 } else if (ExtendIntegerParam) {
1824 assert(VTs.size() == 1 && "Scalar can't have multiple parts.");
1825 // zext/sext to i32
1826 StVal = DAG.getNode(Outs[OIdx].Flags.isSExt() ? ISD::SIGN_EXTEND
1828 dl, MVT::i32, StVal);
1829 }
1830
1831 if (!ExtendIntegerParam && EltVT.getSizeInBits() < 16) {
1832 // Use 16-bit registers for small stores as it's the
1833 // smallest general purpose register size supported by NVPTX.
1834 StVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, StVal);
1835 }
1836
1837 // If we have a PVF_SCALAR entry, it may not be sufficiently aligned for a
1838 // scalar store. In such cases, fall back to byte stores.
1839 if (VectorInfo[j] == PVF_SCALAR && !IsVAArg && PartAlign.has_value() &&
1840 PartAlign.value() <
1841 DL.getABITypeAlign(EltVT.getTypeForEVT(*DAG.getContext()))) {
1842 assert(StoreOperands.empty() && "Unfinished preceeding store.");
1844 DAG, Chain, IsByVal ? CurOffset + VAOffset : CurOffset, EltVT,
1845 StVal, InGlue, ParamCount, dl);
1846
1847 // LowerUnalignedStoreParam took care of inserting the necessary nodes
1848 // into the SDAG, so just move on to the next element.
1849 if (!IsByVal)
1850 ++OIdx;
1851 continue;
1852 }
1853
1854 // New store.
1855 if (VectorInfo[j] & PVF_FIRST) {
1856 assert(StoreOperands.empty() && "Unfinished preceding store.");
1857 StoreOperands.push_back(Chain);
1858 StoreOperands.push_back(
1859 DAG.getConstant(IsVAArg ? FirstVAArg : ParamCount, dl, MVT::i32));
1860
1861 StoreOperands.push_back(DAG.getConstant(
1862 IsByVal ? CurOffset + VAOffset : (IsVAArg ? VAOffset : CurOffset),
1863 dl, MVT::i32));
1864 }
1865
1866 // Record the value to store.
1867 StoreOperands.push_back(StVal);
1868
1869 if (VectorInfo[j] & PVF_LAST) {
1870 unsigned NumElts = StoreOperands.size() - 3;
1872 switch (NumElts) {
1873 case 1:
1875 break;
1876 case 2:
1878 break;
1879 case 4:
1881 break;
1882 default:
1883 llvm_unreachable("Invalid vector info.");
1884 }
1885
1886 StoreOperands.push_back(InGlue);
1887
1888 // Adjust type of the store op if we've extended the scalar
1889 // return value.
1890 EVT TheStoreType = ExtendIntegerParam ? MVT::i32 : EltVT;
1891
1892 Chain = DAG.getMemIntrinsicNode(
1893 Op, dl, DAG.getVTList(MVT::Other, MVT::Glue), StoreOperands,
1894 TheStoreType, MachinePointerInfo(), PartAlign,
1896 InGlue = Chain.getValue(1);
1897
1898 // Cleanup.
1899 StoreOperands.clear();
1900
1901 // TODO: We may need to support vector types that can be passed
1902 // as scalars in variadic arguments.
1903 if (!IsByVal && IsVAArg) {
1904 assert(NumElts == 1 &&
1905 "Vectorization is expected to be disabled for variadics.");
1906 VAOffset += DL.getTypeAllocSize(
1907 TheStoreType.getTypeForEVT(*DAG.getContext()));
1908 }
1909 }
1910 if (!IsByVal)
1911 ++OIdx;
1912 }
1913 assert(StoreOperands.empty() && "Unfinished parameter store.");
1914 if (!IsByVal && VTs.size() > 0)
1915 --OIdx;
1916 ++ParamCount;
1917 if (IsByVal && IsVAArg)
1918 VAOffset += TypeSize;
1919 }
1920
1921 GlobalAddressSDNode *Func = dyn_cast<GlobalAddressSDNode>(Callee.getNode());
1922 MaybeAlign retAlignment = std::nullopt;
1923
1924 // Handle Result
1925 if (Ins.size() > 0) {
1926 SmallVector<EVT, 16> resvtparts;
1927 ComputeValueVTs(*this, DL, RetTy, resvtparts);
1928
1929 // Declare
1930 // .param .align N .b8 retval0[<size-in-bytes>], or
1931 // .param .b<size-in-bits> retval0
1932 unsigned resultsz = DL.getTypeAllocSizeInBits(RetTy);
1933 if (!IsTypePassedAsArray(RetTy)) {
1934 resultsz = promoteScalarArgumentSize(resultsz);
1935 SDVTList DeclareRetVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1936 SDValue DeclareRetOps[] = { Chain, DAG.getConstant(1, dl, MVT::i32),
1937 DAG.getConstant(resultsz, dl, MVT::i32),
1938 DAG.getConstant(0, dl, MVT::i32), InGlue };
1939 Chain = DAG.getNode(NVPTXISD::DeclareRet, dl, DeclareRetVTs,
1940 DeclareRetOps);
1941 InGlue = Chain.getValue(1);
1942 } else {
1943 retAlignment = getArgumentAlignment(CB, RetTy, 0, DL);
1944 assert(retAlignment && "retAlignment is guaranteed to be set");
1945 SDVTList DeclareRetVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1946 SDValue DeclareRetOps[] = {
1947 Chain, DAG.getConstant(retAlignment->value(), dl, MVT::i32),
1948 DAG.getConstant(resultsz / 8, dl, MVT::i32),
1949 DAG.getConstant(0, dl, MVT::i32), InGlue};
1950 Chain = DAG.getNode(NVPTXISD::DeclareRetParam, dl, DeclareRetVTs,
1951 DeclareRetOps);
1952 InGlue = Chain.getValue(1);
1953 }
1954 }
1955
1956 bool HasVAArgs = CLI.IsVarArg && (CLI.Args.size() > CLI.NumFixedArgs);
1957 // Set the size of the vararg param byte array if the callee is a variadic
1958 // function and the variadic part is not empty.
1959 if (HasVAArgs) {
1960 SDValue DeclareParamOps[] = {
1961 VADeclareParam.getOperand(0), VADeclareParam.getOperand(1),
1962 VADeclareParam.getOperand(2), DAG.getConstant(VAOffset, dl, MVT::i32),
1963 VADeclareParam.getOperand(4)};
1964 DAG.MorphNodeTo(VADeclareParam.getNode(), VADeclareParam.getOpcode(),
1965 VADeclareParam->getVTList(), DeclareParamOps);
1966 }
1967
1968 // Both indirect calls and libcalls have nullptr Func. In order to distinguish
1969 // between them we must rely on the call site value which is valid for
1970 // indirect calls but is always null for libcalls.
1971 bool isIndirectCall = !Func && CB;
1972
1973 if (isa<ExternalSymbolSDNode>(Callee)) {
1974 Function* CalleeFunc = nullptr;
1975
1976 // Try to find the callee in the current module.
1977 Callee = DAG.getSymbolFunctionGlobalAddress(Callee, &CalleeFunc);
1978 assert(CalleeFunc != nullptr && "Libcall callee must be set.");
1979
1980 // Set the "libcall callee" attribute to indicate that the function
1981 // must always have a declaration.
1982 CalleeFunc->addFnAttr("nvptx-libcall-callee", "true");
1983 }
1984
1985 if (isIndirectCall) {
1986 // This is indirect function call case : PTX requires a prototype of the
1987 // form
1988 // proto_0 : .callprototype(.param .b32 _) _ (.param .b32 _);
1989 // to be emitted, and the label has to used as the last arg of call
1990 // instruction.
1991 // The prototype is embedded in a string and put as the operand for a
1992 // CallPrototype SDNode which will print out to the value of the string.
1993 SDVTList ProtoVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1994 std::string Proto = getPrototype(
1995 DL, RetTy, Args, Outs, retAlignment,
1996 HasVAArgs
1997 ? std::optional<std::pair<unsigned, const APInt &>>(std::make_pair(
1998 CLI.NumFixedArgs, VADeclareParam->getConstantOperandAPInt(1)))
1999 : std::nullopt,
2000 *CB, UniqueCallSite);
2001 const char *ProtoStr = nvTM->getStrPool().save(Proto).data();
2002 SDValue ProtoOps[] = {
2003 Chain,
2004 DAG.getTargetExternalSymbol(ProtoStr, MVT::i32),
2005 InGlue,
2006 };
2007 Chain = DAG.getNode(NVPTXISD::CallPrototype, dl, ProtoVTs, ProtoOps);
2008 InGlue = Chain.getValue(1);
2009 }
2010 // Op to just print "call"
2011 SDVTList PrintCallVTs = DAG.getVTList(MVT::Other, MVT::Glue);
2012 SDValue PrintCallOps[] = {
2013 Chain, DAG.getConstant((Ins.size() == 0) ? 0 : 1, dl, MVT::i32), InGlue
2014 };
2015 // We model convergent calls as separate opcodes.
2017 if (CLI.IsConvergent)
2020 Chain = DAG.getNode(Opcode, dl, PrintCallVTs, PrintCallOps);
2021 InGlue = Chain.getValue(1);
2022
2023 // Ops to print out the function name
2024 SDVTList CallVoidVTs = DAG.getVTList(MVT::Other, MVT::Glue);
2025 SDValue CallVoidOps[] = { Chain, Callee, InGlue };
2026 Chain = DAG.getNode(NVPTXISD::CallVoid, dl, CallVoidVTs, CallVoidOps);
2027 InGlue = Chain.getValue(1);
2028
2029 // Ops to print out the param list
2030 SDVTList CallArgBeginVTs = DAG.getVTList(MVT::Other, MVT::Glue);
2031 SDValue CallArgBeginOps[] = { Chain, InGlue };
2032 Chain = DAG.getNode(NVPTXISD::CallArgBegin, dl, CallArgBeginVTs,
2033 CallArgBeginOps);
2034 InGlue = Chain.getValue(1);
2035
2036 for (unsigned i = 0, e = std::min(CLI.NumFixedArgs + 1, ParamCount); i != e;
2037 ++i) {
2038 unsigned opcode;
2039 if (i == (e - 1))
2040 opcode = NVPTXISD::LastCallArg;
2041 else
2042 opcode = NVPTXISD::CallArg;
2043 SDVTList CallArgVTs = DAG.getVTList(MVT::Other, MVT::Glue);
2044 SDValue CallArgOps[] = { Chain, DAG.getConstant(1, dl, MVT::i32),
2045 DAG.getConstant(i, dl, MVT::i32), InGlue };
2046 Chain = DAG.getNode(opcode, dl, CallArgVTs, CallArgOps);
2047 InGlue = Chain.getValue(1);
2048 }
2049 SDVTList CallArgEndVTs = DAG.getVTList(MVT::Other, MVT::Glue);
2050 SDValue CallArgEndOps[] = { Chain,
2051 DAG.getConstant(isIndirectCall ? 0 : 1, dl, MVT::i32),
2052 InGlue };
2053 Chain = DAG.getNode(NVPTXISD::CallArgEnd, dl, CallArgEndVTs, CallArgEndOps);
2054 InGlue = Chain.getValue(1);
2055
2056 if (isIndirectCall) {
2057 SDVTList PrototypeVTs = DAG.getVTList(MVT::Other, MVT::Glue);
2058 SDValue PrototypeOps[] = {
2059 Chain, DAG.getConstant(UniqueCallSite, dl, MVT::i32), InGlue};
2060 Chain = DAG.getNode(NVPTXISD::Prototype, dl, PrototypeVTs, PrototypeOps);
2061 InGlue = Chain.getValue(1);
2062 }
2063
2064 SmallVector<SDValue, 16> ProxyRegOps;
2065 SmallVector<std::optional<MVT>, 16> ProxyRegTruncates;
2066 // An item of the vector is filled if the element does not need a ProxyReg
2067 // operation on it and should be added to InVals as is. ProxyRegOps and
2068 // ProxyRegTruncates contain empty/none items at the same index.
2070 // A temporary ProxyReg operations inserted in `LowerUnalignedLoadRetParam()`
2071 // to use the values of `LoadParam`s and to be replaced later then
2072 // `CALLSEQ_END` is added.
2073 SmallVector<SDValue, 16> TempProxyRegOps;
2074
2075 // Generate loads from param memory/moves from registers for result
2076 if (Ins.size() > 0) {
2079 ComputePTXValueVTs(*this, DL, RetTy, VTs, &Offsets, 0);
2080 assert(VTs.size() == Ins.size() && "Bad value decomposition");
2081
2082 Align RetAlign = getArgumentAlignment(CB, RetTy, 0, DL);
2083 auto VectorInfo = VectorizePTXValueVTs(VTs, Offsets, RetAlign);
2084
2085 SmallVector<EVT, 6> LoadVTs;
2086 int VecIdx = -1; // Index of the first element of the vector.
2087
2088 // PTX Interoperability Guide 3.3(A): [Integer] Values shorter than
2089 // 32-bits are sign extended or zero extended, depending on whether
2090 // they are signed or unsigned types.
2091 bool ExtendIntegerRetVal =
2092 RetTy->isIntegerTy() && DL.getTypeAllocSizeInBits(RetTy) < 32;
2093
2094 for (unsigned i = 0, e = VTs.size(); i != e; ++i) {
2095 bool needTruncate = false;
2096 EVT TheLoadType = VTs[i];
2097 EVT EltType = Ins[i].VT;
2098 Align EltAlign = commonAlignment(RetAlign, Offsets[i]);
2099 MVT PromotedVT;
2100
2101 if (PromoteScalarIntegerPTX(TheLoadType, &PromotedVT)) {
2102 TheLoadType = EVT(PromotedVT);
2103 EltType = EVT(PromotedVT);
2104 needTruncate = true;
2105 }
2106
2107 if (ExtendIntegerRetVal) {
2108 TheLoadType = MVT::i32;
2109 EltType = MVT::i32;
2110 needTruncate = true;
2111 } else if (TheLoadType.getSizeInBits() < 16) {
2112 if (VTs[i].isInteger())
2113 needTruncate = true;
2114 EltType = MVT::i16;
2115 }
2116
2117 // If we have a PVF_SCALAR entry, it may not be sufficiently aligned for a
2118 // scalar load. In such cases, fall back to byte loads.
2119 if (VectorInfo[i] == PVF_SCALAR && RetTy->isAggregateType() &&
2120 EltAlign < DL.getABITypeAlign(
2121 TheLoadType.getTypeForEVT(*DAG.getContext()))) {
2122 assert(VecIdx == -1 && LoadVTs.empty() && "Orphaned operand list.");
2124 DAG, Chain, Offsets[i], TheLoadType, InGlue, TempProxyRegOps, dl);
2125 ProxyRegOps.push_back(SDValue());
2126 ProxyRegTruncates.push_back(std::optional<MVT>());
2127 RetElts.resize(i);
2128 RetElts.push_back(Ret);
2129
2130 continue;
2131 }
2132
2133 // Record index of the very first element of the vector.
2134 if (VectorInfo[i] & PVF_FIRST) {
2135 assert(VecIdx == -1 && LoadVTs.empty() && "Orphaned operand list.");
2136 VecIdx = i;
2137 }
2138
2139 LoadVTs.push_back(EltType);
2140
2141 if (VectorInfo[i] & PVF_LAST) {
2142 unsigned NumElts = LoadVTs.size();
2143 LoadVTs.push_back(MVT::Other);
2144 LoadVTs.push_back(MVT::Glue);
2146 switch (NumElts) {
2147 case 1:
2149 break;
2150 case 2:
2152 break;
2153 case 4:
2155 break;
2156 default:
2157 llvm_unreachable("Invalid vector info.");
2158 }
2159
2160 SDValue LoadOperands[] = {
2161 Chain, DAG.getConstant(1, dl, MVT::i32),
2162 DAG.getConstant(Offsets[VecIdx], dl, MVT::i32), InGlue};
2163 SDValue RetVal = DAG.getMemIntrinsicNode(
2164 Op, dl, DAG.getVTList(LoadVTs), LoadOperands, TheLoadType,
2165 MachinePointerInfo(), EltAlign,
2167
2168 for (unsigned j = 0; j < NumElts; ++j) {
2169 ProxyRegOps.push_back(RetVal.getValue(j));
2170
2171 if (needTruncate)
2172 ProxyRegTruncates.push_back(std::optional<MVT>(Ins[VecIdx + j].VT));
2173 else
2174 ProxyRegTruncates.push_back(std::optional<MVT>());
2175 }
2176
2177 Chain = RetVal.getValue(NumElts);
2178 InGlue = RetVal.getValue(NumElts + 1);
2179
2180 // Cleanup
2181 VecIdx = -1;
2182 LoadVTs.clear();
2183 }
2184 }
2185 }
2186
2187 Chain =
2188 DAG.getCALLSEQ_END(Chain, UniqueCallSite, UniqueCallSite + 1, InGlue, dl);
2189 InGlue = Chain.getValue(1);
2190
2191 // Append ProxyReg instructions to the chain to make sure that `callseq_end`
2192 // will not get lost. Otherwise, during libcalls expansion, the nodes can become
2193 // dangling.
2194 for (unsigned i = 0; i < ProxyRegOps.size(); ++i) {
2195 if (i < RetElts.size() && RetElts[i]) {
2196 InVals.push_back(RetElts[i]);
2197 continue;
2198 }
2199
2200 SDValue Ret = DAG.getNode(
2202 DAG.getVTList(ProxyRegOps[i].getSimpleValueType(), MVT::Other, MVT::Glue),
2203 { Chain, ProxyRegOps[i], InGlue }
2204 );
2205
2206 Chain = Ret.getValue(1);
2207 InGlue = Ret.getValue(2);
2208
2209 if (ProxyRegTruncates[i]) {
2210 Ret = DAG.getNode(ISD::TRUNCATE, dl, *ProxyRegTruncates[i], Ret);
2211 }
2212
2213 InVals.push_back(Ret);
2214 }
2215
2216 for (SDValue &T : TempProxyRegOps) {
2217 SDValue Repl = DAG.getNode(
2219 DAG.getVTList(T.getSimpleValueType(), MVT::Other, MVT::Glue),
2220 {Chain, T.getOperand(0), InGlue});
2221 DAG.ReplaceAllUsesWith(T, Repl);
2222 DAG.RemoveDeadNode(T.getNode());
2223
2224 Chain = Repl.getValue(1);
2225 InGlue = Repl.getValue(2);
2226 }
2227
2228 // set isTailCall to false for now, until we figure out how to express
2229 // tail call optimization in PTX
2230 isTailCall = false;
2231 return Chain;
2232}
2233
2235 SelectionDAG &DAG) const {
2236
2237 if (STI.getPTXVersion() < 73 || STI.getSmVersion() < 52) {
2238 const Function &Fn = DAG.getMachineFunction().getFunction();
2239
2240 DiagnosticInfoUnsupported NoDynamicAlloca(
2241 Fn,
2242 "Support for dynamic alloca introduced in PTX ISA version 7.3 and "
2243 "requires target sm_52.",
2244 SDLoc(Op).getDebugLoc());
2245 DAG.getContext()->diagnose(NoDynamicAlloca);
2246 auto Ops = {DAG.getConstant(0, SDLoc(), Op.getValueType()),
2247 Op.getOperand(0)};
2248 return DAG.getMergeValues(Ops, SDLoc());
2249 }
2250
2251 SDValue Chain = Op.getOperand(0);
2252 SDValue Size = Op.getOperand(1);
2253 uint64_t Align = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
2254 SDLoc DL(Op.getNode());
2255
2256 // The size for ptx alloca instruction is 64-bit for m64 and 32-bit for m32.
2257 MVT ValueSizeTy = nvTM->is64Bit() ? MVT::i64 : MVT::i32;
2258
2259 SDValue AllocOps[] = {Chain, DAG.getZExtOrTrunc(Size, DL, ValueSizeTy),
2260 DAG.getTargetConstant(Align, DL, MVT::i32)};
2261 EVT RetTypes[] = {ValueSizeTy, MVT::Other};
2262 return DAG.getNode(NVPTXISD::DYNAMIC_STACKALLOC, DL, RetTypes, AllocOps);
2263}
2264
2265// By default CONCAT_VECTORS is lowered by ExpandVectorBuildThroughStack()
2266// (see LegalizeDAG.cpp). This is slow and uses local memory.
2267// We use extract/insert/build vector just as what LegalizeOp() does in llvm 2.5
2268SDValue
2269NVPTXTargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const {
2270 SDNode *Node = Op.getNode();
2271 SDLoc dl(Node);
2273 unsigned NumOperands = Node->getNumOperands();
2274 for (unsigned i = 0; i < NumOperands; ++i) {
2275 SDValue SubOp = Node->getOperand(i);
2276 EVT VVT = SubOp.getNode()->getValueType(0);
2277 EVT EltVT = VVT.getVectorElementType();
2278 unsigned NumSubElem = VVT.getVectorNumElements();
2279 for (unsigned j = 0; j < NumSubElem; ++j) {
2280 Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, SubOp,
2281 DAG.getIntPtrConstant(j, dl)));
2282 }
2283 }
2284 return DAG.getBuildVector(Node->getValueType(0), dl, Ops);
2285}
2286
2287// We can init constant f16x2/v2i16/v4i8 with a single .b32 move. Normally it
2288// would get lowered as two constant loads and vector-packing move.
2289// Instead we want just a constant move:
2290// mov.b32 %r2, 0x40003C00
2291SDValue NVPTXTargetLowering::LowerBUILD_VECTOR(SDValue Op,
2292 SelectionDAG &DAG) const {
2293 EVT VT = Op->getValueType(0);
2294 if (!(Isv2x16VT(VT) || VT == MVT::v4i8))
2295 return Op;
2296
2297 SDLoc DL(Op);
2298
2299 if (!llvm::all_of(Op->ops(), [](SDValue Operand) {
2300 return Operand->isUndef() || isa<ConstantSDNode>(Operand) ||
2301 isa<ConstantFPSDNode>(Operand);
2302 })) {
2303 // Lower non-const v4i8 vector as byte-wise constructed i32, which allows us
2304 // to optimize calculation of constant parts.
2305 if (VT == MVT::v4i8) {
2306 SDValue C8 = DAG.getConstant(8, DL, MVT::i32);
2307 SDValue E01 = DAG.getNode(
2308 NVPTXISD::BFI, DL, MVT::i32,
2309 DAG.getAnyExtOrTrunc(Op->getOperand(1), DL, MVT::i32),
2310 DAG.getAnyExtOrTrunc(Op->getOperand(0), DL, MVT::i32), C8, C8);
2311 SDValue E012 =
2312 DAG.getNode(NVPTXISD::BFI, DL, MVT::i32,
2313 DAG.getAnyExtOrTrunc(Op->getOperand(2), DL, MVT::i32),
2314 E01, DAG.getConstant(16, DL, MVT::i32), C8);
2315 SDValue E0123 =
2316 DAG.getNode(NVPTXISD::BFI, DL, MVT::i32,
2317 DAG.getAnyExtOrTrunc(Op->getOperand(3), DL, MVT::i32),
2318 E012, DAG.getConstant(24, DL, MVT::i32), C8);
2319 return DAG.getNode(ISD::BITCAST, DL, VT, E0123);
2320 }
2321 return Op;
2322 }
2323
2324 // Get value or the Nth operand as an APInt(32). Undef values treated as 0.
2325 auto GetOperand = [](SDValue Op, int N) -> APInt {
2326 const SDValue &Operand = Op->getOperand(N);
2327 EVT VT = Op->getValueType(0);
2328 if (Operand->isUndef())
2329 return APInt(32, 0);
2330 APInt Value;
2331 if (VT == MVT::v2f16 || VT == MVT::v2bf16)
2332 Value = cast<ConstantFPSDNode>(Operand)->getValueAPF().bitcastToAPInt();
2333 else if (VT == MVT::v2i16 || VT == MVT::v4i8)
2334 Value = Operand->getAsAPIntVal();
2335 else
2336 llvm_unreachable("Unsupported type");
2337 // i8 values are carried around as i16, so we need to zero out upper bits,
2338 // so they do not get in the way of combining individual byte values
2339 if (VT == MVT::v4i8)
2340 Value = Value.trunc(8);
2341 return Value.zext(32);
2342 };
2343 APInt Value;
2344 if (Isv2x16VT(VT)) {
2345 Value = GetOperand(Op, 0) | GetOperand(Op, 1).shl(16);
2346 } else if (VT == MVT::v4i8) {
2347 Value = GetOperand(Op, 0) | GetOperand(Op, 1).shl(8) |
2348 GetOperand(Op, 2).shl(16) | GetOperand(Op, 3).shl(24);
2349 } else {
2350 llvm_unreachable("Unsupported type");
2351 }
2352 SDValue Const = DAG.getConstant(Value, SDLoc(Op), MVT::i32);
2353 return DAG.getNode(ISD::BITCAST, SDLoc(Op), Op->getValueType(0), Const);
2354}
2355
2356SDValue NVPTXTargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
2357 SelectionDAG &DAG) const {
2358 SDValue Index = Op->getOperand(1);
2359 SDValue Vector = Op->getOperand(0);
2360 SDLoc DL(Op);
2361 EVT VectorVT = Vector.getValueType();
2362
2363 if (VectorVT == MVT::v4i8) {
2364 SDValue BFE =
2365 DAG.getNode(NVPTXISD::BFE, DL, MVT::i32,
2366 {Vector,
2367 DAG.getNode(ISD::MUL, DL, MVT::i32,
2368 DAG.getZExtOrTrunc(Index, DL, MVT::i32),
2369 DAG.getConstant(8, DL, MVT::i32)),
2370 DAG.getConstant(8, DL, MVT::i32)});
2371 return DAG.getAnyExtOrTrunc(BFE, DL, Op->getValueType(0));
2372 }
2373
2374 // Constant index will be matched by tablegen.
2375 if (isa<ConstantSDNode>(Index.getNode()))
2376 return Op;
2377
2378 // Extract individual elements and select one of them.
2379 assert(Isv2x16VT(VectorVT) && "Unexpected vector type.");
2380 EVT EltVT = VectorVT.getVectorElementType();
2381
2382 SDLoc dl(Op.getNode());
2383 SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Vector,
2384 DAG.getIntPtrConstant(0, dl));
2385 SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Vector,
2386 DAG.getIntPtrConstant(1, dl));
2387 return DAG.getSelectCC(dl, Index, DAG.getIntPtrConstant(0, dl), E0, E1,
2389}
2390
2391SDValue NVPTXTargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
2392 SelectionDAG &DAG) const {
2393 SDValue Vector = Op->getOperand(0);
2394 EVT VectorVT = Vector.getValueType();
2395
2396 if (VectorVT != MVT::v4i8)
2397 return Op;
2398 SDLoc DL(Op);
2399 SDValue Value = Op->getOperand(1);
2400 if (Value->isUndef())
2401 return Vector;
2402
2403 SDValue Index = Op->getOperand(2);
2404
2405 SDValue BFI =
2406 DAG.getNode(NVPTXISD::BFI, DL, MVT::i32,
2407 {DAG.getZExtOrTrunc(Value, DL, MVT::i32), Vector,
2408 DAG.getNode(ISD::MUL, DL, MVT::i32,
2409 DAG.getZExtOrTrunc(Index, DL, MVT::i32),
2410 DAG.getConstant(8, DL, MVT::i32)),
2411 DAG.getConstant(8, DL, MVT::i32)});
2412 return DAG.getNode(ISD::BITCAST, DL, Op->getValueType(0), BFI);
2413}
2414
2415SDValue NVPTXTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
2416 SelectionDAG &DAG) const {
2417 SDValue V1 = Op.getOperand(0);
2418 EVT VectorVT = V1.getValueType();
2419 if (VectorVT != MVT::v4i8 || Op.getValueType() != MVT::v4i8)
2420 return Op;
2421
2422 // Lower shuffle to PRMT instruction.
2423 const ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
2424 SDValue V2 = Op.getOperand(1);
2425 uint32_t Selector = 0;
2426 for (auto I : llvm::enumerate(SVN->getMask())) {
2427 if (I.value() != -1) // -1 is a placeholder for undef.
2428 Selector |= (I.value() << (I.index() * 4));
2429 }
2430
2431 SDLoc DL(Op);
2432 return DAG.getNode(NVPTXISD::PRMT, DL, MVT::v4i8, V1, V2,
2433 DAG.getConstant(Selector, DL, MVT::i32),
2434 DAG.getConstant(NVPTX::PTXPrmtMode::NONE, DL, MVT::i32));
2435}
2436/// LowerShiftRightParts - Lower SRL_PARTS, SRA_PARTS, which
2437/// 1) returns two i32 values and take a 2 x i32 value to shift plus a shift
2438/// amount, or
2439/// 2) returns two i64 values and take a 2 x i64 value to shift plus a shift
2440/// amount.
2441SDValue NVPTXTargetLowering::LowerShiftRightParts(SDValue Op,
2442 SelectionDAG &DAG) const {
2443 assert(Op.getNumOperands() == 3 && "Not a double-shift!");
2444 assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS);
2445
2446 EVT VT = Op.getValueType();
2447 unsigned VTBits = VT.getSizeInBits();
2448 SDLoc dl(Op);
2449 SDValue ShOpLo = Op.getOperand(0);
2450 SDValue ShOpHi = Op.getOperand(1);
2451 SDValue ShAmt = Op.getOperand(2);
2452 unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL;
2453
2454 if (VTBits == 32 && STI.getSmVersion() >= 35) {
2455 // For 32bit and sm35, we can use the funnel shift 'shf' instruction.
2456 // {dHi, dLo} = {aHi, aLo} >> Amt
2457 // dHi = aHi >> Amt
2458 // dLo = shf.r.clamp aLo, aHi, Amt
2459
2460 SDValue Hi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
2461 SDValue Lo = DAG.getNode(NVPTXISD::FUN_SHFR_CLAMP, dl, VT, ShOpLo, ShOpHi,
2462 ShAmt);
2463
2464 SDValue Ops[2] = { Lo, Hi };
2465 return DAG.getMergeValues(Ops, dl);
2466 }
2467 else {
2468 // {dHi, dLo} = {aHi, aLo} >> Amt
2469 // - if (Amt>=size) then
2470 // dLo = aHi >> (Amt-size)
2471 // dHi = aHi >> Amt (this is either all 0 or all 1)
2472 // else
2473 // dLo = (aLo >>logic Amt) | (aHi << (size-Amt))
2474 // dHi = aHi >> Amt
2475
2476 SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
2477 DAG.getConstant(VTBits, dl, MVT::i32),
2478 ShAmt);
2479 SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt);
2480 SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
2481 DAG.getConstant(VTBits, dl, MVT::i32));
2482 SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt);
2483 SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
2484 SDValue TrueVal = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt);
2485
2486 SDValue Cmp = DAG.getSetCC(dl, MVT::i1, ShAmt,
2487 DAG.getConstant(VTBits, dl, MVT::i32),
2488 ISD::SETGE);
2489 SDValue Hi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
2490 SDValue Lo = DAG.getNode(ISD::SELECT, dl, VT, Cmp, TrueVal, FalseVal);
2491
2492 SDValue Ops[2] = { Lo, Hi };
2493 return DAG.getMergeValues(Ops, dl);
2494 }
2495}
2496
2497/// LowerShiftLeftParts - Lower SHL_PARTS, which
2498/// 1) returns two i32 values and take a 2 x i32 value to shift plus a shift
2499/// amount, or
2500/// 2) returns two i64 values and take a 2 x i64 value to shift plus a shift
2501/// amount.
2502SDValue NVPTXTargetLowering::LowerShiftLeftParts(SDValue Op,
2503 SelectionDAG &DAG) const {
2504 assert(Op.getNumOperands() == 3 && "Not a double-shift!");
2505 assert(Op.getOpcode() == ISD::SHL_PARTS);
2506
2507 EVT VT = Op.getValueType();
2508 unsigned VTBits = VT.getSizeInBits();
2509 SDLoc dl(Op);
2510 SDValue ShOpLo = Op.getOperand(0);
2511 SDValue ShOpHi = Op.getOperand(1);
2512 SDValue ShAmt = Op.getOperand(2);
2513
2514 if (VTBits == 32 && STI.getSmVersion() >= 35) {
2515 // For 32bit and sm35, we can use the funnel shift 'shf' instruction.
2516 // {dHi, dLo} = {aHi, aLo} << Amt
2517 // dHi = shf.l.clamp aLo, aHi, Amt
2518 // dLo = aLo << Amt
2519
2520 SDValue Hi = DAG.getNode(NVPTXISD::FUN_SHFL_CLAMP, dl, VT, ShOpLo, ShOpHi,
2521 ShAmt);
2522 SDValue Lo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
2523
2524 SDValue Ops[2] = { Lo, Hi };
2525 return DAG.getMergeValues(Ops, dl);
2526 }
2527 else {
2528 // {dHi, dLo} = {aHi, aLo} << Amt
2529 // - if (Amt>=size) then
2530 // dLo = aLo << Amt (all 0)
2531 // dLo = aLo << (Amt-size)
2532 // else
2533 // dLo = aLo << Amt
2534 // dHi = (aHi << Amt) | (aLo >> (size-Amt))
2535
2536 SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
2537 DAG.getConstant(VTBits, dl, MVT::i32),
2538 ShAmt);
2539 SDValue Tmp1 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt);
2540 SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
2541 DAG.getConstant(VTBits, dl, MVT::i32));
2542 SDValue Tmp2 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt);
2543 SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
2544 SDValue TrueVal = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt);
2545
2546 SDValue Cmp = DAG.getSetCC(dl, MVT::i1, ShAmt,
2547 DAG.getConstant(VTBits, dl, MVT::i32),
2548 ISD::SETGE);
2549 SDValue Lo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
2550 SDValue Hi = DAG.getNode(ISD::SELECT, dl, VT, Cmp, TrueVal, FalseVal);
2551
2552 SDValue Ops[2] = { Lo, Hi };
2553 return DAG.getMergeValues(Ops, dl);
2554 }
2555}
2556
2557SDValue NVPTXTargetLowering::LowerFROUND(SDValue Op, SelectionDAG &DAG) const {
2558 EVT VT = Op.getValueType();
2559
2560 if (VT == MVT::f32)
2561 return LowerFROUND32(Op, DAG);
2562
2563 if (VT == MVT::f64)
2564 return LowerFROUND64(Op, DAG);
2565
2566 llvm_unreachable("unhandled type");
2567}
2568
2569// This is the the rounding method used in CUDA libdevice in C like code:
2570// float roundf(float A)
2571// {
2572// float RoundedA = (float) (int) ( A > 0 ? (A + 0.5f) : (A - 0.5f));
2573// RoundedA = abs(A) > 0x1.0p23 ? A : RoundedA;
2574// return abs(A) < 0.5 ? (float)(int)A : RoundedA;
2575// }
2576SDValue NVPTXTargetLowering::LowerFROUND32(SDValue Op,
2577 SelectionDAG &DAG) const {
2578 SDLoc SL(Op);
2579 SDValue A = Op.getOperand(0);
2580 EVT VT = Op.getValueType();
2581
2582 SDValue AbsA = DAG.getNode(ISD::FABS, SL, VT, A);
2583
2584 // RoundedA = (float) (int) ( A > 0 ? (A + 0.5f) : (A - 0.5f))
2585 SDValue Bitcast = DAG.getNode(ISD::BITCAST, SL, MVT::i32, A);
2586 const int SignBitMask = 0x80000000;
2587 SDValue Sign = DAG.getNode(ISD::AND, SL, MVT::i32, Bitcast,
2588 DAG.getConstant(SignBitMask, SL, MVT::i32));
2589 const int PointFiveInBits = 0x3F000000;
2590 SDValue PointFiveWithSignRaw =
2591 DAG.getNode(ISD::OR, SL, MVT::i32, Sign,
2592 DAG.getConstant(PointFiveInBits, SL, MVT::i32));
2593 SDValue PointFiveWithSign =
2594 DAG.getNode(ISD::BITCAST, SL, VT, PointFiveWithSignRaw);
2595 SDValue AdjustedA = DAG.getNode(ISD::FADD, SL, VT, A, PointFiveWithSign);
2596 SDValue RoundedA = DAG.getNode(ISD::FTRUNC, SL, VT, AdjustedA);
2597
2598 // RoundedA = abs(A) > 0x1.0p23 ? A : RoundedA;
2599 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2600 SDValue IsLarge =
2601 DAG.getSetCC(SL, SetCCVT, AbsA, DAG.getConstantFP(pow(2.0, 23.0), SL, VT),
2602 ISD::SETOGT);
2603 RoundedA = DAG.getNode(ISD::SELECT, SL, VT, IsLarge, A, RoundedA);
2604
2605 // return abs(A) < 0.5 ? (float)(int)A : RoundedA;
2606 SDValue IsSmall =DAG.getSetCC(SL, SetCCVT, AbsA,
2607 DAG.getConstantFP(0.5, SL, VT), ISD::SETOLT);
2608 SDValue RoundedAForSmallA = DAG.getNode(ISD::FTRUNC, SL, VT, A);
2609 return DAG.getNode(ISD::SELECT, SL, VT, IsSmall, RoundedAForSmallA, RoundedA);
2610}
2611
2612// The implementation of round(double) is similar to that of round(float) in
2613// that they both separate the value range into three regions and use a method
2614// specific to the region to round the values. However, round(double) first
2615// calculates the round of the absolute value and then adds the sign back while
2616// round(float) directly rounds the value with sign.
2617SDValue NVPTXTargetLowering::LowerFROUND64(SDValue Op,
2618 SelectionDAG &DAG) const {
2619 SDLoc SL(Op);
2620 SDValue A = Op.getOperand(0);
2621 EVT VT = Op.getValueType();
2622
2623 SDValue AbsA = DAG.getNode(ISD::FABS, SL, VT, A);
2624
2625 // double RoundedA = (double) (int) (abs(A) + 0.5f);
2626 SDValue AdjustedA = DAG.getNode(ISD::FADD, SL, VT, AbsA,
2627 DAG.getConstantFP(0.5, SL, VT));
2628 SDValue RoundedA = DAG.getNode(ISD::FTRUNC, SL, VT, AdjustedA);
2629
2630 // RoundedA = abs(A) < 0.5 ? (double)0 : RoundedA;
2631 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2632 SDValue IsSmall =DAG.getSetCC(SL, SetCCVT, AbsA,
2633 DAG.getConstantFP(0.5, SL, VT), ISD::SETOLT);
2634 RoundedA = DAG.getNode(ISD::SELECT, SL, VT, IsSmall,
2635 DAG.getConstantFP(0, SL, VT),
2636 RoundedA);
2637
2638 // Add sign to rounded_A
2639 RoundedA = DAG.getNode(ISD::FCOPYSIGN, SL, VT, RoundedA, A);
2640 DAG.getNode(ISD::FTRUNC, SL, VT, A);
2641
2642 // RoundedA = abs(A) > 0x1.0p52 ? A : RoundedA;
2643 SDValue IsLarge =
2644 DAG.getSetCC(SL, SetCCVT, AbsA, DAG.getConstantFP(pow(2.0, 52.0), SL, VT),
2645 ISD::SETOGT);
2646 return DAG.getNode(ISD::SELECT, SL, VT, IsLarge, A, RoundedA);
2647}
2648
2649SDValue NVPTXTargetLowering::LowerINT_TO_FP(SDValue Op,
2650 SelectionDAG &DAG) const {
2651 assert(STI.getSmVersion() < 90 || STI.getPTXVersion() < 78);
2652
2653 if (Op.getValueType() == MVT::bf16) {
2654 SDLoc Loc(Op);
2655 return DAG.getNode(
2656 ISD::FP_ROUND, Loc, MVT::bf16,
2657 DAG.getNode(Op.getOpcode(), Loc, MVT::f32, Op.getOperand(0)),
2658 DAG.getIntPtrConstant(0, Loc));
2659 }
2660
2661 // Everything else is considered legal.
2662 return Op;
2663}
2664
2665SDValue NVPTXTargetLowering::LowerFP_TO_INT(SDValue Op,
2666 SelectionDAG &DAG) const {
2667 assert(STI.getSmVersion() < 90 || STI.getPTXVersion() < 78);
2668
2669 if (Op.getOperand(0).getValueType() == MVT::bf16) {
2670 SDLoc Loc(Op);
2671 return DAG.getNode(
2672 Op.getOpcode(), Loc, Op.getValueType(),
2673 DAG.getNode(ISD::FP_EXTEND, Loc, MVT::f32, Op.getOperand(0)));
2674 }
2675
2676 // Everything else is considered legal.
2677 return Op;
2678}
2679
2680SDValue NVPTXTargetLowering::LowerFP_ROUND(SDValue Op,
2681 SelectionDAG &DAG) const {
2682 EVT NarrowVT = Op.getValueType();
2683 SDValue Wide = Op.getOperand(0);
2684 EVT WideVT = Wide.getValueType();
2685 if (NarrowVT.getScalarType() == MVT::bf16) {
2686 const TargetLowering *TLI = STI.getTargetLowering();
2687 if (STI.getSmVersion() < 80 || STI.getPTXVersion() < 70) {
2688 return TLI->expandFP_ROUND(Op.getNode(), DAG);
2689 }
2690 if (STI.getSmVersion() < 90 || STI.getPTXVersion() < 78) {
2691 // This combination was the first to support f32 -> bf16.
2692 if (STI.getSmVersion() >= 80 && STI.getPTXVersion() >= 70) {
2693 if (WideVT.getScalarType() == MVT::f32) {
2694 return Op;
2695 }
2696 if (WideVT.getScalarType() == MVT::f64) {
2697 SDLoc Loc(Op);
2698 // Round-inexact-to-odd f64 to f32, then do the final rounding using
2699 // the hardware f32 -> bf16 instruction.
2701 WideVT.isVector() ? WideVT.changeVectorElementType(MVT::f32)
2702 : MVT::f32,
2703 Wide, Loc, DAG);
2704 return DAG.getFPExtendOrRound(rod, Loc, NarrowVT);
2705 }
2706 }
2707 return TLI->expandFP_ROUND(Op.getNode(), DAG);
2708 }
2709 }
2710
2711 // Everything else is considered legal.
2712 return Op;
2713}
2714
2715SDValue NVPTXTargetLowering::LowerFP_EXTEND(SDValue Op,
2716 SelectionDAG &DAG) const {
2717 SDValue Narrow = Op.getOperand(0);
2718 EVT NarrowVT = Narrow.getValueType();
2719 EVT WideVT = Op.getValueType();
2720 if (NarrowVT.getScalarType() == MVT::bf16) {
2721 if (WideVT.getScalarType() == MVT::f32 &&
2722 (STI.getSmVersion() < 80 || STI.getPTXVersion() < 71)) {
2723 SDLoc Loc(Op);
2724 return DAG.getNode(ISD::BF16_TO_FP, Loc, WideVT, Narrow);
2725 }
2726 if (WideVT.getScalarType() == MVT::f64 &&
2727 (STI.getSmVersion() < 90 || STI.getPTXVersion() < 78)) {
2728 EVT F32 = NarrowVT.isVector() ? NarrowVT.changeVectorElementType(MVT::f32)
2729 : MVT::f32;
2730 SDLoc Loc(Op);
2731 if (STI.getSmVersion() >= 80 && STI.getPTXVersion() >= 71) {
2732 Op = DAG.getNode(ISD::FP_EXTEND, Loc, F32, Narrow);
2733 } else {
2734 Op = DAG.getNode(ISD::BF16_TO_FP, Loc, F32, Narrow);
2735 }
2736 return DAG.getNode(ISD::FP_EXTEND, Loc, WideVT, Op);
2737 }
2738 }
2739
2740 // Everything else is considered legal.
2741 return Op;
2742}
2743
2745 SDLoc DL(Op);
2746 if (Op.getValueType() != MVT::v2i16)
2747 return Op;
2748 EVT EltVT = Op.getValueType().getVectorElementType();
2749 SmallVector<SDValue> VecElements;
2750 for (int I = 0, E = Op.getValueType().getVectorNumElements(); I < E; I++) {
2751 SmallVector<SDValue> ScalarArgs;
2752 llvm::transform(Op->ops(), std::back_inserter(ScalarArgs),
2753 [&](const SDUse &O) {
2754 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT,
2755 O.get(), DAG.getIntPtrConstant(I, DL));
2756 });
2757 VecElements.push_back(DAG.getNode(Op.getOpcode(), DL, EltVT, ScalarArgs));
2758 }
2759 SDValue V =
2760 DAG.getNode(ISD::BUILD_VECTOR, DL, Op.getValueType(), VecElements);
2761 return V;
2762}
2763
2764SDValue
2766 switch (Op.getOpcode()) {
2767 case ISD::RETURNADDR:
2768 return SDValue();
2769 case ISD::FRAMEADDR:
2770 return SDValue();
2771 case ISD::GlobalAddress:
2772 return LowerGlobalAddress(Op, DAG);
2774 return Op;
2775 case ISD::BUILD_VECTOR:
2776 return LowerBUILD_VECTOR(Op, DAG);
2778 return Op;
2780 return LowerEXTRACT_VECTOR_ELT(Op, DAG);
2782 return LowerINSERT_VECTOR_ELT(Op, DAG);
2784 return LowerVECTOR_SHUFFLE(Op, DAG);
2786 return LowerCONCAT_VECTORS(Op, DAG);
2787 case ISD::STORE:
2788 return LowerSTORE(Op, DAG);
2789 case ISD::LOAD:
2790 return LowerLOAD(Op, DAG);
2791 case ISD::SHL_PARTS:
2792 return LowerShiftLeftParts(Op, DAG);
2793 case ISD::SRA_PARTS:
2794 case ISD::SRL_PARTS:
2795 return LowerShiftRightParts(Op, DAG);
2796 case ISD::SELECT:
2797 return LowerSelect(Op, DAG);
2798 case ISD::FROUND:
2799 return LowerFROUND(Op, DAG);
2800 case ISD::SINT_TO_FP:
2801 case ISD::UINT_TO_FP:
2802 return LowerINT_TO_FP(Op, DAG);
2803 case ISD::FP_TO_SINT:
2804 case ISD::FP_TO_UINT:
2805 return LowerFP_TO_INT(Op, DAG);
2806 case ISD::FP_ROUND:
2807 return LowerFP_ROUND(Op, DAG);
2808 case ISD::FP_EXTEND:
2809 return LowerFP_EXTEND(Op, DAG);
2810 case ISD::BR_JT:
2811 return LowerBR_JT(Op, DAG);
2812 case ISD::VAARG:
2813 return LowerVAARG(Op, DAG);
2814 case ISD::VASTART:
2815 return LowerVASTART(Op, DAG);
2816 case ISD::ABS:
2817 case ISD::SMIN:
2818 case ISD::SMAX:
2819 case ISD::UMIN:
2820 case ISD::UMAX:
2821 case ISD::ADD:
2822 case ISD::SUB:
2823 case ISD::MUL:
2824 case ISD::SHL:
2825 case ISD::SREM:
2826 case ISD::UREM:
2827 return LowerVectorArith(Op, DAG);
2829 return LowerDYNAMIC_STACKALLOC(Op, DAG);
2830 case ISD::CopyToReg:
2831 return LowerCopyToReg_128(Op, DAG);
2832 default:
2833 llvm_unreachable("Custom lowering not defined for operation");
2834 }
2835}
2836
2837SDValue NVPTXTargetLowering::LowerBR_JT(SDValue Op, SelectionDAG &DAG) const {
2838 SDLoc DL(Op);
2839 SDValue Chain = Op.getOperand(0);
2840 const auto *JT = cast<JumpTableSDNode>(Op.getOperand(1));
2841 SDValue Index = Op.getOperand(2);
2842
2843 unsigned JId = JT->getIndex();
2845 ArrayRef<MachineBasicBlock *> MBBs = MJTI->getJumpTables()[JId].MBBs;
2846
2847 SDValue IdV = DAG.getConstant(JId, DL, MVT::i32);
2848
2849 // Generate BrxStart node
2850 SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
2851 Chain = DAG.getNode(NVPTXISD::BrxStart, DL, VTs, Chain, IdV);
2852
2853 // Generate BrxItem nodes
2854 assert(!MBBs.empty());
2855 for (MachineBasicBlock *MBB : MBBs.drop_back())
2856 Chain = DAG.getNode(NVPTXISD::BrxItem, DL, VTs, Chain.getValue(0),
2857 DAG.getBasicBlock(MBB), Chain.getValue(1));
2858
2859 // Generate BrxEnd nodes
2860 SDValue EndOps[] = {Chain.getValue(0), DAG.getBasicBlock(MBBs.back()), Index,
2861 IdV, Chain.getValue(1)};
2862 SDValue BrxEnd = DAG.getNode(NVPTXISD::BrxEnd, DL, VTs, EndOps);
2863
2864 return BrxEnd;
2865}
2866
2867// This will prevent AsmPrinter from trying to print the jump tables itself.
2870}
2871
2872// This function is almost a copy of SelectionDAG::expandVAArg().
2873// The only diff is that this one produces loads from local address space.
2874SDValue NVPTXTargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
2875 const TargetLowering *TLI = STI.getTargetLowering();
2876 SDLoc DL(Op);
2877
2878 SDNode *Node = Op.getNode();
2879 const Value *V = cast<SrcValueSDNode>(Node->getOperand(2))->getValue();
2880 EVT VT = Node->getValueType(0);
2881 auto *Ty = VT.getTypeForEVT(*DAG.getContext());
2882 SDValue Tmp1 = Node->getOperand(0);
2883 SDValue Tmp2 = Node->getOperand(1);
2884 const MaybeAlign MA(Node->getConstantOperandVal(3));
2885
2886 SDValue VAListLoad = DAG.getLoad(TLI->getPointerTy(DAG.getDataLayout()), DL,
2887 Tmp1, Tmp2, MachinePointerInfo(V));
2888 SDValue VAList = VAListLoad;
2889
2890 if (MA && *MA > TLI->getMinStackArgumentAlignment()) {
2891 VAList = DAG.getNode(
2892 ISD::ADD, DL, VAList.getValueType(), VAList,
2893 DAG.getConstant(MA->value() - 1, DL, VAList.getValueType()));
2894
2895 VAList = DAG.getNode(
2896 ISD::AND, DL, VAList.getValueType(), VAList,
2897 DAG.getConstant(-(int64_t)MA->value(), DL, VAList.getValueType()));
2898 }
2899
2900 // Increment the pointer, VAList, to the next vaarg
2901 Tmp1 = DAG.getNode(ISD::ADD, DL, VAList.getValueType(), VAList,
2903 DL, VAList.getValueType()));
2904
2905 // Store the incremented VAList to the legalized pointer
2906 Tmp1 = DAG.getStore(VAListLoad.getValue(1), DL, Tmp1, Tmp2,
2908
2909 const Value *SrcV =
2911
2912 // Load the actual argument out of the pointer VAList
2913 return DAG.getLoad(VT, DL, Tmp1, VAList, MachinePointerInfo(SrcV));
2914}
2915
2916SDValue NVPTXTargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
2917 const TargetLowering *TLI = STI.getTargetLowering();
2918 SDLoc DL(Op);
2919 EVT PtrVT = TLI->getPointerTy(DAG.getDataLayout());
2920
2921 // Store the address of unsized array <function>_vararg[] in the ap object.
2922 SDValue Arg = getParamSymbol(DAG, /* vararg */ -1, PtrVT);
2923 SDValue VAReg = DAG.getNode(NVPTXISD::Wrapper, DL, PtrVT, Arg);
2924
2925 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
2926 return DAG.getStore(Op.getOperand(0), DL, VAReg, Op.getOperand(1),
2927 MachinePointerInfo(SV));
2928}
2929
2930SDValue NVPTXTargetLowering::LowerSelect(SDValue Op, SelectionDAG &DAG) const {
2931 SDValue Op0 = Op->getOperand(0);
2932 SDValue Op1 = Op->getOperand(1);
2933 SDValue Op2 = Op->getOperand(2);
2934 SDLoc DL(Op.getNode());
2935
2936 assert(Op.getValueType() == MVT::i1 && "Custom lowering enabled only for i1");
2937
2938 Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op1);
2939 Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op2);
2940 SDValue Select = DAG.getNode(ISD::SELECT, DL, MVT::i32, Op0, Op1, Op2);
2941 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Select);
2942
2943 return Trunc;
2944}
2945
2946SDValue NVPTXTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
2947 if (Op.getValueType() == MVT::i1)
2948 return LowerLOADi1(Op, DAG);
2949
2950 // v2f16/v2bf16/v2i16/v4i8 are legal, so we can't rely on legalizer to handle
2951 // unaligned loads and have to handle it here.
2952 EVT VT = Op.getValueType();
2953 if (Isv2x16VT(VT) || VT == MVT::v4i8) {
2954 LoadSDNode *Load = cast<LoadSDNode>(Op);
2955 EVT MemVT = Load->getMemoryVT();
2957 MemVT, *Load->getMemOperand())) {
2958 SDValue Ops[2];
2959 std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(Load, DAG);
2960 return DAG.getMergeValues(Ops, SDLoc(Op));
2961 }
2962 }
2963
2964 return SDValue();
2965}
2966
2967// v = ld i1* addr
2968// =>
2969// v1 = ld i8* addr (-> i16)
2970// v = trunc i16 to i1
2971SDValue NVPTXTargetLowering::LowerLOADi1(SDValue Op, SelectionDAG &DAG) const {
2972 SDNode *Node = Op.getNode();
2973 LoadSDNode *LD = cast<LoadSDNode>(Node);
2974 SDLoc dl(Node);
2975 assert(LD->getExtensionType() == ISD::NON_EXTLOAD);
2976 assert(Node->getValueType(0) == MVT::i1 &&
2977 "Custom lowering for i1 load only");
2978 SDValue newLD = DAG.getExtLoad(ISD::ZEXTLOAD, dl, MVT::i16, LD->getChain(),
2979 LD->getBasePtr(), LD->getPointerInfo(),
2980 MVT::i8, LD->getAlign(),
2981 LD->getMemOperand()->getFlags());
2982 SDValue result = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, newLD);
2983 // The legalizer (the caller) is expecting two values from the legalized
2984 // load, so we build a MergeValues node for it. See ExpandUnalignedLoad()
2985 // in LegalizeDAG.cpp which also uses MergeValues.
2986 SDValue Ops[] = { result, LD->getChain() };
2987 return DAG.getMergeValues(Ops, dl);
2988}
2989
2990SDValue NVPTXTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
2991 StoreSDNode *Store = cast<StoreSDNode>(Op);
2992 EVT VT = Store->getMemoryVT();
2993
2994 if (VT == MVT::i1)
2995 return LowerSTOREi1(Op, DAG);
2996
2997 // v2f16 is legal, so we can't rely on legalizer to handle unaligned
2998 // stores and have to handle it here.
2999 if ((Isv2x16VT(VT) || VT == MVT::v4i8) &&
3001 VT, *Store->getMemOperand()))
3002 return expandUnalignedStore(Store, DAG);
3003
3004 // v2f16, v2bf16 and v2i16 don't need special handling.
3005 if (Isv2x16VT(VT) || VT == MVT::v4i8)
3006 return SDValue();
3007
3008 if (VT.isVector())
3009 return LowerSTOREVector(Op, DAG);
3010
3011 return SDValue();
3012}
3013
3014SDValue
3015NVPTXTargetLowering::LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const {
3016 SDNode *N = Op.getNode();
3017 SDValue Val = N->getOperand(1);
3018 SDLoc DL(N);
3019 EVT ValVT = Val.getValueType();
3020
3021 if (ValVT.isVector()) {
3022 // We only handle "native" vector sizes for now, e.g. <4 x double> is not
3023 // legal. We can (and should) split that into 2 stores of <2 x double> here
3024 // but I'm leaving that as a TODO for now.
3025 if (!ValVT.isSimple())
3026 return SDValue();
3027 switch (ValVT.getSimpleVT().SimpleTy) {
3028 default:
3029 return SDValue();
3030 case MVT::v2i8:
3031 case MVT::v2i16:
3032 case MVT::v2i32:
3033 case MVT::v2i64:
3034 case MVT::v2f16:
3035 case MVT::v2bf16:
3036 case MVT::v2f32:
3037 case MVT::v2f64:
3038 case MVT::v4i8:
3039 case MVT::v4i16:
3040 case MVT::v4i32:
3041 case MVT::v4f16:
3042 case MVT::v4bf16:
3043 case MVT::v4f32:
3044 case MVT::v8f16: // <4 x f16x2>
3045 case MVT::v8bf16: // <4 x bf16x2>
3046 case MVT::v8i16: // <4 x i16x2>
3047 // This is a "native" vector type
3048 break;
3049 }
3050
3051 MemSDNode *MemSD = cast<MemSDNode>(N);
3052 const DataLayout &TD = DAG.getDataLayout();
3053
3054 Align Alignment = MemSD->getAlign();
3055 Align PrefAlign =
3056 TD.getPrefTypeAlign(ValVT.getTypeForEVT(*DAG.getContext()));
3057 if (Alignment < PrefAlign) {
3058 // This store is not sufficiently aligned, so bail out and let this vector
3059 // store be scalarized. Note that we may still be able to emit smaller
3060 // vector stores. For example, if we are storing a <4 x float> with an
3061 // alignment of 8, this check will fail but the legalizer will try again
3062 // with 2 x <2 x float>, which will succeed with an alignment of 8.
3063 return SDValue();
3064 }
3065
3066 unsigned Opcode = 0;
3067 EVT EltVT = ValVT.getVectorElementType();
3068 unsigned NumElts = ValVT.getVectorNumElements();
3069
3070 // Since StoreV2 is a target node, we cannot rely on DAG type legalization.
3071 // Therefore, we must ensure the type is legal. For i1 and i8, we set the
3072 // stored type to i16 and propagate the "real" type as the memory type.
3073 bool NeedExt = false;
3074 if (EltVT.getSizeInBits() < 16)
3075 NeedExt = true;
3076
3077 bool StoreF16x2 = false;
3078 switch (NumElts) {
3079 default:
3080 return SDValue();
3081 case 2:
3082 Opcode = NVPTXISD::StoreV2;
3083 break;
3084 case 4:
3085 Opcode = NVPTXISD::StoreV4;
3086 break;
3087 case 8:
3088 // v8f16 is a special case. PTX doesn't have st.v8.f16
3089 // instruction. Instead, we split the vector into v2f16 chunks and
3090 // store them with st.v4.b32.
3091 assert(Is16bitsType(EltVT.getSimpleVT()) && "Wrong type for the vector.");
3092 Opcode = NVPTXISD::StoreV4;
3093 StoreF16x2 = true;
3094 break;
3095 }
3096
3098
3099 // First is the chain
3100 Ops.push_back(N->getOperand(0));
3101
3102 if (StoreF16x2) {
3103 // Combine f16,f16 -> v2f16
3104 NumElts /= 2;
3105 for (unsigned i = 0; i < NumElts; ++i) {
3106 SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Val,
3107 DAG.getIntPtrConstant(i * 2, DL));
3108 SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Val,
3109 DAG.getIntPtrConstant(i * 2 + 1, DL));
3110 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, 2);
3111 SDValue V2 = DAG.getNode(ISD::BUILD_VECTOR, DL, VecVT, E0, E1);
3112 Ops.push_back(V2);
3113 }
3114 } else {
3115 // Then the split values
3116 for (unsigned i = 0; i < NumElts; ++i) {
3117 SDValue ExtVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Val,
3118 DAG.getIntPtrConstant(i, DL));
3119 if (NeedExt)
3120 ExtVal = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i16, ExtVal);
3121 Ops.push_back(ExtVal);
3122 }
3123 }
3124
3125 // Then any remaining arguments
3126 Ops.append(N->op_begin() + 2, N->op_end());
3127
3128 SDValue NewSt =
3129 DAG.getMemIntrinsicNode(Opcode, DL, DAG.getVTList(MVT::Other), Ops,
3130 MemSD->getMemoryVT(), MemSD->getMemOperand());
3131
3132 // return DCI.CombineTo(N, NewSt, true);
3133 return NewSt;
3134 }
3135
3136 return SDValue();
3137}
3138
3139// st i1 v, addr
3140// =>
3141// v1 = zxt v to i16
3142// st.u8 i16, addr
3143SDValue NVPTXTargetLowering::LowerSTOREi1(SDValue Op, SelectionDAG &DAG) const {
3144 SDNode *Node = Op.getNode();
3145 SDLoc dl(Node);
3146 StoreSDNode *ST = cast<StoreSDNode>(Node);
3147 SDValue Tmp1 = ST->getChain();
3148 SDValue Tmp2 = ST->getBasePtr();
3149 SDValue Tmp3 = ST->getValue();
3150 assert(Tmp3.getValueType() == MVT::i1 && "Custom lowering for i1 store only");
3151 Tmp3 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Tmp3);
3152 SDValue Result =
3153 DAG.getTruncStore(Tmp1, dl, Tmp3, Tmp2, ST->getPointerInfo(), MVT::i8,
3154 ST->getAlign(), ST->getMemOperand()->getFlags());
3155 return Result;
3156}
3157
3158SDValue NVPTXTargetLowering::LowerCopyToReg_128(SDValue Op,
3159 SelectionDAG &DAG) const {
3160 // Change the CopyToReg to take in two 64-bit operands instead of a 128-bit
3161 // operand so that it can pass the legalization.
3162
3163 assert(Op.getOperand(1).getValueType() == MVT::i128 &&
3164 "Custom lowering for 128-bit CopyToReg only");
3165
3166 SDNode *Node = Op.getNode();
3167 SDLoc DL(Node);
3168
3169 SDValue Cast = DAG.getBitcast(MVT::v2i64, Op->getOperand(2));
3170 SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, Cast,
3171 DAG.getIntPtrConstant(0, DL));
3172 SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, Cast,
3173 DAG.getIntPtrConstant(1, DL));
3174
3176 SmallVector<EVT, 3> ResultsType(Node->values());
3177
3178 NewOps[0] = Op->getOperand(0); // Chain
3179 NewOps[1] = Op->getOperand(1); // Dst Reg
3180 NewOps[2] = Lo; // Lower 64-bit
3181 NewOps[3] = Hi; // Higher 64-bit
3182 if (Op.getNumOperands() == 4)
3183 NewOps[4] = Op->getOperand(3); // Glue if exists
3184
3185 return DAG.getNode(ISD::CopyToReg, DL, ResultsType, NewOps);
3186}
3187
3188unsigned NVPTXTargetLowering::getNumRegisters(
3189 LLVMContext &Context, EVT VT,
3190 std::optional<MVT> RegisterVT = std::nullopt) const {
3191 if (VT == MVT::i128 && RegisterVT == MVT::i128)
3192 return 1;
3193 return TargetLoweringBase::getNumRegisters(Context, VT, RegisterVT);
3194}
3195
3196bool NVPTXTargetLowering::splitValueIntoRegisterParts(
3197 SelectionDAG &DAG, const SDLoc &DL, SDValue Val, SDValue *Parts,
3198 unsigned NumParts, MVT PartVT, std::optional<CallingConv::ID> CC) const {
3199 if (Val.getValueType() == MVT::i128 && NumParts == 1) {
3200 Parts[0] = Val;
3201 return true;
3202 }
3203 return false;
3204}
3205
3206// This creates target external symbol for a function parameter.
3207// Name of the symbol is composed from its index and the function name.
3208// Negative index corresponds to special parameter (unsized array) used for
3209// passing variable arguments.
3210SDValue NVPTXTargetLowering::getParamSymbol(SelectionDAG &DAG, int idx,
3211 EVT v) const {
3212 StringRef SavedStr = nvTM->getStrPool().save(
3214 return DAG.getTargetExternalSymbol(SavedStr.data(), v);
3215}
3216
3218 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
3219 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
3220 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
3222 const DataLayout &DL = DAG.getDataLayout();
3223 auto PtrVT = getPointerTy(DAG.getDataLayout());
3224
3225 const Function *F = &MF.getFunction();
3226 const AttributeList &PAL = F->getAttributes();
3227 const TargetLowering *TLI = STI.getTargetLowering();
3228
3229 SDValue Root = DAG.getRoot();
3230 std::vector<SDValue> OutChains;
3231
3232 bool isABI = (STI.getSmVersion() >= 20);
3233 assert(isABI && "Non-ABI compilation is not supported");
3234 if (!isABI)
3235 return Chain;
3236
3237 std::vector<Type *> argTypes;
3238 std::vector<const Argument *> theArgs;
3239 for (const Argument &I : F->args()) {
3240 theArgs.push_back(&I);
3241 argTypes.push_back(I.getType());
3242 }
3243 // argTypes.size() (or theArgs.size()) and Ins.size() need not match.
3244 // Ins.size() will be larger
3245 // * if there is an aggregate argument with multiple fields (each field
3246 // showing up separately in Ins)
3247 // * if there is a vector argument with more than typical vector-length
3248 // elements (generally if more than 4) where each vector element is
3249 // individually present in Ins.
3250 // So a different index should be used for indexing into Ins.
3251 // See similar issue in LowerCall.
3252 unsigned InsIdx = 0;
3253
3254 for (unsigned i = 0, e = theArgs.size(); i != e; ++i, ++InsIdx) {
3255 Type *Ty = argTypes[i];
3256
3257 if (theArgs[i]->use_empty()) {
3258 // argument is dead
3259 if (IsTypePassedAsArray(Ty) && !Ty->isVectorTy()) {
3260 SmallVector<EVT, 16> vtparts;
3261
3262 ComputePTXValueVTs(*this, DAG.getDataLayout(), Ty, vtparts);
3263 if (vtparts.empty())
3264 report_fatal_error("Empty parameter types are not supported");
3265
3266 for (unsigned parti = 0, parte = vtparts.size(); parti != parte;
3267 ++parti) {
3268 InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT));
3269 ++InsIdx;
3270 }
3271 if (vtparts.size() > 0)
3272 --InsIdx;
3273 continue;
3274 }
3275 if (Ty->isVectorTy()) {
3276 EVT ObjectVT = getValueType(DL, Ty);
3277 unsigned NumRegs = TLI->getNumRegisters(F->getContext(), ObjectVT);
3278 for (unsigned parti = 0; parti < NumRegs; ++parti) {
3279 InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT));
3280 ++InsIdx;
3281 }
3282 if (NumRegs > 0)
3283 --InsIdx;
3284 continue;
3285 }
3286 InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT));
3287 continue;
3288 }
3289
3290 // In the following cases, assign a node order of "i+1"
3291 // to newly created nodes. The SDNodes for params have to
3292 // appear in the same order as their order of appearance
3293 // in the original function. "i+1" holds that order.
3294 if (!PAL.hasParamAttr(i, Attribute::ByVal)) {
3295 bool aggregateIsPacked = false;
3296 if (StructType *STy = dyn_cast<StructType>(Ty))
3297 aggregateIsPacked = STy->isPacked();
3298
3301 ComputePTXValueVTs(*this, DL, Ty, VTs, &Offsets, 0);
3302 if (VTs.empty())
3303 report_fatal_error("Empty parameter types are not supported");
3304
3307 auto VectorInfo = VectorizePTXValueVTs(VTs, Offsets, ArgAlign);
3308
3309 SDValue Arg = getParamSymbol(DAG, i, PtrVT);
3310 int VecIdx = -1; // Index of the first element of the current vector.
3311 for (unsigned parti = 0, parte = VTs.size(); parti != parte; ++parti) {
3312 if (VectorInfo[parti] & PVF_FIRST) {
3313 assert(VecIdx == -1 && "Orphaned vector.");
3314 VecIdx = parti;
3315 }
3316
3317 // That's the last element of this store op.
3318 if (VectorInfo[parti] & PVF_LAST) {
3319 unsigned NumElts = parti - VecIdx + 1;
3320 EVT EltVT = VTs[parti];
3321 // i1 is loaded/stored as i8.
3322 EVT LoadVT = EltVT;
3323 if (EltVT == MVT::i1)
3324 LoadVT = MVT::i8;
3325 else if (Isv2x16VT(EltVT) || EltVT == MVT::v4i8)
3326 // getLoad needs a vector type, but it can't handle
3327 // vectors which contain v2f16 or v2bf16 elements. So we must load
3328 // using i32 here and then bitcast back.
3329 LoadVT = MVT::i32;
3330
3331 EVT VecVT = EVT::getVectorVT(F->getContext(), LoadVT, NumElts);
3332 SDValue VecAddr =
3333 DAG.getNode(ISD::ADD, dl, PtrVT, Arg,
3334 DAG.getConstant(Offsets[VecIdx], dl, PtrVT));
3336 EltVT.getTypeForEVT(F->getContext()), ADDRESS_SPACE_PARAM));
3337
3338 const MaybeAlign PartAlign = [&]() -> MaybeAlign {
3339 if (aggregateIsPacked)
3340 return Align(1);
3341 if (NumElts != 1)
3342 return std::nullopt;
3343 Align PartAlign =
3344 DL.getABITypeAlign(EltVT.getTypeForEVT(F->getContext()));
3345 return commonAlignment(PartAlign, Offsets[parti]);
3346 }();
3347 SDValue P = DAG.getLoad(VecVT, dl, Root, VecAddr,
3348 MachinePointerInfo(srcValue), PartAlign,
3351 if (P.getNode())
3352 P.getNode()->setIROrder(i + 1);
3353 for (unsigned j = 0; j < NumElts; ++j) {
3354 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, LoadVT, P,
3355 DAG.getIntPtrConstant(j, dl));
3356 // We've loaded i1 as an i8 and now must truncate it back to i1
3357 if (EltVT == MVT::i1)
3358 Elt = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Elt);
3359 // v2f16 was loaded as an i32. Now we must bitcast it back.
3360 else if (EltVT != LoadVT)
3361 Elt = DAG.getNode(ISD::BITCAST, dl, EltVT, Elt);
3362
3363 // If a promoted integer type is used, truncate down to the original
3364 MVT PromotedVT;
3365 if (PromoteScalarIntegerPTX(EltVT, &PromotedVT)) {
3366 Elt = DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);
3367 }
3368
3369 // Extend the element if necessary (e.g. an i8 is loaded
3370 // into an i16 register)
3371 if (Ins[InsIdx].VT.isInteger() &&
3372 Ins[InsIdx].VT.getFixedSizeInBits() >
3373 LoadVT.getFixedSizeInBits()) {
3374 unsigned Extend = Ins[InsIdx].Flags.isSExt() ? ISD::SIGN_EXTEND
3376 Elt = DAG.getNode(Extend, dl, Ins[InsIdx].VT, Elt);
3377 }
3378 InVals.push_back(Elt);
3379 }
3380
3381 // Reset vector tracking state.
3382 VecIdx = -1;
3383 }
3384 ++InsIdx;
3385 }
3386 if (VTs.size() > 0)
3387 --InsIdx;
3388 continue;
3389 }
3390
3391 // Param has ByVal attribute
3392 // Return MoveParam(param symbol).
3393 // Ideally, the param symbol can be returned directly,
3394 // but when SDNode builder decides to use it in a CopyToReg(),
3395 // machine instruction fails because TargetExternalSymbol
3396 // (not lowered) is target dependent, and CopyToReg assumes
3397 // the source is lowered.
3398 EVT ObjectVT = getValueType(DL, Ty);
3399 assert(ObjectVT == Ins[InsIdx].VT &&
3400 "Ins type did not match function type");
3401 SDValue Arg = getParamSymbol(DAG, i, PtrVT);
3402 SDValue p = DAG.getNode(NVPTXISD::MoveParam, dl, ObjectVT, Arg);
3403 if (p.getNode())
3404 p.getNode()->setIROrder(i + 1);
3405 InVals.push_back(p);
3406 }
3407
3408 if (!OutChains.empty())
3409 DAG.setRoot(DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains));
3410
3411 return Chain;
3412}
3413
3414// Use byte-store when the param adress of the return value is unaligned.
3415// This may happen when the return value is a field of a packed structure.
3417 uint64_t Offset, EVT ElementType,
3418 SDValue RetVal, const SDLoc &dl) {
3419 // Bit logic only works on integer types
3420 if (adjustElementType(ElementType))
3421 RetVal = DAG.getNode(ISD::BITCAST, dl, ElementType, RetVal);
3422
3423 // Store each byte
3424 for (unsigned i = 0, n = ElementType.getSizeInBits() / 8; i < n; i++) {
3425 // Shift the byte to the last byte position
3426 SDValue ShiftVal = DAG.getNode(ISD::SRL, dl, ElementType, RetVal,
3427 DAG.getConstant(i * 8, dl, MVT::i32));
3428 SDValue StoreOperands[] = {Chain, DAG.getConstant(Offset + i, dl, MVT::i32),
3429 ShiftVal};
3430 // Trunc store only the last byte by using
3431 // st.param.b8
3432 // The register type can be larger than b8.
3434 DAG.getVTList(MVT::Other), StoreOperands,
3435 MVT::i8, MachinePointerInfo(), std::nullopt,
3437 }
3438 return Chain;
3439}
3440
3441SDValue
3443 bool isVarArg,
3445 const SmallVectorImpl<SDValue> &OutVals,
3446 const SDLoc &dl, SelectionDAG &DAG) const {
3447 const MachineFunction &MF = DAG.getMachineFunction();
3448 const Function &F = MF.getFunction();
3450
3451 bool isABI = (STI.getSmVersion() >= 20);
3452 assert(isABI && "Non-ABI compilation is not supported");
3453 if (!isABI)
3454 return Chain;
3455
3456 const DataLayout &DL = DAG.getDataLayout();
3457 SmallVector<SDValue, 16> PromotedOutVals;
3460 ComputePTXValueVTs(*this, DL, RetTy, VTs, &Offsets);
3461 assert(VTs.size() == OutVals.size() && "Bad return value decomposition");
3462
3463 for (unsigned i = 0, e = VTs.size(); i != e; ++i) {
3464 SDValue PromotedOutVal = OutVals[i];
3465 MVT PromotedVT;
3466 if (PromoteScalarIntegerPTX(VTs[i], &PromotedVT)) {
3467 VTs[i] = EVT(PromotedVT);
3468 }
3469 if (PromoteScalarIntegerPTX(PromotedOutVal.getValueType(), &PromotedVT)) {
3471 Outs[i].Flags.isSExt() ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
3472 PromotedOutVal = DAG.getNode(Ext, dl, PromotedVT, PromotedOutVal);
3473 }
3474 PromotedOutVals.push_back(PromotedOutVal);
3475 }
3476
3477 auto VectorInfo = VectorizePTXValueVTs(
3478 VTs, Offsets,
3480 : Align(1));
3481
3482 // PTX Interoperability Guide 3.3(A): [Integer] Values shorter than
3483 // 32-bits are sign extended or zero extended, depending on whether
3484 // they are signed or unsigned types.
3485 bool ExtendIntegerRetVal =
3486 RetTy->isIntegerTy() && DL.getTypeAllocSizeInBits(RetTy) < 32;
3487
3488 SmallVector<SDValue, 6> StoreOperands;
3489 for (unsigned i = 0, e = VTs.size(); i != e; ++i) {
3490 SDValue OutVal = OutVals[i];
3491 SDValue RetVal = PromotedOutVals[i];
3492
3493 if (ExtendIntegerRetVal) {
3494 RetVal = DAG.getNode(Outs[i].Flags.isSExt() ? ISD::SIGN_EXTEND
3496 dl, MVT::i32, RetVal);
3497 } else if (OutVal.getValueSizeInBits() < 16) {
3498 // Use 16-bit registers for small load-stores as it's the
3499 // smallest general purpose register size supported by NVPTX.
3500 RetVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, RetVal);
3501 }
3502
3503 // If we have a PVF_SCALAR entry, it may not even be sufficiently aligned
3504 // for a scalar store. In such cases, fall back to byte stores.
3505 if (VectorInfo[i] == PVF_SCALAR && RetTy->isAggregateType()) {
3506 EVT ElementType = ExtendIntegerRetVal ? MVT::i32 : VTs[i];
3507 Align ElementTypeAlign =
3508 DL.getABITypeAlign(ElementType.getTypeForEVT(RetTy->getContext()));
3509 Align ElementAlign =
3510 commonAlignment(DL.getABITypeAlign(RetTy), Offsets[i]);
3511 if (ElementAlign < ElementTypeAlign) {
3512 assert(StoreOperands.empty() && "Orphaned operand list.");
3513 Chain = LowerUnalignedStoreRet(DAG, Chain, Offsets[i], ElementType,
3514 RetVal, dl);
3515
3516 // The call to LowerUnalignedStoreRet inserted the necessary SDAG nodes
3517 // into the graph, so just move on to the next element.
3518 continue;
3519 }
3520 }
3521
3522 // New load/store. Record chain and offset operands.
3523 if (VectorInfo[i] & PVF_FIRST) {
3524 assert(StoreOperands.empty() && "Orphaned operand list.");
3525 StoreOperands.push_back(Chain);
3526 StoreOperands.push_back(DAG.getConstant(Offsets[i], dl, MVT::i32));
3527 }
3528
3529 // Record the value to return.
3530 StoreOperands.push_back(RetVal);
3531
3532 // That's the last element of this store op.
3533 if (VectorInfo[i] & PVF_LAST) {
3535 unsigned NumElts = StoreOperands.size() - 2;
3536 switch (NumElts) {
3537 case 1:
3539 break;
3540 case 2:
3542 break;
3543 case 4:
3545 break;
3546 default:
3547 llvm_unreachable("Invalid vector info.");
3548 }
3549
3550 // Adjust type of load/store op if we've extended the scalar
3551 // return value.
3552 EVT TheStoreType = ExtendIntegerRetVal ? MVT::i32 : VTs[i];
3553 Chain = DAG.getMemIntrinsicNode(
3554 Op, dl, DAG.getVTList(MVT::Other), StoreOperands, TheStoreType,
3556 // Cleanup vector state.
3557 StoreOperands.clear();
3558 }
3559 }
3560
3561 return DAG.getNode(NVPTXISD::RET_GLUE, dl, MVT::Other, Chain);
3562}
3563
3565 SDValue Op, StringRef Constraint, std::vector<SDValue> &Ops,
3566 SelectionDAG &DAG) const {
3567 if (Constraint.size() > 1)
3568 return;
3569 TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
3570}
3571
3572static unsigned getOpcForTextureInstr(unsigned Intrinsic) {
3573 switch (Intrinsic) {
3574 default:
3575 return 0;
3576
3577 case Intrinsic::nvvm_tex_1d_v4f32_s32:
3579 case Intrinsic::nvvm_tex_1d_v4f32_f32:
3581 case Intrinsic::nvvm_tex_1d_level_v4f32_f32:
3583 case Intrinsic::nvvm_tex_1d_grad_v4f32_f32:
3585 case Intrinsic::nvvm_tex_1d_v4s32_s32:
3586 return NVPTXISD::Tex1DS32S32;
3587 case Intrinsic::nvvm_tex_1d_v4s32_f32:
3589 case Intrinsic::nvvm_tex_1d_level_v4s32_f32:
3591 case Intrinsic::nvvm_tex_1d_grad_v4s32_f32:
3593 case Intrinsic::nvvm_tex_1d_v4u32_s32:
3594 return NVPTXISD::Tex1DU32S32;
3595 case Intrinsic::nvvm_tex_1d_v4u32_f32:
3597 case Intrinsic::nvvm_tex_1d_level_v4u32_f32:
3599 case Intrinsic::nvvm_tex_1d_grad_v4u32_f32:
3601
3602 case Intrinsic::nvvm_tex_1d_array_v4f32_s32:
3604 case Intrinsic::nvvm_tex_1d_array_v4f32_f32:
3606 case Intrinsic::nvvm_tex_1d_array_level_v4f32_f32:
3608 case Intrinsic::nvvm_tex_1d_array_grad_v4f32_f32:
3610 case Intrinsic::nvvm_tex_1d_array_v4s32_s32:
3612 case Intrinsic::nvvm_tex_1d_array_v4s32_f32:
3614 case Intrinsic::nvvm_tex_1d_array_level_v4s32_f32:
3616 case Intrinsic::nvvm_tex_1d_array_grad_v4s32_f32:
3618 case Intrinsic::nvvm_tex_1d_array_v4u32_s32:
3620 case Intrinsic::nvvm_tex_1d_array_v4u32_f32:
3622 case Intrinsic::nvvm_tex_1d_array_level_v4u32_f32:
3624 case Intrinsic::nvvm_tex_1d_array_grad_v4u32_f32:
3626
3627 case Intrinsic::nvvm_tex_2d_v4f32_s32:
3629 case Intrinsic::nvvm_tex_2d_v4f32_f32:
3631 case Intrinsic::nvvm_tex_2d_level_v4f32_f32:
3633 case Intrinsic::nvvm_tex_2d_grad_v4f32_f32:
3635 case Intrinsic::nvvm_tex_2d_v4s32_s32:
3636 return NVPTXISD::Tex2DS32S32;
3637 case Intrinsic::nvvm_tex_2d_v4s32_f32:
3639 case Intrinsic::nvvm_tex_2d_level_v4s32_f32:
3641 case Intrinsic::nvvm_tex_2d_grad_v4s32_f32:
3643 case Intrinsic::nvvm_tex_2d_v4u32_s32:
3644 return NVPTXISD::Tex2DU32S32;
3645 case Intrinsic::nvvm_tex_2d_v4u32_f32:
3647 case Intrinsic::nvvm_tex_2d_level_v4u32_f32:
3649 case Intrinsic::nvvm_tex_2d_grad_v4u32_f32:
3651
3652 case Intrinsic::nvvm_tex_2d_array_v4f32_s32:
3654 case Intrinsic::nvvm_tex_2d_array_v4f32_f32:
3656 case Intrinsic::nvvm_tex_2d_array_level_v4f32_f32:
3658 case Intrinsic::nvvm_tex_2d_array_grad_v4f32_f32:
3660 case Intrinsic::nvvm_tex_2d_array_v4s32_s32:
3662 case Intrinsic::nvvm_tex_2d_array_v4s32_f32:
3664 case Intrinsic::nvvm_tex_2d_array_level_v4s32_f32:
3666 case Intrinsic::nvvm_tex_2d_array_grad_v4s32_f32:
3668 case Intrinsic::nvvm_tex_2d_array_v4u32_s32:
3670 case Intrinsic::nvvm_tex_2d_array_v4u32_f32:
3672 case Intrinsic::nvvm_tex_2d_array_level_v4u32_f32:
3674 case Intrinsic::nvvm_tex_2d_array_grad_v4u32_f32:
3676
3677 case Intrinsic::nvvm_tex_3d_v4f32_s32:
3679 case Intrinsic::nvvm_tex_3d_v4f32_f32:
3681 case Intrinsic::nvvm_tex_3d_level_v4f32_f32:
3683 case Intrinsic::nvvm_tex_3d_grad_v4f32_f32:
3685 case Intrinsic::nvvm_tex_3d_v4s32_s32:
3686 return NVPTXISD::Tex3DS32S32;
3687 case Intrinsic::nvvm_tex_3d_v4s32_f32:
3689 case Intrinsic::nvvm_tex_3d_level_v4s32_f32:
3691 case Intrinsic::nvvm_tex_3d_grad_v4s32_f32:
3693 case Intrinsic::nvvm_tex_3d_v4u32_s32:
3694 return NVPTXISD::Tex3DU32S32;
3695 case Intrinsic::nvvm_tex_3d_v4u32_f32:
3697 case Intrinsic::nvvm_tex_3d_level_v4u32_f32:
3699 case Intrinsic::nvvm_tex_3d_grad_v4u32_f32:
3701
3702 case Intrinsic::nvvm_tex_cube_v4f32_f32:
3704 case Intrinsic::nvvm_tex_cube_level_v4f32_f32:
3706 case Intrinsic::nvvm_tex_cube_v4s32_f32:
3708 case Intrinsic::nvvm_tex_cube_level_v4s32_f32:
3710 case Intrinsic::nvvm_tex_cube_v4u32_f32:
3712 case Intrinsic::nvvm_tex_cube_level_v4u32_f32:
3714
3715 case Intrinsic::nvvm_tex_cube_array_v4f32_f32:
3717 case Intrinsic::nvvm_tex_cube_array_level_v4f32_f32:
3719 case Intrinsic::nvvm_tex_cube_array_v4s32_f32:
3721 case Intrinsic::nvvm_tex_cube_array_level_v4s32_f32:
3723 case Intrinsic::nvvm_tex_cube_array_v4u32_f32:
3725 case Intrinsic::nvvm_tex_cube_array_level_v4u32_f32:
3727
3728 case Intrinsic::nvvm_tld4_r_2d_v4f32_f32:
3730 case Intrinsic::nvvm_tld4_g_2d_v4f32_f32:
3732 case Intrinsic::nvvm_tld4_b_2d_v4f32_f32:
3734 case Intrinsic::nvvm_tld4_a_2d_v4f32_f32:
3736 case Intrinsic::nvvm_tld4_r_2d_v4s32_f32:
3738 case Intrinsic::nvvm_tld4_g_2d_v4s32_f32:
3740 case Intrinsic::nvvm_tld4_b_2d_v4s32_f32:
3742 case Intrinsic::nvvm_tld4_a_2d_v4s32_f32:
3744 case Intrinsic::nvvm_tld4_r_2d_v4u32_f32:
3746 case Intrinsic::nvvm_tld4_g_2d_v4u32_f32:
3748 case Intrinsic::nvvm_tld4_b_2d_v4u32_f32:
3750 case Intrinsic::nvvm_tld4_a_2d_v4u32_f32:
3752
3753 case Intrinsic::nvvm_tex_unified_1d_v4f32_s32:
3755 case Intrinsic::nvvm_tex_unified_1d_v4f32_f32:
3757 case Intrinsic::nvvm_tex_unified_1d_level_v4f32_f32:
3759 case Intrinsic::nvvm_tex_unified_1d_grad_v4f32_f32:
3761 case Intrinsic::nvvm_tex_unified_1d_v4s32_s32:
3763 case Intrinsic::nvvm_tex_unified_1d_v4s32_f32:
3765 case Intrinsic::nvvm_tex_unified_1d_level_v4s32_f32:
3767 case Intrinsic::nvvm_tex_unified_1d_grad_v4s32_f32:
3769 case Intrinsic::nvvm_tex_unified_1d_v4u32_s32:
3771 case Intrinsic::nvvm_tex_unified_1d_v4u32_f32:
3773 case Intrinsic::nvvm_tex_unified_1d_level_v4u32_f32:
3775 case Intrinsic::nvvm_tex_unified_1d_grad_v4u32_f32:
3777
3778 case Intrinsic::nvvm_tex_unified_1d_array_v4f32_s32:
3780 case Intrinsic::nvvm_tex_unified_1d_array_v4f32_f32:
3782 case Intrinsic::nvvm_tex_unified_1d_array_level_v4f32_f32:
3784 case Intrinsic::nvvm_tex_unified_1d_array_grad_v4f32_f32:
3786 case Intrinsic::nvvm_tex_unified_1d_array_v4s32_s32:
3788 case Intrinsic::nvvm_tex_unified_1d_array_v4s32_f32:
3790 case Intrinsic::nvvm_tex_unified_1d_array_level_v4s32_f32:
3792 case Intrinsic::nvvm_tex_unified_1d_array_grad_v4s32_f32:
3794 case Intrinsic::nvvm_tex_unified_1d_array_v4u32_s32:
3796 case Intrinsic::nvvm_tex_unified_1d_array_v4u32_f32:
3798 case Intrinsic::nvvm_tex_unified_1d_array_level_v4u32_f32:
3800 case Intrinsic::nvvm_tex_unified_1d_array_grad_v4u32_f32:
3802
3803 case Intrinsic::nvvm_tex_unified_2d_v4f32_s32:
3805 case Intrinsic::nvvm_tex_unified_2d_v4f32_f32:
3807 case Intrinsic::nvvm_tex_unified_2d_level_v4f32_f32:
3809 case Intrinsic::nvvm_tex_unified_2d_grad_v4f32_f32:
3811 case Intrinsic::nvvm_tex_unified_2d_v4s32_s32:
3813 case Intrinsic::nvvm_tex_unified_2d_v4s32_f32:
3815 case Intrinsic::nvvm_tex_unified_2d_level_v4s32_f32:
3817 case Intrinsic::nvvm_tex_unified_2d_grad_v4s32_f32:
3819 case Intrinsic::nvvm_tex_unified_2d_v4u32_s32:
3821 case Intrinsic::nvvm_tex_unified_2d_v4u32_f32:
3823 case Intrinsic::nvvm_tex_unified_2d_level_v4u32_f32:
3825 case Intrinsic::nvvm_tex_unified_2d_grad_v4u32_f32:
3827
3828 case Intrinsic::nvvm_tex_unified_2d_array_v4f32_s32:
3830 case Intrinsic::nvvm_tex_unified_2d_array_v4f32_f32:
3832 case Intrinsic::nvvm_tex_unified_2d_array_level_v4f32_f32:
3834 case Intrinsic::nvvm_tex_unified_2d_array_grad_v4f32_f32:
3836 case Intrinsic::nvvm_tex_unified_2d_array_v4s32_s32:
3838 case Intrinsic::nvvm_tex_unified_2d_array_v4s32_f32:
3840 case Intrinsic::nvvm_tex_unified_2d_array_level_v4s32_f32:
3842 case Intrinsic::nvvm_tex_unified_2d_array_grad_v4s32_f32:
3844 case Intrinsic::nvvm_tex_unified_2d_array_v4u32_s32:
3846 case Intrinsic::nvvm_tex_unified_2d_array_v4u32_f32:
3848 case Intrinsic::nvvm_tex_unified_2d_array_level_v4u32_f32:
3850 case Intrinsic::nvvm_tex_unified_2d_array_grad_v4u32_f32:
3852
3853 case Intrinsic::nvvm_tex_unified_3d_v4f32_s32:
3855 case Intrinsic::nvvm_tex_unified_3d_v4f32_f32:
3857 case Intrinsic::nvvm_tex_unified_3d_level_v4f32_f32:
3859 case Intrinsic::nvvm_tex_unified_3d_grad_v4f32_f32:
3861 case Intrinsic::nvvm_tex_unified_3d_v4s32_s32:
3863 case Intrinsic::nvvm_tex_unified_3d_v4s32_f32:
3865 case Intrinsic::nvvm_tex_unified_3d_level_v4s32_f32:
3867 case Intrinsic::nvvm_tex_unified_3d_grad_v4s32_f32:
3869 case Intrinsic::nvvm_tex_unified_3d_v4u32_s32:
3871 case Intrinsic::nvvm_tex_unified_3d_v4u32_f32:
3873 case Intrinsic::nvvm_tex_unified_3d_level_v4u32_f32:
3875 case Intrinsic::nvvm_tex_unified_3d_grad_v4u32_f32:
3877
3878 case Intrinsic::nvvm_tex_unified_cube_v4f32_f32:
3880 case Intrinsic::nvvm_tex_unified_cube_level_v4f32_f32:
3882 case Intrinsic::nvvm_tex_unified_cube_v4s32_f32:
3884 case Intrinsic::nvvm_tex_unified_cube_level_v4s32_f32:
3886 case Intrinsic::nvvm_tex_unified_cube_v4u32_f32:
3888 case Intrinsic::nvvm_tex_unified_cube_level_v4u32_f32:
3890
3891 case Intrinsic::nvvm_tex_unified_cube_array_v4f32_f32:
3893 case Intrinsic::nvvm_tex_unified_cube_array_level_v4f32_f32:
3895 case Intrinsic::nvvm_tex_unified_cube_array_v4s32_f32:
3897 case Intrinsic::nvvm_tex_unified_cube_array_level_v4s32_f32:
3899 case Intrinsic::nvvm_tex_unified_cube_array_v4u32_f32:
3901 case Intrinsic::nvvm_tex_unified_cube_array_level_v4u32_f32:
3903
3904 case Intrinsic::nvvm_tex_unified_cube_grad_v4f32_f32:
3906 case Intrinsic::nvvm_tex_unified_cube_grad_v4s32_f32:
3908 case Intrinsic::nvvm_tex_unified_cube_grad_v4u32_f32:
3910 case Intrinsic::nvvm_tex_unified_cube_array_grad_v4f32_f32:
3912 case Intrinsic::nvvm_tex_unified_cube_array_grad_v4s32_f32:
3914 case Intrinsic::nvvm_tex_unified_cube_array_grad_v4u32_f32:
3916
3917 case Intrinsic::nvvm_tld4_unified_r_2d_v4f32_f32:
3919 case Intrinsic::nvvm_tld4_unified_g_2d_v4f32_f32:
3921 case Intrinsic::nvvm_tld4_unified_b_2d_v4f32_f32:
3923 case Intrinsic::nvvm_tld4_unified_a_2d_v4f32_f32:
3925 case Intrinsic::nvvm_tld4_unified_r_2d_v4s32_f32:
3927 case Intrinsic::nvvm_tld4_unified_g_2d_v4s32_f32:
3929 case Intrinsic::nvvm_tld4_unified_b_2d_v4s32_f32:
3931 case Intrinsic::nvvm_tld4_unified_a_2d_v4s32_f32:
3933 case Intrinsic::nvvm_tld4_unified_r_2d_v4u32_f32:
3935 case Intrinsic::nvvm_tld4_unified_g_2d_v4u32_f32:
3937 case Intrinsic::nvvm_tld4_unified_b_2d_v4u32_f32:
3939 case Intrinsic::nvvm_tld4_unified_a_2d_v4u32_f32:
3941 }
3942}
3943
3944static unsigned getOpcForSurfaceInstr(unsigned Intrinsic) {
3945 switch (Intrinsic) {
3946 default:
3947 return 0;
3948 case Intrinsic::nvvm_suld_1d_i8_clamp:
3950 case Intrinsic::nvvm_suld_1d_i16_clamp:
3952 case Intrinsic::nvvm_suld_1d_i32_clamp:
3954 case Intrinsic::nvvm_suld_1d_i64_clamp:
3956 case Intrinsic::nvvm_suld_1d_v2i8_clamp:
3958 case Intrinsic::nvvm_suld_1d_v2i16_clamp:
3960 case Intrinsic::nvvm_suld_1d_v2i32_clamp:
3962 case Intrinsic::nvvm_suld_1d_v2i64_clamp:
3964 case Intrinsic::nvvm_suld_1d_v4i8_clamp:
3966 case Intrinsic::nvvm_suld_1d_v4i16_clamp:
3968 case Intrinsic::nvvm_suld_1d_v4i32_clamp:
3970 case Intrinsic::nvvm_suld_1d_array_i8_clamp:
3972 case Intrinsic::nvvm_suld_1d_array_i16_clamp:
3974 case Intrinsic::nvvm_suld_1d_array_i32_clamp:
3976 case Intrinsic::nvvm_suld_1d_array_i64_clamp:
3978 case Intrinsic::nvvm_suld_1d_array_v2i8_clamp:
3980 case Intrinsic::nvvm_suld_1d_array_v2i16_clamp:
3982 case Intrinsic::nvvm_suld_1d_array_v2i32_clamp:
3984 case Intrinsic::nvvm_suld_1d_array_v2i64_clamp:
3986 case Intrinsic::nvvm_suld_1d_array_v4i8_clamp:
3988 case Intrinsic::nvvm_suld_1d_array_v4i16_clamp:
3990 case Intrinsic::nvvm_suld_1d_array_v4i32_clamp:
3992 case Intrinsic::nvvm_suld_2d_i8_clamp:
3994 case Intrinsic::nvvm_suld_2d_i16_clamp:
3996 case Intrinsic::nvvm_suld_2d_i32_clamp:
3998 case Intrinsic::nvvm_suld_2d_i64_clamp:
4000 case Intrinsic::nvvm_suld_2d_v2i8_clamp:
4002 case Intrinsic::nvvm_suld_2d_v2i16_clamp:
4004 case Intrinsic::nvvm_suld_2d_v2i32_clamp:
4006 case Intrinsic::nvvm_suld_2d_v2i64_clamp:
4008 case Intrinsic::nvvm_suld_2d_v4i8_clamp:
4010 case Intrinsic::nvvm_suld_2d_v4i16_clamp:
4012 case Intrinsic::nvvm_suld_2d_v4i32_clamp:
4014 case Intrinsic::nvvm_suld_2d_array_i8_clamp:
4016 case Intrinsic::nvvm_suld_2d_array_i16_clamp:
4018 case Intrinsic::nvvm_suld_2d_array_i32_clamp:
4020 case Intrinsic::nvvm_suld_2d_array_i64_clamp:
4022 case Intrinsic::nvvm_suld_2d_array_v2i8_clamp:
4024 case Intrinsic::nvvm_suld_2d_array_v2i16_clamp:
4026 case Intrinsic::nvvm_suld_2d_array_v2i32_clamp:
4028 case Intrinsic::nvvm_suld_2d_array_v2i64_clamp:
4030 case Intrinsic::nvvm_suld_2d_array_v4i8_clamp:
4032 case Intrinsic::nvvm_suld_2d_array_v4i16_clamp:
4034 case Intrinsic::nvvm_suld_2d_array_v4i32_clamp:
4036 case Intrinsic::nvvm_suld_3d_i8_clamp:
4038 case Intrinsic::nvvm_suld_3d_i16_clamp:
4040 case Intrinsic::nvvm_suld_3d_i32_clamp:
4042 case Intrinsic::nvvm_suld_3d_i64_clamp:
4044 case Intrinsic::nvvm_suld_3d_v2i8_clamp:
4046 case Intrinsic::nvvm_suld_3d_v2i16_clamp:
4048 case Intrinsic::nvvm_suld_3d_v2i32_clamp:
4050 case Intrinsic::nvvm_suld_3d_v2i64_clamp:
4052 case Intrinsic::nvvm_suld_3d_v4i8_clamp:
4054 case Intrinsic::nvvm_suld_3d_v4i16_clamp:
4056 case Intrinsic::nvvm_suld_3d_v4i32_clamp:
4058 case Intrinsic::nvvm_suld_1d_i8_trap:
4060 case Intrinsic::nvvm_suld_1d_i16_trap:
4062 case Intrinsic::nvvm_suld_1d_i32_trap:
4064 case Intrinsic::nvvm_suld_1d_i64_trap:
4066 case Intrinsic::nvvm_suld_1d_v2i8_trap:
4068 case Intrinsic::nvvm_suld_1d_v2i16_trap:
4070 case Intrinsic::nvvm_suld_1d_v2i32_trap:
4072 case Intrinsic::nvvm_suld_1d_v2i64_trap:
4074 case Intrinsic::nvvm_suld_1d_v4i8_trap:
4076 case Intrinsic::nvvm_suld_1d_v4i16_trap:
4078 case Intrinsic::nvvm_suld_1d_v4i32_trap:
4080 case Intrinsic::nvvm_suld_1d_array_i8_trap:
4082 case Intrinsic::nvvm_suld_1d_array_i16_trap:
4084 case Intrinsic::nvvm_suld_1d_array_i32_trap:
4086 case Intrinsic::nvvm_suld_1d_array_i64_trap:
4088 case Intrinsic::nvvm_suld_1d_array_v2i8_trap:
4090 case Intrinsic::nvvm_suld_1d_array_v2i16_trap:
4092 case Intrinsic::nvvm_suld_1d_array_v2i32_trap:
4094 case Intrinsic::nvvm_suld_1d_array_v2i64_trap:
4096 case Intrinsic::nvvm_suld_1d_array_v4i8_trap:
4098 case Intrinsic::nvvm_suld_1d_array_v4i16_trap:
4100 case Intrinsic::nvvm_suld_1d_array_v4i32_trap:
4102 case Intrinsic::nvvm_suld_2d_i8_trap:
4104 case Intrinsic::nvvm_suld_2d_i16_trap:
4106 case Intrinsic::nvvm_suld_2d_i32_trap:
4108 case Intrinsic::nvvm_suld_2d_i64_trap:
4110 case Intrinsic::nvvm_suld_2d_v2i8_trap:
4112 case Intrinsic::nvvm_suld_2d_v2i16_trap:
4114 case Intrinsic::nvvm_suld_2d_v2i32_trap:
4116 case Intrinsic::nvvm_suld_2d_v2i64_trap:
4118 case Intrinsic::nvvm_suld_2d_v4i8_trap:
4120 case Intrinsic::nvvm_suld_2d_v4i16_trap:
4122 case Intrinsic::nvvm_suld_2d_v4i32_trap:
4124 case Intrinsic::nvvm_suld_2d_array_i8_trap:
4126 case Intrinsic::nvvm_suld_2d_array_i16_trap:
4128 case Intrinsic::nvvm_suld_2d_array_i32_trap:
4130 case Intrinsic::nvvm_suld_2d_array_i64_trap:
4132 case Intrinsic::nvvm_suld_2d_array_v2i8_trap:
4134 case Intrinsic::nvvm_suld_2d_array_v2i16_trap:
4136 case Intrinsic::nvvm_suld_2d_array_v2i32_trap:
4138 case Intrinsic::nvvm_suld_2d_array_v2i64_trap:
4140 case Intrinsic::nvvm_suld_2d_array_v4i8_trap:
4142 case Intrinsic::nvvm_suld_2d_array_v4i16_trap:
4144 case Intrinsic::nvvm_suld_2d_array_v4i32_trap:
4146 case Intrinsic::nvvm_suld_3d_i8_trap:
4148 case Intrinsic::nvvm_suld_3d_i16_trap:
4150 case Intrinsic::nvvm_suld_3d_i32_trap:
4152 case Intrinsic::nvvm_suld_3d_i64_trap:
4154 case Intrinsic::nvvm_suld_3d_v2i8_trap:
4156 case Intrinsic::nvvm_suld_3d_v2i16_trap:
4158 case Intrinsic::nvvm_suld_3d_v2i32_trap:
4160 case Intrinsic::nvvm_suld_3d_v2i64_trap:
4162 case Intrinsic::nvvm_suld_3d_v4i8_trap:
4164 case Intrinsic::nvvm_suld_3d_v4i16_trap:
4166 case Intrinsic::nvvm_suld_3d_v4i32_trap:
4168 case Intrinsic::nvvm_suld_1d_i8_zero:
4170 case Intrinsic::nvvm_suld_1d_i16_zero:
4172 case Intrinsic::nvvm_suld_1d_i32_zero:
4174 case Intrinsic::nvvm_suld_1d_i64_zero:
4176 case Intrinsic::nvvm_suld_1d_v2i8_zero:
4178 case Intrinsic::nvvm_suld_1d_v2i16_zero:
4180 case Intrinsic::nvvm_suld_1d_v2i32_zero:
4182 case Intrinsic::nvvm_suld_1d_v2i64_zero:
4184 case Intrinsic::nvvm_suld_1d_v4i8_zero:
4186 case Intrinsic::nvvm_suld_1d_v4i16_zero:
4188 case Intrinsic::nvvm_suld_1d_v4i32_zero:
4190 case Intrinsic::nvvm_suld_1d_array_i8_zero:
4192 case Intrinsic::nvvm_suld_1d_array_i16_zero:
4194 case Intrinsic::nvvm_suld_1d_array_i32_zero:
4196 case Intrinsic::nvvm_suld_1d_array_i64_zero:
4198 case Intrinsic::nvvm_suld_1d_array_v2i8_zero:
4200 case Intrinsic::nvvm_suld_1d_array_v2i16_zero:
4202 case Intrinsic::nvvm_suld_1d_array_v2i32_zero:
4204 case Intrinsic::nvvm_suld_1d_array_v2i64_zero:
4206 case Intrinsic::nvvm_suld_1d_array_v4i8_zero:
4208 case Intrinsic::nvvm_suld_1d_array_v4i16_zero:
4210 case Intrinsic::nvvm_suld_1d_array_v4i32_zero:
4212 case Intrinsic::nvvm_suld_2d_i8_zero:
4214 case Intrinsic::nvvm_suld_2d_i16_zero:
4216 case Intrinsic::nvvm_suld_2d_i32_zero:
4218 case Intrinsic::nvvm_suld_2d_i64_zero:
4220 case Intrinsic::nvvm_suld_2d_v2i8_zero:
4222 case Intrinsic::nvvm_suld_2d_v2i16_zero:
4224 case Intrinsic::nvvm_suld_2d_v2i32_zero:
4226 case Intrinsic::nvvm_suld_2d_v2i64_zero:
4228 case Intrinsic::nvvm_suld_2d_v4i8_zero:
4230 case Intrinsic::nvvm_suld_2d_v4i16_zero:
4232 case Intrinsic::nvvm_suld_2d_v4i32_zero:
4234 case Intrinsic::nvvm_suld_2d_array_i8_zero:
4236 case Intrinsic::nvvm_suld_2d_array_i16_zero:
4238 case Intrinsic::nvvm_suld_2d_array_i32_zero:
4240 case Intrinsic::nvvm_suld_2d_array_i64_zero:
4242 case Intrinsic::nvvm_suld_2d_array_v2i8_zero:
4244 case Intrinsic::nvvm_suld_2d_array_v2i16_zero:
4246 case Intrinsic::nvvm_suld_2d_array_v2i32_zero:
4248 case Intrinsic::nvvm_suld_2d_array_v2i64_zero:
4250 case Intrinsic::nvvm_suld_2d_array_v4i8_zero:
4252 case Intrinsic::nvvm_suld_2d_array_v4i16_zero:
4254 case Intrinsic::nvvm_suld_2d_array_v4i32_zero:
4256 case Intrinsic::nvvm_suld_3d_i8_zero:
4258 case Intrinsic::nvvm_suld_3d_i16_zero:
4260 case Intrinsic::nvvm_suld_3d_i32_zero:
4262 case Intrinsic::nvvm_suld_3d_i64_zero:
4264 case Intrinsic::nvvm_suld_3d_v2i8_zero:
4266 case Intrinsic::nvvm_suld_3d_v2i16_zero:
4268 case Intrinsic::nvvm_suld_3d_v2i32_zero:
4270 case Intrinsic::nvvm_suld_3d_v2i64_zero:
4272 case Intrinsic::nvvm_suld_3d_v4i8_zero:
4274 case Intrinsic::nvvm_suld_3d_v4i16_zero:
4276 case Intrinsic::nvvm_suld_3d_v4i32_zero:
4278 }
4279}
4280
4281// llvm.ptx.memcpy.const and llvm.ptx.memmove.const need to be modeled as
4282// TgtMemIntrinsic
4283// because we need the information that is only available in the "Value" type
4284// of destination
4285// pointer. In particular, the address space information.
4287 IntrinsicInfo &Info, const CallInst &I,
4288 MachineFunction &MF, unsigned Intrinsic) const {
4289 switch (Intrinsic) {
4290 default:
4291 return false;
4292 case Intrinsic::nvvm_match_all_sync_i32p:
4293 case Intrinsic::nvvm_match_all_sync_i64p:
4295 // memVT is bogus. These intrinsics have IntrInaccessibleMemOnly attribute
4296 // in order to model data exchange with other threads, but perform no real
4297 // memory accesses.
4298 Info.memVT = MVT::i1;
4299
4300 // Our result depends on both our and other thread's arguments.
4302 return true;
4303 case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_col:
4304 case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_row:
4305 case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_col_stride:
4306 case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_row_stride:
4307 case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_col:
4308 case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_row:
4309 case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_col_stride:
4310 case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_row_stride:
4311 case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_col:
4312 case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_row:
4313 case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_col_stride:
4314 case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_row_stride:
4315 case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_col:
4316 case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_row:
4317 case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_col_stride:
4318 case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_row_stride:
4319 case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_col:
4320 case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_row:
4321 case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_col_stride:
4322 case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_row_stride:
4323 case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_col:
4324 case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_row:
4325 case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_col_stride:
4326 case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_row_stride: {
4328 Info.memVT = MVT::v8f16;
4329 Info.ptrVal = I.getArgOperand(0);
4330 Info.offset = 0;
4332 Info.align = Align(16);
4333 return true;
4334 }
4335 case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_col:
4336 case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_col_stride:
4337 case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_col_stride:
4338 case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_col:
4339 case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_row:
4340 case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_row_stride:
4341 case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_row_stride:
4342 case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_row:
4343 case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_col:
4344 case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_col_stride:
4345 case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_row:
4346 case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_row_stride:
4347 case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_col:
4348 case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_col_stride:
4349 case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_col_stride:
4350 case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_col:
4351 case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_row:
4352 case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_row_stride:
4353 case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_row_stride:
4354 case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_row:
4355 case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_col:
4356 case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_col_stride:
4357 case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_row:
4358 case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_row_stride: {
4360 Info.memVT = MVT::v2i32;
4361 Info.ptrVal = I.getArgOperand(0);
4362 Info.offset = 0;
4364 Info.align = Align(8);
4365 return true;
4366 }
4367
4368 case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_col:
4369 case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_col_stride:
4370 case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_col_stride:
4371 case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_col:
4372 case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_row:
4373 case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_row_stride:
4374 case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_row_stride:
4375 case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_row:
4376 case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_col:
4377 case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_col_stride:
4378 case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_row:
4379 case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_row_stride:
4380 case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_col:
4381 case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_col_stride:
4382 case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_row:
4383 case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_row_stride:
4384
4385 case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_col:
4386 case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_col_stride:
4387 case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_col_stride:
4388 case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_col:
4389 case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_row:
4390 case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_row_stride:
4391 case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_row_stride:
4392 case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_row:
4393 case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_col:
4394 case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_col_stride:
4395 case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_row:
4396 case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_row_stride:
4397 case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_col:
4398 case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_col_stride:
4399 case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_row:
4400 case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_row_stride:
4401 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x4_b16:
4402 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x4_trans_b16: {
4404 Info.memVT = MVT::v4i32;
4405 Info.ptrVal = I.getArgOperand(0);
4406 Info.offset = 0;
4408 Info.align = Align(16);
4409 return true;
4410 }
4411
4412 case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_col:
4413 case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_col_stride:
4414 case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_col_stride:
4415 case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_col:
4416 case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_row:
4417 case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_row_stride:
4418 case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_row_stride:
4419 case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_row:
4420
4421 case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_col:
4422 case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_col_stride:
4423 case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_col_stride:
4424 case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_col:
4425 case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_row:
4426 case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_row_stride:
4427 case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_row_stride:
4428 case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_row:
4429 case Intrinsic::nvvm_wmma_m8n8k128_load_a_b1_row:
4430 case Intrinsic::nvvm_wmma_m8n8k128_load_a_b1_row_stride:
4431 case Intrinsic::nvvm_wmma_m8n8k128_load_b_b1_col:
4432 case Intrinsic::nvvm_wmma_m8n8k128_load_b_b1_col_stride:
4433 case Intrinsic::nvvm_wmma_m8n8k32_load_a_s4_row:
4434 case Intrinsic::nvvm_wmma_m8n8k32_load_a_s4_row_stride:
4435 case Intrinsic::nvvm_wmma_m8n8k32_load_a_u4_row_stride:
4436 case Intrinsic::nvvm_wmma_m8n8k32_load_a_u4_row:
4437 case Intrinsic::nvvm_wmma_m8n8k32_load_b_s4_col:
4438 case Intrinsic::nvvm_wmma_m8n8k32_load_b_s4_col_stride:
4439 case Intrinsic::nvvm_wmma_m8n8k32_load_b_u4_col_stride:
4440 case Intrinsic::nvvm_wmma_m8n8k32_load_b_u4_col:
4441 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x1_b16:
4442 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x1_trans_b16: {
4444 Info.memVT = MVT::i32;
4445 Info.ptrVal = I.getArgOperand(0);
4446 Info.offset = 0;
4448 Info.align = Align(4);
4449 return true;
4450 }
4451
4452 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_col:
4453 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_row:
4454 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_col_stride:
4455 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_row_stride:
4456 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_col:
4457 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_row:
4458 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_col_stride:
4459 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_row_stride:
4460 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_col:
4461 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_row:
4462 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_col_stride:
4463 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_row_stride: {
4465 Info.memVT = MVT::v4f16;
4466 Info.ptrVal = I.getArgOperand(0);
4467 Info.offset = 0;
4469 Info.align = Align(16);
4470 return true;
4471 }
4472
4473 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_col:
4474 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_row:
4475 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_col_stride:
4476 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_row_stride:
4477 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_col:
4478 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_row:
4479 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_col_stride:
4480 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_row_stride:
4481 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_col:
4482 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_row:
4483 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_col_stride:
4484 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_row_stride:
4485 case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_col:
4486 case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_row:
4487 case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_col_stride:
4488 case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_row_stride: {
4490 Info.memVT = MVT::v8f32;
4491 Info.ptrVal = I.getArgOperand(0);
4492 Info.offset = 0;
4494 Info.align = Align(16);
4495 return true;
4496 }
4497
4498 case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_col:
4499 case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_col_stride:
4500 case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_row:
4501 case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_row_stride:
4502
4503 case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_col:
4504 case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_col_stride:
4505 case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_row:
4506 case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_row_stride:
4507
4508 case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_col:
4509 case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_col_stride:
4510 case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_row:
4511 case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_row_stride:
4512 case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_col:
4513 case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_col_stride:
4514 case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_row:
4515 case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_row_stride:
4516 case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_col:
4517 case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_col_stride:
4518 case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_row:
4519 case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_row_stride: {
4521 Info.memVT = MVT::v8i32;
4522 Info.ptrVal = I.getArgOperand(0);
4523 Info.offset = 0;
4525 Info.align = Align(16);
4526 return true;
4527 }
4528
4529 case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_col:
4530 case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_col_stride:
4531 case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_row:
4532 case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_row_stride:
4533 case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_col:
4534 case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_col_stride:
4535 case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_row:
4536 case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_row_stride:
4537 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x2_b16:
4538 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x2_trans_b16: {
4540 Info.memVT = MVT::v2i32;
4541 Info.ptrVal = I.getArgOperand(0);
4542 Info.offset = 0;
4544 Info.align = Align(8);
4545 return true;
4546 }
4547
4548 case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_col:
4549 case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_col_stride:
4550 case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_row:
4551 case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_row_stride:
4552
4553 case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_col:
4554 case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_col_stride:
4555 case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_row:
4556 case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_row_stride: {
4558 Info.memVT = MVT::f64;
4559 Info.ptrVal = I.getArgOperand(0);
4560 Info.offset = 0;
4562 Info.align = Align(8);
4563 return true;
4564 }
4565
4566 case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_col:
4567 case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_col_stride:
4568 case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_row:
4569 case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_row_stride: {
4571 Info.memVT = MVT::v2f64;
4572 Info.ptrVal = I.getArgOperand(0);
4573 Info.offset = 0;
4575 Info.align = Align(16);
4576 return true;
4577 }
4578
4579 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_col:
4580 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_row:
4581 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_col_stride:
4582 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_row_stride:
4583 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_col:
4584 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_row:
4585 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_col_stride:
4586 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_row_stride:
4587 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_col:
4588 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_row:
4589 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_col_stride:
4590 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_row_stride: {
4592 Info.memVT = MVT::v4f16;
4593 Info.ptrVal = I.getArgOperand(0);
4594 Info.offset = 0;
4596 Info.align = Align(16);
4597 return true;
4598 }
4599
4600 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_col:
4601 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_row:
4602 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_col_stride:
4603 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_row_stride:
4604 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_col:
4605 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_row:
4606 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_col_stride:
4607 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_row_stride:
4608 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_col:
4609 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_row:
4610 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_col_stride:
4611 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_row_stride:
4612 case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_col:
4613 case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_row:
4614 case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_col_stride:
4615 case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_row_stride: {
4617 Info.memVT = MVT::v8f32;
4618 Info.ptrVal = I.getArgOperand(0);
4619 Info.offset = 0;
4621 Info.align = Align(16);
4622 return true;
4623 }
4624
4625 case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_col:
4626 case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_col_stride:
4627 case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_row:
4628 case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_row_stride:
4629 case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_col:
4630 case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_col_stride:
4631 case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_row:
4632 case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_row_stride:
4633 case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_col:
4634 case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_col_stride:
4635 case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_row:
4636 case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_row_stride: {
4638 Info.memVT = MVT::v8i32;
4639 Info.ptrVal = I.getArgOperand(0);
4640 Info.offset = 0;
4642 Info.align = Align(16);
4643 return true;
4644 }
4645
4646 case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_col:
4647 case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_col_stride:
4648 case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_row:
4649 case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_row_stride:
4650 case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_col:
4651 case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_col_stride:
4652 case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_row:
4653 case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_row_stride: {
4655 Info.memVT = MVT::v2i32;
4656 Info.ptrVal = I.getArgOperand(0);
4657 Info.offset = 0;
4659 Info.align = Align(8);
4660 return true;
4661 }
4662
4663 case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_col:
4664 case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_col_stride:
4665 case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_row:
4666 case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_row_stride: {
4668 Info.memVT = MVT::v2f64;
4669 Info.ptrVal = I.getArgOperand(0);
4670 Info.offset = 0;
4672 Info.align = Align(16);
4673 return true;
4674 }
4675
4676 case Intrinsic::nvvm_atomic_load_inc_32:
4677 case Intrinsic::nvvm_atomic_load_dec_32:
4678
4679 case Intrinsic::nvvm_atomic_add_gen_f_cta:
4680 case Intrinsic::nvvm_atomic_add_gen_f_sys:
4681 case Intrinsic::nvvm_atomic_add_gen_i_cta:
4682 case Intrinsic::nvvm_atomic_add_gen_i_sys:
4683 case Intrinsic::nvvm_atomic_and_gen_i_cta:
4684 case Intrinsic::nvvm_atomic_and_gen_i_sys:
4685 case Intrinsic::nvvm_atomic_cas_gen_i_cta:
4686 case Intrinsic::nvvm_atomic_cas_gen_i_sys:
4687 case Intrinsic::nvvm_atomic_dec_gen_i_cta:
4688 case Intrinsic::nvvm_atomic_dec_gen_i_sys:
4689 case Intrinsic::nvvm_atomic_inc_gen_i_cta:
4690 case Intrinsic::nvvm_atomic_inc_gen_i_sys:
4691 case Intrinsic::nvvm_atomic_max_gen_i_cta:
4692 case Intrinsic::nvvm_atomic_max_gen_i_sys:
4693 case Intrinsic::nvvm_atomic_min_gen_i_cta:
4694 case Intrinsic::nvvm_atomic_min_gen_i_sys:
4695 case Intrinsic::nvvm_atomic_or_gen_i_cta:
4696 case Intrinsic::nvvm_atomic_or_gen_i_sys:
4697 case Intrinsic::nvvm_atomic_exch_gen_i_cta:
4698 case Intrinsic::nvvm_atomic_exch_gen_i_sys:
4699 case Intrinsic::nvvm_atomic_xor_gen_i_cta:
4700 case Intrinsic::nvvm_atomic_xor_gen_i_sys: {
4701 auto &DL = I.getDataLayout();
4703 Info.memVT = getValueType(DL, I.getType());
4704 Info.ptrVal = I.getArgOperand(0);
4705 Info.offset = 0;
4707 Info.align.reset();
4708 return true;
4709 }
4710
4711 case Intrinsic::nvvm_ldu_global_i:
4712 case Intrinsic::nvvm_ldu_global_f:
4713 case Intrinsic::nvvm_ldu_global_p: {
4714 auto &DL = I.getDataLayout();
4716 if (Intrinsic == Intrinsic::nvvm_ldu_global_i)
4717 Info.memVT = getValueType(DL, I.getType());
4718 else if(Intrinsic == Intrinsic::nvvm_ldu_global_p)
4719 Info.memVT = getPointerTy(DL);
4720 else
4721 Info.memVT = getValueType(DL, I.getType());
4722 Info.ptrVal = I.getArgOperand(0);
4723 Info.offset = 0;
4725 Info.align = cast<ConstantInt>(I.getArgOperand(1))->getMaybeAlignValue();
4726
4727 return true;
4728 }
4729 case Intrinsic::nvvm_ldg_global_i:
4730 case Intrinsic::nvvm_ldg_global_f:
4731 case Intrinsic::nvvm_ldg_global_p: {
4732 auto &DL = I.getDataLayout();
4733
4735 if (Intrinsic == Intrinsic::nvvm_ldg_global_i)
4736 Info.memVT = getValueType(DL, I.getType());
4737 else if(Intrinsic == Intrinsic::nvvm_ldg_global_p)
4738 Info.memVT = getPointerTy(DL);
4739 else
4740 Info.memVT = getValueType(DL, I.getType());
4741 Info.ptrVal = I.getArgOperand(0);
4742 Info.offset = 0;
4744 Info.align = cast<ConstantInt>(I.getArgOperand(1))->getMaybeAlignValue();
4745
4746 return true;
4747 }
4748
4749 case Intrinsic::nvvm_tex_1d_v4f32_s32:
4750 case Intrinsic::nvvm_tex_1d_v4f32_f32:
4751 case Intrinsic::nvvm_tex_1d_level_v4f32_f32:
4752 case Intrinsic::nvvm_tex_1d_grad_v4f32_f32:
4753 case Intrinsic::nvvm_tex_1d_array_v4f32_s32:
4754 case Intrinsic::nvvm_tex_1d_array_v4f32_f32:
4755 case Intrinsic::nvvm_tex_1d_array_level_v4f32_f32:
4756 case Intrinsic::nvvm_tex_1d_array_grad_v4f32_f32:
4757 case Intrinsic::nvvm_tex_2d_v4f32_s32:
4758 case Intrinsic::nvvm_tex_2d_v4f32_f32:
4759 case Intrinsic::nvvm_tex_2d_level_v4f32_f32:
4760 case Intrinsic::nvvm_tex_2d_grad_v4f32_f32:
4761 case Intrinsic::nvvm_tex_2d_array_v4f32_s32:
4762 case Intrinsic::nvvm_tex_2d_array_v4f32_f32:
4763 case Intrinsic::nvvm_tex_2d_array_level_v4f32_f32:
4764 case Intrinsic::nvvm_tex_2d_array_grad_v4f32_f32:
4765 case Intrinsic::nvvm_tex_3d_v4f32_s32:
4766 case Intrinsic::nvvm_tex_3d_v4f32_f32:
4767 case Intrinsic::nvvm_tex_3d_level_v4f32_f32:
4768 case Intrinsic::nvvm_tex_3d_grad_v4f32_f32:
4769 case Intrinsic::nvvm_tex_cube_v4f32_f32:
4770 case Intrinsic::nvvm_tex_cube_level_v4f32_f32:
4771 case Intrinsic::nvvm_tex_cube_array_v4f32_f32:
4772 case Intrinsic::nvvm_tex_cube_array_level_v4f32_f32:
4773 case Intrinsic::nvvm_tld4_r_2d_v4f32_f32:
4774 case Intrinsic::nvvm_tld4_g_2d_v4f32_f32:
4775 case Intrinsic::nvvm_tld4_b_2d_v4f32_f32:
4776 case Intrinsic::nvvm_tld4_a_2d_v4f32_f32:
4777 case Intrinsic::nvvm_tex_unified_1d_v4f32_s32:
4778 case Intrinsic::nvvm_tex_unified_1d_v4f32_f32:
4779 case Intrinsic::nvvm_tex_unified_1d_level_v4f32_f32:
4780 case Intrinsic::nvvm_tex_unified_1d_grad_v4f32_f32:
4781 case Intrinsic::nvvm_tex_unified_1d_array_v4f32_s32:
4782 case Intrinsic::nvvm_tex_unified_1d_array_v4f32_f32:
4783 case Intrinsic::nvvm_tex_unified_1d_array_level_v4f32_f32:
4784 case Intrinsic::nvvm_tex_unified_1d_array_grad_v4f32_f32:
4785 case Intrinsic::nvvm_tex_unified_2d_v4f32_s32:
4786 case Intrinsic::nvvm_tex_unified_2d_v4f32_f32:
4787 case Intrinsic::nvvm_tex_unified_2d_level_v4f32_f32:
4788 case Intrinsic::nvvm_tex_unified_2d_grad_v4f32_f32:
4789 case Intrinsic::nvvm_tex_unified_2d_array_v4f32_s32:
4790 case Intrinsic::nvvm_tex_unified_2d_array_v4f32_f32:
4791 case Intrinsic::nvvm_tex_unified_2d_array_level_v4f32_f32:
4792 case Intrinsic::nvvm_tex_unified_2d_array_grad_v4f32_f32:
4793 case Intrinsic::nvvm_tex_unified_3d_v4f32_s32:
4794 case Intrinsic::nvvm_tex_unified_3d_v4f32_f32:
4795 case Intrinsic::nvvm_tex_unified_3d_level_v4f32_f32:
4796 case Intrinsic::nvvm_tex_unified_3d_grad_v4f32_f32:
4797 case Intrinsic::nvvm_tex_unified_cube_v4f32_f32:
4798 case Intrinsic::nvvm_tex_unified_cube_level_v4f32_f32:
4799 case Intrinsic::nvvm_tex_unified_cube_array_v4f32_f32:
4800 case Intrinsic::nvvm_tex_unified_cube_array_level_v4f32_f32:
4801 case Intrinsic::nvvm_tex_unified_cube_grad_v4f32_f32:
4802 case Intrinsic::nvvm_tex_unified_cube_array_grad_v4f32_f32:
4803 case Intrinsic::nvvm_tld4_unified_r_2d_v4f32_f32:
4804 case Intrinsic::nvvm_tld4_unified_g_2d_v4f32_f32:
4805 case Intrinsic::nvvm_tld4_unified_b_2d_v4f32_f32:
4806 case Intrinsic::nvvm_tld4_unified_a_2d_v4f32_f32:
4807 Info.opc = getOpcForTextureInstr(Intrinsic);
4808 Info.memVT = MVT::v4f32;
4809 Info.ptrVal = nullptr;
4810 Info.offset = 0;
4812 Info.align = Align(16);
4813 return true;
4814
4815 case Intrinsic::nvvm_tex_1d_v4s32_s32:
4816 case Intrinsic::nvvm_tex_1d_v4s32_f32:
4817 case Intrinsic::nvvm_tex_1d_level_v4s32_f32:
4818 case Intrinsic::nvvm_tex_1d_grad_v4s32_f32:
4819 case Intrinsic::nvvm_tex_1d_array_v4s32_s32:
4820 case Intrinsic::nvvm_tex_1d_array_v4s32_f32:
4821 case Intrinsic::nvvm_tex_1d_array_level_v4s32_f32:
4822 case Intrinsic::nvvm_tex_1d_array_grad_v4s32_f32:
4823 case Intrinsic::nvvm_tex_2d_v4s32_s32:
4824 case Intrinsic::nvvm_tex_2d_v4s32_f32:
4825 case Intrinsic::nvvm_tex_2d_level_v4s32_f32:
4826 case Intrinsic::nvvm_tex_2d_grad_v4s32_f32:
4827 case Intrinsic::nvvm_tex_2d_array_v4s32_s32:
4828 case Intrinsic::nvvm_tex_2d_array_v4s32_f32:
4829 case Intrinsic::nvvm_tex_2d_array_level_v4s32_f32:
4830 case Intrinsic::nvvm_tex_2d_array_grad_v4s32_f32:
4831 case Intrinsic::nvvm_tex_3d_v4s32_s32:
4832 case Intrinsic::nvvm_tex_3d_v4s32_f32:
4833 case Intrinsic::nvvm_tex_3d_level_v4s32_f32:
4834 case Intrinsic::nvvm_tex_3d_grad_v4s32_f32:
4835 case Intrinsic::nvvm_tex_cube_v4s32_f32:
4836 case Intrinsic::nvvm_tex_cube_level_v4s32_f32:
4837 case Intrinsic::nvvm_tex_cube_array_v4s32_f32:
4838 case Intrinsic::nvvm_tex_cube_array_level_v4s32_f32:
4839 case Intrinsic::nvvm_tex_cube_v4u32_f32:
4840 case Intrinsic::nvvm_tex_cube_level_v4u32_f32:
4841 case Intrinsic::nvvm_tex_cube_array_v4u32_f32:
4842 case Intrinsic::nvvm_tex_cube_array_level_v4u32_f32:
4843 case Intrinsic::nvvm_tex_1d_v4u32_s32:
4844 case Intrinsic::nvvm_tex_1d_v4u32_f32:
4845 case Intrinsic::nvvm_tex_1d_level_v4u32_f32:
4846 case Intrinsic::nvvm_tex_1d_grad_v4u32_f32:
4847 case Intrinsic::nvvm_tex_1d_array_v4u32_s32:
4848 case Intrinsic::nvvm_tex_1d_array_v4u32_f32:
4849 case Intrinsic::nvvm_tex_1d_array_level_v4u32_f32:
4850 case Intrinsic::nvvm_tex_1d_array_grad_v4u32_f32:
4851 case Intrinsic::nvvm_tex_2d_v4u32_s32:
4852 case Intrinsic::nvvm_tex_2d_v4u32_f32:
4853 case Intrinsic::nvvm_tex_2d_level_v4u32_f32:
4854 case Intrinsic::nvvm_tex_2d_grad_v4u32_f32:
4855 case Intrinsic::nvvm_tex_2d_array_v4u32_s32:
4856 case Intrinsic::nvvm_tex_2d_array_v4u32_f32:
4857 case Intrinsic::nvvm_tex_2d_array_level_v4u32_f32:
4858 case Intrinsic::nvvm_tex_2d_array_grad_v4u32_f32:
4859 case Intrinsic::nvvm_tex_3d_v4u32_s32:
4860 case Intrinsic::nvvm_tex_3d_v4u32_f32:
4861 case Intrinsic::nvvm_tex_3d_level_v4u32_f32:
4862 case Intrinsic::nvvm_tex_3d_grad_v4u32_f32:
4863 case Intrinsic::nvvm_tld4_r_2d_v4s32_f32:
4864 case Intrinsic::nvvm_tld4_g_2d_v4s32_f32:
4865 case Intrinsic::nvvm_tld4_b_2d_v4s32_f32:
4866 case Intrinsic::nvvm_tld4_a_2d_v4s32_f32:
4867 case Intrinsic::nvvm_tld4_r_2d_v4u32_f32:
4868 case Intrinsic::nvvm_tld4_g_2d_v4u32_f32:
4869 case Intrinsic::nvvm_tld4_b_2d_v4u32_f32:
4870 case Intrinsic::nvvm_tld4_a_2d_v4u32_f32:
4871 case Intrinsic::nvvm_tex_unified_1d_v4s32_s32:
4872 case Intrinsic::nvvm_tex_unified_1d_v4s32_f32:
4873 case Intrinsic::nvvm_tex_unified_1d_level_v4s32_f32:
4874 case Intrinsic::nvvm_tex_unified_1d_grad_v4s32_f32:
4875 case Intrinsic::nvvm_tex_unified_1d_array_v4s32_s32:
4876 case Intrinsic::nvvm_tex_unified_1d_array_v4s32_f32:
4877 case Intrinsic::nvvm_tex_unified_1d_array_level_v4s32_f32:
4878 case Intrinsic::nvvm_tex_unified_1d_array_grad_v4s32_f32:
4879 case Intrinsic::nvvm_tex_unified_2d_v4s32_s32:
4880 case Intrinsic::nvvm_tex_unified_2d_v4s32_f32:
4881 case Intrinsic::nvvm_tex_unified_2d_level_v4s32_f32:
4882 case Intrinsic::nvvm_tex_unified_2d_grad_v4s32_f32:
4883 case Intrinsic::nvvm_tex_unified_2d_array_v4s32_s32:
4884 case Intrinsic::nvvm_tex_unified_2d_array_v4s32_f32:
4885 case Intrinsic::nvvm_tex_unified_2d_array_level_v4s32_f32:
4886 case Intrinsic::nvvm_tex_unified_2d_array_grad_v4s32_f32:
4887 case Intrinsic::nvvm_tex_unified_3d_v4s32_s32:
4888 case Intrinsic::nvvm_tex_unified_3d_v4s32_f32:
4889 case Intrinsic::nvvm_tex_unified_3d_level_v4s32_f32:
4890 case Intrinsic::nvvm_tex_unified_3d_grad_v4s32_f32:
4891 case Intrinsic::nvvm_tex_unified_1d_v4u32_s32:
4892 case Intrinsic::nvvm_tex_unified_1d_v4u32_f32:
4893 case Intrinsic::nvvm_tex_unified_1d_level_v4u32_f32:
4894 case Intrinsic::nvvm_tex_unified_1d_grad_v4u32_f32:
4895 case Intrinsic::nvvm_tex_unified_1d_array_v4u32_s32:
4896 case Intrinsic::nvvm_tex_unified_1d_array_v4u32_f32:
4897 case Intrinsic::nvvm_tex_unified_1d_array_level_v4u32_f32:
4898 case Intrinsic::nvvm_tex_unified_1d_array_grad_v4u32_f32:
4899 case Intrinsic::nvvm_tex_unified_2d_v4u32_s32:
4900 case Intrinsic::nvvm_tex_unified_2d_v4u32_f32:
4901 case Intrinsic::nvvm_tex_unified_2d_level_v4u32_f32:
4902 case Intrinsic::nvvm_tex_unified_2d_grad_v4u32_f32:
4903 case Intrinsic::nvvm_tex_unified_2d_array_v4u32_s32:
4904 case Intrinsic::nvvm_tex_unified_2d_array_v4u32_f32:
4905 case Intrinsic::nvvm_tex_unified_2d_array_level_v4u32_f32:
4906 case Intrinsic::nvvm_tex_unified_2d_array_grad_v4u32_f32:
4907 case Intrinsic::nvvm_tex_unified_3d_v4u32_s32:
4908 case Intrinsic::nvvm_tex_unified_3d_v4u32_f32:
4909 case Intrinsic::nvvm_tex_unified_3d_level_v4u32_f32:
4910 case Intrinsic::nvvm_tex_unified_3d_grad_v4u32_f32:
4911 case Intrinsic::nvvm_tex_unified_cube_v4s32_f32:
4912 case Intrinsic::nvvm_tex_unified_cube_level_v4s32_f32:
4913 case Intrinsic::nvvm_tex_unified_cube_array_v4s32_f32:
4914 case Intrinsic::nvvm_tex_unified_cube_array_level_v4s32_f32:
4915 case Intrinsic::nvvm_tex_unified_cube_v4u32_f32:
4916 case Intrinsic::nvvm_tex_unified_cube_level_v4u32_f32:
4917 case Intrinsic::nvvm_tex_unified_cube_array_v4u32_f32:
4918 case Intrinsic::nvvm_tex_unified_cube_array_level_v4u32_f32:
4919 case Intrinsic::nvvm_tex_unified_cube_grad_v4s32_f32:
4920 case Intrinsic::nvvm_tex_unified_cube_grad_v4u32_f32:
4921 case Intrinsic::nvvm_tex_unified_cube_array_grad_v4s32_f32:
4922 case Intrinsic::nvvm_tex_unified_cube_array_grad_v4u32_f32:
4923 case Intrinsic::nvvm_tld4_unified_r_2d_v4s32_f32:
4924 case Intrinsic::nvvm_tld4_unified_g_2d_v4s32_f32:
4925 case Intrinsic::nvvm_tld4_unified_b_2d_v4s32_f32:
4926 case Intrinsic::nvvm_tld4_unified_a_2d_v4s32_f32:
4927 case Intrinsic::nvvm_tld4_unified_r_2d_v4u32_f32:
4928 case Intrinsic::nvvm_tld4_unified_g_2d_v4u32_f32:
4929 case Intrinsic::nvvm_tld4_unified_b_2d_v4u32_f32:
4930 case Intrinsic::nvvm_tld4_unified_a_2d_v4u32_f32:
4931 Info.opc = getOpcForTextureInstr(Intrinsic);
4932 Info.memVT = MVT::v4i32;
4933 Info.ptrVal = nullptr;
4934 Info.offset = 0;
4936 Info.align = Align(16);
4937 return true;
4938
4939 case Intrinsic::nvvm_suld_1d_i8_clamp:
4940 case Intrinsic::nvvm_suld_1d_v2i8_clamp:
4941 case Intrinsic::nvvm_suld_1d_v4i8_clamp:
4942 case Intrinsic::nvvm_suld_1d_array_i8_clamp:
4943 case Intrinsic::nvvm_suld_1d_array_v2i8_clamp:
4944 case Intrinsic::nvvm_suld_1d_array_v4i8_clamp:
4945 case Intrinsic::nvvm_suld_2d_i8_clamp:
4946 case Intrinsic::nvvm_suld_2d_v2i8_clamp:
4947 case Intrinsic::nvvm_suld_2d_v4i8_clamp:
4948 case Intrinsic::nvvm_suld_2d_array_i8_clamp:
4949 case Intrinsic::nvvm_suld_2d_array_v2i8_clamp:
4950 case Intrinsic::nvvm_suld_2d_array_v4i8_clamp:
4951 case Intrinsic::nvvm_suld_3d_i8_clamp:
4952 case Intrinsic::nvvm_suld_3d_v2i8_clamp:
4953 case Intrinsic::nvvm_suld_3d_v4i8_clamp:
4954 case Intrinsic::nvvm_suld_1d_i8_trap:
4955 case Intrinsic::nvvm_suld_1d_v2i8_trap:
4956 case Intrinsic::nvvm_suld_1d_v4i8_trap:
4957 case Intrinsic::nvvm_suld_1d_array_i8_trap:
4958 case Intrinsic::nvvm_suld_1d_array_v2i8_trap:
4959 case Intrinsic::nvvm_suld_1d_array_v4i8_trap:
4960 case Intrinsic::nvvm_suld_2d_i8_trap:
4961 case Intrinsic::nvvm_suld_2d_v2i8_trap:
4962 case Intrinsic::nvvm_suld_2d_v4i8_trap:
4963 case Intrinsic::nvvm_suld_2d_array_i8_trap:
4964 case Intrinsic::nvvm_suld_2d_array_v2i8_trap:
4965 case Intrinsic::nvvm_suld_2d_array_v4i8_trap:
4966 case Intrinsic::nvvm_suld_3d_i8_trap:
4967 case Intrinsic::nvvm_suld_3d_v2i8_trap:
4968 case Intrinsic::nvvm_suld_3d_v4i8_trap:
4969 case Intrinsic::nvvm_suld_1d_i8_zero:
4970 case Intrinsic::nvvm_suld_1d_v2i8_zero:
4971 case Intrinsic::nvvm_suld_1d_v4i8_zero:
4972 case Intrinsic::nvvm_suld_1d_array_i8_zero:
4973 case Intrinsic::nvvm_suld_1d_array_v2i8_zero:
4974 case Intrinsic::nvvm_suld_1d_array_v4i8_zero:
4975 case Intrinsic::nvvm_suld_2d_i8_zero:
4976 case Intrinsic::nvvm_suld_2d_v2i8_zero:
4977 case Intrinsic::nvvm_suld_2d_v4i8_zero:
4978 case Intrinsic::nvvm_suld_2d_array_i8_zero:
4979 case Intrinsic::nvvm_suld_2d_array_v2i8_zero:
4980 case Intrinsic::nvvm_suld_2d_array_v4i8_zero:
4981 case Intrinsic::nvvm_suld_3d_i8_zero:
4982 case Intrinsic::nvvm_suld_3d_v2i8_zero:
4983 case Intrinsic::nvvm_suld_3d_v4i8_zero:
4984 Info.opc = getOpcForSurfaceInstr(Intrinsic);
4985 Info.memVT = MVT::i8;
4986 Info.ptrVal = nullptr;
4987 Info.offset = 0;
4989 Info.align = Align(16);
4990 return true;
4991
4992 case Intrinsic::nvvm_suld_1d_i16_clamp:
4993 case Intrinsic::nvvm_suld_1d_v2i16_clamp:
4994 case Intrinsic::nvvm_suld_1d_v4i16_clamp:
4995 case Intrinsic::nvvm_suld_1d_array_i16_clamp:
4996 case Intrinsic::nvvm_suld_1d_array_v2i16_clamp:
4997 case Intrinsic::nvvm_suld_1d_array_v4i16_clamp:
4998 case Intrinsic::nvvm_suld_2d_i16_clamp:
4999 case Intrinsic::nvvm_suld_2d_v2i16_clamp:
5000 case Intrinsic::nvvm_suld_2d_v4i16_clamp:
5001 case Intrinsic::nvvm_suld_2d_array_i16_clamp:
5002 case Intrinsic::nvvm_suld_2d_array_v2i16_clamp:
5003 case Intrinsic::nvvm_suld_2d_array_v4i16_clamp:
5004 case Intrinsic::nvvm_suld_3d_i16_clamp:
5005 case Intrinsic::nvvm_suld_3d_v2i16_clamp:
5006 case Intrinsic::nvvm_suld_3d_v4i16_clamp:
5007 case Intrinsic::nvvm_suld_1d_i16_trap:
5008 case Intrinsic::nvvm_suld_1d_v2i16_trap:
5009 case Intrinsic::nvvm_suld_1d_v4i16_trap:
5010 case Intrinsic::nvvm_suld_1d_array_i16_trap:
5011 case Intrinsic::nvvm_suld_1d_array_v2i16_trap:
5012 case Intrinsic::nvvm_suld_1d_array_v4i16_trap:
5013 case Intrinsic::nvvm_suld_2d_i16_trap:
5014 case Intrinsic::nvvm_suld_2d_v2i16_trap:
5015 case Intrinsic::nvvm_suld_2d_v4i16_trap:
5016 case Intrinsic::nvvm_suld_2d_array_i16_trap:
5017 case Intrinsic::nvvm_suld_2d_array_v2i16_trap:
5018 case Intrinsic::nvvm_suld_2d_array_v4i16_trap:
5019 case Intrinsic::nvvm_suld_3d_i16_trap:
5020 case Intrinsic::nvvm_suld_3d_v2i16_trap:
5021 case Intrinsic::nvvm_suld_3d_v4i16_trap:
5022 case Intrinsic::nvvm_suld_1d_i16_zero:
5023 case Intrinsic::nvvm_suld_1d_v2i16_zero:
5024 case Intrinsic::nvvm_suld_1d_v4i16_zero:
5025 case Intrinsic::nvvm_suld_1d_array_i16_zero:
5026 case Intrinsic::nvvm_suld_1d_array_v2i16_zero:
5027 case Intrinsic::nvvm_suld_1d_array_v4i16_zero:
5028 case Intrinsic::nvvm_suld_2d_i16_zero:
5029 case Intrinsic::nvvm_suld_2d_v2i16_zero:
5030 case Intrinsic::nvvm_suld_2d_v4i16_zero:
5031 case Intrinsic::nvvm_suld_2d_array_i16_zero:
5032 case Intrinsic::nvvm_suld_2d_array_v2i16_zero:
5033 case Intrinsic::nvvm_suld_2d_array_v4i16_zero:
5034 case Intrinsic::nvvm_suld_3d_i16_zero:
5035 case Intrinsic::nvvm_suld_3d_v2i16_zero:
5036 case Intrinsic::nvvm_suld_3d_v4i16_zero:
5037 Info.opc = getOpcForSurfaceInstr(Intrinsic);
5038 Info.memVT = MVT::i16;
5039 Info.ptrVal = nullptr;
5040 Info.offset = 0;
5042 Info.align = Align(16);
5043 return true;
5044
5045 case Intrinsic::nvvm_suld_1d_i32_clamp:
5046 case Intrinsic::nvvm_suld_1d_v2i32_clamp:
5047 case Intrinsic::nvvm_suld_1d_v4i32_clamp:
5048 case Intrinsic::nvvm_suld_1d_array_i32_clamp:
5049 case Intrinsic::nvvm_suld_1d_array_v2i32_clamp:
5050 case Intrinsic::nvvm_suld_1d_array_v4i32_clamp:
5051 case Intrinsic::nvvm_suld_2d_i32_clamp:
5052 case Intrinsic::nvvm_suld_2d_v2i32_clamp:
5053 case Intrinsic::nvvm_suld_2d_v4i32_clamp:
5054 case Intrinsic::nvvm_suld_2d_array_i32_clamp:
5055 case Intrinsic::nvvm_suld_2d_array_v2i32_clamp:
5056 case Intrinsic::nvvm_suld_2d_array_v4i32_clamp:
5057 case Intrinsic::nvvm_suld_3d_i32_clamp:
5058 case Intrinsic::nvvm_suld_3d_v2i32_clamp:
5059 case Intrinsic::nvvm_suld_3d_v4i32_clamp:
5060 case Intrinsic::nvvm_suld_1d_i32_trap:
5061 case Intrinsic::nvvm_suld_1d_v2i32_trap:
5062 case Intrinsic::nvvm_suld_1d_v4i32_trap:
5063 case Intrinsic::nvvm_suld_1d_array_i32_trap:
5064 case Intrinsic::nvvm_suld_1d_array_v2i32_trap:
5065 case Intrinsic::nvvm_suld_1d_array_v4i32_trap:
5066 case Intrinsic::nvvm_suld_2d_i32_trap:
5067 case Intrinsic::nvvm_suld_2d_v2i32_trap:
5068 case Intrinsic::nvvm_suld_2d_v4i32_trap:
5069 case Intrinsic::nvvm_suld_2d_array_i32_trap:
5070 case Intrinsic::nvvm_suld_2d_array_v2i32_trap:
5071 case Intrinsic::nvvm_suld_2d_array_v4i32_trap:
5072 case Intrinsic::nvvm_suld_3d_i32_trap:
5073 case Intrinsic::nvvm_suld_3d_v2i32_trap:
5074 case Intrinsic::nvvm_suld_3d_v4i32_trap:
5075 case Intrinsic::nvvm_suld_1d_i32_zero:
5076 case Intrinsic::nvvm_suld_1d_v2i32_zero:
5077 case Intrinsic::nvvm_suld_1d_v4i32_zero:
5078 case Intrinsic::nvvm_suld_1d_array_i32_zero:
5079 case Intrinsic::nvvm_suld_1d_array_v2i32_zero:
5080 case Intrinsic::nvvm_suld_1d_array_v4i32_zero:
5081 case Intrinsic::nvvm_suld_2d_i32_zero:
5082 case Intrinsic::nvvm_suld_2d_v2i32_zero:
5083 case Intrinsic::nvvm_suld_2d_v4i32_zero:
5084 case Intrinsic::nvvm_suld_2d_array_i32_zero:
5085 case Intrinsic::nvvm_suld_2d_array_v2i32_zero:
5086 case Intrinsic::nvvm_suld_2d_array_v4i32_zero:
5087 case Intrinsic::nvvm_suld_3d_i32_zero:
5088 case Intrinsic::nvvm_suld_3d_v2i32_zero:
5089 case Intrinsic::nvvm_suld_3d_v4i32_zero:
5090 Info.opc = getOpcForSurfaceInstr(Intrinsic);
5091 Info.memVT = MVT::i32;
5092 Info.ptrVal = nullptr;
5093 Info.offset = 0;
5095 Info.align = Align(16);
5096 return true;
5097
5098 case Intrinsic::nvvm_suld_1d_i64_clamp:
5099 case Intrinsic::nvvm_suld_1d_v2i64_clamp:
5100 case Intrinsic::nvvm_suld_1d_array_i64_clamp:
5101 case Intrinsic::nvvm_suld_1d_array_v2i64_clamp:
5102 case Intrinsic::nvvm_suld_2d_i64_clamp:
5103 case Intrinsic::nvvm_suld_2d_v2i64_clamp:
5104 case Intrinsic::nvvm_suld_2d_array_i64_clamp:
5105 case Intrinsic::nvvm_suld_2d_array_v2i64_clamp:
5106 case Intrinsic::nvvm_suld_3d_i64_clamp:
5107 case Intrinsic::nvvm_suld_3d_v2i64_clamp:
5108 case Intrinsic::nvvm_suld_1d_i64_trap:
5109 case Intrinsic::nvvm_suld_1d_v2i64_trap:
5110 case Intrinsic::nvvm_suld_1d_array_i64_trap:
5111 case Intrinsic::nvvm_suld_1d_array_v2i64_trap:
5112 case Intrinsic::nvvm_suld_2d_i64_trap:
5113 case Intrinsic::nvvm_suld_2d_v2i64_trap:
5114 case Intrinsic::nvvm_suld_2d_array_i64_trap:
5115 case Intrinsic::nvvm_suld_2d_array_v2i64_trap:
5116 case Intrinsic::nvvm_suld_3d_i64_trap:
5117 case Intrinsic::nvvm_suld_3d_v2i64_trap:
5118 case Intrinsic::nvvm_suld_1d_i64_zero:
5119 case Intrinsic::nvvm_suld_1d_v2i64_zero:
5120 case Intrinsic::nvvm_suld_1d_array_i64_zero:
5121 case Intrinsic::nvvm_suld_1d_array_v2i64_zero:
5122 case Intrinsic::nvvm_suld_2d_i64_zero:
5123 case Intrinsic::nvvm_suld_2d_v2i64_zero:
5124 case Intrinsic::nvvm_suld_2d_array_i64_zero:
5125 case Intrinsic::nvvm_suld_2d_array_v2i64_zero:
5126 case Intrinsic::nvvm_suld_3d_i64_zero:
5127 case Intrinsic::nvvm_suld_3d_v2i64_zero:
5128 Info.opc = getOpcForSurfaceInstr(Intrinsic);
5129 Info.memVT = MVT::i64;
5130 Info.ptrVal = nullptr;
5131 Info.offset = 0;
5133 Info.align = Align(16);
5134 return true;
5135 }
5136 return false;
5137}
5138
5139/// getFunctionParamOptimizedAlign - since function arguments are passed via
5140/// .param space, we may want to increase their alignment in a way that
5141/// ensures that we can effectively vectorize their loads & stores. We can
5142/// increase alignment only if the function has internal or has private
5143/// linkage as for other linkage types callers may already rely on default
5144/// alignment. To allow using 128-bit vectorized loads/stores, this function
5145/// ensures that alignment is 16 or greater.
5147 const Function *F, Type *ArgTy, const DataLayout &DL) const {
5148 // Capping the alignment to 128 bytes as that is the maximum alignment
5149 // supported by PTX.
5150 const Align ABITypeAlign = std::min(Align(128), DL.getABITypeAlign(ArgTy));
5151
5152 // If a function has linkage different from internal or private, we
5153 // must use default ABI alignment as external users rely on it. Same
5154 // for a function that may be called from a function pointer.
5155 if (!F || !F->hasLocalLinkage() ||
5156 F->hasAddressTaken(/*Users=*/nullptr,
5157 /*IgnoreCallbackUses=*/false,
5158 /*IgnoreAssumeLikeCalls=*/true,
5159 /*IgnoreLLVMUsed=*/true))
5160 return ABITypeAlign;
5161
5162 assert(!isKernelFunction(*F) && "Expect kernels to have non-local linkage");
5163 return std::max(Align(16), ABITypeAlign);
5164}
5165
5166/// Helper for computing alignment of a device function byval parameter.
5168 const Function *F, Type *ArgTy, Align InitialAlign,
5169 const DataLayout &DL) const {
5170 Align ArgAlign = InitialAlign;
5171 // Try to increase alignment to enhance vectorization options.
5172 if (F)
5173 ArgAlign = std::max(ArgAlign, getFunctionParamOptimizedAlign(F, ArgTy, DL));
5174
5175 // Old ptx versions have a bug. When PTX code takes address of
5176 // byval parameter with alignment < 4, ptxas generates code to
5177 // spill argument into memory. Alas on sm_50+ ptxas generates
5178 // SASS code that fails with misaligned access. To work around
5179 // the problem, make sure that we align byval parameters by at
5180 // least 4. This bug seems to be fixed at least starting from
5181 // ptxas > 9.0.
5182 // TODO: remove this after verifying the bug is not reproduced
5183 // on non-deprecated ptxas versions.
5185 ArgAlign = std::max(ArgAlign, Align(4));
5186
5187 return ArgAlign;
5188}
5189
5190// Helper for getting a function parameter name. Name is composed from
5191// its index and the function name. Negative index corresponds to special
5192// parameter (unsized array) used for passing variable arguments.
5194 int Idx) const {
5195 std::string ParamName;
5196 raw_string_ostream ParamStr(ParamName);
5197
5198 ParamStr << getTargetMachine().getSymbol(F)->getName();
5199 if (Idx < 0)
5200 ParamStr << "_vararg";
5201 else
5202 ParamStr << "_param_" << Idx;
5203
5204 return ParamName;
5205}
5206
5207/// isLegalAddressingMode - Return true if the addressing mode represented
5208/// by AM is legal for this target, for a load/store of the specified type.
5209/// Used to guide target specific optimizations, like loop strength reduction
5210/// (LoopStrengthReduce.cpp) and memory optimization for address mode
5211/// (CodeGenPrepare.cpp)
5213 const AddrMode &AM, Type *Ty,
5214 unsigned AS, Instruction *I) const {
5215 // AddrMode - This represents an addressing mode of:
5216 // BaseGV + BaseOffs + BaseReg + Scale*ScaleReg
5217 //
5218 // The legal address modes are
5219 // - [avar]
5220 // - [areg]
5221 // - [areg+immoff]
5222 // - [immAddr]
5223
5224 // immoff must fit in a signed 32-bit int
5225 if (!APInt(64, AM.BaseOffs).isSignedIntN(32))
5226 return false;
5227
5228 if (AM.BaseGV)
5229 return !AM.BaseOffs && !AM.HasBaseReg && !AM.Scale;
5230
5231 switch (AM.Scale) {
5232 case 0: // "r", "r+i" or "i" is allowed
5233 break;
5234 case 1:
5235 if (AM.HasBaseReg) // "r+r+i" or "r+r" is not allowed.
5236 return false;
5237 // Otherwise we have r+i.
5238 break;
5239 default:
5240 // No scale > 1 is allowed
5241 return false;
5242 }
5243 return true;
5244}
5245
5246//===----------------------------------------------------------------------===//
5247// NVPTX Inline Assembly Support
5248//===----------------------------------------------------------------------===//
5249
5250/// getConstraintType - Given a constraint letter, return the type of
5251/// constraint it is for this target.
5254 if (Constraint.size() == 1) {
5255 switch (Constraint[0]) {
5256 default:
5257 break;
5258 case 'b':
5259 case 'r':
5260 case 'h':
5261 case 'c':
5262 case 'l':
5263 case 'f':
5264 case 'd':
5265 case 'q':
5266 case '0':
5267 case 'N':
5268 return C_RegisterClass;
5269 }
5270 }
5271 return TargetLowering::getConstraintType(Constraint);
5272}
5273
5274std::pair<unsigned, const TargetRegisterClass *>
5276 StringRef Constraint,
5277 MVT VT) const {
5278 if (Constraint.size() == 1) {
5279 switch (Constraint[0]) {
5280 case 'b':
5281 return std::make_pair(0U, &NVPTX::Int1RegsRegClass);
5282 case 'c':
5283 return std::make_pair(0U, &NVPTX::Int16RegsRegClass);
5284 case 'h':
5285 return std::make_pair(0U, &NVPTX::Int16RegsRegClass);
5286 case 'r':
5287 return std::make_pair(0U, &NVPTX::Int32RegsRegClass);
5288 case 'l':
5289 case 'N':
5290 return std::make_pair(0U, &NVPTX::Int64RegsRegClass);
5291 case 'q': {
5292 if (STI.getSmVersion() < 70)
5293 report_fatal_error("Inline asm with 128 bit operands is only "
5294 "supported for sm_70 and higher!");
5295 return std::make_pair(0U, &NVPTX::Int128RegsRegClass);
5296 }
5297 case 'f':
5298 return std::make_pair(0U, &NVPTX::Float32RegsRegClass);
5299 case 'd':
5300 return std::make_pair(0U, &NVPTX::Float64RegsRegClass);
5301 }
5302 }
5303 return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
5304}
5305
5306//===----------------------------------------------------------------------===//
5307// NVPTX DAG Combining
5308//===----------------------------------------------------------------------===//
5309
5311 CodeGenOptLevel OptLevel) const {
5312 // Always honor command-line argument
5313 if (FMAContractLevelOpt.getNumOccurrences() > 0)
5314 return FMAContractLevelOpt > 0;
5315
5316 // Do not contract if we're not optimizing the code.
5317 if (OptLevel == CodeGenOptLevel::None)
5318 return false;
5319
5320 // Honor TargetOptions flags that explicitly say fusion is okay.
5322 return true;
5323
5324 return allowUnsafeFPMath(MF);
5325}
5326
5328 // Honor TargetOptions flags that explicitly say unsafe math is okay.
5330 return true;
5331
5332 // Allow unsafe math if unsafe-fp-math attribute explicitly says so.
5333 const Function &F = MF.getFunction();
5334 return F.getFnAttribute("unsafe-fp-math").getValueAsBool();
5335}
5336
5337static bool isConstZero(const SDValue &Operand) {
5338 const auto *Const = dyn_cast<ConstantSDNode>(Operand);
5339 return Const && Const->getZExtValue() == 0;
5340}
5341
5342/// PerformADDCombineWithOperands - Try DAG combinations for an ADD with
5343/// operands N0 and N1. This is a helper for PerformADDCombine that is
5344/// called with the default operands, and if that fails, with commuted
5345/// operands.
5346static SDValue
5349 EVT VT = N0.getValueType();
5350
5351 // Since integer multiply-add costs the same as integer multiply
5352 // but is more costly than integer add, do the fusion only when
5353 // the mul is only used in the add.
5354 // TODO: this may not be true for later architectures, consider relaxing this
5355 if (!N0.getNode()->hasOneUse())
5356 return SDValue();
5357
5358 // fold (add (mul a, b), c) -> (mad a, b, c)
5359 //
5360 if (N0.getOpcode() == ISD::MUL)
5361 return DCI.DAG.getNode(NVPTXISD::IMAD, SDLoc(N), VT, N0.getOperand(0),
5362 N0.getOperand(1), N1);
5363
5364 // fold (add (select cond, 0, (mul a, b)), c)
5365 // -> (select cond, c, (mad a, b, c))
5366 //
5367 if (N0.getOpcode() == ISD::SELECT) {
5368 unsigned ZeroOpNum;
5369 if (isConstZero(N0->getOperand(1)))
5370 ZeroOpNum = 1;
5371 else if (isConstZero(N0->getOperand(2)))
5372 ZeroOpNum = 2;
5373 else
5374 return SDValue();
5375
5376 SDValue M = N0->getOperand((ZeroOpNum == 1) ? 2 : 1);
5377 if (M->getOpcode() != ISD::MUL || !M.getNode()->hasOneUse())
5378 return SDValue();
5379
5380 SDValue MAD = DCI.DAG.getNode(NVPTXISD::IMAD, SDLoc(N), VT,
5381 M->getOperand(0), M->getOperand(1), N1);
5382 return DCI.DAG.getSelect(SDLoc(N), VT, N0->getOperand(0),
5383 ((ZeroOpNum == 1) ? N1 : MAD),
5384 ((ZeroOpNum == 1) ? MAD : N1));
5385 }
5386
5387 return SDValue();
5388}
5389
5390static SDValue
5393 CodeGenOptLevel OptLevel) {
5394 EVT VT = N0.getValueType();
5395 if (N0.getOpcode() == ISD::FMUL) {
5396 const auto *TLI = static_cast<const NVPTXTargetLowering *>(
5397 &DCI.DAG.getTargetLoweringInfo());
5398 if (!TLI->allowFMA(DCI.DAG.getMachineFunction(), OptLevel))
5399 return SDValue();
5400
5401 // For floating point:
5402 // Do the fusion only when the mul has less than 5 uses and all
5403 // are add.
5404 // The heuristic is that if a use is not an add, then that use
5405 // cannot be fused into fma, therefore mul is still needed anyway.
5406 // If there are more than 4 uses, even if they are all add, fusing
5407 // them will increase register pressue.
5408 //
5409 int numUses = 0;
5410 int nonAddCount = 0;
5411 for (const SDNode *User : N0.getNode()->uses()) {
5412 numUses++;
5413 if (User->getOpcode() != ISD::FADD)
5414 ++nonAddCount;
5415 if (numUses >= 5)
5416 return SDValue();
5417 }
5418 if (nonAddCount) {
5419 int orderNo = N->getIROrder();
5420 int orderNo2 = N0.getNode()->getIROrder();
5421 // simple heuristics here for considering potential register
5422 // pressure, the logics here is that the differnce are used
5423 // to measure the distance between def and use, the longer distance
5424 // more likely cause register pressure.
5425 if (orderNo - orderNo2 < 500)
5426 return SDValue();
5427
5428 // Now, check if at least one of the FMUL's operands is live beyond the
5429 // node N, which guarantees that the FMA will not increase register
5430 // pressure at node N.
5431 bool opIsLive = false;
5432 const SDNode *left = N0.getOperand(0).getNode();
5433 const SDNode *right = N0.getOperand(1).getNode();
5434
5435 if (isa<ConstantSDNode>(left) || isa<ConstantSDNode>(right))
5436 opIsLive = true;
5437
5438 if (!opIsLive)
5439 for (const SDNode *User : left->uses()) {
5440 int orderNo3 = User->getIROrder();
5441 if (orderNo3 > orderNo) {
5442 opIsLive = true;
5443 break;
5444 }
5445 }
5446
5447 if (!opIsLive)
5448 for (const SDNode *User : right->uses()) {
5449 int orderNo3 = User->getIROrder();
5450 if (orderNo3 > orderNo) {
5451 opIsLive = true;
5452 break;
5453 }
5454 }
5455
5456 if (!opIsLive)
5457 return SDValue();
5458 }
5459
5460 return DCI.DAG.getNode(ISD::FMA, SDLoc(N), VT, N0.getOperand(0),
5461 N0.getOperand(1), N1);
5462 }
5463
5464 return SDValue();
5465}
5466
5467static SDValue PerformStoreCombineHelper(SDNode *N, std::size_t Front,
5468 std::size_t Back) {
5469 if (all_of(N->ops().drop_front(Front).drop_back(Back),
5470 [](const SDUse &U) { return U.get()->isUndef(); }))
5471 // Operand 0 is the previous value in the chain. Cannot return EntryToken
5472 // as the previous value will become unused and eliminated later.
5473 return N->getOperand(0);
5474
5475 return SDValue();
5476}
5477
5479 // Operands from the 3rd to the 2nd last one are the values to be stored.
5480 // {Chain, ArgID, Offset, Val, Glue}
5481 return PerformStoreCombineHelper(N, 3, 1);
5482}
5483
5485 // Operands from the 2nd to the last one are the values to be stored
5486 return PerformStoreCombineHelper(N, 2, 0);
5487}
5488
5489/// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD.
5490///
5493 CodeGenOptLevel OptLevel) {
5494 if (OptLevel == CodeGenOptLevel::None)
5495 return SDValue();
5496
5497 SDValue N0 = N->getOperand(0);
5498 SDValue N1 = N->getOperand(1);
5499
5500 // Skip non-integer, non-scalar case
5501 EVT VT = N0.getValueType();
5502 if (VT.isVector() || VT != MVT::i32)
5503 return SDValue();
5504
5505 // First try with the default operand order.
5506 if (SDValue Result = PerformADDCombineWithOperands(N, N0, N1, DCI))
5507 return Result;
5508
5509 // If that didn't work, try again with the operands commuted.
5510 return PerformADDCombineWithOperands(N, N1, N0, DCI);
5511}
5512
5513/// PerformFADDCombine - Target-specific dag combine xforms for ISD::FADD.
5514///
5517 CodeGenOptLevel OptLevel) {
5518 SDValue N0 = N->getOperand(0);
5519 SDValue N1 = N->getOperand(1);
5520
5521 EVT VT = N0.getValueType();
5522 if (VT.isVector() || !(VT == MVT::f32 || VT == MVT::f64))
5523 return SDValue();
5524
5525 // First try with the default operand order.
5526 if (SDValue Result = PerformFADDCombineWithOperands(N, N0, N1, DCI, OptLevel))
5527 return Result;
5528
5529 // If that didn't work, try again with the operands commuted.
5530 return PerformFADDCombineWithOperands(N, N1, N0, DCI, OptLevel);
5531}
5532
5535 // The type legalizer turns a vector load of i8 values into a zextload to i16
5536 // registers, optionally ANY_EXTENDs it (if target type is integer),
5537 // and ANDs off the high 8 bits. Since we turn this load into a
5538 // target-specific DAG node, the DAG combiner fails to eliminate these AND
5539 // nodes. Do that here.
5540 SDValue Val = N->getOperand(0);
5541 SDValue Mask = N->getOperand(1);
5542
5543 if (isa<ConstantSDNode>(Val)) {
5544 std::swap(Val, Mask);
5545 }
5546
5547 SDValue AExt;
5548
5549 // Convert BFE-> truncate i16 -> and 255
5550 // To just BFE-> truncate i16, as the value already has all the bits in the
5551 // right places.
5552 if (Val.getOpcode() == ISD::TRUNCATE) {
5553 SDValue BFE = Val.getOperand(0);
5554 if (BFE.getOpcode() != NVPTXISD::BFE)
5555 return SDValue();
5556
5557 ConstantSDNode *BFEBits = dyn_cast<ConstantSDNode>(BFE.getOperand(0));
5558 if (!BFEBits)
5559 return SDValue();
5560 uint64_t BFEBitsVal = BFEBits->getZExtValue();
5561
5562 ConstantSDNode *MaskCnst = dyn_cast<ConstantSDNode>(Mask);
5563 if (!MaskCnst) {
5564 // Not an AND with a constant
5565 return SDValue();
5566 }
5567 uint64_t MaskVal = MaskCnst->getZExtValue();
5568
5569 if (MaskVal != (uint64_t(1) << BFEBitsVal) - 1)
5570 return SDValue();
5571 // If we get here, the AND is unnecessary. Just replace it with the trunc
5572 DCI.CombineTo(N, Val, false);
5573 }
5574 // Generally, we will see zextload -> IMOV16rr -> ANY_EXTEND -> and
5575 if (Val.getOpcode() == ISD::ANY_EXTEND) {
5576 AExt = Val;
5577 Val = Val->getOperand(0);
5578 }
5579
5580 if (Val->isMachineOpcode() && Val->getMachineOpcode() == NVPTX::IMOV16rr) {
5581 Val = Val->getOperand(0);
5582 }
5583
5584 if (Val->getOpcode() == NVPTXISD::LoadV2 ||
5585 Val->getOpcode() == NVPTXISD::LoadV4) {
5586 ConstantSDNode *MaskCnst = dyn_cast<ConstantSDNode>(Mask);
5587 if (!MaskCnst) {
5588 // Not an AND with a constant
5589 return SDValue();
5590 }
5591
5592 uint64_t MaskVal = MaskCnst->getZExtValue();
5593 if (MaskVal != 0xff) {
5594 // Not an AND that chops off top 8 bits
5595 return SDValue();
5596 }
5597
5598 MemSDNode *Mem = dyn_cast<MemSDNode>(Val);
5599 if (!Mem) {
5600 // Not a MemSDNode?!?
5601 return SDValue();
5602 }
5603
5604 EVT MemVT = Mem->getMemoryVT();
5605 if (MemVT != MVT::v2i8 && MemVT != MVT::v4i8) {
5606 // We only handle the i8 case
5607 return SDValue();
5608 }
5609
5610 unsigned ExtType = Val->getConstantOperandVal(Val->getNumOperands() - 1);
5611 if (ExtType == ISD::SEXTLOAD) {
5612 // If for some reason the load is a sextload, the and is needed to zero
5613 // out the high 8 bits
5614 return SDValue();
5615 }
5616
5617 bool AddTo = false;
5618 if (AExt.getNode() != nullptr) {
5619 // Re-insert the ext as a zext.
5620 Val = DCI.DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N),
5621 AExt.getValueType(), Val);
5622 AddTo = true;
5623 }
5624
5625 // If we get here, the AND is unnecessary. Just replace it with the load
5626 DCI.CombineTo(N, Val, AddTo);
5627 }
5628
5629 return SDValue();
5630}
5631
5634 CodeGenOptLevel OptLevel) {
5635 assert(N->getOpcode() == ISD::SREM || N->getOpcode() == ISD::UREM);
5636
5637 // Don't do anything at less than -O2.
5638 if (OptLevel < CodeGenOptLevel::Default)
5639 return SDValue();
5640
5641 SelectionDAG &DAG = DCI.DAG;
5642 SDLoc DL(N);
5643 EVT VT = N->getValueType(0);
5644 bool IsSigned = N->getOpcode() == ISD::SREM;
5645 unsigned DivOpc = IsSigned ? ISD::SDIV : ISD::UDIV;
5646
5647 const SDValue &Num = N->getOperand(0);
5648 const SDValue &Den = N->getOperand(1);
5649
5650 for (const SDNode *U : Num->uses()) {
5651 if (U->getOpcode() == DivOpc && U->getOperand(0) == Num &&
5652 U->getOperand(1) == Den) {
5653 // Num % Den -> Num - (Num / Den) * Den
5654 return DAG.getNode(ISD::SUB, DL, VT, Num,
5655 DAG.getNode(ISD::MUL, DL, VT,
5656 DAG.getNode(DivOpc, DL, VT, Num, Den),
5657 Den));
5658 }
5659 }
5660 return SDValue();
5661}
5662
5666 Unknown
5668
5669/// IsMulWideOperandDemotable - Checks if the provided DAG node is an operand
5670/// that can be demoted to \p OptSize bits without loss of information. The
5671/// signedness of the operand, if determinable, is placed in \p S.
5673 unsigned OptSize,
5674 OperandSignedness &S) {
5675 S = Unknown;
5676
5677 if (Op.getOpcode() == ISD::SIGN_EXTEND ||
5678 Op.getOpcode() == ISD::SIGN_EXTEND_INREG) {
5679 EVT OrigVT = Op.getOperand(0).getValueType();
5680 if (OrigVT.getFixedSizeInBits() <= OptSize) {
5681 S = Signed;
5682 return true;
5683 }
5684 } else if (Op.getOpcode() == ISD::ZERO_EXTEND) {
5685 EVT OrigVT = Op.getOperand(0).getValueType();
5686 if (OrigVT.getFixedSizeInBits() <= OptSize) {
5687 S = Unsigned;
5688 return true;
5689 }
5690 }
5691
5692 return false;
5693}
5694
5695/// AreMulWideOperandsDemotable - Checks if the given LHS and RHS operands can
5696/// be demoted to \p OptSize bits without loss of information. If the operands
5697/// contain a constant, it should appear as the RHS operand. The signedness of
5698/// the operands is placed in \p IsSigned.
5700 unsigned OptSize,
5701 bool &IsSigned) {
5702 OperandSignedness LHSSign;
5703
5704 // The LHS operand must be a demotable op
5705 if (!IsMulWideOperandDemotable(LHS, OptSize, LHSSign))
5706 return false;
5707
5708 // We should have been able to determine the signedness from the LHS
5709 if (LHSSign == Unknown)
5710 return false;
5711
5712 IsSigned = (LHSSign == Signed);
5713
5714 // The RHS can be a demotable op or a constant
5715 if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(RHS)) {
5716 const APInt &Val = CI->getAPIntValue();
5717 if (LHSSign == Unsigned) {
5718 return Val.isIntN(OptSize);
5719 } else {
5720 return Val.isSignedIntN(OptSize);
5721 }
5722 } else {
5723 OperandSignedness RHSSign;
5724 if (!IsMulWideOperandDemotable(RHS, OptSize, RHSSign))
5725 return false;
5726
5727 return LHSSign == RHSSign;
5728 }
5729}
5730
5731/// TryMULWIDECombine - Attempt to replace a multiply of M bits with a multiply
5732/// of M/2 bits that produces an M-bit result (i.e. mul.wide). This transform
5733/// works on both multiply DAG nodes and SHL DAG nodes with a constant shift
5734/// amount.
5737 EVT MulType = N->getValueType(0);
5738 if (MulType != MVT::i32 && MulType != MVT::i64) {
5739 return SDValue();
5740 }
5741
5742 SDLoc DL(N);
5743 unsigned OptSize = MulType.getSizeInBits() >> 1;
5744 SDValue LHS = N->getOperand(0);
5745 SDValue RHS = N->getOperand(1);
5746
5747 // Canonicalize the multiply so the constant (if any) is on the right
5748 if (N->getOpcode() == ISD::MUL) {
5749 if (isa<ConstantSDNode>(LHS)) {
5750 std::swap(LHS, RHS);
5751 }
5752 }
5753
5754 // If we have a SHL, determine the actual multiply amount
5755 if (N->getOpcode() == ISD::SHL) {
5756 ConstantSDNode *ShlRHS = dyn_cast<ConstantSDNode>(RHS);
5757 if (!ShlRHS) {
5758 return SDValue();
5759 }
5760
5761 APInt ShiftAmt = ShlRHS->getAPIntValue();
5762 unsigned BitWidth = MulType.getSizeInBits();
5763 if (ShiftAmt.sge(0) && ShiftAmt.slt(BitWidth)) {
5764 APInt MulVal = APInt(BitWidth, 1) << ShiftAmt;
5765 RHS = DCI.DAG.getConstant(MulVal, DL, MulType);
5766 } else {
5767 return SDValue();
5768 }
5769 }
5770
5771 bool Signed;
5772 // Verify that our operands are demotable
5773 if (!AreMulWideOperandsDemotable(LHS, RHS, OptSize, Signed)) {
5774 return SDValue();
5775 }
5776
5777 EVT DemotedVT;
5778 if (MulType == MVT::i32) {
5779 DemotedVT = MVT::i16;
5780 } else {
5781 DemotedVT = MVT::i32;
5782 }
5783
5784 // Truncate the operands to the correct size. Note that these are just for
5785 // type consistency and will (likely) be eliminated in later phases.
5786 SDValue TruncLHS =
5787 DCI.DAG.getNode(ISD::TRUNCATE, DL, DemotedVT, LHS);
5788 SDValue TruncRHS =
5789 DCI.DAG.getNode(ISD::TRUNCATE, DL, DemotedVT, RHS);
5790
5791 unsigned Opc;
5792 if (Signed) {
5794 } else {
5796 }
5797
5798 return DCI.DAG.getNode(Opc, DL, MulType, TruncLHS, TruncRHS);
5799}
5800
5801static bool isConstOne(const SDValue &Operand) {
5802 const auto *Const = dyn_cast<ConstantSDNode>(Operand);
5803 return Const && Const->getZExtValue() == 1;
5804}
5805
5807 if (Add->getOpcode() != ISD::ADD)
5808 return SDValue();
5809
5810 if (isConstOne(Add->getOperand(0)))
5811 return Add->getOperand(1);
5812
5813 if (isConstOne(Add->getOperand(1)))
5814 return Add->getOperand(0);
5815
5816 return SDValue();
5817}
5818
5821
5823 return DCI.DAG.getNode(NVPTXISD::IMAD, DL, VT, X, Y, X);
5824
5825 return SDValue();
5826}
5827
5829 SDLoc DL,
5831 if (Select->getOpcode() != ISD::SELECT)
5832 return SDValue();
5833
5834 SDValue Cond = Select->getOperand(0);
5835
5836 unsigned ConstOpNo;
5837 if (isConstOne(Select->getOperand(1)))
5838 ConstOpNo = 1;
5839 else if (isConstOne(Select->getOperand(2)))
5840 ConstOpNo = 2;
5841 else
5842 return SDValue();
5843
5844 SDValue Y = Select->getOperand((ConstOpNo == 1) ? 2 : 1);
5845
5846 // Do not combine if the resulting sequence is not obviously profitable.
5848 return SDValue();
5849
5850 SDValue NewMul = DCI.DAG.getNode(ISD::MUL, DL, VT, X, Y);
5851
5852 return DCI.DAG.getNode(ISD::SELECT, DL, VT, Cond,
5853 (ConstOpNo == 1) ? X : NewMul,
5854 (ConstOpNo == 1) ? NewMul : X);
5855}
5856
5857static SDValue
5860
5861 EVT VT = N0.getValueType();
5862 if (VT.isVector())
5863 return SDValue();
5864
5865 if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64)
5866 return SDValue();
5867
5868 SDLoc DL(N);
5869
5870 // (mul x, (add y, 1)) -> (mad x, y, x)
5871 if (SDValue Res = combineMADConstOne(N0, N1, VT, DL, DCI))
5872 return Res;
5873 if (SDValue Res = combineMADConstOne(N1, N0, VT, DL, DCI))
5874 return Res;
5875
5876 // (mul x, (select y, 1)) -> (select (mul x, y), x)
5877 if (SDValue Res = combineMulSelectConstOne(N0, N1, VT, DL, DCI))
5878 return Res;
5879 if (SDValue Res = combineMulSelectConstOne(N1, N0, VT, DL, DCI))
5880 return Res;
5881
5882 return SDValue();
5883}
5884
5885/// PerformMULCombine - Runs PTX-specific DAG combine patterns on MUL nodes.
5888 CodeGenOptLevel OptLevel) {
5889 if (OptLevel == CodeGenOptLevel::None)
5890 return SDValue();
5891
5892 if (SDValue Ret = TryMULWIDECombine(N, DCI))
5893 return Ret;
5894
5895 SDValue N0 = N->getOperand(0);
5896 SDValue N1 = N->getOperand(1);
5897 return PerformMULCombineWithOperands(N, N0, N1, DCI);
5898}
5899
5900/// PerformSHLCombine - Runs PTX-specific DAG combine patterns on SHL nodes.
5903 CodeGenOptLevel OptLevel) {
5904 if (OptLevel > CodeGenOptLevel::None) {
5905 // Try mul.wide combining at OptLevel > 0
5906 if (SDValue Ret = TryMULWIDECombine(N, DCI))
5907 return Ret;
5908 }
5909
5910 return SDValue();
5911}
5912
5915 unsigned int SmVersion) {
5916 EVT CCType = N->getValueType(0);
5917 SDValue A = N->getOperand(0);
5918 SDValue B = N->getOperand(1);
5919
5920 EVT AType = A.getValueType();
5921 if (!(CCType == MVT::v2i1 && (AType == MVT::v2f16 || AType == MVT::v2bf16)))
5922 return SDValue();
5923
5924 if (A.getValueType() == MVT::v2bf16 && SmVersion < 90)
5925 return SDValue();
5926
5927 SDLoc DL(N);
5928 // setp.f16x2 returns two scalar predicates, which we need to
5929 // convert back to v2i1. The returned result will be scalarized by
5930 // the legalizer, but the comparison will remain a single vector
5931 // instruction.
5932 SDValue CCNode = DCI.DAG.getNode(
5933 A.getValueType() == MVT::v2f16 ? NVPTXISD::SETP_F16X2
5935 DL, DCI.DAG.getVTList(MVT::i1, MVT::i1), {A, B, N->getOperand(2)});
5936 return DCI.DAG.getNode(ISD::BUILD_VECTOR, DL, CCType, CCNode.getValue(0),
5937 CCNode.getValue(1));
5938}
5939
5942 SDValue Vector = N->getOperand(0);
5943 if (Vector->getOpcode() == ISD::FREEZE)
5944 Vector = Vector->getOperand(0);
5945 SDLoc DL(N);
5946 EVT VectorVT = Vector.getValueType();
5947 if (Vector->getOpcode() == ISD::LOAD && VectorVT.isSimple() &&
5948 IsPTXVectorType(VectorVT.getSimpleVT()))
5949 return SDValue(); // Native vector loads already combine nicely w/
5950 // extract_vector_elt.
5951 // Don't mess with singletons or v2*16, v4i8 and v8i8 types, we already
5952 // handle them OK.
5953 if (VectorVT.getVectorNumElements() == 1 || Isv2x16VT(VectorVT) ||
5954 VectorVT == MVT::v4i8 || VectorVT == MVT::v8i8)
5955 return SDValue();
5956
5957 // Don't mess with undef values as sra may be simplified to 0, not undef.
5958 if (Vector->isUndef() || ISD::allOperandsUndef(Vector.getNode()))
5959 return SDValue();
5960
5961 uint64_t VectorBits = VectorVT.getSizeInBits();
5962 // We only handle the types we can extract in-register.
5963 if (!(VectorBits == 16 || VectorBits == 32 || VectorBits == 64))
5964 return SDValue();
5965
5966 ConstantSDNode *Index = dyn_cast<ConstantSDNode>(N->getOperand(1));
5967 // Index == 0 is handled by generic DAG combiner.
5968 if (!Index || Index->getZExtValue() == 0)
5969 return SDValue();
5970
5971 MVT IVT = MVT::getIntegerVT(VectorBits);
5972 EVT EltVT = VectorVT.getVectorElementType();
5973 EVT EltIVT = EltVT.changeTypeToInteger();
5974 uint64_t EltBits = EltVT.getScalarSizeInBits();
5975
5976 SDValue Result = DCI.DAG.getNode(
5977 ISD::TRUNCATE, DL, EltIVT,
5978 DCI.DAG.getNode(
5979 ISD::SRA, DL, IVT, DCI.DAG.getNode(ISD::BITCAST, DL, IVT, Vector),
5980 DCI.DAG.getConstant(Index->getZExtValue() * EltBits, DL, IVT)));
5981
5982 // If element has non-integer type, bitcast it back to the expected type.
5983 if (EltVT != EltIVT)
5984 Result = DCI.DAG.getNode(ISD::BITCAST, DL, EltVT, Result);
5985 // Past legalizer, we may need to extent i8 -> i16 to match the register type.
5986 if (EltVT != N->getValueType(0))
5987 Result = DCI.DAG.getNode(ISD::ANY_EXTEND, DL, N->getValueType(0), Result);
5988
5989 return Result;
5990}
5991
5994 SDValue VA = N->getOperand(1);
5995 EVT VectorVT = VA.getValueType();
5996 if (VectorVT != MVT::v4i8)
5997 return SDValue();
5998
5999 // We need to split vselect into individual per-element operations Because we
6000 // use BFE/BFI instruction for byte extraction/insertion, we do end up with
6001 // 32-bit values, so we may as well do comparison as i32 to avoid conversions
6002 // to/from i16 normally used for i8 values.
6004 SDLoc DL(N);
6005 SDValue VCond = N->getOperand(0);
6006 SDValue VB = N->getOperand(2);
6007 for (int I = 0; I < 4; ++I) {
6008 SDValue C = DCI.DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i1, VCond,
6009 DCI.DAG.getConstant(I, DL, MVT::i32));
6010 SDValue EA = DCI.DAG.getAnyExtOrTrunc(
6011 DCI.DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8, VA,
6012 DCI.DAG.getConstant(I, DL, MVT::i32)),
6013 DL, MVT::i32);
6014 SDValue EB = DCI.DAG.getAnyExtOrTrunc(
6015 DCI.DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8, VB,
6016 DCI.DAG.getConstant(I, DL, MVT::i32)),
6017 DL, MVT::i32);
6019 DCI.DAG.getNode(ISD::SELECT, DL, MVT::i32, C, EA, EB), DL, MVT::i8));
6020 }
6021 return DCI.DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v4i8, E);
6022}
6023
6026 SelectionDAG &DAG = DCI.DAG;
6027 LoadSDNode *LD = cast<LoadSDNode>(N);
6028
6029 // Lower a v16i8 load into a LoadV4 operation with i32 results instead of
6030 // letting ReplaceLoadVector split it into smaller loads during legalization.
6031 // This is done at dag-combine1 time, so that vector operations with i8
6032 // elements can be optimised away instead of being needlessly split during
6033 // legalization, which involves storing to the stack and loading it back.
6034 EVT VT = N->getValueType(0);
6035 if (VT != MVT::v16i8)
6036 return SDValue();
6037
6038 SDLoc DL(N);
6039
6040 // Create a v4i32 vector load operation, effectively <4 x v4i8>.
6041 unsigned Opc = NVPTXISD::LoadV4;
6042 EVT NewVT = MVT::v4i32;
6043 EVT EltVT = NewVT.getVectorElementType();
6044 unsigned NumElts = NewVT.getVectorNumElements();
6045 EVT RetVTs[] = {EltVT, EltVT, EltVT, EltVT, MVT::Other};
6046 SDVTList RetVTList = DAG.getVTList(RetVTs);
6047 SmallVector<SDValue, 8> Ops(N->ops());
6048 Ops.push_back(DAG.getIntPtrConstant(LD->getExtensionType(), DL));
6049 SDValue NewLoad = DAG.getMemIntrinsicNode(Opc, DL, RetVTList, Ops, NewVT,
6050 LD->getMemOperand());
6051 SDValue NewChain = NewLoad.getValue(NumElts);
6052
6053 // Create a vector of the same type returned by the original load.
6055 for (unsigned i = 0; i < NumElts; i++)
6056 Elts.push_back(NewLoad.getValue(i));
6057 return DCI.DAG.getMergeValues(
6058 {DCI.DAG.getBitcast(VT, DCI.DAG.getBuildVector(NewVT, DL, Elts)),
6059 NewChain},
6060 DL);
6061}
6062
6063SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N,
6064 DAGCombinerInfo &DCI) const {
6066 switch (N->getOpcode()) {
6067 default: break;
6068 case ISD::ADD:
6069 return PerformADDCombine(N, DCI, OptLevel);
6070 case ISD::FADD:
6071 return PerformFADDCombine(N, DCI, OptLevel);
6072 case ISD::MUL:
6073 return PerformMULCombine(N, DCI, OptLevel);
6074 case ISD::SHL:
6075 return PerformSHLCombine(N, DCI, OptLevel);
6076 case ISD::AND:
6077 return PerformANDCombine(N, DCI);
6078 case ISD::UREM:
6079 case ISD::SREM:
6080 return PerformREMCombine(N, DCI, OptLevel);
6081 case ISD::SETCC:
6082 return PerformSETCCCombine(N, DCI, STI.getSmVersion());
6083 case ISD::LOAD:
6084 return PerformLOADCombine(N, DCI);
6094 return PerformEXTRACTCombine(N, DCI);
6095 case ISD::VSELECT:
6096 return PerformVSELECTCombine(N, DCI);
6097 }
6098 return SDValue();
6099}
6100
6101/// ReplaceVectorLoad - Convert vector loads into multi-output scalar loads.
6104 EVT ResVT = N->getValueType(0);
6105 SDLoc DL(N);
6106
6107 assert(ResVT.isVector() && "Vector load must have vector type");
6108
6109 // We only handle "native" vector sizes for now, e.g. <4 x double> is not
6110 // legal. We can (and should) split that into 2 loads of <2 x double> here
6111 // but I'm leaving that as a TODO for now.
6112 assert(ResVT.isSimple() && "Can only handle simple types");
6113 switch (ResVT.getSimpleVT().SimpleTy) {
6114 default:
6115 return;
6116 case MVT::v2i8:
6117 case MVT::v2i16:
6118 case MVT::v2i32:
6119 case MVT::v2i64:
6120 case MVT::v2f16:
6121 case MVT::v2f32:
6122 case MVT::v2f64:
6123 case MVT::v4i8:
6124 case MVT::v4i16:
6125 case MVT::v4i32:
6126 case MVT::v4f16:
6127 case MVT::v4f32:
6128 case MVT::v8f16: // <4 x f16x2>
6129 case MVT::v8bf16: // <4 x bf16x2>
6130 case MVT::v8i16: // <4 x i16x2>
6131 // This is a "native" vector type
6132 break;
6133 }
6134
6135 LoadSDNode *LD = cast<LoadSDNode>(N);
6136
6137 Align Alignment = LD->getAlign();
6138 auto &TD = DAG.getDataLayout();
6139 Align PrefAlign =
6140 TD.getPrefTypeAlign(LD->getMemoryVT().getTypeForEVT(*DAG.getContext()));
6141 if (Alignment < PrefAlign) {
6142 // This load is not sufficiently aligned, so bail out and let this vector
6143 // load be scalarized. Note that we may still be able to emit smaller
6144 // vector loads. For example, if we are loading a <4 x float> with an
6145 // alignment of 8, this check will fail but the legalizer will try again
6146 // with 2 x <2 x float>, which will succeed with an alignment of 8.
6147 return;
6148 }
6149
6150 EVT EltVT = ResVT.getVectorElementType();
6151 unsigned NumElts = ResVT.getVectorNumElements();
6152
6153 // Since LoadV2 is a target node, we cannot rely on DAG type legalization.
6154 // Therefore, we must ensure the type is legal. For i1 and i8, we set the
6155 // loaded type to i16 and propagate the "real" type as the memory type.
6156 bool NeedTrunc = false;
6157 if (EltVT.getSizeInBits() < 16) {
6158 EltVT = MVT::i16;
6159 NeedTrunc = true;
6160 }
6161
6162 unsigned Opcode = 0;
6163 SDVTList LdResVTs;
6164 bool Load16x2 = false;
6165
6166 switch (NumElts) {
6167 default:
6168 return;
6169 case 2:
6170 Opcode = NVPTXISD::LoadV2;
6171 LdResVTs = DAG.getVTList(EltVT, EltVT, MVT::Other);
6172 break;
6173 case 4: {
6174 Opcode = NVPTXISD::LoadV4;
6175 EVT ListVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other };
6176 LdResVTs = DAG.getVTList(ListVTs);
6177 break;
6178 }
6179 case 8: {
6180 // v8f16 is a special case. PTX doesn't have ld.v8.f16
6181 // instruction. Instead, we split the vector into v2f16 chunks and
6182 // load them with ld.v4.b32.
6183 assert(Is16bitsType(EltVT.getSimpleVT()) && "Unsupported v8 vector type.");
6184 Load16x2 = true;
6185 Opcode = NVPTXISD::LoadV4;
6186 EVT VVT;
6187 switch (EltVT.getSimpleVT().SimpleTy) {
6188 case MVT::f16:
6189 VVT = MVT::v2f16;
6190 break;
6191 case MVT::bf16:
6192 VVT = MVT::v2bf16;
6193 break;
6194 case MVT::i16:
6195 VVT = MVT::v2i16;
6196 break;
6197 default:
6198 llvm_unreachable("Unsupported v8 vector type.");
6199 }
6200 EVT ListVTs[] = {VVT, VVT, VVT, VVT, MVT::Other};
6201 LdResVTs = DAG.getVTList(ListVTs);
6202 break;
6203 }
6204 }
6205
6206 // Copy regular operands
6207 SmallVector<SDValue, 8> OtherOps(N->ops());
6208
6209 // The select routine does not have access to the LoadSDNode instance, so
6210 // pass along the extension information
6211 OtherOps.push_back(DAG.getIntPtrConstant(LD->getExtensionType(), DL));
6212
6213 SDValue NewLD = DAG.getMemIntrinsicNode(Opcode, DL, LdResVTs, OtherOps,
6214 LD->getMemoryVT(),
6215 LD->getMemOperand());
6216
6217 SmallVector<SDValue, 8> ScalarRes;
6218 if (Load16x2) {
6219 // Split v2f16 subvectors back into individual elements.
6220 NumElts /= 2;
6221 for (unsigned i = 0; i < NumElts; ++i) {
6222 SDValue SubVector = NewLD.getValue(i);
6223 SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, SubVector,
6224 DAG.getIntPtrConstant(0, DL));
6225 SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, SubVector,
6226 DAG.getIntPtrConstant(1, DL));
6227 ScalarRes.push_back(E0);
6228 ScalarRes.push_back(E1);
6229 }
6230 } else {
6231 for (unsigned i = 0; i < NumElts; ++i) {
6232 SDValue Res = NewLD.getValue(i);
6233 if (NeedTrunc)
6234 Res = DAG.getNode(ISD::TRUNCATE, DL, ResVT.getVectorElementType(), Res);
6235 ScalarRes.push_back(Res);
6236 }
6237 }
6238
6239 SDValue LoadChain = NewLD.getValue(NumElts);
6240
6241 SDValue BuildVec = DAG.getBuildVector(ResVT, DL, ScalarRes);
6242
6243 Results.push_back(BuildVec);
6244 Results.push_back(LoadChain);
6245}
6246
6249 SDValue Chain = N->getOperand(0);
6250 SDValue Intrin = N->getOperand(1);
6251 SDLoc DL(N);
6252
6253 // Get the intrinsic ID
6254 unsigned IntrinNo = Intrin.getNode()->getAsZExtVal();
6255 switch (IntrinNo) {
6256 default:
6257 return;
6258 case Intrinsic::nvvm_ldg_global_i:
6259 case Intrinsic::nvvm_ldg_global_f:
6260 case Intrinsic::nvvm_ldg_global_p:
6261 case Intrinsic::nvvm_ldu_global_i:
6262 case Intrinsic::nvvm_ldu_global_f:
6263 case Intrinsic::nvvm_ldu_global_p: {
6264 EVT ResVT = N->getValueType(0);
6265
6266 if (ResVT.isVector()) {
6267 // Vector LDG/LDU
6268
6269 unsigned NumElts = ResVT.getVectorNumElements();
6270 EVT EltVT = ResVT.getVectorElementType();
6271
6272 // Since LDU/LDG are target nodes, we cannot rely on DAG type
6273 // legalization.
6274 // Therefore, we must ensure the type is legal. For i1 and i8, we set the
6275 // loaded type to i16 and propagate the "real" type as the memory type.
6276 bool NeedTrunc = false;
6277 if (EltVT.getSizeInBits() < 16) {
6278 EltVT = MVT::i16;
6279 NeedTrunc = true;
6280 }
6281
6282 unsigned Opcode = 0;
6283 SDVTList LdResVTs;
6284
6285 switch (NumElts) {
6286 default:
6287 return;
6288 case 2:
6289 switch (IntrinNo) {
6290 default:
6291 return;
6292 case Intrinsic::nvvm_ldg_global_i:
6293 case Intrinsic::nvvm_ldg_global_f:
6294 case Intrinsic::nvvm_ldg_global_p:
6295 Opcode = NVPTXISD::LDGV2;
6296 break;
6297 case Intrinsic::nvvm_ldu_global_i:
6298 case Intrinsic::nvvm_ldu_global_f:
6299 case Intrinsic::nvvm_ldu_global_p:
6300 Opcode = NVPTXISD::LDUV2;
6301 break;
6302 }
6303 LdResVTs = DAG.getVTList(EltVT, EltVT, MVT::Other);
6304 break;
6305 case 4: {
6306 switch (IntrinNo) {
6307 default:
6308 return;
6309 case Intrinsic::nvvm_ldg_global_i:
6310 case Intrinsic::nvvm_ldg_global_f:
6311 case Intrinsic::nvvm_ldg_global_p:
6312 Opcode = NVPTXISD::LDGV4;
6313 break;
6314 case Intrinsic::nvvm_ldu_global_i:
6315 case Intrinsic::nvvm_ldu_global_f:
6316 case Intrinsic::nvvm_ldu_global_p:
6317 Opcode = NVPTXISD::LDUV4;
6318 break;
6319 }
6320 EVT ListVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other };
6321 LdResVTs = DAG.getVTList(ListVTs);
6322 break;
6323 }
6324 }
6325
6326 SmallVector<SDValue, 8> OtherOps;
6327
6328 // Copy regular operands
6329
6330 OtherOps.push_back(Chain); // Chain
6331 // Skip operand 1 (intrinsic ID)
6332 // Others
6333 OtherOps.append(N->op_begin() + 2, N->op_end());
6334
6335 MemIntrinsicSDNode *MemSD = cast<MemIntrinsicSDNode>(N);
6336
6337 SDValue NewLD = DAG.getMemIntrinsicNode(Opcode, DL, LdResVTs, OtherOps,
6338 MemSD->getMemoryVT(),
6339 MemSD->getMemOperand());
6340
6341 SmallVector<SDValue, 4> ScalarRes;
6342
6343 for (unsigned i = 0; i < NumElts; ++i) {
6344 SDValue Res = NewLD.getValue(i);
6345 if (NeedTrunc)
6346 Res =
6347 DAG.getNode(ISD::TRUNCATE, DL, ResVT.getVectorElementType(), Res);
6348 ScalarRes.push_back(Res);
6349 }
6350
6351 SDValue LoadChain = NewLD.getValue(NumElts);
6352
6353 SDValue BuildVec =
6354 DAG.getBuildVector(ResVT, DL, ScalarRes);
6355
6356 Results.push_back(BuildVec);
6357 Results.push_back(LoadChain);
6358 } else {
6359 // i8 LDG/LDU
6360 assert(ResVT.isSimple() && ResVT.getSimpleVT().SimpleTy == MVT::i8 &&
6361 "Custom handling of non-i8 ldu/ldg?");
6362
6363 // Just copy all operands as-is
6364 SmallVector<SDValue, 4> Ops(N->ops());
6365
6366 // Force output to i16
6367 SDVTList LdResVTs = DAG.getVTList(MVT::i16, MVT::Other);
6368
6369 MemIntrinsicSDNode *MemSD = cast<MemIntrinsicSDNode>(N);
6370
6371 // We make sure the memory type is i8, which will be used during isel
6372 // to select the proper instruction.
6373 SDValue NewLD =
6374 DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, DL, LdResVTs, Ops,
6375 MVT::i8, MemSD->getMemOperand());
6376
6377 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
6378 NewLD.getValue(0)));
6379 Results.push_back(NewLD.getValue(1));
6380 }
6381 }
6382 }
6383}
6384
6387 // Change the CopyFromReg to output 2 64-bit results instead of a 128-bit
6388 // result so that it can pass the legalization
6389 SDLoc DL(N);
6390 SDValue Chain = N->getOperand(0);
6391 SDValue Reg = N->getOperand(1);
6392 SDValue Glue = N->getOperand(2);
6393
6394 assert(Reg.getValueType() == MVT::i128 &&
6395 "Custom lowering for CopyFromReg with 128-bit reg only");
6396 SmallVector<EVT, 4> ResultsType = {MVT::i64, MVT::i64, N->getValueType(1),
6397 N->getValueType(2)};
6398 SmallVector<SDValue, 3> NewOps = {Chain, Reg, Glue};
6399
6400 SDValue NewValue = DAG.getNode(ISD::CopyFromReg, DL, ResultsType, NewOps);
6401 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i128,
6402 {NewValue.getValue(0), NewValue.getValue(1)});
6403
6404 Results.push_back(Pair);
6405 Results.push_back(NewValue.getValue(2));
6406 Results.push_back(NewValue.getValue(3));
6407}
6408
6409void NVPTXTargetLowering::ReplaceNodeResults(
6411 switch (N->getOpcode()) {
6412 default:
6413 report_fatal_error("Unhandled custom legalization");
6414 case ISD::LOAD:
6416 return;
6419 return;
6420 case ISD::CopyFromReg:
6422 return;
6423 }
6424}
6425
6428 Type *Ty = AI->getValOperand()->getType();
6429
6430 if (AI->isFloatingPointOperation()) {
6432 if (Ty->isHalfTy() && STI.getSmVersion() >= 70 &&
6433 STI.getPTXVersion() >= 63)
6435 if (Ty->isBFloatTy() && STI.getSmVersion() >= 90 &&
6436 STI.getPTXVersion() >= 78)
6438 if (Ty->isFloatTy())
6440 if (Ty->isDoubleTy() && STI.hasAtomAddF64())
6442 }
6444 }
6445
6446 assert(Ty->isIntegerTy() && "Ty should be integer at this point");
6447 auto ITy = cast<llvm::IntegerType>(Ty);
6448
6449 switch (AI->getOperation()) {
6450 default:
6456 switch (ITy->getBitWidth()) {
6457 case 8:
6458 case 16:
6460 case 32:
6462 case 64:
6463 if (STI.hasAtomBitwise64())
6466 default:
6467 llvm_unreachable("unsupported width encountered");
6468 }
6475 switch (ITy->getBitWidth()) {
6476 case 8:
6477 case 16:
6479 case 32:
6481 case 64:
6482 if (STI.hasAtomMinMax64())
6485 default:
6486 llvm_unreachable("unsupported width encountered");
6487 }
6488 }
6489
6491}
6492
6493// Pin NVPTXTargetObjectFile's vtables to this file.
6495
6497 const GlobalObject *GO, SectionKind Kind, const TargetMachine &TM) const {
6498 return getDataSection();
6499}
#define MAKE_CASE(V)
static const LLT F32
amdgpu AMDGPU Register Bank Select
This file implements a class to represent arbitrary precision integral constant values and operations...
static SDValue PerformLOADCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue PerformADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformADDCombineWithOperands - Try DAG combinations for an ADD with operands N0 and N1.
static SDValue PerformADDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformADDCombine - Target-specific dag combine xforms for ISD::ADD.
static SDValue PerformVSELECTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue PerformMULCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue PerformFADDCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue PerformANDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Function Alias Analysis Results
This file contains the simple types necessary to represent the attributes associated with functions a...
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
This file contains the declarations for the subclasses of Constant, which represent the different fla...
return RetTy
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
uint64_t Size
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
This file contains the declarations of entities that describe floating point environment and related ...
static LVOptions Options
Definition: LVOptions.cpp:25
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
static DebugLoc getDebugLoc(MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
Return the first found DebugLoc that has a DILocation, given a range of instructions.
unsigned const TargetRegisterInfo * TRI
Module.h This file contains the declarations for the Module class.
static cl::opt< bool > sched4reg("nvptx-sched4reg", cl::desc("NVPTX Specific: schedule for register pressue"), cl::init(false))
static SDValue PerformEXTRACTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static bool isConstOne(const SDValue &Operand)
static cl::opt< unsigned > FMAContractLevelOpt("nvptx-fma-level", cl::Hidden, cl::desc("NVPTX Specific: FMA contraction (0: don't do it" " 1: do it 2: do it aggressively"), cl::init(2))
static bool IsPTXVectorType(MVT VT)
static cl::opt< int > UsePrecDivF32("nvptx-prec-divf32", cl::Hidden, cl::desc("NVPTX Specifies: 0 use div.approx, 1 use div.full, 2 use" " IEEE Compliant F32 div.rnd if available."), cl::init(2))
static SDValue PerformStoreParamCombine(SDNode *N)
static void ReplaceLoadVector(SDNode *N, SelectionDAG &DAG, SmallVectorImpl< SDValue > &Results)
ReplaceVectorLoad - Convert vector loads into multi-output scalar loads.
static unsigned getOpcForSurfaceInstr(unsigned Intrinsic)
static void ReplaceCopyFromReg_128(SDNode *N, SelectionDAG &DAG, SmallVectorImpl< SDValue > &Results)
static bool Is16bitsType(MVT VT)
static SDValue combineMADConstOne(SDValue X, SDValue Add, EVT VT, SDLoc DL, TargetLowering::DAGCombinerInfo &DCI)
static bool IsTypePassedAsArray(const Type *Ty)
static SmallVector< ParamVectorizationFlags, 16 > VectorizePTXValueVTs(const SmallVectorImpl< EVT > &ValueVTs, const SmallVectorImpl< uint64_t > &Offsets, Align ParamAlignment, bool IsVAArg=false)
static unsigned CanMergeParamLoadStoresStartingAt(unsigned Idx, uint32_t AccessSize, const SmallVectorImpl< EVT > &ValueVTs, const SmallVectorImpl< uint64_t > &Offsets, Align ParamAlignment)
static void ReplaceINTRINSIC_W_CHAIN(SDNode *N, SelectionDAG &DAG, SmallVectorImpl< SDValue > &Results)
static SDValue PerformFADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1, TargetLowering::DAGCombinerInfo &DCI, CodeGenOptLevel OptLevel)
static unsigned getOpcForTextureInstr(unsigned Intrinsic)
static bool isConstZero(const SDValue &Operand)
static SDValue LowerVectorArith(SDValue Op, SelectionDAG &DAG)
static void ComputePTXValueVTs(const TargetLowering &TLI, const DataLayout &DL, Type *Ty, SmallVectorImpl< EVT > &ValueVTs, SmallVectorImpl< uint64_t > *Offsets=nullptr, uint64_t StartingOffset=0)
ComputePTXValueVTs - For the given Type Ty, returns the set of primitive EVTs that compose it.
static bool IsMulWideOperandDemotable(SDValue Op, unsigned OptSize, OperandSignedness &S)
IsMulWideOperandDemotable - Checks if the provided DAG node is an operand that can be demoted to OptS...
static SDValue LowerUnalignedStoreParam(SelectionDAG &DAG, SDValue Chain, uint64_t Offset, EVT ElementType, SDValue StVal, SDValue &InGlue, unsigned ArgID, const SDLoc &dl)
static SDValue PerformREMCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, CodeGenOptLevel OptLevel)
static SDValue PerformMULCombineWithOperands(SDNode *N, SDValue N0, SDValue N1, TargetLowering::DAGCombinerInfo &DCI)
static SDValue PerformStoreRetvalCombine(SDNode *N)
static bool AreMulWideOperandsDemotable(SDValue LHS, SDValue RHS, unsigned OptSize, bool &IsSigned)
AreMulWideOperandsDemotable - Checks if the given LHS and RHS operands can be demoted to OptSize bits...
static SDValue PerformStoreCombineHelper(SDNode *N, std::size_t Front, std::size_t Back)
static bool adjustElementType(EVT &ElementType)
static SDValue TryMULWIDECombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
TryMULWIDECombine - Attempt to replace a multiply of M bits with a multiply of M/2 bits that produces...
static SDValue combineMulSelectConstOne(SDValue X, SDValue Select, EVT VT, SDLoc DL, TargetLowering::DAGCombinerInfo &DCI)
static SDValue matchMADConstOnePattern(SDValue Add)
static cl::opt< bool > UsePrecSqrtF32("nvptx-prec-sqrtf32", cl::Hidden, cl::desc("NVPTX Specific: 0 use sqrt.approx, 1 use sqrt.rn."), cl::init(true))
ParamVectorizationFlags
@ PVF_FIRST
@ PVF_SCALAR
@ PVF_INNER
@ PVF_LAST
static SDValue LowerUnalignedStoreRet(SelectionDAG &DAG, SDValue Chain, uint64_t Offset, EVT ElementType, SDValue RetVal, const SDLoc &dl)
static bool PromoteScalarIntegerPTX(const EVT &VT, MVT *PromotedVT)
PromoteScalarIntegerPTX Used to make sure the arguments/returns are suitable for passing and promote ...
OperandSignedness
static SDValue PerformSETCCCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, unsigned int SmVersion)
static SDValue LowerUnalignedLoadRetParam(SelectionDAG &DAG, SDValue &Chain, uint64_t Offset, EVT ElementType, SDValue &InGlue, SmallVectorImpl< SDValue > &TempProxyRegOps, const SDLoc &dl)
static std::atomic< unsigned > GlobalUniqueCallSite
static cl::opt< bool > ForceMinByValParamAlign("nvptx-force-min-byval-param-align", cl::Hidden, cl::desc("NVPTX Specific: force 4-byte minimal alignment for byval" " params of device functions."), cl::init(false))
static SDValue PerformSHLCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, CodeGenOptLevel OptLevel)
PerformSHLCombine - Runs PTX-specific DAG combine patterns on SHL nodes.
unsigned SmVersion
Definition: NVVMReflect.cpp:81
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
#define P(N)
if(PassOpts->AAPipeline)
const SmallVectorImpl< MachineOperand > & Cond
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file contains some templates that are useful if you are working with the STL at all.
This file defines the SmallVector class.
static bool Enabled
Definition: Statistic.cpp:46
This file describes how to lower LLVM code to machine code.
Value * RHS
Value * LHS
Class for arbitrary precision integers.
Definition: APInt.h:78
bool isSignedIntN(unsigned N) const
Check if this APInt has an N-bits signed integer value.
Definition: APInt.h:413
bool slt(const APInt &RHS) const
Signed less than comparison.
Definition: APInt.h:1108
bool isIntN(unsigned N) const
Check if this APInt has an N-bits unsigned integer value.
Definition: APInt.h:410
bool sge(const APInt &RHS) const
Signed greater or equal comparison.
Definition: APInt.h:1215
This class represents an incoming formal argument to a Function.
Definition: Argument.h:31
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
const T & back() const
back - Get the last element.
Definition: ArrayRef.h:174
ArrayRef< T > drop_back(size_t N=1) const
Drop the last N elements of the array.
Definition: ArrayRef.h:210
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:160
an instruction that atomically reads a memory location, combines it with another value,...
Definition: Instructions.h:696
@ Add
*p = old + v
Definition: Instructions.h:712
@ FAdd
*p = old + v
Definition: Instructions.h:733
@ Min
*p = old <signed v ? old : v
Definition: Instructions.h:726
@ Or
*p = old | v
Definition: Instructions.h:720
@ Sub
*p = old - v
Definition: Instructions.h:714
@ And
*p = old & v
Definition: Instructions.h:716
@ Xor
*p = old ^ v
Definition: Instructions.h:722
@ Max
*p = old >signed v ? old : v
Definition: Instructions.h:724
@ UMin
*p = old <unsigned v ? old : v
Definition: Instructions.h:730
@ UMax
*p = old >unsigned v ? old : v
Definition: Instructions.h:728
bool isFloatingPointOperation() const
Definition: Instructions.h:864
BinOp getOperation() const
Definition: Instructions.h:787
Value * getValOperand()
Definition: Instructions.h:856
bool hasParamAttr(unsigned ArgNo, Attribute::AttrKind Kind) const
Return true if the attribute exists for the given argument.
Definition: Attributes.h:805
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
Definition: InstrTypes.h:1236
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
Definition: InstrTypes.h:1465
This class represents a function call, abstracting a target machine's calling convention.
uint64_t getZExtValue() const
const APInt & getAPIntValue() const
static Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
Definition: Constants.cpp:370
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:63
TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
Definition: DataLayout.h:461
Align getPrefTypeAlign(Type *Ty) const
Returns the preferred stack/global alignment for the specified type.
Definition: DataLayout.cpp:842
Diagnostic information for unsupported feature in backend.
void addFnAttr(Attribute::AttrKind Kind)
Add function attributes to this function.
Definition: Function.cpp:653
Type * getReturnType() const
Returns the type of the ret val.
Definition: Function.h:219
unsigned getAddressSpace() const
const GlobalValue * getGlobal() const
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
This class is used to represent ISD::LOAD nodes.
MCSection * getDataSection() const
Instances of this class represent a uniqued identifier for a section in the current translation unit.
Definition: MCSection.h:36
StringRef getName() const
getName - Get the symbol name.
Definition: MCSymbol.h:205
Machine Value Type.
SimpleValueType SimpleTy
unsigned getVectorNumElements() const
bool isScalableVector() const
Return true if this is a vector value type where the runtime length is machine dependent.
static auto integer_valuetypes()
static auto fixedlen_vector_valuetypes()
static MVT getIntegerVT(unsigned BitWidth)
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
Function & getFunction()
Return the LLVM function that this machine code represents.
const LLVMTargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
const MachineJumpTableInfo * getJumpTableInfo() const
getJumpTableInfo - Return the jump table info object for the current function.
@ EK_Inline
EK_Inline - Jump table entries are emitted inline at their point of use.
const std::vector< MachineJumpTableEntry > & getJumpTables() const
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MOInvariant
The memory access always returns the same value (or traps).
@ MOStore
The memory access writes data.
This SDNode is used for target intrinsics that touch memory and need an associated MachineMemOperand.
This is an abstract virtual class for memory operations.
Align getAlign() const
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
EVT getMemoryVT() const
Return the type of the in-memory value.
unsigned getMaxRequiredAlignment() const
bool hasAtomMinMax64() const
bool hasAtomAddF64() const
const NVPTXTargetLowering * getTargetLowering() const override
unsigned getPTXVersion() const
const NVPTXRegisterInfo * getRegisterInfo() const override
unsigned int getSmVersion() const
bool hasAtomBitwise64() const
bool hasBF16Math() const
bool allowFP16Math() const
ConstraintType getConstraintType(StringRef Constraint) const override
getConstraintType - Given a constraint letter, return the type of constraint it is for this target.
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
const NVPTXTargetMachine * nvTM
SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const
NVPTXTargetLowering(const NVPTXTargetMachine &TM, const NVPTXSubtarget &STI)
bool useF32FTZ(const MachineFunction &MF) const
Align getFunctionArgumentAlignment(const Function *F, Type *Ty, unsigned Idx, const DataLayout &DL) const
SDValue getSqrtEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled, int &ExtraSteps, bool &UseOneConst, bool Reciprocal) const override
Hooks for building estimates in place of slower divisions and square roots.
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &dl, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &dl, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower the incoming (formal) arguments, described by the Ins array,...
void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const override
Lower the specified operand into the Ops vector.
std::string getParamName(const Function *F, int Idx) const
TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const override
Return the preferred vector type legalization action.
std::string getPrototype(const DataLayout &DL, Type *, const ArgListTy &, const SmallVectorImpl< ISD::OutputArg > &, MaybeAlign retAlignment, std::optional< std::pair< unsigned, const APInt & > > VAInfo, const CallBase &CB, unsigned UniqueCallSite) const
Align getFunctionParamOptimizedAlign(const Function *F, Type *ArgTy, const DataLayout &DL) const
getFunctionParamOptimizedAlign - since function arguments are passed via .param space,...
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Ctx, EVT VT) const override
Return the ValueType of the result of SETCC operations.
std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override
Given a physical register constraint (e.g.
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I=nullptr) const override
isLegalAddressingMode - Return true if the addressing mode represented by AM is legal for this target...
AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override
Returns how the IR-level AtomicExpand pass should expand the given AtomicRMW, if at all.
Align getFunctionByValParamAlign(const Function *F, Type *ArgTy, Align InitialAlign, const DataLayout &DL) const
Helper for computing alignment of a device function byval parameter.
bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I, MachineFunction &MF, unsigned Intrinsic) const override
Given an intrinsic, checks if on the target the intrinsic will need to map to a MemIntrinsicNode (tou...
const char * getTargetNodeName(unsigned Opcode) const override
This method returns the name of a target specific DAG node.
bool allowFMA(MachineFunction &MF, CodeGenOptLevel OptLevel) const
unsigned getJumpTableEncoding() const override
Return the entry encoding for a jump table in the current function.
bool allowUnsafeFPMath(MachineFunction &MF) const
SDValue LowerCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower calls into the specified DAG.
UniqueStringSaver & getStrPool() const
MCSection * SelectSectionForGlobal(const GlobalObject *GO, SectionKind Kind, const TargetMachine &TM) const override
static PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
Represents one node in the SelectionDAG.
const APInt & getAsAPIntVal() const
Helper method returns the APInt value of a ConstantSDNode.
bool isMachineOpcode() const
Test if this node has a post-isel opcode, directly corresponding to a MachineInstr opcode.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool hasOneUse() const
Return true if there is exactly one use of this node.
unsigned getIROrder() const
Return the node ordering.
iterator_range< use_iterator > uses()
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getNumOperands() const
Return the number of values used by this operation.
unsigned getMachineOpcode() const
This may only be called if isMachineOpcode returns true.
SDVTList getVTList() const
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
const APInt & getConstantOperandAPInt(unsigned Num) const
Helper method returns the APInt of a ConstantSDNode operand.
EVT getValueType(unsigned ResNo) const
Return the type of a specified result.
bool isUndef() const
Return true if the type of the node type undefined.
Represents a use of a SDNode.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
SDNode * getNode() const
get the SDNode which holds the desired result
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
MVT getSimpleValueType() const
Return the simple ValueType of the referenced return value.
unsigned getOpcode() const
SectionKind - This is a simple POD value that classifies the properties of a section.
Definition: SectionKind.h:22
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
Definition: SelectionDAG.h:226
SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
SDValue getTargetGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, unsigned TargetFlags=0)
Definition: SelectionDAG.h:736
const SDValue & getRoot() const
Return the root tag of the SelectionDAG.
Definition: SelectionDAG.h:567
SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false)
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
SDValue getSymbolFunctionGlobalAddress(SDValue Op, Function **TargetFunction=nullptr)
Return a GlobalAddress of the function from the current module with name matching the given ExternalS...
SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
const TargetLowering & getTargetLoweringInfo() const
Definition: SelectionDAG.h:493
SDNode * MorphNodeTo(SDNode *N, unsigned Opc, SDVTList VTs, ArrayRef< SDValue > Ops)
This mutates the specified node to have the specified return type, opcode, and operands.
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
Definition: SelectionDAG.h:842
SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build Select's if you just have operands and don't want to check...
const DataLayout & getDataLayout() const
Definition: SelectionDAG.h:487
SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
void RemoveDeadNode(SDNode *N)
Remove the specified node from the system.
SDValue getBasicBlock(MachineBasicBlock *MBB)
SDValue getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either any-extending or truncat...
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond)
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
SDValue getIntPtrConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
SDValue getFPExtendOrRound(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of float type, to the float type VT, by either extending or rounding (by tr...
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
Definition: SelectionDAG.h:690
MachineFunction & getMachineFunction() const
Definition: SelectionDAG.h:482
SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
LLVMContext * getContext() const
Definition: SelectionDAG.h:500
const SDValue & setRoot(SDValue N)
Set the current root tag of the SelectionDAG.
Definition: SelectionDAG.h:576
SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, LocationSize Size=0, const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
SDValue getTargetExternalSymbol(const char *Sym, EVT VT, unsigned TargetFlags=0)
This SDNode is used to implement the code generator support for the llvm IR shufflevector instruction...
ArrayRef< int > getMask() const
bool empty() const
Definition: SmallVector.h:94
size_t size() const
Definition: SmallVector.h:91
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:586
void assign(size_type NumElts, ValueParamT Elt)
Definition: SmallVector.h:717
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
Definition: SmallVector.h:696
void resize(size_type N)
Definition: SmallVector.h:651
void push_back(const T &Elt)
Definition: SmallVector.h:426
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
This class is used to represent ISD::STORE nodes.
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
constexpr size_t size() const
size - Get the string size.
Definition: StringRef.h:137
constexpr const char * data() const
data - Get a pointer to the start of the string (which may not be null terminated).
Definition: StringRef.h:131
Class to represent struct types.
Definition: DerivedTypes.h:216
void setBooleanVectorContents(BooleanContent Ty)
Specify how the target extends the result of a vector boolean value from a vector of i1 to a wider ty...
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
void setMaxDivRemBitWidthSupported(unsigned SizeInBits)
Set the size in bits of the maximum div/rem the backend supports.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
LegalizeAction
This enum indicates whether operations are valid for a target, and if not, what action should be used...
unsigned MaxStoresPerMemcpyOptSize
Likewise for functions with the OptSize attribute.
const TargetMachine & getTargetMachine() const
LegalizeTypeAction
This enum indicates whether a types are legal for a target, and if not, what action should be used to...
void addBypassSlowDiv(unsigned int SlowBitWidth, unsigned int FastBitWidth)
Tells the code generator which bitwidths to bypass.
virtual unsigned getNumRegisters(LLVMContext &Context, EVT VT, std::optional< MVT > RegisterVT=std::nullopt) const
Return the number of registers that this ValueType will eventually require.
void setMaxAtomicSizeInBitsSupported(unsigned SizeInBits)
Set the maximum atomic operation size supported by the backend.
virtual TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const
Return the preferred vector type legalization action.
unsigned MaxStoresPerMemsetOptSize
Likewise for functions with the OptSize attribute.
void setBooleanContents(BooleanContent Ty)
Specify how the target extends the result of integer and floating point boolean values from i1 to a w...
unsigned MaxStoresPerMemmove
Specify maximum number of store instructions per memmove call.
void computeRegisterProperties(const TargetRegisterInfo *TRI)
Once all of the register classes are added, this allows us to compute derived properties we expose.
unsigned MaxStoresPerMemmoveOptSize
Likewise for functions with the OptSize attribute.
void addRegisterClass(MVT VT, const TargetRegisterClass *RC)
Add the specified register class as an available regclass for the specified value type.
virtual MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
unsigned MaxStoresPerMemset
Specify maximum number of store instructions per memset call.
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
void setMinCmpXchgSizeInBits(unsigned SizeInBits)
Sets the minimum cmpxchg or ll/sc size supported by the backend.
void AddPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
If Opc/OrigVT is specified as being promoted, the promotion code defaults to trying a larger integer/...
AtomicExpansionKind
Enum that specifies what an atomic load/AtomicRMWInst is expanded to, if at all.
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
Align getMinStackArgumentAlignment() const
Return the minimum stack alignment of an argument.
void setLoadExtAction(unsigned ExtType, MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified load with extension does not work with the specified type and indicate wh...
std::vector< ArgListEntry > ArgListTy
bool allowsMemoryAccessForAlignment(LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const
This function returns true if the memory access is aligned or if the target allows this specific unal...
unsigned MaxStoresPerMemcpy
Specify maximum number of store instructions per memcpy call.
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
void setJumpIsExpensive(bool isExpensive=true)
Tells the code generator not to expand logic operations on comparison predicates into separate sequen...
LegalizeAction getOperationAction(unsigned Op, EVT VT) const
Return how this operation should be treated: either it is legal, needs to be promoted to a larger siz...
This class defines information used to lower LLVM code to legal SelectionDAG operators that the targe...
SDValue expandUnalignedStore(StoreSDNode *ST, SelectionDAG &DAG) const
Expands an unaligned store to 2 half-size stores for integer values, and possibly more for vectors.
virtual ConstraintType getConstraintType(StringRef Constraint) const
Given a constraint, return the type of constraint it is for this target.
std::pair< SDValue, SDValue > expandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Expands an unaligned load to 2 half-size loads for an integer, and possibly more for vectors.
virtual std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const
Given a physical register constraint (e.g.
SDValue expandRoundInexactToOdd(EVT ResultVT, SDValue Op, const SDLoc &DL, SelectionDAG &DAG) const
Truncate Op to ResultVT.
SDValue expandFP_ROUND(SDNode *Node, SelectionDAG &DAG) const
Expand round(fp) to fp conversion.
virtual void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const
Lower the specified operand into the Ops vector.
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:77
CodeGenOptLevel getOptLevel() const
Returns the optimization level: None, Less, Default, or Aggressive.
TargetOptions Options
MCSymbol * getSymbol(const GlobalValue *GV) const
unsigned UnsafeFPMath
UnsafeFPMath - This flag is enabled when the -enable-unsafe-fp-math flag is specified on the command ...
FPOpFusion::FPOpFusionMode AllowFPOpFusion
AllowFPOpFusion - This flag is set by the -fp-contract=xxx option.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:261
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition: Type.h:153
bool isBFloatTy() const
Return true if this is 'bfloat', a 16-bit bfloat type.
Definition: Type.h:145
@ VoidTyID
type with no size
Definition: Type.h:63
bool isAggregateType() const
Return true if the type is an aggregate type.
Definition: Type.h:291
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition: Type.h:142
bool isDoubleTy() const
Return true if this is 'double', a 64-bit IEEE fp type.
Definition: Type.h:156
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition: Type.h:184
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:224
TypeID getTypeID() const
Return the type id for the type.
Definition: Type.h:136
TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
StringRef save(const char *S)
Definition: StringSaver.h:52
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
A raw_ostream that writes to an std::string.
Definition: raw_ostream.h:661
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
NodeType
ISD::NodeType enum - This enum defines the target-independent operators for a SelectionDAG.
Definition: ISDOpcodes.h:40
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition: ISDOpcodes.h:779
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition: ISDOpcodes.h:257
@ BSWAP
Byte Swap and Counting operators.
Definition: ISDOpcodes.h:743
@ VAEND
VAEND, VASTART - VAEND and VASTART have three operands: an input chain, pointer, and a SRCVALUE.
Definition: ISDOpcodes.h:1223
@ ConstantFP
Definition: ISDOpcodes.h:77
@ ADDC
Carry-setting nodes for multiple precision addition and subtraction.
Definition: ISDOpcodes.h:276
@ ADD
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:246
@ LOAD
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
Definition: ISDOpcodes.h:1099
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition: ISDOpcodes.h:813
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition: ISDOpcodes.h:497
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
Definition: ISDOpcodes.h:205
@ GlobalAddress
Definition: ISDOpcodes.h:78
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition: ISDOpcodes.h:840
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
Definition: ISDOpcodes.h:557
@ FADD
Simple binary floating point operators.
Definition: ISDOpcodes.h:397
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
Definition: ISDOpcodes.h:716
@ SDIVREM
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition: ISDOpcodes.h:262
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
Definition: ISDOpcodes.h:953
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
Definition: ISDOpcodes.h:236
@ SIGN_EXTEND
Conversion operators.
Definition: ISDOpcodes.h:804
@ READSTEADYCOUNTER
READSTEADYCOUNTER - This corresponds to the readfixedcounter intrinsic.
Definition: ISDOpcodes.h:1256
@ FNEG
Perform various unary floating-point operations inspired by libm.
Definition: ISDOpcodes.h:980
@ BR_CC
BR_CC - Conditional branch.
Definition: ISDOpcodes.h:1145
@ SSUBO
Same for subtraction.
Definition: ISDOpcodes.h:334
@ BRIND
BRIND - Indirect branch.
Definition: ISDOpcodes.h:1120
@ BR_JT
BR_JT - Jumptable branch.
Definition: ISDOpcodes.h:1124
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
Definition: ISDOpcodes.h:356
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition: ISDOpcodes.h:756
@ UNDEF
UNDEF - An undefined node.
Definition: ISDOpcodes.h:218
@ VACOPY
VACOPY - VACOPY has 5 operands: an input chain, a destination pointer, a source pointer,...
Definition: ISDOpcodes.h:1219
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
Definition: ISDOpcodes.h:215
@ SADDO
RESULT, BOOL = [SU]ADDO(LHS, RHS) - Overflow-aware nodes for addition.
Definition: ISDOpcodes.h:330
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition: ISDOpcodes.h:673
@ SHL
Shift and rotation operations.
Definition: ISDOpcodes.h:734
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition: ISDOpcodes.h:614
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
Definition: ISDOpcodes.h:587
@ FMINNUM_IEEE
FMINNUM_IEEE/FMAXNUM_IEEE - Perform floating-point minimumNumber or maximumNumber on two values,...
Definition: ISDOpcodes.h:1041
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition: ISDOpcodes.h:549
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition: ISDOpcodes.h:209
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition: ISDOpcodes.h:810
@ DEBUGTRAP
DEBUGTRAP - Trap intended to get the attention of a debugger.
Definition: ISDOpcodes.h:1279
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
Definition: ISDOpcodes.h:771
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum or maximum on two values.
Definition: ISDOpcodes.h:1028
@ SSHLSAT
RESULT = [US]SHLSAT(LHS, RHS) - Perform saturation left shift.
Definition: ISDOpcodes.h:366
@ SMULO
Same for multiplication.
Definition: ISDOpcodes.h:338
@ DYNAMIC_STACKALLOC
DYNAMIC_STACKALLOC - Allocate some number of bytes on the stack aligned to a specified boundary.
Definition: ISDOpcodes.h:1109
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition: ISDOpcodes.h:848
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition: ISDOpcodes.h:696
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:938
@ VSELECT
Select with a vector condition (op #0) and two vector operands (ops #1 and #2), returning a vector re...
Definition: ISDOpcodes.h:765
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition: ISDOpcodes.h:310
@ BF16_TO_FP
BF16_TO_FP, FP_TO_BF16 - These operators are used to perform promotions and truncation for bfloat16.
Definition: ISDOpcodes.h:972
@ FRAMEADDR
FRAMEADDR, RETURNADDR - These nodes represent llvm.frameaddress and llvm.returnaddress on the DAG.
Definition: ISDOpcodes.h:100
@ FMINIMUM
FMINIMUM/FMAXIMUM - NaN-propagating minimum/maximum that also treat -0.0 as less than 0....
Definition: ISDOpcodes.h:1047
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition: ISDOpcodes.h:886
@ READCYCLECOUNTER
READCYCLECOUNTER - This corresponds to the readcyclecounter intrinsic.
Definition: ISDOpcodes.h:1250
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition: ISDOpcodes.h:708
@ TRAP
TRAP - Trapping instruction.
Definition: ISDOpcodes.h:1276
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition: ISDOpcodes.h:190
@ ADDE
Carry-using nodes for multiple precision addition and subtraction.
Definition: ISDOpcodes.h:286
@ FREEZE
FREEZE - FREEZE(VAL) returns an arbitrary value if VAL is UNDEF (or is evaluated to UNDEF),...
Definition: ISDOpcodes.h:223
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
Definition: ISDOpcodes.h:538
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
Definition: ISDOpcodes.h:52
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition: ISDOpcodes.h:919
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition: ISDOpcodes.h:816
@ VAARG
VAARG - VAARG has four operands: an input chain, a pointer, a SRCVALUE, and the alignment.
Definition: ISDOpcodes.h:1214
@ SHL_PARTS
SHL_PARTS/SRA_PARTS/SRL_PARTS - These operators are used for expanded integer shift operations.
Definition: ISDOpcodes.h:793
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
Definition: ISDOpcodes.h:507
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
Definition: ISDOpcodes.h:347
@ SADDO_CARRY
Carry-using overflow-aware nodes for multiple precision addition and subtraction.
Definition: ISDOpcodes.h:320
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition: ISDOpcodes.h:198
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
Definition: ISDOpcodes.h:529
bool allOperandsUndef(const SDNode *N)
Return true if the node has at least one operand and all operands of the specified node are ISD::UNDE...
@ Bitcast
Perform the operation on a different, but equivalently sized type.
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
@ FalseVal
Definition: TGLexer.h:59
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
@ Offset
Definition: DWP.cpp:480
static bool isIndirectCall(const MachineInstr &MI)
bool shouldEmitPTXNoReturn(const Value *V, const TargetMachine &TM)
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1722
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition: STLExtras.h:1680
bool Isv2x16VT(EVT VT)
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition: STLExtras.h:2431
@ ADDRESS_SPACE_LOCAL
Definition: NVPTXBaseInfo.h:26
@ ADDRESS_SPACE_PARAM
Definition: NVPTXBaseInfo.h:29
MaybeAlign getAlign(const Function &F, unsigned Index)
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition: MathExtras.h:394
OutputIt transform(R &&Range, OutputIt d_first, UnaryFunction F)
Wrapper function around std::transform to apply a function to a range and store the result elsewhere.
Definition: STLExtras.h:1935
unsigned promoteScalarArgumentSize(unsigned size)
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition: Error.cpp:167
CodeGenOptLevel
Code generation optimization level.
Definition: CodeGen.h:54
@ Add
Sum of integers.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition: Alignment.h:155
DWARFExpression::Operation Op
void ComputeValueVTs(const TargetLowering &TLI, const DataLayout &DL, Type *Ty, SmallVectorImpl< EVT > &ValueVTs, SmallVectorImpl< EVT > *MemVTs, SmallVectorImpl< TypeSize > *Offsets=nullptr, TypeSize StartingOffset=TypeSize::getZero())
ComputeValueVTs - Given an LLVM IR type, compute a sequence of EVTs that represent all the individual...
Definition: Analysis.cpp:79
constexpr unsigned BitWidth
Definition: BitmaskEnum.h:191
bool isKernelFunction(const Function &F)
Function * getMaybeBitcastedCallee(const CallBase *CB)
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition: Alignment.h:212
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:860
#define N
static const fltSemantics & IEEEsingle() LLVM_READNONE
Definition: APFloat.cpp:281
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
uint64_t value() const
This is a hole in the type system and should not be abused.
Definition: Alignment.h:85
@ PreserveSign
The sign of a flushed-to-zero number is preserved in the sign of 0.
DenormalModeKind Output
Denormal flushing mode for floating point instruction results in the default floating point environme...
Extended Value Type.
Definition: ValueTypes.h:35
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
Definition: ValueTypes.h:381
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition: ValueTypes.h:137
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition: ValueTypes.h:74
EVT changeTypeToInteger() const
Return the type converted to an equivalently sized integer or vector with integer element type.
Definition: ValueTypes.h:121
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
Definition: ValueTypes.h:147
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition: ValueTypes.h:359
uint64_t getScalarSizeInBits() const
Definition: ValueTypes.h:371
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition: ValueTypes.h:307
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
Definition: ValueTypes.h:367
bool isVector() const
Return true if this is a vector value type.
Definition: ValueTypes.h:168
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition: ValueTypes.h:314
Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
Definition: ValueTypes.cpp:204
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition: ValueTypes.h:319
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
Definition: ValueTypes.h:157
EVT changeVectorElementType(EVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
Definition: ValueTypes.h:102
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition: ValueTypes.h:327
bool isInteger() const
Return true if this is an integer or a vector integer type.
Definition: ValueTypes.h:152
This class contains a discriminated union of information about pointers in memory operands,...
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
This structure contains all information that is necessary for lowering calls.
SmallVector< ISD::InputArg, 32 > Ins
SmallVector< ISD::OutputArg, 32 > Outs
SmallVector< SDValue, 32 > OutVals
SDValue CombineTo(SDNode *N, ArrayRef< SDValue > To, bool AddTo=true)