LLVM 22.0.0git
NVPTXISelLowering.cpp
Go to the documentation of this file.
1//===-- NVPTXISelLowering.cpp - NVPTX DAG Lowering Implementation ---------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines the interfaces that NVPTX uses to lower LLVM code into a
10// selection DAG.
11//
12//===----------------------------------------------------------------------===//
13
14#include "NVPTXISelLowering.h"
16#include "NVPTX.h"
17#include "NVPTXISelDAGToDAG.h"
18#include "NVPTXSubtarget.h"
19#include "NVPTXTargetMachine.h"
21#include "NVPTXUtilities.h"
22#include "llvm/ADT/APFloat.h"
23#include "llvm/ADT/APInt.h"
24#include "llvm/ADT/STLExtras.h"
26#include "llvm/ADT/StringRef.h"
38#include "llvm/IR/Argument.h"
39#include "llvm/IR/Attributes.h"
40#include "llvm/IR/Constants.h"
41#include "llvm/IR/DataLayout.h"
44#include "llvm/IR/FPEnv.h"
45#include "llvm/IR/Function.h"
46#include "llvm/IR/GlobalValue.h"
47#include "llvm/IR/IRBuilder.h"
48#include "llvm/IR/Instruction.h"
50#include "llvm/IR/IntrinsicsNVPTX.h"
51#include "llvm/IR/Module.h"
52#include "llvm/IR/Type.h"
53#include "llvm/IR/Value.h"
65#include <algorithm>
66#include <cassert>
67#include <cmath>
68#include <cstdint>
69#include <iterator>
70#include <optional>
71#include <string>
72#include <tuple>
73#include <utility>
74#include <vector>
75
76#define DEBUG_TYPE "nvptx-lower"
77
78using namespace llvm;
79
81 "nvptx-sched4reg",
82 cl::desc("NVPTX Specific: schedule for register pressue"), cl::init(false));
83
85 "nvptx-fma-level", cl::Hidden,
86 cl::desc("NVPTX Specific: FMA contraction (0: don't do it"
87 " 1: do it 2: do it aggressively"),
88 cl::init(2));
89
91 "nvptx-prec-divf32", cl::Hidden,
93 "NVPTX Specific: Override the precision of the lowering for f32 fdiv"),
95 clEnumValN(NVPTX::DivPrecisionLevel::Approx, "0", "Use div.approx"),
96 clEnumValN(NVPTX::DivPrecisionLevel::Full, "1", "Use div.full"),
98 "Use IEEE Compliant F32 div.rnd if available (default)"),
100 "Use IEEE Compliant F32 div.rnd if available, no FTZ")),
102
104 "nvptx-prec-sqrtf32", cl::Hidden,
105 cl::desc("NVPTX Specific: 0 use sqrt.approx, 1 use sqrt.rn."),
106 cl::init(true));
107
108/// Whereas CUDA's implementation (see libdevice) uses ex2.approx for exp2(), it
109/// does NOT use lg2.approx for log2, so this is disabled by default.
111 "nvptx-approx-log2f32",
112 cl::desc("NVPTX Specific: whether to use lg2.approx for log2"),
113 cl::init(false));
114
116 "nvptx-force-min-byval-param-align", cl::Hidden,
117 cl::desc("NVPTX Specific: force 4-byte minimal alignment for byval"
118 " params of device functions."),
119 cl::init(false));
120
123 const SDNode &N) const {
124 // If nvptx-prec-div32=N is used on the command-line, always honor it
125 if (UsePrecDivF32.getNumOccurrences() > 0)
126 return UsePrecDivF32;
127
128 const SDNodeFlags Flags = N.getFlags();
129 if (Flags.hasApproximateFuncs())
131
133}
134
136 // If nvptx-prec-sqrtf32 is used on the command-line, always honor it
137 if (UsePrecSqrtF32.getNumOccurrences() > 0)
138 return UsePrecSqrtF32;
139
140 if (N) {
141 const SDNodeFlags Flags = N->getFlags();
142 if (Flags.hasApproximateFuncs())
143 return false;
144 }
145
146 return true;
147}
148
153
154static bool IsPTXVectorType(MVT VT) {
155 switch (VT.SimpleTy) {
156 default:
157 return false;
158 case MVT::v2i1:
159 case MVT::v4i1:
160 case MVT::v2i8:
161 case MVT::v4i8:
162 case MVT::v8i8: // <2 x i8x4>
163 case MVT::v16i8: // <4 x i8x4>
164 case MVT::v2i16:
165 case MVT::v4i16:
166 case MVT::v8i16: // <4 x i16x2>
167 case MVT::v2i32:
168 case MVT::v4i32:
169 case MVT::v2i64:
170 case MVT::v2f16:
171 case MVT::v4f16:
172 case MVT::v8f16: // <4 x f16x2>
173 case MVT::v2bf16:
174 case MVT::v4bf16:
175 case MVT::v8bf16: // <4 x bf16x2>
176 case MVT::v2f32:
177 case MVT::v4f32:
178 case MVT::v2f64:
179 case MVT::v4i64:
180 case MVT::v4f64:
181 case MVT::v8i32:
182 case MVT::v8f32:
183 case MVT::v16f16: // <8 x f16x2>
184 case MVT::v16bf16: // <8 x bf16x2>
185 case MVT::v16i16: // <8 x i16x2>
186 case MVT::v32i8: // <8 x i8x4>
187 return true;
188 }
189}
190
191// When legalizing vector loads/stores, this function is called, which does two
192// things:
193// 1. Determines Whether the vector is something we want to custom lower,
194// std::nullopt is returned if we do not want to custom lower it.
195// 2. If we do want to handle it, returns two parameters:
196// - unsigned int NumElts - The number of elements in the final vector
197// - EVT EltVT - The type of the elements in the final vector
198static std::optional<std::pair<unsigned int, MVT>>
200 unsigned AddressSpace) {
201 const bool CanLowerTo256Bit = STI.has256BitVectorLoadStore(AddressSpace);
202
203 if (CanLowerTo256Bit && VectorEVT.isScalarInteger() &&
204 VectorEVT.getSizeInBits() == 256)
205 return {{4, MVT::i64}};
206
207 if (!VectorEVT.isSimple())
208 return std::nullopt;
209 const MVT VectorVT = VectorEVT.getSimpleVT();
210
211 if (!VectorVT.isVector()) {
212 if (VectorVT == MVT::i128 || VectorVT == MVT::f128)
213 return {{2, MVT::i64}};
214 return std::nullopt;
215 }
216
217 const MVT EltVT = VectorVT.getVectorElementType();
218 const unsigned NumElts = VectorVT.getVectorNumElements();
219
220 // The size of the PTX virtual register that holds a packed type.
221 unsigned PackRegSize;
222
223 // We only handle "native" vector sizes for now, e.g. <4 x double> is not
224 // legal. We can (and should) split that into 2 stores of <2 x double> here
225 // but I'm leaving that as a TODO for now.
226 switch (VectorVT.SimpleTy) {
227 default:
228 return std::nullopt;
229
230 case MVT::v4i64:
231 case MVT::v4f64:
232 // This is a "native" vector type iff the address space is global and the
233 // target supports 256-bit loads/stores
234 if (!CanLowerTo256Bit)
235 return std::nullopt;
237 case MVT::v2i8:
238 case MVT::v2i64:
239 case MVT::v2f64:
240 // This is a "native" vector type
241 return std::pair(NumElts, EltVT);
242
243 case MVT::v16f16: // <8 x f16x2>
244 case MVT::v16bf16: // <8 x bf16x2>
245 case MVT::v16i16: // <8 x i16x2>
246 case MVT::v32i8: // <8 x i8x4>
247 // This can be upsized into a "native" vector type iff the address space is
248 // global and the target supports 256-bit loads/stores.
249 if (!CanLowerTo256Bit)
250 return std::nullopt;
252 case MVT::v2i16: // <1 x i16x2>
253 case MVT::v2f16: // <1 x f16x2>
254 case MVT::v2bf16: // <1 x bf16x2>
255 case MVT::v4i8: // <1 x i8x4>
256 case MVT::v4i16: // <2 x i16x2>
257 case MVT::v4f16: // <2 x f16x2>
258 case MVT::v4bf16: // <2 x bf16x2>
259 case MVT::v8i8: // <2 x i8x4>
260 case MVT::v8f16: // <4 x f16x2>
261 case MVT::v8bf16: // <4 x bf16x2>
262 case MVT::v8i16: // <4 x i16x2>
263 case MVT::v16i8: // <4 x i8x4>
264 PackRegSize = 32;
265 break;
266
267 case MVT::v8f32: // <4 x f32x2>
268 case MVT::v8i32: // <4 x i32x2>
269 // This is a "native" vector type iff the address space is global and the
270 // target supports 256-bit loads/stores
271 if (!CanLowerTo256Bit)
272 return std::nullopt;
274 case MVT::v2f32: // <1 x f32x2>
275 case MVT::v4f32: // <2 x f32x2>
276 case MVT::v2i32: // <1 x i32x2>
277 case MVT::v4i32: // <2 x i32x2>
278 if (!STI.hasF32x2Instructions())
279 return std::pair(NumElts, EltVT);
280 PackRegSize = 64;
281 break;
282 }
283
284 // If we reach here, then we can pack 2 or more elements into a single 32-bit
285 // or 64-bit PTX register and treat the vector as a new vector containing
286 // packed elements.
287
288 // Number of elements to pack in one word.
289 const unsigned NPerReg = PackRegSize / EltVT.getSizeInBits();
290
291 return std::pair(NumElts / NPerReg, MVT::getVectorVT(EltVT, NPerReg));
292}
293
294/// ComputePTXValueVTs - For the given Type \p Ty, returns the set of primitive
295/// legal-ish MVTs that compose it. Unlike ComputeValueVTs, this will legalize
296/// the types as required by the calling convention (with special handling for
297/// i8s).
298/// NOTE: This is a band-aid for code that expects ComputeValueVTs to return the
299/// same number of types as the Ins/Outs arrays in LowerFormalArguments,
300/// LowerCall, and LowerReturn.
301static void ComputePTXValueVTs(const TargetLowering &TLI, const DataLayout &DL,
302 LLVMContext &Ctx, CallingConv::ID CallConv,
303 Type *Ty, SmallVectorImpl<EVT> &ValueVTs,
305 uint64_t StartingOffset = 0) {
306 SmallVector<EVT, 16> TempVTs;
307 SmallVector<uint64_t, 16> TempOffsets;
308 ComputeValueVTs(TLI, DL, Ty, TempVTs, &TempOffsets, StartingOffset);
309
310 for (const auto [VT, Off] : zip(TempVTs, TempOffsets)) {
311 MVT RegisterVT = TLI.getRegisterTypeForCallingConv(Ctx, CallConv, VT);
312 unsigned NumRegs = TLI.getNumRegistersForCallingConv(Ctx, CallConv, VT);
313
314 // Since we actually can load/store b8, we need to ensure that we'll use
315 // the original sized type for any i8s or i8 vectors.
316 if (VT.getScalarType() == MVT::i8) {
317 if (RegisterVT == MVT::i16)
318 RegisterVT = MVT::i8;
319 else if (RegisterVT == MVT::v2i16)
320 RegisterVT = MVT::v2i8;
321 else
322 assert(RegisterVT == MVT::v4i8 &&
323 "Expected v4i8, v2i16, or i16 for i8 RegisterVT");
324 }
325
326 // TODO: This is horribly incorrect for cases where the vector elements are
327 // not a multiple of bytes (ex i1) and legal or i8. However, this problem
328 // has existed for as long as NVPTX has and no one has complained, so we'll
329 // leave it for now.
330 for (unsigned I : seq(NumRegs)) {
331 ValueVTs.push_back(RegisterVT);
332 Offsets.push_back(Off + I * RegisterVT.getStoreSize());
333 }
334 }
335}
336
337// We return an EVT that can hold N VTs
338// If the VT is a vector, the resulting EVT is a flat vector with the same
339// element type as VT's element type.
340static EVT getVectorizedVT(EVT VT, unsigned N, LLVMContext &C) {
341 if (N == 1)
342 return VT;
343
344 return VT.isVector() ? EVT::getVectorVT(C, VT.getScalarType(),
345 VT.getVectorNumElements() * N)
346 : EVT::getVectorVT(C, VT, N);
347}
348
350 const SDLoc &dl, SelectionDAG &DAG) {
351 if (V.getValueType() == VT) {
352 assert(I == 0 && "Index must be 0 for scalar value");
353 return V;
354 }
355
356 if (!VT.isVector())
357 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, V,
358 DAG.getVectorIdxConstant(I, dl));
359
360 return DAG.getNode(
361 ISD::EXTRACT_SUBVECTOR, dl, VT, V,
363}
364
365template <typename T>
366static inline SDValue getBuildVectorizedValue(unsigned N, const SDLoc &dl,
367 SelectionDAG &DAG, T GetElement) {
368 if (N == 1)
369 return GetElement(0);
370
372 for (const unsigned I : llvm::seq(N)) {
373 SDValue Val = GetElement(I);
374 if (Val.getValueType().isVector())
375 DAG.ExtractVectorElements(Val, Values);
376 else
377 Values.push_back(Val);
378 }
379
380 EVT VT = EVT::getVectorVT(*DAG.getContext(), Values[0].getValueType(),
381 Values.size());
382 return DAG.getBuildVector(VT, dl, Values);
383}
384
385/// PromoteScalarIntegerPTX
386/// Used to make sure the arguments/returns are suitable for passing
387/// and promote them to a larger size if they're not.
388///
389/// The promoted type is placed in \p PromoteVT if the function returns true.
391 if (VT.isScalarInteger()) {
392 switch (PowerOf2Ceil(VT.getFixedSizeInBits())) {
393 default:
395 "Promotion is not suitable for scalars of size larger than 64-bits");
396 case 1:
397 return MVT::i1;
398 case 2:
399 case 4:
400 case 8:
401 return MVT::i8;
402 case 16:
403 return MVT::i16;
404 case 32:
405 return MVT::i32;
406 case 64:
407 return MVT::i64;
408 }
409 }
410 return VT;
411}
412
413// Check whether we can merge loads/stores of some of the pieces of a
414// flattened function parameter or return value into a single vector
415// load/store.
416//
417// The flattened parameter is represented as a list of EVTs and
418// offsets, and the whole structure is aligned to ParamAlignment. This
419// function determines whether we can load/store pieces of the
420// parameter starting at index Idx using a single vectorized op of
421// size AccessSize. If so, it returns the number of param pieces
422// covered by the vector op. Otherwise, it returns 1.
423template <typename T>
425 unsigned Idx, uint32_t AccessSize, const SmallVectorImpl<EVT> &ValueVTs,
426 const SmallVectorImpl<T> &Offsets, Align ParamAlignment) {
427
428 // Can't vectorize if param alignment is not sufficient.
429 if (ParamAlignment < AccessSize)
430 return 1;
431 // Can't vectorize if offset is not aligned.
432 if (Offsets[Idx] & (AccessSize - 1))
433 return 1;
434
435 EVT EltVT = ValueVTs[Idx];
436 unsigned EltSize = EltVT.getStoreSize();
437
438 // Element is too large to vectorize.
439 if (EltSize >= AccessSize)
440 return 1;
441
442 unsigned NumElts = AccessSize / EltSize;
443 // Can't vectorize if AccessBytes if not a multiple of EltSize.
444 if (AccessSize != EltSize * NumElts)
445 return 1;
446
447 // We don't have enough elements to vectorize.
448 if (Idx + NumElts > ValueVTs.size())
449 return 1;
450
451 // PTX ISA can only deal with 2- and 4-element vector ops.
452 if (NumElts != 4 && NumElts != 2)
453 return 1;
454
455 for (unsigned j = Idx + 1; j < Idx + NumElts; ++j) {
456 // Types do not match.
457 if (ValueVTs[j] != EltVT)
458 return 1;
459
460 // Elements are not contiguous.
461 if (Offsets[j] - Offsets[j - 1] != EltSize)
462 return 1;
463 }
464 // OK. We can vectorize ValueVTs[i..i+NumElts)
465 return NumElts;
466}
467
468// Computes whether and how we can vectorize the loads/stores of a
469// flattened function parameter or return value.
470//
471// The flattened parameter is represented as the list of ValueVTs and
472// Offsets, and is aligned to ParamAlignment bytes. We return a vector
473// of the same size as ValueVTs indicating how each piece should be
474// loaded/stored (i.e. as a scalar, or as part of a vector
475// load/store).
476template <typename T>
479 const SmallVectorImpl<T> &Offsets, Align ParamAlignment,
480 bool IsVAArg = false) {
481 // Set vector size to match ValueVTs and mark all elements as
482 // scalars by default.
483
484 if (IsVAArg)
485 return SmallVector<unsigned>(ValueVTs.size(), 1);
486
487 SmallVector<unsigned, 16> VectorInfo;
488
489 const auto GetNumElts = [&](unsigned I) -> unsigned {
490 for (const unsigned AccessSize : {16, 8, 4, 2}) {
491 const unsigned NumElts = canMergeParamLoadStoresStartingAt(
492 I, AccessSize, ValueVTs, Offsets, ParamAlignment);
493 assert((NumElts == 1 || NumElts == 2 || NumElts == 4) &&
494 "Unexpected vectorization size");
495 if (NumElts != 1)
496 return NumElts;
497 }
498 return 1;
499 };
500
501 // Check what we can vectorize using 128/64/32-bit accesses.
502 for (unsigned I = 0, E = ValueVTs.size(); I != E;) {
503 const unsigned NumElts = GetNumElts(I);
504 VectorInfo.push_back(NumElts);
505 I += NumElts;
506 }
507 assert(std::accumulate(VectorInfo.begin(), VectorInfo.end(), 0u) ==
508 ValueVTs.size());
509 return VectorInfo;
510}
511
512// NVPTXTargetLowering Constructor.
514 const NVPTXSubtarget &STI)
515 : TargetLowering(TM), nvTM(&TM), STI(STI), GlobalUniqueCallSite(0) {
516 // always lower memset, memcpy, and memmove intrinsics to load/store
517 // instructions, rather
518 // then generating calls to memset, mempcy or memmove.
522
525
526 // Jump is Expensive. Don't create extra control flow for 'and', 'or'
527 // condition branches.
528 setJumpIsExpensive(true);
529
530 // Wide divides are _very_ slow. Try to reduce the width of the divide if
531 // possible.
532 addBypassSlowDiv(64, 32);
533
534 // By default, use the Source scheduling
535 if (sched4reg)
537 else
539
540 auto setFP16OperationAction = [&](unsigned Op, MVT VT, LegalizeAction Action,
541 LegalizeAction NoF16Action) {
542 bool IsOpSupported = STI.allowFP16Math();
543 switch (Op) {
544 // Several FP16 instructions are available on sm_80 only.
545 case ISD::FMINNUM:
546 case ISD::FMAXNUM:
547 case ISD::FMAXNUM_IEEE:
548 case ISD::FMINNUM_IEEE:
549 case ISD::FMAXIMUM:
550 case ISD::FMINIMUM:
551 case ISD::FMAXIMUMNUM:
552 case ISD::FMINIMUMNUM:
553 IsOpSupported &= STI.getSmVersion() >= 80 && STI.getPTXVersion() >= 70;
554 break;
555 case ISD::FEXP2:
556 IsOpSupported &= STI.getSmVersion() >= 75 && STI.getPTXVersion() >= 70;
557 break;
558 }
559 setOperationAction(Op, VT, IsOpSupported ? Action : NoF16Action);
560 };
561
562 auto setBF16OperationAction = [&](unsigned Op, MVT VT, LegalizeAction Action,
563 LegalizeAction NoBF16Action) {
564 bool IsOpSupported = STI.hasNativeBF16Support(Op);
566 Op, VT, IsOpSupported ? Action : NoBF16Action);
567 };
568
569 auto setI16x2OperationAction = [&](unsigned Op, MVT VT, LegalizeAction Action,
570 LegalizeAction NoI16x2Action) {
571 bool IsOpSupported = false;
572 // instructions are available on sm_90 only
573 switch (Op) {
574 case ISD::ADD:
575 case ISD::SMAX:
576 case ISD::SMIN:
577 case ISD::UMIN:
578 case ISD::UMAX:
579 IsOpSupported = STI.getSmVersion() >= 90 && STI.getPTXVersion() >= 80;
580 break;
581 }
582 setOperationAction(Op, VT, IsOpSupported ? Action : NoI16x2Action);
583 };
584
585 addRegisterClass(MVT::i1, &NVPTX::B1RegClass);
586 addRegisterClass(MVT::i16, &NVPTX::B16RegClass);
587 addRegisterClass(MVT::v2i16, &NVPTX::B32RegClass);
588 addRegisterClass(MVT::v4i8, &NVPTX::B32RegClass);
589 addRegisterClass(MVT::i32, &NVPTX::B32RegClass);
590 addRegisterClass(MVT::i64, &NVPTX::B64RegClass);
591 addRegisterClass(MVT::f32, &NVPTX::B32RegClass);
592 addRegisterClass(MVT::f64, &NVPTX::B64RegClass);
593 addRegisterClass(MVT::f16, &NVPTX::B16RegClass);
594 addRegisterClass(MVT::v2f16, &NVPTX::B32RegClass);
595 addRegisterClass(MVT::bf16, &NVPTX::B16RegClass);
596 addRegisterClass(MVT::v2bf16, &NVPTX::B32RegClass);
597
598 if (STI.hasF32x2Instructions()) {
599 addRegisterClass(MVT::v2f32, &NVPTX::B64RegClass);
600 addRegisterClass(MVT::v2i32, &NVPTX::B64RegClass);
601 }
602
603 // Conversion to/from FP16/FP16x2 is always legal.
608
609 setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal);
610 if (STI.getSmVersion() >= 30 && STI.getPTXVersion() > 31)
611 setOperationAction(ISD::READSTEADYCOUNTER, MVT::i64, Legal);
612
613 setFP16OperationAction(ISD::SETCC, MVT::f16, Legal, Promote);
614 setFP16OperationAction(ISD::SETCC, MVT::v2f16, Legal, Expand);
615
616 // Conversion to/from BFP16/BFP16x2 is always legal.
621
622 setBF16OperationAction(ISD::SETCC, MVT::v2bf16, Legal, Expand);
623 setBF16OperationAction(ISD::SETCC, MVT::bf16, Legal, Promote);
624 if (getOperationAction(ISD::SETCC, MVT::bf16) == Promote)
625 AddPromotedToType(ISD::SETCC, MVT::bf16, MVT::f32);
626
627 // Conversion to/from i16/i16x2 is always legal.
632
637
638 // No support for these operations with v2f32/v2i32
639 setOperationAction(ISD::INSERT_VECTOR_ELT, {MVT::v2f32, MVT::v2i32}, Expand);
640 setOperationAction(ISD::VECTOR_SHUFFLE, {MVT::v2f32, MVT::v2i32}, Expand);
641 // Need custom lowering in case the index is dynamic.
642 if (STI.hasF32x2Instructions())
643 setOperationAction(ISD::EXTRACT_VECTOR_ELT, {MVT::v2f32, MVT::v2i32},
644 Custom);
645
646 // Custom conversions to/from v2i8.
647 setOperationAction(ISD::BITCAST, MVT::v2i8, Custom);
648
649 // Only logical ops can be done on v4i8 directly, others must be done
650 // elementwise.
667 MVT::v4i8, Expand);
668
669 // Operations not directly supported by NVPTX.
670 for (MVT VT : {MVT::bf16, MVT::f16, MVT::v2bf16, MVT::v2f16, MVT::f32,
671 MVT::v2f32, MVT::f64, MVT::i1, MVT::i8, MVT::i16, MVT::v2i16,
672 MVT::v4i8, MVT::i32, MVT::v2i32, MVT::i64}) {
674 setOperationAction(ISD::BR_CC, VT, Expand);
675 }
676
677 // We don't want ops like FMINIMUM or UMAX to be lowered to SETCC+VSELECT.
678 setOperationAction(ISD::VSELECT, {MVT::v2f32, MVT::v2i32}, Expand);
679
680 // Some SIGN_EXTEND_INREG can be done using cvt instruction.
681 // For others we will expand to a SHL/SRA pair.
688
695
698
700 {MVT::i8, MVT::i16, MVT::v2i16, MVT::i32, MVT::i64},
701 Expand);
702
703 if (STI.hasHWROT32()) {
706 Custom);
707 }
708
710
711 setOperationAction(ISD::BR_JT, MVT::Other, Custom);
712 setOperationAction(ISD::BRIND, MVT::Other, Expand);
713
714 // We want to legalize constant related memmove and memcopy
715 // intrinsics.
717
718 // FP extload/truncstore is not legal in PTX. We need to expand all these.
719 for (auto FloatVTs :
721 for (MVT ValVT : FloatVTs) {
722 for (MVT MemVT : FloatVTs) {
723 setLoadExtAction(ISD::EXTLOAD, ValVT, MemVT, Expand);
724 setTruncStoreAction(ValVT, MemVT, Expand);
725 }
726 }
727 }
728
729 // To improve CodeGen we'll legalize any-extend loads to zext loads. This is
730 // how they'll be lowered in ISel anyway, and by doing this a little earlier
731 // we allow for more DAG combine opportunities.
732 for (auto IntVTs :
734 for (MVT ValVT : IntVTs)
735 for (MVT MemVT : IntVTs)
736 if (isTypeLegal(ValVT))
737 setLoadExtAction(ISD::EXTLOAD, ValVT, MemVT, Custom);
738
739 // PTX does not support load / store predicate registers
740 setOperationAction({ISD::LOAD, ISD::STORE}, MVT::i1, Custom);
741 for (MVT VT : MVT::integer_valuetypes()) {
743 Promote);
744 setTruncStoreAction(VT, MVT::i1, Expand);
745 }
746
747 // Disable generations of extload/truncstore for v2i16/v2i8. The generic
748 // expansion for these nodes when they are unaligned is incorrect if the
749 // type is a vector.
750 //
751 // TODO: Fix the generic expansion for these nodes found in
752 // TargetLowering::expandUnalignedLoad/Store.
754 MVT::v2i8, Expand);
755 setTruncStoreAction(MVT::v2i16, MVT::v2i8, Expand);
756
757 // Register custom handling for illegal type loads/stores. We'll try to custom
758 // lower almost all illegal types and logic in the lowering will discard cases
759 // we can't handle.
760 setOperationAction({ISD::LOAD, ISD::STORE}, {MVT::i128, MVT::f128}, Custom);
762 if (!isTypeLegal(VT) && VT.getStoreSizeInBits() <= 256)
763 setOperationAction({ISD::STORE, ISD::LOAD}, VT, Custom);
764
765 // Custom legalization for LDU intrinsics.
766 // TODO: The logic to lower these is not very robust and we should rewrite it.
767 // Perhaps LDU should not be represented as an intrinsic at all.
770 if (IsPTXVectorType(VT))
772
776 MVT::i1, Expand);
777
778 // This is legal in NVPTX
783
784 setOperationAction(ISD::DYNAMIC_STACKALLOC, {MVT::i32, MVT::i64}, Custom);
785 setOperationAction({ISD::STACKRESTORE, ISD::STACKSAVE}, MVT::Other, Custom);
786
787 // TRAP can be lowered to PTX trap
788 setOperationAction(ISD::TRAP, MVT::Other, Legal);
789 // DEBUGTRAP can be lowered to PTX brkpt
790 setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
791
792 // Support varargs.
793 setOperationAction(ISD::VASTART, MVT::Other, Custom);
794 setOperationAction(ISD::VAARG, MVT::Other, Custom);
795 setOperationAction(ISD::VACOPY, MVT::Other, Expand);
796 setOperationAction(ISD::VAEND, MVT::Other, Expand);
797
799 {MVT::i16, MVT::i32, MVT::i64}, Legal);
800
802 Promote);
805
806 setI16x2OperationAction(ISD::ABS, MVT::v2i16, Legal, Custom);
807 setI16x2OperationAction(ISD::SMIN, MVT::v2i16, Legal, Custom);
808 setI16x2OperationAction(ISD::SMAX, MVT::v2i16, Legal, Custom);
809 setI16x2OperationAction(ISD::UMIN, MVT::v2i16, Legal, Custom);
810 setI16x2OperationAction(ISD::UMAX, MVT::v2i16, Legal, Custom);
811 setI16x2OperationAction(ISD::CTPOP, MVT::v2i16, Legal, Expand);
812 setI16x2OperationAction(ISD::CTLZ, MVT::v2i16, Legal, Expand);
813
814 setI16x2OperationAction(ISD::ADD, MVT::v2i16, Legal, Custom);
815 setI16x2OperationAction(ISD::SUB, MVT::v2i16, Legal, Custom);
816 setI16x2OperationAction(ISD::MUL, MVT::v2i16, Legal, Custom);
817 setI16x2OperationAction(ISD::SHL, MVT::v2i16, Legal, Custom);
818 setI16x2OperationAction(ISD::SREM, MVT::v2i16, Legal, Custom);
819 setI16x2OperationAction(ISD::UREM, MVT::v2i16, Legal, Custom);
820
821 // Other arithmetic and logic ops are unsupported.
825 {MVT::v2i16, MVT::v2i32}, Expand);
826
827 // v2i32 is not supported for any arithmetic operations
832 MVT::v2i32, Expand);
833
838 if (STI.getPTXVersion() >= 43) {
843 }
844
846 setOperationAction(ISD::CTTZ, {MVT::v2i16, MVT::v2i32}, Expand);
849
850 // PTX does not directly support SELP of i1, so promote to i32 first
852
853 // PTX cannot multiply two i64s in a single instruction.
856
857 // We have some custom DAG combine patterns for these nodes
860 ISD::FADD, ISD::FMAXNUM, ISD::FMINNUM,
861 ISD::FMAXIMUM, ISD::FMINIMUM, ISD::FMAXIMUMNUM,
862 ISD::FMINIMUMNUM, ISD::MUL, ISD::SHL,
864 ISD::BUILD_VECTOR, ISD::ADDRSPACECAST, ISD::LOAD,
865 ISD::STORE, ISD::ZERO_EXTEND, ISD::SIGN_EXTEND});
866
867 // setcc for f16x2 and bf16x2 needs special handling to prevent
868 // legalizer's attempt to scalarize it due to v2i1 not being legal.
869 if (STI.allowFP16Math() || STI.hasBF16Math())
871
872 // Vector reduction operations. These may be turned into shuffle or tree
873 // reductions depending on what instructions are available for each type.
875 MVT EltVT = VT.getVectorElementType();
876 if (EltVT == MVT::f32 || EltVT == MVT::f64) {
877 setOperationAction({ISD::VECREDUCE_FMAX, ISD::VECREDUCE_FMIN,
878 ISD::VECREDUCE_FMAXIMUM, ISD::VECREDUCE_FMINIMUM},
879 VT, Custom);
880 }
881 }
882
883 // Promote fp16 arithmetic if fp16 hardware isn't available or the
884 // user passed --nvptx-no-fp16-math. The flag is useful because,
885 // although sm_53+ GPUs have some sort of FP16 support in
886 // hardware, only sm_53 and sm_60 have full implementation. Others
887 // only have token amount of hardware and are likely to run faster
888 // by using fp32 units instead.
889 for (const auto &Op : {ISD::FADD, ISD::FMUL, ISD::FSUB, ISD::FMA}) {
890 setFP16OperationAction(Op, MVT::f16, Legal, Promote);
891 setFP16OperationAction(Op, MVT::v2f16, Legal, Expand);
892 setBF16OperationAction(Op, MVT::v2bf16, Legal, Expand);
893 // bf16 must be promoted to f32.
894 setBF16OperationAction(Op, MVT::bf16, Legal, Promote);
895 if (getOperationAction(Op, MVT::bf16) == Promote)
896 AddPromotedToType(Op, MVT::bf16, MVT::f32);
897 setOperationAction(Op, MVT::v2f32,
898 STI.hasF32x2Instructions() ? Legal : Expand);
899 }
900
901 // On SM80, we select add/mul/sub as fma to avoid promotion to float
902 for (const auto &Op : {ISD::FADD, ISD::FMUL, ISD::FSUB}) {
903 for (const auto &VT : {MVT::bf16, MVT::v2bf16}) {
904 if (!STI.hasNativeBF16Support(Op) && STI.hasNativeBF16Support(ISD::FMA)) {
906 }
907 }
908 }
909
910 // f16/f16x2 neg was introduced in PTX 60, SM_53.
911 const bool IsFP16FP16x2NegAvailable = STI.getSmVersion() >= 53 &&
912 STI.getPTXVersion() >= 60 &&
913 STI.allowFP16Math();
914 for (const auto &VT : {MVT::f16, MVT::v2f16})
915 setOperationAction(ISD::FNEG, VT,
916 IsFP16FP16x2NegAvailable ? Legal : Expand);
917
918 setBF16OperationAction(ISD::FNEG, MVT::bf16, Legal, Expand);
919 setBF16OperationAction(ISD::FNEG, MVT::v2bf16, Legal, Expand);
920 setOperationAction(ISD::FNEG, MVT::v2f32, Expand);
921 // (would be) Library functions.
922
923 // These map to conversion instructions for scalar FP types.
924 for (const auto &Op : {ISD::FCEIL, ISD::FFLOOR, ISD::FNEARBYINT, ISD::FRINT,
925 ISD::FROUNDEVEN, ISD::FTRUNC}) {
926 setOperationAction(Op, MVT::f16, Legal);
927 setOperationAction(Op, MVT::f32, Legal);
928 setOperationAction(Op, MVT::f64, Legal);
929 setOperationAction(Op, MVT::v2f16, Expand);
930 setOperationAction(Op, MVT::v2bf16, Expand);
931 setOperationAction(Op, MVT::v2f32, Expand);
932 setBF16OperationAction(Op, MVT::bf16, Legal, Promote);
933 if (getOperationAction(Op, MVT::bf16) == Promote)
934 AddPromotedToType(Op, MVT::bf16, MVT::f32);
935 }
936
937 if (STI.getSmVersion() < 80 || STI.getPTXVersion() < 71) {
938 setOperationAction(ISD::BF16_TO_FP, MVT::f32, Expand);
939 }
940 if (STI.getSmVersion() < 90 || STI.getPTXVersion() < 78) {
941 for (MVT VT : {MVT::bf16, MVT::f32, MVT::f64}) {
942 setOperationAction(ISD::FP_EXTEND, VT, Custom);
944 }
945 }
946
947 // Expand v2f32 = fp_extend
948 setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Expand);
949 // Expand v2[b]f16 = fp_round v2f32
950 setOperationAction(ISD::FP_ROUND, {MVT::v2bf16, MVT::v2f16}, Expand);
951
952 // sm_80 only has conversions between f32 and bf16. Custom lower all other
953 // bf16 conversions.
954 if (STI.getSmVersion() < 90 || STI.getPTXVersion() < 78) {
955 for (MVT VT : {MVT::i1, MVT::i16, MVT::i32, MVT::i64}) {
958 VT, Custom);
959 }
962 MVT::bf16, Custom);
963 }
964
965 setOperationAction(ISD::FROUND, MVT::f16, Promote);
966 setOperationAction(ISD::FROUND, MVT::v2f16, Expand);
967 setOperationAction(ISD::FROUND, MVT::v2bf16, Expand);
968 setOperationAction(ISD::FROUND, MVT::f32, Custom);
969 setOperationAction(ISD::FROUND, MVT::f64, Custom);
970 setOperationAction(ISD::FROUND, MVT::bf16, Promote);
971 AddPromotedToType(ISD::FROUND, MVT::bf16, MVT::f32);
972
973 // 'Expand' implements FCOPYSIGN without calling an external library.
980
981 // These map to corresponding instructions for f32/f64. f16 must be
982 // promoted to f32. v2f16 is expanded to f16, which is then promoted
983 // to f32.
984 for (const auto &Op :
985 {ISD::FDIV, ISD::FREM, ISD::FSQRT, ISD::FSIN, ISD::FCOS, ISD::FTANH}) {
986 setOperationAction(Op, MVT::f16, Promote);
987 setOperationAction(Op, MVT::f32, Legal);
988 // only div/rem/sqrt are legal for f64
989 if (Op == ISD::FDIV || Op == ISD::FREM || Op == ISD::FSQRT) {
990 setOperationAction(Op, MVT::f64, Legal);
991 }
992 setOperationAction(Op, {MVT::v2f16, MVT::v2bf16, MVT::v2f32}, Expand);
993 setOperationAction(Op, MVT::bf16, Promote);
994 AddPromotedToType(Op, MVT::bf16, MVT::f32);
995 }
996 setOperationAction(ISD::FREM, {MVT::f32, MVT::f64}, Custom);
997
998 setOperationAction(ISD::FABS, {MVT::f32, MVT::f64}, Legal);
999 setOperationAction(ISD::FABS, MVT::v2f32, Expand);
1000 if (STI.getPTXVersion() >= 65) {
1001 setFP16OperationAction(ISD::FABS, MVT::f16, Legal, Promote);
1002 setFP16OperationAction(ISD::FABS, MVT::v2f16, Legal, Expand);
1003 } else {
1004 setOperationAction(ISD::FABS, MVT::f16, Promote);
1005 setOperationAction(ISD::FABS, MVT::v2f16, Expand);
1006 }
1007 setBF16OperationAction(ISD::FABS, MVT::v2bf16, Legal, Expand);
1008 setBF16OperationAction(ISD::FABS, MVT::bf16, Legal, Promote);
1009 if (getOperationAction(ISD::FABS, MVT::bf16) == Promote)
1010 AddPromotedToType(ISD::FABS, MVT::bf16, MVT::f32);
1011
1012 for (const auto &Op :
1013 {ISD::FMINNUM, ISD::FMAXNUM, ISD::FMINIMUMNUM, ISD::FMAXIMUMNUM}) {
1014 setOperationAction(Op, MVT::f32, Legal);
1015 setOperationAction(Op, MVT::f64, Legal);
1016 setFP16OperationAction(Op, MVT::f16, Legal, Promote);
1017 setFP16OperationAction(Op, MVT::v2f16, Legal, Expand);
1018 setBF16OperationAction(Op, MVT::v2bf16, Legal, Expand);
1019 setBF16OperationAction(Op, MVT::bf16, Legal, Promote);
1020 if (getOperationAction(Op, MVT::bf16) == Promote)
1021 AddPromotedToType(Op, MVT::bf16, MVT::f32);
1022 setOperationAction(Op, MVT::v2f32, Expand);
1023 }
1024 bool SupportsF32MinMaxNaN =
1025 STI.getSmVersion() >= 80 && STI.getPTXVersion() >= 70;
1026 for (const auto &Op : {ISD::FMINIMUM, ISD::FMAXIMUM}) {
1027 setOperationAction(Op, MVT::f32, SupportsF32MinMaxNaN ? Legal : Expand);
1028 setFP16OperationAction(Op, MVT::f16, Legal, Expand);
1029 setFP16OperationAction(Op, MVT::v2f16, Legal, Expand);
1030 setBF16OperationAction(Op, MVT::bf16, Legal, Expand);
1031 setBF16OperationAction(Op, MVT::v2bf16, Legal, Expand);
1032 setOperationAction(Op, MVT::v2f32, Expand);
1033 }
1034
1035 // Custom lowering for inline asm with 128-bit operands
1038
1039 // FEXP2 support:
1040 // - f32
1041 // - f16/f16x2 (sm_70+, PTX 7.0+)
1042 // - bf16/bf16x2 (sm_90+, PTX 7.8+)
1043 // When f16/bf16 types aren't supported, they are promoted/expanded to f32.
1044 setOperationAction(ISD::FEXP2, MVT::f32, Legal);
1045 setOperationAction(ISD::FEXP2, MVT::v2f32, Expand);
1046 setFP16OperationAction(ISD::FEXP2, MVT::f16, Legal, Promote);
1047 setFP16OperationAction(ISD::FEXP2, MVT::v2f16, Legal, Expand);
1048 setBF16OperationAction(ISD::FEXP2, MVT::bf16, Legal, Promote);
1049 setBF16OperationAction(ISD::FEXP2, MVT::v2bf16, Legal, Expand);
1050
1051 // FLOG2 supports f32 only
1052 // f16/bf16 types aren't supported, but they are promoted/expanded to f32.
1053 if (UseApproxLog2F32) {
1054 setOperationAction(ISD::FLOG2, MVT::f32, Legal);
1055 setOperationPromotedToType(ISD::FLOG2, MVT::f16, MVT::f32);
1056 setOperationPromotedToType(ISD::FLOG2, MVT::bf16, MVT::f32);
1057 setOperationAction(ISD::FLOG2, {MVT::v2f16, MVT::v2bf16, MVT::v2f32},
1058 Expand);
1059 }
1060
1061 setOperationAction(ISD::ADDRSPACECAST, {MVT::i32, MVT::i64}, Custom);
1062
1063 setOperationAction(ISD::ATOMIC_LOAD_SUB, {MVT::i32, MVT::i64}, Expand);
1064
1065 // atom.b128 is legal in PTX but since we don't represent i128 as a legal
1066 // type, we need to custom lower it.
1067 setOperationAction({ISD::ATOMIC_CMP_SWAP, ISD::ATOMIC_SWAP}, MVT::i128,
1068 Custom);
1069
1070 // Now deduce the information based on the above mentioned
1071 // actions
1072 computeRegisterProperties(STI.getRegisterInfo());
1073
1074 // PTX support for 16-bit CAS is emulated. Only use 32+
1075 setMinCmpXchgSizeInBits(STI.getMinCmpXchgSizeInBits());
1076 setMaxAtomicSizeInBitsSupported(STI.hasAtomSwap128() ? 128 : 64);
1078
1079 // Custom lowering for tcgen05.ld vector operands
1081 {MVT::v2i32, MVT::v4i32, MVT::v8i32, MVT::v16i32,
1082 MVT::v32i32, MVT::v64i32, MVT::v128i32},
1083 Custom);
1084
1085 // Custom lowering for tcgen05.st vector operands
1087 {MVT::v2i32, MVT::v4i32, MVT::v8i32, MVT::v16i32,
1088 MVT::v32i32, MVT::v64i32, MVT::v128i32, MVT::Other},
1089 Custom);
1090
1091 // Enable custom lowering for the following:
1092 // * MVT::i128 - clusterlaunchcontrol
1093 // * MVT::i32 - prmt
1094 // * MVT::Other - internal.addrspace.wrap
1095 setOperationAction(ISD::INTRINSIC_WO_CHAIN, {MVT::i32, MVT::i128, MVT::Other},
1096 Custom);
1097}
1098
1099const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const {
1100
1101#define MAKE_CASE(V) \
1102 case V: \
1103 return #V;
1104
1105 switch ((NVPTXISD::NodeType)Opcode) {
1107 break;
1108
1161 MAKE_CASE(
1163 MAKE_CASE(
1175 MAKE_CASE(
1177 MAKE_CASE(
1179 }
1180 return nullptr;
1181
1182#undef MAKE_CASE
1183}
1184
1187 if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&
1188 VT.getScalarType() == MVT::i1)
1189 return TypeSplitVector;
1191}
1192
1194 int Enabled, int &ExtraSteps,
1195 bool &UseOneConst,
1196 bool Reciprocal) const {
1199 return SDValue();
1200
1201 if (ExtraSteps == ReciprocalEstimate::Unspecified)
1202 ExtraSteps = 0;
1203
1204 SDLoc DL(Operand);
1205 EVT VT = Operand.getValueType();
1206 bool Ftz = useF32FTZ(DAG.getMachineFunction());
1207
1208 auto MakeIntrinsicCall = [&](Intrinsic::ID IID) {
1209 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
1210 DAG.getConstant(IID, DL, MVT::i32), Operand);
1211 };
1212
1213 // The sqrt and rsqrt refinement processes assume we always start out with an
1214 // approximation of the rsqrt. Therefore, if we're going to do any refinement
1215 // (i.e. ExtraSteps > 0), we must return an rsqrt. But if we're *not* doing
1216 // any refinement, we must return a regular sqrt.
1217 if (Reciprocal || ExtraSteps > 0) {
1218 if (VT == MVT::f32)
1219 return MakeIntrinsicCall(Ftz ? Intrinsic::nvvm_rsqrt_approx_ftz_f
1220 : Intrinsic::nvvm_rsqrt_approx_f);
1221 else if (VT == MVT::f64)
1222 return MakeIntrinsicCall(Intrinsic::nvvm_rsqrt_approx_d);
1223 else
1224 return SDValue();
1225 } else {
1226 if (VT == MVT::f32)
1227 return MakeIntrinsicCall(Ftz ? Intrinsic::nvvm_sqrt_approx_ftz_f
1228 : Intrinsic::nvvm_sqrt_approx_f);
1229 else {
1230 // There's no sqrt.approx.f64 instruction, so we emit
1231 // reciprocal(rsqrt(x)). This is faster than
1232 // select(x == 0, 0, x * rsqrt(x)). (In fact, it's faster than plain
1233 // x * rsqrt(x).)
1234 return DAG.getNode(
1236 DAG.getConstant(Intrinsic::nvvm_rcp_approx_ftz_d, DL, MVT::i32),
1237 MakeIntrinsicCall(Intrinsic::nvvm_rsqrt_approx_d));
1238 }
1239 }
1240}
1241
1243 const DataLayout &DL, Type *RetTy, const ArgListTy &Args,
1245 std::optional<unsigned> FirstVAArg, const CallBase &CB,
1246 unsigned UniqueCallSite) const {
1247 auto PtrVT = getPointerTy(DL);
1248
1249 std::string Prototype;
1250 raw_string_ostream O(Prototype);
1251 O << "prototype_" << UniqueCallSite << " : .callprototype ";
1252
1253 if (RetTy->isVoidTy()) {
1254 O << "()";
1255 } else {
1256 O << "(";
1257 if (shouldPassAsArray(RetTy)) {
1258 const Align RetAlign = getArgumentAlignment(&CB, RetTy, 0, DL);
1259 O << ".param .align " << RetAlign.value() << " .b8 _["
1260 << DL.getTypeAllocSize(RetTy) << "]";
1261 } else if (RetTy->isFloatingPointTy() || RetTy->isIntegerTy()) {
1262 unsigned size = 0;
1263 if (auto *ITy = dyn_cast<IntegerType>(RetTy)) {
1264 size = ITy->getBitWidth();
1265 } else {
1266 assert(RetTy->isFloatingPointTy() &&
1267 "Floating point type expected here");
1268 size = RetTy->getPrimitiveSizeInBits();
1269 }
1270 // PTX ABI requires all scalar return values to be at least 32
1271 // bits in size. fp16 normally uses .b16 as its storage type in
1272 // PTX, so its size must be adjusted here, too.
1274
1275 O << ".param .b" << size << " _";
1276 } else if (isa<PointerType>(RetTy)) {
1277 O << ".param .b" << PtrVT.getSizeInBits() << " _";
1278 } else {
1279 llvm_unreachable("Unknown return type");
1280 }
1281 O << ") ";
1282 }
1283 O << "_ (";
1284
1285 bool first = true;
1286
1287 const unsigned NumArgs = FirstVAArg.value_or(Args.size());
1288 auto AllOuts = ArrayRef(Outs);
1289 for (const unsigned I : llvm::seq(NumArgs)) {
1290 const auto ArgOuts =
1291 AllOuts.take_while([I](auto O) { return O.OrigArgIndex == I; });
1292 AllOuts = AllOuts.drop_front(ArgOuts.size());
1293
1294 Type *Ty = Args[I].Ty;
1295 if (!first) {
1296 O << ", ";
1297 }
1298 first = false;
1299
1300 if (ArgOuts[0].Flags.isByVal()) {
1301 // Indirect calls need strict ABI alignment so we disable optimizations by
1302 // not providing a function to optimize.
1303 Type *ETy = Args[I].IndirectType;
1304 Align InitialAlign = ArgOuts[0].Flags.getNonZeroByValAlign();
1305 Align ParamByValAlign =
1306 getFunctionByValParamAlign(/*F=*/nullptr, ETy, InitialAlign, DL);
1307
1308 O << ".param .align " << ParamByValAlign.value() << " .b8 _["
1309 << ArgOuts[0].Flags.getByValSize() << "]";
1310 } else {
1311 if (shouldPassAsArray(Ty)) {
1312 Align ParamAlign =
1313 getArgumentAlignment(&CB, Ty, I + AttributeList::FirstArgIndex, DL);
1314 O << ".param .align " << ParamAlign.value() << " .b8 _["
1315 << DL.getTypeAllocSize(Ty) << "]";
1316 continue;
1317 }
1318 // i8 types in IR will be i16 types in SDAG
1319 assert((getValueType(DL, Ty) == ArgOuts[0].VT ||
1320 (getValueType(DL, Ty) == MVT::i8 && ArgOuts[0].VT == MVT::i16)) &&
1321 "type mismatch between callee prototype and arguments");
1322 // scalar type
1323 unsigned sz = 0;
1324 if (auto *ITy = dyn_cast<IntegerType>(Ty)) {
1325 sz = promoteScalarArgumentSize(ITy->getBitWidth());
1326 } else if (isa<PointerType>(Ty)) {
1327 sz = PtrVT.getSizeInBits();
1328 } else {
1329 sz = Ty->getPrimitiveSizeInBits();
1330 }
1331 O << ".param .b" << sz << " _";
1332 }
1333 }
1334
1335 if (FirstVAArg)
1336 O << (first ? "" : ",") << " .param .align "
1337 << STI.getMaxRequiredAlignment() << " .b8 _[]";
1338 O << ")";
1339 if (shouldEmitPTXNoReturn(&CB, *nvTM))
1340 O << " .noreturn";
1341 O << ";";
1342
1343 return Prototype;
1344}
1345
1347 const Function *F, Type *Ty, unsigned Idx, const DataLayout &DL) const {
1348 return getAlign(*F, Idx).value_or(getFunctionParamOptimizedAlign(F, Ty, DL));
1349}
1350
1351Align NVPTXTargetLowering::getArgumentAlignment(const CallBase *CB, Type *Ty,
1352 unsigned Idx,
1353 const DataLayout &DL) const {
1354 if (!CB) {
1355 // CallSite is zero, fallback to ABI type alignment
1356 return DL.getABITypeAlign(Ty);
1357 }
1358
1359 const Function *DirectCallee = CB->getCalledFunction();
1360
1361 if (!DirectCallee) {
1362 // We don't have a direct function symbol, but that may be because of
1363 // constant cast instructions in the call.
1364
1365 // With bitcast'd call targets, the instruction will be the call
1366 if (const auto *CI = dyn_cast<CallInst>(CB)) {
1367 // Check if we have call alignment metadata
1368 if (MaybeAlign StackAlign = getAlign(*CI, Idx))
1369 return StackAlign.value();
1370 }
1371 DirectCallee = getMaybeBitcastedCallee(CB);
1372 }
1373
1374 // Check for function alignment information if we found that the
1375 // ultimate target is a Function
1376 if (DirectCallee)
1377 return getFunctionArgumentAlignment(DirectCallee, Ty, Idx, DL);
1378
1379 // Call is indirect, fall back to the ABI type alignment
1380 return DL.getABITypeAlign(Ty);
1381}
1382
1384 const GlobalAddressSDNode *Func) {
1385 if (!Func)
1386 return false;
1387 if (auto *CalleeFunc = dyn_cast<Function>(Func->getGlobal()))
1388 return CB->getFunctionType() != CalleeFunc->getFunctionType();
1389 return false;
1390}
1391
1393 const DataLayout &DL,
1394 const TargetLowering &TL) {
1395 if (Ptr->getOpcode() == ISD::FrameIndex) {
1396 auto Ty = TL.getPointerTy(DL, ADDRESS_SPACE_LOCAL);
1399
1401 }
1402
1403 // Peel of an addrspacecast to generic and load directly from the specific
1404 // address space.
1405 if (Ptr->getOpcode() == ISD::ADDRSPACECAST) {
1406 const auto *ASC = cast<AddrSpaceCastSDNode>(Ptr);
1407 if (ASC->getDestAddressSpace() == ADDRESS_SPACE_GENERIC) {
1408 Ptr = ASC->getOperand(0);
1409 return MachinePointerInfo(ASC->getSrcAddressSpace());
1410 }
1411 }
1412
1413 return MachinePointerInfo();
1414}
1415
1417 if (Flags.isSExt())
1418 return ISD::SIGN_EXTEND;
1419 if (Flags.isZExt())
1420 return ISD::ZERO_EXTEND;
1421 return ISD::ANY_EXTEND;
1422}
1423
1425 ISD::ArgFlagsTy Flags, SelectionDAG &DAG,
1426 SDLoc dl) {
1427 const EVT ActualVT = V.getValueType();
1428 assert((ActualVT == ExpectedVT ||
1429 (ExpectedVT.isInteger() && ActualVT.isInteger())) &&
1430 "Non-integer argument type size mismatch");
1431 if (ExpectedVT.bitsGT(ActualVT))
1432 return DAG.getNode(getExtOpcode(Flags), dl, ExpectedVT, V);
1433 if (ExpectedVT.bitsLT(ActualVT))
1434 return DAG.getNode(ISD::TRUNCATE, dl, ExpectedVT, V);
1435
1436 return V;
1437}
1438
1440 SmallVectorImpl<SDValue> &InVals) const {
1441
1442 if (CLI.IsVarArg && (STI.getPTXVersion() < 60 || STI.getSmVersion() < 30))
1444 "Support for variadic functions (unsized array parameter) introduced "
1445 "in PTX ISA version 6.0 and requires target sm_30.");
1446
1447 SelectionDAG &DAG = CLI.DAG;
1448 SDLoc dl = CLI.DL;
1449 const SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
1450 SDValue Callee = CLI.Callee;
1451 ArgListTy &Args = CLI.getArgs();
1452 Type *RetTy = CLI.RetTy;
1453 const CallBase *CB = CLI.CB;
1454 const DataLayout &DL = DAG.getDataLayout();
1455 LLVMContext &Ctx = *DAG.getContext();
1456
1457 const auto GetI32 = [&](const unsigned I) {
1458 return DAG.getConstant(I, dl, MVT::i32);
1459 };
1460
1461 const unsigned UniqueCallSite = GlobalUniqueCallSite++;
1462 const SDValue CallChain = CLI.Chain;
1463 const SDValue StartChain =
1464 DAG.getCALLSEQ_START(CallChain, UniqueCallSite, 0, dl);
1465 SDValue DeclareGlue = StartChain.getValue(1);
1466
1467 SmallVector<SDValue, 16> CallPrereqs{StartChain};
1468
1469 const auto MakeDeclareScalarParam = [&](SDValue Symbol, unsigned Size) {
1470 // PTX ABI requires integral types to be at least 32 bits in size. FP16 is
1471 // loaded/stored using i16, so it's handled here as well.
1472 const unsigned SizeBits = promoteScalarArgumentSize(Size * 8);
1473 SDValue Declare =
1474 DAG.getNode(NVPTXISD::DeclareScalarParam, dl, {MVT::Other, MVT::Glue},
1475 {StartChain, Symbol, GetI32(SizeBits), DeclareGlue});
1476 CallPrereqs.push_back(Declare);
1477 DeclareGlue = Declare.getValue(1);
1478 return Declare;
1479 };
1480
1481 const auto MakeDeclareArrayParam = [&](SDValue Symbol, Align Align,
1482 unsigned Size) {
1483 SDValue Declare = DAG.getNode(
1484 NVPTXISD::DeclareArrayParam, dl, {MVT::Other, MVT::Glue},
1485 {StartChain, Symbol, GetI32(Align.value()), GetI32(Size), DeclareGlue});
1486 CallPrereqs.push_back(Declare);
1487 DeclareGlue = Declare.getValue(1);
1488 return Declare;
1489 };
1490
1491 // Variadic arguments.
1492 //
1493 // Normally, for each argument, we declare a param scalar or a param
1494 // byte array in the .param space, and store the argument value to that
1495 // param scalar or array starting at offset 0.
1496 //
1497 // In the case of the first variadic argument, we declare a vararg byte array
1498 // with size 0. The exact size of this array isn't known at this point, so
1499 // it'll be patched later. All the variadic arguments will be stored to this
1500 // array at a certain offset (which gets tracked by 'VAOffset'). The offset is
1501 // initially set to 0, so it can be used for non-variadic arguments (which use
1502 // 0 offset) to simplify the code.
1503 //
1504 // After all vararg is processed, 'VAOffset' holds the size of the
1505 // vararg byte array.
1506 assert((CLI.IsVarArg || CLI.Args.size() == CLI.NumFixedArgs) &&
1507 "Non-VarArg function with extra arguments");
1508
1509 const unsigned FirstVAArg = CLI.NumFixedArgs; // position of first variadic
1510 unsigned VAOffset = 0; // current offset in the param array
1511
1512 const SDValue VADeclareParam =
1513 CLI.Args.size() > FirstVAArg
1514 ? MakeDeclareArrayParam(getCallParamSymbol(DAG, FirstVAArg, MVT::i32),
1515 Align(STI.getMaxRequiredAlignment()), 0)
1516 : SDValue();
1517
1518 // Args.size() and Outs.size() need not match.
1519 // Outs.size() will be larger
1520 // * if there is an aggregate argument with multiple fields (each field
1521 // showing up separately in Outs)
1522 // * if there is a vector argument with more than typical vector-length
1523 // elements (generally if more than 4) where each vector element is
1524 // individually present in Outs.
1525 // So a different index should be used for indexing into Outs/OutVals.
1526 // See similar issue in LowerFormalArguments.
1527 auto AllOuts = ArrayRef(CLI.Outs);
1528 auto AllOutVals = ArrayRef(CLI.OutVals);
1529 assert(AllOuts.size() == AllOutVals.size() &&
1530 "Outs and OutVals must be the same size");
1531 // Declare the .params or .reg need to pass values
1532 // to the function
1533 for (const auto E : llvm::enumerate(Args)) {
1534 const auto ArgI = E.index();
1535 const auto Arg = E.value();
1536 const auto ArgOuts =
1537 AllOuts.take_while([&](auto O) { return O.OrigArgIndex == ArgI; });
1538 const auto ArgOutVals = AllOutVals.take_front(ArgOuts.size());
1539 AllOuts = AllOuts.drop_front(ArgOuts.size());
1540 AllOutVals = AllOutVals.drop_front(ArgOuts.size());
1541
1542 const bool IsVAArg = (ArgI >= FirstVAArg);
1543 const bool IsByVal = Arg.IsByVal;
1544
1545 const SDValue ParamSymbol =
1546 getCallParamSymbol(DAG, IsVAArg ? FirstVAArg : ArgI, MVT::i32);
1547
1548 assert((!IsByVal || Arg.IndirectType) &&
1549 "byval arg must have indirect type");
1550 Type *ETy = (IsByVal ? Arg.IndirectType : Arg.Ty);
1551
1552 const Align ArgAlign = [&]() {
1553 if (IsByVal) {
1554 // The ByValAlign in the Outs[OIdx].Flags is always set at this point,
1555 // so we don't need to worry whether it's naturally aligned or not.
1556 // See TargetLowering::LowerCallTo().
1557 const Align InitialAlign = ArgOuts[0].Flags.getNonZeroByValAlign();
1559 InitialAlign, DL);
1560 }
1561 return getArgumentAlignment(CB, Arg.Ty, ArgI + 1, DL);
1562 }();
1563
1564 const unsigned TySize = DL.getTypeAllocSize(ETy);
1565 assert((!IsByVal || TySize == ArgOuts[0].Flags.getByValSize()) &&
1566 "type size mismatch");
1567
1568 const SDValue ArgDeclare = [&]() {
1569 if (IsVAArg)
1570 return VADeclareParam;
1571
1572 if (IsByVal || shouldPassAsArray(Arg.Ty))
1573 return MakeDeclareArrayParam(ParamSymbol, ArgAlign, TySize);
1574
1575 assert(ArgOuts.size() == 1 && "We must pass only one value as non-array");
1576 assert((ArgOuts[0].VT.isInteger() || ArgOuts[0].VT.isFloatingPoint()) &&
1577 "Only int and float types are supported as non-array arguments");
1578
1579 return MakeDeclareScalarParam(ParamSymbol, TySize);
1580 }();
1581
1582 if (IsByVal) {
1583 assert(ArgOutVals.size() == 1 && "We must pass only one value as byval");
1584 SDValue SrcPtr = ArgOutVals[0];
1585 const auto PointerInfo = refinePtrAS(SrcPtr, DAG, DL, *this);
1586 const Align BaseSrcAlign = ArgOuts[0].Flags.getNonZeroByValAlign();
1587
1588 if (IsVAArg)
1589 VAOffset = alignTo(VAOffset, ArgAlign);
1590
1591 SmallVector<EVT, 4> ValueVTs, MemVTs;
1593 ComputeValueVTs(*this, DL, ETy, ValueVTs, &MemVTs, &Offsets);
1594
1595 unsigned J = 0;
1596 const auto VI = VectorizePTXValueVTs(MemVTs, Offsets, ArgAlign, IsVAArg);
1597 for (const unsigned NumElts : VI) {
1598 EVT LoadVT = getVectorizedVT(MemVTs[J], NumElts, Ctx);
1599 Align SrcAlign = commonAlignment(BaseSrcAlign, Offsets[J]);
1600 SDValue SrcAddr = DAG.getObjectPtrOffset(dl, SrcPtr, Offsets[J]);
1601 SDValue SrcLoad =
1602 DAG.getLoad(LoadVT, dl, CallChain, SrcAddr, PointerInfo, SrcAlign);
1603
1604 TypeSize ParamOffset = Offsets[J].getWithIncrement(VAOffset);
1605 Align ParamAlign = commonAlignment(ArgAlign, ParamOffset);
1606 SDValue ParamAddr =
1607 DAG.getObjectPtrOffset(dl, ParamSymbol, ParamOffset);
1608 SDValue StoreParam =
1609 DAG.getStore(ArgDeclare, dl, SrcLoad, ParamAddr,
1611 CallPrereqs.push_back(StoreParam);
1612
1613 J += NumElts;
1614 }
1615 if (IsVAArg)
1616 VAOffset += TySize;
1617 } else {
1620 ComputePTXValueVTs(*this, DL, Ctx, CLI.CallConv, Arg.Ty, VTs, Offsets,
1621 VAOffset);
1622 assert(VTs.size() == Offsets.size() && "Size mismatch");
1623 assert(VTs.size() == ArgOuts.size() && "Size mismatch");
1624
1625 // PTX Interoperability Guide 3.3(A): [Integer] Values shorter
1626 // than 32-bits are sign extended or zero extended, depending on
1627 // whether they are signed or unsigned types. This case applies
1628 // only to scalar parameters and not to aggregate values.
1629 const bool ExtendIntegerParam =
1630 Arg.Ty->isIntegerTy() && DL.getTypeAllocSizeInBits(Arg.Ty) < 32;
1631
1632 const auto GetStoredValue = [&](const unsigned I) {
1633 SDValue StVal = ArgOutVals[I];
1635 StVal.getValueType() &&
1636 "OutVal type should always be legal");
1637
1638 const EVT VTI = promoteScalarIntegerPTX(VTs[I]);
1639 const EVT StoreVT =
1640 ExtendIntegerParam ? MVT::i32 : (VTI == MVT::i1 ? MVT::i8 : VTI);
1641
1642 return correctParamType(StVal, StoreVT, ArgOuts[I].Flags, DAG, dl);
1643 };
1644
1645 unsigned J = 0;
1646 const auto VI = VectorizePTXValueVTs(VTs, Offsets, ArgAlign, IsVAArg);
1647 for (const unsigned NumElts : VI) {
1648 const EVT EltVT = promoteScalarIntegerPTX(VTs[J]);
1649
1650 unsigned Offset;
1651 if (IsVAArg) {
1652 // TODO: We may need to support vector types that can be passed
1653 // as scalars in variadic arguments.
1654 assert(NumElts == 1 &&
1655 "Vectorization should be disabled for vaargs.");
1656
1657 // Align each part of the variadic argument to their type.
1658 VAOffset = alignTo(VAOffset, DAG.getEVTAlign(EltVT));
1659 Offset = VAOffset;
1660
1661 const EVT TheStoreType = ExtendIntegerParam ? MVT::i32 : EltVT;
1662 VAOffset += DL.getTypeAllocSize(TheStoreType.getTypeForEVT(Ctx));
1663 } else {
1664 assert(VAOffset == 0 && "VAOffset must be 0 for non-VA args");
1665 Offset = Offsets[J];
1666 }
1667
1668 SDValue Ptr =
1669 DAG.getObjectPtrOffset(dl, ParamSymbol, TypeSize::getFixed(Offset));
1670
1671 const MaybeAlign CurrentAlign = ExtendIntegerParam
1672 ? MaybeAlign(std::nullopt)
1673 : commonAlignment(ArgAlign, Offset);
1674
1675 SDValue Val =
1676 getBuildVectorizedValue(NumElts, dl, DAG, [&](unsigned K) {
1677 return GetStoredValue(J + K);
1678 });
1679
1680 SDValue StoreParam =
1681 DAG.getStore(ArgDeclare, dl, Val, Ptr,
1683 CallPrereqs.push_back(StoreParam);
1684
1685 J += NumElts;
1686 }
1687 }
1688 }
1689
1690 // Handle Result
1691 if (!Ins.empty()) {
1692 const SDValue RetSymbol = DAG.getExternalSymbol("retval0", MVT::i32);
1693 const unsigned ResultSize = DL.getTypeAllocSize(RetTy);
1694 if (shouldPassAsArray(RetTy)) {
1695 const Align RetAlign = getArgumentAlignment(CB, RetTy, 0, DL);
1696 MakeDeclareArrayParam(RetSymbol, RetAlign, ResultSize);
1697 } else {
1698 MakeDeclareScalarParam(RetSymbol, ResultSize);
1699 }
1700 }
1701
1702 // Set the size of the vararg param byte array if the callee is a variadic
1703 // function and the variadic part is not empty.
1704 if (VADeclareParam) {
1705 SDValue DeclareParamOps[] = {VADeclareParam.getOperand(0),
1706 VADeclareParam.getOperand(1),
1707 VADeclareParam.getOperand(2), GetI32(VAOffset),
1708 VADeclareParam.getOperand(4)};
1709 DAG.MorphNodeTo(VADeclareParam.getNode(), VADeclareParam.getOpcode(),
1710 VADeclareParam->getVTList(), DeclareParamOps);
1711 }
1712
1713 const auto *Func = dyn_cast<GlobalAddressSDNode>(Callee.getNode());
1714 // If the type of the callsite does not match that of the function, convert
1715 // the callsite to an indirect call.
1716 const bool ConvertToIndirectCall = shouldConvertToIndirectCall(CB, Func);
1717
1718 // Both indirect calls and libcalls have nullptr Func. In order to distinguish
1719 // between them we must rely on the call site value which is valid for
1720 // indirect calls but is always null for libcalls.
1721 const bool IsIndirectCall = (!Func && CB) || ConvertToIndirectCall;
1722
1723 if (isa<ExternalSymbolSDNode>(Callee)) {
1724 Function* CalleeFunc = nullptr;
1725
1726 // Try to find the callee in the current module.
1727 Callee = DAG.getSymbolFunctionGlobalAddress(Callee, &CalleeFunc);
1728 assert(CalleeFunc != nullptr && "Libcall callee must be set.");
1729
1730 // Set the "libcall callee" attribute to indicate that the function
1731 // must always have a declaration.
1732 CalleeFunc->addFnAttr("nvptx-libcall-callee", "true");
1733 }
1734
1735 if (IsIndirectCall) {
1736 // This is indirect function call case : PTX requires a prototype of the
1737 // form
1738 // proto_0 : .callprototype(.param .b32 _) _ (.param .b32 _);
1739 // to be emitted, and the label has to used as the last arg of call
1740 // instruction.
1741 // The prototype is embedded in a string and put as the operand for a
1742 // CallPrototype SDNode which will print out to the value of the string.
1743 const bool HasVAArgs = CLI.IsVarArg && (CLI.Args.size() > CLI.NumFixedArgs);
1744 std::string Proto =
1745 getPrototype(DL, RetTy, Args, CLI.Outs,
1746 HasVAArgs ? std::optional(FirstVAArg) : std::nullopt, *CB,
1747 UniqueCallSite);
1748 const char *ProtoStr = nvTM->getStrPool().save(Proto).data();
1749 const SDValue PrototypeDeclare = DAG.getNode(
1750 NVPTXISD::CallPrototype, dl, MVT::Other,
1751 {StartChain, DAG.getTargetExternalSymbol(ProtoStr, MVT::i32)});
1752 CallPrereqs.push_back(PrototypeDeclare);
1753 }
1754
1755 const unsigned Proto = IsIndirectCall ? UniqueCallSite : 0;
1756 const unsigned NumArgs =
1757 std::min<unsigned>(CLI.NumFixedArgs + 1, Args.size());
1758 /// CALL(Chain, IsConvergent, IsIndirectCall/IsUniform, NumReturns,
1759 /// NumParams, Callee, Proto)
1760 const SDValue CallToken = DAG.getTokenFactor(dl, CallPrereqs);
1761 const SDValue Call = DAG.getNode(
1762 NVPTXISD::CALL, dl, MVT::Other,
1763 {CallToken, GetI32(CLI.IsConvergent), GetI32(IsIndirectCall),
1764 GetI32(Ins.empty() ? 0 : 1), GetI32(NumArgs), Callee, GetI32(Proto)});
1765
1766 SmallVector<SDValue, 16> LoadChains{Call};
1767 SmallVector<SDValue, 16> ProxyRegOps;
1768 if (!Ins.empty()) {
1771 ComputePTXValueVTs(*this, DL, Ctx, CLI.CallConv, RetTy, VTs, Offsets);
1772 assert(VTs.size() == Ins.size() && "Bad value decomposition");
1773
1774 const Align RetAlign = getArgumentAlignment(CB, RetTy, 0, DL);
1775 const SDValue RetSymbol = DAG.getExternalSymbol("retval0", MVT::i32);
1776
1777 // PTX Interoperability Guide 3.3(A): [Integer] Values shorter than
1778 // 32-bits are sign extended or zero extended, depending on whether
1779 // they are signed or unsigned types.
1780 const bool ExtendIntegerRetVal =
1781 RetTy->isIntegerTy() && DL.getTypeAllocSizeInBits(RetTy) < 32;
1782
1783 unsigned I = 0;
1784 const auto VI = VectorizePTXValueVTs(VTs, Offsets, RetAlign);
1785 for (const unsigned NumElts : VI) {
1786 const MaybeAlign CurrentAlign =
1787 ExtendIntegerRetVal ? MaybeAlign(std::nullopt)
1788 : commonAlignment(RetAlign, Offsets[I]);
1789
1790 const EVT VTI = promoteScalarIntegerPTX(VTs[I]);
1791 const EVT LoadVT =
1792 ExtendIntegerRetVal ? MVT::i32 : (VTI == MVT::i1 ? MVT::i8 : VTI);
1793 const EVT VecVT = getVectorizedVT(LoadVT, NumElts, Ctx);
1794 SDValue Ptr =
1795 DAG.getObjectPtrOffset(dl, RetSymbol, TypeSize::getFixed(Offsets[I]));
1796
1797 SDValue R =
1798 DAG.getLoad(VecVT, dl, Call, Ptr,
1800
1801 LoadChains.push_back(R.getValue(1));
1802 for (const unsigned J : llvm::seq(NumElts))
1803 ProxyRegOps.push_back(getExtractVectorizedValue(R, J, LoadVT, dl, DAG));
1804 I += NumElts;
1805 }
1806 }
1807
1808 const SDValue EndToken = DAG.getTokenFactor(dl, LoadChains);
1809 const SDValue CallEnd = DAG.getCALLSEQ_END(EndToken, UniqueCallSite,
1810 UniqueCallSite + 1, SDValue(), dl);
1811
1812 // Append ProxyReg instructions to the chain to make sure that `callseq_end`
1813 // will not get lost. Otherwise, during libcalls expansion, the nodes can become
1814 // dangling.
1815 for (const auto [I, Reg] : llvm::enumerate(ProxyRegOps)) {
1816 SDValue Proxy =
1817 DAG.getNode(NVPTXISD::ProxyReg, dl, Reg.getValueType(), {CallEnd, Reg});
1818 SDValue Ret = correctParamType(Proxy, Ins[I].VT, Ins[I].Flags, DAG, dl);
1819 InVals.push_back(Ret);
1820 }
1821
1822 // set IsTailCall to false for now, until we figure out how to express
1823 // tail call optimization in PTX
1824 CLI.IsTailCall = false;
1825 return CallEnd;
1826}
1827
1829 SelectionDAG &DAG) const {
1830
1831 if (STI.getPTXVersion() < 73 || STI.getSmVersion() < 52) {
1832 const Function &Fn = DAG.getMachineFunction().getFunction();
1833
1835 Fn,
1836 "Support for dynamic alloca introduced in PTX ISA version 7.3 and "
1837 "requires target sm_52.",
1838 SDLoc(Op).getDebugLoc()));
1839 auto Ops = {DAG.getConstant(0, SDLoc(), Op.getValueType()),
1840 Op.getOperand(0)};
1841 return DAG.getMergeValues(Ops, SDLoc());
1842 }
1843
1844 SDLoc DL(Op.getNode());
1845 SDValue Chain = Op.getOperand(0);
1846 SDValue Size = Op.getOperand(1);
1847 uint64_t Align = Op.getConstantOperandVal(2);
1848
1849 // The alignment on a ISD::DYNAMIC_STACKALLOC node may be 0 to indicate that
1850 // the default stack alignment should be used.
1851 if (Align == 0)
1853
1854 // The size for ptx alloca instruction is 64-bit for m64 and 32-bit for m32.
1855 const MVT LocalVT = getPointerTy(DAG.getDataLayout(), ADDRESS_SPACE_LOCAL);
1856
1857 SDValue Alloc =
1858 DAG.getNode(NVPTXISD::DYNAMIC_STACKALLOC, DL, {LocalVT, MVT::Other},
1859 {Chain, DAG.getZExtOrTrunc(Size, DL, LocalVT),
1860 DAG.getTargetConstant(Align, DL, MVT::i32)});
1861
1862 SDValue ASC = DAG.getAddrSpaceCast(
1864
1865 return DAG.getMergeValues({ASC, SDValue(Alloc.getNode(), 1)}, DL);
1866}
1867
1869 SelectionDAG &DAG) const {
1870 SDLoc DL(Op.getNode());
1871 if (STI.getPTXVersion() < 73 || STI.getSmVersion() < 52) {
1872 const Function &Fn = DAG.getMachineFunction().getFunction();
1873
1875 Fn,
1876 "Support for stackrestore requires PTX ISA version >= 7.3 and target "
1877 ">= sm_52.",
1878 DL.getDebugLoc()));
1879 return Op.getOperand(0);
1880 }
1881
1882 const MVT LocalVT = getPointerTy(DAG.getDataLayout(), ADDRESS_SPACE_LOCAL);
1883 SDValue Chain = Op.getOperand(0);
1884 SDValue Ptr = Op.getOperand(1);
1887 return DAG.getNode(NVPTXISD::STACKRESTORE, DL, MVT::Other, {Chain, ASC});
1888}
1889
1891 SelectionDAG &DAG) const {
1892 SDLoc DL(Op.getNode());
1893 if (STI.getPTXVersion() < 73 || STI.getSmVersion() < 52) {
1894 const Function &Fn = DAG.getMachineFunction().getFunction();
1895
1897 Fn,
1898 "Support for stacksave requires PTX ISA version >= 7.3 and target >= "
1899 "sm_52.",
1900 DL.getDebugLoc()));
1901 auto Ops = {DAG.getConstant(0, DL, Op.getValueType()), Op.getOperand(0)};
1902 return DAG.getMergeValues(Ops, DL);
1903 }
1904
1905 const MVT LocalVT = getPointerTy(DAG.getDataLayout(), ADDRESS_SPACE_LOCAL);
1906 SDValue Chain = Op.getOperand(0);
1907 SDValue SS =
1908 DAG.getNode(NVPTXISD::STACKSAVE, DL, {LocalVT, MVT::Other}, Chain);
1909 SDValue ASC = DAG.getAddrSpaceCast(
1910 DL, Op.getValueType(), SS, ADDRESS_SPACE_LOCAL, ADDRESS_SPACE_GENERIC);
1911 return DAG.getMergeValues({ASC, SDValue(SS.getNode(), 1)}, DL);
1912}
1913
1914// By default CONCAT_VECTORS is lowered by ExpandVectorBuildThroughStack()
1915// (see LegalizeDAG.cpp). This is slow and uses local memory.
1916// We use extract/insert/build vector just as what LegalizeOp() does in llvm 2.5
1917SDValue
1918NVPTXTargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const {
1919 SDNode *Node = Op.getNode();
1920 SDLoc dl(Node);
1922 unsigned NumOperands = Node->getNumOperands();
1923 for (unsigned i = 0; i < NumOperands; ++i) {
1924 SDValue SubOp = Node->getOperand(i);
1925 EVT VVT = SubOp.getNode()->getValueType(0);
1926 EVT EltVT = VVT.getVectorElementType();
1927 unsigned NumSubElem = VVT.getVectorNumElements();
1928 for (unsigned j = 0; j < NumSubElem; ++j) {
1929 Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, SubOp,
1930 DAG.getIntPtrConstant(j, dl)));
1931 }
1932 }
1933 return DAG.getBuildVector(Node->getValueType(0), dl, Ops);
1934}
1935
1937 SelectionDAG &DAG,
1938 unsigned Mode = NVPTX::PTXPrmtMode::NONE) {
1939 assert(A.getValueType() == MVT::i32 && B.getValueType() == MVT::i32 &&
1940 Selector.getValueType() == MVT::i32 && "PRMT must have i32 operands");
1941 return DAG.getNode(NVPTXISD::PRMT, DL, MVT::i32,
1942 {A, B, Selector, DAG.getConstant(Mode, DL, MVT::i32)});
1943}
1944
1946 SelectionDAG &DAG,
1947 unsigned Mode = NVPTX::PTXPrmtMode::NONE) {
1948 return getPRMT(A, B, DAG.getConstant(Selector, DL, MVT::i32), DL, DAG, Mode);
1949}
1950
1951/// Reduces the elements using the scalar operations provided. The operations
1952/// are sorted descending in number of inputs they take. The flags on the
1953/// original reduction operation will be propagated to each scalar operation.
1954/// Nearby elements are grouped in tree reduction, unlike the shuffle reduction
1955/// used in ExpandReductions and SelectionDAG.
1957 const SmallVector<SDValue> &Elements, EVT EltTy,
1958 ArrayRef<std::pair<unsigned /*NodeType*/, unsigned /*NumInputs*/>> Ops,
1959 const SDLoc &DL, const SDNodeFlags Flags, SelectionDAG &DAG) {
1960 // Build the reduction tree at each level, starting with all the elements.
1961 SmallVector<SDValue> Level = Elements;
1962
1963 unsigned OpIdx = 0;
1964 while (Level.size() > 1) {
1965 // Try to reduce this level using the current operator.
1966 const auto [Op, NumInputs] = Ops[OpIdx];
1967
1968 // Build the next level by partially reducing all elements.
1969 SmallVector<SDValue> ReducedLevel;
1970 unsigned I = 0, E = Level.size();
1971 for (; I + NumInputs <= E; I += NumInputs) {
1972 // Reduce elements in groups of [NumInputs], as much as possible.
1973 ReducedLevel.push_back(DAG.getNode(
1974 Op, DL, EltTy, ArrayRef<SDValue>(Level).slice(I, NumInputs), Flags));
1975 }
1976
1977 if (I < E) {
1978 // Handle leftover elements.
1979
1980 if (ReducedLevel.empty()) {
1981 // We didn't reduce anything at this level. We need to pick a smaller
1982 // operator.
1983 ++OpIdx;
1984 assert(OpIdx < Ops.size() && "no smaller operators for reduction");
1985 continue;
1986 }
1987
1988 // We reduced some things but there's still more left, meaning the
1989 // operator's number of inputs doesn't evenly divide this level size. Move
1990 // these elements to the next level.
1991 for (; I < E; ++I)
1992 ReducedLevel.push_back(Level[I]);
1993 }
1994
1995 // Process the next level.
1996 Level = ReducedLevel;
1997 }
1998
1999 return *Level.begin();
2000}
2001
2002// Get scalar reduction opcode
2003static ISD::NodeType getScalarOpcodeForReduction(unsigned ReductionOpcode) {
2004 switch (ReductionOpcode) {
2005 case ISD::VECREDUCE_FMAX:
2006 return ISD::FMAXNUM;
2007 case ISD::VECREDUCE_FMIN:
2008 return ISD::FMINNUM;
2009 case ISD::VECREDUCE_FMAXIMUM:
2010 return ISD::FMAXIMUM;
2011 case ISD::VECREDUCE_FMINIMUM:
2012 return ISD::FMINIMUM;
2013 default:
2014 llvm_unreachable("unhandled reduction opcode");
2015 }
2016}
2017
2018/// Get 3-input scalar reduction opcode
2019static std::optional<NVPTXISD::NodeType>
2020getScalar3OpcodeForReduction(unsigned ReductionOpcode) {
2021 switch (ReductionOpcode) {
2022 case ISD::VECREDUCE_FMAX:
2023 return NVPTXISD::FMAXNUM3;
2024 case ISD::VECREDUCE_FMIN:
2025 return NVPTXISD::FMINNUM3;
2026 case ISD::VECREDUCE_FMAXIMUM:
2027 return NVPTXISD::FMAXIMUM3;
2028 case ISD::VECREDUCE_FMINIMUM:
2029 return NVPTXISD::FMINIMUM3;
2030 default:
2031 return std::nullopt;
2032 }
2033}
2034
2035/// Lower reductions to either a sequence of operations or a tree if
2036/// reassociations are allowed. This method will use larger operations like
2037/// max3/min3 when the target supports them.
2038SDValue NVPTXTargetLowering::LowerVECREDUCE(SDValue Op,
2039 SelectionDAG &DAG) const {
2040 SDLoc DL(Op);
2041 const SDNodeFlags Flags = Op->getFlags();
2042 SDValue Vector = Op.getOperand(0);
2043
2044 const unsigned Opcode = Op->getOpcode();
2045 const EVT EltTy = Vector.getValueType().getVectorElementType();
2046
2047 // Whether we can use 3-input min/max when expanding the reduction.
2048 const bool CanUseMinMax3 =
2049 EltTy == MVT::f32 && STI.getSmVersion() >= 100 &&
2050 STI.getPTXVersion() >= 88 &&
2051 (Opcode == ISD::VECREDUCE_FMAX || Opcode == ISD::VECREDUCE_FMIN ||
2052 Opcode == ISD::VECREDUCE_FMAXIMUM || Opcode == ISD::VECREDUCE_FMINIMUM);
2053
2054 // A list of SDNode opcodes with equivalent semantics, sorted descending by
2055 // number of inputs they take.
2056 SmallVector<std::pair<unsigned /*Op*/, unsigned /*NumIn*/>, 2> ScalarOps;
2057
2058 if (auto Opcode3Elem = getScalar3OpcodeForReduction(Opcode);
2059 CanUseMinMax3 && Opcode3Elem)
2060 ScalarOps.push_back({*Opcode3Elem, 3});
2061 ScalarOps.push_back({getScalarOpcodeForReduction(Opcode), 2});
2062
2064 DAG.ExtractVectorElements(Vector, Elements);
2065
2066 return buildTreeReduction(Elements, EltTy, ScalarOps, DL, Flags, DAG);
2067}
2068
2069SDValue NVPTXTargetLowering::LowerBITCAST(SDValue Op, SelectionDAG &DAG) const {
2070 // Handle bitcasting from v2i8 without hitting the default promotion
2071 // strategy which goes through stack memory.
2072 EVT FromVT = Op->getOperand(0)->getValueType(0);
2073 if (FromVT != MVT::v2i8) {
2074 return Op;
2075 }
2076
2077 // Pack vector elements into i16 and bitcast to final type
2078 SDLoc DL(Op);
2079 SDValue Vec0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8,
2080 Op->getOperand(0), DAG.getIntPtrConstant(0, DL));
2081 SDValue Vec1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8,
2082 Op->getOperand(0), DAG.getIntPtrConstant(1, DL));
2083 SDValue Extend0 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i16, Vec0);
2084 SDValue Extend1 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i16, Vec1);
2085 SDValue Const8 = DAG.getConstant(8, DL, MVT::i16);
2086 SDValue AsInt = DAG.getNode(
2087 ISD::OR, DL, MVT::i16,
2088 {Extend0, DAG.getNode(ISD::SHL, DL, MVT::i16, {Extend1, Const8})});
2089 EVT ToVT = Op->getValueType(0);
2090 return DAG.getBitcast(ToVT, AsInt);
2091}
2092
2093// We can init constant f16x2/v2i16/v4i8 with a single .b32 move. Normally it
2094// would get lowered as two constant loads and vector-packing move.
2095// Instead we want just a constant move:
2096// mov.b32 %r2, 0x40003C00
2097SDValue NVPTXTargetLowering::LowerBUILD_VECTOR(SDValue Op,
2098 SelectionDAG &DAG) const {
2099 EVT VT = Op->getValueType(0);
2100 if (!(NVPTX::isPackedVectorTy(VT) && VT.is32BitVector()))
2101 return Op;
2102 SDLoc DL(Op);
2103
2104 if (!llvm::all_of(Op->ops(), [](SDValue Operand) {
2105 return Operand->isUndef() || isa<ConstantSDNode>(Operand) ||
2106 isa<ConstantFPSDNode>(Operand);
2107 })) {
2108 if (VT != MVT::v4i8)
2109 return Op;
2110 // Lower non-const v4i8 vector as byte-wise constructed i32, which allows us
2111 // to optimize calculation of constant parts.
2112 auto GetPRMT = [&](const SDValue Left, const SDValue Right, bool Cast,
2113 uint64_t SelectionValue) -> SDValue {
2114 SDValue L = Left;
2115 SDValue R = Right;
2116 if (Cast) {
2117 L = DAG.getAnyExtOrTrunc(L, DL, MVT::i32);
2118 R = DAG.getAnyExtOrTrunc(R, DL, MVT::i32);
2119 }
2120 return getPRMT(L, R, SelectionValue, DL, DAG);
2121 };
2122 auto PRMT__10 = GetPRMT(Op->getOperand(0), Op->getOperand(1), true, 0x3340);
2123 auto PRMT__32 = GetPRMT(Op->getOperand(2), Op->getOperand(3), true, 0x3340);
2124 auto PRMT3210 = GetPRMT(PRMT__10, PRMT__32, false, 0x5410);
2125 return DAG.getBitcast(VT, PRMT3210);
2126 }
2127
2128 // Get value or the Nth operand as an APInt(32). Undef values treated as 0.
2129 auto GetOperand = [](SDValue Op, int N) -> APInt {
2130 const SDValue &Operand = Op->getOperand(N);
2131 EVT VT = Op->getValueType(0);
2132 if (Operand->isUndef())
2133 return APInt(32, 0);
2134 APInt Value;
2135 if (VT == MVT::v2f16 || VT == MVT::v2bf16)
2136 Value = cast<ConstantFPSDNode>(Operand)->getValueAPF().bitcastToAPInt();
2137 else if (VT == MVT::v2i16 || VT == MVT::v4i8)
2138 Value = Operand->getAsAPIntVal();
2139 else
2140 llvm_unreachable("Unsupported type");
2141 // i8 values are carried around as i16, so we need to zero out upper bits,
2142 // so they do not get in the way of combining individual byte values
2143 if (VT == MVT::v4i8)
2144 Value = Value.trunc(8);
2145 return Value.zext(32);
2146 };
2147
2148 // Construct a 32-bit constant by shifting into place smaller values
2149 // (elements of the vector type VT).
2150 // For example, if VT has 2 elements, then N == 2:
2151 // ShiftAmount = 32 / N = 16
2152 // Value |= Op0 (b16) << 0
2153 // Value |= Op1 (b16) << 16
2154 // If N == 4:
2155 // ShiftAmount = 32 / N = 8
2156 // Value |= Op0 (b8) << 0
2157 // Value |= Op1 (b8) << 8
2158 // Value |= Op2 (b8) << 16
2159 // Value |= Op3 (b8) << 24
2160 // ...etc
2161 APInt Value(32, 0);
2162 const unsigned NumElements = VT.getVectorNumElements();
2163 assert(32 % NumElements == 0 && "must evenly divide bit length");
2164 const unsigned ShiftAmount = 32 / NumElements;
2165 for (unsigned ElementNo : seq(NumElements))
2166 Value |= GetOperand(Op, ElementNo).shl(ElementNo * ShiftAmount);
2167 SDValue Const = DAG.getConstant(Value, DL, MVT::i32);
2168 return DAG.getNode(ISD::BITCAST, DL, Op->getValueType(0), Const);
2169}
2170
2171SDValue NVPTXTargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
2172 SelectionDAG &DAG) const {
2173 SDValue Index = Op->getOperand(1);
2174 SDValue Vector = Op->getOperand(0);
2175 SDLoc DL(Op);
2176 EVT VectorVT = Vector.getValueType();
2177
2178 if (VectorVT == MVT::v4i8) {
2179 SDValue Selector = DAG.getNode(ISD::OR, DL, MVT::i32,
2180 DAG.getZExtOrTrunc(Index, DL, MVT::i32),
2181 DAG.getConstant(0x7770, DL, MVT::i32));
2182 SDValue PRMT = getPRMT(DAG.getBitcast(MVT::i32, Vector),
2183 DAG.getConstant(0, DL, MVT::i32), Selector, DL, DAG);
2184 SDValue Ext = DAG.getAnyExtOrTrunc(PRMT, DL, Op->getValueType(0));
2185 SDNodeFlags Flags;
2186 Flags.setNoSignedWrap(Ext.getScalarValueSizeInBits() > 8);
2187 Flags.setNoUnsignedWrap(Ext.getScalarValueSizeInBits() >= 8);
2188 Ext->setFlags(Flags);
2189 return Ext;
2190 }
2191
2192 // Constant index will be matched by tablegen.
2193 if (isa<ConstantSDNode>(Index.getNode()))
2194 return Op;
2195
2196 // Extract individual elements and select one of them.
2197 assert(NVPTX::isPackedVectorTy(VectorVT) &&
2198 VectorVT.getVectorNumElements() == 2 && "Unexpected vector type.");
2199 EVT EltVT = VectorVT.getVectorElementType();
2200
2201 SDLoc dl(Op.getNode());
2202 SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Vector,
2203 DAG.getIntPtrConstant(0, dl));
2204 SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Vector,
2205 DAG.getIntPtrConstant(1, dl));
2206 return DAG.getSelectCC(dl, Index, DAG.getIntPtrConstant(0, dl), E0, E1,
2208}
2209
2210SDValue NVPTXTargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
2211 SelectionDAG &DAG) const {
2212 SDValue Vector = Op->getOperand(0);
2213 EVT VectorVT = Vector.getValueType();
2214
2215 if (VectorVT != MVT::v4i8)
2216 return Op;
2217 SDLoc DL(Op);
2218 SDValue Value = Op->getOperand(1);
2219 if (Value->isUndef())
2220 return Vector;
2221
2222 SDValue Index = Op->getOperand(2);
2223
2224 SDValue BFI =
2225 DAG.getNode(NVPTXISD::BFI, DL, MVT::i32,
2226 {DAG.getZExtOrTrunc(Value, DL, MVT::i32), Vector,
2227 DAG.getNode(ISD::MUL, DL, MVT::i32,
2228 DAG.getZExtOrTrunc(Index, DL, MVT::i32),
2229 DAG.getConstant(8, DL, MVT::i32)),
2230 DAG.getConstant(8, DL, MVT::i32)});
2231 return DAG.getNode(ISD::BITCAST, DL, Op->getValueType(0), BFI);
2232}
2233
2234SDValue NVPTXTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
2235 SelectionDAG &DAG) const {
2236 SDValue V1 = Op.getOperand(0);
2237 EVT VectorVT = V1.getValueType();
2238 if (VectorVT != MVT::v4i8 || Op.getValueType() != MVT::v4i8)
2239 return Op;
2240
2241 // Lower shuffle to PRMT instruction.
2242 const ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
2243 SDValue V2 = Op.getOperand(1);
2244 uint32_t Selector = 0;
2245 for (auto I : llvm::enumerate(SVN->getMask())) {
2246 if (I.value() != -1) // -1 is a placeholder for undef.
2247 Selector |= (I.value() << (I.index() * 4));
2248 }
2249
2250 SDLoc DL(Op);
2251 SDValue PRMT = getPRMT(DAG.getBitcast(MVT::i32, V1),
2252 DAG.getBitcast(MVT::i32, V2), Selector, DL, DAG);
2253 return DAG.getBitcast(Op.getValueType(), PRMT);
2254}
2255/// LowerShiftRightParts - Lower SRL_PARTS, SRA_PARTS, which
2256/// 1) returns two i32 values and take a 2 x i32 value to shift plus a shift
2257/// amount, or
2258/// 2) returns two i64 values and take a 2 x i64 value to shift plus a shift
2259/// amount.
2260SDValue NVPTXTargetLowering::LowerShiftRightParts(SDValue Op,
2261 SelectionDAG &DAG) const {
2262 assert(Op.getNumOperands() == 3 && "Not a double-shift!");
2263 assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS);
2264
2265 EVT VT = Op.getValueType();
2266 unsigned VTBits = VT.getSizeInBits();
2267 SDLoc dl(Op);
2268 SDValue ShOpLo = Op.getOperand(0);
2269 SDValue ShOpHi = Op.getOperand(1);
2270 SDValue ShAmt = Op.getOperand(2);
2271 unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL;
2272
2273 if (VTBits == 32 && STI.getSmVersion() >= 35) {
2274 // For 32bit and sm35, we can use the funnel shift 'shf' instruction.
2275 // {dHi, dLo} = {aHi, aLo} >> Amt
2276 // dHi = aHi >> Amt
2277 // dLo = shf.r.clamp aLo, aHi, Amt
2278
2279 SDValue Hi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
2280 SDValue Lo =
2281 DAG.getNode(NVPTXISD::FSHR_CLAMP, dl, VT, ShOpHi, ShOpLo, ShAmt);
2282
2283 SDValue Ops[2] = { Lo, Hi };
2284 return DAG.getMergeValues(Ops, dl);
2285 }
2286 else {
2287 // {dHi, dLo} = {aHi, aLo} >> Amt
2288 // - if (Amt>=size) then
2289 // dLo = aHi >> (Amt-size)
2290 // dHi = aHi >> Amt (this is either all 0 or all 1)
2291 // else
2292 // dLo = (aLo >>logic Amt) | (aHi << (size-Amt))
2293 // dHi = aHi >> Amt
2294
2295 SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
2296 DAG.getConstant(VTBits, dl, MVT::i32),
2297 ShAmt);
2298 SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt);
2299 SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
2300 DAG.getConstant(VTBits, dl, MVT::i32));
2301 SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt);
2302 SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
2303 SDValue TrueVal = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt);
2304
2305 SDValue Cmp = DAG.getSetCC(dl, MVT::i1, ShAmt,
2306 DAG.getConstant(VTBits, dl, MVT::i32),
2307 ISD::SETGE);
2308 SDValue Hi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
2309 SDValue Lo = DAG.getNode(ISD::SELECT, dl, VT, Cmp, TrueVal, FalseVal);
2310
2311 SDValue Ops[2] = { Lo, Hi };
2312 return DAG.getMergeValues(Ops, dl);
2313 }
2314}
2315
2316/// LowerShiftLeftParts - Lower SHL_PARTS, which
2317/// 1) returns two i32 values and take a 2 x i32 value to shift plus a shift
2318/// amount, or
2319/// 2) returns two i64 values and take a 2 x i64 value to shift plus a shift
2320/// amount.
2321SDValue NVPTXTargetLowering::LowerShiftLeftParts(SDValue Op,
2322 SelectionDAG &DAG) const {
2323 assert(Op.getNumOperands() == 3 && "Not a double-shift!");
2324 assert(Op.getOpcode() == ISD::SHL_PARTS);
2325
2326 EVT VT = Op.getValueType();
2327 unsigned VTBits = VT.getSizeInBits();
2328 SDLoc dl(Op);
2329 SDValue ShOpLo = Op.getOperand(0);
2330 SDValue ShOpHi = Op.getOperand(1);
2331 SDValue ShAmt = Op.getOperand(2);
2332
2333 if (VTBits == 32 && STI.getSmVersion() >= 35) {
2334 // For 32bit and sm35, we can use the funnel shift 'shf' instruction.
2335 // {dHi, dLo} = {aHi, aLo} << Amt
2336 // dHi = shf.l.clamp aLo, aHi, Amt
2337 // dLo = aLo << Amt
2338
2339 SDValue Hi =
2340 DAG.getNode(NVPTXISD::FSHL_CLAMP, dl, VT, ShOpHi, ShOpLo, ShAmt);
2341 SDValue Lo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
2342
2343 SDValue Ops[2] = { Lo, Hi };
2344 return DAG.getMergeValues(Ops, dl);
2345 }
2346 else {
2347 // {dHi, dLo} = {aHi, aLo} << Amt
2348 // - if (Amt>=size) then
2349 // dLo = aLo << Amt (all 0)
2350 // dLo = aLo << (Amt-size)
2351 // else
2352 // dLo = aLo << Amt
2353 // dHi = (aHi << Amt) | (aLo >> (size-Amt))
2354
2355 SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
2356 DAG.getConstant(VTBits, dl, MVT::i32),
2357 ShAmt);
2358 SDValue Tmp1 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt);
2359 SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
2360 DAG.getConstant(VTBits, dl, MVT::i32));
2361 SDValue Tmp2 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt);
2362 SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
2363 SDValue TrueVal = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt);
2364
2365 SDValue Cmp = DAG.getSetCC(dl, MVT::i1, ShAmt,
2366 DAG.getConstant(VTBits, dl, MVT::i32),
2367 ISD::SETGE);
2368 SDValue Lo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
2369 SDValue Hi = DAG.getNode(ISD::SELECT, dl, VT, Cmp, TrueVal, FalseVal);
2370
2371 SDValue Ops[2] = { Lo, Hi };
2372 return DAG.getMergeValues(Ops, dl);
2373 }
2374}
2375
2376/// If the types match, convert the generic copysign to the NVPTXISD version,
2377/// otherwise bail ensuring that mismatched cases are properly expaned.
2378SDValue NVPTXTargetLowering::LowerFCOPYSIGN(SDValue Op,
2379 SelectionDAG &DAG) const {
2380 EVT VT = Op.getValueType();
2381 SDLoc DL(Op);
2382
2383 SDValue In1 = Op.getOperand(0);
2384 SDValue In2 = Op.getOperand(1);
2385 EVT SrcVT = In2.getValueType();
2386
2387 if (!SrcVT.bitsEq(VT))
2388 return SDValue();
2389
2390 return DAG.getNode(NVPTXISD::FCOPYSIGN, DL, VT, In1, In2);
2391}
2392
2393SDValue NVPTXTargetLowering::LowerFROUND(SDValue Op, SelectionDAG &DAG) const {
2394 EVT VT = Op.getValueType();
2395
2396 if (VT == MVT::f32)
2397 return LowerFROUND32(Op, DAG);
2398
2399 if (VT == MVT::f64)
2400 return LowerFROUND64(Op, DAG);
2401
2402 llvm_unreachable("unhandled type");
2403}
2404
2405// This is the the rounding method used in CUDA libdevice in C like code:
2406// float roundf(float A)
2407// {
2408// float RoundedA = (float) (int) ( A > 0 ? (A + 0.5f) : (A - 0.5f));
2409// RoundedA = abs(A) > 0x1.0p23 ? A : RoundedA;
2410// return abs(A) < 0.5 ? (float)(int)A : RoundedA;
2411// }
2412SDValue NVPTXTargetLowering::LowerFROUND32(SDValue Op,
2413 SelectionDAG &DAG) const {
2414 SDLoc SL(Op);
2415 SDValue A = Op.getOperand(0);
2416 EVT VT = Op.getValueType();
2417
2418 SDValue AbsA = DAG.getNode(ISD::FABS, SL, VT, A);
2419
2420 // RoundedA = (float) (int) ( A > 0 ? (A + 0.5f) : (A - 0.5f))
2421 SDValue Bitcast = DAG.getNode(ISD::BITCAST, SL, MVT::i32, A);
2422 const unsigned SignBitMask = 0x80000000;
2423 SDValue Sign = DAG.getNode(ISD::AND, SL, MVT::i32, Bitcast,
2424 DAG.getConstant(SignBitMask, SL, MVT::i32));
2425 const unsigned PointFiveInBits = 0x3F000000;
2426 SDValue PointFiveWithSignRaw =
2427 DAG.getNode(ISD::OR, SL, MVT::i32, Sign,
2428 DAG.getConstant(PointFiveInBits, SL, MVT::i32));
2429 SDValue PointFiveWithSign =
2430 DAG.getNode(ISD::BITCAST, SL, VT, PointFiveWithSignRaw);
2431 SDValue AdjustedA = DAG.getNode(ISD::FADD, SL, VT, A, PointFiveWithSign);
2432 SDValue RoundedA = DAG.getNode(ISD::FTRUNC, SL, VT, AdjustedA);
2433
2434 // RoundedA = abs(A) > 0x1.0p23 ? A : RoundedA;
2435 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2436 SDValue IsLarge =
2437 DAG.getSetCC(SL, SetCCVT, AbsA, DAG.getConstantFP(pow(2.0, 23.0), SL, VT),
2438 ISD::SETOGT);
2439 RoundedA = DAG.getNode(ISD::SELECT, SL, VT, IsLarge, A, RoundedA);
2440
2441 // return abs(A) < 0.5 ? (float)(int)A : RoundedA;
2442 SDValue IsSmall =DAG.getSetCC(SL, SetCCVT, AbsA,
2443 DAG.getConstantFP(0.5, SL, VT), ISD::SETOLT);
2444 SDValue RoundedAForSmallA = DAG.getNode(ISD::FTRUNC, SL, VT, A);
2445 return DAG.getNode(ISD::SELECT, SL, VT, IsSmall, RoundedAForSmallA, RoundedA);
2446}
2447
2448// The implementation of round(double) is similar to that of round(float) in
2449// that they both separate the value range into three regions and use a method
2450// specific to the region to round the values. However, round(double) first
2451// calculates the round of the absolute value and then adds the sign back while
2452// round(float) directly rounds the value with sign.
2453SDValue NVPTXTargetLowering::LowerFROUND64(SDValue Op,
2454 SelectionDAG &DAG) const {
2455 SDLoc SL(Op);
2456 SDValue A = Op.getOperand(0);
2457 EVT VT = Op.getValueType();
2458
2459 SDValue AbsA = DAG.getNode(ISD::FABS, SL, VT, A);
2460
2461 // double RoundedA = (double) (int) (abs(A) + 0.5f);
2462 SDValue AdjustedA = DAG.getNode(ISD::FADD, SL, VT, AbsA,
2463 DAG.getConstantFP(0.5, SL, VT));
2464 SDValue RoundedA = DAG.getNode(ISD::FTRUNC, SL, VT, AdjustedA);
2465
2466 // RoundedA = abs(A) < 0.5 ? (double)0 : RoundedA;
2467 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2468 SDValue IsSmall =DAG.getSetCC(SL, SetCCVT, AbsA,
2469 DAG.getConstantFP(0.5, SL, VT), ISD::SETOLT);
2470 RoundedA = DAG.getNode(ISD::SELECT, SL, VT, IsSmall,
2471 DAG.getConstantFP(0, SL, VT),
2472 RoundedA);
2473
2474 // Add sign to rounded_A
2475 RoundedA = DAG.getNode(ISD::FCOPYSIGN, SL, VT, RoundedA, A);
2476 DAG.getNode(ISD::FTRUNC, SL, VT, A);
2477
2478 // RoundedA = abs(A) > 0x1.0p52 ? A : RoundedA;
2479 SDValue IsLarge =
2480 DAG.getSetCC(SL, SetCCVT, AbsA, DAG.getConstantFP(pow(2.0, 52.0), SL, VT),
2481 ISD::SETOGT);
2482 return DAG.getNode(ISD::SELECT, SL, VT, IsLarge, A, RoundedA);
2483}
2484
2486 EVT VT = N->getValueType(0);
2487 EVT NVT = MVT::f32;
2488 if (VT.isVector()) {
2489 NVT = EVT::getVectorVT(*DAG.getContext(), NVT, VT.getVectorElementCount());
2490 }
2491 SDLoc DL(N);
2492 SDValue Tmp0 = DAG.getFPExtendOrRound(N->getOperand(0), DL, NVT);
2493 SDValue Tmp1 = DAG.getFPExtendOrRound(N->getOperand(1), DL, NVT);
2494 SDValue Res = DAG.getNode(N->getOpcode(), DL, NVT, Tmp0, Tmp1, N->getFlags());
2495 return DAG.getFPExtendOrRound(Res, DL, VT);
2496}
2497
2498SDValue NVPTXTargetLowering::PromoteBinOpIfF32FTZ(SDValue Op,
2499 SelectionDAG &DAG) const {
2500 if (useF32FTZ(DAG.getMachineFunction())) {
2501 return PromoteBinOpToF32(Op.getNode(), DAG);
2502 }
2503 return Op;
2504}
2505
2506SDValue NVPTXTargetLowering::LowerINT_TO_FP(SDValue Op,
2507 SelectionDAG &DAG) const {
2508 assert(STI.getSmVersion() < 90 || STI.getPTXVersion() < 78);
2509
2510 if (Op.getValueType() == MVT::bf16) {
2511 SDLoc Loc(Op);
2512 return DAG.getNode(
2513 ISD::FP_ROUND, Loc, MVT::bf16,
2514 DAG.getNode(Op.getOpcode(), Loc, MVT::f32, Op.getOperand(0)),
2515 DAG.getIntPtrConstant(0, Loc, /*isTarget=*/true));
2516 }
2517
2518 // Everything else is considered legal.
2519 return Op;
2520}
2521
2522SDValue NVPTXTargetLowering::LowerFP_TO_INT(SDValue Op,
2523 SelectionDAG &DAG) const {
2524 assert(STI.getSmVersion() < 90 || STI.getPTXVersion() < 78);
2525
2526 if (Op.getOperand(0).getValueType() == MVT::bf16) {
2527 SDLoc Loc(Op);
2528 return DAG.getNode(
2529 Op.getOpcode(), Loc, Op.getValueType(),
2530 DAG.getNode(ISD::FP_EXTEND, Loc, MVT::f32, Op.getOperand(0)));
2531 }
2532
2533 // Everything else is considered legal.
2534 return Op;
2535}
2536
2537SDValue NVPTXTargetLowering::LowerFP_ROUND(SDValue Op,
2538 SelectionDAG &DAG) const {
2539 EVT NarrowVT = Op.getValueType();
2540 SDValue Wide = Op.getOperand(0);
2541 EVT WideVT = Wide.getValueType();
2542 if (NarrowVT.getScalarType() == MVT::bf16) {
2543 const TargetLowering *TLI = STI.getTargetLowering();
2544 if (STI.getSmVersion() < 80 || STI.getPTXVersion() < 70) {
2545 return TLI->expandFP_ROUND(Op.getNode(), DAG);
2546 }
2547 if (STI.getSmVersion() < 90 || STI.getPTXVersion() < 78) {
2548 // This combination was the first to support f32 -> bf16.
2549 if (STI.getSmVersion() >= 80 && STI.getPTXVersion() >= 70) {
2550 if (WideVT.getScalarType() == MVT::f32) {
2551 return Op;
2552 }
2553 if (WideVT.getScalarType() == MVT::f64) {
2554 SDLoc Loc(Op);
2555 // Round-inexact-to-odd f64 to f32, then do the final rounding using
2556 // the hardware f32 -> bf16 instruction.
2558 WideVT.isVector() ? WideVT.changeVectorElementType(MVT::f32)
2559 : MVT::f32,
2560 Wide, Loc, DAG);
2561 return DAG.getFPExtendOrRound(rod, Loc, NarrowVT);
2562 }
2563 }
2564 return TLI->expandFP_ROUND(Op.getNode(), DAG);
2565 }
2566 }
2567
2568 // Everything else is considered legal.
2569 return Op;
2570}
2571
2572SDValue NVPTXTargetLowering::LowerFP_EXTEND(SDValue Op,
2573 SelectionDAG &DAG) const {
2574 SDValue Narrow = Op.getOperand(0);
2575 EVT NarrowVT = Narrow.getValueType();
2576 EVT WideVT = Op.getValueType();
2577 if (NarrowVT.getScalarType() == MVT::bf16) {
2578 if (WideVT.getScalarType() == MVT::f32 &&
2579 (STI.getSmVersion() < 80 || STI.getPTXVersion() < 71)) {
2580 SDLoc Loc(Op);
2581 return DAG.getNode(ISD::BF16_TO_FP, Loc, WideVT, Narrow);
2582 }
2583 if (WideVT.getScalarType() == MVT::f64 &&
2584 (STI.getSmVersion() < 90 || STI.getPTXVersion() < 78)) {
2585 EVT F32 = NarrowVT.isVector() ? NarrowVT.changeVectorElementType(MVT::f32)
2586 : MVT::f32;
2587 SDLoc Loc(Op);
2588 if (STI.getSmVersion() >= 80 && STI.getPTXVersion() >= 71) {
2589 Op = DAG.getNode(ISD::FP_EXTEND, Loc, F32, Narrow);
2590 } else {
2591 Op = DAG.getNode(ISD::BF16_TO_FP, Loc, F32, Narrow);
2592 }
2593 return DAG.getNode(ISD::FP_EXTEND, Loc, WideVT, Op);
2594 }
2595 }
2596
2597 // Everything else is considered legal.
2598 return Op;
2599}
2600
2602 SDLoc DL(Op);
2603 if (Op.getValueType() != MVT::v2i16)
2604 return Op;
2605 EVT EltVT = Op.getValueType().getVectorElementType();
2606 SmallVector<SDValue> VecElements;
2607 for (int I = 0, E = Op.getValueType().getVectorNumElements(); I < E; I++) {
2608 SmallVector<SDValue> ScalarArgs;
2609 llvm::transform(Op->ops(), std::back_inserter(ScalarArgs),
2610 [&](const SDUse &O) {
2611 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT,
2612 O.get(), DAG.getIntPtrConstant(I, DL));
2613 });
2614 VecElements.push_back(DAG.getNode(Op.getOpcode(), DL, EltVT, ScalarArgs));
2615 }
2616 SDValue V =
2617 DAG.getNode(ISD::BUILD_VECTOR, DL, Op.getValueType(), VecElements);
2618 return V;
2619}
2620
2622 SDNode *N = Op.getNode();
2623 SDLoc DL(N);
2625
2626 // split the vector argument
2627 for (size_t I = 0; I < N->getNumOperands(); I++) {
2628 SDValue Val = N->getOperand(I);
2629 EVT ValVT = Val.getValueType();
2630 if (ValVT.isVector()) {
2631 EVT EltVT = ValVT.getVectorElementType();
2632 for (unsigned J = 0, NElts = ValVT.getVectorNumElements(); J < NElts; J++)
2633 Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Val,
2634 DAG.getIntPtrConstant(J, DL)));
2635 } else
2636 Ops.push_back(Val);
2637 }
2638
2640 SDValue Tcgen05StNode =
2641 DAG.getMemIntrinsicNode(ISD::INTRINSIC_VOID, DL, N->getVTList(), Ops,
2642 MemSD->getMemoryVT(), MemSD->getMemOperand());
2643
2644 return Tcgen05StNode;
2645}
2646
2647static unsigned getTcgen05MMADisableOutputLane(unsigned IID) {
2648 switch (IID) {
2649 case Intrinsic::nvvm_tcgen05_mma_shared_disable_output_lane_cg1:
2651 case Intrinsic::nvvm_tcgen05_mma_shared_disable_output_lane_cg2:
2653 case Intrinsic::nvvm_tcgen05_mma_shared_scale_d_disable_output_lane_cg1:
2655 case Intrinsic::nvvm_tcgen05_mma_shared_scale_d_disable_output_lane_cg2:
2657 case Intrinsic::nvvm_tcgen05_mma_tensor_disable_output_lane_cg1:
2659 case Intrinsic::nvvm_tcgen05_mma_tensor_disable_output_lane_cg2:
2661 case Intrinsic::nvvm_tcgen05_mma_tensor_scale_d_disable_output_lane_cg1:
2663 case Intrinsic::nvvm_tcgen05_mma_tensor_scale_d_disable_output_lane_cg2:
2665 case Intrinsic::nvvm_tcgen05_mma_tensor_disable_output_lane_cg1_ashift:
2667 case Intrinsic::nvvm_tcgen05_mma_tensor_disable_output_lane_cg2_ashift:
2669 case Intrinsic::
2670 nvvm_tcgen05_mma_tensor_scale_d_disable_output_lane_cg1_ashift:
2672 case Intrinsic::
2673 nvvm_tcgen05_mma_tensor_scale_d_disable_output_lane_cg2_ashift:
2675 case Intrinsic::nvvm_tcgen05_mma_sp_shared_disable_output_lane_cg1:
2677 case Intrinsic::nvvm_tcgen05_mma_sp_shared_disable_output_lane_cg2:
2679 case Intrinsic::nvvm_tcgen05_mma_sp_shared_scale_d_disable_output_lane_cg1:
2681 case Intrinsic::nvvm_tcgen05_mma_sp_shared_scale_d_disable_output_lane_cg2:
2683 case Intrinsic::nvvm_tcgen05_mma_sp_tensor_disable_output_lane_cg1:
2685 case Intrinsic::nvvm_tcgen05_mma_sp_tensor_disable_output_lane_cg2:
2687 case Intrinsic::nvvm_tcgen05_mma_sp_tensor_disable_output_lane_cg1_ashift:
2689 case Intrinsic::nvvm_tcgen05_mma_sp_tensor_disable_output_lane_cg2_ashift:
2691 case Intrinsic::nvvm_tcgen05_mma_sp_tensor_scale_d_disable_output_lane_cg1:
2693 case Intrinsic::nvvm_tcgen05_mma_sp_tensor_scale_d_disable_output_lane_cg2:
2695 case Intrinsic::
2696 nvvm_tcgen05_mma_sp_tensor_scale_d_disable_output_lane_cg1_ashift:
2697 return NVPTXISD::
2698 TCGEN05_MMA_SP_TENSOR_SCALE_D_DISABLE_OUTPUT_LANE_CG1_ASHIFT;
2699 case Intrinsic::
2700 nvvm_tcgen05_mma_sp_tensor_scale_d_disable_output_lane_cg2_ashift:
2701 return NVPTXISD::
2702 TCGEN05_MMA_SP_TENSOR_SCALE_D_DISABLE_OUTPUT_LANE_CG2_ASHIFT;
2703 };
2704 llvm_unreachable("unhandled tcgen05.mma.disable_output_lane intrinsic");
2705}
2706
2708 SDNode *N = Op.getNode();
2709 SDLoc DL(N);
2710 unsigned IID = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
2711
2713 // split the vector argument
2714 for (size_t I = 0; I < N->getNumOperands(); I++) {
2715 if (I == 1)
2716 continue; // skip IID
2717 SDValue Val = N->getOperand(I);
2718 EVT ValVT = Val.getValueType();
2719 if (ValVT.isVector()) {
2720 EVT EltVT = ValVT.getVectorElementType();
2721 for (unsigned J = 0, NElts = ValVT.getVectorNumElements(); J < NElts; J++)
2722 Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Val,
2723 DAG.getIntPtrConstant(J, DL)));
2724 } else
2725 Ops.push_back(Val);
2726 }
2727
2729 SDValue Tcgen05MMANode = DAG.getMemIntrinsicNode(
2730 getTcgen05MMADisableOutputLane(IID), DL, N->getVTList(), Ops,
2731 MemSD->getMemoryVT(), MemSD->getMemOperand());
2732
2733 return Tcgen05MMANode;
2734}
2735
2736// Lower vector return type of tcgen05.ld intrinsics
2737static std::optional<std::pair<SDValue, SDValue>>
2738lowerTcgen05Ld(SDNode *N, SelectionDAG &DAG, bool HasOffset = false) {
2739 SDLoc DL(N);
2740 EVT ResVT = N->getValueType(0);
2741 if (!ResVT.isVector())
2742 return {}; // already legalized.
2743
2744 const unsigned NumElts = ResVT.getVectorNumElements();
2745
2746 // Create the return type of the instructions
2747 SmallVector<EVT, 5> ListVTs;
2748 for (unsigned i = 0; i < NumElts; ++i)
2749 ListVTs.push_back(MVT::i32);
2750
2751 ListVTs.push_back(N->getValueType(1)); // Chain
2752
2753 SDVTList ResVTs = DAG.getVTList(ListVTs);
2754
2755 SmallVector<SDValue, 8> Ops{N->getOperand(0), N->getOperand(1),
2756 N->getOperand(2)};
2757
2758 if (HasOffset) {
2759 Ops.push_back(N->getOperand(3)); // offset
2760 Ops.push_back(N->getOperand(4)); // Pack flag
2761 } else
2762 Ops.push_back(N->getOperand(3)); // Pack flag
2763
2765 SDValue NewNode =
2767 MemSD->getMemoryVT(), MemSD->getMemOperand());
2768
2769 // split the vector result
2770 SmallVector<SDValue, 4> ScalarRes;
2771 for (unsigned i = 0; i < NumElts; ++i) {
2772 SDValue Res = NewNode.getValue(i);
2773 ScalarRes.push_back(Res);
2774 }
2775
2776 SDValue Chain = NewNode.getValue(NumElts);
2777 SDValue BuildVector = DAG.getNode(ISD::BUILD_VECTOR, DL, ResVT, ScalarRes);
2778 return {{BuildVector, Chain}};
2779}
2780
2782 SDNode *N = Op.getNode();
2783 SDValue Intrin = N->getOperand(1);
2784
2785 // Get the intrinsic ID
2786 unsigned IntrinNo = cast<ConstantSDNode>(Intrin.getNode())->getZExtValue();
2787 switch (IntrinNo) {
2788 default:
2789 break;
2790 case Intrinsic::nvvm_tcgen05_st_16x64b_x1:
2791 case Intrinsic::nvvm_tcgen05_st_16x64b_x2:
2792 case Intrinsic::nvvm_tcgen05_st_16x64b_x4:
2793 case Intrinsic::nvvm_tcgen05_st_16x64b_x8:
2794 case Intrinsic::nvvm_tcgen05_st_16x64b_x16:
2795 case Intrinsic::nvvm_tcgen05_st_16x64b_x32:
2796 case Intrinsic::nvvm_tcgen05_st_16x64b_x128:
2797 case Intrinsic::nvvm_tcgen05_st_16x128b_x1:
2798 case Intrinsic::nvvm_tcgen05_st_16x128b_x2:
2799 case Intrinsic::nvvm_tcgen05_st_16x128b_x4:
2800 case Intrinsic::nvvm_tcgen05_st_16x128b_x8:
2801 case Intrinsic::nvvm_tcgen05_st_16x128b_x16:
2802 case Intrinsic::nvvm_tcgen05_st_16x128b_x32:
2803 case Intrinsic::nvvm_tcgen05_st_16x128b_x64:
2804 case Intrinsic::nvvm_tcgen05_st_16x256b_x1:
2805 case Intrinsic::nvvm_tcgen05_st_16x256b_x2:
2806 case Intrinsic::nvvm_tcgen05_st_16x256b_x4:
2807 case Intrinsic::nvvm_tcgen05_st_16x256b_x8:
2808 case Intrinsic::nvvm_tcgen05_st_16x256b_x16:
2809 case Intrinsic::nvvm_tcgen05_st_16x256b_x32:
2810 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x1:
2811 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x2:
2812 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x4:
2813 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x8:
2814 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x16:
2815 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x32:
2816 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x64:
2817 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x128:
2818 case Intrinsic::nvvm_tcgen05_st_32x32b_x1:
2819 case Intrinsic::nvvm_tcgen05_st_32x32b_x2:
2820 case Intrinsic::nvvm_tcgen05_st_32x32b_x4:
2821 case Intrinsic::nvvm_tcgen05_st_32x32b_x8:
2822 case Intrinsic::nvvm_tcgen05_st_32x32b_x16:
2823 case Intrinsic::nvvm_tcgen05_st_32x32b_x32:
2824 case Intrinsic::nvvm_tcgen05_st_16x64b_x64:
2825 case Intrinsic::nvvm_tcgen05_st_32x32b_x64:
2826 case Intrinsic::nvvm_tcgen05_st_32x32b_x128:
2827 return lowerTcgen05St(Op, DAG);
2828 case Intrinsic::nvvm_tcgen05_mma_shared_disable_output_lane_cg1:
2829 case Intrinsic::nvvm_tcgen05_mma_shared_disable_output_lane_cg2:
2830 case Intrinsic::nvvm_tcgen05_mma_shared_scale_d_disable_output_lane_cg1:
2831 case Intrinsic::nvvm_tcgen05_mma_shared_scale_d_disable_output_lane_cg2:
2832 case Intrinsic::nvvm_tcgen05_mma_sp_shared_disable_output_lane_cg1:
2833 case Intrinsic::nvvm_tcgen05_mma_sp_shared_disable_output_lane_cg2:
2834 case Intrinsic::nvvm_tcgen05_mma_sp_shared_scale_d_disable_output_lane_cg1:
2835 case Intrinsic::nvvm_tcgen05_mma_sp_shared_scale_d_disable_output_lane_cg2:
2836 case Intrinsic::nvvm_tcgen05_mma_tensor_disable_output_lane_cg1:
2837 case Intrinsic::nvvm_tcgen05_mma_tensor_disable_output_lane_cg2:
2838 case Intrinsic::nvvm_tcgen05_mma_tensor_scale_d_disable_output_lane_cg1:
2839 case Intrinsic::nvvm_tcgen05_mma_tensor_scale_d_disable_output_lane_cg2:
2840 case Intrinsic::nvvm_tcgen05_mma_sp_tensor_disable_output_lane_cg1:
2841 case Intrinsic::nvvm_tcgen05_mma_sp_tensor_disable_output_lane_cg2:
2842 case Intrinsic::nvvm_tcgen05_mma_sp_tensor_scale_d_disable_output_lane_cg1:
2843 case Intrinsic::nvvm_tcgen05_mma_sp_tensor_scale_d_disable_output_lane_cg2:
2844 case Intrinsic::nvvm_tcgen05_mma_tensor_disable_output_lane_cg1_ashift:
2845 case Intrinsic::nvvm_tcgen05_mma_tensor_disable_output_lane_cg2_ashift:
2846 case Intrinsic::
2847 nvvm_tcgen05_mma_tensor_scale_d_disable_output_lane_cg1_ashift:
2848 case Intrinsic::
2849 nvvm_tcgen05_mma_tensor_scale_d_disable_output_lane_cg2_ashift:
2850 case Intrinsic::nvvm_tcgen05_mma_sp_tensor_disable_output_lane_cg1_ashift:
2851 case Intrinsic::nvvm_tcgen05_mma_sp_tensor_disable_output_lane_cg2_ashift:
2852 case Intrinsic::
2853 nvvm_tcgen05_mma_sp_tensor_scale_d_disable_output_lane_cg1_ashift:
2854 case Intrinsic::
2855 nvvm_tcgen05_mma_sp_tensor_scale_d_disable_output_lane_cg2_ashift:
2857 }
2858 return Op;
2859}
2860
2862 SelectionDAG &DAG) {
2863
2864 SDNode *N = Op.getNode();
2865 if (N->getOperand(1).getValueType() != MVT::i128) {
2866 // return, if the operand is already lowered
2867 return SDValue();
2868 }
2869
2870 unsigned IID =
2871 cast<ConstantSDNode>(N->getOperand(0).getNode())->getZExtValue();
2872 auto Opcode = [&]() {
2873 switch (IID) {
2874 case Intrinsic::nvvm_clusterlaunchcontrol_query_cancel_is_canceled:
2876 case Intrinsic::nvvm_clusterlaunchcontrol_query_cancel_get_first_ctaid_x:
2878 case Intrinsic::nvvm_clusterlaunchcontrol_query_cancel_get_first_ctaid_y:
2880 case Intrinsic::nvvm_clusterlaunchcontrol_query_cancel_get_first_ctaid_z:
2882 default:
2883 llvm_unreachable("unsupported/unhandled intrinsic");
2884 }
2885 }();
2886
2887 SDLoc DL(N);
2888 SDValue TryCancelResponse = N->getOperand(1);
2889 SDValue Cast = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, TryCancelResponse);
2890 SDValue TryCancelResponse0 =
2891 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, Cast,
2892 DAG.getIntPtrConstant(0, DL));
2893 SDValue TryCancelResponse1 =
2894 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, Cast,
2895 DAG.getIntPtrConstant(1, DL));
2896
2897 return DAG.getNode(Opcode, DL, N->getVTList(),
2898 {TryCancelResponse0, TryCancelResponse1});
2899}
2900
2902 const unsigned Mode = [&]() {
2903 switch (Op->getConstantOperandVal(0)) {
2904 case Intrinsic::nvvm_prmt:
2906 case Intrinsic::nvvm_prmt_b4e:
2908 case Intrinsic::nvvm_prmt_ecl:
2910 case Intrinsic::nvvm_prmt_ecr:
2912 case Intrinsic::nvvm_prmt_f4e:
2914 case Intrinsic::nvvm_prmt_rc16:
2916 case Intrinsic::nvvm_prmt_rc8:
2918 default:
2919 llvm_unreachable("unsupported/unhandled intrinsic");
2920 }
2921 }();
2922 SDLoc DL(Op);
2923 SDValue A = Op->getOperand(1);
2924 SDValue B = Op.getNumOperands() == 4 ? Op.getOperand(2)
2925 : DAG.getConstant(0, DL, MVT::i32);
2926 SDValue Selector = (Op->op_end() - 1)->get();
2927 return getPRMT(A, B, Selector, DL, DAG, Mode);
2928}
2929
2931 switch (Op->getConstantOperandVal(1)) {
2932 default:
2933 return Op;
2934
2935 // These tcgen05 intrinsics return a v2i32, which is legal, so we have to
2936 // lower them through LowerOperation() instead of ReplaceNodeResults().
2937 case Intrinsic::nvvm_tcgen05_ld_16x64b_x2:
2938 case Intrinsic::nvvm_tcgen05_ld_16x128b_x1:
2939 case Intrinsic::nvvm_tcgen05_ld_32x32b_x2:
2940 if (auto Res = lowerTcgen05Ld(Op.getNode(), DAG))
2941 return DAG.getMergeValues({Res->first, Res->second}, SDLoc(Op));
2942 return SDValue();
2943
2944 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x2:
2945 if (auto Res = lowerTcgen05Ld(Op.getNode(), DAG, /*HasOffset=*/true))
2946 return DAG.getMergeValues({Res->first, Res->second}, SDLoc(Op));
2947 return SDValue();
2948 }
2949}
2950
2952 switch (Op->getConstantOperandVal(0)) {
2953 default:
2954 return Op;
2955 case Intrinsic::nvvm_prmt:
2956 case Intrinsic::nvvm_prmt_b4e:
2957 case Intrinsic::nvvm_prmt_ecl:
2958 case Intrinsic::nvvm_prmt_ecr:
2959 case Intrinsic::nvvm_prmt_f4e:
2960 case Intrinsic::nvvm_prmt_rc16:
2961 case Intrinsic::nvvm_prmt_rc8:
2962 return lowerPrmtIntrinsic(Op, DAG);
2963 case Intrinsic::nvvm_internal_addrspace_wrap:
2964 return Op.getOperand(1);
2965 case Intrinsic::nvvm_clusterlaunchcontrol_query_cancel_is_canceled:
2966 case Intrinsic::nvvm_clusterlaunchcontrol_query_cancel_get_first_ctaid_x:
2967 case Intrinsic::nvvm_clusterlaunchcontrol_query_cancel_get_first_ctaid_y:
2968 case Intrinsic::nvvm_clusterlaunchcontrol_query_cancel_get_first_ctaid_z:
2970 }
2971}
2972
2973// In PTX 64-bit CTLZ and CTPOP are supported, but they return a 32-bit value.
2974// Lower these into a node returning the correct type which is zero-extended
2975// back to the correct size.
2977 SDValue V = Op->getOperand(0);
2978 assert(V.getValueType() == MVT::i64 &&
2979 "Unexpected CTLZ/CTPOP type to legalize");
2980
2981 SDLoc DL(Op);
2982 SDValue CT = DAG.getNode(Op->getOpcode(), DL, MVT::i32, V);
2983 return DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, CT, SDNodeFlags::NonNeg);
2984}
2985
2987 unsigned Opcode, SelectionDAG &DAG) {
2988 assert(A.getValueType() == MVT::i64 && B.getValueType() == MVT::i64);
2989
2990 const auto *AmtConst = dyn_cast<ConstantSDNode>(ShiftAmount);
2991 if (!AmtConst)
2992 return SDValue();
2993 const auto Amt = AmtConst->getZExtValue() & 63;
2994
2995 SDValue UnpackA =
2996 DAG.getNode(NVPTXISD::UNPACK_VECTOR, DL, {MVT::i32, MVT::i32}, A);
2997 SDValue UnpackB =
2998 DAG.getNode(NVPTXISD::UNPACK_VECTOR, DL, {MVT::i32, MVT::i32}, B);
2999
3000 // Arch is Little endiain: 0 = low bits, 1 = high bits
3001 SDValue ALo = UnpackA.getValue(0);
3002 SDValue AHi = UnpackA.getValue(1);
3003 SDValue BLo = UnpackB.getValue(0);
3004 SDValue BHi = UnpackB.getValue(1);
3005
3006 // The bitfeild consists of { AHi : ALo : BHi : BLo }
3007 //
3008 // * FSHL, Amt < 32 - The window will contain { AHi : ALo : BHi }
3009 // * FSHL, Amt >= 32 - The window will contain { ALo : BHi : BLo }
3010 // * FSHR, Amt < 32 - The window will contain { ALo : BHi : BLo }
3011 // * FSHR, Amt >= 32 - The window will contain { AHi : ALo : BHi }
3012 //
3013 // Note that Amt = 0 and Amt = 32 are special cases where 32-bit funnel shifts
3014 // are not needed at all. Amt = 0 is a no-op producing either A or B depending
3015 // on the direction. Amt = 32 can be implemented by a packing and unpacking
3016 // move to select and arrange the 32bit values. For simplicity, these cases
3017 // are not handled here explicitly and instead we rely on DAGCombiner to
3018 // remove the no-op funnel shifts we insert.
3019 auto [High, Mid, Low] = ((Opcode == ISD::FSHL) == (Amt < 32))
3020 ? std::make_tuple(AHi, ALo, BHi)
3021 : std::make_tuple(ALo, BHi, BLo);
3022
3023 SDValue NewAmt = DAG.getConstant(Amt & 31, DL, MVT::i32);
3024 SDValue RHi = DAG.getNode(Opcode, DL, MVT::i32, {High, Mid, NewAmt});
3025 SDValue RLo = DAG.getNode(Opcode, DL, MVT::i32, {Mid, Low, NewAmt});
3026
3027 return DAG.getNode(NVPTXISD::BUILD_VECTOR, DL, MVT::i64, {RLo, RHi});
3028}
3029
3031 return expandFSH64(Op->getOperand(0), Op->getOperand(1), Op->getOperand(2),
3032 SDLoc(Op), Op->getOpcode(), DAG);
3033}
3034
3036 unsigned Opcode = Op->getOpcode() == ISD::ROTL ? ISD::FSHL : ISD::FSHR;
3037 return expandFSH64(Op->getOperand(0), Op->getOperand(0), Op->getOperand(1),
3038 SDLoc(Op), Opcode, DAG);
3039}
3040
3042 // Lower (frem x, y) into (sub x, (mul (ftrunc (div x, y)) y)),
3043 // i.e. "poor man's fmod()". When y is infinite, x is returned. This matches
3044 // the semantics of LLVM's frem.
3045 SDLoc DL(Op);
3046 SDValue X = Op->getOperand(0);
3047 SDValue Y = Op->getOperand(1);
3048 EVT Ty = Op.getValueType();
3049 SDNodeFlags Flags = Op->getFlags();
3050
3051 SDValue Div = DAG.getNode(ISD::FDIV, DL, Ty, X, Y, Flags);
3052 SDValue Trunc = DAG.getNode(ISD::FTRUNC, DL, Ty, Div, Flags);
3053 SDValue Mul = DAG.getNode(ISD::FMUL, DL, Ty, Trunc, Y,
3055 SDValue Sub = DAG.getNode(ISD::FSUB, DL, Ty, X, Mul,
3057
3058 if (Flags.hasNoInfs())
3059 return Sub;
3060
3061 // If Y is infinite, return X
3062 SDValue AbsY = DAG.getNode(ISD::FABS, DL, Ty, Y);
3063 SDValue Inf =
3064 DAG.getConstantFP(APFloat::getInf(Ty.getFltSemantics()), DL, Ty);
3065 SDValue IsInf = DAG.getSetCC(DL, MVT::i1, AbsY, Inf, ISD::SETEQ);
3066 return DAG.getSelect(DL, Ty, IsInf, X, Sub);
3067}
3068
3070 assert(Op.getValueType() == MVT::i1 && "Custom lowering enabled only for i1");
3071
3072 SDValue Cond = Op->getOperand(0);
3073 SDValue TrueVal = Op->getOperand(1);
3074 SDValue FalseVal = Op->getOperand(2);
3075 SDLoc DL(Op);
3076
3077 // If both operands are truncated, we push the select through the truncates.
3078 if (TrueVal.getOpcode() == ISD::TRUNCATE &&
3079 FalseVal.getOpcode() == ISD::TRUNCATE) {
3080 TrueVal = TrueVal.getOperand(0);
3081 FalseVal = FalseVal.getOperand(0);
3082
3083 EVT VT = TrueVal.getSimpleValueType().bitsLE(FalseVal.getSimpleValueType())
3084 ? TrueVal.getValueType()
3085 : FalseVal.getValueType();
3086 TrueVal = DAG.getAnyExtOrTrunc(TrueVal, DL, VT);
3087 FalseVal = DAG.getAnyExtOrTrunc(FalseVal, DL, VT);
3088 SDValue Select = DAG.getSelect(DL, VT, Cond, TrueVal, FalseVal);
3089 return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Select);
3090 }
3091
3092 // Otherwise, expand the select into a series of logical operations. These
3093 // often can be folded into other operations either by us or ptxas.
3094 TrueVal = DAG.getFreeze(TrueVal);
3095 FalseVal = DAG.getFreeze(FalseVal);
3096 SDValue And1 = DAG.getNode(ISD::AND, DL, MVT::i1, Cond, TrueVal);
3097 SDValue NotCond = DAG.getNOT(DL, Cond, MVT::i1);
3098 SDValue And2 = DAG.getNode(ISD::AND, DL, MVT::i1, NotCond, FalseVal);
3099 SDValue Or = DAG.getNode(ISD::OR, DL, MVT::i1, And1, And2);
3100 return Or;
3101}
3102
3103SDValue
3105 switch (Op.getOpcode()) {
3106 case ISD::RETURNADDR:
3107 return SDValue();
3108 case ISD::FRAMEADDR:
3109 return SDValue();
3110 case ISD::ADDRSPACECAST:
3111 return LowerADDRSPACECAST(Op, DAG);
3113 return lowerIntrinsicWChain(Op, DAG);
3115 return lowerIntrinsicWOChain(Op, DAG);
3117 return lowerIntrinsicVoid(Op, DAG);
3118 case ISD::BUILD_VECTOR:
3119 return LowerBUILD_VECTOR(Op, DAG);
3120 case ISD::BITCAST:
3121 return LowerBITCAST(Op, DAG);
3123 return Op;
3125 return LowerEXTRACT_VECTOR_ELT(Op, DAG);
3127 return LowerINSERT_VECTOR_ELT(Op, DAG);
3129 return LowerVECTOR_SHUFFLE(Op, DAG);
3131 return LowerCONCAT_VECTORS(Op, DAG);
3132 case ISD::VECREDUCE_FMAX:
3133 case ISD::VECREDUCE_FMIN:
3134 case ISD::VECREDUCE_FMAXIMUM:
3135 case ISD::VECREDUCE_FMINIMUM:
3136 return LowerVECREDUCE(Op, DAG);
3137 case ISD::STORE:
3138 return LowerSTORE(Op, DAG);
3139 case ISD::LOAD:
3140 return LowerLOAD(Op, DAG);
3141 case ISD::SHL_PARTS:
3142 return LowerShiftLeftParts(Op, DAG);
3143 case ISD::SRA_PARTS:
3144 case ISD::SRL_PARTS:
3145 return LowerShiftRightParts(Op, DAG);
3146 case ISD::SELECT:
3147 return lowerSELECT(Op, DAG);
3148 case ISD::FROUND:
3149 return LowerFROUND(Op, DAG);
3150 case ISD::FCOPYSIGN:
3151 return LowerFCOPYSIGN(Op, DAG);
3152 case ISD::SINT_TO_FP:
3153 case ISD::UINT_TO_FP:
3154 return LowerINT_TO_FP(Op, DAG);
3155 case ISD::FP_TO_SINT:
3156 case ISD::FP_TO_UINT:
3157 return LowerFP_TO_INT(Op, DAG);
3158 case ISD::FP_ROUND:
3159 return LowerFP_ROUND(Op, DAG);
3160 case ISD::FP_EXTEND:
3161 return LowerFP_EXTEND(Op, DAG);
3162 case ISD::BR_JT:
3163 return LowerBR_JT(Op, DAG);
3164 case ISD::VAARG:
3165 return LowerVAARG(Op, DAG);
3166 case ISD::VASTART:
3167 return LowerVASTART(Op, DAG);
3168 case ISD::FSHL:
3169 case ISD::FSHR:
3170 return lowerFSH(Op, DAG);
3171 case ISD::ROTL:
3172 case ISD::ROTR:
3173 return lowerROT(Op, DAG);
3174 case ISD::ABS:
3175 case ISD::SMIN:
3176 case ISD::SMAX:
3177 case ISD::UMIN:
3178 case ISD::UMAX:
3179 case ISD::ADD:
3180 case ISD::SUB:
3181 case ISD::MUL:
3182 case ISD::SHL:
3183 case ISD::SREM:
3184 case ISD::UREM:
3185 return LowerVectorArith(Op, DAG);
3186 case ISD::DYNAMIC_STACKALLOC:
3187 return LowerDYNAMIC_STACKALLOC(Op, DAG);
3188 case ISD::STACKRESTORE:
3189 return LowerSTACKRESTORE(Op, DAG);
3190 case ISD::STACKSAVE:
3191 return LowerSTACKSAVE(Op, DAG);
3192 case ISD::CopyToReg:
3193 return LowerCopyToReg_128(Op, DAG);
3194 case ISD::FADD:
3195 case ISD::FSUB:
3196 case ISD::FMUL:
3197 // Used only for bf16 on SM80, where we select fma for non-ftz operation
3198 return PromoteBinOpIfF32FTZ(Op, DAG);
3199 case ISD::CTPOP:
3200 case ISD::CTLZ:
3201 return lowerCTLZCTPOP(Op, DAG);
3202 case ISD::FREM:
3203 return lowerFREM(Op, DAG);
3204
3205 default:
3206 llvm_unreachable("Custom lowering not defined for operation");
3207 }
3208}
3209
3210SDValue NVPTXTargetLowering::LowerBR_JT(SDValue Op, SelectionDAG &DAG) const {
3211 SDLoc DL(Op);
3212 SDValue Chain = Op.getOperand(0);
3213 const auto *JT = cast<JumpTableSDNode>(Op.getOperand(1));
3214 SDValue Index = Op.getOperand(2);
3215
3216 unsigned JId = JT->getIndex();
3218 ArrayRef<MachineBasicBlock *> MBBs = MJTI->getJumpTables()[JId].MBBs;
3219
3220 SDValue IdV = DAG.getConstant(JId, DL, MVT::i32);
3221
3222 // Generate BrxStart node
3223 SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
3224 Chain = DAG.getNode(NVPTXISD::BrxStart, DL, VTs, Chain, IdV);
3225
3226 // Generate BrxItem nodes
3227 assert(!MBBs.empty());
3228 for (MachineBasicBlock *MBB : MBBs.drop_back())
3229 Chain = DAG.getNode(NVPTXISD::BrxItem, DL, VTs, Chain.getValue(0),
3230 DAG.getBasicBlock(MBB), Chain.getValue(1));
3231
3232 // Generate BrxEnd nodes
3233 SDValue EndOps[] = {Chain.getValue(0), DAG.getBasicBlock(MBBs.back()), Index,
3234 IdV, Chain.getValue(1)};
3235 SDValue BrxEnd = DAG.getNode(NVPTXISD::BrxEnd, DL, VTs, EndOps);
3236
3237 return BrxEnd;
3238}
3239
3240// This will prevent AsmPrinter from trying to print the jump tables itself.
3244
3245SDValue NVPTXTargetLowering::LowerADDRSPACECAST(SDValue Op,
3246 SelectionDAG &DAG) const {
3248 unsigned SrcAS = N->getSrcAddressSpace();
3249 unsigned DestAS = N->getDestAddressSpace();
3250 if (SrcAS != llvm::ADDRESS_SPACE_GENERIC &&
3251 DestAS != llvm::ADDRESS_SPACE_GENERIC) {
3252 // Shared and SharedCluster can be converted to each other through generic
3253 // space
3254 if ((SrcAS == llvm::ADDRESS_SPACE_SHARED &&
3257 DestAS == llvm::ADDRESS_SPACE_SHARED)) {
3258 SDLoc DL(Op.getNode());
3259 const MVT GenerictVT =
3261 SDValue GenericConversion = DAG.getAddrSpaceCast(
3262 DL, GenerictVT, Op.getOperand(0), SrcAS, ADDRESS_SPACE_GENERIC);
3263 SDValue SharedClusterConversion =
3264 DAG.getAddrSpaceCast(DL, Op.getValueType(), GenericConversion,
3265 ADDRESS_SPACE_GENERIC, DestAS);
3266 return SharedClusterConversion;
3267 }
3268
3269 return DAG.getUNDEF(Op.getValueType());
3270 }
3271
3272 return Op;
3273}
3274
3275// This function is almost a copy of SelectionDAG::expandVAArg().
3276// The only diff is that this one produces loads from local address space.
3277SDValue NVPTXTargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
3278 const TargetLowering *TLI = STI.getTargetLowering();
3279 SDLoc DL(Op);
3280
3281 SDNode *Node = Op.getNode();
3282 const Value *V = cast<SrcValueSDNode>(Node->getOperand(2))->getValue();
3283 EVT VT = Node->getValueType(0);
3284 auto *Ty = VT.getTypeForEVT(*DAG.getContext());
3285 SDValue Tmp1 = Node->getOperand(0);
3286 SDValue Tmp2 = Node->getOperand(1);
3287 const MaybeAlign MA(Node->getConstantOperandVal(3));
3288
3289 SDValue VAListLoad = DAG.getLoad(TLI->getPointerTy(DAG.getDataLayout()), DL,
3290 Tmp1, Tmp2, MachinePointerInfo(V));
3291 SDValue VAList = VAListLoad;
3292
3293 if (MA && *MA > TLI->getMinStackArgumentAlignment()) {
3294 VAList = DAG.getNode(
3295 ISD::ADD, DL, VAList.getValueType(), VAList,
3296 DAG.getConstant(MA->value() - 1, DL, VAList.getValueType()));
3297
3298 VAList = DAG.getNode(ISD::AND, DL, VAList.getValueType(), VAList,
3299 DAG.getSignedConstant(-(int64_t)MA->value(), DL,
3300 VAList.getValueType()));
3301 }
3302
3303 // Increment the pointer, VAList, to the next vaarg
3304 Tmp1 = DAG.getNode(ISD::ADD, DL, VAList.getValueType(), VAList,
3306 DL, VAList.getValueType()));
3307
3308 // Store the incremented VAList to the legalized pointer
3309 Tmp1 = DAG.getStore(VAListLoad.getValue(1), DL, Tmp1, Tmp2,
3310 MachinePointerInfo(V));
3311
3312 const Value *SrcV = Constant::getNullValue(
3314
3315 // Load the actual argument out of the pointer VAList
3316 return DAG.getLoad(VT, DL, Tmp1, VAList, MachinePointerInfo(SrcV));
3317}
3318
3319SDValue NVPTXTargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
3320 const TargetLowering *TLI = STI.getTargetLowering();
3321 SDLoc DL(Op);
3322 EVT PtrVT = TLI->getPointerTy(DAG.getDataLayout());
3323
3324 // Store the address of unsized array <function>_vararg[] in the ap object.
3325 SDValue VAReg = getParamSymbol(DAG, /* vararg */ -1, PtrVT);
3326
3327 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
3328 return DAG.getStore(Op.getOperand(0), DL, VAReg, Op.getOperand(1),
3329 MachinePointerInfo(SV));
3330}
3331
3332/// replaceLoadVector - Convert vector loads into multi-output scalar loads.
3333static std::optional<std::pair<SDValue, SDValue>>
3336 const EVT ResVT = LD->getValueType(0);
3337 const EVT MemVT = LD->getMemoryVT();
3338
3339 // If we're doing sign/zero extension as part of the load, avoid lowering to
3340 // a LoadV node. TODO: consider relaxing this restriction.
3341 if (ResVT != MemVT)
3342 return std::nullopt;
3343
3344 const auto NumEltsAndEltVT =
3345 getVectorLoweringShape(ResVT, STI, LD->getAddressSpace());
3346 if (!NumEltsAndEltVT)
3347 return std::nullopt;
3348 const auto [NumElts, EltVT] = NumEltsAndEltVT.value();
3349
3350 Align Alignment = LD->getAlign();
3351 const auto &TD = DAG.getDataLayout();
3352 Align PrefAlign = TD.getPrefTypeAlign(MemVT.getTypeForEVT(*DAG.getContext()));
3353 if (Alignment < PrefAlign) {
3354 // This load is not sufficiently aligned, so bail out and let this vector
3355 // load be scalarized. Note that we may still be able to emit smaller
3356 // vector loads. For example, if we are loading a <4 x float> with an
3357 // alignment of 8, this check will fail but the legalizer will try again
3358 // with 2 x <2 x float>, which will succeed with an alignment of 8.
3359 return std::nullopt;
3360 }
3361
3362 // Since LoadV2 is a target node, we cannot rely on DAG type legalization.
3363 // Therefore, we must ensure the type is legal. For i1 and i8, we set the
3364 // loaded type to i16 and propagate the "real" type as the memory type.
3365 const MVT LoadEltVT = (EltVT.getSizeInBits() < 16) ? MVT::i16 : EltVT;
3366
3367 unsigned Opcode;
3368 switch (NumElts) {
3369 default:
3370 return std::nullopt;
3371 case 2:
3372 Opcode = NVPTXISD::LoadV2;
3373 break;
3374 case 4:
3375 Opcode = NVPTXISD::LoadV4;
3376 break;
3377 case 8:
3378 Opcode = NVPTXISD::LoadV8;
3379 break;
3380 }
3381 auto ListVTs = SmallVector<EVT, 9>(NumElts, LoadEltVT);
3382 ListVTs.push_back(MVT::Other);
3383 SDVTList LdResVTs = DAG.getVTList(ListVTs);
3384
3385 SDLoc DL(LD);
3386
3387 // Copy regular operands
3388 SmallVector<SDValue, 8> OtherOps(LD->ops());
3389
3390 // The select routine does not have access to the LoadSDNode instance, so
3391 // pass along the extension information
3392 OtherOps.push_back(DAG.getIntPtrConstant(LD->getExtensionType(), DL));
3393
3394 SDValue NewLD = DAG.getMemIntrinsicNode(Opcode, DL, LdResVTs, OtherOps, MemVT,
3395 LD->getMemOperand());
3396
3397 SmallVector<SDValue> ScalarRes;
3398 if (EltVT.isVector()) {
3400 assert(NumElts * EltVT.getVectorNumElements() ==
3401 ResVT.getVectorNumElements());
3402 // Generate EXTRACT_VECTOR_ELTs to split v2[i,f,bf]16/v4i8 subvectors back
3403 // into individual elements.
3404 for (const unsigned I : llvm::seq(NumElts)) {
3405 SDValue SubVector = NewLD.getValue(I);
3406 DAG.ExtractVectorElements(SubVector, ScalarRes);
3407 }
3408 } else {
3409 for (const unsigned I : llvm::seq(NumElts)) {
3410 SDValue Res = NewLD.getValue(I);
3411 if (LoadEltVT != EltVT)
3412 Res = DAG.getNode(ISD::TRUNCATE, DL, EltVT, Res);
3413 ScalarRes.push_back(Res);
3414 }
3415 }
3416
3417 SDValue LoadChain = NewLD.getValue(NumElts);
3418
3419 const MVT BuildVecVT =
3420 MVT::getVectorVT(EltVT.getScalarType(), ScalarRes.size());
3421 SDValue BuildVec = DAG.getBuildVector(BuildVecVT, DL, ScalarRes);
3422 SDValue LoadValue = DAG.getBitcast(ResVT, BuildVec);
3423
3424 return {{LoadValue, LoadChain}};
3425}
3426
3429 const NVPTXSubtarget &STI) {
3430 if (auto Res = replaceLoadVector(N, DAG, STI))
3431 Results.append({Res->first, Res->second});
3432}
3433
3435 const NVPTXSubtarget &STI) {
3436 if (auto Res = replaceLoadVector(N, DAG, STI))
3437 return DAG.getMergeValues({Res->first, Res->second}, SDLoc(N));
3438 return SDValue();
3439}
3440
3441// v = ld i1* addr
3442// =>
3443// v1 = ld i8* addr (-> i16)
3444// v = trunc i16 to i1
3446 SDLoc dl(LD);
3447 assert(LD->getExtensionType() == ISD::NON_EXTLOAD);
3448 assert(LD->getValueType(0) == MVT::i1 && "Custom lowering for i1 load only");
3449 SDValue newLD = DAG.getExtLoad(ISD::ZEXTLOAD, dl, MVT::i16, LD->getChain(),
3450 LD->getBasePtr(), LD->getPointerInfo(),
3451 MVT::i8, LD->getAlign(),
3452 LD->getMemOperand()->getFlags());
3453 SDValue result = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, newLD);
3454 // The legalizer (the caller) is expecting two values from the legalized
3455 // load, so we build a MergeValues node for it. See ExpandUnalignedLoad()
3456 // in LegalizeDAG.cpp which also uses MergeValues.
3457 return DAG.getMergeValues({result, LD->getChain()}, dl);
3458}
3459
3460SDValue NVPTXTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
3461 LoadSDNode *LD = cast<LoadSDNode>(Op);
3462
3463 if (Op.getValueType() == MVT::i1)
3464 return lowerLOADi1(LD, DAG);
3465
3466 // To improve CodeGen we'll legalize any-extend loads to zext loads. This is
3467 // how they'll be lowered in ISel anyway, and by doing this a little earlier
3468 // we allow for more DAG combine opportunities.
3469 if (LD->getExtensionType() == ISD::EXTLOAD) {
3470 assert(LD->getValueType(0).isInteger() && LD->getMemoryVT().isInteger() &&
3471 "Unexpected fpext-load");
3472 return DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(Op), Op.getValueType(),
3473 LD->getChain(), LD->getBasePtr(), LD->getMemoryVT(),
3474 LD->getMemOperand());
3475 }
3476
3477 llvm_unreachable("Unexpected custom lowering for load");
3478}
3479
3481 const NVPTXSubtarget &STI) {
3482 MemSDNode *N = cast<MemSDNode>(Op.getNode());
3483 SDValue Val = N->getOperand(1);
3484 SDLoc DL(N);
3485 const EVT ValVT = Val.getValueType();
3486 const EVT MemVT = N->getMemoryVT();
3487
3488 // If we're truncating as part of the store, avoid lowering to a StoreV node.
3489 // TODO: consider relaxing this restriction.
3490 if (ValVT != MemVT)
3491 return SDValue();
3492
3493 const auto NumEltsAndEltVT =
3494 getVectorLoweringShape(ValVT, STI, N->getAddressSpace());
3495 if (!NumEltsAndEltVT)
3496 return SDValue();
3497 const auto [NumElts, EltVT] = NumEltsAndEltVT.value();
3498
3499 const DataLayout &TD = DAG.getDataLayout();
3500
3501 Align Alignment = N->getAlign();
3502 Align PrefAlign = TD.getPrefTypeAlign(ValVT.getTypeForEVT(*DAG.getContext()));
3503 if (Alignment < PrefAlign) {
3504 // This store is not sufficiently aligned, so bail out and let this vector
3505 // store be scalarized. Note that we may still be able to emit smaller
3506 // vector stores. For example, if we are storing a <4 x float> with an
3507 // alignment of 8, this check will fail but the legalizer will try again
3508 // with 2 x <2 x float>, which will succeed with an alignment of 8.
3509 return SDValue();
3510 }
3511
3512 unsigned Opcode;
3513 switch (NumElts) {
3514 default:
3515 return SDValue();
3516 case 2:
3517 Opcode = NVPTXISD::StoreV2;
3518 break;
3519 case 4:
3520 Opcode = NVPTXISD::StoreV4;
3521 break;
3522 case 8:
3523 Opcode = NVPTXISD::StoreV8;
3524 break;
3525 }
3526
3528
3529 // First is the chain
3530 Ops.push_back(N->getOperand(0));
3531
3532 // Then the split values
3533 if (EltVT.isVector()) {
3535 assert(NumElts * EltVT.getVectorNumElements() ==
3536 ValVT.getVectorNumElements());
3537 // Combine individual elements into v2[i,f,bf]16/v4i8 subvectors to be
3538 // stored as b32s
3539 const unsigned NumEltsPerSubVector = EltVT.getVectorNumElements();
3540 for (const unsigned I : llvm::seq(NumElts)) {
3541 SmallVector<SDValue, 4> SubVectorElts;
3542 DAG.ExtractVectorElements(Val, SubVectorElts, I * NumEltsPerSubVector,
3543 NumEltsPerSubVector);
3544 Ops.push_back(DAG.getBuildVector(EltVT, DL, SubVectorElts));
3545 }
3546 } else {
3547 SDValue V = DAG.getBitcast(MVT::getVectorVT(EltVT, NumElts), Val);
3548 for (const unsigned I : llvm::seq(NumElts)) {
3549 SDValue ExtVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, V,
3550 DAG.getIntPtrConstant(I, DL));
3551
3552 // Since StoreV2 is a target node, we cannot rely on DAG type
3553 // legalization. Therefore, we must ensure the type is legal. For i1 and
3554 // i8, we set the stored type to i16 and propagate the "real" type as the
3555 // memory type.
3556 if (EltVT.getSizeInBits() < 16)
3557 ExtVal = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i16, ExtVal);
3558 Ops.push_back(ExtVal);
3559 }
3560 }
3561
3562 // Then any remaining arguments
3563 Ops.append(N->op_begin() + 2, N->op_end());
3564
3565 SDValue NewSt =
3566 DAG.getMemIntrinsicNode(Opcode, DL, DAG.getVTList(MVT::Other), Ops,
3567 N->getMemoryVT(), N->getMemOperand());
3568
3569 // return DCI.CombineTo(N, NewSt, true);
3570 return NewSt;
3571}
3572
3573SDValue NVPTXTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
3574 StoreSDNode *Store = cast<StoreSDNode>(Op);
3575 EVT VT = Store->getMemoryVT();
3576
3577 if (VT == MVT::i1)
3578 return LowerSTOREi1(Op, DAG);
3579
3580 // Lower store of any other vector type, including v2f32 as we want to break
3581 // it apart since this is not a widely-supported type.
3582 return lowerSTOREVector(Op, DAG, STI);
3583}
3584
3585// st i1 v, addr
3586// =>
3587// v1 = zxt v to i16
3588// st.u8 i16, addr
3589SDValue NVPTXTargetLowering::LowerSTOREi1(SDValue Op, SelectionDAG &DAG) const {
3590 SDNode *Node = Op.getNode();
3591 SDLoc dl(Node);
3592 StoreSDNode *ST = cast<StoreSDNode>(Node);
3593 SDValue Tmp1 = ST->getChain();
3594 SDValue Tmp2 = ST->getBasePtr();
3595 SDValue Tmp3 = ST->getValue();
3596 assert(Tmp3.getValueType() == MVT::i1 && "Custom lowering for i1 store only");
3597 Tmp3 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Tmp3);
3598 SDValue Result =
3599 DAG.getTruncStore(Tmp1, dl, Tmp3, Tmp2, ST->getPointerInfo(), MVT::i8,
3600 ST->getAlign(), ST->getMemOperand()->getFlags());
3601 return Result;
3602}
3603
3604SDValue NVPTXTargetLowering::LowerCopyToReg_128(SDValue Op,
3605 SelectionDAG &DAG) const {
3606 // Change the CopyToReg to take in two 64-bit operands instead of a 128-bit
3607 // operand so that it can pass the legalization.
3608
3609 assert(Op.getOperand(1).getValueType() == MVT::i128 &&
3610 "Custom lowering for 128-bit CopyToReg only");
3611
3612 SDNode *Node = Op.getNode();
3613 SDLoc DL(Node);
3614
3615 SDValue Cast = DAG.getBitcast(MVT::v2i64, Op->getOperand(2));
3616 SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, Cast,
3617 DAG.getIntPtrConstant(0, DL));
3618 SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, Cast,
3619 DAG.getIntPtrConstant(1, DL));
3620
3622 SmallVector<EVT, 3> ResultsType(Node->values());
3623
3624 NewOps[0] = Op->getOperand(0); // Chain
3625 NewOps[1] = Op->getOperand(1); // Dst Reg
3626 NewOps[2] = Lo; // Lower 64-bit
3627 NewOps[3] = Hi; // Higher 64-bit
3628 if (Op.getNumOperands() == 4)
3629 NewOps[4] = Op->getOperand(3); // Glue if exists
3630
3631 return DAG.getNode(ISD::CopyToReg, DL, ResultsType, NewOps);
3632}
3633
3634unsigned NVPTXTargetLowering::getNumRegisters(
3635 LLVMContext &Context, EVT VT,
3636 std::optional<MVT> RegisterVT = std::nullopt) const {
3637 if (VT == MVT::i128 && RegisterVT == MVT::i128)
3638 return 1;
3639 return TargetLoweringBase::getNumRegisters(Context, VT, RegisterVT);
3640}
3641
3642bool NVPTXTargetLowering::splitValueIntoRegisterParts(
3643 SelectionDAG &DAG, const SDLoc &DL, SDValue Val, SDValue *Parts,
3644 unsigned NumParts, MVT PartVT, std::optional<CallingConv::ID> CC) const {
3645 if (Val.getValueType() == MVT::i128 && NumParts == 1) {
3646 Parts[0] = Val;
3647 return true;
3648 }
3649 return false;
3650}
3651
3652// This creates target external symbol for a function parameter.
3653// Name of the symbol is composed from its index and the function name.
3654// Negative index corresponds to special parameter (unsized array) used for
3655// passing variable arguments.
3656SDValue NVPTXTargetLowering::getParamSymbol(SelectionDAG &DAG, int I,
3657 EVT T) const {
3658 StringRef SavedStr = nvTM->getStrPool().save(
3660 return DAG.getExternalSymbol(SavedStr.data(), T);
3661}
3662
3663SDValue NVPTXTargetLowering::getCallParamSymbol(SelectionDAG &DAG, int I,
3664 EVT T) const {
3665 const StringRef SavedStr = nvTM->getStrPool().save("param" + Twine(I));
3666 return DAG.getExternalSymbol(SavedStr.data(), T);
3667}
3668
3670 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
3671 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
3672 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
3673 const DataLayout &DL = DAG.getDataLayout();
3674 LLVMContext &Ctx = *DAG.getContext();
3675 auto PtrVT = getPointerTy(DAG.getDataLayout());
3676
3677 const Function &F = DAG.getMachineFunction().getFunction();
3678
3679 SDValue Root = DAG.getRoot();
3680 SmallVector<SDValue, 16> OutChains;
3681
3682 // argTypes.size() (or theArgs.size()) and Ins.size() need not match.
3683 // Ins.size() will be larger
3684 // * if there is an aggregate argument with multiple fields (each field
3685 // showing up separately in Ins)
3686 // * if there is a vector argument with more than typical vector-length
3687 // elements (generally if more than 4) where each vector element is
3688 // individually present in Ins.
3689 // So a different index should be used for indexing into Ins.
3690 // See similar issue in LowerCall.
3691
3692 auto AllIns = ArrayRef(Ins);
3693 for (const auto &Arg : F.args()) {
3694 const auto ArgIns = AllIns.take_while(
3695 [&](auto I) { return I.OrigArgIndex == Arg.getArgNo(); });
3696 AllIns = AllIns.drop_front(ArgIns.size());
3697
3698 Type *Ty = Arg.getType();
3699
3700 if (ArgIns.empty())
3701 report_fatal_error("Empty parameter types are not supported");
3702
3703 if (Arg.use_empty()) {
3704 // argument is dead
3705 for (const auto &In : ArgIns) {
3706 assert(!In.Used && "Arg.use_empty() is true but Arg is used?");
3707 InVals.push_back(DAG.getUNDEF(In.VT));
3708 }
3709 continue;
3710 }
3711
3712 SDValue ArgSymbol = getParamSymbol(DAG, Arg.getArgNo(), PtrVT);
3713
3714 // In the following cases, assign a node order of "i+1"
3715 // to newly created nodes. The SDNodes for params have to
3716 // appear in the same order as their order of appearance
3717 // in the original function. "i+1" holds that order.
3718 if (Arg.hasByValAttr()) {
3719 // Param has ByVal attribute
3720 // Return MoveParam(param symbol).
3721 // Ideally, the param symbol can be returned directly,
3722 // but when SDNode builder decides to use it in a CopyToReg(),
3723 // machine instruction fails because TargetExternalSymbol
3724 // (not lowered) is target dependent, and CopyToReg assumes
3725 // the source is lowered.
3726 assert(ArgIns.size() == 1 && "ByVal argument must be a pointer");
3727 const auto &ByvalIn = ArgIns[0];
3728 assert(getValueType(DL, Ty) == ByvalIn.VT &&
3729 "Ins type did not match function type");
3730 assert(ByvalIn.VT == PtrVT && "ByVal argument must be a pointer");
3731
3732 SDValue P;
3733 if (isKernelFunction(F)) {
3734 P = ArgSymbol;
3735 P.getNode()->setIROrder(Arg.getArgNo() + 1);
3736 } else {
3737 P = DAG.getNode(NVPTXISD::MoveParam, dl, ByvalIn.VT, ArgSymbol);
3738 P.getNode()->setIROrder(Arg.getArgNo() + 1);
3739 P = DAG.getAddrSpaceCast(dl, ByvalIn.VT, P, ADDRESS_SPACE_LOCAL,
3741 }
3742 InVals.push_back(P);
3743 } else {
3746 ComputePTXValueVTs(*this, DL, Ctx, CallConv, Ty, VTs, Offsets);
3747 assert(VTs.size() == ArgIns.size() && "Size mismatch");
3748 assert(VTs.size() == Offsets.size() && "Size mismatch");
3749
3750 const Align ArgAlign = getFunctionArgumentAlignment(
3751 &F, Ty, Arg.getArgNo() + AttributeList::FirstArgIndex, DL);
3752
3753 unsigned I = 0;
3754 const auto VI = VectorizePTXValueVTs(VTs, Offsets, ArgAlign);
3755 for (const unsigned NumElts : VI) {
3756 // i1 is loaded/stored as i8
3757 const EVT LoadVT = VTs[I] == MVT::i1 ? MVT::i8 : VTs[I];
3758 const EVT VecVT = getVectorizedVT(LoadVT, NumElts, Ctx);
3759
3760 SDValue VecAddr = DAG.getObjectPtrOffset(
3761 dl, ArgSymbol, TypeSize::getFixed(Offsets[I]));
3762
3763 const Align PartAlign = commonAlignment(ArgAlign, Offsets[I]);
3764 SDValue P =
3765 DAG.getLoad(VecVT, dl, Root, VecAddr,
3769 P.getNode()->setIROrder(Arg.getArgNo() + 1);
3770 for (const unsigned J : llvm::seq(NumElts)) {
3771 SDValue Elt = getExtractVectorizedValue(P, J, LoadVT, dl, DAG);
3772
3773 Elt = correctParamType(Elt, ArgIns[I + J].VT, ArgIns[I + J].Flags,
3774 DAG, dl);
3775 InVals.push_back(Elt);
3776 }
3777 I += NumElts;
3778 }
3779 }
3780 }
3781
3782 if (!OutChains.empty())
3783 DAG.setRoot(DAG.getTokenFactor(dl, OutChains));
3784
3785 return Chain;
3786}
3787
3788SDValue
3790 bool isVarArg,
3792 const SmallVectorImpl<SDValue> &OutVals,
3793 const SDLoc &dl, SelectionDAG &DAG) const {
3794 const Function &F = DAG.getMachineFunction().getFunction();
3795 Type *RetTy = F.getReturnType();
3796
3797 if (RetTy->isVoidTy()) {
3798 assert(OutVals.empty() && Outs.empty() && "Return value expected for void");
3799 return DAG.getNode(NVPTXISD::RET_GLUE, dl, MVT::Other, Chain);
3800 }
3801
3802 const DataLayout &DL = DAG.getDataLayout();
3803 LLVMContext &Ctx = *DAG.getContext();
3804
3805 const SDValue RetSymbol = DAG.getExternalSymbol("func_retval0", MVT::i32);
3806 const auto RetAlign = getFunctionParamOptimizedAlign(&F, RetTy, DL);
3807
3808 // PTX Interoperability Guide 3.3(A): [Integer] Values shorter than
3809 // 32-bits are sign extended or zero extended, depending on whether
3810 // they are signed or unsigned types.
3811 const bool ExtendIntegerRetVal =
3812 RetTy->isIntegerTy() && DL.getTypeAllocSizeInBits(RetTy) < 32;
3813
3816 ComputePTXValueVTs(*this, DL, Ctx, CallConv, RetTy, VTs, Offsets);
3817 assert(VTs.size() == OutVals.size() && "Bad return value decomposition");
3818
3819 const auto GetRetVal = [&](unsigned I) -> SDValue {
3820 SDValue RetVal = OutVals[I];
3822 RetVal.getValueType() &&
3823 "OutVal type should always be legal");
3824
3825 const EVT VTI = promoteScalarIntegerPTX(VTs[I]);
3826 const EVT StoreVT =
3827 ExtendIntegerRetVal ? MVT::i32 : (VTI == MVT::i1 ? MVT::i8 : VTI);
3828 return correctParamType(RetVal, StoreVT, Outs[I].Flags, DAG, dl);
3829 };
3830
3831 unsigned I = 0;
3832 const auto VI = VectorizePTXValueVTs(VTs, Offsets, RetAlign);
3833 for (const unsigned NumElts : VI) {
3834 const MaybeAlign CurrentAlign = ExtendIntegerRetVal
3835 ? MaybeAlign(std::nullopt)
3836 : commonAlignment(RetAlign, Offsets[I]);
3837
3839 NumElts, dl, DAG, [&](unsigned K) { return GetRetVal(I + K); });
3840
3841 SDValue Ptr =
3842 DAG.getObjectPtrOffset(dl, RetSymbol, TypeSize::getFixed(Offsets[I]));
3843
3844 Chain = DAG.getStore(Chain, dl, Val, Ptr,
3846
3847 I += NumElts;
3848 }
3849
3850 return DAG.getNode(NVPTXISD::RET_GLUE, dl, MVT::Other, Chain);
3851}
3852
3854 SDValue Op, StringRef Constraint, std::vector<SDValue> &Ops,
3855 SelectionDAG &DAG) const {
3856 if (Constraint.size() > 1)
3857 return;
3859}
3860
3861// llvm.ptx.memcpy.const and llvm.ptx.memmove.const need to be modeled as
3862// TgtMemIntrinsic
3863// because we need the information that is only available in the "Value" type
3864// of destination
3865// pointer. In particular, the address space information.
3867 IntrinsicInfo &Info, const CallInst &I,
3868 MachineFunction &MF, unsigned Intrinsic) const {
3869 switch (Intrinsic) {
3870 default:
3871 return false;
3872 case Intrinsic::nvvm_match_all_sync_i32p:
3873 case Intrinsic::nvvm_match_all_sync_i64p:
3874 Info.opc = ISD::INTRINSIC_W_CHAIN;
3875 // memVT is bogus. These intrinsics have IntrInaccessibleMemOnly attribute
3876 // in order to model data exchange with other threads, but perform no real
3877 // memory accesses.
3878 Info.memVT = MVT::i1;
3879
3880 // Our result depends on both our and other thread's arguments.
3882 return true;
3883 case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_col:
3884 case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_row:
3885 case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_col_stride:
3886 case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_row_stride:
3887 case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_col:
3888 case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_row:
3889 case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_col_stride:
3890 case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_row_stride:
3891 case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_col:
3892 case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_row:
3893 case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_col_stride:
3894 case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_row_stride:
3895 case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_col:
3896 case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_row:
3897 case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_col_stride:
3898 case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_row_stride:
3899 case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_col:
3900 case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_row:
3901 case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_col_stride:
3902 case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_row_stride:
3903 case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_col:
3904 case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_row:
3905 case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_col_stride:
3906 case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_row_stride: {
3907 Info.opc = ISD::INTRINSIC_W_CHAIN;
3908 Info.memVT = MVT::v8f16;
3909 Info.ptrVal = I.getArgOperand(0);
3910 Info.offset = 0;
3911 Info.flags = MachineMemOperand::MOLoad;
3912 Info.align = Align(16);
3913 return true;
3914 }
3915 case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_col:
3916 case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_col_stride:
3917 case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_col_stride:
3918 case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_col:
3919 case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_row:
3920 case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_row_stride:
3921 case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_row_stride:
3922 case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_row:
3923 case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_col:
3924 case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_col_stride:
3925 case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_row:
3926 case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_row_stride:
3927 case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_col:
3928 case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_col_stride:
3929 case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_col_stride:
3930 case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_col:
3931 case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_row:
3932 case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_row_stride:
3933 case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_row_stride:
3934 case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_row:
3935 case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_col:
3936 case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_col_stride:
3937 case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_row:
3938 case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_row_stride: {
3939 Info.opc = ISD::INTRINSIC_W_CHAIN;
3940 Info.memVT = MVT::v2i32;
3941 Info.ptrVal = I.getArgOperand(0);
3942 Info.offset = 0;
3943 Info.flags = MachineMemOperand::MOLoad;
3944 Info.align = Align(8);
3945 return true;
3946 }
3947
3948 case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_col:
3949 case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_col_stride:
3950 case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_col_stride:
3951 case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_col:
3952 case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_row:
3953 case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_row_stride:
3954 case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_row_stride:
3955 case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_row:
3956 case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_col:
3957 case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_col_stride:
3958 case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_row:
3959 case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_row_stride:
3960 case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_col:
3961 case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_col_stride:
3962 case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_row:
3963 case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_row_stride:
3964
3965 case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_col:
3966 case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_col_stride:
3967 case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_col_stride:
3968 case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_col:
3969 case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_row:
3970 case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_row_stride:
3971 case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_row_stride:
3972 case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_row:
3973 case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_col:
3974 case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_col_stride:
3975 case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_row:
3976 case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_row_stride:
3977 case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_col:
3978 case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_col_stride:
3979 case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_row:
3980 case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_row_stride:
3981 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x4_b16:
3982 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x4_trans_b16:
3983 case Intrinsic::nvvm_ldmatrix_sync_aligned_m16n16_x2_trans_b8:
3984 case Intrinsic::nvvm_ldmatrix_sync_aligned_m16n16_x2_trans_b8x16_b4x16_p64:
3985 case Intrinsic::nvvm_ldmatrix_sync_aligned_m16n16_x2_trans_b8x16_b6x16_p32:
3986 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n16_x4_b8x16_b4x16_p64:
3987 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n16_x4_b8x16_b6x16_p32: {
3988 Info.opc = ISD::INTRINSIC_W_CHAIN;
3989 Info.memVT = MVT::v4i32;
3990 Info.ptrVal = I.getArgOperand(0);
3991 Info.offset = 0;
3992 Info.flags = MachineMemOperand::MOLoad;
3993 Info.align = Align(16);
3994 return true;
3995 }
3996
3997 case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_col:
3998 case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_col_stride:
3999 case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_col_stride:
4000 case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_col:
4001 case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_row:
4002 case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_row_stride:
4003 case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_row_stride:
4004 case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_row:
4005
4006 case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_col:
4007 case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_col_stride:
4008 case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_col_stride:
4009 case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_col:
4010 case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_row:
4011 case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_row_stride:
4012 case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_row_stride:
4013 case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_row:
4014 case Intrinsic::nvvm_wmma_m8n8k128_load_a_b1_row:
4015 case Intrinsic::nvvm_wmma_m8n8k128_load_a_b1_row_stride:
4016 case Intrinsic::nvvm_wmma_m8n8k128_load_b_b1_col:
4017 case Intrinsic::nvvm_wmma_m8n8k128_load_b_b1_col_stride:
4018 case Intrinsic::nvvm_wmma_m8n8k32_load_a_s4_row:
4019 case Intrinsic::nvvm_wmma_m8n8k32_load_a_s4_row_stride:
4020 case Intrinsic::nvvm_wmma_m8n8k32_load_a_u4_row_stride:
4021 case Intrinsic::nvvm_wmma_m8n8k32_load_a_u4_row:
4022 case Intrinsic::nvvm_wmma_m8n8k32_load_b_s4_col:
4023 case Intrinsic::nvvm_wmma_m8n8k32_load_b_s4_col_stride:
4024 case Intrinsic::nvvm_wmma_m8n8k32_load_b_u4_col_stride:
4025 case Intrinsic::nvvm_wmma_m8n8k32_load_b_u4_col:
4026 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x1_b16:
4027 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x1_trans_b16:
4028 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n16_x1_b8x16_b4x16_p64:
4029 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n16_x1_b8x16_b6x16_p32: {
4030 Info.opc = ISD::INTRINSIC_W_CHAIN;
4031 Info.memVT = MVT::i32;
4032 Info.ptrVal = I.getArgOperand(0);
4033 Info.offset = 0;
4034 Info.flags = MachineMemOperand::MOLoad;
4035 Info.align = Align(4);
4036 return true;
4037 }
4038
4039 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_col:
4040 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_row:
4041 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_col_stride:
4042 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_row_stride:
4043 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_col:
4044 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_row:
4045 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_col_stride:
4046 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_row_stride:
4047 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_col:
4048 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_row:
4049 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_col_stride:
4050 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_row_stride: {
4051 Info.opc = ISD::INTRINSIC_W_CHAIN;
4052 Info.memVT = MVT::v4f16;
4053 Info.ptrVal = I.getArgOperand(0);
4054 Info.offset = 0;
4055 Info.flags = MachineMemOperand::MOLoad;
4056 Info.align = Align(16);
4057 return true;
4058 }
4059
4060 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_col:
4061 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_row:
4062 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_col_stride:
4063 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_row_stride:
4064 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_col:
4065 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_row:
4066 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_col_stride:
4067 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_row_stride:
4068 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_col:
4069 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_row:
4070 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_col_stride:
4071 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_row_stride:
4072 case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_col:
4073 case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_row:
4074 case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_col_stride:
4075 case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_row_stride: {
4076 Info.opc = ISD::INTRINSIC_W_CHAIN;
4077 Info.memVT = MVT::v8f32;
4078 Info.ptrVal = I.getArgOperand(0);
4079 Info.offset = 0;
4080 Info.flags = MachineMemOperand::MOLoad;
4081 Info.align = Align(16);
4082 return true;
4083 }
4084
4085 case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_col:
4086 case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_col_stride:
4087 case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_row:
4088 case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_row_stride:
4089
4090 case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_col:
4091 case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_col_stride:
4092 case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_row:
4093 case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_row_stride:
4094
4095 case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_col:
4096 case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_col_stride:
4097 case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_row:
4098 case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_row_stride:
4099 case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_col:
4100 case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_col_stride:
4101 case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_row:
4102 case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_row_stride:
4103 case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_col:
4104 case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_col_stride:
4105 case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_row:
4106 case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_row_stride: {
4107 Info.opc = ISD::INTRINSIC_W_CHAIN;
4108 Info.memVT = MVT::v8i32;
4109 Info.ptrVal = I.getArgOperand(0);
4110 Info.offset = 0;
4111 Info.flags = MachineMemOperand::MOLoad;
4112 Info.align = Align(16);
4113 return true;
4114 }
4115
4116 case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_col:
4117 case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_col_stride:
4118 case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_row:
4119 case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_row_stride:
4120 case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_col:
4121 case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_col_stride:
4122 case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_row:
4123 case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_row_stride:
4124 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x2_b16:
4125 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x2_trans_b16:
4126 case Intrinsic::nvvm_ldmatrix_sync_aligned_m16n16_x1_trans_b8:
4127 case Intrinsic::nvvm_ldmatrix_sync_aligned_m16n16_x1_trans_b8x16_b4x16_p64:
4128 case Intrinsic::nvvm_ldmatrix_sync_aligned_m16n16_x1_trans_b8x16_b6x16_p32:
4129 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n16_x2_b8x16_b4x16_p64:
4130 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n16_x2_b8x16_b6x16_p32: {
4131 Info.opc = ISD::INTRINSIC_W_CHAIN;
4132 Info.memVT = MVT::v2i32;
4133 Info.ptrVal = I.getArgOperand(0);
4134 Info.offset = 0;
4135 Info.flags = MachineMemOperand::MOLoad;
4136 Info.align = Align(8);
4137 return true;
4138 }
4139
4140 case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_col:
4141 case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_col_stride:
4142 case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_row:
4143 case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_row_stride:
4144
4145 case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_col:
4146 case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_col_stride:
4147 case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_row:
4148 case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_row_stride: {
4149 Info.opc = ISD::INTRINSIC_W_CHAIN;
4150 Info.memVT = MVT::f64;
4151 Info.ptrVal = I.getArgOperand(0);
4152 Info.offset = 0;
4153 Info.flags = MachineMemOperand::MOLoad;
4154 Info.align = Align(8);
4155 return true;
4156 }
4157
4158 case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_col:
4159 case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_col_stride:
4160 case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_row:
4161 case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_row_stride: {
4162 Info.opc = ISD::INTRINSIC_W_CHAIN;
4163 Info.memVT = MVT::v2f64;
4164 Info.ptrVal = I.getArgOperand(0);
4165 Info.offset = 0;
4166 Info.flags = MachineMemOperand::MOLoad;
4167 Info.align = Align(16);
4168 return true;
4169 }
4170
4171 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_col:
4172 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_row:
4173 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_col_stride:
4174 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_row_stride:
4175 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_col:
4176 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_row:
4177 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_col_stride:
4178 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_row_stride:
4179 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_col:
4180 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_row:
4181 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_col_stride:
4182 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_row_stride: {
4183 Info.opc = ISD::INTRINSIC_VOID;
4184 Info.memVT = MVT::v4f16;
4185 Info.ptrVal = I.getArgOperand(0);
4186 Info.offset = 0;
4187 Info.flags = MachineMemOperand::MOStore;
4188 Info.align = Align(16);
4189 return true;
4190 }
4191
4192 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_col:
4193 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_row:
4194 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_col_stride:
4195 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_row_stride:
4196 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_col:
4197 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_row:
4198 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_col_stride:
4199 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_row_stride:
4200 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_col:
4201 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_row:
4202 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_col_stride:
4203 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_row_stride:
4204 case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_col:
4205 case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_row:
4206 case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_col_stride:
4207 case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_row_stride: {
4208 Info.opc = ISD::INTRINSIC_VOID;
4209 Info.memVT = MVT::v8f32;
4210 Info.ptrVal = I.getArgOperand(0);
4211 Info.offset = 0;
4212 Info.flags = MachineMemOperand::MOStore;
4213 Info.align = Align(16);
4214 return true;
4215 }
4216
4217 case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_col:
4218 case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_col_stride:
4219 case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_row:
4220 case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_row_stride:
4221 case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_col:
4222 case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_col_stride:
4223 case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_row:
4224 case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_row_stride:
4225 case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_col:
4226 case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_col_stride:
4227 case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_row:
4228 case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_row_stride: {
4229 Info.opc = ISD::INTRINSIC_VOID;
4230 Info.memVT = MVT::v8i32;
4231 Info.ptrVal = I.getArgOperand(0);
4232 Info.offset = 0;
4233 Info.flags = MachineMemOperand::MOStore;
4234 Info.align = Align(16);
4235 return true;
4236 }
4237
4238 case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_col:
4239 case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_col_stride:
4240 case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_row:
4241 case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_row_stride:
4242 case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_col:
4243 case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_col_stride:
4244 case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_row:
4245 case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_row_stride:
4246 case Intrinsic::nvvm_stmatrix_sync_aligned_m8n8_x2_b16:
4247 case Intrinsic::nvvm_stmatrix_sync_aligned_m8n8_x2_trans_b16:
4248 case Intrinsic::nvvm_stmatrix_sync_aligned_m16n8_x2_trans_b8: {
4249 Info.opc = ISD::INTRINSIC_VOID;
4250 Info.memVT = MVT::v2i32;
4251 Info.ptrVal = I.getArgOperand(0);
4252 Info.offset = 0;
4253 Info.flags = MachineMemOperand::MOStore;
4254 Info.align = Align(8);
4255 return true;
4256 }
4257
4258 case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_col:
4259 case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_col_stride:
4260 case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_row:
4261 case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_row_stride: {
4262 Info.opc = ISD::INTRINSIC_VOID;
4263 Info.memVT = MVT::v2f64;
4264 Info.ptrVal = I.getArgOperand(0);
4265 Info.offset = 0;
4266 Info.flags = MachineMemOperand::MOStore;
4267 Info.align = Align(16);
4268 return true;
4269 }
4270
4271 case Intrinsic::nvvm_stmatrix_sync_aligned_m8n8_x1_b16:
4272 case Intrinsic::nvvm_stmatrix_sync_aligned_m8n8_x1_trans_b16:
4273 case Intrinsic::nvvm_stmatrix_sync_aligned_m16n8_x1_trans_b8: {
4274 Info.opc = ISD::INTRINSIC_VOID;
4275 Info.memVT = MVT::i32;
4276 Info.ptrVal = I.getArgOperand(0);
4277 Info.offset = 0;
4278 Info.flags = MachineMemOperand::MOStore;
4279 Info.align = Align(4);
4280 return true;
4281 }
4282
4283 case Intrinsic::nvvm_stmatrix_sync_aligned_m8n8_x4_b16:
4284 case Intrinsic::nvvm_stmatrix_sync_aligned_m8n8_x4_trans_b16:
4285 case Intrinsic::nvvm_stmatrix_sync_aligned_m16n8_x4_trans_b8: {
4286 Info.opc = ISD::INTRINSIC_VOID;
4287 Info.memVT = MVT::v4i32;
4288 Info.ptrVal = I.getArgOperand(0);
4289 Info.offset = 0;
4290 Info.flags = MachineMemOperand::MOStore;
4291 Info.align = Align(16);
4292 return true;
4293 }
4294
4295 case Intrinsic::nvvm_atomic_add_gen_f_cta:
4296 case Intrinsic::nvvm_atomic_add_gen_f_sys:
4297 case Intrinsic::nvvm_atomic_add_gen_i_cta:
4298 case Intrinsic::nvvm_atomic_add_gen_i_sys:
4299 case Intrinsic::nvvm_atomic_and_gen_i_cta:
4300 case Intrinsic::nvvm_atomic_and_gen_i_sys:
4301 case Intrinsic::nvvm_atomic_cas_gen_i_cta:
4302 case Intrinsic::nvvm_atomic_cas_gen_i_sys:
4303 case Intrinsic::nvvm_atomic_dec_gen_i_cta:
4304 case Intrinsic::nvvm_atomic_dec_gen_i_sys:
4305 case Intrinsic::nvvm_atomic_inc_gen_i_cta:
4306 case Intrinsic::nvvm_atomic_inc_gen_i_sys:
4307 case Intrinsic::nvvm_atomic_max_gen_i_cta:
4308 case Intrinsic::nvvm_atomic_max_gen_i_sys:
4309 case Intrinsic::nvvm_atomic_min_gen_i_cta:
4310 case Intrinsic::nvvm_atomic_min_gen_i_sys:
4311 case Intrinsic::nvvm_atomic_or_gen_i_cta:
4312 case Intrinsic::nvvm_atomic_or_gen_i_sys:
4313 case Intrinsic::nvvm_atomic_exch_gen_i_cta:
4314 case Intrinsic::nvvm_atomic_exch_gen_i_sys:
4315 case Intrinsic::nvvm_atomic_xor_gen_i_cta:
4316 case Intrinsic::nvvm_atomic_xor_gen_i_sys: {
4317 auto &DL = I.getDataLayout();
4318 Info.opc = ISD::INTRINSIC_W_CHAIN;
4319 Info.memVT = getValueType(DL, I.getType());
4320 Info.ptrVal = I.getArgOperand(0);
4321 Info.offset = 0;
4323 Info.align.reset();
4324 return true;
4325 }
4326
4327 case Intrinsic::nvvm_prefetch_tensormap: {
4328 auto &DL = I.getDataLayout();
4329 Info.opc = ISD::INTRINSIC_VOID;
4330 Info.memVT = getPointerTy(DL);
4331 Info.ptrVal = I.getArgOperand(0);
4332 Info.offset = 0;
4333 Info.flags =
4335 Info.align.reset();
4336 return true;
4337 }
4338
4339 case Intrinsic::nvvm_ldu_global_i:
4340 case Intrinsic::nvvm_ldu_global_f:
4341 case Intrinsic::nvvm_ldu_global_p: {
4342 Info.opc = ISD::INTRINSIC_W_CHAIN;
4343 Info.memVT = getValueType(I.getDataLayout(), I.getType());
4344 Info.ptrVal = I.getArgOperand(0);
4345 Info.offset = 0;
4346 Info.flags = MachineMemOperand::MOLoad;
4347 Info.align = cast<ConstantInt>(I.getArgOperand(1))->getMaybeAlignValue();
4348
4349 return true;
4350 }
4351 case Intrinsic::nvvm_tex_1d_v4f32_s32:
4352 case Intrinsic::nvvm_tex_1d_v4f32_f32:
4353 case Intrinsic::nvvm_tex_1d_level_v4f32_f32:
4354 case Intrinsic::nvvm_tex_1d_grad_v4f32_f32:
4355 case Intrinsic::nvvm_tex_1d_array_v4f32_s32:
4356 case Intrinsic::nvvm_tex_1d_array_v4f32_f32:
4357 case Intrinsic::nvvm_tex_1d_array_level_v4f32_f32:
4358 case Intrinsic::nvvm_tex_1d_array_grad_v4f32_f32:
4359 case Intrinsic::nvvm_tex_2d_v4f32_s32:
4360 case Intrinsic::nvvm_tex_2d_v4f32_f32:
4361 case Intrinsic::nvvm_tex_2d_level_v4f32_f32:
4362 case Intrinsic::nvvm_tex_2d_grad_v4f32_f32:
4363 case Intrinsic::nvvm_tex_2d_array_v4f32_s32:
4364 case Intrinsic::nvvm_tex_2d_array_v4f32_f32:
4365 case Intrinsic::nvvm_tex_2d_array_level_v4f32_f32:
4366 case Intrinsic::nvvm_tex_2d_array_grad_v4f32_f32:
4367 case Intrinsic::nvvm_tex_3d_v4f32_s32:
4368 case Intrinsic::nvvm_tex_3d_v4f32_f32:
4369 case Intrinsic::nvvm_tex_3d_level_v4f32_f32:
4370 case Intrinsic::nvvm_tex_3d_grad_v4f32_f32:
4371 case Intrinsic::nvvm_tex_cube_v4f32_f32:
4372 case Intrinsic::nvvm_tex_cube_level_v4f32_f32:
4373 case Intrinsic::nvvm_tex_cube_array_v4f32_f32:
4374 case Intrinsic::nvvm_tex_cube_array_level_v4f32_f32:
4375 case Intrinsic::nvvm_tld4_r_2d_v4f32_f32:
4376 case Intrinsic::nvvm_tld4_g_2d_v4f32_f32:
4377 case Intrinsic::nvvm_tld4_b_2d_v4f32_f32:
4378 case Intrinsic::nvvm_tld4_a_2d_v4f32_f32:
4379 case Intrinsic::nvvm_tex_unified_1d_v4f32_s32:
4380 case Intrinsic::nvvm_tex_unified_1d_v4f32_f32:
4381 case Intrinsic::nvvm_tex_unified_1d_level_v4f32_f32:
4382 case Intrinsic::nvvm_tex_unified_1d_grad_v4f32_f32:
4383 case Intrinsic::nvvm_tex_unified_1d_array_v4f32_s32:
4384 case Intrinsic::nvvm_tex_unified_1d_array_v4f32_f32:
4385 case Intrinsic::nvvm_tex_unified_1d_array_level_v4f32_f32:
4386 case Intrinsic::nvvm_tex_unified_1d_array_grad_v4f32_f32:
4387 case Intrinsic::nvvm_tex_unified_2d_v4f32_s32:
4388 case Intrinsic::nvvm_tex_unified_2d_v4f32_f32:
4389 case Intrinsic::nvvm_tex_unified_2d_level_v4f32_f32:
4390 case Intrinsic::nvvm_tex_unified_2d_grad_v4f32_f32:
4391 case Intrinsic::nvvm_tex_unified_2d_array_v4f32_s32:
4392 case Intrinsic::nvvm_tex_unified_2d_array_v4f32_f32:
4393 case Intrinsic::nvvm_tex_unified_2d_array_level_v4f32_f32:
4394 case Intrinsic::nvvm_tex_unified_2d_array_grad_v4f32_f32:
4395 case Intrinsic::nvvm_tex_unified_3d_v4f32_s32:
4396 case Intrinsic::nvvm_tex_unified_3d_v4f32_f32:
4397 case Intrinsic::nvvm_tex_unified_3d_level_v4f32_f32:
4398 case Intrinsic::nvvm_tex_unified_3d_grad_v4f32_f32:
4399 case Intrinsic::nvvm_tex_unified_cube_v4f32_f32:
4400 case Intrinsic::nvvm_tex_unified_cube_level_v4f32_f32:
4401 case Intrinsic::nvvm_tex_unified_cube_array_v4f32_f32:
4402 case Intrinsic::nvvm_tex_unified_cube_array_level_v4f32_f32:
4403 case Intrinsic::nvvm_tex_unified_cube_grad_v4f32_f32:
4404 case Intrinsic::nvvm_tex_unified_cube_array_grad_v4f32_f32:
4405 case Intrinsic::nvvm_tld4_unified_r_2d_v4f32_f32:
4406 case Intrinsic::nvvm_tld4_unified_g_2d_v4f32_f32:
4407 case Intrinsic::nvvm_tld4_unified_b_2d_v4f32_f32:
4408 case Intrinsic::nvvm_tld4_unified_a_2d_v4f32_f32:
4409 Info.opc = ISD::INTRINSIC_W_CHAIN;
4410 Info.memVT = MVT::v4f32;
4411 Info.ptrVal = nullptr;
4412 Info.offset = 0;
4413 Info.flags = MachineMemOperand::MOLoad;
4414 Info.align = Align(16);
4415 return true;
4416
4417 case Intrinsic::nvvm_tex_1d_v4s32_s32:
4418 case Intrinsic::nvvm_tex_1d_v4s32_f32:
4419 case Intrinsic::nvvm_tex_1d_level_v4s32_f32:
4420 case Intrinsic::nvvm_tex_1d_grad_v4s32_f32:
4421 case Intrinsic::nvvm_tex_1d_array_v4s32_s32:
4422 case Intrinsic::nvvm_tex_1d_array_v4s32_f32:
4423 case Intrinsic::nvvm_tex_1d_array_level_v4s32_f32:
4424 case Intrinsic::nvvm_tex_1d_array_grad_v4s32_f32:
4425 case Intrinsic::nvvm_tex_2d_v4s32_s32:
4426 case Intrinsic::nvvm_tex_2d_v4s32_f32:
4427 case Intrinsic::nvvm_tex_2d_level_v4s32_f32:
4428 case Intrinsic::nvvm_tex_2d_grad_v4s32_f32:
4429 case Intrinsic::nvvm_tex_2d_array_v4s32_s32:
4430 case Intrinsic::nvvm_tex_2d_array_v4s32_f32:
4431 case Intrinsic::nvvm_tex_2d_array_level_v4s32_f32:
4432 case Intrinsic::nvvm_tex_2d_array_grad_v4s32_f32:
4433 case Intrinsic::nvvm_tex_3d_v4s32_s32:
4434 case Intrinsic::nvvm_tex_3d_v4s32_f32:
4435 case Intrinsic::nvvm_tex_3d_level_v4s32_f32:
4436 case Intrinsic::nvvm_tex_3d_grad_v4s32_f32:
4437 case Intrinsic::nvvm_tex_cube_v4s32_f32:
4438 case Intrinsic::nvvm_tex_cube_level_v4s32_f32:
4439 case Intrinsic::nvvm_tex_cube_array_v4s32_f32:
4440 case Intrinsic::nvvm_tex_cube_array_level_v4s32_f32:
4441 case Intrinsic::nvvm_tex_cube_v4u32_f32:
4442 case Intrinsic::nvvm_tex_cube_level_v4u32_f32:
4443 case Intrinsic::nvvm_tex_cube_array_v4u32_f32:
4444 case Intrinsic::nvvm_tex_cube_array_level_v4u32_f32:
4445 case Intrinsic::nvvm_tex_1d_v4u32_s32:
4446 case Intrinsic::nvvm_tex_1d_v4u32_f32:
4447 case Intrinsic::nvvm_tex_1d_level_v4u32_f32:
4448 case Intrinsic::nvvm_tex_1d_grad_v4u32_f32:
4449 case Intrinsic::nvvm_tex_1d_array_v4u32_s32:
4450 case Intrinsic::nvvm_tex_1d_array_v4u32_f32:
4451 case Intrinsic::nvvm_tex_1d_array_level_v4u32_f32:
4452 case Intrinsic::nvvm_tex_1d_array_grad_v4u32_f32:
4453 case Intrinsic::nvvm_tex_2d_v4u32_s32:
4454 case Intrinsic::nvvm_tex_2d_v4u32_f32:
4455 case Intrinsic::nvvm_tex_2d_level_v4u32_f32:
4456 case Intrinsic::nvvm_tex_2d_grad_v4u32_f32:
4457 case Intrinsic::nvvm_tex_2d_array_v4u32_s32:
4458 case Intrinsic::nvvm_tex_2d_array_v4u32_f32:
4459 case Intrinsic::nvvm_tex_2d_array_level_v4u32_f32:
4460 case Intrinsic::nvvm_tex_2d_array_grad_v4u32_f32:
4461 case Intrinsic::nvvm_tex_3d_v4u32_s32:
4462 case Intrinsic::nvvm_tex_3d_v4u32_f32:
4463 case Intrinsic::nvvm_tex_3d_level_v4u32_f32:
4464 case Intrinsic::nvvm_tex_3d_grad_v4u32_f32:
4465 case Intrinsic::nvvm_tld4_r_2d_v4s32_f32:
4466 case Intrinsic::nvvm_tld4_g_2d_v4s32_f32:
4467 case Intrinsic::nvvm_tld4_b_2d_v4s32_f32:
4468 case Intrinsic::nvvm_tld4_a_2d_v4s32_f32:
4469 case Intrinsic::nvvm_tld4_r_2d_v4u32_f32:
4470 case Intrinsic::nvvm_tld4_g_2d_v4u32_f32:
4471 case Intrinsic::nvvm_tld4_b_2d_v4u32_f32:
4472 case Intrinsic::nvvm_tld4_a_2d_v4u32_f32:
4473 case Intrinsic::nvvm_tex_unified_1d_v4s32_s32:
4474 case Intrinsic::nvvm_tex_unified_1d_v4s32_f32:
4475 case Intrinsic::nvvm_tex_unified_1d_level_v4s32_f32:
4476 case Intrinsic::nvvm_tex_unified_1d_grad_v4s32_f32:
4477 case Intrinsic::nvvm_tex_unified_1d_array_v4s32_s32:
4478 case Intrinsic::nvvm_tex_unified_1d_array_v4s32_f32:
4479 case Intrinsic::nvvm_tex_unified_1d_array_level_v4s32_f32:
4480 case Intrinsic::nvvm_tex_unified_1d_array_grad_v4s32_f32:
4481 case Intrinsic::nvvm_tex_unified_2d_v4s32_s32:
4482 case Intrinsic::nvvm_tex_unified_2d_v4s32_f32:
4483 case Intrinsic::nvvm_tex_unified_2d_level_v4s32_f32:
4484 case Intrinsic::nvvm_tex_unified_2d_grad_v4s32_f32:
4485 case Intrinsic::nvvm_tex_unified_2d_array_v4s32_s32:
4486 case Intrinsic::nvvm_tex_unified_2d_array_v4s32_f32:
4487 case Intrinsic::nvvm_tex_unified_2d_array_level_v4s32_f32:
4488 case Intrinsic::nvvm_tex_unified_2d_array_grad_v4s32_f32:
4489 case Intrinsic::nvvm_tex_unified_3d_v4s32_s32:
4490 case Intrinsic::nvvm_tex_unified_3d_v4s32_f32:
4491 case Intrinsic::nvvm_tex_unified_3d_level_v4s32_f32:
4492 case Intrinsic::nvvm_tex_unified_3d_grad_v4s32_f32:
4493 case Intrinsic::nvvm_tex_unified_1d_v4u32_s32:
4494 case Intrinsic::nvvm_tex_unified_1d_v4u32_f32:
4495 case Intrinsic::nvvm_tex_unified_1d_level_v4u32_f32:
4496 case Intrinsic::nvvm_tex_unified_1d_grad_v4u32_f32:
4497 case Intrinsic::nvvm_tex_unified_1d_array_v4u32_s32:
4498 case Intrinsic::nvvm_tex_unified_1d_array_v4u32_f32:
4499 case Intrinsic::nvvm_tex_unified_1d_array_level_v4u32_f32:
4500 case Intrinsic::nvvm_tex_unified_1d_array_grad_v4u32_f32:
4501 case Intrinsic::nvvm_tex_unified_2d_v4u32_s32:
4502 case Intrinsic::nvvm_tex_unified_2d_v4u32_f32:
4503 case Intrinsic::nvvm_tex_unified_2d_level_v4u32_f32:
4504 case Intrinsic::nvvm_tex_unified_2d_grad_v4u32_f32:
4505 case Intrinsic::nvvm_tex_unified_2d_array_v4u32_s32:
4506 case Intrinsic::nvvm_tex_unified_2d_array_v4u32_f32:
4507 case Intrinsic::nvvm_tex_unified_2d_array_level_v4u32_f32:
4508 case Intrinsic::nvvm_tex_unified_2d_array_grad_v4u32_f32:
4509 case Intrinsic::nvvm_tex_unified_3d_v4u32_s32:
4510 case Intrinsic::nvvm_tex_unified_3d_v4u32_f32:
4511 case Intrinsic::nvvm_tex_unified_3d_level_v4u32_f32:
4512 case Intrinsic::nvvm_tex_unified_3d_grad_v4u32_f32:
4513 case Intrinsic::nvvm_tex_unified_cube_v4s32_f32:
4514 case Intrinsic::nvvm_tex_unified_cube_level_v4s32_f32:
4515 case Intrinsic::nvvm_tex_unified_cube_array_v4s32_f32:
4516 case Intrinsic::nvvm_tex_unified_cube_array_level_v4s32_f32:
4517 case Intrinsic::nvvm_tex_unified_cube_v4u32_f32:
4518 case Intrinsic::nvvm_tex_unified_cube_level_v4u32_f32:
4519 case Intrinsic::nvvm_tex_unified_cube_array_v4u32_f32:
4520 case Intrinsic::nvvm_tex_unified_cube_array_level_v4u32_f32:
4521 case Intrinsic::nvvm_tex_unified_cube_grad_v4s32_f32:
4522 case Intrinsic::nvvm_tex_unified_cube_grad_v4u32_f32:
4523 case Intrinsic::nvvm_tex_unified_cube_array_grad_v4s32_f32:
4524 case Intrinsic::nvvm_tex_unified_cube_array_grad_v4u32_f32:
4525 case Intrinsic::nvvm_tld4_unified_r_2d_v4s32_f32:
4526 case Intrinsic::nvvm_tld4_unified_g_2d_v4s32_f32:
4527 case Intrinsic::nvvm_tld4_unified_b_2d_v4s32_f32:
4528 case Intrinsic::nvvm_tld4_unified_a_2d_v4s32_f32:
4529 case Intrinsic::nvvm_tld4_unified_r_2d_v4u32_f32:
4530 case Intrinsic::nvvm_tld4_unified_g_2d_v4u32_f32:
4531 case Intrinsic::nvvm_tld4_unified_b_2d_v4u32_f32:
4532 case Intrinsic::nvvm_tld4_unified_a_2d_v4u32_f32:
4533 Info.opc = ISD::INTRINSIC_W_CHAIN;
4534 Info.memVT = MVT::v4i32;
4535 Info.ptrVal = nullptr;
4536 Info.offset = 0;
4537 Info.flags = MachineMemOperand::MOLoad;
4538 Info.align = Align(16);
4539 return true;
4540
4541 case Intrinsic::nvvm_suld_1d_i8_clamp:
4542 case Intrinsic::nvvm_suld_1d_v2i8_clamp:
4543 case Intrinsic::nvvm_suld_1d_v4i8_clamp:
4544 case Intrinsic::nvvm_suld_1d_array_i8_clamp:
4545 case Intrinsic::nvvm_suld_1d_array_v2i8_clamp:
4546 case Intrinsic::nvvm_suld_1d_array_v4i8_clamp:
4547 case Intrinsic::nvvm_suld_2d_i8_clamp:
4548 case Intrinsic::nvvm_suld_2d_v2i8_clamp:
4549 case Intrinsic::nvvm_suld_2d_v4i8_clamp:
4550 case Intrinsic::nvvm_suld_2d_array_i8_clamp:
4551 case Intrinsic::nvvm_suld_2d_array_v2i8_clamp:
4552 case Intrinsic::nvvm_suld_2d_array_v4i8_clamp:
4553 case Intrinsic::nvvm_suld_3d_i8_clamp:
4554 case Intrinsic::nvvm_suld_3d_v2i8_clamp:
4555 case Intrinsic::nvvm_suld_3d_v4i8_clamp:
4556 case Intrinsic::nvvm_suld_1d_i8_trap:
4557 case Intrinsic::nvvm_suld_1d_v2i8_trap:
4558 case Intrinsic::nvvm_suld_1d_v4i8_trap:
4559 case Intrinsic::nvvm_suld_1d_array_i8_trap:
4560 case Intrinsic::nvvm_suld_1d_array_v2i8_trap:
4561 case Intrinsic::nvvm_suld_1d_array_v4i8_trap:
4562 case Intrinsic::nvvm_suld_2d_i8_trap:
4563 case Intrinsic::nvvm_suld_2d_v2i8_trap:
4564 case Intrinsic::nvvm_suld_2d_v4i8_trap:
4565 case Intrinsic::nvvm_suld_2d_array_i8_trap:
4566 case Intrinsic::nvvm_suld_2d_array_v2i8_trap:
4567 case Intrinsic::nvvm_suld_2d_array_v4i8_trap:
4568 case Intrinsic::nvvm_suld_3d_i8_trap:
4569 case Intrinsic::nvvm_suld_3d_v2i8_trap:
4570 case Intrinsic::nvvm_suld_3d_v4i8_trap:
4571 case Intrinsic::nvvm_suld_1d_i8_zero:
4572 case Intrinsic::nvvm_suld_1d_v2i8_zero:
4573 case Intrinsic::nvvm_suld_1d_v4i8_zero:
4574 case Intrinsic::nvvm_suld_1d_array_i8_zero:
4575 case Intrinsic::nvvm_suld_1d_array_v2i8_zero:
4576 case Intrinsic::nvvm_suld_1d_array_v4i8_zero:
4577 case Intrinsic::nvvm_suld_2d_i8_zero:
4578 case Intrinsic::nvvm_suld_2d_v2i8_zero:
4579 case Intrinsic::nvvm_suld_2d_v4i8_zero:
4580 case Intrinsic::nvvm_suld_2d_array_i8_zero:
4581 case Intrinsic::nvvm_suld_2d_array_v2i8_zero:
4582 case Intrinsic::nvvm_suld_2d_array_v4i8_zero:
4583 case Intrinsic::nvvm_suld_3d_i8_zero:
4584 case Intrinsic::nvvm_suld_3d_v2i8_zero:
4585 case Intrinsic::nvvm_suld_3d_v4i8_zero:
4586 Info.opc = ISD::INTRINSIC_W_CHAIN;
4587 Info.memVT = MVT::i8;
4588 Info.ptrVal = nullptr;
4589 Info.offset = 0;
4590 Info.flags = MachineMemOperand::MOLoad;
4591 Info.align = Align(16);
4592 return true;
4593
4594 case Intrinsic::nvvm_suld_1d_i16_clamp:
4595 case Intrinsic::nvvm_suld_1d_v2i16_clamp:
4596 case Intrinsic::nvvm_suld_1d_v4i16_clamp:
4597 case Intrinsic::nvvm_suld_1d_array_i16_clamp:
4598 case Intrinsic::nvvm_suld_1d_array_v2i16_clamp:
4599 case Intrinsic::nvvm_suld_1d_array_v4i16_clamp:
4600 case Intrinsic::nvvm_suld_2d_i16_clamp:
4601 case Intrinsic::nvvm_suld_2d_v2i16_clamp:
4602 case Intrinsic::nvvm_suld_2d_v4i16_clamp:
4603 case Intrinsic::nvvm_suld_2d_array_i16_clamp:
4604 case Intrinsic::nvvm_suld_2d_array_v2i16_clamp:
4605 case Intrinsic::nvvm_suld_2d_array_v4i16_clamp:
4606 case Intrinsic::nvvm_suld_3d_i16_clamp:
4607 case Intrinsic::nvvm_suld_3d_v2i16_clamp:
4608 case Intrinsic::nvvm_suld_3d_v4i16_clamp:
4609 case Intrinsic::nvvm_suld_1d_i16_trap:
4610 case Intrinsic::nvvm_suld_1d_v2i16_trap:
4611 case Intrinsic::nvvm_suld_1d_v4i16_trap:
4612 case Intrinsic::nvvm_suld_1d_array_i16_trap:
4613 case Intrinsic::nvvm_suld_1d_array_v2i16_trap:
4614 case Intrinsic::nvvm_suld_1d_array_v4i16_trap:
4615 case Intrinsic::nvvm_suld_2d_i16_trap:
4616 case Intrinsic::nvvm_suld_2d_v2i16_trap:
4617 case Intrinsic::nvvm_suld_2d_v4i16_trap:
4618 case Intrinsic::nvvm_suld_2d_array_i16_trap:
4619 case Intrinsic::nvvm_suld_2d_array_v2i16_trap:
4620 case Intrinsic::nvvm_suld_2d_array_v4i16_trap:
4621 case Intrinsic::nvvm_suld_3d_i16_trap:
4622 case Intrinsic::nvvm_suld_3d_v2i16_trap:
4623 case Intrinsic::nvvm_suld_3d_v4i16_trap:
4624 case Intrinsic::nvvm_suld_1d_i16_zero:
4625 case Intrinsic::nvvm_suld_1d_v2i16_zero:
4626 case Intrinsic::nvvm_suld_1d_v4i16_zero:
4627 case Intrinsic::nvvm_suld_1d_array_i16_zero:
4628 case Intrinsic::nvvm_suld_1d_array_v2i16_zero:
4629 case Intrinsic::nvvm_suld_1d_array_v4i16_zero:
4630 case Intrinsic::nvvm_suld_2d_i16_zero:
4631 case Intrinsic::nvvm_suld_2d_v2i16_zero:
4632 case Intrinsic::nvvm_suld_2d_v4i16_zero:
4633 case Intrinsic::nvvm_suld_2d_array_i16_zero:
4634 case Intrinsic::nvvm_suld_2d_array_v2i16_zero:
4635 case Intrinsic::nvvm_suld_2d_array_v4i16_zero:
4636 case Intrinsic::nvvm_suld_3d_i16_zero:
4637 case Intrinsic::nvvm_suld_3d_v2i16_zero:
4638 case Intrinsic::nvvm_suld_3d_v4i16_zero:
4639 Info.opc = ISD::INTRINSIC_W_CHAIN;
4640 Info.memVT = MVT::i16;
4641 Info.ptrVal = nullptr;
4642 Info.offset = 0;
4643 Info.flags = MachineMemOperand::MOLoad;
4644 Info.align = Align(16);
4645 return true;
4646
4647 case Intrinsic::nvvm_suld_1d_i32_clamp:
4648 case Intrinsic::nvvm_suld_1d_v2i32_clamp:
4649 case Intrinsic::nvvm_suld_1d_v4i32_clamp:
4650 case Intrinsic::nvvm_suld_1d_array_i32_clamp:
4651 case Intrinsic::nvvm_suld_1d_array_v2i32_clamp:
4652 case Intrinsic::nvvm_suld_1d_array_v4i32_clamp:
4653 case Intrinsic::nvvm_suld_2d_i32_clamp:
4654 case Intrinsic::nvvm_suld_2d_v2i32_clamp:
4655 case Intrinsic::nvvm_suld_2d_v4i32_clamp:
4656 case Intrinsic::nvvm_suld_2d_array_i32_clamp:
4657 case Intrinsic::nvvm_suld_2d_array_v2i32_clamp:
4658 case Intrinsic::nvvm_suld_2d_array_v4i32_clamp:
4659 case Intrinsic::nvvm_suld_3d_i32_clamp:
4660 case Intrinsic::nvvm_suld_3d_v2i32_clamp:
4661 case Intrinsic::nvvm_suld_3d_v4i32_clamp:
4662 case Intrinsic::nvvm_suld_1d_i32_trap:
4663 case Intrinsic::nvvm_suld_1d_v2i32_trap:
4664 case Intrinsic::nvvm_suld_1d_v4i32_trap:
4665 case Intrinsic::nvvm_suld_1d_array_i32_trap:
4666 case Intrinsic::nvvm_suld_1d_array_v2i32_trap:
4667 case Intrinsic::nvvm_suld_1d_array_v4i32_trap:
4668 case Intrinsic::nvvm_suld_2d_i32_trap:
4669 case Intrinsic::nvvm_suld_2d_v2i32_trap:
4670 case Intrinsic::nvvm_suld_2d_v4i32_trap:
4671 case Intrinsic::nvvm_suld_2d_array_i32_trap:
4672 case Intrinsic::nvvm_suld_2d_array_v2i32_trap:
4673 case Intrinsic::nvvm_suld_2d_array_v4i32_trap:
4674 case Intrinsic::nvvm_suld_3d_i32_trap:
4675 case Intrinsic::nvvm_suld_3d_v2i32_trap:
4676 case Intrinsic::nvvm_suld_3d_v4i32_trap:
4677 case Intrinsic::nvvm_suld_1d_i32_zero:
4678 case Intrinsic::nvvm_suld_1d_v2i32_zero:
4679 case Intrinsic::nvvm_suld_1d_v4i32_zero:
4680 case Intrinsic::nvvm_suld_1d_array_i32_zero:
4681 case Intrinsic::nvvm_suld_1d_array_v2i32_zero:
4682 case Intrinsic::nvvm_suld_1d_array_v4i32_zero:
4683 case Intrinsic::nvvm_suld_2d_i32_zero:
4684 case Intrinsic::nvvm_suld_2d_v2i32_zero:
4685 case Intrinsic::nvvm_suld_2d_v4i32_zero:
4686 case Intrinsic::nvvm_suld_2d_array_i32_zero:
4687 case Intrinsic::nvvm_suld_2d_array_v2i32_zero:
4688 case Intrinsic::nvvm_suld_2d_array_v4i32_zero:
4689 case Intrinsic::nvvm_suld_3d_i32_zero:
4690 case Intrinsic::nvvm_suld_3d_v2i32_zero:
4691 case Intrinsic::nvvm_suld_3d_v4i32_zero:
4692 Info.opc = ISD::INTRINSIC_W_CHAIN;
4693 Info.memVT = MVT::i32;
4694 Info.ptrVal = nullptr;
4695 Info.offset = 0;
4696 Info.flags = MachineMemOperand::MOLoad;
4697 Info.align = Align(16);
4698 return true;
4699
4700 case Intrinsic::nvvm_suld_1d_i64_clamp:
4701 case Intrinsic::nvvm_suld_1d_v2i64_clamp:
4702 case Intrinsic::nvvm_suld_1d_array_i64_clamp:
4703 case Intrinsic::nvvm_suld_1d_array_v2i64_clamp:
4704 case Intrinsic::nvvm_suld_2d_i64_clamp:
4705 case Intrinsic::nvvm_suld_2d_v2i64_clamp:
4706 case Intrinsic::nvvm_suld_2d_array_i64_clamp:
4707 case Intrinsic::nvvm_suld_2d_array_v2i64_clamp:
4708 case Intrinsic::nvvm_suld_3d_i64_clamp:
4709 case Intrinsic::nvvm_suld_3d_v2i64_clamp:
4710 case Intrinsic::nvvm_suld_1d_i64_trap:
4711 case Intrinsic::nvvm_suld_1d_v2i64_trap:
4712 case Intrinsic::nvvm_suld_1d_array_i64_trap:
4713 case Intrinsic::nvvm_suld_1d_array_v2i64_trap:
4714 case Intrinsic::nvvm_suld_2d_i64_trap:
4715 case Intrinsic::nvvm_suld_2d_v2i64_trap:
4716 case Intrinsic::nvvm_suld_2d_array_i64_trap:
4717 case Intrinsic::nvvm_suld_2d_array_v2i64_trap:
4718 case Intrinsic::nvvm_suld_3d_i64_trap:
4719 case Intrinsic::nvvm_suld_3d_v2i64_trap:
4720 case Intrinsic::nvvm_suld_1d_i64_zero:
4721 case Intrinsic::nvvm_suld_1d_v2i64_zero:
4722 case Intrinsic::nvvm_suld_1d_array_i64_zero:
4723 case Intrinsic::nvvm_suld_1d_array_v2i64_zero:
4724 case Intrinsic::nvvm_suld_2d_i64_zero:
4725 case Intrinsic::nvvm_suld_2d_v2i64_zero:
4726 case Intrinsic::nvvm_suld_2d_array_i64_zero:
4727 case Intrinsic::nvvm_suld_2d_array_v2i64_zero:
4728 case Intrinsic::nvvm_suld_3d_i64_zero:
4729 case Intrinsic::nvvm_suld_3d_v2i64_zero:
4730 Info.opc = ISD::INTRINSIC_W_CHAIN;
4731 Info.memVT = MVT::i64;
4732 Info.ptrVal = nullptr;
4733 Info.offset = 0;
4734 Info.flags = MachineMemOperand::MOLoad;
4735 Info.align = Align(16);
4736 return true;
4737
4738 case Intrinsic::nvvm_tcgen05_ld_16x64b_x1:
4739 case Intrinsic::nvvm_tcgen05_ld_32x32b_x1:
4740 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x1: {
4741 Info.opc = ISD::INTRINSIC_W_CHAIN;
4742 Info.memVT = MVT::v1i32;
4743 Info.ptrVal = I.getArgOperand(0);
4744 Info.offset = 0;
4745 Info.flags = MachineMemOperand::MOLoad;
4746 Info.align.reset();
4747 return true;
4748 }
4749
4750 case Intrinsic::nvvm_tcgen05_ld_16x64b_x2:
4751 case Intrinsic::nvvm_tcgen05_ld_16x128b_x1:
4752 case Intrinsic::nvvm_tcgen05_ld_32x32b_x2:
4753 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x2: {
4754 Info.opc = ISD::INTRINSIC_W_CHAIN;
4755 Info.memVT = MVT::v2i32;
4756 Info.ptrVal = I.getArgOperand(0);
4757 Info.offset = 0;
4758 Info.flags = MachineMemOperand::MOLoad;
4759 Info.align.reset();
4760 return true;
4761 }
4762
4763 case Intrinsic::nvvm_tcgen05_ld_16x64b_x4:
4764 case Intrinsic::nvvm_tcgen05_ld_16x128b_x2:
4765 case Intrinsic::nvvm_tcgen05_ld_32x32b_x4:
4766 case Intrinsic::nvvm_tcgen05_ld_16x256b_x1:
4767 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x4: {
4768 Info.opc = ISD::INTRINSIC_W_CHAIN;
4769 Info.memVT = MVT::v4i32;
4770 Info.ptrVal = I.getArgOperand(0);
4771 Info.offset = 0;
4772 Info.flags = MachineMemOperand::MOLoad;
4773 Info.align.reset();
4774 return true;
4775 }
4776
4777 case Intrinsic::nvvm_tcgen05_ld_16x64b_x8:
4778 case Intrinsic::nvvm_tcgen05_ld_16x128b_x4:
4779 case Intrinsic::nvvm_tcgen05_ld_16x256b_x2:
4780 case Intrinsic::nvvm_tcgen05_ld_32x32b_x8:
4781 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x8: {
4782 Info.opc = ISD::INTRINSIC_W_CHAIN;
4783 Info.memVT = MVT::v8i32;
4784 Info.ptrVal = I.getArgOperand(0);
4785 Info.offset = 0;
4786 Info.flags = MachineMemOperand::MOLoad;
4787 Info.align.reset();
4788 return true;
4789 }
4790
4791 case Intrinsic::nvvm_tcgen05_ld_16x64b_x16:
4792 case Intrinsic::nvvm_tcgen05_ld_16x128b_x8:
4793 case Intrinsic::nvvm_tcgen05_ld_16x256b_x4:
4794 case Intrinsic::nvvm_tcgen05_ld_32x32b_x16:
4795 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x16: {
4796 Info.opc = ISD::INTRINSIC_W_CHAIN;
4797 Info.memVT = MVT::v16i32;
4798 Info.ptrVal = I.getArgOperand(0);
4799 Info.offset = 0;
4800 Info.flags = MachineMemOperand::MOLoad;
4801 Info.align.reset();
4802 return true;
4803 }
4804
4805 case Intrinsic::nvvm_tcgen05_ld_16x64b_x32:
4806 case Intrinsic::nvvm_tcgen05_ld_16x128b_x16:
4807 case Intrinsic::nvvm_tcgen05_ld_16x256b_x8:
4808 case Intrinsic::nvvm_tcgen05_ld_32x32b_x32:
4809 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x32: {
4810 Info.opc = ISD::INTRINSIC_W_CHAIN;
4811 Info.memVT = MVT::v32i32;
4812 Info.ptrVal = I.getArgOperand(0);
4813 Info.offset = 0;
4814 Info.flags = MachineMemOperand::MOLoad;
4815 Info.align.reset();
4816 return true;
4817 }
4818
4819 case Intrinsic::nvvm_tcgen05_ld_16x64b_x64:
4820 case Intrinsic::nvvm_tcgen05_ld_16x128b_x32:
4821 case Intrinsic::nvvm_tcgen05_ld_16x256b_x16:
4822 case Intrinsic::nvvm_tcgen05_ld_32x32b_x64:
4823 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x64: {
4824 Info.opc = ISD::INTRINSIC_W_CHAIN;
4825 Info.memVT = MVT::v64i32;
4826 Info.ptrVal = I.getArgOperand(0);
4827 Info.offset = 0;
4828 Info.flags = MachineMemOperand::MOLoad;
4829 Info.align.reset();
4830 return true;
4831 }
4832
4833 case Intrinsic::nvvm_tcgen05_ld_16x64b_x128:
4834 case Intrinsic::nvvm_tcgen05_ld_16x128b_x64:
4835 case Intrinsic::nvvm_tcgen05_ld_16x256b_x32:
4836 case Intrinsic::nvvm_tcgen05_ld_32x32b_x128:
4837 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x128: {
4838 Info.opc = ISD::INTRINSIC_W_CHAIN;
4839 Info.memVT = MVT::v128i32;
4840 Info.ptrVal = I.getArgOperand(0);
4841 Info.offset = 0;
4842 Info.flags = MachineMemOperand::MOLoad;
4843 Info.align.reset();
4844 return true;
4845 }
4846
4847 case Intrinsic::nvvm_tcgen05_st_16x64b_x1:
4848 case Intrinsic::nvvm_tcgen05_st_32x32b_x1:
4849 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x1: {
4850 Info.opc = ISD::INTRINSIC_VOID;
4851 Info.memVT = MVT::i32;
4852 Info.ptrVal = I.getArgOperand(0);
4853 Info.offset = 0;
4854 Info.flags = MachineMemOperand::MOStore;
4855 Info.align.reset();
4856 return true;
4857 }
4858
4859 case Intrinsic::nvvm_tcgen05_st_16x64b_x2:
4860 case Intrinsic::nvvm_tcgen05_st_16x128b_x1:
4861 case Intrinsic::nvvm_tcgen05_st_32x32b_x2:
4862 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x2: {
4863 Info.opc = ISD::INTRINSIC_VOID;
4864 Info.memVT = MVT::v2i32;
4865 Info.ptrVal = I.getArgOperand(0);
4866 Info.offset = 0;
4867 Info.flags = MachineMemOperand::MOStore;
4868 Info.align.reset();
4869 return true;
4870 }
4871
4872 case Intrinsic::nvvm_tcgen05_st_16x64b_x4:
4873 case Intrinsic::nvvm_tcgen05_st_16x128b_x2:
4874 case Intrinsic::nvvm_tcgen05_st_16x256b_x1:
4875 case Intrinsic::nvvm_tcgen05_st_32x32b_x4:
4876 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x4: {
4877 Info.opc = ISD::INTRINSIC_VOID;
4878 Info.memVT = MVT::v4i32;
4879 Info.ptrVal = I.getArgOperand(0);
4880 Info.offset = 0;
4881 Info.flags = MachineMemOperand::MOStore;
4882 Info.align.reset();
4883 return true;
4884 }
4885
4886 case Intrinsic::nvvm_tcgen05_st_16x64b_x8:
4887 case Intrinsic::nvvm_tcgen05_st_16x128b_x4:
4888 case Intrinsic::nvvm_tcgen05_st_16x256b_x2:
4889 case Intrinsic::nvvm_tcgen05_st_32x32b_x8:
4890 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x8: {
4891 Info.opc = ISD::INTRINSIC_VOID;
4892 Info.memVT = MVT::v8i32;
4893 Info.ptrVal = I.getArgOperand(0);
4894 Info.offset = 0;
4895 Info.flags = MachineMemOperand::MOStore;
4896 Info.align.reset();
4897 return true;
4898 }
4899
4900 case Intrinsic::nvvm_tcgen05_st_16x64b_x16:
4901 case Intrinsic::nvvm_tcgen05_st_16x128b_x8:
4902 case Intrinsic::nvvm_tcgen05_st_16x256b_x4:
4903 case Intrinsic::nvvm_tcgen05_st_32x32b_x16:
4904 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x16: {
4905 Info.opc = ISD::INTRINSIC_VOID;
4906 Info.memVT = MVT::v16i32;
4907 Info.ptrVal = I.getArgOperand(0);
4908 Info.offset = 0;
4909 Info.flags = MachineMemOperand::MOStore;
4910 Info.align.reset();
4911 return true;
4912 }
4913
4914 case Intrinsic::nvvm_tcgen05_st_16x64b_x32:
4915 case Intrinsic::nvvm_tcgen05_st_16x128b_x16:
4916 case Intrinsic::nvvm_tcgen05_st_16x256b_x8:
4917 case Intrinsic::nvvm_tcgen05_st_32x32b_x32:
4918 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x32: {
4919 Info.opc = ISD::INTRINSIC_VOID;
4920 Info.memVT = MVT::v32i32;
4921 Info.ptrVal = I.getArgOperand(0);
4922 Info.offset = 0;
4923 Info.flags = MachineMemOperand::MOStore;
4924 Info.align.reset();
4925 return true;
4926 }
4927
4928 case Intrinsic::nvvm_tcgen05_st_16x64b_x64:
4929 case Intrinsic::nvvm_tcgen05_st_16x128b_x32:
4930 case Intrinsic::nvvm_tcgen05_st_16x256b_x16:
4931 case Intrinsic::nvvm_tcgen05_st_32x32b_x64:
4932 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x64: {
4933 Info.opc = ISD::INTRINSIC_VOID;
4934 Info.memVT = MVT::v64i32;
4935 Info.ptrVal = I.getArgOperand(0);
4936 Info.offset = 0;
4937 Info.flags = MachineMemOperand::MOStore;
4938 Info.align.reset();
4939 return true;
4940 }
4941
4942 case Intrinsic::nvvm_tcgen05_st_16x64b_x128:
4943 case Intrinsic::nvvm_tcgen05_st_16x128b_x64:
4944 case Intrinsic::nvvm_tcgen05_st_16x256b_x32:
4945 case Intrinsic::nvvm_tcgen05_st_32x32b_x128:
4946 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x128: {
4947 Info.opc = ISD::INTRINSIC_VOID;
4948 Info.memVT = MVT::v128i32;
4949 Info.ptrVal = I.getArgOperand(0);
4950 Info.offset = 0;
4951 Info.flags = MachineMemOperand::MOStore;
4952 Info.align.reset();
4953 return true;
4954 }
4955 case Intrinsic::nvvm_tcgen05_mma_shared_disable_output_lane_cg1:
4956 case Intrinsic::nvvm_tcgen05_mma_shared_scale_d_disable_output_lane_cg1:
4957 case Intrinsic::nvvm_tcgen05_mma_sp_shared_disable_output_lane_cg1:
4958 case Intrinsic::nvvm_tcgen05_mma_sp_shared_scale_d_disable_output_lane_cg1:
4959 case Intrinsic::nvvm_tcgen05_mma_tensor_disable_output_lane_cg1:
4960 case Intrinsic::nvvm_tcgen05_mma_tensor_scale_d_disable_output_lane_cg1:
4961 case Intrinsic::nvvm_tcgen05_mma_tensor_disable_output_lane_cg1_ashift:
4962 case Intrinsic::
4963 nvvm_tcgen05_mma_tensor_scale_d_disable_output_lane_cg1_ashift:
4964 case Intrinsic::nvvm_tcgen05_mma_sp_tensor_disable_output_lane_cg1:
4965 case Intrinsic::nvvm_tcgen05_mma_sp_tensor_scale_d_disable_output_lane_cg1:
4966 case Intrinsic::nvvm_tcgen05_mma_sp_tensor_disable_output_lane_cg1_ashift:
4967 case Intrinsic::
4968 nvvm_tcgen05_mma_sp_tensor_scale_d_disable_output_lane_cg1_ashift: {
4969 // We are reading and writing back to TMem
4970 Info.opc = ISD::INTRINSIC_VOID;
4971 Info.memVT = MVT::v4i32;
4972 Info.ptrVal = I.getArgOperand(0);
4973 Info.offset = 0;
4975 Info.align = Align(16);
4976 return true;
4977 }
4978
4979 case Intrinsic::nvvm_tcgen05_mma_shared_disable_output_lane_cg2:
4980 case Intrinsic::nvvm_tcgen05_mma_shared_scale_d_disable_output_lane_cg2:
4981 case Intrinsic::nvvm_tcgen05_mma_sp_shared_disable_output_lane_cg2:
4982 case Intrinsic::nvvm_tcgen05_mma_sp_shared_scale_d_disable_output_lane_cg2:
4983 case Intrinsic::nvvm_tcgen05_mma_tensor_disable_output_lane_cg2:
4984 case Intrinsic::nvvm_tcgen05_mma_tensor_scale_d_disable_output_lane_cg2:
4985 case Intrinsic::nvvm_tcgen05_mma_sp_tensor_disable_output_lane_cg2:
4986 case Intrinsic::nvvm_tcgen05_mma_sp_tensor_scale_d_disable_output_lane_cg2:
4987 case Intrinsic::nvvm_tcgen05_mma_tensor_disable_output_lane_cg2_ashift:
4988 case Intrinsic::
4989 nvvm_tcgen05_mma_tensor_scale_d_disable_output_lane_cg2_ashift:
4990 case Intrinsic::nvvm_tcgen05_mma_sp_tensor_disable_output_lane_cg2_ashift:
4991 case Intrinsic::
4992 nvvm_tcgen05_mma_sp_tensor_scale_d_disable_output_lane_cg2_ashift: {
4993 // We are reading and writing back to TMem
4994 Info.opc = ISD::INTRINSIC_VOID;
4995 Info.memVT = MVT::v8i32;
4996 Info.ptrVal = I.getArgOperand(0);
4997 Info.offset = 0;
4999 Info.align = Align(16);
5000 return true;
5001 }
5002 }
5003 return false;
5004}
5005
5006/// getFunctionParamOptimizedAlign - since function arguments are passed via
5007/// .param space, we may want to increase their alignment in a way that
5008/// ensures that we can effectively vectorize their loads & stores. We can
5009/// increase alignment only if the function has internal or has private
5010/// linkage as for other linkage types callers may already rely on default
5011/// alignment. To allow using 128-bit vectorized loads/stores, this function
5012/// ensures that alignment is 16 or greater.
5014 const Function *F, Type *ArgTy, const DataLayout &DL) const {
5015 // Capping the alignment to 128 bytes as that is the maximum alignment
5016 // supported by PTX.
5017 const Align ABITypeAlign = std::min(Align(128), DL.getABITypeAlign(ArgTy));
5018
5019 // If a function has linkage different from internal or private, we
5020 // must use default ABI alignment as external users rely on it. Same
5021 // for a function that may be called from a function pointer.
5022 if (!F || !F->hasLocalLinkage() ||
5023 F->hasAddressTaken(/*Users=*/nullptr,
5024 /*IgnoreCallbackUses=*/false,
5025 /*IgnoreAssumeLikeCalls=*/true,
5026 /*IgnoreLLVMUsed=*/true))
5027 return ABITypeAlign;
5028
5029 assert(!isKernelFunction(*F) && "Expect kernels to have non-local linkage");
5030 return std::max(Align(16), ABITypeAlign);
5031}
5032
5033/// Helper for computing alignment of a device function byval parameter.
5035 const Function *F, Type *ArgTy, Align InitialAlign,
5036 const DataLayout &DL) const {
5037 Align ArgAlign = InitialAlign;
5038 // Try to increase alignment to enhance vectorization options.
5039 if (F)
5040 ArgAlign = std::max(ArgAlign, getFunctionParamOptimizedAlign(F, ArgTy, DL));
5041
5042 // Old ptx versions have a bug. When PTX code takes address of
5043 // byval parameter with alignment < 4, ptxas generates code to
5044 // spill argument into memory. Alas on sm_50+ ptxas generates
5045 // SASS code that fails with misaligned access. To work around
5046 // the problem, make sure that we align byval parameters by at
5047 // least 4. This bug seems to be fixed at least starting from
5048 // ptxas > 9.0.
5049 // TODO: remove this after verifying the bug is not reproduced
5050 // on non-deprecated ptxas versions.
5052 ArgAlign = std::max(ArgAlign, Align(4));
5053
5054 return ArgAlign;
5055}
5056
5057// Helper for getting a function parameter name. Name is composed from
5058// its index and the function name. Negative index corresponds to special
5059// parameter (unsized array) used for passing variable arguments.
5061 int Idx) const {
5062 std::string ParamName;
5063 raw_string_ostream ParamStr(ParamName);
5064
5065 ParamStr << getTargetMachine().getSymbol(F)->getName();
5066 if (Idx < 0)
5067 ParamStr << "_vararg";
5068 else
5069 ParamStr << "_param_" << Idx;
5070
5071 return ParamName;
5072}
5073
5074/// isLegalAddressingMode - Return true if the addressing mode represented
5075/// by AM is legal for this target, for a load/store of the specified type.
5076/// Used to guide target specific optimizations, like loop strength reduction
5077/// (LoopStrengthReduce.cpp) and memory optimization for address mode
5078/// (CodeGenPrepare.cpp)
5080 const AddrMode &AM, Type *Ty,
5081 unsigned AS, Instruction *I) const {
5082 // AddrMode - This represents an addressing mode of:
5083 // BaseGV + BaseOffs + BaseReg + Scale*ScaleReg
5084 //
5085 // The legal address modes are
5086 // - [avar]
5087 // - [areg]
5088 // - [areg+immoff]
5089 // - [immAddr]
5090
5091 // immoff must fit in a signed 32-bit int
5092 if (!APInt(64, AM.BaseOffs).isSignedIntN(32))
5093 return false;
5094
5095 if (AM.BaseGV)
5096 return !AM.BaseOffs && !AM.HasBaseReg && !AM.Scale;
5097
5098 switch (AM.Scale) {
5099 case 0: // "r", "r+i" or "i" is allowed
5100 break;
5101 case 1:
5102 if (AM.HasBaseReg) // "r+r+i" or "r+r" is not allowed.
5103 return false;
5104 // Otherwise we have r+i.
5105 break;
5106 default:
5107 // No scale > 1 is allowed
5108 return false;
5109 }
5110 return true;
5111}
5112
5113//===----------------------------------------------------------------------===//
5114// NVPTX Inline Assembly Support
5115//===----------------------------------------------------------------------===//
5116
5117/// getConstraintType - Given a constraint letter, return the type of
5118/// constraint it is for this target.
5121 if (Constraint.size() == 1) {
5122 switch (Constraint[0]) {
5123 default:
5124 break;
5125 case 'b':
5126 case 'r':
5127 case 'h':
5128 case 'c':
5129 case 'l':
5130 case 'f':
5131 case 'd':
5132 case 'q':
5133 case '0':
5134 case 'N':
5135 return C_RegisterClass;
5136 }
5137 }
5138 return TargetLowering::getConstraintType(Constraint);
5139}
5140
5141std::pair<unsigned, const TargetRegisterClass *>
5143 StringRef Constraint,
5144 MVT VT) const {
5145 if (Constraint.size() == 1) {
5146 switch (Constraint[0]) {
5147 case 'b':
5148 return std::make_pair(0U, &NVPTX::B1RegClass);
5149 case 'c':
5150 case 'h':
5151 return std::make_pair(0U, &NVPTX::B16RegClass);
5152 case 'r':
5153 case 'f':
5154 return std::make_pair(0U, &NVPTX::B32RegClass);
5155 case 'l':
5156 case 'N':
5157 case 'd':
5158 return std::make_pair(0U, &NVPTX::B64RegClass);
5159 case 'q': {
5160 if (STI.getSmVersion() < 70)
5161 report_fatal_error("Inline asm with 128 bit operands is only "
5162 "supported for sm_70 and higher!");
5163 return std::make_pair(0U, &NVPTX::B128RegClass);
5164 }
5165 }
5166 }
5167 return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
5168}
5169
5170//===----------------------------------------------------------------------===//
5171// NVPTX DAG Combining
5172//===----------------------------------------------------------------------===//
5173
5175 CodeGenOptLevel OptLevel) const {
5176 // Always honor command-line argument
5177 if (FMAContractLevelOpt.getNumOccurrences() > 0)
5178 return FMAContractLevelOpt > 0;
5179
5180 // Do not contract if we're not optimizing the code.
5181 if (OptLevel == CodeGenOptLevel::None)
5182 return false;
5183
5184 // Honor TargetOptions flags that explicitly say fusion is okay.
5186 return true;
5187
5188 return false;
5189}
5190
5191static bool isConstZero(const SDValue &Operand) {
5192 const auto *Const = dyn_cast<ConstantSDNode>(Operand);
5193 return Const && Const->getZExtValue() == 0;
5194}
5195
5196/// PerformADDCombineWithOperands - Try DAG combinations for an ADD with
5197/// operands N0 and N1. This is a helper for PerformADDCombine that is
5198/// called with the default operands, and if that fails, with commuted
5199/// operands.
5200static SDValue
5203 EVT VT = N0.getValueType();
5204
5205 // Since integer multiply-add costs the same as integer multiply
5206 // but is more costly than integer add, do the fusion only when
5207 // the mul is only used in the add.
5208 // TODO: this may not be true for later architectures, consider relaxing this
5209 if (!N0.getNode()->hasOneUse())
5210 return SDValue();
5211
5212 // fold (add (select cond, 0, (mul a, b)), c)
5213 // -> (select cond, c, (add (mul a, b), c))
5214 //
5215 if (N0.getOpcode() == ISD::SELECT) {
5216 unsigned ZeroOpNum;
5217 if (isConstZero(N0->getOperand(1)))
5218 ZeroOpNum = 1;
5219 else if (isConstZero(N0->getOperand(2)))
5220 ZeroOpNum = 2;
5221 else
5222 return SDValue();
5223
5224 SDValue M = N0->getOperand((ZeroOpNum == 1) ? 2 : 1);
5225 if (M->getOpcode() != ISD::MUL || !M.getNode()->hasOneUse())
5226 return SDValue();
5227
5228 SDLoc DL(N);
5229 SDValue Mul =
5230 DCI.DAG.getNode(ISD::MUL, DL, VT, M->getOperand(0), M->getOperand(1));
5231 SDValue MAD = DCI.DAG.getNode(ISD::ADD, DL, VT, Mul, N1);
5232 return DCI.DAG.getSelect(SDLoc(N), VT, N0->getOperand(0),
5233 ((ZeroOpNum == 1) ? N1 : MAD),
5234 ((ZeroOpNum == 1) ? MAD : N1));
5235 }
5236
5237 return SDValue();
5238}
5239
5240static SDValue
5243 CodeGenOptLevel OptLevel) {
5244 EVT VT = N0.getValueType();
5245 if (N0.getOpcode() == ISD::FMUL) {
5246 const auto *TLI = static_cast<const NVPTXTargetLowering *>(
5247 &DCI.DAG.getTargetLoweringInfo());
5248 if (!(TLI->allowFMA(DCI.DAG.getMachineFunction(), OptLevel) ||
5249 (N->getFlags().hasAllowContract() &&
5250 N0->getFlags().hasAllowContract())))
5251 return SDValue();
5252
5253 // For floating point:
5254 // Do the fusion only when the mul has less than 5 uses and all
5255 // are add.
5256 // The heuristic is that if a use is not an add, then that use
5257 // cannot be fused into fma, therefore mul is still needed anyway.
5258 // If there are more than 4 uses, even if they are all add, fusing
5259 // them will increase register pressue.
5260 //
5261 int numUses = 0;
5262 int nonAddCount = 0;
5263 for (const SDNode *User : N0.getNode()->users()) {
5264 numUses++;
5265 if (User->getOpcode() != ISD::FADD)
5266 ++nonAddCount;
5267 if (numUses >= 5)
5268 return SDValue();
5269 }
5270 if (nonAddCount) {
5271 int orderNo = N->getIROrder();
5272 int orderNo2 = N0.getNode()->getIROrder();
5273 // simple heuristics here for considering potential register
5274 // pressure, the logics here is that the differnce are used
5275 // to measure the distance between def and use, the longer distance
5276 // more likely cause register pressure.
5277 if (orderNo - orderNo2 < 500)
5278 return SDValue();
5279
5280 // Now, check if at least one of the FMUL's operands is live beyond the
5281 // node N, which guarantees that the FMA will not increase register
5282 // pressure at node N.
5283 bool opIsLive = false;
5284 const SDNode *left = N0.getOperand(0).getNode();
5285 const SDNode *right = N0.getOperand(1).getNode();
5286
5287 if (isa<ConstantSDNode>(left) || isa<ConstantSDNode>(right))
5288 opIsLive = true;
5289
5290 if (!opIsLive)
5291 for (const SDNode *User : left->users()) {
5292 int orderNo3 = User->getIROrder();
5293 if (orderNo3 > orderNo) {
5294 opIsLive = true;
5295 break;
5296 }
5297 }
5298
5299 if (!opIsLive)
5300 for (const SDNode *User : right->users()) {
5301 int orderNo3 = User->getIROrder();
5302 if (orderNo3 > orderNo) {
5303 opIsLive = true;
5304 break;
5305 }
5306 }
5307
5308 if (!opIsLive)
5309 return SDValue();
5310 }
5311
5312 return DCI.DAG.getNode(ISD::FMA, SDLoc(N), VT, N0.getOperand(0),
5313 N0.getOperand(1), N1);
5314 }
5315
5316 return SDValue();
5317}
5318
5319/// Fold unpacking movs into a load by increasing the number of return values.
5320///
5321/// ex:
5322/// L: v2f16,ch = load <p>
5323/// a: f16 = extractelt L:0, 0
5324/// b: f16 = extractelt L:0, 1
5325/// use(a, b)
5326///
5327/// ...is turned into...
5328///
5329/// L: f16,f16,ch = LoadV2 <p>
5330/// use(L:0, L:1)
5331static SDValue
5333 // Don't run this optimization before the legalizer
5334 if (!DCI.isAfterLegalizeDAG())
5335 return SDValue();
5336
5337 EVT ElementVT = N->getValueType(0);
5338 // Avoid non-packed types and v4i8
5339 if (!NVPTX::isPackedVectorTy(ElementVT) || ElementVT == MVT::v4i8)
5340 return SDValue();
5341
5342 SmallVector<SDNode *> DeadCopyToRegs;
5343
5344 // Check whether all outputs are either used by an extractelt or are
5345 // glue/chain nodes
5346 if (!all_of(N->uses(), [&](SDUse &U) {
5347 // Skip glue, chain nodes
5348 if (U.getValueType() == MVT::Glue || U.getValueType() == MVT::Other)
5349 return true;
5350 if (U.getUser()->getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
5351 if (N->getOpcode() != ISD::LOAD)
5352 return true;
5353 // Since this is an ISD::LOAD, check all extractelts are used. If
5354 // any are not used, we don't want to defeat another optimization that
5355 // will narrow the load.
5356 //
5357 // For example:
5358 //
5359 // L: v2f16,ch = load <p>
5360 // e0: f16 = extractelt L:0, 0
5361 // e1: f16 = extractelt L:0, 1 <-- unused
5362 // store e0
5363 //
5364 // Can be optimized by DAGCombiner to:
5365 //
5366 // L: f16,ch = load <p>
5367 // store L:0
5368 return !U.getUser()->use_empty();
5369 }
5370
5371 // Otherwise, this use prevents us from splitting a value.
5372 return false;
5373 }))
5374 return SDValue();
5375
5376 auto *LD = cast<MemSDNode>(N);
5377 SDLoc DL(LD);
5378
5379 // the new opcode after we double the number of operands
5380 NVPTXISD::NodeType Opcode;
5382 unsigned OldNumOutputs; // non-glue, non-chain outputs
5383 switch (LD->getOpcode()) {
5384 case ISD::LOAD:
5385 OldNumOutputs = 1;
5386 // Any packed type is legal, so the legalizer will not have lowered
5387 // ISD::LOAD -> NVPTXISD::Load (unless it's under-aligned). We have to do it
5388 // here.
5389 Opcode = NVPTXISD::LoadV2;
5390 Operands.push_back(DCI.DAG.getIntPtrConstant(
5391 cast<LoadSDNode>(LD)->getExtensionType(), DL));
5392 break;
5393 case NVPTXISD::LoadV2:
5394 OldNumOutputs = 2;
5395 Opcode = NVPTXISD::LoadV4;
5396 break;
5397 case NVPTXISD::LoadV4:
5398 // V8 is only supported for f32. Don't forget, we're not changing the load
5399 // size here. This is already a 256-bit load.
5400 if (ElementVT != MVT::v2f32)
5401 return SDValue();
5402 OldNumOutputs = 4;
5403 Opcode = NVPTXISD::LoadV8;
5404 break;
5405 case NVPTXISD::LoadV8:
5406 // PTX doesn't support the next doubling of outputs
5407 return SDValue();
5408 }
5409
5410 // the non-glue, non-chain outputs in the new load
5411 const unsigned NewNumOutputs = OldNumOutputs * 2;
5412 SmallVector<EVT> NewVTs(NewNumOutputs, ElementVT.getVectorElementType());
5413 // add remaining chain and glue values
5414 NewVTs.append(LD->value_begin() + OldNumOutputs, LD->value_end());
5415
5416 // Create the new load
5417 SDValue NewLoad = DCI.DAG.getMemIntrinsicNode(
5418 Opcode, DL, DCI.DAG.getVTList(NewVTs), Operands, LD->getMemoryVT(),
5419 LD->getMemOperand());
5420
5421 // Now we use a combination of BUILD_VECTORs and a MERGE_VALUES node to keep
5422 // the outputs the same. These nodes will be optimized away in later
5423 // DAGCombiner iterations.
5425 for (unsigned I : seq(OldNumOutputs))
5426 Results.push_back(DCI.DAG.getBuildVector(
5427 ElementVT, DL, {NewLoad.getValue(I * 2), NewLoad.getValue(I * 2 + 1)}));
5428 // Add remaining chain and glue nodes
5429 for (unsigned I : seq(NewLoad->getNumValues() - NewNumOutputs))
5430 Results.push_back(NewLoad.getValue(NewNumOutputs + I));
5431
5432 return DCI.DAG.getMergeValues(Results, DL);
5433}
5434
5435/// Fold packing movs into a store.
5436///
5437/// ex:
5438/// v1: v2f16 = BUILD_VECTOR a:f16, b:f16
5439/// v2: v2f16 = BUILD_VECTOR c:f16, d:f16
5440/// StoreV2 v1, v2
5441///
5442/// ...is turned into...
5443///
5444/// StoreV4 a, b, c, d
5447 unsigned Front, unsigned Back) {
5448 // We want to run this as late as possible since other optimizations may
5449 // eliminate the BUILD_VECTORs.
5450 if (!DCI.isAfterLegalizeDAG())
5451 return SDValue();
5452
5453 // Get the type of the operands being stored.
5454 EVT ElementVT = N->getOperand(Front).getValueType();
5455
5456 // Avoid non-packed types and v4i8
5457 if (!NVPTX::isPackedVectorTy(ElementVT) || ElementVT == MVT::v4i8)
5458 return SDValue();
5459
5460 auto *ST = cast<MemSDNode>(N);
5461
5462 // The new opcode after we double the number of operands.
5463 NVPTXISD::NodeType Opcode;
5464 switch (N->getOpcode()) {
5465 case ISD::STORE:
5466 // Any packed type is legal, so the legalizer will not have lowered
5467 // ISD::STORE -> NVPTXISD::Store (unless it's under-aligned). We have to do
5468 // it here.
5469 Opcode = NVPTXISD::StoreV2;
5470 break;
5471 case NVPTXISD::StoreV2:
5472 Opcode = NVPTXISD::StoreV4;
5473 break;
5474 case NVPTXISD::StoreV4:
5475 // V8 is only supported for f32. Don't forget, we're not changing the store
5476 // size here. This is already a 256-bit store.
5477 if (ElementVT != MVT::v2f32)
5478 return SDValue();
5479 Opcode = NVPTXISD::StoreV8;
5480 break;
5481 case NVPTXISD::StoreV8:
5482 // PTX doesn't support the next doubling of operands
5483 return SDValue();
5484 default:
5485 llvm_unreachable("Unhandled store opcode");
5486 }
5487
5488 // Scan the operands and if they're all BUILD_VECTORs, we'll have gathered
5489 // their elements.
5490 SmallVector<SDValue, 4> Operands(N->ops().take_front(Front));
5491 for (SDValue BV : N->ops().drop_front(Front).drop_back(Back)) {
5492 if (BV.getOpcode() != ISD::BUILD_VECTOR)
5493 return SDValue();
5494
5495 // If the operand has multiple uses, this optimization can increase register
5496 // pressure.
5497 if (!BV.hasOneUse())
5498 return SDValue();
5499
5500 // DAGCombiner visits nodes bottom-up. Check the BUILD_VECTOR operands for
5501 // any signs they may be folded by some other pattern or rule.
5502 for (SDValue Op : BV->ops()) {
5503 // Peek through bitcasts
5504 if (Op.getOpcode() == ISD::BITCAST)
5505 Op = Op.getOperand(0);
5506
5507 // This may be folded into a PRMT.
5508 if (Op.getValueType() == MVT::i16 && Op.getOpcode() == ISD::TRUNCATE &&
5509 Op->getOperand(0).getValueType() == MVT::i32)
5510 return SDValue();
5511
5512 // This may be folded into cvt.bf16x2
5513 if (Op.getOpcode() == ISD::FP_ROUND)
5514 return SDValue();
5515 }
5516 Operands.append({BV.getOperand(0), BV.getOperand(1)});
5517 }
5518 Operands.append(N->op_end() - Back, N->op_end());
5519
5520 // Now we replace the store
5521 return DCI.DAG.getMemIntrinsicNode(Opcode, SDLoc(N), N->getVTList(), Operands,
5522 ST->getMemoryVT(), ST->getMemOperand());
5523}
5524
5526 const NVPTXSubtarget &STI) {
5527
5528 if (DCI.isBeforeLegalize() && N->getOpcode() == ISD::STORE) {
5529 // Here is our chance to custom lower a store with a non-simple type.
5530 // Unfortunately, we can't do this in the legalizer because there is no
5531 // way to setOperationAction for an non-simple type.
5533 if (!ST->getValue().getValueType().isSimple())
5534 return lowerSTOREVector(SDValue(ST, 0), DCI.DAG, STI);
5535 }
5536
5537 return combinePackingMovIntoStore(N, DCI, 1, 2);
5538}
5539
5541 const NVPTXSubtarget &STI) {
5542 if (DCI.isBeforeLegalize() && N->getOpcode() == ISD::LOAD) {
5543 // Here is our chance to custom lower a load with a non-simple type.
5544 // Unfortunately, we can't do this in the legalizer because there is no
5545 // way to setOperationAction for an non-simple type.
5546 if (!N->getValueType(0).isSimple())
5547 return lowerLoadVector(N, DCI.DAG, STI);
5548 }
5549
5550 return combineUnpackingMovIntoLoad(N, DCI);
5551}
5552
5553/// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD.
5554///
5557 CodeGenOptLevel OptLevel) {
5558 if (OptLevel == CodeGenOptLevel::None)
5559 return SDValue();
5560
5561 SDValue N0 = N->getOperand(0);
5562 SDValue N1 = N->getOperand(1);
5563
5564 // Skip non-integer, non-scalar case
5565 EVT VT = N0.getValueType();
5566 if (VT.isVector() || VT != MVT::i32)
5567 return SDValue();
5568
5569 // First try with the default operand order.
5570 if (SDValue Result = PerformADDCombineWithOperands(N, N0, N1, DCI))
5571 return Result;
5572
5573 // If that didn't work, try again with the operands commuted.
5574 return PerformADDCombineWithOperands(N, N1, N0, DCI);
5575}
5576
5577/// PerformFADDCombine - Target-specific dag combine xforms for ISD::FADD.
5578///
5581 CodeGenOptLevel OptLevel) {
5582 SDValue N0 = N->getOperand(0);
5583 SDValue N1 = N->getOperand(1);
5584
5585 EVT VT = N0.getValueType();
5586 if (VT.isVector() || !(VT == MVT::f32 || VT == MVT::f64))
5587 return SDValue();
5588
5589 // First try with the default operand order.
5590 if (SDValue Result = PerformFADDCombineWithOperands(N, N0, N1, DCI, OptLevel))
5591 return Result;
5592
5593 // If that didn't work, try again with the operands commuted.
5594 return PerformFADDCombineWithOperands(N, N1, N0, DCI, OptLevel);
5595}
5596
5597/// Get 3-input version of a 2-input min/max opcode
5598static NVPTXISD::NodeType getMinMax3Opcode(unsigned MinMax2Opcode) {
5599 switch (MinMax2Opcode) {
5600 case ISD::FMAXNUM:
5601 case ISD::FMAXIMUMNUM:
5602 return NVPTXISD::FMAXNUM3;
5603 case ISD::FMINNUM:
5604 case ISD::FMINIMUMNUM:
5605 return NVPTXISD::FMINNUM3;
5606 case ISD::FMAXIMUM:
5607 return NVPTXISD::FMAXIMUM3;
5608 case ISD::FMINIMUM:
5609 return NVPTXISD::FMINIMUM3;
5610 default:
5611 llvm_unreachable("Invalid 2-input min/max opcode");
5612 }
5613}
5614
5615/// PerformFMinMaxCombine - Combine (fmaxnum (fmaxnum a, b), c) into
5616/// (fmaxnum3 a, b, c). Also covers other llvm min/max intrinsics.
5619 unsigned PTXVersion, unsigned SmVersion) {
5620
5621 // 3-input min/max requires PTX 8.8+ and SM_100+, and only supports f32s
5622 EVT VT = N->getValueType(0);
5623 if (VT != MVT::f32 || PTXVersion < 88 || SmVersion < 100)
5624 return SDValue();
5625
5626 SDValue Op0 = N->getOperand(0);
5627 SDValue Op1 = N->getOperand(1);
5628 unsigned MinMaxOp2 = N->getOpcode();
5629 NVPTXISD::NodeType MinMaxOp3 = getMinMax3Opcode(MinMaxOp2);
5630
5631 if (Op0.getOpcode() == MinMaxOp2 && Op0.hasOneUse()) {
5632 // (maxnum (maxnum a, b), c) -> (maxnum3 a, b, c)
5633 SDValue A = Op0.getOperand(0);
5634 SDValue B = Op0.getOperand(1);
5635 SDValue C = Op1;
5636 return DCI.DAG.getNode(MinMaxOp3, SDLoc(N), VT, A, B, C, N->getFlags());
5637 } else if (Op1.getOpcode() == MinMaxOp2 && Op1.hasOneUse()) {
5638 // (maxnum a, (maxnum b, c)) -> (maxnum3 a, b, c)
5639 SDValue A = Op0;
5640 SDValue B = Op1.getOperand(0);
5641 SDValue C = Op1.getOperand(1);
5642 return DCI.DAG.getNode(MinMaxOp3, SDLoc(N), VT, A, B, C, N->getFlags());
5643 }
5644 return SDValue();
5645}
5646
5649 CodeGenOptLevel OptLevel) {
5650 assert(N->getOpcode() == ISD::SREM || N->getOpcode() == ISD::UREM);
5651
5652 // Don't do anything at less than -O2.
5653 if (OptLevel < CodeGenOptLevel::Default)
5654 return SDValue();
5655
5656 SelectionDAG &DAG = DCI.DAG;
5657 SDLoc DL(N);
5658 EVT VT = N->getValueType(0);
5659 bool IsSigned = N->getOpcode() == ISD::SREM;
5660 unsigned DivOpc = IsSigned ? ISD::SDIV : ISD::UDIV;
5661
5662 const SDValue &Num = N->getOperand(0);
5663 const SDValue &Den = N->getOperand(1);
5664
5665 for (const SDNode *U : Num->users()) {
5666 if (U->getOpcode() == DivOpc && U->getOperand(0) == Num &&
5667 U->getOperand(1) == Den) {
5668 // Num % Den -> Num - (Num / Den) * Den
5669 return DAG.getNode(ISD::SUB, DL, VT, Num,
5670 DAG.getNode(ISD::MUL, DL, VT,
5671 DAG.getNode(DivOpc, DL, VT, Num, Den),
5672 Den));
5673 }
5674 }
5675 return SDValue();
5676}
5677
5678// (sign_extend|zero_extend (mul|shl) x, y) -> (mul.wide x, y)
5680 CodeGenOptLevel OptLevel) {
5681 if (OptLevel == CodeGenOptLevel::None)
5682 return SDValue();
5683
5684 SDValue Op = N->getOperand(0);
5685 if (!Op.hasOneUse())
5686 return SDValue();
5687 EVT ToVT = N->getValueType(0);
5688 EVT FromVT = Op.getValueType();
5689 if (!((ToVT == MVT::i32 && FromVT == MVT::i16) ||
5690 (ToVT == MVT::i64 && FromVT == MVT::i32)))
5691 return SDValue();
5692 if (!(Op.getOpcode() == ISD::MUL ||
5693 (Op.getOpcode() == ISD::SHL && isa<ConstantSDNode>(Op.getOperand(1)))))
5694 return SDValue();
5695
5696 SDLoc DL(N);
5697 unsigned ExtOpcode = N->getOpcode();
5698 unsigned Opcode = 0;
5699 if (ExtOpcode == ISD::SIGN_EXTEND && Op->getFlags().hasNoSignedWrap())
5701 else if (ExtOpcode == ISD::ZERO_EXTEND && Op->getFlags().hasNoUnsignedWrap())
5703 else
5704 return SDValue();
5705 SDValue RHS = Op.getOperand(1);
5706 if (Op.getOpcode() == ISD::SHL) {
5707 const auto ShiftAmt = Op.getConstantOperandVal(1);
5708 const auto MulVal = APInt(ToVT.getSizeInBits(), 1) << ShiftAmt;
5709 RHS = DCI.DAG.getConstant(MulVal, DL, ToVT);
5710 }
5711 return DCI.DAG.getNode(Opcode, DL, ToVT, Op.getOperand(0), RHS);
5712}
5713
5719
5720/// IsMulWideOperandDemotable - Checks if the provided DAG node is an operand
5721/// that can be demoted to \p OptSize bits without loss of information. The
5722/// signedness of the operand, if determinable, is placed in \p S.
5724 unsigned OptSize,
5725 OperandSignedness &S) {
5726 S = Unknown;
5727
5728 if (Op.getOpcode() == ISD::SIGN_EXTEND ||
5729 Op.getOpcode() == ISD::SIGN_EXTEND_INREG) {
5730 EVT OrigVT = Op.getOperand(0).getValueType();
5731 if (OrigVT.getFixedSizeInBits() <= OptSize) {
5732 S = Signed;
5733 return true;
5734 }
5735 } else if (Op.getOpcode() == ISD::ZERO_EXTEND) {
5736 EVT OrigVT = Op.getOperand(0).getValueType();
5737 if (OrigVT.getFixedSizeInBits() <= OptSize) {
5738 S = Unsigned;
5739 return true;
5740 }
5741 }
5742
5743 return false;
5744}
5745
5746/// AreMulWideOperandsDemotable - Checks if the given LHS and RHS operands can
5747/// be demoted to \p OptSize bits without loss of information. If the operands
5748/// contain a constant, it should appear as the RHS operand. The signedness of
5749/// the operands is placed in \p IsSigned.
5751 unsigned OptSize,
5752 bool &IsSigned) {
5753 OperandSignedness LHSSign;
5754
5755 // The LHS operand must be a demotable op
5756 if (!IsMulWideOperandDemotable(LHS, OptSize, LHSSign))
5757 return false;
5758
5759 // We should have been able to determine the signedness from the LHS
5760 if (LHSSign == Unknown)
5761 return false;
5762
5763 IsSigned = (LHSSign == Signed);
5764
5765 // The RHS can be a demotable op or a constant
5767 const APInt &Val = CI->getAPIntValue();
5768 if (LHSSign == Unsigned) {
5769 return Val.isIntN(OptSize);
5770 } else {
5771 return Val.isSignedIntN(OptSize);
5772 }
5773 } else {
5774 OperandSignedness RHSSign;
5775 if (!IsMulWideOperandDemotable(RHS, OptSize, RHSSign))
5776 return false;
5777
5778 return LHSSign == RHSSign;
5779 }
5780}
5781
5782/// TryMULWIDECombine - Attempt to replace a multiply of M bits with a multiply
5783/// of M/2 bits that produces an M-bit result (i.e. mul.wide). This transform
5784/// works on both multiply DAG nodes and SHL DAG nodes with a constant shift
5785/// amount.
5788 EVT MulType = N->getValueType(0);
5789 if (MulType != MVT::i32 && MulType != MVT::i64) {
5790 return SDValue();
5791 }
5792
5793 SDLoc DL(N);
5794 unsigned OptSize = MulType.getSizeInBits() >> 1;
5795 SDValue LHS = N->getOperand(0);
5796 SDValue RHS = N->getOperand(1);
5797
5798 // Canonicalize the multiply so the constant (if any) is on the right
5799 if (N->getOpcode() == ISD::MUL) {
5800 if (isa<ConstantSDNode>(LHS)) {
5801 std::swap(LHS, RHS);
5802 }
5803 }
5804
5805 // If we have a SHL, determine the actual multiply amount
5806 if (N->getOpcode() == ISD::SHL) {
5808 if (!ShlRHS) {
5809 return SDValue();
5810 }
5811
5812 APInt ShiftAmt = ShlRHS->getAPIntValue();
5813 unsigned BitWidth = MulType.getSizeInBits();
5814 if (ShiftAmt.sge(0) && ShiftAmt.slt(BitWidth)) {
5815 APInt MulVal = APInt(BitWidth, 1) << ShiftAmt;
5816 RHS = DCI.DAG.getConstant(MulVal, DL, MulType);
5817 } else {
5818 return SDValue();
5819 }
5820 }
5821
5822 bool Signed;
5823 // Verify that our operands are demotable
5824 if (!AreMulWideOperandsDemotable(LHS, RHS, OptSize, Signed)) {
5825 return SDValue();
5826 }
5827
5828 EVT DemotedVT;
5829 if (MulType == MVT::i32) {
5830 DemotedVT = MVT::i16;
5831 } else {
5832 DemotedVT = MVT::i32;
5833 }
5834
5835 // Truncate the operands to the correct size. Note that these are just for
5836 // type consistency and will (likely) be eliminated in later phases.
5837 SDValue TruncLHS =
5838 DCI.DAG.getNode(ISD::TRUNCATE, DL, DemotedVT, LHS);
5839 SDValue TruncRHS =
5840 DCI.DAG.getNode(ISD::TRUNCATE, DL, DemotedVT, RHS);
5841
5842 unsigned Opc;
5843 if (Signed) {
5845 } else {
5847 }
5848
5849 return DCI.DAG.getNode(Opc, DL, MulType, TruncLHS, TruncRHS);
5850}
5851
5852static bool isConstOne(const SDValue &Operand) {
5853 const auto *Const = dyn_cast<ConstantSDNode>(Operand);
5854 return Const && Const->getZExtValue() == 1;
5855}
5856
5858 if (Add->getOpcode() != ISD::ADD)
5859 return SDValue();
5860
5861 if (isConstOne(Add->getOperand(0)))
5862 return Add->getOperand(1);
5863
5864 if (isConstOne(Add->getOperand(1)))
5865 return Add->getOperand(0);
5866
5867 return SDValue();
5868}
5869
5872
5874 SDValue Mul = DCI.DAG.getNode(ISD::MUL, DL, VT, X, Y);
5875 return DCI.DAG.getNode(ISD::ADD, DL, VT, Mul, X);
5876 }
5877
5878 return SDValue();
5879}
5880
5882 SDLoc DL,
5884 if (Select->getOpcode() != ISD::SELECT)
5885 return SDValue();
5886
5887 SDValue Cond = Select->getOperand(0);
5888
5889 unsigned ConstOpNo;
5890 if (isConstOne(Select->getOperand(1)))
5891 ConstOpNo = 1;
5892 else if (isConstOne(Select->getOperand(2)))
5893 ConstOpNo = 2;
5894 else
5895 return SDValue();
5896
5897 SDValue Y = Select->getOperand((ConstOpNo == 1) ? 2 : 1);
5898
5899 // Do not combine if the resulting sequence is not obviously profitable.
5901 return SDValue();
5902
5903 SDValue NewMul = DCI.DAG.getNode(ISD::MUL, DL, VT, X, Y);
5904
5905 return DCI.DAG.getNode(ISD::SELECT, DL, VT, Cond,
5906 (ConstOpNo == 1) ? X : NewMul,
5907 (ConstOpNo == 1) ? NewMul : X);
5908}
5909
5910static SDValue
5913
5914 EVT VT = N0.getValueType();
5915 if (VT.isVector())
5916 return SDValue();
5917
5918 if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64)
5919 return SDValue();
5920
5921 SDLoc DL(N);
5922
5923 // (mul x, (add y, 1)) -> (add (mul x, y), x)
5924 if (SDValue Res = combineMADConstOne(N0, N1, VT, DL, DCI))
5925 return Res;
5926 if (SDValue Res = combineMADConstOne(N1, N0, VT, DL, DCI))
5927 return Res;
5928
5929 // (mul x, (select y, 1)) -> (select (mul x, y), x)
5930 if (SDValue Res = combineMulSelectConstOne(N0, N1, VT, DL, DCI))
5931 return Res;
5932 if (SDValue Res = combineMulSelectConstOne(N1, N0, VT, DL, DCI))
5933 return Res;
5934
5935 return SDValue();
5936}
5937
5938/// PerformMULCombine - Runs PTX-specific DAG combine patterns on MUL nodes.
5941 CodeGenOptLevel OptLevel) {
5942 if (OptLevel == CodeGenOptLevel::None)
5943 return SDValue();
5944
5945 if (SDValue Ret = TryMULWIDECombine(N, DCI))
5946 return Ret;
5947
5948 SDValue N0 = N->getOperand(0);
5949 SDValue N1 = N->getOperand(1);
5950 return PerformMULCombineWithOperands(N, N0, N1, DCI);
5951}
5952
5953/// PerformSHLCombine - Runs PTX-specific DAG combine patterns on SHL nodes.
5956 CodeGenOptLevel OptLevel) {
5957 if (OptLevel > CodeGenOptLevel::None) {
5958 // Try mul.wide combining at OptLevel > 0
5959 if (SDValue Ret = TryMULWIDECombine(N, DCI))
5960 return Ret;
5961 }
5962
5963 return SDValue();
5964}
5965
5968 unsigned int SmVersion) {
5969 EVT CCType = N->getValueType(0);
5970 SDValue A = N->getOperand(0);
5971 SDValue B = N->getOperand(1);
5972
5973 EVT AType = A.getValueType();
5974 if (!(CCType == MVT::v2i1 && (AType == MVT::v2f16 || AType == MVT::v2bf16)))
5975 return SDValue();
5976
5977 if (A.getValueType() == MVT::v2bf16 && SmVersion < 90)
5978 return SDValue();
5979
5980 SDLoc DL(N);
5981 // setp.f16x2 returns two scalar predicates, which we need to
5982 // convert back to v2i1. The returned result will be scalarized by
5983 // the legalizer, but the comparison will remain a single vector
5984 // instruction.
5985 SDValue CCNode = DCI.DAG.getNode(
5986 A.getValueType() == MVT::v2f16 ? NVPTXISD::SETP_F16X2
5988 DL, DCI.DAG.getVTList(MVT::i1, MVT::i1), {A, B, N->getOperand(2)});
5989 return DCI.DAG.getNode(ISD::BUILD_VECTOR, DL, CCType, CCNode.getValue(0),
5990 CCNode.getValue(1));
5991}
5992
5995 SDValue Vector = N->getOperand(0);
5996 if (Vector->getOpcode() == ISD::FREEZE)
5997 Vector = Vector->getOperand(0);
5998 SDLoc DL(N);
5999 EVT VectorVT = Vector.getValueType();
6000 if (Vector->getOpcode() == ISD::LOAD && VectorVT.isSimple() &&
6001 IsPTXVectorType(VectorVT.getSimpleVT()))
6002 return SDValue(); // Native vector loads already combine nicely w/
6003 // extract_vector_elt.
6004 // Don't mess with singletons or packed types (v2*32, v2*16, v4i8 and v8i8),
6005 // we already handle them OK.
6006 if (VectorVT.getVectorNumElements() == 1 ||
6007 NVPTX::isPackedVectorTy(VectorVT) || VectorVT == MVT::v8i8)
6008 return SDValue();
6009
6010 // Don't mess with undef values as sra may be simplified to 0, not undef.
6011 if (Vector->isUndef() || ISD::allOperandsUndef(Vector.getNode()))
6012 return SDValue();
6013
6014 uint64_t VectorBits = VectorVT.getSizeInBits();
6015 // We only handle the types we can extract in-register.
6016 if (!(VectorBits == 16 || VectorBits == 32 || VectorBits == 64))
6017 return SDValue();
6018
6019 ConstantSDNode *Index = dyn_cast<ConstantSDNode>(N->getOperand(1));
6020 // Index == 0 is handled by generic DAG combiner.
6021 if (!Index || Index->getZExtValue() == 0)
6022 return SDValue();
6023
6024 MVT IVT = MVT::getIntegerVT(VectorBits);
6025 EVT EltVT = VectorVT.getVectorElementType();
6026 EVT EltIVT = EltVT.changeTypeToInteger();
6027 uint64_t EltBits = EltVT.getScalarSizeInBits();
6028
6029 SDValue Result = DCI.DAG.getNode(
6030 ISD::TRUNCATE, DL, EltIVT,
6031 DCI.DAG.getNode(
6032 ISD::SRA, DL, IVT, DCI.DAG.getNode(ISD::BITCAST, DL, IVT, Vector),
6033 DCI.DAG.getConstant(Index->getZExtValue() * EltBits, DL, IVT)));
6034
6035 // If element has non-integer type, bitcast it back to the expected type.
6036 if (EltVT != EltIVT)
6037 Result = DCI.DAG.getNode(ISD::BITCAST, DL, EltVT, Result);
6038 // Past legalizer, we may need to extent i8 -> i16 to match the register type.
6039 if (EltVT != N->getValueType(0))
6040 Result = DCI.DAG.getNode(ISD::ANY_EXTEND, DL, N->getValueType(0), Result);
6041
6042 return Result;
6043}
6044
6047 SDValue VA = N->getOperand(1);
6048 EVT VectorVT = VA.getValueType();
6049 if (VectorVT != MVT::v4i8)
6050 return SDValue();
6051
6052 // We need to split vselect into individual per-element operations Because we
6053 // use BFE/BFI instruction for byte extraction/insertion, we do end up with
6054 // 32-bit values, so we may as well do comparison as i32 to avoid conversions
6055 // to/from i16 normally used for i8 values.
6057 SDLoc DL(N);
6058 SDValue VCond = N->getOperand(0);
6059 SDValue VB = N->getOperand(2);
6060 for (int I = 0; I < 4; ++I) {
6061 SDValue C = DCI.DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i1, VCond,
6062 DCI.DAG.getConstant(I, DL, MVT::i32));
6063 SDValue EA = DCI.DAG.getAnyExtOrTrunc(
6064 DCI.DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8, VA,
6065 DCI.DAG.getConstant(I, DL, MVT::i32)),
6066 DL, MVT::i32);
6067 SDValue EB = DCI.DAG.getAnyExtOrTrunc(
6068 DCI.DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8, VB,
6069 DCI.DAG.getConstant(I, DL, MVT::i32)),
6070 DL, MVT::i32);
6071 E.push_back(DCI.DAG.getAnyExtOrTrunc(
6072 DCI.DAG.getNode(ISD::SELECT, DL, MVT::i32, C, EA, EB), DL, MVT::i8));
6073 }
6074 return DCI.DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v4i8, E);
6075}
6076
6077static SDValue
6079 auto VT = N->getValueType(0);
6080 if (!DCI.isAfterLegalizeDAG() ||
6081 // only process v2*16 types
6082 !(NVPTX::isPackedVectorTy(VT) && VT.is32BitVector() &&
6083 VT.getVectorNumElements() == 2))
6084 return SDValue();
6085
6086 auto Op0 = N->getOperand(0);
6087 auto Op1 = N->getOperand(1);
6088
6089 // Start out by assuming we want to take the lower 2 bytes of each i32
6090 // operand.
6091 uint64_t Op0Bytes = 0x10;
6092 uint64_t Op1Bytes = 0x54;
6093
6094 std::pair<SDValue *, uint64_t *> OpData[2] = {{&Op0, &Op0Bytes},
6095 {&Op1, &Op1Bytes}};
6096
6097 // Check that each operand is an i16, truncated from an i32 operand. We'll
6098 // select individual bytes from those original operands. Optionally, fold in a
6099 // shift right of that original operand.
6100 for (auto &[Op, OpBytes] : OpData) {
6101 // Eat up any bitcast
6102 if (Op->getOpcode() == ISD::BITCAST)
6103 *Op = Op->getOperand(0);
6104
6105 if (!(Op->getValueType() == MVT::i16 && Op->getOpcode() == ISD::TRUNCATE &&
6106 Op->getOperand(0).getValueType() == MVT::i32))
6107 return SDValue();
6108
6109 // If the truncate has multiple uses, this optimization can increase
6110 // register pressure
6111 if (!Op->hasOneUse())
6112 return SDValue();
6113
6114 *Op = Op->getOperand(0);
6115
6116 // Optionally, fold in a shift-right of the original operand and let permute
6117 // pick the two higher bytes of the original value directly.
6118 if (Op->getOpcode() == ISD::SRL && isa<ConstantSDNode>(Op->getOperand(1))) {
6119 if (cast<ConstantSDNode>(Op->getOperand(1))->getZExtValue() == 16) {
6120 // Shift the PRMT byte selector to pick upper bytes from each respective
6121 // value, instead of the lower ones: 0x10 -> 0x32, 0x54 -> 0x76
6122 assert((*OpBytes == 0x10 || *OpBytes == 0x54) &&
6123 "PRMT selector values out of range");
6124 *OpBytes += 0x22;
6125 *Op = Op->getOperand(0);
6126 }
6127 }
6128 }
6129
6130 SDLoc DL(N);
6131 auto &DAG = DCI.DAG;
6132
6133 auto PRMT =
6134 getPRMT(DAG.getBitcast(MVT::i32, Op0), DAG.getBitcast(MVT::i32, Op1),
6135 (Op1Bytes << 8) | Op0Bytes, DL, DAG);
6136 return DAG.getBitcast(VT, PRMT);
6137}
6138
6141 auto *ASCN1 = cast<AddrSpaceCastSDNode>(N);
6142
6143 if (auto *ASCN2 = dyn_cast<AddrSpaceCastSDNode>(ASCN1->getOperand(0))) {
6144 assert(ASCN2->getDestAddressSpace() == ASCN1->getSrcAddressSpace());
6145
6146 // Fold asc[B -> A](asc[A -> B](x)) -> x
6147 if (ASCN1->getDestAddressSpace() == ASCN2->getSrcAddressSpace())
6148 return ASCN2->getOperand(0);
6149 }
6150
6151 return SDValue();
6152}
6153
6154// Given a constant selector value and a prmt mode, return the selector value
6155// normalized to the generic prmt mode. See the PTX ISA documentation for more
6156// details:
6157// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-prmt
6158static APInt getPRMTSelector(const APInt &Selector, unsigned Mode) {
6159 assert(Selector.getBitWidth() == 32 && "PRMT must have i32 operands");
6160
6162 return Selector;
6163
6164 const unsigned V = Selector.trunc(2).getZExtValue();
6165
6166 const auto GetSelector = [](unsigned S0, unsigned S1, unsigned S2,
6167 unsigned S3) {
6168 return APInt(32, S0 | (S1 << 4) | (S2 << 8) | (S3 << 12));
6169 };
6170
6171 switch (Mode) {
6173 return GetSelector(V, V + 1, V + 2, V + 3);
6175 return GetSelector(V, (V - 1) & 7, (V - 2) & 7, (V - 3) & 7);
6177 return GetSelector(V, V, V, V);
6179 return GetSelector(V, std::max(V, 1U), std::max(V, 2U), 3U);
6181 return GetSelector(0, std::min(V, 1U), std::min(V, 2U), V);
6183 unsigned V1 = (V & 1) << 1;
6184 return GetSelector(V1, V1 + 1, V1, V1 + 1);
6185 }
6186 default:
6187 llvm_unreachable("Invalid PRMT mode");
6188 }
6189}
6190
6191static APInt computePRMT(APInt A, APInt B, APInt Selector, unsigned Mode) {
6192 assert(A.getBitWidth() == 32 && B.getBitWidth() == 32 &&
6193 Selector.getBitWidth() == 32 && "PRMT must have i32 operands");
6194 // {b, a} = {{b7, b6, b5, b4}, {b3, b2, b1, b0}}
6195 APInt BitField = B.concat(A);
6196 APInt SelectorVal = getPRMTSelector(Selector, Mode);
6197 APInt Result(32, 0);
6198 for (unsigned I : llvm::seq(4U)) {
6199 APInt Sel = SelectorVal.extractBits(4, I * 4);
6200 unsigned Idx = Sel.getLoBits(3).getZExtValue();
6201 unsigned Sign = Sel.getHiBits(1).getZExtValue();
6202 APInt Byte = BitField.extractBits(8, Idx * 8);
6203 if (Sign)
6204 Byte = Byte.ashr(8);
6205 Result.insertBits(Byte, I * 8);
6206 }
6207 return Result;
6208}
6209
6211 CodeGenOptLevel OptLevel) {
6212 if (OptLevel == CodeGenOptLevel::None)
6213 return SDValue();
6214
6215 // Constant fold PRMT
6216 if (isa<ConstantSDNode>(N->getOperand(0)) &&
6217 isa<ConstantSDNode>(N->getOperand(1)) &&
6218 isa<ConstantSDNode>(N->getOperand(2)))
6219 return DCI.DAG.getConstant(computePRMT(N->getConstantOperandAPInt(0),
6220 N->getConstantOperandAPInt(1),
6221 N->getConstantOperandAPInt(2),
6222 N->getConstantOperandVal(3)),
6223 SDLoc(N), N->getValueType(0));
6224 return SDValue();
6225}
6226
6227// During call lowering we wrap the return values in a ProxyReg node which
6228// depend on the chain value produced by the completed call. This ensures that
6229// the full call is emitted in cases where libcalls are used to legalize
6230// operations. To improve the functioning of other DAG combines we pull all
6231// operations we can through one of these nodes, ensuring that the ProxyReg
6232// directly wraps a load. That is:
6233//
6234// (ProxyReg (zext (load retval0))) => (zext (ProxyReg (load retval0)))
6235//
6238 switch (R.getOpcode()) {
6239 case ISD::TRUNCATE:
6240 case ISD::ANY_EXTEND:
6241 case ISD::SIGN_EXTEND:
6242 case ISD::ZERO_EXTEND:
6243 case ISD::BITCAST: {
6244 if (SDValue V = sinkProxyReg(R.getOperand(0), Chain, DCI))
6245 return DCI.DAG.getNode(R.getOpcode(), SDLoc(R), R.getValueType(), V);
6246 return SDValue();
6247 }
6248 case ISD::SHL:
6249 case ISD::SRL:
6250 case ISD::SRA:
6251 case ISD::OR: {
6252 if (SDValue A = sinkProxyReg(R.getOperand(0), Chain, DCI))
6253 if (SDValue B = sinkProxyReg(R.getOperand(1), Chain, DCI))
6254 return DCI.DAG.getNode(R.getOpcode(), SDLoc(R), R.getValueType(), A, B);
6255 return SDValue();
6256 }
6257 case ISD::Constant:
6258 return R;
6259 case ISD::LOAD:
6260 case NVPTXISD::LoadV2:
6261 case NVPTXISD::LoadV4: {
6262 return DCI.DAG.getNode(NVPTXISD::ProxyReg, SDLoc(R), R.getValueType(),
6263 {Chain, R});
6264 }
6265 case ISD::BUILD_VECTOR: {
6266 if (DCI.isBeforeLegalize())
6267 return SDValue();
6268
6270 for (auto &Op : R->ops()) {
6271 SDValue V = sinkProxyReg(Op, Chain, DCI);
6272 if (!V)
6273 return SDValue();
6274 Ops.push_back(V);
6275 }
6276 return DCI.DAG.getNode(ISD::BUILD_VECTOR, SDLoc(R), R.getValueType(), Ops);
6277 }
6279 if (DCI.isBeforeLegalize())
6280 return SDValue();
6281
6282 if (SDValue V = sinkProxyReg(R.getOperand(0), Chain, DCI))
6284 R.getValueType(), V, R.getOperand(1));
6285 return SDValue();
6286 }
6287 default:
6288 return SDValue();
6289 }
6290}
6291
6294
6295 SDValue Chain = N->getOperand(0);
6296 SDValue Reg = N->getOperand(1);
6297
6298 // If the ProxyReg is not wrapping a load, try to pull the operations through
6299 // the ProxyReg.
6300 if (Reg.getOpcode() != ISD::LOAD) {
6301 if (SDValue V = sinkProxyReg(Reg, Chain, DCI))
6302 return V;
6303 }
6304
6305 return SDValue();
6306}
6307
6308SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N,
6309 DAGCombinerInfo &DCI) const {
6311 switch (N->getOpcode()) {
6312 default:
6313 break;
6314 case ISD::ADD:
6315 return PerformADDCombine(N, DCI, OptLevel);
6316 case ISD::ADDRSPACECAST:
6317 return combineADDRSPACECAST(N, DCI);
6318 case ISD::SIGN_EXTEND:
6319 case ISD::ZERO_EXTEND:
6320 return combineMulWide(N, DCI, OptLevel);
6321 case ISD::BUILD_VECTOR:
6322 return PerformBUILD_VECTORCombine(N, DCI);
6324 return PerformEXTRACTCombine(N, DCI);
6325 case ISD::FADD:
6326 return PerformFADDCombine(N, DCI, OptLevel);
6327 case ISD::FMAXNUM:
6328 case ISD::FMINNUM:
6329 case ISD::FMAXIMUM:
6330 case ISD::FMINIMUM:
6331 case ISD::FMAXIMUMNUM:
6332 case ISD::FMINIMUMNUM:
6333 return PerformFMinMaxCombine(N, DCI, STI.getPTXVersion(),
6334 STI.getSmVersion());
6335 case ISD::LOAD:
6336 case NVPTXISD::LoadV2:
6337 case NVPTXISD::LoadV4:
6338 return combineLOAD(N, DCI, STI);
6339 case ISD::MUL:
6340 return PerformMULCombine(N, DCI, OptLevel);
6341 case NVPTXISD::PRMT:
6342 return combinePRMT(N, DCI, OptLevel);
6343 case NVPTXISD::ProxyReg:
6344 return combineProxyReg(N, DCI);
6345 case ISD::SETCC:
6346 return PerformSETCCCombine(N, DCI, STI.getSmVersion());
6347 case ISD::SHL:
6348 return PerformSHLCombine(N, DCI, OptLevel);
6349 case ISD::SREM:
6350 case ISD::UREM:
6351 return PerformREMCombine(N, DCI, OptLevel);
6352 case ISD::STORE:
6353 case NVPTXISD::StoreV2:
6354 case NVPTXISD::StoreV4:
6355 return combineSTORE(N, DCI, STI);
6356 case ISD::VSELECT:
6357 return PerformVSELECTCombine(N, DCI);
6358 }
6359 return SDValue();
6360}
6361
6364 // Handle bitcasting to v2i8 without hitting the default promotion
6365 // strategy which goes through stack memory.
6366 SDValue Op(Node, 0);
6367 EVT ToVT = Op->getValueType(0);
6368 if (ToVT != MVT::v2i8) {
6369 return;
6370 }
6371
6372 // Bitcast to i16 and unpack elements into a vector
6373 SDLoc DL(Node);
6374 SDValue AsInt = DAG.getBitcast(MVT::i16, Op->getOperand(0));
6375 SDValue Vec0 = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, AsInt);
6376 SDValue Const8 = DAG.getConstant(8, DL, MVT::i16);
6377 SDValue Vec1 =
6378 DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
6379 DAG.getNode(ISD::SRL, DL, MVT::i16, {AsInt, Const8}));
6380 Results.push_back(
6381 DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2i8, {Vec0, Vec1}));
6382}
6383
6386 SDValue Chain = N->getOperand(0);
6387 SDValue Intrin = N->getOperand(1);
6388 SDLoc DL(N);
6389
6390 // Get the intrinsic ID
6391 unsigned IntrinNo = Intrin.getNode()->getAsZExtVal();
6392 switch (IntrinNo) {
6393 default:
6394 return;
6395 case Intrinsic::nvvm_ldu_global_i:
6396 case Intrinsic::nvvm_ldu_global_f:
6397 case Intrinsic::nvvm_ldu_global_p: {
6398 EVT ResVT = N->getValueType(0);
6399
6400 if (ResVT.isVector()) {
6401 // Vector LDG/LDU
6402
6403 unsigned NumElts = ResVT.getVectorNumElements();
6404 EVT EltVT = ResVT.getVectorElementType();
6405
6406 // Since LDU/LDG are target nodes, we cannot rely on DAG type
6407 // legalization.
6408 // Therefore, we must ensure the type is legal. For i1 and i8, we set the
6409 // loaded type to i16 and propagate the "real" type as the memory type.
6410 bool NeedTrunc = false;
6411 if (EltVT.getSizeInBits() < 16) {
6412 EltVT = MVT::i16;
6413 NeedTrunc = true;
6414 }
6415
6416 unsigned Opcode = 0;
6417 SDVTList LdResVTs;
6418
6419 switch (NumElts) {
6420 default:
6421 return;
6422 case 2:
6423 Opcode = NVPTXISD::LDUV2;
6424 LdResVTs = DAG.getVTList(EltVT, EltVT, MVT::Other);
6425 break;
6426 case 4: {
6427 Opcode = NVPTXISD::LDUV4;
6428 EVT ListVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other };
6429 LdResVTs = DAG.getVTList(ListVTs);
6430 break;
6431 }
6432 }
6433
6434 SmallVector<SDValue, 8> OtherOps;
6435
6436 // Copy regular operands
6437
6438 OtherOps.push_back(Chain); // Chain
6439 // Skip operand 1 (intrinsic ID)
6440 // Others
6441 OtherOps.append(N->op_begin() + 2, N->op_end());
6442
6444
6445 SDValue NewLD = DAG.getMemIntrinsicNode(Opcode, DL, LdResVTs, OtherOps,
6446 MemSD->getMemoryVT(),
6447 MemSD->getMemOperand());
6448
6449 SmallVector<SDValue, 4> ScalarRes;
6450
6451 for (unsigned i = 0; i < NumElts; ++i) {
6452 SDValue Res = NewLD.getValue(i);
6453 if (NeedTrunc)
6454 Res =
6455 DAG.getNode(ISD::TRUNCATE, DL, ResVT.getVectorElementType(), Res);
6456 ScalarRes.push_back(Res);
6457 }
6458
6459 SDValue LoadChain = NewLD.getValue(NumElts);
6460
6461 SDValue BuildVec =
6462 DAG.getBuildVector(ResVT, DL, ScalarRes);
6463
6464 Results.push_back(BuildVec);
6465 Results.push_back(LoadChain);
6466 } else {
6467 // i8 LDG/LDU
6468 assert(ResVT.isSimple() && ResVT.getSimpleVT().SimpleTy == MVT::i8 &&
6469 "Custom handling of non-i8 ldu/ldg?");
6470
6471 // Just copy all operands as-is
6473
6474 // Force output to i16
6475 SDVTList LdResVTs = DAG.getVTList(MVT::i16, MVT::Other);
6476
6478
6479 // We make sure the memory type is i8, which will be used during isel
6480 // to select the proper instruction.
6481 SDValue NewLD =
6483 MVT::i8, MemSD->getMemOperand());
6484
6485 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
6486 NewLD.getValue(0)));
6487 Results.push_back(NewLD.getValue(1));
6488 }
6489 return;
6490 }
6491
6492 case Intrinsic::nvvm_tcgen05_ld_16x64b_x4:
6493 case Intrinsic::nvvm_tcgen05_ld_16x64b_x8:
6494 case Intrinsic::nvvm_tcgen05_ld_16x64b_x16:
6495 case Intrinsic::nvvm_tcgen05_ld_16x64b_x32:
6496 case Intrinsic::nvvm_tcgen05_ld_16x64b_x64:
6497 case Intrinsic::nvvm_tcgen05_ld_16x64b_x128:
6498 case Intrinsic::nvvm_tcgen05_ld_32x32b_x4:
6499 case Intrinsic::nvvm_tcgen05_ld_32x32b_x8:
6500 case Intrinsic::nvvm_tcgen05_ld_32x32b_x16:
6501 case Intrinsic::nvvm_tcgen05_ld_32x32b_x32:
6502 case Intrinsic::nvvm_tcgen05_ld_32x32b_x64:
6503 case Intrinsic::nvvm_tcgen05_ld_32x32b_x128:
6504 case Intrinsic::nvvm_tcgen05_ld_16x128b_x2:
6505 case Intrinsic::nvvm_tcgen05_ld_16x128b_x4:
6506 case Intrinsic::nvvm_tcgen05_ld_16x128b_x8:
6507 case Intrinsic::nvvm_tcgen05_ld_16x128b_x16:
6508 case Intrinsic::nvvm_tcgen05_ld_16x128b_x32:
6509 case Intrinsic::nvvm_tcgen05_ld_16x128b_x64:
6510 case Intrinsic::nvvm_tcgen05_ld_16x256b_x1:
6511 case Intrinsic::nvvm_tcgen05_ld_16x256b_x2:
6512 case Intrinsic::nvvm_tcgen05_ld_16x256b_x4:
6513 case Intrinsic::nvvm_tcgen05_ld_16x256b_x8:
6514 case Intrinsic::nvvm_tcgen05_ld_16x256b_x16:
6515 case Intrinsic::nvvm_tcgen05_ld_16x256b_x32:
6516 if (auto Res = lowerTcgen05Ld(N, DAG)) {
6517 Results.push_back(Res->first);
6518 Results.push_back(Res->second);
6519 }
6520 return;
6521
6522 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x4:
6523 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x8:
6524 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x16:
6525 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x32:
6526 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x64:
6527 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x128:
6528 if (auto Res = lowerTcgen05Ld(N, DAG, /*HasOffset=*/true)) {
6529 Results.push_back(Res->first);
6530 Results.push_back(Res->second);
6531 }
6532 return;
6533 }
6534}
6535
6538 // Change the CopyFromReg to output 2 64-bit results instead of a 128-bit
6539 // result so that it can pass the legalization
6540 SDLoc DL(N);
6541 SDValue Chain = N->getOperand(0);
6542 SDValue Reg = N->getOperand(1);
6543 SDValue Glue = N->getOperand(2);
6544
6545 assert(Reg.getValueType() == MVT::i128 &&
6546 "Custom lowering for CopyFromReg with 128-bit reg only");
6547 SmallVector<EVT, 4> ResultsType = {MVT::i64, MVT::i64, N->getValueType(1),
6548 N->getValueType(2)};
6549 SmallVector<SDValue, 3> NewOps = {Chain, Reg, Glue};
6550
6551 SDValue NewValue = DAG.getNode(ISD::CopyFromReg, DL, ResultsType, NewOps);
6552 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i128,
6553 {NewValue.getValue(0), NewValue.getValue(1)});
6554
6555 Results.push_back(Pair);
6556 Results.push_back(NewValue.getValue(2));
6557 Results.push_back(NewValue.getValue(3));
6558}
6559
6561 const TargetLowering &TLI,
6563 SDValue Chain = N->getOperand(0);
6564 SDValue Reg = N->getOperand(1);
6565
6566 MVT VT = TLI.getRegisterType(*DAG.getContext(), Reg.getValueType());
6567
6568 SDValue NewReg = DAG.getAnyExtOrTrunc(Reg, SDLoc(N), VT);
6569 SDValue NewProxy =
6570 DAG.getNode(NVPTXISD::ProxyReg, SDLoc(N), VT, {Chain, NewReg});
6571 SDValue Res = DAG.getAnyExtOrTrunc(NewProxy, SDLoc(N), N->getValueType(0));
6572
6573 Results.push_back(Res);
6574}
6575
6577 const NVPTXSubtarget &STI,
6579 assert(N->getValueType(0) == MVT::i128 &&
6580 "Custom lowering for atomic128 only supports i128");
6581
6583 SDLoc dl(N);
6584
6585 if (!STI.hasAtomSwap128()) {
6588 "Support for b128 atomics introduced in PTX ISA version 8.3 and "
6589 "requires target sm_90.",
6590 dl.getDebugLoc()));
6591
6592 Results.push_back(DAG.getUNDEF(MVT::i128));
6593 Results.push_back(AN->getOperand(0)); // Chain
6594 return;
6595 }
6596
6598 Ops.push_back(AN->getOperand(0)); // Chain
6599 Ops.push_back(AN->getOperand(1)); // Ptr
6600 for (const auto &Op : AN->ops().drop_front(2)) {
6601 // Low part
6602 Ops.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i64, Op,
6603 DAG.getIntPtrConstant(0, dl)));
6604 // High part
6605 Ops.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i64, Op,
6606 DAG.getIntPtrConstant(1, dl)));
6607 }
6608 unsigned Opcode = N->getOpcode() == ISD::ATOMIC_SWAP
6611 SDVTList Tys = DAG.getVTList(MVT::i64, MVT::i64, MVT::Other);
6612 SDValue Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, MVT::i128,
6613 AN->getMemOperand());
6614 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i128,
6615 {Result.getValue(0), Result.getValue(1)}));
6616 Results.push_back(Result.getValue(2));
6617}
6618
6619void NVPTXTargetLowering::ReplaceNodeResults(
6621 switch (N->getOpcode()) {
6622 default:
6623 report_fatal_error("Unhandled custom legalization");
6624 case ISD::BITCAST:
6625 ReplaceBITCAST(N, DAG, Results);
6626 return;
6627 case ISD::LOAD:
6628 replaceLoadVector(N, DAG, Results, STI);
6629 return;
6632 return;
6633 case ISD::CopyFromReg:
6635 return;
6636 case NVPTXISD::ProxyReg:
6637 replaceProxyReg(N, DAG, *this, Results);
6638 return;
6639 case ISD::ATOMIC_CMP_SWAP:
6640 case ISD::ATOMIC_SWAP:
6641 replaceAtomicSwap128(N, DAG, STI, Results);
6642 return;
6643 }
6644}
6645
6648 Type *Ty = AI->getValOperand()->getType();
6649
6650 if (AI->isFloatingPointOperation()) {
6652 if (Ty->isHalfTy() && STI.getSmVersion() >= 70 &&
6653 STI.getPTXVersion() >= 63)
6655 if (Ty->isBFloatTy() && STI.getSmVersion() >= 90 &&
6656 STI.getPTXVersion() >= 78)
6658 if (Ty->isFloatTy())
6660 if (Ty->isDoubleTy() && STI.hasAtomAddF64())
6662 }
6664 }
6665
6666 assert(Ty->isIntegerTy() && "Ty should be integer at this point");
6667 const unsigned BitWidth = cast<IntegerType>(Ty)->getBitWidth();
6668
6669 switch (AI->getOperation()) {
6670 default:
6673 if (BitWidth == 128)
6679 switch (BitWidth) {
6680 case 8:
6681 case 16:
6683 case 32:
6685 case 64:
6686 if (STI.hasAtomBitwise64())
6689 case 128:
6691 default:
6692 llvm_unreachable("unsupported width encountered");
6693 }
6700 switch (BitWidth) {
6701 case 8:
6702 case 16:
6704 case 32:
6706 case 64:
6707 if (STI.hasAtomMinMax64())
6710 case 128:
6712 default:
6713 llvm_unreachable("unsupported width encountered");
6714 }
6717 switch (BitWidth) {
6718 case 32:
6720 case 8:
6721 case 16:
6722 case 64:
6723 case 128:
6725 default:
6726 llvm_unreachable("unsupported width encountered");
6727 }
6728 }
6729
6731}
6732
6734 const Instruction *I) const {
6735 auto *CI = dyn_cast<AtomicCmpXchgInst>(I);
6736 // When CAS bitwidth is not supported on the hardware, the CAS is emulated
6737 // using a retry loop that uses a higher-bitwidth monotonic CAS. We enforce
6738 // the memory order using explicit fences around the retry loop.
6739 // The memory order of natively supported CAS operations can be enforced
6740 // by lowering to an atom.cas with the right memory synchronizing effect.
6741 // However, atom.cas only supports relaxed, acquire, release and acq_rel.
6742 // So we also use explicit fences for enforcing memory order for
6743 // seq_cast CAS with natively-supported bitwidths.
6744 return CI &&
6745 (cast<IntegerType>(CI->getCompareOperand()->getType())->getBitWidth() <
6746 STI.getMinCmpXchgSizeInBits() ||
6747 CI->getMergedOrdering() == AtomicOrdering::SequentiallyConsistent);
6748}
6749
6751 const Instruction *I) const {
6752 auto *CI = dyn_cast<AtomicCmpXchgInst>(I);
6753 bool BitwidthSupportedAndIsSeqCst =
6754 CI && CI->getMergedOrdering() == AtomicOrdering::SequentiallyConsistent &&
6755 cast<IntegerType>(CI->getCompareOperand()->getType())->getBitWidth() >=
6756 STI.getMinCmpXchgSizeInBits();
6757 return BitwidthSupportedAndIsSeqCst ? AtomicOrdering::Acquire
6759}
6760
6762 Instruction *Inst,
6763 AtomicOrdering Ord) const {
6764 if (!isa<AtomicCmpXchgInst>(Inst))
6765 return TargetLoweringBase::emitLeadingFence(Builder, Inst, Ord);
6766
6767 // Specialize for cmpxchg
6768 // Emit a fence.sc leading fence for cmpxchg seq_cst which are not emulated
6769 SyncScope::ID SSID = cast<AtomicCmpXchgInst>(Inst)->getSyncScopeID();
6770 if (isReleaseOrStronger(Ord))
6771 return Builder.CreateFence(Ord == AtomicOrdering::SequentiallyConsistent
6772 ? Ord
6774 SSID);
6775
6776 return nullptr;
6777}
6778
6780 Instruction *Inst,
6781 AtomicOrdering Ord) const {
6782 // Specialize for cmpxchg
6783 if (!isa<AtomicCmpXchgInst>(Inst))
6784 return TargetLoweringBase::emitTrailingFence(Builder, Inst, Ord);
6785
6786 auto *CI = cast<AtomicCmpXchgInst>(Inst);
6787 auto CASWidth =
6788 cast<IntegerType>(CI->getCompareOperand()->getType())->getBitWidth();
6789 SyncScope::ID SSID = CI->getSyncScopeID();
6790 // Do not emit a trailing fence for cmpxchg seq_cst which are not emulated
6791 if (isAcquireOrStronger(Ord) &&
6793 CASWidth < STI.getMinCmpXchgSizeInBits()))
6794 return Builder.CreateFence(AtomicOrdering::Acquire, SSID);
6795
6796 return nullptr;
6797}
6798
6799// Rather than default to SINT when both UINT and SINT are custom, we only
6800// change the opcode when UINT is not legal and SINT is. UINT is preferred when
6801// both are custom since unsigned CVT instructions can lead to slightly better
6802// SASS code with fewer instructions.
6804 EVT ToVT) const {
6805 if (isOperationLegal(Op, ToVT))
6806 return Op;
6807 switch (Op) {
6808 case ISD::FP_TO_UINT:
6810 return ISD::FP_TO_SINT;
6811 break;
6815 break;
6816 case ISD::VP_FP_TO_UINT:
6817 if (isOperationLegal(ISD::VP_FP_TO_SINT, ToVT))
6818 return ISD::VP_FP_TO_SINT;
6819 break;
6820 default:
6821 break;
6822 }
6823 return Op;
6824}
6825
6826// Pin NVPTXTargetObjectFile's vtables to this file.
6828
6833
6835 const SelectionDAG &DAG, unsigned Depth) {
6836 SDValue A = Op.getOperand(0);
6837 SDValue B = Op.getOperand(1);
6838 ConstantSDNode *Selector = dyn_cast<ConstantSDNode>(Op.getOperand(2));
6839 unsigned Mode = Op.getConstantOperandVal(3);
6840
6841 if (!Selector)
6842 return;
6843
6844 KnownBits AKnown = DAG.computeKnownBits(A, Depth);
6845 KnownBits BKnown = DAG.computeKnownBits(B, Depth);
6846
6847 // {b, a} = {{b7, b6, b5, b4}, {b3, b2, b1, b0}}
6848 assert(AKnown.getBitWidth() == 32 && BKnown.getBitWidth() == 32 &&
6849 "PRMT must have i32 operands");
6850 assert(Known.getBitWidth() == 32 && "PRMT must have i32 result");
6851 KnownBits BitField = BKnown.concat(AKnown);
6852
6853 APInt SelectorVal = getPRMTSelector(Selector->getAPIntValue(), Mode);
6854 for (unsigned I : llvm::seq(4)) {
6855 APInt Sel = SelectorVal.extractBits(4, I * 4);
6856 unsigned Idx = Sel.getLoBits(3).getZExtValue();
6857 unsigned Sign = Sel.getHiBits(1).getZExtValue();
6858 KnownBits Byte = BitField.extractBits(8, Idx * 8);
6859 if (Sign)
6860 Byte = KnownBits::ashr(Byte, 8);
6861 Known.insertBits(Byte, I * 8);
6862 }
6863}
6864
6865static void computeKnownBitsForLoadV(const SDValue Op, KnownBits &Known) {
6867
6868 // We can't do anything without knowing the sign bit.
6869 auto ExtType = LD->getConstantOperandVal(LD->getNumOperands() - 1);
6870 if (ExtType == ISD::SEXTLOAD)
6871 return;
6872
6873 // ExtLoading to vector types is weird and may not work well with known bits.
6874 auto DestVT = LD->getValueType(0);
6875 if (DestVT.isVector())
6876 return;
6877
6878 assert(Known.getBitWidth() == DestVT.getSizeInBits());
6879 auto ElementBitWidth = NVPTXDAGToDAGISel::getFromTypeWidthForLoad(LD);
6880 Known.Zero.setHighBits(Known.getBitWidth() - ElementBitWidth);
6881}
6882
6884 const SDValue Op, KnownBits &Known, const APInt &DemandedElts,
6885 const SelectionDAG &DAG, unsigned Depth) const {
6886 Known.resetAll();
6887
6888 switch (Op.getOpcode()) {
6889 case NVPTXISD::PRMT:
6890 computeKnownBitsForPRMT(Op, Known, DAG, Depth);
6891 break;
6892 case NVPTXISD::LoadV2:
6893 case NVPTXISD::LoadV4:
6894 case NVPTXISD::LoadV8:
6896 break;
6897 default:
6898 break;
6899 }
6900}
6901
6902static std::pair<APInt, APInt> getPRMTDemandedBits(const APInt &SelectorVal,
6903 const APInt &DemandedBits) {
6904 APInt DemandedLHS = APInt(32, 0);
6905 APInt DemandedRHS = APInt(32, 0);
6906
6907 for (unsigned I : llvm::seq(4)) {
6908 if (DemandedBits.extractBits(8, I * 8).isZero())
6909 continue;
6910
6911 APInt Sel = SelectorVal.extractBits(4, I * 4);
6912 unsigned Idx = Sel.getLoBits(3).getZExtValue();
6913 unsigned Sign = Sel.getHiBits(1).getZExtValue();
6914
6915 APInt &Src = Idx < 4 ? DemandedLHS : DemandedRHS;
6916 unsigned ByteStart = (Idx % 4) * 8;
6917 if (Sign)
6918 Src.setBit(ByteStart + 7);
6919 else
6920 Src.setBits(ByteStart, ByteStart + 8);
6921 }
6922
6923 return {DemandedLHS, DemandedRHS};
6924}
6925
6926// Replace undef with 0 as this is easier for other optimizations such as
6927// known bits.
6929 if (!Op)
6930 return SDValue();
6931 if (Op.isUndef())
6932 return DAG.getConstant(0, SDLoc(), MVT::i32);
6933 return Op;
6934}
6935
6937 const APInt &DemandedBits,
6938 SelectionDAG &DAG,
6939 const TargetLowering &TLI,
6940 unsigned Depth) {
6941 assert(PRMT.getOpcode() == NVPTXISD::PRMT);
6942 SDValue Op0 = PRMT.getOperand(0);
6943 SDValue Op1 = PRMT.getOperand(1);
6944 auto *SelectorConst = dyn_cast<ConstantSDNode>(PRMT.getOperand(2));
6945 if (!SelectorConst)
6946 return SDValue();
6947
6948 unsigned Mode = PRMT.getConstantOperandVal(3);
6949 const APInt Selector = getPRMTSelector(SelectorConst->getAPIntValue(), Mode);
6950
6951 // Try to simplify the PRMT to one of the inputs if the used bytes are all
6952 // from the same input in the correct order.
6953 const unsigned LeadingBytes = DemandedBits.countLeadingZeros() / 8;
6954 const unsigned SelBits = (4 - LeadingBytes) * 4;
6955 if (Selector.getLoBits(SelBits) == APInt(32, 0x3210).getLoBits(SelBits))
6956 return Op0;
6957 if (Selector.getLoBits(SelBits) == APInt(32, 0x7654).getLoBits(SelBits))
6958 return Op1;
6959
6960 auto [DemandedLHS, DemandedRHS] = getPRMTDemandedBits(Selector, DemandedBits);
6961
6962 // Attempt to avoid multi-use ops if we don't need anything from them.
6963 SDValue DemandedOp0 =
6964 TLI.SimplifyMultipleUseDemandedBits(Op0, DemandedLHS, DAG, Depth + 1);
6965 SDValue DemandedOp1 =
6966 TLI.SimplifyMultipleUseDemandedBits(Op1, DemandedRHS, DAG, Depth + 1);
6967
6968 DemandedOp0 = canonicalizePRMTInput(DemandedOp0, DAG);
6969 DemandedOp1 = canonicalizePRMTInput(DemandedOp1, DAG);
6970 if ((DemandedOp0 && DemandedOp0 != Op0) ||
6971 (DemandedOp1 && DemandedOp1 != Op1)) {
6972 Op0 = DemandedOp0 ? DemandedOp0 : Op0;
6973 Op1 = DemandedOp1 ? DemandedOp1 : Op1;
6974 return getPRMT(Op0, Op1, Selector.getZExtValue(), SDLoc(PRMT), DAG);
6975 }
6976
6977 return SDValue();
6978}
6979
6981 SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
6982 KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth) const {
6983 Known.resetAll();
6984
6985 switch (Op.getOpcode()) {
6986 case NVPTXISD::PRMT:
6988 *this, Depth)) {
6989 TLO.CombineTo(Op, Result);
6990 return true;
6991 }
6992 break;
6993 default:
6994 break;
6995 }
6996
6997 computeKnownBitsForTargetNode(Op, Known, DemandedElts, TLO.DAG, Depth);
6998 return false;
6999}
return SDValue()
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
constexpr LLT S1
constexpr LLT F32
AMDGPU Register Bank Select
This file declares a class to represent arbitrary precision floating point values and provide a varie...
This file implements a class to represent arbitrary precision integral constant values and operations...
static SDValue PerformADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformADDCombineWithOperands - Try DAG combinations for an ADD with operands N0 and N1.
static SDValue PerformADDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformADDCombine - Target-specific dag combine xforms for ISD::ADD.
static SDValue PerformVSELECTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue PerformMULCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue PerformFADDCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue PerformBUILD_VECTORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformBUILD_VECTORCombine - Target-specific dag combine xforms for ISD::BUILD_VECTOR.
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Function Alias Analysis Results
Atomic ordering constants.
This file contains the simple types necessary to represent the attributes associated with functions a...
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
#define clEnumValN(ENUMVAL, FLAGNAME, DESC)
#define LLVM_FALLTHROUGH
LLVM_FALLTHROUGH - Mark fallthrough cases in switch statements.
Definition Compiler.h:404
This file contains the declarations for the subclasses of Constant, which represent the different fla...
This file contains the declarations of entities that describe floating point environment and related ...
Module.h This file contains the declarations for the Module class.
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
#define F(x, y, z)
Definition MD5.cpp:55
#define I(x, y, z)
Definition MD5.cpp:58
mir Rename Register Operands
static DebugLoc getDebugLoc(MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
Return the first found DebugLoc that has a DILocation, given a range of instructions.
#define MAKE_CASE(V)
Register Reg
Register const TargetRegisterInfo * TRI
#define T
NVPTX address space definition.
static bool shouldConvertToIndirectCall(const CallBase *CB, const GlobalAddressSDNode *Func)
static SDValue combineADDRSPACECAST(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static cl::opt< bool > sched4reg("nvptx-sched4reg", cl::desc("NVPTX Specific: schedule for register pressue"), cl::init(false))
static SDValue lowerTcgen05St(SDValue Op, SelectionDAG &DAG)
static SDValue PerformEXTRACTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static cl::opt< NVPTX::DivPrecisionLevel > UsePrecDivF32("nvptx-prec-divf32", cl::Hidden, cl::desc("NVPTX Specific: Override the precision of the lowering for f32 fdiv"), cl::values(clEnumValN(NVPTX::DivPrecisionLevel::Approx, "0", "Use div.approx"), clEnumValN(NVPTX::DivPrecisionLevel::Full, "1", "Use div.full"), clEnumValN(NVPTX::DivPrecisionLevel::IEEE754, "2", "Use IEEE Compliant F32 div.rnd if available (default)"), clEnumValN(NVPTX::DivPrecisionLevel::IEEE754_NoFTZ, "3", "Use IEEE Compliant F32 div.rnd if available, no FTZ")), cl::init(NVPTX::DivPrecisionLevel::IEEE754))
static bool isConstOne(const SDValue &Operand)
static cl::opt< unsigned > FMAContractLevelOpt("nvptx-fma-level", cl::Hidden, cl::desc("NVPTX Specific: FMA contraction (0: don't do it" " 1: do it 2: do it aggressively"), cl::init(2))
static bool IsPTXVectorType(MVT VT)
static SDValue lowerLOADi1(LoadSDNode *LD, SelectionDAG &DAG)
static SDValue lowerIntrinsicVoid(SDValue Op, SelectionDAG &DAG)
static MachinePointerInfo refinePtrAS(SDValue &Ptr, SelectionDAG &DAG, const DataLayout &DL, const TargetLowering &TL)
static SDValue lowerROT(SDValue Op, SelectionDAG &DAG)
static void ComputePTXValueVTs(const TargetLowering &TLI, const DataLayout &DL, LLVMContext &Ctx, CallingConv::ID CallConv, Type *Ty, SmallVectorImpl< EVT > &ValueVTs, SmallVectorImpl< uint64_t > &Offsets, uint64_t StartingOffset=0)
ComputePTXValueVTs - For the given Type Ty, returns the set of primitive legal-ish MVTs that compose ...
static void ReplaceBITCAST(SDNode *Node, SelectionDAG &DAG, SmallVectorImpl< SDValue > &Results)
static void replaceAtomicSwap128(SDNode *N, SelectionDAG &DAG, const NVPTXSubtarget &STI, SmallVectorImpl< SDValue > &Results)
static SDValue lowerSTOREVector(SDValue Op, SelectionDAG &DAG, const NVPTXSubtarget &STI)
static SDValue lowerLoadVector(SDNode *N, SelectionDAG &DAG, const NVPTXSubtarget &STI)
static void replaceProxyReg(SDNode *N, SelectionDAG &DAG, const TargetLowering &TLI, SmallVectorImpl< SDValue > &Results)
static void ReplaceCopyFromReg_128(SDNode *N, SelectionDAG &DAG, SmallVectorImpl< SDValue > &Results)
static SDValue lowerCTLZCTPOP(SDValue Op, SelectionDAG &DAG)
static SDValue combineMADConstOne(SDValue X, SDValue Add, EVT VT, SDLoc DL, TargetLowering::DAGCombinerInfo &DCI)
static SDValue combinePRMT(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, CodeGenOptLevel OptLevel)
static SDValue combinePackingMovIntoStore(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, unsigned Front, unsigned Back)
Fold packing movs into a store.
static void ReplaceINTRINSIC_W_CHAIN(SDNode *N, SelectionDAG &DAG, SmallVectorImpl< SDValue > &Results)
static SDValue getBuildVectorizedValue(unsigned N, const SDLoc &dl, SelectionDAG &DAG, T GetElement)
static SDValue getExtractVectorizedValue(SDValue V, unsigned I, EVT VT, const SDLoc &dl, SelectionDAG &DAG)
static NVPTXISD::NodeType getMinMax3Opcode(unsigned MinMax2Opcode)
Get 3-input version of a 2-input min/max opcode.
static unsigned canMergeParamLoadStoresStartingAt(unsigned Idx, uint32_t AccessSize, const SmallVectorImpl< EVT > &ValueVTs, const SmallVectorImpl< T > &Offsets, Align ParamAlignment)
static EVT getVectorizedVT(EVT VT, unsigned N, LLVMContext &C)
static SDValue lowerIntrinsicWOChain(SDValue Op, SelectionDAG &DAG)
static SDValue PerformFMinMaxCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, unsigned PTXVersion, unsigned SmVersion)
PerformFMinMaxCombine - Combine (fmaxnum (fmaxnum a, b), c) into (fmaxnum3 a, b, c).
static SDValue combineMulWide(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, CodeGenOptLevel OptLevel)
static SDValue PerformFADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1, TargetLowering::DAGCombinerInfo &DCI, CodeGenOptLevel OptLevel)
static SDValue lowerIntrinsicWChain(SDValue Op, SelectionDAG &DAG)
static bool isConstZero(const SDValue &Operand)
static SDValue LowerVectorArith(SDValue Op, SelectionDAG &DAG)
static SDValue LowerTcgen05MMADisableOutputLane(SDValue Op, SelectionDAG &DAG)
static bool IsMulWideOperandDemotable(SDValue Op, unsigned OptSize, OperandSignedness &S)
IsMulWideOperandDemotable - Checks if the provided DAG node is an operand that can be demoted to OptS...
static unsigned getTcgen05MMADisableOutputLane(unsigned IID)
static std::pair< APInt, APInt > getPRMTDemandedBits(const APInt &SelectorVal, const APInt &DemandedBits)
static APInt computePRMT(APInt A, APInt B, APInt Selector, unsigned Mode)
static ISD::NodeType getScalarOpcodeForReduction(unsigned ReductionOpcode)
static SDValue PerformREMCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, CodeGenOptLevel OptLevel)
static SDValue PerformMULCombineWithOperands(SDNode *N, SDValue N0, SDValue N1, TargetLowering::DAGCombinerInfo &DCI)
static void computeKnownBitsForPRMT(const SDValue Op, KnownBits &Known, const SelectionDAG &DAG, unsigned Depth)
static SDValue combineUnpackingMovIntoLoad(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
Fold unpacking movs into a load by increasing the number of return values.
static SDValue LowerClusterLaunchControlQueryCancel(SDValue Op, SelectionDAG &DAG)
static std::optional< std::pair< SDValue, SDValue > > lowerTcgen05Ld(SDNode *N, SelectionDAG &DAG, bool HasOffset=false)
static std::optional< NVPTXISD::NodeType > getScalar3OpcodeForReduction(unsigned ReductionOpcode)
Get 3-input scalar reduction opcode.
static std::optional< std::pair< SDValue, SDValue > > replaceLoadVector(SDNode *N, SelectionDAG &DAG, const NVPTXSubtarget &STI)
replaceLoadVector - Convert vector loads into multi-output scalar loads.
static SDValue expandFSH64(SDValue A, SDValue B, SDValue ShiftAmount, SDLoc DL, unsigned Opcode, SelectionDAG &DAG)
static bool AreMulWideOperandsDemotable(SDValue LHS, SDValue RHS, unsigned OptSize, bool &IsSigned)
AreMulWideOperandsDemotable - Checks if the given LHS and RHS operands can be demoted to OptSize bits...
static SDValue TryMULWIDECombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
TryMULWIDECombine - Attempt to replace a multiply of M bits with a multiply of M/2 bits that produces...
static SDValue lowerPrmtIntrinsic(SDValue Op, SelectionDAG &DAG)
static SDValue combineMulSelectConstOne(SDValue X, SDValue Select, EVT VT, SDLoc DL, TargetLowering::DAGCombinerInfo &DCI)
static SDValue buildTreeReduction(const SmallVector< SDValue > &Elements, EVT EltTy, ArrayRef< std::pair< unsigned, unsigned > > Ops, const SDLoc &DL, const SDNodeFlags Flags, SelectionDAG &DAG)
Reduces the elements using the scalar operations provided.
static SDValue combineProxyReg(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SmallVector< unsigned, 16 > VectorizePTXValueVTs(const SmallVectorImpl< EVT > &ValueVTs, const SmallVectorImpl< T > &Offsets, Align ParamAlignment, bool IsVAArg=false)
static SDValue getPRMT(SDValue A, SDValue B, SDValue Selector, SDLoc DL, SelectionDAG &DAG, unsigned Mode=NVPTX::PTXPrmtMode::NONE)
static SDValue matchMADConstOnePattern(SDValue Add)
static SDValue correctParamType(SDValue V, EVT ExpectedVT, ISD::ArgFlagsTy Flags, SelectionDAG &DAG, SDLoc dl)
static ISD::NodeType getExtOpcode(const ISD::ArgFlagsTy &Flags)
static cl::opt< bool > UsePrecSqrtF32("nvptx-prec-sqrtf32", cl::Hidden, cl::desc("NVPTX Specific: 0 use sqrt.approx, 1 use sqrt.rn."), cl::init(true))
static void computeKnownBitsForLoadV(const SDValue Op, KnownBits &Known)
static APInt getPRMTSelector(const APInt &Selector, unsigned Mode)
static EVT promoteScalarIntegerPTX(const EVT VT)
PromoteScalarIntegerPTX Used to make sure the arguments/returns are suitable for passing and promote ...
static SDValue simplifyDemandedBitsForPRMT(SDValue PRMT, const APInt &DemandedBits, SelectionDAG &DAG, const TargetLowering &TLI, unsigned Depth)
static SDValue lowerFREM(SDValue Op, SelectionDAG &DAG)
static SDValue canonicalizePRMTInput(SDValue Op, SelectionDAG &DAG)
static SDValue sinkProxyReg(SDValue R, SDValue Chain, TargetLowering::DAGCombinerInfo &DCI)
static SDValue lowerFSH(SDValue Op, SelectionDAG &DAG)
static SDValue PromoteBinOpToF32(SDNode *N, SelectionDAG &DAG)
static SDValue PerformSETCCCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, unsigned int SmVersion)
static std::optional< std::pair< unsigned int, MVT > > getVectorLoweringShape(EVT VectorEVT, const NVPTXSubtarget &STI, unsigned AddressSpace)
static cl::opt< bool > ForceMinByValParamAlign("nvptx-force-min-byval-param-align", cl::Hidden, cl::desc("NVPTX Specific: force 4-byte minimal alignment for byval" " params of device functions."), cl::init(false))
static cl::opt< bool > UseApproxLog2F32("nvptx-approx-log2f32", cl::desc("NVPTX Specific: whether to use lg2.approx for log2"), cl::init(false))
Whereas CUDA's implementation (see libdevice) uses ex2.approx for exp2(), it does NOT use lg2....
static SDValue lowerSELECT(SDValue Op, SelectionDAG &DAG)
static SDValue combineLOAD(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const NVPTXSubtarget &STI)
static SDValue combineSTORE(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const NVPTXSubtarget &STI)
static SDValue PerformSHLCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, CodeGenOptLevel OptLevel)
PerformSHLCombine - Runs PTX-specific DAG combine patterns on SHL nodes.
MachineInstr unsigned OpIdx
uint64_t High
#define P(N)
const SmallVectorImpl< MachineOperand > & Cond
static cl::opt< RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode > Mode("regalloc-enable-advisor", cl::Hidden, cl::init(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default), cl::desc("Enable regalloc advisor mode"), cl::values(clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default, "default", "Default"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Release, "release", "precompiled"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Development, "development", "for training")))
This file contains some templates that are useful if you are working with the STL at all.
This file defines the SmallVector class.
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
This file describes how to lower LLVM code to machine code.
Value * RHS
Value * LHS
BinaryOperator * Mul
static APFloat getInf(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Infinity.
Definition APFloat.h:1098
Class for arbitrary precision integers.
Definition APInt.h:78
LLVM_ABI APInt getLoBits(unsigned numBits) const
Compute an APInt containing numBits lowbits from this APInt.
Definition APInt.cpp:644
uint64_t getZExtValue() const
Get zero extended value.
Definition APInt.h:1540
void setHighBits(unsigned hiBits)
Set the top hiBits bits.
Definition APInt.h:1391
LLVM_ABI APInt getHiBits(unsigned numBits) const
Compute an APInt containing numBits highbits from this APInt.
Definition APInt.cpp:639
LLVM_ABI APInt trunc(unsigned width) const
Truncate to new width.
Definition APInt.cpp:936
void setBit(unsigned BitPosition)
Set the given bit to 1 whose position is given as "bitPosition".
Definition APInt.h:1330
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition APInt.h:1488
bool isSignedIntN(unsigned N) const
Check if this APInt has an N-bits signed integer value.
Definition APInt.h:435
bool slt(const APInt &RHS) const
Signed less than comparison.
Definition APInt.h:1130
LLVM_ABI APInt extractBits(unsigned numBits, unsigned bitPosition) const
Return an APInt with the extracted bits [bitPosition,bitPosition+numBits).
Definition APInt.cpp:482
bool isIntN(unsigned N) const
Check if this APInt has an N-bits unsigned integer value.
Definition APInt.h:432
bool sge(const APInt &RHS) const
Signed greater or equal comparison.
Definition APInt.h:1237
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:41
const T & back() const
back - Get the last element.
Definition ArrayRef.h:156
ArrayRef< T > drop_back(size_t N=1) const
Drop the last N elements of the array.
Definition ArrayRef.h:206
bool empty() const
empty - Check if the array is empty.
Definition ArrayRef.h:142
ArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
Definition ArrayRef.h:191
an instruction that atomically reads a memory location, combines it with another value,...
@ Add
*p = old + v
@ FAdd
*p = old + v
@ Min
*p = old <signed v ? old : v
@ Sub
*p = old - v
@ And
*p = old & v
@ Xor
*p = old ^ v
@ UIncWrap
Increment one up to a maximum value.
@ Max
*p = old >signed v ? old : v
@ UMin
*p = old <unsigned v ? old : v
@ UMax
*p = old >unsigned v ? old : v
@ UDecWrap
Decrement one until a minimum value or zero.
bool isFloatingPointOperation() const
BinOp getOperation() const
This is an SDNode representing atomic operations.
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
FunctionType * getFunctionType() const
This class represents a function call, abstracting a target machine's calling convention.
const APInt & getAPIntValue() const
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:63
TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
LLVM_ABI Align getPrefTypeAlign(Type *Ty) const
Returns the preferred stack/global alignment for the specified type.
Diagnostic information for unsupported feature in backend.
void addFnAttr(Attribute::AttrKind Kind)
Add function attributes to this function.
Definition Function.cpp:637
Common base class shared among various IRBuilders.
Definition IRBuilder.h:114
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
This class is used to represent ISD::LOAD nodes.
MCSection * getDataSection() const
Instances of this class represent a uniqued identifier for a section in the current translation unit.
Definition MCSection.h:521
StringRef getName() const
getName - Get the symbol name.
Definition MCSymbol.h:188
Machine Value Type.
static auto integer_fixedlen_vector_valuetypes()
SimpleValueType SimpleTy
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isScalableVector() const
Return true if this is a vector value type where the runtime length is machine dependent.
static auto integer_valuetypes()
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
static auto fixedlen_vector_valuetypes()
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
static MVT getVectorVT(MVT VT, unsigned NumElements)
MVT getVectorElementType() const
static MVT getIntegerVT(unsigned BitWidth)
static auto fp_valuetypes()
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
static auto fp_fixedlen_vector_valuetypes()
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
Function & getFunction()
Return the LLVM function that this machine code represents.
const MachineJumpTableInfo * getJumpTableInfo() const
getJumpTableInfo - Return the jump table info object for the current function.
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
@ EK_Inline
EK_Inline - Jump table entries are emitted inline at their point of use.
const std::vector< MachineJumpTableEntry > & getJumpTables() const
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MOInvariant
The memory access always returns the same value (or traps).
@ MOStore
The memory access writes data.
This SDNode is used for target intrinsics that touch memory and need an associated MachineMemOperand.
This is an abstract virtual class for memory operations.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
EVT getMemoryVT() const
Return the type of the in-memory value.
static unsigned getFromTypeWidthForLoad(const MemSDNode *Mem)
bool hasAtomSwap128() const
bool hasF32x2Instructions() const
bool has256BitVectorLoadStore(unsigned AS) const
AtomicOrdering atomicOperationOrderAfterFenceSplit(const Instruction *I) const override
ConstraintType getConstraintType(StringRef Constraint) const override
getConstraintType - Given a constraint letter, return the type of constraint it is for this target.
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
const NVPTXTargetMachine * nvTM
bool SimplifyDemandedBitsForTargetNode(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0) const override
Attempt to simplify any target nodes based on the demanded bits/elts, returning true on success.
NVPTXTargetLowering(const NVPTXTargetMachine &TM, const NVPTXSubtarget &STI)
std::string getPrototype(const DataLayout &DL, Type *, const ArgListTy &, const SmallVectorImpl< ISD::OutputArg > &, std::optional< unsigned > FirstVAArg, const CallBase &CB, unsigned UniqueCallSite) const
unsigned getPreferredFPToIntOpcode(unsigned Op, EVT FromVT, EVT ToVT) const override
bool useF32FTZ(const MachineFunction &MF) const
SDValue LowerSTACKSAVE(SDValue Op, SelectionDAG &DAG) const
Align getFunctionArgumentAlignment(const Function *F, Type *Ty, unsigned Idx, const DataLayout &DL) const
SDValue getSqrtEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled, int &ExtraSteps, bool &UseOneConst, bool Reciprocal) const override
Hooks for building estimates in place of slower divisions and square roots.
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &dl, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &dl, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower the incoming (formal) arguments, described by the Ins array,...
void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const override
Lower the specified operand into the Ops vector.
SDValue LowerSTACKRESTORE(SDValue Op, SelectionDAG &DAG) const
Instruction * emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst, AtomicOrdering Ord) const override
std::string getParamName(const Function *F, int Idx) const
TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const override
Return the preferred vector type legalization action.
NVPTX::DivPrecisionLevel getDivF32Level(const MachineFunction &MF, const SDNode &N) const
bool shouldInsertFencesForAtomic(const Instruction *) const override
Whether AtomicExpandPass should automatically insert fences and reduce ordering for this atomic.
Align getFunctionParamOptimizedAlign(const Function *F, Type *ArgTy, const DataLayout &DL) const
getFunctionParamOptimizedAlign - since function arguments are passed via .param space,...
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Ctx, EVT VT) const override
Return the ValueType of the result of SETCC operations.
std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override
Given a physical register constraint (e.g.
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I=nullptr) const override
isLegalAddressingMode - Return true if the addressing mode represented by AM is legal for this target...
Instruction * emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst, AtomicOrdering Ord) const override
Inserts in the IR a target-specific intrinsic specifying a fence.
AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override
Returns how the IR-level AtomicExpand pass should expand the given AtomicRMW, if at all.
Align getFunctionByValParamAlign(const Function *F, Type *ArgTy, Align InitialAlign, const DataLayout &DL) const
Helper for computing alignment of a device function byval parameter.
bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I, MachineFunction &MF, unsigned Intrinsic) const override
Given an intrinsic, checks if on the target the intrinsic will need to map to a MemIntrinsicNode (tou...
const char * getTargetNodeName(unsigned Opcode) const override
This method returns the name of a target specific DAG node.
bool allowFMA(MachineFunction &MF, CodeGenOptLevel OptLevel) const
bool usePrecSqrtF32(const SDNode *N=nullptr) const
unsigned getJumpTableEncoding() const override
Return the entry encoding for a jump table in the current function.
SDValue LowerCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower calls into the specified DAG.
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
MCSection * SelectSectionForGlobal(const GlobalObject *GO, SectionKind Kind, const TargetMachine &TM) const override
static LLVM_ABI PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
const DebugLoc & getDebugLoc() const
Represents one node in the SelectionDAG.
ArrayRef< SDUse > ops() const
const APInt & getAsAPIntVal() const
Helper method returns the APInt value of a ConstantSDNode.
bool hasOneUse() const
Return true if there is exactly one use of this node.
unsigned getIROrder() const
Return the node ordering.
SDNodeFlags getFlags() const
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getNumValues() const
Return the number of values defined/returned by this operator.
SDVTList getVTList() const
const SDValue & getOperand(unsigned Num) const
bool isUndef() const
Returns true if the node type is UNDEF or POISON.
iterator_range< user_iterator > users()
Represents a use of a SDNode.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
const SDValue & getOperand(unsigned i) const
unsigned getOpcode() const
SectionKind - This is a simple POD value that classifies the properties of a section.
Definition SectionKind.h:22
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
LLVM_ABI SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
const SDValue & getRoot() const
Return the root tag of the SelectionDAG.
LLVM_ABI SDValue getAddrSpaceCast(const SDLoc &dl, EVT VT, SDValue Ptr, unsigned SrcAS, unsigned DestAS)
Return an AddrSpaceCastSDNode.
const TargetSubtargetInfo & getSubtarget() const
LLVM_ABI SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
LLVM_ABI SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
LLVM_ABI void ExtractVectorElements(SDValue Op, SmallVectorImpl< SDValue > &Args, unsigned Start=0, unsigned Count=0, EVT EltVT=EVT())
Append the extracted elements from Start to Count out of the vector Op in Args.
LLVM_ABI SDValue getFreeze(SDValue V)
Return a freeze using the SDLoc of the value operand.
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false)
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
LLVM_ABI SDValue getSymbolFunctionGlobalAddress(SDValue Op, Function **TargetFunction=nullptr)
Return a GlobalAddress of the function from the current module with name matching the given ExternalS...
LLVM_ABI SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
LLVM_ABI SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
LLVM_ABI SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, LocationSize Size=LocationSize::precise(0), const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
LLVM_ABI Align getEVTAlign(EVT MemoryVT) const
Compute the default alignment value for the given type.
LLVM_ABI SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
const TargetLowering & getTargetLoweringInfo() const
LLVM_ABI SDNode * MorphNodeTo(SDNode *N, unsigned Opc, SDVTList VTs, ArrayRef< SDValue > Ops)
This mutates the specified node to have the specified return type, opcode, and operands.
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
LLVM_ABI SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build Select's if you just have operands and don't want to check...
const DataLayout & getDataLayout() const
LLVM_ABI SDValue getTokenFactor(const SDLoc &DL, SmallVectorImpl< SDValue > &Vals)
Creates a new TokenFactor containing Vals.
LLVM_ABI SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
LLVM_ABI SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
LLVM_ABI SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
LLVM_ABI SDValue getSignedConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
LLVM_ABI SDValue getBasicBlock(MachineBasicBlock *MBB)
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
LLVM_ABI SDValue getExternalSymbol(const char *Sym, EVT VT)
LLVM_ABI SDValue getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either any-extending or truncat...
LLVM_ABI SDValue getIntPtrConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
LLVM_ABI SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
LLVM_ABI SDValue getFPExtendOrRound(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of float type, to the float type VT, by either extending or rounding (by tr...
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
MachineFunction & getMachineFunction() const
LLVM_ABI KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
LLVM_ABI SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
SDValue getObjectPtrOffset(const SDLoc &SL, SDValue Ptr, TypeSize Offset)
Create an add instruction with appropriate flags when used for addressing some offset of an object.
LLVMContext * getContext() const
const SDValue & setRoot(SDValue N)
Set the current root tag of the SelectionDAG.
LLVM_ABI SDValue getTargetExternalSymbol(const char *Sym, EVT VT, unsigned TargetFlags=0)
ArrayRef< int > getMask() const
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
This class is used to represent ISD::STORE nodes.
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
constexpr size_t size() const
size - Get the string size.
Definition StringRef.h:146
constexpr const char * data() const
data - Get a pointer to the start of the string (which may not be null terminated).
Definition StringRef.h:140
Align getStackAlign() const
getStackAlignment - This method returns the number of bytes to which the stack pointer must be aligne...
void setBooleanVectorContents(BooleanContent Ty)
Specify how the target extends the result of a vector boolean value from a vector of i1 to a wider ty...
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
void setMaxDivRemBitWidthSupported(unsigned SizeInBits)
Set the size in bits of the maximum div/rem the backend supports.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
unsigned MaxStoresPerMemcpyOptSize
Likewise for functions with the OptSize attribute.
const TargetMachine & getTargetMachine() const
virtual unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain targets require unusual breakdowns of certain types.
virtual MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
void setOperationPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
Convenience method to set an operation to Promote and specify the type in a single call.
LegalizeTypeAction
This enum indicates whether a types are legal for a target, and if not, what action should be used to...
void addBypassSlowDiv(unsigned int SlowBitWidth, unsigned int FastBitWidth)
Tells the code generator which bitwidths to bypass.
virtual unsigned getNumRegisters(LLVMContext &Context, EVT VT, std::optional< MVT > RegisterVT=std::nullopt) const
Return the number of registers that this ValueType will eventually require.
void setMaxAtomicSizeInBitsSupported(unsigned SizeInBits)
Set the maximum atomic operation size supported by the backend.
virtual TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const
Return the preferred vector type legalization action.
unsigned MaxStoresPerMemsetOptSize
Likewise for functions with the OptSize attribute.
void setBooleanContents(BooleanContent Ty)
Specify how the target extends the result of integer and floating point boolean values from i1 to a w...
unsigned MaxStoresPerMemmove
Specify maximum number of store instructions per memmove call.
void computeRegisterProperties(const TargetRegisterInfo *TRI)
Once all of the register classes are added, this allows us to compute derived properties we expose.
unsigned MaxStoresPerMemmoveOptSize
Likewise for functions with the OptSize attribute.
void addRegisterClass(MVT VT, const TargetRegisterClass *RC)
Add the specified register class as an available regclass for the specified value type.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
virtual MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
unsigned MaxStoresPerMemset
Specify maximum number of store instructions per memset call.
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
void setMinCmpXchgSizeInBits(unsigned SizeInBits)
Sets the minimum cmpxchg or ll/sc size supported by the backend.
void AddPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
If Opc/OrigVT is specified as being promoted, the promotion code defaults to trying a larger integer/...
AtomicExpansionKind
Enum that specifies what an atomic load/AtomicRMWInst is expanded to, if at all.
void setCondCodeAction(ArrayRef< ISD::CondCode > CCs, MVT VT, LegalizeAction Action)
Indicate that the specified condition code is or isn't supported on the target and indicate what to d...
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
Align getMinStackArgumentAlignment() const
Return the minimum stack alignment of an argument.
void setLoadExtAction(unsigned ExtType, MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified load with extension does not work with the specified type and indicate wh...
std::vector< ArgListEntry > ArgListTy
virtual Instruction * emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst, AtomicOrdering Ord) const
virtual Instruction * emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst, AtomicOrdering Ord) const
Inserts in the IR a target-specific intrinsic specifying a fence.
unsigned MaxStoresPerMemcpy
Specify maximum number of store instructions per memcpy call.
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
MVT getRegisterType(MVT VT) const
Return the type of registers that this ValueType will eventually require.
void setJumpIsExpensive(bool isExpensive=true)
Tells the code generator not to expand logic operations on comparison predicates into separate sequen...
LegalizeAction getOperationAction(unsigned Op, EVT VT) const
Return how this operation should be treated: either it is legal, needs to be promoted to a larger siz...
This class defines information used to lower LLVM code to legal SelectionDAG operators that the targe...
SDValue SimplifyMultipleUseDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth=0) const
More limited version of SimplifyDemandedBits that can be used to "lookthrough" ops that don't contrib...
virtual ConstraintType getConstraintType(StringRef Constraint) const
Given a constraint, return the type of constraint it is for this target.
virtual std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const
Given a physical register constraint (e.g.
TargetLowering(const TargetLowering &)=delete
SDValue expandRoundInexactToOdd(EVT ResultVT, SDValue Op, const SDLoc &DL, SelectionDAG &DAG) const
Truncate Op to ResultVT.
SDValue expandFP_ROUND(SDNode *Node, SelectionDAG &DAG) const
Expand round(fp) to fp conversion.
virtual void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const
Lower the specified operand into the Ops vector.
Primary interface to the complete machine description for the target machine.
CodeGenOptLevel getOptLevel() const
Returns the optimization level: None, Less, Default, or Aggressive.
TargetOptions Options
MCSymbol * getSymbol(const GlobalValue *GV) const
FPOpFusion::FPOpFusionMode AllowFPOpFusion
AllowFPOpFusion - This flag is set by the -fp-contract=xxx option.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
virtual const TargetFrameLowering * getFrameLowering() const
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:343
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
LLVM_ABI TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Definition Type.cpp:198
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:184
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240
bool isVoidTy() const
Return true if this is 'void'.
Definition Type.h:139
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
A raw_ostream that writes to an std::string.
CallInst * Call
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
LLVM_ABI APInt pow(const APInt &X, int64_t N)
Compute X^N for N>=0.
Definition APInt.cpp:3155
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
NodeType
ISD::NodeType enum - This enum defines the target-independent operators for a SelectionDAG.
Definition ISDOpcodes.h:41
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition ISDOpcodes.h:801
@ CTLZ_ZERO_UNDEF
Definition ISDOpcodes.h:774
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition ISDOpcodes.h:270
@ BSWAP
Byte Swap and Counting operators.
Definition ISDOpcodes.h:765
@ ADDC
Carry-setting nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:289
@ ADD
Simple integer binary arithmetic operators.
Definition ISDOpcodes.h:259
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition ISDOpcodes.h:835
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition ISDOpcodes.h:511
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
Definition ISDOpcodes.h:215
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition ISDOpcodes.h:862
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
Definition ISDOpcodes.h:571
@ FADD
Simple binary floating point operators.
Definition ISDOpcodes.h:410
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
Definition ISDOpcodes.h:738
@ SDIVREM
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition ISDOpcodes.h:275
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
Definition ISDOpcodes.h:249
@ SIGN_EXTEND
Conversion operators.
Definition ISDOpcodes.h:826
@ SSUBO
Same for subtraction.
Definition ISDOpcodes.h:347
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
Definition ISDOpcodes.h:369
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition ISDOpcodes.h:778
@ EXTRACT_ELEMENT
EXTRACT_ELEMENT - This is used to get the lower or upper (determined by a Constant,...
Definition ISDOpcodes.h:242
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
Definition ISDOpcodes.h:225
@ SADDO
RESULT, BOOL = [SU]ADDO(LHS, RHS) - Overflow-aware nodes for addition.
Definition ISDOpcodes.h:343
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition ISDOpcodes.h:695
@ SHL
Shift and rotation operations.
Definition ISDOpcodes.h:756
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition ISDOpcodes.h:636
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
Definition ISDOpcodes.h:601
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition ISDOpcodes.h:563
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition ISDOpcodes.h:219
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition ISDOpcodes.h:832
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
Definition ISDOpcodes.h:793
@ SSHLSAT
RESULT = [US]SHLSAT(LHS, RHS) - Perform saturation left shift.
Definition ISDOpcodes.h:379
@ SMULO
Same for multiplication.
Definition ISDOpcodes.h:351
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition ISDOpcodes.h:870
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition ISDOpcodes.h:718
@ VSELECT
Select with a vector condition (op #0) and two vector operands (ops #1 and #2), returning a vector re...
Definition ISDOpcodes.h:787
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:323
@ FRAMEADDR
FRAMEADDR, RETURNADDR - These nodes represent llvm.frameaddress and llvm.returnaddress on the DAG.
Definition ISDOpcodes.h:110
@ STRICT_FP_TO_UINT
Definition ISDOpcodes.h:471
@ STRICT_FP_TO_SINT
STRICT_FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:470
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:908
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition ISDOpcodes.h:730
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition ISDOpcodes.h:200
@ ADDE
Carry-using nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:299
@ FREEZE
FREEZE - FREEZE(VAL) returns an arbitrary value if VAL is UNDEF (or is evaluated to UNDEF),...
Definition ISDOpcodes.h:236
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
Definition ISDOpcodes.h:552
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition ISDOpcodes.h:941
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition ISDOpcodes.h:838
@ SHL_PARTS
SHL_PARTS/SRA_PARTS/SRL_PARTS - These operators are used for expanded integer shift operations.
Definition ISDOpcodes.h:815
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
Definition ISDOpcodes.h:521
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
Definition ISDOpcodes.h:360
@ SADDO_CARRY
Carry-using overflow-aware nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:333
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition ISDOpcodes.h:208
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
Definition ISDOpcodes.h:543
LLVM_ABI bool allOperandsUndef(const SDNode *N)
Return true if the node has at least one operand and all operands of the specified node are ISD::UNDE...
This namespace contains an enum with a value for every intrinsic/builtin function known by LLVM.
@ Bitcast
Perform the operation on a different, but equivalently sized type.
@ TCGEN05_MMA_SHARED_SCALE_D_DISABLE_OUTPUT_LANE_CG2
@ TCGEN05_MMA_SP_TENSOR_SCALE_D_DISABLE_OUTPUT_LANE_CG2_ASHIFT
@ TCGEN05_MMA_TENSOR_SCALE_D_DISABLE_OUTPUT_LANE_CG1
@ CLUSTERLAUNCHCONTROL_QUERY_CANCEL_IS_CANCELED
@ TCGEN05_MMA_SP_SHARED_SCALE_D_DISABLE_OUTPUT_LANE_CG2
@ CALL
This node represents a PTX call instruction.
@ TCGEN05_MMA_SP_SHARED_SCALE_D_DISABLE_OUTPUT_LANE_CG1
@ TCGEN05_MMA_SP_TENSOR_DISABLE_OUTPUT_LANE_CG2_ASHIFT
@ TCGEN05_MMA_SP_SHARED_DISABLE_OUTPUT_LANE_CG1
@ TCGEN05_MMA_SP_TENSOR_SCALE_D_DISABLE_OUTPUT_LANE_CG1
@ TCGEN05_MMA_SHARED_SCALE_D_DISABLE_OUTPUT_LANE_CG1
@ TCGEN05_MMA_TENSOR_DISABLE_OUTPUT_LANE_CG2
@ TCGEN05_MMA_SHARED_DISABLE_OUTPUT_LANE_CG2
@ TCGEN05_MMA_TENSOR_SCALE_D_DISABLE_OUTPUT_LANE_CG2
@ TCGEN05_MMA_SP_TENSOR_SCALE_D_DISABLE_OUTPUT_LANE_CG2
@ CLUSTERLAUNCHCONTROL_QUERY_CANCEL_GET_FIRST_CTAID_X
@ TCGEN05_MMA_SP_TENSOR_DISABLE_OUTPUT_LANE_CG1
@ TCGEN05_MMA_TENSOR_SCALE_D_DISABLE_OUTPUT_LANE_CG1_ASHIFT
@ TCGEN05_MMA_SP_TENSOR_SCALE_D_DISABLE_OUTPUT_LANE_CG1_ASHIFT
@ UNPACK_VECTOR
This node is the inverse of NVPTX::BUILD_VECTOR.
@ CLUSTERLAUNCHCONTROL_QUERY_CANCEL_GET_FIRST_CTAID_Y
@ TCGEN05_MMA_SHARED_DISABLE_OUTPUT_LANE_CG1
@ TCGEN05_MMA_TENSOR_DISABLE_OUTPUT_LANE_CG2_ASHIFT
@ TCGEN05_MMA_TENSOR_SCALE_D_DISABLE_OUTPUT_LANE_CG2_ASHIFT
@ DeclareScalarParam
These nodes represent a parameter declaration.
@ CLUSTERLAUNCHCONTROL_QUERY_CANCEL_GET_FIRST_CTAID_Z
@ ATOMIC_CMP_SWAP_B128
These nodes are used to lower atomic instructions with i128 type.
@ BUILD_VECTOR
This node is similar to ISD::BUILD_VECTOR except that the output may be implicitly bitcast to a scala...
@ TCGEN05_MMA_SP_SHARED_DISABLE_OUTPUT_LANE_CG2
@ TCGEN05_MMA_TENSOR_DISABLE_OUTPUT_LANE_CG1_ASHIFT
@ TCGEN05_MMA_SP_TENSOR_DISABLE_OUTPUT_LANE_CG1_ASHIFT
@ TCGEN05_MMA_TENSOR_DISABLE_OUTPUT_LANE_CG1
@ TCGEN05_MMA_SP_TENSOR_DISABLE_OUTPUT_LANE_CG2
bool isPackedVectorTy(EVT VT)
DivPrecisionLevel
Definition NVPTX.h:251
ValuesClass values(OptsTy... Options)
Helper to build a ValuesClass by forwarding a variable number of arguments as an initializer list to ...
initializer< Ty > init(const Ty &Val)
NodeAddr< NodeBase * > Node
Definition RDFGraph.h:381
This is an optimization pass for GlobalISel generic memory operations.
@ Low
Lower the current thread's priority such that it does not affect foreground tasks significantly.
Definition Threading.h:262
@ Offset
Definition DWP.cpp:477
detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&...args)
zip iterator for two or more iteratable types.
Definition STLExtras.h:831
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
bool shouldEmitPTXNoReturn(const Value *V, const TargetMachine &TM)
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1705
MaybeAlign getAlign(const CallInst &I, unsigned Index)
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition STLExtras.h:1657
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2452
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:649
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition MathExtras.h:396
bool isReleaseOrStronger(AtomicOrdering AO)
OutputIt transform(R &&Range, OutputIt d_first, UnaryFunction F)
Wrapper function around std::transform to apply a function to a range and store the result elsewhere.
Definition STLExtras.h:1948
unsigned promoteScalarArgumentSize(unsigned size)
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:167
bool shouldPassAsArray(Type *Ty)
CodeGenOptLevel
Code generation optimization level.
Definition CodeGen.h:82
@ Default
-O2, -Os
Definition CodeGen.h:85
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:548
AtomicOrdering
Atomic ordering for LLVM's memory model.
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition Alignment.h:144
DWARFExpression::Operation Op
void ComputeValueVTs(const TargetLowering &TLI, const DataLayout &DL, Type *Ty, SmallVectorImpl< EVT > &ValueVTs, SmallVectorImpl< EVT > *MemVTs, SmallVectorImpl< TypeSize > *Offsets=nullptr, TypeSize StartingOffset=TypeSize::getZero())
ComputeValueVTs - Given an LLVM IR type, compute a sequence of EVTs that represent all the individual...
Definition Analysis.cpp:119
ArrayRef(const T &OneElt) -> ArrayRef< T >
bool isAcquireOrStronger(AtomicOrdering AO)
constexpr unsigned BitWidth
bool isKernelFunction(const Function &F)
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:565
Function * getMaybeBitcastedCallee(const CallBase *CB)
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition Alignment.h:201
auto seq(T Begin, T End)
Iterate over an integral type from Begin up to - but not including - End.
Definition Sequence.h:305
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:872
#define N
static LLVM_ABI const fltSemantics & IEEEsingle() LLVM_READNONE
Definition APFloat.cpp:266
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
constexpr uint64_t value() const
This is a hole in the type system and should not be abused.
Definition Alignment.h:77
@ PreserveSign
The sign of a flushed-to-zero number is preserved in the sign of 0.
DenormalModeKind Output
Denormal flushing mode for floating point instruction results in the default floating point environme...
Extended Value Type.
Definition ValueTypes.h:35
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
Definition ValueTypes.h:395
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition ValueTypes.h:137
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition ValueTypes.h:74
EVT changeTypeToInteger() const
Return the type converted to an equivalently sized integer or vector with integer element type.
Definition ValueTypes.h:121
bool bitsGT(EVT VT) const
Return true if this has more bits than VT.
Definition ValueTypes.h:284
bool bitsLT(EVT VT) const
Return true if this has less bits than VT.
Definition ValueTypes.h:300
ElementCount getVectorElementCount() const
Definition ValueTypes.h:350
bool is32BitVector() const
Return true if this is a 32-bit vector type.
Definition ValueTypes.h:197
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition ValueTypes.h:373
uint64_t getScalarSizeInBits() const
Definition ValueTypes.h:385
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition ValueTypes.h:316
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
Definition ValueTypes.h:381
bool isVector() const
Return true if this is a vector value type.
Definition ValueTypes.h:168
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition ValueTypes.h:323
bool bitsEq(EVT VT) const
Return true if this has the same number of bits as VT.
Definition ValueTypes.h:256
LLVM_ABI Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition ValueTypes.h:328
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
Definition ValueTypes.h:157
EVT changeVectorElementType(EVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
Definition ValueTypes.h:102
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition ValueTypes.h:336
bool isInteger() const
Return true if this is an integer or a vector integer type.
Definition ValueTypes.h:152
static LLVM_ABI KnownBits ashr(const KnownBits &LHS, const KnownBits &RHS, bool ShAmtNonZero=false, bool Exact=false)
Compute known bits for ashr(LHS, RHS).
KnownBits concat(const KnownBits &Lo) const
Concatenate the bits from Lo onto the bottom of *this.
Definition KnownBits.h:233
unsigned getBitWidth() const
Get the bit width of this value.
Definition KnownBits.h:44
void resetAll()
Resets the known state of all bits.
Definition KnownBits.h:74
void insertBits(const KnownBits &SubBits, unsigned BitPosition)
Insert the bits from a smaller known bits starting at bitPosition.
Definition KnownBits.h:219
This class contains a discriminated union of information about pointers in memory operands,...
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition Alignment.h:106
These are IR-level optimization flags that may be propagated to SDNodes.
bool hasAllowContract() const
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
This structure contains all information that is necessary for lowering calls.
SmallVector< ISD::InputArg, 32 > Ins
SmallVector< ISD::OutputArg, 32 > Outs
Type * RetTy
Same as OrigRetTy, or partially legalized for soft float libcalls.
A convenience struct that encapsulates a DAG, and two SDValues for returning information from TargetL...