LLVM 22.0.0git
NVPTXISelLowering.cpp
Go to the documentation of this file.
1//===-- NVPTXISelLowering.cpp - NVPTX DAG Lowering Implementation ---------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines the interfaces that NVPTX uses to lower LLVM code into a
10// selection DAG.
11//
12//===----------------------------------------------------------------------===//
13
14#include "NVPTXISelLowering.h"
16#include "NVPTX.h"
17#include "NVPTXISelDAGToDAG.h"
18#include "NVPTXSubtarget.h"
19#include "NVPTXTargetMachine.h"
21#include "NVPTXUtilities.h"
22#include "llvm/ADT/APFloat.h"
23#include "llvm/ADT/APInt.h"
24#include "llvm/ADT/STLExtras.h"
26#include "llvm/ADT/StringRef.h"
38#include "llvm/IR/Argument.h"
39#include "llvm/IR/Attributes.h"
40#include "llvm/IR/Constants.h"
41#include "llvm/IR/DataLayout.h"
44#include "llvm/IR/FPEnv.h"
45#include "llvm/IR/Function.h"
46#include "llvm/IR/GlobalValue.h"
47#include "llvm/IR/IRBuilder.h"
48#include "llvm/IR/Instruction.h"
50#include "llvm/IR/IntrinsicsNVPTX.h"
51#include "llvm/IR/Module.h"
52#include "llvm/IR/Type.h"
53#include "llvm/IR/Value.h"
65#include <algorithm>
66#include <cassert>
67#include <cmath>
68#include <cstdint>
69#include <iterator>
70#include <optional>
71#include <string>
72#include <tuple>
73#include <utility>
74#include <vector>
75
76#define DEBUG_TYPE "nvptx-lower"
77
78using namespace llvm;
79
81 "nvptx-sched4reg",
82 cl::desc("NVPTX Specific: schedule for register pressue"), cl::init(false));
83
85 "nvptx-fma-level", cl::Hidden,
86 cl::desc("NVPTX Specific: FMA contraction (0: don't do it"
87 " 1: do it 2: do it aggressively"),
88 cl::init(2));
89
91 "nvptx-prec-divf32", cl::Hidden,
93 "NVPTX Specific: Override the precision of the lowering for f32 fdiv"),
95 clEnumValN(NVPTX::DivPrecisionLevel::Approx, "0", "Use div.approx"),
96 clEnumValN(NVPTX::DivPrecisionLevel::Full, "1", "Use div.full"),
98 "Use IEEE Compliant F32 div.rnd if available (default)"),
100 "Use IEEE Compliant F32 div.rnd if available, no FTZ")),
102
104 "nvptx-prec-sqrtf32", cl::Hidden,
105 cl::desc("NVPTX Specific: 0 use sqrt.approx, 1 use sqrt.rn."),
106 cl::init(true));
107
108/// Whereas CUDA's implementation (see libdevice) uses ex2.approx for exp2(), it
109/// does NOT use lg2.approx for log2, so this is disabled by default.
111 "nvptx-approx-log2f32",
112 cl::desc("NVPTX Specific: whether to use lg2.approx for log2"),
113 cl::init(false));
114
116 "nvptx-force-min-byval-param-align", cl::Hidden,
117 cl::desc("NVPTX Specific: force 4-byte minimal alignment for byval"
118 " params of device functions."),
119 cl::init(false));
120
123 const SDNode &N) const {
124 // If nvptx-prec-div32=N is used on the command-line, always honor it
125 if (UsePrecDivF32.getNumOccurrences() > 0)
126 return UsePrecDivF32;
127
128 const SDNodeFlags Flags = N.getFlags();
129 if (Flags.hasApproximateFuncs())
131
133}
134
136 // If nvptx-prec-sqrtf32 is used on the command-line, always honor it
137 if (UsePrecSqrtF32.getNumOccurrences() > 0)
138 return UsePrecSqrtF32;
139
140 if (N) {
141 const SDNodeFlags Flags = N->getFlags();
142 if (Flags.hasApproximateFuncs())
143 return false;
144 }
145
146 return true;
147}
148
153
154static bool IsPTXVectorType(MVT VT) {
155 switch (VT.SimpleTy) {
156 default:
157 return false;
158 case MVT::v2i1:
159 case MVT::v4i1:
160 case MVT::v2i8:
161 case MVT::v4i8:
162 case MVT::v8i8: // <2 x i8x4>
163 case MVT::v16i8: // <4 x i8x4>
164 case MVT::v2i16:
165 case MVT::v4i16:
166 case MVT::v8i16: // <4 x i16x2>
167 case MVT::v2i32:
168 case MVT::v4i32:
169 case MVT::v2i64:
170 case MVT::v2f16:
171 case MVT::v4f16:
172 case MVT::v8f16: // <4 x f16x2>
173 case MVT::v2bf16:
174 case MVT::v4bf16:
175 case MVT::v8bf16: // <4 x bf16x2>
176 case MVT::v2f32:
177 case MVT::v4f32:
178 case MVT::v2f64:
179 case MVT::v4i64:
180 case MVT::v4f64:
181 case MVT::v8i32:
182 case MVT::v8f32:
183 case MVT::v16f16: // <8 x f16x2>
184 case MVT::v16bf16: // <8 x bf16x2>
185 case MVT::v16i16: // <8 x i16x2>
186 case MVT::v32i8: // <8 x i8x4>
187 return true;
188 }
189}
190
191// When legalizing vector loads/stores, this function is called, which does two
192// things:
193// 1. Determines Whether the vector is something we want to custom lower,
194// std::nullopt is returned if we do not want to custom lower it.
195// 2. If we do want to handle it, returns two parameters:
196// - unsigned int NumElts - The number of elements in the final vector
197// - EVT EltVT - The type of the elements in the final vector
198static std::optional<std::pair<unsigned int, MVT>>
200 unsigned AddressSpace) {
201 const bool CanLowerTo256Bit = STI.has256BitVectorLoadStore(AddressSpace);
202
203 if (CanLowerTo256Bit && VectorEVT.isScalarInteger() &&
204 VectorEVT.getSizeInBits() == 256)
205 return {{4, MVT::i64}};
206
207 if (!VectorEVT.isSimple())
208 return std::nullopt;
209 const MVT VectorVT = VectorEVT.getSimpleVT();
210
211 if (!VectorVT.isVector()) {
212 if (VectorVT == MVT::i128 || VectorVT == MVT::f128)
213 return {{2, MVT::i64}};
214 return std::nullopt;
215 }
216
217 const MVT EltVT = VectorVT.getVectorElementType();
218 const unsigned NumElts = VectorVT.getVectorNumElements();
219
220 // The size of the PTX virtual register that holds a packed type.
221 unsigned PackRegSize;
222
223 // We only handle "native" vector sizes for now, e.g. <4 x double> is not
224 // legal. We can (and should) split that into 2 stores of <2 x double> here
225 // but I'm leaving that as a TODO for now.
226 switch (VectorVT.SimpleTy) {
227 default:
228 return std::nullopt;
229
230 case MVT::v4i64:
231 case MVT::v4f64:
232 // This is a "native" vector type iff the address space is global and the
233 // target supports 256-bit loads/stores
234 if (!CanLowerTo256Bit)
235 return std::nullopt;
236 [[fallthrough]];
237 case MVT::v2i8:
238 case MVT::v2i64:
239 case MVT::v2f64:
240 // This is a "native" vector type
241 return std::pair(NumElts, EltVT);
242
243 case MVT::v16f16: // <8 x f16x2>
244 case MVT::v16bf16: // <8 x bf16x2>
245 case MVT::v16i16: // <8 x i16x2>
246 case MVT::v32i8: // <8 x i8x4>
247 // This can be upsized into a "native" vector type iff the address space is
248 // global and the target supports 256-bit loads/stores.
249 if (!CanLowerTo256Bit)
250 return std::nullopt;
251 [[fallthrough]];
252 case MVT::v2i16: // <1 x i16x2>
253 case MVT::v2f16: // <1 x f16x2>
254 case MVT::v2bf16: // <1 x bf16x2>
255 case MVT::v4i8: // <1 x i8x4>
256 case MVT::v4i16: // <2 x i16x2>
257 case MVT::v4f16: // <2 x f16x2>
258 case MVT::v4bf16: // <2 x bf16x2>
259 case MVT::v8i8: // <2 x i8x4>
260 case MVT::v8f16: // <4 x f16x2>
261 case MVT::v8bf16: // <4 x bf16x2>
262 case MVT::v8i16: // <4 x i16x2>
263 case MVT::v16i8: // <4 x i8x4>
264 PackRegSize = 32;
265 break;
266
267 case MVT::v8f32: // <4 x f32x2>
268 case MVT::v8i32: // <4 x i32x2>
269 // This is a "native" vector type iff the address space is global and the
270 // target supports 256-bit loads/stores
271 if (!CanLowerTo256Bit)
272 return std::nullopt;
273 [[fallthrough]];
274 case MVT::v2f32: // <1 x f32x2>
275 case MVT::v4f32: // <2 x f32x2>
276 case MVT::v2i32: // <1 x i32x2>
277 case MVT::v4i32: // <2 x i32x2>
278 if (!STI.hasF32x2Instructions())
279 return std::pair(NumElts, EltVT);
280 PackRegSize = 64;
281 break;
282 }
283
284 // If we reach here, then we can pack 2 or more elements into a single 32-bit
285 // or 64-bit PTX register and treat the vector as a new vector containing
286 // packed elements.
287
288 // Number of elements to pack in one word.
289 const unsigned NPerReg = PackRegSize / EltVT.getSizeInBits();
290
291 return std::pair(NumElts / NPerReg, MVT::getVectorVT(EltVT, NPerReg));
292}
293
294/// ComputePTXValueVTs - For the given Type \p Ty, returns the set of primitive
295/// legal-ish MVTs that compose it. Unlike ComputeValueVTs, this will legalize
296/// the types as required by the calling convention (with special handling for
297/// i8s).
298/// NOTE: This is a band-aid for code that expects ComputeValueVTs to return the
299/// same number of types as the Ins/Outs arrays in LowerFormalArguments,
300/// LowerCall, and LowerReturn.
301static void ComputePTXValueVTs(const TargetLowering &TLI, const DataLayout &DL,
302 LLVMContext &Ctx, CallingConv::ID CallConv,
303 Type *Ty, SmallVectorImpl<EVT> &ValueVTs,
305 uint64_t StartingOffset = 0) {
306 SmallVector<EVT, 16> TempVTs;
307 SmallVector<uint64_t, 16> TempOffsets;
308 ComputeValueVTs(TLI, DL, Ty, TempVTs, /*MemVTs=*/nullptr, &TempOffsets,
309 StartingOffset);
310
311 for (const auto [VT, Off] : zip(TempVTs, TempOffsets)) {
312 MVT RegisterVT = TLI.getRegisterTypeForCallingConv(Ctx, CallConv, VT);
313 unsigned NumRegs = TLI.getNumRegistersForCallingConv(Ctx, CallConv, VT);
314
315 // Since we actually can load/store b8, we need to ensure that we'll use
316 // the original sized type for any i8s or i8 vectors.
317 if (VT.getScalarType() == MVT::i8) {
318 if (RegisterVT == MVT::i16)
319 RegisterVT = MVT::i8;
320 else if (RegisterVT == MVT::v2i16)
321 RegisterVT = MVT::v2i8;
322 else
323 assert(RegisterVT == MVT::v4i8 &&
324 "Expected v4i8, v2i16, or i16 for i8 RegisterVT");
325 }
326
327 // TODO: This is horribly incorrect for cases where the vector elements are
328 // not a multiple of bytes (ex i1) and legal or i8. However, this problem
329 // has existed for as long as NVPTX has and no one has complained, so we'll
330 // leave it for now.
331 for (unsigned I : seq(NumRegs)) {
332 ValueVTs.push_back(RegisterVT);
333 Offsets.push_back(Off + I * RegisterVT.getStoreSize());
334 }
335 }
336}
337
338// We return an EVT that can hold N VTs
339// If the VT is a vector, the resulting EVT is a flat vector with the same
340// element type as VT's element type.
341static EVT getVectorizedVT(EVT VT, unsigned N, LLVMContext &C) {
342 if (N == 1)
343 return VT;
344
345 return VT.isVector() ? EVT::getVectorVT(C, VT.getScalarType(),
346 VT.getVectorNumElements() * N)
347 : EVT::getVectorVT(C, VT, N);
348}
349
351 const SDLoc &dl, SelectionDAG &DAG) {
352 if (V.getValueType() == VT) {
353 assert(I == 0 && "Index must be 0 for scalar value");
354 return V;
355 }
356
357 if (!VT.isVector())
358 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, V,
359 DAG.getVectorIdxConstant(I, dl));
360
361 return DAG.getNode(
362 ISD::EXTRACT_SUBVECTOR, dl, VT, V,
364}
365
366template <typename T>
367static inline SDValue getBuildVectorizedValue(unsigned N, const SDLoc &dl,
368 SelectionDAG &DAG, T GetElement) {
369 if (N == 1)
370 return GetElement(0);
371
373 for (const unsigned I : llvm::seq(N)) {
374 SDValue Val = GetElement(I);
375 if (Val.getValueType().isVector())
376 DAG.ExtractVectorElements(Val, Values);
377 else
378 Values.push_back(Val);
379 }
380
381 EVT VT = EVT::getVectorVT(*DAG.getContext(), Values[0].getValueType(),
382 Values.size());
383 return DAG.getBuildVector(VT, dl, Values);
384}
385
386/// PromoteScalarIntegerPTX
387/// Used to make sure the arguments/returns are suitable for passing
388/// and promote them to a larger size if they're not.
389///
390/// The promoted type is placed in \p PromoteVT if the function returns true.
392 if (VT.isScalarInteger()) {
393 switch (PowerOf2Ceil(VT.getFixedSizeInBits())) {
394 default:
396 "Promotion is not suitable for scalars of size larger than 64-bits");
397 case 1:
398 return MVT::i1;
399 case 2:
400 case 4:
401 case 8:
402 return MVT::i8;
403 case 16:
404 return MVT::i16;
405 case 32:
406 return MVT::i32;
407 case 64:
408 return MVT::i64;
409 }
410 }
411 return VT;
412}
413
414// Check whether we can merge loads/stores of some of the pieces of a
415// flattened function parameter or return value into a single vector
416// load/store.
417//
418// The flattened parameter is represented as a list of EVTs and
419// offsets, and the whole structure is aligned to ParamAlignment. This
420// function determines whether we can load/store pieces of the
421// parameter starting at index Idx using a single vectorized op of
422// size AccessSize. If so, it returns the number of param pieces
423// covered by the vector op. Otherwise, it returns 1.
424template <typename T>
426 unsigned Idx, uint32_t AccessSize, const SmallVectorImpl<EVT> &ValueVTs,
427 const SmallVectorImpl<T> &Offsets, Align ParamAlignment) {
428
429 // Can't vectorize if param alignment is not sufficient.
430 if (ParamAlignment < AccessSize)
431 return 1;
432 // Can't vectorize if offset is not aligned.
433 if (Offsets[Idx] & (AccessSize - 1))
434 return 1;
435
436 EVT EltVT = ValueVTs[Idx];
437 unsigned EltSize = EltVT.getStoreSize();
438
439 // Element is too large to vectorize.
440 if (EltSize >= AccessSize)
441 return 1;
442
443 unsigned NumElts = AccessSize / EltSize;
444 // Can't vectorize if AccessBytes if not a multiple of EltSize.
445 if (AccessSize != EltSize * NumElts)
446 return 1;
447
448 // We don't have enough elements to vectorize.
449 if (Idx + NumElts > ValueVTs.size())
450 return 1;
451
452 // PTX ISA can only deal with 2- and 4-element vector ops.
453 if (NumElts != 4 && NumElts != 2)
454 return 1;
455
456 for (unsigned j = Idx + 1; j < Idx + NumElts; ++j) {
457 // Types do not match.
458 if (ValueVTs[j] != EltVT)
459 return 1;
460
461 // Elements are not contiguous.
462 if (Offsets[j] - Offsets[j - 1] != EltSize)
463 return 1;
464 }
465 // OK. We can vectorize ValueVTs[i..i+NumElts)
466 return NumElts;
467}
468
469// Computes whether and how we can vectorize the loads/stores of a
470// flattened function parameter or return value.
471//
472// The flattened parameter is represented as the list of ValueVTs and
473// Offsets, and is aligned to ParamAlignment bytes. We return a vector
474// of the same size as ValueVTs indicating how each piece should be
475// loaded/stored (i.e. as a scalar, or as part of a vector
476// load/store).
477template <typename T>
480 const SmallVectorImpl<T> &Offsets, Align ParamAlignment,
481 bool IsVAArg = false) {
482 // Set vector size to match ValueVTs and mark all elements as
483 // scalars by default.
484
485 if (IsVAArg)
486 return SmallVector<unsigned>(ValueVTs.size(), 1);
487
488 SmallVector<unsigned, 16> VectorInfo;
489
490 const auto GetNumElts = [&](unsigned I) -> unsigned {
491 for (const unsigned AccessSize : {16, 8, 4, 2}) {
492 const unsigned NumElts = canMergeParamLoadStoresStartingAt(
493 I, AccessSize, ValueVTs, Offsets, ParamAlignment);
494 assert((NumElts == 1 || NumElts == 2 || NumElts == 4) &&
495 "Unexpected vectorization size");
496 if (NumElts != 1)
497 return NumElts;
498 }
499 return 1;
500 };
501
502 // Check what we can vectorize using 128/64/32-bit accesses.
503 for (unsigned I = 0, E = ValueVTs.size(); I != E;) {
504 const unsigned NumElts = GetNumElts(I);
505 VectorInfo.push_back(NumElts);
506 I += NumElts;
507 }
508 assert(std::accumulate(VectorInfo.begin(), VectorInfo.end(), 0u) ==
509 ValueVTs.size());
510 return VectorInfo;
511}
512
513// NVPTXTargetLowering Constructor.
515 const NVPTXSubtarget &STI)
516 : TargetLowering(TM), nvTM(&TM), STI(STI), GlobalUniqueCallSite(0) {
517 // always lower memset, memcpy, and memmove intrinsics to load/store
518 // instructions, rather
519 // then generating calls to memset, mempcy or memmove.
523
526
527 // Jump is Expensive. Don't create extra control flow for 'and', 'or'
528 // condition branches.
529 setJumpIsExpensive(true);
530
531 // Wide divides are _very_ slow. Try to reduce the width of the divide if
532 // possible.
533 addBypassSlowDiv(64, 32);
534
535 // By default, use the Source scheduling
536 if (sched4reg)
538 else
540
541 auto setFP16OperationAction = [&](unsigned Op, MVT VT, LegalizeAction Action,
542 LegalizeAction NoF16Action) {
543 bool IsOpSupported = STI.allowFP16Math();
544 switch (Op) {
545 // Several FP16 instructions are available on sm_80 only.
546 case ISD::FMINNUM:
547 case ISD::FMAXNUM:
548 case ISD::FMAXNUM_IEEE:
549 case ISD::FMINNUM_IEEE:
550 case ISD::FMAXIMUM:
551 case ISD::FMINIMUM:
552 case ISD::FMAXIMUMNUM:
553 case ISD::FMINIMUMNUM:
554 IsOpSupported &= STI.getSmVersion() >= 80 && STI.getPTXVersion() >= 70;
555 break;
556 case ISD::FEXP2:
557 IsOpSupported &= STI.getSmVersion() >= 75 && STI.getPTXVersion() >= 70;
558 break;
559 }
560 setOperationAction(Op, VT, IsOpSupported ? Action : NoF16Action);
561 };
562
563 auto setBF16OperationAction = [&](unsigned Op, MVT VT, LegalizeAction Action,
564 LegalizeAction NoBF16Action) {
565 bool IsOpSupported = STI.hasNativeBF16Support(Op);
567 Op, VT, IsOpSupported ? Action : NoBF16Action);
568 };
569
570 auto setI16x2OperationAction = [&](unsigned Op, MVT VT, LegalizeAction Action,
571 LegalizeAction NoI16x2Action) {
572 bool IsOpSupported = false;
573 // instructions are available on sm_90 only
574 switch (Op) {
575 case ISD::ADD:
576 case ISD::SMAX:
577 case ISD::SMIN:
578 case ISD::UMIN:
579 case ISD::UMAX:
580 IsOpSupported = STI.getSmVersion() >= 90 && STI.getPTXVersion() >= 80;
581 break;
582 }
583 setOperationAction(Op, VT, IsOpSupported ? Action : NoI16x2Action);
584 };
585
586 addRegisterClass(MVT::i1, &NVPTX::B1RegClass);
587 addRegisterClass(MVT::i16, &NVPTX::B16RegClass);
588 addRegisterClass(MVT::v2i16, &NVPTX::B32RegClass);
589 addRegisterClass(MVT::v4i8, &NVPTX::B32RegClass);
590 addRegisterClass(MVT::i32, &NVPTX::B32RegClass);
591 addRegisterClass(MVT::i64, &NVPTX::B64RegClass);
592 addRegisterClass(MVT::f32, &NVPTX::B32RegClass);
593 addRegisterClass(MVT::f64, &NVPTX::B64RegClass);
594 addRegisterClass(MVT::f16, &NVPTX::B16RegClass);
595 addRegisterClass(MVT::v2f16, &NVPTX::B32RegClass);
596 addRegisterClass(MVT::bf16, &NVPTX::B16RegClass);
597 addRegisterClass(MVT::v2bf16, &NVPTX::B32RegClass);
598
599 if (STI.hasF32x2Instructions()) {
600 addRegisterClass(MVT::v2f32, &NVPTX::B64RegClass);
601 addRegisterClass(MVT::v2i32, &NVPTX::B64RegClass);
602 }
603
604 // Conversion to/from FP16/FP16x2 is always legal.
609
610 setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal);
611 if (STI.getSmVersion() >= 30 && STI.getPTXVersion() > 31)
612 setOperationAction(ISD::READSTEADYCOUNTER, MVT::i64, Legal);
613
614 setFP16OperationAction(ISD::SETCC, MVT::f16, Legal, Promote);
615 setFP16OperationAction(ISD::SETCC, MVT::v2f16, Legal, Expand);
616
617 // Conversion to/from BFP16/BFP16x2 is always legal.
622
623 setBF16OperationAction(ISD::SETCC, MVT::v2bf16, Legal, Expand);
624 setBF16OperationAction(ISD::SETCC, MVT::bf16, Legal, Promote);
625 if (getOperationAction(ISD::SETCC, MVT::bf16) == Promote)
626 AddPromotedToType(ISD::SETCC, MVT::bf16, MVT::f32);
627
628 // Conversion to/from i16/i16x2 is always legal.
633
638
639 // No support for these operations with v2f32/v2i32
640 setOperationAction(ISD::INSERT_VECTOR_ELT, {MVT::v2f32, MVT::v2i32}, Expand);
641 setOperationAction(ISD::VECTOR_SHUFFLE, {MVT::v2f32, MVT::v2i32}, Expand);
642
645 MVT::v2i32, Expand);
646
647 // Need custom lowering in case the index is dynamic.
648 if (STI.hasF32x2Instructions())
649 setOperationAction(ISD::EXTRACT_VECTOR_ELT, {MVT::v2f32, MVT::v2i32},
650 Custom);
651
652 // Custom conversions to/from v2i8.
653 setOperationAction(ISD::BITCAST, MVT::v2i8, Custom);
654
655 // Only logical ops can be done on v4i8/v2i32 directly, others must be done
656 // elementwise.
673 {MVT::v4i8, MVT::v2i32}, Expand);
674
675 // Operations not directly supported by NVPTX.
676 for (MVT VT : {MVT::bf16, MVT::f16, MVT::v2bf16, MVT::v2f16, MVT::f32,
677 MVT::v2f32, MVT::f64, MVT::i1, MVT::i8, MVT::i16, MVT::v2i16,
678 MVT::v4i8, MVT::i32, MVT::v2i32, MVT::i64}) {
680 setOperationAction(ISD::BR_CC, VT, Expand);
681 }
682
683 // We don't want ops like FMINIMUM or UMAX to be lowered to SETCC+VSELECT.
684 setOperationAction(ISD::VSELECT, {MVT::v2f32, MVT::v2i32}, Expand);
685
686 // Some SIGN_EXTEND_INREG can be done using cvt instruction.
687 // For others we will expand to a SHL/SRA pair.
693 setOperationAction(ISD::SIGN_EXTEND_INREG, {MVT::v2i16, MVT::v2i32}, Expand);
694
701
704
706 {MVT::i8, MVT::i16, MVT::v2i16, MVT::i32, MVT::i64},
707 Expand);
708
709 if (STI.hasHWROT32()) {
712 Custom);
713 }
714
716
717 setOperationAction(ISD::BR_JT, MVT::Other, Custom);
718 setOperationAction(ISD::BRIND, MVT::Other, Expand);
719
720 // We want to legalize constant related memmove and memcopy
721 // intrinsics.
723
724 // FP extload/truncstore is not legal in PTX. We need to expand all these.
725 for (auto FloatVTs :
727 for (MVT ValVT : FloatVTs) {
728 for (MVT MemVT : FloatVTs) {
729 setLoadExtAction(ISD::EXTLOAD, ValVT, MemVT, Expand);
730 setTruncStoreAction(ValVT, MemVT, Expand);
731 }
732 }
733 }
734
735 // To improve CodeGen we'll legalize any-extend loads to zext loads. This is
736 // how they'll be lowered in ISel anyway, and by doing this a little earlier
737 // we allow for more DAG combine opportunities.
738 for (auto IntVTs :
740 for (MVT ValVT : IntVTs)
741 for (MVT MemVT : IntVTs)
742 if (isTypeLegal(ValVT))
743 setLoadExtAction(ISD::EXTLOAD, ValVT, MemVT, Custom);
744
745 // PTX does not support load / store predicate registers
746 setOperationAction({ISD::LOAD, ISD::STORE}, MVT::i1, Custom);
747 for (MVT VT : MVT::integer_valuetypes()) {
749 Promote);
750 setTruncStoreAction(VT, MVT::i1, Expand);
751 }
752
753 // Disable generations of extload/truncstore for v2i32/v2i16/v2i8. The generic
754 // expansion for these nodes when they are unaligned is incorrect if the
755 // type is a vector.
756 //
757 // TODO: Fix the generic expansion for these nodes found in
758 // TargetLowering::expandUnalignedLoad/Store.
760 MVT::v2i8, Expand);
762 {MVT::v2i8, MVT::v2i16}, Expand);
763 setTruncStoreAction(MVT::v2i16, MVT::v2i8, Expand);
764 setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
765 setTruncStoreAction(MVT::v2i32, MVT::v2i8, Expand);
766
767 // Register custom handling for illegal type loads/stores. We'll try to custom
768 // lower almost all illegal types and logic in the lowering will discard cases
769 // we can't handle.
770 setOperationAction({ISD::LOAD, ISD::STORE}, {MVT::i128, MVT::f128}, Custom);
772 if (!isTypeLegal(VT) && VT.getStoreSizeInBits() <= 256)
773 setOperationAction({ISD::STORE, ISD::LOAD}, VT, Custom);
774
775 // Custom legalization for LDU intrinsics.
776 // TODO: The logic to lower these is not very robust and we should rewrite it.
777 // Perhaps LDU should not be represented as an intrinsic at all.
780 if (IsPTXVectorType(VT))
782
786 MVT::i1, Expand);
787
788 // This is legal in NVPTX
793
794 setOperationAction(ISD::DYNAMIC_STACKALLOC, {MVT::i32, MVT::i64}, Custom);
795 setOperationAction({ISD::STACKRESTORE, ISD::STACKSAVE}, MVT::Other, Custom);
796
797 // TRAP can be lowered to PTX trap
798 setOperationAction(ISD::TRAP, MVT::Other, Legal);
799 // DEBUGTRAP can be lowered to PTX brkpt
800 setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
801
802 // Support varargs.
803 setOperationAction(ISD::VASTART, MVT::Other, Custom);
804 setOperationAction(ISD::VAARG, MVT::Other, Custom);
805 setOperationAction(ISD::VACOPY, MVT::Other, Expand);
806 setOperationAction(ISD::VAEND, MVT::Other, Expand);
807
809 {MVT::i16, MVT::i32, MVT::i64}, Legal);
810
812 Promote);
815
816 setI16x2OperationAction(ISD::ABS, MVT::v2i16, Legal, Custom);
817 setI16x2OperationAction(ISD::SMIN, MVT::v2i16, Legal, Custom);
818 setI16x2OperationAction(ISD::SMAX, MVT::v2i16, Legal, Custom);
819 setI16x2OperationAction(ISD::UMIN, MVT::v2i16, Legal, Custom);
820 setI16x2OperationAction(ISD::UMAX, MVT::v2i16, Legal, Custom);
821 setI16x2OperationAction(ISD::CTPOP, MVT::v2i16, Legal, Expand);
822 setI16x2OperationAction(ISD::CTLZ, MVT::v2i16, Legal, Expand);
823
824 setI16x2OperationAction(ISD::ADD, MVT::v2i16, Legal, Custom);
825 setI16x2OperationAction(ISD::SUB, MVT::v2i16, Legal, Custom);
826 setI16x2OperationAction(ISD::MUL, MVT::v2i16, Legal, Custom);
827 setI16x2OperationAction(ISD::SHL, MVT::v2i16, Legal, Custom);
828 setI16x2OperationAction(ISD::SREM, MVT::v2i16, Legal, Custom);
829 setI16x2OperationAction(ISD::UREM, MVT::v2i16, Legal, Custom);
830
831 // Other arithmetic and logic ops are unsupported.
835 {MVT::v2i16, MVT::v2i32}, Expand);
836
837 // v2i32 is not supported for any arithmetic operations
842 MVT::v2i32, Expand);
843
848 if (STI.getPTXVersion() >= 43) {
853 }
854
856 setOperationAction(ISD::CTTZ, {MVT::v2i16, MVT::v2i32}, Expand);
859
860 // PTX does not directly support SELP of i1, so promote to i32 first
862
863 // PTX cannot multiply two i64s in a single instruction.
866
867 // We have some custom DAG combine patterns for these nodes
870 ISD::FADD, ISD::FMAXNUM, ISD::FMINNUM,
871 ISD::FMAXIMUM, ISD::FMINIMUM, ISD::FMAXIMUMNUM,
872 ISD::FMINIMUMNUM, ISD::MUL, ISD::SHL,
874 ISD::BUILD_VECTOR, ISD::ADDRSPACECAST, ISD::LOAD,
875 ISD::STORE, ISD::ZERO_EXTEND, ISD::SIGN_EXTEND});
876
877 // setcc for f16x2 and bf16x2 needs special handling to prevent
878 // legalizer's attempt to scalarize it due to v2i1 not being legal.
879 if (STI.allowFP16Math() || STI.hasBF16Math())
881
882 // Vector reduction operations. These may be turned into shuffle or tree
883 // reductions depending on what instructions are available for each type.
885 MVT EltVT = VT.getVectorElementType();
886 if (EltVT == MVT::f32 || EltVT == MVT::f64) {
887 setOperationAction({ISD::VECREDUCE_FMAX, ISD::VECREDUCE_FMIN,
888 ISD::VECREDUCE_FMAXIMUM, ISD::VECREDUCE_FMINIMUM},
889 VT, Custom);
890 }
891 }
892
893 // Promote fp16 arithmetic if fp16 hardware isn't available or the
894 // user passed --nvptx-no-fp16-math. The flag is useful because,
895 // although sm_53+ GPUs have some sort of FP16 support in
896 // hardware, only sm_53 and sm_60 have full implementation. Others
897 // only have token amount of hardware and are likely to run faster
898 // by using fp32 units instead.
899 for (const auto &Op : {ISD::FADD, ISD::FMUL, ISD::FSUB, ISD::FMA}) {
900 setFP16OperationAction(Op, MVT::f16, Legal, Promote);
901 setFP16OperationAction(Op, MVT::v2f16, Legal, Expand);
902 setBF16OperationAction(Op, MVT::v2bf16, Legal, Expand);
903 // bf16 must be promoted to f32.
904 setBF16OperationAction(Op, MVT::bf16, Legal, Promote);
905 if (getOperationAction(Op, MVT::bf16) == Promote)
906 AddPromotedToType(Op, MVT::bf16, MVT::f32);
907 setOperationAction(Op, MVT::v2f32,
908 STI.hasF32x2Instructions() ? Legal : Expand);
909 }
910
911 // On SM80, we select add/mul/sub as fma to avoid promotion to float
912 for (const auto &Op : {ISD::FADD, ISD::FMUL, ISD::FSUB}) {
913 for (const auto &VT : {MVT::bf16, MVT::v2bf16}) {
914 if (!STI.hasNativeBF16Support(Op) && STI.hasNativeBF16Support(ISD::FMA)) {
916 }
917 }
918 }
919
920 // f16/f16x2 neg was introduced in PTX 60, SM_53.
921 const bool IsFP16FP16x2NegAvailable = STI.getSmVersion() >= 53 &&
922 STI.getPTXVersion() >= 60 &&
923 STI.allowFP16Math();
924 for (const auto &VT : {MVT::f16, MVT::v2f16})
925 setOperationAction(ISD::FNEG, VT,
926 IsFP16FP16x2NegAvailable ? Legal : Expand);
927
928 setBF16OperationAction(ISD::FNEG, MVT::bf16, Legal, Expand);
929 setBF16OperationAction(ISD::FNEG, MVT::v2bf16, Legal, Expand);
930 setOperationAction(ISD::FNEG, MVT::v2f32, Expand);
931 // (would be) Library functions.
932
933 // These map to conversion instructions for scalar FP types.
934 for (const auto &Op : {ISD::FCEIL, ISD::FFLOOR, ISD::FNEARBYINT, ISD::FRINT,
935 ISD::FROUNDEVEN, ISD::FTRUNC}) {
936 setOperationAction(Op, MVT::f16, Legal);
937 setOperationAction(Op, MVT::f32, Legal);
938 setOperationAction(Op, MVT::f64, Legal);
939 setOperationAction(Op, MVT::v2f16, Expand);
940 setOperationAction(Op, MVT::v2bf16, Expand);
941 setOperationAction(Op, MVT::v2f32, Expand);
942 setBF16OperationAction(Op, MVT::bf16, Legal, Promote);
943 if (getOperationAction(Op, MVT::bf16) == Promote)
944 AddPromotedToType(Op, MVT::bf16, MVT::f32);
945 }
946
947 if (STI.getSmVersion() < 80 || STI.getPTXVersion() < 71) {
948 setOperationAction(ISD::BF16_TO_FP, MVT::f32, Expand);
949 }
950 if (STI.getSmVersion() < 90 || STI.getPTXVersion() < 78) {
951 for (MVT VT : {MVT::bf16, MVT::f32, MVT::f64}) {
952 setOperationAction(ISD::FP_EXTEND, VT, Custom);
954 }
955 }
956
957 // Expand v2f32 = fp_extend
958 setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Expand);
959 // Expand v2[b]f16 = fp_round v2f32
960 setOperationAction(ISD::FP_ROUND, {MVT::v2bf16, MVT::v2f16}, Expand);
961
962 // sm_80 only has conversions between f32 and bf16. Custom lower all other
963 // bf16 conversions.
964 if (STI.getSmVersion() < 90 || STI.getPTXVersion() < 78) {
965 for (MVT VT : {MVT::i1, MVT::i16, MVT::i32, MVT::i64}) {
968 VT, Custom);
969 }
972 MVT::bf16, Custom);
973 }
974
975 setOperationAction(ISD::FROUND, MVT::f16, Promote);
976 setOperationAction(ISD::FROUND, MVT::v2f16, Expand);
977 setOperationAction(ISD::FROUND, MVT::v2bf16, Expand);
978 setOperationAction(ISD::FROUND, MVT::f32, Custom);
979 setOperationAction(ISD::FROUND, MVT::f64, Custom);
980 setOperationAction(ISD::FROUND, MVT::bf16, Promote);
981 AddPromotedToType(ISD::FROUND, MVT::bf16, MVT::f32);
982
983 // 'Expand' implements FCOPYSIGN without calling an external library.
990
991 // These map to corresponding instructions for f32/f64. f16 must be
992 // promoted to f32. v2f16 is expanded to f16, which is then promoted
993 // to f32.
994 for (const auto &Op :
995 {ISD::FDIV, ISD::FREM, ISD::FSQRT, ISD::FSIN, ISD::FCOS, ISD::FTANH}) {
996 setOperationAction(Op, MVT::f16, Promote);
997 setOperationAction(Op, MVT::f32, Legal);
998 // only div/rem/sqrt are legal for f64
999 if (Op == ISD::FDIV || Op == ISD::FREM || Op == ISD::FSQRT) {
1000 setOperationAction(Op, MVT::f64, Legal);
1001 }
1002 setOperationAction(Op, {MVT::v2f16, MVT::v2bf16, MVT::v2f32}, Expand);
1003 setOperationAction(Op, MVT::bf16, Promote);
1004 AddPromotedToType(Op, MVT::bf16, MVT::f32);
1005 }
1006 setOperationAction(ISD::FREM, {MVT::f32, MVT::f64}, Custom);
1007
1008 setOperationAction(ISD::FABS, {MVT::f32, MVT::f64}, Legal);
1009 setOperationAction(ISD::FABS, MVT::v2f32, Expand);
1010 if (STI.getPTXVersion() >= 65) {
1011 setFP16OperationAction(ISD::FABS, MVT::f16, Legal, Promote);
1012 setFP16OperationAction(ISD::FABS, MVT::v2f16, Legal, Expand);
1013 } else {
1014 setOperationAction(ISD::FABS, MVT::f16, Promote);
1015 setOperationAction(ISD::FABS, MVT::v2f16, Expand);
1016 }
1017 setBF16OperationAction(ISD::FABS, MVT::v2bf16, Legal, Expand);
1018 setBF16OperationAction(ISD::FABS, MVT::bf16, Legal, Promote);
1019 if (getOperationAction(ISD::FABS, MVT::bf16) == Promote)
1020 AddPromotedToType(ISD::FABS, MVT::bf16, MVT::f32);
1021
1022 for (const auto &Op :
1023 {ISD::FMINNUM, ISD::FMAXNUM, ISD::FMINIMUMNUM, ISD::FMAXIMUMNUM}) {
1024 setOperationAction(Op, MVT::f32, Legal);
1025 setOperationAction(Op, MVT::f64, Legal);
1026 setFP16OperationAction(Op, MVT::f16, Legal, Promote);
1027 setFP16OperationAction(Op, MVT::v2f16, Legal, Expand);
1028 setBF16OperationAction(Op, MVT::v2bf16, Legal, Expand);
1029 setBF16OperationAction(Op, MVT::bf16, Legal, Promote);
1030 if (getOperationAction(Op, MVT::bf16) == Promote)
1031 AddPromotedToType(Op, MVT::bf16, MVT::f32);
1032 setOperationAction(Op, MVT::v2f32, Expand);
1033 }
1034 bool SupportsF32MinMaxNaN =
1035 STI.getSmVersion() >= 80 && STI.getPTXVersion() >= 70;
1036 for (const auto &Op : {ISD::FMINIMUM, ISD::FMAXIMUM}) {
1037 setOperationAction(Op, MVT::f32, SupportsF32MinMaxNaN ? Legal : Expand);
1038 setFP16OperationAction(Op, MVT::f16, Legal, Expand);
1039 setFP16OperationAction(Op, MVT::v2f16, Legal, Expand);
1040 setBF16OperationAction(Op, MVT::bf16, Legal, Expand);
1041 setBF16OperationAction(Op, MVT::v2bf16, Legal, Expand);
1042 setOperationAction(Op, MVT::v2f32, Expand);
1043 }
1044
1045 // Custom lowering for inline asm with 128-bit operands
1048
1049 // FEXP2 support:
1050 // - f32
1051 // - f16/f16x2 (sm_70+, PTX 7.0+)
1052 // - bf16/bf16x2 (sm_90+, PTX 7.8+)
1053 // When f16/bf16 types aren't supported, they are promoted/expanded to f32.
1054 setOperationAction(ISD::FEXP2, MVT::f32, Legal);
1055 setOperationAction(ISD::FEXP2, MVT::v2f32, Expand);
1056 setFP16OperationAction(ISD::FEXP2, MVT::f16, Legal, Promote);
1057 setFP16OperationAction(ISD::FEXP2, MVT::v2f16, Legal, Expand);
1058 setBF16OperationAction(ISD::FEXP2, MVT::bf16, Legal, Promote);
1059 setBF16OperationAction(ISD::FEXP2, MVT::v2bf16, Legal, Expand);
1060
1061 // FLOG2 supports f32 only
1062 // f16/bf16 types aren't supported, but they are promoted/expanded to f32.
1063 if (UseApproxLog2F32) {
1064 setOperationAction(ISD::FLOG2, MVT::f32, Legal);
1065 setOperationPromotedToType(ISD::FLOG2, MVT::f16, MVT::f32);
1066 setOperationPromotedToType(ISD::FLOG2, MVT::bf16, MVT::f32);
1067 setOperationAction(ISD::FLOG2, {MVT::v2f16, MVT::v2bf16, MVT::v2f32},
1068 Expand);
1069 }
1070
1071 setOperationAction(ISD::ADDRSPACECAST, {MVT::i32, MVT::i64}, Custom);
1072
1073 setOperationAction(ISD::ATOMIC_LOAD_SUB, {MVT::i32, MVT::i64}, Expand);
1074
1075 // atom.b128 is legal in PTX but since we don't represent i128 as a legal
1076 // type, we need to custom lower it.
1077 setOperationAction({ISD::ATOMIC_CMP_SWAP, ISD::ATOMIC_SWAP}, MVT::i128,
1078 Custom);
1079
1080 // Now deduce the information based on the above mentioned
1081 // actions
1082 computeRegisterProperties(STI.getRegisterInfo());
1083
1084 // PTX support for 16-bit CAS is emulated. Only use 32+
1085 setMinCmpXchgSizeInBits(STI.getMinCmpXchgSizeInBits());
1086 setMaxAtomicSizeInBitsSupported(STI.hasAtomSwap128() ? 128 : 64);
1088
1089 // Custom lowering for tcgen05.ld vector operands
1091 {MVT::v2i32, MVT::v4i32, MVT::v8i32, MVT::v16i32,
1092 MVT::v32i32, MVT::v64i32, MVT::v128i32},
1093 Custom);
1094
1095 // Custom lowering for tcgen05.st vector operands
1097 {MVT::v2i32, MVT::v4i32, MVT::v8i32, MVT::v16i32,
1098 MVT::v32i32, MVT::v64i32, MVT::v128i32, MVT::Other},
1099 Custom);
1100
1101 // Enable custom lowering for the following:
1102 // * MVT::i128 - clusterlaunchcontrol
1103 // * MVT::i32 - prmt
1104 // * MVT::v4f32 - cvt_rs fp{4/6/8}x4 intrinsics
1105 // * MVT::Other - internal.addrspace.wrap
1107 {MVT::i32, MVT::i128, MVT::v4f32, MVT::Other}, Custom);
1108}
1109
1110const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const {
1111
1112#define MAKE_CASE(V) \
1113 case V: \
1114 return #V;
1115
1116 switch ((NVPTXISD::NodeType)Opcode) {
1118 break;
1119
1172 MAKE_CASE(
1174 MAKE_CASE(
1186 MAKE_CASE(
1188 MAKE_CASE(
1195 }
1196 return nullptr;
1197
1198#undef MAKE_CASE
1199}
1200
1203 if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&
1204 VT.getScalarType() == MVT::i1)
1205 return TypeSplitVector;
1207}
1208
1210 int Enabled, int &ExtraSteps,
1211 bool &UseOneConst,
1212 bool Reciprocal) const {
1215 return SDValue();
1216
1217 if (ExtraSteps == ReciprocalEstimate::Unspecified)
1218 ExtraSteps = 0;
1219
1220 SDLoc DL(Operand);
1221 EVT VT = Operand.getValueType();
1222 bool Ftz = useF32FTZ(DAG.getMachineFunction());
1223
1224 auto MakeIntrinsicCall = [&](Intrinsic::ID IID) {
1225 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
1226 DAG.getConstant(IID, DL, MVT::i32), Operand);
1227 };
1228
1229 // The sqrt and rsqrt refinement processes assume we always start out with an
1230 // approximation of the rsqrt. Therefore, if we're going to do any refinement
1231 // (i.e. ExtraSteps > 0), we must return an rsqrt. But if we're *not* doing
1232 // any refinement, we must return a regular sqrt.
1233 if (Reciprocal || ExtraSteps > 0) {
1234 if (VT == MVT::f32)
1235 return MakeIntrinsicCall(Ftz ? Intrinsic::nvvm_rsqrt_approx_ftz_f
1236 : Intrinsic::nvvm_rsqrt_approx_f);
1237 else if (VT == MVT::f64)
1238 return MakeIntrinsicCall(Intrinsic::nvvm_rsqrt_approx_d);
1239 else
1240 return SDValue();
1241 } else {
1242 if (VT == MVT::f32)
1243 return MakeIntrinsicCall(Ftz ? Intrinsic::nvvm_sqrt_approx_ftz_f
1244 : Intrinsic::nvvm_sqrt_approx_f);
1245 else {
1246 // There's no sqrt.approx.f64 instruction, so we emit
1247 // reciprocal(rsqrt(x)). This is faster than
1248 // select(x == 0, 0, x * rsqrt(x)). (In fact, it's faster than plain
1249 // x * rsqrt(x).)
1250 return DAG.getNode(
1252 DAG.getConstant(Intrinsic::nvvm_rcp_approx_ftz_d, DL, MVT::i32),
1253 MakeIntrinsicCall(Intrinsic::nvvm_rsqrt_approx_d));
1254 }
1255 }
1256}
1257
1259 const DataLayout &DL, Type *RetTy, const ArgListTy &Args,
1261 std::optional<unsigned> FirstVAArg, const CallBase &CB,
1262 unsigned UniqueCallSite) const {
1263 auto PtrVT = getPointerTy(DL);
1264
1265 std::string Prototype;
1266 raw_string_ostream O(Prototype);
1267 O << "prototype_" << UniqueCallSite << " : .callprototype ";
1268
1269 if (RetTy->isVoidTy()) {
1270 O << "()";
1271 } else {
1272 O << "(";
1273 if (shouldPassAsArray(RetTy)) {
1274 const Align RetAlign = getArgumentAlignment(&CB, RetTy, 0, DL);
1275 O << ".param .align " << RetAlign.value() << " .b8 _["
1276 << DL.getTypeAllocSize(RetTy) << "]";
1277 } else if (RetTy->isFloatingPointTy() || RetTy->isIntegerTy()) {
1278 unsigned size = 0;
1279 if (auto *ITy = dyn_cast<IntegerType>(RetTy)) {
1280 size = ITy->getBitWidth();
1281 } else {
1282 assert(RetTy->isFloatingPointTy() &&
1283 "Floating point type expected here");
1284 size = RetTy->getPrimitiveSizeInBits();
1285 }
1286 // PTX ABI requires all scalar return values to be at least 32
1287 // bits in size. fp16 normally uses .b16 as its storage type in
1288 // PTX, so its size must be adjusted here, too.
1290
1291 O << ".param .b" << size << " _";
1292 } else if (isa<PointerType>(RetTy)) {
1293 O << ".param .b" << PtrVT.getSizeInBits() << " _";
1294 } else {
1295 llvm_unreachable("Unknown return type");
1296 }
1297 O << ") ";
1298 }
1299 O << "_ (";
1300
1301 bool first = true;
1302
1303 const unsigned NumArgs = FirstVAArg.value_or(Args.size());
1304 auto AllOuts = ArrayRef(Outs);
1305 for (const unsigned I : llvm::seq(NumArgs)) {
1306 const auto ArgOuts =
1307 AllOuts.take_while([I](auto O) { return O.OrigArgIndex == I; });
1308 AllOuts = AllOuts.drop_front(ArgOuts.size());
1309
1310 Type *Ty = Args[I].Ty;
1311 if (!first) {
1312 O << ", ";
1313 }
1314 first = false;
1315
1316 if (ArgOuts[0].Flags.isByVal()) {
1317 // Indirect calls need strict ABI alignment so we disable optimizations by
1318 // not providing a function to optimize.
1319 Type *ETy = Args[I].IndirectType;
1320 Align InitialAlign = ArgOuts[0].Flags.getNonZeroByValAlign();
1321 Align ParamByValAlign =
1322 getFunctionByValParamAlign(/*F=*/nullptr, ETy, InitialAlign, DL);
1323
1324 O << ".param .align " << ParamByValAlign.value() << " .b8 _["
1325 << ArgOuts[0].Flags.getByValSize() << "]";
1326 } else {
1327 if (shouldPassAsArray(Ty)) {
1328 Align ParamAlign =
1329 getArgumentAlignment(&CB, Ty, I + AttributeList::FirstArgIndex, DL);
1330 O << ".param .align " << ParamAlign.value() << " .b8 _["
1331 << DL.getTypeAllocSize(Ty) << "]";
1332 continue;
1333 }
1334 // i8 types in IR will be i16 types in SDAG
1335 assert((getValueType(DL, Ty) == ArgOuts[0].VT ||
1336 (getValueType(DL, Ty) == MVT::i8 && ArgOuts[0].VT == MVT::i16)) &&
1337 "type mismatch between callee prototype and arguments");
1338 // scalar type
1339 unsigned sz = 0;
1340 if (auto *ITy = dyn_cast<IntegerType>(Ty)) {
1341 sz = promoteScalarArgumentSize(ITy->getBitWidth());
1342 } else if (isa<PointerType>(Ty)) {
1343 sz = PtrVT.getSizeInBits();
1344 } else {
1345 sz = Ty->getPrimitiveSizeInBits();
1346 }
1347 O << ".param .b" << sz << " _";
1348 }
1349 }
1350
1351 if (FirstVAArg)
1352 O << (first ? "" : ",") << " .param .align "
1353 << STI.getMaxRequiredAlignment() << " .b8 _[]";
1354 O << ")";
1355 if (shouldEmitPTXNoReturn(&CB, *nvTM))
1356 O << " .noreturn";
1357 O << ";";
1358
1359 return Prototype;
1360}
1361
1363 const Function *F, Type *Ty, unsigned Idx, const DataLayout &DL) const {
1364 return getAlign(*F, Idx).value_or(getFunctionParamOptimizedAlign(F, Ty, DL));
1365}
1366
1367Align NVPTXTargetLowering::getArgumentAlignment(const CallBase *CB, Type *Ty,
1368 unsigned Idx,
1369 const DataLayout &DL) const {
1370 if (!CB) {
1371 // CallSite is zero, fallback to ABI type alignment
1372 return DL.getABITypeAlign(Ty);
1373 }
1374
1375 const Function *DirectCallee = CB->getCalledFunction();
1376
1377 if (!DirectCallee) {
1378 // We don't have a direct function symbol, but that may be because of
1379 // constant cast instructions in the call.
1380
1381 // With bitcast'd call targets, the instruction will be the call
1382 if (const auto *CI = dyn_cast<CallInst>(CB)) {
1383 // Check if we have call alignment metadata
1384 if (MaybeAlign StackAlign = getAlign(*CI, Idx))
1385 return StackAlign.value();
1386 }
1387 DirectCallee = getMaybeBitcastedCallee(CB);
1388 }
1389
1390 // Check for function alignment information if we found that the
1391 // ultimate target is a Function
1392 if (DirectCallee)
1393 return getFunctionArgumentAlignment(DirectCallee, Ty, Idx, DL);
1394
1395 // Call is indirect, fall back to the ABI type alignment
1396 return DL.getABITypeAlign(Ty);
1397}
1398
1400 const GlobalAddressSDNode *Func) {
1401 if (!Func)
1402 return false;
1403 if (auto *CalleeFunc = dyn_cast<Function>(Func->getGlobal()))
1404 return CB->getFunctionType() != CalleeFunc->getFunctionType();
1405 return false;
1406}
1407
1409 const DataLayout &DL,
1410 const TargetLowering &TL) {
1411 if (Ptr->getOpcode() == ISD::FrameIndex) {
1412 auto Ty = TL.getPointerTy(DL, ADDRESS_SPACE_LOCAL);
1415
1417 }
1418
1419 // Peel of an addrspacecast to generic and load directly from the specific
1420 // address space.
1421 if (Ptr->getOpcode() == ISD::ADDRSPACECAST) {
1422 const auto *ASC = cast<AddrSpaceCastSDNode>(Ptr);
1423 if (ASC->getDestAddressSpace() == ADDRESS_SPACE_GENERIC) {
1424 Ptr = ASC->getOperand(0);
1425 return MachinePointerInfo(ASC->getSrcAddressSpace());
1426 }
1427 }
1428
1429 return MachinePointerInfo();
1430}
1431
1433 if (Flags.isSExt())
1434 return ISD::SIGN_EXTEND;
1435 if (Flags.isZExt())
1436 return ISD::ZERO_EXTEND;
1437 return ISD::ANY_EXTEND;
1438}
1439
1441 ISD::ArgFlagsTy Flags, SelectionDAG &DAG,
1442 SDLoc dl) {
1443 const EVT ActualVT = V.getValueType();
1444 assert((ActualVT == ExpectedVT ||
1445 (ExpectedVT.isInteger() && ActualVT.isInteger())) &&
1446 "Non-integer argument type size mismatch");
1447 if (ExpectedVT.bitsGT(ActualVT))
1448 return DAG.getNode(getExtOpcode(Flags), dl, ExpectedVT, V);
1449 if (ExpectedVT.bitsLT(ActualVT))
1450 return DAG.getNode(ISD::TRUNCATE, dl, ExpectedVT, V);
1451
1452 return V;
1453}
1454
1456 SmallVectorImpl<SDValue> &InVals) const {
1457
1458 if (CLI.IsVarArg && (STI.getPTXVersion() < 60 || STI.getSmVersion() < 30))
1460 "Support for variadic functions (unsized array parameter) introduced "
1461 "in PTX ISA version 6.0 and requires target sm_30.");
1462
1463 SelectionDAG &DAG = CLI.DAG;
1464 SDLoc dl = CLI.DL;
1465 const SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
1466 SDValue Callee = CLI.Callee;
1467 ArgListTy &Args = CLI.getArgs();
1468 Type *RetTy = CLI.RetTy;
1469 const CallBase *CB = CLI.CB;
1470 const DataLayout &DL = DAG.getDataLayout();
1471 LLVMContext &Ctx = *DAG.getContext();
1472
1473 const auto GetI32 = [&](const unsigned I) {
1474 return DAG.getConstant(I, dl, MVT::i32);
1475 };
1476
1477 const unsigned UniqueCallSite = GlobalUniqueCallSite++;
1478 const SDValue CallChain = CLI.Chain;
1479 const SDValue StartChain =
1480 DAG.getCALLSEQ_START(CallChain, UniqueCallSite, 0, dl);
1481 SDValue DeclareGlue = StartChain.getValue(1);
1482
1483 SmallVector<SDValue, 16> CallPrereqs{StartChain};
1484
1485 const auto MakeDeclareScalarParam = [&](SDValue Symbol, unsigned Size) {
1486 // PTX ABI requires integral types to be at least 32 bits in size. FP16 is
1487 // loaded/stored using i16, so it's handled here as well.
1488 const unsigned SizeBits = promoteScalarArgumentSize(Size * 8);
1489 SDValue Declare =
1490 DAG.getNode(NVPTXISD::DeclareScalarParam, dl, {MVT::Other, MVT::Glue},
1491 {StartChain, Symbol, GetI32(SizeBits), DeclareGlue});
1492 CallPrereqs.push_back(Declare);
1493 DeclareGlue = Declare.getValue(1);
1494 return Declare;
1495 };
1496
1497 const auto MakeDeclareArrayParam = [&](SDValue Symbol, Align Align,
1498 unsigned Size) {
1499 SDValue Declare = DAG.getNode(
1500 NVPTXISD::DeclareArrayParam, dl, {MVT::Other, MVT::Glue},
1501 {StartChain, Symbol, GetI32(Align.value()), GetI32(Size), DeclareGlue});
1502 CallPrereqs.push_back(Declare);
1503 DeclareGlue = Declare.getValue(1);
1504 return Declare;
1505 };
1506
1507 // Variadic arguments.
1508 //
1509 // Normally, for each argument, we declare a param scalar or a param
1510 // byte array in the .param space, and store the argument value to that
1511 // param scalar or array starting at offset 0.
1512 //
1513 // In the case of the first variadic argument, we declare a vararg byte array
1514 // with size 0. The exact size of this array isn't known at this point, so
1515 // it'll be patched later. All the variadic arguments will be stored to this
1516 // array at a certain offset (which gets tracked by 'VAOffset'). The offset is
1517 // initially set to 0, so it can be used for non-variadic arguments (which use
1518 // 0 offset) to simplify the code.
1519 //
1520 // After all vararg is processed, 'VAOffset' holds the size of the
1521 // vararg byte array.
1522 assert((CLI.IsVarArg || CLI.Args.size() == CLI.NumFixedArgs) &&
1523 "Non-VarArg function with extra arguments");
1524
1525 const unsigned FirstVAArg = CLI.NumFixedArgs; // position of first variadic
1526 unsigned VAOffset = 0; // current offset in the param array
1527
1528 const SDValue VADeclareParam =
1529 CLI.Args.size() > FirstVAArg
1530 ? MakeDeclareArrayParam(getCallParamSymbol(DAG, FirstVAArg, MVT::i32),
1531 Align(STI.getMaxRequiredAlignment()), 0)
1532 : SDValue();
1533
1534 // Args.size() and Outs.size() need not match.
1535 // Outs.size() will be larger
1536 // * if there is an aggregate argument with multiple fields (each field
1537 // showing up separately in Outs)
1538 // * if there is a vector argument with more than typical vector-length
1539 // elements (generally if more than 4) where each vector element is
1540 // individually present in Outs.
1541 // So a different index should be used for indexing into Outs/OutVals.
1542 // See similar issue in LowerFormalArguments.
1543 auto AllOuts = ArrayRef(CLI.Outs);
1544 auto AllOutVals = ArrayRef(CLI.OutVals);
1545 assert(AllOuts.size() == AllOutVals.size() &&
1546 "Outs and OutVals must be the same size");
1547 // Declare the .params or .reg need to pass values
1548 // to the function
1549 for (const auto E : llvm::enumerate(Args)) {
1550 const auto ArgI = E.index();
1551 const auto Arg = E.value();
1552 const auto ArgOuts =
1553 AllOuts.take_while([&](auto O) { return O.OrigArgIndex == ArgI; });
1554 const auto ArgOutVals = AllOutVals.take_front(ArgOuts.size());
1555 AllOuts = AllOuts.drop_front(ArgOuts.size());
1556 AllOutVals = AllOutVals.drop_front(ArgOuts.size());
1557
1558 const bool IsVAArg = (ArgI >= FirstVAArg);
1559 const bool IsByVal = Arg.IsByVal;
1560
1561 const SDValue ParamSymbol =
1562 getCallParamSymbol(DAG, IsVAArg ? FirstVAArg : ArgI, MVT::i32);
1563
1564 assert((!IsByVal || Arg.IndirectType) &&
1565 "byval arg must have indirect type");
1566 Type *ETy = (IsByVal ? Arg.IndirectType : Arg.Ty);
1567
1568 const Align ArgAlign = [&]() {
1569 if (IsByVal) {
1570 // The ByValAlign in the Outs[OIdx].Flags is always set at this point,
1571 // so we don't need to worry whether it's naturally aligned or not.
1572 // See TargetLowering::LowerCallTo().
1573 const Align InitialAlign = ArgOuts[0].Flags.getNonZeroByValAlign();
1575 InitialAlign, DL);
1576 }
1577 return getArgumentAlignment(CB, Arg.Ty, ArgI + 1, DL);
1578 }();
1579
1580 const unsigned TySize = DL.getTypeAllocSize(ETy);
1581 assert((!IsByVal || TySize == ArgOuts[0].Flags.getByValSize()) &&
1582 "type size mismatch");
1583
1584 const SDValue ArgDeclare = [&]() {
1585 if (IsVAArg)
1586 return VADeclareParam;
1587
1588 if (IsByVal || shouldPassAsArray(Arg.Ty))
1589 return MakeDeclareArrayParam(ParamSymbol, ArgAlign, TySize);
1590
1591 assert(ArgOuts.size() == 1 && "We must pass only one value as non-array");
1592 assert((ArgOuts[0].VT.isInteger() || ArgOuts[0].VT.isFloatingPoint()) &&
1593 "Only int and float types are supported as non-array arguments");
1594
1595 return MakeDeclareScalarParam(ParamSymbol, TySize);
1596 }();
1597
1598 if (IsByVal) {
1599 assert(ArgOutVals.size() == 1 && "We must pass only one value as byval");
1600 SDValue SrcPtr = ArgOutVals[0];
1601 const auto PointerInfo = refinePtrAS(SrcPtr, DAG, DL, *this);
1602 const Align BaseSrcAlign = ArgOuts[0].Flags.getNonZeroByValAlign();
1603
1604 if (IsVAArg)
1605 VAOffset = alignTo(VAOffset, ArgAlign);
1606
1607 SmallVector<EVT, 4> ValueVTs, MemVTs;
1609 ComputeValueVTs(*this, DL, ETy, ValueVTs, &MemVTs, &Offsets);
1610
1611 unsigned J = 0;
1612 const auto VI = VectorizePTXValueVTs(MemVTs, Offsets, ArgAlign, IsVAArg);
1613 for (const unsigned NumElts : VI) {
1614 EVT LoadVT = getVectorizedVT(MemVTs[J], NumElts, Ctx);
1615 Align SrcAlign = commonAlignment(BaseSrcAlign, Offsets[J]);
1616 SDValue SrcAddr = DAG.getObjectPtrOffset(dl, SrcPtr, Offsets[J]);
1617 SDValue SrcLoad =
1618 DAG.getLoad(LoadVT, dl, CallChain, SrcAddr, PointerInfo, SrcAlign);
1619
1620 TypeSize ParamOffset = Offsets[J].getWithIncrement(VAOffset);
1621 Align ParamAlign = commonAlignment(ArgAlign, ParamOffset);
1622 SDValue ParamAddr =
1623 DAG.getObjectPtrOffset(dl, ParamSymbol, ParamOffset);
1624 SDValue StoreParam =
1625 DAG.getStore(ArgDeclare, dl, SrcLoad, ParamAddr,
1627 CallPrereqs.push_back(StoreParam);
1628
1629 J += NumElts;
1630 }
1631 if (IsVAArg)
1632 VAOffset += TySize;
1633 } else {
1636 ComputePTXValueVTs(*this, DL, Ctx, CLI.CallConv, Arg.Ty, VTs, Offsets,
1637 VAOffset);
1638 assert(VTs.size() == Offsets.size() && "Size mismatch");
1639 assert(VTs.size() == ArgOuts.size() && "Size mismatch");
1640
1641 // PTX Interoperability Guide 3.3(A): [Integer] Values shorter
1642 // than 32-bits are sign extended or zero extended, depending on
1643 // whether they are signed or unsigned types. This case applies
1644 // only to scalar parameters and not to aggregate values.
1645 const bool ExtendIntegerParam =
1646 Arg.Ty->isIntegerTy() && DL.getTypeAllocSizeInBits(Arg.Ty) < 32;
1647
1648 const auto GetStoredValue = [&](const unsigned I) {
1649 SDValue StVal = ArgOutVals[I];
1651 StVal.getValueType() &&
1652 "OutVal type should always be legal");
1653
1654 const EVT VTI = promoteScalarIntegerPTX(VTs[I]);
1655 const EVT StoreVT =
1656 ExtendIntegerParam ? MVT::i32 : (VTI == MVT::i1 ? MVT::i8 : VTI);
1657
1658 return correctParamType(StVal, StoreVT, ArgOuts[I].Flags, DAG, dl);
1659 };
1660
1661 unsigned J = 0;
1662 const auto VI = VectorizePTXValueVTs(VTs, Offsets, ArgAlign, IsVAArg);
1663 for (const unsigned NumElts : VI) {
1664 const EVT EltVT = promoteScalarIntegerPTX(VTs[J]);
1665
1666 unsigned Offset;
1667 if (IsVAArg) {
1668 // TODO: We may need to support vector types that can be passed
1669 // as scalars in variadic arguments.
1670 assert(NumElts == 1 &&
1671 "Vectorization should be disabled for vaargs.");
1672
1673 // Align each part of the variadic argument to their type.
1674 VAOffset = alignTo(VAOffset, DAG.getEVTAlign(EltVT));
1675 Offset = VAOffset;
1676
1677 const EVT TheStoreType = ExtendIntegerParam ? MVT::i32 : EltVT;
1678 VAOffset += DL.getTypeAllocSize(TheStoreType.getTypeForEVT(Ctx));
1679 } else {
1680 assert(VAOffset == 0 && "VAOffset must be 0 for non-VA args");
1681 Offset = Offsets[J];
1682 }
1683
1684 SDValue Ptr =
1685 DAG.getObjectPtrOffset(dl, ParamSymbol, TypeSize::getFixed(Offset));
1686
1687 const MaybeAlign CurrentAlign = ExtendIntegerParam
1688 ? MaybeAlign(std::nullopt)
1689 : commonAlignment(ArgAlign, Offset);
1690
1691 SDValue Val =
1692 getBuildVectorizedValue(NumElts, dl, DAG, [&](unsigned K) {
1693 return GetStoredValue(J + K);
1694 });
1695
1696 SDValue StoreParam =
1697 DAG.getStore(ArgDeclare, dl, Val, Ptr,
1699 CallPrereqs.push_back(StoreParam);
1700
1701 J += NumElts;
1702 }
1703 }
1704 }
1705
1706 // Handle Result
1707 if (!Ins.empty()) {
1708 const SDValue RetSymbol = DAG.getExternalSymbol("retval0", MVT::i32);
1709 const unsigned ResultSize = DL.getTypeAllocSize(RetTy);
1710 if (shouldPassAsArray(RetTy)) {
1711 const Align RetAlign = getArgumentAlignment(CB, RetTy, 0, DL);
1712 MakeDeclareArrayParam(RetSymbol, RetAlign, ResultSize);
1713 } else {
1714 MakeDeclareScalarParam(RetSymbol, ResultSize);
1715 }
1716 }
1717
1718 // Set the size of the vararg param byte array if the callee is a variadic
1719 // function and the variadic part is not empty.
1720 if (VADeclareParam) {
1721 SDValue DeclareParamOps[] = {VADeclareParam.getOperand(0),
1722 VADeclareParam.getOperand(1),
1723 VADeclareParam.getOperand(2), GetI32(VAOffset),
1724 VADeclareParam.getOperand(4)};
1725 DAG.MorphNodeTo(VADeclareParam.getNode(), VADeclareParam.getOpcode(),
1726 VADeclareParam->getVTList(), DeclareParamOps);
1727 }
1728
1729 const auto *Func = dyn_cast<GlobalAddressSDNode>(Callee.getNode());
1730 // If the type of the callsite does not match that of the function, convert
1731 // the callsite to an indirect call.
1732 const bool ConvertToIndirectCall = shouldConvertToIndirectCall(CB, Func);
1733
1734 // Both indirect calls and libcalls have nullptr Func. In order to distinguish
1735 // between them we must rely on the call site value which is valid for
1736 // indirect calls but is always null for libcalls.
1737 const bool IsIndirectCall = (!Func && CB) || ConvertToIndirectCall;
1738
1739 if (isa<ExternalSymbolSDNode>(Callee)) {
1740 Function* CalleeFunc = nullptr;
1741
1742 // Try to find the callee in the current module.
1743 Callee = DAG.getSymbolFunctionGlobalAddress(Callee, &CalleeFunc);
1744 assert(CalleeFunc != nullptr && "Libcall callee must be set.");
1745
1746 // Set the "libcall callee" attribute to indicate that the function
1747 // must always have a declaration.
1748 CalleeFunc->addFnAttr("nvptx-libcall-callee", "true");
1749 }
1750
1751 if (IsIndirectCall) {
1752 // This is indirect function call case : PTX requires a prototype of the
1753 // form
1754 // proto_0 : .callprototype(.param .b32 _) _ (.param .b32 _);
1755 // to be emitted, and the label has to used as the last arg of call
1756 // instruction.
1757 // The prototype is embedded in a string and put as the operand for a
1758 // CallPrototype SDNode which will print out to the value of the string.
1759 const bool HasVAArgs = CLI.IsVarArg && (CLI.Args.size() > CLI.NumFixedArgs);
1760 std::string Proto =
1761 getPrototype(DL, RetTy, Args, CLI.Outs,
1762 HasVAArgs ? std::optional(FirstVAArg) : std::nullopt, *CB,
1763 UniqueCallSite);
1764 const char *ProtoStr = nvTM->getStrPool().save(Proto).data();
1765 const SDValue PrototypeDeclare = DAG.getNode(
1766 NVPTXISD::CallPrototype, dl, MVT::Other,
1767 {StartChain, DAG.getTargetExternalSymbol(ProtoStr, MVT::i32)});
1768 CallPrereqs.push_back(PrototypeDeclare);
1769 }
1770
1771 const unsigned Proto = IsIndirectCall ? UniqueCallSite : 0;
1772 const unsigned NumArgs =
1773 std::min<unsigned>(CLI.NumFixedArgs + 1, Args.size());
1774 /// CALL(Chain, IsConvergent, IsIndirectCall/IsUniform, NumReturns,
1775 /// NumParams, Callee, Proto)
1776 const SDValue CallToken = DAG.getTokenFactor(dl, CallPrereqs);
1777 const SDValue Call = DAG.getNode(
1778 NVPTXISD::CALL, dl, MVT::Other,
1779 {CallToken, GetI32(CLI.IsConvergent), GetI32(IsIndirectCall),
1780 GetI32(Ins.empty() ? 0 : 1), GetI32(NumArgs), Callee, GetI32(Proto)});
1781
1782 SmallVector<SDValue, 16> LoadChains{Call};
1783 SmallVector<SDValue, 16> ProxyRegOps;
1784 if (!Ins.empty()) {
1787 ComputePTXValueVTs(*this, DL, Ctx, CLI.CallConv, RetTy, VTs, Offsets);
1788 assert(VTs.size() == Ins.size() && "Bad value decomposition");
1789
1790 const Align RetAlign = getArgumentAlignment(CB, RetTy, 0, DL);
1791 const SDValue RetSymbol = DAG.getExternalSymbol("retval0", MVT::i32);
1792
1793 // PTX Interoperability Guide 3.3(A): [Integer] Values shorter than
1794 // 32-bits are sign extended or zero extended, depending on whether
1795 // they are signed or unsigned types.
1796 const bool ExtendIntegerRetVal =
1797 RetTy->isIntegerTy() && DL.getTypeAllocSizeInBits(RetTy) < 32;
1798
1799 unsigned I = 0;
1800 const auto VI = VectorizePTXValueVTs(VTs, Offsets, RetAlign);
1801 for (const unsigned NumElts : VI) {
1802 const MaybeAlign CurrentAlign =
1803 ExtendIntegerRetVal ? MaybeAlign(std::nullopt)
1804 : commonAlignment(RetAlign, Offsets[I]);
1805
1806 const EVT VTI = promoteScalarIntegerPTX(VTs[I]);
1807 const EVT LoadVT =
1808 ExtendIntegerRetVal ? MVT::i32 : (VTI == MVT::i1 ? MVT::i8 : VTI);
1809 const EVT VecVT = getVectorizedVT(LoadVT, NumElts, Ctx);
1810 SDValue Ptr =
1811 DAG.getObjectPtrOffset(dl, RetSymbol, TypeSize::getFixed(Offsets[I]));
1812
1813 SDValue R =
1814 DAG.getLoad(VecVT, dl, Call, Ptr,
1816
1817 LoadChains.push_back(R.getValue(1));
1818 for (const unsigned J : llvm::seq(NumElts))
1819 ProxyRegOps.push_back(getExtractVectorizedValue(R, J, LoadVT, dl, DAG));
1820 I += NumElts;
1821 }
1822 }
1823
1824 const SDValue EndToken = DAG.getTokenFactor(dl, LoadChains);
1825 const SDValue CallEnd = DAG.getCALLSEQ_END(EndToken, UniqueCallSite,
1826 UniqueCallSite + 1, SDValue(), dl);
1827
1828 // Append ProxyReg instructions to the chain to make sure that `callseq_end`
1829 // will not get lost. Otherwise, during libcalls expansion, the nodes can become
1830 // dangling.
1831 for (const auto [I, Reg] : llvm::enumerate(ProxyRegOps)) {
1832 SDValue Proxy =
1833 DAG.getNode(NVPTXISD::ProxyReg, dl, Reg.getValueType(), {CallEnd, Reg});
1834 SDValue Ret = correctParamType(Proxy, Ins[I].VT, Ins[I].Flags, DAG, dl);
1835 InVals.push_back(Ret);
1836 }
1837
1838 // set IsTailCall to false for now, until we figure out how to express
1839 // tail call optimization in PTX
1840 CLI.IsTailCall = false;
1841 return CallEnd;
1842}
1843
1845 SelectionDAG &DAG) const {
1846
1847 if (STI.getPTXVersion() < 73 || STI.getSmVersion() < 52) {
1848 const Function &Fn = DAG.getMachineFunction().getFunction();
1849
1851 Fn,
1852 "Support for dynamic alloca introduced in PTX ISA version 7.3 and "
1853 "requires target sm_52.",
1854 SDLoc(Op).getDebugLoc()));
1855 auto Ops = {DAG.getConstant(0, SDLoc(), Op.getValueType()),
1856 Op.getOperand(0)};
1857 return DAG.getMergeValues(Ops, SDLoc());
1858 }
1859
1860 SDLoc DL(Op.getNode());
1861 SDValue Chain = Op.getOperand(0);
1862 SDValue Size = Op.getOperand(1);
1863 uint64_t Align = Op.getConstantOperandVal(2);
1864
1865 // The alignment on a ISD::DYNAMIC_STACKALLOC node may be 0 to indicate that
1866 // the default stack alignment should be used.
1867 if (Align == 0)
1869
1870 // The size for ptx alloca instruction is 64-bit for m64 and 32-bit for m32.
1871 const MVT LocalVT = getPointerTy(DAG.getDataLayout(), ADDRESS_SPACE_LOCAL);
1872
1873 SDValue Alloc =
1874 DAG.getNode(NVPTXISD::DYNAMIC_STACKALLOC, DL, {LocalVT, MVT::Other},
1875 {Chain, DAG.getZExtOrTrunc(Size, DL, LocalVT),
1876 DAG.getTargetConstant(Align, DL, MVT::i32)});
1877
1878 SDValue ASC = DAG.getAddrSpaceCast(
1880
1881 return DAG.getMergeValues({ASC, SDValue(Alloc.getNode(), 1)}, DL);
1882}
1883
1885 SelectionDAG &DAG) const {
1886 SDLoc DL(Op.getNode());
1887 if (STI.getPTXVersion() < 73 || STI.getSmVersion() < 52) {
1888 const Function &Fn = DAG.getMachineFunction().getFunction();
1889
1891 Fn,
1892 "Support for stackrestore requires PTX ISA version >= 7.3 and target "
1893 ">= sm_52.",
1894 DL.getDebugLoc()));
1895 return Op.getOperand(0);
1896 }
1897
1898 const MVT LocalVT = getPointerTy(DAG.getDataLayout(), ADDRESS_SPACE_LOCAL);
1899 SDValue Chain = Op.getOperand(0);
1900 SDValue Ptr = Op.getOperand(1);
1903 return DAG.getNode(NVPTXISD::STACKRESTORE, DL, MVT::Other, {Chain, ASC});
1904}
1905
1907 SelectionDAG &DAG) const {
1908 SDLoc DL(Op.getNode());
1909 if (STI.getPTXVersion() < 73 || STI.getSmVersion() < 52) {
1910 const Function &Fn = DAG.getMachineFunction().getFunction();
1911
1913 Fn,
1914 "Support for stacksave requires PTX ISA version >= 7.3 and target >= "
1915 "sm_52.",
1916 DL.getDebugLoc()));
1917 auto Ops = {DAG.getConstant(0, DL, Op.getValueType()), Op.getOperand(0)};
1918 return DAG.getMergeValues(Ops, DL);
1919 }
1920
1921 const MVT LocalVT = getPointerTy(DAG.getDataLayout(), ADDRESS_SPACE_LOCAL);
1922 SDValue Chain = Op.getOperand(0);
1923 SDValue SS =
1924 DAG.getNode(NVPTXISD::STACKSAVE, DL, {LocalVT, MVT::Other}, Chain);
1925 SDValue ASC = DAG.getAddrSpaceCast(
1926 DL, Op.getValueType(), SS, ADDRESS_SPACE_LOCAL, ADDRESS_SPACE_GENERIC);
1927 return DAG.getMergeValues({ASC, SDValue(SS.getNode(), 1)}, DL);
1928}
1929
1930// By default CONCAT_VECTORS is lowered by ExpandVectorBuildThroughStack()
1931// (see LegalizeDAG.cpp). This is slow and uses local memory.
1932// We use extract/insert/build vector just as what LegalizeOp() does in llvm 2.5
1933SDValue
1934NVPTXTargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const {
1935 SDNode *Node = Op.getNode();
1936 SDLoc dl(Node);
1938 unsigned NumOperands = Node->getNumOperands();
1939 for (unsigned i = 0; i < NumOperands; ++i) {
1940 SDValue SubOp = Node->getOperand(i);
1941 EVT VVT = SubOp.getNode()->getValueType(0);
1942 EVT EltVT = VVT.getVectorElementType();
1943 unsigned NumSubElem = VVT.getVectorNumElements();
1944 for (unsigned j = 0; j < NumSubElem; ++j) {
1945 Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, SubOp,
1946 DAG.getIntPtrConstant(j, dl)));
1947 }
1948 }
1949 return DAG.getBuildVector(Node->getValueType(0), dl, Ops);
1950}
1951
1953 SelectionDAG &DAG,
1954 unsigned Mode = NVPTX::PTXPrmtMode::NONE) {
1955 assert(A.getValueType() == MVT::i32 && B.getValueType() == MVT::i32 &&
1956 Selector.getValueType() == MVT::i32 && "PRMT must have i32 operands");
1957 return DAG.getNode(NVPTXISD::PRMT, DL, MVT::i32,
1958 {A, B, Selector, DAG.getConstant(Mode, DL, MVT::i32)});
1959}
1960
1962 SelectionDAG &DAG,
1963 unsigned Mode = NVPTX::PTXPrmtMode::NONE) {
1964 return getPRMT(A, B, DAG.getConstant(Selector, DL, MVT::i32), DL, DAG, Mode);
1965}
1966
1967/// Reduces the elements using the scalar operations provided. The operations
1968/// are sorted descending in number of inputs they take. The flags on the
1969/// original reduction operation will be propagated to each scalar operation.
1970/// Nearby elements are grouped in tree reduction, unlike the shuffle reduction
1971/// used in ExpandReductions and SelectionDAG.
1973 const SmallVector<SDValue> &Elements, EVT EltTy,
1974 ArrayRef<std::pair<unsigned /*NodeType*/, unsigned /*NumInputs*/>> Ops,
1975 const SDLoc &DL, const SDNodeFlags Flags, SelectionDAG &DAG) {
1976 // Build the reduction tree at each level, starting with all the elements.
1977 SmallVector<SDValue> Level = Elements;
1978
1979 unsigned OpIdx = 0;
1980 while (Level.size() > 1) {
1981 // Try to reduce this level using the current operator.
1982 const auto [Op, NumInputs] = Ops[OpIdx];
1983
1984 // Build the next level by partially reducing all elements.
1985 SmallVector<SDValue> ReducedLevel;
1986 unsigned I = 0, E = Level.size();
1987 for (; I + NumInputs <= E; I += NumInputs) {
1988 // Reduce elements in groups of [NumInputs], as much as possible.
1989 ReducedLevel.push_back(DAG.getNode(
1990 Op, DL, EltTy, ArrayRef<SDValue>(Level).slice(I, NumInputs), Flags));
1991 }
1992
1993 if (I < E) {
1994 // Handle leftover elements.
1995
1996 if (ReducedLevel.empty()) {
1997 // We didn't reduce anything at this level. We need to pick a smaller
1998 // operator.
1999 ++OpIdx;
2000 assert(OpIdx < Ops.size() && "no smaller operators for reduction");
2001 continue;
2002 }
2003
2004 // We reduced some things but there's still more left, meaning the
2005 // operator's number of inputs doesn't evenly divide this level size. Move
2006 // these elements to the next level.
2007 for (; I < E; ++I)
2008 ReducedLevel.push_back(Level[I]);
2009 }
2010
2011 // Process the next level.
2012 Level = ReducedLevel;
2013 }
2014
2015 return *Level.begin();
2016}
2017
2018// Get scalar reduction opcode
2019static ISD::NodeType getScalarOpcodeForReduction(unsigned ReductionOpcode) {
2020 switch (ReductionOpcode) {
2021 case ISD::VECREDUCE_FMAX:
2022 return ISD::FMAXNUM;
2023 case ISD::VECREDUCE_FMIN:
2024 return ISD::FMINNUM;
2025 case ISD::VECREDUCE_FMAXIMUM:
2026 return ISD::FMAXIMUM;
2027 case ISD::VECREDUCE_FMINIMUM:
2028 return ISD::FMINIMUM;
2029 default:
2030 llvm_unreachable("unhandled reduction opcode");
2031 }
2032}
2033
2034/// Get 3-input scalar reduction opcode
2035static std::optional<NVPTXISD::NodeType>
2036getScalar3OpcodeForReduction(unsigned ReductionOpcode) {
2037 switch (ReductionOpcode) {
2038 case ISD::VECREDUCE_FMAX:
2039 return NVPTXISD::FMAXNUM3;
2040 case ISD::VECREDUCE_FMIN:
2041 return NVPTXISD::FMINNUM3;
2042 case ISD::VECREDUCE_FMAXIMUM:
2043 return NVPTXISD::FMAXIMUM3;
2044 case ISD::VECREDUCE_FMINIMUM:
2045 return NVPTXISD::FMINIMUM3;
2046 default:
2047 return std::nullopt;
2048 }
2049}
2050
2051/// Lower reductions to either a sequence of operations or a tree if
2052/// reassociations are allowed. This method will use larger operations like
2053/// max3/min3 when the target supports them.
2054SDValue NVPTXTargetLowering::LowerVECREDUCE(SDValue Op,
2055 SelectionDAG &DAG) const {
2056 SDLoc DL(Op);
2057 const SDNodeFlags Flags = Op->getFlags();
2058 SDValue Vector = Op.getOperand(0);
2059
2060 const unsigned Opcode = Op->getOpcode();
2061 const EVT EltTy = Vector.getValueType().getVectorElementType();
2062
2063 // Whether we can use 3-input min/max when expanding the reduction.
2064 const bool CanUseMinMax3 =
2065 EltTy == MVT::f32 && STI.getSmVersion() >= 100 &&
2066 STI.getPTXVersion() >= 88 &&
2067 (Opcode == ISD::VECREDUCE_FMAX || Opcode == ISD::VECREDUCE_FMIN ||
2068 Opcode == ISD::VECREDUCE_FMAXIMUM || Opcode == ISD::VECREDUCE_FMINIMUM);
2069
2070 // A list of SDNode opcodes with equivalent semantics, sorted descending by
2071 // number of inputs they take.
2072 SmallVector<std::pair<unsigned /*Op*/, unsigned /*NumIn*/>, 2> ScalarOps;
2073
2074 if (auto Opcode3Elem = getScalar3OpcodeForReduction(Opcode);
2075 CanUseMinMax3 && Opcode3Elem)
2076 ScalarOps.push_back({*Opcode3Elem, 3});
2077 ScalarOps.push_back({getScalarOpcodeForReduction(Opcode), 2});
2078
2080 DAG.ExtractVectorElements(Vector, Elements);
2081
2082 return buildTreeReduction(Elements, EltTy, ScalarOps, DL, Flags, DAG);
2083}
2084
2085SDValue NVPTXTargetLowering::LowerBITCAST(SDValue Op, SelectionDAG &DAG) const {
2086 // Handle bitcasting from v2i8 without hitting the default promotion
2087 // strategy which goes through stack memory.
2088 EVT FromVT = Op->getOperand(0)->getValueType(0);
2089 if (FromVT != MVT::v2i8) {
2090 return Op;
2091 }
2092
2093 // Pack vector elements into i16 and bitcast to final type
2094 SDLoc DL(Op);
2095 SDValue Vec0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8,
2096 Op->getOperand(0), DAG.getIntPtrConstant(0, DL));
2097 SDValue Vec1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8,
2098 Op->getOperand(0), DAG.getIntPtrConstant(1, DL));
2099 SDValue Extend0 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i16, Vec0);
2100 SDValue Extend1 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i16, Vec1);
2101 SDValue Const8 = DAG.getConstant(8, DL, MVT::i16);
2102 SDValue AsInt = DAG.getNode(
2103 ISD::OR, DL, MVT::i16,
2104 {Extend0, DAG.getNode(ISD::SHL, DL, MVT::i16, {Extend1, Const8})});
2105 EVT ToVT = Op->getValueType(0);
2106 return DAG.getBitcast(ToVT, AsInt);
2107}
2108
2109// We can init constant f16x2/v2i16/v4i8 with a single .b32 move. Normally it
2110// would get lowered as two constant loads and vector-packing move.
2111// Instead we want just a constant move:
2112// mov.b32 %r2, 0x40003C00
2113SDValue NVPTXTargetLowering::LowerBUILD_VECTOR(SDValue Op,
2114 SelectionDAG &DAG) const {
2115 EVT VT = Op->getValueType(0);
2116 if (!(NVPTX::isPackedVectorTy(VT) && VT.is32BitVector()))
2117 return Op;
2118 SDLoc DL(Op);
2119
2120 if (!llvm::all_of(Op->ops(), [](SDValue Operand) {
2121 return Operand->isUndef() || isa<ConstantSDNode>(Operand) ||
2122 isa<ConstantFPSDNode>(Operand);
2123 })) {
2124 if (VT != MVT::v4i8)
2125 return Op;
2126 // Lower non-const v4i8 vector as byte-wise constructed i32, which allows us
2127 // to optimize calculation of constant parts.
2128 auto GetPRMT = [&](const SDValue Left, const SDValue Right, bool Cast,
2129 uint64_t SelectionValue) -> SDValue {
2130 SDValue L = Left;
2131 SDValue R = Right;
2132 if (Cast) {
2133 L = DAG.getAnyExtOrTrunc(L, DL, MVT::i32);
2134 R = DAG.getAnyExtOrTrunc(R, DL, MVT::i32);
2135 }
2136 return getPRMT(L, R, SelectionValue, DL, DAG);
2137 };
2138 auto PRMT__10 = GetPRMT(Op->getOperand(0), Op->getOperand(1), true, 0x3340);
2139 auto PRMT__32 = GetPRMT(Op->getOperand(2), Op->getOperand(3), true, 0x3340);
2140 auto PRMT3210 = GetPRMT(PRMT__10, PRMT__32, false, 0x5410);
2141 return DAG.getBitcast(VT, PRMT3210);
2142 }
2143
2144 // Get value or the Nth operand as an APInt(32). Undef values treated as 0.
2145 auto GetOperand = [](SDValue Op, int N) -> APInt {
2146 const SDValue &Operand = Op->getOperand(N);
2147 EVT VT = Op->getValueType(0);
2148 if (Operand->isUndef())
2149 return APInt(32, 0);
2150 APInt Value;
2151 if (VT == MVT::v2f16 || VT == MVT::v2bf16)
2152 Value = cast<ConstantFPSDNode>(Operand)->getValueAPF().bitcastToAPInt();
2153 else if (VT == MVT::v2i16 || VT == MVT::v4i8)
2154 Value = Operand->getAsAPIntVal();
2155 else
2156 llvm_unreachable("Unsupported type");
2157 // i8 values are carried around as i16, so we need to zero out upper bits,
2158 // so they do not get in the way of combining individual byte values
2159 if (VT == MVT::v4i8)
2160 Value = Value.trunc(8);
2161 return Value.zext(32);
2162 };
2163
2164 // Construct a 32-bit constant by shifting into place smaller values
2165 // (elements of the vector type VT).
2166 // For example, if VT has 2 elements, then N == 2:
2167 // ShiftAmount = 32 / N = 16
2168 // Value |= Op0 (b16) << 0
2169 // Value |= Op1 (b16) << 16
2170 // If N == 4:
2171 // ShiftAmount = 32 / N = 8
2172 // Value |= Op0 (b8) << 0
2173 // Value |= Op1 (b8) << 8
2174 // Value |= Op2 (b8) << 16
2175 // Value |= Op3 (b8) << 24
2176 // ...etc
2177 APInt Value(32, 0);
2178 const unsigned NumElements = VT.getVectorNumElements();
2179 assert(32 % NumElements == 0 && "must evenly divide bit length");
2180 const unsigned ShiftAmount = 32 / NumElements;
2181 for (unsigned ElementNo : seq(NumElements))
2182 Value |= GetOperand(Op, ElementNo).shl(ElementNo * ShiftAmount);
2183 SDValue Const = DAG.getConstant(Value, DL, MVT::i32);
2184 return DAG.getNode(ISD::BITCAST, DL, Op->getValueType(0), Const);
2185}
2186
2187SDValue NVPTXTargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
2188 SelectionDAG &DAG) const {
2189 SDValue Index = Op->getOperand(1);
2190 SDValue Vector = Op->getOperand(0);
2191 SDLoc DL(Op);
2192 EVT VectorVT = Vector.getValueType();
2193
2194 if (VectorVT == MVT::v4i8) {
2195 SDValue Selector = DAG.getNode(ISD::OR, DL, MVT::i32,
2196 DAG.getZExtOrTrunc(Index, DL, MVT::i32),
2197 DAG.getConstant(0x7770, DL, MVT::i32));
2198 SDValue PRMT = getPRMT(DAG.getBitcast(MVT::i32, Vector),
2199 DAG.getConstant(0, DL, MVT::i32), Selector, DL, DAG);
2200 SDValue Ext = DAG.getAnyExtOrTrunc(PRMT, DL, Op->getValueType(0));
2201 SDNodeFlags Flags;
2202 Flags.setNoSignedWrap(Ext.getScalarValueSizeInBits() > 8);
2203 Flags.setNoUnsignedWrap(Ext.getScalarValueSizeInBits() >= 8);
2204 Ext->setFlags(Flags);
2205 return Ext;
2206 }
2207
2208 // Constant index will be matched by tablegen.
2209 if (isa<ConstantSDNode>(Index.getNode()))
2210 return Op;
2211
2212 // Extract individual elements and select one of them.
2213 assert(NVPTX::isPackedVectorTy(VectorVT) &&
2214 VectorVT.getVectorNumElements() == 2 && "Unexpected vector type.");
2215 EVT EltVT = VectorVT.getVectorElementType();
2216
2217 SDLoc dl(Op.getNode());
2218 SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Vector,
2219 DAG.getIntPtrConstant(0, dl));
2220 SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Vector,
2221 DAG.getIntPtrConstant(1, dl));
2222 return DAG.getSelectCC(dl, Index, DAG.getIntPtrConstant(0, dl), E0, E1,
2224}
2225
2226SDValue NVPTXTargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
2227 SelectionDAG &DAG) const {
2228 SDValue Vector = Op->getOperand(0);
2229 EVT VectorVT = Vector.getValueType();
2230
2231 if (VectorVT != MVT::v4i8)
2232 return Op;
2233 SDLoc DL(Op);
2234 SDValue Value = Op->getOperand(1);
2235 if (Value->isUndef())
2236 return Vector;
2237
2238 SDValue Index = Op->getOperand(2);
2239
2240 SDValue BFI =
2241 DAG.getNode(NVPTXISD::BFI, DL, MVT::i32,
2242 {DAG.getZExtOrTrunc(Value, DL, MVT::i32), Vector,
2243 DAG.getNode(ISD::MUL, DL, MVT::i32,
2244 DAG.getZExtOrTrunc(Index, DL, MVT::i32),
2245 DAG.getConstant(8, DL, MVT::i32)),
2246 DAG.getConstant(8, DL, MVT::i32)});
2247 return DAG.getNode(ISD::BITCAST, DL, Op->getValueType(0), BFI);
2248}
2249
2250SDValue NVPTXTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
2251 SelectionDAG &DAG) const {
2252 SDValue V1 = Op.getOperand(0);
2253 EVT VectorVT = V1.getValueType();
2254 if (VectorVT != MVT::v4i8 || Op.getValueType() != MVT::v4i8)
2255 return Op;
2256
2257 // Lower shuffle to PRMT instruction.
2258 const ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
2259 SDValue V2 = Op.getOperand(1);
2260 uint32_t Selector = 0;
2261 for (auto I : llvm::enumerate(SVN->getMask())) {
2262 if (I.value() != -1) // -1 is a placeholder for undef.
2263 Selector |= (I.value() << (I.index() * 4));
2264 }
2265
2266 SDLoc DL(Op);
2267 SDValue PRMT = getPRMT(DAG.getBitcast(MVT::i32, V1),
2268 DAG.getBitcast(MVT::i32, V2), Selector, DL, DAG);
2269 return DAG.getBitcast(Op.getValueType(), PRMT);
2270}
2271/// LowerShiftRightParts - Lower SRL_PARTS, SRA_PARTS, which
2272/// 1) returns two i32 values and take a 2 x i32 value to shift plus a shift
2273/// amount, or
2274/// 2) returns two i64 values and take a 2 x i64 value to shift plus a shift
2275/// amount.
2276SDValue NVPTXTargetLowering::LowerShiftRightParts(SDValue Op,
2277 SelectionDAG &DAG) const {
2278 assert(Op.getNumOperands() == 3 && "Not a double-shift!");
2279 assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS);
2280
2281 EVT VT = Op.getValueType();
2282 unsigned VTBits = VT.getSizeInBits();
2283 SDLoc dl(Op);
2284 SDValue ShOpLo = Op.getOperand(0);
2285 SDValue ShOpHi = Op.getOperand(1);
2286 SDValue ShAmt = Op.getOperand(2);
2287 unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL;
2288
2289 if (VTBits == 32 && STI.getSmVersion() >= 35) {
2290 // For 32bit and sm35, we can use the funnel shift 'shf' instruction.
2291 // {dHi, dLo} = {aHi, aLo} >> Amt
2292 // dHi = aHi >> Amt
2293 // dLo = shf.r.clamp aLo, aHi, Amt
2294
2295 SDValue Hi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
2296 SDValue Lo =
2297 DAG.getNode(NVPTXISD::FSHR_CLAMP, dl, VT, ShOpHi, ShOpLo, ShAmt);
2298
2299 SDValue Ops[2] = { Lo, Hi };
2300 return DAG.getMergeValues(Ops, dl);
2301 }
2302 else {
2303 // {dHi, dLo} = {aHi, aLo} >> Amt
2304 // - if (Amt>=size) then
2305 // dLo = aHi >> (Amt-size)
2306 // dHi = aHi >> Amt (this is either all 0 or all 1)
2307 // else
2308 // dLo = (aLo >>logic Amt) | (aHi << (size-Amt))
2309 // dHi = aHi >> Amt
2310
2311 SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
2312 DAG.getConstant(VTBits, dl, MVT::i32),
2313 ShAmt);
2314 SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt);
2315 SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
2316 DAG.getConstant(VTBits, dl, MVT::i32));
2317 SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt);
2318 SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
2319 SDValue TrueVal = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt);
2320
2321 SDValue Cmp = DAG.getSetCC(dl, MVT::i1, ShAmt,
2322 DAG.getConstant(VTBits, dl, MVT::i32),
2323 ISD::SETGE);
2324 SDValue Hi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
2325 SDValue Lo = DAG.getNode(ISD::SELECT, dl, VT, Cmp, TrueVal, FalseVal);
2326
2327 SDValue Ops[2] = { Lo, Hi };
2328 return DAG.getMergeValues(Ops, dl);
2329 }
2330}
2331
2332/// LowerShiftLeftParts - Lower SHL_PARTS, which
2333/// 1) returns two i32 values and take a 2 x i32 value to shift plus a shift
2334/// amount, or
2335/// 2) returns two i64 values and take a 2 x i64 value to shift plus a shift
2336/// amount.
2337SDValue NVPTXTargetLowering::LowerShiftLeftParts(SDValue Op,
2338 SelectionDAG &DAG) const {
2339 assert(Op.getNumOperands() == 3 && "Not a double-shift!");
2340 assert(Op.getOpcode() == ISD::SHL_PARTS);
2341
2342 EVT VT = Op.getValueType();
2343 unsigned VTBits = VT.getSizeInBits();
2344 SDLoc dl(Op);
2345 SDValue ShOpLo = Op.getOperand(0);
2346 SDValue ShOpHi = Op.getOperand(1);
2347 SDValue ShAmt = Op.getOperand(2);
2348
2349 if (VTBits == 32 && STI.getSmVersion() >= 35) {
2350 // For 32bit and sm35, we can use the funnel shift 'shf' instruction.
2351 // {dHi, dLo} = {aHi, aLo} << Amt
2352 // dHi = shf.l.clamp aLo, aHi, Amt
2353 // dLo = aLo << Amt
2354
2355 SDValue Hi =
2356 DAG.getNode(NVPTXISD::FSHL_CLAMP, dl, VT, ShOpHi, ShOpLo, ShAmt);
2357 SDValue Lo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
2358
2359 SDValue Ops[2] = { Lo, Hi };
2360 return DAG.getMergeValues(Ops, dl);
2361 }
2362 else {
2363 // {dHi, dLo} = {aHi, aLo} << Amt
2364 // - if (Amt>=size) then
2365 // dLo = aLo << Amt (all 0)
2366 // dLo = aLo << (Amt-size)
2367 // else
2368 // dLo = aLo << Amt
2369 // dHi = (aHi << Amt) | (aLo >> (size-Amt))
2370
2371 SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
2372 DAG.getConstant(VTBits, dl, MVT::i32),
2373 ShAmt);
2374 SDValue Tmp1 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt);
2375 SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
2376 DAG.getConstant(VTBits, dl, MVT::i32));
2377 SDValue Tmp2 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt);
2378 SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
2379 SDValue TrueVal = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt);
2380
2381 SDValue Cmp = DAG.getSetCC(dl, MVT::i1, ShAmt,
2382 DAG.getConstant(VTBits, dl, MVT::i32),
2383 ISD::SETGE);
2384 SDValue Lo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
2385 SDValue Hi = DAG.getNode(ISD::SELECT, dl, VT, Cmp, TrueVal, FalseVal);
2386
2387 SDValue Ops[2] = { Lo, Hi };
2388 return DAG.getMergeValues(Ops, dl);
2389 }
2390}
2391
2392/// If the types match, convert the generic copysign to the NVPTXISD version,
2393/// otherwise bail ensuring that mismatched cases are properly expaned.
2394SDValue NVPTXTargetLowering::LowerFCOPYSIGN(SDValue Op,
2395 SelectionDAG &DAG) const {
2396 EVT VT = Op.getValueType();
2397 SDLoc DL(Op);
2398
2399 SDValue In1 = Op.getOperand(0);
2400 SDValue In2 = Op.getOperand(1);
2401 EVT SrcVT = In2.getValueType();
2402
2403 if (!SrcVT.bitsEq(VT))
2404 return SDValue();
2405
2406 return DAG.getNode(NVPTXISD::FCOPYSIGN, DL, VT, In1, In2);
2407}
2408
2409SDValue NVPTXTargetLowering::LowerFROUND(SDValue Op, SelectionDAG &DAG) const {
2410 EVT VT = Op.getValueType();
2411
2412 if (VT == MVT::f32)
2413 return LowerFROUND32(Op, DAG);
2414
2415 if (VT == MVT::f64)
2416 return LowerFROUND64(Op, DAG);
2417
2418 llvm_unreachable("unhandled type");
2419}
2420
2421// This is the the rounding method used in CUDA libdevice in C like code:
2422// float roundf(float A)
2423// {
2424// float RoundedA = (float) (int) ( A > 0 ? (A + 0.5f) : (A - 0.5f));
2425// RoundedA = abs(A) > 0x1.0p23 ? A : RoundedA;
2426// return abs(A) < 0.5 ? (float)(int)A : RoundedA;
2427// }
2428SDValue NVPTXTargetLowering::LowerFROUND32(SDValue Op,
2429 SelectionDAG &DAG) const {
2430 SDLoc SL(Op);
2431 SDValue A = Op.getOperand(0);
2432 EVT VT = Op.getValueType();
2433
2434 SDValue AbsA = DAG.getNode(ISD::FABS, SL, VT, A);
2435
2436 // RoundedA = (float) (int) ( A > 0 ? (A + 0.5f) : (A - 0.5f))
2437 SDValue Bitcast = DAG.getNode(ISD::BITCAST, SL, MVT::i32, A);
2438 const unsigned SignBitMask = 0x80000000;
2439 SDValue Sign = DAG.getNode(ISD::AND, SL, MVT::i32, Bitcast,
2440 DAG.getConstant(SignBitMask, SL, MVT::i32));
2441 const unsigned PointFiveInBits = 0x3F000000;
2442 SDValue PointFiveWithSignRaw =
2443 DAG.getNode(ISD::OR, SL, MVT::i32, Sign,
2444 DAG.getConstant(PointFiveInBits, SL, MVT::i32));
2445 SDValue PointFiveWithSign =
2446 DAG.getNode(ISD::BITCAST, SL, VT, PointFiveWithSignRaw);
2447 SDValue AdjustedA = DAG.getNode(ISD::FADD, SL, VT, A, PointFiveWithSign);
2448 SDValue RoundedA = DAG.getNode(ISD::FTRUNC, SL, VT, AdjustedA);
2449
2450 // RoundedA = abs(A) > 0x1.0p23 ? A : RoundedA;
2451 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2452 SDValue IsLarge =
2453 DAG.getSetCC(SL, SetCCVT, AbsA, DAG.getConstantFP(pow(2.0, 23.0), SL, VT),
2454 ISD::SETOGT);
2455 RoundedA = DAG.getNode(ISD::SELECT, SL, VT, IsLarge, A, RoundedA);
2456
2457 // return abs(A) < 0.5 ? (float)(int)A : RoundedA;
2458 SDValue IsSmall =DAG.getSetCC(SL, SetCCVT, AbsA,
2459 DAG.getConstantFP(0.5, SL, VT), ISD::SETOLT);
2460 SDValue RoundedAForSmallA = DAG.getNode(ISD::FTRUNC, SL, VT, A);
2461 return DAG.getNode(ISD::SELECT, SL, VT, IsSmall, RoundedAForSmallA, RoundedA);
2462}
2463
2464// The implementation of round(double) is similar to that of round(float) in
2465// that they both separate the value range into three regions and use a method
2466// specific to the region to round the values. However, round(double) first
2467// calculates the round of the absolute value and then adds the sign back while
2468// round(float) directly rounds the value with sign.
2469SDValue NVPTXTargetLowering::LowerFROUND64(SDValue Op,
2470 SelectionDAG &DAG) const {
2471 SDLoc SL(Op);
2472 SDValue A = Op.getOperand(0);
2473 EVT VT = Op.getValueType();
2474
2475 SDValue AbsA = DAG.getNode(ISD::FABS, SL, VT, A);
2476
2477 // double RoundedA = (double) (int) (abs(A) + 0.5f);
2478 SDValue AdjustedA = DAG.getNode(ISD::FADD, SL, VT, AbsA,
2479 DAG.getConstantFP(0.5, SL, VT));
2480 SDValue RoundedA = DAG.getNode(ISD::FTRUNC, SL, VT, AdjustedA);
2481
2482 // RoundedA = abs(A) < 0.5 ? (double)0 : RoundedA;
2483 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2484 SDValue IsSmall =DAG.getSetCC(SL, SetCCVT, AbsA,
2485 DAG.getConstantFP(0.5, SL, VT), ISD::SETOLT);
2486 RoundedA = DAG.getNode(ISD::SELECT, SL, VT, IsSmall,
2487 DAG.getConstantFP(0, SL, VT),
2488 RoundedA);
2489
2490 // Add sign to rounded_A
2491 RoundedA = DAG.getNode(ISD::FCOPYSIGN, SL, VT, RoundedA, A);
2492 DAG.getNode(ISD::FTRUNC, SL, VT, A);
2493
2494 // RoundedA = abs(A) > 0x1.0p52 ? A : RoundedA;
2495 SDValue IsLarge =
2496 DAG.getSetCC(SL, SetCCVT, AbsA, DAG.getConstantFP(pow(2.0, 52.0), SL, VT),
2497 ISD::SETOGT);
2498 return DAG.getNode(ISD::SELECT, SL, VT, IsLarge, A, RoundedA);
2499}
2500
2502 EVT VT = N->getValueType(0);
2503 EVT NVT = MVT::f32;
2504 if (VT.isVector()) {
2505 NVT = EVT::getVectorVT(*DAG.getContext(), NVT, VT.getVectorElementCount());
2506 }
2507 SDLoc DL(N);
2508 SDValue Tmp0 = DAG.getFPExtendOrRound(N->getOperand(0), DL, NVT);
2509 SDValue Tmp1 = DAG.getFPExtendOrRound(N->getOperand(1), DL, NVT);
2510 SDValue Res = DAG.getNode(N->getOpcode(), DL, NVT, Tmp0, Tmp1, N->getFlags());
2511 return DAG.getFPExtendOrRound(Res, DL, VT);
2512}
2513
2514SDValue NVPTXTargetLowering::PromoteBinOpIfF32FTZ(SDValue Op,
2515 SelectionDAG &DAG) const {
2516 if (useF32FTZ(DAG.getMachineFunction())) {
2517 return PromoteBinOpToF32(Op.getNode(), DAG);
2518 }
2519 return Op;
2520}
2521
2522SDValue NVPTXTargetLowering::LowerINT_TO_FP(SDValue Op,
2523 SelectionDAG &DAG) const {
2524 assert(STI.getSmVersion() < 90 || STI.getPTXVersion() < 78);
2525
2526 if (Op.getValueType() == MVT::bf16) {
2527 SDLoc Loc(Op);
2528 return DAG.getNode(
2529 ISD::FP_ROUND, Loc, MVT::bf16,
2530 DAG.getNode(Op.getOpcode(), Loc, MVT::f32, Op.getOperand(0)),
2531 DAG.getIntPtrConstant(0, Loc, /*isTarget=*/true));
2532 }
2533
2534 // Everything else is considered legal.
2535 return Op;
2536}
2537
2538SDValue NVPTXTargetLowering::LowerFP_TO_INT(SDValue Op,
2539 SelectionDAG &DAG) const {
2540 assert(STI.getSmVersion() < 90 || STI.getPTXVersion() < 78);
2541
2542 if (Op.getOperand(0).getValueType() == MVT::bf16) {
2543 SDLoc Loc(Op);
2544 return DAG.getNode(
2545 Op.getOpcode(), Loc, Op.getValueType(),
2546 DAG.getNode(ISD::FP_EXTEND, Loc, MVT::f32, Op.getOperand(0)));
2547 }
2548
2549 // Everything else is considered legal.
2550 return Op;
2551}
2552
2553SDValue NVPTXTargetLowering::LowerFP_ROUND(SDValue Op,
2554 SelectionDAG &DAG) const {
2555 EVT NarrowVT = Op.getValueType();
2556 SDValue Wide = Op.getOperand(0);
2557 EVT WideVT = Wide.getValueType();
2558 if (NarrowVT.getScalarType() == MVT::bf16) {
2559 const TargetLowering *TLI = STI.getTargetLowering();
2560 if (STI.getSmVersion() < 80 || STI.getPTXVersion() < 70) {
2561 return TLI->expandFP_ROUND(Op.getNode(), DAG);
2562 }
2563 if (STI.getSmVersion() < 90 || STI.getPTXVersion() < 78) {
2564 // This combination was the first to support f32 -> bf16.
2565 if (STI.getSmVersion() >= 80 && STI.getPTXVersion() >= 70) {
2566 if (WideVT.getScalarType() == MVT::f32) {
2567 return Op;
2568 }
2569 if (WideVT.getScalarType() == MVT::f64) {
2570 SDLoc Loc(Op);
2571 // Round-inexact-to-odd f64 to f32, then do the final rounding using
2572 // the hardware f32 -> bf16 instruction.
2574 WideVT.isVector() ? WideVT.changeVectorElementType(MVT::f32)
2575 : MVT::f32,
2576 Wide, Loc, DAG);
2577 return DAG.getFPExtendOrRound(rod, Loc, NarrowVT);
2578 }
2579 }
2580 return TLI->expandFP_ROUND(Op.getNode(), DAG);
2581 }
2582 }
2583
2584 // Everything else is considered legal.
2585 return Op;
2586}
2587
2588SDValue NVPTXTargetLowering::LowerFP_EXTEND(SDValue Op,
2589 SelectionDAG &DAG) const {
2590 SDValue Narrow = Op.getOperand(0);
2591 EVT NarrowVT = Narrow.getValueType();
2592 EVT WideVT = Op.getValueType();
2593 if (NarrowVT.getScalarType() == MVT::bf16) {
2594 if (WideVT.getScalarType() == MVT::f32 &&
2595 (STI.getSmVersion() < 80 || STI.getPTXVersion() < 71)) {
2596 SDLoc Loc(Op);
2597 return DAG.getNode(ISD::BF16_TO_FP, Loc, WideVT, Narrow);
2598 }
2599 if (WideVT.getScalarType() == MVT::f64 &&
2600 (STI.getSmVersion() < 90 || STI.getPTXVersion() < 78)) {
2601 EVT F32 = NarrowVT.isVector() ? NarrowVT.changeVectorElementType(MVT::f32)
2602 : MVT::f32;
2603 SDLoc Loc(Op);
2604 if (STI.getSmVersion() >= 80 && STI.getPTXVersion() >= 71) {
2605 Op = DAG.getNode(ISD::FP_EXTEND, Loc, F32, Narrow);
2606 } else {
2607 Op = DAG.getNode(ISD::BF16_TO_FP, Loc, F32, Narrow);
2608 }
2609 return DAG.getNode(ISD::FP_EXTEND, Loc, WideVT, Op);
2610 }
2611 }
2612
2613 // Everything else is considered legal.
2614 return Op;
2615}
2616
2618 SDLoc DL(Op);
2619 if (Op.getValueType() != MVT::v2i16)
2620 return Op;
2621 EVT EltVT = Op.getValueType().getVectorElementType();
2622 SmallVector<SDValue> VecElements;
2623 for (int I = 0, E = Op.getValueType().getVectorNumElements(); I < E; I++) {
2624 SmallVector<SDValue> ScalarArgs;
2625 llvm::transform(Op->ops(), std::back_inserter(ScalarArgs),
2626 [&](const SDUse &O) {
2627 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT,
2628 O.get(), DAG.getIntPtrConstant(I, DL));
2629 });
2630 VecElements.push_back(DAG.getNode(Op.getOpcode(), DL, EltVT, ScalarArgs));
2631 }
2632 SDValue V =
2633 DAG.getNode(ISD::BUILD_VECTOR, DL, Op.getValueType(), VecElements);
2634 return V;
2635}
2636
2638 SDNode *N = Op.getNode();
2639 SDLoc DL(N);
2641
2642 // split the vector argument
2643 for (size_t I = 0; I < N->getNumOperands(); I++) {
2644 SDValue Val = N->getOperand(I);
2645 EVT ValVT = Val.getValueType();
2646 if (ValVT.isVector()) {
2647 EVT EltVT = ValVT.getVectorElementType();
2648 for (unsigned J = 0, NElts = ValVT.getVectorNumElements(); J < NElts; J++)
2649 Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Val,
2650 DAG.getIntPtrConstant(J, DL)));
2651 } else
2652 Ops.push_back(Val);
2653 }
2654
2656 SDValue Tcgen05StNode =
2657 DAG.getMemIntrinsicNode(ISD::INTRINSIC_VOID, DL, N->getVTList(), Ops,
2658 MemSD->getMemoryVT(), MemSD->getMemOperand());
2659
2660 return Tcgen05StNode;
2661}
2662
2663static unsigned getTcgen05MMADisableOutputLane(unsigned IID) {
2664 switch (IID) {
2665 case Intrinsic::nvvm_tcgen05_mma_shared_disable_output_lane_cg1:
2667 case Intrinsic::nvvm_tcgen05_mma_shared_disable_output_lane_cg2:
2669 case Intrinsic::nvvm_tcgen05_mma_shared_scale_d_disable_output_lane_cg1:
2671 case Intrinsic::nvvm_tcgen05_mma_shared_scale_d_disable_output_lane_cg2:
2673 case Intrinsic::nvvm_tcgen05_mma_tensor_disable_output_lane_cg1:
2675 case Intrinsic::nvvm_tcgen05_mma_tensor_disable_output_lane_cg2:
2677 case Intrinsic::nvvm_tcgen05_mma_tensor_scale_d_disable_output_lane_cg1:
2679 case Intrinsic::nvvm_tcgen05_mma_tensor_scale_d_disable_output_lane_cg2:
2681 case Intrinsic::nvvm_tcgen05_mma_tensor_disable_output_lane_cg1_ashift:
2683 case Intrinsic::nvvm_tcgen05_mma_tensor_disable_output_lane_cg2_ashift:
2685 case Intrinsic::
2686 nvvm_tcgen05_mma_tensor_scale_d_disable_output_lane_cg1_ashift:
2688 case Intrinsic::
2689 nvvm_tcgen05_mma_tensor_scale_d_disable_output_lane_cg2_ashift:
2691 case Intrinsic::nvvm_tcgen05_mma_sp_shared_disable_output_lane_cg1:
2693 case Intrinsic::nvvm_tcgen05_mma_sp_shared_disable_output_lane_cg2:
2695 case Intrinsic::nvvm_tcgen05_mma_sp_shared_scale_d_disable_output_lane_cg1:
2697 case Intrinsic::nvvm_tcgen05_mma_sp_shared_scale_d_disable_output_lane_cg2:
2699 case Intrinsic::nvvm_tcgen05_mma_sp_tensor_disable_output_lane_cg1:
2701 case Intrinsic::nvvm_tcgen05_mma_sp_tensor_disable_output_lane_cg2:
2703 case Intrinsic::nvvm_tcgen05_mma_sp_tensor_disable_output_lane_cg1_ashift:
2705 case Intrinsic::nvvm_tcgen05_mma_sp_tensor_disable_output_lane_cg2_ashift:
2707 case Intrinsic::nvvm_tcgen05_mma_sp_tensor_scale_d_disable_output_lane_cg1:
2709 case Intrinsic::nvvm_tcgen05_mma_sp_tensor_scale_d_disable_output_lane_cg2:
2711 case Intrinsic::
2712 nvvm_tcgen05_mma_sp_tensor_scale_d_disable_output_lane_cg1_ashift:
2713 return NVPTXISD::
2714 TCGEN05_MMA_SP_TENSOR_SCALE_D_DISABLE_OUTPUT_LANE_CG1_ASHIFT;
2715 case Intrinsic::
2716 nvvm_tcgen05_mma_sp_tensor_scale_d_disable_output_lane_cg2_ashift:
2717 return NVPTXISD::
2718 TCGEN05_MMA_SP_TENSOR_SCALE_D_DISABLE_OUTPUT_LANE_CG2_ASHIFT;
2719 };
2720 llvm_unreachable("unhandled tcgen05.mma.disable_output_lane intrinsic");
2721}
2722
2724 SDNode *N = Op.getNode();
2725 SDLoc DL(N);
2726 unsigned IID = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
2727
2729 // split the vector argument
2730 for (size_t I = 0; I < N->getNumOperands(); I++) {
2731 if (I == 1)
2732 continue; // skip IID
2733 SDValue Val = N->getOperand(I);
2734 EVT ValVT = Val.getValueType();
2735 if (ValVT.isVector()) {
2736 EVT EltVT = ValVT.getVectorElementType();
2737 for (unsigned J = 0, NElts = ValVT.getVectorNumElements(); J < NElts; J++)
2738 Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Val,
2739 DAG.getIntPtrConstant(J, DL)));
2740 } else
2741 Ops.push_back(Val);
2742 }
2743
2745 SDValue Tcgen05MMANode = DAG.getMemIntrinsicNode(
2746 getTcgen05MMADisableOutputLane(IID), DL, N->getVTList(), Ops,
2747 MemSD->getMemoryVT(), MemSD->getMemOperand());
2748
2749 return Tcgen05MMANode;
2750}
2751
2752// Lower vector return type of tcgen05.ld intrinsics
2753static std::optional<std::pair<SDValue, SDValue>>
2754lowerTcgen05Ld(SDNode *N, SelectionDAG &DAG, bool HasOffset = false) {
2755 SDLoc DL(N);
2756 EVT ResVT = N->getValueType(0);
2757 if (!ResVT.isVector())
2758 return {}; // already legalized.
2759
2760 const unsigned NumElts = ResVT.getVectorNumElements();
2761
2762 // Create the return type of the instructions
2763 SmallVector<EVT, 5> ListVTs;
2764 for (unsigned i = 0; i < NumElts; ++i)
2765 ListVTs.push_back(MVT::i32);
2766
2767 ListVTs.push_back(N->getValueType(1)); // Chain
2768
2769 SDVTList ResVTs = DAG.getVTList(ListVTs);
2770
2771 SmallVector<SDValue, 8> Ops{N->getOperand(0), N->getOperand(1),
2772 N->getOperand(2)};
2773
2774 if (HasOffset) {
2775 Ops.push_back(N->getOperand(3)); // offset
2776 Ops.push_back(N->getOperand(4)); // Pack flag
2777 } else
2778 Ops.push_back(N->getOperand(3)); // Pack flag
2779
2781 SDValue NewNode =
2783 MemSD->getMemoryVT(), MemSD->getMemOperand());
2784
2785 // split the vector result
2786 SmallVector<SDValue, 4> ScalarRes;
2787 for (unsigned i = 0; i < NumElts; ++i) {
2788 SDValue Res = NewNode.getValue(i);
2789 ScalarRes.push_back(Res);
2790 }
2791
2792 SDValue Chain = NewNode.getValue(NumElts);
2793 SDValue BuildVector = DAG.getNode(ISD::BUILD_VECTOR, DL, ResVT, ScalarRes);
2794 return {{BuildVector, Chain}};
2795}
2796
2798 SDNode *N = Op.getNode();
2799 SDValue Intrin = N->getOperand(1);
2800
2801 // Get the intrinsic ID
2802 unsigned IntrinNo = cast<ConstantSDNode>(Intrin.getNode())->getZExtValue();
2803 switch (IntrinNo) {
2804 default:
2805 break;
2806 case Intrinsic::nvvm_tcgen05_st_16x64b_x1:
2807 case Intrinsic::nvvm_tcgen05_st_16x64b_x2:
2808 case Intrinsic::nvvm_tcgen05_st_16x64b_x4:
2809 case Intrinsic::nvvm_tcgen05_st_16x64b_x8:
2810 case Intrinsic::nvvm_tcgen05_st_16x64b_x16:
2811 case Intrinsic::nvvm_tcgen05_st_16x64b_x32:
2812 case Intrinsic::nvvm_tcgen05_st_16x64b_x128:
2813 case Intrinsic::nvvm_tcgen05_st_16x128b_x1:
2814 case Intrinsic::nvvm_tcgen05_st_16x128b_x2:
2815 case Intrinsic::nvvm_tcgen05_st_16x128b_x4:
2816 case Intrinsic::nvvm_tcgen05_st_16x128b_x8:
2817 case Intrinsic::nvvm_tcgen05_st_16x128b_x16:
2818 case Intrinsic::nvvm_tcgen05_st_16x128b_x32:
2819 case Intrinsic::nvvm_tcgen05_st_16x128b_x64:
2820 case Intrinsic::nvvm_tcgen05_st_16x256b_x1:
2821 case Intrinsic::nvvm_tcgen05_st_16x256b_x2:
2822 case Intrinsic::nvvm_tcgen05_st_16x256b_x4:
2823 case Intrinsic::nvvm_tcgen05_st_16x256b_x8:
2824 case Intrinsic::nvvm_tcgen05_st_16x256b_x16:
2825 case Intrinsic::nvvm_tcgen05_st_16x256b_x32:
2826 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x1:
2827 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x2:
2828 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x4:
2829 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x8:
2830 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x16:
2831 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x32:
2832 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x64:
2833 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x128:
2834 case Intrinsic::nvvm_tcgen05_st_32x32b_x1:
2835 case Intrinsic::nvvm_tcgen05_st_32x32b_x2:
2836 case Intrinsic::nvvm_tcgen05_st_32x32b_x4:
2837 case Intrinsic::nvvm_tcgen05_st_32x32b_x8:
2838 case Intrinsic::nvvm_tcgen05_st_32x32b_x16:
2839 case Intrinsic::nvvm_tcgen05_st_32x32b_x32:
2840 case Intrinsic::nvvm_tcgen05_st_16x64b_x64:
2841 case Intrinsic::nvvm_tcgen05_st_32x32b_x64:
2842 case Intrinsic::nvvm_tcgen05_st_32x32b_x128:
2843 return lowerTcgen05St(Op, DAG);
2844 case Intrinsic::nvvm_tcgen05_mma_shared_disable_output_lane_cg1:
2845 case Intrinsic::nvvm_tcgen05_mma_shared_disable_output_lane_cg2:
2846 case Intrinsic::nvvm_tcgen05_mma_shared_scale_d_disable_output_lane_cg1:
2847 case Intrinsic::nvvm_tcgen05_mma_shared_scale_d_disable_output_lane_cg2:
2848 case Intrinsic::nvvm_tcgen05_mma_sp_shared_disable_output_lane_cg1:
2849 case Intrinsic::nvvm_tcgen05_mma_sp_shared_disable_output_lane_cg2:
2850 case Intrinsic::nvvm_tcgen05_mma_sp_shared_scale_d_disable_output_lane_cg1:
2851 case Intrinsic::nvvm_tcgen05_mma_sp_shared_scale_d_disable_output_lane_cg2:
2852 case Intrinsic::nvvm_tcgen05_mma_tensor_disable_output_lane_cg1:
2853 case Intrinsic::nvvm_tcgen05_mma_tensor_disable_output_lane_cg2:
2854 case Intrinsic::nvvm_tcgen05_mma_tensor_scale_d_disable_output_lane_cg1:
2855 case Intrinsic::nvvm_tcgen05_mma_tensor_scale_d_disable_output_lane_cg2:
2856 case Intrinsic::nvvm_tcgen05_mma_sp_tensor_disable_output_lane_cg1:
2857 case Intrinsic::nvvm_tcgen05_mma_sp_tensor_disable_output_lane_cg2:
2858 case Intrinsic::nvvm_tcgen05_mma_sp_tensor_scale_d_disable_output_lane_cg1:
2859 case Intrinsic::nvvm_tcgen05_mma_sp_tensor_scale_d_disable_output_lane_cg2:
2860 case Intrinsic::nvvm_tcgen05_mma_tensor_disable_output_lane_cg1_ashift:
2861 case Intrinsic::nvvm_tcgen05_mma_tensor_disable_output_lane_cg2_ashift:
2862 case Intrinsic::
2863 nvvm_tcgen05_mma_tensor_scale_d_disable_output_lane_cg1_ashift:
2864 case Intrinsic::
2865 nvvm_tcgen05_mma_tensor_scale_d_disable_output_lane_cg2_ashift:
2866 case Intrinsic::nvvm_tcgen05_mma_sp_tensor_disable_output_lane_cg1_ashift:
2867 case Intrinsic::nvvm_tcgen05_mma_sp_tensor_disable_output_lane_cg2_ashift:
2868 case Intrinsic::
2869 nvvm_tcgen05_mma_sp_tensor_scale_d_disable_output_lane_cg1_ashift:
2870 case Intrinsic::
2871 nvvm_tcgen05_mma_sp_tensor_scale_d_disable_output_lane_cg2_ashift:
2873 }
2874 return Op;
2875}
2876
2878 SelectionDAG &DAG) {
2879
2880 SDNode *N = Op.getNode();
2881 if (N->getOperand(1).getValueType() != MVT::i128) {
2882 // return, if the operand is already lowered
2883 return SDValue();
2884 }
2885
2886 unsigned IID =
2887 cast<ConstantSDNode>(N->getOperand(0).getNode())->getZExtValue();
2888 auto Opcode = [&]() {
2889 switch (IID) {
2890 case Intrinsic::nvvm_clusterlaunchcontrol_query_cancel_is_canceled:
2892 case Intrinsic::nvvm_clusterlaunchcontrol_query_cancel_get_first_ctaid_x:
2894 case Intrinsic::nvvm_clusterlaunchcontrol_query_cancel_get_first_ctaid_y:
2896 case Intrinsic::nvvm_clusterlaunchcontrol_query_cancel_get_first_ctaid_z:
2898 default:
2899 llvm_unreachable("unsupported/unhandled intrinsic");
2900 }
2901 }();
2902
2903 SDLoc DL(N);
2904 SDValue TryCancelResponse = N->getOperand(1);
2905 SDValue Cast = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, TryCancelResponse);
2906 SDValue TryCancelResponse0 =
2907 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, Cast,
2908 DAG.getIntPtrConstant(0, DL));
2909 SDValue TryCancelResponse1 =
2910 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, Cast,
2911 DAG.getIntPtrConstant(1, DL));
2912
2913 return DAG.getNode(Opcode, DL, N->getVTList(),
2914 {TryCancelResponse0, TryCancelResponse1});
2915}
2916
2918 SDNode *N = Op.getNode();
2919 SDLoc DL(N);
2920 SDValue F32Vec = N->getOperand(1);
2921 SDValue RBits = N->getOperand(2);
2922
2923 unsigned IntrinsicID = N->getConstantOperandVal(0);
2924
2925 // Extract the 4 float elements from the vector
2927 for (unsigned i = 0; i < 4; ++i)
2928 Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, F32Vec,
2929 DAG.getIntPtrConstant(i, DL)));
2930
2932
2933 auto [OpCode, RetTy, CvtModeFlag] =
2934 [&]() -> std::tuple<NVPTXISD::NodeType, MVT::SimpleValueType, uint32_t> {
2935 switch (IntrinsicID) {
2936 case Intrinsic::nvvm_f32x4_to_e4m3x4_rs_relu_satfinite:
2937 return {NVPTXISD::CVT_E4M3X4_F32X4_RS_SF, MVT::v4i8,
2938 CvtMode::RS | CvtMode::RELU_FLAG};
2939 case Intrinsic::nvvm_f32x4_to_e4m3x4_rs_satfinite:
2940 return {NVPTXISD::CVT_E4M3X4_F32X4_RS_SF, MVT::v4i8, CvtMode::RS};
2941 case Intrinsic::nvvm_f32x4_to_e5m2x4_rs_relu_satfinite:
2942 return {NVPTXISD::CVT_E5M2X4_F32X4_RS_SF, MVT::v4i8,
2943 CvtMode::RS | CvtMode::RELU_FLAG};
2944 case Intrinsic::nvvm_f32x4_to_e5m2x4_rs_satfinite:
2945 return {NVPTXISD::CVT_E5M2X4_F32X4_RS_SF, MVT::v4i8, CvtMode::RS};
2946 case Intrinsic::nvvm_f32x4_to_e2m3x4_rs_relu_satfinite:
2947 return {NVPTXISD::CVT_E2M3X4_F32X4_RS_SF, MVT::v4i8,
2948 CvtMode::RS | CvtMode::RELU_FLAG};
2949 case Intrinsic::nvvm_f32x4_to_e2m3x4_rs_satfinite:
2950 return {NVPTXISD::CVT_E2M3X4_F32X4_RS_SF, MVT::v4i8, CvtMode::RS};
2951 case Intrinsic::nvvm_f32x4_to_e3m2x4_rs_relu_satfinite:
2952 return {NVPTXISD::CVT_E3M2X4_F32X4_RS_SF, MVT::v4i8,
2953 CvtMode::RS | CvtMode::RELU_FLAG};
2954 case Intrinsic::nvvm_f32x4_to_e3m2x4_rs_satfinite:
2955 return {NVPTXISD::CVT_E3M2X4_F32X4_RS_SF, MVT::v4i8, CvtMode::RS};
2956 case Intrinsic::nvvm_f32x4_to_e2m1x4_rs_relu_satfinite:
2957 return {NVPTXISD::CVT_E2M1X4_F32X4_RS_SF, MVT::i16,
2958 CvtMode::RS | CvtMode::RELU_FLAG};
2959 case Intrinsic::nvvm_f32x4_to_e2m1x4_rs_satfinite:
2960 return {NVPTXISD::CVT_E2M1X4_F32X4_RS_SF, MVT::i16, CvtMode::RS};
2961 default:
2962 llvm_unreachable("unsupported/unhandled intrinsic");
2963 }
2964 }();
2965
2966 Ops.push_back(RBits);
2967 Ops.push_back(DAG.getConstant(CvtModeFlag, DL, MVT::i32));
2968
2969 return DAG.getNode(OpCode, DL, RetTy, Ops);
2970}
2971
2973 const unsigned Mode = [&]() {
2974 switch (Op->getConstantOperandVal(0)) {
2975 case Intrinsic::nvvm_prmt:
2977 case Intrinsic::nvvm_prmt_b4e:
2979 case Intrinsic::nvvm_prmt_ecl:
2981 case Intrinsic::nvvm_prmt_ecr:
2983 case Intrinsic::nvvm_prmt_f4e:
2985 case Intrinsic::nvvm_prmt_rc16:
2987 case Intrinsic::nvvm_prmt_rc8:
2989 default:
2990 llvm_unreachable("unsupported/unhandled intrinsic");
2991 }
2992 }();
2993 SDLoc DL(Op);
2994 SDValue A = Op->getOperand(1);
2995 SDValue B = Op.getNumOperands() == 4 ? Op.getOperand(2)
2996 : DAG.getConstant(0, DL, MVT::i32);
2997 SDValue Selector = (Op->op_end() - 1)->get();
2998 return getPRMT(A, B, Selector, DL, DAG, Mode);
2999}
3000
3002 switch (Op->getConstantOperandVal(1)) {
3003 default:
3004 return Op;
3005
3006 // These tcgen05 intrinsics return a v2i32, which is legal, so we have to
3007 // lower them through LowerOperation() instead of ReplaceNodeResults().
3008 case Intrinsic::nvvm_tcgen05_ld_16x64b_x2:
3009 case Intrinsic::nvvm_tcgen05_ld_16x128b_x1:
3010 case Intrinsic::nvvm_tcgen05_ld_32x32b_x2:
3011 if (auto Res = lowerTcgen05Ld(Op.getNode(), DAG))
3012 return DAG.getMergeValues({Res->first, Res->second}, SDLoc(Op));
3013 return SDValue();
3014
3015 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x2:
3016 if (auto Res = lowerTcgen05Ld(Op.getNode(), DAG, /*HasOffset=*/true))
3017 return DAG.getMergeValues({Res->first, Res->second}, SDLoc(Op));
3018 return SDValue();
3019 }
3020}
3021
3023 switch (Op->getConstantOperandVal(0)) {
3024 default:
3025 return Op;
3026 case Intrinsic::nvvm_prmt:
3027 case Intrinsic::nvvm_prmt_b4e:
3028 case Intrinsic::nvvm_prmt_ecl:
3029 case Intrinsic::nvvm_prmt_ecr:
3030 case Intrinsic::nvvm_prmt_f4e:
3031 case Intrinsic::nvvm_prmt_rc16:
3032 case Intrinsic::nvvm_prmt_rc8:
3033 return lowerPrmtIntrinsic(Op, DAG);
3034 case Intrinsic::nvvm_internal_addrspace_wrap:
3035 return Op.getOperand(1);
3036 case Intrinsic::nvvm_clusterlaunchcontrol_query_cancel_is_canceled:
3037 case Intrinsic::nvvm_clusterlaunchcontrol_query_cancel_get_first_ctaid_x:
3038 case Intrinsic::nvvm_clusterlaunchcontrol_query_cancel_get_first_ctaid_y:
3039 case Intrinsic::nvvm_clusterlaunchcontrol_query_cancel_get_first_ctaid_z:
3041 case Intrinsic::nvvm_f32x4_to_e4m3x4_rs_satfinite:
3042 case Intrinsic::nvvm_f32x4_to_e4m3x4_rs_relu_satfinite:
3043 case Intrinsic::nvvm_f32x4_to_e5m2x4_rs_satfinite:
3044 case Intrinsic::nvvm_f32x4_to_e5m2x4_rs_relu_satfinite:
3045 case Intrinsic::nvvm_f32x4_to_e2m3x4_rs_satfinite:
3046 case Intrinsic::nvvm_f32x4_to_e2m3x4_rs_relu_satfinite:
3047 case Intrinsic::nvvm_f32x4_to_e3m2x4_rs_satfinite:
3048 case Intrinsic::nvvm_f32x4_to_e3m2x4_rs_relu_satfinite:
3049 case Intrinsic::nvvm_f32x4_to_e2m1x4_rs_satfinite:
3050 case Intrinsic::nvvm_f32x4_to_e2m1x4_rs_relu_satfinite:
3051 return lowerCvtRSIntrinsics(Op, DAG);
3052 }
3053}
3054
3055// In PTX 64-bit CTLZ and CTPOP are supported, but they return a 32-bit value.
3056// Lower these into a node returning the correct type which is zero-extended
3057// back to the correct size.
3059 SDValue V = Op->getOperand(0);
3060 assert(V.getValueType() == MVT::i64 &&
3061 "Unexpected CTLZ/CTPOP type to legalize");
3062
3063 SDLoc DL(Op);
3064 SDValue CT = DAG.getNode(Op->getOpcode(), DL, MVT::i32, V);
3065 return DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, CT, SDNodeFlags::NonNeg);
3066}
3067
3069 unsigned Opcode, SelectionDAG &DAG) {
3070 assert(A.getValueType() == MVT::i64 && B.getValueType() == MVT::i64);
3071
3072 const auto *AmtConst = dyn_cast<ConstantSDNode>(ShiftAmount);
3073 if (!AmtConst)
3074 return SDValue();
3075 const auto Amt = AmtConst->getZExtValue() & 63;
3076
3077 SDValue UnpackA =
3078 DAG.getNode(NVPTXISD::UNPACK_VECTOR, DL, {MVT::i32, MVT::i32}, A);
3079 SDValue UnpackB =
3080 DAG.getNode(NVPTXISD::UNPACK_VECTOR, DL, {MVT::i32, MVT::i32}, B);
3081
3082 // Arch is Little endiain: 0 = low bits, 1 = high bits
3083 SDValue ALo = UnpackA.getValue(0);
3084 SDValue AHi = UnpackA.getValue(1);
3085 SDValue BLo = UnpackB.getValue(0);
3086 SDValue BHi = UnpackB.getValue(1);
3087
3088 // The bitfeild consists of { AHi : ALo : BHi : BLo }
3089 //
3090 // * FSHL, Amt < 32 - The window will contain { AHi : ALo : BHi }
3091 // * FSHL, Amt >= 32 - The window will contain { ALo : BHi : BLo }
3092 // * FSHR, Amt < 32 - The window will contain { ALo : BHi : BLo }
3093 // * FSHR, Amt >= 32 - The window will contain { AHi : ALo : BHi }
3094 //
3095 // Note that Amt = 0 and Amt = 32 are special cases where 32-bit funnel shifts
3096 // are not needed at all. Amt = 0 is a no-op producing either A or B depending
3097 // on the direction. Amt = 32 can be implemented by a packing and unpacking
3098 // move to select and arrange the 32bit values. For simplicity, these cases
3099 // are not handled here explicitly and instead we rely on DAGCombiner to
3100 // remove the no-op funnel shifts we insert.
3101 auto [High, Mid, Low] = ((Opcode == ISD::FSHL) == (Amt < 32))
3102 ? std::make_tuple(AHi, ALo, BHi)
3103 : std::make_tuple(ALo, BHi, BLo);
3104
3105 SDValue NewAmt = DAG.getConstant(Amt & 31, DL, MVT::i32);
3106 SDValue RHi = DAG.getNode(Opcode, DL, MVT::i32, {High, Mid, NewAmt});
3107 SDValue RLo = DAG.getNode(Opcode, DL, MVT::i32, {Mid, Low, NewAmt});
3108
3109 return DAG.getNode(NVPTXISD::BUILD_VECTOR, DL, MVT::i64, {RLo, RHi});
3110}
3111
3113 return expandFSH64(Op->getOperand(0), Op->getOperand(1), Op->getOperand(2),
3114 SDLoc(Op), Op->getOpcode(), DAG);
3115}
3116
3118 unsigned Opcode = Op->getOpcode() == ISD::ROTL ? ISD::FSHL : ISD::FSHR;
3119 return expandFSH64(Op->getOperand(0), Op->getOperand(0), Op->getOperand(1),
3120 SDLoc(Op), Opcode, DAG);
3121}
3122
3124 // Lower (frem x, y) into (sub x, (mul (ftrunc (div x, y)) y)),
3125 // i.e. "poor man's fmod()". When y is infinite, x is returned. This matches
3126 // the semantics of LLVM's frem.
3127 SDLoc DL(Op);
3128 SDValue X = Op->getOperand(0);
3129 SDValue Y = Op->getOperand(1);
3130 EVT Ty = Op.getValueType();
3131 SDNodeFlags Flags = Op->getFlags();
3132
3133 SDValue Div = DAG.getNode(ISD::FDIV, DL, Ty, X, Y, Flags);
3134 SDValue Trunc = DAG.getNode(ISD::FTRUNC, DL, Ty, Div, Flags);
3135 SDValue Mul = DAG.getNode(ISD::FMUL, DL, Ty, Trunc, Y,
3137 SDValue Sub = DAG.getNode(ISD::FSUB, DL, Ty, X, Mul,
3139
3140 if (Flags.hasNoInfs())
3141 return Sub;
3142
3143 // If Y is infinite, return X
3144 SDValue AbsY = DAG.getNode(ISD::FABS, DL, Ty, Y);
3145 SDValue Inf =
3146 DAG.getConstantFP(APFloat::getInf(Ty.getFltSemantics()), DL, Ty);
3147 SDValue IsInf = DAG.getSetCC(DL, MVT::i1, AbsY, Inf, ISD::SETEQ);
3148 return DAG.getSelect(DL, Ty, IsInf, X, Sub);
3149}
3150
3152 assert(Op.getValueType() == MVT::i1 && "Custom lowering enabled only for i1");
3153
3154 SDValue Cond = Op->getOperand(0);
3155 SDValue TrueVal = Op->getOperand(1);
3156 SDValue FalseVal = Op->getOperand(2);
3157 SDLoc DL(Op);
3158
3159 // If both operands are truncated, we push the select through the truncates.
3160 if (TrueVal.getOpcode() == ISD::TRUNCATE &&
3161 FalseVal.getOpcode() == ISD::TRUNCATE) {
3162 TrueVal = TrueVal.getOperand(0);
3163 FalseVal = FalseVal.getOperand(0);
3164
3165 EVT VT = TrueVal.getSimpleValueType().bitsLE(FalseVal.getSimpleValueType())
3166 ? TrueVal.getValueType()
3167 : FalseVal.getValueType();
3168 TrueVal = DAG.getAnyExtOrTrunc(TrueVal, DL, VT);
3169 FalseVal = DAG.getAnyExtOrTrunc(FalseVal, DL, VT);
3170 SDValue Select = DAG.getSelect(DL, VT, Cond, TrueVal, FalseVal);
3171 return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Select);
3172 }
3173
3174 // Otherwise, expand the select into a series of logical operations. These
3175 // often can be folded into other operations either by us or ptxas.
3176 TrueVal = DAG.getFreeze(TrueVal);
3177 FalseVal = DAG.getFreeze(FalseVal);
3178 SDValue And1 = DAG.getNode(ISD::AND, DL, MVT::i1, Cond, TrueVal);
3179 SDValue NotCond = DAG.getNOT(DL, Cond, MVT::i1);
3180 SDValue And2 = DAG.getNode(ISD::AND, DL, MVT::i1, NotCond, FalseVal);
3181 SDValue Or = DAG.getNode(ISD::OR, DL, MVT::i1, And1, And2);
3182 return Or;
3183}
3184
3185SDValue
3187 switch (Op.getOpcode()) {
3188 case ISD::RETURNADDR:
3189 return SDValue();
3190 case ISD::FRAMEADDR:
3191 return SDValue();
3192 case ISD::ADDRSPACECAST:
3193 return LowerADDRSPACECAST(Op, DAG);
3195 return lowerIntrinsicWChain(Op, DAG);
3197 return lowerIntrinsicWOChain(Op, DAG);
3199 return lowerIntrinsicVoid(Op, DAG);
3200 case ISD::BUILD_VECTOR:
3201 return LowerBUILD_VECTOR(Op, DAG);
3202 case ISD::BITCAST:
3203 return LowerBITCAST(Op, DAG);
3205 return Op;
3207 return LowerEXTRACT_VECTOR_ELT(Op, DAG);
3209 return LowerINSERT_VECTOR_ELT(Op, DAG);
3211 return LowerVECTOR_SHUFFLE(Op, DAG);
3213 return LowerCONCAT_VECTORS(Op, DAG);
3214 case ISD::VECREDUCE_FMAX:
3215 case ISD::VECREDUCE_FMIN:
3216 case ISD::VECREDUCE_FMAXIMUM:
3217 case ISD::VECREDUCE_FMINIMUM:
3218 return LowerVECREDUCE(Op, DAG);
3219 case ISD::STORE:
3220 return LowerSTORE(Op, DAG);
3221 case ISD::LOAD:
3222 return LowerLOAD(Op, DAG);
3223 case ISD::SHL_PARTS:
3224 return LowerShiftLeftParts(Op, DAG);
3225 case ISD::SRA_PARTS:
3226 case ISD::SRL_PARTS:
3227 return LowerShiftRightParts(Op, DAG);
3228 case ISD::SELECT:
3229 return lowerSELECT(Op, DAG);
3230 case ISD::FROUND:
3231 return LowerFROUND(Op, DAG);
3232 case ISD::FCOPYSIGN:
3233 return LowerFCOPYSIGN(Op, DAG);
3234 case ISD::SINT_TO_FP:
3235 case ISD::UINT_TO_FP:
3236 return LowerINT_TO_FP(Op, DAG);
3237 case ISD::FP_TO_SINT:
3238 case ISD::FP_TO_UINT:
3239 return LowerFP_TO_INT(Op, DAG);
3240 case ISD::FP_ROUND:
3241 return LowerFP_ROUND(Op, DAG);
3242 case ISD::FP_EXTEND:
3243 return LowerFP_EXTEND(Op, DAG);
3244 case ISD::BR_JT:
3245 return LowerBR_JT(Op, DAG);
3246 case ISD::VAARG:
3247 return LowerVAARG(Op, DAG);
3248 case ISD::VASTART:
3249 return LowerVASTART(Op, DAG);
3250 case ISD::FSHL:
3251 case ISD::FSHR:
3252 return lowerFSH(Op, DAG);
3253 case ISD::ROTL:
3254 case ISD::ROTR:
3255 return lowerROT(Op, DAG);
3256 case ISD::ABS:
3257 case ISD::SMIN:
3258 case ISD::SMAX:
3259 case ISD::UMIN:
3260 case ISD::UMAX:
3261 case ISD::ADD:
3262 case ISD::SUB:
3263 case ISD::MUL:
3264 case ISD::SHL:
3265 case ISD::SREM:
3266 case ISD::UREM:
3267 return LowerVectorArith(Op, DAG);
3268 case ISD::DYNAMIC_STACKALLOC:
3269 return LowerDYNAMIC_STACKALLOC(Op, DAG);
3270 case ISD::STACKRESTORE:
3271 return LowerSTACKRESTORE(Op, DAG);
3272 case ISD::STACKSAVE:
3273 return LowerSTACKSAVE(Op, DAG);
3274 case ISD::CopyToReg:
3275 return LowerCopyToReg_128(Op, DAG);
3276 case ISD::FADD:
3277 case ISD::FSUB:
3278 case ISD::FMUL:
3279 // Used only for bf16 on SM80, where we select fma for non-ftz operation
3280 return PromoteBinOpIfF32FTZ(Op, DAG);
3281 case ISD::CTPOP:
3282 case ISD::CTLZ:
3283 return lowerCTLZCTPOP(Op, DAG);
3284 case ISD::FREM:
3285 return lowerFREM(Op, DAG);
3286
3287 default:
3288 llvm_unreachable("Custom lowering not defined for operation");
3289 }
3290}
3291
3292SDValue NVPTXTargetLowering::LowerBR_JT(SDValue Op, SelectionDAG &DAG) const {
3293 SDLoc DL(Op);
3294 SDValue Chain = Op.getOperand(0);
3295 const auto *JT = cast<JumpTableSDNode>(Op.getOperand(1));
3296 SDValue Index = Op.getOperand(2);
3297
3298 unsigned JId = JT->getIndex();
3300 ArrayRef<MachineBasicBlock *> MBBs = MJTI->getJumpTables()[JId].MBBs;
3301
3302 SDValue IdV = DAG.getConstant(JId, DL, MVT::i32);
3303
3304 // Generate BrxStart node
3305 SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
3306 Chain = DAG.getNode(NVPTXISD::BrxStart, DL, VTs, Chain, IdV);
3307
3308 // Generate BrxItem nodes
3309 assert(!MBBs.empty());
3310 for (MachineBasicBlock *MBB : MBBs.drop_back())
3311 Chain = DAG.getNode(NVPTXISD::BrxItem, DL, VTs, Chain.getValue(0),
3312 DAG.getBasicBlock(MBB), Chain.getValue(1));
3313
3314 // Generate BrxEnd nodes
3315 SDValue EndOps[] = {Chain.getValue(0), DAG.getBasicBlock(MBBs.back()), Index,
3316 IdV, Chain.getValue(1)};
3317 SDValue BrxEnd = DAG.getNode(NVPTXISD::BrxEnd, DL, VTs, EndOps);
3318
3319 return BrxEnd;
3320}
3321
3322// This will prevent AsmPrinter from trying to print the jump tables itself.
3326
3327SDValue NVPTXTargetLowering::LowerADDRSPACECAST(SDValue Op,
3328 SelectionDAG &DAG) const {
3330 unsigned SrcAS = N->getSrcAddressSpace();
3331 unsigned DestAS = N->getDestAddressSpace();
3332 if (SrcAS != llvm::ADDRESS_SPACE_GENERIC &&
3333 DestAS != llvm::ADDRESS_SPACE_GENERIC) {
3334 // Shared and SharedCluster can be converted to each other through generic
3335 // space
3336 if ((SrcAS == llvm::ADDRESS_SPACE_SHARED &&
3339 DestAS == llvm::ADDRESS_SPACE_SHARED)) {
3340 SDLoc DL(Op.getNode());
3341 const MVT GenerictVT =
3343 SDValue GenericConversion = DAG.getAddrSpaceCast(
3344 DL, GenerictVT, Op.getOperand(0), SrcAS, ADDRESS_SPACE_GENERIC);
3345 SDValue SharedClusterConversion =
3346 DAG.getAddrSpaceCast(DL, Op.getValueType(), GenericConversion,
3347 ADDRESS_SPACE_GENERIC, DestAS);
3348 return SharedClusterConversion;
3349 }
3350
3351 return DAG.getUNDEF(Op.getValueType());
3352 }
3353
3354 return Op;
3355}
3356
3357// This function is almost a copy of SelectionDAG::expandVAArg().
3358// The only diff is that this one produces loads from local address space.
3359SDValue NVPTXTargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
3360 const TargetLowering *TLI = STI.getTargetLowering();
3361 SDLoc DL(Op);
3362
3363 SDNode *Node = Op.getNode();
3364 const Value *V = cast<SrcValueSDNode>(Node->getOperand(2))->getValue();
3365 EVT VT = Node->getValueType(0);
3366 auto *Ty = VT.getTypeForEVT(*DAG.getContext());
3367 SDValue Tmp1 = Node->getOperand(0);
3368 SDValue Tmp2 = Node->getOperand(1);
3369 const MaybeAlign MA(Node->getConstantOperandVal(3));
3370
3371 SDValue VAListLoad = DAG.getLoad(TLI->getPointerTy(DAG.getDataLayout()), DL,
3372 Tmp1, Tmp2, MachinePointerInfo(V));
3373 SDValue VAList = VAListLoad;
3374
3375 if (MA && *MA > TLI->getMinStackArgumentAlignment()) {
3376 VAList = DAG.getNode(
3377 ISD::ADD, DL, VAList.getValueType(), VAList,
3378 DAG.getConstant(MA->value() - 1, DL, VAList.getValueType()));
3379
3380 VAList = DAG.getNode(ISD::AND, DL, VAList.getValueType(), VAList,
3381 DAG.getSignedConstant(-(int64_t)MA->value(), DL,
3382 VAList.getValueType()));
3383 }
3384
3385 // Increment the pointer, VAList, to the next vaarg
3386 Tmp1 = DAG.getNode(ISD::ADD, DL, VAList.getValueType(), VAList,
3388 DL, VAList.getValueType()));
3389
3390 // Store the incremented VAList to the legalized pointer
3391 Tmp1 = DAG.getStore(VAListLoad.getValue(1), DL, Tmp1, Tmp2,
3392 MachinePointerInfo(V));
3393
3394 const Value *SrcV = Constant::getNullValue(
3396
3397 // Load the actual argument out of the pointer VAList
3398 return DAG.getLoad(VT, DL, Tmp1, VAList, MachinePointerInfo(SrcV));
3399}
3400
3401SDValue NVPTXTargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
3402 const TargetLowering *TLI = STI.getTargetLowering();
3403 SDLoc DL(Op);
3404 EVT PtrVT = TLI->getPointerTy(DAG.getDataLayout());
3405
3406 // Store the address of unsized array <function>_vararg[] in the ap object.
3407 SDValue VAReg = getParamSymbol(DAG, /* vararg */ -1, PtrVT);
3408
3409 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
3410 return DAG.getStore(Op.getOperand(0), DL, VAReg, Op.getOperand(1),
3411 MachinePointerInfo(SV));
3412}
3413
3414/// replaceLoadVector - Convert vector loads into multi-output scalar loads.
3415static std::optional<std::pair<SDValue, SDValue>>
3418 const EVT ResVT = LD->getValueType(0);
3419 const EVT MemVT = LD->getMemoryVT();
3420
3421 // If we're doing sign/zero extension as part of the load, avoid lowering to
3422 // a LoadV node. TODO: consider relaxing this restriction.
3423 if (ResVT != MemVT)
3424 return std::nullopt;
3425
3426 const auto NumEltsAndEltVT =
3427 getVectorLoweringShape(ResVT, STI, LD->getAddressSpace());
3428 if (!NumEltsAndEltVT)
3429 return std::nullopt;
3430 const auto [NumElts, EltVT] = NumEltsAndEltVT.value();
3431
3432 Align Alignment = LD->getAlign();
3433 const auto &TD = DAG.getDataLayout();
3434 Align PrefAlign = TD.getPrefTypeAlign(MemVT.getTypeForEVT(*DAG.getContext()));
3435 if (Alignment < PrefAlign) {
3436 // This load is not sufficiently aligned, so bail out and let this vector
3437 // load be scalarized. Note that we may still be able to emit smaller
3438 // vector loads. For example, if we are loading a <4 x float> with an
3439 // alignment of 8, this check will fail but the legalizer will try again
3440 // with 2 x <2 x float>, which will succeed with an alignment of 8.
3441 return std::nullopt;
3442 }
3443
3444 // Since LoadV2 is a target node, we cannot rely on DAG type legalization.
3445 // Therefore, we must ensure the type is legal. For i1 and i8, we set the
3446 // loaded type to i16 and propagate the "real" type as the memory type.
3447 const MVT LoadEltVT = (EltVT.getSizeInBits() < 16) ? MVT::i16 : EltVT;
3448
3449 unsigned Opcode;
3450 switch (NumElts) {
3451 default:
3452 return std::nullopt;
3453 case 2:
3454 Opcode = NVPTXISD::LoadV2;
3455 break;
3456 case 4:
3457 Opcode = NVPTXISD::LoadV4;
3458 break;
3459 case 8:
3460 Opcode = NVPTXISD::LoadV8;
3461 break;
3462 }
3463 auto ListVTs = SmallVector<EVT, 9>(NumElts, LoadEltVT);
3464 ListVTs.push_back(MVT::Other);
3465 SDVTList LdResVTs = DAG.getVTList(ListVTs);
3466
3467 SDLoc DL(LD);
3468
3469 // Copy regular operands
3470 SmallVector<SDValue, 8> OtherOps(LD->ops());
3471
3472 // The select routine does not have access to the LoadSDNode instance, so
3473 // pass along the extension information
3474 OtherOps.push_back(DAG.getIntPtrConstant(LD->getExtensionType(), DL));
3475
3476 SDValue NewLD = DAG.getMemIntrinsicNode(Opcode, DL, LdResVTs, OtherOps, MemVT,
3477 LD->getMemOperand());
3478
3479 SmallVector<SDValue> ScalarRes;
3480 if (EltVT.isVector()) {
3482 assert(NumElts * EltVT.getVectorNumElements() ==
3483 ResVT.getVectorNumElements());
3484 // Generate EXTRACT_VECTOR_ELTs to split v2[i,f,bf]16/v4i8 subvectors back
3485 // into individual elements.
3486 for (const unsigned I : llvm::seq(NumElts)) {
3487 SDValue SubVector = NewLD.getValue(I);
3488 DAG.ExtractVectorElements(SubVector, ScalarRes);
3489 }
3490 } else {
3491 for (const unsigned I : llvm::seq(NumElts)) {
3492 SDValue Res = NewLD.getValue(I);
3493 if (LoadEltVT != EltVT)
3494 Res = DAG.getNode(ISD::TRUNCATE, DL, EltVT, Res);
3495 ScalarRes.push_back(Res);
3496 }
3497 }
3498
3499 SDValue LoadChain = NewLD.getValue(NumElts);
3500
3501 const MVT BuildVecVT =
3502 MVT::getVectorVT(EltVT.getScalarType(), ScalarRes.size());
3503 SDValue BuildVec = DAG.getBuildVector(BuildVecVT, DL, ScalarRes);
3504 SDValue LoadValue = DAG.getBitcast(ResVT, BuildVec);
3505
3506 return {{LoadValue, LoadChain}};
3507}
3508
3511 const NVPTXSubtarget &STI) {
3512 if (auto Res = replaceLoadVector(N, DAG, STI))
3513 Results.append({Res->first, Res->second});
3514}
3515
3517 const NVPTXSubtarget &STI) {
3518 if (auto Res = replaceLoadVector(N, DAG, STI))
3519 return DAG.getMergeValues({Res->first, Res->second}, SDLoc(N));
3520 return SDValue();
3521}
3522
3523// v = ld i1* addr
3524// =>
3525// v1 = ld i8* addr (-> i16)
3526// v = trunc i16 to i1
3528 SDLoc dl(LD);
3529 assert(LD->getExtensionType() == ISD::NON_EXTLOAD);
3530 assert(LD->getValueType(0) == MVT::i1 && "Custom lowering for i1 load only");
3531 SDValue newLD = DAG.getExtLoad(ISD::ZEXTLOAD, dl, MVT::i16, LD->getChain(),
3532 LD->getBasePtr(), LD->getPointerInfo(),
3533 MVT::i8, LD->getAlign(),
3534 LD->getMemOperand()->getFlags());
3535 SDValue result = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, newLD);
3536 // The legalizer (the caller) is expecting two values from the legalized
3537 // load, so we build a MergeValues node for it. See ExpandUnalignedLoad()
3538 // in LegalizeDAG.cpp which also uses MergeValues.
3539 return DAG.getMergeValues({result, LD->getChain()}, dl);
3540}
3541
3542SDValue NVPTXTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
3543 LoadSDNode *LD = cast<LoadSDNode>(Op);
3544
3545 if (Op.getValueType() == MVT::i1)
3546 return lowerLOADi1(LD, DAG);
3547
3548 // To improve CodeGen we'll legalize any-extend loads to zext loads. This is
3549 // how they'll be lowered in ISel anyway, and by doing this a little earlier
3550 // we allow for more DAG combine opportunities.
3551 if (LD->getExtensionType() == ISD::EXTLOAD) {
3552 assert(LD->getValueType(0).isInteger() && LD->getMemoryVT().isInteger() &&
3553 "Unexpected fpext-load");
3554 return DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(Op), Op.getValueType(),
3555 LD->getChain(), LD->getBasePtr(), LD->getMemoryVT(),
3556 LD->getMemOperand());
3557 }
3558
3559 llvm_unreachable("Unexpected custom lowering for load");
3560}
3561
3563 const NVPTXSubtarget &STI) {
3564 MemSDNode *N = cast<MemSDNode>(Op.getNode());
3565 SDValue Val = N->getOperand(1);
3566 SDLoc DL(N);
3567 const EVT ValVT = Val.getValueType();
3568 const EVT MemVT = N->getMemoryVT();
3569
3570 // If we're truncating as part of the store, avoid lowering to a StoreV node.
3571 // TODO: consider relaxing this restriction.
3572 if (ValVT != MemVT)
3573 return SDValue();
3574
3575 const auto NumEltsAndEltVT =
3576 getVectorLoweringShape(ValVT, STI, N->getAddressSpace());
3577 if (!NumEltsAndEltVT)
3578 return SDValue();
3579 const auto [NumElts, EltVT] = NumEltsAndEltVT.value();
3580
3581 const DataLayout &TD = DAG.getDataLayout();
3582
3583 Align Alignment = N->getAlign();
3584 Align PrefAlign = TD.getPrefTypeAlign(ValVT.getTypeForEVT(*DAG.getContext()));
3585 if (Alignment < PrefAlign) {
3586 // This store is not sufficiently aligned, so bail out and let this vector
3587 // store be scalarized. Note that we may still be able to emit smaller
3588 // vector stores. For example, if we are storing a <4 x float> with an
3589 // alignment of 8, this check will fail but the legalizer will try again
3590 // with 2 x <2 x float>, which will succeed with an alignment of 8.
3591 return SDValue();
3592 }
3593
3594 unsigned Opcode;
3595 switch (NumElts) {
3596 default:
3597 return SDValue();
3598 case 2:
3599 Opcode = NVPTXISD::StoreV2;
3600 break;
3601 case 4:
3602 Opcode = NVPTXISD::StoreV4;
3603 break;
3604 case 8:
3605 Opcode = NVPTXISD::StoreV8;
3606 break;
3607 }
3608
3610
3611 // First is the chain
3612 Ops.push_back(N->getOperand(0));
3613
3614 // Then the split values
3615 if (EltVT.isVector()) {
3617 assert(NumElts * EltVT.getVectorNumElements() ==
3618 ValVT.getVectorNumElements());
3619 // Combine individual elements into v2[i,f,bf]16/v4i8 subvectors to be
3620 // stored as b32s
3621 const unsigned NumEltsPerSubVector = EltVT.getVectorNumElements();
3622 for (const unsigned I : llvm::seq(NumElts)) {
3623 SmallVector<SDValue, 4> SubVectorElts;
3624 DAG.ExtractVectorElements(Val, SubVectorElts, I * NumEltsPerSubVector,
3625 NumEltsPerSubVector);
3626 Ops.push_back(DAG.getBuildVector(EltVT, DL, SubVectorElts));
3627 }
3628 } else {
3629 SDValue V = DAG.getBitcast(MVT::getVectorVT(EltVT, NumElts), Val);
3630 for (const unsigned I : llvm::seq(NumElts)) {
3631 SDValue ExtVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, V,
3632 DAG.getIntPtrConstant(I, DL));
3633
3634 // Since StoreV2 is a target node, we cannot rely on DAG type
3635 // legalization. Therefore, we must ensure the type is legal. For i1 and
3636 // i8, we set the stored type to i16 and propagate the "real" type as the
3637 // memory type.
3638 if (EltVT.getSizeInBits() < 16)
3639 ExtVal = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i16, ExtVal);
3640 Ops.push_back(ExtVal);
3641 }
3642 }
3643
3644 // Then any remaining arguments
3645 Ops.append(N->op_begin() + 2, N->op_end());
3646
3647 SDValue NewSt =
3648 DAG.getMemIntrinsicNode(Opcode, DL, DAG.getVTList(MVT::Other), Ops,
3649 N->getMemoryVT(), N->getMemOperand());
3650
3651 // return DCI.CombineTo(N, NewSt, true);
3652 return NewSt;
3653}
3654
3655SDValue NVPTXTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
3656 StoreSDNode *Store = cast<StoreSDNode>(Op);
3657 EVT VT = Store->getMemoryVT();
3658
3659 if (VT == MVT::i1)
3660 return LowerSTOREi1(Op, DAG);
3661
3662 // Lower store of any other vector type, including v2f32 as we want to break
3663 // it apart since this is not a widely-supported type.
3664 return lowerSTOREVector(Op, DAG, STI);
3665}
3666
3667// st i1 v, addr
3668// =>
3669// v1 = zxt v to i16
3670// st.u8 i16, addr
3671SDValue NVPTXTargetLowering::LowerSTOREi1(SDValue Op, SelectionDAG &DAG) const {
3672 SDNode *Node = Op.getNode();
3673 SDLoc dl(Node);
3674 StoreSDNode *ST = cast<StoreSDNode>(Node);
3675 SDValue Tmp1 = ST->getChain();
3676 SDValue Tmp2 = ST->getBasePtr();
3677 SDValue Tmp3 = ST->getValue();
3678 assert(Tmp3.getValueType() == MVT::i1 && "Custom lowering for i1 store only");
3679 Tmp3 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Tmp3);
3680 SDValue Result =
3681 DAG.getTruncStore(Tmp1, dl, Tmp3, Tmp2, ST->getPointerInfo(), MVT::i8,
3682 ST->getAlign(), ST->getMemOperand()->getFlags());
3683 return Result;
3684}
3685
3686SDValue NVPTXTargetLowering::LowerCopyToReg_128(SDValue Op,
3687 SelectionDAG &DAG) const {
3688 // Change the CopyToReg to take in two 64-bit operands instead of a 128-bit
3689 // operand so that it can pass the legalization.
3690
3691 assert(Op.getOperand(1).getValueType() == MVT::i128 &&
3692 "Custom lowering for 128-bit CopyToReg only");
3693
3694 SDNode *Node = Op.getNode();
3695 SDLoc DL(Node);
3696
3697 SDValue Cast = DAG.getBitcast(MVT::v2i64, Op->getOperand(2));
3698 SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, Cast,
3699 DAG.getIntPtrConstant(0, DL));
3700 SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, Cast,
3701 DAG.getIntPtrConstant(1, DL));
3702
3704 SmallVector<EVT, 3> ResultsType(Node->values());
3705
3706 NewOps[0] = Op->getOperand(0); // Chain
3707 NewOps[1] = Op->getOperand(1); // Dst Reg
3708 NewOps[2] = Lo; // Lower 64-bit
3709 NewOps[3] = Hi; // Higher 64-bit
3710 if (Op.getNumOperands() == 4)
3711 NewOps[4] = Op->getOperand(3); // Glue if exists
3712
3713 return DAG.getNode(ISD::CopyToReg, DL, ResultsType, NewOps);
3714}
3715
3716unsigned NVPTXTargetLowering::getNumRegisters(
3717 LLVMContext &Context, EVT VT,
3718 std::optional<MVT> RegisterVT = std::nullopt) const {
3719 if (VT == MVT::i128 && RegisterVT == MVT::i128)
3720 return 1;
3721 return TargetLoweringBase::getNumRegisters(Context, VT, RegisterVT);
3722}
3723
3724bool NVPTXTargetLowering::splitValueIntoRegisterParts(
3725 SelectionDAG &DAG, const SDLoc &DL, SDValue Val, SDValue *Parts,
3726 unsigned NumParts, MVT PartVT, std::optional<CallingConv::ID> CC) const {
3727 if (Val.getValueType() == MVT::i128 && NumParts == 1) {
3728 Parts[0] = Val;
3729 return true;
3730 }
3731 return false;
3732}
3733
3734// This creates target external symbol for a function parameter.
3735// Name of the symbol is composed from its index and the function name.
3736// Negative index corresponds to special parameter (unsized array) used for
3737// passing variable arguments.
3738SDValue NVPTXTargetLowering::getParamSymbol(SelectionDAG &DAG, int I,
3739 EVT T) const {
3740 StringRef SavedStr = nvTM->getStrPool().save(
3742 return DAG.getExternalSymbol(SavedStr.data(), T);
3743}
3744
3745SDValue NVPTXTargetLowering::getCallParamSymbol(SelectionDAG &DAG, int I,
3746 EVT T) const {
3747 const StringRef SavedStr = nvTM->getStrPool().save("param" + Twine(I));
3748 return DAG.getExternalSymbol(SavedStr.data(), T);
3749}
3750
3752 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
3753 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
3754 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
3755 const DataLayout &DL = DAG.getDataLayout();
3756 LLVMContext &Ctx = *DAG.getContext();
3757 auto PtrVT = getPointerTy(DAG.getDataLayout());
3758
3759 const Function &F = DAG.getMachineFunction().getFunction();
3760
3761 SDValue Root = DAG.getRoot();
3762 SmallVector<SDValue, 16> OutChains;
3763
3764 // argTypes.size() (or theArgs.size()) and Ins.size() need not match.
3765 // Ins.size() will be larger
3766 // * if there is an aggregate argument with multiple fields (each field
3767 // showing up separately in Ins)
3768 // * if there is a vector argument with more than typical vector-length
3769 // elements (generally if more than 4) where each vector element is
3770 // individually present in Ins.
3771 // So a different index should be used for indexing into Ins.
3772 // See similar issue in LowerCall.
3773
3774 auto AllIns = ArrayRef(Ins);
3775 for (const auto &Arg : F.args()) {
3776 const auto ArgIns = AllIns.take_while(
3777 [&](auto I) { return I.OrigArgIndex == Arg.getArgNo(); });
3778 AllIns = AllIns.drop_front(ArgIns.size());
3779
3780 Type *Ty = Arg.getType();
3781
3782 if (ArgIns.empty())
3783 report_fatal_error("Empty parameter types are not supported");
3784
3785 if (Arg.use_empty()) {
3786 // argument is dead
3787 for (const auto &In : ArgIns) {
3788 assert(!In.Used && "Arg.use_empty() is true but Arg is used?");
3789 InVals.push_back(DAG.getUNDEF(In.VT));
3790 }
3791 continue;
3792 }
3793
3794 SDValue ArgSymbol = getParamSymbol(DAG, Arg.getArgNo(), PtrVT);
3795
3796 // In the following cases, assign a node order of "i+1"
3797 // to newly created nodes. The SDNodes for params have to
3798 // appear in the same order as their order of appearance
3799 // in the original function. "i+1" holds that order.
3800 if (Arg.hasByValAttr()) {
3801 // Param has ByVal attribute
3802 // Return MoveParam(param symbol).
3803 // Ideally, the param symbol can be returned directly,
3804 // but when SDNode builder decides to use it in a CopyToReg(),
3805 // machine instruction fails because TargetExternalSymbol
3806 // (not lowered) is target dependent, and CopyToReg assumes
3807 // the source is lowered.
3808 assert(ArgIns.size() == 1 && "ByVal argument must be a pointer");
3809 const auto &ByvalIn = ArgIns[0];
3810 assert(getValueType(DL, Ty) == ByvalIn.VT &&
3811 "Ins type did not match function type");
3812 assert(ByvalIn.VT == PtrVT && "ByVal argument must be a pointer");
3813
3814 SDValue P;
3815 if (isKernelFunction(F)) {
3816 P = ArgSymbol;
3817 P.getNode()->setIROrder(Arg.getArgNo() + 1);
3818 } else {
3819 P = DAG.getNode(NVPTXISD::MoveParam, dl, ByvalIn.VT, ArgSymbol);
3820 P.getNode()->setIROrder(Arg.getArgNo() + 1);
3821 P = DAG.getAddrSpaceCast(dl, ByvalIn.VT, P, ADDRESS_SPACE_LOCAL,
3823 }
3824 InVals.push_back(P);
3825 } else {
3828 ComputePTXValueVTs(*this, DL, Ctx, CallConv, Ty, VTs, Offsets);
3829 assert(VTs.size() == ArgIns.size() && "Size mismatch");
3830 assert(VTs.size() == Offsets.size() && "Size mismatch");
3831
3832 const Align ArgAlign = getFunctionArgumentAlignment(
3833 &F, Ty, Arg.getArgNo() + AttributeList::FirstArgIndex, DL);
3834
3835 unsigned I = 0;
3836 const auto VI = VectorizePTXValueVTs(VTs, Offsets, ArgAlign);
3837 for (const unsigned NumElts : VI) {
3838 // i1 is loaded/stored as i8
3839 const EVT LoadVT = VTs[I] == MVT::i1 ? MVT::i8 : VTs[I];
3840 const EVT VecVT = getVectorizedVT(LoadVT, NumElts, Ctx);
3841
3842 SDValue VecAddr = DAG.getObjectPtrOffset(
3843 dl, ArgSymbol, TypeSize::getFixed(Offsets[I]));
3844
3845 const Align PartAlign = commonAlignment(ArgAlign, Offsets[I]);
3846 SDValue P =
3847 DAG.getLoad(VecVT, dl, Root, VecAddr,
3851 P.getNode()->setIROrder(Arg.getArgNo() + 1);
3852 for (const unsigned J : llvm::seq(NumElts)) {
3853 SDValue Elt = getExtractVectorizedValue(P, J, LoadVT, dl, DAG);
3854
3855 Elt = correctParamType(Elt, ArgIns[I + J].VT, ArgIns[I + J].Flags,
3856 DAG, dl);
3857 InVals.push_back(Elt);
3858 }
3859 I += NumElts;
3860 }
3861 }
3862 }
3863
3864 if (!OutChains.empty())
3865 DAG.setRoot(DAG.getTokenFactor(dl, OutChains));
3866
3867 return Chain;
3868}
3869
3870SDValue
3872 bool isVarArg,
3874 const SmallVectorImpl<SDValue> &OutVals,
3875 const SDLoc &dl, SelectionDAG &DAG) const {
3876 const Function &F = DAG.getMachineFunction().getFunction();
3877 Type *RetTy = F.getReturnType();
3878
3879 if (RetTy->isVoidTy()) {
3880 assert(OutVals.empty() && Outs.empty() && "Return value expected for void");
3881 return DAG.getNode(NVPTXISD::RET_GLUE, dl, MVT::Other, Chain);
3882 }
3883
3884 const DataLayout &DL = DAG.getDataLayout();
3885 LLVMContext &Ctx = *DAG.getContext();
3886
3887 const SDValue RetSymbol = DAG.getExternalSymbol("func_retval0", MVT::i32);
3888 const auto RetAlign = getFunctionParamOptimizedAlign(&F, RetTy, DL);
3889
3890 // PTX Interoperability Guide 3.3(A): [Integer] Values shorter than
3891 // 32-bits are sign extended or zero extended, depending on whether
3892 // they are signed or unsigned types.
3893 const bool ExtendIntegerRetVal =
3894 RetTy->isIntegerTy() && DL.getTypeAllocSizeInBits(RetTy) < 32;
3895
3898 ComputePTXValueVTs(*this, DL, Ctx, CallConv, RetTy, VTs, Offsets);
3899 assert(VTs.size() == OutVals.size() && "Bad return value decomposition");
3900
3901 const auto GetRetVal = [&](unsigned I) -> SDValue {
3902 SDValue RetVal = OutVals[I];
3904 RetVal.getValueType() &&
3905 "OutVal type should always be legal");
3906
3907 const EVT VTI = promoteScalarIntegerPTX(VTs[I]);
3908 const EVT StoreVT =
3909 ExtendIntegerRetVal ? MVT::i32 : (VTI == MVT::i1 ? MVT::i8 : VTI);
3910 return correctParamType(RetVal, StoreVT, Outs[I].Flags, DAG, dl);
3911 };
3912
3913 unsigned I = 0;
3914 const auto VI = VectorizePTXValueVTs(VTs, Offsets, RetAlign);
3915 for (const unsigned NumElts : VI) {
3916 const MaybeAlign CurrentAlign = ExtendIntegerRetVal
3917 ? MaybeAlign(std::nullopt)
3918 : commonAlignment(RetAlign, Offsets[I]);
3919
3921 NumElts, dl, DAG, [&](unsigned K) { return GetRetVal(I + K); });
3922
3923 SDValue Ptr =
3924 DAG.getObjectPtrOffset(dl, RetSymbol, TypeSize::getFixed(Offsets[I]));
3925
3926 Chain = DAG.getStore(Chain, dl, Val, Ptr,
3928
3929 I += NumElts;
3930 }
3931
3932 return DAG.getNode(NVPTXISD::RET_GLUE, dl, MVT::Other, Chain);
3933}
3934
3936 SDValue Op, StringRef Constraint, std::vector<SDValue> &Ops,
3937 SelectionDAG &DAG) const {
3938 if (Constraint.size() > 1)
3939 return;
3941}
3942
3943// llvm.ptx.memcpy.const and llvm.ptx.memmove.const need to be modeled as
3944// TgtMemIntrinsic
3945// because we need the information that is only available in the "Value" type
3946// of destination
3947// pointer. In particular, the address space information.
3949 IntrinsicInfo &Info, const CallInst &I,
3950 MachineFunction &MF, unsigned Intrinsic) const {
3951 switch (Intrinsic) {
3952 default:
3953 return false;
3954 case Intrinsic::nvvm_match_all_sync_i32p:
3955 case Intrinsic::nvvm_match_all_sync_i64p:
3956 Info.opc = ISD::INTRINSIC_W_CHAIN;
3957 // memVT is bogus. These intrinsics have IntrInaccessibleMemOnly attribute
3958 // in order to model data exchange with other threads, but perform no real
3959 // memory accesses.
3960 Info.memVT = MVT::i1;
3961
3962 // Our result depends on both our and other thread's arguments.
3964 return true;
3965 case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_col:
3966 case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_row:
3967 case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_col_stride:
3968 case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_row_stride:
3969 case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_col:
3970 case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_row:
3971 case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_col_stride:
3972 case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_row_stride:
3973 case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_col:
3974 case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_row:
3975 case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_col_stride:
3976 case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_row_stride:
3977 case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_col:
3978 case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_row:
3979 case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_col_stride:
3980 case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_row_stride:
3981 case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_col:
3982 case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_row:
3983 case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_col_stride:
3984 case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_row_stride:
3985 case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_col:
3986 case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_row:
3987 case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_col_stride:
3988 case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_row_stride: {
3989 Info.opc = ISD::INTRINSIC_W_CHAIN;
3990 Info.memVT = MVT::v8f16;
3991 Info.ptrVal = I.getArgOperand(0);
3992 Info.offset = 0;
3993 Info.flags = MachineMemOperand::MOLoad;
3994 Info.align = Align(16);
3995 return true;
3996 }
3997 case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_col:
3998 case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_col_stride:
3999 case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_col_stride:
4000 case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_col:
4001 case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_row:
4002 case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_row_stride:
4003 case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_row_stride:
4004 case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_row:
4005 case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_col:
4006 case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_col_stride:
4007 case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_row:
4008 case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_row_stride:
4009 case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_col:
4010 case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_col_stride:
4011 case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_col_stride:
4012 case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_col:
4013 case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_row:
4014 case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_row_stride:
4015 case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_row_stride:
4016 case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_row:
4017 case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_col:
4018 case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_col_stride:
4019 case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_row:
4020 case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_row_stride: {
4021 Info.opc = ISD::INTRINSIC_W_CHAIN;
4022 Info.memVT = MVT::v2i32;
4023 Info.ptrVal = I.getArgOperand(0);
4024 Info.offset = 0;
4025 Info.flags = MachineMemOperand::MOLoad;
4026 Info.align = Align(8);
4027 return true;
4028 }
4029
4030 case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_col:
4031 case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_col_stride:
4032 case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_col_stride:
4033 case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_col:
4034 case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_row:
4035 case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_row_stride:
4036 case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_row_stride:
4037 case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_row:
4038 case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_col:
4039 case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_col_stride:
4040 case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_row:
4041 case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_row_stride:
4042 case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_col:
4043 case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_col_stride:
4044 case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_row:
4045 case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_row_stride:
4046
4047 case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_col:
4048 case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_col_stride:
4049 case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_col_stride:
4050 case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_col:
4051 case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_row:
4052 case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_row_stride:
4053 case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_row_stride:
4054 case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_row:
4055 case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_col:
4056 case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_col_stride:
4057 case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_row:
4058 case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_row_stride:
4059 case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_col:
4060 case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_col_stride:
4061 case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_row:
4062 case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_row_stride:
4063 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x4_b16:
4064 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x4_trans_b16:
4065 case Intrinsic::nvvm_ldmatrix_sync_aligned_m16n16_x2_trans_b8:
4066 case Intrinsic::nvvm_ldmatrix_sync_aligned_m16n16_x2_trans_b8x16_b4x16_p64:
4067 case Intrinsic::nvvm_ldmatrix_sync_aligned_m16n16_x2_trans_b8x16_b6x16_p32:
4068 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n16_x4_b8x16_b4x16_p64:
4069 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n16_x4_b8x16_b6x16_p32: {
4070 Info.opc = ISD::INTRINSIC_W_CHAIN;
4071 Info.memVT = MVT::v4i32;
4072 Info.ptrVal = I.getArgOperand(0);
4073 Info.offset = 0;
4074 Info.flags = MachineMemOperand::MOLoad;
4075 Info.align = Align(16);
4076 return true;
4077 }
4078
4079 case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_col:
4080 case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_col_stride:
4081 case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_col_stride:
4082 case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_col:
4083 case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_row:
4084 case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_row_stride:
4085 case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_row_stride:
4086 case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_row:
4087
4088 case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_col:
4089 case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_col_stride:
4090 case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_col_stride:
4091 case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_col:
4092 case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_row:
4093 case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_row_stride:
4094 case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_row_stride:
4095 case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_row:
4096 case Intrinsic::nvvm_wmma_m8n8k128_load_a_b1_row:
4097 case Intrinsic::nvvm_wmma_m8n8k128_load_a_b1_row_stride:
4098 case Intrinsic::nvvm_wmma_m8n8k128_load_b_b1_col:
4099 case Intrinsic::nvvm_wmma_m8n8k128_load_b_b1_col_stride:
4100 case Intrinsic::nvvm_wmma_m8n8k32_load_a_s4_row:
4101 case Intrinsic::nvvm_wmma_m8n8k32_load_a_s4_row_stride:
4102 case Intrinsic::nvvm_wmma_m8n8k32_load_a_u4_row_stride:
4103 case Intrinsic::nvvm_wmma_m8n8k32_load_a_u4_row:
4104 case Intrinsic::nvvm_wmma_m8n8k32_load_b_s4_col:
4105 case Intrinsic::nvvm_wmma_m8n8k32_load_b_s4_col_stride:
4106 case Intrinsic::nvvm_wmma_m8n8k32_load_b_u4_col_stride:
4107 case Intrinsic::nvvm_wmma_m8n8k32_load_b_u4_col:
4108 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x1_b16:
4109 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x1_trans_b16:
4110 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n16_x1_b8x16_b4x16_p64:
4111 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n16_x1_b8x16_b6x16_p32: {
4112 Info.opc = ISD::INTRINSIC_W_CHAIN;
4113 Info.memVT = MVT::i32;
4114 Info.ptrVal = I.getArgOperand(0);
4115 Info.offset = 0;
4116 Info.flags = MachineMemOperand::MOLoad;
4117 Info.align = Align(4);
4118 return true;
4119 }
4120
4121 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_col:
4122 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_row:
4123 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_col_stride:
4124 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_row_stride:
4125 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_col:
4126 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_row:
4127 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_col_stride:
4128 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_row_stride:
4129 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_col:
4130 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_row:
4131 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_col_stride:
4132 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_row_stride: {
4133 Info.opc = ISD::INTRINSIC_W_CHAIN;
4134 Info.memVT = MVT::v4f16;
4135 Info.ptrVal = I.getArgOperand(0);
4136 Info.offset = 0;
4137 Info.flags = MachineMemOperand::MOLoad;
4138 Info.align = Align(16);
4139 return true;
4140 }
4141
4142 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_col:
4143 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_row:
4144 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_col_stride:
4145 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_row_stride:
4146 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_col:
4147 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_row:
4148 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_col_stride:
4149 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_row_stride:
4150 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_col:
4151 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_row:
4152 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_col_stride:
4153 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_row_stride:
4154 case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_col:
4155 case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_row:
4156 case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_col_stride:
4157 case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_row_stride: {
4158 Info.opc = ISD::INTRINSIC_W_CHAIN;
4159 Info.memVT = MVT::v8f32;
4160 Info.ptrVal = I.getArgOperand(0);
4161 Info.offset = 0;
4162 Info.flags = MachineMemOperand::MOLoad;
4163 Info.align = Align(16);
4164 return true;
4165 }
4166
4167 case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_col:
4168 case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_col_stride:
4169 case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_row:
4170 case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_row_stride:
4171
4172 case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_col:
4173 case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_col_stride:
4174 case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_row:
4175 case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_row_stride:
4176
4177 case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_col:
4178 case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_col_stride:
4179 case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_row:
4180 case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_row_stride:
4181 case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_col:
4182 case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_col_stride:
4183 case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_row:
4184 case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_row_stride:
4185 case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_col:
4186 case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_col_stride:
4187 case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_row:
4188 case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_row_stride: {
4189 Info.opc = ISD::INTRINSIC_W_CHAIN;
4190 Info.memVT = MVT::v8i32;
4191 Info.ptrVal = I.getArgOperand(0);
4192 Info.offset = 0;
4193 Info.flags = MachineMemOperand::MOLoad;
4194 Info.align = Align(16);
4195 return true;
4196 }
4197
4198 case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_col:
4199 case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_col_stride:
4200 case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_row:
4201 case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_row_stride:
4202 case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_col:
4203 case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_col_stride:
4204 case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_row:
4205 case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_row_stride:
4206 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x2_b16:
4207 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x2_trans_b16:
4208 case Intrinsic::nvvm_ldmatrix_sync_aligned_m16n16_x1_trans_b8:
4209 case Intrinsic::nvvm_ldmatrix_sync_aligned_m16n16_x1_trans_b8x16_b4x16_p64:
4210 case Intrinsic::nvvm_ldmatrix_sync_aligned_m16n16_x1_trans_b8x16_b6x16_p32:
4211 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n16_x2_b8x16_b4x16_p64:
4212 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n16_x2_b8x16_b6x16_p32: {
4213 Info.opc = ISD::INTRINSIC_W_CHAIN;
4214 Info.memVT = MVT::v2i32;
4215 Info.ptrVal = I.getArgOperand(0);
4216 Info.offset = 0;
4217 Info.flags = MachineMemOperand::MOLoad;
4218 Info.align = Align(8);
4219 return true;
4220 }
4221
4222 case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_col:
4223 case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_col_stride:
4224 case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_row:
4225 case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_row_stride:
4226
4227 case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_col:
4228 case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_col_stride:
4229 case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_row:
4230 case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_row_stride: {
4231 Info.opc = ISD::INTRINSIC_W_CHAIN;
4232 Info.memVT = MVT::f64;
4233 Info.ptrVal = I.getArgOperand(0);
4234 Info.offset = 0;
4235 Info.flags = MachineMemOperand::MOLoad;
4236 Info.align = Align(8);
4237 return true;
4238 }
4239
4240 case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_col:
4241 case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_col_stride:
4242 case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_row:
4243 case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_row_stride: {
4244 Info.opc = ISD::INTRINSIC_W_CHAIN;
4245 Info.memVT = MVT::v2f64;
4246 Info.ptrVal = I.getArgOperand(0);
4247 Info.offset = 0;
4248 Info.flags = MachineMemOperand::MOLoad;
4249 Info.align = Align(16);
4250 return true;
4251 }
4252
4253 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_col:
4254 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_row:
4255 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_col_stride:
4256 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_row_stride:
4257 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_col:
4258 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_row:
4259 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_col_stride:
4260 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_row_stride:
4261 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_col:
4262 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_row:
4263 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_col_stride:
4264 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_row_stride: {
4265 Info.opc = ISD::INTRINSIC_VOID;
4266 Info.memVT = MVT::v4f16;
4267 Info.ptrVal = I.getArgOperand(0);
4268 Info.offset = 0;
4269 Info.flags = MachineMemOperand::MOStore;
4270 Info.align = Align(16);
4271 return true;
4272 }
4273
4274 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_col:
4275 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_row:
4276 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_col_stride:
4277 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_row_stride:
4278 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_col:
4279 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_row:
4280 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_col_stride:
4281 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_row_stride:
4282 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_col:
4283 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_row:
4284 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_col_stride:
4285 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_row_stride:
4286 case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_col:
4287 case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_row:
4288 case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_col_stride:
4289 case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_row_stride: {
4290 Info.opc = ISD::INTRINSIC_VOID;
4291 Info.memVT = MVT::v8f32;
4292 Info.ptrVal = I.getArgOperand(0);
4293 Info.offset = 0;
4294 Info.flags = MachineMemOperand::MOStore;
4295 Info.align = Align(16);
4296 return true;
4297 }
4298
4299 case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_col:
4300 case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_col_stride:
4301 case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_row:
4302 case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_row_stride:
4303 case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_col:
4304 case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_col_stride:
4305 case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_row:
4306 case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_row_stride:
4307 case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_col:
4308 case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_col_stride:
4309 case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_row:
4310 case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_row_stride: {
4311 Info.opc = ISD::INTRINSIC_VOID;
4312 Info.memVT = MVT::v8i32;
4313 Info.ptrVal = I.getArgOperand(0);
4314 Info.offset = 0;
4315 Info.flags = MachineMemOperand::MOStore;
4316 Info.align = Align(16);
4317 return true;
4318 }
4319
4320 case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_col:
4321 case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_col_stride:
4322 case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_row:
4323 case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_row_stride:
4324 case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_col:
4325 case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_col_stride:
4326 case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_row:
4327 case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_row_stride:
4328 case Intrinsic::nvvm_stmatrix_sync_aligned_m8n8_x2_b16:
4329 case Intrinsic::nvvm_stmatrix_sync_aligned_m8n8_x2_trans_b16:
4330 case Intrinsic::nvvm_stmatrix_sync_aligned_m16n8_x2_trans_b8: {
4331 Info.opc = ISD::INTRINSIC_VOID;
4332 Info.memVT = MVT::v2i32;
4333 Info.ptrVal = I.getArgOperand(0);
4334 Info.offset = 0;
4335 Info.flags = MachineMemOperand::MOStore;
4336 Info.align = Align(8);
4337 return true;
4338 }
4339
4340 case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_col:
4341 case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_col_stride:
4342 case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_row:
4343 case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_row_stride: {
4344 Info.opc = ISD::INTRINSIC_VOID;
4345 Info.memVT = MVT::v2f64;
4346 Info.ptrVal = I.getArgOperand(0);
4347 Info.offset = 0;
4348 Info.flags = MachineMemOperand::MOStore;
4349 Info.align = Align(16);
4350 return true;
4351 }
4352
4353 case Intrinsic::nvvm_stmatrix_sync_aligned_m8n8_x1_b16:
4354 case Intrinsic::nvvm_stmatrix_sync_aligned_m8n8_x1_trans_b16:
4355 case Intrinsic::nvvm_stmatrix_sync_aligned_m16n8_x1_trans_b8: {
4356 Info.opc = ISD::INTRINSIC_VOID;
4357 Info.memVT = MVT::i32;
4358 Info.ptrVal = I.getArgOperand(0);
4359 Info.offset = 0;
4360 Info.flags = MachineMemOperand::MOStore;
4361 Info.align = Align(4);
4362 return true;
4363 }
4364
4365 case Intrinsic::nvvm_stmatrix_sync_aligned_m8n8_x4_b16:
4366 case Intrinsic::nvvm_stmatrix_sync_aligned_m8n8_x4_trans_b16:
4367 case Intrinsic::nvvm_stmatrix_sync_aligned_m16n8_x4_trans_b8: {
4368 Info.opc = ISD::INTRINSIC_VOID;
4369 Info.memVT = MVT::v4i32;
4370 Info.ptrVal = I.getArgOperand(0);
4371 Info.offset = 0;
4372 Info.flags = MachineMemOperand::MOStore;
4373 Info.align = Align(16);
4374 return true;
4375 }
4376
4377 case Intrinsic::nvvm_atomic_add_gen_f_cta:
4378 case Intrinsic::nvvm_atomic_add_gen_f_sys:
4379 case Intrinsic::nvvm_atomic_add_gen_i_cta:
4380 case Intrinsic::nvvm_atomic_add_gen_i_sys:
4381 case Intrinsic::nvvm_atomic_and_gen_i_cta:
4382 case Intrinsic::nvvm_atomic_and_gen_i_sys:
4383 case Intrinsic::nvvm_atomic_cas_gen_i_cta:
4384 case Intrinsic::nvvm_atomic_cas_gen_i_sys:
4385 case Intrinsic::nvvm_atomic_dec_gen_i_cta:
4386 case Intrinsic::nvvm_atomic_dec_gen_i_sys:
4387 case Intrinsic::nvvm_atomic_inc_gen_i_cta:
4388 case Intrinsic::nvvm_atomic_inc_gen_i_sys:
4389 case Intrinsic::nvvm_atomic_max_gen_i_cta:
4390 case Intrinsic::nvvm_atomic_max_gen_i_sys:
4391 case Intrinsic::nvvm_atomic_min_gen_i_cta:
4392 case Intrinsic::nvvm_atomic_min_gen_i_sys:
4393 case Intrinsic::nvvm_atomic_or_gen_i_cta:
4394 case Intrinsic::nvvm_atomic_or_gen_i_sys:
4395 case Intrinsic::nvvm_atomic_exch_gen_i_cta:
4396 case Intrinsic::nvvm_atomic_exch_gen_i_sys:
4397 case Intrinsic::nvvm_atomic_xor_gen_i_cta:
4398 case Intrinsic::nvvm_atomic_xor_gen_i_sys: {
4399 auto &DL = I.getDataLayout();
4400 Info.opc = ISD::INTRINSIC_W_CHAIN;
4401 Info.memVT = getValueType(DL, I.getType());
4402 Info.ptrVal = I.getArgOperand(0);
4403 Info.offset = 0;
4405 Info.align.reset();
4406 return true;
4407 }
4408
4409 case Intrinsic::nvvm_prefetch_tensormap: {
4410 auto &DL = I.getDataLayout();
4411 Info.opc = ISD::INTRINSIC_VOID;
4412 Info.memVT = getPointerTy(DL);
4413 Info.ptrVal = I.getArgOperand(0);
4414 Info.offset = 0;
4415 Info.flags =
4417 Info.align.reset();
4418 return true;
4419 }
4420
4421 case Intrinsic::nvvm_ldu_global_i:
4422 case Intrinsic::nvvm_ldu_global_f:
4423 case Intrinsic::nvvm_ldu_global_p: {
4424 Info.opc = ISD::INTRINSIC_W_CHAIN;
4425 Info.memVT = getValueType(I.getDataLayout(), I.getType());
4426 Info.ptrVal = I.getArgOperand(0);
4427 Info.offset = 0;
4428 Info.flags = MachineMemOperand::MOLoad;
4429 Info.align = cast<ConstantInt>(I.getArgOperand(1))->getMaybeAlignValue();
4430
4431 return true;
4432 }
4433 case Intrinsic::nvvm_tex_1d_v4f32_s32:
4434 case Intrinsic::nvvm_tex_1d_v4f32_f32:
4435 case Intrinsic::nvvm_tex_1d_level_v4f32_f32:
4436 case Intrinsic::nvvm_tex_1d_grad_v4f32_f32:
4437 case Intrinsic::nvvm_tex_1d_array_v4f32_s32:
4438 case Intrinsic::nvvm_tex_1d_array_v4f32_f32:
4439 case Intrinsic::nvvm_tex_1d_array_level_v4f32_f32:
4440 case Intrinsic::nvvm_tex_1d_array_grad_v4f32_f32:
4441 case Intrinsic::nvvm_tex_2d_v4f32_s32:
4442 case Intrinsic::nvvm_tex_2d_v4f32_f32:
4443 case Intrinsic::nvvm_tex_2d_level_v4f32_f32:
4444 case Intrinsic::nvvm_tex_2d_grad_v4f32_f32:
4445 case Intrinsic::nvvm_tex_2d_array_v4f32_s32:
4446 case Intrinsic::nvvm_tex_2d_array_v4f32_f32:
4447 case Intrinsic::nvvm_tex_2d_array_level_v4f32_f32:
4448 case Intrinsic::nvvm_tex_2d_array_grad_v4f32_f32:
4449 case Intrinsic::nvvm_tex_3d_v4f32_s32:
4450 case Intrinsic::nvvm_tex_3d_v4f32_f32:
4451 case Intrinsic::nvvm_tex_3d_level_v4f32_f32:
4452 case Intrinsic::nvvm_tex_3d_grad_v4f32_f32:
4453 case Intrinsic::nvvm_tex_cube_v4f32_f32:
4454 case Intrinsic::nvvm_tex_cube_level_v4f32_f32:
4455 case Intrinsic::nvvm_tex_cube_array_v4f32_f32:
4456 case Intrinsic::nvvm_tex_cube_array_level_v4f32_f32:
4457 case Intrinsic::nvvm_tld4_r_2d_v4f32_f32:
4458 case Intrinsic::nvvm_tld4_g_2d_v4f32_f32:
4459 case Intrinsic::nvvm_tld4_b_2d_v4f32_f32:
4460 case Intrinsic::nvvm_tld4_a_2d_v4f32_f32:
4461 case Intrinsic::nvvm_tex_unified_1d_v4f32_s32:
4462 case Intrinsic::nvvm_tex_unified_1d_v4f32_f32:
4463 case Intrinsic::nvvm_tex_unified_1d_level_v4f32_f32:
4464 case Intrinsic::nvvm_tex_unified_1d_grad_v4f32_f32:
4465 case Intrinsic::nvvm_tex_unified_1d_array_v4f32_s32:
4466 case Intrinsic::nvvm_tex_unified_1d_array_v4f32_f32:
4467 case Intrinsic::nvvm_tex_unified_1d_array_level_v4f32_f32:
4468 case Intrinsic::nvvm_tex_unified_1d_array_grad_v4f32_f32:
4469 case Intrinsic::nvvm_tex_unified_2d_v4f32_s32:
4470 case Intrinsic::nvvm_tex_unified_2d_v4f32_f32:
4471 case Intrinsic::nvvm_tex_unified_2d_level_v4f32_f32:
4472 case Intrinsic::nvvm_tex_unified_2d_grad_v4f32_f32:
4473 case Intrinsic::nvvm_tex_unified_2d_array_v4f32_s32:
4474 case Intrinsic::nvvm_tex_unified_2d_array_v4f32_f32:
4475 case Intrinsic::nvvm_tex_unified_2d_array_level_v4f32_f32:
4476 case Intrinsic::nvvm_tex_unified_2d_array_grad_v4f32_f32:
4477 case Intrinsic::nvvm_tex_unified_3d_v4f32_s32:
4478 case Intrinsic::nvvm_tex_unified_3d_v4f32_f32:
4479 case Intrinsic::nvvm_tex_unified_3d_level_v4f32_f32:
4480 case Intrinsic::nvvm_tex_unified_3d_grad_v4f32_f32:
4481 case Intrinsic::nvvm_tex_unified_cube_v4f32_f32:
4482 case Intrinsic::nvvm_tex_unified_cube_level_v4f32_f32:
4483 case Intrinsic::nvvm_tex_unified_cube_array_v4f32_f32:
4484 case Intrinsic::nvvm_tex_unified_cube_array_level_v4f32_f32:
4485 case Intrinsic::nvvm_tex_unified_cube_grad_v4f32_f32:
4486 case Intrinsic::nvvm_tex_unified_cube_array_grad_v4f32_f32:
4487 case Intrinsic::nvvm_tld4_unified_r_2d_v4f32_f32:
4488 case Intrinsic::nvvm_tld4_unified_g_2d_v4f32_f32:
4489 case Intrinsic::nvvm_tld4_unified_b_2d_v4f32_f32:
4490 case Intrinsic::nvvm_tld4_unified_a_2d_v4f32_f32:
4491 Info.opc = ISD::INTRINSIC_W_CHAIN;
4492 Info.memVT = MVT::v4f32;
4493 Info.ptrVal = nullptr;
4494 Info.offset = 0;
4495 Info.flags = MachineMemOperand::MOLoad;
4496 Info.align = Align(16);
4497 return true;
4498
4499 case Intrinsic::nvvm_tex_1d_v4s32_s32:
4500 case Intrinsic::nvvm_tex_1d_v4s32_f32:
4501 case Intrinsic::nvvm_tex_1d_level_v4s32_f32:
4502 case Intrinsic::nvvm_tex_1d_grad_v4s32_f32:
4503 case Intrinsic::nvvm_tex_1d_array_v4s32_s32:
4504 case Intrinsic::nvvm_tex_1d_array_v4s32_f32:
4505 case Intrinsic::nvvm_tex_1d_array_level_v4s32_f32:
4506 case Intrinsic::nvvm_tex_1d_array_grad_v4s32_f32:
4507 case Intrinsic::nvvm_tex_2d_v4s32_s32:
4508 case Intrinsic::nvvm_tex_2d_v4s32_f32:
4509 case Intrinsic::nvvm_tex_2d_level_v4s32_f32:
4510 case Intrinsic::nvvm_tex_2d_grad_v4s32_f32:
4511 case Intrinsic::nvvm_tex_2d_array_v4s32_s32:
4512 case Intrinsic::nvvm_tex_2d_array_v4s32_f32:
4513 case Intrinsic::nvvm_tex_2d_array_level_v4s32_f32:
4514 case Intrinsic::nvvm_tex_2d_array_grad_v4s32_f32:
4515 case Intrinsic::nvvm_tex_3d_v4s32_s32:
4516 case Intrinsic::nvvm_tex_3d_v4s32_f32:
4517 case Intrinsic::nvvm_tex_3d_level_v4s32_f32:
4518 case Intrinsic::nvvm_tex_3d_grad_v4s32_f32:
4519 case Intrinsic::nvvm_tex_cube_v4s32_f32:
4520 case Intrinsic::nvvm_tex_cube_level_v4s32_f32:
4521 case Intrinsic::nvvm_tex_cube_array_v4s32_f32:
4522 case Intrinsic::nvvm_tex_cube_array_level_v4s32_f32:
4523 case Intrinsic::nvvm_tex_cube_v4u32_f32:
4524 case Intrinsic::nvvm_tex_cube_level_v4u32_f32:
4525 case Intrinsic::nvvm_tex_cube_array_v4u32_f32:
4526 case Intrinsic::nvvm_tex_cube_array_level_v4u32_f32:
4527 case Intrinsic::nvvm_tex_1d_v4u32_s32:
4528 case Intrinsic::nvvm_tex_1d_v4u32_f32:
4529 case Intrinsic::nvvm_tex_1d_level_v4u32_f32:
4530 case Intrinsic::nvvm_tex_1d_grad_v4u32_f32:
4531 case Intrinsic::nvvm_tex_1d_array_v4u32_s32:
4532 case Intrinsic::nvvm_tex_1d_array_v4u32_f32:
4533 case Intrinsic::nvvm_tex_1d_array_level_v4u32_f32:
4534 case Intrinsic::nvvm_tex_1d_array_grad_v4u32_f32:
4535 case Intrinsic::nvvm_tex_2d_v4u32_s32:
4536 case Intrinsic::nvvm_tex_2d_v4u32_f32:
4537 case Intrinsic::nvvm_tex_2d_level_v4u32_f32:
4538 case Intrinsic::nvvm_tex_2d_grad_v4u32_f32:
4539 case Intrinsic::nvvm_tex_2d_array_v4u32_s32:
4540 case Intrinsic::nvvm_tex_2d_array_v4u32_f32:
4541 case Intrinsic::nvvm_tex_2d_array_level_v4u32_f32:
4542 case Intrinsic::nvvm_tex_2d_array_grad_v4u32_f32:
4543 case Intrinsic::nvvm_tex_3d_v4u32_s32:
4544 case Intrinsic::nvvm_tex_3d_v4u32_f32:
4545 case Intrinsic::nvvm_tex_3d_level_v4u32_f32:
4546 case Intrinsic::nvvm_tex_3d_grad_v4u32_f32:
4547 case Intrinsic::nvvm_tld4_r_2d_v4s32_f32:
4548 case Intrinsic::nvvm_tld4_g_2d_v4s32_f32:
4549 case Intrinsic::nvvm_tld4_b_2d_v4s32_f32:
4550 case Intrinsic::nvvm_tld4_a_2d_v4s32_f32:
4551 case Intrinsic::nvvm_tld4_r_2d_v4u32_f32:
4552 case Intrinsic::nvvm_tld4_g_2d_v4u32_f32:
4553 case Intrinsic::nvvm_tld4_b_2d_v4u32_f32:
4554 case Intrinsic::nvvm_tld4_a_2d_v4u32_f32:
4555 case Intrinsic::nvvm_tex_unified_1d_v4s32_s32:
4556 case Intrinsic::nvvm_tex_unified_1d_v4s32_f32:
4557 case Intrinsic::nvvm_tex_unified_1d_level_v4s32_f32:
4558 case Intrinsic::nvvm_tex_unified_1d_grad_v4s32_f32:
4559 case Intrinsic::nvvm_tex_unified_1d_array_v4s32_s32:
4560 case Intrinsic::nvvm_tex_unified_1d_array_v4s32_f32:
4561 case Intrinsic::nvvm_tex_unified_1d_array_level_v4s32_f32:
4562 case Intrinsic::nvvm_tex_unified_1d_array_grad_v4s32_f32:
4563 case Intrinsic::nvvm_tex_unified_2d_v4s32_s32:
4564 case Intrinsic::nvvm_tex_unified_2d_v4s32_f32:
4565 case Intrinsic::nvvm_tex_unified_2d_level_v4s32_f32:
4566 case Intrinsic::nvvm_tex_unified_2d_grad_v4s32_f32:
4567 case Intrinsic::nvvm_tex_unified_2d_array_v4s32_s32:
4568 case Intrinsic::nvvm_tex_unified_2d_array_v4s32_f32:
4569 case Intrinsic::nvvm_tex_unified_2d_array_level_v4s32_f32:
4570 case Intrinsic::nvvm_tex_unified_2d_array_grad_v4s32_f32:
4571 case Intrinsic::nvvm_tex_unified_3d_v4s32_s32:
4572 case Intrinsic::nvvm_tex_unified_3d_v4s32_f32:
4573 case Intrinsic::nvvm_tex_unified_3d_level_v4s32_f32:
4574 case Intrinsic::nvvm_tex_unified_3d_grad_v4s32_f32:
4575 case Intrinsic::nvvm_tex_unified_1d_v4u32_s32:
4576 case Intrinsic::nvvm_tex_unified_1d_v4u32_f32:
4577 case Intrinsic::nvvm_tex_unified_1d_level_v4u32_f32:
4578 case Intrinsic::nvvm_tex_unified_1d_grad_v4u32_f32:
4579 case Intrinsic::nvvm_tex_unified_1d_array_v4u32_s32:
4580 case Intrinsic::nvvm_tex_unified_1d_array_v4u32_f32:
4581 case Intrinsic::nvvm_tex_unified_1d_array_level_v4u32_f32:
4582 case Intrinsic::nvvm_tex_unified_1d_array_grad_v4u32_f32:
4583 case Intrinsic::nvvm_tex_unified_2d_v4u32_s32:
4584 case Intrinsic::nvvm_tex_unified_2d_v4u32_f32:
4585 case Intrinsic::nvvm_tex_unified_2d_level_v4u32_f32:
4586 case Intrinsic::nvvm_tex_unified_2d_grad_v4u32_f32:
4587 case Intrinsic::nvvm_tex_unified_2d_array_v4u32_s32:
4588 case Intrinsic::nvvm_tex_unified_2d_array_v4u32_f32:
4589 case Intrinsic::nvvm_tex_unified_2d_array_level_v4u32_f32:
4590 case Intrinsic::nvvm_tex_unified_2d_array_grad_v4u32_f32:
4591 case Intrinsic::nvvm_tex_unified_3d_v4u32_s32:
4592 case Intrinsic::nvvm_tex_unified_3d_v4u32_f32:
4593 case Intrinsic::nvvm_tex_unified_3d_level_v4u32_f32:
4594 case Intrinsic::nvvm_tex_unified_3d_grad_v4u32_f32:
4595 case Intrinsic::nvvm_tex_unified_cube_v4s32_f32:
4596 case Intrinsic::nvvm_tex_unified_cube_level_v4s32_f32:
4597 case Intrinsic::nvvm_tex_unified_cube_array_v4s32_f32:
4598 case Intrinsic::nvvm_tex_unified_cube_array_level_v4s32_f32:
4599 case Intrinsic::nvvm_tex_unified_cube_v4u32_f32:
4600 case Intrinsic::nvvm_tex_unified_cube_level_v4u32_f32:
4601 case Intrinsic::nvvm_tex_unified_cube_array_v4u32_f32:
4602 case Intrinsic::nvvm_tex_unified_cube_array_level_v4u32_f32:
4603 case Intrinsic::nvvm_tex_unified_cube_grad_v4s32_f32:
4604 case Intrinsic::nvvm_tex_unified_cube_grad_v4u32_f32:
4605 case Intrinsic::nvvm_tex_unified_cube_array_grad_v4s32_f32:
4606 case Intrinsic::nvvm_tex_unified_cube_array_grad_v4u32_f32:
4607 case Intrinsic::nvvm_tld4_unified_r_2d_v4s32_f32:
4608 case Intrinsic::nvvm_tld4_unified_g_2d_v4s32_f32:
4609 case Intrinsic::nvvm_tld4_unified_b_2d_v4s32_f32:
4610 case Intrinsic::nvvm_tld4_unified_a_2d_v4s32_f32:
4611 case Intrinsic::nvvm_tld4_unified_r_2d_v4u32_f32:
4612 case Intrinsic::nvvm_tld4_unified_g_2d_v4u32_f32:
4613 case Intrinsic::nvvm_tld4_unified_b_2d_v4u32_f32:
4614 case Intrinsic::nvvm_tld4_unified_a_2d_v4u32_f32:
4615 Info.opc = ISD::INTRINSIC_W_CHAIN;
4616 Info.memVT = MVT::v4i32;
4617 Info.ptrVal = nullptr;
4618 Info.offset = 0;
4619 Info.flags = MachineMemOperand::MOLoad;
4620 Info.align = Align(16);
4621 return true;
4622
4623 case Intrinsic::nvvm_suld_1d_i8_clamp:
4624 case Intrinsic::nvvm_suld_1d_v2i8_clamp:
4625 case Intrinsic::nvvm_suld_1d_v4i8_clamp:
4626 case Intrinsic::nvvm_suld_1d_array_i8_clamp:
4627 case Intrinsic::nvvm_suld_1d_array_v2i8_clamp:
4628 case Intrinsic::nvvm_suld_1d_array_v4i8_clamp:
4629 case Intrinsic::nvvm_suld_2d_i8_clamp:
4630 case Intrinsic::nvvm_suld_2d_v2i8_clamp:
4631 case Intrinsic::nvvm_suld_2d_v4i8_clamp:
4632 case Intrinsic::nvvm_suld_2d_array_i8_clamp:
4633 case Intrinsic::nvvm_suld_2d_array_v2i8_clamp:
4634 case Intrinsic::nvvm_suld_2d_array_v4i8_clamp:
4635 case Intrinsic::nvvm_suld_3d_i8_clamp:
4636 case Intrinsic::nvvm_suld_3d_v2i8_clamp:
4637 case Intrinsic::nvvm_suld_3d_v4i8_clamp:
4638 case Intrinsic::nvvm_suld_1d_i8_trap:
4639 case Intrinsic::nvvm_suld_1d_v2i8_trap:
4640 case Intrinsic::nvvm_suld_1d_v4i8_trap:
4641 case Intrinsic::nvvm_suld_1d_array_i8_trap:
4642 case Intrinsic::nvvm_suld_1d_array_v2i8_trap:
4643 case Intrinsic::nvvm_suld_1d_array_v4i8_trap:
4644 case Intrinsic::nvvm_suld_2d_i8_trap:
4645 case Intrinsic::nvvm_suld_2d_v2i8_trap:
4646 case Intrinsic::nvvm_suld_2d_v4i8_trap:
4647 case Intrinsic::nvvm_suld_2d_array_i8_trap:
4648 case Intrinsic::nvvm_suld_2d_array_v2i8_trap:
4649 case Intrinsic::nvvm_suld_2d_array_v4i8_trap:
4650 case Intrinsic::nvvm_suld_3d_i8_trap:
4651 case Intrinsic::nvvm_suld_3d_v2i8_trap:
4652 case Intrinsic::nvvm_suld_3d_v4i8_trap:
4653 case Intrinsic::nvvm_suld_1d_i8_zero:
4654 case Intrinsic::nvvm_suld_1d_v2i8_zero:
4655 case Intrinsic::nvvm_suld_1d_v4i8_zero:
4656 case Intrinsic::nvvm_suld_1d_array_i8_zero:
4657 case Intrinsic::nvvm_suld_1d_array_v2i8_zero:
4658 case Intrinsic::nvvm_suld_1d_array_v4i8_zero:
4659 case Intrinsic::nvvm_suld_2d_i8_zero:
4660 case Intrinsic::nvvm_suld_2d_v2i8_zero:
4661 case Intrinsic::nvvm_suld_2d_v4i8_zero:
4662 case Intrinsic::nvvm_suld_2d_array_i8_zero:
4663 case Intrinsic::nvvm_suld_2d_array_v2i8_zero:
4664 case Intrinsic::nvvm_suld_2d_array_v4i8_zero:
4665 case Intrinsic::nvvm_suld_3d_i8_zero:
4666 case Intrinsic::nvvm_suld_3d_v2i8_zero:
4667 case Intrinsic::nvvm_suld_3d_v4i8_zero:
4668 Info.opc = ISD::INTRINSIC_W_CHAIN;
4669 Info.memVT = MVT::i8;
4670 Info.ptrVal = nullptr;
4671 Info.offset = 0;
4672 Info.flags = MachineMemOperand::MOLoad;
4673 Info.align = Align(16);
4674 return true;
4675
4676 case Intrinsic::nvvm_suld_1d_i16_clamp:
4677 case Intrinsic::nvvm_suld_1d_v2i16_clamp:
4678 case Intrinsic::nvvm_suld_1d_v4i16_clamp:
4679 case Intrinsic::nvvm_suld_1d_array_i16_clamp:
4680 case Intrinsic::nvvm_suld_1d_array_v2i16_clamp:
4681 case Intrinsic::nvvm_suld_1d_array_v4i16_clamp:
4682 case Intrinsic::nvvm_suld_2d_i16_clamp:
4683 case Intrinsic::nvvm_suld_2d_v2i16_clamp:
4684 case Intrinsic::nvvm_suld_2d_v4i16_clamp:
4685 case Intrinsic::nvvm_suld_2d_array_i16_clamp:
4686 case Intrinsic::nvvm_suld_2d_array_v2i16_clamp:
4687 case Intrinsic::nvvm_suld_2d_array_v4i16_clamp:
4688 case Intrinsic::nvvm_suld_3d_i16_clamp:
4689 case Intrinsic::nvvm_suld_3d_v2i16_clamp:
4690 case Intrinsic::nvvm_suld_3d_v4i16_clamp:
4691 case Intrinsic::nvvm_suld_1d_i16_trap:
4692 case Intrinsic::nvvm_suld_1d_v2i16_trap:
4693 case Intrinsic::nvvm_suld_1d_v4i16_trap:
4694 case Intrinsic::nvvm_suld_1d_array_i16_trap:
4695 case Intrinsic::nvvm_suld_1d_array_v2i16_trap:
4696 case Intrinsic::nvvm_suld_1d_array_v4i16_trap:
4697 case Intrinsic::nvvm_suld_2d_i16_trap:
4698 case Intrinsic::nvvm_suld_2d_v2i16_trap:
4699 case Intrinsic::nvvm_suld_2d_v4i16_trap:
4700 case Intrinsic::nvvm_suld_2d_array_i16_trap:
4701 case Intrinsic::nvvm_suld_2d_array_v2i16_trap:
4702 case Intrinsic::nvvm_suld_2d_array_v4i16_trap:
4703 case Intrinsic::nvvm_suld_3d_i16_trap:
4704 case Intrinsic::nvvm_suld_3d_v2i16_trap:
4705 case Intrinsic::nvvm_suld_3d_v4i16_trap:
4706 case Intrinsic::nvvm_suld_1d_i16_zero:
4707 case Intrinsic::nvvm_suld_1d_v2i16_zero:
4708 case Intrinsic::nvvm_suld_1d_v4i16_zero:
4709 case Intrinsic::nvvm_suld_1d_array_i16_zero:
4710 case Intrinsic::nvvm_suld_1d_array_v2i16_zero:
4711 case Intrinsic::nvvm_suld_1d_array_v4i16_zero:
4712 case Intrinsic::nvvm_suld_2d_i16_zero:
4713 case Intrinsic::nvvm_suld_2d_v2i16_zero:
4714 case Intrinsic::nvvm_suld_2d_v4i16_zero:
4715 case Intrinsic::nvvm_suld_2d_array_i16_zero:
4716 case Intrinsic::nvvm_suld_2d_array_v2i16_zero:
4717 case Intrinsic::nvvm_suld_2d_array_v4i16_zero:
4718 case Intrinsic::nvvm_suld_3d_i16_zero:
4719 case Intrinsic::nvvm_suld_3d_v2i16_zero:
4720 case Intrinsic::nvvm_suld_3d_v4i16_zero:
4721 Info.opc = ISD::INTRINSIC_W_CHAIN;
4722 Info.memVT = MVT::i16;
4723 Info.ptrVal = nullptr;
4724 Info.offset = 0;
4725 Info.flags = MachineMemOperand::MOLoad;
4726 Info.align = Align(16);
4727 return true;
4728
4729 case Intrinsic::nvvm_suld_1d_i32_clamp:
4730 case Intrinsic::nvvm_suld_1d_v2i32_clamp:
4731 case Intrinsic::nvvm_suld_1d_v4i32_clamp:
4732 case Intrinsic::nvvm_suld_1d_array_i32_clamp:
4733 case Intrinsic::nvvm_suld_1d_array_v2i32_clamp:
4734 case Intrinsic::nvvm_suld_1d_array_v4i32_clamp:
4735 case Intrinsic::nvvm_suld_2d_i32_clamp:
4736 case Intrinsic::nvvm_suld_2d_v2i32_clamp:
4737 case Intrinsic::nvvm_suld_2d_v4i32_clamp:
4738 case Intrinsic::nvvm_suld_2d_array_i32_clamp:
4739 case Intrinsic::nvvm_suld_2d_array_v2i32_clamp:
4740 case Intrinsic::nvvm_suld_2d_array_v4i32_clamp:
4741 case Intrinsic::nvvm_suld_3d_i32_clamp:
4742 case Intrinsic::nvvm_suld_3d_v2i32_clamp:
4743 case Intrinsic::nvvm_suld_3d_v4i32_clamp:
4744 case Intrinsic::nvvm_suld_1d_i32_trap:
4745 case Intrinsic::nvvm_suld_1d_v2i32_trap:
4746 case Intrinsic::nvvm_suld_1d_v4i32_trap:
4747 case Intrinsic::nvvm_suld_1d_array_i32_trap:
4748 case Intrinsic::nvvm_suld_1d_array_v2i32_trap:
4749 case Intrinsic::nvvm_suld_1d_array_v4i32_trap:
4750 case Intrinsic::nvvm_suld_2d_i32_trap:
4751 case Intrinsic::nvvm_suld_2d_v2i32_trap:
4752 case Intrinsic::nvvm_suld_2d_v4i32_trap:
4753 case Intrinsic::nvvm_suld_2d_array_i32_trap:
4754 case Intrinsic::nvvm_suld_2d_array_v2i32_trap:
4755 case Intrinsic::nvvm_suld_2d_array_v4i32_trap:
4756 case Intrinsic::nvvm_suld_3d_i32_trap:
4757 case Intrinsic::nvvm_suld_3d_v2i32_trap:
4758 case Intrinsic::nvvm_suld_3d_v4i32_trap:
4759 case Intrinsic::nvvm_suld_1d_i32_zero:
4760 case Intrinsic::nvvm_suld_1d_v2i32_zero:
4761 case Intrinsic::nvvm_suld_1d_v4i32_zero:
4762 case Intrinsic::nvvm_suld_1d_array_i32_zero:
4763 case Intrinsic::nvvm_suld_1d_array_v2i32_zero:
4764 case Intrinsic::nvvm_suld_1d_array_v4i32_zero:
4765 case Intrinsic::nvvm_suld_2d_i32_zero:
4766 case Intrinsic::nvvm_suld_2d_v2i32_zero:
4767 case Intrinsic::nvvm_suld_2d_v4i32_zero:
4768 case Intrinsic::nvvm_suld_2d_array_i32_zero:
4769 case Intrinsic::nvvm_suld_2d_array_v2i32_zero:
4770 case Intrinsic::nvvm_suld_2d_array_v4i32_zero:
4771 case Intrinsic::nvvm_suld_3d_i32_zero:
4772 case Intrinsic::nvvm_suld_3d_v2i32_zero:
4773 case Intrinsic::nvvm_suld_3d_v4i32_zero:
4774 Info.opc = ISD::INTRINSIC_W_CHAIN;
4775 Info.memVT = MVT::i32;
4776 Info.ptrVal = nullptr;
4777 Info.offset = 0;
4778 Info.flags = MachineMemOperand::MOLoad;
4779 Info.align = Align(16);
4780 return true;
4781
4782 case Intrinsic::nvvm_suld_1d_i64_clamp:
4783 case Intrinsic::nvvm_suld_1d_v2i64_clamp:
4784 case Intrinsic::nvvm_suld_1d_array_i64_clamp:
4785 case Intrinsic::nvvm_suld_1d_array_v2i64_clamp:
4786 case Intrinsic::nvvm_suld_2d_i64_clamp:
4787 case Intrinsic::nvvm_suld_2d_v2i64_clamp:
4788 case Intrinsic::nvvm_suld_2d_array_i64_clamp:
4789 case Intrinsic::nvvm_suld_2d_array_v2i64_clamp:
4790 case Intrinsic::nvvm_suld_3d_i64_clamp:
4791 case Intrinsic::nvvm_suld_3d_v2i64_clamp:
4792 case Intrinsic::nvvm_suld_1d_i64_trap:
4793 case Intrinsic::nvvm_suld_1d_v2i64_trap:
4794 case Intrinsic::nvvm_suld_1d_array_i64_trap:
4795 case Intrinsic::nvvm_suld_1d_array_v2i64_trap:
4796 case Intrinsic::nvvm_suld_2d_i64_trap:
4797 case Intrinsic::nvvm_suld_2d_v2i64_trap:
4798 case Intrinsic::nvvm_suld_2d_array_i64_trap:
4799 case Intrinsic::nvvm_suld_2d_array_v2i64_trap:
4800 case Intrinsic::nvvm_suld_3d_i64_trap:
4801 case Intrinsic::nvvm_suld_3d_v2i64_trap:
4802 case Intrinsic::nvvm_suld_1d_i64_zero:
4803 case Intrinsic::nvvm_suld_1d_v2i64_zero:
4804 case Intrinsic::nvvm_suld_1d_array_i64_zero:
4805 case Intrinsic::nvvm_suld_1d_array_v2i64_zero:
4806 case Intrinsic::nvvm_suld_2d_i64_zero:
4807 case Intrinsic::nvvm_suld_2d_v2i64_zero:
4808 case Intrinsic::nvvm_suld_2d_array_i64_zero:
4809 case Intrinsic::nvvm_suld_2d_array_v2i64_zero:
4810 case Intrinsic::nvvm_suld_3d_i64_zero:
4811 case Intrinsic::nvvm_suld_3d_v2i64_zero:
4812 Info.opc = ISD::INTRINSIC_W_CHAIN;
4813 Info.memVT = MVT::i64;
4814 Info.ptrVal = nullptr;
4815 Info.offset = 0;
4816 Info.flags = MachineMemOperand::MOLoad;
4817 Info.align = Align(16);
4818 return true;
4819
4820 case Intrinsic::nvvm_tcgen05_ld_16x64b_x1:
4821 case Intrinsic::nvvm_tcgen05_ld_32x32b_x1:
4822 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x1: {
4823 Info.opc = ISD::INTRINSIC_W_CHAIN;
4824 Info.memVT = MVT::v1i32;
4825 Info.ptrVal = I.getArgOperand(0);
4826 Info.offset = 0;
4827 Info.flags = MachineMemOperand::MOLoad;
4828 Info.align.reset();
4829 return true;
4830 }
4831
4832 case Intrinsic::nvvm_tcgen05_ld_16x64b_x2:
4833 case Intrinsic::nvvm_tcgen05_ld_16x128b_x1:
4834 case Intrinsic::nvvm_tcgen05_ld_32x32b_x2:
4835 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x2: {
4836 Info.opc = ISD::INTRINSIC_W_CHAIN;
4837 Info.memVT = MVT::v2i32;
4838 Info.ptrVal = I.getArgOperand(0);
4839 Info.offset = 0;
4840 Info.flags = MachineMemOperand::MOLoad;
4841 Info.align.reset();
4842 return true;
4843 }
4844
4845 case Intrinsic::nvvm_tcgen05_ld_16x64b_x4:
4846 case Intrinsic::nvvm_tcgen05_ld_16x128b_x2:
4847 case Intrinsic::nvvm_tcgen05_ld_32x32b_x4:
4848 case Intrinsic::nvvm_tcgen05_ld_16x256b_x1:
4849 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x4: {
4850 Info.opc = ISD::INTRINSIC_W_CHAIN;
4851 Info.memVT = MVT::v4i32;
4852 Info.ptrVal = I.getArgOperand(0);
4853 Info.offset = 0;
4854 Info.flags = MachineMemOperand::MOLoad;
4855 Info.align.reset();
4856 return true;
4857 }
4858
4859 case Intrinsic::nvvm_tcgen05_ld_16x64b_x8:
4860 case Intrinsic::nvvm_tcgen05_ld_16x128b_x4:
4861 case Intrinsic::nvvm_tcgen05_ld_16x256b_x2:
4862 case Intrinsic::nvvm_tcgen05_ld_32x32b_x8:
4863 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x8: {
4864 Info.opc = ISD::INTRINSIC_W_CHAIN;
4865 Info.memVT = MVT::v8i32;
4866 Info.ptrVal = I.getArgOperand(0);
4867 Info.offset = 0;
4868 Info.flags = MachineMemOperand::MOLoad;
4869 Info.align.reset();
4870 return true;
4871 }
4872
4873 case Intrinsic::nvvm_tcgen05_ld_16x64b_x16:
4874 case Intrinsic::nvvm_tcgen05_ld_16x128b_x8:
4875 case Intrinsic::nvvm_tcgen05_ld_16x256b_x4:
4876 case Intrinsic::nvvm_tcgen05_ld_32x32b_x16:
4877 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x16: {
4878 Info.opc = ISD::INTRINSIC_W_CHAIN;
4879 Info.memVT = MVT::v16i32;
4880 Info.ptrVal = I.getArgOperand(0);
4881 Info.offset = 0;
4882 Info.flags = MachineMemOperand::MOLoad;
4883 Info.align.reset();
4884 return true;
4885 }
4886
4887 case Intrinsic::nvvm_tcgen05_ld_16x64b_x32:
4888 case Intrinsic::nvvm_tcgen05_ld_16x128b_x16:
4889 case Intrinsic::nvvm_tcgen05_ld_16x256b_x8:
4890 case Intrinsic::nvvm_tcgen05_ld_32x32b_x32:
4891 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x32: {
4892 Info.opc = ISD::INTRINSIC_W_CHAIN;
4893 Info.memVT = MVT::v32i32;
4894 Info.ptrVal = I.getArgOperand(0);
4895 Info.offset = 0;
4896 Info.flags = MachineMemOperand::MOLoad;
4897 Info.align.reset();
4898 return true;
4899 }
4900
4901 case Intrinsic::nvvm_tcgen05_ld_16x64b_x64:
4902 case Intrinsic::nvvm_tcgen05_ld_16x128b_x32:
4903 case Intrinsic::nvvm_tcgen05_ld_16x256b_x16:
4904 case Intrinsic::nvvm_tcgen05_ld_32x32b_x64:
4905 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x64: {
4906 Info.opc = ISD::INTRINSIC_W_CHAIN;
4907 Info.memVT = MVT::v64i32;
4908 Info.ptrVal = I.getArgOperand(0);
4909 Info.offset = 0;
4910 Info.flags = MachineMemOperand::MOLoad;
4911 Info.align.reset();
4912 return true;
4913 }
4914
4915 case Intrinsic::nvvm_tcgen05_ld_16x64b_x128:
4916 case Intrinsic::nvvm_tcgen05_ld_16x128b_x64:
4917 case Intrinsic::nvvm_tcgen05_ld_16x256b_x32:
4918 case Intrinsic::nvvm_tcgen05_ld_32x32b_x128:
4919 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x128: {
4920 Info.opc = ISD::INTRINSIC_W_CHAIN;
4921 Info.memVT = MVT::v128i32;
4922 Info.ptrVal = I.getArgOperand(0);
4923 Info.offset = 0;
4924 Info.flags = MachineMemOperand::MOLoad;
4925 Info.align.reset();
4926 return true;
4927 }
4928
4929 case Intrinsic::nvvm_tcgen05_st_16x64b_x1:
4930 case Intrinsic::nvvm_tcgen05_st_32x32b_x1:
4931 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x1: {
4932 Info.opc = ISD::INTRINSIC_VOID;
4933 Info.memVT = MVT::i32;
4934 Info.ptrVal = I.getArgOperand(0);
4935 Info.offset = 0;
4936 Info.flags = MachineMemOperand::MOStore;
4937 Info.align.reset();
4938 return true;
4939 }
4940
4941 case Intrinsic::nvvm_tcgen05_st_16x64b_x2:
4942 case Intrinsic::nvvm_tcgen05_st_16x128b_x1:
4943 case Intrinsic::nvvm_tcgen05_st_32x32b_x2:
4944 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x2: {
4945 Info.opc = ISD::INTRINSIC_VOID;
4946 Info.memVT = MVT::v2i32;
4947 Info.ptrVal = I.getArgOperand(0);
4948 Info.offset = 0;
4949 Info.flags = MachineMemOperand::MOStore;
4950 Info.align.reset();
4951 return true;
4952 }
4953
4954 case Intrinsic::nvvm_tcgen05_st_16x64b_x4:
4955 case Intrinsic::nvvm_tcgen05_st_16x128b_x2:
4956 case Intrinsic::nvvm_tcgen05_st_16x256b_x1:
4957 case Intrinsic::nvvm_tcgen05_st_32x32b_x4:
4958 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x4: {
4959 Info.opc = ISD::INTRINSIC_VOID;
4960 Info.memVT = MVT::v4i32;
4961 Info.ptrVal = I.getArgOperand(0);
4962 Info.offset = 0;
4963 Info.flags = MachineMemOperand::MOStore;
4964 Info.align.reset();
4965 return true;
4966 }
4967
4968 case Intrinsic::nvvm_tcgen05_st_16x64b_x8:
4969 case Intrinsic::nvvm_tcgen05_st_16x128b_x4:
4970 case Intrinsic::nvvm_tcgen05_st_16x256b_x2:
4971 case Intrinsic::nvvm_tcgen05_st_32x32b_x8:
4972 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x8: {
4973 Info.opc = ISD::INTRINSIC_VOID;
4974 Info.memVT = MVT::v8i32;
4975 Info.ptrVal = I.getArgOperand(0);
4976 Info.offset = 0;
4977 Info.flags = MachineMemOperand::MOStore;
4978 Info.align.reset();
4979 return true;
4980 }
4981
4982 case Intrinsic::nvvm_tcgen05_st_16x64b_x16:
4983 case Intrinsic::nvvm_tcgen05_st_16x128b_x8:
4984 case Intrinsic::nvvm_tcgen05_st_16x256b_x4:
4985 case Intrinsic::nvvm_tcgen05_st_32x32b_x16:
4986 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x16: {
4987 Info.opc = ISD::INTRINSIC_VOID;
4988 Info.memVT = MVT::v16i32;
4989 Info.ptrVal = I.getArgOperand(0);
4990 Info.offset = 0;
4991 Info.flags = MachineMemOperand::MOStore;
4992 Info.align.reset();
4993 return true;
4994 }
4995
4996 case Intrinsic::nvvm_tcgen05_st_16x64b_x32:
4997 case Intrinsic::nvvm_tcgen05_st_16x128b_x16:
4998 case Intrinsic::nvvm_tcgen05_st_16x256b_x8:
4999 case Intrinsic::nvvm_tcgen05_st_32x32b_x32:
5000 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x32: {
5001 Info.opc = ISD::INTRINSIC_VOID;
5002 Info.memVT = MVT::v32i32;
5003 Info.ptrVal = I.getArgOperand(0);
5004 Info.offset = 0;
5005 Info.flags = MachineMemOperand::MOStore;
5006 Info.align.reset();
5007 return true;
5008 }
5009
5010 case Intrinsic::nvvm_tcgen05_st_16x64b_x64:
5011 case Intrinsic::nvvm_tcgen05_st_16x128b_x32:
5012 case Intrinsic::nvvm_tcgen05_st_16x256b_x16:
5013 case Intrinsic::nvvm_tcgen05_st_32x32b_x64:
5014 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x64: {
5015 Info.opc = ISD::INTRINSIC_VOID;
5016 Info.memVT = MVT::v64i32;
5017 Info.ptrVal = I.getArgOperand(0);
5018 Info.offset = 0;
5019 Info.flags = MachineMemOperand::MOStore;
5020 Info.align.reset();
5021 return true;
5022 }
5023
5024 case Intrinsic::nvvm_tcgen05_st_16x64b_x128:
5025 case Intrinsic::nvvm_tcgen05_st_16x128b_x64:
5026 case Intrinsic::nvvm_tcgen05_st_16x256b_x32:
5027 case Intrinsic::nvvm_tcgen05_st_32x32b_x128:
5028 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x128: {
5029 Info.opc = ISD::INTRINSIC_VOID;
5030 Info.memVT = MVT::v128i32;
5031 Info.ptrVal = I.getArgOperand(0);
5032 Info.offset = 0;
5033 Info.flags = MachineMemOperand::MOStore;
5034 Info.align.reset();
5035 return true;
5036 }
5037 case Intrinsic::nvvm_tcgen05_mma_shared_disable_output_lane_cg1:
5038 case Intrinsic::nvvm_tcgen05_mma_shared_scale_d_disable_output_lane_cg1:
5039 case Intrinsic::nvvm_tcgen05_mma_sp_shared_disable_output_lane_cg1:
5040 case Intrinsic::nvvm_tcgen05_mma_sp_shared_scale_d_disable_output_lane_cg1:
5041 case Intrinsic::nvvm_tcgen05_mma_tensor_disable_output_lane_cg1:
5042 case Intrinsic::nvvm_tcgen05_mma_tensor_scale_d_disable_output_lane_cg1:
5043 case Intrinsic::nvvm_tcgen05_mma_tensor_disable_output_lane_cg1_ashift:
5044 case Intrinsic::
5045 nvvm_tcgen05_mma_tensor_scale_d_disable_output_lane_cg1_ashift:
5046 case Intrinsic::nvvm_tcgen05_mma_sp_tensor_disable_output_lane_cg1:
5047 case Intrinsic::nvvm_tcgen05_mma_sp_tensor_scale_d_disable_output_lane_cg1:
5048 case Intrinsic::nvvm_tcgen05_mma_sp_tensor_disable_output_lane_cg1_ashift:
5049 case Intrinsic::
5050 nvvm_tcgen05_mma_sp_tensor_scale_d_disable_output_lane_cg1_ashift: {
5051 // We are reading and writing back to TMem
5052 Info.opc = ISD::INTRINSIC_VOID;
5053 Info.memVT = MVT::v4i32;
5054 Info.ptrVal = I.getArgOperand(0);
5055 Info.offset = 0;
5057 Info.align = Align(16);
5058 return true;
5059 }
5060
5061 case Intrinsic::nvvm_tcgen05_mma_shared_disable_output_lane_cg2:
5062 case Intrinsic::nvvm_tcgen05_mma_shared_scale_d_disable_output_lane_cg2:
5063 case Intrinsic::nvvm_tcgen05_mma_sp_shared_disable_output_lane_cg2:
5064 case Intrinsic::nvvm_tcgen05_mma_sp_shared_scale_d_disable_output_lane_cg2:
5065 case Intrinsic::nvvm_tcgen05_mma_tensor_disable_output_lane_cg2:
5066 case Intrinsic::nvvm_tcgen05_mma_tensor_scale_d_disable_output_lane_cg2:
5067 case Intrinsic::nvvm_tcgen05_mma_sp_tensor_disable_output_lane_cg2:
5068 case Intrinsic::nvvm_tcgen05_mma_sp_tensor_scale_d_disable_output_lane_cg2:
5069 case Intrinsic::nvvm_tcgen05_mma_tensor_disable_output_lane_cg2_ashift:
5070 case Intrinsic::
5071 nvvm_tcgen05_mma_tensor_scale_d_disable_output_lane_cg2_ashift:
5072 case Intrinsic::nvvm_tcgen05_mma_sp_tensor_disable_output_lane_cg2_ashift:
5073 case Intrinsic::
5074 nvvm_tcgen05_mma_sp_tensor_scale_d_disable_output_lane_cg2_ashift: {
5075 // We are reading and writing back to TMem
5076 Info.opc = ISD::INTRINSIC_VOID;
5077 Info.memVT = MVT::v8i32;
5078 Info.ptrVal = I.getArgOperand(0);
5079 Info.offset = 0;
5081 Info.align = Align(16);
5082 return true;
5083 }
5084 }
5085 return false;
5086}
5087
5088/// getFunctionParamOptimizedAlign - since function arguments are passed via
5089/// .param space, we may want to increase their alignment in a way that
5090/// ensures that we can effectively vectorize their loads & stores. We can
5091/// increase alignment only if the function has internal or has private
5092/// linkage as for other linkage types callers may already rely on default
5093/// alignment. To allow using 128-bit vectorized loads/stores, this function
5094/// ensures that alignment is 16 or greater.
5096 const Function *F, Type *ArgTy, const DataLayout &DL) const {
5097 // Capping the alignment to 128 bytes as that is the maximum alignment
5098 // supported by PTX.
5099 const Align ABITypeAlign = std::min(Align(128), DL.getABITypeAlign(ArgTy));
5100
5101 // If a function has linkage different from internal or private, we
5102 // must use default ABI alignment as external users rely on it. Same
5103 // for a function that may be called from a function pointer.
5104 if (!F || !F->hasLocalLinkage() ||
5105 F->hasAddressTaken(/*Users=*/nullptr,
5106 /*IgnoreCallbackUses=*/false,
5107 /*IgnoreAssumeLikeCalls=*/true,
5108 /*IgnoreLLVMUsed=*/true))
5109 return ABITypeAlign;
5110
5111 assert(!isKernelFunction(*F) && "Expect kernels to have non-local linkage");
5112 return std::max(Align(16), ABITypeAlign);
5113}
5114
5115/// Helper for computing alignment of a device function byval parameter.
5117 const Function *F, Type *ArgTy, Align InitialAlign,
5118 const DataLayout &DL) const {
5119 Align ArgAlign = InitialAlign;
5120 // Try to increase alignment to enhance vectorization options.
5121 if (F)
5122 ArgAlign = std::max(ArgAlign, getFunctionParamOptimizedAlign(F, ArgTy, DL));
5123
5124 // Old ptx versions have a bug. When PTX code takes address of
5125 // byval parameter with alignment < 4, ptxas generates code to
5126 // spill argument into memory. Alas on sm_50+ ptxas generates
5127 // SASS code that fails with misaligned access. To work around
5128 // the problem, make sure that we align byval parameters by at
5129 // least 4. This bug seems to be fixed at least starting from
5130 // ptxas > 9.0.
5131 // TODO: remove this after verifying the bug is not reproduced
5132 // on non-deprecated ptxas versions.
5134 ArgAlign = std::max(ArgAlign, Align(4));
5135
5136 return ArgAlign;
5137}
5138
5139// Helper for getting a function parameter name. Name is composed from
5140// its index and the function name. Negative index corresponds to special
5141// parameter (unsized array) used for passing variable arguments.
5143 int Idx) const {
5144 std::string ParamName;
5145 raw_string_ostream ParamStr(ParamName);
5146
5147 ParamStr << getTargetMachine().getSymbol(F)->getName();
5148 if (Idx < 0)
5149 ParamStr << "_vararg";
5150 else
5151 ParamStr << "_param_" << Idx;
5152
5153 return ParamName;
5154}
5155
5156/// isLegalAddressingMode - Return true if the addressing mode represented
5157/// by AM is legal for this target, for a load/store of the specified type.
5158/// Used to guide target specific optimizations, like loop strength reduction
5159/// (LoopStrengthReduce.cpp) and memory optimization for address mode
5160/// (CodeGenPrepare.cpp)
5162 const AddrMode &AM, Type *Ty,
5163 unsigned AS, Instruction *I) const {
5164 // AddrMode - This represents an addressing mode of:
5165 // BaseGV + BaseOffs + BaseReg + Scale*ScaleReg
5166 //
5167 // The legal address modes are
5168 // - [avar]
5169 // - [areg]
5170 // - [areg+immoff]
5171 // - [immAddr]
5172
5173 // immoff must fit in a signed 32-bit int
5174 if (!APInt(64, AM.BaseOffs).isSignedIntN(32))
5175 return false;
5176
5177 if (AM.BaseGV)
5178 return !AM.BaseOffs && !AM.HasBaseReg && !AM.Scale;
5179
5180 switch (AM.Scale) {
5181 case 0: // "r", "r+i" or "i" is allowed
5182 break;
5183 case 1:
5184 if (AM.HasBaseReg) // "r+r+i" or "r+r" is not allowed.
5185 return false;
5186 // Otherwise we have r+i.
5187 break;
5188 default:
5189 // No scale > 1 is allowed
5190 return false;
5191 }
5192 return true;
5193}
5194
5195//===----------------------------------------------------------------------===//
5196// NVPTX Inline Assembly Support
5197//===----------------------------------------------------------------------===//
5198
5199/// getConstraintType - Given a constraint letter, return the type of
5200/// constraint it is for this target.
5203 if (Constraint.size() == 1) {
5204 switch (Constraint[0]) {
5205 default:
5206 break;
5207 case 'b':
5208 case 'r':
5209 case 'h':
5210 case 'c':
5211 case 'l':
5212 case 'f':
5213 case 'd':
5214 case 'q':
5215 case '0':
5216 case 'N':
5217 return C_RegisterClass;
5218 }
5219 }
5220 return TargetLowering::getConstraintType(Constraint);
5221}
5222
5223std::pair<unsigned, const TargetRegisterClass *>
5225 StringRef Constraint,
5226 MVT VT) const {
5227 if (Constraint.size() == 1) {
5228 switch (Constraint[0]) {
5229 case 'b':
5230 return std::make_pair(0U, &NVPTX::B1RegClass);
5231 case 'c':
5232 case 'h':
5233 return std::make_pair(0U, &NVPTX::B16RegClass);
5234 case 'r':
5235 case 'f':
5236 return std::make_pair(0U, &NVPTX::B32RegClass);
5237 case 'l':
5238 case 'N':
5239 case 'd':
5240 return std::make_pair(0U, &NVPTX::B64RegClass);
5241 case 'q': {
5242 if (STI.getSmVersion() < 70)
5243 report_fatal_error("Inline asm with 128 bit operands is only "
5244 "supported for sm_70 and higher!");
5245 return std::make_pair(0U, &NVPTX::B128RegClass);
5246 }
5247 }
5248 }
5249 return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
5250}
5251
5252//===----------------------------------------------------------------------===//
5253// NVPTX DAG Combining
5254//===----------------------------------------------------------------------===//
5255
5257 CodeGenOptLevel OptLevel) const {
5258 // Always honor command-line argument
5259 if (FMAContractLevelOpt.getNumOccurrences() > 0)
5260 return FMAContractLevelOpt > 0;
5261
5262 // Do not contract if we're not optimizing the code.
5263 if (OptLevel == CodeGenOptLevel::None)
5264 return false;
5265
5266 // Honor TargetOptions flags that explicitly say fusion is okay.
5268 return true;
5269
5270 return false;
5271}
5272
5273static bool isConstZero(const SDValue &Operand) {
5274 const auto *Const = dyn_cast<ConstantSDNode>(Operand);
5275 return Const && Const->getZExtValue() == 0;
5276}
5277
5278/// PerformADDCombineWithOperands - Try DAG combinations for an ADD with
5279/// operands N0 and N1. This is a helper for PerformADDCombine that is
5280/// called with the default operands, and if that fails, with commuted
5281/// operands.
5282static SDValue
5285 EVT VT = N0.getValueType();
5286
5287 // Since integer multiply-add costs the same as integer multiply
5288 // but is more costly than integer add, do the fusion only when
5289 // the mul is only used in the add.
5290 // TODO: this may not be true for later architectures, consider relaxing this
5291 if (!N0.getNode()->hasOneUse())
5292 return SDValue();
5293
5294 // fold (add (select cond, 0, (mul a, b)), c)
5295 // -> (select cond, c, (add (mul a, b), c))
5296 //
5297 if (N0.getOpcode() == ISD::SELECT) {
5298 unsigned ZeroOpNum;
5299 if (isConstZero(N0->getOperand(1)))
5300 ZeroOpNum = 1;
5301 else if (isConstZero(N0->getOperand(2)))
5302 ZeroOpNum = 2;
5303 else
5304 return SDValue();
5305
5306 SDValue M = N0->getOperand((ZeroOpNum == 1) ? 2 : 1);
5307 if (M->getOpcode() != ISD::MUL || !M.getNode()->hasOneUse())
5308 return SDValue();
5309
5310 SDLoc DL(N);
5311 SDValue Mul =
5312 DCI.DAG.getNode(ISD::MUL, DL, VT, M->getOperand(0), M->getOperand(1));
5313 SDValue MAD = DCI.DAG.getNode(ISD::ADD, DL, VT, Mul, N1);
5314 return DCI.DAG.getSelect(SDLoc(N), VT, N0->getOperand(0),
5315 ((ZeroOpNum == 1) ? N1 : MAD),
5316 ((ZeroOpNum == 1) ? MAD : N1));
5317 }
5318
5319 return SDValue();
5320}
5321
5322static SDValue
5325 CodeGenOptLevel OptLevel) {
5326 EVT VT = N0.getValueType();
5327 if (N0.getOpcode() == ISD::FMUL) {
5328 const auto *TLI = static_cast<const NVPTXTargetLowering *>(
5329 &DCI.DAG.getTargetLoweringInfo());
5330 if (!(TLI->allowFMA(DCI.DAG.getMachineFunction(), OptLevel) ||
5331 (N->getFlags().hasAllowContract() &&
5332 N0->getFlags().hasAllowContract())))
5333 return SDValue();
5334
5335 // For floating point:
5336 // Do the fusion only when the mul has less than 5 uses and all
5337 // are add.
5338 // The heuristic is that if a use is not an add, then that use
5339 // cannot be fused into fma, therefore mul is still needed anyway.
5340 // If there are more than 4 uses, even if they are all add, fusing
5341 // them will increase register pressue.
5342 //
5343 int numUses = 0;
5344 int nonAddCount = 0;
5345 for (const SDNode *User : N0.getNode()->users()) {
5346 numUses++;
5347 if (User->getOpcode() != ISD::FADD)
5348 ++nonAddCount;
5349 if (numUses >= 5)
5350 return SDValue();
5351 }
5352 if (nonAddCount) {
5353 int orderNo = N->getIROrder();
5354 int orderNo2 = N0.getNode()->getIROrder();
5355 // simple heuristics here for considering potential register
5356 // pressure, the logics here is that the differnce are used
5357 // to measure the distance between def and use, the longer distance
5358 // more likely cause register pressure.
5359 if (orderNo - orderNo2 < 500)
5360 return SDValue();
5361
5362 // Now, check if at least one of the FMUL's operands is live beyond the
5363 // node N, which guarantees that the FMA will not increase register
5364 // pressure at node N.
5365 bool opIsLive = false;
5366 const SDNode *left = N0.getOperand(0).getNode();
5367 const SDNode *right = N0.getOperand(1).getNode();
5368
5369 if (isa<ConstantSDNode>(left) || isa<ConstantSDNode>(right))
5370 opIsLive = true;
5371
5372 if (!opIsLive)
5373 for (const SDNode *User : left->users()) {
5374 int orderNo3 = User->getIROrder();
5375 if (orderNo3 > orderNo) {
5376 opIsLive = true;
5377 break;
5378 }
5379 }
5380
5381 if (!opIsLive)
5382 for (const SDNode *User : right->users()) {
5383 int orderNo3 = User->getIROrder();
5384 if (orderNo3 > orderNo) {
5385 opIsLive = true;
5386 break;
5387 }
5388 }
5389
5390 if (!opIsLive)
5391 return SDValue();
5392 }
5393
5394 return DCI.DAG.getNode(ISD::FMA, SDLoc(N), VT, N0.getOperand(0),
5395 N0.getOperand(1), N1);
5396 }
5397
5398 return SDValue();
5399}
5400
5401/// Fold unpacking movs into a load by increasing the number of return values.
5402///
5403/// ex:
5404/// L: v2f16,ch = load <p>
5405/// a: f16 = extractelt L:0, 0
5406/// b: f16 = extractelt L:0, 1
5407/// use(a, b)
5408///
5409/// ...is turned into...
5410///
5411/// L: f16,f16,ch = LoadV2 <p>
5412/// use(L:0, L:1)
5413static SDValue
5415 // Don't run this optimization before the legalizer
5416 if (!DCI.isAfterLegalizeDAG())
5417 return SDValue();
5418
5419 EVT ElementVT = N->getValueType(0);
5420 // Avoid non-packed types and v4i8
5421 if (!NVPTX::isPackedVectorTy(ElementVT) || ElementVT == MVT::v4i8)
5422 return SDValue();
5423
5424 SmallVector<SDNode *> DeadCopyToRegs;
5425
5426 // Check whether all outputs are either used by an extractelt or are
5427 // glue/chain nodes
5428 if (!all_of(N->uses(), [&](SDUse &U) {
5429 // Skip glue, chain nodes
5430 if (U.getValueType() == MVT::Glue || U.getValueType() == MVT::Other)
5431 return true;
5432 if (U.getUser()->getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
5433 if (N->getOpcode() != ISD::LOAD)
5434 return true;
5435 // Since this is an ISD::LOAD, check all extractelts are used. If
5436 // any are not used, we don't want to defeat another optimization that
5437 // will narrow the load.
5438 //
5439 // For example:
5440 //
5441 // L: v2f16,ch = load <p>
5442 // e0: f16 = extractelt L:0, 0
5443 // e1: f16 = extractelt L:0, 1 <-- unused
5444 // store e0
5445 //
5446 // Can be optimized by DAGCombiner to:
5447 //
5448 // L: f16,ch = load <p>
5449 // store L:0
5450 return !U.getUser()->use_empty();
5451 }
5452
5453 // Otherwise, this use prevents us from splitting a value.
5454 return false;
5455 }))
5456 return SDValue();
5457
5458 auto *LD = cast<MemSDNode>(N);
5459 SDLoc DL(LD);
5460
5461 // the new opcode after we double the number of operands
5462 NVPTXISD::NodeType Opcode;
5463 SmallVector<SDValue> Operands(LD->ops());
5464 unsigned OldNumOutputs; // non-glue, non-chain outputs
5465 switch (LD->getOpcode()) {
5466 case ISD::LOAD:
5467 OldNumOutputs = 1;
5468 // Any packed type is legal, so the legalizer will not have lowered
5469 // ISD::LOAD -> NVPTXISD::Load (unless it's under-aligned). We have to do it
5470 // here.
5471 Opcode = NVPTXISD::LoadV2;
5472 Operands.push_back(DCI.DAG.getIntPtrConstant(
5473 cast<LoadSDNode>(LD)->getExtensionType(), DL));
5474 break;
5475 case NVPTXISD::LoadV2:
5476 OldNumOutputs = 2;
5477 Opcode = NVPTXISD::LoadV4;
5478 break;
5479 case NVPTXISD::LoadV4:
5480 // V8 is only supported for f32. Don't forget, we're not changing the load
5481 // size here. This is already a 256-bit load.
5482 if (ElementVT != MVT::v2f32)
5483 return SDValue();
5484 OldNumOutputs = 4;
5485 Opcode = NVPTXISD::LoadV8;
5486 break;
5487 case NVPTXISD::LoadV8:
5488 // PTX doesn't support the next doubling of outputs
5489 return SDValue();
5490 }
5491
5492 // the non-glue, non-chain outputs in the new load
5493 const unsigned NewNumOutputs = OldNumOutputs * 2;
5494 SmallVector<EVT> NewVTs(NewNumOutputs, ElementVT.getVectorElementType());
5495 // add remaining chain and glue values
5496 NewVTs.append(LD->value_begin() + OldNumOutputs, LD->value_end());
5497
5498 // Create the new load
5499 SDValue NewLoad = DCI.DAG.getMemIntrinsicNode(
5500 Opcode, DL, DCI.DAG.getVTList(NewVTs), Operands, LD->getMemoryVT(),
5501 LD->getMemOperand());
5502
5503 // Now we use a combination of BUILD_VECTORs and a MERGE_VALUES node to keep
5504 // the outputs the same. These nodes will be optimized away in later
5505 // DAGCombiner iterations.
5507 for (unsigned I : seq(OldNumOutputs))
5508 Results.push_back(DCI.DAG.getBuildVector(
5509 ElementVT, DL, {NewLoad.getValue(I * 2), NewLoad.getValue(I * 2 + 1)}));
5510 // Add remaining chain and glue nodes
5511 for (unsigned I : seq(NewLoad->getNumValues() - NewNumOutputs))
5512 Results.push_back(NewLoad.getValue(NewNumOutputs + I));
5513
5514 return DCI.DAG.getMergeValues(Results, DL);
5515}
5516
5517/// Fold packing movs into a store.
5518///
5519/// ex:
5520/// v1: v2f16 = BUILD_VECTOR a:f16, b:f16
5521/// v2: v2f16 = BUILD_VECTOR c:f16, d:f16
5522/// StoreV2 v1, v2
5523///
5524/// ...is turned into...
5525///
5526/// StoreV4 a, b, c, d
5529 unsigned Front, unsigned Back) {
5530 // We want to run this as late as possible since other optimizations may
5531 // eliminate the BUILD_VECTORs.
5532 if (!DCI.isAfterLegalizeDAG())
5533 return SDValue();
5534
5535 // Get the type of the operands being stored.
5536 EVT ElementVT = N->getOperand(Front).getValueType();
5537
5538 // Avoid non-packed types and v4i8
5539 if (!NVPTX::isPackedVectorTy(ElementVT) || ElementVT == MVT::v4i8)
5540 return SDValue();
5541
5542 auto *ST = cast<MemSDNode>(N);
5543
5544 // The new opcode after we double the number of operands.
5545 NVPTXISD::NodeType Opcode;
5546 switch (N->getOpcode()) {
5547 case ISD::STORE:
5548 // Any packed type is legal, so the legalizer will not have lowered
5549 // ISD::STORE -> NVPTXISD::Store (unless it's under-aligned). We have to do
5550 // it here.
5551 Opcode = NVPTXISD::StoreV2;
5552 break;
5553 case NVPTXISD::StoreV2:
5554 Opcode = NVPTXISD::StoreV4;
5555 break;
5556 case NVPTXISD::StoreV4:
5557 // V8 is only supported for f32. Don't forget, we're not changing the store
5558 // size here. This is already a 256-bit store.
5559 if (ElementVT != MVT::v2f32)
5560 return SDValue();
5561 Opcode = NVPTXISD::StoreV8;
5562 break;
5563 case NVPTXISD::StoreV8:
5564 // PTX doesn't support the next doubling of operands
5565 return SDValue();
5566 default:
5567 llvm_unreachable("Unhandled store opcode");
5568 }
5569
5570 // Scan the operands and if they're all BUILD_VECTORs, we'll have gathered
5571 // their elements.
5572 SmallVector<SDValue, 4> Operands(N->ops().take_front(Front));
5573 for (SDValue BV : N->ops().drop_front(Front).drop_back(Back)) {
5574 if (BV.getOpcode() != ISD::BUILD_VECTOR)
5575 return SDValue();
5576
5577 // If the operand has multiple uses, this optimization can increase register
5578 // pressure.
5579 if (!BV.hasOneUse())
5580 return SDValue();
5581
5582 // DAGCombiner visits nodes bottom-up. Check the BUILD_VECTOR operands for
5583 // any signs they may be folded by some other pattern or rule.
5584 for (SDValue Op : BV->ops()) {
5585 // Peek through bitcasts
5586 if (Op.getOpcode() == ISD::BITCAST)
5587 Op = Op.getOperand(0);
5588
5589 // This may be folded into a PRMT.
5590 if (Op.getValueType() == MVT::i16 && Op.getOpcode() == ISD::TRUNCATE &&
5591 Op->getOperand(0).getValueType() == MVT::i32)
5592 return SDValue();
5593
5594 // This may be folded into cvt.bf16x2
5595 if (Op.getOpcode() == ISD::FP_ROUND)
5596 return SDValue();
5597 }
5598 Operands.append({BV.getOperand(0), BV.getOperand(1)});
5599 }
5600 Operands.append(N->op_end() - Back, N->op_end());
5601
5602 // Now we replace the store
5603 return DCI.DAG.getMemIntrinsicNode(Opcode, SDLoc(N), N->getVTList(), Operands,
5604 ST->getMemoryVT(), ST->getMemOperand());
5605}
5606
5608 const NVPTXSubtarget &STI) {
5609
5610 if (DCI.isBeforeLegalize() && N->getOpcode() == ISD::STORE) {
5611 // Here is our chance to custom lower a store with a non-simple type.
5612 // Unfortunately, we can't do this in the legalizer because there is no
5613 // way to setOperationAction for an non-simple type.
5615 if (!ST->getValue().getValueType().isSimple())
5616 return lowerSTOREVector(SDValue(ST, 0), DCI.DAG, STI);
5617 }
5618
5619 return combinePackingMovIntoStore(N, DCI, 1, 2);
5620}
5621
5623 const NVPTXSubtarget &STI) {
5624 if (DCI.isBeforeLegalize() && N->getOpcode() == ISD::LOAD) {
5625 // Here is our chance to custom lower a load with a non-simple type.
5626 // Unfortunately, we can't do this in the legalizer because there is no
5627 // way to setOperationAction for an non-simple type.
5628 if (!N->getValueType(0).isSimple())
5629 return lowerLoadVector(N, DCI.DAG, STI);
5630 }
5631
5632 return combineUnpackingMovIntoLoad(N, DCI);
5633}
5634
5635/// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD.
5636///
5639 CodeGenOptLevel OptLevel) {
5640 if (OptLevel == CodeGenOptLevel::None)
5641 return SDValue();
5642
5643 SDValue N0 = N->getOperand(0);
5644 SDValue N1 = N->getOperand(1);
5645
5646 // Skip non-integer, non-scalar case
5647 EVT VT = N0.getValueType();
5648 if (VT.isVector() || VT != MVT::i32)
5649 return SDValue();
5650
5651 // First try with the default operand order.
5652 if (SDValue Result = PerformADDCombineWithOperands(N, N0, N1, DCI))
5653 return Result;
5654
5655 // If that didn't work, try again with the operands commuted.
5656 return PerformADDCombineWithOperands(N, N1, N0, DCI);
5657}
5658
5659/// PerformFADDCombine - Target-specific dag combine xforms for ISD::FADD.
5660///
5663 CodeGenOptLevel OptLevel) {
5664 SDValue N0 = N->getOperand(0);
5665 SDValue N1 = N->getOperand(1);
5666
5667 EVT VT = N0.getValueType();
5668 if (VT.isVector() || !(VT == MVT::f32 || VT == MVT::f64))
5669 return SDValue();
5670
5671 // First try with the default operand order.
5672 if (SDValue Result = PerformFADDCombineWithOperands(N, N0, N1, DCI, OptLevel))
5673 return Result;
5674
5675 // If that didn't work, try again with the operands commuted.
5676 return PerformFADDCombineWithOperands(N, N1, N0, DCI, OptLevel);
5677}
5678
5679/// Get 3-input version of a 2-input min/max opcode
5680static NVPTXISD::NodeType getMinMax3Opcode(unsigned MinMax2Opcode) {
5681 switch (MinMax2Opcode) {
5682 case ISD::FMAXNUM:
5683 case ISD::FMAXIMUMNUM:
5684 return NVPTXISD::FMAXNUM3;
5685 case ISD::FMINNUM:
5686 case ISD::FMINIMUMNUM:
5687 return NVPTXISD::FMINNUM3;
5688 case ISD::FMAXIMUM:
5689 return NVPTXISD::FMAXIMUM3;
5690 case ISD::FMINIMUM:
5691 return NVPTXISD::FMINIMUM3;
5692 default:
5693 llvm_unreachable("Invalid 2-input min/max opcode");
5694 }
5695}
5696
5697/// PerformFMinMaxCombine - Combine (fmaxnum (fmaxnum a, b), c) into
5698/// (fmaxnum3 a, b, c). Also covers other llvm min/max intrinsics.
5701 unsigned PTXVersion, unsigned SmVersion) {
5702
5703 // 3-input min/max requires PTX 8.8+ and SM_100+, and only supports f32s
5704 EVT VT = N->getValueType(0);
5705 if (VT != MVT::f32 || PTXVersion < 88 || SmVersion < 100)
5706 return SDValue();
5707
5708 SDValue Op0 = N->getOperand(0);
5709 SDValue Op1 = N->getOperand(1);
5710 unsigned MinMaxOp2 = N->getOpcode();
5711 NVPTXISD::NodeType MinMaxOp3 = getMinMax3Opcode(MinMaxOp2);
5712
5713 if (Op0.getOpcode() == MinMaxOp2 && Op0.hasOneUse()) {
5714 // (maxnum (maxnum a, b), c) -> (maxnum3 a, b, c)
5715 SDValue A = Op0.getOperand(0);
5716 SDValue B = Op0.getOperand(1);
5717 SDValue C = Op1;
5718 return DCI.DAG.getNode(MinMaxOp3, SDLoc(N), VT, A, B, C, N->getFlags());
5719 } else if (Op1.getOpcode() == MinMaxOp2 && Op1.hasOneUse()) {
5720 // (maxnum a, (maxnum b, c)) -> (maxnum3 a, b, c)
5721 SDValue A = Op0;
5722 SDValue B = Op1.getOperand(0);
5723 SDValue C = Op1.getOperand(1);
5724 return DCI.DAG.getNode(MinMaxOp3, SDLoc(N), VT, A, B, C, N->getFlags());
5725 }
5726 return SDValue();
5727}
5728
5731 CodeGenOptLevel OptLevel) {
5732 assert(N->getOpcode() == ISD::SREM || N->getOpcode() == ISD::UREM);
5733
5734 // Don't do anything at less than -O2.
5735 if (OptLevel < CodeGenOptLevel::Default)
5736 return SDValue();
5737
5738 SelectionDAG &DAG = DCI.DAG;
5739 SDLoc DL(N);
5740 EVT VT = N->getValueType(0);
5741 bool IsSigned = N->getOpcode() == ISD::SREM;
5742 unsigned DivOpc = IsSigned ? ISD::SDIV : ISD::UDIV;
5743
5744 const SDValue &Num = N->getOperand(0);
5745 const SDValue &Den = N->getOperand(1);
5746
5747 for (const SDNode *U : Num->users()) {
5748 if (U->getOpcode() == DivOpc && U->getOperand(0) == Num &&
5749 U->getOperand(1) == Den) {
5750 // Num % Den -> Num - (Num / Den) * Den
5751 return DAG.getNode(ISD::SUB, DL, VT, Num,
5752 DAG.getNode(ISD::MUL, DL, VT,
5753 DAG.getNode(DivOpc, DL, VT, Num, Den),
5754 Den));
5755 }
5756 }
5757 return SDValue();
5758}
5759
5760// (sign_extend|zero_extend (mul|shl) x, y) -> (mul.wide x, y)
5762 CodeGenOptLevel OptLevel) {
5763 if (OptLevel == CodeGenOptLevel::None)
5764 return SDValue();
5765
5766 SDValue Op = N->getOperand(0);
5767 if (!Op.hasOneUse())
5768 return SDValue();
5769 EVT ToVT = N->getValueType(0);
5770 EVT FromVT = Op.getValueType();
5771 if (!((ToVT == MVT::i32 && FromVT == MVT::i16) ||
5772 (ToVT == MVT::i64 && FromVT == MVT::i32)))
5773 return SDValue();
5774 if (!(Op.getOpcode() == ISD::MUL ||
5775 (Op.getOpcode() == ISD::SHL && isa<ConstantSDNode>(Op.getOperand(1)))))
5776 return SDValue();
5777
5778 SDLoc DL(N);
5779 unsigned ExtOpcode = N->getOpcode();
5780 unsigned Opcode = 0;
5781 if (ExtOpcode == ISD::SIGN_EXTEND && Op->getFlags().hasNoSignedWrap())
5783 else if (ExtOpcode == ISD::ZERO_EXTEND && Op->getFlags().hasNoUnsignedWrap())
5785 else
5786 return SDValue();
5787 SDValue RHS = Op.getOperand(1);
5788 if (Op.getOpcode() == ISD::SHL) {
5789 const auto ShiftAmt = Op.getConstantOperandVal(1);
5790 const auto MulVal = APInt(ToVT.getSizeInBits(), 1) << ShiftAmt;
5791 RHS = DCI.DAG.getConstant(MulVal, DL, ToVT);
5792 }
5793 return DCI.DAG.getNode(Opcode, DL, ToVT, Op.getOperand(0), RHS);
5794}
5795
5801
5802/// IsMulWideOperandDemotable - Checks if the provided DAG node is an operand
5803/// that can be demoted to \p OptSize bits without loss of information. The
5804/// signedness of the operand, if determinable, is placed in \p S.
5806 unsigned OptSize,
5807 OperandSignedness &S) {
5808 S = Unknown;
5809
5810 if (Op.getOpcode() == ISD::SIGN_EXTEND ||
5811 Op.getOpcode() == ISD::SIGN_EXTEND_INREG) {
5812 EVT OrigVT = Op.getOperand(0).getValueType();
5813 if (OrigVT.getFixedSizeInBits() <= OptSize) {
5814 S = Signed;
5815 return true;
5816 }
5817 } else if (Op.getOpcode() == ISD::ZERO_EXTEND) {
5818 EVT OrigVT = Op.getOperand(0).getValueType();
5819 if (OrigVT.getFixedSizeInBits() <= OptSize) {
5820 S = Unsigned;
5821 return true;
5822 }
5823 }
5824
5825 return false;
5826}
5827
5828/// AreMulWideOperandsDemotable - Checks if the given LHS and RHS operands can
5829/// be demoted to \p OptSize bits without loss of information. If the operands
5830/// contain a constant, it should appear as the RHS operand. The signedness of
5831/// the operands is placed in \p IsSigned.
5833 unsigned OptSize,
5834 bool &IsSigned) {
5835 OperandSignedness LHSSign;
5836
5837 // The LHS operand must be a demotable op
5838 if (!IsMulWideOperandDemotable(LHS, OptSize, LHSSign))
5839 return false;
5840
5841 // We should have been able to determine the signedness from the LHS
5842 if (LHSSign == Unknown)
5843 return false;
5844
5845 IsSigned = (LHSSign == Signed);
5846
5847 // The RHS can be a demotable op or a constant
5849 const APInt &Val = CI->getAPIntValue();
5850 if (LHSSign == Unsigned) {
5851 return Val.isIntN(OptSize);
5852 } else {
5853 return Val.isSignedIntN(OptSize);
5854 }
5855 } else {
5856 OperandSignedness RHSSign;
5857 if (!IsMulWideOperandDemotable(RHS, OptSize, RHSSign))
5858 return false;
5859
5860 return LHSSign == RHSSign;
5861 }
5862}
5863
5864/// TryMULWIDECombine - Attempt to replace a multiply of M bits with a multiply
5865/// of M/2 bits that produces an M-bit result (i.e. mul.wide). This transform
5866/// works on both multiply DAG nodes and SHL DAG nodes with a constant shift
5867/// amount.
5870 EVT MulType = N->getValueType(0);
5871 if (MulType != MVT::i32 && MulType != MVT::i64) {
5872 return SDValue();
5873 }
5874
5875 SDLoc DL(N);
5876 unsigned OptSize = MulType.getSizeInBits() >> 1;
5877 SDValue LHS = N->getOperand(0);
5878 SDValue RHS = N->getOperand(1);
5879
5880 // Canonicalize the multiply so the constant (if any) is on the right
5881 if (N->getOpcode() == ISD::MUL) {
5882 if (isa<ConstantSDNode>(LHS)) {
5883 std::swap(LHS, RHS);
5884 }
5885 }
5886
5887 // If we have a SHL, determine the actual multiply amount
5888 if (N->getOpcode() == ISD::SHL) {
5890 if (!ShlRHS) {
5891 return SDValue();
5892 }
5893
5894 APInt ShiftAmt = ShlRHS->getAPIntValue();
5895 unsigned BitWidth = MulType.getSizeInBits();
5896 if (ShiftAmt.sge(0) && ShiftAmt.slt(BitWidth)) {
5897 APInt MulVal = APInt(BitWidth, 1) << ShiftAmt;
5898 RHS = DCI.DAG.getConstant(MulVal, DL, MulType);
5899 } else {
5900 return SDValue();
5901 }
5902 }
5903
5904 bool Signed;
5905 // Verify that our operands are demotable
5906 if (!AreMulWideOperandsDemotable(LHS, RHS, OptSize, Signed)) {
5907 return SDValue();
5908 }
5909
5910 EVT DemotedVT;
5911 if (MulType == MVT::i32) {
5912 DemotedVT = MVT::i16;
5913 } else {
5914 DemotedVT = MVT::i32;
5915 }
5916
5917 // Truncate the operands to the correct size. Note that these are just for
5918 // type consistency and will (likely) be eliminated in later phases.
5919 SDValue TruncLHS =
5920 DCI.DAG.getNode(ISD::TRUNCATE, DL, DemotedVT, LHS);
5921 SDValue TruncRHS =
5922 DCI.DAG.getNode(ISD::TRUNCATE, DL, DemotedVT, RHS);
5923
5924 unsigned Opc;
5925 if (Signed) {
5927 } else {
5929 }
5930
5931 return DCI.DAG.getNode(Opc, DL, MulType, TruncLHS, TruncRHS);
5932}
5933
5934static bool isConstOne(const SDValue &Operand) {
5935 const auto *Const = dyn_cast<ConstantSDNode>(Operand);
5936 return Const && Const->getZExtValue() == 1;
5937}
5938
5940 if (Add->getOpcode() != ISD::ADD)
5941 return SDValue();
5942
5943 if (isConstOne(Add->getOperand(0)))
5944 return Add->getOperand(1);
5945
5946 if (isConstOne(Add->getOperand(1)))
5947 return Add->getOperand(0);
5948
5949 return SDValue();
5950}
5951
5954
5956 SDValue Mul = DCI.DAG.getNode(ISD::MUL, DL, VT, X, Y);
5957 return DCI.DAG.getNode(ISD::ADD, DL, VT, Mul, X);
5958 }
5959
5960 return SDValue();
5961}
5962
5964 SDLoc DL,
5966 if (Select->getOpcode() != ISD::SELECT)
5967 return SDValue();
5968
5969 SDValue Cond = Select->getOperand(0);
5970
5971 unsigned ConstOpNo;
5972 if (isConstOne(Select->getOperand(1)))
5973 ConstOpNo = 1;
5974 else if (isConstOne(Select->getOperand(2)))
5975 ConstOpNo = 2;
5976 else
5977 return SDValue();
5978
5979 SDValue Y = Select->getOperand((ConstOpNo == 1) ? 2 : 1);
5980
5981 // Do not combine if the resulting sequence is not obviously profitable.
5983 return SDValue();
5984
5985 SDValue NewMul = DCI.DAG.getNode(ISD::MUL, DL, VT, X, Y);
5986
5987 return DCI.DAG.getNode(ISD::SELECT, DL, VT, Cond,
5988 (ConstOpNo == 1) ? X : NewMul,
5989 (ConstOpNo == 1) ? NewMul : X);
5990}
5991
5992static SDValue
5995
5996 EVT VT = N0.getValueType();
5997 if (VT.isVector())
5998 return SDValue();
5999
6000 if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64)
6001 return SDValue();
6002
6003 SDLoc DL(N);
6004
6005 // (mul x, (add y, 1)) -> (add (mul x, y), x)
6006 if (SDValue Res = combineMADConstOne(N0, N1, VT, DL, DCI))
6007 return Res;
6008 if (SDValue Res = combineMADConstOne(N1, N0, VT, DL, DCI))
6009 return Res;
6010
6011 // (mul x, (select y, 1)) -> (select (mul x, y), x)
6012 if (SDValue Res = combineMulSelectConstOne(N0, N1, VT, DL, DCI))
6013 return Res;
6014 if (SDValue Res = combineMulSelectConstOne(N1, N0, VT, DL, DCI))
6015 return Res;
6016
6017 return SDValue();
6018}
6019
6020/// PerformMULCombine - Runs PTX-specific DAG combine patterns on MUL nodes.
6023 CodeGenOptLevel OptLevel) {
6024 if (OptLevel == CodeGenOptLevel::None)
6025 return SDValue();
6026
6027 if (SDValue Ret = TryMULWIDECombine(N, DCI))
6028 return Ret;
6029
6030 SDValue N0 = N->getOperand(0);
6031 SDValue N1 = N->getOperand(1);
6032 return PerformMULCombineWithOperands(N, N0, N1, DCI);
6033}
6034
6035/// PerformSHLCombine - Runs PTX-specific DAG combine patterns on SHL nodes.
6038 CodeGenOptLevel OptLevel) {
6039 if (OptLevel > CodeGenOptLevel::None) {
6040 // Try mul.wide combining at OptLevel > 0
6041 if (SDValue Ret = TryMULWIDECombine(N, DCI))
6042 return Ret;
6043 }
6044
6045 return SDValue();
6046}
6047
6050 unsigned int SmVersion) {
6051 EVT CCType = N->getValueType(0);
6052 SDValue A = N->getOperand(0);
6053 SDValue B = N->getOperand(1);
6054
6055 EVT AType = A.getValueType();
6056 if (!(CCType == MVT::v2i1 && (AType == MVT::v2f16 || AType == MVT::v2bf16)))
6057 return SDValue();
6058
6059 if (A.getValueType() == MVT::v2bf16 && SmVersion < 90)
6060 return SDValue();
6061
6062 SDLoc DL(N);
6063 // setp.f16x2 returns two scalar predicates, which we need to
6064 // convert back to v2i1. The returned result will be scalarized by
6065 // the legalizer, but the comparison will remain a single vector
6066 // instruction.
6067 SDValue CCNode = DCI.DAG.getNode(
6068 A.getValueType() == MVT::v2f16 ? NVPTXISD::SETP_F16X2
6070 DL, DCI.DAG.getVTList(MVT::i1, MVT::i1), {A, B, N->getOperand(2)});
6071 return DCI.DAG.getNode(ISD::BUILD_VECTOR, DL, CCType, CCNode.getValue(0),
6072 CCNode.getValue(1));
6073}
6074
6077 SDValue Vector = N->getOperand(0);
6078 if (Vector->getOpcode() == ISD::FREEZE)
6079 Vector = Vector->getOperand(0);
6080 SDLoc DL(N);
6081 EVT VectorVT = Vector.getValueType();
6082 if (Vector->getOpcode() == ISD::LOAD && VectorVT.isSimple() &&
6083 IsPTXVectorType(VectorVT.getSimpleVT()))
6084 return SDValue(); // Native vector loads already combine nicely w/
6085 // extract_vector_elt.
6086 // Don't mess with singletons or packed types (v2*32, v2*16, v4i8 and v8i8),
6087 // we already handle them OK.
6088 if (VectorVT.getVectorNumElements() == 1 ||
6089 NVPTX::isPackedVectorTy(VectorVT) || VectorVT == MVT::v8i8)
6090 return SDValue();
6091
6092 // Don't mess with undef values as sra may be simplified to 0, not undef.
6093 if (Vector->isUndef() || ISD::allOperandsUndef(Vector.getNode()))
6094 return SDValue();
6095
6096 uint64_t VectorBits = VectorVT.getSizeInBits();
6097 // We only handle the types we can extract in-register.
6098 if (!(VectorBits == 16 || VectorBits == 32 || VectorBits == 64))
6099 return SDValue();
6100
6101 ConstantSDNode *Index = dyn_cast<ConstantSDNode>(N->getOperand(1));
6102 // Index == 0 is handled by generic DAG combiner.
6103 if (!Index || Index->getZExtValue() == 0)
6104 return SDValue();
6105
6106 MVT IVT = MVT::getIntegerVT(VectorBits);
6107 EVT EltVT = VectorVT.getVectorElementType();
6108 EVT EltIVT = EltVT.changeTypeToInteger();
6109 uint64_t EltBits = EltVT.getScalarSizeInBits();
6110
6111 SDValue Result = DCI.DAG.getNode(
6112 ISD::TRUNCATE, DL, EltIVT,
6113 DCI.DAG.getNode(
6114 ISD::SRA, DL, IVT, DCI.DAG.getNode(ISD::BITCAST, DL, IVT, Vector),
6115 DCI.DAG.getConstant(Index->getZExtValue() * EltBits, DL, IVT)));
6116
6117 // If element has non-integer type, bitcast it back to the expected type.
6118 if (EltVT != EltIVT)
6119 Result = DCI.DAG.getNode(ISD::BITCAST, DL, EltVT, Result);
6120 // Past legalizer, we may need to extent i8 -> i16 to match the register type.
6121 if (EltVT != N->getValueType(0))
6122 Result = DCI.DAG.getNode(ISD::ANY_EXTEND, DL, N->getValueType(0), Result);
6123
6124 return Result;
6125}
6126
6129 SDValue VA = N->getOperand(1);
6130 EVT VectorVT = VA.getValueType();
6131 if (VectorVT != MVT::v4i8)
6132 return SDValue();
6133
6134 // We need to split vselect into individual per-element operations Because we
6135 // use BFE/BFI instruction for byte extraction/insertion, we do end up with
6136 // 32-bit values, so we may as well do comparison as i32 to avoid conversions
6137 // to/from i16 normally used for i8 values.
6139 SDLoc DL(N);
6140 SDValue VCond = N->getOperand(0);
6141 SDValue VB = N->getOperand(2);
6142 for (int I = 0; I < 4; ++I) {
6143 SDValue C = DCI.DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i1, VCond,
6144 DCI.DAG.getConstant(I, DL, MVT::i32));
6145 SDValue EA = DCI.DAG.getAnyExtOrTrunc(
6146 DCI.DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8, VA,
6147 DCI.DAG.getConstant(I, DL, MVT::i32)),
6148 DL, MVT::i32);
6149 SDValue EB = DCI.DAG.getAnyExtOrTrunc(
6150 DCI.DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8, VB,
6151 DCI.DAG.getConstant(I, DL, MVT::i32)),
6152 DL, MVT::i32);
6153 E.push_back(DCI.DAG.getAnyExtOrTrunc(
6154 DCI.DAG.getNode(ISD::SELECT, DL, MVT::i32, C, EA, EB), DL, MVT::i8));
6155 }
6156 return DCI.DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v4i8, E);
6157}
6158
6159static SDValue
6161 auto VT = N->getValueType(0);
6162 if (!DCI.isAfterLegalizeDAG() ||
6163 // only process v2*16 types
6164 !(NVPTX::isPackedVectorTy(VT) && VT.is32BitVector() &&
6165 VT.getVectorNumElements() == 2))
6166 return SDValue();
6167
6168 auto Op0 = N->getOperand(0);
6169 auto Op1 = N->getOperand(1);
6170
6171 // Start out by assuming we want to take the lower 2 bytes of each i32
6172 // operand.
6173 uint64_t Op0Bytes = 0x10;
6174 uint64_t Op1Bytes = 0x54;
6175
6176 std::pair<SDValue *, uint64_t *> OpData[2] = {{&Op0, &Op0Bytes},
6177 {&Op1, &Op1Bytes}};
6178
6179 // Check that each operand is an i16, truncated from an i32 operand. We'll
6180 // select individual bytes from those original operands. Optionally, fold in a
6181 // shift right of that original operand.
6182 for (auto &[Op, OpBytes] : OpData) {
6183 // Eat up any bitcast
6184 if (Op->getOpcode() == ISD::BITCAST)
6185 *Op = Op->getOperand(0);
6186
6187 if (!(Op->getValueType() == MVT::i16 && Op->getOpcode() == ISD::TRUNCATE &&
6188 Op->getOperand(0).getValueType() == MVT::i32))
6189 return SDValue();
6190
6191 // If the truncate has multiple uses, this optimization can increase
6192 // register pressure
6193 if (!Op->hasOneUse())
6194 return SDValue();
6195
6196 *Op = Op->getOperand(0);
6197
6198 // Optionally, fold in a shift-right of the original operand and let permute
6199 // pick the two higher bytes of the original value directly.
6200 if (Op->getOpcode() == ISD::SRL && isa<ConstantSDNode>(Op->getOperand(1))) {
6201 if (cast<ConstantSDNode>(Op->getOperand(1))->getZExtValue() == 16) {
6202 // Shift the PRMT byte selector to pick upper bytes from each respective
6203 // value, instead of the lower ones: 0x10 -> 0x32, 0x54 -> 0x76
6204 assert((*OpBytes == 0x10 || *OpBytes == 0x54) &&
6205 "PRMT selector values out of range");
6206 *OpBytes += 0x22;
6207 *Op = Op->getOperand(0);
6208 }
6209 }
6210 }
6211
6212 SDLoc DL(N);
6213 auto &DAG = DCI.DAG;
6214
6215 auto PRMT =
6216 getPRMT(DAG.getBitcast(MVT::i32, Op0), DAG.getBitcast(MVT::i32, Op1),
6217 (Op1Bytes << 8) | Op0Bytes, DL, DAG);
6218 return DAG.getBitcast(VT, PRMT);
6219}
6220
6223 auto *ASCN1 = cast<AddrSpaceCastSDNode>(N);
6224
6225 if (auto *ASCN2 = dyn_cast<AddrSpaceCastSDNode>(ASCN1->getOperand(0))) {
6226 assert(ASCN2->getDestAddressSpace() == ASCN1->getSrcAddressSpace());
6227
6228 // Fold asc[B -> A](asc[A -> B](x)) -> x
6229 if (ASCN1->getDestAddressSpace() == ASCN2->getSrcAddressSpace())
6230 return ASCN2->getOperand(0);
6231 }
6232
6233 return SDValue();
6234}
6235
6236// Given a constant selector value and a prmt mode, return the selector value
6237// normalized to the generic prmt mode. See the PTX ISA documentation for more
6238// details:
6239// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-prmt
6240static APInt getPRMTSelector(const APInt &Selector, unsigned Mode) {
6241 assert(Selector.getBitWidth() == 32 && "PRMT must have i32 operands");
6242
6244 return Selector;
6245
6246 const unsigned V = Selector.trunc(2).getZExtValue();
6247
6248 const auto GetSelector = [](unsigned S0, unsigned S1, unsigned S2,
6249 unsigned S3) {
6250 return APInt(32, S0 | (S1 << 4) | (S2 << 8) | (S3 << 12));
6251 };
6252
6253 switch (Mode) {
6255 return GetSelector(V, V + 1, V + 2, V + 3);
6257 return GetSelector(V, (V - 1) & 7, (V - 2) & 7, (V - 3) & 7);
6259 return GetSelector(V, V, V, V);
6261 return GetSelector(V, std::max(V, 1U), std::max(V, 2U), 3U);
6263 return GetSelector(0, std::min(V, 1U), std::min(V, 2U), V);
6265 unsigned V1 = (V & 1) << 1;
6266 return GetSelector(V1, V1 + 1, V1, V1 + 1);
6267 }
6268 default:
6269 llvm_unreachable("Invalid PRMT mode");
6270 }
6271}
6272
6273static APInt computePRMT(APInt A, APInt B, APInt Selector, unsigned Mode) {
6274 assert(A.getBitWidth() == 32 && B.getBitWidth() == 32 &&
6275 Selector.getBitWidth() == 32 && "PRMT must have i32 operands");
6276 // {b, a} = {{b7, b6, b5, b4}, {b3, b2, b1, b0}}
6277 APInt BitField = B.concat(A);
6278 APInt SelectorVal = getPRMTSelector(Selector, Mode);
6279 APInt Result(32, 0);
6280 for (unsigned I : llvm::seq(4U)) {
6281 APInt Sel = SelectorVal.extractBits(4, I * 4);
6282 unsigned Idx = Sel.getLoBits(3).getZExtValue();
6283 unsigned Sign = Sel.getHiBits(1).getZExtValue();
6284 APInt Byte = BitField.extractBits(8, Idx * 8);
6285 if (Sign)
6286 Byte = Byte.ashr(8);
6287 Result.insertBits(Byte, I * 8);
6288 }
6289 return Result;
6290}
6291
6293 CodeGenOptLevel OptLevel) {
6294 if (OptLevel == CodeGenOptLevel::None)
6295 return SDValue();
6296
6297 // Constant fold PRMT
6298 if (isa<ConstantSDNode>(N->getOperand(0)) &&
6299 isa<ConstantSDNode>(N->getOperand(1)) &&
6300 isa<ConstantSDNode>(N->getOperand(2)))
6301 return DCI.DAG.getConstant(computePRMT(N->getConstantOperandAPInt(0),
6302 N->getConstantOperandAPInt(1),
6303 N->getConstantOperandAPInt(2),
6304 N->getConstantOperandVal(3)),
6305 SDLoc(N), N->getValueType(0));
6306 return SDValue();
6307}
6308
6309// During call lowering we wrap the return values in a ProxyReg node which
6310// depend on the chain value produced by the completed call. This ensures that
6311// the full call is emitted in cases where libcalls are used to legalize
6312// operations. To improve the functioning of other DAG combines we pull all
6313// operations we can through one of these nodes, ensuring that the ProxyReg
6314// directly wraps a load. That is:
6315//
6316// (ProxyReg (zext (load retval0))) => (zext (ProxyReg (load retval0)))
6317//
6320 switch (R.getOpcode()) {
6321 case ISD::TRUNCATE:
6322 case ISD::ANY_EXTEND:
6323 case ISD::SIGN_EXTEND:
6324 case ISD::ZERO_EXTEND:
6325 case ISD::BITCAST: {
6326 if (SDValue V = sinkProxyReg(R.getOperand(0), Chain, DCI))
6327 return DCI.DAG.getNode(R.getOpcode(), SDLoc(R), R.getValueType(), V);
6328 return SDValue();
6329 }
6330 case ISD::SHL:
6331 case ISD::SRL:
6332 case ISD::SRA:
6333 case ISD::OR: {
6334 if (SDValue A = sinkProxyReg(R.getOperand(0), Chain, DCI))
6335 if (SDValue B = sinkProxyReg(R.getOperand(1), Chain, DCI))
6336 return DCI.DAG.getNode(R.getOpcode(), SDLoc(R), R.getValueType(), A, B);
6337 return SDValue();
6338 }
6339 case ISD::Constant:
6340 return R;
6341 case ISD::LOAD:
6342 case NVPTXISD::LoadV2:
6343 case NVPTXISD::LoadV4: {
6344 return DCI.DAG.getNode(NVPTXISD::ProxyReg, SDLoc(R), R.getValueType(),
6345 {Chain, R});
6346 }
6347 case ISD::BUILD_VECTOR: {
6348 if (DCI.isBeforeLegalize())
6349 return SDValue();
6350
6352 for (auto &Op : R->ops()) {
6353 SDValue V = sinkProxyReg(Op, Chain, DCI);
6354 if (!V)
6355 return SDValue();
6356 Ops.push_back(V);
6357 }
6358 return DCI.DAG.getNode(ISD::BUILD_VECTOR, SDLoc(R), R.getValueType(), Ops);
6359 }
6361 if (DCI.isBeforeLegalize())
6362 return SDValue();
6363
6364 if (SDValue V = sinkProxyReg(R.getOperand(0), Chain, DCI))
6366 R.getValueType(), V, R.getOperand(1));
6367 return SDValue();
6368 }
6369 default:
6370 return SDValue();
6371 }
6372}
6373
6376
6377 SDValue Chain = N->getOperand(0);
6378 SDValue Reg = N->getOperand(1);
6379
6380 // If the ProxyReg is not wrapping a load, try to pull the operations through
6381 // the ProxyReg.
6382 if (Reg.getOpcode() != ISD::LOAD) {
6383 if (SDValue V = sinkProxyReg(Reg, Chain, DCI))
6384 return V;
6385 }
6386
6387 return SDValue();
6388}
6389
6390SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N,
6391 DAGCombinerInfo &DCI) const {
6393 switch (N->getOpcode()) {
6394 default:
6395 break;
6396 case ISD::ADD:
6397 return PerformADDCombine(N, DCI, OptLevel);
6398 case ISD::ADDRSPACECAST:
6399 return combineADDRSPACECAST(N, DCI);
6400 case ISD::SIGN_EXTEND:
6401 case ISD::ZERO_EXTEND:
6402 return combineMulWide(N, DCI, OptLevel);
6403 case ISD::BUILD_VECTOR:
6404 return PerformBUILD_VECTORCombine(N, DCI);
6406 return PerformEXTRACTCombine(N, DCI);
6407 case ISD::FADD:
6408 return PerformFADDCombine(N, DCI, OptLevel);
6409 case ISD::FMAXNUM:
6410 case ISD::FMINNUM:
6411 case ISD::FMAXIMUM:
6412 case ISD::FMINIMUM:
6413 case ISD::FMAXIMUMNUM:
6414 case ISD::FMINIMUMNUM:
6415 return PerformFMinMaxCombine(N, DCI, STI.getPTXVersion(),
6416 STI.getSmVersion());
6417 case ISD::LOAD:
6418 case NVPTXISD::LoadV2:
6419 case NVPTXISD::LoadV4:
6420 return combineLOAD(N, DCI, STI);
6421 case ISD::MUL:
6422 return PerformMULCombine(N, DCI, OptLevel);
6423 case NVPTXISD::PRMT:
6424 return combinePRMT(N, DCI, OptLevel);
6425 case NVPTXISD::ProxyReg:
6426 return combineProxyReg(N, DCI);
6427 case ISD::SETCC:
6428 return PerformSETCCCombine(N, DCI, STI.getSmVersion());
6429 case ISD::SHL:
6430 return PerformSHLCombine(N, DCI, OptLevel);
6431 case ISD::SREM:
6432 case ISD::UREM:
6433 return PerformREMCombine(N, DCI, OptLevel);
6434 case ISD::STORE:
6435 case NVPTXISD::StoreV2:
6436 case NVPTXISD::StoreV4:
6437 return combineSTORE(N, DCI, STI);
6438 case ISD::VSELECT:
6439 return PerformVSELECTCombine(N, DCI);
6440 }
6441 return SDValue();
6442}
6443
6446 // Handle bitcasting to v2i8 without hitting the default promotion
6447 // strategy which goes through stack memory.
6448 SDValue Op(Node, 0);
6449 EVT ToVT = Op->getValueType(0);
6450 if (ToVT != MVT::v2i8) {
6451 return;
6452 }
6453
6454 // Bitcast to i16 and unpack elements into a vector
6455 SDLoc DL(Node);
6456 SDValue AsInt = DAG.getBitcast(MVT::i16, Op->getOperand(0));
6457 SDValue Vec0 = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, AsInt);
6458 SDValue Const8 = DAG.getConstant(8, DL, MVT::i16);
6459 SDValue Vec1 =
6460 DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
6461 DAG.getNode(ISD::SRL, DL, MVT::i16, {AsInt, Const8}));
6462 Results.push_back(
6463 DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2i8, {Vec0, Vec1}));
6464}
6465
6468 SDValue Chain = N->getOperand(0);
6469 SDValue Intrin = N->getOperand(1);
6470 SDLoc DL(N);
6471
6472 // Get the intrinsic ID
6473 unsigned IntrinNo = Intrin.getNode()->getAsZExtVal();
6474 switch (IntrinNo) {
6475 default:
6476 return;
6477 case Intrinsic::nvvm_ldu_global_i:
6478 case Intrinsic::nvvm_ldu_global_f:
6479 case Intrinsic::nvvm_ldu_global_p: {
6480 EVT ResVT = N->getValueType(0);
6481
6482 if (ResVT.isVector()) {
6483 // Vector LDG/LDU
6484
6485 unsigned NumElts = ResVT.getVectorNumElements();
6486 EVT EltVT = ResVT.getVectorElementType();
6487
6488 // Since LDU/LDG are target nodes, we cannot rely on DAG type
6489 // legalization.
6490 // Therefore, we must ensure the type is legal. For i1 and i8, we set the
6491 // loaded type to i16 and propagate the "real" type as the memory type.
6492 bool NeedTrunc = false;
6493 if (EltVT.getSizeInBits() < 16) {
6494 EltVT = MVT::i16;
6495 NeedTrunc = true;
6496 }
6497
6498 unsigned Opcode = 0;
6499 SDVTList LdResVTs;
6500
6501 switch (NumElts) {
6502 default:
6503 return;
6504 case 2:
6505 Opcode = NVPTXISD::LDUV2;
6506 LdResVTs = DAG.getVTList(EltVT, EltVT, MVT::Other);
6507 break;
6508 case 4: {
6509 Opcode = NVPTXISD::LDUV4;
6510 EVT ListVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other };
6511 LdResVTs = DAG.getVTList(ListVTs);
6512 break;
6513 }
6514 }
6515
6516 SmallVector<SDValue, 8> OtherOps;
6517
6518 // Copy regular operands
6519
6520 OtherOps.push_back(Chain); // Chain
6521 // Skip operand 1 (intrinsic ID)
6522 // Others
6523 OtherOps.append(N->op_begin() + 2, N->op_end());
6524
6526
6527 SDValue NewLD = DAG.getMemIntrinsicNode(Opcode, DL, LdResVTs, OtherOps,
6528 MemSD->getMemoryVT(),
6529 MemSD->getMemOperand());
6530
6531 SmallVector<SDValue, 4> ScalarRes;
6532
6533 for (unsigned i = 0; i < NumElts; ++i) {
6534 SDValue Res = NewLD.getValue(i);
6535 if (NeedTrunc)
6536 Res =
6537 DAG.getNode(ISD::TRUNCATE, DL, ResVT.getVectorElementType(), Res);
6538 ScalarRes.push_back(Res);
6539 }
6540
6541 SDValue LoadChain = NewLD.getValue(NumElts);
6542
6543 SDValue BuildVec =
6544 DAG.getBuildVector(ResVT, DL, ScalarRes);
6545
6546 Results.push_back(BuildVec);
6547 Results.push_back(LoadChain);
6548 } else {
6549 // i8 LDG/LDU
6550 assert(ResVT.isSimple() && ResVT.getSimpleVT().SimpleTy == MVT::i8 &&
6551 "Custom handling of non-i8 ldu/ldg?");
6552
6553 // Just copy all operands as-is
6555
6556 // Force output to i16
6557 SDVTList LdResVTs = DAG.getVTList(MVT::i16, MVT::Other);
6558
6560
6561 // We make sure the memory type is i8, which will be used during isel
6562 // to select the proper instruction.
6563 SDValue NewLD =
6565 MVT::i8, MemSD->getMemOperand());
6566
6567 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
6568 NewLD.getValue(0)));
6569 Results.push_back(NewLD.getValue(1));
6570 }
6571 return;
6572 }
6573
6574 case Intrinsic::nvvm_tcgen05_ld_16x64b_x4:
6575 case Intrinsic::nvvm_tcgen05_ld_16x64b_x8:
6576 case Intrinsic::nvvm_tcgen05_ld_16x64b_x16:
6577 case Intrinsic::nvvm_tcgen05_ld_16x64b_x32:
6578 case Intrinsic::nvvm_tcgen05_ld_16x64b_x64:
6579 case Intrinsic::nvvm_tcgen05_ld_16x64b_x128:
6580 case Intrinsic::nvvm_tcgen05_ld_32x32b_x4:
6581 case Intrinsic::nvvm_tcgen05_ld_32x32b_x8:
6582 case Intrinsic::nvvm_tcgen05_ld_32x32b_x16:
6583 case Intrinsic::nvvm_tcgen05_ld_32x32b_x32:
6584 case Intrinsic::nvvm_tcgen05_ld_32x32b_x64:
6585 case Intrinsic::nvvm_tcgen05_ld_32x32b_x128:
6586 case Intrinsic::nvvm_tcgen05_ld_16x128b_x2:
6587 case Intrinsic::nvvm_tcgen05_ld_16x128b_x4:
6588 case Intrinsic::nvvm_tcgen05_ld_16x128b_x8:
6589 case Intrinsic::nvvm_tcgen05_ld_16x128b_x16:
6590 case Intrinsic::nvvm_tcgen05_ld_16x128b_x32:
6591 case Intrinsic::nvvm_tcgen05_ld_16x128b_x64:
6592 case Intrinsic::nvvm_tcgen05_ld_16x256b_x1:
6593 case Intrinsic::nvvm_tcgen05_ld_16x256b_x2:
6594 case Intrinsic::nvvm_tcgen05_ld_16x256b_x4:
6595 case Intrinsic::nvvm_tcgen05_ld_16x256b_x8:
6596 case Intrinsic::nvvm_tcgen05_ld_16x256b_x16:
6597 case Intrinsic::nvvm_tcgen05_ld_16x256b_x32:
6598 if (auto Res = lowerTcgen05Ld(N, DAG)) {
6599 Results.push_back(Res->first);
6600 Results.push_back(Res->second);
6601 }
6602 return;
6603
6604 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x4:
6605 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x8:
6606 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x16:
6607 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x32:
6608 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x64:
6609 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x128:
6610 if (auto Res = lowerTcgen05Ld(N, DAG, /*HasOffset=*/true)) {
6611 Results.push_back(Res->first);
6612 Results.push_back(Res->second);
6613 }
6614 return;
6615 }
6616}
6617
6620 // Change the CopyFromReg to output 2 64-bit results instead of a 128-bit
6621 // result so that it can pass the legalization
6622 SDLoc DL(N);
6623 SDValue Chain = N->getOperand(0);
6624 SDValue Reg = N->getOperand(1);
6625 SDValue Glue = N->getOperand(2);
6626
6627 assert(Reg.getValueType() == MVT::i128 &&
6628 "Custom lowering for CopyFromReg with 128-bit reg only");
6629 SmallVector<EVT, 4> ResultsType = {MVT::i64, MVT::i64, N->getValueType(1),
6630 N->getValueType(2)};
6631 SmallVector<SDValue, 3> NewOps = {Chain, Reg, Glue};
6632
6633 SDValue NewValue = DAG.getNode(ISD::CopyFromReg, DL, ResultsType, NewOps);
6634 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i128,
6635 {NewValue.getValue(0), NewValue.getValue(1)});
6636
6637 Results.push_back(Pair);
6638 Results.push_back(NewValue.getValue(2));
6639 Results.push_back(NewValue.getValue(3));
6640}
6641
6643 const TargetLowering &TLI,
6645 SDValue Chain = N->getOperand(0);
6646 SDValue Reg = N->getOperand(1);
6647
6648 MVT VT = TLI.getRegisterType(*DAG.getContext(), Reg.getValueType());
6649
6650 SDValue NewReg = DAG.getAnyExtOrTrunc(Reg, SDLoc(N), VT);
6651 SDValue NewProxy =
6652 DAG.getNode(NVPTXISD::ProxyReg, SDLoc(N), VT, {Chain, NewReg});
6653 SDValue Res = DAG.getAnyExtOrTrunc(NewProxy, SDLoc(N), N->getValueType(0));
6654
6655 Results.push_back(Res);
6656}
6657
6659 const NVPTXSubtarget &STI,
6661 assert(N->getValueType(0) == MVT::i128 &&
6662 "Custom lowering for atomic128 only supports i128");
6663
6665 SDLoc dl(N);
6666
6667 if (!STI.hasAtomSwap128()) {
6670 "Support for b128 atomics introduced in PTX ISA version 8.3 and "
6671 "requires target sm_90.",
6672 dl.getDebugLoc()));
6673
6674 Results.push_back(DAG.getUNDEF(MVT::i128));
6675 Results.push_back(AN->getOperand(0)); // Chain
6676 return;
6677 }
6678
6680 Ops.push_back(AN->getOperand(0)); // Chain
6681 Ops.push_back(AN->getOperand(1)); // Ptr
6682 for (const auto &Op : AN->ops().drop_front(2)) {
6683 // Low part
6684 Ops.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i64, Op,
6685 DAG.getIntPtrConstant(0, dl)));
6686 // High part
6687 Ops.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i64, Op,
6688 DAG.getIntPtrConstant(1, dl)));
6689 }
6690 unsigned Opcode = N->getOpcode() == ISD::ATOMIC_SWAP
6693 SDVTList Tys = DAG.getVTList(MVT::i64, MVT::i64, MVT::Other);
6694 SDValue Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, MVT::i128,
6695 AN->getMemOperand());
6696 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i128,
6697 {Result.getValue(0), Result.getValue(1)}));
6698 Results.push_back(Result.getValue(2));
6699}
6700
6701void NVPTXTargetLowering::ReplaceNodeResults(
6703 switch (N->getOpcode()) {
6704 default:
6705 report_fatal_error("Unhandled custom legalization");
6706 case ISD::BITCAST:
6707 ReplaceBITCAST(N, DAG, Results);
6708 return;
6709 case ISD::LOAD:
6710 replaceLoadVector(N, DAG, Results, STI);
6711 return;
6714 return;
6715 case ISD::CopyFromReg:
6717 return;
6718 case NVPTXISD::ProxyReg:
6719 replaceProxyReg(N, DAG, *this, Results);
6720 return;
6721 case ISD::ATOMIC_CMP_SWAP:
6722 case ISD::ATOMIC_SWAP:
6723 replaceAtomicSwap128(N, DAG, STI, Results);
6724 return;
6725 }
6726}
6727
6730 Type *Ty = AI->getValOperand()->getType();
6731
6732 if (AI->isFloatingPointOperation()) {
6734 if (Ty->isHalfTy() && STI.getSmVersion() >= 70 &&
6735 STI.getPTXVersion() >= 63)
6737 if (Ty->isBFloatTy() && STI.getSmVersion() >= 90 &&
6738 STI.getPTXVersion() >= 78)
6740 if (Ty->isFloatTy())
6742 if (Ty->isDoubleTy() && STI.hasAtomAddF64())
6744 }
6746 }
6747
6748 assert(Ty->isIntegerTy() && "Ty should be integer at this point");
6749 const unsigned BitWidth = cast<IntegerType>(Ty)->getBitWidth();
6750
6751 switch (AI->getOperation()) {
6752 default:
6755 if (BitWidth == 128)
6757 [[fallthrough]];
6761 switch (BitWidth) {
6762 case 8:
6763 case 16:
6765 case 32:
6767 case 64:
6768 if (STI.hasAtomBitwise64())
6771 case 128:
6773 default:
6774 llvm_unreachable("unsupported width encountered");
6775 }
6782 switch (BitWidth) {
6783 case 8:
6784 case 16:
6786 case 32:
6788 case 64:
6789 if (STI.hasAtomMinMax64())
6792 case 128:
6794 default:
6795 llvm_unreachable("unsupported width encountered");
6796 }
6799 switch (BitWidth) {
6800 case 32:
6802 case 8:
6803 case 16:
6804 case 64:
6805 case 128:
6807 default:
6808 llvm_unreachable("unsupported width encountered");
6809 }
6810 }
6811
6813}
6814
6816 const Instruction *I) const {
6817 auto *CI = dyn_cast<AtomicCmpXchgInst>(I);
6818 // When CAS bitwidth is not supported on the hardware, the CAS is emulated
6819 // using a retry loop that uses a higher-bitwidth monotonic CAS. We enforce
6820 // the memory order using explicit fences around the retry loop.
6821 // The memory order of natively supported CAS operations can be enforced
6822 // by lowering to an atom.cas with the right memory synchronizing effect.
6823 // However, atom.cas only supports relaxed, acquire, release and acq_rel.
6824 // So we also use explicit fences for enforcing memory order for
6825 // seq_cast CAS with natively-supported bitwidths.
6826 return CI &&
6827 (cast<IntegerType>(CI->getCompareOperand()->getType())->getBitWidth() <
6828 STI.getMinCmpXchgSizeInBits() ||
6829 CI->getMergedOrdering() == AtomicOrdering::SequentiallyConsistent);
6830}
6831
6833 const Instruction *I) const {
6834 auto *CI = dyn_cast<AtomicCmpXchgInst>(I);
6835 bool BitwidthSupportedAndIsSeqCst =
6836 CI && CI->getMergedOrdering() == AtomicOrdering::SequentiallyConsistent &&
6837 cast<IntegerType>(CI->getCompareOperand()->getType())->getBitWidth() >=
6838 STI.getMinCmpXchgSizeInBits();
6839 return BitwidthSupportedAndIsSeqCst ? AtomicOrdering::Acquire
6841}
6842
6844 Instruction *Inst,
6845 AtomicOrdering Ord) const {
6846 if (!isa<AtomicCmpXchgInst>(Inst))
6847 return TargetLoweringBase::emitLeadingFence(Builder, Inst, Ord);
6848
6849 // Specialize for cmpxchg
6850 // Emit a fence.sc leading fence for cmpxchg seq_cst which are not emulated
6851 SyncScope::ID SSID = cast<AtomicCmpXchgInst>(Inst)->getSyncScopeID();
6852 if (isReleaseOrStronger(Ord))
6853 return Builder.CreateFence(Ord == AtomicOrdering::SequentiallyConsistent
6854 ? Ord
6856 SSID);
6857
6858 return nullptr;
6859}
6860
6862 Instruction *Inst,
6863 AtomicOrdering Ord) const {
6864 // Specialize for cmpxchg
6865 if (!isa<AtomicCmpXchgInst>(Inst))
6866 return TargetLoweringBase::emitTrailingFence(Builder, Inst, Ord);
6867
6868 auto *CI = cast<AtomicCmpXchgInst>(Inst);
6869 auto CASWidth =
6870 cast<IntegerType>(CI->getCompareOperand()->getType())->getBitWidth();
6871 SyncScope::ID SSID = CI->getSyncScopeID();
6872 // Do not emit a trailing fence for cmpxchg seq_cst which are not emulated
6873 if (isAcquireOrStronger(Ord) &&
6875 CASWidth < STI.getMinCmpXchgSizeInBits()))
6876 return Builder.CreateFence(AtomicOrdering::Acquire, SSID);
6877
6878 return nullptr;
6879}
6880
6881// Rather than default to SINT when both UINT and SINT are custom, we only
6882// change the opcode when UINT is not legal and SINT is. UINT is preferred when
6883// both are custom since unsigned CVT instructions can lead to slightly better
6884// SASS code with fewer instructions.
6886 EVT ToVT) const {
6887 if (isOperationLegal(Op, ToVT))
6888 return Op;
6889 switch (Op) {
6890 case ISD::FP_TO_UINT:
6892 return ISD::FP_TO_SINT;
6893 break;
6897 break;
6898 case ISD::VP_FP_TO_UINT:
6899 if (isOperationLegal(ISD::VP_FP_TO_SINT, ToVT))
6900 return ISD::VP_FP_TO_SINT;
6901 break;
6902 default:
6903 break;
6904 }
6905 return Op;
6906}
6907
6908// Pin NVPTXTargetObjectFile's vtables to this file.
6910
6915
6917 const SelectionDAG &DAG, unsigned Depth) {
6918 SDValue A = Op.getOperand(0);
6919 SDValue B = Op.getOperand(1);
6920 ConstantSDNode *Selector = dyn_cast<ConstantSDNode>(Op.getOperand(2));
6921 unsigned Mode = Op.getConstantOperandVal(3);
6922
6923 if (!Selector)
6924 return;
6925
6926 KnownBits AKnown = DAG.computeKnownBits(A, Depth);
6927 KnownBits BKnown = DAG.computeKnownBits(B, Depth);
6928
6929 // {b, a} = {{b7, b6, b5, b4}, {b3, b2, b1, b0}}
6930 assert(AKnown.getBitWidth() == 32 && BKnown.getBitWidth() == 32 &&
6931 "PRMT must have i32 operands");
6932 assert(Known.getBitWidth() == 32 && "PRMT must have i32 result");
6933 KnownBits BitField = BKnown.concat(AKnown);
6934
6935 APInt SelectorVal = getPRMTSelector(Selector->getAPIntValue(), Mode);
6936 for (unsigned I : llvm::seq(4)) {
6937 APInt Sel = SelectorVal.extractBits(4, I * 4);
6938 unsigned Idx = Sel.getLoBits(3).getZExtValue();
6939 unsigned Sign = Sel.getHiBits(1).getZExtValue();
6940 KnownBits Byte = BitField.extractBits(8, Idx * 8);
6941 if (Sign)
6942 Byte = KnownBits::ashr(Byte, 8);
6943 Known.insertBits(Byte, I * 8);
6944 }
6945}
6946
6947static void computeKnownBitsForLoadV(const SDValue Op, KnownBits &Known) {
6949
6950 // We can't do anything without knowing the sign bit.
6951 auto ExtType = LD->getConstantOperandVal(LD->getNumOperands() - 1);
6952 if (ExtType == ISD::SEXTLOAD)
6953 return;
6954
6955 // ExtLoading to vector types is weird and may not work well with known bits.
6956 auto DestVT = LD->getValueType(0);
6957 if (DestVT.isVector())
6958 return;
6959
6960 assert(Known.getBitWidth() == DestVT.getSizeInBits());
6961 auto ElementBitWidth = NVPTXDAGToDAGISel::getFromTypeWidthForLoad(LD);
6962 Known.Zero.setHighBits(Known.getBitWidth() - ElementBitWidth);
6963}
6964
6966 const SDValue Op, KnownBits &Known, const APInt &DemandedElts,
6967 const SelectionDAG &DAG, unsigned Depth) const {
6968 Known.resetAll();
6969
6970 switch (Op.getOpcode()) {
6971 case NVPTXISD::PRMT:
6972 computeKnownBitsForPRMT(Op, Known, DAG, Depth);
6973 break;
6974 case NVPTXISD::LoadV2:
6975 case NVPTXISD::LoadV4:
6976 case NVPTXISD::LoadV8:
6978 break;
6979 default:
6980 break;
6981 }
6982}
6983
6984static std::pair<APInt, APInt> getPRMTDemandedBits(const APInt &SelectorVal,
6985 const APInt &DemandedBits) {
6986 APInt DemandedLHS = APInt(32, 0);
6987 APInt DemandedRHS = APInt(32, 0);
6988
6989 for (unsigned I : llvm::seq(4)) {
6990 if (DemandedBits.extractBits(8, I * 8).isZero())
6991 continue;
6992
6993 APInt Sel = SelectorVal.extractBits(4, I * 4);
6994 unsigned Idx = Sel.getLoBits(3).getZExtValue();
6995 unsigned Sign = Sel.getHiBits(1).getZExtValue();
6996
6997 APInt &Src = Idx < 4 ? DemandedLHS : DemandedRHS;
6998 unsigned ByteStart = (Idx % 4) * 8;
6999 if (Sign)
7000 Src.setBit(ByteStart + 7);
7001 else
7002 Src.setBits(ByteStart, ByteStart + 8);
7003 }
7004
7005 return {DemandedLHS, DemandedRHS};
7006}
7007
7008// Replace undef with 0 as this is easier for other optimizations such as
7009// known bits.
7011 if (!Op)
7012 return SDValue();
7013 if (Op.isUndef())
7014 return DAG.getConstant(0, SDLoc(), MVT::i32);
7015 return Op;
7016}
7017
7019 const APInt &DemandedBits,
7020 SelectionDAG &DAG,
7021 const TargetLowering &TLI,
7022 unsigned Depth) {
7023 assert(PRMT.getOpcode() == NVPTXISD::PRMT);
7024 SDValue Op0 = PRMT.getOperand(0);
7025 SDValue Op1 = PRMT.getOperand(1);
7026 auto *SelectorConst = dyn_cast<ConstantSDNode>(PRMT.getOperand(2));
7027 if (!SelectorConst)
7028 return SDValue();
7029
7030 unsigned Mode = PRMT.getConstantOperandVal(3);
7031 const APInt Selector = getPRMTSelector(SelectorConst->getAPIntValue(), Mode);
7032
7033 // Try to simplify the PRMT to one of the inputs if the used bytes are all
7034 // from the same input in the correct order.
7035 const unsigned LeadingBytes = DemandedBits.countLeadingZeros() / 8;
7036 const unsigned SelBits = (4 - LeadingBytes) * 4;
7037 if (Selector.getLoBits(SelBits) == APInt(32, 0x3210).getLoBits(SelBits))
7038 return Op0;
7039 if (Selector.getLoBits(SelBits) == APInt(32, 0x7654).getLoBits(SelBits))
7040 return Op1;
7041
7042 auto [DemandedLHS, DemandedRHS] = getPRMTDemandedBits(Selector, DemandedBits);
7043
7044 // Attempt to avoid multi-use ops if we don't need anything from them.
7045 SDValue DemandedOp0 =
7046 TLI.SimplifyMultipleUseDemandedBits(Op0, DemandedLHS, DAG, Depth + 1);
7047 SDValue DemandedOp1 =
7048 TLI.SimplifyMultipleUseDemandedBits(Op1, DemandedRHS, DAG, Depth + 1);
7049
7050 DemandedOp0 = canonicalizePRMTInput(DemandedOp0, DAG);
7051 DemandedOp1 = canonicalizePRMTInput(DemandedOp1, DAG);
7052 if ((DemandedOp0 && DemandedOp0 != Op0) ||
7053 (DemandedOp1 && DemandedOp1 != Op1)) {
7054 Op0 = DemandedOp0 ? DemandedOp0 : Op0;
7055 Op1 = DemandedOp1 ? DemandedOp1 : Op1;
7056 return getPRMT(Op0, Op1, Selector.getZExtValue(), SDLoc(PRMT), DAG);
7057 }
7058
7059 return SDValue();
7060}
7061
7063 SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
7064 KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth) const {
7065 Known.resetAll();
7066
7067 switch (Op.getOpcode()) {
7068 case NVPTXISD::PRMT:
7070 *this, Depth)) {
7071 TLO.CombineTo(Op, Result);
7072 return true;
7073 }
7074 break;
7075 default:
7076 break;
7077 }
7078
7079 computeKnownBitsForTargetNode(Op, Known, DemandedElts, TLO.DAG, Depth);
7080 return false;
7081}
return SDValue()
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
constexpr LLT S1
constexpr LLT F32
AMDGPU Register Bank Select
This file declares a class to represent arbitrary precision floating point values and provide a varie...
This file implements a class to represent arbitrary precision integral constant values and operations...
static SDValue PerformADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformADDCombineWithOperands - Try DAG combinations for an ADD with operands N0 and N1.
static SDValue PerformADDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformADDCombine - Target-specific dag combine xforms for ISD::ADD.
static SDValue PerformVSELECTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue PerformMULCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue PerformFADDCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue PerformBUILD_VECTORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformBUILD_VECTORCombine - Target-specific dag combine xforms for ISD::BUILD_VECTOR.
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Function Alias Analysis Results
Atomic ordering constants.
This file contains the simple types necessary to represent the attributes associated with functions a...
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
#define clEnumValN(ENUMVAL, FLAGNAME, DESC)
This file contains the declarations for the subclasses of Constant, which represent the different fla...
This file contains the declarations of entities that describe floating point environment and related ...
Module.h This file contains the declarations for the Module class.
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
#define F(x, y, z)
Definition MD5.cpp:55
#define I(x, y, z)
Definition MD5.cpp:58
static DebugLoc getDebugLoc(MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
Return the first DebugLoc that has line number information, given a range of instructions.
#define MAKE_CASE(V)
Register Reg
Register const TargetRegisterInfo * TRI
#define T
NVPTX address space definition.
static bool shouldConvertToIndirectCall(const CallBase *CB, const GlobalAddressSDNode *Func)
static SDValue combineADDRSPACECAST(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static cl::opt< bool > sched4reg("nvptx-sched4reg", cl::desc("NVPTX Specific: schedule for register pressue"), cl::init(false))
static SDValue lowerTcgen05St(SDValue Op, SelectionDAG &DAG)
static SDValue PerformEXTRACTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static cl::opt< NVPTX::DivPrecisionLevel > UsePrecDivF32("nvptx-prec-divf32", cl::Hidden, cl::desc("NVPTX Specific: Override the precision of the lowering for f32 fdiv"), cl::values(clEnumValN(NVPTX::DivPrecisionLevel::Approx, "0", "Use div.approx"), clEnumValN(NVPTX::DivPrecisionLevel::Full, "1", "Use div.full"), clEnumValN(NVPTX::DivPrecisionLevel::IEEE754, "2", "Use IEEE Compliant F32 div.rnd if available (default)"), clEnumValN(NVPTX::DivPrecisionLevel::IEEE754_NoFTZ, "3", "Use IEEE Compliant F32 div.rnd if available, no FTZ")), cl::init(NVPTX::DivPrecisionLevel::IEEE754))
static bool isConstOne(const SDValue &Operand)
static cl::opt< unsigned > FMAContractLevelOpt("nvptx-fma-level", cl::Hidden, cl::desc("NVPTX Specific: FMA contraction (0: don't do it" " 1: do it 2: do it aggressively"), cl::init(2))
static bool IsPTXVectorType(MVT VT)
static SDValue lowerLOADi1(LoadSDNode *LD, SelectionDAG &DAG)
static SDValue lowerIntrinsicVoid(SDValue Op, SelectionDAG &DAG)
static MachinePointerInfo refinePtrAS(SDValue &Ptr, SelectionDAG &DAG, const DataLayout &DL, const TargetLowering &TL)
static SDValue lowerROT(SDValue Op, SelectionDAG &DAG)
static void ComputePTXValueVTs(const TargetLowering &TLI, const DataLayout &DL, LLVMContext &Ctx, CallingConv::ID CallConv, Type *Ty, SmallVectorImpl< EVT > &ValueVTs, SmallVectorImpl< uint64_t > &Offsets, uint64_t StartingOffset=0)
ComputePTXValueVTs - For the given Type Ty, returns the set of primitive legal-ish MVTs that compose ...
static void ReplaceBITCAST(SDNode *Node, SelectionDAG &DAG, SmallVectorImpl< SDValue > &Results)
static void replaceAtomicSwap128(SDNode *N, SelectionDAG &DAG, const NVPTXSubtarget &STI, SmallVectorImpl< SDValue > &Results)
static SDValue lowerSTOREVector(SDValue Op, SelectionDAG &DAG, const NVPTXSubtarget &STI)
static SDValue lowerLoadVector(SDNode *N, SelectionDAG &DAG, const NVPTXSubtarget &STI)
static void replaceProxyReg(SDNode *N, SelectionDAG &DAG, const TargetLowering &TLI, SmallVectorImpl< SDValue > &Results)
static void ReplaceCopyFromReg_128(SDNode *N, SelectionDAG &DAG, SmallVectorImpl< SDValue > &Results)
static SDValue lowerCTLZCTPOP(SDValue Op, SelectionDAG &DAG)
static SDValue combineMADConstOne(SDValue X, SDValue Add, EVT VT, SDLoc DL, TargetLowering::DAGCombinerInfo &DCI)
static SDValue combinePRMT(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, CodeGenOptLevel OptLevel)
static SDValue combinePackingMovIntoStore(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, unsigned Front, unsigned Back)
Fold packing movs into a store.
static void ReplaceINTRINSIC_W_CHAIN(SDNode *N, SelectionDAG &DAG, SmallVectorImpl< SDValue > &Results)
static SDValue getBuildVectorizedValue(unsigned N, const SDLoc &dl, SelectionDAG &DAG, T GetElement)
static SDValue getExtractVectorizedValue(SDValue V, unsigned I, EVT VT, const SDLoc &dl, SelectionDAG &DAG)
static NVPTXISD::NodeType getMinMax3Opcode(unsigned MinMax2Opcode)
Get 3-input version of a 2-input min/max opcode.
static unsigned canMergeParamLoadStoresStartingAt(unsigned Idx, uint32_t AccessSize, const SmallVectorImpl< EVT > &ValueVTs, const SmallVectorImpl< T > &Offsets, Align ParamAlignment)
static EVT getVectorizedVT(EVT VT, unsigned N, LLVMContext &C)
static SDValue lowerIntrinsicWOChain(SDValue Op, SelectionDAG &DAG)
static SDValue PerformFMinMaxCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, unsigned PTXVersion, unsigned SmVersion)
PerformFMinMaxCombine - Combine (fmaxnum (fmaxnum a, b), c) into (fmaxnum3 a, b, c).
static SDValue combineMulWide(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, CodeGenOptLevel OptLevel)
static SDValue PerformFADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1, TargetLowering::DAGCombinerInfo &DCI, CodeGenOptLevel OptLevel)
static SDValue lowerIntrinsicWChain(SDValue Op, SelectionDAG &DAG)
static bool isConstZero(const SDValue &Operand)
static SDValue LowerVectorArith(SDValue Op, SelectionDAG &DAG)
static SDValue LowerTcgen05MMADisableOutputLane(SDValue Op, SelectionDAG &DAG)
static bool IsMulWideOperandDemotable(SDValue Op, unsigned OptSize, OperandSignedness &S)
IsMulWideOperandDemotable - Checks if the provided DAG node is an operand that can be demoted to OptS...
static unsigned getTcgen05MMADisableOutputLane(unsigned IID)
static std::pair< APInt, APInt > getPRMTDemandedBits(const APInt &SelectorVal, const APInt &DemandedBits)
static APInt computePRMT(APInt A, APInt B, APInt Selector, unsigned Mode)
static ISD::NodeType getScalarOpcodeForReduction(unsigned ReductionOpcode)
static SDValue PerformREMCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, CodeGenOptLevel OptLevel)
static SDValue PerformMULCombineWithOperands(SDNode *N, SDValue N0, SDValue N1, TargetLowering::DAGCombinerInfo &DCI)
static void computeKnownBitsForPRMT(const SDValue Op, KnownBits &Known, const SelectionDAG &DAG, unsigned Depth)
static SDValue combineUnpackingMovIntoLoad(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
Fold unpacking movs into a load by increasing the number of return values.
static SDValue LowerClusterLaunchControlQueryCancel(SDValue Op, SelectionDAG &DAG)
static std::optional< std::pair< SDValue, SDValue > > lowerTcgen05Ld(SDNode *N, SelectionDAG &DAG, bool HasOffset=false)
static SDValue lowerCvtRSIntrinsics(SDValue Op, SelectionDAG &DAG)
static std::optional< NVPTXISD::NodeType > getScalar3OpcodeForReduction(unsigned ReductionOpcode)
Get 3-input scalar reduction opcode.
static std::optional< std::pair< SDValue, SDValue > > replaceLoadVector(SDNode *N, SelectionDAG &DAG, const NVPTXSubtarget &STI)
replaceLoadVector - Convert vector loads into multi-output scalar loads.
static SDValue expandFSH64(SDValue A, SDValue B, SDValue ShiftAmount, SDLoc DL, unsigned Opcode, SelectionDAG &DAG)
static bool AreMulWideOperandsDemotable(SDValue LHS, SDValue RHS, unsigned OptSize, bool &IsSigned)
AreMulWideOperandsDemotable - Checks if the given LHS and RHS operands can be demoted to OptSize bits...
static SDValue TryMULWIDECombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
TryMULWIDECombine - Attempt to replace a multiply of M bits with a multiply of M/2 bits that produces...
static SDValue lowerPrmtIntrinsic(SDValue Op, SelectionDAG &DAG)
static SDValue combineMulSelectConstOne(SDValue X, SDValue Select, EVT VT, SDLoc DL, TargetLowering::DAGCombinerInfo &DCI)
static SDValue buildTreeReduction(const SmallVector< SDValue > &Elements, EVT EltTy, ArrayRef< std::pair< unsigned, unsigned > > Ops, const SDLoc &DL, const SDNodeFlags Flags, SelectionDAG &DAG)
Reduces the elements using the scalar operations provided.
static SDValue combineProxyReg(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SmallVector< unsigned, 16 > VectorizePTXValueVTs(const SmallVectorImpl< EVT > &ValueVTs, const SmallVectorImpl< T > &Offsets, Align ParamAlignment, bool IsVAArg=false)
static SDValue getPRMT(SDValue A, SDValue B, SDValue Selector, SDLoc DL, SelectionDAG &DAG, unsigned Mode=NVPTX::PTXPrmtMode::NONE)
static SDValue matchMADConstOnePattern(SDValue Add)
static SDValue correctParamType(SDValue V, EVT ExpectedVT, ISD::ArgFlagsTy Flags, SelectionDAG &DAG, SDLoc dl)
static ISD::NodeType getExtOpcode(const ISD::ArgFlagsTy &Flags)
static cl::opt< bool > UsePrecSqrtF32("nvptx-prec-sqrtf32", cl::Hidden, cl::desc("NVPTX Specific: 0 use sqrt.approx, 1 use sqrt.rn."), cl::init(true))
static void computeKnownBitsForLoadV(const SDValue Op, KnownBits &Known)
static APInt getPRMTSelector(const APInt &Selector, unsigned Mode)
static EVT promoteScalarIntegerPTX(const EVT VT)
PromoteScalarIntegerPTX Used to make sure the arguments/returns are suitable for passing and promote ...
static SDValue simplifyDemandedBitsForPRMT(SDValue PRMT, const APInt &DemandedBits, SelectionDAG &DAG, const TargetLowering &TLI, unsigned Depth)
static SDValue lowerFREM(SDValue Op, SelectionDAG &DAG)
static SDValue canonicalizePRMTInput(SDValue Op, SelectionDAG &DAG)
static SDValue sinkProxyReg(SDValue R, SDValue Chain, TargetLowering::DAGCombinerInfo &DCI)
static SDValue lowerFSH(SDValue Op, SelectionDAG &DAG)
static SDValue PromoteBinOpToF32(SDNode *N, SelectionDAG &DAG)
static SDValue PerformSETCCCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, unsigned int SmVersion)
static std::optional< std::pair< unsigned int, MVT > > getVectorLoweringShape(EVT VectorEVT, const NVPTXSubtarget &STI, unsigned AddressSpace)
static cl::opt< bool > ForceMinByValParamAlign("nvptx-force-min-byval-param-align", cl::Hidden, cl::desc("NVPTX Specific: force 4-byte minimal alignment for byval" " params of device functions."), cl::init(false))
static cl::opt< bool > UseApproxLog2F32("nvptx-approx-log2f32", cl::desc("NVPTX Specific: whether to use lg2.approx for log2"), cl::init(false))
Whereas CUDA's implementation (see libdevice) uses ex2.approx for exp2(), it does NOT use lg2....
static SDValue lowerSELECT(SDValue Op, SelectionDAG &DAG)
static SDValue combineLOAD(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const NVPTXSubtarget &STI)
static SDValue combineSTORE(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const NVPTXSubtarget &STI)
static SDValue PerformSHLCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, CodeGenOptLevel OptLevel)
PerformSHLCombine - Runs PTX-specific DAG combine patterns on SHL nodes.
MachineInstr unsigned OpIdx
uint64_t High
#define P(N)
const SmallVectorImpl< MachineOperand > & Cond
static cl::opt< RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode > Mode("regalloc-enable-advisor", cl::Hidden, cl::init(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default), cl::desc("Enable regalloc advisor mode"), cl::values(clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default, "default", "Default"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Release, "release", "precompiled"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Development, "development", "for training")))
This file contains some templates that are useful if you are working with the STL at all.
This file defines the SmallVector class.
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
This file describes how to lower LLVM code to machine code.
Value * RHS
Value * LHS
BinaryOperator * Mul
static const fltSemantics & IEEEsingle()
Definition APFloat.h:296
static APFloat getInf(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Infinity.
Definition APFloat.h:1080
Class for arbitrary precision integers.
Definition APInt.h:78
LLVM_ABI APInt getLoBits(unsigned numBits) const
Compute an APInt containing numBits lowbits from this APInt.
Definition APInt.cpp:644
uint64_t getZExtValue() const
Get zero extended value.
Definition APInt.h:1541
void setHighBits(unsigned hiBits)
Set the top hiBits bits.
Definition APInt.h:1392
LLVM_ABI APInt getHiBits(unsigned numBits) const
Compute an APInt containing numBits highbits from this APInt.
Definition APInt.cpp:639
LLVM_ABI APInt trunc(unsigned width) const
Truncate to new width.
Definition APInt.cpp:936
void setBit(unsigned BitPosition)
Set the given bit to 1 whose position is given as "bitPosition".
Definition APInt.h:1331
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition APInt.h:1489
bool isSignedIntN(unsigned N) const
Check if this APInt has an N-bits signed integer value.
Definition APInt.h:436
bool slt(const APInt &RHS) const
Signed less than comparison.
Definition APInt.h:1131
LLVM_ABI APInt extractBits(unsigned numBits, unsigned bitPosition) const
Return an APInt with the extracted bits [bitPosition,bitPosition+numBits).
Definition APInt.cpp:482
bool isIntN(unsigned N) const
Check if this APInt has an N-bits unsigned integer value.
Definition APInt.h:433
bool sge(const APInt &RHS) const
Signed greater or equal comparison.
Definition APInt.h:1238
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:41
const T & back() const
back - Get the last element.
Definition ArrayRef.h:152
ArrayRef< T > drop_back(size_t N=1) const
Drop the last N elements of the array.
Definition ArrayRef.h:202
bool empty() const
empty - Check if the array is empty.
Definition ArrayRef.h:138
ArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
Definition ArrayRef.h:187
an instruction that atomically reads a memory location, combines it with another value,...
@ Add
*p = old + v
@ FAdd
*p = old + v
@ Min
*p = old <signed v ? old : v
@ Sub
*p = old - v
@ And
*p = old & v
@ Xor
*p = old ^ v
@ UIncWrap
Increment one up to a maximum value.
@ Max
*p = old >signed v ? old : v
@ UMin
*p = old <unsigned v ? old : v
@ UMax
*p = old >unsigned v ? old : v
@ UDecWrap
Decrement one until a minimum value or zero.
bool isFloatingPointOperation() const
BinOp getOperation() const
This is an SDNode representing atomic operations.
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
FunctionType * getFunctionType() const
This class represents a function call, abstracting a target machine's calling convention.
const APInt & getAPIntValue() const
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:63
LLVM_ABI TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
LLVM_ABI Align getPrefTypeAlign(Type *Ty) const
Returns the preferred stack/global alignment for the specified type.
Diagnostic information for unsupported feature in backend.
void addFnAttr(Attribute::AttrKind Kind)
Add function attributes to this function.
Definition Function.cpp:637
Common base class shared among various IRBuilders.
Definition IRBuilder.h:114
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
This class is used to represent ISD::LOAD nodes.
MCSection * getDataSection() const
Instances of this class represent a uniqued identifier for a section in the current translation unit.
Definition MCSection.h:521
StringRef getName() const
getName - Get the symbol name.
Definition MCSymbol.h:188
Machine Value Type.
static auto integer_fixedlen_vector_valuetypes()
SimpleValueType SimpleTy
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isScalableVector() const
Return true if this is a vector value type where the runtime length is machine dependent.
static auto integer_valuetypes()
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
static auto fixedlen_vector_valuetypes()
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
static MVT getVectorVT(MVT VT, unsigned NumElements)
MVT getVectorElementType() const
static MVT getIntegerVT(unsigned BitWidth)
static auto fp_valuetypes()
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
static auto fp_fixedlen_vector_valuetypes()
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
Function & getFunction()
Return the LLVM function that this machine code represents.
const MachineJumpTableInfo * getJumpTableInfo() const
getJumpTableInfo - Return the jump table info object for the current function.
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
@ EK_Inline
EK_Inline - Jump table entries are emitted inline at their point of use.
const std::vector< MachineJumpTableEntry > & getJumpTables() const
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MOInvariant
The memory access always returns the same value (or traps).
@ MOStore
The memory access writes data.
This SDNode is used for target intrinsics that touch memory and need an associated MachineMemOperand.
This is an abstract virtual class for memory operations.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
EVT getMemoryVT() const
Return the type of the in-memory value.
static unsigned getFromTypeWidthForLoad(const MemSDNode *Mem)
bool hasAtomSwap128() const
bool hasF32x2Instructions() const
bool has256BitVectorLoadStore(unsigned AS) const
AtomicOrdering atomicOperationOrderAfterFenceSplit(const Instruction *I) const override
ConstraintType getConstraintType(StringRef Constraint) const override
getConstraintType - Given a constraint letter, return the type of constraint it is for this target.
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
const NVPTXTargetMachine * nvTM
bool SimplifyDemandedBitsForTargetNode(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0) const override
Attempt to simplify any target nodes based on the demanded bits/elts, returning true on success.
NVPTXTargetLowering(const NVPTXTargetMachine &TM, const NVPTXSubtarget &STI)
std::string getPrototype(const DataLayout &DL, Type *, const ArgListTy &, const SmallVectorImpl< ISD::OutputArg > &, std::optional< unsigned > FirstVAArg, const CallBase &CB, unsigned UniqueCallSite) const
unsigned getPreferredFPToIntOpcode(unsigned Op, EVT FromVT, EVT ToVT) const override
bool useF32FTZ(const MachineFunction &MF) const
SDValue LowerSTACKSAVE(SDValue Op, SelectionDAG &DAG) const
Align getFunctionArgumentAlignment(const Function *F, Type *Ty, unsigned Idx, const DataLayout &DL) const
SDValue getSqrtEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled, int &ExtraSteps, bool &UseOneConst, bool Reciprocal) const override
Hooks for building estimates in place of slower divisions and square roots.
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &dl, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &dl, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower the incoming (formal) arguments, described by the Ins array,...
void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const override
Lower the specified operand into the Ops vector.
SDValue LowerSTACKRESTORE(SDValue Op, SelectionDAG &DAG) const
Instruction * emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst, AtomicOrdering Ord) const override
std::string getParamName(const Function *F, int Idx) const
TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const override
Return the preferred vector type legalization action.
NVPTX::DivPrecisionLevel getDivF32Level(const MachineFunction &MF, const SDNode &N) const
bool shouldInsertFencesForAtomic(const Instruction *) const override
Whether AtomicExpandPass should automatically insert fences and reduce ordering for this atomic.
Align getFunctionParamOptimizedAlign(const Function *F, Type *ArgTy, const DataLayout &DL) const
getFunctionParamOptimizedAlign - since function arguments are passed via .param space,...
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Ctx, EVT VT) const override
Return the ValueType of the result of SETCC operations.
std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override
Given a physical register constraint (e.g.
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I=nullptr) const override
isLegalAddressingMode - Return true if the addressing mode represented by AM is legal for this target...
Instruction * emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst, AtomicOrdering Ord) const override
Inserts in the IR a target-specific intrinsic specifying a fence.
AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override
Returns how the IR-level AtomicExpand pass should expand the given AtomicRMW, if at all.
Align getFunctionByValParamAlign(const Function *F, Type *ArgTy, Align InitialAlign, const DataLayout &DL) const
Helper for computing alignment of a device function byval parameter.
bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I, MachineFunction &MF, unsigned Intrinsic) const override
Given an intrinsic, checks if on the target the intrinsic will need to map to a MemIntrinsicNode (tou...
const char * getTargetNodeName(unsigned Opcode) const override
This method returns the name of a target specific DAG node.
bool allowFMA(MachineFunction &MF, CodeGenOptLevel OptLevel) const
bool usePrecSqrtF32(const SDNode *N=nullptr) const
unsigned getJumpTableEncoding() const override
Return the entry encoding for a jump table in the current function.
SDValue LowerCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower calls into the specified DAG.
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
MCSection * SelectSectionForGlobal(const GlobalObject *GO, SectionKind Kind, const TargetMachine &TM) const override
static LLVM_ABI PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
const DebugLoc & getDebugLoc() const
Represents one node in the SelectionDAG.
ArrayRef< SDUse > ops() const
const APInt & getAsAPIntVal() const
Helper method returns the APInt value of a ConstantSDNode.
bool hasOneUse() const
Return true if there is exactly one use of this node.
unsigned getIROrder() const
Return the node ordering.
SDNodeFlags getFlags() const
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getNumValues() const
Return the number of values defined/returned by this operator.
SDVTList getVTList() const
const SDValue & getOperand(unsigned Num) const
bool isUndef() const
Returns true if the node type is UNDEF or POISON.
iterator_range< user_iterator > users()
Represents a use of a SDNode.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
const SDValue & getOperand(unsigned i) const
unsigned getOpcode() const
SectionKind - This is a simple POD value that classifies the properties of a section.
Definition SectionKind.h:22
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
LLVM_ABI SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
const SDValue & getRoot() const
Return the root tag of the SelectionDAG.
LLVM_ABI SDValue getAddrSpaceCast(const SDLoc &dl, EVT VT, SDValue Ptr, unsigned SrcAS, unsigned DestAS)
Return an AddrSpaceCastSDNode.
const TargetSubtargetInfo & getSubtarget() const
LLVM_ABI SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
LLVM_ABI SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
LLVM_ABI void ExtractVectorElements(SDValue Op, SmallVectorImpl< SDValue > &Args, unsigned Start=0, unsigned Count=0, EVT EltVT=EVT())
Append the extracted elements from Start to Count out of the vector Op in Args.
LLVM_ABI SDValue getFreeze(SDValue V)
Return a freeze using the SDLoc of the value operand.
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false)
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
LLVM_ABI SDValue getSymbolFunctionGlobalAddress(SDValue Op, Function **TargetFunction=nullptr)
Return a GlobalAddress of the function from the current module with name matching the given ExternalS...
LLVM_ABI SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
LLVM_ABI SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
LLVM_ABI SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, LocationSize Size=LocationSize::precise(0), const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
LLVM_ABI Align getEVTAlign(EVT MemoryVT) const
Compute the default alignment value for the given type.
LLVM_ABI SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
const TargetLowering & getTargetLoweringInfo() const
LLVM_ABI SDNode * MorphNodeTo(SDNode *N, unsigned Opc, SDVTList VTs, ArrayRef< SDValue > Ops)
This mutates the specified node to have the specified return type, opcode, and operands.
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
LLVM_ABI SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build Select's if you just have operands and don't want to check...
const DataLayout & getDataLayout() const
LLVM_ABI SDValue getTokenFactor(const SDLoc &DL, SmallVectorImpl< SDValue > &Vals)
Creates a new TokenFactor containing Vals.
LLVM_ABI SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
LLVM_ABI SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
LLVM_ABI SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
LLVM_ABI SDValue getSignedConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
LLVM_ABI SDValue getBasicBlock(MachineBasicBlock *MBB)
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
LLVM_ABI SDValue getExternalSymbol(const char *Sym, EVT VT)
LLVM_ABI SDValue getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either any-extending or truncat...
LLVM_ABI SDValue getIntPtrConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
LLVM_ABI SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
LLVM_ABI SDValue getFPExtendOrRound(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of float type, to the float type VT, by either extending or rounding (by tr...
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
MachineFunction & getMachineFunction() const
LLVM_ABI KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
LLVM_ABI SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
SDValue getObjectPtrOffset(const SDLoc &SL, SDValue Ptr, TypeSize Offset)
Create an add instruction with appropriate flags when used for addressing some offset of an object.
LLVMContext * getContext() const
const SDValue & setRoot(SDValue N)
Set the current root tag of the SelectionDAG.
LLVM_ABI SDValue getTargetExternalSymbol(const char *Sym, EVT VT, unsigned TargetFlags=0)
ArrayRef< int > getMask() const
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
This class is used to represent ISD::STORE nodes.
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
constexpr size_t size() const
size - Get the string size.
Definition StringRef.h:146
constexpr const char * data() const
data - Get a pointer to the start of the string (which may not be null terminated).
Definition StringRef.h:140
Align getStackAlign() const
getStackAlignment - This method returns the number of bytes to which the stack pointer must be aligne...
void setBooleanVectorContents(BooleanContent Ty)
Specify how the target extends the result of a vector boolean value from a vector of i1 to a wider ty...
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
void setMaxDivRemBitWidthSupported(unsigned SizeInBits)
Set the size in bits of the maximum div/rem the backend supports.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
unsigned MaxStoresPerMemcpyOptSize
Likewise for functions with the OptSize attribute.
const TargetMachine & getTargetMachine() const
virtual unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain targets require unusual breakdowns of certain types.
virtual MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
void setOperationPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
Convenience method to set an operation to Promote and specify the type in a single call.
LegalizeTypeAction
This enum indicates whether a types are legal for a target, and if not, what action should be used to...
void addBypassSlowDiv(unsigned int SlowBitWidth, unsigned int FastBitWidth)
Tells the code generator which bitwidths to bypass.
virtual unsigned getNumRegisters(LLVMContext &Context, EVT VT, std::optional< MVT > RegisterVT=std::nullopt) const
Return the number of registers that this ValueType will eventually require.
void setMaxAtomicSizeInBitsSupported(unsigned SizeInBits)
Set the maximum atomic operation size supported by the backend.
virtual TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const
Return the preferred vector type legalization action.
unsigned MaxStoresPerMemsetOptSize
Likewise for functions with the OptSize attribute.
void setBooleanContents(BooleanContent Ty)
Specify how the target extends the result of integer and floating point boolean values from i1 to a w...
unsigned MaxStoresPerMemmove
Specify maximum number of store instructions per memmove call.
void computeRegisterProperties(const TargetRegisterInfo *TRI)
Once all of the register classes are added, this allows us to compute derived properties we expose.
unsigned MaxStoresPerMemmoveOptSize
Likewise for functions with the OptSize attribute.
void addRegisterClass(MVT VT, const TargetRegisterClass *RC)
Add the specified register class as an available regclass for the specified value type.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
virtual MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
unsigned MaxStoresPerMemset
Specify maximum number of store instructions per memset call.
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
void setMinCmpXchgSizeInBits(unsigned SizeInBits)
Sets the minimum cmpxchg or ll/sc size supported by the backend.
void AddPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
If Opc/OrigVT is specified as being promoted, the promotion code defaults to trying a larger integer/...
AtomicExpansionKind
Enum that specifies what an atomic load/AtomicRMWInst is expanded to, if at all.
void setCondCodeAction(ArrayRef< ISD::CondCode > CCs, MVT VT, LegalizeAction Action)
Indicate that the specified condition code is or isn't supported on the target and indicate what to d...
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
Align getMinStackArgumentAlignment() const
Return the minimum stack alignment of an argument.
void setLoadExtAction(unsigned ExtType, MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified load with extension does not work with the specified type and indicate wh...
std::vector< ArgListEntry > ArgListTy
virtual Instruction * emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst, AtomicOrdering Ord) const
virtual Instruction * emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst, AtomicOrdering Ord) const
Inserts in the IR a target-specific intrinsic specifying a fence.
unsigned MaxStoresPerMemcpy
Specify maximum number of store instructions per memcpy call.
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
MVT getRegisterType(MVT VT) const
Return the type of registers that this ValueType will eventually require.
void setJumpIsExpensive(bool isExpensive=true)
Tells the code generator not to expand logic operations on comparison predicates into separate sequen...
LegalizeAction getOperationAction(unsigned Op, EVT VT) const
Return how this operation should be treated: either it is legal, needs to be promoted to a larger siz...
This class defines information used to lower LLVM code to legal SelectionDAG operators that the targe...
SDValue SimplifyMultipleUseDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth=0) const
More limited version of SimplifyDemandedBits that can be used to "lookthrough" ops that don't contrib...
virtual ConstraintType getConstraintType(StringRef Constraint) const
Given a constraint, return the type of constraint it is for this target.
virtual std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const
Given a physical register constraint (e.g.
TargetLowering(const TargetLowering &)=delete
SDValue expandRoundInexactToOdd(EVT ResultVT, SDValue Op, const SDLoc &DL, SelectionDAG &DAG) const
Truncate Op to ResultVT.
SDValue expandFP_ROUND(SDNode *Node, SelectionDAG &DAG) const
Expand round(fp) to fp conversion.
virtual void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const
Lower the specified operand into the Ops vector.
Primary interface to the complete machine description for the target machine.
CodeGenOptLevel getOptLevel() const
Returns the optimization level: None, Less, Default, or Aggressive.
TargetOptions Options
MCSymbol * getSymbol(const GlobalValue *GV) const
FPOpFusion::FPOpFusionMode AllowFPOpFusion
AllowFPOpFusion - This flag is set by the -fp-contract=xxx option.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
virtual const TargetFrameLowering * getFrameLowering() const
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:344
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
LLVM_ABI TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Definition Type.cpp:198
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:184
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240
bool isVoidTy() const
Return true if this is 'void'.
Definition Type.h:139
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
A raw_ostream that writes to an std::string.
CallInst * Call
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
LLVM_ABI APInt pow(const APInt &X, int64_t N)
Compute X^N for N>=0.
Definition APInt.cpp:3155
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
NodeType
ISD::NodeType enum - This enum defines the target-independent operators for a SelectionDAG.
Definition ISDOpcodes.h:41
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition ISDOpcodes.h:807
@ CTLZ_ZERO_UNDEF
Definition ISDOpcodes.h:780
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition ISDOpcodes.h:270
@ BSWAP
Byte Swap and Counting operators.
Definition ISDOpcodes.h:771
@ ADDC
Carry-setting nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:289
@ ADD
Simple integer binary arithmetic operators.
Definition ISDOpcodes.h:259
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition ISDOpcodes.h:841
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition ISDOpcodes.h:511
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
Definition ISDOpcodes.h:215
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition ISDOpcodes.h:868
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
Definition ISDOpcodes.h:577
@ FADD
Simple binary floating point operators.
Definition ISDOpcodes.h:410
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
Definition ISDOpcodes.h:744
@ SDIVREM
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition ISDOpcodes.h:275
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
Definition ISDOpcodes.h:249
@ SIGN_EXTEND
Conversion operators.
Definition ISDOpcodes.h:832
@ SSUBO
Same for subtraction.
Definition ISDOpcodes.h:347
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
Definition ISDOpcodes.h:369
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition ISDOpcodes.h:784
@ EXTRACT_ELEMENT
EXTRACT_ELEMENT - This is used to get the lower or upper (determined by a Constant,...
Definition ISDOpcodes.h:242
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
Definition ISDOpcodes.h:225
@ SADDO
RESULT, BOOL = [SU]ADDO(LHS, RHS) - Overflow-aware nodes for addition.
Definition ISDOpcodes.h:343
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition ISDOpcodes.h:701
@ SHL
Shift and rotation operations.
Definition ISDOpcodes.h:762
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition ISDOpcodes.h:642
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
Definition ISDOpcodes.h:607
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition ISDOpcodes.h:569
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition ISDOpcodes.h:219
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition ISDOpcodes.h:838
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
Definition ISDOpcodes.h:799
@ SSHLSAT
RESULT = [US]SHLSAT(LHS, RHS) - Perform saturation left shift.
Definition ISDOpcodes.h:379
@ SMULO
Same for multiplication.
Definition ISDOpcodes.h:351
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition ISDOpcodes.h:876
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition ISDOpcodes.h:724
@ VSELECT
Select with a vector condition (op #0) and two vector operands (ops #1 and #2), returning a vector re...
Definition ISDOpcodes.h:793
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:323
@ FRAMEADDR
FRAMEADDR, RETURNADDR - These nodes represent llvm.frameaddress and llvm.returnaddress on the DAG.
Definition ISDOpcodes.h:110
@ STRICT_FP_TO_UINT
Definition ISDOpcodes.h:471
@ STRICT_FP_TO_SINT
STRICT_FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:470
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:914
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition ISDOpcodes.h:736
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition ISDOpcodes.h:200
@ ADDE
Carry-using nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:299
@ FREEZE
FREEZE - FREEZE(VAL) returns an arbitrary value if VAL is UNDEF (or is evaluated to UNDEF),...
Definition ISDOpcodes.h:236
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
Definition ISDOpcodes.h:558
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition ISDOpcodes.h:947
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition ISDOpcodes.h:844
@ SHL_PARTS
SHL_PARTS/SRA_PARTS/SRL_PARTS - These operators are used for expanded integer shift operations.
Definition ISDOpcodes.h:821
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
Definition ISDOpcodes.h:527
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
Definition ISDOpcodes.h:360
@ SADDO_CARRY
Carry-using overflow-aware nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:333
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition ISDOpcodes.h:208
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
Definition ISDOpcodes.h:549
LLVM_ABI bool allOperandsUndef(const SDNode *N)
Return true if the node has at least one operand and all operands of the specified node are ISD::UNDE...
This namespace contains an enum with a value for every intrinsic/builtin function known by LLVM.
@ Bitcast
Perform the operation on a different, but equivalently sized type.
@ TCGEN05_MMA_SHARED_SCALE_D_DISABLE_OUTPUT_LANE_CG2
@ TCGEN05_MMA_SP_TENSOR_SCALE_D_DISABLE_OUTPUT_LANE_CG2_ASHIFT
@ TCGEN05_MMA_TENSOR_SCALE_D_DISABLE_OUTPUT_LANE_CG1
@ CLUSTERLAUNCHCONTROL_QUERY_CANCEL_IS_CANCELED
@ TCGEN05_MMA_SP_SHARED_SCALE_D_DISABLE_OUTPUT_LANE_CG2
@ CALL
This node represents a PTX call instruction.
@ TCGEN05_MMA_SP_SHARED_SCALE_D_DISABLE_OUTPUT_LANE_CG1
@ TCGEN05_MMA_SP_TENSOR_DISABLE_OUTPUT_LANE_CG2_ASHIFT
@ TCGEN05_MMA_SP_SHARED_DISABLE_OUTPUT_LANE_CG1
@ TCGEN05_MMA_SP_TENSOR_SCALE_D_DISABLE_OUTPUT_LANE_CG1
@ TCGEN05_MMA_SHARED_SCALE_D_DISABLE_OUTPUT_LANE_CG1
@ TCGEN05_MMA_TENSOR_DISABLE_OUTPUT_LANE_CG2
@ TCGEN05_MMA_SHARED_DISABLE_OUTPUT_LANE_CG2
@ TCGEN05_MMA_TENSOR_SCALE_D_DISABLE_OUTPUT_LANE_CG2
@ TCGEN05_MMA_SP_TENSOR_SCALE_D_DISABLE_OUTPUT_LANE_CG2
@ CLUSTERLAUNCHCONTROL_QUERY_CANCEL_GET_FIRST_CTAID_X
@ TCGEN05_MMA_SP_TENSOR_DISABLE_OUTPUT_LANE_CG1
@ TCGEN05_MMA_TENSOR_SCALE_D_DISABLE_OUTPUT_LANE_CG1_ASHIFT
@ TCGEN05_MMA_SP_TENSOR_SCALE_D_DISABLE_OUTPUT_LANE_CG1_ASHIFT
@ UNPACK_VECTOR
This node is the inverse of NVPTX::BUILD_VECTOR.
@ CLUSTERLAUNCHCONTROL_QUERY_CANCEL_GET_FIRST_CTAID_Y
@ TCGEN05_MMA_SHARED_DISABLE_OUTPUT_LANE_CG1
@ TCGEN05_MMA_TENSOR_DISABLE_OUTPUT_LANE_CG2_ASHIFT
@ TCGEN05_MMA_TENSOR_SCALE_D_DISABLE_OUTPUT_LANE_CG2_ASHIFT
@ DeclareScalarParam
These nodes represent a parameter declaration.
@ CLUSTERLAUNCHCONTROL_QUERY_CANCEL_GET_FIRST_CTAID_Z
@ ATOMIC_CMP_SWAP_B128
These nodes are used to lower atomic instructions with i128 type.
@ BUILD_VECTOR
This node is similar to ISD::BUILD_VECTOR except that the output may be implicitly bitcast to a scala...
@ TCGEN05_MMA_SP_SHARED_DISABLE_OUTPUT_LANE_CG2
@ TCGEN05_MMA_TENSOR_DISABLE_OUTPUT_LANE_CG1_ASHIFT
@ TCGEN05_MMA_SP_TENSOR_DISABLE_OUTPUT_LANE_CG1_ASHIFT
@ TCGEN05_MMA_TENSOR_DISABLE_OUTPUT_LANE_CG1
@ TCGEN05_MMA_SP_TENSOR_DISABLE_OUTPUT_LANE_CG2
bool isPackedVectorTy(EVT VT)
DivPrecisionLevel
Definition NVPTX.h:252
ValuesClass values(OptsTy... Options)
Helper to build a ValuesClass by forwarding a variable number of arguments as an initializer list to ...
initializer< Ty > init(const Ty &Val)
NodeAddr< NodeBase * > Node
Definition RDFGraph.h:381
This is an optimization pass for GlobalISel generic memory operations.
@ Low
Lower the current thread's priority such that it does not affect foreground tasks significantly.
Definition Threading.h:280
@ Offset
Definition DWP.cpp:477
detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&...args)
zip iterator for two or more iteratable types.
Definition STLExtras.h:829
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
bool shouldEmitPTXNoReturn(const Value *V, const TargetMachine &TM)
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1725
MaybeAlign getAlign(const CallInst &I, unsigned Index)
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition STLExtras.h:1655
void ComputeValueVTs(const TargetLowering &TLI, const DataLayout &DL, Type *Ty, SmallVectorImpl< EVT > &ValueVTs, SmallVectorImpl< EVT > *MemVTs=nullptr, SmallVectorImpl< TypeSize > *Offsets=nullptr, TypeSize StartingOffset=TypeSize::getZero())
ComputeValueVTs - Given an LLVM IR type, compute a sequence of EVTs that represent all the individual...
Definition Analysis.cpp:119
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2472
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition MathExtras.h:385
bool isReleaseOrStronger(AtomicOrdering AO)
OutputIt transform(R &&Range, OutputIt d_first, UnaryFunction F)
Wrapper function around std::transform to apply a function to a range and store the result elsewhere.
Definition STLExtras.h:1968
unsigned promoteScalarArgumentSize(unsigned size)
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:167
bool shouldPassAsArray(Type *Ty)
CodeGenOptLevel
Code generation optimization level.
Definition CodeGen.h:82
@ Default
-O2, -Os
Definition CodeGen.h:85
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
AtomicOrdering
Atomic ordering for LLVM's memory model.
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition Alignment.h:144
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
bool isAcquireOrStronger(AtomicOrdering AO)
constexpr unsigned BitWidth
bool isKernelFunction(const Function &F)
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
Function * getMaybeBitcastedCallee(const CallBase *CB)
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition Alignment.h:201
auto seq(T Begin, T End)
Iterate over an integral type from Begin up to - but not including - End.
Definition Sequence.h:305
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:869
#define N
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
constexpr uint64_t value() const
This is a hole in the type system and should not be abused.
Definition Alignment.h:77
@ PreserveSign
The sign of a flushed-to-zero number is preserved in the sign of 0.
DenormalModeKind Output
Denormal flushing mode for floating point instruction results in the default floating point environme...
Extended Value Type.
Definition ValueTypes.h:35
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
Definition ValueTypes.h:395
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition ValueTypes.h:137
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition ValueTypes.h:74
EVT changeTypeToInteger() const
Return the type converted to an equivalently sized integer or vector with integer element type.
Definition ValueTypes.h:121
bool bitsGT(EVT VT) const
Return true if this has more bits than VT.
Definition ValueTypes.h:284
bool bitsLT(EVT VT) const
Return true if this has less bits than VT.
Definition ValueTypes.h:300
ElementCount getVectorElementCount() const
Definition ValueTypes.h:350
bool is32BitVector() const
Return true if this is a 32-bit vector type.
Definition ValueTypes.h:197
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition ValueTypes.h:373
uint64_t getScalarSizeInBits() const
Definition ValueTypes.h:385
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition ValueTypes.h:316
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
Definition ValueTypes.h:381
bool isVector() const
Return true if this is a vector value type.
Definition ValueTypes.h:168
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition ValueTypes.h:323
bool bitsEq(EVT VT) const
Return true if this has the same number of bits as VT.
Definition ValueTypes.h:256
LLVM_ABI Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition ValueTypes.h:328
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
Definition ValueTypes.h:157
EVT changeVectorElementType(EVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
Definition ValueTypes.h:102
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition ValueTypes.h:336
bool isInteger() const
Return true if this is an integer or a vector integer type.
Definition ValueTypes.h:152
static LLVM_ABI KnownBits ashr(const KnownBits &LHS, const KnownBits &RHS, bool ShAmtNonZero=false, bool Exact=false)
Compute known bits for ashr(LHS, RHS).
KnownBits concat(const KnownBits &Lo) const
Concatenate the bits from Lo onto the bottom of *this.
Definition KnownBits.h:233
unsigned getBitWidth() const
Get the bit width of this value.
Definition KnownBits.h:44
void resetAll()
Resets the known state of all bits.
Definition KnownBits.h:74
void insertBits(const KnownBits &SubBits, unsigned BitPosition)
Insert the bits from a smaller known bits starting at bitPosition.
Definition KnownBits.h:219
This class contains a discriminated union of information about pointers in memory operands,...
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition Alignment.h:106
These are IR-level optimization flags that may be propagated to SDNodes.
bool hasAllowContract() const
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
This structure contains all information that is necessary for lowering calls.
SmallVector< ISD::InputArg, 32 > Ins
SmallVector< ISD::OutputArg, 32 > Outs
Type * RetTy
Same as OrigRetTy, or partially legalized for soft float libcalls.
A convenience struct that encapsulates a DAG, and two SDValues for returning information from TargetL...