LLVM 23.0.0git
NVPTXISelLowering.cpp
Go to the documentation of this file.
1//===-- NVPTXISelLowering.cpp - NVPTX DAG Lowering Implementation ---------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines the interfaces that NVPTX uses to lower LLVM code into a
10// selection DAG.
11//
12//===----------------------------------------------------------------------===//
13
14#include "NVPTXISelLowering.h"
16#include "NVPTX.h"
17#include "NVPTXISelDAGToDAG.h"
19#include "NVPTXSubtarget.h"
20#include "NVPTXTargetMachine.h"
22#include "NVPTXUtilities.h"
23#include "llvm/ADT/APFloat.h"
24#include "llvm/ADT/APInt.h"
25#include "llvm/ADT/STLExtras.h"
27#include "llvm/ADT/StringRef.h"
40#include "llvm/IR/Argument.h"
41#include "llvm/IR/Attributes.h"
42#include "llvm/IR/Constants.h"
43#include "llvm/IR/DataLayout.h"
46#include "llvm/IR/FPEnv.h"
47#include "llvm/IR/Function.h"
48#include "llvm/IR/GlobalValue.h"
49#include "llvm/IR/IRBuilder.h"
50#include "llvm/IR/Instruction.h"
52#include "llvm/IR/IntrinsicsNVPTX.h"
53#include "llvm/IR/Module.h"
54#include "llvm/IR/Type.h"
55#include "llvm/IR/Value.h"
67#include <algorithm>
68#include <cassert>
69#include <cmath>
70#include <cstdint>
71#include <iterator>
72#include <optional>
73#include <string>
74#include <tuple>
75#include <utility>
76#include <vector>
77
78#define DEBUG_TYPE "nvptx-lower"
79
80using namespace llvm;
81
83 "nvptx-sched4reg",
84 cl::desc("NVPTX Specific: schedule for register pressue"), cl::init(false));
85
87 "nvptx-fma-level", cl::Hidden,
88 cl::desc("NVPTX Specific: FMA contraction (0: don't do it"
89 " 1: do it 2: do it aggressively"),
90 cl::init(2));
91
93 "nvptx-prec-divf32", cl::Hidden,
95 "NVPTX Specific: Override the precision of the lowering for f32 fdiv"),
97 clEnumValN(NVPTX::DivPrecisionLevel::Approx, "0", "Use div.approx"),
98 clEnumValN(NVPTX::DivPrecisionLevel::Full, "1", "Use div.full"),
100 "Use IEEE Compliant F32 div.rnd if available (default)"),
102 "Use IEEE Compliant F32 div.rnd if available, no FTZ")),
104
106 "nvptx-prec-sqrtf32", cl::Hidden,
107 cl::desc("NVPTX Specific: 0 use sqrt.approx, 1 use sqrt.rn."),
108 cl::init(true));
109
110/// Whereas CUDA's implementation (see libdevice) uses ex2.approx for exp2(), it
111/// does NOT use lg2.approx for log2, so this is disabled by default.
113 "nvptx-approx-log2f32",
114 cl::desc("NVPTX Specific: whether to use lg2.approx for log2"),
115 cl::init(false));
116
118 "nvptx-force-min-byval-param-align", cl::Hidden,
119 cl::desc("NVPTX Specific: force 4-byte minimal alignment for byval"
120 " params of device functions."),
121 cl::init(false));
122
125 const SDNode &N) const {
126 // If nvptx-prec-div32=N is used on the command-line, always honor it
127 if (UsePrecDivF32.getNumOccurrences() > 0)
128 return UsePrecDivF32;
129
130 const SDNodeFlags Flags = N.getFlags();
131 if (Flags.hasApproximateFuncs())
133
135}
136
138 // If nvptx-prec-sqrtf32 is used on the command-line, always honor it
139 if (UsePrecSqrtF32.getNumOccurrences() > 0)
140 return UsePrecSqrtF32;
141
142 if (N) {
143 const SDNodeFlags Flags = N->getFlags();
144 if (Flags.hasApproximateFuncs())
145 return false;
146 }
147
148 return true;
149}
150
155
156static bool IsPTXVectorType(MVT VT) {
157 switch (VT.SimpleTy) {
158 default:
159 return false;
160 case MVT::v2i1:
161 case MVT::v4i1:
162 case MVT::v2i8:
163 case MVT::v4i8:
164 case MVT::v8i8: // <2 x i8x4>
165 case MVT::v16i8: // <4 x i8x4>
166 case MVT::v2i16:
167 case MVT::v4i16:
168 case MVT::v8i16: // <4 x i16x2>
169 case MVT::v2i32:
170 case MVT::v4i32:
171 case MVT::v2i64:
172 case MVT::v2f16:
173 case MVT::v4f16:
174 case MVT::v8f16: // <4 x f16x2>
175 case MVT::v2bf16:
176 case MVT::v4bf16:
177 case MVT::v8bf16: // <4 x bf16x2>
178 case MVT::v2f32:
179 case MVT::v4f32:
180 case MVT::v2f64:
181 case MVT::v4i64:
182 case MVT::v4f64:
183 case MVT::v8i32:
184 case MVT::v8f32:
185 case MVT::v16f16: // <8 x f16x2>
186 case MVT::v16bf16: // <8 x bf16x2>
187 case MVT::v16i16: // <8 x i16x2>
188 case MVT::v32i8: // <8 x i8x4>
189 return true;
190 }
191}
192
193// When legalizing vector loads/stores, this function is called, which does two
194// things:
195// 1. Determines Whether the vector is something we want to custom lower,
196// std::nullopt is returned if we do not want to custom lower it.
197// 2. If we do want to handle it, returns two parameters:
198// - unsigned int NumElts - The number of elements in the final vector
199// - EVT EltVT - The type of the elements in the final vector
200static std::optional<std::pair<unsigned int, MVT>>
202 unsigned AddressSpace) {
203 const bool CanLowerTo256Bit = STI.has256BitVectorLoadStore(AddressSpace);
204
205 if (CanLowerTo256Bit && VectorEVT.isScalarInteger() &&
206 VectorEVT.getSizeInBits() == 256)
207 return {{4, MVT::i64}};
208
209 if (!VectorEVT.isSimple())
210 return std::nullopt;
211 const MVT VectorVT = VectorEVT.getSimpleVT();
212
213 if (!VectorVT.isVector()) {
214 if (VectorVT == MVT::i128 || VectorVT == MVT::f128)
215 return {{2, MVT::i64}};
216 return std::nullopt;
217 }
218
219 const MVT EltVT = VectorVT.getVectorElementType();
220 const unsigned NumElts = VectorVT.getVectorNumElements();
221
222 // The size of the PTX virtual register that holds a packed type.
223 unsigned PackRegSize;
224
225 // We only handle "native" vector sizes for now, e.g. <4 x double> is not
226 // legal. We can (and should) split that into 2 stores of <2 x double> here
227 // but I'm leaving that as a TODO for now.
228 switch (VectorVT.SimpleTy) {
229 default:
230 return std::nullopt;
231
232 case MVT::v4i64:
233 case MVT::v4f64:
234 // This is a "native" vector type iff the address space is global and the
235 // target supports 256-bit loads/stores
236 if (!CanLowerTo256Bit)
237 return std::nullopt;
238 [[fallthrough]];
239 case MVT::v2i8:
240 case MVT::v2i64:
241 case MVT::v2f64:
242 // This is a "native" vector type
243 return std::pair(NumElts, EltVT);
244
245 case MVT::v16f16: // <8 x f16x2>
246 case MVT::v16bf16: // <8 x bf16x2>
247 case MVT::v16i16: // <8 x i16x2>
248 case MVT::v32i8: // <8 x i8x4>
249 // This can be upsized into a "native" vector type iff the address space is
250 // global and the target supports 256-bit loads/stores.
251 if (!CanLowerTo256Bit)
252 return std::nullopt;
253 [[fallthrough]];
254 case MVT::v2i16: // <1 x i16x2>
255 case MVT::v2f16: // <1 x f16x2>
256 case MVT::v2bf16: // <1 x bf16x2>
257 case MVT::v4i8: // <1 x i8x4>
258 case MVT::v4i16: // <2 x i16x2>
259 case MVT::v4f16: // <2 x f16x2>
260 case MVT::v4bf16: // <2 x bf16x2>
261 case MVT::v8i8: // <2 x i8x4>
262 case MVT::v8f16: // <4 x f16x2>
263 case MVT::v8bf16: // <4 x bf16x2>
264 case MVT::v8i16: // <4 x i16x2>
265 case MVT::v16i8: // <4 x i8x4>
266 PackRegSize = 32;
267 break;
268
269 case MVT::v8f32: // <4 x f32x2>
270 case MVT::v8i32: // <4 x i32x2>
271 // This is a "native" vector type iff the address space is global and the
272 // target supports 256-bit loads/stores
273 if (!CanLowerTo256Bit)
274 return std::nullopt;
275 [[fallthrough]];
276 case MVT::v2f32: // <1 x f32x2>
277 case MVT::v4f32: // <2 x f32x2>
278 case MVT::v2i32: // <1 x i32x2>
279 case MVT::v4i32: // <2 x i32x2>
280 if (!STI.hasF32x2Instructions())
281 return std::pair(NumElts, EltVT);
282 PackRegSize = 64;
283 break;
284 }
285
286 // If we reach here, then we can pack 2 or more elements into a single 32-bit
287 // or 64-bit PTX register and treat the vector as a new vector containing
288 // packed elements.
289
290 // Number of elements to pack in one word.
291 const unsigned NPerReg = PackRegSize / EltVT.getSizeInBits();
292
293 return std::pair(NumElts / NPerReg, MVT::getVectorVT(EltVT, NPerReg));
294}
295
296/// ComputePTXValueVTs - For the given Type \p Ty, returns the set of primitive
297/// legal-ish MVTs that compose it. Unlike ComputeValueVTs, this will legalize
298/// the types as required by the calling convention (with special handling for
299/// i8s).
300/// NOTE: This is a band-aid for code that expects ComputeValueVTs to return the
301/// same number of types as the Ins/Outs arrays in LowerFormalArguments,
302/// LowerCall, and LowerReturn.
303static void ComputePTXValueVTs(const TargetLowering &TLI, const DataLayout &DL,
304 LLVMContext &Ctx, CallingConv::ID CallConv,
305 Type *Ty, SmallVectorImpl<EVT> &ValueVTs,
307 uint64_t StartingOffset = 0) {
308 SmallVector<EVT, 16> TempVTs;
309 SmallVector<uint64_t, 16> TempOffsets;
310 ComputeValueVTs(TLI, DL, Ty, TempVTs, /*MemVTs=*/nullptr, &TempOffsets,
311 StartingOffset);
312
313 for (const auto [VT, Off] : zip(TempVTs, TempOffsets)) {
314 MVT RegisterVT = TLI.getRegisterTypeForCallingConv(Ctx, CallConv, VT);
315 unsigned NumRegs = TLI.getNumRegistersForCallingConv(Ctx, CallConv, VT);
316
317 // Since we actually can load/store b8, we need to ensure that we'll use
318 // the original sized type for any i8s or i8 vectors.
319 if (VT.getScalarType() == MVT::i8) {
320 if (RegisterVT == MVT::i16)
321 RegisterVT = MVT::i8;
322 else if (RegisterVT == MVT::v2i16)
323 RegisterVT = MVT::v2i8;
324 else
325 assert(RegisterVT == MVT::v4i8 &&
326 "Expected v4i8, v2i16, or i16 for i8 RegisterVT");
327 }
328
329 // TODO: This is horribly incorrect for cases where the vector elements are
330 // not a multiple of bytes (ex i1) and legal or i8. However, this problem
331 // has existed for as long as NVPTX has and no one has complained, so we'll
332 // leave it for now.
333 for (unsigned I : seq(NumRegs)) {
334 ValueVTs.push_back(RegisterVT);
335 Offsets.push_back(Off + I * RegisterVT.getStoreSize());
336 }
337 }
338}
339
340// We return an EVT that can hold N VTs
341// If the VT is a vector, the resulting EVT is a flat vector with the same
342// element type as VT's element type.
343static EVT getVectorizedVT(EVT VT, unsigned N, LLVMContext &C) {
344 if (N == 1)
345 return VT;
346
347 return VT.isVector() ? EVT::getVectorVT(C, VT.getScalarType(),
348 VT.getVectorNumElements() * N)
349 : EVT::getVectorVT(C, VT, N);
350}
351
353 const SDLoc &dl, SelectionDAG &DAG) {
354 if (V.getValueType() == VT) {
355 assert(I == 0 && "Index must be 0 for scalar value");
356 return V;
357 }
358
359 if (!VT.isVector())
360 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, V,
361 DAG.getVectorIdxConstant(I, dl));
362
363 return DAG.getNode(
364 ISD::EXTRACT_SUBVECTOR, dl, VT, V,
366}
367
368template <typename T>
369static inline SDValue getBuildVectorizedValue(unsigned N, const SDLoc &dl,
370 SelectionDAG &DAG, T GetElement) {
371 if (N == 1)
372 return GetElement(0);
373
375 for (const unsigned I : llvm::seq(N)) {
376 SDValue Val = GetElement(I);
377 if (Val.getValueType().isVector())
378 DAG.ExtractVectorElements(Val, Values);
379 else
380 Values.push_back(Val);
381 }
382
383 EVT VT = EVT::getVectorVT(*DAG.getContext(), Values[0].getValueType(),
384 Values.size());
385 return DAG.getBuildVector(VT, dl, Values);
386}
387
388/// PromoteScalarIntegerPTX
389/// Used to make sure the arguments/returns are suitable for passing
390/// and promote them to a larger size if they're not.
391///
392/// The promoted type is placed in \p PromoteVT if the function returns true.
394 if (VT.isScalarInteger()) {
395 switch (PowerOf2Ceil(VT.getFixedSizeInBits())) {
396 default:
398 "Promotion is not suitable for scalars of size larger than 64-bits");
399 case 1:
400 return MVT::i1;
401 case 2:
402 case 4:
403 case 8:
404 return MVT::i8;
405 case 16:
406 return MVT::i16;
407 case 32:
408 return MVT::i32;
409 case 64:
410 return MVT::i64;
411 }
412 }
413 return VT;
414}
415
416// Check whether we can merge loads/stores of some of the pieces of a
417// flattened function parameter or return value into a single vector
418// load/store.
419//
420// The flattened parameter is represented as a list of EVTs and
421// offsets, and the whole structure is aligned to ParamAlignment. This
422// function determines whether we can load/store pieces of the
423// parameter starting at index Idx using a single vectorized op of
424// size AccessSize. If so, it returns the number of param pieces
425// covered by the vector op. Otherwise, it returns 1.
426template <typename T>
428 unsigned Idx, uint32_t AccessSize, const SmallVectorImpl<EVT> &ValueVTs,
429 const SmallVectorImpl<T> &Offsets, Align ParamAlignment) {
430
431 // Can't vectorize if param alignment is not sufficient.
432 if (ParamAlignment < AccessSize)
433 return 1;
434 // Can't vectorize if offset is not aligned.
435 if (Offsets[Idx] & (AccessSize - 1))
436 return 1;
437
438 EVT EltVT = ValueVTs[Idx];
439 unsigned EltSize = EltVT.getStoreSize();
440
441 // Element is too large to vectorize.
442 if (EltSize >= AccessSize)
443 return 1;
444
445 unsigned NumElts = AccessSize / EltSize;
446 // Can't vectorize if AccessBytes if not a multiple of EltSize.
447 if (AccessSize != EltSize * NumElts)
448 return 1;
449
450 // We don't have enough elements to vectorize.
451 if (Idx + NumElts > ValueVTs.size())
452 return 1;
453
454 // PTX ISA can only deal with 2- and 4-element vector ops.
455 if (NumElts != 4 && NumElts != 2)
456 return 1;
457
458 for (unsigned j = Idx + 1; j < Idx + NumElts; ++j) {
459 // Types do not match.
460 if (ValueVTs[j] != EltVT)
461 return 1;
462
463 // Elements are not contiguous.
464 if (Offsets[j] - Offsets[j - 1] != EltSize)
465 return 1;
466 }
467 // OK. We can vectorize ValueVTs[i..i+NumElts)
468 return NumElts;
469}
470
471// Computes whether and how we can vectorize the loads/stores of a
472// flattened function parameter or return value.
473//
474// The flattened parameter is represented as the list of ValueVTs and
475// Offsets, and is aligned to ParamAlignment bytes. We return a vector
476// of the same size as ValueVTs indicating how each piece should be
477// loaded/stored (i.e. as a scalar, or as part of a vector
478// load/store).
479template <typename T>
482 const SmallVectorImpl<T> &Offsets, Align ParamAlignment,
483 bool IsVAArg = false) {
484 // Set vector size to match ValueVTs and mark all elements as
485 // scalars by default.
486
487 if (IsVAArg)
488 return SmallVector<unsigned>(ValueVTs.size(), 1);
489
490 SmallVector<unsigned, 16> VectorInfo;
491
492 const auto GetNumElts = [&](unsigned I) -> unsigned {
493 for (const unsigned AccessSize : {16, 8, 4, 2}) {
494 const unsigned NumElts = canMergeParamLoadStoresStartingAt(
495 I, AccessSize, ValueVTs, Offsets, ParamAlignment);
496 assert((NumElts == 1 || NumElts == 2 || NumElts == 4) &&
497 "Unexpected vectorization size");
498 if (NumElts != 1)
499 return NumElts;
500 }
501 return 1;
502 };
503
504 // Check what we can vectorize using 128/64/32-bit accesses.
505 for (unsigned I = 0, E = ValueVTs.size(); I != E;) {
506 const unsigned NumElts = GetNumElts(I);
507 VectorInfo.push_back(NumElts);
508 I += NumElts;
509 }
510 assert(std::accumulate(VectorInfo.begin(), VectorInfo.end(), 0u) ==
511 ValueVTs.size());
512 return VectorInfo;
513}
514
515// NVPTXTargetLowering Constructor.
517 const NVPTXSubtarget &STI)
518 : TargetLowering(TM, STI), nvTM(&TM), STI(STI), GlobalUniqueCallSite(0) {
519 // always lower memset, memcpy, and memmove intrinsics to load/store
520 // instructions, rather
521 // then generating calls to memset, mempcy or memmove.
525
528
529 // Jump is Expensive. Don't create extra control flow for 'and', 'or'
530 // condition branches.
531 setJumpIsExpensive(true);
532
533 // Wide divides are _very_ slow. Try to reduce the width of the divide if
534 // possible.
535 addBypassSlowDiv(64, 32);
536
537 // By default, use the Source scheduling
538 if (sched4reg)
540 else
542
543 auto setFP16OperationAction = [&](unsigned Op, MVT VT, LegalizeAction Action,
544 LegalizeAction NoF16Action) {
545 bool IsOpSupported = STI.allowFP16Math();
546 switch (Op) {
547 // Several FP16 instructions are available on sm_80 only.
548 case ISD::FMINNUM:
549 case ISD::FMAXNUM:
552 case ISD::FMAXIMUM:
553 case ISD::FMINIMUM:
554 case ISD::FMAXIMUMNUM:
555 case ISD::FMINIMUMNUM:
556 IsOpSupported &= STI.getSmVersion() >= 80 && STI.getPTXVersion() >= 70;
557 break;
558 case ISD::FEXP2:
559 IsOpSupported &= STI.getSmVersion() >= 75 && STI.getPTXVersion() >= 70;
560 break;
561 }
562 setOperationAction(Op, VT, IsOpSupported ? Action : NoF16Action);
563 };
564
565 auto setBF16OperationAction = [&](unsigned Op, MVT VT, LegalizeAction Action,
566 LegalizeAction NoBF16Action) {
567 bool IsOpSupported = STI.hasNativeBF16Support(Op);
569 Op, VT, IsOpSupported ? Action : NoBF16Action);
570 };
571
572 auto setI16x2OperationAction = [&](unsigned Op, MVT VT, LegalizeAction Action,
573 LegalizeAction NoI16x2Action) {
574 bool IsOpSupported = false;
575 // instructions are available on sm_90 only
576 switch (Op) {
577 case ISD::ADD:
578 case ISD::SMAX:
579 case ISD::SMIN:
580 case ISD::UMIN:
581 case ISD::UMAX:
582 IsOpSupported = STI.getSmVersion() >= 90 && STI.getPTXVersion() >= 80;
583 break;
584 }
585 setOperationAction(Op, VT, IsOpSupported ? Action : NoI16x2Action);
586 };
587
588 addRegisterClass(MVT::i1, &NVPTX::B1RegClass);
589 addRegisterClass(MVT::i16, &NVPTX::B16RegClass);
590 addRegisterClass(MVT::v2i16, &NVPTX::B32RegClass);
591 addRegisterClass(MVT::v4i8, &NVPTX::B32RegClass);
592 addRegisterClass(MVT::i32, &NVPTX::B32RegClass);
593 addRegisterClass(MVT::i64, &NVPTX::B64RegClass);
594 addRegisterClass(MVT::f32, &NVPTX::B32RegClass);
595 addRegisterClass(MVT::f64, &NVPTX::B64RegClass);
596 addRegisterClass(MVT::f16, &NVPTX::B16RegClass);
597 addRegisterClass(MVT::v2f16, &NVPTX::B32RegClass);
598 addRegisterClass(MVT::bf16, &NVPTX::B16RegClass);
599 addRegisterClass(MVT::v2bf16, &NVPTX::B32RegClass);
600
601 if (STI.hasF32x2Instructions()) {
602 addRegisterClass(MVT::v2f32, &NVPTX::B64RegClass);
603 addRegisterClass(MVT::v2i32, &NVPTX::B64RegClass);
604 }
605
606 // Conversion to/from FP16/FP16x2 is always legal.
611
613 if (STI.getSmVersion() >= 30 && STI.getPTXVersion() > 31)
615
616 setFP16OperationAction(ISD::SETCC, MVT::f16, Legal, Promote);
617 setFP16OperationAction(ISD::SETCC, MVT::v2f16, Legal, Expand);
618
619 // Conversion to/from BFP16/BFP16x2 is always legal.
624
625 setBF16OperationAction(ISD::SETCC, MVT::v2bf16, Legal, Expand);
626 setBF16OperationAction(ISD::SETCC, MVT::bf16, Legal, Promote);
627 if (getOperationAction(ISD::SETCC, MVT::bf16) == Promote)
628 AddPromotedToType(ISD::SETCC, MVT::bf16, MVT::f32);
629
630 // Conversion to/from i16/i16x2 is always legal.
635
640
641 // No support for these operations with v2f32/v2i32
642 setOperationAction(ISD::INSERT_VECTOR_ELT, {MVT::v2f32, MVT::v2i32}, Expand);
643 setOperationAction(ISD::VECTOR_SHUFFLE, {MVT::v2f32, MVT::v2i32}, Expand);
644
647 MVT::v2i32, Expand);
648
649 // Need custom lowering in case the index is dynamic.
650 if (STI.hasF32x2Instructions())
651 setOperationAction(ISD::EXTRACT_VECTOR_ELT, {MVT::v2f32, MVT::v2i32},
652 Custom);
653
654 // Custom conversions to/from v2i8.
656
657 // Only logical ops can be done on v4i8/v2i32 directly, others must be done
658 // elementwise.
675 {MVT::v4i8, MVT::v2i32}, Expand);
676
677 // Operations not directly supported by NVPTX.
678 for (MVT VT : {MVT::bf16, MVT::f16, MVT::v2bf16, MVT::v2f16, MVT::f32,
679 MVT::v2f32, MVT::f64, MVT::i1, MVT::i8, MVT::i16, MVT::v2i16,
680 MVT::v4i8, MVT::i32, MVT::v2i32, MVT::i64}) {
683 }
684
685 // We don't want ops like FMINIMUM or UMAX to be lowered to SETCC+VSELECT.
686 setOperationAction(ISD::VSELECT, {MVT::v2f32, MVT::v2i32}, Expand);
687
688 // Some SIGN_EXTEND_INREG can be done using cvt instruction.
689 // For others we will expand to a SHL/SRA pair.
695 setOperationAction(ISD::SIGN_EXTEND_INREG, {MVT::v2i16, MVT::v2i32}, Expand);
696
703
706
708 {MVT::i8, MVT::i16, MVT::v2i16, MVT::i32, MVT::i64},
709 Expand);
710
711 if (STI.hasHWROT32()) {
714 Custom);
715 }
716
717 setOperationAction(ISD::BR_JT, MVT::Other, STI.hasBrx() ? Legal : Expand);
719
720 // We want to legalize constant related memmove and memcopy
721 // intrinsics.
723
724 // FP extload/truncstore is not legal in PTX. We need to expand all these.
725 for (auto FloatVTs :
727 for (MVT ValVT : FloatVTs) {
728 for (MVT MemVT : FloatVTs) {
729 setLoadExtAction(ISD::EXTLOAD, ValVT, MemVT, Expand);
730 setTruncStoreAction(ValVT, MemVT, Expand);
731 }
732 }
733 }
734
735 // To improve CodeGen we'll legalize any-extend loads to zext loads. This is
736 // how they'll be lowered in ISel anyway, and by doing this a little earlier
737 // we allow for more DAG combine opportunities.
738 for (auto IntVTs :
740 for (MVT ValVT : IntVTs)
741 for (MVT MemVT : IntVTs)
742 if (isTypeLegal(ValVT))
743 setLoadExtAction(ISD::EXTLOAD, ValVT, MemVT, Custom);
744
745 // PTX does not support load / store predicate registers
747 for (MVT VT : MVT::integer_valuetypes()) {
749 Promote);
750 setTruncStoreAction(VT, MVT::i1, Expand);
751 }
752
753 // Disable generations of extload/truncstore for v2i32/v2i16/v2i8. The generic
754 // expansion for these nodes when they are unaligned is incorrect if the
755 // type is a vector.
756 //
757 // TODO: Fix the generic expansion for these nodes found in
758 // TargetLowering::expandUnalignedLoad/Store.
760 MVT::v2i8, Expand);
762 {MVT::v2i8, MVT::v2i16}, Expand);
763 setTruncStoreAction(MVT::v2i16, MVT::v2i8, Expand);
764 setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
765 setTruncStoreAction(MVT::v2i32, MVT::v2i8, Expand);
766
767 // Register custom handling for illegal type loads/stores. We'll try to custom
768 // lower almost all illegal types and logic in the lowering will discard cases
769 // we can't handle.
770 setOperationAction({ISD::LOAD, ISD::STORE}, {MVT::i128, MVT::i256, MVT::f128},
771 Custom);
773 if (!isTypeLegal(VT) && VT.getStoreSizeInBits() <= 256)
775 Custom);
776
777 // Custom legalization for LDU intrinsics.
778 // TODO: The logic to lower these is not very robust and we should rewrite it.
779 // Perhaps LDU should not be represented as an intrinsic at all.
782 if (IsPTXVectorType(VT))
784
788 MVT::i1, Expand);
789
790 // This is legal in NVPTX
795
796 setOperationAction(ISD::DYNAMIC_STACKALLOC, {MVT::i32, MVT::i64}, Custom);
798
799 // TRAP can be lowered to PTX trap
800 setOperationAction(ISD::TRAP, MVT::Other, Legal);
801 // DEBUGTRAP can be lowered to PTX brkpt
803
804 // Support varargs.
809
811 {MVT::i16, MVT::i32, MVT::i64}, Legal);
812
814 Promote);
817
818 setI16x2OperationAction(ISD::ABS, MVT::v2i16, Legal, Custom);
819 setI16x2OperationAction(ISD::SMIN, MVT::v2i16, Legal, Custom);
820 setI16x2OperationAction(ISD::SMAX, MVT::v2i16, Legal, Custom);
821 setI16x2OperationAction(ISD::UMIN, MVT::v2i16, Legal, Custom);
822 setI16x2OperationAction(ISD::UMAX, MVT::v2i16, Legal, Custom);
823 setI16x2OperationAction(ISD::CTPOP, MVT::v2i16, Legal, Expand);
824 setI16x2OperationAction(ISD::CTLZ, MVT::v2i16, Legal, Expand);
825
826 setI16x2OperationAction(ISD::ADD, MVT::v2i16, Legal, Custom);
827 setI16x2OperationAction(ISD::SUB, MVT::v2i16, Legal, Custom);
828 setI16x2OperationAction(ISD::MUL, MVT::v2i16, Legal, Custom);
829 setI16x2OperationAction(ISD::SHL, MVT::v2i16, Legal, Custom);
830 setI16x2OperationAction(ISD::SREM, MVT::v2i16, Legal, Custom);
831 setI16x2OperationAction(ISD::UREM, MVT::v2i16, Legal, Custom);
832
833 // Other arithmetic and logic ops are unsupported.
837 {MVT::v2i16, MVT::v2i32}, Expand);
838
839 // v2i32 is not supported for any arithmetic operations
844 MVT::v2i32, Expand);
845
850 if (STI.getPTXVersion() >= 43) {
855 }
856
858 setOperationAction(ISD::CTTZ, {MVT::v2i16, MVT::v2i32}, Expand);
861
862 // PTX does not directly support SELP of i1, so promote to i32 first
864
865 // PTX cannot multiply two i64s in a single instruction.
868
869 // We have some custom DAG combine patterns for these nodes
871 ISD::AND,
873 ISD::FADD,
880 ISD::MUL,
882 ISD::SHL,
883 ISD::SREM,
884 ISD::UREM,
888 ISD::LOAD,
893
894 // If the vector operands require register coalescing, scalarize instead
895 if (STI.hasF32x2Instructions())
897
898 // setcc for f16x2 and bf16x2 needs special handling to prevent
899 // legalizer's attempt to scalarize it due to v2i1 not being legal.
900 if (STI.allowFP16Math() || STI.hasBF16Math())
902
903 // Vector reduction operations. These may be turned into shuffle or tree
904 // reductions depending on what instructions are available for each type.
906 MVT EltVT = VT.getVectorElementType();
907 if (EltVT == MVT::f32 || EltVT == MVT::f64) {
910 VT, Custom);
911 }
912 }
913
914 // Promote fp16 arithmetic if fp16 hardware isn't available or the
915 // user passed --nvptx-no-fp16-math. The flag is useful because,
916 // although sm_53+ GPUs have some sort of FP16 support in
917 // hardware, only sm_53 and sm_60 have full implementation. Others
918 // only have token amount of hardware and are likely to run faster
919 // by using fp32 units instead.
920 for (const auto &Op : {ISD::FADD, ISD::FMUL, ISD::FSUB, ISD::FMA}) {
921 setFP16OperationAction(Op, MVT::f16, Legal, Promote);
922 setFP16OperationAction(Op, MVT::v2f16, Legal, Expand);
923 setBF16OperationAction(Op, MVT::v2bf16, Legal, Expand);
924 // bf16 must be promoted to f32.
925 setBF16OperationAction(Op, MVT::bf16, Legal, Promote);
926 if (getOperationAction(Op, MVT::bf16) == Promote)
927 AddPromotedToType(Op, MVT::bf16, MVT::f32);
928 setOperationAction(Op, MVT::v2f32,
929 STI.hasF32x2Instructions() ? Legal : Expand);
930 }
931
932 // On SM80, we select add/mul/sub as fma to avoid promotion to float
933 for (const auto &Op : {ISD::FADD, ISD::FMUL, ISD::FSUB}) {
934 for (const auto &VT : {MVT::bf16, MVT::v2bf16}) {
935 if (!STI.hasNativeBF16Support(Op) && STI.hasNativeBF16Support(ISD::FMA)) {
937 }
938 }
939 }
940
941 // f16/f16x2 neg was introduced in PTX 60, SM_53.
942 const bool IsFP16FP16x2NegAvailable = STI.getSmVersion() >= 53 &&
943 STI.getPTXVersion() >= 60 &&
944 STI.allowFP16Math();
945 for (const auto &VT : {MVT::f16, MVT::v2f16})
947 IsFP16FP16x2NegAvailable ? Legal : Expand);
948
949 setBF16OperationAction(ISD::FNEG, MVT::bf16, Legal, Expand);
950 setBF16OperationAction(ISD::FNEG, MVT::v2bf16, Legal, Expand);
951 setOperationAction(ISD::FNEG, MVT::v2f32, Expand);
952 // (would be) Library functions.
953
954 // These map to conversion instructions for scalar FP types.
955 for (const auto &Op : {ISD::FCEIL, ISD::FFLOOR, ISD::FNEARBYINT, ISD::FRINT,
957 setOperationAction(Op, MVT::f16, Legal);
958 setOperationAction(Op, MVT::f32, Legal);
959 setOperationAction(Op, MVT::f64, Legal);
960 setOperationAction(Op, MVT::v2f16, Expand);
961 setOperationAction(Op, MVT::v2bf16, Expand);
962 setOperationAction(Op, MVT::v2f32, Expand);
963 setBF16OperationAction(Op, MVT::bf16, Legal, Promote);
964 if (getOperationAction(Op, MVT::bf16) == Promote)
965 AddPromotedToType(Op, MVT::bf16, MVT::f32);
966 }
967
968 if (STI.getSmVersion() < 80 || STI.getPTXVersion() < 71) {
970 }
971 if (STI.getSmVersion() < 90 || STI.getPTXVersion() < 78) {
972 for (MVT VT : {MVT::bf16, MVT::f32, MVT::f64}) {
975 }
976 }
977
978 // Expand v2f32 = fp_extend
980 // Expand v2[b]f16 = fp_round v2f32
981 setOperationAction(ISD::FP_ROUND, {MVT::v2bf16, MVT::v2f16}, Expand);
982
983 // sm_80 only has conversions between f32 and bf16. Custom lower all other
984 // bf16 conversions.
985 if (STI.getSmVersion() < 90 || STI.getPTXVersion() < 78) {
986 for (MVT VT : {MVT::i1, MVT::i16, MVT::i32, MVT::i64}) {
989 VT, Custom);
990 }
993 MVT::bf16, Custom);
994 }
995
1002 AddPromotedToType(ISD::FROUND, MVT::bf16, MVT::f32);
1003
1004 // 'Expand' implements FCOPYSIGN without calling an external library.
1011
1012 // These map to corresponding instructions for f32/f64. f16 must be
1013 // promoted to f32. v2f16 is expanded to f16, which is then promoted
1014 // to f32.
1015 for (const auto &Op :
1017 setOperationAction(Op, MVT::f16, Promote);
1018 setOperationAction(Op, MVT::f32, Legal);
1019 // only div/rem/sqrt are legal for f64
1020 if (Op == ISD::FDIV || Op == ISD::FREM || Op == ISD::FSQRT) {
1021 setOperationAction(Op, MVT::f64, Legal);
1022 }
1023 setOperationAction(Op, {MVT::v2f16, MVT::v2bf16, MVT::v2f32}, Expand);
1024 setOperationAction(Op, MVT::bf16, Promote);
1025 AddPromotedToType(Op, MVT::bf16, MVT::f32);
1026 }
1027 setOperationAction(ISD::FREM, {MVT::f32, MVT::f64}, Custom);
1028
1029 setOperationAction(ISD::FABS, {MVT::f32, MVT::f64}, Legal);
1030 setOperationAction(ISD::FABS, MVT::v2f32, Expand);
1031 if (STI.getPTXVersion() >= 65) {
1032 setFP16OperationAction(ISD::FABS, MVT::f16, Legal, Promote);
1033 setFP16OperationAction(ISD::FABS, MVT::v2f16, Legal, Expand);
1034 } else {
1036 setOperationAction(ISD::FABS, MVT::v2f16, Expand);
1037 }
1038 setBF16OperationAction(ISD::FABS, MVT::v2bf16, Legal, Expand);
1039 setBF16OperationAction(ISD::FABS, MVT::bf16, Legal, Promote);
1040 if (getOperationAction(ISD::FABS, MVT::bf16) == Promote)
1041 AddPromotedToType(ISD::FABS, MVT::bf16, MVT::f32);
1042
1043 for (const auto &Op :
1045 setOperationAction(Op, MVT::f32, Legal);
1046 setOperationAction(Op, MVT::f64, Legal);
1047 setFP16OperationAction(Op, MVT::f16, Legal, Promote);
1048 setFP16OperationAction(Op, MVT::v2f16, Legal, Expand);
1049 setBF16OperationAction(Op, MVT::v2bf16, Legal, Expand);
1050 setBF16OperationAction(Op, MVT::bf16, Legal, Promote);
1051 if (getOperationAction(Op, MVT::bf16) == Promote)
1052 AddPromotedToType(Op, MVT::bf16, MVT::f32);
1053 setOperationAction(Op, MVT::v2f32, Expand);
1054 }
1055 bool SupportsF32MinMaxNaN =
1056 STI.getSmVersion() >= 80 && STI.getPTXVersion() >= 70;
1057 for (const auto &Op : {ISD::FMINIMUM, ISD::FMAXIMUM}) {
1058 setOperationAction(Op, MVT::f32, SupportsF32MinMaxNaN ? Legal : Expand);
1059 setFP16OperationAction(Op, MVT::f16, Legal, Expand);
1060 setFP16OperationAction(Op, MVT::v2f16, Legal, Expand);
1061 setBF16OperationAction(Op, MVT::bf16, Legal, Expand);
1062 setBF16OperationAction(Op, MVT::v2bf16, Legal, Expand);
1063 setOperationAction(Op, MVT::v2f32, Expand);
1064 }
1065
1066 // Custom lowering for inline asm with 128-bit operands
1069
1070 // FEXP2 support:
1071 // - f32
1072 // - f16/f16x2 (sm_70+, PTX 7.0+)
1073 // - bf16/bf16x2 (sm_90+, PTX 7.8+)
1074 // When f16/bf16 types aren't supported, they are promoted/expanded to f32.
1076 setOperationAction(ISD::FEXP2, MVT::v2f32, Expand);
1077 setFP16OperationAction(ISD::FEXP2, MVT::f16, Legal, Promote);
1078 setFP16OperationAction(ISD::FEXP2, MVT::v2f16, Legal, Expand);
1079 setBF16OperationAction(ISD::FEXP2, MVT::bf16, Legal, Promote);
1080 setBF16OperationAction(ISD::FEXP2, MVT::v2bf16, Legal, Expand);
1081
1082 // FLOG2 supports f32 only
1083 // f16/bf16 types aren't supported, but they are promoted/expanded to f32.
1084 if (UseApproxLog2F32) {
1086 setOperationPromotedToType(ISD::FLOG2, MVT::f16, MVT::f32);
1087 setOperationPromotedToType(ISD::FLOG2, MVT::bf16, MVT::f32);
1088 setOperationAction(ISD::FLOG2, {MVT::v2f16, MVT::v2bf16, MVT::v2f32},
1089 Expand);
1090 }
1091
1092 setOperationAction(ISD::ADDRSPACECAST, {MVT::i32, MVT::i64}, Custom);
1093
1094 setOperationAction(ISD::ATOMIC_LOAD_SUB, {MVT::i32, MVT::i64}, Expand);
1095
1096 // atom.b128 is legal in PTX but since we don't represent i128 as a legal
1097 // type, we need to custom lower it.
1099 Custom);
1100
1101 // Now deduce the information based on the above mentioned
1102 // actions
1103 computeRegisterProperties(STI.getRegisterInfo());
1104
1105 // PTX support for 16-bit CAS is emulated. Only use 32+
1106 setMinCmpXchgSizeInBits(STI.getMinCmpXchgSizeInBits());
1107 setMaxAtomicSizeInBitsSupported(STI.hasAtomSwap128() ? 128 : 64);
1109
1110 // Custom lowering for tcgen05.ld vector operands
1112 {MVT::v2i32, MVT::v4i32, MVT::v8i32, MVT::v16i32,
1113 MVT::v32i32, MVT::v64i32, MVT::v128i32, MVT::v2f32,
1114 MVT::v4f32, MVT::v8f32, MVT::v16f32, MVT::v32f32,
1115 MVT::v64f32, MVT::v128f32},
1116 Custom);
1117
1118 // Custom lowering for tcgen05.st vector operands
1120 {MVT::v2i32, MVT::v4i32, MVT::v8i32, MVT::v16i32,
1121 MVT::v32i32, MVT::v64i32, MVT::v128i32, MVT::Other},
1122 Custom);
1123
1124 // Enable custom lowering for the following:
1125 // * MVT::i128 - clusterlaunchcontrol
1126 // * MVT::i32 - prmt
1127 // * MVT::v4f32 - cvt_rs fp{4/6/8}x4 intrinsics
1128 // * MVT::Other - internal.addrspace.wrap
1130 {MVT::i32, MVT::i128, MVT::v4f32, MVT::Other}, Custom);
1131
1132 // Custom lowering for bswap
1133 setOperationAction(ISD::BSWAP, {MVT::i16, MVT::i32, MVT::i64, MVT::v2i16},
1134 Custom);
1135}
1136
1139 if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&
1140 VT.getScalarType() == MVT::i1)
1141 return TypeSplitVector;
1143}
1144
1146 int Enabled, int &ExtraSteps,
1147 bool &UseOneConst,
1148 bool Reciprocal) const {
1151 return SDValue();
1152
1153 if (ExtraSteps == ReciprocalEstimate::Unspecified)
1154 ExtraSteps = 0;
1155
1156 SDLoc DL(Operand);
1157 EVT VT = Operand.getValueType();
1158 bool Ftz = useF32FTZ(DAG.getMachineFunction());
1159
1160 auto MakeIntrinsicCall = [&](Intrinsic::ID IID) {
1161 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
1162 DAG.getConstant(IID, DL, MVT::i32), Operand);
1163 };
1164
1165 // The sqrt and rsqrt refinement processes assume we always start out with an
1166 // approximation of the rsqrt. Therefore, if we're going to do any refinement
1167 // (i.e. ExtraSteps > 0), we must return an rsqrt. But if we're *not* doing
1168 // any refinement, we must return a regular sqrt.
1169 if (Reciprocal || ExtraSteps > 0) {
1170 if (VT == MVT::f32)
1171 return MakeIntrinsicCall(Ftz ? Intrinsic::nvvm_rsqrt_approx_ftz_f
1172 : Intrinsic::nvvm_rsqrt_approx_f);
1173 else if (VT == MVT::f64)
1174 return MakeIntrinsicCall(Intrinsic::nvvm_rsqrt_approx_d);
1175 else
1176 return SDValue();
1177 } else {
1178 if (VT == MVT::f32)
1179 return MakeIntrinsicCall(Ftz ? Intrinsic::nvvm_sqrt_approx_ftz_f
1180 : Intrinsic::nvvm_sqrt_approx_f);
1181 else {
1182 // There's no sqrt.approx.f64 instruction, so we emit
1183 // reciprocal(rsqrt(x)). This is faster than
1184 // select(x == 0, 0, x * rsqrt(x)). (In fact, it's faster than plain
1185 // x * rsqrt(x).)
1186 return DAG.getNode(
1188 DAG.getConstant(Intrinsic::nvvm_rcp_approx_ftz_d, DL, MVT::i32),
1189 MakeIntrinsicCall(Intrinsic::nvvm_rsqrt_approx_d));
1190 }
1191 }
1192}
1193
1195 const DataLayout &DL, Type *RetTy, const ArgListTy &Args,
1197 std::optional<unsigned> FirstVAArg, const CallBase &CB,
1198 unsigned UniqueCallSite) const {
1199 auto PtrVT = getPointerTy(DL);
1200
1201 std::string Prototype;
1202 raw_string_ostream O(Prototype);
1203 O << "prototype_" << UniqueCallSite << " : .callprototype ";
1204
1205 if (RetTy->isVoidTy()) {
1206 O << "()";
1207 } else {
1208 O << "(";
1209 if (shouldPassAsArray(RetTy)) {
1210 const Align RetAlign = getArgumentAlignment(&CB, RetTy, 0, DL);
1211 O << ".param .align " << RetAlign.value() << " .b8 _["
1212 << DL.getTypeAllocSize(RetTy) << "]";
1213 } else if (RetTy->isFloatingPointTy() || RetTy->isIntegerTy()) {
1214 unsigned size = 0;
1215 if (auto *ITy = dyn_cast<IntegerType>(RetTy)) {
1216 size = ITy->getBitWidth();
1217 } else {
1218 assert(RetTy->isFloatingPointTy() &&
1219 "Floating point type expected here");
1220 size = RetTy->getPrimitiveSizeInBits();
1221 }
1222 // PTX ABI requires all scalar return values to be at least 32
1223 // bits in size. fp16 normally uses .b16 as its storage type in
1224 // PTX, so its size must be adjusted here, too.
1226
1227 O << ".param .b" << size << " _";
1228 } else if (isa<PointerType>(RetTy)) {
1229 O << ".param .b" << PtrVT.getSizeInBits() << " _";
1230 } else {
1231 llvm_unreachable("Unknown return type");
1232 }
1233 O << ") ";
1234 }
1235 O << "_ (";
1236
1237 bool first = true;
1238
1239 const unsigned NumArgs = FirstVAArg.value_or(Args.size());
1240 auto AllOuts = ArrayRef(Outs);
1241 for (const unsigned I : llvm::seq(NumArgs)) {
1242 const auto ArgOuts =
1243 AllOuts.take_while([I](auto O) { return O.OrigArgIndex == I; });
1244 AllOuts = AllOuts.drop_front(ArgOuts.size());
1245
1246 Type *Ty = Args[I].Ty;
1247 if (!first) {
1248 O << ", ";
1249 }
1250 first = false;
1251
1252 if (ArgOuts[0].Flags.isByVal()) {
1253 // Indirect calls need strict ABI alignment so we disable optimizations by
1254 // not providing a function to optimize.
1255 Type *ETy = Args[I].IndirectType;
1256 Align InitialAlign = ArgOuts[0].Flags.getNonZeroByValAlign();
1257 Align ParamByValAlign =
1258 getFunctionByValParamAlign(/*F=*/nullptr, ETy, InitialAlign, DL);
1259
1260 O << ".param .align " << ParamByValAlign.value() << " .b8 _["
1261 << ArgOuts[0].Flags.getByValSize() << "]";
1262 } else {
1263 if (shouldPassAsArray(Ty)) {
1264 Align ParamAlign =
1265 getArgumentAlignment(&CB, Ty, I + AttributeList::FirstArgIndex, DL);
1266 O << ".param .align " << ParamAlign.value() << " .b8 _["
1267 << DL.getTypeAllocSize(Ty) << "]";
1268 continue;
1269 }
1270 // i8 types in IR will be i16 types in SDAG
1271 assert((getValueType(DL, Ty) == ArgOuts[0].VT ||
1272 (getValueType(DL, Ty) == MVT::i8 && ArgOuts[0].VT == MVT::i16)) &&
1273 "type mismatch between callee prototype and arguments");
1274 // scalar type
1275 unsigned sz = 0;
1276 if (auto *ITy = dyn_cast<IntegerType>(Ty)) {
1277 sz = promoteScalarArgumentSize(ITy->getBitWidth());
1278 } else if (isa<PointerType>(Ty)) {
1279 sz = PtrVT.getSizeInBits();
1280 } else {
1281 sz = Ty->getPrimitiveSizeInBits();
1282 }
1283 O << ".param .b" << sz << " _";
1284 }
1285 }
1286
1287 if (FirstVAArg)
1288 O << (first ? "" : ",") << " .param .align "
1289 << STI.getMaxRequiredAlignment() << " .b8 _[]";
1290 O << ")";
1291 if (shouldEmitPTXNoReturn(&CB, *nvTM))
1292 O << " .noreturn";
1293 O << ";";
1294
1295 return Prototype;
1296}
1297
1299 const Function *F, Type *Ty, unsigned Idx, const DataLayout &DL) const {
1300 return getAlign(*F, Idx).value_or(getFunctionParamOptimizedAlign(F, Ty, DL));
1301}
1302
1303Align NVPTXTargetLowering::getArgumentAlignment(const CallBase *CB, Type *Ty,
1304 unsigned Idx,
1305 const DataLayout &DL) const {
1306 if (!CB) {
1307 // CallSite is zero, fallback to ABI type alignment
1308 return DL.getABITypeAlign(Ty);
1309 }
1310
1311 const Function *DirectCallee = CB->getCalledFunction();
1312
1313 if (!DirectCallee) {
1314 // We don't have a direct function symbol, but that may be because of
1315 // constant cast instructions in the call.
1316
1317 // With bitcast'd call targets, the instruction will be the call
1318 if (const auto *CI = dyn_cast<CallInst>(CB)) {
1319 // Check if we have call alignment metadata
1320 if (MaybeAlign StackAlign = getAlign(*CI, Idx))
1321 return StackAlign.value();
1322 }
1323 DirectCallee = getMaybeBitcastedCallee(CB);
1324 }
1325
1326 // Check for function alignment information if we found that the
1327 // ultimate target is a Function
1328 if (DirectCallee)
1329 return getFunctionArgumentAlignment(DirectCallee, Ty, Idx, DL);
1330
1331 // Call is indirect, fall back to the ABI type alignment
1332 return DL.getABITypeAlign(Ty);
1333}
1334
1336 const GlobalAddressSDNode *Func) {
1337 if (!Func)
1338 return false;
1339 if (auto *CalleeFunc = dyn_cast<Function>(Func->getGlobal()))
1340 return CB->getFunctionType() != CalleeFunc->getFunctionType();
1341 return false;
1342}
1343
1345 const DataLayout &DL,
1346 const TargetLowering &TL) {
1347 if (Ptr->getOpcode() == ISD::FrameIndex) {
1348 auto Ty = TL.getPointerTy(DL, ADDRESS_SPACE_LOCAL);
1349 Ptr = DAG.getAddrSpaceCast(SDLoc(), Ty, Ptr, ADDRESS_SPACE_GENERIC,
1351
1353 }
1354
1355 // Peel of an addrspacecast to generic and load directly from the specific
1356 // address space.
1357 if (Ptr->getOpcode() == ISD::ADDRSPACECAST) {
1358 const auto *ASC = cast<AddrSpaceCastSDNode>(Ptr);
1359 if (ASC->getDestAddressSpace() == ADDRESS_SPACE_GENERIC) {
1360 Ptr = ASC->getOperand(0);
1361 return MachinePointerInfo(ASC->getSrcAddressSpace());
1362 }
1363 }
1364
1365 return MachinePointerInfo();
1366}
1367
1369 if (Flags.isSExt())
1370 return ISD::SIGN_EXTEND;
1371 if (Flags.isZExt())
1372 return ISD::ZERO_EXTEND;
1373 return ISD::ANY_EXTEND;
1374}
1375
1377 ISD::ArgFlagsTy Flags, SelectionDAG &DAG,
1378 SDLoc dl) {
1379 const EVT ActualVT = V.getValueType();
1380 assert((ActualVT == ExpectedVT ||
1381 (ExpectedVT.isInteger() && ActualVT.isInteger())) &&
1382 "Non-integer argument type size mismatch");
1383 if (ExpectedVT.bitsGT(ActualVT))
1384 return DAG.getNode(getExtOpcode(Flags), dl, ExpectedVT, V);
1385 if (ExpectedVT.bitsLT(ActualVT))
1386 return DAG.getNode(ISD::TRUNCATE, dl, ExpectedVT, V);
1387
1388 return V;
1389}
1390
1392 SmallVectorImpl<SDValue> &InVals) const {
1393
1394 if (CLI.IsVarArg && (STI.getPTXVersion() < 60 || STI.getSmVersion() < 30))
1396 "Support for variadic functions (unsized array parameter) introduced "
1397 "in PTX ISA version 6.0 and requires target sm_30.");
1398
1399 SelectionDAG &DAG = CLI.DAG;
1400 SDLoc dl = CLI.DL;
1401 const SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
1402 SDValue Callee = CLI.Callee;
1403 ArgListTy &Args = CLI.getArgs();
1404 Type *RetTy = CLI.RetTy;
1405 const CallBase *CB = CLI.CB;
1406 const DataLayout &DL = DAG.getDataLayout();
1407 LLVMContext &Ctx = *DAG.getContext();
1408
1409 const auto GetI32 = [&](const unsigned I) {
1410 return DAG.getConstant(I, dl, MVT::i32);
1411 };
1412
1413 const unsigned UniqueCallSite = GlobalUniqueCallSite++;
1414 const SDValue CallChain = CLI.Chain;
1415 const SDValue StartChain =
1416 DAG.getCALLSEQ_START(CallChain, UniqueCallSite, 0, dl);
1417 SDValue DeclareGlue = StartChain.getValue(1);
1418
1419 SmallVector<SDValue, 16> CallPrereqs{StartChain};
1420
1421 const auto MakeDeclareScalarParam = [&](SDValue Symbol, unsigned Size) {
1422 // PTX ABI requires integral types to be at least 32 bits in size. FP16 is
1423 // loaded/stored using i16, so it's handled here as well.
1424 const unsigned SizeBits = promoteScalarArgumentSize(Size * 8);
1425 SDValue Declare =
1426 DAG.getNode(NVPTXISD::DeclareScalarParam, dl, {MVT::Other, MVT::Glue},
1427 {StartChain, Symbol, GetI32(SizeBits), DeclareGlue});
1428 CallPrereqs.push_back(Declare);
1429 DeclareGlue = Declare.getValue(1);
1430 return Declare;
1431 };
1432
1433 const auto MakeDeclareArrayParam = [&](SDValue Symbol, Align Align,
1434 unsigned Size) {
1435 SDValue Declare = DAG.getNode(
1436 NVPTXISD::DeclareArrayParam, dl, {MVT::Other, MVT::Glue},
1437 {StartChain, Symbol, GetI32(Align.value()), GetI32(Size), DeclareGlue});
1438 CallPrereqs.push_back(Declare);
1439 DeclareGlue = Declare.getValue(1);
1440 return Declare;
1441 };
1442
1443 // Variadic arguments.
1444 //
1445 // Normally, for each argument, we declare a param scalar or a param
1446 // byte array in the .param space, and store the argument value to that
1447 // param scalar or array starting at offset 0.
1448 //
1449 // In the case of the first variadic argument, we declare a vararg byte array
1450 // with size 0. The exact size of this array isn't known at this point, so
1451 // it'll be patched later. All the variadic arguments will be stored to this
1452 // array at a certain offset (which gets tracked by 'VAOffset'). The offset is
1453 // initially set to 0, so it can be used for non-variadic arguments (which use
1454 // 0 offset) to simplify the code.
1455 //
1456 // After all vararg is processed, 'VAOffset' holds the size of the
1457 // vararg byte array.
1458 assert((CLI.IsVarArg || CLI.Args.size() == CLI.NumFixedArgs) &&
1459 "Non-VarArg function with extra arguments");
1460
1461 const unsigned FirstVAArg = CLI.NumFixedArgs; // position of first variadic
1462 unsigned VAOffset = 0; // current offset in the param array
1463
1464 const SDValue VADeclareParam =
1465 CLI.Args.size() > FirstVAArg
1466 ? MakeDeclareArrayParam(getCallParamSymbol(DAG, FirstVAArg, MVT::i32),
1467 Align(STI.getMaxRequiredAlignment()), 0)
1468 : SDValue();
1469
1470 // Args.size() and Outs.size() need not match.
1471 // Outs.size() will be larger
1472 // * if there is an aggregate argument with multiple fields (each field
1473 // showing up separately in Outs)
1474 // * if there is a vector argument with more than typical vector-length
1475 // elements (generally if more than 4) where each vector element is
1476 // individually present in Outs.
1477 // So a different index should be used for indexing into Outs/OutVals.
1478 // See similar issue in LowerFormalArguments.
1479 auto AllOuts = ArrayRef(CLI.Outs);
1480 auto AllOutVals = ArrayRef(CLI.OutVals);
1481 assert(AllOuts.size() == AllOutVals.size() &&
1482 "Outs and OutVals must be the same size");
1483 // Declare the .params or .reg need to pass values
1484 // to the function
1485 for (const auto E : llvm::enumerate(Args)) {
1486 const auto ArgI = E.index();
1487 const auto Arg = E.value();
1488 const auto ArgOuts =
1489 AllOuts.take_while([&](auto O) { return O.OrigArgIndex == ArgI; });
1490 const auto ArgOutVals = AllOutVals.take_front(ArgOuts.size());
1491 AllOuts = AllOuts.drop_front(ArgOuts.size());
1492 AllOutVals = AllOutVals.drop_front(ArgOuts.size());
1493
1494 const bool IsVAArg = (ArgI >= FirstVAArg);
1495 const bool IsByVal = Arg.IsByVal;
1496
1497 const SDValue ParamSymbol =
1498 getCallParamSymbol(DAG, IsVAArg ? FirstVAArg : ArgI, MVT::i32);
1499
1500 assert((!IsByVal || Arg.IndirectType) &&
1501 "byval arg must have indirect type");
1502 Type *ETy = (IsByVal ? Arg.IndirectType : Arg.Ty);
1503
1504 const Align ArgAlign = [&]() {
1505 if (IsByVal) {
1506 // The ByValAlign in the Outs[OIdx].Flags is always set at this point,
1507 // so we don't need to worry whether it's naturally aligned or not.
1508 // See TargetLowering::LowerCallTo().
1509 const Align InitialAlign = ArgOuts[0].Flags.getNonZeroByValAlign();
1511 InitialAlign, DL);
1512 }
1513 return getArgumentAlignment(CB, Arg.Ty, ArgI + 1, DL);
1514 }();
1515
1516 const unsigned TySize = DL.getTypeAllocSize(ETy);
1517 assert((!IsByVal || TySize == ArgOuts[0].Flags.getByValSize()) &&
1518 "type size mismatch");
1519
1520 const SDValue ArgDeclare = [&]() {
1521 if (IsVAArg)
1522 return VADeclareParam;
1523
1524 if (IsByVal || shouldPassAsArray(Arg.Ty))
1525 return MakeDeclareArrayParam(ParamSymbol, ArgAlign, TySize);
1526
1527 assert(ArgOuts.size() == 1 && "We must pass only one value as non-array");
1528 assert((ArgOuts[0].VT.isInteger() || ArgOuts[0].VT.isFloatingPoint()) &&
1529 "Only int and float types are supported as non-array arguments");
1530
1531 return MakeDeclareScalarParam(ParamSymbol, TySize);
1532 }();
1533
1534 if (IsByVal) {
1535 assert(ArgOutVals.size() == 1 && "We must pass only one value as byval");
1536 SDValue SrcPtr = ArgOutVals[0];
1537 const auto PointerInfo = refinePtrAS(SrcPtr, DAG, DL, *this);
1538 const Align BaseSrcAlign = ArgOuts[0].Flags.getNonZeroByValAlign();
1539
1540 if (IsVAArg)
1541 VAOffset = alignTo(VAOffset, ArgAlign);
1542
1543 SmallVector<EVT, 4> ValueVTs, MemVTs;
1545 ComputeValueVTs(*this, DL, ETy, ValueVTs, &MemVTs, &Offsets);
1546
1547 unsigned J = 0;
1548 const auto VI = VectorizePTXValueVTs(MemVTs, Offsets, ArgAlign, IsVAArg);
1549 for (const unsigned NumElts : VI) {
1550 EVT LoadVT = getVectorizedVT(MemVTs[J], NumElts, Ctx);
1551 Align SrcAlign = commonAlignment(BaseSrcAlign, Offsets[J]);
1552 SDValue SrcAddr = DAG.getObjectPtrOffset(dl, SrcPtr, Offsets[J]);
1553 SDValue SrcLoad =
1554 DAG.getLoad(LoadVT, dl, CallChain, SrcAddr, PointerInfo, SrcAlign);
1555
1556 TypeSize ParamOffset = Offsets[J].getWithIncrement(VAOffset);
1557 Align ParamAlign = commonAlignment(ArgAlign, ParamOffset);
1558 SDValue ParamAddr =
1559 DAG.getObjectPtrOffset(dl, ParamSymbol, ParamOffset);
1560 SDValue StoreParam =
1561 DAG.getStore(ArgDeclare, dl, SrcLoad, ParamAddr,
1563 CallPrereqs.push_back(StoreParam);
1564
1565 J += NumElts;
1566 }
1567 if (IsVAArg)
1568 VAOffset += TySize;
1569 } else {
1572 ComputePTXValueVTs(*this, DL, Ctx, CLI.CallConv, Arg.Ty, VTs, Offsets,
1573 VAOffset);
1574 assert(VTs.size() == Offsets.size() && "Size mismatch");
1575 assert(VTs.size() == ArgOuts.size() && "Size mismatch");
1576
1577 // PTX Interoperability Guide 3.3(A): [Integer] Values shorter
1578 // than 32-bits are sign extended or zero extended, depending on
1579 // whether they are signed or unsigned types. This case applies
1580 // only to scalar parameters and not to aggregate values.
1581 const bool ExtendIntegerParam =
1582 Arg.Ty->isIntegerTy() && DL.getTypeAllocSizeInBits(Arg.Ty) < 32;
1583
1584 const auto GetStoredValue = [&](const unsigned I) {
1585 SDValue StVal = ArgOutVals[I];
1587 StVal.getValueType() &&
1588 "OutVal type should always be legal");
1589
1590 const EVT VTI = promoteScalarIntegerPTX(VTs[I]);
1591 const EVT StoreVT =
1592 ExtendIntegerParam ? MVT::i32 : (VTI == MVT::i1 ? MVT::i8 : VTI);
1593
1594 return correctParamType(StVal, StoreVT, ArgOuts[I].Flags, DAG, dl);
1595 };
1596
1597 unsigned J = 0;
1598 const auto VI = VectorizePTXValueVTs(VTs, Offsets, ArgAlign, IsVAArg);
1599 for (const unsigned NumElts : VI) {
1600 const EVT EltVT = promoteScalarIntegerPTX(VTs[J]);
1601
1602 unsigned Offset;
1603 if (IsVAArg) {
1604 // TODO: We may need to support vector types that can be passed
1605 // as scalars in variadic arguments.
1606 assert(NumElts == 1 &&
1607 "Vectorization should be disabled for vaargs.");
1608
1609 // Align each part of the variadic argument to their type.
1610 VAOffset = alignTo(VAOffset, DAG.getEVTAlign(EltVT));
1611 Offset = VAOffset;
1612
1613 const EVT TheStoreType = ExtendIntegerParam ? MVT::i32 : EltVT;
1614 VAOffset += DL.getTypeAllocSize(TheStoreType.getTypeForEVT(Ctx));
1615 } else {
1616 assert(VAOffset == 0 && "VAOffset must be 0 for non-VA args");
1617 Offset = Offsets[J];
1618 }
1619
1620 SDValue Ptr =
1621 DAG.getObjectPtrOffset(dl, ParamSymbol, TypeSize::getFixed(Offset));
1622
1623 const MaybeAlign CurrentAlign = ExtendIntegerParam
1624 ? MaybeAlign(std::nullopt)
1625 : commonAlignment(ArgAlign, Offset);
1626
1627 SDValue Val =
1628 getBuildVectorizedValue(NumElts, dl, DAG, [&](unsigned K) {
1629 return GetStoredValue(J + K);
1630 });
1631
1632 SDValue StoreParam =
1633 DAG.getStore(ArgDeclare, dl, Val, Ptr,
1635 CallPrereqs.push_back(StoreParam);
1636
1637 J += NumElts;
1638 }
1639 }
1640 }
1641
1642 // Handle Result
1643 if (!Ins.empty()) {
1644 const SDValue RetSymbol = DAG.getExternalSymbol("retval0", MVT::i32);
1645 const unsigned ResultSize = DL.getTypeAllocSize(RetTy);
1646 if (shouldPassAsArray(RetTy)) {
1647 const Align RetAlign = getArgumentAlignment(CB, RetTy, 0, DL);
1648 MakeDeclareArrayParam(RetSymbol, RetAlign, ResultSize);
1649 } else {
1650 MakeDeclareScalarParam(RetSymbol, ResultSize);
1651 }
1652 }
1653
1654 // Set the size of the vararg param byte array if the callee is a variadic
1655 // function and the variadic part is not empty.
1656 if (VADeclareParam) {
1657 SDValue DeclareParamOps[] = {VADeclareParam.getOperand(0),
1658 VADeclareParam.getOperand(1),
1659 VADeclareParam.getOperand(2), GetI32(VAOffset),
1660 VADeclareParam.getOperand(4)};
1661 DAG.MorphNodeTo(VADeclareParam.getNode(), VADeclareParam.getOpcode(),
1662 VADeclareParam->getVTList(), DeclareParamOps);
1663 }
1664
1665 const auto *Func = dyn_cast<GlobalAddressSDNode>(Callee.getNode());
1666 // If the type of the callsite does not match that of the function, convert
1667 // the callsite to an indirect call.
1668 const bool ConvertToIndirectCall = shouldConvertToIndirectCall(CB, Func);
1669
1670 // Both indirect calls and libcalls have nullptr Func. In order to distinguish
1671 // between them we must rely on the call site value which is valid for
1672 // indirect calls but is always null for libcalls.
1673 const bool IsIndirectCall = (!Func && CB) || ConvertToIndirectCall;
1674
1675 if (isa<ExternalSymbolSDNode>(Callee)) {
1676 Function* CalleeFunc = nullptr;
1677
1678 // Try to find the callee in the current module.
1679 Callee = DAG.getSymbolFunctionGlobalAddress(Callee, &CalleeFunc);
1680 assert(CalleeFunc != nullptr && "Libcall callee must be set.");
1681
1682 // Set the "libcall callee" attribute to indicate that the function
1683 // must always have a declaration.
1684 CalleeFunc->addFnAttr("nvptx-libcall-callee", "true");
1685 }
1686
1687 if (IsIndirectCall) {
1688 // This is indirect function call case : PTX requires a prototype of the
1689 // form
1690 // proto_0 : .callprototype(.param .b32 _) _ (.param .b32 _);
1691 // to be emitted, and the label has to used as the last arg of call
1692 // instruction.
1693 // The prototype is embedded in a string and put as the operand for a
1694 // CallPrototype SDNode which will print out to the value of the string.
1695 const bool HasVAArgs = CLI.IsVarArg && (CLI.Args.size() > CLI.NumFixedArgs);
1696 std::string Proto =
1697 getPrototype(DL, RetTy, Args, CLI.Outs,
1698 HasVAArgs ? std::optional(FirstVAArg) : std::nullopt, *CB,
1699 UniqueCallSite);
1700 const char *ProtoStr = nvTM->getStrPool().save(Proto).data();
1701 const SDValue PrototypeDeclare = DAG.getNode(
1702 NVPTXISD::CallPrototype, dl, MVT::Other,
1703 {StartChain, DAG.getTargetExternalSymbol(ProtoStr, MVT::i32)});
1704 CallPrereqs.push_back(PrototypeDeclare);
1705 }
1706
1707 const unsigned Proto = IsIndirectCall ? UniqueCallSite : 0;
1708 const unsigned NumArgs =
1709 std::min<unsigned>(CLI.NumFixedArgs + 1, Args.size());
1710 /// CALL(Chain, IsConvergent, IsIndirectCall/IsUniform, NumReturns,
1711 /// NumParams, Callee, Proto)
1712 const SDValue CallToken = DAG.getTokenFactor(dl, CallPrereqs);
1713 const SDValue Call = DAG.getNode(
1714 NVPTXISD::CALL, dl, MVT::Other,
1715 {CallToken, GetI32(CLI.IsConvergent), GetI32(IsIndirectCall),
1716 GetI32(Ins.empty() ? 0 : 1), GetI32(NumArgs), Callee, GetI32(Proto)});
1717
1718 SmallVector<SDValue, 16> LoadChains{Call};
1719 SmallVector<SDValue, 16> ProxyRegOps;
1720 if (!Ins.empty()) {
1723 ComputePTXValueVTs(*this, DL, Ctx, CLI.CallConv, RetTy, VTs, Offsets);
1724 assert(VTs.size() == Ins.size() && "Bad value decomposition");
1725
1726 const Align RetAlign = getArgumentAlignment(CB, RetTy, 0, DL);
1727 const SDValue RetSymbol = DAG.getExternalSymbol("retval0", MVT::i32);
1728
1729 // PTX Interoperability Guide 3.3(A): [Integer] Values shorter than
1730 // 32-bits are sign extended or zero extended, depending on whether
1731 // they are signed or unsigned types.
1732 const bool ExtendIntegerRetVal =
1733 RetTy->isIntegerTy() && DL.getTypeAllocSizeInBits(RetTy) < 32;
1734
1735 unsigned I = 0;
1736 const auto VI = VectorizePTXValueVTs(VTs, Offsets, RetAlign);
1737 for (const unsigned NumElts : VI) {
1738 const MaybeAlign CurrentAlign =
1739 ExtendIntegerRetVal ? MaybeAlign(std::nullopt)
1740 : commonAlignment(RetAlign, Offsets[I]);
1741
1742 const EVT VTI = promoteScalarIntegerPTX(VTs[I]);
1743 const EVT LoadVT =
1744 ExtendIntegerRetVal ? MVT::i32 : (VTI == MVT::i1 ? MVT::i8 : VTI);
1745 const EVT VecVT = getVectorizedVT(LoadVT, NumElts, Ctx);
1746 SDValue Ptr =
1747 DAG.getObjectPtrOffset(dl, RetSymbol, TypeSize::getFixed(Offsets[I]));
1748
1749 SDValue R =
1750 DAG.getLoad(VecVT, dl, Call, Ptr,
1752
1753 LoadChains.push_back(R.getValue(1));
1754 for (const unsigned J : llvm::seq(NumElts))
1755 ProxyRegOps.push_back(getExtractVectorizedValue(R, J, LoadVT, dl, DAG));
1756 I += NumElts;
1757 }
1758 }
1759
1760 const SDValue EndToken = DAG.getTokenFactor(dl, LoadChains);
1761 const SDValue CallEnd = DAG.getCALLSEQ_END(EndToken, UniqueCallSite,
1762 UniqueCallSite + 1, SDValue(), dl);
1763
1764 // Append ProxyReg instructions to the chain to make sure that `callseq_end`
1765 // will not get lost. Otherwise, during libcalls expansion, the nodes can become
1766 // dangling.
1767 for (const auto [I, Reg] : llvm::enumerate(ProxyRegOps)) {
1768 SDValue Proxy =
1769 DAG.getNode(NVPTXISD::ProxyReg, dl, Reg.getValueType(), {CallEnd, Reg});
1770 SDValue Ret = correctParamType(Proxy, Ins[I].VT, Ins[I].Flags, DAG, dl);
1771 InVals.push_back(Ret);
1772 }
1773
1774 // set IsTailCall to false for now, until we figure out how to express
1775 // tail call optimization in PTX
1776 CLI.IsTailCall = false;
1777 return CallEnd;
1778}
1779
1781 SelectionDAG &DAG) const {
1782
1783 if (STI.getPTXVersion() < 73 || STI.getSmVersion() < 52) {
1784 const Function &Fn = DAG.getMachineFunction().getFunction();
1785
1787 Fn,
1788 "Support for dynamic alloca introduced in PTX ISA version 7.3 and "
1789 "requires target sm_52.",
1790 SDLoc(Op).getDebugLoc()));
1791 auto Ops = {DAG.getConstant(0, SDLoc(), Op.getValueType()),
1792 Op.getOperand(0)};
1793 return DAG.getMergeValues(Ops, SDLoc());
1794 }
1795
1796 SDLoc DL(Op.getNode());
1797 SDValue Chain = Op.getOperand(0);
1798 SDValue Size = Op.getOperand(1);
1799 uint64_t Align = Op.getConstantOperandVal(2);
1800
1801 // The alignment on a ISD::DYNAMIC_STACKALLOC node may be 0 to indicate that
1802 // the default stack alignment should be used.
1803 if (Align == 0)
1805
1806 // The size for ptx alloca instruction is 64-bit for m64 and 32-bit for m32.
1807 const MVT LocalVT = getPointerTy(DAG.getDataLayout(), ADDRESS_SPACE_LOCAL);
1808
1809 SDValue Alloc =
1810 DAG.getNode(NVPTXISD::DYNAMIC_STACKALLOC, DL, {LocalVT, MVT::Other},
1811 {Chain, DAG.getZExtOrTrunc(Size, DL, LocalVT),
1812 DAG.getTargetConstant(Align, DL, MVT::i32)});
1813
1814 SDValue ASC = DAG.getAddrSpaceCast(
1816
1817 return DAG.getMergeValues({ASC, SDValue(Alloc.getNode(), 1)}, DL);
1818}
1819
1821 SelectionDAG &DAG) const {
1822 SDLoc DL(Op.getNode());
1823 if (STI.getPTXVersion() < 73 || STI.getSmVersion() < 52) {
1824 const Function &Fn = DAG.getMachineFunction().getFunction();
1825
1827 Fn,
1828 "Support for stackrestore requires PTX ISA version >= 7.3 and target "
1829 ">= sm_52.",
1830 DL.getDebugLoc()));
1831 return Op.getOperand(0);
1832 }
1833
1834 const MVT LocalVT = getPointerTy(DAG.getDataLayout(), ADDRESS_SPACE_LOCAL);
1835 SDValue Chain = Op.getOperand(0);
1836 SDValue Ptr = Op.getOperand(1);
1837 SDValue ASC = DAG.getAddrSpaceCast(DL, LocalVT, Ptr, ADDRESS_SPACE_GENERIC,
1839 return DAG.getNode(NVPTXISD::STACKRESTORE, DL, MVT::Other, {Chain, ASC});
1840}
1841
1843 SelectionDAG &DAG) const {
1844 SDLoc DL(Op.getNode());
1845 if (STI.getPTXVersion() < 73 || STI.getSmVersion() < 52) {
1846 const Function &Fn = DAG.getMachineFunction().getFunction();
1847
1849 Fn,
1850 "Support for stacksave requires PTX ISA version >= 7.3 and target >= "
1851 "sm_52.",
1852 DL.getDebugLoc()));
1853 auto Ops = {DAG.getConstant(0, DL, Op.getValueType()), Op.getOperand(0)};
1854 return DAG.getMergeValues(Ops, DL);
1855 }
1856
1857 const MVT LocalVT = getPointerTy(DAG.getDataLayout(), ADDRESS_SPACE_LOCAL);
1858 SDValue Chain = Op.getOperand(0);
1859 SDValue SS =
1860 DAG.getNode(NVPTXISD::STACKSAVE, DL, {LocalVT, MVT::Other}, Chain);
1861 SDValue ASC = DAG.getAddrSpaceCast(
1862 DL, Op.getValueType(), SS, ADDRESS_SPACE_LOCAL, ADDRESS_SPACE_GENERIC);
1863 return DAG.getMergeValues({ASC, SDValue(SS.getNode(), 1)}, DL);
1864}
1865
1866// By default CONCAT_VECTORS is lowered by ExpandVectorBuildThroughStack()
1867// (see LegalizeDAG.cpp). This is slow and uses local memory.
1868// We use extract/insert/build vector just as what LegalizeOp() does in llvm 2.5
1869SDValue
1870NVPTXTargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const {
1871 SDNode *Node = Op.getNode();
1872 SDLoc dl(Node);
1874 unsigned NumOperands = Node->getNumOperands();
1875 for (unsigned i = 0; i < NumOperands; ++i) {
1876 SDValue SubOp = Node->getOperand(i);
1877 EVT VVT = SubOp.getNode()->getValueType(0);
1878 EVT EltVT = VVT.getVectorElementType();
1879 unsigned NumSubElem = VVT.getVectorNumElements();
1880 for (unsigned j = 0; j < NumSubElem; ++j) {
1881 Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, SubOp,
1882 DAG.getIntPtrConstant(j, dl)));
1883 }
1884 }
1885 return DAG.getBuildVector(Node->getValueType(0), dl, Ops);
1886}
1887
1889 SelectionDAG &DAG,
1890 unsigned Mode = NVPTX::PTXPrmtMode::NONE) {
1891 assert(A.getValueType() == MVT::i32 && B.getValueType() == MVT::i32 &&
1892 Selector.getValueType() == MVT::i32 && "PRMT must have i32 operands");
1893 return DAG.getNode(NVPTXISD::PRMT, DL, MVT::i32,
1894 {A, B, Selector, DAG.getConstant(Mode, DL, MVT::i32)});
1895}
1896
1898 SelectionDAG &DAG,
1899 unsigned Mode = NVPTX::PTXPrmtMode::NONE) {
1900 return getPRMT(A, B, DAG.getConstant(Selector, DL, MVT::i32), DL, DAG, Mode);
1901}
1902
1903/// Reduces the elements using the scalar operations provided. The operations
1904/// are sorted descending in number of inputs they take. The flags on the
1905/// original reduction operation will be propagated to each scalar operation.
1906/// Nearby elements are grouped in tree reduction, unlike the shuffle reduction
1907/// used in ExpandReductions and SelectionDAG.
1909 const SmallVector<SDValue> &Elements, EVT EltTy,
1910 ArrayRef<std::pair<unsigned /*NodeType*/, unsigned /*NumInputs*/>> Ops,
1911 const SDLoc &DL, const SDNodeFlags Flags, SelectionDAG &DAG) {
1912 // Build the reduction tree at each level, starting with all the elements.
1913 SmallVector<SDValue> Level = Elements;
1914
1915 unsigned OpIdx = 0;
1916 while (Level.size() > 1) {
1917 // Try to reduce this level using the current operator.
1918 const auto [Op, NumInputs] = Ops[OpIdx];
1919
1920 // Build the next level by partially reducing all elements.
1921 SmallVector<SDValue> ReducedLevel;
1922 unsigned I = 0, E = Level.size();
1923 for (; I + NumInputs <= E; I += NumInputs) {
1924 // Reduce elements in groups of [NumInputs], as much as possible.
1925 ReducedLevel.push_back(DAG.getNode(
1926 Op, DL, EltTy, ArrayRef<SDValue>(Level).slice(I, NumInputs), Flags));
1927 }
1928
1929 if (I < E) {
1930 // Handle leftover elements.
1931
1932 if (ReducedLevel.empty()) {
1933 // We didn't reduce anything at this level. We need to pick a smaller
1934 // operator.
1935 ++OpIdx;
1936 assert(OpIdx < Ops.size() && "no smaller operators for reduction");
1937 continue;
1938 }
1939
1940 // We reduced some things but there's still more left, meaning the
1941 // operator's number of inputs doesn't evenly divide this level size. Move
1942 // these elements to the next level.
1943 for (; I < E; ++I)
1944 ReducedLevel.push_back(Level[I]);
1945 }
1946
1947 // Process the next level.
1948 Level = ReducedLevel;
1949 }
1950
1951 return *Level.begin();
1952}
1953
1954// Get scalar reduction opcode
1955static ISD::NodeType getScalarOpcodeForReduction(unsigned ReductionOpcode) {
1956 switch (ReductionOpcode) {
1958 return ISD::FMAXNUM;
1960 return ISD::FMINNUM;
1962 return ISD::FMAXIMUM;
1964 return ISD::FMINIMUM;
1965 default:
1966 llvm_unreachable("unhandled reduction opcode");
1967 }
1968}
1969
1970/// Get 3-input scalar reduction opcode
1971static std::optional<unsigned>
1972getScalar3OpcodeForReduction(unsigned ReductionOpcode) {
1973 switch (ReductionOpcode) {
1975 return NVPTXISD::FMAXNUM3;
1977 return NVPTXISD::FMINNUM3;
1979 return NVPTXISD::FMAXIMUM3;
1981 return NVPTXISD::FMINIMUM3;
1982 default:
1983 return std::nullopt;
1984 }
1985}
1986
1987/// Lower reductions to either a sequence of operations or a tree if
1988/// reassociations are allowed. This method will use larger operations like
1989/// max3/min3 when the target supports them.
1990SDValue NVPTXTargetLowering::LowerVECREDUCE(SDValue Op,
1991 SelectionDAG &DAG) const {
1992 SDLoc DL(Op);
1993 const SDNodeFlags Flags = Op->getFlags();
1994 SDValue Vector = Op.getOperand(0);
1995
1996 const unsigned Opcode = Op->getOpcode();
1997 const EVT EltTy = Vector.getValueType().getVectorElementType();
1998
1999 // Whether we can use 3-input min/max when expanding the reduction.
2000 const bool CanUseMinMax3 =
2001 EltTy == MVT::f32 && STI.getSmVersion() >= 100 &&
2002 STI.getPTXVersion() >= 88 &&
2003 (Opcode == ISD::VECREDUCE_FMAX || Opcode == ISD::VECREDUCE_FMIN ||
2004 Opcode == ISD::VECREDUCE_FMAXIMUM || Opcode == ISD::VECREDUCE_FMINIMUM);
2005
2006 // A list of SDNode opcodes with equivalent semantics, sorted descending by
2007 // number of inputs they take.
2008 SmallVector<std::pair<unsigned /*Op*/, unsigned /*NumIn*/>, 2> ScalarOps;
2009
2010 if (auto Opcode3Elem = getScalar3OpcodeForReduction(Opcode);
2011 CanUseMinMax3 && Opcode3Elem)
2012 ScalarOps.push_back({*Opcode3Elem, 3});
2013 ScalarOps.push_back({getScalarOpcodeForReduction(Opcode), 2});
2014
2016 DAG.ExtractVectorElements(Vector, Elements);
2017
2018 return buildTreeReduction(Elements, EltTy, ScalarOps, DL, Flags, DAG);
2019}
2020
2021SDValue NVPTXTargetLowering::LowerBITCAST(SDValue Op, SelectionDAG &DAG) const {
2022 // Handle bitcasting from v2i8 without hitting the default promotion
2023 // strategy which goes through stack memory.
2024 EVT FromVT = Op->getOperand(0)->getValueType(0);
2025 if (FromVT != MVT::v2i8) {
2026 return Op;
2027 }
2028
2029 // Pack vector elements into i16 and bitcast to final type
2030 SDLoc DL(Op);
2031 SDValue Vec0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8,
2032 Op->getOperand(0), DAG.getIntPtrConstant(0, DL));
2033 SDValue Vec1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8,
2034 Op->getOperand(0), DAG.getIntPtrConstant(1, DL));
2035 SDValue Extend0 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i16, Vec0);
2036 SDValue Extend1 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i16, Vec1);
2037 SDValue Const8 = DAG.getConstant(8, DL, MVT::i16);
2038 SDValue AsInt = DAG.getNode(
2039 ISD::OR, DL, MVT::i16,
2040 {Extend0, DAG.getNode(ISD::SHL, DL, MVT::i16, {Extend1, Const8})});
2041 EVT ToVT = Op->getValueType(0);
2042 return DAG.getBitcast(ToVT, AsInt);
2043}
2044
2045// We can init constant f16x2/v2i16/v4i8 with a single .b32 move. Normally it
2046// would get lowered as two constant loads and vector-packing move.
2047// Instead we want just a constant move:
2048// mov.b32 %r2, 0x40003C00
2049SDValue NVPTXTargetLowering::LowerBUILD_VECTOR(SDValue Op,
2050 SelectionDAG &DAG) const {
2051 EVT VT = Op->getValueType(0);
2052 if (!(NVPTX::isPackedVectorTy(VT) && VT.is32BitVector()))
2053 return Op;
2054 SDLoc DL(Op);
2055
2056 if (!llvm::all_of(Op->ops(), [](SDValue Operand) {
2057 return Operand->isUndef() || isa<ConstantSDNode>(Operand) ||
2058 isa<ConstantFPSDNode>(Operand);
2059 })) {
2060 if (VT != MVT::v4i8)
2061 return Op;
2062 // Lower non-const v4i8 vector as byte-wise constructed i32, which allows us
2063 // to optimize calculation of constant parts.
2064 auto GetPRMT = [&](const SDValue Left, const SDValue Right, bool Cast,
2065 uint64_t SelectionValue) -> SDValue {
2066 SDValue L = Left;
2067 SDValue R = Right;
2068 if (Cast) {
2069 L = DAG.getAnyExtOrTrunc(L, DL, MVT::i32);
2070 R = DAG.getAnyExtOrTrunc(R, DL, MVT::i32);
2071 }
2072 return getPRMT(L, R, SelectionValue, DL, DAG);
2073 };
2074 auto PRMT__10 = GetPRMT(Op->getOperand(0), Op->getOperand(1), true, 0x3340);
2075 auto PRMT__32 = GetPRMT(Op->getOperand(2), Op->getOperand(3), true, 0x3340);
2076 auto PRMT3210 = GetPRMT(PRMT__10, PRMT__32, false, 0x5410);
2077 return DAG.getBitcast(VT, PRMT3210);
2078 }
2079
2080 // Get value or the Nth operand as an APInt(32). Undef values treated as 0.
2081 auto GetOperand = [](SDValue Op, int N) -> APInt {
2082 const SDValue &Operand = Op->getOperand(N);
2083 EVT VT = Op->getValueType(0);
2084 if (Operand->isUndef())
2085 return APInt(32, 0);
2086 APInt Value;
2087 if (VT == MVT::v2f16 || VT == MVT::v2bf16)
2088 Value = cast<ConstantFPSDNode>(Operand)->getValueAPF().bitcastToAPInt();
2089 else if (VT == MVT::v2i16 || VT == MVT::v4i8)
2090 Value = Operand->getAsAPIntVal();
2091 else
2092 llvm_unreachable("Unsupported type");
2093 // i8 values are carried around as i16, so we need to zero out upper bits,
2094 // so they do not get in the way of combining individual byte values
2095 if (VT == MVT::v4i8)
2096 Value = Value.trunc(8);
2097 return Value.zext(32);
2098 };
2099
2100 // Construct a 32-bit constant by shifting into place smaller values
2101 // (elements of the vector type VT).
2102 // For example, if VT has 2 elements, then N == 2:
2103 // ShiftAmount = 32 / N = 16
2104 // Value |= Op0 (b16) << 0
2105 // Value |= Op1 (b16) << 16
2106 // If N == 4:
2107 // ShiftAmount = 32 / N = 8
2108 // Value |= Op0 (b8) << 0
2109 // Value |= Op1 (b8) << 8
2110 // Value |= Op2 (b8) << 16
2111 // Value |= Op3 (b8) << 24
2112 // ...etc
2113 APInt Value(32, 0);
2114 const unsigned NumElements = VT.getVectorNumElements();
2115 assert(32 % NumElements == 0 && "must evenly divide bit length");
2116 const unsigned ShiftAmount = 32 / NumElements;
2117 for (unsigned ElementNo : seq(NumElements))
2118 Value |= GetOperand(Op, ElementNo).shl(ElementNo * ShiftAmount);
2119 SDValue Const = DAG.getConstant(Value, DL, MVT::i32);
2120 return DAG.getNode(ISD::BITCAST, DL, Op->getValueType(0), Const);
2121}
2122
2123SDValue NVPTXTargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
2124 SelectionDAG &DAG) const {
2125 SDValue Index = Op->getOperand(1);
2126 SDValue Vector = Op->getOperand(0);
2127 SDLoc DL(Op);
2128 EVT VectorVT = Vector.getValueType();
2129
2130 if (VectorVT == MVT::v4i8) {
2131 SDValue Selector = DAG.getNode(ISD::OR, DL, MVT::i32,
2132 DAG.getZExtOrTrunc(Index, DL, MVT::i32),
2133 DAG.getConstant(0x7770, DL, MVT::i32));
2134 SDValue PRMT = getPRMT(DAG.getBitcast(MVT::i32, Vector),
2135 DAG.getConstant(0, DL, MVT::i32), Selector, DL, DAG);
2136 SDValue Ext = DAG.getAnyExtOrTrunc(PRMT, DL, Op->getValueType(0));
2137 SDNodeFlags Flags;
2138 Flags.setNoSignedWrap(Ext.getScalarValueSizeInBits() > 8);
2139 Flags.setNoUnsignedWrap(Ext.getScalarValueSizeInBits() >= 8);
2140 Ext->setFlags(Flags);
2141 return Ext;
2142 }
2143
2144 // Constant index will be matched by tablegen.
2145 if (isa<ConstantSDNode>(Index.getNode()))
2146 return Op;
2147
2148 // Extract individual elements and select one of them.
2149 assert(NVPTX::isPackedVectorTy(VectorVT) &&
2150 VectorVT.getVectorNumElements() == 2 && "Unexpected vector type.");
2151 EVT EltVT = VectorVT.getVectorElementType();
2152
2153 SDLoc dl(Op.getNode());
2154 SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Vector,
2155 DAG.getIntPtrConstant(0, dl));
2156 SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Vector,
2157 DAG.getIntPtrConstant(1, dl));
2158 return DAG.getSelectCC(dl, Index, DAG.getIntPtrConstant(0, dl), E0, E1,
2160}
2161
2162SDValue NVPTXTargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
2163 SelectionDAG &DAG) const {
2164 SDValue Vector = Op->getOperand(0);
2165 EVT VectorVT = Vector.getValueType();
2166
2167 if (VectorVT != MVT::v4i8)
2168 return Op;
2169 SDLoc DL(Op);
2170 SDValue Value = Op->getOperand(1);
2171 if (Value->isUndef())
2172 return Vector;
2173
2174 SDValue Index = Op->getOperand(2);
2175
2176 SDValue BFI =
2177 DAG.getNode(NVPTXISD::BFI, DL, MVT::i32,
2178 {DAG.getZExtOrTrunc(Value, DL, MVT::i32), Vector,
2179 DAG.getNode(ISD::MUL, DL, MVT::i32,
2180 DAG.getZExtOrTrunc(Index, DL, MVT::i32),
2181 DAG.getConstant(8, DL, MVT::i32)),
2182 DAG.getConstant(8, DL, MVT::i32)});
2183 return DAG.getNode(ISD::BITCAST, DL, Op->getValueType(0), BFI);
2184}
2185
2186SDValue NVPTXTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
2187 SelectionDAG &DAG) const {
2188 SDValue V1 = Op.getOperand(0);
2189 EVT VectorVT = V1.getValueType();
2190 if (VectorVT != MVT::v4i8 || Op.getValueType() != MVT::v4i8)
2191 return Op;
2192
2193 // Lower shuffle to PRMT instruction.
2194 const ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
2195 SDValue V2 = Op.getOperand(1);
2196 uint32_t Selector = 0;
2197 for (auto I : llvm::enumerate(SVN->getMask())) {
2198 if (I.value() != -1) // -1 is a placeholder for undef.
2199 Selector |= (I.value() << (I.index() * 4));
2200 }
2201
2202 SDLoc DL(Op);
2203 SDValue PRMT = getPRMT(DAG.getBitcast(MVT::i32, V1),
2204 DAG.getBitcast(MVT::i32, V2), Selector, DL, DAG);
2205 return DAG.getBitcast(Op.getValueType(), PRMT);
2206}
2207/// LowerShiftRightParts - Lower SRL_PARTS, SRA_PARTS, which
2208/// 1) returns two i32 values and take a 2 x i32 value to shift plus a shift
2209/// amount, or
2210/// 2) returns two i64 values and take a 2 x i64 value to shift plus a shift
2211/// amount.
2212SDValue NVPTXTargetLowering::LowerShiftRightParts(SDValue Op,
2213 SelectionDAG &DAG) const {
2214 assert(Op.getNumOperands() == 3 && "Not a double-shift!");
2215 assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS);
2216
2217 EVT VT = Op.getValueType();
2218 unsigned VTBits = VT.getSizeInBits();
2219 SDLoc dl(Op);
2220 SDValue ShOpLo = Op.getOperand(0);
2221 SDValue ShOpHi = Op.getOperand(1);
2222 SDValue ShAmt = Op.getOperand(2);
2223 unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL;
2224
2225 if (VTBits == 32 && STI.getSmVersion() >= 35) {
2226 // For 32bit and sm35, we can use the funnel shift 'shf' instruction.
2227 // {dHi, dLo} = {aHi, aLo} >> Amt
2228 // dHi = aHi >> Amt
2229 // dLo = shf.r.clamp aLo, aHi, Amt
2230
2231 SDValue Hi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
2232 SDValue Lo =
2233 DAG.getNode(NVPTXISD::FSHR_CLAMP, dl, VT, ShOpHi, ShOpLo, ShAmt);
2234
2235 SDValue Ops[2] = { Lo, Hi };
2236 return DAG.getMergeValues(Ops, dl);
2237 }
2238 else {
2239 // {dHi, dLo} = {aHi, aLo} >> Amt
2240 // - if (Amt>=size) then
2241 // dLo = aHi >> (Amt-size)
2242 // dHi = aHi >> Amt (this is either all 0 or all 1)
2243 // else
2244 // dLo = (aLo >>logic Amt) | (aHi << (size-Amt))
2245 // dHi = aHi >> Amt
2246
2247 SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
2248 DAG.getConstant(VTBits, dl, MVT::i32),
2249 ShAmt);
2250 SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt);
2251 SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
2252 DAG.getConstant(VTBits, dl, MVT::i32));
2253 SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt);
2254 SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
2255 SDValue TrueVal = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt);
2256
2257 SDValue Cmp = DAG.getSetCC(dl, MVT::i1, ShAmt,
2258 DAG.getConstant(VTBits, dl, MVT::i32),
2259 ISD::SETGE);
2260 SDValue Hi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
2261 SDValue Lo = DAG.getNode(ISD::SELECT, dl, VT, Cmp, TrueVal, FalseVal);
2262
2263 SDValue Ops[2] = { Lo, Hi };
2264 return DAG.getMergeValues(Ops, dl);
2265 }
2266}
2267
2268/// LowerShiftLeftParts - Lower SHL_PARTS, which
2269/// 1) returns two i32 values and take a 2 x i32 value to shift plus a shift
2270/// amount, or
2271/// 2) returns two i64 values and take a 2 x i64 value to shift plus a shift
2272/// amount.
2273SDValue NVPTXTargetLowering::LowerShiftLeftParts(SDValue Op,
2274 SelectionDAG &DAG) const {
2275 assert(Op.getNumOperands() == 3 && "Not a double-shift!");
2276 assert(Op.getOpcode() == ISD::SHL_PARTS);
2277
2278 EVT VT = Op.getValueType();
2279 unsigned VTBits = VT.getSizeInBits();
2280 SDLoc dl(Op);
2281 SDValue ShOpLo = Op.getOperand(0);
2282 SDValue ShOpHi = Op.getOperand(1);
2283 SDValue ShAmt = Op.getOperand(2);
2284
2285 if (VTBits == 32 && STI.getSmVersion() >= 35) {
2286 // For 32bit and sm35, we can use the funnel shift 'shf' instruction.
2287 // {dHi, dLo} = {aHi, aLo} << Amt
2288 // dHi = shf.l.clamp aLo, aHi, Amt
2289 // dLo = aLo << Amt
2290
2291 SDValue Hi =
2292 DAG.getNode(NVPTXISD::FSHL_CLAMP, dl, VT, ShOpHi, ShOpLo, ShAmt);
2293 SDValue Lo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
2294
2295 SDValue Ops[2] = { Lo, Hi };
2296 return DAG.getMergeValues(Ops, dl);
2297 }
2298 else {
2299 // {dHi, dLo} = {aHi, aLo} << Amt
2300 // - if (Amt>=size) then
2301 // dLo = aLo << Amt (all 0)
2302 // dLo = aLo << (Amt-size)
2303 // else
2304 // dLo = aLo << Amt
2305 // dHi = (aHi << Amt) | (aLo >> (size-Amt))
2306
2307 SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
2308 DAG.getConstant(VTBits, dl, MVT::i32),
2309 ShAmt);
2310 SDValue Tmp1 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt);
2311 SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
2312 DAG.getConstant(VTBits, dl, MVT::i32));
2313 SDValue Tmp2 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt);
2314 SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
2315 SDValue TrueVal = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt);
2316
2317 SDValue Cmp = DAG.getSetCC(dl, MVT::i1, ShAmt,
2318 DAG.getConstant(VTBits, dl, MVT::i32),
2319 ISD::SETGE);
2320 SDValue Lo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
2321 SDValue Hi = DAG.getNode(ISD::SELECT, dl, VT, Cmp, TrueVal, FalseVal);
2322
2323 SDValue Ops[2] = { Lo, Hi };
2324 return DAG.getMergeValues(Ops, dl);
2325 }
2326}
2327
2328/// If the types match, convert the generic copysign to the NVPTXISD version,
2329/// otherwise bail ensuring that mismatched cases are properly expaned.
2330SDValue NVPTXTargetLowering::LowerFCOPYSIGN(SDValue Op,
2331 SelectionDAG &DAG) const {
2332 EVT VT = Op.getValueType();
2333 SDLoc DL(Op);
2334
2335 SDValue In1 = Op.getOperand(0);
2336 SDValue In2 = Op.getOperand(1);
2337 EVT SrcVT = In2.getValueType();
2338
2339 if (!SrcVT.bitsEq(VT))
2340 return SDValue();
2341
2342 return DAG.getNode(NVPTXISD::FCOPYSIGN, DL, VT, In1, In2);
2343}
2344
2345SDValue NVPTXTargetLowering::LowerFROUND(SDValue Op, SelectionDAG &DAG) const {
2346 EVT VT = Op.getValueType();
2347
2348 if (VT == MVT::f32)
2349 return LowerFROUND32(Op, DAG);
2350
2351 if (VT == MVT::f64)
2352 return LowerFROUND64(Op, DAG);
2353
2354 llvm_unreachable("unhandled type");
2355}
2356
2357// This is the the rounding method used in CUDA libdevice in C like code:
2358// float roundf(float A)
2359// {
2360// float RoundedA = (float) (int) ( A > 0 ? (A + 0.5f) : (A - 0.5f));
2361// RoundedA = abs(A) > 0x1.0p23 ? A : RoundedA;
2362// return abs(A) < 0.5 ? (float)(int)A : RoundedA;
2363// }
2364SDValue NVPTXTargetLowering::LowerFROUND32(SDValue Op,
2365 SelectionDAG &DAG) const {
2366 SDLoc SL(Op);
2367 SDValue A = Op.getOperand(0);
2368 EVT VT = Op.getValueType();
2369
2370 SDValue AbsA = DAG.getNode(ISD::FABS, SL, VT, A);
2371
2372 // RoundedA = (float) (int) ( A > 0 ? (A + 0.5f) : (A - 0.5f))
2373 SDValue Bitcast = DAG.getNode(ISD::BITCAST, SL, MVT::i32, A);
2374 const unsigned SignBitMask = 0x80000000;
2375 SDValue Sign = DAG.getNode(ISD::AND, SL, MVT::i32, Bitcast,
2376 DAG.getConstant(SignBitMask, SL, MVT::i32));
2377 const unsigned PointFiveInBits = 0x3F000000;
2378 SDValue PointFiveWithSignRaw =
2379 DAG.getNode(ISD::OR, SL, MVT::i32, Sign,
2380 DAG.getConstant(PointFiveInBits, SL, MVT::i32));
2381 SDValue PointFiveWithSign =
2382 DAG.getNode(ISD::BITCAST, SL, VT, PointFiveWithSignRaw);
2383 SDValue AdjustedA = DAG.getNode(ISD::FADD, SL, VT, A, PointFiveWithSign);
2384 SDValue RoundedA = DAG.getNode(ISD::FTRUNC, SL, VT, AdjustedA);
2385
2386 // RoundedA = abs(A) > 0x1.0p23 ? A : RoundedA;
2387 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2388 SDValue IsLarge =
2389 DAG.getSetCC(SL, SetCCVT, AbsA, DAG.getConstantFP(pow(2.0, 23.0), SL, VT),
2390 ISD::SETOGT);
2391 RoundedA = DAG.getNode(ISD::SELECT, SL, VT, IsLarge, A, RoundedA);
2392
2393 // return abs(A) < 0.5 ? (float)(int)A : RoundedA;
2394 SDValue IsSmall =DAG.getSetCC(SL, SetCCVT, AbsA,
2395 DAG.getConstantFP(0.5, SL, VT), ISD::SETOLT);
2396 SDValue RoundedAForSmallA = DAG.getNode(ISD::FTRUNC, SL, VT, A);
2397 return DAG.getNode(ISD::SELECT, SL, VT, IsSmall, RoundedAForSmallA, RoundedA);
2398}
2399
2400// The implementation of round(double) is similar to that of round(float) in
2401// that they both separate the value range into three regions and use a method
2402// specific to the region to round the values. However, round(double) first
2403// calculates the round of the absolute value and then adds the sign back while
2404// round(float) directly rounds the value with sign.
2405SDValue NVPTXTargetLowering::LowerFROUND64(SDValue Op,
2406 SelectionDAG &DAG) const {
2407 SDLoc SL(Op);
2408 SDValue A = Op.getOperand(0);
2409 EVT VT = Op.getValueType();
2410
2411 SDValue AbsA = DAG.getNode(ISD::FABS, SL, VT, A);
2412
2413 // double RoundedA = (double) (int) (abs(A) + 0.5f);
2414 SDValue AdjustedA = DAG.getNode(ISD::FADD, SL, VT, AbsA,
2415 DAG.getConstantFP(0.5, SL, VT));
2416 SDValue RoundedA = DAG.getNode(ISD::FTRUNC, SL, VT, AdjustedA);
2417
2418 // RoundedA = abs(A) < 0.5 ? (double)0 : RoundedA;
2419 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2420 SDValue IsSmall =DAG.getSetCC(SL, SetCCVT, AbsA,
2421 DAG.getConstantFP(0.5, SL, VT), ISD::SETOLT);
2422 RoundedA = DAG.getNode(ISD::SELECT, SL, VT, IsSmall,
2423 DAG.getConstantFP(0, SL, VT),
2424 RoundedA);
2425
2426 // Add sign to rounded_A
2427 RoundedA = DAG.getNode(ISD::FCOPYSIGN, SL, VT, RoundedA, A);
2428 DAG.getNode(ISD::FTRUNC, SL, VT, A);
2429
2430 // RoundedA = abs(A) > 0x1.0p52 ? A : RoundedA;
2431 SDValue IsLarge =
2432 DAG.getSetCC(SL, SetCCVT, AbsA, DAG.getConstantFP(pow(2.0, 52.0), SL, VT),
2433 ISD::SETOGT);
2434 return DAG.getNode(ISD::SELECT, SL, VT, IsLarge, A, RoundedA);
2435}
2436
2438 EVT VT = N->getValueType(0);
2439 EVT NVT = MVT::f32;
2440 if (VT.isVector()) {
2441 NVT = EVT::getVectorVT(*DAG.getContext(), NVT, VT.getVectorElementCount());
2442 }
2443 SDLoc DL(N);
2444 SDValue Tmp0 = DAG.getFPExtendOrRound(N->getOperand(0), DL, NVT);
2445 SDValue Tmp1 = DAG.getFPExtendOrRound(N->getOperand(1), DL, NVT);
2446 SDValue Res = DAG.getNode(N->getOpcode(), DL, NVT, Tmp0, Tmp1, N->getFlags());
2447 return DAG.getFPExtendOrRound(Res, DL, VT);
2448}
2449
2450SDValue NVPTXTargetLowering::PromoteBinOpIfF32FTZ(SDValue Op,
2451 SelectionDAG &DAG) const {
2452 if (useF32FTZ(DAG.getMachineFunction())) {
2453 return PromoteBinOpToF32(Op.getNode(), DAG);
2454 }
2455 return Op;
2456}
2457
2458SDValue NVPTXTargetLowering::LowerINT_TO_FP(SDValue Op,
2459 SelectionDAG &DAG) const {
2460 assert(STI.getSmVersion() < 90 || STI.getPTXVersion() < 78);
2461
2462 if (Op.getValueType() == MVT::bf16) {
2463 SDLoc Loc(Op);
2464 return DAG.getNode(
2465 ISD::FP_ROUND, Loc, MVT::bf16,
2466 DAG.getNode(Op.getOpcode(), Loc, MVT::f32, Op.getOperand(0)),
2467 DAG.getIntPtrConstant(0, Loc, /*isTarget=*/true));
2468 }
2469
2470 // Everything else is considered legal.
2471 return Op;
2472}
2473
2474SDValue NVPTXTargetLowering::LowerFP_TO_INT(SDValue Op,
2475 SelectionDAG &DAG) const {
2476 assert(STI.getSmVersion() < 90 || STI.getPTXVersion() < 78);
2477
2478 if (Op.getOperand(0).getValueType() == MVT::bf16) {
2479 SDLoc Loc(Op);
2480 return DAG.getNode(
2481 Op.getOpcode(), Loc, Op.getValueType(),
2482 DAG.getNode(ISD::FP_EXTEND, Loc, MVT::f32, Op.getOperand(0)));
2483 }
2484
2485 // Everything else is considered legal.
2486 return Op;
2487}
2488
2489SDValue NVPTXTargetLowering::LowerFP_ROUND(SDValue Op,
2490 SelectionDAG &DAG) const {
2491 EVT NarrowVT = Op.getValueType();
2492 SDValue Wide = Op.getOperand(0);
2493 EVT WideVT = Wide.getValueType();
2494 if (NarrowVT.getScalarType() == MVT::bf16) {
2495 const TargetLowering *TLI = STI.getTargetLowering();
2496 if (STI.getSmVersion() < 80 || STI.getPTXVersion() < 70) {
2497 return TLI->expandFP_ROUND(Op.getNode(), DAG);
2498 }
2499 if (STI.getSmVersion() < 90 || STI.getPTXVersion() < 78) {
2500 // This combination was the first to support f32 -> bf16.
2501 if (STI.getSmVersion() >= 80 && STI.getPTXVersion() >= 70) {
2502 if (WideVT.getScalarType() == MVT::f32) {
2503 return Op;
2504 }
2505 if (WideVT.getScalarType() == MVT::f64) {
2506 SDLoc Loc(Op);
2507 // Round-inexact-to-odd f64 to f32, then do the final rounding using
2508 // the hardware f32 -> bf16 instruction.
2510 WideVT.changeElementType(*DAG.getContext(), MVT::f32), Wide, Loc,
2511 DAG);
2512 return DAG.getFPExtendOrRound(rod, Loc, NarrowVT);
2513 }
2514 }
2515 return TLI->expandFP_ROUND(Op.getNode(), DAG);
2516 }
2517 }
2518
2519 // Everything else is considered legal.
2520 return Op;
2521}
2522
2523SDValue NVPTXTargetLowering::LowerFP_EXTEND(SDValue Op,
2524 SelectionDAG &DAG) const {
2525 SDValue Narrow = Op.getOperand(0);
2526 EVT NarrowVT = Narrow.getValueType();
2527 EVT WideVT = Op.getValueType();
2528 if (NarrowVT.getScalarType() == MVT::bf16) {
2529 if (WideVT.getScalarType() == MVT::f32 &&
2530 (STI.getSmVersion() < 80 || STI.getPTXVersion() < 71)) {
2531 SDLoc Loc(Op);
2532 return DAG.getNode(ISD::BF16_TO_FP, Loc, WideVT, Narrow);
2533 }
2534 if (WideVT.getScalarType() == MVT::f64 &&
2535 (STI.getSmVersion() < 90 || STI.getPTXVersion() < 78)) {
2536 EVT F32 = NarrowVT.changeElementType(*DAG.getContext(), MVT::f32);
2537 SDLoc Loc(Op);
2538 if (STI.getSmVersion() >= 80 && STI.getPTXVersion() >= 71) {
2539 Op = DAG.getNode(ISD::FP_EXTEND, Loc, F32, Narrow);
2540 } else {
2541 Op = DAG.getNode(ISD::BF16_TO_FP, Loc, F32, Narrow);
2542 }
2543 return DAG.getNode(ISD::FP_EXTEND, Loc, WideVT, Op);
2544 }
2545 }
2546
2547 // Everything else is considered legal.
2548 return Op;
2549}
2550
2552 SDLoc DL(Op);
2553 if (Op.getValueType() != MVT::v2i16)
2554 return Op;
2555 EVT EltVT = Op.getValueType().getVectorElementType();
2556 SmallVector<SDValue> VecElements;
2557 for (int I = 0, E = Op.getValueType().getVectorNumElements(); I < E; I++) {
2558 SmallVector<SDValue> ScalarArgs;
2559 llvm::transform(Op->ops(), std::back_inserter(ScalarArgs),
2560 [&](const SDUse &O) {
2561 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT,
2562 O.get(), DAG.getIntPtrConstant(I, DL));
2563 });
2564 VecElements.push_back(DAG.getNode(Op.getOpcode(), DL, EltVT, ScalarArgs));
2565 }
2566 SDValue V =
2567 DAG.getNode(ISD::BUILD_VECTOR, DL, Op.getValueType(), VecElements);
2568 return V;
2569}
2570
2572 SDNode *N = Op.getNode();
2573 SDLoc DL(N);
2575
2576 // split the vector argument
2577 for (size_t I = 0; I < N->getNumOperands(); I++) {
2578 SDValue Val = N->getOperand(I);
2579 EVT ValVT = Val.getValueType();
2580 if (ValVT.isVector()) {
2581 EVT EltVT = ValVT.getVectorElementType();
2582 for (unsigned J = 0, NElts = ValVT.getVectorNumElements(); J < NElts; J++)
2583 Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Val,
2584 DAG.getIntPtrConstant(J, DL)));
2585 } else
2586 Ops.push_back(Val);
2587 }
2588
2590 SDValue Tcgen05StNode =
2591 DAG.getMemIntrinsicNode(ISD::INTRINSIC_VOID, DL, N->getVTList(), Ops,
2592 MemSD->getMemoryVT(), MemSD->getMemOperand());
2593
2594 return Tcgen05StNode;
2595}
2596
2598 SDLoc DL(Op);
2599 SDValue Src = Op.getOperand(0);
2600 EVT VT = Op.getValueType();
2601
2602 switch (VT.getSimpleVT().SimpleTy) {
2603 case MVT::i16: {
2604 SDValue Extended = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Src);
2605 SDValue Swapped =
2606 getPRMT(Extended, DAG.getConstant(0, DL, MVT::i32), 0x7701, DL, DAG);
2607 return DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Swapped);
2608 }
2609 case MVT::i32: {
2610 return getPRMT(Src, DAG.getConstant(0, DL, MVT::i32), 0x0123, DL, DAG);
2611 }
2612 case MVT::v2i16: {
2613 SDValue Converted = DAG.getBitcast(MVT::i32, Src);
2614 SDValue Swapped =
2615 getPRMT(Converted, DAG.getConstant(0, DL, MVT::i32), 0x2301, DL, DAG);
2616 return DAG.getNode(ISD::BITCAST, DL, MVT::v2i16, Swapped);
2617 }
2618 case MVT::i64: {
2619 SDValue UnpackSrc =
2620 DAG.getNode(NVPTXISD::UNPACK_VECTOR, DL, {MVT::i32, MVT::i32}, Src);
2621 SDValue SwappedLow =
2622 getPRMT(UnpackSrc.getValue(0), DAG.getConstant(0, DL, MVT::i32), 0x0123,
2623 DL, DAG);
2624 SDValue SwappedHigh =
2625 getPRMT(UnpackSrc.getValue(1), DAG.getConstant(0, DL, MVT::i32), 0x0123,
2626 DL, DAG);
2627 return DAG.getNode(NVPTXISD::BUILD_VECTOR, DL, MVT::i64,
2628 {SwappedHigh, SwappedLow});
2629 }
2630 default:
2631 llvm_unreachable("unsupported type for bswap");
2632 }
2633}
2634
2635static unsigned getTcgen05MMADisableOutputLane(unsigned IID) {
2636 switch (IID) {
2637 case Intrinsic::nvvm_tcgen05_mma_shared_disable_output_lane_cg1:
2638 return NVPTXISD::TCGEN05_MMA_SHARED_DISABLE_OUTPUT_LANE_CG1;
2639 case Intrinsic::nvvm_tcgen05_mma_shared_disable_output_lane_cg2:
2640 return NVPTXISD::TCGEN05_MMA_SHARED_DISABLE_OUTPUT_LANE_CG2;
2641 case Intrinsic::nvvm_tcgen05_mma_shared_scale_d_disable_output_lane_cg1:
2642 return NVPTXISD::TCGEN05_MMA_SHARED_SCALE_D_DISABLE_OUTPUT_LANE_CG1;
2643 case Intrinsic::nvvm_tcgen05_mma_shared_scale_d_disable_output_lane_cg2:
2644 return NVPTXISD::TCGEN05_MMA_SHARED_SCALE_D_DISABLE_OUTPUT_LANE_CG2;
2645 case Intrinsic::nvvm_tcgen05_mma_tensor_disable_output_lane_cg1:
2646 return NVPTXISD::TCGEN05_MMA_TENSOR_DISABLE_OUTPUT_LANE_CG1;
2647 case Intrinsic::nvvm_tcgen05_mma_tensor_disable_output_lane_cg2:
2648 return NVPTXISD::TCGEN05_MMA_TENSOR_DISABLE_OUTPUT_LANE_CG2;
2649 case Intrinsic::nvvm_tcgen05_mma_tensor_scale_d_disable_output_lane_cg1:
2650 return NVPTXISD::TCGEN05_MMA_TENSOR_SCALE_D_DISABLE_OUTPUT_LANE_CG1;
2651 case Intrinsic::nvvm_tcgen05_mma_tensor_scale_d_disable_output_lane_cg2:
2652 return NVPTXISD::TCGEN05_MMA_TENSOR_SCALE_D_DISABLE_OUTPUT_LANE_CG2;
2653 case Intrinsic::nvvm_tcgen05_mma_tensor_disable_output_lane_cg1_ashift:
2654 return NVPTXISD::TCGEN05_MMA_TENSOR_DISABLE_OUTPUT_LANE_CG1_ASHIFT;
2655 case Intrinsic::nvvm_tcgen05_mma_tensor_disable_output_lane_cg2_ashift:
2656 return NVPTXISD::TCGEN05_MMA_TENSOR_DISABLE_OUTPUT_LANE_CG2_ASHIFT;
2657 case Intrinsic::
2658 nvvm_tcgen05_mma_tensor_scale_d_disable_output_lane_cg1_ashift:
2659 return NVPTXISD::TCGEN05_MMA_TENSOR_SCALE_D_DISABLE_OUTPUT_LANE_CG1_ASHIFT;
2660 case Intrinsic::
2661 nvvm_tcgen05_mma_tensor_scale_d_disable_output_lane_cg2_ashift:
2662 return NVPTXISD::TCGEN05_MMA_TENSOR_SCALE_D_DISABLE_OUTPUT_LANE_CG2_ASHIFT;
2663 case Intrinsic::nvvm_tcgen05_mma_sp_shared_disable_output_lane_cg1:
2664 return NVPTXISD::TCGEN05_MMA_SP_SHARED_DISABLE_OUTPUT_LANE_CG1;
2665 case Intrinsic::nvvm_tcgen05_mma_sp_shared_disable_output_lane_cg2:
2666 return NVPTXISD::TCGEN05_MMA_SP_SHARED_DISABLE_OUTPUT_LANE_CG2;
2667 case Intrinsic::nvvm_tcgen05_mma_sp_shared_scale_d_disable_output_lane_cg1:
2668 return NVPTXISD::TCGEN05_MMA_SP_SHARED_SCALE_D_DISABLE_OUTPUT_LANE_CG1;
2669 case Intrinsic::nvvm_tcgen05_mma_sp_shared_scale_d_disable_output_lane_cg2:
2670 return NVPTXISD::TCGEN05_MMA_SP_SHARED_SCALE_D_DISABLE_OUTPUT_LANE_CG2;
2671 case Intrinsic::nvvm_tcgen05_mma_sp_tensor_disable_output_lane_cg1:
2672 return NVPTXISD::TCGEN05_MMA_SP_TENSOR_DISABLE_OUTPUT_LANE_CG1;
2673 case Intrinsic::nvvm_tcgen05_mma_sp_tensor_disable_output_lane_cg2:
2674 return NVPTXISD::TCGEN05_MMA_SP_TENSOR_DISABLE_OUTPUT_LANE_CG2;
2675 case Intrinsic::nvvm_tcgen05_mma_sp_tensor_disable_output_lane_cg1_ashift:
2676 return NVPTXISD::TCGEN05_MMA_SP_TENSOR_DISABLE_OUTPUT_LANE_CG1_ASHIFT;
2677 case Intrinsic::nvvm_tcgen05_mma_sp_tensor_disable_output_lane_cg2_ashift:
2678 return NVPTXISD::TCGEN05_MMA_SP_TENSOR_DISABLE_OUTPUT_LANE_CG2_ASHIFT;
2679 case Intrinsic::nvvm_tcgen05_mma_sp_tensor_scale_d_disable_output_lane_cg1:
2680 return NVPTXISD::TCGEN05_MMA_SP_TENSOR_SCALE_D_DISABLE_OUTPUT_LANE_CG1;
2681 case Intrinsic::nvvm_tcgen05_mma_sp_tensor_scale_d_disable_output_lane_cg2:
2682 return NVPTXISD::TCGEN05_MMA_SP_TENSOR_SCALE_D_DISABLE_OUTPUT_LANE_CG2;
2683 case Intrinsic::
2684 nvvm_tcgen05_mma_sp_tensor_scale_d_disable_output_lane_cg1_ashift:
2685 return NVPTXISD::
2686 TCGEN05_MMA_SP_TENSOR_SCALE_D_DISABLE_OUTPUT_LANE_CG1_ASHIFT;
2687 case Intrinsic::
2688 nvvm_tcgen05_mma_sp_tensor_scale_d_disable_output_lane_cg2_ashift:
2689 return NVPTXISD::
2690 TCGEN05_MMA_SP_TENSOR_SCALE_D_DISABLE_OUTPUT_LANE_CG2_ASHIFT;
2691 };
2692 llvm_unreachable("unhandled tcgen05.mma.disable_output_lane intrinsic");
2693}
2694
2696 SDNode *N = Op.getNode();
2697 SDLoc DL(N);
2698 unsigned IID = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
2699
2701 // split the vector argument
2702 for (size_t I = 0; I < N->getNumOperands(); I++) {
2703 if (I == 1)
2704 continue; // skip IID
2705 SDValue Val = N->getOperand(I);
2706 EVT ValVT = Val.getValueType();
2707 if (ValVT.isVector()) {
2708 EVT EltVT = ValVT.getVectorElementType();
2709 for (unsigned J = 0, NElts = ValVT.getVectorNumElements(); J < NElts; J++)
2710 Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Val,
2711 DAG.getIntPtrConstant(J, DL)));
2712 } else
2713 Ops.push_back(Val);
2714 }
2715
2717 SDValue Tcgen05MMANode = DAG.getMemIntrinsicNode(
2718 getTcgen05MMADisableOutputLane(IID), DL, N->getVTList(), Ops,
2719 MemSD->getMemoryVT(), MemSD->getMemOperand());
2720
2721 return Tcgen05MMANode;
2722}
2723
2724// Lower vector return type of tcgen05.ld intrinsics
2725static std::optional<std::pair<SDValue, SDValue>>
2726lowerTcgen05Ld(SDNode *N, SelectionDAG &DAG, bool HasOffset = false) {
2727 SDLoc DL(N);
2728 EVT ResVT = N->getValueType(0);
2729 if (!ResVT.isVector())
2730 return {}; // already legalized.
2731
2732 const unsigned NumElts = ResVT.getVectorNumElements();
2733
2734 // Create the return type of the instructions
2735 SmallVector<EVT, 5> ListVTs;
2736 for (unsigned i = 0; i < NumElts; ++i)
2737 ListVTs.push_back(MVT::i32);
2738
2739 ListVTs.push_back(N->getValueType(1)); // Chain
2740
2741 SDVTList ResVTs = DAG.getVTList(ListVTs);
2742
2743 SmallVector<SDValue, 8> Ops{N->getOperand(0), N->getOperand(1),
2744 N->getOperand(2)};
2745
2746 if (HasOffset) {
2747 Ops.push_back(N->getOperand(3)); // offset
2748 Ops.push_back(N->getOperand(4)); // Pack flag
2749 } else
2750 Ops.push_back(N->getOperand(3)); // Pack flag
2751
2753 SDValue NewNode =
2755 MemSD->getMemoryVT(), MemSD->getMemOperand());
2756
2757 // split the vector result
2758 SmallVector<SDValue, 4> ScalarRes;
2759 for (unsigned i = 0; i < NumElts; ++i) {
2760 SDValue Res = NewNode.getValue(i);
2761 ScalarRes.push_back(Res);
2762 }
2763
2764 SDValue Chain = NewNode.getValue(NumElts);
2765 SDValue BuildVector = DAG.getNode(ISD::BUILD_VECTOR, DL, ResVT, ScalarRes);
2766 return {{BuildVector, Chain}};
2767}
2768
2770 unsigned Val) {
2771 SDNode *N = Op.getNode();
2772 SDLoc DL(N);
2773
2774 const Function &Fn = DAG.getMachineFunction().getFunction();
2775
2776 unsigned AS = 0;
2777 if (auto *MemN = dyn_cast<MemIntrinsicSDNode>(N))
2778 AS = MemN->getAddressSpace();
2779 Type *PtrTy = PointerType::get(*DAG.getContext(), AS);
2781
2783 Fn,
2784 "Intrinsic " +
2785 Intrinsic::getName(N->getConstantOperandVal(1), {PtrTy}, M) +
2786 " with value " + Twine(Val) +
2787 " is not supported on the given target.",
2788 DL.getDebugLoc()));
2789 return Op.getOperand(0);
2790}
2791
2793 SDNode *N = Op.getNode();
2794 SDLoc DL(N);
2795
2796 // immediate argument representing elemtype
2797 unsigned Val = N->getConstantOperandVal(3);
2798
2800 Val))
2801 return reportInvalidTensormapReplaceUsage(Op, DAG, Val);
2802
2803 return Op;
2804}
2805
2807 SDNode *N = Op.getNode();
2808 SDLoc DL(N);
2809
2810 // immediate argument representing swizzle mode
2811 unsigned Val = N->getConstantOperandVal(3);
2812
2814 Val))
2815 return reportInvalidTensormapReplaceUsage(Op, DAG, Val);
2816
2817 return Op;
2818}
2819
2821 SDNode *N = Op.getNode();
2822 SDValue Intrin = N->getOperand(1);
2823
2824 // Get the intrinsic ID
2825 unsigned IntrinNo = cast<ConstantSDNode>(Intrin.getNode())->getZExtValue();
2826 switch (IntrinNo) {
2827 default:
2828 break;
2829 case Intrinsic::nvvm_tcgen05_st_16x64b_x1:
2830 case Intrinsic::nvvm_tcgen05_st_16x64b_x2:
2831 case Intrinsic::nvvm_tcgen05_st_16x64b_x4:
2832 case Intrinsic::nvvm_tcgen05_st_16x64b_x8:
2833 case Intrinsic::nvvm_tcgen05_st_16x64b_x16:
2834 case Intrinsic::nvvm_tcgen05_st_16x64b_x32:
2835 case Intrinsic::nvvm_tcgen05_st_16x64b_x128:
2836 case Intrinsic::nvvm_tcgen05_st_16x128b_x1:
2837 case Intrinsic::nvvm_tcgen05_st_16x128b_x2:
2838 case Intrinsic::nvvm_tcgen05_st_16x128b_x4:
2839 case Intrinsic::nvvm_tcgen05_st_16x128b_x8:
2840 case Intrinsic::nvvm_tcgen05_st_16x128b_x16:
2841 case Intrinsic::nvvm_tcgen05_st_16x128b_x32:
2842 case Intrinsic::nvvm_tcgen05_st_16x128b_x64:
2843 case Intrinsic::nvvm_tcgen05_st_16x256b_x1:
2844 case Intrinsic::nvvm_tcgen05_st_16x256b_x2:
2845 case Intrinsic::nvvm_tcgen05_st_16x256b_x4:
2846 case Intrinsic::nvvm_tcgen05_st_16x256b_x8:
2847 case Intrinsic::nvvm_tcgen05_st_16x256b_x16:
2848 case Intrinsic::nvvm_tcgen05_st_16x256b_x32:
2849 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x1:
2850 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x2:
2851 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x4:
2852 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x8:
2853 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x16:
2854 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x32:
2855 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x64:
2856 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x128:
2857 case Intrinsic::nvvm_tcgen05_st_32x32b_x1:
2858 case Intrinsic::nvvm_tcgen05_st_32x32b_x2:
2859 case Intrinsic::nvvm_tcgen05_st_32x32b_x4:
2860 case Intrinsic::nvvm_tcgen05_st_32x32b_x8:
2861 case Intrinsic::nvvm_tcgen05_st_32x32b_x16:
2862 case Intrinsic::nvvm_tcgen05_st_32x32b_x32:
2863 case Intrinsic::nvvm_tcgen05_st_16x64b_x64:
2864 case Intrinsic::nvvm_tcgen05_st_32x32b_x64:
2865 case Intrinsic::nvvm_tcgen05_st_32x32b_x128:
2866 return lowerTcgen05St(Op, DAG);
2867 case Intrinsic::nvvm_tcgen05_mma_shared_disable_output_lane_cg1:
2868 case Intrinsic::nvvm_tcgen05_mma_shared_disable_output_lane_cg2:
2869 case Intrinsic::nvvm_tcgen05_mma_shared_scale_d_disable_output_lane_cg1:
2870 case Intrinsic::nvvm_tcgen05_mma_shared_scale_d_disable_output_lane_cg2:
2871 case Intrinsic::nvvm_tcgen05_mma_sp_shared_disable_output_lane_cg1:
2872 case Intrinsic::nvvm_tcgen05_mma_sp_shared_disable_output_lane_cg2:
2873 case Intrinsic::nvvm_tcgen05_mma_sp_shared_scale_d_disable_output_lane_cg1:
2874 case Intrinsic::nvvm_tcgen05_mma_sp_shared_scale_d_disable_output_lane_cg2:
2875 case Intrinsic::nvvm_tcgen05_mma_tensor_disable_output_lane_cg1:
2876 case Intrinsic::nvvm_tcgen05_mma_tensor_disable_output_lane_cg2:
2877 case Intrinsic::nvvm_tcgen05_mma_tensor_scale_d_disable_output_lane_cg1:
2878 case Intrinsic::nvvm_tcgen05_mma_tensor_scale_d_disable_output_lane_cg2:
2879 case Intrinsic::nvvm_tcgen05_mma_sp_tensor_disable_output_lane_cg1:
2880 case Intrinsic::nvvm_tcgen05_mma_sp_tensor_disable_output_lane_cg2:
2881 case Intrinsic::nvvm_tcgen05_mma_sp_tensor_scale_d_disable_output_lane_cg1:
2882 case Intrinsic::nvvm_tcgen05_mma_sp_tensor_scale_d_disable_output_lane_cg2:
2883 case Intrinsic::nvvm_tcgen05_mma_tensor_disable_output_lane_cg1_ashift:
2884 case Intrinsic::nvvm_tcgen05_mma_tensor_disable_output_lane_cg2_ashift:
2885 case Intrinsic::
2886 nvvm_tcgen05_mma_tensor_scale_d_disable_output_lane_cg1_ashift:
2887 case Intrinsic::
2888 nvvm_tcgen05_mma_tensor_scale_d_disable_output_lane_cg2_ashift:
2889 case Intrinsic::nvvm_tcgen05_mma_sp_tensor_disable_output_lane_cg1_ashift:
2890 case Intrinsic::nvvm_tcgen05_mma_sp_tensor_disable_output_lane_cg2_ashift:
2891 case Intrinsic::
2892 nvvm_tcgen05_mma_sp_tensor_scale_d_disable_output_lane_cg1_ashift:
2893 case Intrinsic::
2894 nvvm_tcgen05_mma_sp_tensor_scale_d_disable_output_lane_cg2_ashift:
2896 case Intrinsic::nvvm_tensormap_replace_elemtype:
2897 return lowerTensormapReplaceElemtype(Op, DAG);
2898 case Intrinsic::nvvm_tensormap_replace_swizzle_mode:
2900 }
2901 return Op;
2902}
2903
2905 SelectionDAG &DAG) {
2906
2907 SDNode *N = Op.getNode();
2908 if (N->getOperand(1).getValueType() != MVT::i128) {
2909 // return, if the operand is already lowered
2910 return SDValue();
2911 }
2912
2913 unsigned IID =
2914 cast<ConstantSDNode>(N->getOperand(0).getNode())->getZExtValue();
2915 auto Opcode = [&]() {
2916 switch (IID) {
2917 case Intrinsic::nvvm_clusterlaunchcontrol_query_cancel_is_canceled:
2918 return NVPTXISD::CLUSTERLAUNCHCONTROL_QUERY_CANCEL_IS_CANCELED;
2919 case Intrinsic::nvvm_clusterlaunchcontrol_query_cancel_get_first_ctaid_x:
2920 return NVPTXISD::CLUSTERLAUNCHCONTROL_QUERY_CANCEL_GET_FIRST_CTAID_X;
2921 case Intrinsic::nvvm_clusterlaunchcontrol_query_cancel_get_first_ctaid_y:
2922 return NVPTXISD::CLUSTERLAUNCHCONTROL_QUERY_CANCEL_GET_FIRST_CTAID_Y;
2923 case Intrinsic::nvvm_clusterlaunchcontrol_query_cancel_get_first_ctaid_z:
2924 return NVPTXISD::CLUSTERLAUNCHCONTROL_QUERY_CANCEL_GET_FIRST_CTAID_Z;
2925 default:
2926 llvm_unreachable("unsupported/unhandled intrinsic");
2927 }
2928 }();
2929
2930 SDLoc DL(N);
2931 SDValue TryCancelResponse = N->getOperand(1);
2932 SDValue Cast = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, TryCancelResponse);
2933 SDValue TryCancelResponse0 =
2934 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, Cast,
2935 DAG.getIntPtrConstant(0, DL));
2936 SDValue TryCancelResponse1 =
2937 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, Cast,
2938 DAG.getIntPtrConstant(1, DL));
2939
2940 return DAG.getNode(Opcode, DL, N->getVTList(),
2941 {TryCancelResponse0, TryCancelResponse1});
2942}
2943
2945 SDNode *N = Op.getNode();
2946 SDLoc DL(N);
2947 SDValue F32Vec = N->getOperand(1);
2948 SDValue RBits = N->getOperand(2);
2949
2950 unsigned IntrinsicID = N->getConstantOperandVal(0);
2951
2952 // Extract the 4 float elements from the vector
2954 for (unsigned i = 0; i < 4; ++i)
2955 Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, F32Vec,
2956 DAG.getIntPtrConstant(i, DL)));
2957
2959
2960 auto [OpCode, RetTy, CvtModeFlag] =
2961 [&]() -> std::tuple<unsigned, MVT::SimpleValueType, uint32_t> {
2962 switch (IntrinsicID) {
2963 case Intrinsic::nvvm_f32x4_to_e4m3x4_rs_relu_satfinite:
2964 return {NVPTXISD::CVT_E4M3X4_F32X4_RS_SF, MVT::v4i8,
2965 CvtMode::RS | CvtMode::RELU_FLAG};
2966 case Intrinsic::nvvm_f32x4_to_e4m3x4_rs_satfinite:
2967 return {NVPTXISD::CVT_E4M3X4_F32X4_RS_SF, MVT::v4i8, CvtMode::RS};
2968 case Intrinsic::nvvm_f32x4_to_e5m2x4_rs_relu_satfinite:
2969 return {NVPTXISD::CVT_E5M2X4_F32X4_RS_SF, MVT::v4i8,
2970 CvtMode::RS | CvtMode::RELU_FLAG};
2971 case Intrinsic::nvvm_f32x4_to_e5m2x4_rs_satfinite:
2972 return {NVPTXISD::CVT_E5M2X4_F32X4_RS_SF, MVT::v4i8, CvtMode::RS};
2973 case Intrinsic::nvvm_f32x4_to_e2m3x4_rs_relu_satfinite:
2974 return {NVPTXISD::CVT_E2M3X4_F32X4_RS_SF, MVT::v4i8,
2975 CvtMode::RS | CvtMode::RELU_FLAG};
2976 case Intrinsic::nvvm_f32x4_to_e2m3x4_rs_satfinite:
2977 return {NVPTXISD::CVT_E2M3X4_F32X4_RS_SF, MVT::v4i8, CvtMode::RS};
2978 case Intrinsic::nvvm_f32x4_to_e3m2x4_rs_relu_satfinite:
2979 return {NVPTXISD::CVT_E3M2X4_F32X4_RS_SF, MVT::v4i8,
2980 CvtMode::RS | CvtMode::RELU_FLAG};
2981 case Intrinsic::nvvm_f32x4_to_e3m2x4_rs_satfinite:
2982 return {NVPTXISD::CVT_E3M2X4_F32X4_RS_SF, MVT::v4i8, CvtMode::RS};
2983 case Intrinsic::nvvm_f32x4_to_e2m1x4_rs_relu_satfinite:
2984 return {NVPTXISD::CVT_E2M1X4_F32X4_RS_SF, MVT::i16,
2985 CvtMode::RS | CvtMode::RELU_FLAG};
2986 case Intrinsic::nvvm_f32x4_to_e2m1x4_rs_satfinite:
2987 return {NVPTXISD::CVT_E2M1X4_F32X4_RS_SF, MVT::i16, CvtMode::RS};
2988 default:
2989 llvm_unreachable("unsupported/unhandled intrinsic");
2990 }
2991 }();
2992
2993 Ops.push_back(RBits);
2994 Ops.push_back(DAG.getConstant(CvtModeFlag, DL, MVT::i32));
2995
2996 return DAG.getNode(OpCode, DL, RetTy, Ops);
2997}
2998
3000 const unsigned Mode = [&]() {
3001 switch (Op->getConstantOperandVal(0)) {
3002 case Intrinsic::nvvm_prmt:
3004 case Intrinsic::nvvm_prmt_b4e:
3006 case Intrinsic::nvvm_prmt_ecl:
3008 case Intrinsic::nvvm_prmt_ecr:
3010 case Intrinsic::nvvm_prmt_f4e:
3012 case Intrinsic::nvvm_prmt_rc16:
3014 case Intrinsic::nvvm_prmt_rc8:
3016 default:
3017 llvm_unreachable("unsupported/unhandled intrinsic");
3018 }
3019 }();
3020 SDLoc DL(Op);
3021 SDValue A = Op->getOperand(1);
3022 SDValue B = Op.getNumOperands() == 4 ? Op.getOperand(2)
3023 : DAG.getConstant(0, DL, MVT::i32);
3024 SDValue Selector = (Op->op_end() - 1)->get();
3025 return getPRMT(A, B, Selector, DL, DAG, Mode);
3026}
3027
3028#define TCGEN05_LD_RED_INTR(SHAPE, NUM, TYPE) \
3029 Intrinsic::nvvm_tcgen05_ld_red_##SHAPE##_x##NUM##_##TYPE
3030
3031#define TCGEN05_LD_RED_INST(SHAPE, NUM, TYPE) \
3032 NVPTXISD::TCGEN05_LD_RED_##SHAPE##_X##NUM##_##TYPE
3033
3034static unsigned getTcgen05LdRedID(Intrinsic::ID IID) {
3035 switch (IID) {
3036 case TCGEN05_LD_RED_INTR(32x32b, 2, f32):
3037 return TCGEN05_LD_RED_INST(32x32b, 2, F32);
3038 case TCGEN05_LD_RED_INTR(32x32b, 4, f32):
3039 return TCGEN05_LD_RED_INST(32x32b, 4, F32);
3040 case TCGEN05_LD_RED_INTR(32x32b, 8, f32):
3041 return TCGEN05_LD_RED_INST(32x32b, 8, F32);
3042 case TCGEN05_LD_RED_INTR(32x32b, 16, f32):
3043 return TCGEN05_LD_RED_INST(32x32b, 16, F32);
3044 case TCGEN05_LD_RED_INTR(32x32b, 32, f32):
3045 return TCGEN05_LD_RED_INST(32x32b, 32, F32);
3046 case TCGEN05_LD_RED_INTR(32x32b, 64, f32):
3047 return TCGEN05_LD_RED_INST(32x32b, 64, F32);
3048 case TCGEN05_LD_RED_INTR(32x32b, 128, f32):
3049 return TCGEN05_LD_RED_INST(32x32b, 128, F32);
3050 case TCGEN05_LD_RED_INTR(16x32bx2, 2, f32):
3051 return TCGEN05_LD_RED_INST(16x32bx2, 2, F32);
3052 case TCGEN05_LD_RED_INTR(16x32bx2, 4, f32):
3053 return TCGEN05_LD_RED_INST(16x32bx2, 4, F32);
3054 case TCGEN05_LD_RED_INTR(16x32bx2, 8, f32):
3055 return TCGEN05_LD_RED_INST(16x32bx2, 8, F32);
3056 case TCGEN05_LD_RED_INTR(16x32bx2, 16, f32):
3057 return TCGEN05_LD_RED_INST(16x32bx2, 16, F32);
3058 case TCGEN05_LD_RED_INTR(16x32bx2, 32, f32):
3059 return TCGEN05_LD_RED_INST(16x32bx2, 32, F32);
3060 case TCGEN05_LD_RED_INTR(16x32bx2, 64, f32):
3061 return TCGEN05_LD_RED_INST(16x32bx2, 64, F32);
3062 case TCGEN05_LD_RED_INTR(16x32bx2, 128, f32):
3063 return TCGEN05_LD_RED_INST(16x32bx2, 128, F32);
3064 case TCGEN05_LD_RED_INTR(32x32b, 2, i32):
3065 return TCGEN05_LD_RED_INST(32x32b, 2, I32);
3066 case TCGEN05_LD_RED_INTR(32x32b, 4, i32):
3067 return TCGEN05_LD_RED_INST(32x32b, 4, I32);
3068 case TCGEN05_LD_RED_INTR(32x32b, 8, i32):
3069 return TCGEN05_LD_RED_INST(32x32b, 8, I32);
3070 case TCGEN05_LD_RED_INTR(32x32b, 16, i32):
3071 return TCGEN05_LD_RED_INST(32x32b, 16, I32);
3072 case TCGEN05_LD_RED_INTR(32x32b, 32, i32):
3073 return TCGEN05_LD_RED_INST(32x32b, 32, I32);
3074 case TCGEN05_LD_RED_INTR(32x32b, 64, i32):
3075 return TCGEN05_LD_RED_INST(32x32b, 64, I32);
3076 case TCGEN05_LD_RED_INTR(32x32b, 128, i32):
3077 return TCGEN05_LD_RED_INST(32x32b, 128, I32);
3078 case TCGEN05_LD_RED_INTR(16x32bx2, 2, i32):
3079 return TCGEN05_LD_RED_INST(16x32bx2, 2, I32);
3080 case TCGEN05_LD_RED_INTR(16x32bx2, 4, i32):
3081 return TCGEN05_LD_RED_INST(16x32bx2, 4, I32);
3082 case TCGEN05_LD_RED_INTR(16x32bx2, 8, i32):
3083 return TCGEN05_LD_RED_INST(16x32bx2, 8, I32);
3084 case TCGEN05_LD_RED_INTR(16x32bx2, 16, i32):
3085 return TCGEN05_LD_RED_INST(16x32bx2, 16, I32);
3086 case TCGEN05_LD_RED_INTR(16x32bx2, 32, i32):
3087 return TCGEN05_LD_RED_INST(16x32bx2, 32, I32);
3088 case TCGEN05_LD_RED_INTR(16x32bx2, 64, i32):
3089 return TCGEN05_LD_RED_INST(16x32bx2, 64, I32);
3090 case TCGEN05_LD_RED_INTR(16x32bx2, 128, i32):
3091 return TCGEN05_LD_RED_INST(16x32bx2, 128, I32);
3092 default:
3093 llvm_unreachable("Invalid tcgen05.ld.red intrinsic ID");
3094 }
3095}
3096
3097// Lower vector return type of tcgen05.ld intrinsics
3098static std::optional<std::tuple<SDValue, SDValue, SDValue>>
3100 SDLoc DL(N);
3101 EVT ResVT = N->getValueType(0);
3102 if (!ResVT.isVector())
3103 return {}; // already legalized.
3104
3105 const unsigned NumElts = ResVT.getVectorNumElements();
3106
3107 // Create the return type of the instructions
3108 // +1 represents the reduction value
3109 SmallVector<EVT, 132> ListVTs{
3110 NumElts + 1,
3111 ResVT.getVectorElementType().isFloatingPoint() ? MVT::f32 : MVT::i32};
3112
3113 ListVTs.push_back(MVT::Other); // Chain
3114
3115 SDVTList ResVTs = DAG.getVTList(ListVTs);
3116
3117 // Prepare the Operands
3118 SmallVector<SDValue, 8> Ops{N->getOperand(0)}; // Chain
3119
3120 // skip IID at index 1
3121 for (unsigned i = 2; i < N->getNumOperands(); i++)
3122 Ops.push_back(N->getOperand(i));
3123
3124 unsigned IID = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
3126 SDValue NewNode =
3127 DAG.getMemIntrinsicNode(getTcgen05LdRedID(IID), DL, ResVTs, Ops,
3128 MemSD->getMemoryVT(), MemSD->getMemOperand());
3129
3130 // Split vector result
3131 SmallVector<SDValue, 132> ScalarRes;
3132 for (unsigned i = 0; i < NumElts; ++i) {
3133 SDValue Res = NewNode.getValue(i);
3134 ScalarRes.push_back(Res);
3135 }
3136
3137 SDValue BuildVector = DAG.getNode(ISD::BUILD_VECTOR, DL, ResVT, ScalarRes);
3138 SDValue RedResult = NewNode.getValue(NumElts);
3139 SDValue Chain = NewNode.getValue(NumElts + 1);
3140 return {{BuildVector, RedResult, Chain}};
3141}
3142
3144 switch (Op->getConstantOperandVal(1)) {
3145 default:
3146 return Op;
3147
3148 // These tcgen05 intrinsics return a v2i32, which is legal, so we have to
3149 // lower them through LowerOperation() instead of ReplaceNodeResults().
3150 case Intrinsic::nvvm_tcgen05_ld_16x64b_x2:
3151 case Intrinsic::nvvm_tcgen05_ld_16x128b_x1:
3152 case Intrinsic::nvvm_tcgen05_ld_32x32b_x2:
3153 if (auto Res = lowerTcgen05Ld(Op.getNode(), DAG))
3154 return DAG.getMergeValues({Res->first, Res->second}, SDLoc(Op));
3155 return SDValue();
3156
3157 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x2:
3158 if (auto Res = lowerTcgen05Ld(Op.getNode(), DAG, /*HasOffset=*/true))
3159 return DAG.getMergeValues({Res->first, Res->second}, SDLoc(Op));
3160 return SDValue();
3161
3162 case Intrinsic::nvvm_tcgen05_ld_red_32x32b_x2_f32:
3163 case Intrinsic::nvvm_tcgen05_ld_red_32x32b_x2_i32:
3164 case Intrinsic::nvvm_tcgen05_ld_red_16x32bx2_x2_f32:
3165 case Intrinsic::nvvm_tcgen05_ld_red_16x32bx2_x2_i32:
3166 if (auto Res = lowerTcgen05LdRed(Op.getNode(), DAG))
3167 return DAG.getMergeValues(
3168 {std::get<0>(*Res), std::get<1>(*Res), std::get<2>(*Res)}, SDLoc(Op));
3169 return SDValue();
3170 }
3171}
3172
3174 switch (Op->getConstantOperandVal(0)) {
3175 default:
3176 return Op;
3177 case Intrinsic::nvvm_prmt:
3178 case Intrinsic::nvvm_prmt_b4e:
3179 case Intrinsic::nvvm_prmt_ecl:
3180 case Intrinsic::nvvm_prmt_ecr:
3181 case Intrinsic::nvvm_prmt_f4e:
3182 case Intrinsic::nvvm_prmt_rc16:
3183 case Intrinsic::nvvm_prmt_rc8:
3184 return lowerPrmtIntrinsic(Op, DAG);
3185 case Intrinsic::nvvm_internal_addrspace_wrap:
3186 return Op.getOperand(1);
3187 case Intrinsic::nvvm_clusterlaunchcontrol_query_cancel_is_canceled:
3188 case Intrinsic::nvvm_clusterlaunchcontrol_query_cancel_get_first_ctaid_x:
3189 case Intrinsic::nvvm_clusterlaunchcontrol_query_cancel_get_first_ctaid_y:
3190 case Intrinsic::nvvm_clusterlaunchcontrol_query_cancel_get_first_ctaid_z:
3192 case Intrinsic::nvvm_f32x4_to_e4m3x4_rs_satfinite:
3193 case Intrinsic::nvvm_f32x4_to_e4m3x4_rs_relu_satfinite:
3194 case Intrinsic::nvvm_f32x4_to_e5m2x4_rs_satfinite:
3195 case Intrinsic::nvvm_f32x4_to_e5m2x4_rs_relu_satfinite:
3196 case Intrinsic::nvvm_f32x4_to_e2m3x4_rs_satfinite:
3197 case Intrinsic::nvvm_f32x4_to_e2m3x4_rs_relu_satfinite:
3198 case Intrinsic::nvvm_f32x4_to_e3m2x4_rs_satfinite:
3199 case Intrinsic::nvvm_f32x4_to_e3m2x4_rs_relu_satfinite:
3200 case Intrinsic::nvvm_f32x4_to_e2m1x4_rs_satfinite:
3201 case Intrinsic::nvvm_f32x4_to_e2m1x4_rs_relu_satfinite:
3202 return lowerCvtRSIntrinsics(Op, DAG);
3203 }
3204}
3205
3206// In PTX 64-bit CTLZ and CTPOP are supported, but they return a 32-bit value.
3207// Lower these into a node returning the correct type which is zero-extended
3208// back to the correct size.
3210 SDValue V = Op->getOperand(0);
3211 assert(V.getValueType() == MVT::i64 &&
3212 "Unexpected CTLZ/CTPOP type to legalize");
3213
3214 SDLoc DL(Op);
3215 SDValue CT = DAG.getNode(Op->getOpcode(), DL, MVT::i32, V);
3216 return DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, CT, SDNodeFlags::NonNeg);
3217}
3218
3220 unsigned Opcode, SelectionDAG &DAG) {
3221 assert(A.getValueType() == MVT::i64 && B.getValueType() == MVT::i64);
3222
3223 const auto *AmtConst = dyn_cast<ConstantSDNode>(ShiftAmount);
3224 if (!AmtConst)
3225 return SDValue();
3226 const auto Amt = AmtConst->getZExtValue() & 63;
3227
3228 SDValue UnpackA =
3229 DAG.getNode(NVPTXISD::UNPACK_VECTOR, DL, {MVT::i32, MVT::i32}, A);
3230 SDValue UnpackB =
3231 DAG.getNode(NVPTXISD::UNPACK_VECTOR, DL, {MVT::i32, MVT::i32}, B);
3232
3233 // Arch is Little endiain: 0 = low bits, 1 = high bits
3234 SDValue ALo = UnpackA.getValue(0);
3235 SDValue AHi = UnpackA.getValue(1);
3236 SDValue BLo = UnpackB.getValue(0);
3237 SDValue BHi = UnpackB.getValue(1);
3238
3239 // The bitfeild consists of { AHi : ALo : BHi : BLo }
3240 //
3241 // * FSHL, Amt < 32 - The window will contain { AHi : ALo : BHi }
3242 // * FSHL, Amt >= 32 - The window will contain { ALo : BHi : BLo }
3243 // * FSHR, Amt < 32 - The window will contain { ALo : BHi : BLo }
3244 // * FSHR, Amt >= 32 - The window will contain { AHi : ALo : BHi }
3245 //
3246 // Note that Amt = 0 and Amt = 32 are special cases where 32-bit funnel shifts
3247 // are not needed at all. Amt = 0 is a no-op producing either A or B depending
3248 // on the direction. Amt = 32 can be implemented by a packing and unpacking
3249 // move to select and arrange the 32bit values. For simplicity, these cases
3250 // are not handled here explicitly and instead we rely on DAGCombiner to
3251 // remove the no-op funnel shifts we insert.
3252 auto [High, Mid, Low] = ((Opcode == ISD::FSHL) == (Amt < 32))
3253 ? std::make_tuple(AHi, ALo, BHi)
3254 : std::make_tuple(ALo, BHi, BLo);
3255
3256 SDValue NewAmt = DAG.getConstant(Amt & 31, DL, MVT::i32);
3257 SDValue RHi = DAG.getNode(Opcode, DL, MVT::i32, {High, Mid, NewAmt});
3258 SDValue RLo = DAG.getNode(Opcode, DL, MVT::i32, {Mid, Low, NewAmt});
3259
3260 return DAG.getNode(NVPTXISD::BUILD_VECTOR, DL, MVT::i64, {RLo, RHi});
3261}
3262
3264 return expandFSH64(Op->getOperand(0), Op->getOperand(1), Op->getOperand(2),
3265 SDLoc(Op), Op->getOpcode(), DAG);
3266}
3267
3269 unsigned Opcode = Op->getOpcode() == ISD::ROTL ? ISD::FSHL : ISD::FSHR;
3270 return expandFSH64(Op->getOperand(0), Op->getOperand(0), Op->getOperand(1),
3271 SDLoc(Op), Opcode, DAG);
3272}
3273
3275 // Lower (frem x, y) into (sub x, (mul (ftrunc (div x, y)) y)),
3276 // i.e. "poor man's fmod()". When y is infinite, x is returned. This matches
3277 // the semantics of LLVM's frem.
3278 SDLoc DL(Op);
3279 SDValue X = Op->getOperand(0);
3280 SDValue Y = Op->getOperand(1);
3281 EVT Ty = Op.getValueType();
3282 SDNodeFlags Flags = Op->getFlags();
3283
3284 SDValue Div = DAG.getNode(ISD::FDIV, DL, Ty, X, Y, Flags);
3285 SDValue Trunc = DAG.getNode(ISD::FTRUNC, DL, Ty, Div, Flags);
3286 SDValue Mul = DAG.getNode(ISD::FMUL, DL, Ty, Trunc, Y,
3288 SDValue Sub = DAG.getNode(ISD::FSUB, DL, Ty, X, Mul,
3290
3291 if (Flags.hasNoInfs())
3292 return Sub;
3293
3294 // If Y is infinite, return X
3295 SDValue AbsY = DAG.getNode(ISD::FABS, DL, Ty, Y);
3296 SDValue Inf =
3297 DAG.getConstantFP(APFloat::getInf(Ty.getFltSemantics()), DL, Ty);
3298 SDValue IsInf = DAG.getSetCC(DL, MVT::i1, AbsY, Inf, ISD::SETEQ);
3299 return DAG.getSelect(DL, Ty, IsInf, X, Sub);
3300}
3301
3303 assert(Op.getValueType() == MVT::i1 && "Custom lowering enabled only for i1");
3304
3305 SDValue Cond = Op->getOperand(0);
3306 SDValue TrueVal = Op->getOperand(1);
3307 SDValue FalseVal = Op->getOperand(2);
3308 SDLoc DL(Op);
3309
3310 // If both operands are truncated, we push the select through the truncates.
3311 if (TrueVal.getOpcode() == ISD::TRUNCATE &&
3312 FalseVal.getOpcode() == ISD::TRUNCATE) {
3313 TrueVal = TrueVal.getOperand(0);
3314 FalseVal = FalseVal.getOperand(0);
3315
3316 EVT VT = TrueVal.getSimpleValueType().bitsLE(FalseVal.getSimpleValueType())
3317 ? TrueVal.getValueType()
3318 : FalseVal.getValueType();
3319 TrueVal = DAG.getAnyExtOrTrunc(TrueVal, DL, VT);
3320 FalseVal = DAG.getAnyExtOrTrunc(FalseVal, DL, VT);
3321 SDValue Select = DAG.getSelect(DL, VT, Cond, TrueVal, FalseVal);
3322 return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Select);
3323 }
3324
3325 // Otherwise, expand the select into a series of logical operations. These
3326 // often can be folded into other operations either by us or ptxas.
3327 TrueVal = DAG.getFreeze(TrueVal);
3328 FalseVal = DAG.getFreeze(FalseVal);
3329 SDValue And1 = DAG.getNode(ISD::AND, DL, MVT::i1, Cond, TrueVal);
3330 SDValue NotCond = DAG.getNOT(DL, Cond, MVT::i1);
3331 SDValue And2 = DAG.getNode(ISD::AND, DL, MVT::i1, NotCond, FalseVal);
3332 SDValue Or = DAG.getNode(ISD::OR, DL, MVT::i1, And1, And2);
3333 return Or;
3334}
3335
3337 SDNode *N = Op.getNode();
3338
3339 SDValue Chain = N->getOperand(0);
3340 SDValue Val = N->getOperand(1);
3341 SDValue BasePtr = N->getOperand(2);
3342 SDValue Offset = N->getOperand(3);
3343 SDValue Mask = N->getOperand(4);
3344
3345 SDLoc DL(N);
3346 EVT ValVT = Val.getValueType();
3347 MemSDNode *MemSD = cast<MemSDNode>(N);
3348 assert(ValVT.isVector() && "Masked vector store must have vector type");
3349 assert(MemSD->getAlign() >= DAG.getEVTAlign(ValVT) &&
3350 "Unexpected alignment for masked store");
3351
3352 unsigned Opcode = 0;
3353 switch (ValVT.getSimpleVT().SimpleTy) {
3354 default:
3355 llvm_unreachable("Unexpected masked vector store type");
3356 case MVT::v4i64:
3357 case MVT::v4f64: {
3358 Opcode = NVPTXISD::StoreV4;
3359 break;
3360 }
3361 case MVT::v8i32:
3362 case MVT::v8f32: {
3363 Opcode = NVPTXISD::StoreV8;
3364 break;
3365 }
3366 }
3367
3369
3370 // Construct the new SDNode. First operand is the chain.
3371 Ops.push_back(Chain);
3372
3373 // The next N operands are the values to store. Encode the mask into the
3374 // values using the sentinel register 0 to represent a masked-off element.
3375 assert(Mask.getValueType().isVector() &&
3376 Mask.getValueType().getVectorElementType() == MVT::i1 &&
3377 "Mask must be a vector of i1");
3378 assert(Mask.getOpcode() == ISD::BUILD_VECTOR &&
3379 "Mask expected to be a BUILD_VECTOR");
3380 assert(Mask.getValueType().getVectorNumElements() ==
3381 ValVT.getVectorNumElements() &&
3382 "Mask size must be the same as the vector size");
3383 for (auto [I, Op] : enumerate(Mask->ops())) {
3384 // Mask elements must be constants.
3385 if (Op.getNode()->getAsZExtVal() == 0) {
3386 // Append a sentinel register 0 to the Ops vector to represent a masked
3387 // off element, this will be handled in tablegen
3389 ValVT.getVectorElementType()));
3390 } else {
3391 // Extract the element from the vector to store
3392 SDValue ExtVal =
3394 Val, DAG.getIntPtrConstant(I, DL));
3395 Ops.push_back(ExtVal);
3396 }
3397 }
3398
3399 // Next, the pointer operand.
3400 Ops.push_back(BasePtr);
3401
3402 // Finally, the offset operand. We expect this to always be undef, and it will
3403 // be ignored in lowering, but to mirror the handling of the other vector
3404 // store instructions we include it in the new SDNode.
3405 assert(Offset.getOpcode() == ISD::UNDEF &&
3406 "Offset operand expected to be undef");
3407 Ops.push_back(Offset);
3408
3409 SDValue NewSt =
3410 DAG.getMemIntrinsicNode(Opcode, DL, DAG.getVTList(MVT::Other), Ops,
3411 MemSD->getMemoryVT(), MemSD->getMemOperand());
3412
3413 return NewSt;
3414}
3415
3416SDValue
3418 switch (Op.getOpcode()) {
3419 case ISD::RETURNADDR:
3420 return SDValue();
3421 case ISD::FRAMEADDR:
3422 return SDValue();
3423 case ISD::ADDRSPACECAST:
3424 return LowerADDRSPACECAST(Op, DAG);
3426 return lowerIntrinsicWChain(Op, DAG);
3428 return lowerIntrinsicWOChain(Op, DAG);
3430 return lowerIntrinsicVoid(Op, DAG);
3431 case ISD::BUILD_VECTOR:
3432 return LowerBUILD_VECTOR(Op, DAG);
3433 case ISD::BITCAST:
3434 return LowerBITCAST(Op, DAG);
3436 return Op;
3438 return LowerEXTRACT_VECTOR_ELT(Op, DAG);
3440 return LowerINSERT_VECTOR_ELT(Op, DAG);
3442 return LowerVECTOR_SHUFFLE(Op, DAG);
3444 return LowerCONCAT_VECTORS(Op, DAG);
3449 return LowerVECREDUCE(Op, DAG);
3450 case ISD::STORE:
3451 return LowerSTORE(Op, DAG);
3452 case ISD::MSTORE: {
3453 assert(STI.has256BitVectorLoadStore(
3454 cast<MemSDNode>(Op.getNode())->getAddressSpace()) &&
3455 "Masked store vector not supported on subtarget.");
3456 return lowerMSTORE(Op, DAG);
3457 }
3458 case ISD::LOAD:
3459 return LowerLOAD(Op, DAG);
3460 case ISD::MLOAD:
3461 return LowerMLOAD(Op, DAG);
3462 case ISD::SHL_PARTS:
3463 return LowerShiftLeftParts(Op, DAG);
3464 case ISD::SRA_PARTS:
3465 case ISD::SRL_PARTS:
3466 return LowerShiftRightParts(Op, DAG);
3467 case ISD::SELECT:
3468 return lowerSELECT(Op, DAG);
3469 case ISD::FROUND:
3470 return LowerFROUND(Op, DAG);
3471 case ISD::FCOPYSIGN:
3472 return LowerFCOPYSIGN(Op, DAG);
3473 case ISD::SINT_TO_FP:
3474 case ISD::UINT_TO_FP:
3475 return LowerINT_TO_FP(Op, DAG);
3476 case ISD::FP_TO_SINT:
3477 case ISD::FP_TO_UINT:
3478 return LowerFP_TO_INT(Op, DAG);
3479 case ISD::FP_ROUND:
3480 return LowerFP_ROUND(Op, DAG);
3481 case ISD::FP_EXTEND:
3482 return LowerFP_EXTEND(Op, DAG);
3483 case ISD::VAARG:
3484 return LowerVAARG(Op, DAG);
3485 case ISD::VASTART:
3486 return LowerVASTART(Op, DAG);
3487 case ISD::FSHL:
3488 case ISD::FSHR:
3489 return lowerFSH(Op, DAG);
3490 case ISD::ROTL:
3491 case ISD::ROTR:
3492 return lowerROT(Op, DAG);
3493 case ISD::ABS:
3494 case ISD::SMIN:
3495 case ISD::SMAX:
3496 case ISD::UMIN:
3497 case ISD::UMAX:
3498 case ISD::ADD:
3499 case ISD::SUB:
3500 case ISD::MUL:
3501 case ISD::SHL:
3502 case ISD::SREM:
3503 case ISD::UREM:
3504 return LowerVectorArith(Op, DAG);
3506 return LowerDYNAMIC_STACKALLOC(Op, DAG);
3507 case ISD::STACKRESTORE:
3508 return LowerSTACKRESTORE(Op, DAG);
3509 case ISD::STACKSAVE:
3510 return LowerSTACKSAVE(Op, DAG);
3511 case ISD::CopyToReg:
3512 return LowerCopyToReg_128(Op, DAG);
3513 case ISD::FADD:
3514 case ISD::FSUB:
3515 case ISD::FMUL:
3516 // Used only for bf16 on SM80, where we select fma for non-ftz operation
3517 return PromoteBinOpIfF32FTZ(Op, DAG);
3518 case ISD::CTPOP:
3519 case ISD::CTLZ:
3520 return lowerCTLZCTPOP(Op, DAG);
3521 case ISD::FREM:
3522 return lowerFREM(Op, DAG);
3523 case ISD::BSWAP:
3524 return lowerBSWAP(Op, DAG);
3525 default:
3526 llvm_unreachable("Custom lowering not defined for operation");
3527 }
3528}
3529
3530// This will prevent AsmPrinter from trying to print the jump tables itself.
3534
3535SDValue NVPTXTargetLowering::LowerADDRSPACECAST(SDValue Op,
3536 SelectionDAG &DAG) const {
3538 unsigned SrcAS = N->getSrcAddressSpace();
3539 unsigned DestAS = N->getDestAddressSpace();
3540 if (SrcAS != llvm::ADDRESS_SPACE_GENERIC &&
3541 DestAS != llvm::ADDRESS_SPACE_GENERIC) {
3542 // Shared and SharedCluster can be converted to each other through generic
3543 // space
3544 if ((SrcAS == llvm::ADDRESS_SPACE_SHARED &&
3547 DestAS == llvm::ADDRESS_SPACE_SHARED)) {
3548 SDLoc DL(Op.getNode());
3549 const MVT GenerictVT =
3551 SDValue GenericConversion = DAG.getAddrSpaceCast(
3552 DL, GenerictVT, Op.getOperand(0), SrcAS, ADDRESS_SPACE_GENERIC);
3553 SDValue SharedClusterConversion =
3554 DAG.getAddrSpaceCast(DL, Op.getValueType(), GenericConversion,
3555 ADDRESS_SPACE_GENERIC, DestAS);
3556 return SharedClusterConversion;
3557 }
3558
3559 return DAG.getUNDEF(Op.getValueType());
3560 }
3561
3562 return Op;
3563}
3564
3565// This function is almost a copy of SelectionDAG::expandVAArg().
3566// The only diff is that this one produces loads from local address space.
3567SDValue NVPTXTargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
3568 const TargetLowering *TLI = STI.getTargetLowering();
3569 SDLoc DL(Op);
3570
3571 SDNode *Node = Op.getNode();
3572 const Value *V = cast<SrcValueSDNode>(Node->getOperand(2))->getValue();
3573 EVT VT = Node->getValueType(0);
3574 auto *Ty = VT.getTypeForEVT(*DAG.getContext());
3575 SDValue Tmp1 = Node->getOperand(0);
3576 SDValue Tmp2 = Node->getOperand(1);
3577 const MaybeAlign MA(Node->getConstantOperandVal(3));
3578
3579 SDValue VAListLoad = DAG.getLoad(TLI->getPointerTy(DAG.getDataLayout()), DL,
3580 Tmp1, Tmp2, MachinePointerInfo(V));
3581 SDValue VAList = VAListLoad;
3582
3583 if (MA && *MA > TLI->getMinStackArgumentAlignment()) {
3584 VAList = DAG.getNode(
3585 ISD::ADD, DL, VAList.getValueType(), VAList,
3586 DAG.getConstant(MA->value() - 1, DL, VAList.getValueType()));
3587
3588 VAList = DAG.getNode(ISD::AND, DL, VAList.getValueType(), VAList,
3589 DAG.getSignedConstant(-(int64_t)MA->value(), DL,
3590 VAList.getValueType()));
3591 }
3592
3593 // Increment the pointer, VAList, to the next vaarg
3594 Tmp1 = DAG.getNode(ISD::ADD, DL, VAList.getValueType(), VAList,
3596 DL, VAList.getValueType()));
3597
3598 // Store the incremented VAList to the legalized pointer
3599 Tmp1 = DAG.getStore(VAListLoad.getValue(1), DL, Tmp1, Tmp2,
3600 MachinePointerInfo(V));
3601
3602 const Value *SrcV = Constant::getNullValue(
3604
3605 // Load the actual argument out of the pointer VAList
3606 return DAG.getLoad(VT, DL, Tmp1, VAList, MachinePointerInfo(SrcV));
3607}
3608
3609SDValue NVPTXTargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
3610 const TargetLowering *TLI = STI.getTargetLowering();
3611 SDLoc DL(Op);
3612 EVT PtrVT = TLI->getPointerTy(DAG.getDataLayout());
3613
3614 // Store the address of unsized array <function>_vararg[] in the ap object.
3615 SDValue VAReg = getParamSymbol(DAG, /* vararg */ -1, PtrVT);
3616
3617 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
3618 return DAG.getStore(Op.getOperand(0), DL, VAReg, Op.getOperand(1),
3619 MachinePointerInfo(SV));
3620}
3621
3622static std::pair<MemSDNode *, uint32_t>
3624 const NVPTXSubtarget &STI) {
3625 SDValue Chain = N->getOperand(0);
3626 SDValue BasePtr = N->getOperand(1);
3627 SDValue Mask = N->getOperand(3);
3628 [[maybe_unused]] SDValue Passthru = N->getOperand(4);
3629
3630 SDLoc DL(N);
3631 EVT ResVT = N->getValueType(0);
3632 assert(ResVT.isVector() && "Masked vector load must have vector type");
3633 // While we only expect poison passthru vectors as an input to the backend,
3634 // when the legalization framework splits a poison vector in half, it creates
3635 // two undef vectors, so we can technically expect those too.
3636 assert((Passthru.getOpcode() == ISD::POISON ||
3637 Passthru.getOpcode() == ISD::UNDEF) &&
3638 "Passthru operand expected to be poison or undef");
3639
3640 // Extract the mask and convert it to a uint32_t representing the used bytes
3641 // of the entire vector load
3642 uint32_t UsedBytesMask = 0;
3643 uint32_t ElementSizeInBits = ResVT.getVectorElementType().getSizeInBits();
3644 assert(ElementSizeInBits % 8 == 0 && "Unexpected element size");
3645 uint32_t ElementSizeInBytes = ElementSizeInBits / 8;
3646 uint32_t ElementMask = (1u << ElementSizeInBytes) - 1u;
3647
3648 for (SDValue Op : reverse(Mask->ops())) {
3649 // We technically only want to do this shift for every
3650 // iteration *but* the first, but in the first iteration UsedBytesMask is 0,
3651 // so this shift is a no-op.
3652 UsedBytesMask <<= ElementSizeInBytes;
3653
3654 // Mask elements must be constants.
3655 if (Op->getAsZExtVal() != 0)
3656 UsedBytesMask |= ElementMask;
3657 }
3658
3659 assert(UsedBytesMask != 0 && UsedBytesMask != UINT32_MAX &&
3660 "Unexpected masked load with elements masked all on or all off");
3661
3662 // Create a new load sd node to be handled normally by ReplaceLoadVector.
3663 MemSDNode *NewLD = cast<MemSDNode>(
3664 DAG.getLoad(ResVT, DL, Chain, BasePtr, N->getMemOperand()).getNode());
3665
3666 // If our subtarget does not support the used bytes mask pragma, "drop" the
3667 // mask by setting it to UINT32_MAX
3668 if (!STI.hasUsedBytesMaskPragma())
3669 UsedBytesMask = UINT32_MAX;
3670
3671 return {NewLD, UsedBytesMask};
3672}
3673
3674/// replaceLoadVector - Convert vector loads into multi-output scalar loads.
3675static std::optional<std::pair<SDValue, SDValue>>
3678 const EVT ResVT = LD->getValueType(0);
3679 const EVT MemVT = LD->getMemoryVT();
3680
3681 // If we're doing sign/zero extension as part of the load, avoid lowering to
3682 // a LoadV node. TODO: consider relaxing this restriction.
3683 if (ResVT != MemVT)
3684 return std::nullopt;
3685
3686 const auto NumEltsAndEltVT =
3687 getVectorLoweringShape(ResVT, STI, LD->getAddressSpace());
3688 if (!NumEltsAndEltVT)
3689 return std::nullopt;
3690 const auto [NumElts, EltVT] = NumEltsAndEltVT.value();
3691
3692 Align Alignment = LD->getAlign();
3693 const auto &TD = DAG.getDataLayout();
3694 Align PrefAlign = TD.getPrefTypeAlign(MemVT.getTypeForEVT(*DAG.getContext()));
3695 if (Alignment < PrefAlign) {
3696 // This load is not sufficiently aligned, so bail out and let this vector
3697 // load be scalarized. Note that we may still be able to emit smaller
3698 // vector loads. For example, if we are loading a <4 x float> with an
3699 // alignment of 8, this check will fail but the legalizer will try again
3700 // with 2 x <2 x float>, which will succeed with an alignment of 8.
3701 return std::nullopt;
3702 }
3703
3704 // If we have a masked load, convert it to a normal load now
3705 std::optional<uint32_t> UsedBytesMask = std::nullopt;
3706 if (LD->getOpcode() == ISD::MLOAD)
3707 std::tie(LD, UsedBytesMask) =
3709
3710 // Since LoadV2 is a target node, we cannot rely on DAG type legalization.
3711 // Therefore, we must ensure the type is legal. For i1 and i8, we set the
3712 // loaded type to i16 and propagate the "real" type as the memory type.
3713 const MVT LoadEltVT = (EltVT.getSizeInBits() < 16) ? MVT::i16 : EltVT;
3714
3715 unsigned Opcode;
3716 switch (NumElts) {
3717 default:
3718 return std::nullopt;
3719 case 2:
3720 Opcode = NVPTXISD::LoadV2;
3721 break;
3722 case 4:
3723 Opcode = NVPTXISD::LoadV4;
3724 break;
3725 case 8:
3726 Opcode = NVPTXISD::LoadV8;
3727 break;
3728 }
3729 auto ListVTs = SmallVector<EVT, 9>(NumElts, LoadEltVT);
3730 ListVTs.push_back(MVT::Other);
3731 SDVTList LdResVTs = DAG.getVTList(ListVTs);
3732
3733 SDLoc DL(LD);
3734
3735 // Copy regular operands
3736 SmallVector<SDValue, 8> OtherOps(LD->ops());
3737
3738 OtherOps.push_back(
3739 DAG.getConstant(UsedBytesMask.value_or(UINT32_MAX), DL, MVT::i32));
3740
3741 // The select routine does not have access to the LoadSDNode instance, so
3742 // pass along the extension information
3743 OtherOps.push_back(
3744 DAG.getIntPtrConstant(cast<LoadSDNode>(LD)->getExtensionType(), DL));
3745
3746 SDValue NewLD = DAG.getMemIntrinsicNode(Opcode, DL, LdResVTs, OtherOps, MemVT,
3747 LD->getMemOperand());
3748
3749 SmallVector<SDValue> ScalarRes;
3750 if (EltVT.isVector()) {
3752 assert(NumElts * EltVT.getVectorNumElements() ==
3753 ResVT.getVectorNumElements());
3754 // Generate EXTRACT_VECTOR_ELTs to split v2[i,f,bf]16/v4i8 subvectors back
3755 // into individual elements.
3756 for (const unsigned I : llvm::seq(NumElts)) {
3757 SDValue SubVector = NewLD.getValue(I);
3758 DAG.ExtractVectorElements(SubVector, ScalarRes);
3759 }
3760 } else {
3761 for (const unsigned I : llvm::seq(NumElts)) {
3762 SDValue Res = NewLD.getValue(I);
3763 if (LoadEltVT != EltVT)
3764 Res = DAG.getNode(ISD::TRUNCATE, DL, EltVT, Res);
3765 ScalarRes.push_back(Res);
3766 }
3767 }
3768
3769 SDValue LoadChain = NewLD.getValue(NumElts);
3770
3771 const MVT BuildVecVT =
3772 MVT::getVectorVT(EltVT.getScalarType(), ScalarRes.size());
3773 SDValue BuildVec = DAG.getBuildVector(BuildVecVT, DL, ScalarRes);
3774 SDValue LoadValue = DAG.getBitcast(ResVT, BuildVec);
3775
3776 return {{LoadValue, LoadChain}};
3777}
3778
3781 const NVPTXSubtarget &STI) {
3782 if (auto Res = replaceLoadVector(N, DAG, STI))
3783 Results.append({Res->first, Res->second});
3784}
3785
3787 const NVPTXSubtarget &STI) {
3788 if (auto Res = replaceLoadVector(N, DAG, STI))
3789 return DAG.getMergeValues({Res->first, Res->second}, SDLoc(N));
3790 return SDValue();
3791}
3792
3793// v = ld i1* addr
3794// =>
3795// v1 = ld i8* addr (-> i16)
3796// v = trunc i16 to i1
3798 SDLoc dl(LD);
3799 assert(LD->getExtensionType() == ISD::NON_EXTLOAD);
3800 assert(LD->getValueType(0) == MVT::i1 && "Custom lowering for i1 load only");
3801 SDValue newLD = DAG.getExtLoad(ISD::ZEXTLOAD, dl, MVT::i16, LD->getChain(),
3802 LD->getBasePtr(), LD->getPointerInfo(),
3803 MVT::i8, LD->getAlign(),
3804 LD->getMemOperand()->getFlags());
3805 SDValue result = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, newLD);
3806 // The legalizer (the caller) is expecting two values from the legalized
3807 // load, so we build a MergeValues node for it. See ExpandUnalignedLoad()
3808 // in LegalizeDAG.cpp which also uses MergeValues.
3809 return DAG.getMergeValues({result, LD->getChain()}, dl);
3810}
3811
3812SDValue NVPTXTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
3813 LoadSDNode *LD = cast<LoadSDNode>(Op);
3814
3815 if (Op.getValueType() == MVT::i1)
3816 return lowerLOADi1(LD, DAG);
3817
3818 // To improve CodeGen we'll legalize any-extend loads to zext loads. This is
3819 // how they'll be lowered in ISel anyway, and by doing this a little earlier
3820 // we allow for more DAG combine opportunities.
3821 if (LD->getExtensionType() == ISD::EXTLOAD) {
3822 assert(LD->getValueType(0).isInteger() && LD->getMemoryVT().isInteger() &&
3823 "Unexpected fpext-load");
3824 return DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(Op), Op.getValueType(),
3825 LD->getChain(), LD->getBasePtr(), LD->getMemoryVT(),
3826 LD->getMemOperand());
3827 }
3828
3829 llvm_unreachable("Unexpected custom lowering for load");
3830}
3831
3832SDValue NVPTXTargetLowering::LowerMLOAD(SDValue Op, SelectionDAG &DAG) const {
3833 // v2f16/v2bf16/v2i16/v4i8 are legal, so we can't rely on legalizer to handle
3834 // masked loads of these types and have to handle them here.
3835 // v2f32 also needs to be handled here if the subtarget has f32x2
3836 // instructions, making it legal.
3837 //
3838 // Note: misaligned masked loads should never reach this point
3839 // because the override of isLegalMaskedLoad in NVPTXTargetTransformInfo.cpp
3840 // will validate alignment. Therefore, we do not need to special case handle
3841 // them here.
3842 EVT VT = Op.getValueType();
3843 if (NVPTX::isPackedVectorTy(VT)) {
3845 cast<MemSDNode>(Op.getNode()), DAG, STI);
3846 MemSDNode *LD = std::get<0>(Result);
3847 uint32_t UsedBytesMask = std::get<1>(Result);
3848
3849 SDLoc DL(LD);
3850
3851 // Copy regular operands
3852 SmallVector<SDValue, 8> OtherOps(LD->ops());
3853
3854 OtherOps.push_back(DAG.getConstant(UsedBytesMask, DL, MVT::i32));
3855
3856 // We currently are not lowering extending loads, but pass the extension
3857 // type anyway as later handling expects it.
3858 OtherOps.push_back(
3859 DAG.getIntPtrConstant(cast<LoadSDNode>(LD)->getExtensionType(), DL));
3860 SDValue NewLD =
3861 DAG.getMemIntrinsicNode(NVPTXISD::MLoad, DL, LD->getVTList(), OtherOps,
3862 LD->getMemoryVT(), LD->getMemOperand());
3863 return NewLD;
3864 }
3865 return SDValue();
3866}
3867
3869 const NVPTXSubtarget &STI) {
3870 MemSDNode *N = cast<MemSDNode>(Op.getNode());
3871 SDValue Val = N->getOperand(1);
3872 SDLoc DL(N);
3873 const EVT ValVT = Val.getValueType();
3874 const EVT MemVT = N->getMemoryVT();
3875
3876 // If we're truncating as part of the store, avoid lowering to a StoreV node.
3877 // TODO: consider relaxing this restriction.
3878 if (ValVT != MemVT)
3879 return SDValue();
3880
3881 const auto NumEltsAndEltVT =
3882 getVectorLoweringShape(ValVT, STI, N->getAddressSpace());
3883 if (!NumEltsAndEltVT)
3884 return SDValue();
3885 const auto [NumElts, EltVT] = NumEltsAndEltVT.value();
3886
3887 const DataLayout &TD = DAG.getDataLayout();
3888
3889 Align Alignment = N->getAlign();
3890 Align PrefAlign = TD.getPrefTypeAlign(ValVT.getTypeForEVT(*DAG.getContext()));
3891 if (Alignment < PrefAlign) {
3892 // This store is not sufficiently aligned, so bail out and let this vector
3893 // store be scalarized. Note that we may still be able to emit smaller
3894 // vector stores. For example, if we are storing a <4 x float> with an
3895 // alignment of 8, this check will fail but the legalizer will try again
3896 // with 2 x <2 x float>, which will succeed with an alignment of 8.
3897 return SDValue();
3898 }
3899
3900 unsigned Opcode;
3901 switch (NumElts) {
3902 default:
3903 return SDValue();
3904 case 2:
3905 Opcode = NVPTXISD::StoreV2;
3906 break;
3907 case 4:
3908 Opcode = NVPTXISD::StoreV4;
3909 break;
3910 case 8:
3911 Opcode = NVPTXISD::StoreV8;
3912 break;
3913 }
3914
3916
3917 // First is the chain
3918 Ops.push_back(N->getOperand(0));
3919
3920 // Then the split values
3921 if (EltVT.isVector()) {
3923 assert(NumElts * EltVT.getVectorNumElements() ==
3924 ValVT.getVectorNumElements());
3925 // Combine individual elements into v2[i,f,bf]16/v4i8 subvectors to be
3926 // stored as b32s
3927 const unsigned NumEltsPerSubVector = EltVT.getVectorNumElements();
3928 for (const unsigned I : llvm::seq(NumElts)) {
3929 SmallVector<SDValue, 4> SubVectorElts;
3930 DAG.ExtractVectorElements(Val, SubVectorElts, I * NumEltsPerSubVector,
3931 NumEltsPerSubVector);
3932 Ops.push_back(DAG.getBuildVector(EltVT, DL, SubVectorElts));
3933 }
3934 } else {
3935 SDValue V = DAG.getBitcast(MVT::getVectorVT(EltVT, NumElts), Val);
3936 for (const unsigned I : llvm::seq(NumElts)) {
3937 SDValue ExtVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, V,
3938 DAG.getIntPtrConstant(I, DL));
3939
3940 // Since StoreV2 is a target node, we cannot rely on DAG type
3941 // legalization. Therefore, we must ensure the type is legal. For i1 and
3942 // i8, we set the stored type to i16 and propagate the "real" type as the
3943 // memory type.
3944 if (EltVT.getSizeInBits() < 16)
3945 ExtVal = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i16, ExtVal);
3946 Ops.push_back(ExtVal);
3947 }
3948 }
3949
3950 // Then any remaining arguments
3951 Ops.append(N->op_begin() + 2, N->op_end());
3952
3953 SDValue NewSt =
3954 DAG.getMemIntrinsicNode(Opcode, DL, DAG.getVTList(MVT::Other), Ops,
3955 N->getMemoryVT(), N->getMemOperand());
3956
3957 // return DCI.CombineTo(N, NewSt, true);
3958 return NewSt;
3959}
3960
3961SDValue NVPTXTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
3962 StoreSDNode *Store = cast<StoreSDNode>(Op);
3963 EVT VT = Store->getMemoryVT();
3964
3965 if (VT == MVT::i1)
3966 return LowerSTOREi1(Op, DAG);
3967
3968 // Lower store of any other vector type, including v2f32 as we want to break
3969 // it apart since this is not a widely-supported type.
3970 return lowerSTOREVector(Op, DAG, STI);
3971}
3972
3973// st i1 v, addr
3974// =>
3975// v1 = zxt v to i16
3976// st.u8 i16, addr
3977SDValue NVPTXTargetLowering::LowerSTOREi1(SDValue Op, SelectionDAG &DAG) const {
3978 SDNode *Node = Op.getNode();
3979 SDLoc dl(Node);
3980 StoreSDNode *ST = cast<StoreSDNode>(Node);
3981 SDValue Tmp1 = ST->getChain();
3982 SDValue Tmp2 = ST->getBasePtr();
3983 SDValue Tmp3 = ST->getValue();
3984 assert(Tmp3.getValueType() == MVT::i1 && "Custom lowering for i1 store only");
3985 Tmp3 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Tmp3);
3986 SDValue Result =
3987 DAG.getTruncStore(Tmp1, dl, Tmp3, Tmp2, ST->getPointerInfo(), MVT::i8,
3988 ST->getAlign(), ST->getMemOperand()->getFlags());
3989 return Result;
3990}
3991
3992SDValue NVPTXTargetLowering::LowerCopyToReg_128(SDValue Op,
3993 SelectionDAG &DAG) const {
3994 // Change the CopyToReg to take in two 64-bit operands instead of a 128-bit
3995 // operand so that it can pass the legalization.
3996
3997 assert(Op.getOperand(1).getValueType() == MVT::i128 &&
3998 "Custom lowering for 128-bit CopyToReg only");
3999
4000 SDNode *Node = Op.getNode();
4001 SDLoc DL(Node);
4002
4003 SDValue Cast = DAG.getBitcast(MVT::v2i64, Op->getOperand(2));
4004 SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, Cast,
4005 DAG.getIntPtrConstant(0, DL));
4006 SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, Cast,
4007 DAG.getIntPtrConstant(1, DL));
4008
4010 SmallVector<EVT, 3> ResultsType(Node->values());
4011
4012 NewOps[0] = Op->getOperand(0); // Chain
4013 NewOps[1] = Op->getOperand(1); // Dst Reg
4014 NewOps[2] = Lo; // Lower 64-bit
4015 NewOps[3] = Hi; // Higher 64-bit
4016 if (Op.getNumOperands() == 4)
4017 NewOps[4] = Op->getOperand(3); // Glue if exists
4018
4019 return DAG.getNode(ISD::CopyToReg, DL, ResultsType, NewOps);
4020}
4021
4022unsigned NVPTXTargetLowering::getNumRegisters(
4023 LLVMContext &Context, EVT VT,
4024 std::optional<MVT> RegisterVT = std::nullopt) const {
4025 if (VT == MVT::i128 && RegisterVT == MVT::i128)
4026 return 1;
4027 return TargetLoweringBase::getNumRegisters(Context, VT, RegisterVT);
4028}
4029
4030bool NVPTXTargetLowering::splitValueIntoRegisterParts(
4031 SelectionDAG &DAG, const SDLoc &DL, SDValue Val, SDValue *Parts,
4032 unsigned NumParts, MVT PartVT, std::optional<CallingConv::ID> CC) const {
4033 if (Val.getValueType() == MVT::i128 && NumParts == 1) {
4034 Parts[0] = Val;
4035 return true;
4036 }
4037 return false;
4038}
4039
4040// This creates target external symbol for a function parameter.
4041// Name of the symbol is composed from its index and the function name.
4042// Negative index corresponds to special parameter (unsized array) used for
4043// passing variable arguments.
4044SDValue NVPTXTargetLowering::getParamSymbol(SelectionDAG &DAG, int I,
4045 EVT T) const {
4046 StringRef SavedStr = nvTM->getStrPool().save(
4048 return DAG.getExternalSymbol(SavedStr.data(), T);
4049}
4050
4051SDValue NVPTXTargetLowering::getCallParamSymbol(SelectionDAG &DAG, int I,
4052 EVT T) const {
4053 const StringRef SavedStr = nvTM->getStrPool().save("param" + Twine(I));
4054 return DAG.getExternalSymbol(SavedStr.data(), T);
4055}
4056
4058 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
4059 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
4060 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
4061 const DataLayout &DL = DAG.getDataLayout();
4062 LLVMContext &Ctx = *DAG.getContext();
4063 auto PtrVT = getPointerTy(DAG.getDataLayout());
4064
4065 const Function &F = DAG.getMachineFunction().getFunction();
4066
4067 SDValue Root = DAG.getRoot();
4068 SmallVector<SDValue, 16> OutChains;
4069
4070 // argTypes.size() (or theArgs.size()) and Ins.size() need not match.
4071 // Ins.size() will be larger
4072 // * if there is an aggregate argument with multiple fields (each field
4073 // showing up separately in Ins)
4074 // * if there is a vector argument with more than typical vector-length
4075 // elements (generally if more than 4) where each vector element is
4076 // individually present in Ins.
4077 // So a different index should be used for indexing into Ins.
4078 // See similar issue in LowerCall.
4079
4080 auto AllIns = ArrayRef(Ins);
4081 for (const auto &Arg : F.args()) {
4082 const auto ArgIns = AllIns.take_while(
4083 [&](auto I) { return I.OrigArgIndex == Arg.getArgNo(); });
4084 AllIns = AllIns.drop_front(ArgIns.size());
4085
4086 Type *Ty = Arg.getType();
4087
4088 if (ArgIns.empty())
4089 report_fatal_error("Empty parameter types are not supported");
4090
4091 if (Arg.use_empty()) {
4092 // argument is dead
4093 for (const auto &In : ArgIns) {
4094 assert(!In.Used && "Arg.use_empty() is true but Arg is used?");
4095 InVals.push_back(DAG.getUNDEF(In.VT));
4096 }
4097 continue;
4098 }
4099
4100 SDValue ArgSymbol = getParamSymbol(DAG, Arg.getArgNo(), PtrVT);
4101
4102 // In the following cases, assign a node order of "i+1"
4103 // to newly created nodes. The SDNodes for params have to
4104 // appear in the same order as their order of appearance
4105 // in the original function. "i+1" holds that order.
4106 if (Arg.hasByValAttr()) {
4107 // Param has ByVal attribute
4108 // Return MoveParam(param symbol).
4109 // Ideally, the param symbol can be returned directly,
4110 // but when SDNode builder decides to use it in a CopyToReg(),
4111 // machine instruction fails because TargetExternalSymbol
4112 // (not lowered) is target dependent, and CopyToReg assumes
4113 // the source is lowered.
4114 assert(ArgIns.size() == 1 && "ByVal argument must be a pointer");
4115 const auto &ByvalIn = ArgIns[0];
4116 assert(getValueType(DL, Ty) == ByvalIn.VT &&
4117 "Ins type did not match function type");
4118 assert(ByvalIn.VT == PtrVT && "ByVal argument must be a pointer");
4119
4120 SDValue P;
4121 if (isKernelFunction(F)) {
4122 assert(isParamGridConstant(Arg) && "ByVal argument must be lowered to "
4123 "grid_constant by NVPTXLowerArgs");
4124 P = ArgSymbol;
4125 P.getNode()->setIROrder(Arg.getArgNo() + 1);
4126 } else {
4127 P = DAG.getNode(NVPTXISD::MoveParam, dl, ByvalIn.VT, ArgSymbol);
4128 P.getNode()->setIROrder(Arg.getArgNo() + 1);
4129 P = DAG.getAddrSpaceCast(dl, ByvalIn.VT, P, ADDRESS_SPACE_LOCAL,
4131 }
4132 InVals.push_back(P);
4133 } else {
4136 ComputePTXValueVTs(*this, DL, Ctx, CallConv, Ty, VTs, Offsets);
4137 assert(VTs.size() == ArgIns.size() && "Size mismatch");
4138 assert(VTs.size() == Offsets.size() && "Size mismatch");
4139
4140 const Align ArgAlign = getFunctionArgumentAlignment(
4141 &F, Ty, Arg.getArgNo() + AttributeList::FirstArgIndex, DL);
4142
4143 unsigned I = 0;
4144 const auto VI = VectorizePTXValueVTs(VTs, Offsets, ArgAlign);
4145 for (const unsigned NumElts : VI) {
4146 // i1 is loaded/stored as i8
4147 const EVT LoadVT = VTs[I] == MVT::i1 ? MVT::i8 : VTs[I];
4148 const EVT VecVT = getVectorizedVT(LoadVT, NumElts, Ctx);
4149
4150 SDValue VecAddr = DAG.getObjectPtrOffset(
4151 dl, ArgSymbol, TypeSize::getFixed(Offsets[I]));
4152
4153 const Align PartAlign = commonAlignment(ArgAlign, Offsets[I]);
4154 SDValue P =
4155 DAG.getLoad(VecVT, dl, Root, VecAddr,
4159 P.getNode()->setIROrder(Arg.getArgNo() + 1);
4160 for (const unsigned J : llvm::seq(NumElts)) {
4161 SDValue Elt = getExtractVectorizedValue(P, J, LoadVT, dl, DAG);
4162
4163 Elt = correctParamType(Elt, ArgIns[I + J].VT, ArgIns[I + J].Flags,
4164 DAG, dl);
4165 InVals.push_back(Elt);
4166 }
4167 I += NumElts;
4168 }
4169 }
4170 }
4171
4172 if (!OutChains.empty())
4173 DAG.setRoot(DAG.getTokenFactor(dl, OutChains));
4174
4175 return Chain;
4176}
4177
4178SDValue
4180 bool isVarArg,
4182 const SmallVectorImpl<SDValue> &OutVals,
4183 const SDLoc &dl, SelectionDAG &DAG) const {
4184 const Function &F = DAG.getMachineFunction().getFunction();
4185 Type *RetTy = F.getReturnType();
4186
4187 if (RetTy->isVoidTy()) {
4188 assert(OutVals.empty() && Outs.empty() && "Return value expected for void");
4189 return DAG.getNode(NVPTXISD::RET_GLUE, dl, MVT::Other, Chain);
4190 }
4191
4192 const DataLayout &DL = DAG.getDataLayout();
4193 LLVMContext &Ctx = *DAG.getContext();
4194
4195 const SDValue RetSymbol = DAG.getExternalSymbol("func_retval0", MVT::i32);
4196 const auto RetAlign = getFunctionParamOptimizedAlign(&F, RetTy, DL);
4197
4198 // PTX Interoperability Guide 3.3(A): [Integer] Values shorter than
4199 // 32-bits are sign extended or zero extended, depending on whether
4200 // they are signed or unsigned types.
4201 const bool ExtendIntegerRetVal =
4202 RetTy->isIntegerTy() && DL.getTypeAllocSizeInBits(RetTy) < 32;
4203
4206 ComputePTXValueVTs(*this, DL, Ctx, CallConv, RetTy, VTs, Offsets);
4207 assert(VTs.size() == OutVals.size() && "Bad return value decomposition");
4208
4209 const auto GetRetVal = [&](unsigned I) -> SDValue {
4210 SDValue RetVal = OutVals[I];
4212 RetVal.getValueType() &&
4213 "OutVal type should always be legal");
4214
4215 const EVT VTI = promoteScalarIntegerPTX(VTs[I]);
4216 const EVT StoreVT =
4217 ExtendIntegerRetVal ? MVT::i32 : (VTI == MVT::i1 ? MVT::i8 : VTI);
4218 return correctParamType(RetVal, StoreVT, Outs[I].Flags, DAG, dl);
4219 };
4220
4221 unsigned I = 0;
4222 const auto VI = VectorizePTXValueVTs(VTs, Offsets, RetAlign);
4223 for (const unsigned NumElts : VI) {
4224 const MaybeAlign CurrentAlign = ExtendIntegerRetVal
4225 ? MaybeAlign(std::nullopt)
4226 : commonAlignment(RetAlign, Offsets[I]);
4227
4229 NumElts, dl, DAG, [&](unsigned K) { return GetRetVal(I + K); });
4230
4231 SDValue Ptr =
4232 DAG.getObjectPtrOffset(dl, RetSymbol, TypeSize::getFixed(Offsets[I]));
4233
4234 Chain = DAG.getStore(Chain, dl, Val, Ptr,
4236
4237 I += NumElts;
4238 }
4239
4240 return DAG.getNode(NVPTXISD::RET_GLUE, dl, MVT::Other, Chain);
4241}
4242
4244 SDValue Op, StringRef Constraint, std::vector<SDValue> &Ops,
4245 SelectionDAG &DAG) const {
4246 if (Constraint.size() > 1)
4247 return;
4249}
4250
4251// llvm.ptx.memcpy.const and llvm.ptx.memmove.const need to be modeled as
4252// TgtMemIntrinsic
4253// because we need the information that is only available in the "Value" type
4254// of destination
4255// pointer. In particular, the address space information.
4258 MachineFunction &MF, unsigned Intrinsic) const {
4259 IntrinsicInfo Info;
4260 switch (Intrinsic) {
4261 default:
4262 return;
4263 case Intrinsic::nvvm_match_all_sync_i32p:
4264 case Intrinsic::nvvm_match_all_sync_i64p:
4265 Info.opc = ISD::INTRINSIC_W_CHAIN;
4266 // memVT is bogus. These intrinsics have IntrInaccessibleMemOnly attribute
4267 // in order to model data exchange with other threads, but perform no real
4268 // memory accesses.
4269 Info.memVT = MVT::i1;
4270
4271 // Our result depends on both our and other thread's arguments.
4273 Infos.push_back(Info);
4274 return;
4275 case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_col:
4276 case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_row:
4277 case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_col_stride:
4278 case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_row_stride:
4279 case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_col:
4280 case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_row:
4281 case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_col_stride:
4282 case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_row_stride:
4283 case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_col:
4284 case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_row:
4285 case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_col_stride:
4286 case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_row_stride:
4287 case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_col:
4288 case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_row:
4289 case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_col_stride:
4290 case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_row_stride:
4291 case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_col:
4292 case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_row:
4293 case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_col_stride:
4294 case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_row_stride:
4295 case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_col:
4296 case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_row:
4297 case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_col_stride:
4298 case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_row_stride: {
4299 Info.opc = ISD::INTRINSIC_W_CHAIN;
4300 Info.memVT = MVT::v8f16;
4301 Info.ptrVal = I.getArgOperand(0);
4302 Info.offset = 0;
4303 Info.flags = MachineMemOperand::MOLoad;
4304 Info.align = Align(16);
4305 Infos.push_back(Info);
4306 return;
4307 }
4308 case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_col:
4309 case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_col_stride:
4310 case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_col_stride:
4311 case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_col:
4312 case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_row:
4313 case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_row_stride:
4314 case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_row_stride:
4315 case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_row:
4316 case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_col:
4317 case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_col_stride:
4318 case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_row:
4319 case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_row_stride:
4320 case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_col:
4321 case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_col_stride:
4322 case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_col_stride:
4323 case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_col:
4324 case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_row:
4325 case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_row_stride:
4326 case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_row_stride:
4327 case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_row:
4328 case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_col:
4329 case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_col_stride:
4330 case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_row:
4331 case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_row_stride: {
4332 Info.opc = ISD::INTRINSIC_W_CHAIN;
4333 Info.memVT = MVT::v2i32;
4334 Info.ptrVal = I.getArgOperand(0);
4335 Info.offset = 0;
4336 Info.flags = MachineMemOperand::MOLoad;
4337 Info.align = Align(8);
4338 Infos.push_back(Info);
4339 return;
4340 }
4341
4342 case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_col:
4343 case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_col_stride:
4344 case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_col_stride:
4345 case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_col:
4346 case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_row:
4347 case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_row_stride:
4348 case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_row_stride:
4349 case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_row:
4350 case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_col:
4351 case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_col_stride:
4352 case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_row:
4353 case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_row_stride:
4354 case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_col:
4355 case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_col_stride:
4356 case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_row:
4357 case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_row_stride:
4358
4359 case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_col:
4360 case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_col_stride:
4361 case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_col_stride:
4362 case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_col:
4363 case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_row:
4364 case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_row_stride:
4365 case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_row_stride:
4366 case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_row:
4367 case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_col:
4368 case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_col_stride:
4369 case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_row:
4370 case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_row_stride:
4371 case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_col:
4372 case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_col_stride:
4373 case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_row:
4374 case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_row_stride:
4375 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x4_b16:
4376 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x4_trans_b16:
4377 case Intrinsic::nvvm_ldmatrix_sync_aligned_m16n16_x2_trans_b8:
4378 case Intrinsic::nvvm_ldmatrix_sync_aligned_m16n16_x2_trans_b8x16_b4x16_p64:
4379 case Intrinsic::nvvm_ldmatrix_sync_aligned_m16n16_x2_trans_b8x16_b6x16_p32:
4380 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n16_x4_b8x16_b4x16_p64:
4381 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n16_x4_b8x16_b6x16_p32: {
4382 Info.opc = ISD::INTRINSIC_W_CHAIN;
4383 Info.memVT = MVT::v4i32;
4384 Info.ptrVal = I.getArgOperand(0);
4385 Info.offset = 0;
4386 Info.flags = MachineMemOperand::MOLoad;
4387 Info.align = Align(16);
4388 Infos.push_back(Info);
4389 return;
4390 }
4391
4392 case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_col:
4393 case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_col_stride:
4394 case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_col_stride:
4395 case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_col:
4396 case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_row:
4397 case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_row_stride:
4398 case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_row_stride:
4399 case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_row:
4400
4401 case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_col:
4402 case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_col_stride:
4403 case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_col_stride:
4404 case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_col:
4405 case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_row:
4406 case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_row_stride:
4407 case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_row_stride:
4408 case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_row:
4409 case Intrinsic::nvvm_wmma_m8n8k128_load_a_b1_row:
4410 case Intrinsic::nvvm_wmma_m8n8k128_load_a_b1_row_stride:
4411 case Intrinsic::nvvm_wmma_m8n8k128_load_b_b1_col:
4412 case Intrinsic::nvvm_wmma_m8n8k128_load_b_b1_col_stride:
4413 case Intrinsic::nvvm_wmma_m8n8k32_load_a_s4_row:
4414 case Intrinsic::nvvm_wmma_m8n8k32_load_a_s4_row_stride:
4415 case Intrinsic::nvvm_wmma_m8n8k32_load_a_u4_row_stride:
4416 case Intrinsic::nvvm_wmma_m8n8k32_load_a_u4_row:
4417 case Intrinsic::nvvm_wmma_m8n8k32_load_b_s4_col:
4418 case Intrinsic::nvvm_wmma_m8n8k32_load_b_s4_col_stride:
4419 case Intrinsic::nvvm_wmma_m8n8k32_load_b_u4_col_stride:
4420 case Intrinsic::nvvm_wmma_m8n8k32_load_b_u4_col:
4421 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x1_b16:
4422 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x1_trans_b16:
4423 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n16_x1_b8x16_b4x16_p64:
4424 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n16_x1_b8x16_b6x16_p32: {
4425 Info.opc = ISD::INTRINSIC_W_CHAIN;
4426 Info.memVT = MVT::i32;
4427 Info.ptrVal = I.getArgOperand(0);
4428 Info.offset = 0;
4429 Info.flags = MachineMemOperand::MOLoad;
4430 Info.align = Align(4);
4431 Infos.push_back(Info);
4432 return;
4433 }
4434
4435 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_col:
4436 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_row:
4437 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_col_stride:
4438 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_row_stride:
4439 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_col:
4440 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_row:
4441 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_col_stride:
4442 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_row_stride:
4443 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_col:
4444 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_row:
4445 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_col_stride:
4446 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_row_stride: {
4447 Info.opc = ISD::INTRINSIC_W_CHAIN;
4448 Info.memVT = MVT::v4f16;
4449 Info.ptrVal = I.getArgOperand(0);
4450 Info.offset = 0;
4451 Info.flags = MachineMemOperand::MOLoad;
4452 Info.align = Align(16);
4453 Infos.push_back(Info);
4454 return;
4455 }
4456
4457 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_col:
4458 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_row:
4459 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_col_stride:
4460 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_row_stride:
4461 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_col:
4462 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_row:
4463 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_col_stride:
4464 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_row_stride:
4465 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_col:
4466 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_row:
4467 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_col_stride:
4468 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_row_stride:
4469 case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_col:
4470 case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_row:
4471 case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_col_stride:
4472 case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_row_stride: {
4473 Info.opc = ISD::INTRINSIC_W_CHAIN;
4474 Info.memVT = MVT::v8f32;
4475 Info.ptrVal = I.getArgOperand(0);
4476 Info.offset = 0;
4477 Info.flags = MachineMemOperand::MOLoad;
4478 Info.align = Align(16);
4479 Infos.push_back(Info);
4480 return;
4481 }
4482
4483 case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_col:
4484 case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_col_stride:
4485 case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_row:
4486 case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_row_stride:
4487
4488 case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_col:
4489 case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_col_stride:
4490 case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_row:
4491 case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_row_stride:
4492
4493 case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_col:
4494 case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_col_stride:
4495 case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_row:
4496 case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_row_stride:
4497 case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_col:
4498 case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_col_stride:
4499 case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_row:
4500 case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_row_stride:
4501 case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_col:
4502 case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_col_stride:
4503 case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_row:
4504 case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_row_stride: {
4505 Info.opc = ISD::INTRINSIC_W_CHAIN;
4506 Info.memVT = MVT::v8i32;
4507 Info.ptrVal = I.getArgOperand(0);
4508 Info.offset = 0;
4509 Info.flags = MachineMemOperand::MOLoad;
4510 Info.align = Align(16);
4511 Infos.push_back(Info);
4512 return;
4513 }
4514
4515 case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_col:
4516 case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_col_stride:
4517 case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_row:
4518 case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_row_stride:
4519 case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_col:
4520 case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_col_stride:
4521 case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_row:
4522 case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_row_stride:
4523 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x2_b16:
4524 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x2_trans_b16:
4525 case Intrinsic::nvvm_ldmatrix_sync_aligned_m16n16_x1_trans_b8:
4526 case Intrinsic::nvvm_ldmatrix_sync_aligned_m16n16_x1_trans_b8x16_b4x16_p64:
4527 case Intrinsic::nvvm_ldmatrix_sync_aligned_m16n16_x1_trans_b8x16_b6x16_p32:
4528 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n16_x2_b8x16_b4x16_p64:
4529 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n16_x2_b8x16_b6x16_p32: {
4530 Info.opc = ISD::INTRINSIC_W_CHAIN;
4531 Info.memVT = MVT::v2i32;
4532 Info.ptrVal = I.getArgOperand(0);
4533 Info.offset = 0;
4534 Info.flags = MachineMemOperand::MOLoad;
4535 Info.align = Align(8);
4536 Infos.push_back(Info);
4537 return;
4538 }
4539
4540 case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_col:
4541 case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_col_stride:
4542 case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_row:
4543 case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_row_stride:
4544
4545 case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_col:
4546 case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_col_stride:
4547 case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_row:
4548 case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_row_stride: {
4549 Info.opc = ISD::INTRINSIC_W_CHAIN;
4550 Info.memVT = MVT::f64;
4551 Info.ptrVal = I.getArgOperand(0);
4552 Info.offset = 0;
4553 Info.flags = MachineMemOperand::MOLoad;
4554 Info.align = Align(8);
4555 Infos.push_back(Info);
4556 return;
4557 }
4558
4559 case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_col:
4560 case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_col_stride:
4561 case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_row:
4562 case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_row_stride: {
4563 Info.opc = ISD::INTRINSIC_W_CHAIN;
4564 Info.memVT = MVT::v2f64;
4565 Info.ptrVal = I.getArgOperand(0);
4566 Info.offset = 0;
4567 Info.flags = MachineMemOperand::MOLoad;
4568 Info.align = Align(16);
4569 Infos.push_back(Info);
4570 return;
4571 }
4572
4573 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_col:
4574 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_row:
4575 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_col_stride:
4576 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_row_stride:
4577 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_col:
4578 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_row:
4579 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_col_stride:
4580 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_row_stride:
4581 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_col:
4582 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_row:
4583 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_col_stride:
4584 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_row_stride: {
4585 Info.opc = ISD::INTRINSIC_VOID;
4586 Info.memVT = MVT::v4f16;
4587 Info.ptrVal = I.getArgOperand(0);
4588 Info.offset = 0;
4589 Info.flags = MachineMemOperand::MOStore;
4590 Info.align = Align(16);
4591 Infos.push_back(Info);
4592 return;
4593 }
4594
4595 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_col:
4596 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_row:
4597 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_col_stride:
4598 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_row_stride:
4599 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_col:
4600 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_row:
4601 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_col_stride:
4602 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_row_stride:
4603 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_col:
4604 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_row:
4605 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_col_stride:
4606 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_row_stride:
4607 case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_col:
4608 case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_row:
4609 case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_col_stride:
4610 case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_row_stride: {
4611 Info.opc = ISD::INTRINSIC_VOID;
4612 Info.memVT = MVT::v8f32;
4613 Info.ptrVal = I.getArgOperand(0);
4614 Info.offset = 0;
4615 Info.flags = MachineMemOperand::MOStore;
4616 Info.align = Align(16);
4617 Infos.push_back(Info);
4618 return;
4619 }
4620
4621 case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_col:
4622 case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_col_stride:
4623 case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_row:
4624 case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_row_stride:
4625 case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_col:
4626 case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_col_stride:
4627 case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_row:
4628 case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_row_stride:
4629 case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_col:
4630 case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_col_stride:
4631 case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_row:
4632 case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_row_stride: {
4633 Info.opc = ISD::INTRINSIC_VOID;
4634 Info.memVT = MVT::v8i32;
4635 Info.ptrVal = I.getArgOperand(0);
4636 Info.offset = 0;
4637 Info.flags = MachineMemOperand::MOStore;
4638 Info.align = Align(16);
4639 Infos.push_back(Info);
4640 return;
4641 }
4642
4643 case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_col:
4644 case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_col_stride:
4645 case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_row:
4646 case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_row_stride:
4647 case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_col:
4648 case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_col_stride:
4649 case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_row:
4650 case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_row_stride:
4651 case Intrinsic::nvvm_stmatrix_sync_aligned_m8n8_x2_b16:
4652 case Intrinsic::nvvm_stmatrix_sync_aligned_m8n8_x2_trans_b16:
4653 case Intrinsic::nvvm_stmatrix_sync_aligned_m16n8_x2_trans_b8: {
4654 Info.opc = ISD::INTRINSIC_VOID;
4655 Info.memVT = MVT::v2i32;
4656 Info.ptrVal = I.getArgOperand(0);
4657 Info.offset = 0;
4658 Info.flags = MachineMemOperand::MOStore;
4659 Info.align = Align(8);
4660 Infos.push_back(Info);
4661 return;
4662 }
4663
4664 case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_col:
4665 case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_col_stride:
4666 case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_row:
4667 case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_row_stride: {
4668 Info.opc = ISD::INTRINSIC_VOID;
4669 Info.memVT = MVT::v2f64;
4670 Info.ptrVal = I.getArgOperand(0);
4671 Info.offset = 0;
4672 Info.flags = MachineMemOperand::MOStore;
4673 Info.align = Align(16);
4674 Infos.push_back(Info);
4675 return;
4676 }
4677
4678 case Intrinsic::nvvm_stmatrix_sync_aligned_m8n8_x1_b16:
4679 case Intrinsic::nvvm_stmatrix_sync_aligned_m8n8_x1_trans_b16:
4680 case Intrinsic::nvvm_stmatrix_sync_aligned_m16n8_x1_trans_b8: {
4681 Info.opc = ISD::INTRINSIC_VOID;
4682 Info.memVT = MVT::i32;
4683 Info.ptrVal = I.getArgOperand(0);
4684 Info.offset = 0;
4685 Info.flags = MachineMemOperand::MOStore;
4686 Info.align = Align(4);
4687 Infos.push_back(Info);
4688 return;
4689 }
4690
4691 case Intrinsic::nvvm_stmatrix_sync_aligned_m8n8_x4_b16:
4692 case Intrinsic::nvvm_stmatrix_sync_aligned_m8n8_x4_trans_b16:
4693 case Intrinsic::nvvm_stmatrix_sync_aligned_m16n8_x4_trans_b8: {
4694 Info.opc = ISD::INTRINSIC_VOID;
4695 Info.memVT = MVT::v4i32;
4696 Info.ptrVal = I.getArgOperand(0);
4697 Info.offset = 0;
4698 Info.flags = MachineMemOperand::MOStore;
4699 Info.align = Align(16);
4700 Infos.push_back(Info);
4701 return;
4702 }
4703
4704 case Intrinsic::nvvm_atomic_add_gen_f_cta:
4705 case Intrinsic::nvvm_atomic_add_gen_f_sys:
4706 case Intrinsic::nvvm_atomic_add_gen_i_cta:
4707 case Intrinsic::nvvm_atomic_add_gen_i_sys:
4708 case Intrinsic::nvvm_atomic_and_gen_i_cta:
4709 case Intrinsic::nvvm_atomic_and_gen_i_sys:
4710 case Intrinsic::nvvm_atomic_cas_gen_i_cta:
4711 case Intrinsic::nvvm_atomic_cas_gen_i_sys:
4712 case Intrinsic::nvvm_atomic_dec_gen_i_cta:
4713 case Intrinsic::nvvm_atomic_dec_gen_i_sys:
4714 case Intrinsic::nvvm_atomic_inc_gen_i_cta:
4715 case Intrinsic::nvvm_atomic_inc_gen_i_sys:
4716 case Intrinsic::nvvm_atomic_max_gen_i_cta:
4717 case Intrinsic::nvvm_atomic_max_gen_i_sys:
4718 case Intrinsic::nvvm_atomic_min_gen_i_cta:
4719 case Intrinsic::nvvm_atomic_min_gen_i_sys:
4720 case Intrinsic::nvvm_atomic_or_gen_i_cta:
4721 case Intrinsic::nvvm_atomic_or_gen_i_sys:
4722 case Intrinsic::nvvm_atomic_exch_gen_i_cta:
4723 case Intrinsic::nvvm_atomic_exch_gen_i_sys:
4724 case Intrinsic::nvvm_atomic_xor_gen_i_cta:
4725 case Intrinsic::nvvm_atomic_xor_gen_i_sys: {
4726 auto &DL = I.getDataLayout();
4727 Info.opc = ISD::INTRINSIC_W_CHAIN;
4728 Info.memVT = getValueType(DL, I.getType());
4729 Info.ptrVal = I.getArgOperand(0);
4730 Info.offset = 0;
4732 Info.align.reset();
4733 Infos.push_back(Info);
4734 return;
4735 }
4736
4737 case Intrinsic::nvvm_prefetch_tensormap: {
4738 auto &DL = I.getDataLayout();
4739 Info.opc = ISD::INTRINSIC_VOID;
4740 Info.memVT = getPointerTy(DL);
4741 Info.ptrVal = I.getArgOperand(0);
4742 Info.offset = 0;
4743 Info.flags =
4745 Info.align.reset();
4746 Infos.push_back(Info);
4747 return;
4748 }
4749
4750 case Intrinsic::nvvm_tensormap_replace_global_address:
4751 case Intrinsic::nvvm_tensormap_replace_global_stride: {
4752 Info.opc = ISD::INTRINSIC_VOID;
4753 Info.memVT = MVT::i64;
4754 Info.ptrVal = I.getArgOperand(0);
4755 Info.offset = 0;
4756 Info.flags = MachineMemOperand::MOStore;
4757 Info.align.reset();
4758 Infos.push_back(Info);
4759 return;
4760 }
4761
4762 case Intrinsic::nvvm_tensormap_replace_rank:
4763 case Intrinsic::nvvm_tensormap_replace_box_dim:
4764 case Intrinsic::nvvm_tensormap_replace_global_dim:
4765 case Intrinsic::nvvm_tensormap_replace_element_stride:
4766 case Intrinsic::nvvm_tensormap_replace_elemtype:
4767 case Intrinsic::nvvm_tensormap_replace_interleave_layout:
4768 case Intrinsic::nvvm_tensormap_replace_swizzle_mode:
4769 case Intrinsic::nvvm_tensormap_replace_swizzle_atomicity:
4770 case Intrinsic::nvvm_tensormap_replace_fill_mode: {
4771 Info.opc = ISD::INTRINSIC_VOID;
4772 Info.memVT = MVT::i32;
4773 Info.ptrVal = I.getArgOperand(0);
4774 Info.offset = 0;
4775 Info.flags = MachineMemOperand::MOStore;
4776 Info.align.reset();
4777 Infos.push_back(Info);
4778 return;
4779 }
4780
4781 case Intrinsic::nvvm_ldu_global_i:
4782 case Intrinsic::nvvm_ldu_global_f:
4783 case Intrinsic::nvvm_ldu_global_p: {
4784 Info.opc = ISD::INTRINSIC_W_CHAIN;
4785 Info.memVT = getValueType(I.getDataLayout(), I.getType());
4786 Info.ptrVal = I.getArgOperand(0);
4787 Info.offset = 0;
4788 Info.flags = MachineMemOperand::MOLoad;
4789 Info.align = cast<ConstantInt>(I.getArgOperand(1))->getMaybeAlignValue();
4790
4791 Infos.push_back(Info);
4792 return;
4793 }
4794 case Intrinsic::nvvm_tex_1d_v4f32_s32:
4795 case Intrinsic::nvvm_tex_1d_v4f32_f32:
4796 case Intrinsic::nvvm_tex_1d_level_v4f32_f32:
4797 case Intrinsic::nvvm_tex_1d_grad_v4f32_f32:
4798 case Intrinsic::nvvm_tex_1d_array_v4f32_s32:
4799 case Intrinsic::nvvm_tex_1d_array_v4f32_f32:
4800 case Intrinsic::nvvm_tex_1d_array_level_v4f32_f32:
4801 case Intrinsic::nvvm_tex_1d_array_grad_v4f32_f32:
4802 case Intrinsic::nvvm_tex_2d_v4f32_s32:
4803 case Intrinsic::nvvm_tex_2d_v4f32_f32:
4804 case Intrinsic::nvvm_tex_2d_level_v4f32_f32:
4805 case Intrinsic::nvvm_tex_2d_grad_v4f32_f32:
4806 case Intrinsic::nvvm_tex_2d_array_v4f32_s32:
4807 case Intrinsic::nvvm_tex_2d_array_v4f32_f32:
4808 case Intrinsic::nvvm_tex_2d_array_level_v4f32_f32:
4809 case Intrinsic::nvvm_tex_2d_array_grad_v4f32_f32:
4810 case Intrinsic::nvvm_tex_3d_v4f32_s32:
4811 case Intrinsic::nvvm_tex_3d_v4f32_f32:
4812 case Intrinsic::nvvm_tex_3d_level_v4f32_f32:
4813 case Intrinsic::nvvm_tex_3d_grad_v4f32_f32:
4814 case Intrinsic::nvvm_tex_cube_v4f32_f32:
4815 case Intrinsic::nvvm_tex_cube_level_v4f32_f32:
4816 case Intrinsic::nvvm_tex_cube_array_v4f32_f32:
4817 case Intrinsic::nvvm_tex_cube_array_level_v4f32_f32:
4818 case Intrinsic::nvvm_tld4_r_2d_v4f32_f32:
4819 case Intrinsic::nvvm_tld4_g_2d_v4f32_f32:
4820 case Intrinsic::nvvm_tld4_b_2d_v4f32_f32:
4821 case Intrinsic::nvvm_tld4_a_2d_v4f32_f32:
4822 case Intrinsic::nvvm_tex_unified_1d_v4f32_s32:
4823 case Intrinsic::nvvm_tex_unified_1d_v4f32_f32:
4824 case Intrinsic::nvvm_tex_unified_1d_level_v4f32_f32:
4825 case Intrinsic::nvvm_tex_unified_1d_grad_v4f32_f32:
4826 case Intrinsic::nvvm_tex_unified_1d_array_v4f32_s32:
4827 case Intrinsic::nvvm_tex_unified_1d_array_v4f32_f32:
4828 case Intrinsic::nvvm_tex_unified_1d_array_level_v4f32_f32:
4829 case Intrinsic::nvvm_tex_unified_1d_array_grad_v4f32_f32:
4830 case Intrinsic::nvvm_tex_unified_2d_v4f32_s32:
4831 case Intrinsic::nvvm_tex_unified_2d_v4f32_f32:
4832 case Intrinsic::nvvm_tex_unified_2d_level_v4f32_f32:
4833 case Intrinsic::nvvm_tex_unified_2d_grad_v4f32_f32:
4834 case Intrinsic::nvvm_tex_unified_2d_array_v4f32_s32:
4835 case Intrinsic::nvvm_tex_unified_2d_array_v4f32_f32:
4836 case Intrinsic::nvvm_tex_unified_2d_array_level_v4f32_f32:
4837 case Intrinsic::nvvm_tex_unified_2d_array_grad_v4f32_f32:
4838 case Intrinsic::nvvm_tex_unified_3d_v4f32_s32:
4839 case Intrinsic::nvvm_tex_unified_3d_v4f32_f32:
4840 case Intrinsic::nvvm_tex_unified_3d_level_v4f32_f32:
4841 case Intrinsic::nvvm_tex_unified_3d_grad_v4f32_f32:
4842 case Intrinsic::nvvm_tex_unified_cube_v4f32_f32:
4843 case Intrinsic::nvvm_tex_unified_cube_level_v4f32_f32:
4844 case Intrinsic::nvvm_tex_unified_cube_array_v4f32_f32:
4845 case Intrinsic::nvvm_tex_unified_cube_array_level_v4f32_f32:
4846 case Intrinsic::nvvm_tex_unified_cube_grad_v4f32_f32:
4847 case Intrinsic::nvvm_tex_unified_cube_array_grad_v4f32_f32:
4848 case Intrinsic::nvvm_tld4_unified_r_2d_v4f32_f32:
4849 case Intrinsic::nvvm_tld4_unified_g_2d_v4f32_f32:
4850 case Intrinsic::nvvm_tld4_unified_b_2d_v4f32_f32:
4851 case Intrinsic::nvvm_tld4_unified_a_2d_v4f32_f32:
4852 Info.opc = ISD::INTRINSIC_W_CHAIN;
4853 Info.memVT = MVT::v4f32;
4854 Info.ptrVal = nullptr;
4855 Info.offset = 0;
4856 Info.flags = MachineMemOperand::MOLoad;
4857 Info.align = Align(16);
4858 Infos.push_back(Info);
4859 return;
4860
4861 case Intrinsic::nvvm_tex_1d_v4s32_s32:
4862 case Intrinsic::nvvm_tex_1d_v4s32_f32:
4863 case Intrinsic::nvvm_tex_1d_level_v4s32_f32:
4864 case Intrinsic::nvvm_tex_1d_grad_v4s32_f32:
4865 case Intrinsic::nvvm_tex_1d_array_v4s32_s32:
4866 case Intrinsic::nvvm_tex_1d_array_v4s32_f32:
4867 case Intrinsic::nvvm_tex_1d_array_level_v4s32_f32:
4868 case Intrinsic::nvvm_tex_1d_array_grad_v4s32_f32:
4869 case Intrinsic::nvvm_tex_2d_v4s32_s32:
4870 case Intrinsic::nvvm_tex_2d_v4s32_f32:
4871 case Intrinsic::nvvm_tex_2d_level_v4s32_f32:
4872 case Intrinsic::nvvm_tex_2d_grad_v4s32_f32:
4873 case Intrinsic::nvvm_tex_2d_array_v4s32_s32:
4874 case Intrinsic::nvvm_tex_2d_array_v4s32_f32:
4875 case Intrinsic::nvvm_tex_2d_array_level_v4s32_f32:
4876 case Intrinsic::nvvm_tex_2d_array_grad_v4s32_f32:
4877 case Intrinsic::nvvm_tex_3d_v4s32_s32:
4878 case Intrinsic::nvvm_tex_3d_v4s32_f32:
4879 case Intrinsic::nvvm_tex_3d_level_v4s32_f32:
4880 case Intrinsic::nvvm_tex_3d_grad_v4s32_f32:
4881 case Intrinsic::nvvm_tex_cube_v4s32_f32:
4882 case Intrinsic::nvvm_tex_cube_level_v4s32_f32:
4883 case Intrinsic::nvvm_tex_cube_array_v4s32_f32:
4884 case Intrinsic::nvvm_tex_cube_array_level_v4s32_f32:
4885 case Intrinsic::nvvm_tex_cube_v4u32_f32:
4886 case Intrinsic::nvvm_tex_cube_level_v4u32_f32:
4887 case Intrinsic::nvvm_tex_cube_array_v4u32_f32:
4888 case Intrinsic::nvvm_tex_cube_array_level_v4u32_f32:
4889 case Intrinsic::nvvm_tex_1d_v4u32_s32:
4890 case Intrinsic::nvvm_tex_1d_v4u32_f32:
4891 case Intrinsic::nvvm_tex_1d_level_v4u32_f32:
4892 case Intrinsic::nvvm_tex_1d_grad_v4u32_f32:
4893 case Intrinsic::nvvm_tex_1d_array_v4u32_s32:
4894 case Intrinsic::nvvm_tex_1d_array_v4u32_f32:
4895 case Intrinsic::nvvm_tex_1d_array_level_v4u32_f32:
4896 case Intrinsic::nvvm_tex_1d_array_grad_v4u32_f32:
4897 case Intrinsic::nvvm_tex_2d_v4u32_s32:
4898 case Intrinsic::nvvm_tex_2d_v4u32_f32:
4899 case Intrinsic::nvvm_tex_2d_level_v4u32_f32:
4900 case Intrinsic::nvvm_tex_2d_grad_v4u32_f32:
4901 case Intrinsic::nvvm_tex_2d_array_v4u32_s32:
4902 case Intrinsic::nvvm_tex_2d_array_v4u32_f32:
4903 case Intrinsic::nvvm_tex_2d_array_level_v4u32_f32:
4904 case Intrinsic::nvvm_tex_2d_array_grad_v4u32_f32:
4905 case Intrinsic::nvvm_tex_3d_v4u32_s32:
4906 case Intrinsic::nvvm_tex_3d_v4u32_f32:
4907 case Intrinsic::nvvm_tex_3d_level_v4u32_f32:
4908 case Intrinsic::nvvm_tex_3d_grad_v4u32_f32:
4909 case Intrinsic::nvvm_tld4_r_2d_v4s32_f32:
4910 case Intrinsic::nvvm_tld4_g_2d_v4s32_f32:
4911 case Intrinsic::nvvm_tld4_b_2d_v4s32_f32:
4912 case Intrinsic::nvvm_tld4_a_2d_v4s32_f32:
4913 case Intrinsic::nvvm_tld4_r_2d_v4u32_f32:
4914 case Intrinsic::nvvm_tld4_g_2d_v4u32_f32:
4915 case Intrinsic::nvvm_tld4_b_2d_v4u32_f32:
4916 case Intrinsic::nvvm_tld4_a_2d_v4u32_f32:
4917 case Intrinsic::nvvm_tex_unified_1d_v4s32_s32:
4918 case Intrinsic::nvvm_tex_unified_1d_v4s32_f32:
4919 case Intrinsic::nvvm_tex_unified_1d_level_v4s32_f32:
4920 case Intrinsic::nvvm_tex_unified_1d_grad_v4s32_f32:
4921 case Intrinsic::nvvm_tex_unified_1d_array_v4s32_s32:
4922 case Intrinsic::nvvm_tex_unified_1d_array_v4s32_f32:
4923 case Intrinsic::nvvm_tex_unified_1d_array_level_v4s32_f32:
4924 case Intrinsic::nvvm_tex_unified_1d_array_grad_v4s32_f32:
4925 case Intrinsic::nvvm_tex_unified_2d_v4s32_s32:
4926 case Intrinsic::nvvm_tex_unified_2d_v4s32_f32:
4927 case Intrinsic::nvvm_tex_unified_2d_level_v4s32_f32:
4928 case Intrinsic::nvvm_tex_unified_2d_grad_v4s32_f32:
4929 case Intrinsic::nvvm_tex_unified_2d_array_v4s32_s32:
4930 case Intrinsic::nvvm_tex_unified_2d_array_v4s32_f32:
4931 case Intrinsic::nvvm_tex_unified_2d_array_level_v4s32_f32:
4932 case Intrinsic::nvvm_tex_unified_2d_array_grad_v4s32_f32:
4933 case Intrinsic::nvvm_tex_unified_3d_v4s32_s32:
4934 case Intrinsic::nvvm_tex_unified_3d_v4s32_f32:
4935 case Intrinsic::nvvm_tex_unified_3d_level_v4s32_f32:
4936 case Intrinsic::nvvm_tex_unified_3d_grad_v4s32_f32:
4937 case Intrinsic::nvvm_tex_unified_1d_v4u32_s32:
4938 case Intrinsic::nvvm_tex_unified_1d_v4u32_f32:
4939 case Intrinsic::nvvm_tex_unified_1d_level_v4u32_f32:
4940 case Intrinsic::nvvm_tex_unified_1d_grad_v4u32_f32:
4941 case Intrinsic::nvvm_tex_unified_1d_array_v4u32_s32:
4942 case Intrinsic::nvvm_tex_unified_1d_array_v4u32_f32:
4943 case Intrinsic::nvvm_tex_unified_1d_array_level_v4u32_f32:
4944 case Intrinsic::nvvm_tex_unified_1d_array_grad_v4u32_f32:
4945 case Intrinsic::nvvm_tex_unified_2d_v4u32_s32:
4946 case Intrinsic::nvvm_tex_unified_2d_v4u32_f32:
4947 case Intrinsic::nvvm_tex_unified_2d_level_v4u32_f32:
4948 case Intrinsic::nvvm_tex_unified_2d_grad_v4u32_f32:
4949 case Intrinsic::nvvm_tex_unified_2d_array_v4u32_s32:
4950 case Intrinsic::nvvm_tex_unified_2d_array_v4u32_f32:
4951 case Intrinsic::nvvm_tex_unified_2d_array_level_v4u32_f32:
4952 case Intrinsic::nvvm_tex_unified_2d_array_grad_v4u32_f32:
4953 case Intrinsic::nvvm_tex_unified_3d_v4u32_s32:
4954 case Intrinsic::nvvm_tex_unified_3d_v4u32_f32:
4955 case Intrinsic::nvvm_tex_unified_3d_level_v4u32_f32:
4956 case Intrinsic::nvvm_tex_unified_3d_grad_v4u32_f32:
4957 case Intrinsic::nvvm_tex_unified_cube_v4s32_f32:
4958 case Intrinsic::nvvm_tex_unified_cube_level_v4s32_f32:
4959 case Intrinsic::nvvm_tex_unified_cube_array_v4s32_f32:
4960 case Intrinsic::nvvm_tex_unified_cube_array_level_v4s32_f32:
4961 case Intrinsic::nvvm_tex_unified_cube_v4u32_f32:
4962 case Intrinsic::nvvm_tex_unified_cube_level_v4u32_f32:
4963 case Intrinsic::nvvm_tex_unified_cube_array_v4u32_f32:
4964 case Intrinsic::nvvm_tex_unified_cube_array_level_v4u32_f32:
4965 case Intrinsic::nvvm_tex_unified_cube_grad_v4s32_f32:
4966 case Intrinsic::nvvm_tex_unified_cube_grad_v4u32_f32:
4967 case Intrinsic::nvvm_tex_unified_cube_array_grad_v4s32_f32:
4968 case Intrinsic::nvvm_tex_unified_cube_array_grad_v4u32_f32:
4969 case Intrinsic::nvvm_tld4_unified_r_2d_v4s32_f32:
4970 case Intrinsic::nvvm_tld4_unified_g_2d_v4s32_f32:
4971 case Intrinsic::nvvm_tld4_unified_b_2d_v4s32_f32:
4972 case Intrinsic::nvvm_tld4_unified_a_2d_v4s32_f32:
4973 case Intrinsic::nvvm_tld4_unified_r_2d_v4u32_f32:
4974 case Intrinsic::nvvm_tld4_unified_g_2d_v4u32_f32:
4975 case Intrinsic::nvvm_tld4_unified_b_2d_v4u32_f32:
4976 case Intrinsic::nvvm_tld4_unified_a_2d_v4u32_f32:
4977 Info.opc = ISD::INTRINSIC_W_CHAIN;
4978 Info.memVT = MVT::v4i32;
4979 Info.ptrVal = nullptr;
4980 Info.offset = 0;
4981 Info.flags = MachineMemOperand::MOLoad;
4982 Info.align = Align(16);
4983 Infos.push_back(Info);
4984 return;
4985
4986 case Intrinsic::nvvm_suld_1d_i8_clamp:
4987 case Intrinsic::nvvm_suld_1d_v2i8_clamp:
4988 case Intrinsic::nvvm_suld_1d_v4i8_clamp:
4989 case Intrinsic::nvvm_suld_1d_array_i8_clamp:
4990 case Intrinsic::nvvm_suld_1d_array_v2i8_clamp:
4991 case Intrinsic::nvvm_suld_1d_array_v4i8_clamp:
4992 case Intrinsic::nvvm_suld_2d_i8_clamp:
4993 case Intrinsic::nvvm_suld_2d_v2i8_clamp:
4994 case Intrinsic::nvvm_suld_2d_v4i8_clamp:
4995 case Intrinsic::nvvm_suld_2d_array_i8_clamp:
4996 case Intrinsic::nvvm_suld_2d_array_v2i8_clamp:
4997 case Intrinsic::nvvm_suld_2d_array_v4i8_clamp:
4998 case Intrinsic::nvvm_suld_3d_i8_clamp:
4999 case Intrinsic::nvvm_suld_3d_v2i8_clamp:
5000 case Intrinsic::nvvm_suld_3d_v4i8_clamp:
5001 case Intrinsic::nvvm_suld_1d_i8_trap:
5002 case Intrinsic::nvvm_suld_1d_v2i8_trap:
5003 case Intrinsic::nvvm_suld_1d_v4i8_trap:
5004 case Intrinsic::nvvm_suld_1d_array_i8_trap:
5005 case Intrinsic::nvvm_suld_1d_array_v2i8_trap:
5006 case Intrinsic::nvvm_suld_1d_array_v4i8_trap:
5007 case Intrinsic::nvvm_suld_2d_i8_trap:
5008 case Intrinsic::nvvm_suld_2d_v2i8_trap:
5009 case Intrinsic::nvvm_suld_2d_v4i8_trap:
5010 case Intrinsic::nvvm_suld_2d_array_i8_trap:
5011 case Intrinsic::nvvm_suld_2d_array_v2i8_trap:
5012 case Intrinsic::nvvm_suld_2d_array_v4i8_trap:
5013 case Intrinsic::nvvm_suld_3d_i8_trap:
5014 case Intrinsic::nvvm_suld_3d_v2i8_trap:
5015 case Intrinsic::nvvm_suld_3d_v4i8_trap:
5016 case Intrinsic::nvvm_suld_1d_i8_zero:
5017 case Intrinsic::nvvm_suld_1d_v2i8_zero:
5018 case Intrinsic::nvvm_suld_1d_v4i8_zero:
5019 case Intrinsic::nvvm_suld_1d_array_i8_zero:
5020 case Intrinsic::nvvm_suld_1d_array_v2i8_zero:
5021 case Intrinsic::nvvm_suld_1d_array_v4i8_zero:
5022 case Intrinsic::nvvm_suld_2d_i8_zero:
5023 case Intrinsic::nvvm_suld_2d_v2i8_zero:
5024 case Intrinsic::nvvm_suld_2d_v4i8_zero:
5025 case Intrinsic::nvvm_suld_2d_array_i8_zero:
5026 case Intrinsic::nvvm_suld_2d_array_v2i8_zero:
5027 case Intrinsic::nvvm_suld_2d_array_v4i8_zero:
5028 case Intrinsic::nvvm_suld_3d_i8_zero:
5029 case Intrinsic::nvvm_suld_3d_v2i8_zero:
5030 case Intrinsic::nvvm_suld_3d_v4i8_zero:
5031 Info.opc = ISD::INTRINSIC_W_CHAIN;
5032 Info.memVT = MVT::i8;
5033 Info.ptrVal = nullptr;
5034 Info.offset = 0;
5035 Info.flags = MachineMemOperand::MOLoad;
5036 Info.align = Align(16);
5037 Infos.push_back(Info);
5038 return;
5039
5040 case Intrinsic::nvvm_suld_1d_i16_clamp:
5041 case Intrinsic::nvvm_suld_1d_v2i16_clamp:
5042 case Intrinsic::nvvm_suld_1d_v4i16_clamp:
5043 case Intrinsic::nvvm_suld_1d_array_i16_clamp:
5044 case Intrinsic::nvvm_suld_1d_array_v2i16_clamp:
5045 case Intrinsic::nvvm_suld_1d_array_v4i16_clamp:
5046 case Intrinsic::nvvm_suld_2d_i16_clamp:
5047 case Intrinsic::nvvm_suld_2d_v2i16_clamp:
5048 case Intrinsic::nvvm_suld_2d_v4i16_clamp:
5049 case Intrinsic::nvvm_suld_2d_array_i16_clamp:
5050 case Intrinsic::nvvm_suld_2d_array_v2i16_clamp:
5051 case Intrinsic::nvvm_suld_2d_array_v4i16_clamp:
5052 case Intrinsic::nvvm_suld_3d_i16_clamp:
5053 case Intrinsic::nvvm_suld_3d_v2i16_clamp:
5054 case Intrinsic::nvvm_suld_3d_v4i16_clamp:
5055 case Intrinsic::nvvm_suld_1d_i16_trap:
5056 case Intrinsic::nvvm_suld_1d_v2i16_trap:
5057 case Intrinsic::nvvm_suld_1d_v4i16_trap:
5058 case Intrinsic::nvvm_suld_1d_array_i16_trap:
5059 case Intrinsic::nvvm_suld_1d_array_v2i16_trap:
5060 case Intrinsic::nvvm_suld_1d_array_v4i16_trap:
5061 case Intrinsic::nvvm_suld_2d_i16_trap:
5062 case Intrinsic::nvvm_suld_2d_v2i16_trap:
5063 case Intrinsic::nvvm_suld_2d_v4i16_trap:
5064 case Intrinsic::nvvm_suld_2d_array_i16_trap:
5065 case Intrinsic::nvvm_suld_2d_array_v2i16_trap:
5066 case Intrinsic::nvvm_suld_2d_array_v4i16_trap:
5067 case Intrinsic::nvvm_suld_3d_i16_trap:
5068 case Intrinsic::nvvm_suld_3d_v2i16_trap:
5069 case Intrinsic::nvvm_suld_3d_v4i16_trap:
5070 case Intrinsic::nvvm_suld_1d_i16_zero:
5071 case Intrinsic::nvvm_suld_1d_v2i16_zero:
5072 case Intrinsic::nvvm_suld_1d_v4i16_zero:
5073 case Intrinsic::nvvm_suld_1d_array_i16_zero:
5074 case Intrinsic::nvvm_suld_1d_array_v2i16_zero:
5075 case Intrinsic::nvvm_suld_1d_array_v4i16_zero:
5076 case Intrinsic::nvvm_suld_2d_i16_zero:
5077 case Intrinsic::nvvm_suld_2d_v2i16_zero:
5078 case Intrinsic::nvvm_suld_2d_v4i16_zero:
5079 case Intrinsic::nvvm_suld_2d_array_i16_zero:
5080 case Intrinsic::nvvm_suld_2d_array_v2i16_zero:
5081 case Intrinsic::nvvm_suld_2d_array_v4i16_zero:
5082 case Intrinsic::nvvm_suld_3d_i16_zero:
5083 case Intrinsic::nvvm_suld_3d_v2i16_zero:
5084 case Intrinsic::nvvm_suld_3d_v4i16_zero:
5085 Info.opc = ISD::INTRINSIC_W_CHAIN;
5086 Info.memVT = MVT::i16;
5087 Info.ptrVal = nullptr;
5088 Info.offset = 0;
5089 Info.flags = MachineMemOperand::MOLoad;
5090 Info.align = Align(16);
5091 Infos.push_back(Info);
5092 return;
5093
5094 case Intrinsic::nvvm_suld_1d_i32_clamp:
5095 case Intrinsic::nvvm_suld_1d_v2i32_clamp:
5096 case Intrinsic::nvvm_suld_1d_v4i32_clamp:
5097 case Intrinsic::nvvm_suld_1d_array_i32_clamp:
5098 case Intrinsic::nvvm_suld_1d_array_v2i32_clamp:
5099 case Intrinsic::nvvm_suld_1d_array_v4i32_clamp:
5100 case Intrinsic::nvvm_suld_2d_i32_clamp:
5101 case Intrinsic::nvvm_suld_2d_v2i32_clamp:
5102 case Intrinsic::nvvm_suld_2d_v4i32_clamp:
5103 case Intrinsic::nvvm_suld_2d_array_i32_clamp:
5104 case Intrinsic::nvvm_suld_2d_array_v2i32_clamp:
5105 case Intrinsic::nvvm_suld_2d_array_v4i32_clamp:
5106 case Intrinsic::nvvm_suld_3d_i32_clamp:
5107 case Intrinsic::nvvm_suld_3d_v2i32_clamp:
5108 case Intrinsic::nvvm_suld_3d_v4i32_clamp:
5109 case Intrinsic::nvvm_suld_1d_i32_trap:
5110 case Intrinsic::nvvm_suld_1d_v2i32_trap:
5111 case Intrinsic::nvvm_suld_1d_v4i32_trap:
5112 case Intrinsic::nvvm_suld_1d_array_i32_trap:
5113 case Intrinsic::nvvm_suld_1d_array_v2i32_trap:
5114 case Intrinsic::nvvm_suld_1d_array_v4i32_trap:
5115 case Intrinsic::nvvm_suld_2d_i32_trap:
5116 case Intrinsic::nvvm_suld_2d_v2i32_trap:
5117 case Intrinsic::nvvm_suld_2d_v4i32_trap:
5118 case Intrinsic::nvvm_suld_2d_array_i32_trap:
5119 case Intrinsic::nvvm_suld_2d_array_v2i32_trap:
5120 case Intrinsic::nvvm_suld_2d_array_v4i32_trap:
5121 case Intrinsic::nvvm_suld_3d_i32_trap:
5122 case Intrinsic::nvvm_suld_3d_v2i32_trap:
5123 case Intrinsic::nvvm_suld_3d_v4i32_trap:
5124 case Intrinsic::nvvm_suld_1d_i32_zero:
5125 case Intrinsic::nvvm_suld_1d_v2i32_zero:
5126 case Intrinsic::nvvm_suld_1d_v4i32_zero:
5127 case Intrinsic::nvvm_suld_1d_array_i32_zero:
5128 case Intrinsic::nvvm_suld_1d_array_v2i32_zero:
5129 case Intrinsic::nvvm_suld_1d_array_v4i32_zero:
5130 case Intrinsic::nvvm_suld_2d_i32_zero:
5131 case Intrinsic::nvvm_suld_2d_v2i32_zero:
5132 case Intrinsic::nvvm_suld_2d_v4i32_zero:
5133 case Intrinsic::nvvm_suld_2d_array_i32_zero:
5134 case Intrinsic::nvvm_suld_2d_array_v2i32_zero:
5135 case Intrinsic::nvvm_suld_2d_array_v4i32_zero:
5136 case Intrinsic::nvvm_suld_3d_i32_zero:
5137 case Intrinsic::nvvm_suld_3d_v2i32_zero:
5138 case Intrinsic::nvvm_suld_3d_v4i32_zero:
5139 Info.opc = ISD::INTRINSIC_W_CHAIN;
5140 Info.memVT = MVT::i32;
5141 Info.ptrVal = nullptr;
5142 Info.offset = 0;
5143 Info.flags = MachineMemOperand::MOLoad;
5144 Info.align = Align(16);
5145 Infos.push_back(Info);
5146 return;
5147
5148 case Intrinsic::nvvm_suld_1d_i64_clamp:
5149 case Intrinsic::nvvm_suld_1d_v2i64_clamp:
5150 case Intrinsic::nvvm_suld_1d_array_i64_clamp:
5151 case Intrinsic::nvvm_suld_1d_array_v2i64_clamp:
5152 case Intrinsic::nvvm_suld_2d_i64_clamp:
5153 case Intrinsic::nvvm_suld_2d_v2i64_clamp:
5154 case Intrinsic::nvvm_suld_2d_array_i64_clamp:
5155 case Intrinsic::nvvm_suld_2d_array_v2i64_clamp:
5156 case Intrinsic::nvvm_suld_3d_i64_clamp:
5157 case Intrinsic::nvvm_suld_3d_v2i64_clamp:
5158 case Intrinsic::nvvm_suld_1d_i64_trap:
5159 case Intrinsic::nvvm_suld_1d_v2i64_trap:
5160 case Intrinsic::nvvm_suld_1d_array_i64_trap:
5161 case Intrinsic::nvvm_suld_1d_array_v2i64_trap:
5162 case Intrinsic::nvvm_suld_2d_i64_trap:
5163 case Intrinsic::nvvm_suld_2d_v2i64_trap:
5164 case Intrinsic::nvvm_suld_2d_array_i64_trap:
5165 case Intrinsic::nvvm_suld_2d_array_v2i64_trap:
5166 case Intrinsic::nvvm_suld_3d_i64_trap:
5167 case Intrinsic::nvvm_suld_3d_v2i64_trap:
5168 case Intrinsic::nvvm_suld_1d_i64_zero:
5169 case Intrinsic::nvvm_suld_1d_v2i64_zero:
5170 case Intrinsic::nvvm_suld_1d_array_i64_zero:
5171 case Intrinsic::nvvm_suld_1d_array_v2i64_zero:
5172 case Intrinsic::nvvm_suld_2d_i64_zero:
5173 case Intrinsic::nvvm_suld_2d_v2i64_zero:
5174 case Intrinsic::nvvm_suld_2d_array_i64_zero:
5175 case Intrinsic::nvvm_suld_2d_array_v2i64_zero:
5176 case Intrinsic::nvvm_suld_3d_i64_zero:
5177 case Intrinsic::nvvm_suld_3d_v2i64_zero:
5178 Info.opc = ISD::INTRINSIC_W_CHAIN;
5179 Info.memVT = MVT::i64;
5180 Info.ptrVal = nullptr;
5181 Info.offset = 0;
5182 Info.flags = MachineMemOperand::MOLoad;
5183 Info.align = Align(16);
5184 Infos.push_back(Info);
5185 return;
5186
5187 case Intrinsic::nvvm_tcgen05_ld_16x64b_x1:
5188 case Intrinsic::nvvm_tcgen05_ld_32x32b_x1:
5189 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x1: {
5190 Info.opc = ISD::INTRINSIC_W_CHAIN;
5191 Info.memVT = MVT::v1i32;
5192 Info.ptrVal = I.getArgOperand(0);
5193 Info.offset = 0;
5194 Info.flags = MachineMemOperand::MOLoad;
5195 Info.align.reset();
5196 Infos.push_back(Info);
5197 return;
5198 }
5199
5200 case Intrinsic::nvvm_tcgen05_ld_16x64b_x2:
5201 case Intrinsic::nvvm_tcgen05_ld_16x128b_x1:
5202 case Intrinsic::nvvm_tcgen05_ld_32x32b_x2:
5203 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x2:
5204 case Intrinsic::nvvm_tcgen05_ld_red_32x32b_x2_i32:
5205 case Intrinsic::nvvm_tcgen05_ld_red_16x32bx2_x2_i32: {
5206 Info.opc = ISD::INTRINSIC_W_CHAIN;
5207 Info.memVT = MVT::v2i32;
5208 Info.ptrVal = I.getArgOperand(0);
5209 Info.offset = 0;
5210 Info.flags = MachineMemOperand::MOLoad;
5211 Info.align.reset();
5212 Infos.push_back(Info);
5213 return;
5214 }
5215
5216 case Intrinsic::nvvm_tcgen05_ld_red_32x32b_x2_f32:
5217 case Intrinsic::nvvm_tcgen05_ld_red_16x32bx2_x2_f32: {
5218 Info.opc = ISD::INTRINSIC_W_CHAIN;
5219 Info.memVT = MVT::v2f32;
5220 Info.ptrVal = I.getArgOperand(0);
5221 Info.offset = 0;
5222 Info.flags = MachineMemOperand::MOLoad;
5223 Info.align.reset();
5224 Infos.push_back(Info);
5225 return;
5226 }
5227
5228 case Intrinsic::nvvm_tcgen05_ld_16x64b_x4:
5229 case Intrinsic::nvvm_tcgen05_ld_16x128b_x2:
5230 case Intrinsic::nvvm_tcgen05_ld_32x32b_x4:
5231 case Intrinsic::nvvm_tcgen05_ld_16x256b_x1:
5232 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x4:
5233 case Intrinsic::nvvm_tcgen05_ld_red_32x32b_x4_i32:
5234 case Intrinsic::nvvm_tcgen05_ld_red_16x32bx2_x4_i32: {
5235 Info.opc = ISD::INTRINSIC_W_CHAIN;
5236 Info.memVT = MVT::v4i32;
5237 Info.ptrVal = I.getArgOperand(0);
5238 Info.offset = 0;
5239 Info.flags = MachineMemOperand::MOLoad;
5240 Info.align.reset();
5241 Infos.push_back(Info);
5242 return;
5243 }
5244
5245 case Intrinsic::nvvm_tcgen05_ld_red_32x32b_x4_f32:
5246 case Intrinsic::nvvm_tcgen05_ld_red_16x32bx2_x4_f32: {
5247 Info.opc = ISD::INTRINSIC_W_CHAIN;
5248 Info.memVT = MVT::v4f32;
5249 Info.ptrVal = I.getArgOperand(0);
5250 Info.offset = 0;
5251 Info.flags = MachineMemOperand::MOLoad;
5252 Info.align.reset();
5253 Infos.push_back(Info);
5254 return;
5255 }
5256
5257 case Intrinsic::nvvm_tcgen05_ld_16x64b_x8:
5258 case Intrinsic::nvvm_tcgen05_ld_16x128b_x4:
5259 case Intrinsic::nvvm_tcgen05_ld_16x256b_x2:
5260 case Intrinsic::nvvm_tcgen05_ld_32x32b_x8:
5261 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x8:
5262 case Intrinsic::nvvm_tcgen05_ld_red_32x32b_x8_i32:
5263 case Intrinsic::nvvm_tcgen05_ld_red_16x32bx2_x8_i32: {
5264 Info.opc = ISD::INTRINSIC_W_CHAIN;
5265 Info.memVT = MVT::v8i32;
5266 Info.ptrVal = I.getArgOperand(0);
5267 Info.offset = 0;
5268 Info.flags = MachineMemOperand::MOLoad;
5269 Info.align.reset();
5270 Infos.push_back(Info);
5271 return;
5272 }
5273
5274 case Intrinsic::nvvm_tcgen05_ld_red_32x32b_x8_f32:
5275 case Intrinsic::nvvm_tcgen05_ld_red_16x32bx2_x8_f32: {
5276 Info.opc = ISD::INTRINSIC_W_CHAIN;
5277 Info.memVT = MVT::v8f32;
5278 Info.ptrVal = I.getArgOperand(0);
5279 Info.offset = 0;
5280 Info.flags = MachineMemOperand::MOLoad;
5281 Info.align.reset();
5282 Infos.push_back(Info);
5283 return;
5284 }
5285
5286 case Intrinsic::nvvm_tcgen05_ld_16x64b_x16:
5287 case Intrinsic::nvvm_tcgen05_ld_16x128b_x8:
5288 case Intrinsic::nvvm_tcgen05_ld_16x256b_x4:
5289 case Intrinsic::nvvm_tcgen05_ld_32x32b_x16:
5290 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x16:
5291 case Intrinsic::nvvm_tcgen05_ld_red_32x32b_x16_i32:
5292 case Intrinsic::nvvm_tcgen05_ld_red_16x32bx2_x16_i32: {
5293 Info.opc = ISD::INTRINSIC_W_CHAIN;
5294 Info.memVT = MVT::v16i32;
5295 Info.ptrVal = I.getArgOperand(0);
5296 Info.offset = 0;
5297 Info.flags = MachineMemOperand::MOLoad;
5298 Info.align.reset();
5299 Infos.push_back(Info);
5300 return;
5301 }
5302
5303 case Intrinsic::nvvm_tcgen05_ld_red_32x32b_x16_f32:
5304 case Intrinsic::nvvm_tcgen05_ld_red_16x32bx2_x16_f32: {
5305 Info.opc = ISD::INTRINSIC_W_CHAIN;
5306 Info.memVT = MVT::v16f32;
5307 Info.ptrVal = I.getArgOperand(0);
5308 Info.offset = 0;
5309 Info.flags = MachineMemOperand::MOLoad;
5310 Info.align.reset();
5311 Infos.push_back(Info);
5312 return;
5313 }
5314
5315 case Intrinsic::nvvm_tcgen05_ld_16x64b_x32:
5316 case Intrinsic::nvvm_tcgen05_ld_16x128b_x16:
5317 case Intrinsic::nvvm_tcgen05_ld_16x256b_x8:
5318 case Intrinsic::nvvm_tcgen05_ld_32x32b_x32:
5319 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x32:
5320 case Intrinsic::nvvm_tcgen05_ld_red_32x32b_x32_i32:
5321 case Intrinsic::nvvm_tcgen05_ld_red_16x32bx2_x32_i32: {
5322 Info.opc = ISD::INTRINSIC_W_CHAIN;
5323 Info.memVT = MVT::v32i32;
5324 Info.ptrVal = I.getArgOperand(0);
5325 Info.offset = 0;
5326 Info.flags = MachineMemOperand::MOLoad;
5327 Info.align.reset();
5328 Infos.push_back(Info);
5329 return;
5330 }
5331
5332 case Intrinsic::nvvm_tcgen05_ld_red_32x32b_x32_f32:
5333 case Intrinsic::nvvm_tcgen05_ld_red_16x32bx2_x32_f32: {
5334 Info.opc = ISD::INTRINSIC_W_CHAIN;
5335 Info.memVT = MVT::v32f32;
5336 Info.ptrVal = I.getArgOperand(0);
5337 Info.offset = 0;
5338 Info.flags = MachineMemOperand::MOLoad;
5339 Info.align.reset();
5340 Infos.push_back(Info);
5341 return;
5342 }
5343
5344 case Intrinsic::nvvm_tcgen05_ld_16x64b_x64:
5345 case Intrinsic::nvvm_tcgen05_ld_16x128b_x32:
5346 case Intrinsic::nvvm_tcgen05_ld_16x256b_x16:
5347 case Intrinsic::nvvm_tcgen05_ld_32x32b_x64:
5348 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x64:
5349 case Intrinsic::nvvm_tcgen05_ld_red_32x32b_x64_i32:
5350 case Intrinsic::nvvm_tcgen05_ld_red_16x32bx2_x64_i32: {
5351 Info.opc = ISD::INTRINSIC_W_CHAIN;
5352 Info.memVT = MVT::v64i32;
5353 Info.ptrVal = I.getArgOperand(0);
5354 Info.offset = 0;
5355 Info.flags = MachineMemOperand::MOLoad;
5356 Info.align.reset();
5357 Infos.push_back(Info);
5358 return;
5359 }
5360
5361 case Intrinsic::nvvm_tcgen05_ld_red_32x32b_x64_f32:
5362 case Intrinsic::nvvm_tcgen05_ld_red_16x32bx2_x64_f32: {
5363 Info.opc = ISD::INTRINSIC_W_CHAIN;
5364 Info.memVT = MVT::v64f32;
5365 Info.ptrVal = I.getArgOperand(0);
5366 Info.offset = 0;
5367 Info.flags = MachineMemOperand::MOLoad;
5368 Info.align.reset();
5369 Infos.push_back(Info);
5370 return;
5371 }
5372
5373 case Intrinsic::nvvm_tcgen05_ld_16x64b_x128:
5374 case Intrinsic::nvvm_tcgen05_ld_16x128b_x64:
5375 case Intrinsic::nvvm_tcgen05_ld_16x256b_x32:
5376 case Intrinsic::nvvm_tcgen05_ld_32x32b_x128:
5377 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x128:
5378 case Intrinsic::nvvm_tcgen05_ld_red_32x32b_x128_i32:
5379 case Intrinsic::nvvm_tcgen05_ld_red_16x32bx2_x128_i32: {
5380 Info.opc = ISD::INTRINSIC_W_CHAIN;
5381 Info.memVT = MVT::v128i32;
5382 Info.ptrVal = I.getArgOperand(0);
5383 Info.offset = 0;
5384 Info.flags = MachineMemOperand::MOLoad;
5385 Info.align.reset();
5386 Infos.push_back(Info);
5387 return;
5388 }
5389
5390 case Intrinsic::nvvm_tcgen05_ld_red_32x32b_x128_f32:
5391 case Intrinsic::nvvm_tcgen05_ld_red_16x32bx2_x128_f32: {
5392 Info.opc = ISD::INTRINSIC_W_CHAIN;
5393 Info.memVT = MVT::v128f32;
5394 Info.ptrVal = I.getArgOperand(0);
5395 Info.offset = 0;
5396 Info.flags = MachineMemOperand::MOLoad;
5397 Info.align.reset();
5398 Infos.push_back(Info);
5399 return;
5400 }
5401
5402 case Intrinsic::nvvm_tcgen05_st_16x64b_x1:
5403 case Intrinsic::nvvm_tcgen05_st_32x32b_x1:
5404 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x1: {
5405 Info.opc = ISD::INTRINSIC_VOID;
5406 Info.memVT = MVT::i32;
5407 Info.ptrVal = I.getArgOperand(0);
5408 Info.offset = 0;
5409 Info.flags = MachineMemOperand::MOStore;
5410 Info.align.reset();
5411 Infos.push_back(Info);
5412 return;
5413 }
5414
5415 case Intrinsic::nvvm_tcgen05_st_16x64b_x2:
5416 case Intrinsic::nvvm_tcgen05_st_16x128b_x1:
5417 case Intrinsic::nvvm_tcgen05_st_32x32b_x2:
5418 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x2: {
5419 Info.opc = ISD::INTRINSIC_VOID;
5420 Info.memVT = MVT::v2i32;
5421 Info.ptrVal = I.getArgOperand(0);
5422 Info.offset = 0;
5423 Info.flags = MachineMemOperand::MOStore;
5424 Info.align.reset();
5425 Infos.push_back(Info);
5426 return;
5427 }
5428
5429 case Intrinsic::nvvm_tcgen05_st_16x64b_x4:
5430 case Intrinsic::nvvm_tcgen05_st_16x128b_x2:
5431 case Intrinsic::nvvm_tcgen05_st_16x256b_x1:
5432 case Intrinsic::nvvm_tcgen05_st_32x32b_x4:
5433 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x4: {
5434 Info.opc = ISD::INTRINSIC_VOID;
5435 Info.memVT = MVT::v4i32;
5436 Info.ptrVal = I.getArgOperand(0);
5437 Info.offset = 0;
5438 Info.flags = MachineMemOperand::MOStore;
5439 Info.align.reset();
5440 Infos.push_back(Info);
5441 return;
5442 }
5443
5444 case Intrinsic::nvvm_tcgen05_st_16x64b_x8:
5445 case Intrinsic::nvvm_tcgen05_st_16x128b_x4:
5446 case Intrinsic::nvvm_tcgen05_st_16x256b_x2:
5447 case Intrinsic::nvvm_tcgen05_st_32x32b_x8:
5448 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x8: {
5449 Info.opc = ISD::INTRINSIC_VOID;
5450 Info.memVT = MVT::v8i32;
5451 Info.ptrVal = I.getArgOperand(0);
5452 Info.offset = 0;
5453 Info.flags = MachineMemOperand::MOStore;
5454 Info.align.reset();
5455 Infos.push_back(Info);
5456 return;
5457 }
5458
5459 case Intrinsic::nvvm_tcgen05_st_16x64b_x16:
5460 case Intrinsic::nvvm_tcgen05_st_16x128b_x8:
5461 case Intrinsic::nvvm_tcgen05_st_16x256b_x4:
5462 case Intrinsic::nvvm_tcgen05_st_32x32b_x16:
5463 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x16: {
5464 Info.opc = ISD::INTRINSIC_VOID;
5465 Info.memVT = MVT::v16i32;
5466 Info.ptrVal = I.getArgOperand(0);
5467 Info.offset = 0;
5468 Info.flags = MachineMemOperand::MOStore;
5469 Info.align.reset();
5470 Infos.push_back(Info);
5471 return;
5472 }
5473
5474 case Intrinsic::nvvm_tcgen05_st_16x64b_x32:
5475 case Intrinsic::nvvm_tcgen05_st_16x128b_x16:
5476 case Intrinsic::nvvm_tcgen05_st_16x256b_x8:
5477 case Intrinsic::nvvm_tcgen05_st_32x32b_x32:
5478 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x32: {
5479 Info.opc = ISD::INTRINSIC_VOID;
5480 Info.memVT = MVT::v32i32;
5481 Info.ptrVal = I.getArgOperand(0);
5482 Info.offset = 0;
5483 Info.flags = MachineMemOperand::MOStore;
5484 Info.align.reset();
5485 Infos.push_back(Info);
5486 return;
5487 }
5488
5489 case Intrinsic::nvvm_tcgen05_st_16x64b_x64:
5490 case Intrinsic::nvvm_tcgen05_st_16x128b_x32:
5491 case Intrinsic::nvvm_tcgen05_st_16x256b_x16:
5492 case Intrinsic::nvvm_tcgen05_st_32x32b_x64:
5493 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x64: {
5494 Info.opc = ISD::INTRINSIC_VOID;
5495 Info.memVT = MVT::v64i32;
5496 Info.ptrVal = I.getArgOperand(0);
5497 Info.offset = 0;
5498 Info.flags = MachineMemOperand::MOStore;
5499 Info.align.reset();
5500 Infos.push_back(Info);
5501 return;
5502 }
5503
5504 case Intrinsic::nvvm_tcgen05_st_16x64b_x128:
5505 case Intrinsic::nvvm_tcgen05_st_16x128b_x64:
5506 case Intrinsic::nvvm_tcgen05_st_16x256b_x32:
5507 case Intrinsic::nvvm_tcgen05_st_32x32b_x128:
5508 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x128: {
5509 Info.opc = ISD::INTRINSIC_VOID;
5510 Info.memVT = MVT::v128i32;
5511 Info.ptrVal = I.getArgOperand(0);
5512 Info.offset = 0;
5513 Info.flags = MachineMemOperand::MOStore;
5514 Info.align.reset();
5515 Infos.push_back(Info);
5516 return;
5517 }
5518 case Intrinsic::nvvm_tcgen05_mma_shared_disable_output_lane_cg1:
5519 case Intrinsic::nvvm_tcgen05_mma_shared_scale_d_disable_output_lane_cg1:
5520 case Intrinsic::nvvm_tcgen05_mma_sp_shared_disable_output_lane_cg1:
5521 case Intrinsic::nvvm_tcgen05_mma_sp_shared_scale_d_disable_output_lane_cg1:
5522 case Intrinsic::nvvm_tcgen05_mma_tensor_disable_output_lane_cg1:
5523 case Intrinsic::nvvm_tcgen05_mma_tensor_scale_d_disable_output_lane_cg1:
5524 case Intrinsic::nvvm_tcgen05_mma_tensor_disable_output_lane_cg1_ashift:
5525 case Intrinsic::
5526 nvvm_tcgen05_mma_tensor_scale_d_disable_output_lane_cg1_ashift:
5527 case Intrinsic::nvvm_tcgen05_mma_sp_tensor_disable_output_lane_cg1:
5528 case Intrinsic::nvvm_tcgen05_mma_sp_tensor_scale_d_disable_output_lane_cg1:
5529 case Intrinsic::nvvm_tcgen05_mma_sp_tensor_disable_output_lane_cg1_ashift:
5530 case Intrinsic::
5531 nvvm_tcgen05_mma_sp_tensor_scale_d_disable_output_lane_cg1_ashift: {
5532 // We are reading and writing back to TMem
5533 Info.opc = ISD::INTRINSIC_VOID;
5534 Info.memVT = MVT::v4i32;
5535 Info.ptrVal = I.getArgOperand(0);
5536 Info.offset = 0;
5538 Info.align = Align(16);
5539 Infos.push_back(Info);
5540 return;
5541 }
5542
5543 case Intrinsic::nvvm_tcgen05_mma_shared_disable_output_lane_cg2:
5544 case Intrinsic::nvvm_tcgen05_mma_shared_scale_d_disable_output_lane_cg2:
5545 case Intrinsic::nvvm_tcgen05_mma_sp_shared_disable_output_lane_cg2:
5546 case Intrinsic::nvvm_tcgen05_mma_sp_shared_scale_d_disable_output_lane_cg2:
5547 case Intrinsic::nvvm_tcgen05_mma_tensor_disable_output_lane_cg2:
5548 case Intrinsic::nvvm_tcgen05_mma_tensor_scale_d_disable_output_lane_cg2:
5549 case Intrinsic::nvvm_tcgen05_mma_sp_tensor_disable_output_lane_cg2:
5550 case Intrinsic::nvvm_tcgen05_mma_sp_tensor_scale_d_disable_output_lane_cg2:
5551 case Intrinsic::nvvm_tcgen05_mma_tensor_disable_output_lane_cg2_ashift:
5552 case Intrinsic::
5553 nvvm_tcgen05_mma_tensor_scale_d_disable_output_lane_cg2_ashift:
5554 case Intrinsic::nvvm_tcgen05_mma_sp_tensor_disable_output_lane_cg2_ashift:
5555 case Intrinsic::
5556 nvvm_tcgen05_mma_sp_tensor_scale_d_disable_output_lane_cg2_ashift: {
5557 // We are reading and writing back to TMem
5558 Info.opc = ISD::INTRINSIC_VOID;
5559 Info.memVT = MVT::v8i32;
5560 Info.ptrVal = I.getArgOperand(0);
5561 Info.offset = 0;
5563 Info.align = Align(16);
5564 Infos.push_back(Info);
5565 return;
5566 }
5567 }
5568}
5569
5570/// getFunctionParamOptimizedAlign - since function arguments are passed via
5571/// .param space, we may want to increase their alignment in a way that
5572/// ensures that we can effectively vectorize their loads & stores. We can
5573/// increase alignment only if the function has internal or has private
5574/// linkage as for other linkage types callers may already rely on default
5575/// alignment. To allow using 128-bit vectorized loads/stores, this function
5576/// ensures that alignment is 16 or greater.
5578 const Function *F, Type *ArgTy, const DataLayout &DL) const {
5579 // Capping the alignment to 128 bytes as that is the maximum alignment
5580 // supported by PTX.
5581 const Align ABITypeAlign = std::min(Align(128), DL.getABITypeAlign(ArgTy));
5582
5583 // If a function has linkage different from internal or private, we
5584 // must use default ABI alignment as external users rely on it. Same
5585 // for a function that may be called from a function pointer.
5586 if (!F || !F->hasLocalLinkage() ||
5587 F->hasAddressTaken(/*Users=*/nullptr,
5588 /*IgnoreCallbackUses=*/false,
5589 /*IgnoreAssumeLikeCalls=*/true,
5590 /*IgnoreLLVMUsed=*/true))
5591 return ABITypeAlign;
5592
5593 assert(!isKernelFunction(*F) && "Expect kernels to have non-local linkage");
5594 return std::max(Align(16), ABITypeAlign);
5595}
5596
5597/// Helper for computing alignment of a device function byval parameter.
5599 const Function *F, Type *ArgTy, Align InitialAlign,
5600 const DataLayout &DL) const {
5601 Align ArgAlign = InitialAlign;
5602 // Try to increase alignment to enhance vectorization options.
5603 if (F)
5604 ArgAlign = std::max(ArgAlign, getFunctionParamOptimizedAlign(F, ArgTy, DL));
5605
5606 // Old ptx versions have a bug. When PTX code takes address of
5607 // byval parameter with alignment < 4, ptxas generates code to
5608 // spill argument into memory. Alas on sm_50+ ptxas generates
5609 // SASS code that fails with misaligned access. To work around
5610 // the problem, make sure that we align byval parameters by at
5611 // least 4. This bug seems to be fixed at least starting from
5612 // ptxas > 9.0.
5613 // TODO: remove this after verifying the bug is not reproduced
5614 // on non-deprecated ptxas versions.
5616 ArgAlign = std::max(ArgAlign, Align(4));
5617
5618 return ArgAlign;
5619}
5620
5621// Helper for getting a function parameter name. Name is composed from
5622// its index and the function name. Negative index corresponds to special
5623// parameter (unsized array) used for passing variable arguments.
5625 int Idx) const {
5626 std::string ParamName;
5627 raw_string_ostream ParamStr(ParamName);
5628
5629 ParamStr << getTargetMachine().getSymbol(F)->getName();
5630 if (Idx < 0)
5631 ParamStr << "_vararg";
5632 else
5633 ParamStr << "_param_" << Idx;
5634
5635 return ParamName;
5636}
5637
5638/// isLegalAddressingMode - Return true if the addressing mode represented
5639/// by AM is legal for this target, for a load/store of the specified type.
5640/// Used to guide target specific optimizations, like loop strength reduction
5641/// (LoopStrengthReduce.cpp) and memory optimization for address mode
5642/// (CodeGenPrepare.cpp)
5644 const AddrMode &AM, Type *Ty,
5645 unsigned AS, Instruction *I) const {
5646 // AddrMode - This represents an addressing mode of:
5647 // BaseGV + BaseOffs + BaseReg + Scale*ScaleReg
5648 //
5649 // The legal address modes are
5650 // - [avar]
5651 // - [areg]
5652 // - [areg+immoff]
5653 // - [immAddr]
5654
5655 // immoff must fit in a signed 32-bit int
5656 if (!APInt(64, AM.BaseOffs).isSignedIntN(32))
5657 return false;
5658
5659 if (AM.BaseGV)
5660 return !AM.BaseOffs && !AM.HasBaseReg && !AM.Scale;
5661
5662 switch (AM.Scale) {
5663 case 0: // "r", "r+i" or "i" is allowed
5664 break;
5665 case 1:
5666 if (AM.HasBaseReg) // "r+r+i" or "r+r" is not allowed.
5667 return false;
5668 // Otherwise we have r+i.
5669 break;
5670 default:
5671 // No scale > 1 is allowed
5672 return false;
5673 }
5674 return true;
5675}
5676
5677//===----------------------------------------------------------------------===//
5678// NVPTX Inline Assembly Support
5679//===----------------------------------------------------------------------===//
5680
5681/// getConstraintType - Given a constraint letter, return the type of
5682/// constraint it is for this target.
5685 if (Constraint.size() == 1) {
5686 switch (Constraint[0]) {
5687 default:
5688 break;
5689 case 'b':
5690 case 'r':
5691 case 'h':
5692 case 'c':
5693 case 'l':
5694 case 'f':
5695 case 'd':
5696 case 'q':
5697 case '0':
5698 case 'N':
5699 return C_RegisterClass;
5700 }
5701 }
5702 return TargetLowering::getConstraintType(Constraint);
5703}
5704
5705std::pair<unsigned, const TargetRegisterClass *>
5707 StringRef Constraint,
5708 MVT VT) const {
5709 if (Constraint.size() == 1) {
5710 switch (Constraint[0]) {
5711 case 'b':
5712 return std::make_pair(0U, &NVPTX::B1RegClass);
5713 case 'c':
5714 case 'h':
5715 return std::make_pair(0U, &NVPTX::B16RegClass);
5716 case 'r':
5717 case 'f':
5718 return std::make_pair(0U, &NVPTX::B32RegClass);
5719 case 'l':
5720 case 'N':
5721 case 'd':
5722 return std::make_pair(0U, &NVPTX::B64RegClass);
5723 case 'q': {
5724 if (STI.getSmVersion() < 70)
5725 report_fatal_error("Inline asm with 128 bit operands is only "
5726 "supported for sm_70 and higher!");
5727 return std::make_pair(0U, &NVPTX::B128RegClass);
5728 }
5729 }
5730 }
5731 return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
5732}
5733
5734//===----------------------------------------------------------------------===//
5735// NVPTX DAG Combining
5736//===----------------------------------------------------------------------===//
5737
5739 CodeGenOptLevel OptLevel) const {
5740 // Always honor command-line argument
5741 if (FMAContractLevelOpt.getNumOccurrences() > 0)
5742 return FMAContractLevelOpt > 0;
5743
5744 // Do not contract if we're not optimizing the code.
5745 if (OptLevel == CodeGenOptLevel::None)
5746 return false;
5747
5748 // Honor TargetOptions flags that explicitly say fusion is okay.
5750 return true;
5751
5752 return false;
5753}
5754
5755static bool isConstZero(const SDValue &Operand) {
5756 const auto *Const = dyn_cast<ConstantSDNode>(Operand);
5757 return Const && Const->getZExtValue() == 0;
5758}
5759
5760/// PerformADDCombineWithOperands - Try DAG combinations for an ADD with
5761/// operands N0 and N1. This is a helper for PerformADDCombine that is
5762/// called with the default operands, and if that fails, with commuted
5763/// operands.
5764static SDValue
5767 EVT VT = N0.getValueType();
5768
5769 // Since integer multiply-add costs the same as integer multiply
5770 // but is more costly than integer add, do the fusion only when
5771 // the mul is only used in the add.
5772 // TODO: this may not be true for later architectures, consider relaxing this
5773 if (!N0.getNode()->hasOneUse())
5774 return SDValue();
5775
5776 // fold (add (select cond, 0, (mul a, b)), c)
5777 // -> (select cond, c, (add (mul a, b), c))
5778 //
5779 if (N0.getOpcode() == ISD::SELECT) {
5780 unsigned ZeroOpNum;
5781 if (isConstZero(N0->getOperand(1)))
5782 ZeroOpNum = 1;
5783 else if (isConstZero(N0->getOperand(2)))
5784 ZeroOpNum = 2;
5785 else
5786 return SDValue();
5787
5788 SDValue M = N0->getOperand((ZeroOpNum == 1) ? 2 : 1);
5789 if (M->getOpcode() != ISD::MUL || !M.getNode()->hasOneUse())
5790 return SDValue();
5791
5792 SDLoc DL(N);
5793 SDValue Mul =
5794 DCI.DAG.getNode(ISD::MUL, DL, VT, M->getOperand(0), M->getOperand(1));
5795 SDValue MAD = DCI.DAG.getNode(ISD::ADD, DL, VT, Mul, N1);
5796 return DCI.DAG.getSelect(SDLoc(N), VT, N0->getOperand(0),
5797 ((ZeroOpNum == 1) ? N1 : MAD),
5798 ((ZeroOpNum == 1) ? MAD : N1));
5799 }
5800
5801 return SDValue();
5802}
5803
5804static SDValue
5807 CodeGenOptLevel OptLevel) {
5808 EVT VT = N0.getValueType();
5809 if (N0.getOpcode() == ISD::FMUL) {
5810 const auto *TLI = static_cast<const NVPTXTargetLowering *>(
5811 &DCI.DAG.getTargetLoweringInfo());
5812 if (!(TLI->allowFMA(DCI.DAG.getMachineFunction(), OptLevel) ||
5813 (N->getFlags().hasAllowContract() &&
5814 N0->getFlags().hasAllowContract())))
5815 return SDValue();
5816
5817 // For floating point:
5818 // Do the fusion only when the mul has less than 5 uses and all
5819 // are add.
5820 // The heuristic is that if a use is not an add, then that use
5821 // cannot be fused into fma, therefore mul is still needed anyway.
5822 // If there are more than 4 uses, even if they are all add, fusing
5823 // them will increase register pressue.
5824 //
5825 int numUses = 0;
5826 int nonAddCount = 0;
5827 for (const SDNode *User : N0.getNode()->users()) {
5828 numUses++;
5829 if (User->getOpcode() != ISD::FADD)
5830 ++nonAddCount;
5831 if (numUses >= 5)
5832 return SDValue();
5833 }
5834 if (nonAddCount) {
5835 int orderNo = N->getIROrder();
5836 int orderNo2 = N0.getNode()->getIROrder();
5837 // simple heuristics here for considering potential register
5838 // pressure, the logics here is that the differnce are used
5839 // to measure the distance between def and use, the longer distance
5840 // more likely cause register pressure.
5841 if (orderNo - orderNo2 < 500)
5842 return SDValue();
5843
5844 // Now, check if at least one of the FMUL's operands is live beyond the
5845 // node N, which guarantees that the FMA will not increase register
5846 // pressure at node N.
5847 bool opIsLive = false;
5848 const SDNode *left = N0.getOperand(0).getNode();
5849 const SDNode *right = N0.getOperand(1).getNode();
5850
5851 if (isa<ConstantSDNode>(left) || isa<ConstantSDNode>(right))
5852 opIsLive = true;
5853
5854 if (!opIsLive)
5855 for (const SDNode *User : left->users()) {
5856 int orderNo3 = User->getIROrder();
5857 if (orderNo3 > orderNo) {
5858 opIsLive = true;
5859 break;
5860 }
5861 }
5862
5863 if (!opIsLive)
5864 for (const SDNode *User : right->users()) {
5865 int orderNo3 = User->getIROrder();
5866 if (orderNo3 > orderNo) {
5867 opIsLive = true;
5868 break;
5869 }
5870 }
5871
5872 if (!opIsLive)
5873 return SDValue();
5874 }
5875
5876 return DCI.DAG.getNode(ISD::FMA, SDLoc(N), VT, N0.getOperand(0),
5877 N0.getOperand(1), N1);
5878 }
5879
5880 return SDValue();
5881}
5882
5883/// Fold unpacking movs into a load by increasing the number of return values.
5884///
5885/// ex:
5886/// L: v2f16,ch = load <p>
5887/// a: f16 = extractelt L:0, 0
5888/// b: f16 = extractelt L:0, 1
5889/// use(a, b)
5890///
5891/// ...is turned into...
5892///
5893/// L: f16,f16,ch = LoadV2 <p>
5894/// use(L:0, L:1)
5895static SDValue
5897 // Don't run this optimization before the legalizer
5898 if (!DCI.isAfterLegalizeDAG())
5899 return SDValue();
5900
5901 EVT ElementVT = N->getValueType(0);
5902 // Avoid non-packed types and v4i8
5903 if (!NVPTX::isPackedVectorTy(ElementVT) || ElementVT == MVT::v4i8)
5904 return SDValue();
5905
5906 // Check whether all outputs are either used by an extractelt or are
5907 // glue/chain nodes
5908 if (!all_of(N->uses(), [&](SDUse &U) {
5909 // Skip glue, chain nodes
5910 if (U.getValueType() == MVT::Glue || U.getValueType() == MVT::Other)
5911 return true;
5912 if (U.getUser()->getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
5913 if (N->getOpcode() != ISD::LOAD)
5914 return true;
5915 // Since this is an ISD::LOAD, check all extractelts are used. If
5916 // any are not used, we don't want to defeat another optimization that
5917 // will narrow the load.
5918 //
5919 // For example:
5920 //
5921 // L: v2f16,ch = load <p>
5922 // e0: f16 = extractelt L:0, 0
5923 // e1: f16 = extractelt L:0, 1 <-- unused
5924 // store e0
5925 //
5926 // Can be optimized by DAGCombiner to:
5927 //
5928 // L: f16,ch = load <p>
5929 // store L:0
5930 return !U.getUser()->use_empty();
5931 }
5932
5933 // Otherwise, this use prevents us from splitting a value.
5934 return false;
5935 }))
5936 return SDValue();
5937
5938 auto *LD = cast<MemSDNode>(N);
5939 SDLoc DL(LD);
5940
5941 // the new opcode after we double the number of operands
5942 unsigned Opcode;
5943 SmallVector<SDValue> Operands(LD->ops());
5944 unsigned OldNumOutputs; // non-glue, non-chain outputs
5945 switch (LD->getOpcode()) {
5946 case ISD::LOAD:
5947 OldNumOutputs = 1;
5948 // Any packed type is legal, so the legalizer will not have lowered
5949 // ISD::LOAD -> NVPTXISD::Load (unless it's under-aligned). We have to do it
5950 // here.
5951 Opcode = NVPTXISD::LoadV2;
5952 // append a "full" used bytes mask operand right before the extension type
5953 // operand, signifying that all bytes are used.
5954 Operands.push_back(DCI.DAG.getConstant(UINT32_MAX, DL, MVT::i32));
5955 Operands.push_back(DCI.DAG.getIntPtrConstant(
5956 cast<LoadSDNode>(LD)->getExtensionType(), DL));
5957 break;
5958 case NVPTXISD::LoadV2:
5959 OldNumOutputs = 2;
5960 Opcode = NVPTXISD::LoadV4;
5961 break;
5962 case NVPTXISD::LoadV4:
5963 // V8 is only supported for f32/i32. Don't forget, we're not changing the
5964 // load size here. This is already a 256-bit load.
5965 if (ElementVT != MVT::v2f32 && ElementVT != MVT::v2i32)
5966 return SDValue();
5967 OldNumOutputs = 4;
5968 Opcode = NVPTXISD::LoadV8;
5969 break;
5970 case NVPTXISD::LoadV8:
5971 // PTX doesn't support the next doubling of outputs
5972 return SDValue();
5973 }
5974
5975 // the non-glue, non-chain outputs in the new load
5976 const unsigned NewNumOutputs = OldNumOutputs * 2;
5977 SmallVector<EVT> NewVTs(NewNumOutputs, ElementVT.getVectorElementType());
5978 // add remaining chain and glue values
5979 NewVTs.append(LD->value_begin() + OldNumOutputs, LD->value_end());
5980
5981 // Create the new load
5982 SDValue NewLoad = DCI.DAG.getMemIntrinsicNode(
5983 Opcode, DL, DCI.DAG.getVTList(NewVTs), Operands, LD->getMemoryVT(),
5984 LD->getMemOperand());
5985
5986 // Now we use a combination of BUILD_VECTORs and a MERGE_VALUES node to keep
5987 // the outputs the same. These nodes will be optimized away in later
5988 // DAGCombiner iterations.
5990 for (unsigned I : seq(OldNumOutputs))
5991 Results.push_back(DCI.DAG.getBuildVector(
5992 ElementVT, DL, {NewLoad.getValue(I * 2), NewLoad.getValue(I * 2 + 1)}));
5993 // Add remaining chain and glue nodes
5994 for (unsigned I : seq(NewLoad->getNumValues() - NewNumOutputs))
5995 Results.push_back(NewLoad.getValue(NewNumOutputs + I));
5996
5997 return DCI.DAG.getMergeValues(Results, DL);
5998}
5999
6000/// Fold packing movs into a store.
6001///
6002/// ex:
6003/// v1: v2f16 = BUILD_VECTOR a:f16, b:f16
6004/// v2: v2f16 = BUILD_VECTOR c:f16, d:f16
6005/// StoreV2 v1, v2
6006///
6007/// ...is turned into...
6008///
6009/// StoreV4 a, b, c, d
6012 unsigned Front, unsigned Back) {
6013 // We want to run this as late as possible since other optimizations may
6014 // eliminate the BUILD_VECTORs.
6015 if (!DCI.isAfterLegalizeDAG())
6016 return SDValue();
6017
6018 // Get the type of the operands being stored.
6019 EVT ElementVT = N->getOperand(Front).getValueType();
6020
6021 // Avoid non-packed types and v4i8
6022 if (!NVPTX::isPackedVectorTy(ElementVT) || ElementVT == MVT::v4i8)
6023 return SDValue();
6024
6025 auto *ST = cast<MemSDNode>(N);
6026
6027 // The new opcode after we double the number of operands.
6028 unsigned Opcode;
6029 switch (N->getOpcode()) {
6030 case ISD::STORE:
6031 // Any packed type is legal, so the legalizer will not have lowered
6032 // ISD::STORE -> NVPTXISD::Store (unless it's under-aligned). We have to do
6033 // it here.
6034 Opcode = NVPTXISD::StoreV2;
6035 break;
6036 case NVPTXISD::StoreV2:
6037 Opcode = NVPTXISD::StoreV4;
6038 break;
6039 case NVPTXISD::StoreV4:
6040 // V8 is only supported for f32/i32. Don't forget, we're not changing the
6041 // store size here. This is already a 256-bit store.
6042 if (ElementVT != MVT::v2f32 && ElementVT != MVT::v2i32)
6043 return SDValue();
6044 Opcode = NVPTXISD::StoreV8;
6045 break;
6046 case NVPTXISD::StoreV8:
6047 // PTX doesn't support the next doubling of operands
6048 return SDValue();
6049 default:
6050 llvm_unreachable("Unhandled store opcode");
6051 }
6052
6053 // Scan the operands and if they're all BUILD_VECTORs, we'll have gathered
6054 // their elements.
6055 SmallVector<SDValue, 4> Operands(N->ops().take_front(Front));
6056 for (SDValue BV : N->ops().drop_front(Front).drop_back(Back)) {
6057 if (BV.getOpcode() != ISD::BUILD_VECTOR)
6058 return SDValue();
6059
6060 // If the operand has multiple uses, this optimization can increase register
6061 // pressure.
6062 if (!BV.hasOneUse())
6063 return SDValue();
6064
6065 // DAGCombiner visits nodes bottom-up. Check the BUILD_VECTOR operands for
6066 // any signs they may be folded by some other pattern or rule.
6067 for (SDValue Op : BV->ops()) {
6068 // Peek through bitcasts
6069 if (Op.getOpcode() == ISD::BITCAST)
6070 Op = Op.getOperand(0);
6071
6072 // This may be folded into a PRMT.
6073 if (Op.getValueType() == MVT::i16 && Op.getOpcode() == ISD::TRUNCATE &&
6074 Op->getOperand(0).getValueType() == MVT::i32)
6075 return SDValue();
6076
6077 // This may be folded into cvt.bf16x2
6078 if (Op.getOpcode() == ISD::FP_ROUND)
6079 return SDValue();
6080 }
6081 Operands.append({BV.getOperand(0), BV.getOperand(1)});
6082 }
6083 Operands.append(N->op_end() - Back, N->op_end());
6084
6085 // Now we replace the store
6086 return DCI.DAG.getMemIntrinsicNode(Opcode, SDLoc(N), N->getVTList(), Operands,
6087 ST->getMemoryVT(), ST->getMemOperand());
6088}
6089
6091 const NVPTXSubtarget &STI) {
6092
6093 if (DCI.isBeforeLegalize() && N->getOpcode() == ISD::STORE) {
6094 // Here is our chance to custom lower a store with a non-simple type.
6095 // Unfortunately, we can't do this in the legalizer because there is no
6096 // way to setOperationAction for an non-simple type.
6098 if (!ST->getValue().getValueType().isSimple())
6099 return lowerSTOREVector(SDValue(ST, 0), DCI.DAG, STI);
6100 }
6101
6102 return combinePackingMovIntoStore(N, DCI, 1, 2);
6103}
6104
6106 const NVPTXSubtarget &STI) {
6107 if (DCI.isBeforeLegalize() && N->getOpcode() == ISD::LOAD) {
6108 // Here is our chance to custom lower a load with a non-simple type.
6109 // Unfortunately, we can't do this in the legalizer because there is no
6110 // way to setOperationAction for an non-simple type.
6111 if (!N->getValueType(0).isSimple())
6112 return lowerLoadVector(N, DCI.DAG, STI);
6113 }
6114
6115 return combineUnpackingMovIntoLoad(N, DCI);
6116}
6117
6118/// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD.
6119///
6122 CodeGenOptLevel OptLevel) {
6123 if (OptLevel == CodeGenOptLevel::None)
6124 return SDValue();
6125
6126 SDValue N0 = N->getOperand(0);
6127 SDValue N1 = N->getOperand(1);
6128
6129 // Skip non-integer, non-scalar case
6130 EVT VT = N0.getValueType();
6131 if (VT.isVector() || VT != MVT::i32)
6132 return SDValue();
6133
6134 // First try with the default operand order.
6135 if (SDValue Result = PerformADDCombineWithOperands(N, N0, N1, DCI))
6136 return Result;
6137
6138 // If that didn't work, try again with the operands commuted.
6139 return PerformADDCombineWithOperands(N, N1, N0, DCI);
6140}
6141
6142/// Check if a v2f32 BUILD_VECTOR provably packs values from non-adjacent
6143/// register pairs (non-coalescable).
6144static bool isNonCoalescableBuildVector(const SDValue &BV) {
6145 if (BV.getOpcode() != ISD::BUILD_VECTOR || BV.getValueType() != MVT::v2f32)
6146 return false;
6147
6148 SDValue Elt0 = BV.getOperand(0);
6149 SDValue Elt1 = BV.getOperand(1);
6150
6151 bool IsExt0 = Elt0.getOpcode() == ISD::EXTRACT_VECTOR_ELT;
6152 bool IsExt1 = Elt1.getOpcode() == ISD::EXTRACT_VECTOR_ELT;
6153
6154 // If neither element is an EXTRACT_VECTOR_ELT they are free-standing
6155 // scalars and the register allocator can still place them side-by-side.
6156 if (!IsExt0 && !IsExt1)
6157 return false;
6158
6159 // If exactly one element is an EXTRACT_VECTOR_ELT, the other is a scalar
6160 // that cannot generally occupy the adjacent register slot.
6161 if (IsExt0 != IsExt1)
6162 return true;
6163
6164 // At this point both sources are extracting from vectors. If they are from
6165 // different vectors, then the BUILD_VECTOR is non-coalescable.
6166 SDValue Src0 = Elt0.getOperand(0);
6167 SDValue Src1 = Elt1.getOperand(0);
6168 if (Src0 != Src1)
6169 return true;
6170
6171 auto *Idx0 = dyn_cast<ConstantSDNode>(Elt0.getOperand(1));
6172 auto *Idx1 = dyn_cast<ConstantSDNode>(Elt1.getOperand(1));
6173 // If both indices are dynamic they will be lowered to
6174 // loads and the vector will be spilled to local memory. The register
6175 // allocator can easily place the results in adjacent registers.
6176 if (!Idx0 && !Idx1)
6177 return false;
6178
6179 // If one index is dynamic and the other is constant, the value from the
6180 // constant load will result in an additional register to pair with the result
6181 // from the dynamic load. We consider this non-coalescable.
6182 if ((Idx0 && !Idx1) || (!Idx0 && Idx1))
6183 return true;
6184
6185 // Both are constant, adjacent pairs are coalescable
6186 return std::abs(Idx0->getSExtValue() - Idx1->getSExtValue()) != 1;
6187}
6188
6189/// Scalarize a v2f32 arithmetic node (FADD, FMUL, FSUB, FMA) when at least
6190/// one operand is a BUILD_VECTOR that repacks values from non-adjacent register
6191/// pairs. Without this combine the BUILD_VECTOR forces allocation of a
6192/// temporary 64-bit register, increasing register pressure.
6193///
6194/// Example - before:
6195/// t0: v2f32,v2f32,ch = LoadV2 ...
6196/// t1: f32 = extract_vector_elt t0, 0
6197/// t2: f32 = extract_vector_elt t0:1, 0
6198/// t3: v2f32 = BUILD_VECTOR t1, t2 ;; non-coalescable repack
6199/// t4: v2f32 = fma t_a, t3, t_c
6200///
6201/// After:
6202/// t0: v2f32,v2f32,ch = LoadV2 ...
6203/// t1: f32 = extract_vector_elt t0, 0
6204/// t2: f32 = extract_vector_elt t0:1, 0
6205/// a0: f32 = extract_vector_elt t_a, 0
6206/// a1: f32 = extract_vector_elt t_a, 1
6207/// c0: f32 = extract_vector_elt t_c, 0
6208/// c1: f32 = extract_vector_elt t_c, 1
6209/// r0: f32 = fma a0, t1, c0
6210/// r1: f32 = fma a1, t2, c1
6211/// t4: v2f32 = BUILD_VECTOR r0, r1
6214 EVT VT = N->getValueType(0);
6215 if (VT != MVT::v2f32)
6216 return SDValue();
6217
6218 // Only scalarize when at least one operand is a BUILD_VECTOR whose elements
6219 // are guaranteed to reside in different register pairs.
6221 return SDValue();
6222
6223 SelectionDAG &DAG = DCI.DAG;
6224 SDLoc DL(N);
6225 EVT EltVT = VT.getVectorElementType();
6226 unsigned Opc = N->getOpcode();
6227
6228 // For each operand, get the scalar element at the given index: if the operand
6229 // is a BUILD_VECTOR, grab the element directly; otherwise, emit an
6230 // EXTRACT_VECTOR_ELT.
6231 auto GetElement = [&](SDValue Op, unsigned Index) -> SDValue {
6232 if (Op.getOpcode() == ISD::BUILD_VECTOR)
6233 return Op.getOperand(Index);
6234 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Op,
6235 DAG.getVectorIdxConstant(Index, DL));
6236 };
6237
6238 // Build scalar operand lists for element 0 and element 1.
6239 SmallVector<SDValue, 3> Ops0, Ops1;
6240 for (const SDValue &Op : N->ops()) {
6241 Ops0.push_back(GetElement(Op, 0));
6242 Ops1.push_back(GetElement(Op, 1));
6243 }
6244
6245 SDValue Res0 = DAG.getNode(Opc, DL, EltVT, Ops0, N->getFlags());
6246 SDValue Res1 = DAG.getNode(Opc, DL, EltVT, Ops1, N->getFlags());
6247
6248 return DAG.getNode(ISD::BUILD_VECTOR, DL, VT, Res0, Res1);
6249}
6250
6251/// PerformFADDCombine - Target-specific dag combine xforms for ISD::FADD.
6252///
6255 CodeGenOptLevel OptLevel) {
6256 SDValue N0 = N->getOperand(0);
6257 SDValue N1 = N->getOperand(1);
6258
6259 if (SDValue Result = PerformScalarizeV2F32Op(N, DCI))
6260 return Result;
6261
6262 EVT VT = N0.getValueType();
6263 if (VT.isVector() || !(VT == MVT::f32 || VT == MVT::f64))
6264 return SDValue();
6265
6266 // First try with the default operand order.
6267 if (SDValue Result = PerformFADDCombineWithOperands(N, N0, N1, DCI, OptLevel))
6268 return Result;
6269
6270 // If that didn't work, try again with the operands commuted.
6271 return PerformFADDCombineWithOperands(N, N1, N0, DCI, OptLevel);
6272}
6273
6274/// Get 3-input version of a 2-input min/max opcode
6275static unsigned getMinMax3Opcode(unsigned MinMax2Opcode) {
6276 switch (MinMax2Opcode) {
6277 case ISD::FMAXNUM:
6278 case ISD::FMAXIMUMNUM:
6279 return NVPTXISD::FMAXNUM3;
6280 case ISD::FMINNUM:
6281 case ISD::FMINIMUMNUM:
6282 return NVPTXISD::FMINNUM3;
6283 case ISD::FMAXIMUM:
6284 return NVPTXISD::FMAXIMUM3;
6285 case ISD::FMINIMUM:
6286 return NVPTXISD::FMINIMUM3;
6287 default:
6288 llvm_unreachable("Invalid 2-input min/max opcode");
6289 }
6290}
6291
6292/// PerformFMinMaxCombine - Combine (fmaxnum (fmaxnum a, b), c) into
6293/// (fmaxnum3 a, b, c). Also covers other llvm min/max intrinsics.
6296 unsigned PTXVersion, unsigned SmVersion) {
6297
6298 // 3-input min/max requires PTX 8.8+ and SM_100+, and only supports f32s
6299 EVT VT = N->getValueType(0);
6300 if (VT != MVT::f32 || PTXVersion < 88 || SmVersion < 100)
6301 return SDValue();
6302
6303 SDValue Op0 = N->getOperand(0);
6304 SDValue Op1 = N->getOperand(1);
6305 unsigned MinMaxOp2 = N->getOpcode();
6306 unsigned MinMaxOp3 = getMinMax3Opcode(MinMaxOp2);
6307
6308 if (Op0.getOpcode() == MinMaxOp2 && Op0.hasOneUse()) {
6309 // (maxnum (maxnum a, b), c) -> (maxnum3 a, b, c)
6310 SDValue A = Op0.getOperand(0);
6311 SDValue B = Op0.getOperand(1);
6312 SDValue C = Op1;
6313 return DCI.DAG.getNode(MinMaxOp3, SDLoc(N), VT, A, B, C, N->getFlags());
6314 } else if (Op1.getOpcode() == MinMaxOp2 && Op1.hasOneUse()) {
6315 // (maxnum a, (maxnum b, c)) -> (maxnum3 a, b, c)
6316 SDValue A = Op0;
6317 SDValue B = Op1.getOperand(0);
6318 SDValue C = Op1.getOperand(1);
6319 return DCI.DAG.getNode(MinMaxOp3, SDLoc(N), VT, A, B, C, N->getFlags());
6320 }
6321 return SDValue();
6322}
6323
6326 CodeGenOptLevel OptLevel) {
6327 assert(N->getOpcode() == ISD::SREM || N->getOpcode() == ISD::UREM);
6328
6329 // Don't do anything at less than -O2.
6330 if (OptLevel < CodeGenOptLevel::Default)
6331 return SDValue();
6332
6333 SelectionDAG &DAG = DCI.DAG;
6334 SDLoc DL(N);
6335 EVT VT = N->getValueType(0);
6336 bool IsSigned = N->getOpcode() == ISD::SREM;
6337 unsigned DivOpc = IsSigned ? ISD::SDIV : ISD::UDIV;
6338
6339 const SDValue &Num = N->getOperand(0);
6340 const SDValue &Den = N->getOperand(1);
6341
6342 for (const SDNode *U : Num->users()) {
6343 if (U->getOpcode() == DivOpc && U->getOperand(0) == Num &&
6344 U->getOperand(1) == Den) {
6345 // Num % Den -> Num - (Num / Den) * Den
6346 return DAG.getNode(ISD::SUB, DL, VT, Num,
6347 DAG.getNode(ISD::MUL, DL, VT,
6348 DAG.getNode(DivOpc, DL, VT, Num, Den),
6349 Den));
6350 }
6351 }
6352 return SDValue();
6353}
6354
6355// (sign_extend|zero_extend (mul|shl) x, y) -> (mul.wide x, y)
6357 CodeGenOptLevel OptLevel) {
6358 if (OptLevel == CodeGenOptLevel::None)
6359 return SDValue();
6360
6361 SDValue Op = N->getOperand(0);
6362 if (!Op.hasOneUse())
6363 return SDValue();
6364 EVT ToVT = N->getValueType(0);
6365 EVT FromVT = Op.getValueType();
6366 if (!((ToVT == MVT::i32 && FromVT == MVT::i16) ||
6367 (ToVT == MVT::i64 && FromVT == MVT::i32)))
6368 return SDValue();
6369 if (!(Op.getOpcode() == ISD::MUL ||
6370 (Op.getOpcode() == ISD::SHL && isa<ConstantSDNode>(Op.getOperand(1)))))
6371 return SDValue();
6372
6373 SDLoc DL(N);
6374 unsigned ExtOpcode = N->getOpcode();
6375 unsigned Opcode = 0;
6376 if (ExtOpcode == ISD::SIGN_EXTEND && Op->getFlags().hasNoSignedWrap())
6377 Opcode = NVPTXISD::MUL_WIDE_SIGNED;
6378 else if (ExtOpcode == ISD::ZERO_EXTEND && Op->getFlags().hasNoUnsignedWrap())
6379 Opcode = NVPTXISD::MUL_WIDE_UNSIGNED;
6380 else
6381 return SDValue();
6382 SDValue RHS = Op.getOperand(1);
6383 if (Op.getOpcode() == ISD::SHL) {
6384 const auto ShiftAmt = Op.getConstantOperandVal(1);
6385 const auto MulVal = APInt(FromVT.getSizeInBits(), 1) << ShiftAmt;
6386 RHS = DCI.DAG.getConstant(MulVal, DL, FromVT);
6387 }
6388 return DCI.DAG.getNode(Opcode, DL, ToVT, Op.getOperand(0), RHS);
6389}
6390
6396
6397/// IsMulWideOperandDemotable - Checks if the provided DAG node is an operand
6398/// that can be demoted to \p OptSize bits without loss of information. The
6399/// signedness of the operand, if determinable, is placed in \p S.
6401 unsigned OptSize,
6402 OperandSignedness &S) {
6403 S = Unknown;
6404
6405 if (Op.getOpcode() == ISD::SIGN_EXTEND ||
6406 Op.getOpcode() == ISD::SIGN_EXTEND_INREG) {
6407 EVT OrigVT = Op.getOperand(0).getValueType();
6408 if (OrigVT.getFixedSizeInBits() <= OptSize) {
6409 S = Signed;
6410 return true;
6411 }
6412 } else if (Op.getOpcode() == ISD::ZERO_EXTEND) {
6413 EVT OrigVT = Op.getOperand(0).getValueType();
6414 if (OrigVT.getFixedSizeInBits() <= OptSize) {
6415 S = Unsigned;
6416 return true;
6417 }
6418 }
6419
6420 return false;
6421}
6422
6423/// AreMulWideOperandsDemotable - Checks if the given LHS and RHS operands can
6424/// be demoted to \p OptSize bits without loss of information. If the operands
6425/// contain a constant, it should appear as the RHS operand. The signedness of
6426/// the operands is placed in \p IsSigned.
6428 unsigned OptSize,
6429 bool &IsSigned) {
6430 OperandSignedness LHSSign;
6431
6432 // The LHS operand must be a demotable op
6433 if (!IsMulWideOperandDemotable(LHS, OptSize, LHSSign))
6434 return false;
6435
6436 // We should have been able to determine the signedness from the LHS
6437 if (LHSSign == Unknown)
6438 return false;
6439
6440 IsSigned = (LHSSign == Signed);
6441
6442 // The RHS can be a demotable op or a constant
6444 const APInt &Val = CI->getAPIntValue();
6445 if (LHSSign == Unsigned) {
6446 return Val.isIntN(OptSize);
6447 } else {
6448 return Val.isSignedIntN(OptSize);
6449 }
6450 } else {
6451 OperandSignedness RHSSign;
6452 if (!IsMulWideOperandDemotable(RHS, OptSize, RHSSign))
6453 return false;
6454
6455 return LHSSign == RHSSign;
6456 }
6457}
6458
6459/// TryMULWIDECombine - Attempt to replace a multiply of M bits with a multiply
6460/// of M/2 bits that produces an M-bit result (i.e. mul.wide). This transform
6461/// works on both multiply DAG nodes and SHL DAG nodes with a constant shift
6462/// amount.
6465 EVT MulType = N->getValueType(0);
6466 if (MulType != MVT::i32 && MulType != MVT::i64) {
6467 return SDValue();
6468 }
6469
6470 SDLoc DL(N);
6471 unsigned OptSize = MulType.getSizeInBits() >> 1;
6472 SDValue LHS = N->getOperand(0);
6473 SDValue RHS = N->getOperand(1);
6474
6475 // Canonicalize the multiply so the constant (if any) is on the right
6476 if (N->getOpcode() == ISD::MUL) {
6477 if (isa<ConstantSDNode>(LHS)) {
6478 std::swap(LHS, RHS);
6479 }
6480 }
6481
6482 // If we have a SHL, determine the actual multiply amount
6483 if (N->getOpcode() == ISD::SHL) {
6485 if (!ShlRHS) {
6486 return SDValue();
6487 }
6488
6489 APInt ShiftAmt = ShlRHS->getAPIntValue();
6490 unsigned BitWidth = MulType.getSizeInBits();
6491 if (ShiftAmt.sge(0) && ShiftAmt.slt(BitWidth)) {
6492 APInt MulVal = APInt(BitWidth, 1) << ShiftAmt;
6493 RHS = DCI.DAG.getConstant(MulVal, DL, MulType);
6494 } else {
6495 return SDValue();
6496 }
6497 }
6498
6499 bool Signed;
6500 // Verify that our operands are demotable
6501 if (!AreMulWideOperandsDemotable(LHS, RHS, OptSize, Signed)) {
6502 return SDValue();
6503 }
6504
6505 EVT DemotedVT;
6506 if (MulType == MVT::i32) {
6507 DemotedVT = MVT::i16;
6508 } else {
6509 DemotedVT = MVT::i32;
6510 }
6511
6512 // Truncate the operands to the correct size. Note that these are just for
6513 // type consistency and will (likely) be eliminated in later phases.
6514 SDValue TruncLHS =
6515 DCI.DAG.getNode(ISD::TRUNCATE, DL, DemotedVT, LHS);
6516 SDValue TruncRHS =
6517 DCI.DAG.getNode(ISD::TRUNCATE, DL, DemotedVT, RHS);
6518
6519 unsigned Opc;
6520 if (Signed) {
6521 Opc = NVPTXISD::MUL_WIDE_SIGNED;
6522 } else {
6523 Opc = NVPTXISD::MUL_WIDE_UNSIGNED;
6524 }
6525
6526 return DCI.DAG.getNode(Opc, DL, MulType, TruncLHS, TruncRHS);
6527}
6528
6529static bool isConstOne(const SDValue &Operand) {
6530 const auto *Const = dyn_cast<ConstantSDNode>(Operand);
6531 return Const && Const->getZExtValue() == 1;
6532}
6533
6535 if (Add->getOpcode() != ISD::ADD)
6536 return SDValue();
6537
6538 if (isConstOne(Add->getOperand(0)))
6539 return Add->getOperand(1);
6540
6541 if (isConstOne(Add->getOperand(1)))
6542 return Add->getOperand(0);
6543
6544 return SDValue();
6545}
6546
6549
6551 SDValue Mul = DCI.DAG.getNode(ISD::MUL, DL, VT, X, Y);
6552 return DCI.DAG.getNode(ISD::ADD, DL, VT, Mul, X);
6553 }
6554
6555 return SDValue();
6556}
6557
6559 SDLoc DL,
6561 if (Select->getOpcode() != ISD::SELECT)
6562 return SDValue();
6563
6564 SDValue Cond = Select->getOperand(0);
6565
6566 unsigned ConstOpNo;
6567 if (isConstOne(Select->getOperand(1)))
6568 ConstOpNo = 1;
6569 else if (isConstOne(Select->getOperand(2)))
6570 ConstOpNo = 2;
6571 else
6572 return SDValue();
6573
6574 SDValue Y = Select->getOperand((ConstOpNo == 1) ? 2 : 1);
6575
6576 // Do not combine if the resulting sequence is not obviously profitable.
6578 return SDValue();
6579
6580 SDValue NewMul = DCI.DAG.getNode(ISD::MUL, DL, VT, X, Y);
6581
6582 return DCI.DAG.getNode(ISD::SELECT, DL, VT, Cond,
6583 (ConstOpNo == 1) ? X : NewMul,
6584 (ConstOpNo == 1) ? NewMul : X);
6585}
6586
6587static SDValue
6590
6591 EVT VT = N0.getValueType();
6592 if (VT.isVector())
6593 return SDValue();
6594
6595 if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64)
6596 return SDValue();
6597
6598 SDLoc DL(N);
6599
6600 // (mul x, (add y, 1)) -> (add (mul x, y), x)
6601 if (SDValue Res = combineMADConstOne(N0, N1, VT, DL, DCI))
6602 return Res;
6603 if (SDValue Res = combineMADConstOne(N1, N0, VT, DL, DCI))
6604 return Res;
6605
6606 // (mul x, (select y, 1)) -> (select (mul x, y), x)
6607 if (SDValue Res = combineMulSelectConstOne(N0, N1, VT, DL, DCI))
6608 return Res;
6609 if (SDValue Res = combineMulSelectConstOne(N1, N0, VT, DL, DCI))
6610 return Res;
6611
6612 return SDValue();
6613}
6614
6615/// PerformMULCombine - Runs PTX-specific DAG combine patterns on MUL nodes.
6618 CodeGenOptLevel OptLevel) {
6619 if (OptLevel == CodeGenOptLevel::None)
6620 return SDValue();
6621
6622 if (SDValue Ret = TryMULWIDECombine(N, DCI))
6623 return Ret;
6624
6625 SDValue N0 = N->getOperand(0);
6626 SDValue N1 = N->getOperand(1);
6627 return PerformMULCombineWithOperands(N, N0, N1, DCI);
6628}
6629
6630/// PerformSHLCombine - Runs PTX-specific DAG combine patterns on SHL nodes.
6633 CodeGenOptLevel OptLevel) {
6634 if (OptLevel > CodeGenOptLevel::None) {
6635 // Try mul.wide combining at OptLevel > 0
6636 if (SDValue Ret = TryMULWIDECombine(N, DCI))
6637 return Ret;
6638 }
6639
6640 return SDValue();
6641}
6642
6645 unsigned int SmVersion) {
6646 EVT CCType = N->getValueType(0);
6647 SDValue A = N->getOperand(0);
6648 SDValue B = N->getOperand(1);
6649
6650 EVT AType = A.getValueType();
6651 if (!(CCType == MVT::v2i1 && (AType == MVT::v2f16 || AType == MVT::v2bf16)))
6652 return SDValue();
6653
6654 if (A.getValueType() == MVT::v2bf16 && SmVersion < 90)
6655 return SDValue();
6656
6657 SDLoc DL(N);
6658 // setp.f16x2 returns two scalar predicates, which we need to
6659 // convert back to v2i1. The returned result will be scalarized by
6660 // the legalizer, but the comparison will remain a single vector
6661 // instruction.
6662 SDValue CCNode = DCI.DAG.getNode(
6663 A.getValueType() == MVT::v2f16 ? NVPTXISD::SETP_F16X2
6665 DL, DCI.DAG.getVTList(MVT::i1, MVT::i1), {A, B, N->getOperand(2)});
6666 return DCI.DAG.getNode(ISD::BUILD_VECTOR, DL, CCType, CCNode.getValue(0),
6667 CCNode.getValue(1));
6668}
6669
6672 SDValue Vector = N->getOperand(0);
6673 if (Vector->getOpcode() == ISD::FREEZE)
6674 Vector = Vector->getOperand(0);
6675 SDLoc DL(N);
6676 EVT VectorVT = Vector.getValueType();
6677 if (Vector->getOpcode() == ISD::LOAD && VectorVT.isSimple() &&
6678 IsPTXVectorType(VectorVT.getSimpleVT()))
6679 return SDValue(); // Native vector loads already combine nicely w/
6680 // extract_vector_elt.
6681 // Don't mess with singletons or packed types (v2*32, v2*16, v4i8 and v8i8),
6682 // we already handle them OK.
6683 if (VectorVT.getVectorNumElements() == 1 ||
6684 NVPTX::isPackedVectorTy(VectorVT) || VectorVT == MVT::v8i8)
6685 return SDValue();
6686
6687 // Don't mess with undef values as sra may be simplified to 0, not undef.
6688 if (Vector->isUndef() || ISD::allOperandsUndef(Vector.getNode()))
6689 return SDValue();
6690
6691 uint64_t VectorBits = VectorVT.getSizeInBits();
6692 // We only handle the types we can extract in-register.
6693 if (!(VectorBits == 16 || VectorBits == 32 || VectorBits == 64))
6694 return SDValue();
6695
6696 ConstantSDNode *Index = dyn_cast<ConstantSDNode>(N->getOperand(1));
6697 // Index == 0 is handled by generic DAG combiner.
6698 if (!Index || Index->getZExtValue() == 0)
6699 return SDValue();
6700
6701 MVT IVT = MVT::getIntegerVT(VectorBits);
6702 EVT EltVT = VectorVT.getVectorElementType();
6703 EVT EltIVT = EltVT.changeTypeToInteger();
6704 uint64_t EltBits = EltVT.getScalarSizeInBits();
6705
6706 SDValue Result = DCI.DAG.getNode(
6707 ISD::TRUNCATE, DL, EltIVT,
6708 DCI.DAG.getNode(
6709 ISD::SRA, DL, IVT, DCI.DAG.getNode(ISD::BITCAST, DL, IVT, Vector),
6710 DCI.DAG.getConstant(Index->getZExtValue() * EltBits, DL, IVT)));
6711
6712 // If element has non-integer type, bitcast it back to the expected type.
6713 if (EltVT != EltIVT)
6714 Result = DCI.DAG.getNode(ISD::BITCAST, DL, EltVT, Result);
6715 // Past legalizer, we may need to extent i8 -> i16 to match the register type.
6716 if (EltVT != N->getValueType(0))
6717 Result = DCI.DAG.getNode(ISD::ANY_EXTEND, DL, N->getValueType(0), Result);
6718
6719 return Result;
6720}
6721
6722/// Transform patterns like:
6723/// (select (ugt shift_amt, BitWidth-1), 0, (srl/shl x, shift_amt))
6724/// (select (ult shift_amt, BitWidth), (srl/shl x, shift_amt), 0)
6725/// Into:
6726/// (NVPTXISD::SRL_CLAMP x, shift_amt) or (NVPTXISD::SHL_CLAMP x, shift_amt)
6727///
6728/// These patterns arise from C/C++ code like `shift >= 32 ? 0 : x >> shift`
6729/// which guards against undefined behavior. PTX shr/shl instructions clamp
6730/// shift amounts >= BitWidth to produce 0 for logical shifts, making the
6731/// guard redundant.
6732///
6733/// Note: We only handle SRL and SHL, not SRA, because arithmetic right
6734/// shifts could produce 0 or -1 when shift >= BitWidth.
6735/// Note: We don't handle uge or ule. These don't appear because of
6736/// canonicalization.
6739 if (!DCI.isAfterLegalizeDAG())
6740 return SDValue();
6741
6742 using namespace SDPatternMatch;
6743 unsigned BitWidth = N->getValueType(0).getSizeInBits();
6744 SDValue ShiftAmt, ShiftOp;
6745
6746 // Match logical shifts where the shift amount in the guard matches the shift
6747 // amount in the operation.
6748 auto LogicalShift =
6749 m_AllOf(m_Value(ShiftOp),
6750 m_AnyOf(m_Srl(m_Value(), m_TruncOrSelf(m_Deferred(ShiftAmt))),
6751 m_Shl(m_Value(), m_TruncOrSelf(m_Deferred(ShiftAmt)))));
6752
6753 // shift_amt > BitWidth-1 ? 0 : shift_op
6754 bool MatchedUGT =
6755 sd_match(N, m_Select(m_SetCC(m_Value(ShiftAmt),
6757 m_SpecificCondCode(ISD::SETUGT)),
6758 m_Zero(), LogicalShift));
6759 // shift_amt < BitWidth ? shift_op : 0
6760 bool MatchedULT =
6761 !MatchedUGT &&
6762 sd_match(N, m_Select(m_SetCC(m_Value(ShiftAmt),
6764 m_SpecificCondCode(ISD::SETULT)),
6765 LogicalShift, m_Zero()));
6766
6767 if (!MatchedUGT && !MatchedULT)
6768 return SDValue();
6769
6770 // Return a clamp shift operation, which has the same semantics as PTX shift.
6771 unsigned ClampOpc = ShiftOp.getOpcode() == ISD::SRL ? NVPTXISD::SRL_CLAMP
6772 : NVPTXISD::SHL_CLAMP;
6773 return DCI.DAG.getNode(ClampOpc, SDLoc(N), ShiftOp.getValueType(),
6774 ShiftOp.getOperand(0), ShiftOp.getOperand(1));
6775}
6776
6779 SDValue VA = N->getOperand(1);
6780 EVT VectorVT = VA.getValueType();
6781 if (VectorVT != MVT::v4i8)
6782 return SDValue();
6783
6784 // We need to split vselect into individual per-element operations Because we
6785 // use BFE/BFI instruction for byte extraction/insertion, we do end up with
6786 // 32-bit values, so we may as well do comparison as i32 to avoid conversions
6787 // to/from i16 normally used for i8 values.
6789 SDLoc DL(N);
6790 SDValue VCond = N->getOperand(0);
6791 SDValue VB = N->getOperand(2);
6792 for (int I = 0; I < 4; ++I) {
6793 SDValue C = DCI.DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i1, VCond,
6794 DCI.DAG.getConstant(I, DL, MVT::i32));
6795 SDValue EA = DCI.DAG.getAnyExtOrTrunc(
6796 DCI.DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8, VA,
6797 DCI.DAG.getConstant(I, DL, MVT::i32)),
6798 DL, MVT::i32);
6799 SDValue EB = DCI.DAG.getAnyExtOrTrunc(
6800 DCI.DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8, VB,
6801 DCI.DAG.getConstant(I, DL, MVT::i32)),
6802 DL, MVT::i32);
6803 E.push_back(DCI.DAG.getAnyExtOrTrunc(
6804 DCI.DAG.getNode(ISD::SELECT, DL, MVT::i32, C, EA, EB), DL, MVT::i8));
6805 }
6806 return DCI.DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v4i8, E);
6807}
6808
6809static SDValue
6811 auto VT = N->getValueType(0);
6812 if (!DCI.isAfterLegalizeDAG() ||
6813 // only process v2*16 types
6814 !(NVPTX::isPackedVectorTy(VT) && VT.is32BitVector() &&
6815 VT.getVectorNumElements() == 2))
6816 return SDValue();
6817
6818 auto Op0 = N->getOperand(0);
6819 auto Op1 = N->getOperand(1);
6820
6821 // Start out by assuming we want to take the lower 2 bytes of each i32
6822 // operand.
6823 uint64_t Op0Bytes = 0x10;
6824 uint64_t Op1Bytes = 0x54;
6825
6826 std::pair<SDValue *, uint64_t *> OpData[2] = {{&Op0, &Op0Bytes},
6827 {&Op1, &Op1Bytes}};
6828
6829 // Check that each operand is an i16, truncated from an i32 operand. We'll
6830 // select individual bytes from those original operands. Optionally, fold in a
6831 // shift right of that original operand.
6832 for (auto &[Op, OpBytes] : OpData) {
6833 // Eat up any bitcast
6834 if (Op->getOpcode() == ISD::BITCAST)
6835 *Op = Op->getOperand(0);
6836
6837 if (!(Op->getValueType() == MVT::i16 && Op->getOpcode() == ISD::TRUNCATE &&
6838 Op->getOperand(0).getValueType() == MVT::i32))
6839 return SDValue();
6840
6841 // If the truncate has multiple uses, this optimization can increase
6842 // register pressure
6843 if (!Op->hasOneUse())
6844 return SDValue();
6845
6846 *Op = Op->getOperand(0);
6847
6848 // Optionally, fold in a shift-right of the original operand and let permute
6849 // pick the two higher bytes of the original value directly.
6850 if (Op->getOpcode() == ISD::SRL && isa<ConstantSDNode>(Op->getOperand(1))) {
6851 if (cast<ConstantSDNode>(Op->getOperand(1))->getZExtValue() == 16) {
6852 // Shift the PRMT byte selector to pick upper bytes from each respective
6853 // value, instead of the lower ones: 0x10 -> 0x32, 0x54 -> 0x76
6854 assert((*OpBytes == 0x10 || *OpBytes == 0x54) &&
6855 "PRMT selector values out of range");
6856 *OpBytes += 0x22;
6857 *Op = Op->getOperand(0);
6858 }
6859 }
6860 }
6861
6862 SDLoc DL(N);
6863 auto &DAG = DCI.DAG;
6864
6865 auto PRMT =
6866 getPRMT(DAG.getBitcast(MVT::i32, Op0), DAG.getBitcast(MVT::i32, Op1),
6867 (Op1Bytes << 8) | Op0Bytes, DL, DAG);
6868 return DAG.getBitcast(VT, PRMT);
6869}
6870
6873 auto *ASCN1 = cast<AddrSpaceCastSDNode>(N);
6874
6875 if (auto *ASCN2 = dyn_cast<AddrSpaceCastSDNode>(ASCN1->getOperand(0))) {
6876 assert(ASCN2->getDestAddressSpace() == ASCN1->getSrcAddressSpace());
6877
6878 // Fold asc[B -> A](asc[A -> B](x)) -> x
6879 if (ASCN1->getDestAddressSpace() == ASCN2->getSrcAddressSpace())
6880 return ASCN2->getOperand(0);
6881 }
6882
6883 return SDValue();
6884}
6885
6886// Given a constant selector value and a prmt mode, return the selector value
6887// normalized to the generic prmt mode. See the PTX ISA documentation for more
6888// details:
6889// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-prmt
6890static APInt getPRMTSelector(const APInt &Selector, unsigned Mode) {
6891 assert(Selector.getBitWidth() == 32 && "PRMT must have i32 operands");
6892
6894 return Selector;
6895
6896 const unsigned V = Selector.trunc(2).getZExtValue();
6897
6898 const auto GetSelector = [](unsigned S0, unsigned S1, unsigned S2,
6899 unsigned S3) {
6900 return APInt(32, S0 | (S1 << 4) | (S2 << 8) | (S3 << 12));
6901 };
6902
6903 switch (Mode) {
6905 return GetSelector(V, V + 1, V + 2, V + 3);
6907 return GetSelector(V, (V - 1) & 7, (V - 2) & 7, (V - 3) & 7);
6909 return GetSelector(V, V, V, V);
6911 return GetSelector(V, std::max(V, 1U), std::max(V, 2U), 3U);
6913 return GetSelector(0, std::min(V, 1U), std::min(V, 2U), V);
6915 unsigned V1 = (V & 1) << 1;
6916 return GetSelector(V1, V1 + 1, V1, V1 + 1);
6917 }
6918 default:
6919 llvm_unreachable("Invalid PRMT mode");
6920 }
6921}
6922
6923static APInt computePRMT(APInt A, APInt B, APInt Selector, unsigned Mode) {
6924 assert(A.getBitWidth() == 32 && B.getBitWidth() == 32 &&
6925 Selector.getBitWidth() == 32 && "PRMT must have i32 operands");
6926 // {b, a} = {{b7, b6, b5, b4}, {b3, b2, b1, b0}}
6927 APInt BitField = B.concat(A);
6928 APInt SelectorVal = getPRMTSelector(Selector, Mode);
6929 APInt Result(32, 0);
6930 for (unsigned I : llvm::seq(4U)) {
6931 APInt Sel = SelectorVal.extractBits(4, I * 4);
6932 unsigned Idx = Sel.getLoBits(3).getZExtValue();
6933 unsigned Sign = Sel.getHiBits(1).getZExtValue();
6934 APInt Byte = BitField.extractBits(8, Idx * 8);
6935 if (Sign)
6936 Byte = Byte.ashr(8);
6937 Result.insertBits(Byte, I * 8);
6938 }
6939 return Result;
6940}
6941
6943 CodeGenOptLevel OptLevel) {
6944 if (OptLevel == CodeGenOptLevel::None)
6945 return SDValue();
6946
6947 // Constant fold PRMT
6948 if (isa<ConstantSDNode>(N->getOperand(0)) &&
6949 isa<ConstantSDNode>(N->getOperand(1)) &&
6950 isa<ConstantSDNode>(N->getOperand(2)))
6951 return DCI.DAG.getConstant(computePRMT(N->getConstantOperandAPInt(0),
6952 N->getConstantOperandAPInt(1),
6953 N->getConstantOperandAPInt(2),
6954 N->getConstantOperandVal(3)),
6955 SDLoc(N), N->getValueType(0));
6956 return SDValue();
6957}
6958
6959// During call lowering we wrap the return values in a ProxyReg node which
6960// depend on the chain value produced by the completed call. This ensures that
6961// the full call is emitted in cases where libcalls are used to legalize
6962// operations. To improve the functioning of other DAG combines we pull all
6963// operations we can through one of these nodes, ensuring that the ProxyReg
6964// directly wraps a load. That is:
6965//
6966// (ProxyReg (zext (load retval0))) => (zext (ProxyReg (load retval0)))
6967//
6970 switch (R.getOpcode()) {
6971 case ISD::TRUNCATE:
6972 case ISD::ANY_EXTEND:
6973 case ISD::SIGN_EXTEND:
6974 case ISD::ZERO_EXTEND:
6975 case ISD::BITCAST: {
6976 if (SDValue V = sinkProxyReg(R.getOperand(0), Chain, DCI))
6977 return DCI.DAG.getNode(R.getOpcode(), SDLoc(R), R.getValueType(), V);
6978 return SDValue();
6979 }
6980 case ISD::SHL:
6981 case ISD::SRL:
6982 case ISD::SRA:
6983 case ISD::OR: {
6984 if (SDValue A = sinkProxyReg(R.getOperand(0), Chain, DCI))
6985 if (SDValue B = sinkProxyReg(R.getOperand(1), Chain, DCI))
6986 return DCI.DAG.getNode(R.getOpcode(), SDLoc(R), R.getValueType(), A, B);
6987 return SDValue();
6988 }
6989 case ISD::Constant:
6990 return R;
6991 case ISD::LOAD:
6992 case NVPTXISD::LoadV2:
6993 case NVPTXISD::LoadV4: {
6994 return DCI.DAG.getNode(NVPTXISD::ProxyReg, SDLoc(R), R.getValueType(),
6995 {Chain, R});
6996 }
6997 case ISD::BUILD_VECTOR: {
6998 if (DCI.isBeforeLegalize())
6999 return SDValue();
7000
7002 for (auto &Op : R->ops()) {
7003 SDValue V = sinkProxyReg(Op, Chain, DCI);
7004 if (!V)
7005 return SDValue();
7006 Ops.push_back(V);
7007 }
7008 return DCI.DAG.getNode(ISD::BUILD_VECTOR, SDLoc(R), R.getValueType(), Ops);
7009 }
7011 if (DCI.isBeforeLegalize())
7012 return SDValue();
7013
7014 if (SDValue V = sinkProxyReg(R.getOperand(0), Chain, DCI))
7016 R.getValueType(), V, R.getOperand(1));
7017 return SDValue();
7018 }
7019 default:
7020 return SDValue();
7021 }
7022}
7023
7024static unsigned getF16SubOpc(Intrinsic::ID AddIntrinsicID) {
7025 switch (AddIntrinsicID) {
7026 default:
7027 break;
7028 case Intrinsic::nvvm_add_rn_sat_f16:
7029 case Intrinsic::nvvm_add_rn_sat_v2f16:
7030 return NVPTXISD::SUB_RN_SAT;
7031 case Intrinsic::nvvm_add_rn_ftz_sat_f16:
7032 case Intrinsic::nvvm_add_rn_ftz_sat_v2f16:
7033 return NVPTXISD::SUB_RN_FTZ_SAT;
7034 }
7035 llvm_unreachable("Invalid F16 add intrinsic");
7036}
7037
7039 Intrinsic::ID AddIntrinsicID) {
7040 SDValue Op1 = N->getOperand(1);
7041 SDValue Op2 = N->getOperand(2);
7042
7043 SDValue SubOp1, SubOp2;
7044
7045 if (Op1.getOpcode() == ISD::FNEG) {
7046 SubOp1 = Op2;
7047 SubOp2 = Op1.getOperand(0);
7048 } else if (Op2.getOpcode() == ISD::FNEG) {
7049 SubOp1 = Op1;
7050 SubOp2 = Op2.getOperand(0);
7051 } else {
7052 return SDValue();
7053 }
7054
7055 SDLoc DL(N);
7056 return DAG.getNode(getF16SubOpc(AddIntrinsicID), DL, N->getValueType(0),
7057 SubOp1, SubOp2);
7058}
7059
7062 const NVPTXSubtarget &STI) {
7063 unsigned IID = N->getConstantOperandVal(0);
7064
7065 switch (IID) {
7066 default:
7067 break;
7068 case Intrinsic::nvvm_add_rn_sat_f16:
7069 case Intrinsic::nvvm_add_rn_ftz_sat_f16:
7070 case Intrinsic::nvvm_add_rn_sat_v2f16:
7071 case Intrinsic::nvvm_add_rn_ftz_sat_v2f16:
7072 return combineF16AddWithNeg(N, DCI.DAG, IID);
7073 }
7074 return SDValue();
7075}
7076
7079
7080 SDValue Chain = N->getOperand(0);
7081 SDValue Reg = N->getOperand(1);
7082
7083 // If the ProxyReg is not wrapping a load, try to pull the operations through
7084 // the ProxyReg.
7085 if (Reg.getOpcode() != ISD::LOAD) {
7086 if (SDValue V = sinkProxyReg(Reg, Chain, DCI))
7087 return V;
7088 }
7089
7090 return SDValue();
7091}
7092
7093SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N,
7094 DAGCombinerInfo &DCI) const {
7096 switch (N->getOpcode()) {
7097 default:
7098 break;
7099 case ISD::ADD:
7100 return PerformADDCombine(N, DCI, OptLevel);
7101 case ISD::ADDRSPACECAST:
7102 return combineADDRSPACECAST(N, DCI);
7103 case ISD::SIGN_EXTEND:
7104 case ISD::ZERO_EXTEND:
7105 return combineMulWide(N, DCI, OptLevel);
7106 case ISD::BUILD_VECTOR:
7107 return PerformBUILD_VECTORCombine(N, DCI);
7109 return PerformEXTRACTCombine(N, DCI);
7110 case ISD::FADD:
7111 return PerformFADDCombine(N, DCI, OptLevel);
7112 case ISD::FMA:
7113 case ISD::FMUL:
7114 case ISD::FSUB:
7115 return PerformScalarizeV2F32Op(N, DCI);
7116 case ISD::FMAXNUM:
7117 case ISD::FMINNUM:
7118 case ISD::FMAXIMUM:
7119 case ISD::FMINIMUM:
7120 case ISD::FMAXIMUMNUM:
7121 case ISD::FMINIMUMNUM:
7122 return PerformFMinMaxCombine(N, DCI, STI.getPTXVersion(),
7123 STI.getSmVersion());
7124 case ISD::LOAD:
7125 case NVPTXISD::LoadV2:
7126 case NVPTXISD::LoadV4:
7127 return combineLOAD(N, DCI, STI);
7128 case ISD::MUL:
7129 return PerformMULCombine(N, DCI, OptLevel);
7130 case NVPTXISD::PRMT:
7131 return combinePRMT(N, DCI, OptLevel);
7132 case NVPTXISD::ProxyReg:
7133 return combineProxyReg(N, DCI);
7134 case ISD::SETCC:
7135 return PerformSETCCCombine(N, DCI, STI.getSmVersion());
7136 case ISD::SHL:
7137 return PerformSHLCombine(N, DCI, OptLevel);
7138 case ISD::SREM:
7139 case ISD::UREM:
7140 return PerformREMCombine(N, DCI, OptLevel);
7141 case ISD::STORE:
7142 case NVPTXISD::StoreV2:
7143 case NVPTXISD::StoreV4:
7144 return combineSTORE(N, DCI, STI);
7145 case ISD::SELECT:
7146 return PerformSELECTShiftCombine(N, DCI);
7147 case ISD::VSELECT:
7148 return PerformVSELECTCombine(N, DCI);
7150 return combineIntrinsicWOChain(N, DCI, STI);
7151 }
7152 return SDValue();
7153}
7154
7157 // Handle bitcasting to v2i8 without hitting the default promotion
7158 // strategy which goes through stack memory.
7159 SDValue Op(Node, 0);
7160 EVT ToVT = Op->getValueType(0);
7161 if (ToVT != MVT::v2i8) {
7162 return;
7163 }
7164
7165 // Bitcast to i16 and unpack elements into a vector
7166 SDLoc DL(Node);
7167 SDValue AsInt = DAG.getBitcast(MVT::i16, Op->getOperand(0));
7168 SDValue Vec0 = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, AsInt);
7169 SDValue Const8 = DAG.getConstant(8, DL, MVT::i16);
7170 SDValue Vec1 =
7171 DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
7172 DAG.getNode(ISD::SRL, DL, MVT::i16, {AsInt, Const8}));
7173 Results.push_back(
7174 DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2i8, {Vec0, Vec1}));
7175}
7176
7179 SDValue Chain = N->getOperand(0);
7180 SDValue Intrin = N->getOperand(1);
7181 SDLoc DL(N);
7182
7183 // Get the intrinsic ID
7184 unsigned IntrinNo = Intrin.getNode()->getAsZExtVal();
7185 switch (IntrinNo) {
7186 default:
7187 return;
7188 case Intrinsic::nvvm_ldu_global_i:
7189 case Intrinsic::nvvm_ldu_global_f:
7190 case Intrinsic::nvvm_ldu_global_p: {
7191 EVT ResVT = N->getValueType(0);
7192
7193 if (ResVT.isVector()) {
7194 // Vector LDG/LDU
7195
7196 unsigned NumElts = ResVT.getVectorNumElements();
7197 EVT EltVT = ResVT.getVectorElementType();
7198
7199 // Since LDU/LDG are target nodes, we cannot rely on DAG type
7200 // legalization.
7201 // Therefore, we must ensure the type is legal. For i1 and i8, we set the
7202 // loaded type to i16 and propagate the "real" type as the memory type.
7203 bool NeedTrunc = false;
7204 if (EltVT.getSizeInBits() < 16) {
7205 EltVT = MVT::i16;
7206 NeedTrunc = true;
7207 }
7208
7209 unsigned Opcode = 0;
7210 SDVTList LdResVTs;
7211
7212 switch (NumElts) {
7213 default:
7214 return;
7215 case 2:
7216 Opcode = NVPTXISD::LDUV2;
7217 LdResVTs = DAG.getVTList(EltVT, EltVT, MVT::Other);
7218 break;
7219 case 4: {
7220 Opcode = NVPTXISD::LDUV4;
7221 EVT ListVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other };
7222 LdResVTs = DAG.getVTList(ListVTs);
7223 break;
7224 }
7225 }
7226
7227 SmallVector<SDValue, 8> OtherOps;
7228
7229 // Copy regular operands
7230
7231 OtherOps.push_back(Chain); // Chain
7232 // Skip operand 1 (intrinsic ID)
7233 // Others
7234 OtherOps.append(N->op_begin() + 2, N->op_end());
7235
7237
7238 SDValue NewLD = DAG.getMemIntrinsicNode(Opcode, DL, LdResVTs, OtherOps,
7239 MemSD->getMemoryVT(),
7240 MemSD->getMemOperand());
7241
7242 SmallVector<SDValue, 4> ScalarRes;
7243
7244 for (unsigned i = 0; i < NumElts; ++i) {
7245 SDValue Res = NewLD.getValue(i);
7246 if (NeedTrunc)
7247 Res =
7248 DAG.getNode(ISD::TRUNCATE, DL, ResVT.getVectorElementType(), Res);
7249 ScalarRes.push_back(Res);
7250 }
7251
7252 SDValue LoadChain = NewLD.getValue(NumElts);
7253
7254 SDValue BuildVec =
7255 DAG.getBuildVector(ResVT, DL, ScalarRes);
7256
7257 Results.push_back(BuildVec);
7258 Results.push_back(LoadChain);
7259 } else {
7260 // i8 LDG/LDU
7261 assert(ResVT.isSimple() && ResVT.getSimpleVT().SimpleTy == MVT::i8 &&
7262 "Custom handling of non-i8 ldu/ldg?");
7263
7264 // Just copy all operands as-is
7266
7267 // Force output to i16
7268 SDVTList LdResVTs = DAG.getVTList(MVT::i16, MVT::Other);
7269
7271
7272 // We make sure the memory type is i8, which will be used during isel
7273 // to select the proper instruction.
7274 SDValue NewLD =
7276 MVT::i8, MemSD->getMemOperand());
7277
7278 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
7279 NewLD.getValue(0)));
7280 Results.push_back(NewLD.getValue(1));
7281 }
7282 return;
7283 }
7284
7285 case Intrinsic::nvvm_tcgen05_ld_16x64b_x4:
7286 case Intrinsic::nvvm_tcgen05_ld_16x64b_x8:
7287 case Intrinsic::nvvm_tcgen05_ld_16x64b_x16:
7288 case Intrinsic::nvvm_tcgen05_ld_16x64b_x32:
7289 case Intrinsic::nvvm_tcgen05_ld_16x64b_x64:
7290 case Intrinsic::nvvm_tcgen05_ld_16x64b_x128:
7291 case Intrinsic::nvvm_tcgen05_ld_32x32b_x4:
7292 case Intrinsic::nvvm_tcgen05_ld_32x32b_x8:
7293 case Intrinsic::nvvm_tcgen05_ld_32x32b_x16:
7294 case Intrinsic::nvvm_tcgen05_ld_32x32b_x32:
7295 case Intrinsic::nvvm_tcgen05_ld_32x32b_x64:
7296 case Intrinsic::nvvm_tcgen05_ld_32x32b_x128:
7297 case Intrinsic::nvvm_tcgen05_ld_16x128b_x2:
7298 case Intrinsic::nvvm_tcgen05_ld_16x128b_x4:
7299 case Intrinsic::nvvm_tcgen05_ld_16x128b_x8:
7300 case Intrinsic::nvvm_tcgen05_ld_16x128b_x16:
7301 case Intrinsic::nvvm_tcgen05_ld_16x128b_x32:
7302 case Intrinsic::nvvm_tcgen05_ld_16x128b_x64:
7303 case Intrinsic::nvvm_tcgen05_ld_16x256b_x1:
7304 case Intrinsic::nvvm_tcgen05_ld_16x256b_x2:
7305 case Intrinsic::nvvm_tcgen05_ld_16x256b_x4:
7306 case Intrinsic::nvvm_tcgen05_ld_16x256b_x8:
7307 case Intrinsic::nvvm_tcgen05_ld_16x256b_x16:
7308 case Intrinsic::nvvm_tcgen05_ld_16x256b_x32:
7309 if (auto Res = lowerTcgen05Ld(N, DAG)) {
7310 Results.push_back(Res->first);
7311 Results.push_back(Res->second);
7312 }
7313 return;
7314
7315 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x4:
7316 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x8:
7317 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x16:
7318 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x32:
7319 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x64:
7320 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x128:
7321 if (auto Res = lowerTcgen05Ld(N, DAG, /*HasOffset=*/true)) {
7322 Results.push_back(Res->first);
7323 Results.push_back(Res->second);
7324 }
7325 return;
7326
7327 case Intrinsic::nvvm_tcgen05_ld_red_32x32b_x8_i32:
7328 case Intrinsic::nvvm_tcgen05_ld_red_32x32b_x8_f32:
7329 case Intrinsic::nvvm_tcgen05_ld_red_32x32b_x64_i32:
7330 case Intrinsic::nvvm_tcgen05_ld_red_32x32b_x64_f32:
7331 case Intrinsic::nvvm_tcgen05_ld_red_32x32b_x4_i32:
7332 case Intrinsic::nvvm_tcgen05_ld_red_32x32b_x4_f32:
7333 case Intrinsic::nvvm_tcgen05_ld_red_32x32b_x32_i32:
7334 case Intrinsic::nvvm_tcgen05_ld_red_32x32b_x32_f32:
7335 case Intrinsic::nvvm_tcgen05_ld_red_32x32b_x16_i32:
7336 case Intrinsic::nvvm_tcgen05_ld_red_32x32b_x16_f32:
7337 case Intrinsic::nvvm_tcgen05_ld_red_32x32b_x128_i32:
7338 case Intrinsic::nvvm_tcgen05_ld_red_32x32b_x128_f32:
7339 case Intrinsic::nvvm_tcgen05_ld_red_16x32bx2_x8_i32:
7340 case Intrinsic::nvvm_tcgen05_ld_red_16x32bx2_x8_f32:
7341 case Intrinsic::nvvm_tcgen05_ld_red_16x32bx2_x64_i32:
7342 case Intrinsic::nvvm_tcgen05_ld_red_16x32bx2_x64_f32:
7343 case Intrinsic::nvvm_tcgen05_ld_red_16x32bx2_x4_i32:
7344 case Intrinsic::nvvm_tcgen05_ld_red_16x32bx2_x4_f32:
7345 case Intrinsic::nvvm_tcgen05_ld_red_16x32bx2_x32_i32:
7346 case Intrinsic::nvvm_tcgen05_ld_red_16x32bx2_x32_f32:
7347 case Intrinsic::nvvm_tcgen05_ld_red_16x32bx2_x16_i32:
7348 case Intrinsic::nvvm_tcgen05_ld_red_16x32bx2_x16_f32:
7349 case Intrinsic::nvvm_tcgen05_ld_red_16x32bx2_x128_i32:
7350 case Intrinsic::nvvm_tcgen05_ld_red_16x32bx2_x128_f32:
7351 if (auto Res = lowerTcgen05LdRed(N, DAG)) {
7352 Results.push_back(std::get<0>(*Res));
7353 Results.push_back(std::get<1>(*Res));
7354 Results.push_back(std::get<2>(*Res));
7355 }
7356 return;
7357 }
7358}
7359
7362 // Change the CopyFromReg to output 2 64-bit results instead of a 128-bit
7363 // result so that it can pass the legalization
7364 SDLoc DL(N);
7365 SDValue Chain = N->getOperand(0);
7366 SDValue Reg = N->getOperand(1);
7367 SDValue Glue = N->getOperand(2);
7368
7369 assert(Reg.getValueType() == MVT::i128 &&
7370 "Custom lowering for CopyFromReg with 128-bit reg only");
7371 SmallVector<EVT, 4> ResultsType = {MVT::i64, MVT::i64, N->getValueType(1),
7372 N->getValueType(2)};
7373 SmallVector<SDValue, 3> NewOps = {Chain, Reg, Glue};
7374
7375 SDValue NewValue = DAG.getNode(ISD::CopyFromReg, DL, ResultsType, NewOps);
7376 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i128,
7377 {NewValue.getValue(0), NewValue.getValue(1)});
7378
7379 Results.push_back(Pair);
7380 Results.push_back(NewValue.getValue(2));
7381 Results.push_back(NewValue.getValue(3));
7382}
7383
7385 const TargetLowering &TLI,
7387 SDValue Chain = N->getOperand(0);
7388 SDValue Reg = N->getOperand(1);
7389
7390 MVT VT = TLI.getRegisterType(*DAG.getContext(), Reg.getValueType());
7391
7392 SDValue NewReg = DAG.getAnyExtOrTrunc(Reg, SDLoc(N), VT);
7393 SDValue NewProxy =
7394 DAG.getNode(NVPTXISD::ProxyReg, SDLoc(N), VT, {Chain, NewReg});
7395 SDValue Res = DAG.getAnyExtOrTrunc(NewProxy, SDLoc(N), N->getValueType(0));
7396
7397 Results.push_back(Res);
7398}
7399
7401 const NVPTXSubtarget &STI,
7403 assert(N->getValueType(0) == MVT::i128 &&
7404 "Custom lowering for atomic128 only supports i128");
7405
7407 SDLoc dl(N);
7408
7409 if (!STI.hasAtomSwap128()) {
7412 "Support for b128 atomics introduced in PTX ISA version 8.3 and "
7413 "requires target sm_90.",
7414 dl.getDebugLoc()));
7415
7416 Results.push_back(DAG.getUNDEF(MVT::i128));
7417 Results.push_back(AN->getOperand(0)); // Chain
7418 return;
7419 }
7420
7422 Ops.push_back(AN->getOperand(0)); // Chain
7423 Ops.push_back(AN->getOperand(1)); // Ptr
7424 for (const auto &Op : AN->ops().drop_front(2)) {
7425 // Low part
7426 Ops.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i64, Op,
7427 DAG.getIntPtrConstant(0, dl)));
7428 // High part
7429 Ops.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i64, Op,
7430 DAG.getIntPtrConstant(1, dl)));
7431 }
7432 unsigned Opcode = N->getOpcode() == ISD::ATOMIC_SWAP
7435 SDVTList Tys = DAG.getVTList(MVT::i64, MVT::i64, MVT::Other);
7436 SDValue Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, MVT::i128,
7437 AN->getMemOperand());
7438 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i128,
7439 {Result.getValue(0), Result.getValue(1)}));
7440 Results.push_back(Result.getValue(2));
7441}
7442
7443void NVPTXTargetLowering::ReplaceNodeResults(
7445 switch (N->getOpcode()) {
7446 default:
7447 report_fatal_error("Unhandled custom legalization");
7448 case ISD::BITCAST:
7449 ReplaceBITCAST(N, DAG, Results);
7450 return;
7451 case ISD::LOAD:
7452 case ISD::MLOAD:
7453 replaceLoadVector(N, DAG, Results, STI);
7454 return;
7457 return;
7458 case ISD::CopyFromReg:
7460 return;
7461 case NVPTXISD::ProxyReg:
7462 replaceProxyReg(N, DAG, *this, Results);
7463 return;
7465 case ISD::ATOMIC_SWAP:
7466 replaceAtomicSwap128(N, DAG, STI, Results);
7467 return;
7468 }
7469}
7470
7473 Type *Ty = AI->getValOperand()->getType();
7474
7475 if (AI->isFloatingPointOperation()) {
7477 if (Ty->isHalfTy() && STI.getSmVersion() >= 70 &&
7478 STI.getPTXVersion() >= 63)
7480 if (Ty->isBFloatTy() && STI.getSmVersion() >= 90 &&
7481 STI.getPTXVersion() >= 78)
7483 if (Ty->isFloatTy())
7485 if (Ty->isDoubleTy() && STI.hasAtomAddF64())
7487 }
7489 }
7490
7491 assert(Ty->isIntegerTy() && "Ty should be integer at this point");
7492 const unsigned BitWidth = cast<IntegerType>(Ty)->getBitWidth();
7493
7494 switch (AI->getOperation()) {
7495 default:
7498 if (BitWidth == 128)
7500 [[fallthrough]];
7504 switch (BitWidth) {
7505 case 8:
7506 case 16:
7508 case 32:
7510 case 64:
7511 if (STI.hasAtomBitwise64())
7514 case 128:
7516 default:
7517 llvm_unreachable("unsupported width encountered");
7518 }
7525 switch (BitWidth) {
7526 case 8:
7527 case 16:
7529 case 32:
7531 case 64:
7532 if (STI.hasAtomMinMax64())
7535 case 128:
7537 default:
7538 llvm_unreachable("unsupported width encountered");
7539 }
7542 switch (BitWidth) {
7543 case 32:
7545 case 8:
7546 case 16:
7547 case 64:
7548 case 128:
7550 default:
7551 llvm_unreachable("unsupported width encountered");
7552 }
7553 }
7554
7556}
7557
7559 const Instruction *I) const {
7560 // This function returns true iff the operation is emulated using a CAS-loop,
7561 // or if it has the memory order seq_cst (which is not natively supported in
7562 // the PTX `atom` instruction).
7563 //
7564 // atomicrmw and cmpxchg instructions not efficiently supported by PTX
7565 // are lowered to CAS emulation loops that preserve their memory order,
7566 // syncscope, and volatile semantics. For PTX, it is more efficient to use
7567 // atom.cas.relaxed.sco instructions within the loop, and fences before and
7568 // after the loop to restore order.
7569 //
7570 // Atomic instructions efficiently supported by PTX are lowered to
7571 // `atom.<op>.<sem>.<scope` instruction with their corresponding memory order
7572 // and scope. Since PTX does not support seq_cst, we emulate it by lowering to
7573 // a fence.sc followed by an atom according to the PTX atomics ABI
7574 // https://docs.nvidia.com/cuda/ptx-writers-guide-to-interoperability/atomic-abi.html
7575 if (auto *CI = dyn_cast<AtomicCmpXchgInst>(I))
7576 return (cast<IntegerType>(CI->getCompareOperand()->getType())
7577 ->getBitWidth() < STI.getMinCmpXchgSizeInBits()) ||
7578 CI->getMergedOrdering() == AtomicOrdering::SequentiallyConsistent;
7579 if (auto *RI = dyn_cast<AtomicRMWInst>(I))
7581 RI->getOrdering() == AtomicOrdering::SequentiallyConsistent;
7582 return false;
7583}
7584
7586 const Instruction *I) const {
7587 // If the operation is emulated by a CAS-loop, we lower the instruction to
7588 // atom.<op>.relaxed, since AtomicExpandPass will insert fences for enforcing
7589 // the correct memory ordering around the CAS loop.
7590 //
7591 // When the operation is not emulated, but the memory order is seq_cst,
7592 // we must lower to "fence.sc.<scope>; atom.<op>.acquire.<scope>;" to conform
7593 // to the PTX atomics ABI.
7594 // https://docs.nvidia.com/cuda/ptx-writers-guide-to-interoperability/atomic-abi.html
7595 // For such cases, emitLeadingFence() will separately insert the leading
7596 // "fence.sc.<scope>;". Here, we only set the memory order to acquire.
7597 //
7598 // Otherwise, the operation is not emulated, and the memory order is not
7599 // seq_cst. In this case, the LLVM memory order is natively supported by the
7600 // PTX `atom` instruction, and we just lower to the corresponding
7601 // `atom.<op>.relaxed|acquire|release|acq_rel". For such cases, this function
7602 // will NOT be called.
7603 // prerequisite: shouldInsertFencesForAtomic() should have returned `true` for
7604 // I before its memory order was modified.
7605 if (auto *CI = dyn_cast<AtomicCmpXchgInst>(I);
7606 CI && CI->getMergedOrdering() == AtomicOrdering::SequentiallyConsistent &&
7607 cast<IntegerType>(CI->getCompareOperand()->getType())->getBitWidth() >=
7608 STI.getMinCmpXchgSizeInBits())
7610 else if (auto *RI = dyn_cast<AtomicRMWInst>(I);
7611 RI && RI->getOrdering() == AtomicOrdering::SequentiallyConsistent &&
7614
7616}
7617
7619 Instruction *Inst,
7620 AtomicOrdering Ord) const {
7621 // prerequisite: shouldInsertFencesForAtomic() should have returned `true` for
7622 // `Inst` before its memory order was modified. We cannot enforce this with an
7623 // assert, because AtomicExpandPass will have modified the memory order
7624 // between the initial call to shouldInsertFencesForAtomic() and the call to
7625 // this function.
7626 if (!isa<AtomicCmpXchgInst>(Inst) && !isa<AtomicRMWInst>(Inst))
7627 return TargetLoweringBase::emitLeadingFence(Builder, Inst, Ord);
7628
7629 // Specialize for cmpxchg and atomicrmw
7630 auto SSID = getAtomicSyncScopeID(Inst);
7631 assert(SSID.has_value() && "Expected an atomic operation");
7632
7633 if (isReleaseOrStronger(Ord))
7634 return Builder.CreateFence(Ord == AtomicOrdering::SequentiallyConsistent
7637 SSID.value());
7638
7639 return nullptr;
7640}
7641
7643 Instruction *Inst,
7644 AtomicOrdering Ord) const {
7645 // prerequisite: shouldInsertFencesForAtomic() should have returned `true` for
7646 // `Inst` before its memory order was modified. See `emitLeadingFence` for why
7647 // this cannot be enforced with an assert. Specialize for cmpxchg and
7648 // atomicrmw
7649 auto *CI = dyn_cast<AtomicCmpXchgInst>(Inst);
7650 auto *RI = dyn_cast<AtomicRMWInst>(Inst);
7651 if (!CI && !RI)
7652 return TargetLoweringBase::emitTrailingFence(Builder, Inst, Ord);
7653
7654 auto SSID = getAtomicSyncScopeID(Inst);
7655 assert(SSID.has_value() && "Expected an atomic operation");
7656
7657 bool IsEmulated =
7658 CI ? cast<IntegerType>(CI->getCompareOperand()->getType())
7659 ->getBitWidth() < STI.getMinCmpXchgSizeInBits()
7661
7662 if (isAcquireOrStronger(Ord) && IsEmulated)
7663 return Builder.CreateFence(AtomicOrdering::Acquire, SSID.value());
7664
7665 return nullptr;
7666}
7667
7668// Rather than default to SINT when both UINT and SINT are custom, we only
7669// change the opcode when UINT is not legal and SINT is. UINT is preferred when
7670// both are custom since unsigned CVT instructions can lead to slightly better
7671// SASS code with fewer instructions.
7673 EVT ToVT) const {
7674 if (isOperationLegal(Op, ToVT))
7675 return Op;
7676 switch (Op) {
7677 case ISD::FP_TO_UINT:
7679 return ISD::FP_TO_SINT;
7680 break;
7684 break;
7685 case ISD::VP_FP_TO_UINT:
7686 if (isOperationLegal(ISD::VP_FP_TO_SINT, ToVT))
7687 return ISD::VP_FP_TO_SINT;
7688 break;
7689 default:
7690 break;
7691 }
7692 return Op;
7693}
7694
7695// Pin NVPTXTargetObjectFile's vtables to this file.
7697
7702
7704 const SelectionDAG &DAG, unsigned Depth) {
7705 SDValue A = Op.getOperand(0);
7706 SDValue B = Op.getOperand(1);
7707 ConstantSDNode *Selector = dyn_cast<ConstantSDNode>(Op.getOperand(2));
7708 unsigned Mode = Op.getConstantOperandVal(3);
7709
7710 if (!Selector)
7711 return;
7712
7713 KnownBits AKnown = DAG.computeKnownBits(A, Depth);
7714 KnownBits BKnown = DAG.computeKnownBits(B, Depth);
7715
7716 // {b, a} = {{b7, b6, b5, b4}, {b3, b2, b1, b0}}
7717 assert(AKnown.getBitWidth() == 32 && BKnown.getBitWidth() == 32 &&
7718 "PRMT must have i32 operands");
7719 assert(Known.getBitWidth() == 32 && "PRMT must have i32 result");
7720 KnownBits BitField = BKnown.concat(AKnown);
7721
7722 APInt SelectorVal = getPRMTSelector(Selector->getAPIntValue(), Mode);
7723 for (unsigned I : llvm::seq(4)) {
7724 APInt Sel = SelectorVal.extractBits(4, I * 4);
7725 unsigned Idx = Sel.getLoBits(3).getZExtValue();
7726 unsigned Sign = Sel.getHiBits(1).getZExtValue();
7727 KnownBits Byte = BitField.extractBits(8, Idx * 8);
7728 if (Sign)
7729 Byte = KnownBits::ashr(Byte, 8);
7730 Known.insertBits(Byte, I * 8);
7731 }
7732}
7733
7734static void computeKnownBitsForLoadV(const SDValue Op, KnownBits &Known) {
7736
7737 // We can't do anything without knowing the sign bit.
7738 auto ExtType = LD->getConstantOperandVal(LD->getNumOperands() - 1);
7739 if (ExtType == ISD::SEXTLOAD)
7740 return;
7741
7742 // ExtLoading to vector types is weird and may not work well with known bits.
7743 auto DestVT = LD->getValueType(0);
7744 if (DestVT.isVector())
7745 return;
7746
7747 assert(Known.getBitWidth() == DestVT.getSizeInBits());
7748 auto ElementBitWidth = NVPTXDAGToDAGISel::getFromTypeWidthForLoad(LD);
7749 Known.Zero.setHighBits(Known.getBitWidth() - ElementBitWidth);
7750}
7751
7753 const SDValue Op, KnownBits &Known, const APInt &DemandedElts,
7754 const SelectionDAG &DAG, unsigned Depth) const {
7755 Known.resetAll();
7756
7757 switch (Op.getOpcode()) {
7758 case NVPTXISD::PRMT:
7759 computeKnownBitsForPRMT(Op, Known, DAG, Depth);
7760 break;
7761 case NVPTXISD::LoadV2:
7762 case NVPTXISD::LoadV4:
7763 case NVPTXISD::LoadV8:
7765 break;
7766 default:
7767 break;
7768 }
7769}
7770
7771static std::pair<APInt, APInt> getPRMTDemandedBits(const APInt &SelectorVal,
7772 const APInt &DemandedBits) {
7773 APInt DemandedLHS = APInt(32, 0);
7774 APInt DemandedRHS = APInt(32, 0);
7775
7776 for (unsigned I : llvm::seq(4)) {
7777 if (DemandedBits.extractBits(8, I * 8).isZero())
7778 continue;
7779
7780 APInt Sel = SelectorVal.extractBits(4, I * 4);
7781 unsigned Idx = Sel.getLoBits(3).getZExtValue();
7782 unsigned Sign = Sel.getHiBits(1).getZExtValue();
7783
7784 APInt &Src = Idx < 4 ? DemandedLHS : DemandedRHS;
7785 unsigned ByteStart = (Idx % 4) * 8;
7786 if (Sign)
7787 Src.setBit(ByteStart + 7);
7788 else
7789 Src.setBits(ByteStart, ByteStart + 8);
7790 }
7791
7792 return {DemandedLHS, DemandedRHS};
7793}
7794
7795// Replace undef with 0 as this is easier for other optimizations such as
7796// known bits.
7798 if (!Op)
7799 return SDValue();
7800 if (Op.isUndef())
7801 return DAG.getConstant(0, SDLoc(), MVT::i32);
7802 return Op;
7803}
7804
7806 const APInt &DemandedBits,
7807 SelectionDAG &DAG,
7808 const TargetLowering &TLI,
7809 unsigned Depth) {
7810 assert(PRMT.getOpcode() == NVPTXISD::PRMT);
7811 SDValue Op0 = PRMT.getOperand(0);
7812 SDValue Op1 = PRMT.getOperand(1);
7813 auto *SelectorConst = dyn_cast<ConstantSDNode>(PRMT.getOperand(2));
7814 if (!SelectorConst)
7815 return SDValue();
7816
7817 unsigned Mode = PRMT.getConstantOperandVal(3);
7818 const APInt Selector = getPRMTSelector(SelectorConst->getAPIntValue(), Mode);
7819
7820 // Try to simplify the PRMT to one of the inputs if the used bytes are all
7821 // from the same input in the correct order.
7822 const unsigned LeadingBytes = DemandedBits.countLeadingZeros() / 8;
7823 const unsigned SelBits = (4 - LeadingBytes) * 4;
7824 if (Selector.getLoBits(SelBits) == APInt(32, 0x3210).getLoBits(SelBits))
7825 return Op0;
7826 if (Selector.getLoBits(SelBits) == APInt(32, 0x7654).getLoBits(SelBits))
7827 return Op1;
7828
7829 auto [DemandedLHS, DemandedRHS] = getPRMTDemandedBits(Selector, DemandedBits);
7830
7831 // Attempt to avoid multi-use ops if we don't need anything from them.
7832 SDValue DemandedOp0 =
7833 TLI.SimplifyMultipleUseDemandedBits(Op0, DemandedLHS, DAG, Depth + 1);
7834 SDValue DemandedOp1 =
7835 TLI.SimplifyMultipleUseDemandedBits(Op1, DemandedRHS, DAG, Depth + 1);
7836
7837 DemandedOp0 = canonicalizePRMTInput(DemandedOp0, DAG);
7838 DemandedOp1 = canonicalizePRMTInput(DemandedOp1, DAG);
7839 if ((DemandedOp0 && DemandedOp0 != Op0) ||
7840 (DemandedOp1 && DemandedOp1 != Op1)) {
7841 Op0 = DemandedOp0 ? DemandedOp0 : Op0;
7842 Op1 = DemandedOp1 ? DemandedOp1 : Op1;
7843 return getPRMT(Op0, Op1, Selector.getZExtValue(), SDLoc(PRMT), DAG);
7844 }
7845
7846 return SDValue();
7847}
7848
7850 SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
7851 KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth) const {
7852 Known.resetAll();
7853
7854 switch (Op.getOpcode()) {
7855 case NVPTXISD::PRMT:
7857 *this, Depth)) {
7858 TLO.CombineTo(Op, Result);
7859 return true;
7860 }
7861 break;
7862 default:
7863 break;
7864 }
7865
7866 computeKnownBitsForTargetNode(Op, Known, DemandedElts, TLO.DAG, Depth);
7867 return false;
7868}
return SDValue()
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
constexpr LLT S1
constexpr LLT F32
AMDGPU Register Bank Select
This file declares a class to represent arbitrary precision floating point values and provide a varie...
This file implements a class to represent arbitrary precision integral constant values and operations...
static SDValue PerformADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformADDCombineWithOperands - Try DAG combinations for an ADD with operands N0 and N1.
static SDValue PerformADDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformADDCombine - Target-specific dag combine xforms for ISD::ADD.
static SDValue PerformVSELECTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue PerformMULCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue PerformFADDCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue PerformBUILD_VECTORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformBUILD_VECTORCombine - Target-specific dag combine xforms for ISD::BUILD_VECTOR.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Function Alias Analysis Results
Atomic ordering constants.
This file contains the simple types necessary to represent the attributes associated with functions a...
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
#define clEnumValN(ENUMVAL, FLAGNAME, DESC)
This file contains the declarations for the subclasses of Constant, which represent the different fla...
This file contains the declarations of entities that describe floating point environment and related ...
static bool IsIndirectCall(const MachineInstr *MI)
Module.h This file contains the declarations for the Module class.
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
static DebugLoc getDebugLoc(MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
Return the first DebugLoc that has line number information, given a range of instructions.
Register Reg
Register const TargetRegisterInfo * TRI
#define T
NVPTX address space definition.
static SDValue reportInvalidTensormapReplaceUsage(SDValue Op, SelectionDAG &DAG, unsigned Val)
static bool shouldConvertToIndirectCall(const CallBase *CB, const GlobalAddressSDNode *Func)
static SDValue combineADDRSPACECAST(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static cl::opt< bool > sched4reg("nvptx-sched4reg", cl::desc("NVPTX Specific: schedule for register pressue"), cl::init(false))
static SDValue lowerTcgen05St(SDValue Op, SelectionDAG &DAG)
static SDValue PerformEXTRACTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static cl::opt< NVPTX::DivPrecisionLevel > UsePrecDivF32("nvptx-prec-divf32", cl::Hidden, cl::desc("NVPTX Specific: Override the precision of the lowering for f32 fdiv"), cl::values(clEnumValN(NVPTX::DivPrecisionLevel::Approx, "0", "Use div.approx"), clEnumValN(NVPTX::DivPrecisionLevel::Full, "1", "Use div.full"), clEnumValN(NVPTX::DivPrecisionLevel::IEEE754, "2", "Use IEEE Compliant F32 div.rnd if available (default)"), clEnumValN(NVPTX::DivPrecisionLevel::IEEE754_NoFTZ, "3", "Use IEEE Compliant F32 div.rnd if available, no FTZ")), cl::init(NVPTX::DivPrecisionLevel::IEEE754))
static bool isConstOne(const SDValue &Operand)
static cl::opt< unsigned > FMAContractLevelOpt("nvptx-fma-level", cl::Hidden, cl::desc("NVPTX Specific: FMA contraction (0: don't do it" " 1: do it 2: do it aggressively"), cl::init(2))
static bool IsPTXVectorType(MVT VT)
static SDValue PerformSELECTShiftCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
Transform patterns like: (select (ugt shift_amt, BitWidth-1), 0, (srl/shl x, shift_amt)) (select (ult...
static SDValue lowerLOADi1(LoadSDNode *LD, SelectionDAG &DAG)
static SDValue lowerIntrinsicVoid(SDValue Op, SelectionDAG &DAG)
static MachinePointerInfo refinePtrAS(SDValue &Ptr, SelectionDAG &DAG, const DataLayout &DL, const TargetLowering &TL)
static SDValue lowerROT(SDValue Op, SelectionDAG &DAG)
static void ComputePTXValueVTs(const TargetLowering &TLI, const DataLayout &DL, LLVMContext &Ctx, CallingConv::ID CallConv, Type *Ty, SmallVectorImpl< EVT > &ValueVTs, SmallVectorImpl< uint64_t > &Offsets, uint64_t StartingOffset=0)
ComputePTXValueVTs - For the given Type Ty, returns the set of primitive legal-ish MVTs that compose ...
static void ReplaceBITCAST(SDNode *Node, SelectionDAG &DAG, SmallVectorImpl< SDValue > &Results)
static void replaceAtomicSwap128(SDNode *N, SelectionDAG &DAG, const NVPTXSubtarget &STI, SmallVectorImpl< SDValue > &Results)
static unsigned getMinMax3Opcode(unsigned MinMax2Opcode)
Get 3-input version of a 2-input min/max opcode.
static SDValue lowerSTOREVector(SDValue Op, SelectionDAG &DAG, const NVPTXSubtarget &STI)
static SDValue lowerLoadVector(SDNode *N, SelectionDAG &DAG, const NVPTXSubtarget &STI)
static void replaceProxyReg(SDNode *N, SelectionDAG &DAG, const TargetLowering &TLI, SmallVectorImpl< SDValue > &Results)
static void ReplaceCopyFromReg_128(SDNode *N, SelectionDAG &DAG, SmallVectorImpl< SDValue > &Results)
#define TCGEN05_LD_RED_INST(SHAPE, NUM, TYPE)
static SDValue lowerCTLZCTPOP(SDValue Op, SelectionDAG &DAG)
static SDValue combineMADConstOne(SDValue X, SDValue Add, EVT VT, SDLoc DL, TargetLowering::DAGCombinerInfo &DCI)
static unsigned getTcgen05LdRedID(Intrinsic::ID IID)
static SDValue combinePRMT(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, CodeGenOptLevel OptLevel)
static SDValue combinePackingMovIntoStore(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, unsigned Front, unsigned Back)
Fold packing movs into a store.
static void ReplaceINTRINSIC_W_CHAIN(SDNode *N, SelectionDAG &DAG, SmallVectorImpl< SDValue > &Results)
static SDValue getBuildVectorizedValue(unsigned N, const SDLoc &dl, SelectionDAG &DAG, T GetElement)
static SDValue getExtractVectorizedValue(SDValue V, unsigned I, EVT VT, const SDLoc &dl, SelectionDAG &DAG)
static unsigned canMergeParamLoadStoresStartingAt(unsigned Idx, uint32_t AccessSize, const SmallVectorImpl< EVT > &ValueVTs, const SmallVectorImpl< T > &Offsets, Align ParamAlignment)
static EVT getVectorizedVT(EVT VT, unsigned N, LLVMContext &C)
static SDValue lowerIntrinsicWOChain(SDValue Op, SelectionDAG &DAG)
static SDValue PerformFMinMaxCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, unsigned PTXVersion, unsigned SmVersion)
PerformFMinMaxCombine - Combine (fmaxnum (fmaxnum a, b), c) into (fmaxnum3 a, b, c).
static SDValue combineMulWide(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, CodeGenOptLevel OptLevel)
static SDValue PerformFADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1, TargetLowering::DAGCombinerInfo &DCI, CodeGenOptLevel OptLevel)
static std::optional< unsigned > getScalar3OpcodeForReduction(unsigned ReductionOpcode)
Get 3-input scalar reduction opcode.
static SDValue lowerIntrinsicWChain(SDValue Op, SelectionDAG &DAG)
static bool isNonCoalescableBuildVector(const SDValue &BV)
Check if a v2f32 BUILD_VECTOR provably packs values from non-adjacent register pairs (non-coalescable...
static SDValue PerformScalarizeV2F32Op(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
Scalarize a v2f32 arithmetic node (FADD, FMUL, FSUB, FMA) when at least one operand is a BUILD_VECTOR...
static bool isConstZero(const SDValue &Operand)
static unsigned getF16SubOpc(Intrinsic::ID AddIntrinsicID)
static SDValue LowerVectorArith(SDValue Op, SelectionDAG &DAG)
static SDValue LowerTcgen05MMADisableOutputLane(SDValue Op, SelectionDAG &DAG)
static bool IsMulWideOperandDemotable(SDValue Op, unsigned OptSize, OperandSignedness &S)
IsMulWideOperandDemotable - Checks if the provided DAG node is an operand that can be demoted to OptS...
static unsigned getTcgen05MMADisableOutputLane(unsigned IID)
static std::pair< APInt, APInt > getPRMTDemandedBits(const APInt &SelectorVal, const APInt &DemandedBits)
static APInt computePRMT(APInt A, APInt B, APInt Selector, unsigned Mode)
static ISD::NodeType getScalarOpcodeForReduction(unsigned ReductionOpcode)
static SDValue PerformREMCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, CodeGenOptLevel OptLevel)
static SDValue lowerBSWAP(SDValue Op, SelectionDAG &DAG)
static SDValue lowerMSTORE(SDValue Op, SelectionDAG &DAG)
static SDValue PerformMULCombineWithOperands(SDNode *N, SDValue N0, SDValue N1, TargetLowering::DAGCombinerInfo &DCI)
static void computeKnownBitsForPRMT(const SDValue Op, KnownBits &Known, const SelectionDAG &DAG, unsigned Depth)
static SDValue combineUnpackingMovIntoLoad(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
Fold unpacking movs into a load by increasing the number of return values.
#define TCGEN05_LD_RED_INTR(SHAPE, NUM, TYPE)
static SDValue lowerTensormapReplaceElemtype(SDValue Op, SelectionDAG &DAG)
static SDValue LowerClusterLaunchControlQueryCancel(SDValue Op, SelectionDAG &DAG)
static std::optional< std::pair< SDValue, SDValue > > lowerTcgen05Ld(SDNode *N, SelectionDAG &DAG, bool HasOffset=false)
static SDValue lowerCvtRSIntrinsics(SDValue Op, SelectionDAG &DAG)
static std::optional< std::pair< SDValue, SDValue > > replaceLoadVector(SDNode *N, SelectionDAG &DAG, const NVPTXSubtarget &STI)
replaceLoadVector - Convert vector loads into multi-output scalar loads.
static SDValue expandFSH64(SDValue A, SDValue B, SDValue ShiftAmount, SDLoc DL, unsigned Opcode, SelectionDAG &DAG)
static bool AreMulWideOperandsDemotable(SDValue LHS, SDValue RHS, unsigned OptSize, bool &IsSigned)
AreMulWideOperandsDemotable - Checks if the given LHS and RHS operands can be demoted to OptSize bits...
static std::pair< MemSDNode *, uint32_t > convertMLOADToLoadWithUsedBytesMask(MemSDNode *N, SelectionDAG &DAG, const NVPTXSubtarget &STI)
static SDValue TryMULWIDECombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
TryMULWIDECombine - Attempt to replace a multiply of M bits with a multiply of M/2 bits that produces...
static SDValue lowerPrmtIntrinsic(SDValue Op, SelectionDAG &DAG)
static SDValue combineMulSelectConstOne(SDValue X, SDValue Select, EVT VT, SDLoc DL, TargetLowering::DAGCombinerInfo &DCI)
static SDValue buildTreeReduction(const SmallVector< SDValue > &Elements, EVT EltTy, ArrayRef< std::pair< unsigned, unsigned > > Ops, const SDLoc &DL, const SDNodeFlags Flags, SelectionDAG &DAG)
Reduces the elements using the scalar operations provided.
static SDValue combineProxyReg(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SmallVector< unsigned, 16 > VectorizePTXValueVTs(const SmallVectorImpl< EVT > &ValueVTs, const SmallVectorImpl< T > &Offsets, Align ParamAlignment, bool IsVAArg=false)
static SDValue getPRMT(SDValue A, SDValue B, SDValue Selector, SDLoc DL, SelectionDAG &DAG, unsigned Mode=NVPTX::PTXPrmtMode::NONE)
static SDValue matchMADConstOnePattern(SDValue Add)
static SDValue correctParamType(SDValue V, EVT ExpectedVT, ISD::ArgFlagsTy Flags, SelectionDAG &DAG, SDLoc dl)
static ISD::NodeType getExtOpcode(const ISD::ArgFlagsTy &Flags)
static cl::opt< bool > UsePrecSqrtF32("nvptx-prec-sqrtf32", cl::Hidden, cl::desc("NVPTX Specific: 0 use sqrt.approx, 1 use sqrt.rn."), cl::init(true))
static void computeKnownBitsForLoadV(const SDValue Op, KnownBits &Known)
static APInt getPRMTSelector(const APInt &Selector, unsigned Mode)
static EVT promoteScalarIntegerPTX(const EVT VT)
PromoteScalarIntegerPTX Used to make sure the arguments/returns are suitable for passing and promote ...
static std::optional< std::tuple< SDValue, SDValue, SDValue > > lowerTcgen05LdRed(SDNode *N, SelectionDAG &DAG)
static SDValue simplifyDemandedBitsForPRMT(SDValue PRMT, const APInt &DemandedBits, SelectionDAG &DAG, const TargetLowering &TLI, unsigned Depth)
static SDValue lowerFREM(SDValue Op, SelectionDAG &DAG)
static SDValue canonicalizePRMTInput(SDValue Op, SelectionDAG &DAG)
static SDValue sinkProxyReg(SDValue R, SDValue Chain, TargetLowering::DAGCombinerInfo &DCI)
static SDValue lowerFSH(SDValue Op, SelectionDAG &DAG)
static SDValue lowerTensormapReplaceSwizzleMode(SDValue Op, SelectionDAG &DAG)
static SDValue combineIntrinsicWOChain(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const NVPTXSubtarget &STI)
static SDValue PromoteBinOpToF32(SDNode *N, SelectionDAG &DAG)
static SDValue PerformSETCCCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, unsigned int SmVersion)
static std::optional< std::pair< unsigned int, MVT > > getVectorLoweringShape(EVT VectorEVT, const NVPTXSubtarget &STI, unsigned AddressSpace)
static cl::opt< bool > ForceMinByValParamAlign("nvptx-force-min-byval-param-align", cl::Hidden, cl::desc("NVPTX Specific: force 4-byte minimal alignment for byval" " params of device functions."), cl::init(false))
static SDValue combineF16AddWithNeg(SDNode *N, SelectionDAG &DAG, Intrinsic::ID AddIntrinsicID)
static cl::opt< bool > UseApproxLog2F32("nvptx-approx-log2f32", cl::desc("NVPTX Specific: whether to use lg2.approx for log2"), cl::init(false))
Whereas CUDA's implementation (see libdevice) uses ex2.approx for exp2(), it does NOT use lg2....
static SDValue lowerSELECT(SDValue Op, SelectionDAG &DAG)
static SDValue combineLOAD(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const NVPTXSubtarget &STI)
static SDValue combineSTORE(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const NVPTXSubtarget &STI)
static SDValue PerformSHLCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, CodeGenOptLevel OptLevel)
PerformSHLCombine - Runs PTX-specific DAG combine patterns on SHL nodes.
MachineInstr unsigned OpIdx
uint64_t High
#define P(N)
const SmallVectorImpl< MachineOperand > & Cond
static cl::opt< RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode > Mode("regalloc-enable-advisor", cl::Hidden, cl::init(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default), cl::desc("Enable regalloc advisor mode"), cl::values(clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default, "default", "Default"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Release, "release", "precompiled"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Development, "development", "for training")))
Contains matchers for matching SelectionDAG nodes and values.
This file contains some templates that are useful if you are working with the STL at all.
This file defines the SmallVector class.
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
This file describes how to lower LLVM code to machine code.
Value * RHS
Value * LHS
BinaryOperator * Mul
static const fltSemantics & IEEEsingle()
Definition APFloat.h:296
static APFloat getInf(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Infinity.
Definition APFloat.h:1161
Class for arbitrary precision integers.
Definition APInt.h:78
LLVM_ABI APInt getLoBits(unsigned numBits) const
Compute an APInt containing numBits lowbits from this APInt.
Definition APInt.cpp:644
uint64_t getZExtValue() const
Get zero extended value.
Definition APInt.h:1555
void setHighBits(unsigned hiBits)
Set the top hiBits bits.
Definition APInt.h:1406
LLVM_ABI APInt getHiBits(unsigned numBits) const
Compute an APInt containing numBits highbits from this APInt.
Definition APInt.cpp:639
LLVM_ABI APInt trunc(unsigned width) const
Truncate to new width.
Definition APInt.cpp:936
void setBit(unsigned BitPosition)
Set the given bit to 1 whose position is given as "bitPosition".
Definition APInt.h:1345
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition APInt.h:1503
bool isSignedIntN(unsigned N) const
Check if this APInt has an N-bits signed integer value.
Definition APInt.h:436
bool slt(const APInt &RHS) const
Signed less than comparison.
Definition APInt.h:1137
LLVM_ABI APInt extractBits(unsigned numBits, unsigned bitPosition) const
Return an APInt with the extracted bits [bitPosition,bitPosition+numBits).
Definition APInt.cpp:482
bool isIntN(unsigned N) const
Check if this APInt has an N-bits unsigned integer value.
Definition APInt.h:433
bool sge(const APInt &RHS) const
Signed greater or equal comparison.
Definition APInt.h:1244
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
ArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
Definition ArrayRef.h:186
an instruction that atomically reads a memory location, combines it with another value,...
@ Add
*p = old + v
@ FAdd
*p = old + v
@ Min
*p = old <signed v ? old : v
@ Sub
*p = old - v
@ And
*p = old & v
@ Xor
*p = old ^ v
@ UIncWrap
Increment one up to a maximum value.
@ Max
*p = old >signed v ? old : v
@ UMin
*p = old <unsigned v ? old : v
@ UMax
*p = old >unsigned v ? old : v
@ UDecWrap
Decrement one until a minimum value or zero.
bool isFloatingPointOperation() const
BinOp getOperation() const
This is an SDNode representing atomic operations.
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
FunctionType * getFunctionType() const
const APInt & getAPIntValue() const
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
LLVM_ABI TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
LLVM_ABI Align getPrefTypeAlign(Type *Ty) const
Returns the preferred stack/global alignment for the specified type.
Diagnostic information for unsupported feature in backend.
void addFnAttr(Attribute::AttrKind Kind)
Add function attributes to this function.
Definition Function.cpp:639
Module * getParent()
Get the module that this global value is contained inside of...
Common base class shared among various IRBuilders.
Definition IRBuilder.h:114
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
This class is used to represent ISD::LOAD nodes.
MCSection * getDataSection() const
static constexpr unsigned NoRegister
Definition MCRegister.h:60
Instances of this class represent a uniqued identifier for a section in the current translation unit.
Definition MCSection.h:516
StringRef getName() const
getName - Get the symbol name.
Definition MCSymbol.h:188
Machine Value Type.
static auto integer_fixedlen_vector_valuetypes()
SimpleValueType SimpleTy
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isScalableVector() const
Return true if this is a vector value type where the runtime length is machine dependent.
static auto integer_valuetypes()
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
static auto fixedlen_vector_valuetypes()
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
static MVT getVectorVT(MVT VT, unsigned NumElements)
MVT getVectorElementType() const
static MVT getIntegerVT(unsigned BitWidth)
static auto fp_valuetypes()
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
static auto fp_fixedlen_vector_valuetypes()
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
Function & getFunction()
Return the LLVM function that this machine code represents.
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
@ EK_Inline
EK_Inline - Jump table entries are emitted inline at their point of use.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MOInvariant
The memory access always returns the same value (or traps).
@ MOStore
The memory access writes data.
This SDNode is used for target intrinsics that touch memory and need an associated MachineMemOperand.
This is an abstract virtual class for memory operations.
Align getAlign() const
MachineMemOperand * getMemOperand() const
Return the unique MachineMemOperand object describing the memory reference performed by operation.
EVT getMemoryVT() const
Return the type of the in-memory value.
A Module instance is used to store all the information related to an LLVM module.
Definition Module.h:67
static unsigned getFromTypeWidthForLoad(const MemSDNode *Mem)
bool hasTensormapReplaceSwizzleModeSupport(unsigned value) const
bool hasUsedBytesMaskPragma() const
bool hasTensormapReplaceElemtypeSupport(unsigned value) const
bool hasAtomSwap128() const
bool hasF32x2Instructions() const
bool has256BitVectorLoadStore(unsigned AS) const
AtomicOrdering atomicOperationOrderAfterFenceSplit(const Instruction *I) const override
ConstraintType getConstraintType(StringRef Constraint) const override
getConstraintType - Given a constraint letter, return the type of constraint it is for this target.
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
const NVPTXTargetMachine * nvTM
bool SimplifyDemandedBitsForTargetNode(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0) const override
Attempt to simplify any target nodes based on the demanded bits/elts, returning true on success.
AtomicExpansionKind shouldExpandAtomicRMWInIR(const AtomicRMWInst *AI) const override
Returns how the IR-level AtomicExpand pass should expand the given AtomicRMW, if at all.
NVPTXTargetLowering(const NVPTXTargetMachine &TM, const NVPTXSubtarget &STI)
std::string getPrototype(const DataLayout &DL, Type *, const ArgListTy &, const SmallVectorImpl< ISD::OutputArg > &, std::optional< unsigned > FirstVAArg, const CallBase &CB, unsigned UniqueCallSite) const
unsigned getPreferredFPToIntOpcode(unsigned Op, EVT FromVT, EVT ToVT) const override
bool useF32FTZ(const MachineFunction &MF) const
SDValue LowerSTACKSAVE(SDValue Op, SelectionDAG &DAG) const
Align getFunctionArgumentAlignment(const Function *F, Type *Ty, unsigned Idx, const DataLayout &DL) const
SDValue getSqrtEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled, int &ExtraSteps, bool &UseOneConst, bool Reciprocal) const override
Hooks for building estimates in place of slower divisions and square roots.
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &dl, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &dl, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower the incoming (formal) arguments, described by the Ins array,...
void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const override
Lower the specified operand into the Ops vector.
SDValue LowerSTACKRESTORE(SDValue Op, SelectionDAG &DAG) const
Instruction * emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst, AtomicOrdering Ord) const override
std::string getParamName(const Function *F, int Idx) const
TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const override
Return the preferred vector type legalization action.
NVPTX::DivPrecisionLevel getDivF32Level(const MachineFunction &MF, const SDNode &N) const
bool shouldInsertFencesForAtomic(const Instruction *) const override
Whether AtomicExpandPass should automatically insert fences and reduce ordering for this atomic.
Align getFunctionParamOptimizedAlign(const Function *F, Type *ArgTy, const DataLayout &DL) const
getFunctionParamOptimizedAlign - since function arguments are passed via .param space,...
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Ctx, EVT VT) const override
Return the ValueType of the result of SETCC operations.
std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override
Given a physical register constraint (e.g.
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I=nullptr) const override
isLegalAddressingMode - Return true if the addressing mode represented by AM is legal for this target...
Instruction * emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst, AtomicOrdering Ord) const override
Inserts in the IR a target-specific intrinsic specifying a fence.
Align getFunctionByValParamAlign(const Function *F, Type *ArgTy, Align InitialAlign, const DataLayout &DL) const
Helper for computing alignment of a device function byval parameter.
void getTgtMemIntrinsic(SmallVectorImpl< IntrinsicInfo > &Infos, const CallBase &I, MachineFunction &MF, unsigned Intrinsic) const override
Given an intrinsic, checks if on the target the intrinsic will need to map to a MemIntrinsicNode (tou...
bool allowFMA(MachineFunction &MF, CodeGenOptLevel OptLevel) const
bool usePrecSqrtF32(const SDNode *N=nullptr) const
unsigned getJumpTableEncoding() const override
Return the entry encoding for a jump table in the current function.
SDValue LowerCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower calls into the specified DAG.
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
MCSection * SelectSectionForGlobal(const GlobalObject *GO, SectionKind Kind, const TargetMachine &TM) const override
static LLVM_ABI PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
const DebugLoc & getDebugLoc() const
Represents one node in the SelectionDAG.
ArrayRef< SDUse > ops() const
const APInt & getAsAPIntVal() const
Helper method returns the APInt value of a ConstantSDNode.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool hasOneUse() const
Return true if there is exactly one use of this node.
unsigned getIROrder() const
Return the node ordering.
SDNodeFlags getFlags() const
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getNumValues() const
Return the number of values defined/returned by this operator.
SDVTList getVTList() const
const SDValue & getOperand(unsigned Num) const
bool isUndef() const
Returns true if the node type is UNDEF or POISON.
iterator_range< user_iterator > users()
void setFlags(SDNodeFlags NewFlags)
Represents a use of a SDNode.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
const SDValue & getOperand(unsigned i) const
uint64_t getScalarValueSizeInBits() const
uint64_t getConstantOperandVal(unsigned i) const
unsigned getOpcode() const
SectionKind - This is a simple POD value that classifies the properties of a section.
Definition SectionKind.h:22
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
LLVM_ABI SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
const SDValue & getRoot() const
Return the root tag of the SelectionDAG.
LLVM_ABI SDValue getAddrSpaceCast(const SDLoc &dl, EVT VT, SDValue Ptr, unsigned SrcAS, unsigned DestAS)
Return an AddrSpaceCastSDNode.
const TargetSubtargetInfo & getSubtarget() const
LLVM_ABI SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
LLVM_ABI SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
LLVM_ABI void ExtractVectorElements(SDValue Op, SmallVectorImpl< SDValue > &Args, unsigned Start=0, unsigned Count=0, EVT EltVT=EVT())
Append the extracted elements from Start to Count out of the vector Op in Args.
LLVM_ABI SDValue getFreeze(SDValue V)
Return a freeze using the SDLoc of the value operand.
LLVM_ABI SDValue getSymbolFunctionGlobalAddress(SDValue Op, Function **TargetFunction=nullptr)
Return a GlobalAddress of the function from the current module with name matching the given ExternalS...
LLVM_ABI SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
LLVM_ABI SDValue getRegister(Register Reg, EVT VT)
LLVM_ABI SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
LLVM_ABI SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, LocationSize Size=LocationSize::precise(0), const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false, SDNodeFlags Flags={})
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
LLVM_ABI Align getEVTAlign(EVT MemoryVT) const
Compute the default alignment value for the given type.
LLVM_ABI SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
const TargetLowering & getTargetLoweringInfo() const
LLVM_ABI SDNode * MorphNodeTo(SDNode *N, unsigned Opc, SDVTList VTs, ArrayRef< SDValue > Ops)
This mutates the specified node to have the specified return type, opcode, and operands.
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
LLVM_ABI SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build Select's if you just have operands and don't want to check...
const DataLayout & getDataLayout() const
LLVM_ABI SDValue getTokenFactor(const SDLoc &DL, SmallVectorImpl< SDValue > &Vals)
Creates a new TokenFactor containing Vals.
LLVM_ABI SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
LLVM_ABI SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
LLVM_ABI SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
LLVM_ABI SDValue getSignedConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
LLVM_ABI SDValue getExternalSymbol(const char *Sym, EVT VT)
LLVM_ABI SDValue getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either any-extending or truncat...
LLVM_ABI SDValue getIntPtrConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
LLVM_ABI SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
LLVM_ABI SDValue getFPExtendOrRound(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of float type, to the float type VT, by either extending or rounding (by tr...
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
MachineFunction & getMachineFunction() const
LLVM_ABI KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
LLVM_ABI SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
SDValue getObjectPtrOffset(const SDLoc &SL, SDValue Ptr, TypeSize Offset)
Create an add instruction with appropriate flags when used for addressing some offset of an object.
LLVMContext * getContext() const
const SDValue & setRoot(SDValue N)
Set the current root tag of the SelectionDAG.
LLVM_ABI SDValue getTargetExternalSymbol(const char *Sym, EVT VT, unsigned TargetFlags=0)
ArrayRef< int > getMask() const
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
This class is used to represent ISD::STORE nodes.
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
constexpr size_t size() const
size - Get the string size.
Definition StringRef.h:146
constexpr const char * data() const
data - Get a pointer to the start of the string (which may not be null terminated).
Definition StringRef.h:140
Align getStackAlign() const
getStackAlignment - This method returns the number of bytes to which the stack pointer must be aligne...
void setBooleanVectorContents(BooleanContent Ty)
Specify how the target extends the result of a vector boolean value from a vector of i1 to a wider ty...
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
void setMaxDivRemBitWidthSupported(unsigned SizeInBits)
Set the size in bits of the maximum div/rem the backend supports.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
unsigned MaxStoresPerMemcpyOptSize
Likewise for functions with the OptSize attribute.
const TargetMachine & getTargetMachine() const
virtual unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain targets require unusual breakdowns of certain types.
virtual MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
void setOperationPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
Convenience method to set an operation to Promote and specify the type in a single call.
LegalizeTypeAction
This enum indicates whether a types are legal for a target, and if not, what action should be used to...
void addBypassSlowDiv(unsigned int SlowBitWidth, unsigned int FastBitWidth)
Tells the code generator which bitwidths to bypass.
virtual unsigned getNumRegisters(LLVMContext &Context, EVT VT, std::optional< MVT > RegisterVT=std::nullopt) const
Return the number of registers that this ValueType will eventually require.
void setMaxAtomicSizeInBitsSupported(unsigned SizeInBits)
Set the maximum atomic operation size supported by the backend.
virtual TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const
Return the preferred vector type legalization action.
unsigned MaxStoresPerMemsetOptSize
Likewise for functions with the OptSize attribute.
void setBooleanContents(BooleanContent Ty)
Specify how the target extends the result of integer and floating point boolean values from i1 to a w...
unsigned MaxStoresPerMemmove
Specify maximum number of store instructions per memmove call.
void computeRegisterProperties(const TargetRegisterInfo *TRI)
Once all of the register classes are added, this allows us to compute derived properties we expose.
unsigned MaxStoresPerMemmoveOptSize
Likewise for functions with the OptSize attribute.
void addRegisterClass(MVT VT, const TargetRegisterClass *RC)
Add the specified register class as an available regclass for the specified value type.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
virtual MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
unsigned MaxStoresPerMemset
Specify maximum number of store instructions per memset call.
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
void setMinCmpXchgSizeInBits(unsigned SizeInBits)
Sets the minimum cmpxchg or ll/sc size supported by the backend.
void AddPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
If Opc/OrigVT is specified as being promoted, the promotion code defaults to trying a larger integer/...
AtomicExpansionKind
Enum that specifies what an atomic load/AtomicRMWInst is expanded to, if at all.
void setCondCodeAction(ArrayRef< ISD::CondCode > CCs, MVT VT, LegalizeAction Action)
Indicate that the specified condition code is or isn't supported on the target and indicate what to d...
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
Align getMinStackArgumentAlignment() const
Return the minimum stack alignment of an argument.
void setLoadExtAction(unsigned ExtType, MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified load with extension does not work with the specified type and indicate wh...
std::vector< ArgListEntry > ArgListTy
virtual Instruction * emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst, AtomicOrdering Ord) const
virtual Instruction * emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst, AtomicOrdering Ord) const
Inserts in the IR a target-specific intrinsic specifying a fence.
unsigned MaxStoresPerMemcpy
Specify maximum number of store instructions per memcpy call.
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
MVT getRegisterType(MVT VT) const
Return the type of registers that this ValueType will eventually require.
void setJumpIsExpensive(bool isExpensive=true)
Tells the code generator not to expand logic operations on comparison predicates into separate sequen...
LegalizeAction getOperationAction(unsigned Op, EVT VT) const
Return how this operation should be treated: either it is legal, needs to be promoted to a larger siz...
This class defines information used to lower LLVM code to legal SelectionDAG operators that the targe...
SDValue SimplifyMultipleUseDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth=0) const
More limited version of SimplifyDemandedBits that can be used to "lookthrough" ops that don't contrib...
virtual ConstraintType getConstraintType(StringRef Constraint) const
Given a constraint, return the type of constraint it is for this target.
virtual std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const
Given a physical register constraint (e.g.
TargetLowering(const TargetLowering &)=delete
SDValue expandRoundInexactToOdd(EVT ResultVT, SDValue Op, const SDLoc &DL, SelectionDAG &DAG) const
Truncate Op to ResultVT.
SDValue expandFP_ROUND(SDNode *Node, SelectionDAG &DAG) const
Expand round(fp) to fp conversion.
virtual void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const
Lower the specified operand into the Ops vector.
Primary interface to the complete machine description for the target machine.
CodeGenOptLevel getOptLevel() const
Returns the optimization level: None, Less, Default, or Aggressive.
TargetOptions Options
MCSymbol * getSymbol(const GlobalValue *GV) const
FPOpFusion::FPOpFusionMode AllowFPOpFusion
AllowFPOpFusion - This flag is set by the -fp-contract=xxx option.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
virtual const TargetFrameLowering * getFrameLowering() const
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:343
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
LLVM_ABI TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Definition Type.cpp:197
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:184
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240
bool isVoidTy() const
Return true if this is 'void'.
Definition Type.h:139
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
A raw_ostream that writes to an std::string.
CallInst * Call
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
LLVM_ABI APInt pow(const APInt &X, int64_t N)
Compute X^N for N>=0.
Definition APInt.cpp:3166
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
NodeType
ISD::NodeType enum - This enum defines the target-independent operators for a SelectionDAG.
Definition ISDOpcodes.h:41
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition ISDOpcodes.h:819
@ STACKRESTORE
STACKRESTORE has two operands, an input chain and a pointer to restore to it returns an output chain.
@ STACKSAVE
STACKSAVE - STACKSAVE has one operand, an input chain.
@ CTLZ_ZERO_UNDEF
Definition ISDOpcodes.h:788
@ POISON
POISON - A poison node.
Definition ISDOpcodes.h:236
@ MLOAD
Masked load and store - consecutive vector load and store operations with additional mask operand tha...
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition ISDOpcodes.h:275
@ BSWAP
Byte Swap and Counting operators.
Definition ISDOpcodes.h:779
@ VAEND
VAEND, VASTART - VAEND and VASTART have three operands: an input chain, pointer, and a SRCVALUE.
@ ADDC
Carry-setting nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:294
@ ADD
Simple integer binary arithmetic operators.
Definition ISDOpcodes.h:264
@ LOAD
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition ISDOpcodes.h:853
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition ISDOpcodes.h:518
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
Definition ISDOpcodes.h:220
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition ISDOpcodes.h:880
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
Definition ISDOpcodes.h:584
@ VECREDUCE_FMAX
FMIN/FMAX nodes can have flags, for NaN/NoNaN variants.
@ FADD
Simple binary floating point operators.
Definition ISDOpcodes.h:417
@ VECREDUCE_FMAXIMUM
FMINIMUM/FMAXIMUM nodes propatate NaNs and signed zeroes using the llvm.minimum and llvm....
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
Definition ISDOpcodes.h:747
@ SDIVREM
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition ISDOpcodes.h:280
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
Definition ISDOpcodes.h:993
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
Definition ISDOpcodes.h:254
@ SIGN_EXTEND
Conversion operators.
Definition ISDOpcodes.h:844
@ READSTEADYCOUNTER
READSTEADYCOUNTER - This corresponds to the readfixedcounter intrinsic.
@ FNEG
Perform various unary floating-point operations inspired by libm.
@ BR_CC
BR_CC - Conditional branch.
@ SSUBO
Same for subtraction.
Definition ISDOpcodes.h:352
@ BRIND
BRIND - Indirect branch.
@ BR_JT
BR_JT - Jumptable branch.
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
Definition ISDOpcodes.h:374
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition ISDOpcodes.h:796
@ UNDEF
UNDEF - An undefined node.
Definition ISDOpcodes.h:233
@ EXTRACT_ELEMENT
EXTRACT_ELEMENT - This is used to get the lower or upper (determined by a Constant,...
Definition ISDOpcodes.h:247
@ VACOPY
VACOPY - VACOPY has 5 operands: an input chain, a destination pointer, a source pointer,...
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
Definition ISDOpcodes.h:230
@ SADDO
RESULT, BOOL = [SU]ADDO(LHS, RHS) - Overflow-aware nodes for addition.
Definition ISDOpcodes.h:348
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition ISDOpcodes.h:704
@ SHL
Shift and rotation operations.
Definition ISDOpcodes.h:765
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition ISDOpcodes.h:649
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
Definition ISDOpcodes.h:614
@ FMINNUM_IEEE
FMINNUM_IEEE/FMAXNUM_IEEE - Perform floating-point minimumNumber or maximumNumber on two values,...
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition ISDOpcodes.h:576
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition ISDOpcodes.h:224
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition ISDOpcodes.h:850
@ DEBUGTRAP
DEBUGTRAP - Trap intended to get the attention of a debugger.
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
Definition ISDOpcodes.h:811
@ ATOMIC_CMP_SWAP
Val, OUTCHAIN = ATOMIC_CMP_SWAP(INCHAIN, ptr, cmp, swap) For double-word atomic operations: ValLo,...
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum maximum on two values, following IEEE-754 definition...
@ SSHLSAT
RESULT = [US]SHLSAT(LHS, RHS) - Perform saturation left shift.
Definition ISDOpcodes.h:386
@ SMULO
Same for multiplication.
Definition ISDOpcodes.h:356
@ DYNAMIC_STACKALLOC
DYNAMIC_STACKALLOC - Allocate some number of bytes on the stack aligned to a specified boundary.
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition ISDOpcodes.h:888
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition ISDOpcodes.h:727
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition ISDOpcodes.h:978
@ VSELECT
Select with a vector condition (op #0) and two vector operands (ops #1 and #2), returning a vector re...
Definition ISDOpcodes.h:805
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:328
@ BF16_TO_FP
BF16_TO_FP, FP_TO_BF16 - These operators are used to perform promotions and truncation for bfloat16.
@ FRAMEADDR
FRAMEADDR, RETURNADDR - These nodes represent llvm.frameaddress and llvm.returnaddress on the DAG.
Definition ISDOpcodes.h:110
@ STRICT_FP_TO_UINT
Definition ISDOpcodes.h:478
@ STRICT_FP_TO_SINT
STRICT_FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:477
@ FMINIMUM
FMINIMUM/FMAXIMUM - NaN-propagating minimum/maximum that also treat -0.0 as less than 0....
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:926
@ READCYCLECOUNTER
READCYCLECOUNTER - This corresponds to the readcyclecounter intrinsic.
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition ISDOpcodes.h:739
@ TRAP
TRAP - Trapping instruction.
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition ISDOpcodes.h:205
@ ADDE
Carry-using nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:304
@ FREEZE
FREEZE - FREEZE(VAL) returns an arbitrary value if VAL is UNDEF (or is evaluated to UNDEF),...
Definition ISDOpcodes.h:241
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
Definition ISDOpcodes.h:565
@ ATOMIC_SWAP
Val, OUTCHAIN = ATOMIC_SWAP(INCHAIN, ptr, amt) Val, OUTCHAIN = ATOMIC_LOAD_[OpName](INCHAIN,...
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition ISDOpcodes.h:959
@ ADDRSPACECAST
ADDRSPACECAST - This operator converts between pointers of different address spaces.
Definition ISDOpcodes.h:997
@ VECREDUCE_FMINIMUM
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition ISDOpcodes.h:856
@ VAARG
VAARG - VAARG has four operands: an input chain, a pointer, a SRCVALUE, and the alignment.
@ SHL_PARTS
SHL_PARTS/SRA_PARTS/SRL_PARTS - These operators are used for expanded integer shift operations.
Definition ISDOpcodes.h:833
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
Definition ISDOpcodes.h:534
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
Definition ISDOpcodes.h:365
@ FMINIMUMNUM
FMINIMUMNUM/FMAXIMUMNUM - minimumnum/maximumnum that is same with FMINNUM_IEEE and FMAXNUM_IEEE besid...
@ SADDO_CARRY
Carry-using overflow-aware nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:338
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition ISDOpcodes.h:213
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
Definition ISDOpcodes.h:556
LLVM_ABI bool allOperandsUndef(const SDNode *N)
Return true if the node has at least one operand and all operands of the specified node are ISD::UNDE...
This namespace contains an enum with a value for every intrinsic/builtin function known by LLVM.
LLVM_ABI StringRef getName(ID id)
Return the LLVM name for an intrinsic, such as "llvm.ppc.altivec.lvx".
@ Bitcast
Perform the operation on a different, but equivalently sized type.
@ ATOMIC_CMP_SWAP_B128
These nodes are used to lower atomic instructions with i128 type.
bool isPackedVectorTy(EVT VT)
DivPrecisionLevel
Definition NVPTX.h:257
match_combine_or< CastInst_match< OpTy, TruncInst >, OpTy > m_TruncOrSelf(const OpTy &Op)
specific_intval< false > m_SpecificInt(const APInt &V)
Match a specific integer value or vector with all elements equal to the value.
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
deferredval_ty< Value > m_Deferred(Value *const &V)
Like m_Specific(), but works if the specific value to match is determined as part of the same match()...
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
is_zero m_Zero()
Match any null constant or a vector with all elements equal to 0.
ValuesClass values(OptsTy... Options)
Helper to build a ValuesClass by forwarding a variable number of arguments as an initializer list to ...
initializer< Ty > init(const Ty &Val)
NodeAddr< NodeBase * > Node
Definition RDFGraph.h:381
This is an optimization pass for GlobalISel generic memory operations.
Definition Types.h:26
@ Low
Lower the current thread's priority such that it does not affect foreground tasks significantly.
Definition Threading.h:280
@ Offset
Definition DWP.cpp:532
detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&...args)
zip iterator for two or more iteratable types.
Definition STLExtras.h:831
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
bool shouldEmitPTXNoReturn(const Value *V, const TargetMachine &TM)
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1739
MaybeAlign getAlign(const CallInst &I, unsigned Index)
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition STLExtras.h:1669
void ComputeValueVTs(const TargetLowering &TLI, const DataLayout &DL, Type *Ty, SmallVectorImpl< EVT > &ValueVTs, SmallVectorImpl< EVT > *MemVTs=nullptr, SmallVectorImpl< TypeSize > *Offsets=nullptr, TypeSize StartingOffset=TypeSize::getZero())
ComputeValueVTs - Given an LLVM IR type, compute a sequence of EVTs that represent all the individual...
Definition Analysis.cpp:119
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2554
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition MathExtras.h:385
bool isReleaseOrStronger(AtomicOrdering AO)
OutputIt transform(R &&Range, OutputIt d_first, UnaryFunction F)
Wrapper function around std::transform to apply a function to a range and store the result elsewhere.
Definition STLExtras.h:2026
auto reverse(ContainerTy &&C)
Definition STLExtras.h:408
std::optional< SyncScope::ID > getAtomicSyncScopeID(const Instruction *I)
A helper function that returns an atomic operation's sync scope; returns std::nullopt if it is not an...
unsigned promoteScalarArgumentSize(unsigned size)
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1753
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:163
bool shouldPassAsArray(Type *Ty)
CodeGenOptLevel
Code generation optimization level.
Definition CodeGen.h:82
@ Default
-O2, -Os, -Oz
Definition CodeGen.h:85
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
AtomicOrdering
Atomic ordering for LLVM's memory model.
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition Alignment.h:144
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
bool isParamGridConstant(const Argument &Arg)
bool isAcquireOrStronger(AtomicOrdering AO)
constexpr unsigned BitWidth
bool isKernelFunction(const Function &F)
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
Function * getMaybeBitcastedCallee(const CallBase *CB)
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition Alignment.h:201
auto seq(T Begin, T End)
Iterate over an integral type from Begin up to - but not including - End.
Definition Sequence.h:305
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:872
#define N
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
constexpr uint64_t value() const
This is a hole in the type system and should not be abused.
Definition Alignment.h:77
@ PreserveSign
The sign of a flushed-to-zero number is preserved in the sign of 0.
DenormalModeKind Output
Denormal flushing mode for floating point instruction results in the default floating point environme...
Extended Value Type.
Definition ValueTypes.h:35
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
Definition ValueTypes.h:395
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition ValueTypes.h:137
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition ValueTypes.h:74
EVT changeTypeToInteger() const
Return the type converted to an equivalently sized integer or vector with integer element type.
Definition ValueTypes.h:121
bool bitsGT(EVT VT) const
Return true if this has more bits than VT.
Definition ValueTypes.h:284
bool bitsLT(EVT VT) const
Return true if this has less bits than VT.
Definition ValueTypes.h:300
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
Definition ValueTypes.h:147
ElementCount getVectorElementCount() const
Definition ValueTypes.h:350
bool is32BitVector() const
Return true if this is a 32-bit vector type.
Definition ValueTypes.h:197
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition ValueTypes.h:373
uint64_t getScalarSizeInBits() const
Definition ValueTypes.h:385
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition ValueTypes.h:316
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
Definition ValueTypes.h:381
bool isVector() const
Return true if this is a vector value type.
Definition ValueTypes.h:168
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition ValueTypes.h:323
bool bitsEq(EVT VT) const
Return true if this has the same number of bits as VT.
Definition ValueTypes.h:256
LLVM_ABI Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition ValueTypes.h:328
EVT changeElementType(LLVMContext &Context, EVT EltVT) const
Return a VT for a type whose attributes match ourselves with the exception of the element type that i...
Definition ValueTypes.h:113
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
Definition ValueTypes.h:157
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition ValueTypes.h:336
bool isInteger() const
Return true if this is an integer or a vector integer type.
Definition ValueTypes.h:152
static LLVM_ABI KnownBits ashr(const KnownBits &LHS, const KnownBits &RHS, bool ShAmtNonZero=false, bool Exact=false)
Compute known bits for ashr(LHS, RHS).
KnownBits concat(const KnownBits &Lo) const
Concatenate the bits from Lo onto the bottom of *this.
Definition KnownBits.h:249
unsigned getBitWidth() const
Get the bit width of this value.
Definition KnownBits.h:44
void resetAll()
Resets the known state of all bits.
Definition KnownBits.h:74
void insertBits(const KnownBits &SubBits, unsigned BitPosition)
Insert the bits from a smaller known bits starting at bitPosition.
Definition KnownBits.h:235
This class contains a discriminated union of information about pointers in memory operands,...
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition Alignment.h:106
These are IR-level optimization flags that may be propagated to SDNodes.
bool hasAllowContract() const
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
This structure contains all information that is necessary for lowering calls.
SmallVector< ISD::InputArg, 32 > Ins
SmallVector< ISD::OutputArg, 32 > Outs
Type * RetTy
Same as OrigRetTy, or partially legalized for soft float libcalls.
A convenience struct that encapsulates a DAG, and two SDValues for returning information from TargetL...