LLVM 20.0.0git
NVPTXISelDAGToDAG.cpp
Go to the documentation of this file.
1//===-- NVPTXISelDAGToDAG.cpp - A dag to dag inst selector for NVPTX ------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines an instruction selector for the NVPTX target.
10//
11//===----------------------------------------------------------------------===//
12
13#include "NVPTXISelDAGToDAG.h"
14#include "NVPTXUtilities.h"
17#include "llvm/IR/GlobalValue.h"
19#include "llvm/IR/IntrinsicsNVPTX.h"
26
27using namespace llvm;
28
29#define DEBUG_TYPE "nvptx-isel"
30#define PASS_NAME "NVPTX DAG->DAG Pattern Instruction Selection"
31
32static cl::opt<bool>
33 EnableRsqrtOpt("nvptx-rsqrt-approx-opt", cl::init(true), cl::Hidden,
34 cl::desc("Enable reciprocal sqrt optimization"));
35
36/// createNVPTXISelDag - This pass converts a legalized DAG into a
37/// NVPTX-specific DAG, ready for instruction scheduling.
39 llvm::CodeGenOptLevel OptLevel) {
40 return new NVPTXDAGToDAGISelLegacy(TM, OptLevel);
41}
42
44 CodeGenOptLevel OptLevel)
46 ID, std::make_unique<NVPTXDAGToDAGISel>(tm, OptLevel)) {}
47
49
51
53 CodeGenOptLevel OptLevel)
54 : SelectionDAGISel(tm, OptLevel), TM(tm) {
55 doMulWide = (OptLevel > CodeGenOptLevel::None);
56}
57
62}
63
64int NVPTXDAGToDAGISel::getDivF32Level() const {
66}
67
68bool NVPTXDAGToDAGISel::usePrecSqrtF32() const {
70}
71
72bool NVPTXDAGToDAGISel::useF32FTZ() const {
74}
75
76bool NVPTXDAGToDAGISel::allowFMA() const {
78 return TL->allowFMA(*MF, OptLevel);
79}
80
81bool NVPTXDAGToDAGISel::allowUnsafeFPMath() const {
83 return TL->allowUnsafeFPMath(*MF);
84}
85
86bool NVPTXDAGToDAGISel::doRsqrtOpt() const { return EnableRsqrtOpt; }
87
88/// Select - Select instructions not customized! Used for
89/// expanded, promoted and normal instructions.
90void NVPTXDAGToDAGISel::Select(SDNode *N) {
91
92 if (N->isMachineOpcode()) {
93 N->setNodeId(-1);
94 return; // Already selected.
95 }
96
97 switch (N->getOpcode()) {
98 case ISD::LOAD:
100 if (tryLoad(N))
101 return;
102 break;
103 case ISD::STORE:
105 if (tryStore(N))
106 return;
107 break;
109 if (tryFence(N))
110 return;
111 break;
113 if (tryEXTRACT_VECTOR_ELEMENT(N))
114 return;
115 break;
117 SelectSETP_F16X2(N);
118 return;
120 SelectSETP_BF16X2(N);
121 return;
122 case NVPTXISD::LoadV2:
123 case NVPTXISD::LoadV4:
124 if (tryLoadVector(N))
125 return;
126 break;
127 case NVPTXISD::LDUV2:
128 case NVPTXISD::LDUV4:
129 if (tryLDGLDU(N))
130 return;
131 break;
134 if (tryStoreVector(N))
135 return;
136 break;
140 if (tryLoadParam(N))
141 return;
142 break;
146 if (tryStoreRetval(N))
147 return;
148 break;
154 if (tryStoreParam(N))
155 return;
156 break;
158 if (tryIntrinsicNoChain(N))
159 return;
160 break;
162 if (tryIntrinsicChain(N))
163 return;
164 break;
166 if (tryIntrinsicVoid(N))
167 return;
168 break;
169 case ISD::AND:
170 case ISD::SRA:
171 case ISD::SRL:
172 // Try to select BFE
173 if (tryBFE(N))
174 return;
175 break;
177 SelectAddrSpaceCast(N);
178 return;
179 case ISD::CopyToReg: {
180 if (N->getOperand(1).getValueType() == MVT::i128) {
181 SelectV2I64toI128(N);
182 return;
183 }
184 break;
185 }
186 case ISD::CopyFromReg: {
187 if (N->getOperand(1).getValueType() == MVT::i128) {
188 SelectI128toV2I64(N);
189 return;
190 }
191 break;
192 }
193 default:
194 break;
195 }
196 SelectCode(N);
197}
198
199bool NVPTXDAGToDAGISel::tryIntrinsicChain(SDNode *N) {
200 unsigned IID = N->getConstantOperandVal(1);
201 switch (IID) {
202 default:
203 return false;
204 case Intrinsic::nvvm_ldu_global_f:
205 case Intrinsic::nvvm_ldu_global_i:
206 case Intrinsic::nvvm_ldu_global_p:
207 return tryLDGLDU(N);
208 }
209}
210
211// Map ISD:CONDCODE value to appropriate CmpMode expected by
212// NVPTXInstPrinter::printCmpMode()
213static unsigned getPTXCmpMode(const CondCodeSDNode &CondCode, bool FTZ) {
215 unsigned PTXCmpMode = [](ISD::CondCode CC) {
216 switch (CC) {
217 default:
218 llvm_unreachable("Unexpected condition code.");
219 case ISD::SETOEQ:
220 return CmpMode::EQ;
221 case ISD::SETOGT:
222 return CmpMode::GT;
223 case ISD::SETOGE:
224 return CmpMode::GE;
225 case ISD::SETOLT:
226 return CmpMode::LT;
227 case ISD::SETOLE:
228 return CmpMode::LE;
229 case ISD::SETONE:
230 return CmpMode::NE;
231 case ISD::SETO:
232 return CmpMode::NUM;
233 case ISD::SETUO:
234 return CmpMode::NotANumber;
235 case ISD::SETUEQ:
236 return CmpMode::EQU;
237 case ISD::SETUGT:
238 return CmpMode::GTU;
239 case ISD::SETUGE:
240 return CmpMode::GEU;
241 case ISD::SETULT:
242 return CmpMode::LTU;
243 case ISD::SETULE:
244 return CmpMode::LEU;
245 case ISD::SETUNE:
246 return CmpMode::NEU;
247 case ISD::SETEQ:
248 return CmpMode::EQ;
249 case ISD::SETGT:
250 return CmpMode::GT;
251 case ISD::SETGE:
252 return CmpMode::GE;
253 case ISD::SETLT:
254 return CmpMode::LT;
255 case ISD::SETLE:
256 return CmpMode::LE;
257 case ISD::SETNE:
258 return CmpMode::NE;
259 }
260 }(CondCode.get());
261
262 if (FTZ)
263 PTXCmpMode |= NVPTX::PTXCmpMode::FTZ_FLAG;
264
265 return PTXCmpMode;
266}
267
268bool NVPTXDAGToDAGISel::SelectSETP_F16X2(SDNode *N) {
269 unsigned PTXCmpMode =
270 getPTXCmpMode(*cast<CondCodeSDNode>(N->getOperand(2)), useF32FTZ());
271 SDLoc DL(N);
273 NVPTX::SETP_f16x2rr, DL, MVT::i1, MVT::i1, N->getOperand(0),
274 N->getOperand(1), CurDAG->getTargetConstant(PTXCmpMode, DL, MVT::i32));
275 ReplaceNode(N, SetP);
276 return true;
277}
278
279bool NVPTXDAGToDAGISel::SelectSETP_BF16X2(SDNode *N) {
280 unsigned PTXCmpMode =
281 getPTXCmpMode(*cast<CondCodeSDNode>(N->getOperand(2)), useF32FTZ());
282 SDLoc DL(N);
284 NVPTX::SETP_bf16x2rr, DL, MVT::i1, MVT::i1, N->getOperand(0),
285 N->getOperand(1), CurDAG->getTargetConstant(PTXCmpMode, DL, MVT::i32));
286 ReplaceNode(N, SetP);
287 return true;
288}
289
290// Find all instances of extract_vector_elt that use this v2f16 vector
291// and coalesce them into a scattering move instruction.
292bool NVPTXDAGToDAGISel::tryEXTRACT_VECTOR_ELEMENT(SDNode *N) {
293 SDValue Vector = N->getOperand(0);
294
295 // We only care about 16x2 as it's the only real vector type we
296 // need to deal with.
297 MVT VT = Vector.getSimpleValueType();
298 if (!Isv2x16VT(VT))
299 return false;
300 // Find and record all uses of this vector that extract element 0 or 1.
302 for (auto *U : Vector.getNode()->users()) {
303 if (U->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
304 continue;
305 if (U->getOperand(0) != Vector)
306 continue;
307 if (const ConstantSDNode *IdxConst =
308 dyn_cast<ConstantSDNode>(U->getOperand(1))) {
309 if (IdxConst->getZExtValue() == 0)
310 E0.push_back(U);
311 else if (IdxConst->getZExtValue() == 1)
312 E1.push_back(U);
313 else
314 llvm_unreachable("Invalid vector index.");
315 }
316 }
317
318 // There's no point scattering f16x2 if we only ever access one
319 // element of it.
320 if (E0.empty() || E1.empty())
321 return false;
322
323 // Merge (f16 extractelt(V, 0), f16 extractelt(V,1))
324 // into f16,f16 SplitF16x2(V)
325 MVT EltVT = VT.getVectorElementType();
326 SDNode *ScatterOp =
327 CurDAG->getMachineNode(NVPTX::I32toV2I16, SDLoc(N), EltVT, EltVT, Vector);
328 for (auto *Node : E0)
329 ReplaceUses(SDValue(Node, 0), SDValue(ScatterOp, 0));
330 for (auto *Node : E1)
331 ReplaceUses(SDValue(Node, 0), SDValue(ScatterOp, 1));
332
333 return true;
334}
335
336static unsigned int getCodeAddrSpace(MemSDNode *N) {
337 const Value *Src = N->getMemOperand()->getValue();
338
339 if (!Src)
341
342 if (auto *PT = dyn_cast<PointerType>(Src->getType())) {
343 switch (PT->getAddressSpace()) {
356 default: break;
357 }
358 }
360}
361
362namespace {
363
364struct OperationOrderings {
366 OperationOrderings(NVPTX::Ordering IO = NVPTX::Ordering::NotAtomic,
367 NVPTX::Ordering FO = NVPTX::Ordering::NotAtomic)
368 : InstructionOrdering(IO), FenceOrdering(FO) {}
369};
370
371static OperationOrderings
372getOperationOrderings(MemSDNode *N, const NVPTXSubtarget *Subtarget) {
373 AtomicOrdering Ordering = N->getSuccessOrdering();
374 auto CodeAddrSpace = getCodeAddrSpace(N);
375
376 bool HasMemoryOrdering = Subtarget->hasMemoryOrdering();
377 bool HasRelaxedMMIO = Subtarget->hasRelaxedMMIO();
378
379 // clang-format off
380
381 // Lowering for Load/Store Operations (note: AcquireRelease Loads or Stores error).
382 // Note: uses of Relaxed in the Atomic column of this table refer
383 // to LLVM AtomicOrdering::Monotonic.
384 //
385 // | Atomic | Volatile | Statespace | PTX sm_60- | PTX sm_70+ |
386 // |---------|----------|--------------------|------------|------------------------------|
387 // | No | No | All | plain | .weak |
388 // | No | Yes | Generic,Shared, | .volatile | .volatile |
389 // | | | Global [0] | | |
390 // | No | Yes | Local,Const,Param | plain [1] | .weak [1] |
391 // | Unorder | Yes/No | All | == Relaxed | == Relaxed |
392 // | Relaxed | No | Generic,Shared, | .volatile | <atomic sem> |
393 // | | | Global [0] | | |
394 // | Other | No | Generic,Shared, | Error [2] | <atomic sem> |
395 // | | | Global [0] | | |
396 // | Yes | No | Local,Const,Param | plain [1] | .weak [1] |
397 // | Relaxed | Yes | Generic,Shared [0] | .volatile | .volatile |
398 // | Relaxed | Yes | Global [0] | .volatile | .mmio.relaxed.sys (PTX 8.2+) |
399 // | | | | | or .volatile (PTX 8.1-) |
400 // | Relaxed | Yes | Local,Const,Param | plain [1] | .weak [1] |
401 // | Other | Yes | Generic, Shared, | Error [2] | <atomic sem> [3] |
402 // | | | / Global [0] | | |
403
404 // Lowering of CUDA C++ SequentiallyConsistent Operations and Fences to PTX
405 // by following the ABI proven sound in:
406 // Lustig et al, A Formal Analysis of the NVIDIA PTX Memory Consistency Model, ASPLOS’19.
407 // https://dl.acm.org/doi/pdf/10.1145/3297858.3304043
408 //
409 // | CUDA C++ Atomic Operation or Atomic Fence | PTX Atomic Operation or Fence |
410 // |------------------------------------------------------|-------------------------------|
411 // | cuda::atomic_thread_fence | fence.sc.<scope>; |
412 // | (memory_order_seq_cst, cuda::thread_scope_<scope>) | |
413 // |------------------------------------------------------|-------------------------------|
414 // | cuda::atomic_load | fence.sc.<scope>; |
415 // | (memory_order_seq_cst, cuda::thread_scope_<scope>) | ld.acquire.<scope>; |
416 // |------------------------------------------------------|-------------------------------|
417 // | cuda::atomic_store | fence.sc.<scope>; |
418 // | (memory_order_seq_cst, cuda::thread_scope_<scope>) | st.release.<scope>; |
419 // |------------------------------------------------------|-------------------------------|
420 // | cuda::atomic_fetch_<op> | fence.sc.<scope>; |
421 // | (memory_order_seq_cst, cuda::thread_scope_<scope>) | atom.acq_rel.<scope>; |
422
423 // clang-format on
424
425 // [0]: volatile and atomics are only supported on global or shared
426 // memory locations, accessed via generic/shared/global pointers.
427 // MMIO is only supported on global memory locations,
428 // accessed via generic/global pointers.
429 // TODO: Implement MMIO access via generic pointer to global.
430 // Currently implemented for global pointers only.
431
432 // [1]: Lowering volatile/atomic operations to non-volatile/non-atomic
433 // PTX instructions fails to preserve their C++ side-effects.
434 //
435 // Example (https://github.com/llvm/llvm-project/issues/62057):
436 //
437 // void example() {
438 // std::atomic<bool> True = true;
439 // while (True.load(std::memory_order_relaxed));
440 // }
441 //
442 // A C++ program that calls "example" is well-defined: the infinite loop
443 // performs an atomic operation. By lowering volatile/atomics to
444 // "weak" memory operations, we are transforming the above into:
445 //
446 // void undefined_behavior() {
447 // bool True = true;
448 // while (True);
449 // }
450 //
451 // which exhibits undefined behavior in both C++ and PTX.
452 //
453 // Calling "example" in CUDA C++ compiled for sm_60- exhibits undefined
454 // behavior due to lack of Independent Forward Progress. Lowering these
455 // to weak memory operations in sm_60- is therefore fine.
456 //
457 // TODO: lower atomic and volatile operations to memory locations
458 // in local, const, and param to two PTX instructions in sm_70+:
459 // - the "weak" memory instruction we are currently lowering to, and
460 // - some other instruction that preserves the side-effect, e.g.,
461 // a dead dummy volatile load.
462 if (CodeAddrSpace == NVPTX::AddressSpace::Local ||
463 CodeAddrSpace == NVPTX::AddressSpace::Const ||
464 CodeAddrSpace == NVPTX::AddressSpace::Param) {
466 }
467
468 // [2]: Atomics with Ordering different than Unordered or Relaxed are not
469 // supported on sm_60 and older; this includes volatile atomics.
470 if (!(Ordering == AtomicOrdering::NotAtomic ||
471 Ordering == AtomicOrdering::Unordered ||
472 Ordering == AtomicOrdering::Monotonic) &&
473 !HasMemoryOrdering) {
475 formatv("PTX does not support \"atomic\" for orderings different than"
476 "\"NotAtomic\" or \"Monotonic\" for sm_60 or older, but order "
477 "is: \"{}\".",
478 toIRString(Ordering)));
479 }
480
481 // [3]: TODO: these should eventually use .mmio<.atomic sem>; for now we drop
482 // the volatile semantics and preserve the atomic ones.
483
484 // PTX volatile and PTX atomics are not available for statespace that differ
485 // from .generic, .global, or .shared. The behavior of PTX volatile and PTX
486 // atomics is undefined if the generic address does not refer to a .global or
487 // .shared memory location.
488 bool AddrGenericOrGlobalOrShared =
489 (CodeAddrSpace == NVPTX::AddressSpace::Generic ||
490 CodeAddrSpace == NVPTX::AddressSpace::Global ||
491 CodeAddrSpace == NVPTX::AddressSpace::Shared);
492 if (!AddrGenericOrGlobalOrShared)
494
495 bool UseRelaxedMMIO =
496 HasRelaxedMMIO && CodeAddrSpace == NVPTX::AddressSpace::Global;
497
498 switch (Ordering) {
500 return N->isVolatile() ? NVPTX::Ordering::Volatile
503 // We lower unordered in the exact same way as 'monotonic' to respect
504 // LLVM IR atomicity requirements.
506 if (N->isVolatile())
507 return UseRelaxedMMIO ? NVPTX::Ordering::RelaxedMMIO
509 else
510 return HasMemoryOrdering ? NVPTX::Ordering::Relaxed
512 // case AtomicOrdering::Consume: // If LLVM ever provides this, lower it to
513 // Acquire.
515 if (!N->readMem())
517 formatv("PTX only supports Acquire Ordering on reads: {}",
518 N->getOperationName()));
521 if (!N->writeMem())
523 formatv("PTX only supports Release Ordering on writes: {}",
524 N->getOperationName()));
528 formatv("NVPTX does not support AcquireRelease Ordering on "
529 "read-modify-write "
530 "yet and PTX does not support it on loads or stores: {}",
531 N->getOperationName()));
532 }
534 // LLVM-IR SequentiallyConsistent atomics map to a two-instruction PTX
535 // sequence including a "fence.sc.sco" and the memory instruction with an
536 // Ordering that differs from "sc": acq, rel, or acq_rel, depending on
537 // whether the memory operation is a read, write, or read-modify-write.
538 //
539 // This sets the ordering of the fence to SequentiallyConsistent, and
540 // sets the corresponding ordering for the instruction.
541 NVPTX::Ordering InstrOrder;
542 if (N->readMem())
543 InstrOrder = NVPTX::Ordering::Acquire;
544 else if (N->writeMem())
545 InstrOrder = NVPTX::Ordering::Release;
546 else
548 formatv("NVPTX does not support SequentiallyConsistent Ordering on "
549 "read-modify-writes yet: {}",
550 N->getOperationName()));
551 return OperationOrderings(InstrOrder,
553 }
554 }
556 formatv("NVPTX backend does not support AtomicOrdering \"{}\" yet.",
557 toIRString(Ordering)));
558}
559
560} // namespace
561
562NVPTX::Scope NVPTXDAGToDAGISel::getOperationScope(MemSDNode *N,
563 NVPTX::Ordering O) const {
564 switch (O) {
566 case NVPTX::Ordering::Volatile: // Non-atomic volatile operations
567 // NVPTX uses Thread scope as the scope of non-atomic operations.
570 // RelaxedMMIO operations are always system scope.
571 // If a RelaxedMMIO order was generated from an atomic volatile operation
572 // with a smaller thread scope, we bump it here to system scope.
579 auto S = Scopes[N->getSyncScopeID()];
580
581 // Atomic operations must have a scope greater than thread.
582 if (S == NVPTX::Scope::Thread)
584 formatv("Atomics need scope > \"{}\".", ScopeToString(S)));
585
586 // If scope is cluster, clusters must be supported.
587 if (S == NVPTX::Scope::Cluster)
588 Subtarget->failIfClustersUnsupported("cluster scope");
589
590 // If operation is volatile, then its scope is system.
591 return N->isVolatile() ? NVPTX::Scope::System : S;
592 }
593 llvm_unreachable("unhandled ordering");
594}
595
596static bool canLowerToLDG(MemSDNode *N, const NVPTXSubtarget &Subtarget,
597 unsigned CodeAddrSpace, MachineFunction *F) {
598 // We use ldg (i.e. ld.global.nc) for invariant loads from the global address
599 // space.
600 //
601 // We have two ways of identifying invariant loads: Loads may be explicitly
602 // marked as invariant, or we may infer them to be invariant.
603 //
604 // We currently infer invariance for loads from
605 // - constant global variables, and
606 // - kernel function pointer params that are noalias (i.e. __restrict) and
607 // never written to.
608 //
609 // TODO: Perform a more powerful invariance analysis (ideally IPO, and ideally
610 // not during the SelectionDAG phase).
611 //
612 // TODO: Infer invariance only at -O2. We still want to use ldg at -O0 for
613 // explicitly invariant loads because these are how clang tells us to use ldg
614 // when the user uses a builtin.
615 if (!Subtarget.hasLDG() || CodeAddrSpace != NVPTX::AddressSpace::Global)
616 return false;
617
618 if (N->isInvariant())
619 return true;
620
621 bool IsKernelFn = isKernelFunction(F->getFunction());
622
623 // We use getUnderlyingObjects() here instead of getUnderlyingObject() mainly
624 // because the former looks through phi nodes while the latter does not. We
625 // need to look through phi nodes to handle pointer induction variables.
627 getUnderlyingObjects(N->getMemOperand()->getValue(), Objs);
628
629 return all_of(Objs, [&](const Value *V) {
630 if (auto *A = dyn_cast<const Argument>(V))
631 return IsKernelFn && A->onlyReadsMemory() && A->hasNoAliasAttr();
632 if (auto *GV = dyn_cast<const GlobalVariable>(V))
633 return GV->isConstant();
634 return false;
635 });
636}
637
638static unsigned int getFenceOp(NVPTX::Ordering O, NVPTX::Scope S,
639 NVPTXSubtarget const *T) {
640 if (S == NVPTX::Scope::Cluster)
641 T->failIfClustersUnsupported(".cluster scope fence");
642
643 switch (O) {
647 switch (S) {
649 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_acq_rel_sys
650 : NVPTX::INT_MEMBAR_SYS;
652 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_acq_rel_cta
653 : NVPTX::INT_MEMBAR_CTA;
655 return NVPTX::atomic_thread_fence_acq_rel_cluster;
657 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_acq_rel_gpu
658 : NVPTX::INT_MEMBAR_GL;
661 formatv("Unsupported scope \"{}\" for acquire/release/acq_rel fence.",
662 ScopeToString(S)));
663 }
664 break;
665 }
667 switch (S) {
669 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_seq_cst_sys
670 : NVPTX::INT_MEMBAR_SYS;
672 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_seq_cst_cta
673 : NVPTX::INT_MEMBAR_CTA;
675 return NVPTX::atomic_thread_fence_seq_cst_cluster;
677 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_seq_cst_gpu
678 : NVPTX::INT_MEMBAR_GL;
680 report_fatal_error(formatv("Unsupported scope \"{}\" for seq_cst fence.",
681 ScopeToString(S)));
682 }
683 break;
684 }
690 formatv("Unsupported \"{}\" ordering and \"{}\" scope for fence.",
691 OrderingToString(O), ScopeToString(S)));
692 }
693 llvm_unreachable("unhandled ordering");
694}
695
696// Returns Memory Order and Scope of a memory instruction, and
697// inserts any fence before the instruction that's required to
698// implement its memory ordering.
699std::pair<NVPTX::Ordering, NVPTX::Scope>
700NVPTXDAGToDAGISel::insertMemoryInstructionFence(SDLoc DL, SDValue &Chain,
701 MemSDNode *N) {
702 auto [InstructionOrdering, FenceOrdering] =
703 getOperationOrderings(N, Subtarget);
704 auto Scope = getOperationScope(N, InstructionOrdering);
705
706 // If a fence is required before the operation, insert it:
707 switch (NVPTX::Ordering(FenceOrdering)) {
709 break;
711 auto Op = getFenceOp(FenceOrdering, Scope, Subtarget);
712 Chain = SDValue(CurDAG->getMachineNode(Op, DL, MVT::Other, Chain), 0);
713 break;
714 }
715 default:
717 formatv("Unexpected fence ordering: \"{}\".",
718 OrderingToString(NVPTX::Ordering(FenceOrdering))));
719 }
720 return {InstructionOrdering, Scope};
721}
722
723bool NVPTXDAGToDAGISel::tryIntrinsicNoChain(SDNode *N) {
724 unsigned IID = N->getConstantOperandVal(0);
725 switch (IID) {
726 default:
727 return false;
728 case Intrinsic::nvvm_texsurf_handle_internal:
729 SelectTexSurfHandle(N);
730 return true;
731 }
732}
733
734void NVPTXDAGToDAGISel::SelectTexSurfHandle(SDNode *N) {
735 // Op 0 is the intrinsic ID
736 SDValue Wrapper = N->getOperand(1);
737 SDValue GlobalVal = Wrapper.getOperand(0);
738 ReplaceNode(N, CurDAG->getMachineNode(NVPTX::texsurf_handles, SDLoc(N),
739 MVT::i64, GlobalVal));
740}
741
742void NVPTXDAGToDAGISel::SelectAddrSpaceCast(SDNode *N) {
743 SDValue Src = N->getOperand(0);
744 AddrSpaceCastSDNode *CastN = cast<AddrSpaceCastSDNode>(N);
745 unsigned SrcAddrSpace = CastN->getSrcAddressSpace();
746 unsigned DstAddrSpace = CastN->getDestAddressSpace();
747 SDLoc DL(N);
748 assert(SrcAddrSpace != DstAddrSpace &&
749 "addrspacecast must be between different address spaces");
750
751 if (DstAddrSpace == ADDRESS_SPACE_GENERIC) {
752 // Specific to generic
753
754 if (TM.is64Bit() && TM.getPointerSizeInBits(SrcAddrSpace) == 32) {
755 SDValue CvtNone =
757 SDNode *Cvt = CurDAG->getMachineNode(NVPTX::CVT_u64_u32, DL, MVT::i64,
758 Src, CvtNone);
759 Src = SDValue(Cvt, 0);
760 }
761
762 unsigned Opc;
763 switch (SrcAddrSpace) {
764 default: report_fatal_error("Bad address space in addrspacecast");
766 Opc = TM.is64Bit() ? NVPTX::cvta_global_64 : NVPTX::cvta_global;
767 break;
769 Opc = TM.is64Bit() ? NVPTX::cvta_shared_64 : NVPTX::cvta_shared;
770 break;
772 Opc = TM.is64Bit() ? NVPTX::cvta_const_64 : NVPTX::cvta_const;
773 break;
775 Opc = TM.is64Bit() ? NVPTX::cvta_local_64 : NVPTX::cvta_local;
776 break;
777 }
778 ReplaceNode(N, CurDAG->getMachineNode(Opc, DL, N->getValueType(0), Src));
779 return;
780 } else {
781 // Generic to specific
782 if (SrcAddrSpace != 0)
783 report_fatal_error("Cannot cast between two non-generic address spaces");
784 unsigned Opc;
785 switch (DstAddrSpace) {
786 default: report_fatal_error("Bad address space in addrspacecast");
788 Opc = TM.is64Bit() ? NVPTX::cvta_to_global_64 : NVPTX::cvta_to_global;
789 break;
791 Opc = TM.is64Bit() ? NVPTX::cvta_to_shared_64 : NVPTX::cvta_to_shared;
792 break;
794 Opc = TM.is64Bit() ? NVPTX::cvta_to_const_64 : NVPTX::cvta_to_const;
795 break;
797 Opc = TM.is64Bit() ? NVPTX::cvta_to_local_64 : NVPTX::cvta_to_local;
798 break;
800 Opc = TM.is64Bit() ? NVPTX::IMOV64rr : NVPTX::IMOV32rr;
801 break;
802 }
803
804 SDNode *CVTA = CurDAG->getMachineNode(Opc, DL, N->getValueType(0), Src);
805 if (TM.is64Bit() && TM.getPointerSizeInBits(DstAddrSpace) == 32) {
806 SDValue CvtNone =
808 CVTA = CurDAG->getMachineNode(NVPTX::CVT_u32_u64, DL, MVT::i32,
809 SDValue(CVTA, 0), CvtNone);
810 }
811
812 ReplaceNode(N, CVTA);
813 return;
814 }
815}
816
817// Helper function template to reduce amount of boilerplate code for
818// opcode selection.
819static std::optional<unsigned>
821 unsigned Opcode_i16, unsigned Opcode_i32,
822 std::optional<unsigned> Opcode_i64, unsigned Opcode_f32,
823 std::optional<unsigned> Opcode_f64) {
824 switch (VT) {
825 case MVT::i1:
826 case MVT::i8:
827 return Opcode_i8;
828 case MVT::i16:
829 return Opcode_i16;
830 case MVT::i32:
831 return Opcode_i32;
832 case MVT::i64:
833 return Opcode_i64;
834 case MVT::f16:
835 case MVT::bf16:
836 return Opcode_i16;
837 case MVT::v2f16:
838 case MVT::v2bf16:
839 case MVT::v2i16:
840 case MVT::v4i8:
841 return Opcode_i32;
842 case MVT::f32:
843 return Opcode_f32;
844 case MVT::f64:
845 return Opcode_f64;
846 default:
847 return std::nullopt;
848 }
849}
850
851static int getLdStRegType(EVT VT) {
852 if (VT.isFloatingPoint())
853 switch (VT.getSimpleVT().SimpleTy) {
854 case MVT::f16:
855 case MVT::bf16:
856 case MVT::v2f16:
857 case MVT::v2bf16:
859 default:
861 }
862 else
864}
865
866bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
867 MemSDNode *LD = cast<MemSDNode>(N);
868 assert(LD->readMem() && "Expected load");
869
870 // do not support pre/post inc/dec
871 LoadSDNode *PlainLoad = dyn_cast<LoadSDNode>(N);
872 if (PlainLoad && PlainLoad->isIndexed())
873 return false;
874
875 EVT LoadedVT = LD->getMemoryVT();
876 if (!LoadedVT.isSimple())
877 return false;
878
879 // Address Space Setting
880 unsigned int CodeAddrSpace = getCodeAddrSpace(LD);
881 if (canLowerToLDG(LD, *Subtarget, CodeAddrSpace, MF)) {
882 return tryLDGLDU(N);
883 }
884 unsigned int PointerSize =
885 CurDAG->getDataLayout().getPointerSizeInBits(LD->getAddressSpace());
886
887 SDLoc DL(N);
888 SDValue Chain = N->getOperand(0);
889 auto [Ordering, Scope] = insertMemoryInstructionFence(DL, Chain, LD);
890
891 // Type Setting: fromType + fromTypeWidth
892 //
893 // Sign : ISD::SEXTLOAD
894 // Unsign : ISD::ZEXTLOAD, ISD::NON_EXTLOAD or ISD::EXTLOAD and the
895 // type is integer
896 // Float : ISD::NON_EXTLOAD or ISD::EXTLOAD and the type is float
897 MVT SimpleVT = LoadedVT.getSimpleVT();
898 MVT ScalarVT = SimpleVT.getScalarType();
899 // Read at least 8 bits (predicates are stored as 8-bit values)
900 unsigned FromTypeWidth = std::max(8U, (unsigned)ScalarVT.getSizeInBits());
901 unsigned int FromType;
902
903 // Vector Setting
905 if (SimpleVT.isVector()) {
906 assert((Isv2x16VT(LoadedVT) || LoadedVT == MVT::v4i8) &&
907 "Unexpected vector type");
908 // v2f16/v2bf16/v2i16 is loaded using ld.b32
909 FromTypeWidth = 32;
910 }
911
912 if (PlainLoad && (PlainLoad->getExtensionType() == ISD::SEXTLOAD))
914 else
915 FromType = getLdStRegType(ScalarVT);
916
917 // Create the machine instruction DAG
918 SDValue N1 = N->getOperand(1);
921 std::optional<unsigned> Opcode;
922 MVT::SimpleValueType TargetVT = LD->getSimpleValueType(0).SimpleTy;
923
924 SmallVector<SDValue, 12> Ops({getI32Imm(Ordering, DL), getI32Imm(Scope, DL),
925 getI32Imm(CodeAddrSpace, DL),
926 getI32Imm(VecType, DL), getI32Imm(FromType, DL),
927 getI32Imm(FromTypeWidth, DL)});
928
929 if (SelectDirectAddr(N1, Addr)) {
930 Opcode = pickOpcodeForVT(TargetVT, NVPTX::LD_i8_avar, NVPTX::LD_i16_avar,
931 NVPTX::LD_i32_avar, NVPTX::LD_i64_avar,
932 NVPTX::LD_f32_avar, NVPTX::LD_f64_avar);
933 if (!Opcode)
934 return false;
935 Ops.append({Addr, Chain});
936 } else if (PointerSize == 64 ? SelectADDRsi64(N1.getNode(), N1, Base, Offset)
937 : SelectADDRsi(N1.getNode(), N1, Base, Offset)) {
938 Opcode = pickOpcodeForVT(TargetVT, NVPTX::LD_i8_asi, NVPTX::LD_i16_asi,
939 NVPTX::LD_i32_asi, NVPTX::LD_i64_asi,
940 NVPTX::LD_f32_asi, NVPTX::LD_f64_asi);
941 if (!Opcode)
942 return false;
943 Ops.append({Base, Offset, Chain});
944 } else if (PointerSize == 64 ? SelectADDRri64(N1.getNode(), N1, Base, Offset)
945 : SelectADDRri(N1.getNode(), N1, Base, Offset)) {
946 if (PointerSize == 64)
947 Opcode =
948 pickOpcodeForVT(TargetVT, NVPTX::LD_i8_ari_64, NVPTX::LD_i16_ari_64,
949 NVPTX::LD_i32_ari_64, NVPTX::LD_i64_ari_64,
950 NVPTX::LD_f32_ari_64, NVPTX::LD_f64_ari_64);
951 else
952 Opcode = pickOpcodeForVT(TargetVT, NVPTX::LD_i8_ari, NVPTX::LD_i16_ari,
953 NVPTX::LD_i32_ari, NVPTX::LD_i64_ari,
954 NVPTX::LD_f32_ari, NVPTX::LD_f64_ari);
955 if (!Opcode)
956 return false;
957 Ops.append({Base, Offset, Chain});
958 } else {
959 if (PointerSize == 64)
960 Opcode =
961 pickOpcodeForVT(TargetVT, NVPTX::LD_i8_areg_64, NVPTX::LD_i16_areg_64,
962 NVPTX::LD_i32_areg_64, NVPTX::LD_i64_areg_64,
963 NVPTX::LD_f32_areg_64, NVPTX::LD_f64_areg_64);
964 else
965 Opcode = pickOpcodeForVT(TargetVT, NVPTX::LD_i8_areg, NVPTX::LD_i16_areg,
966 NVPTX::LD_i32_areg, NVPTX::LD_i64_areg,
967 NVPTX::LD_f32_areg, NVPTX::LD_f64_areg);
968 if (!Opcode)
969 return false;
970 Ops.append({N1, Chain});
971 }
972
973 SDNode *NVPTXLD =
974 CurDAG->getMachineNode(*Opcode, DL, TargetVT, MVT::Other, Ops);
975 if (!NVPTXLD)
976 return false;
977
978 MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand();
979 CurDAG->setNodeMemRefs(cast<MachineSDNode>(NVPTXLD), {MemRef});
980
981 ReplaceNode(N, NVPTXLD);
982 return true;
983}
984
985static bool isVectorElementTypeUpsized(EVT EltVT) {
986 // Despite vectors like v8i8, v16i8, v8i16 being within the bit-limit for
987 // total load/store size, PTX syntax only supports v2/v4. Thus, we can't use
988 // vectorized loads/stores with the actual element type for i8/i16 as that
989 // would require v8/v16 variants that do not exist.
990 // In order to load/store such vectors efficiently, in Type Legalization
991 // we split the vector into word-sized chunks (v2x16/v4i8). Now, we will
992 // lower to PTX as vectors of b32.
993 return Isv2x16VT(EltVT) || EltVT == MVT::v4i8;
994}
995
996bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
997 MemSDNode *MemSD = cast<MemSDNode>(N);
998 EVT LoadedVT = MemSD->getMemoryVT();
999 if (!LoadedVT.isSimple())
1000 return false;
1001
1002 // Address Space Setting
1003 unsigned int CodeAddrSpace = getCodeAddrSpace(MemSD);
1004 if (canLowerToLDG(MemSD, *Subtarget, CodeAddrSpace, MF)) {
1005 return tryLDGLDU(N);
1006 }
1007 unsigned int PointerSize =
1009
1010 SDLoc DL(N);
1011 SDValue Chain = N->getOperand(0);
1012 auto [Ordering, Scope] = insertMemoryInstructionFence(DL, Chain, MemSD);
1013
1014 // Vector Setting
1015 MVT SimpleVT = LoadedVT.getSimpleVT();
1016
1017 // Type Setting: fromType + fromTypeWidth
1018 //
1019 // Sign : ISD::SEXTLOAD
1020 // Unsign : ISD::ZEXTLOAD, ISD::NON_EXTLOAD or ISD::EXTLOAD and the
1021 // type is integer
1022 // Float : ISD::NON_EXTLOAD or ISD::EXTLOAD and the type is float
1023 MVT ScalarVT = SimpleVT.getScalarType();
1024 // Read at least 8 bits (predicates are stored as 8-bit values)
1025 unsigned FromTypeWidth = std::max(8U, (unsigned)ScalarVT.getSizeInBits());
1026 unsigned int FromType;
1027 // The last operand holds the original LoadSDNode::getExtensionType() value
1028 unsigned ExtensionType = cast<ConstantSDNode>(
1029 N->getOperand(N->getNumOperands() - 1))->getZExtValue();
1030 if (ExtensionType == ISD::SEXTLOAD)
1032 else
1033 FromType = getLdStRegType(ScalarVT);
1034
1035 unsigned VecType;
1036
1037 switch (N->getOpcode()) {
1038 case NVPTXISD::LoadV2:
1040 break;
1041 case NVPTXISD::LoadV4:
1043 break;
1044 default:
1045 return false;
1046 }
1047
1048 EVT EltVT = N->getValueType(0);
1049
1050 if (isVectorElementTypeUpsized(EltVT)) {
1051 EltVT = MVT::i32;
1053 FromTypeWidth = 32;
1054 }
1055
1056 SDValue Op1 = N->getOperand(1);
1058 std::optional<unsigned> Opcode;
1059 SDNode *LD;
1060
1061 SmallVector<SDValue, 12> Ops({getI32Imm(Ordering, DL), getI32Imm(Scope, DL),
1062 getI32Imm(CodeAddrSpace, DL),
1063 getI32Imm(VecType, DL), getI32Imm(FromType, DL),
1064 getI32Imm(FromTypeWidth, DL)});
1065
1066 if (SelectDirectAddr(Op1, Addr)) {
1067 switch (N->getOpcode()) {
1068 default:
1069 return false;
1070 case NVPTXISD::LoadV2:
1071 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1072 NVPTX::LDV_i8_v2_avar, NVPTX::LDV_i16_v2_avar,
1073 NVPTX::LDV_i32_v2_avar, NVPTX::LDV_i64_v2_avar,
1074 NVPTX::LDV_f32_v2_avar, NVPTX::LDV_f64_v2_avar);
1075 break;
1076 case NVPTXISD::LoadV4:
1077 Opcode =
1078 pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_avar,
1079 NVPTX::LDV_i16_v4_avar, NVPTX::LDV_i32_v4_avar,
1080 std::nullopt, NVPTX::LDV_f32_v4_avar, std::nullopt);
1081 break;
1082 }
1083 if (!Opcode)
1084 return false;
1085 Ops.append({Addr, Chain});
1086 } else if (PointerSize == 64
1087 ? SelectADDRsi64(Op1.getNode(), Op1, Base, Offset)
1088 : SelectADDRsi(Op1.getNode(), Op1, Base, Offset)) {
1089 switch (N->getOpcode()) {
1090 default:
1091 return false;
1092 case NVPTXISD::LoadV2:
1093 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1094 NVPTX::LDV_i8_v2_asi, NVPTX::LDV_i16_v2_asi,
1095 NVPTX::LDV_i32_v2_asi, NVPTX::LDV_i64_v2_asi,
1096 NVPTX::LDV_f32_v2_asi, NVPTX::LDV_f64_v2_asi);
1097 break;
1098 case NVPTXISD::LoadV4:
1099 Opcode =
1100 pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_asi,
1101 NVPTX::LDV_i16_v4_asi, NVPTX::LDV_i32_v4_asi,
1102 std::nullopt, NVPTX::LDV_f32_v4_asi, std::nullopt);
1103 break;
1104 }
1105 if (!Opcode)
1106 return false;
1107 Ops.append({Base, Offset, Chain});
1108 } else if (PointerSize == 64
1109 ? SelectADDRri64(Op1.getNode(), Op1, Base, Offset)
1110 : SelectADDRri(Op1.getNode(), Op1, Base, Offset)) {
1111 if (PointerSize == 64) {
1112 switch (N->getOpcode()) {
1113 default:
1114 return false;
1115 case NVPTXISD::LoadV2:
1116 Opcode =
1118 NVPTX::LDV_i8_v2_ari_64, NVPTX::LDV_i16_v2_ari_64,
1119 NVPTX::LDV_i32_v2_ari_64, NVPTX::LDV_i64_v2_ari_64,
1120 NVPTX::LDV_f32_v2_ari_64, NVPTX::LDV_f64_v2_ari_64);
1121 break;
1122 case NVPTXISD::LoadV4:
1123 Opcode = pickOpcodeForVT(
1124 EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_ari_64,
1125 NVPTX::LDV_i16_v4_ari_64, NVPTX::LDV_i32_v4_ari_64, std::nullopt,
1126 NVPTX::LDV_f32_v4_ari_64, std::nullopt);
1127 break;
1128 }
1129 } else {
1130 switch (N->getOpcode()) {
1131 default:
1132 return false;
1133 case NVPTXISD::LoadV2:
1134 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1135 NVPTX::LDV_i8_v2_ari, NVPTX::LDV_i16_v2_ari,
1136 NVPTX::LDV_i32_v2_ari, NVPTX::LDV_i64_v2_ari,
1137 NVPTX::LDV_f32_v2_ari, NVPTX::LDV_f64_v2_ari);
1138 break;
1139 case NVPTXISD::LoadV4:
1140 Opcode =
1141 pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_ari,
1142 NVPTX::LDV_i16_v4_ari, NVPTX::LDV_i32_v4_ari,
1143 std::nullopt, NVPTX::LDV_f32_v4_ari, std::nullopt);
1144 break;
1145 }
1146 }
1147 if (!Opcode)
1148 return false;
1149 Ops.append({Base, Offset, Chain});
1150 } else {
1151 if (PointerSize == 64) {
1152 switch (N->getOpcode()) {
1153 default:
1154 return false;
1155 case NVPTXISD::LoadV2:
1156 Opcode = pickOpcodeForVT(
1157 EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v2_areg_64,
1158 NVPTX::LDV_i16_v2_areg_64, NVPTX::LDV_i32_v2_areg_64,
1159 NVPTX::LDV_i64_v2_areg_64, NVPTX::LDV_f32_v2_areg_64,
1160 NVPTX::LDV_f64_v2_areg_64);
1161 break;
1162 case NVPTXISD::LoadV4:
1163 Opcode = pickOpcodeForVT(
1164 EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_areg_64,
1165 NVPTX::LDV_i16_v4_areg_64, NVPTX::LDV_i32_v4_areg_64, std::nullopt,
1166 NVPTX::LDV_f32_v4_areg_64, std::nullopt);
1167 break;
1168 }
1169 } else {
1170 switch (N->getOpcode()) {
1171 default:
1172 return false;
1173 case NVPTXISD::LoadV2:
1174 Opcode =
1175 pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v2_areg,
1176 NVPTX::LDV_i16_v2_areg, NVPTX::LDV_i32_v2_areg,
1177 NVPTX::LDV_i64_v2_areg, NVPTX::LDV_f32_v2_areg,
1178 NVPTX::LDV_f64_v2_areg);
1179 break;
1180 case NVPTXISD::LoadV4:
1181 Opcode =
1182 pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_areg,
1183 NVPTX::LDV_i16_v4_areg, NVPTX::LDV_i32_v4_areg,
1184 std::nullopt, NVPTX::LDV_f32_v4_areg, std::nullopt);
1185 break;
1186 }
1187 }
1188 if (!Opcode)
1189 return false;
1190 Ops.append({Op1, Chain});
1191 }
1192 LD = CurDAG->getMachineNode(*Opcode, DL, N->getVTList(), Ops);
1193
1194 MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand();
1195 CurDAG->setNodeMemRefs(cast<MachineSDNode>(LD), {MemRef});
1196
1197 ReplaceNode(N, LD);
1198 return true;
1199}
1200
1201bool NVPTXDAGToDAGISel::tryLDGLDU(SDNode *N) {
1202 auto *Mem = cast<MemSDNode>(N);
1203
1204 // If this is an LDG intrinsic, the address is the third operand. If its an
1205 // LDG/LDU SD node (from custom vector handling), then its the second operand
1206 SDValue Op1 = N->getOperand(N->getOpcode() == ISD::INTRINSIC_W_CHAIN ? 2 : 1);
1207
1208 EVT OrigType = N->getValueType(0);
1209 EVT EltVT = Mem->getMemoryVT();
1210 unsigned NumElts = 1;
1211 if (EltVT.isVector()) {
1212 NumElts = EltVT.getVectorNumElements();
1213 EltVT = EltVT.getVectorElementType();
1214 // vectors of 8/16bits type are loaded/stored as multiples of v4i8/v2x16
1215 // elements.
1216 if ((EltVT == MVT::f16 && OrigType == MVT::v2f16) ||
1217 (EltVT == MVT::bf16 && OrigType == MVT::v2bf16) ||
1218 (EltVT == MVT::i16 && OrigType == MVT::v2i16) ||
1219 (EltVT == MVT::i8 && OrigType == MVT::v4i8)) {
1220 assert(NumElts % OrigType.getVectorNumElements() == 0 &&
1221 "NumElts must be divisible by the number of elts in subvectors");
1222 EltVT = OrigType;
1223 NumElts /= OrigType.getVectorNumElements();
1224 }
1225 }
1226
1227 // Build the "promoted" result VTList for the load. If we are really loading
1228 // i8s, then the return type will be promoted to i16 since we do not expose
1229 // 8-bit registers in NVPTX.
1230 EVT NodeVT = (EltVT == MVT::i8) ? MVT::i16 : EltVT;
1231 SmallVector<EVT, 5> InstVTs;
1232 for (unsigned i = 0; i != NumElts; ++i) {
1233 InstVTs.push_back(NodeVT);
1234 }
1235 InstVTs.push_back(MVT::Other);
1236 SDVTList InstVTList = CurDAG->getVTList(InstVTs);
1237 SDValue Chain = N->getOperand(0);
1238
1239 std::optional<unsigned> Opcode;
1240 SDLoc DL(N);
1241 SDNode *LD;
1243
1244 if (SelectDirectAddr(Op1, Addr)) {
1245 switch (N->getOpcode()) {
1246 default:
1247 return false;
1248 case ISD::LOAD:
1249 Opcode = pickOpcodeForVT(
1250 EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDG_GLOBAL_i8avar,
1251 NVPTX::INT_PTX_LDG_GLOBAL_i16avar, NVPTX::INT_PTX_LDG_GLOBAL_i32avar,
1252 NVPTX::INT_PTX_LDG_GLOBAL_i64avar, NVPTX::INT_PTX_LDG_GLOBAL_f32avar,
1253 NVPTX::INT_PTX_LDG_GLOBAL_f64avar);
1254 break;
1256 Opcode = pickOpcodeForVT(
1257 EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDU_GLOBAL_i8avar,
1258 NVPTX::INT_PTX_LDU_GLOBAL_i16avar, NVPTX::INT_PTX_LDU_GLOBAL_i32avar,
1259 NVPTX::INT_PTX_LDU_GLOBAL_i64avar, NVPTX::INT_PTX_LDU_GLOBAL_f32avar,
1260 NVPTX::INT_PTX_LDU_GLOBAL_f64avar);
1261 break;
1262 case NVPTXISD::LoadV2:
1263 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1264 NVPTX::INT_PTX_LDG_G_v2i8_ELE_avar,
1265 NVPTX::INT_PTX_LDG_G_v2i16_ELE_avar,
1266 NVPTX::INT_PTX_LDG_G_v2i32_ELE_avar,
1267 NVPTX::INT_PTX_LDG_G_v2i64_ELE_avar,
1268 NVPTX::INT_PTX_LDG_G_v2f32_ELE_avar,
1269 NVPTX::INT_PTX_LDG_G_v2f64_ELE_avar);
1270 break;
1271 case NVPTXISD::LDUV2:
1272 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1273 NVPTX::INT_PTX_LDU_G_v2i8_ELE_avar,
1274 NVPTX::INT_PTX_LDU_G_v2i16_ELE_avar,
1275 NVPTX::INT_PTX_LDU_G_v2i32_ELE_avar,
1276 NVPTX::INT_PTX_LDU_G_v2i64_ELE_avar,
1277 NVPTX::INT_PTX_LDU_G_v2f32_ELE_avar,
1278 NVPTX::INT_PTX_LDU_G_v2f64_ELE_avar);
1279 break;
1280 case NVPTXISD::LoadV4:
1281 Opcode = pickOpcodeForVT(
1282 EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDG_G_v4i8_ELE_avar,
1283 NVPTX::INT_PTX_LDG_G_v4i16_ELE_avar,
1284 NVPTX::INT_PTX_LDG_G_v4i32_ELE_avar, std::nullopt,
1285 NVPTX::INT_PTX_LDG_G_v4f32_ELE_avar, std::nullopt);
1286 break;
1287 case NVPTXISD::LDUV4:
1288 Opcode = pickOpcodeForVT(
1289 EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDU_G_v4i8_ELE_avar,
1290 NVPTX::INT_PTX_LDU_G_v4i16_ELE_avar,
1291 NVPTX::INT_PTX_LDU_G_v4i32_ELE_avar, std::nullopt,
1292 NVPTX::INT_PTX_LDU_G_v4f32_ELE_avar, std::nullopt);
1293 break;
1294 }
1295 if (!Opcode)
1296 return false;
1297 SDValue Ops[] = { Addr, Chain };
1298 LD = CurDAG->getMachineNode(*Opcode, DL, InstVTList, Ops);
1299 } else if (TM.is64Bit() ? SelectADDRri64(Op1.getNode(), Op1, Base, Offset)
1300 : SelectADDRri(Op1.getNode(), Op1, Base, Offset)) {
1301 if (TM.is64Bit()) {
1302 switch (N->getOpcode()) {
1303 default:
1304 return false;
1305 case ISD::LOAD:
1306 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1307 NVPTX::INT_PTX_LDG_GLOBAL_i8ari64,
1308 NVPTX::INT_PTX_LDG_GLOBAL_i16ari64,
1309 NVPTX::INT_PTX_LDG_GLOBAL_i32ari64,
1310 NVPTX::INT_PTX_LDG_GLOBAL_i64ari64,
1311 NVPTX::INT_PTX_LDG_GLOBAL_f32ari64,
1312 NVPTX::INT_PTX_LDG_GLOBAL_f64ari64);
1313 break;
1315 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1316 NVPTX::INT_PTX_LDU_GLOBAL_i8ari64,
1317 NVPTX::INT_PTX_LDU_GLOBAL_i16ari64,
1318 NVPTX::INT_PTX_LDU_GLOBAL_i32ari64,
1319 NVPTX::INT_PTX_LDU_GLOBAL_i64ari64,
1320 NVPTX::INT_PTX_LDU_GLOBAL_f32ari64,
1321 NVPTX::INT_PTX_LDU_GLOBAL_f64ari64);
1322 break;
1323 case NVPTXISD::LoadV2:
1324 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1325 NVPTX::INT_PTX_LDG_G_v2i8_ELE_ari64,
1326 NVPTX::INT_PTX_LDG_G_v2i16_ELE_ari64,
1327 NVPTX::INT_PTX_LDG_G_v2i32_ELE_ari64,
1328 NVPTX::INT_PTX_LDG_G_v2i64_ELE_ari64,
1329 NVPTX::INT_PTX_LDG_G_v2f32_ELE_ari64,
1330 NVPTX::INT_PTX_LDG_G_v2f64_ELE_ari64);
1331 break;
1332 case NVPTXISD::LDUV2:
1333 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1334 NVPTX::INT_PTX_LDU_G_v2i8_ELE_ari64,
1335 NVPTX::INT_PTX_LDU_G_v2i16_ELE_ari64,
1336 NVPTX::INT_PTX_LDU_G_v2i32_ELE_ari64,
1337 NVPTX::INT_PTX_LDU_G_v2i64_ELE_ari64,
1338 NVPTX::INT_PTX_LDU_G_v2f32_ELE_ari64,
1339 NVPTX::INT_PTX_LDU_G_v2f64_ELE_ari64);
1340 break;
1341 case NVPTXISD::LoadV4:
1342 Opcode = pickOpcodeForVT(
1343 EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDG_G_v4i8_ELE_ari64,
1344 NVPTX::INT_PTX_LDG_G_v4i16_ELE_ari64,
1345 NVPTX::INT_PTX_LDG_G_v4i32_ELE_ari64, std::nullopt,
1346 NVPTX::INT_PTX_LDG_G_v4f32_ELE_ari64, std::nullopt);
1347 break;
1348 case NVPTXISD::LDUV4:
1349 Opcode = pickOpcodeForVT(
1350 EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDU_G_v4i8_ELE_ari64,
1351 NVPTX::INT_PTX_LDU_G_v4i16_ELE_ari64,
1352 NVPTX::INT_PTX_LDU_G_v4i32_ELE_ari64, std::nullopt,
1353 NVPTX::INT_PTX_LDU_G_v4f32_ELE_ari64, std::nullopt);
1354 break;
1355 }
1356 } else {
1357 switch (N->getOpcode()) {
1358 default:
1359 return false;
1360 case ISD::LOAD:
1361 Opcode = pickOpcodeForVT(
1362 EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDG_GLOBAL_i8ari,
1363 NVPTX::INT_PTX_LDG_GLOBAL_i16ari, NVPTX::INT_PTX_LDG_GLOBAL_i32ari,
1364 NVPTX::INT_PTX_LDG_GLOBAL_i64ari, NVPTX::INT_PTX_LDG_GLOBAL_f32ari,
1365 NVPTX::INT_PTX_LDG_GLOBAL_f64ari);
1366 break;
1368 Opcode = pickOpcodeForVT(
1369 EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDU_GLOBAL_i8ari,
1370 NVPTX::INT_PTX_LDU_GLOBAL_i16ari, NVPTX::INT_PTX_LDU_GLOBAL_i32ari,
1371 NVPTX::INT_PTX_LDU_GLOBAL_i64ari, NVPTX::INT_PTX_LDU_GLOBAL_f32ari,
1372 NVPTX::INT_PTX_LDU_GLOBAL_f64ari);
1373 break;
1374 case NVPTXISD::LoadV2:
1375 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1376 NVPTX::INT_PTX_LDG_G_v2i8_ELE_ari32,
1377 NVPTX::INT_PTX_LDG_G_v2i16_ELE_ari32,
1378 NVPTX::INT_PTX_LDG_G_v2i32_ELE_ari32,
1379 NVPTX::INT_PTX_LDG_G_v2i64_ELE_ari32,
1380 NVPTX::INT_PTX_LDG_G_v2f32_ELE_ari32,
1381 NVPTX::INT_PTX_LDG_G_v2f64_ELE_ari32);
1382 break;
1383 case NVPTXISD::LDUV2:
1384 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1385 NVPTX::INT_PTX_LDU_G_v2i8_ELE_ari32,
1386 NVPTX::INT_PTX_LDU_G_v2i16_ELE_ari32,
1387 NVPTX::INT_PTX_LDU_G_v2i32_ELE_ari32,
1388 NVPTX::INT_PTX_LDU_G_v2i64_ELE_ari32,
1389 NVPTX::INT_PTX_LDU_G_v2f32_ELE_ari32,
1390 NVPTX::INT_PTX_LDU_G_v2f64_ELE_ari32);
1391 break;
1392 case NVPTXISD::LoadV4:
1393 Opcode = pickOpcodeForVT(
1394 EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDG_G_v4i8_ELE_ari32,
1395 NVPTX::INT_PTX_LDG_G_v4i16_ELE_ari32,
1396 NVPTX::INT_PTX_LDG_G_v4i32_ELE_ari32, std::nullopt,
1397 NVPTX::INT_PTX_LDG_G_v4f32_ELE_ari32, std::nullopt);
1398 break;
1399 case NVPTXISD::LDUV4:
1400 Opcode = pickOpcodeForVT(
1401 EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDU_G_v4i8_ELE_ari32,
1402 NVPTX::INT_PTX_LDU_G_v4i16_ELE_ari32,
1403 NVPTX::INT_PTX_LDU_G_v4i32_ELE_ari32, std::nullopt,
1404 NVPTX::INT_PTX_LDU_G_v4f32_ELE_ari32, std::nullopt);
1405 break;
1406 }
1407 }
1408 if (!Opcode)
1409 return false;
1410 SDValue Ops[] = {Base, Offset, Chain};
1411 LD = CurDAG->getMachineNode(*Opcode, DL, InstVTList, Ops);
1412 } else {
1413 if (TM.is64Bit()) {
1414 switch (N->getOpcode()) {
1415 default:
1416 return false;
1417 case ISD::LOAD:
1418 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1419 NVPTX::INT_PTX_LDG_GLOBAL_i8areg64,
1420 NVPTX::INT_PTX_LDG_GLOBAL_i16areg64,
1421 NVPTX::INT_PTX_LDG_GLOBAL_i32areg64,
1422 NVPTX::INT_PTX_LDG_GLOBAL_i64areg64,
1423 NVPTX::INT_PTX_LDG_GLOBAL_f32areg64,
1424 NVPTX::INT_PTX_LDG_GLOBAL_f64areg64);
1425 break;
1427 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1428 NVPTX::INT_PTX_LDU_GLOBAL_i8areg64,
1429 NVPTX::INT_PTX_LDU_GLOBAL_i16areg64,
1430 NVPTX::INT_PTX_LDU_GLOBAL_i32areg64,
1431 NVPTX::INT_PTX_LDU_GLOBAL_i64areg64,
1432 NVPTX::INT_PTX_LDU_GLOBAL_f32areg64,
1433 NVPTX::INT_PTX_LDU_GLOBAL_f64areg64);
1434 break;
1435 case NVPTXISD::LoadV2:
1436 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1437 NVPTX::INT_PTX_LDG_G_v2i8_ELE_areg64,
1438 NVPTX::INT_PTX_LDG_G_v2i16_ELE_areg64,
1439 NVPTX::INT_PTX_LDG_G_v2i32_ELE_areg64,
1440 NVPTX::INT_PTX_LDG_G_v2i64_ELE_areg64,
1441 NVPTX::INT_PTX_LDG_G_v2f32_ELE_areg64,
1442 NVPTX::INT_PTX_LDG_G_v2f64_ELE_areg64);
1443 break;
1444 case NVPTXISD::LDUV2:
1445 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1446 NVPTX::INT_PTX_LDU_G_v2i8_ELE_areg64,
1447 NVPTX::INT_PTX_LDU_G_v2i16_ELE_areg64,
1448 NVPTX::INT_PTX_LDU_G_v2i32_ELE_areg64,
1449 NVPTX::INT_PTX_LDU_G_v2i64_ELE_areg64,
1450 NVPTX::INT_PTX_LDU_G_v2f32_ELE_areg64,
1451 NVPTX::INT_PTX_LDU_G_v2f64_ELE_areg64);
1452 break;
1453 case NVPTXISD::LoadV4:
1454 Opcode = pickOpcodeForVT(
1455 EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDG_G_v4i8_ELE_areg64,
1456 NVPTX::INT_PTX_LDG_G_v4i16_ELE_areg64,
1457 NVPTX::INT_PTX_LDG_G_v4i32_ELE_areg64, std::nullopt,
1458 NVPTX::INT_PTX_LDG_G_v4f32_ELE_areg64, std::nullopt);
1459 break;
1460 case NVPTXISD::LDUV4:
1461 Opcode = pickOpcodeForVT(
1462 EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDU_G_v4i8_ELE_areg64,
1463 NVPTX::INT_PTX_LDU_G_v4i16_ELE_areg64,
1464 NVPTX::INT_PTX_LDU_G_v4i32_ELE_areg64, std::nullopt,
1465 NVPTX::INT_PTX_LDU_G_v4f32_ELE_areg64, std::nullopt);
1466 break;
1467 }
1468 } else {
1469 switch (N->getOpcode()) {
1470 default:
1471 return false;
1472 case ISD::LOAD:
1473 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1474 NVPTX::INT_PTX_LDG_GLOBAL_i8areg,
1475 NVPTX::INT_PTX_LDG_GLOBAL_i16areg,
1476 NVPTX::INT_PTX_LDG_GLOBAL_i32areg,
1477 NVPTX::INT_PTX_LDG_GLOBAL_i64areg,
1478 NVPTX::INT_PTX_LDG_GLOBAL_f32areg,
1479 NVPTX::INT_PTX_LDG_GLOBAL_f64areg);
1480 break;
1482 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1483 NVPTX::INT_PTX_LDU_GLOBAL_i8areg,
1484 NVPTX::INT_PTX_LDU_GLOBAL_i16areg,
1485 NVPTX::INT_PTX_LDU_GLOBAL_i32areg,
1486 NVPTX::INT_PTX_LDU_GLOBAL_i64areg,
1487 NVPTX::INT_PTX_LDU_GLOBAL_f32areg,
1488 NVPTX::INT_PTX_LDU_GLOBAL_f64areg);
1489 break;
1490 case NVPTXISD::LoadV2:
1491 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1492 NVPTX::INT_PTX_LDG_G_v2i8_ELE_areg32,
1493 NVPTX::INT_PTX_LDG_G_v2i16_ELE_areg32,
1494 NVPTX::INT_PTX_LDG_G_v2i32_ELE_areg32,
1495 NVPTX::INT_PTX_LDG_G_v2i64_ELE_areg32,
1496 NVPTX::INT_PTX_LDG_G_v2f32_ELE_areg32,
1497 NVPTX::INT_PTX_LDG_G_v2f64_ELE_areg32);
1498 break;
1499 case NVPTXISD::LDUV2:
1500 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1501 NVPTX::INT_PTX_LDU_G_v2i8_ELE_areg32,
1502 NVPTX::INT_PTX_LDU_G_v2i16_ELE_areg32,
1503 NVPTX::INT_PTX_LDU_G_v2i32_ELE_areg32,
1504 NVPTX::INT_PTX_LDU_G_v2i64_ELE_areg32,
1505 NVPTX::INT_PTX_LDU_G_v2f32_ELE_areg32,
1506 NVPTX::INT_PTX_LDU_G_v2f64_ELE_areg32);
1507 break;
1508 case NVPTXISD::LoadV4:
1509 Opcode = pickOpcodeForVT(
1510 EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDG_G_v4i8_ELE_areg32,
1511 NVPTX::INT_PTX_LDG_G_v4i16_ELE_areg32,
1512 NVPTX::INT_PTX_LDG_G_v4i32_ELE_areg32, std::nullopt,
1513 NVPTX::INT_PTX_LDG_G_v4f32_ELE_areg32, std::nullopt);
1514 break;
1515 case NVPTXISD::LDUV4:
1516 Opcode = pickOpcodeForVT(
1517 EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDU_G_v4i8_ELE_areg32,
1518 NVPTX::INT_PTX_LDU_G_v4i16_ELE_areg32,
1519 NVPTX::INT_PTX_LDU_G_v4i32_ELE_areg32, std::nullopt,
1520 NVPTX::INT_PTX_LDU_G_v4f32_ELE_areg32, std::nullopt);
1521 break;
1522 }
1523 }
1524 if (!Opcode)
1525 return false;
1526 SDValue Ops[] = { Op1, Chain };
1527 LD = CurDAG->getMachineNode(*Opcode, DL, InstVTList, Ops);
1528 }
1529
1530 // For automatic generation of LDG (through SelectLoad[Vector], not the
1531 // intrinsics), we may have an extending load like:
1532 //
1533 // i32,ch = load<LD1[%data1(addrspace=1)], zext from i8> t0, t7, undef:i64
1534 //
1535 // In this case, the matching logic above will select a load for the original
1536 // memory type (in this case, i8) and our types will not match (the node needs
1537 // to return an i32 in this case). Our LDG/LDU nodes do not support the
1538 // concept of sign-/zero-extension, so emulate it here by adding an explicit
1539 // CVT instruction. Ptxas should clean up any redundancies here.
1540
1541 LoadSDNode *LdNode = dyn_cast<LoadSDNode>(N);
1542
1543 if (OrigType != EltVT &&
1544 (LdNode || (OrigType.isFloatingPoint() && EltVT.isFloatingPoint()))) {
1545 // We have an extending-load. The instruction we selected operates on the
1546 // smaller type, but the SDNode we are replacing has the larger type. We
1547 // need to emit a CVT to make the types match.
1548 unsigned CvtOpc =
1549 GetConvertOpcode(OrigType.getSimpleVT(), EltVT.getSimpleVT(), LdNode);
1550
1551 // For each output value, apply the manual sign/zero-extension and make sure
1552 // all users of the load go through that CVT.
1553 for (unsigned i = 0; i != NumElts; ++i) {
1554 SDValue Res(LD, i);
1555 SDValue OrigVal(N, i);
1556
1557 SDNode *CvtNode =
1558 CurDAG->getMachineNode(CvtOpc, DL, OrigType, Res,
1560 DL, MVT::i32));
1561 ReplaceUses(OrigVal, SDValue(CvtNode, 0));
1562 }
1563 }
1564
1565 ReplaceNode(N, LD);
1566 return true;
1567}
1568
1569bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
1570 MemSDNode *ST = cast<MemSDNode>(N);
1571 assert(ST->writeMem() && "Expected store");
1572 StoreSDNode *PlainStore = dyn_cast<StoreSDNode>(N);
1573 AtomicSDNode *AtomicStore = dyn_cast<AtomicSDNode>(N);
1574 assert((PlainStore || AtomicStore) && "Expected store");
1575
1576 // do not support pre/post inc/dec
1577 if (PlainStore && PlainStore->isIndexed())
1578 return false;
1579
1580 EVT StoreVT = ST->getMemoryVT();
1581 if (!StoreVT.isSimple())
1582 return false;
1583
1584 // Address Space Setting
1585 unsigned int CodeAddrSpace = getCodeAddrSpace(ST);
1586 unsigned int PointerSize =
1587 CurDAG->getDataLayout().getPointerSizeInBits(ST->getAddressSpace());
1588
1589 SDLoc DL(N);
1590 SDValue Chain = ST->getChain();
1591 auto [Ordering, Scope] = insertMemoryInstructionFence(DL, Chain, ST);
1592
1593 // Vector Setting
1594 MVT SimpleVT = StoreVT.getSimpleVT();
1596
1597 // Type Setting: toType + toTypeWidth
1598 // - for integer type, always use 'u'
1599 MVT ScalarVT = SimpleVT.getScalarType();
1600 unsigned ToTypeWidth = ScalarVT.getSizeInBits();
1601 if (SimpleVT.isVector()) {
1602 assert((Isv2x16VT(StoreVT) || StoreVT == MVT::v4i8) &&
1603 "Unexpected vector type");
1604 // v2x16 is stored using st.b32
1605 ToTypeWidth = 32;
1606 }
1607
1608 unsigned int ToType = getLdStRegType(ScalarVT);
1609
1610 // Create the machine instruction DAG
1611 SDValue Value = PlainStore ? PlainStore->getValue() : AtomicStore->getVal();
1612 SDValue BasePtr = ST->getBasePtr();
1613 SDValue Addr;
1615 std::optional<unsigned> Opcode;
1616 MVT::SimpleValueType SourceVT =
1617 Value.getNode()->getSimpleValueType(0).SimpleTy;
1618
1620 {Value, getI32Imm(Ordering, DL), getI32Imm(Scope, DL),
1621 getI32Imm(CodeAddrSpace, DL), getI32Imm(VecType, DL),
1622 getI32Imm(ToType, DL), getI32Imm(ToTypeWidth, DL)});
1623
1624 if (SelectDirectAddr(BasePtr, Addr)) {
1625 Opcode = pickOpcodeForVT(SourceVT, NVPTX::ST_i8_avar, NVPTX::ST_i16_avar,
1626 NVPTX::ST_i32_avar, NVPTX::ST_i64_avar,
1627 NVPTX::ST_f32_avar, NVPTX::ST_f64_avar);
1628 if (!Opcode)
1629 return false;
1630 Ops.append({Addr, Chain});
1631 } else if (PointerSize == 64
1632 ? SelectADDRsi64(BasePtr.getNode(), BasePtr, Base, Offset)
1633 : SelectADDRsi(BasePtr.getNode(), BasePtr, Base, Offset)) {
1634 Opcode = pickOpcodeForVT(SourceVT, NVPTX::ST_i8_asi, NVPTX::ST_i16_asi,
1635 NVPTX::ST_i32_asi, NVPTX::ST_i64_asi,
1636 NVPTX::ST_f32_asi, NVPTX::ST_f64_asi);
1637 if (!Opcode)
1638 return false;
1639 Ops.append({Base, Offset, Chain});
1640 } else if (PointerSize == 64
1641 ? SelectADDRri64(BasePtr.getNode(), BasePtr, Base, Offset)
1642 : SelectADDRri(BasePtr.getNode(), BasePtr, Base, Offset)) {
1643 if (PointerSize == 64)
1644 Opcode =
1645 pickOpcodeForVT(SourceVT, NVPTX::ST_i8_ari_64, NVPTX::ST_i16_ari_64,
1646 NVPTX::ST_i32_ari_64, NVPTX::ST_i64_ari_64,
1647 NVPTX::ST_f32_ari_64, NVPTX::ST_f64_ari_64);
1648 else
1649 Opcode = pickOpcodeForVT(SourceVT, NVPTX::ST_i8_ari, NVPTX::ST_i16_ari,
1650 NVPTX::ST_i32_ari, NVPTX::ST_i64_ari,
1651 NVPTX::ST_f32_ari, NVPTX::ST_f64_ari);
1652 if (!Opcode)
1653 return false;
1654 Ops.append({Base, Offset, Chain});
1655 } else {
1656 if (PointerSize == 64)
1657 Opcode =
1658 pickOpcodeForVT(SourceVT, NVPTX::ST_i8_areg_64, NVPTX::ST_i16_areg_64,
1659 NVPTX::ST_i32_areg_64, NVPTX::ST_i64_areg_64,
1660 NVPTX::ST_f32_areg_64, NVPTX::ST_f64_areg_64);
1661 else
1662 Opcode = pickOpcodeForVT(SourceVT, NVPTX::ST_i8_areg, NVPTX::ST_i16_areg,
1663 NVPTX::ST_i32_areg, NVPTX::ST_i64_areg,
1664 NVPTX::ST_f32_areg, NVPTX::ST_f64_areg);
1665 if (!Opcode)
1666 return false;
1667 Ops.append({BasePtr, Chain});
1668 }
1669
1670 SDNode *NVPTXST = CurDAG->getMachineNode(*Opcode, DL, MVT::Other, Ops);
1671
1672 if (!NVPTXST)
1673 return false;
1674
1675 MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand();
1676 CurDAG->setNodeMemRefs(cast<MachineSDNode>(NVPTXST), {MemRef});
1677 ReplaceNode(N, NVPTXST);
1678 return true;
1679}
1680
1681bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
1682 SDValue Op1 = N->getOperand(1);
1684 std::optional<unsigned> Opcode;
1685 SDNode *ST;
1686 EVT EltVT = Op1.getValueType();
1687 MemSDNode *MemSD = cast<MemSDNode>(N);
1688 EVT StoreVT = MemSD->getMemoryVT();
1689
1690 // Address Space Setting
1691 unsigned CodeAddrSpace = getCodeAddrSpace(MemSD);
1692 if (CodeAddrSpace == NVPTX::AddressSpace::Const) {
1693 report_fatal_error("Cannot store to pointer that points to constant "
1694 "memory space");
1695 }
1696 unsigned int PointerSize =
1698
1699 SDLoc DL(N);
1700 SDValue Chain = N->getOperand(0);
1701 auto [Ordering, Scope] = insertMemoryInstructionFence(DL, Chain, MemSD);
1702
1703 // Type Setting: toType + toTypeWidth
1704 // - for integer type, always use 'u'
1705 assert(StoreVT.isSimple() && "Store value is not simple");
1706 MVT ScalarVT = StoreVT.getSimpleVT().getScalarType();
1707 unsigned ToTypeWidth = ScalarVT.getSizeInBits();
1708 unsigned ToType = getLdStRegType(ScalarVT);
1709
1711 SDValue N2;
1712 unsigned VecType;
1713
1714 switch (N->getOpcode()) {
1715 case NVPTXISD::StoreV2:
1717 Ops.append({N->getOperand(1), N->getOperand(2)});
1718 N2 = N->getOperand(3);
1719 break;
1720 case NVPTXISD::StoreV4:
1722 Ops.append({N->getOperand(1), N->getOperand(2), N->getOperand(3),
1723 N->getOperand(4)});
1724 N2 = N->getOperand(5);
1725 break;
1726 default:
1727 return false;
1728 }
1729
1730 if (isVectorElementTypeUpsized(EltVT)) {
1731 EltVT = MVT::i32;
1733 ToTypeWidth = 32;
1734 }
1735
1736 Ops.append({getI32Imm(Ordering, DL), getI32Imm(Scope, DL),
1737 getI32Imm(CodeAddrSpace, DL), getI32Imm(VecType, DL),
1738 getI32Imm(ToType, DL), getI32Imm(ToTypeWidth, DL)});
1739
1740 if (SelectDirectAddr(N2, Addr)) {
1741 switch (N->getOpcode()) {
1742 default:
1743 return false;
1744 case NVPTXISD::StoreV2:
1745 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1746 NVPTX::STV_i8_v2_avar, NVPTX::STV_i16_v2_avar,
1747 NVPTX::STV_i32_v2_avar, NVPTX::STV_i64_v2_avar,
1748 NVPTX::STV_f32_v2_avar, NVPTX::STV_f64_v2_avar);
1749 break;
1750 case NVPTXISD::StoreV4:
1751 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1752 NVPTX::STV_i8_v4_avar, NVPTX::STV_i16_v4_avar,
1753 NVPTX::STV_i32_v4_avar, std::nullopt,
1754 NVPTX::STV_f32_v4_avar, std::nullopt);
1755 break;
1756 }
1757 Ops.push_back(Addr);
1758 } else if (PointerSize == 64 ? SelectADDRsi64(N2.getNode(), N2, Base, Offset)
1759 : SelectADDRsi(N2.getNode(), N2, Base, Offset)) {
1760 switch (N->getOpcode()) {
1761 default:
1762 return false;
1763 case NVPTXISD::StoreV2:
1764 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1765 NVPTX::STV_i8_v2_asi, NVPTX::STV_i16_v2_asi,
1766 NVPTX::STV_i32_v2_asi, NVPTX::STV_i64_v2_asi,
1767 NVPTX::STV_f32_v2_asi, NVPTX::STV_f64_v2_asi);
1768 break;
1769 case NVPTXISD::StoreV4:
1770 Opcode =
1771 pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v4_asi,
1772 NVPTX::STV_i16_v4_asi, NVPTX::STV_i32_v4_asi,
1773 std::nullopt, NVPTX::STV_f32_v4_asi, std::nullopt);
1774 break;
1775 }
1776 Ops.append({Base, Offset});
1777 } else if (PointerSize == 64 ? SelectADDRri64(N2.getNode(), N2, Base, Offset)
1778 : SelectADDRri(N2.getNode(), N2, Base, Offset)) {
1779 if (PointerSize == 64) {
1780 switch (N->getOpcode()) {
1781 default:
1782 return false;
1783 case NVPTXISD::StoreV2:
1784 Opcode =
1786 NVPTX::STV_i8_v2_ari_64, NVPTX::STV_i16_v2_ari_64,
1787 NVPTX::STV_i32_v2_ari_64, NVPTX::STV_i64_v2_ari_64,
1788 NVPTX::STV_f32_v2_ari_64, NVPTX::STV_f64_v2_ari_64);
1789 break;
1790 case NVPTXISD::StoreV4:
1791 Opcode = pickOpcodeForVT(
1792 EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v4_ari_64,
1793 NVPTX::STV_i16_v4_ari_64, NVPTX::STV_i32_v4_ari_64, std::nullopt,
1794 NVPTX::STV_f32_v4_ari_64, std::nullopt);
1795 break;
1796 }
1797 } else {
1798 switch (N->getOpcode()) {
1799 default:
1800 return false;
1801 case NVPTXISD::StoreV2:
1802 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1803 NVPTX::STV_i8_v2_ari, NVPTX::STV_i16_v2_ari,
1804 NVPTX::STV_i32_v2_ari, NVPTX::STV_i64_v2_ari,
1805 NVPTX::STV_f32_v2_ari, NVPTX::STV_f64_v2_ari);
1806 break;
1807 case NVPTXISD::StoreV4:
1808 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1809 NVPTX::STV_i8_v4_ari, NVPTX::STV_i16_v4_ari,
1810 NVPTX::STV_i32_v4_ari, std::nullopt,
1811 NVPTX::STV_f32_v4_ari, std::nullopt);
1812 break;
1813 }
1814 }
1815 Ops.append({Base, Offset});
1816 } else {
1817 if (PointerSize == 64) {
1818 switch (N->getOpcode()) {
1819 default:
1820 return false;
1821 case NVPTXISD::StoreV2:
1822 Opcode = pickOpcodeForVT(
1823 EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v2_areg_64,
1824 NVPTX::STV_i16_v2_areg_64, NVPTX::STV_i32_v2_areg_64,
1825 NVPTX::STV_i64_v2_areg_64, NVPTX::STV_f32_v2_areg_64,
1826 NVPTX::STV_f64_v2_areg_64);
1827 break;
1828 case NVPTXISD::StoreV4:
1829 Opcode = pickOpcodeForVT(
1830 EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v4_areg_64,
1831 NVPTX::STV_i16_v4_areg_64, NVPTX::STV_i32_v4_areg_64, std::nullopt,
1832 NVPTX::STV_f32_v4_areg_64, std::nullopt);
1833 break;
1834 }
1835 } else {
1836 switch (N->getOpcode()) {
1837 default:
1838 return false;
1839 case NVPTXISD::StoreV2:
1840 Opcode =
1841 pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v2_areg,
1842 NVPTX::STV_i16_v2_areg, NVPTX::STV_i32_v2_areg,
1843 NVPTX::STV_i64_v2_areg, NVPTX::STV_f32_v2_areg,
1844 NVPTX::STV_f64_v2_areg);
1845 break;
1846 case NVPTXISD::StoreV4:
1847 Opcode =
1848 pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v4_areg,
1849 NVPTX::STV_i16_v4_areg, NVPTX::STV_i32_v4_areg,
1850 std::nullopt, NVPTX::STV_f32_v4_areg, std::nullopt);
1851 break;
1852 }
1853 }
1854 Ops.push_back(N2);
1855 }
1856
1857 if (!Opcode)
1858 return false;
1859
1860 Ops.push_back(Chain);
1861
1862 ST = CurDAG->getMachineNode(*Opcode, DL, MVT::Other, Ops);
1863
1864 MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand();
1865 CurDAG->setNodeMemRefs(cast<MachineSDNode>(ST), {MemRef});
1866
1867 ReplaceNode(N, ST);
1868 return true;
1869}
1870
1871bool NVPTXDAGToDAGISel::tryLoadParam(SDNode *Node) {
1872 SDValue Chain = Node->getOperand(0);
1873 SDValue Offset = Node->getOperand(2);
1874 SDValue Glue = Node->getOperand(3);
1875 SDLoc DL(Node);
1876 MemSDNode *Mem = cast<MemSDNode>(Node);
1877
1878 unsigned VecSize;
1879 switch (Node->getOpcode()) {
1880 default:
1881 return false;
1883 VecSize = 1;
1884 break;
1886 VecSize = 2;
1887 break;
1889 VecSize = 4;
1890 break;
1891 }
1892
1893 EVT EltVT = Node->getValueType(0);
1894 EVT MemVT = Mem->getMemoryVT();
1895
1896 std::optional<unsigned> Opcode;
1897
1898 switch (VecSize) {
1899 default:
1900 return false;
1901 case 1:
1902 Opcode = pickOpcodeForVT(MemVT.getSimpleVT().SimpleTy,
1903 NVPTX::LoadParamMemI8, NVPTX::LoadParamMemI16,
1904 NVPTX::LoadParamMemI32, NVPTX::LoadParamMemI64,
1905 NVPTX::LoadParamMemF32, NVPTX::LoadParamMemF64);
1906 break;
1907 case 2:
1908 Opcode =
1909 pickOpcodeForVT(MemVT.getSimpleVT().SimpleTy, NVPTX::LoadParamMemV2I8,
1910 NVPTX::LoadParamMemV2I16, NVPTX::LoadParamMemV2I32,
1911 NVPTX::LoadParamMemV2I64, NVPTX::LoadParamMemV2F32,
1912 NVPTX::LoadParamMemV2F64);
1913 break;
1914 case 4:
1915 Opcode =
1916 pickOpcodeForVT(MemVT.getSimpleVT().SimpleTy, NVPTX::LoadParamMemV4I8,
1917 NVPTX::LoadParamMemV4I16, NVPTX::LoadParamMemV4I32,
1918 std::nullopt, NVPTX::LoadParamMemV4F32, std::nullopt);
1919 break;
1920 }
1921 if (!Opcode)
1922 return false;
1923
1924 SDVTList VTs;
1925 if (VecSize == 1) {
1926 VTs = CurDAG->getVTList(EltVT, MVT::Other, MVT::Glue);
1927 } else if (VecSize == 2) {
1928 VTs = CurDAG->getVTList(EltVT, EltVT, MVT::Other, MVT::Glue);
1929 } else {
1930 EVT EVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other, MVT::Glue };
1931 VTs = CurDAG->getVTList(EVTs);
1932 }
1933
1934 unsigned OffsetVal = Offset->getAsZExtVal();
1935
1937 {CurDAG->getTargetConstant(OffsetVal, DL, MVT::i32), Chain, Glue});
1938
1939 ReplaceNode(Node, CurDAG->getMachineNode(*Opcode, DL, VTs, Ops));
1940 return true;
1941}
1942
1943bool NVPTXDAGToDAGISel::tryStoreRetval(SDNode *N) {
1944 SDLoc DL(N);
1945 SDValue Chain = N->getOperand(0);
1946 SDValue Offset = N->getOperand(1);
1947 unsigned OffsetVal = Offset->getAsZExtVal();
1948 MemSDNode *Mem = cast<MemSDNode>(N);
1949
1950 // How many elements do we have?
1951 unsigned NumElts = 1;
1952 switch (N->getOpcode()) {
1953 default:
1954 return false;
1956 NumElts = 1;
1957 break;
1959 NumElts = 2;
1960 break;
1962 NumElts = 4;
1963 break;
1964 }
1965
1966 // Build vector of operands
1968 for (unsigned i = 0; i < NumElts; ++i)
1969 Ops.push_back(N->getOperand(i + 2));
1970 Ops.append({CurDAG->getTargetConstant(OffsetVal, DL, MVT::i32), Chain});
1971
1972 // Determine target opcode
1973 // If we have an i1, use an 8-bit store. The lowering code in
1974 // NVPTXISelLowering will have already emitted an upcast.
1975 std::optional<unsigned> Opcode = 0;
1976 switch (NumElts) {
1977 default:
1978 return false;
1979 case 1:
1981 NVPTX::StoreRetvalI8, NVPTX::StoreRetvalI16,
1982 NVPTX::StoreRetvalI32, NVPTX::StoreRetvalI64,
1983 NVPTX::StoreRetvalF32, NVPTX::StoreRetvalF64);
1984 if (Opcode == NVPTX::StoreRetvalI8) {
1985 // Fine tune the opcode depending on the size of the operand.
1986 // This helps to avoid creating redundant COPY instructions in
1987 // InstrEmitter::AddRegisterOperand().
1988 switch (Ops[0].getSimpleValueType().SimpleTy) {
1989 default:
1990 break;
1991 case MVT::i32:
1992 Opcode = NVPTX::StoreRetvalI8TruncI32;
1993 break;
1994 case MVT::i64:
1995 Opcode = NVPTX::StoreRetvalI8TruncI64;
1996 break;
1997 }
1998 }
1999 break;
2000 case 2:
2002 NVPTX::StoreRetvalV2I8, NVPTX::StoreRetvalV2I16,
2003 NVPTX::StoreRetvalV2I32, NVPTX::StoreRetvalV2I64,
2004 NVPTX::StoreRetvalV2F32, NVPTX::StoreRetvalV2F64);
2005 break;
2006 case 4:
2008 NVPTX::StoreRetvalV4I8, NVPTX::StoreRetvalV4I16,
2009 NVPTX::StoreRetvalV4I32, std::nullopt,
2010 NVPTX::StoreRetvalV4F32, std::nullopt);
2011 break;
2012 }
2013 if (!Opcode)
2014 return false;
2015
2016 SDNode *Ret = CurDAG->getMachineNode(*Opcode, DL, MVT::Other, Ops);
2017 MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand();
2018 CurDAG->setNodeMemRefs(cast<MachineSDNode>(Ret), {MemRef});
2019
2020 ReplaceNode(N, Ret);
2021 return true;
2022}
2023
2024// Helpers for constructing opcode (ex: NVPTX::StoreParamV4F32_iiri)
2025#define getOpcV2H(ty, opKind0, opKind1) \
2026 NVPTX::StoreParamV2##ty##_##opKind0##opKind1
2027
2028#define getOpcV2H1(ty, opKind0, isImm1) \
2029 (isImm1) ? getOpcV2H(ty, opKind0, i) : getOpcV2H(ty, opKind0, r)
2030
2031#define getOpcodeForVectorStParamV2(ty, isimm) \
2032 (isimm[0]) ? getOpcV2H1(ty, i, isimm[1]) : getOpcV2H1(ty, r, isimm[1])
2033
2034#define getOpcV4H(ty, opKind0, opKind1, opKind2, opKind3) \
2035 NVPTX::StoreParamV4##ty##_##opKind0##opKind1##opKind2##opKind3
2036
2037#define getOpcV4H3(ty, opKind0, opKind1, opKind2, isImm3) \
2038 (isImm3) ? getOpcV4H(ty, opKind0, opKind1, opKind2, i) \
2039 : getOpcV4H(ty, opKind0, opKind1, opKind2, r)
2040
2041#define getOpcV4H2(ty, opKind0, opKind1, isImm2, isImm3) \
2042 (isImm2) ? getOpcV4H3(ty, opKind0, opKind1, i, isImm3) \
2043 : getOpcV4H3(ty, opKind0, opKind1, r, isImm3)
2044
2045#define getOpcV4H1(ty, opKind0, isImm1, isImm2, isImm3) \
2046 (isImm1) ? getOpcV4H2(ty, opKind0, i, isImm2, isImm3) \
2047 : getOpcV4H2(ty, opKind0, r, isImm2, isImm3)
2048
2049#define getOpcodeForVectorStParamV4(ty, isimm) \
2050 (isimm[0]) ? getOpcV4H1(ty, i, isimm[1], isimm[2], isimm[3]) \
2051 : getOpcV4H1(ty, r, isimm[1], isimm[2], isimm[3])
2052
2053#define getOpcodeForVectorStParam(n, ty, isimm) \
2054 (n == 2) ? getOpcodeForVectorStParamV2(ty, isimm) \
2055 : getOpcodeForVectorStParamV4(ty, isimm)
2056
2058 unsigned NumElts,
2060 SelectionDAG *CurDAG, SDLoc DL) {
2061 // Determine which inputs are registers and immediates make new operators
2062 // with constant values
2063 SmallVector<bool, 4> IsImm(NumElts, false);
2064 for (unsigned i = 0; i < NumElts; i++) {
2065 IsImm[i] = (isa<ConstantSDNode>(Ops[i]) || isa<ConstantFPSDNode>(Ops[i]));
2066 if (IsImm[i]) {
2067 SDValue Imm = Ops[i];
2068 if (MemTy == MVT::f32 || MemTy == MVT::f64) {
2069 const ConstantFPSDNode *ConstImm = cast<ConstantFPSDNode>(Imm);
2070 const ConstantFP *CF = ConstImm->getConstantFPValue();
2071 Imm = CurDAG->getTargetConstantFP(*CF, DL, Imm->getValueType(0));
2072 } else {
2073 const ConstantSDNode *ConstImm = cast<ConstantSDNode>(Imm);
2074 const ConstantInt *CI = ConstImm->getConstantIntValue();
2075 Imm = CurDAG->getTargetConstant(*CI, DL, Imm->getValueType(0));
2076 }
2077 Ops[i] = Imm;
2078 }
2079 }
2080
2081 // Get opcode for MemTy, size, and register/immediate operand ordering
2082 switch (MemTy) {
2083 case MVT::i8:
2084 return getOpcodeForVectorStParam(NumElts, I8, IsImm);
2085 case MVT::i16:
2086 return getOpcodeForVectorStParam(NumElts, I16, IsImm);
2087 case MVT::i32:
2088 return getOpcodeForVectorStParam(NumElts, I32, IsImm);
2089 case MVT::i64:
2090 assert(NumElts == 2 && "MVT too large for NumElts > 2");
2091 return getOpcodeForVectorStParamV2(I64, IsImm);
2092 case MVT::f32:
2093 return getOpcodeForVectorStParam(NumElts, F32, IsImm);
2094 case MVT::f64:
2095 assert(NumElts == 2 && "MVT too large for NumElts > 2");
2096 return getOpcodeForVectorStParamV2(F64, IsImm);
2097
2098 // These cases don't support immediates, just use the all register version
2099 // and generate moves.
2100 case MVT::i1:
2101 return (NumElts == 2) ? NVPTX::StoreParamV2I8_rr
2102 : NVPTX::StoreParamV4I8_rrrr;
2103 case MVT::f16:
2104 case MVT::bf16:
2105 return (NumElts == 2) ? NVPTX::StoreParamV2I16_rr
2106 : NVPTX::StoreParamV4I16_rrrr;
2107 case MVT::v2f16:
2108 case MVT::v2bf16:
2109 case MVT::v2i16:
2110 case MVT::v4i8:
2111 return (NumElts == 2) ? NVPTX::StoreParamV2I32_rr
2112 : NVPTX::StoreParamV4I32_rrrr;
2113 default:
2114 llvm_unreachable("Cannot select st.param for unknown MemTy");
2115 }
2116}
2117
2118bool NVPTXDAGToDAGISel::tryStoreParam(SDNode *N) {
2119 SDLoc DL(N);
2120 SDValue Chain = N->getOperand(0);
2121 SDValue Param = N->getOperand(1);
2122 unsigned ParamVal = Param->getAsZExtVal();
2123 SDValue Offset = N->getOperand(2);
2124 unsigned OffsetVal = Offset->getAsZExtVal();
2125 MemSDNode *Mem = cast<MemSDNode>(N);
2126 SDValue Glue = N->getOperand(N->getNumOperands() - 1);
2127
2128 // How many elements do we have?
2129 unsigned NumElts;
2130 switch (N->getOpcode()) {
2131 default:
2132 llvm_unreachable("Unexpected opcode");
2136 NumElts = 1;
2137 break;
2139 NumElts = 2;
2140 break;
2142 NumElts = 4;
2143 break;
2144 }
2145
2146 // Build vector of operands
2148 for (unsigned i = 0; i < NumElts; ++i)
2149 Ops.push_back(N->getOperand(i + 3));
2150 Ops.append({CurDAG->getTargetConstant(ParamVal, DL, MVT::i32),
2151 CurDAG->getTargetConstant(OffsetVal, DL, MVT::i32), Chain, Glue});
2152
2153 // Determine target opcode
2154 // If we have an i1, use an 8-bit store. The lowering code in
2155 // NVPTXISelLowering will have already emitted an upcast.
2156 std::optional<unsigned> Opcode;
2157 switch (N->getOpcode()) {
2158 default:
2159 switch (NumElts) {
2160 default:
2161 llvm_unreachable("Unexpected NumElts");
2162 case 1: {
2164 SDValue Imm = Ops[0];
2165 if (MemTy != MVT::f16 && MemTy != MVT::v2f16 &&
2166 (isa<ConstantSDNode>(Imm) || isa<ConstantFPSDNode>(Imm))) {
2167 // Convert immediate to target constant
2168 if (MemTy == MVT::f32 || MemTy == MVT::f64) {
2169 const ConstantFPSDNode *ConstImm = cast<ConstantFPSDNode>(Imm);
2170 const ConstantFP *CF = ConstImm->getConstantFPValue();
2171 Imm = CurDAG->getTargetConstantFP(*CF, DL, Imm->getValueType(0));
2172 } else {
2173 const ConstantSDNode *ConstImm = cast<ConstantSDNode>(Imm);
2174 const ConstantInt *CI = ConstImm->getConstantIntValue();
2175 Imm = CurDAG->getTargetConstant(*CI, DL, Imm->getValueType(0));
2176 }
2177 Ops[0] = Imm;
2178 // Use immediate version of store param
2179 Opcode = pickOpcodeForVT(MemTy, NVPTX::StoreParamI8_i,
2180 NVPTX::StoreParamI16_i, NVPTX::StoreParamI32_i,
2181 NVPTX::StoreParamI64_i, NVPTX::StoreParamF32_i,
2182 NVPTX::StoreParamF64_i);
2183 } else
2184 Opcode =
2186 NVPTX::StoreParamI8_r, NVPTX::StoreParamI16_r,
2187 NVPTX::StoreParamI32_r, NVPTX::StoreParamI64_r,
2188 NVPTX::StoreParamF32_r, NVPTX::StoreParamF64_r);
2189 if (Opcode == NVPTX::StoreParamI8_r) {
2190 // Fine tune the opcode depending on the size of the operand.
2191 // This helps to avoid creating redundant COPY instructions in
2192 // InstrEmitter::AddRegisterOperand().
2193 switch (Ops[0].getSimpleValueType().SimpleTy) {
2194 default:
2195 break;
2196 case MVT::i32:
2197 Opcode = NVPTX::StoreParamI8TruncI32_r;
2198 break;
2199 case MVT::i64:
2200 Opcode = NVPTX::StoreParamI8TruncI64_r;
2201 break;
2202 }
2203 }
2204 break;
2205 }
2206 case 2:
2207 case 4: {
2209 Opcode = pickOpcodeForVectorStParam(Ops, NumElts, MemTy, CurDAG, DL);
2210 break;
2211 }
2212 }
2213 break;
2214 // Special case: if we have a sign-extend/zero-extend node, insert the
2215 // conversion instruction first, and use that as the value operand to
2216 // the selected StoreParam node.
2218 Opcode = NVPTX::StoreParamI32_r;
2220 MVT::i32);
2221 SDNode *Cvt = CurDAG->getMachineNode(NVPTX::CVT_u32_u16, DL,
2222 MVT::i32, Ops[0], CvtNone);
2223 Ops[0] = SDValue(Cvt, 0);
2224 break;
2225 }
2227 Opcode = NVPTX::StoreParamI32_r;
2229 MVT::i32);
2230 SDNode *Cvt = CurDAG->getMachineNode(NVPTX::CVT_s32_s16, DL,
2231 MVT::i32, Ops[0], CvtNone);
2232 Ops[0] = SDValue(Cvt, 0);
2233 break;
2234 }
2235 }
2236
2237 SDVTList RetVTs = CurDAG->getVTList(MVT::Other, MVT::Glue);
2238 SDNode *Ret = CurDAG->getMachineNode(*Opcode, DL, RetVTs, Ops);
2239 MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand();
2240 CurDAG->setNodeMemRefs(cast<MachineSDNode>(Ret), {MemRef});
2241
2242 ReplaceNode(N, Ret);
2243 return true;
2244}
2245
2246/// SelectBFE - Look for instruction sequences that can be made more efficient
2247/// by using the 'bfe' (bit-field extract) PTX instruction
2248bool NVPTXDAGToDAGISel::tryBFE(SDNode *N) {
2249 SDLoc DL(N);
2250 SDValue LHS = N->getOperand(0);
2251 SDValue RHS = N->getOperand(1);
2252 SDValue Len;
2253 SDValue Start;
2254 SDValue Val;
2255 bool IsSigned = false;
2256
2257 if (N->getOpcode() == ISD::AND) {
2258 // Canonicalize the operands
2259 // We want 'and %val, %mask'
2260 if (isa<ConstantSDNode>(LHS) && !isa<ConstantSDNode>(RHS)) {
2261 std::swap(LHS, RHS);
2262 }
2263
2264 ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(RHS);
2265 if (!Mask) {
2266 // We need a constant mask on the RHS of the AND
2267 return false;
2268 }
2269
2270 // Extract the mask bits
2271 uint64_t MaskVal = Mask->getZExtValue();
2272 if (!isMask_64(MaskVal)) {
2273 // We *could* handle shifted masks here, but doing so would require an
2274 // 'and' operation to fix up the low-order bits so we would trade
2275 // shr+and for bfe+and, which has the same throughput
2276 return false;
2277 }
2278
2279 // How many bits are in our mask?
2280 int64_t NumBits = countr_one(MaskVal);
2281 Len = CurDAG->getTargetConstant(NumBits, DL, MVT::i32);
2282
2283 if (LHS.getOpcode() == ISD::SRL || LHS.getOpcode() == ISD::SRA) {
2284 // We have a 'srl/and' pair, extract the effective start bit and length
2285 Val = LHS.getNode()->getOperand(0);
2286 Start = LHS.getNode()->getOperand(1);
2287 ConstantSDNode *StartConst = dyn_cast<ConstantSDNode>(Start);
2288 if (StartConst) {
2289 uint64_t StartVal = StartConst->getZExtValue();
2290 // How many "good" bits do we have left? "good" is defined here as bits
2291 // that exist in the original value, not shifted in.
2292 int64_t GoodBits = Start.getValueSizeInBits() - StartVal;
2293 if (NumBits > GoodBits) {
2294 // Do not handle the case where bits have been shifted in. In theory
2295 // we could handle this, but the cost is likely higher than just
2296 // emitting the srl/and pair.
2297 return false;
2298 }
2299 Start = CurDAG->getTargetConstant(StartVal, DL, MVT::i32);
2300 } else {
2301 // Do not handle the case where the shift amount (can be zero if no srl
2302 // was found) is not constant. We could handle this case, but it would
2303 // require run-time logic that would be more expensive than just
2304 // emitting the srl/and pair.
2305 return false;
2306 }
2307 } else {
2308 // Do not handle the case where the LHS of the and is not a shift. While
2309 // it would be trivial to handle this case, it would just transform
2310 // 'and' -> 'bfe', but 'and' has higher-throughput.
2311 return false;
2312 }
2313 } else if (N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) {
2314 if (LHS->getOpcode() == ISD::AND) {
2315 ConstantSDNode *ShiftCnst = dyn_cast<ConstantSDNode>(RHS);
2316 if (!ShiftCnst) {
2317 // Shift amount must be constant
2318 return false;
2319 }
2320
2321 uint64_t ShiftAmt = ShiftCnst->getZExtValue();
2322
2323 SDValue AndLHS = LHS->getOperand(0);
2324 SDValue AndRHS = LHS->getOperand(1);
2325
2326 // Canonicalize the AND to have the mask on the RHS
2327 if (isa<ConstantSDNode>(AndLHS)) {
2328 std::swap(AndLHS, AndRHS);
2329 }
2330
2331 ConstantSDNode *MaskCnst = dyn_cast<ConstantSDNode>(AndRHS);
2332 if (!MaskCnst) {
2333 // Mask must be constant
2334 return false;
2335 }
2336
2337 uint64_t MaskVal = MaskCnst->getZExtValue();
2338 uint64_t NumZeros;
2339 uint64_t NumBits;
2340 if (isMask_64(MaskVal)) {
2341 NumZeros = 0;
2342 // The number of bits in the result bitfield will be the number of
2343 // trailing ones (the AND) minus the number of bits we shift off
2344 NumBits = llvm::countr_one(MaskVal) - ShiftAmt;
2345 } else if (isShiftedMask_64(MaskVal)) {
2346 NumZeros = llvm::countr_zero(MaskVal);
2347 unsigned NumOnes = llvm::countr_one(MaskVal >> NumZeros);
2348 // The number of bits in the result bitfield will be the number of
2349 // trailing zeros plus the number of set bits in the mask minus the
2350 // number of bits we shift off
2351 NumBits = NumZeros + NumOnes - ShiftAmt;
2352 } else {
2353 // This is not a mask we can handle
2354 return false;
2355 }
2356
2357 if (ShiftAmt < NumZeros) {
2358 // Handling this case would require extra logic that would make this
2359 // transformation non-profitable
2360 return false;
2361 }
2362
2363 Val = AndLHS;
2364 Start = CurDAG->getTargetConstant(ShiftAmt, DL, MVT::i32);
2365 Len = CurDAG->getTargetConstant(NumBits, DL, MVT::i32);
2366 } else if (LHS->getOpcode() == ISD::SHL) {
2367 // Here, we have a pattern like:
2368 //
2369 // (sra (shl val, NN), MM)
2370 // or
2371 // (srl (shl val, NN), MM)
2372 //
2373 // If MM >= NN, we can efficiently optimize this with bfe
2374 Val = LHS->getOperand(0);
2375
2376 SDValue ShlRHS = LHS->getOperand(1);
2377 ConstantSDNode *ShlCnst = dyn_cast<ConstantSDNode>(ShlRHS);
2378 if (!ShlCnst) {
2379 // Shift amount must be constant
2380 return false;
2381 }
2382 uint64_t InnerShiftAmt = ShlCnst->getZExtValue();
2383
2384 SDValue ShrRHS = RHS;
2385 ConstantSDNode *ShrCnst = dyn_cast<ConstantSDNode>(ShrRHS);
2386 if (!ShrCnst) {
2387 // Shift amount must be constant
2388 return false;
2389 }
2390 uint64_t OuterShiftAmt = ShrCnst->getZExtValue();
2391
2392 // To avoid extra codegen and be profitable, we need Outer >= Inner
2393 if (OuterShiftAmt < InnerShiftAmt) {
2394 return false;
2395 }
2396
2397 // If the outer shift is more than the type size, we have no bitfield to
2398 // extract (since we also check that the inner shift is <= the outer shift
2399 // then this also implies that the inner shift is < the type size)
2400 if (OuterShiftAmt >= Val.getValueSizeInBits()) {
2401 return false;
2402 }
2403
2404 Start = CurDAG->getTargetConstant(OuterShiftAmt - InnerShiftAmt, DL,
2405 MVT::i32);
2406 Len = CurDAG->getTargetConstant(Val.getValueSizeInBits() - OuterShiftAmt,
2407 DL, MVT::i32);
2408
2409 if (N->getOpcode() == ISD::SRA) {
2410 // If we have a arithmetic right shift, we need to use the signed bfe
2411 // variant
2412 IsSigned = true;
2413 }
2414 } else {
2415 // No can do...
2416 return false;
2417 }
2418 } else {
2419 // No can do...
2420 return false;
2421 }
2422
2423
2424 unsigned Opc;
2425 // For the BFE operations we form here from "and" and "srl", always use the
2426 // unsigned variants.
2427 if (Val.getValueType() == MVT::i32) {
2428 if (IsSigned) {
2429 Opc = NVPTX::BFE_S32rii;
2430 } else {
2431 Opc = NVPTX::BFE_U32rii;
2432 }
2433 } else if (Val.getValueType() == MVT::i64) {
2434 if (IsSigned) {
2435 Opc = NVPTX::BFE_S64rii;
2436 } else {
2437 Opc = NVPTX::BFE_U64rii;
2438 }
2439 } else {
2440 // We cannot handle this type
2441 return false;
2442 }
2443
2444 SDValue Ops[] = {
2445 Val, Start, Len
2446 };
2447
2448 ReplaceNode(N, CurDAG->getMachineNode(Opc, DL, N->getVTList(), Ops));
2449 return true;
2450}
2451
2452// SelectDirectAddr - Match a direct address for DAG.
2453// A direct address could be a globaladdress or externalsymbol.
2454bool NVPTXDAGToDAGISel::SelectDirectAddr(SDValue N, SDValue &Address) {
2455 // Return true if TGA or ES.
2456 if (N.getOpcode() == ISD::TargetGlobalAddress ||
2457 N.getOpcode() == ISD::TargetExternalSymbol) {
2458 Address = N;
2459 return true;
2460 }
2461 if (N.getOpcode() == NVPTXISD::Wrapper) {
2462 Address = N.getOperand(0);
2463 return true;
2464 }
2465 // addrspacecast(MoveParam(arg_symbol) to addrspace(PARAM)) -> arg_symbol
2466 if (AddrSpaceCastSDNode *CastN = dyn_cast<AddrSpaceCastSDNode>(N)) {
2467 if (CastN->getSrcAddressSpace() == ADDRESS_SPACE_GENERIC &&
2470 return SelectDirectAddr(CastN->getOperand(0).getOperand(0), Address);
2471 }
2472 return false;
2473}
2474
2475// symbol+offset
2476bool NVPTXDAGToDAGISel::SelectADDRsi_imp(
2477 SDNode *OpNode, SDValue Addr, SDValue &Base, SDValue &Offset, MVT mvt) {
2478 if (Addr.getOpcode() == ISD::ADD) {
2479 if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1))) {
2480 SDValue base = Addr.getOperand(0);
2481 if (SelectDirectAddr(base, Base)) {
2482 Offset = CurDAG->getTargetConstant(CN->getZExtValue(), SDLoc(OpNode),
2483 mvt);
2484 return true;
2485 }
2486 }
2487 }
2488 return false;
2489}
2490
2491// symbol+offset
2492bool NVPTXDAGToDAGISel::SelectADDRsi(SDNode *OpNode, SDValue Addr,
2494 return SelectADDRsi_imp(OpNode, Addr, Base, Offset, MVT::i32);
2495}
2496
2497// symbol+offset
2498bool NVPTXDAGToDAGISel::SelectADDRsi64(SDNode *OpNode, SDValue Addr,
2500 return SelectADDRsi_imp(OpNode, Addr, Base, Offset, MVT::i64);
2501}
2502
2503// register+offset
2504bool NVPTXDAGToDAGISel::SelectADDRri_imp(
2505 SDNode *OpNode, SDValue Addr, SDValue &Base, SDValue &Offset, MVT mvt) {
2506 if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Addr)) {
2507 Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), mvt);
2508 Offset = CurDAG->getTargetConstant(0, SDLoc(OpNode), mvt);
2509 return true;
2510 }
2511 if (Addr.getOpcode() == ISD::TargetExternalSymbol ||
2512 Addr.getOpcode() == ISD::TargetGlobalAddress)
2513 return false; // direct calls.
2514
2515 if (Addr.getOpcode() == ISD::ADD) {
2516 if (SelectDirectAddr(Addr.getOperand(0), Addr)) {
2517 return false;
2518 }
2519 if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1))) {
2520 if (FrameIndexSDNode *FIN =
2521 dyn_cast<FrameIndexSDNode>(Addr.getOperand(0)))
2522 // Constant offset from frame ref.
2523 Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), mvt);
2524 else
2525 Base = Addr.getOperand(0);
2526
2527 // Offset must fit in a 32-bit signed int in PTX [register+offset] address
2528 // mode
2529 if (!CN->getAPIntValue().isSignedIntN(32))
2530 return false;
2531
2532 Offset = CurDAG->getSignedTargetConstant(CN->getSExtValue(),
2533 SDLoc(OpNode), MVT::i32);
2534 return true;
2535 }
2536 }
2537 return false;
2538}
2539
2540// register+offset
2541bool NVPTXDAGToDAGISel::SelectADDRri(SDNode *OpNode, SDValue Addr,
2543 return SelectADDRri_imp(OpNode, Addr, Base, Offset, MVT::i32);
2544}
2545
2546// register+offset
2547bool NVPTXDAGToDAGISel::SelectADDRri64(SDNode *OpNode, SDValue Addr,
2549 return SelectADDRri_imp(OpNode, Addr, Base, Offset, MVT::i64);
2550}
2551
2552bool NVPTXDAGToDAGISel::ChkMemSDNodeAddressSpace(SDNode *N,
2553 unsigned int spN) const {
2554 const Value *Src = nullptr;
2555 if (MemSDNode *mN = dyn_cast<MemSDNode>(N)) {
2556 if (spN == 0 && mN->getMemOperand()->getPseudoValue())
2557 return true;
2558 Src = mN->getMemOperand()->getValue();
2559 }
2560 if (!Src)
2561 return false;
2562 if (auto *PT = dyn_cast<PointerType>(Src->getType()))
2563 return (PT->getAddressSpace() == spN);
2564 return false;
2565}
2566
2567/// SelectInlineAsmMemoryOperand - Implement addressing mode selection for
2568/// inline asm expressions.
2570 const SDValue &Op, InlineAsm::ConstraintCode ConstraintID,
2571 std::vector<SDValue> &OutOps) {
2572 SDValue Op0, Op1;
2573 switch (ConstraintID) {
2574 default:
2575 return true;
2576 case InlineAsm::ConstraintCode::m: // memory
2577 if (SelectDirectAddr(Op, Op0)) {
2578 OutOps.push_back(Op0);
2579 OutOps.push_back(CurDAG->getTargetConstant(0, SDLoc(Op), MVT::i32));
2580 return false;
2581 }
2582 if (SelectADDRri(Op.getNode(), Op, Op0, Op1)) {
2583 OutOps.push_back(Op0);
2584 OutOps.push_back(Op1);
2585 return false;
2586 }
2587 break;
2588 }
2589 return true;
2590}
2591
2592void NVPTXDAGToDAGISel::SelectV2I64toI128(SDNode *N) {
2593 // Lower a CopyToReg with two 64-bit inputs
2594 // Dst:i128, lo:i64, hi:i64
2595 //
2596 // CopyToReg Dst, lo, hi;
2597 //
2598 // ==>
2599 //
2600 // tmp = V2I64toI128 {lo, hi};
2601 // CopyToReg Dst, tmp;
2602 SDValue Dst = N->getOperand(1);
2603 SDValue Lo = N->getOperand(2);
2604 SDValue Hi = N->getOperand(3);
2605
2606 SDLoc DL(N);
2607 SDNode *Mov =
2608 CurDAG->getMachineNode(NVPTX::V2I64toI128, DL, MVT::i128, {Lo, Hi});
2609
2610 SmallVector<SDValue, 4> NewOps(N->getNumOperands() - 1);
2611 NewOps[0] = N->getOperand(0);
2612 NewOps[1] = Dst;
2613 NewOps[2] = SDValue(Mov, 0);
2614 if (N->getNumOperands() == 5)
2615 NewOps[3] = N->getOperand(4);
2616 SDValue NewValue = CurDAG->getNode(ISD::CopyToReg, DL, SmallVector<EVT>(N->values()), NewOps);
2617
2618 ReplaceNode(N, NewValue.getNode());
2619}
2620
2621void NVPTXDAGToDAGISel::SelectI128toV2I64(SDNode *N) {
2622 // Lower CopyFromReg from a 128-bit regs to two 64-bit regs
2623 // Dst:i128, Src:i128
2624 //
2625 // {lo, hi} = CopyFromReg Src
2626 //
2627 // ==>
2628 //
2629 // {lo, hi} = I128toV2I64 Src
2630 //
2631 SDValue Ch = N->getOperand(0);
2632 SDValue Src = N->getOperand(1);
2633 SDValue Glue = N->getOperand(2);
2634 SDLoc DL(N);
2635
2636 // Add Glue and Ch to the operands and results to avoid break the execution
2637 // order
2639 NVPTX::I128toV2I64, DL,
2640 {MVT::i64, MVT::i64, Ch.getValueType(), Glue.getValueType()},
2641 {Src, Ch, Glue});
2642
2643 ReplaceNode(N, Mov);
2644}
2645
2646/// GetConvertOpcode - Returns the CVT_ instruction opcode that implements a
2647/// conversion from \p SrcTy to \p DestTy.
2648unsigned NVPTXDAGToDAGISel::GetConvertOpcode(MVT DestTy, MVT SrcTy,
2649 LoadSDNode *LdNode) {
2650 bool IsSigned = LdNode && LdNode->getExtensionType() == ISD::SEXTLOAD;
2651 switch (SrcTy.SimpleTy) {
2652 default:
2653 llvm_unreachable("Unhandled source type");
2654 case MVT::i8:
2655 switch (DestTy.SimpleTy) {
2656 default:
2657 llvm_unreachable("Unhandled dest type");
2658 case MVT::i16:
2659 return IsSigned ? NVPTX::CVT_s16_s8 : NVPTX::CVT_u16_u8;
2660 case MVT::i32:
2661 return IsSigned ? NVPTX::CVT_s32_s8 : NVPTX::CVT_u32_u8;
2662 case MVT::i64:
2663 return IsSigned ? NVPTX::CVT_s64_s8 : NVPTX::CVT_u64_u8;
2664 }
2665 case MVT::i16:
2666 switch (DestTy.SimpleTy) {
2667 default:
2668 llvm_unreachable("Unhandled dest type");
2669 case MVT::i8:
2670 return IsSigned ? NVPTX::CVT_s8_s16 : NVPTX::CVT_u8_u16;
2671 case MVT::i32:
2672 return IsSigned ? NVPTX::CVT_s32_s16 : NVPTX::CVT_u32_u16;
2673 case MVT::i64:
2674 return IsSigned ? NVPTX::CVT_s64_s16 : NVPTX::CVT_u64_u16;
2675 }
2676 case MVT::i32:
2677 switch (DestTy.SimpleTy) {
2678 default:
2679 llvm_unreachable("Unhandled dest type");
2680 case MVT::i8:
2681 return IsSigned ? NVPTX::CVT_s8_s32 : NVPTX::CVT_u8_u32;
2682 case MVT::i16:
2683 return IsSigned ? NVPTX::CVT_s16_s32 : NVPTX::CVT_u16_u32;
2684 case MVT::i64:
2685 return IsSigned ? NVPTX::CVT_s64_s32 : NVPTX::CVT_u64_u32;
2686 }
2687 case MVT::i64:
2688 switch (DestTy.SimpleTy) {
2689 default:
2690 llvm_unreachable("Unhandled dest type");
2691 case MVT::i8:
2692 return IsSigned ? NVPTX::CVT_s8_s64 : NVPTX::CVT_u8_u64;
2693 case MVT::i16:
2694 return IsSigned ? NVPTX::CVT_s16_s64 : NVPTX::CVT_u16_u64;
2695 case MVT::i32:
2696 return IsSigned ? NVPTX::CVT_s32_s64 : NVPTX::CVT_u32_u64;
2697 }
2698 case MVT::f16:
2699 switch (DestTy.SimpleTy) {
2700 default:
2701 llvm_unreachable("Unhandled dest type");
2702 case MVT::f32:
2703 return NVPTX::CVT_f32_f16;
2704 case MVT::f64:
2705 return NVPTX::CVT_f64_f16;
2706 }
2707 }
2708}
2709
2710bool NVPTXDAGToDAGISel::tryFence(SDNode *N) {
2711 SDLoc DL(N);
2712 assert(N->getOpcode() == ISD::ATOMIC_FENCE);
2713 unsigned int FenceOp =
2714 getFenceOp(NVPTX::Ordering(N->getConstantOperandVal(1)),
2715 Scopes[N->getConstantOperandVal(2)], Subtarget);
2716 SDValue Chain = N->getOperand(0);
2717 SDNode *FenceNode = CurDAG->getMachineNode(FenceOp, DL, MVT::Other, Chain);
2718 ReplaceNode(N, FenceNode);
2719 return true;
2720}
2721
2723 Scopes[C.getOrInsertSyncScopeID("singlethread")] = NVPTX::Scope::Thread;
2724 Scopes[C.getOrInsertSyncScopeID("")] = NVPTX::Scope::System;
2725 Scopes[C.getOrInsertSyncScopeID("block")] = NVPTX::Scope::Block;
2726 Scopes[C.getOrInsertSyncScopeID("cluster")] = NVPTX::Scope::Cluster;
2727 Scopes[C.getOrInsertSyncScopeID("device")] = NVPTX::Scope::Device;
2728}
2729
2731 if (Scopes.empty())
2732 llvm_unreachable("NVPTX Scopes must be initialized before calling "
2733 "NVPTXScopes::operator[]");
2734
2735 auto S = Scopes.find(ID);
2736 if (S == Scopes.end()) {
2737 // TODO:
2738 // - Add API to LLVMContext to get the name of a single scope.
2739 // - Use that API here to print an error containing the name
2740 // of this Unknown ID.
2741 report_fatal_error(formatv("Could not find scope ID={}.", int(ID)));
2742 }
2743 return S->second;
2744}
2745
2746bool NVPTXScopes::empty() const { return Scopes.size() == 0; }
2747
2748#define CP_ASYNC_BULK_TENSOR_OPCODE(dir, dim, mode, is_s32, suffix) \
2749 (is_s32 \
2750 ? NVPTX::CP_ASYNC_BULK_TENSOR_##dir##_##dim##_SHARED32_##mode##suffix \
2751 : NVPTX::CP_ASYNC_BULK_TENSOR_##dir##_##dim##_##mode##suffix)
2752
2753#define CP_ASYNC_BULK_TENSOR_OPCODE_S2G_IMPL(op, dim, mode, is_ch, is_s32) \
2754 (is_ch ? (CP_ASYNC_BULK_TENSOR_OPCODE(op, dim, mode, is_s32, _CH)) \
2755 : (CP_ASYNC_BULK_TENSOR_OPCODE(op, dim, mode, is_s32, )))
2756
2757#define GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G(dim, mode, is_reduce, is_ch, \
2758 is_s32) \
2759 (is_reduce \
2760 ? (CP_ASYNC_BULK_TENSOR_OPCODE_S2G_IMPL(RED, dim, mode, is_ch, is_s32)) \
2761 : (CP_ASYNC_BULK_TENSOR_OPCODE_S2G_IMPL(S2G, dim, mode, is_ch, \
2762 is_s32)))
2763
2764#define GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(dim, mode, is_mc, is_ch, is_s32) \
2765 [&]() -> auto { \
2766 if (is_mc && is_ch) \
2767 return CP_ASYNC_BULK_TENSOR_OPCODE(G2S, dim, mode, is_s32, _MC_CH); \
2768 if (is_ch) \
2769 return CP_ASYNC_BULK_TENSOR_OPCODE(G2S, dim, mode, is_s32, _CH); \
2770 if (is_mc) \
2771 return CP_ASYNC_BULK_TENSOR_OPCODE(G2S, dim, mode, is_s32, _MC); \
2772 return CP_ASYNC_BULK_TENSOR_OPCODE(G2S, dim, mode, is_s32, ); \
2773 }()
2774
2775#define GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH(dim, mode, is_ch) \
2776 (is_ch ? NVPTX::CP_ASYNC_BULK_TENSOR_PREFETCH_##dim##_##mode##_CH \
2777 : NVPTX::CP_ASYNC_BULK_TENSOR_PREFETCH_##dim##_##mode)
2778
2779static unsigned GetCpAsyncBulkTensorS2GOpcode(size_t Dim, bool IsShared32,
2780 bool IsCacheHint, bool IsIm2Col,
2781 bool IsReduce = false) {
2782 if (IsIm2Col) {
2783 switch (Dim) {
2784 case 3:
2785 return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G(3D, IM2COL, IsReduce,
2786 IsCacheHint, IsShared32);
2787 case 4:
2788 return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G(4D, IM2COL, IsReduce,
2789 IsCacheHint, IsShared32);
2790 case 5:
2791 return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G(5D, IM2COL, IsReduce,
2792 IsCacheHint, IsShared32);
2793 default:
2794 llvm_unreachable("Invalid Dimension in im2col mode for "
2795 "GetCpAsyncBulkTensorS2GOpcode.");
2796 }
2797 } else {
2798 switch (Dim) {
2799 case 1:
2800 return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G(1D, TILE, IsReduce,
2801 IsCacheHint, IsShared32);
2802 case 2:
2803 return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G(2D, TILE, IsReduce,
2804 IsCacheHint, IsShared32);
2805 case 3:
2806 return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G(3D, TILE, IsReduce,
2807 IsCacheHint, IsShared32);
2808 case 4:
2809 return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G(4D, TILE, IsReduce,
2810 IsCacheHint, IsShared32);
2811 case 5:
2812 return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G(5D, TILE, IsReduce,
2813 IsCacheHint, IsShared32);
2814 default:
2816 "Invalid Dimension in tile mode for GetCpAsyncBulkTensorS2GOpcode.");
2817 }
2818 }
2819}
2820
2821static unsigned GetCpAsyncBulkTensorG2SOpcode(size_t Dim, bool IsShared32,
2822 bool IsMultiCast,
2823 bool IsCacheHint, bool IsIm2Col) {
2824 if (IsIm2Col) {
2825 switch (Dim) {
2826 case 3:
2827 return GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(3D, IM2COL, IsMultiCast,
2828 IsCacheHint, IsShared32);
2829 case 4:
2830 return GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(4D, IM2COL, IsMultiCast,
2831 IsCacheHint, IsShared32);
2832 case 5:
2833 return GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(5D, IM2COL, IsMultiCast,
2834 IsCacheHint, IsShared32);
2835 default:
2836 llvm_unreachable("Invalid Dimension in im2col mode for "
2837 "GetCpAsyncBulkTensorG2SOpcode.");
2838 }
2839 } else {
2840 switch (Dim) {
2841 case 1:
2842 return GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(1D, TILE, IsMultiCast,
2843 IsCacheHint, IsShared32);
2844 case 2:
2845 return GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(2D, TILE, IsMultiCast,
2846 IsCacheHint, IsShared32);
2847 case 3:
2848 return GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(3D, TILE, IsMultiCast,
2849 IsCacheHint, IsShared32);
2850 case 4:
2851 return GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(4D, TILE, IsMultiCast,
2852 IsCacheHint, IsShared32);
2853 case 5:
2854 return GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(5D, TILE, IsMultiCast,
2855 IsCacheHint, IsShared32);
2856 default:
2858 "Invalid Dimension in tile mode for GetCpAsyncBulkTensorG2SOpcode.");
2859 }
2860 }
2861}
2862
2863static unsigned GetCpAsyncBulkTensorPrefetchOpcode(size_t Dim, bool IsCacheHint,
2864 bool IsIm2Col) {
2865 if (IsIm2Col) {
2866 switch (Dim) {
2867 case 3:
2868 return GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH(3D, IM2COL, IsCacheHint);
2869 case 4:
2870 return GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH(4D, IM2COL, IsCacheHint);
2871 case 5:
2872 return GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH(5D, IM2COL, IsCacheHint);
2873 default:
2874 llvm_unreachable("Invalid Dimension in im2col mode for "
2875 "GetCpAsyncBulkTensorPrefetchOpcode.");
2876 }
2877 } else {
2878 switch (Dim) {
2879 case 1:
2880 return GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH(1D, TILE, IsCacheHint);
2881 case 2:
2882 return GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH(2D, TILE, IsCacheHint);
2883 case 3:
2884 return GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH(3D, TILE, IsCacheHint);
2885 case 4:
2886 return GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH(4D, TILE, IsCacheHint);
2887 case 5:
2888 return GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH(5D, TILE, IsCacheHint);
2889 default:
2890 llvm_unreachable("Invalid Dimension in tile mode for "
2891 "GetCpAsyncBulkTensorPrefetchOpcode.");
2892 }
2893 }
2894}
2895
2896static size_t GetDimsFromIntrinsic(unsigned IID) {
2897 switch (IID) {
2898 case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_3d:
2899 case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_im2col_3d:
2900 return 3;
2901 case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_4d:
2902 case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_im2col_4d:
2903 return 4;
2904 case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_5d:
2905 case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_im2col_5d:
2906 return 5;
2907 default:
2908 llvm_unreachable("Invalid im2col intrinsic in GetDimsFromIntrinsic.");
2909 }
2910}
2911
2912void NVPTXDAGToDAGISel::SelectCpAsyncBulkTensorG2SCommon(SDNode *N,
2913 bool IsIm2Col) {
2914 // We have {Chain, Intrinsic-ID} followed by the actual intrisic args:
2915 // {dst, mbar, src, dims{d0...dN}, im2col_offsets{dims-2}
2916 // multicast, cache_hint,
2917 // multicast_flag, cache_hint_flag}
2918 // NumOperands = {Chain, IID} + {Actual intrinsic args}
2919 // = {2} + {7 + dims + im2col_offsets}
2920 size_t NumOps = N->getNumOperands();
2921 size_t NumDims = IsIm2Col ? GetDimsFromIntrinsic(N->getConstantOperandVal(1))
2922 : (NumOps - 9);
2923 // Offsets is always 'NumDims - 2' and only for im2col mode
2924 size_t NumOffsets = IsIm2Col ? (NumDims - 2) : 0;
2925 bool IsCacheHint = N->getConstantOperandVal(NumOps - 1) == 1;
2926 bool IsMultiCast = N->getConstantOperandVal(NumOps - 2) == 1;
2927 size_t NumBaseArgs = NumDims + NumOffsets + 3; // for {dst, mbar, src}
2928 size_t MultiCastIdx = NumBaseArgs + 2; // for Chain and IID
2929
2930 SDLoc DL(N);
2931 SmallVector<SDValue, 8> Ops(N->ops().slice(2, NumBaseArgs));
2932
2933 // Push MultiCast operand, if available
2934 if (IsMultiCast)
2935 Ops.push_back(N->getOperand(MultiCastIdx));
2936
2937 // Push CacheHint operand, if available
2938 if (IsCacheHint)
2939 Ops.push_back(N->getOperand(MultiCastIdx + 1));
2940
2941 // Finally, the chain operand
2942 Ops.push_back(N->getOperand(0));
2943
2944 bool IsShared32 =
2946 unsigned Opcode = GetCpAsyncBulkTensorG2SOpcode(
2947 NumDims, IsShared32, IsMultiCast, IsCacheHint, IsIm2Col);
2948 ReplaceNode(N, CurDAG->getMachineNode(Opcode, DL, N->getVTList(), Ops));
2949}
2950
2951void NVPTXDAGToDAGISel::SelectCpAsyncBulkTensorS2GCommon(SDNode *N,
2952 bool IsIm2Col) {
2953 // We have {Chain, Intrinsic-ID} followed by the actual intrisic args:
2954 // src, dst, dims{d0...dN}, cache_hint, cache_hint_flag
2955 // NumOperands = {Chain, IID} + {Actual intrinsic args}
2956 // = {2} + {4 + dims}
2957 size_t NumOps = N->getNumOperands();
2958 size_t NumDims = NumOps - 6;
2959 bool IsCacheHint = N->getConstantOperandVal(NumOps - 1) == 1;
2960 size_t NumArgs = NumDims + (IsCacheHint ? 3 : 2); // src, dst, cache_hint
2961
2962 SDLoc DL(N);
2963 SmallVector<SDValue, 8> Ops(N->ops().slice(2, NumArgs));
2964 Ops.push_back(N->getOperand(0)); // Chain operand
2965
2966 bool IsShared32 =
2968 unsigned Opcode =
2969 GetCpAsyncBulkTensorS2GOpcode(NumDims, IsShared32, IsCacheHint, IsIm2Col);
2970 ReplaceNode(N, CurDAG->getMachineNode(Opcode, DL, N->getVTList(), Ops));
2971}
2972
2973void NVPTXDAGToDAGISel::SelectCpAsyncBulkTensorPrefetchCommon(SDNode *N,
2974 bool IsIm2Col) {
2975 // We have {Chain, Intrinsic-ID} followed by the actual intrisic args:
2976 // {src, dims{d0...dN}, im2col_offsets{dims-2}
2977 // cache_hint, cache_hint_flag}
2978 // NumOperands = {Chain, IID} + {Actual intrinsic args}
2979 // = {2} + {3 + dims + im2col_offsets}
2980 size_t NumOps = N->getNumOperands();
2981 size_t NumDims = IsIm2Col ? GetDimsFromIntrinsic(N->getConstantOperandVal(1))
2982 : (NumOps - 5);
2983 // Offsets is always 'NumDims - 2' and only for im2col mode
2984 size_t NumOffsets = IsIm2Col ? (NumDims - 2) : 0;
2985 bool IsCacheHint = N->getConstantOperandVal(NumOps - 1) == 1;
2986 size_t NumArgs = NumDims + NumOffsets + (IsCacheHint ? 2 : 1);
2987
2988 SDLoc DL(N);
2989 SmallVector<SDValue, 12> Ops(N->ops().slice(2, NumArgs));
2990 Ops.push_back(N->getOperand(0)); // Chain operand
2991
2992 unsigned Opcode =
2993 GetCpAsyncBulkTensorPrefetchOpcode(NumDims, IsCacheHint, IsIm2Col);
2994 ReplaceNode(N, CurDAG->getMachineNode(Opcode, DL, N->getVTList(), Ops));
2995}
2996
2997void NVPTXDAGToDAGISel::SelectCpAsyncBulkTensorReduceCommon(SDNode *N,
2998 unsigned RedOp,
2999 bool IsIm2Col) {
3000 // We have {Chain, Intrinsic-ID} followed by the actual intrisic args:
3001 // src, dst, dims{d0...dN}, cache_hint, cache_hint_flag
3002 // NumOperands = {Chain, IID} + {Actual intrinsic args}
3003 // = {2} + {4 + dims}
3004 size_t NumOps = N->getNumOperands();
3005 size_t NumDims = NumOps - 6;
3006 bool IsCacheHint = N->getConstantOperandVal(NumOps - 1) == 1;
3007 size_t NumArgs = NumDims + (IsCacheHint ? 3 : 2); // src, dst, cache_hint
3008
3009 SDLoc DL(N);
3010 SmallVector<SDValue, 12> Ops(N->ops().slice(2, NumArgs));
3011 Ops.push_back(getI32Imm(RedOp, DL)); // Reduction Op
3012 Ops.push_back(N->getOperand(0)); // Chain operand
3013
3014 bool IsShared32 =
3016 unsigned Opcode = GetCpAsyncBulkTensorS2GOpcode(
3017 NumDims, IsShared32, IsCacheHint, IsIm2Col, /*IsReduce=*/true);
3018 ReplaceNode(N, CurDAG->getMachineNode(Opcode, DL, N->getVTList(), Ops));
3019}
3020
3021bool NVPTXDAGToDAGISel::tryIntrinsicVoid(SDNode *N) {
3022 unsigned IID = N->getConstantOperandVal(1);
3023 using TMARedTy = llvm::nvvm::TMAReductionOp;
3024 auto CastTy = [](TMARedTy Op) { return static_cast<unsigned>(Op); };
3025 switch (IID) {
3026 default:
3027 return false;
3028 case Intrinsic::nvvm_cp_async_bulk_tensor_s2g_tile_1d:
3029 case Intrinsic::nvvm_cp_async_bulk_tensor_s2g_tile_2d:
3030 case Intrinsic::nvvm_cp_async_bulk_tensor_s2g_tile_3d:
3031 case Intrinsic::nvvm_cp_async_bulk_tensor_s2g_tile_4d:
3032 case Intrinsic::nvvm_cp_async_bulk_tensor_s2g_tile_5d:
3033 SelectCpAsyncBulkTensorS2GCommon(N);
3034 return true;
3035 case Intrinsic::nvvm_cp_async_bulk_tensor_s2g_im2col_3d:
3036 case Intrinsic::nvvm_cp_async_bulk_tensor_s2g_im2col_4d:
3037 case Intrinsic::nvvm_cp_async_bulk_tensor_s2g_im2col_5d:
3038 SelectCpAsyncBulkTensorS2GCommon(N, /*IsIm2Col=*/true);
3039 return true;
3040 case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_1d:
3041 case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_2d:
3042 case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_3d:
3043 case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_4d:
3044 case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_5d:
3045 SelectCpAsyncBulkTensorG2SCommon(N);
3046 return true;
3047 case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_3d:
3048 case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_4d:
3049 case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_5d:
3050 SelectCpAsyncBulkTensorG2SCommon(N, /*IsIm2Col=*/true);
3051 return true;
3052 case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_tile_1d:
3053 case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_tile_2d:
3054 case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_tile_3d:
3055 case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_tile_4d:
3056 case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_tile_5d:
3057 SelectCpAsyncBulkTensorPrefetchCommon(N);
3058 return true;
3059 case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_im2col_3d:
3060 case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_im2col_4d:
3061 case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_im2col_5d:
3062 SelectCpAsyncBulkTensorPrefetchCommon(N, /*IsIm2Col=*/true);
3063 return true;
3064 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_tile_1d:
3065 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_tile_2d:
3066 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_tile_3d:
3067 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_tile_4d:
3068 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_tile_5d:
3069 SelectCpAsyncBulkTensorReduceCommon(N, CastTy(TMARedTy::ADD));
3070 return true;
3071 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_im2col_3d:
3072 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_im2col_4d:
3073 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_im2col_5d:
3074 SelectCpAsyncBulkTensorReduceCommon(N, CastTy(TMARedTy::ADD),
3075 /*IsIm2Col=*/true);
3076 return true;
3077 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_min_tile_1d:
3078 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_min_tile_2d:
3079 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_min_tile_3d:
3080 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_min_tile_4d:
3081 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_min_tile_5d:
3082 SelectCpAsyncBulkTensorReduceCommon(N, CastTy(TMARedTy::MIN));
3083 return true;
3084 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_min_im2col_3d:
3085 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_min_im2col_4d:
3086 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_min_im2col_5d:
3087 SelectCpAsyncBulkTensorReduceCommon(N, CastTy(TMARedTy::MIN),
3088 /*IsIm2Col=*/true);
3089 return true;
3090 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_max_tile_1d:
3091 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_max_tile_2d:
3092 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_max_tile_3d:
3093 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_max_tile_4d:
3094 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_max_tile_5d:
3095 SelectCpAsyncBulkTensorReduceCommon(N, CastTy(TMARedTy::MAX));
3096 return true;
3097 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_max_im2col_3d:
3098 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_max_im2col_4d:
3099 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_max_im2col_5d:
3100 SelectCpAsyncBulkTensorReduceCommon(N, CastTy(TMARedTy::MAX),
3101 /*IsIm2Col=*/true);
3102 return true;
3103 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_inc_tile_1d:
3104 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_inc_tile_2d:
3105 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_inc_tile_3d:
3106 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_inc_tile_4d:
3107 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_inc_tile_5d:
3108 SelectCpAsyncBulkTensorReduceCommon(N, CastTy(TMARedTy::INC));
3109 return true;
3110 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_inc_im2col_3d:
3111 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_inc_im2col_4d:
3112 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_inc_im2col_5d:
3113 SelectCpAsyncBulkTensorReduceCommon(N, CastTy(TMARedTy::INC),
3114 /*IsIm2Col=*/true);
3115 return true;
3116 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_dec_tile_1d:
3117 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_dec_tile_2d:
3118 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_dec_tile_3d:
3119 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_dec_tile_4d:
3120 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_dec_tile_5d:
3121 SelectCpAsyncBulkTensorReduceCommon(N, CastTy(TMARedTy::DEC));
3122 return true;
3123 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_dec_im2col_3d:
3124 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_dec_im2col_4d:
3125 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_dec_im2col_5d:
3126 SelectCpAsyncBulkTensorReduceCommon(N, CastTy(TMARedTy::DEC),
3127 /*IsIm2Col=*/true);
3128 return true;
3129 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_and_tile_1d:
3130 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_and_tile_2d:
3131 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_and_tile_3d:
3132 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_and_tile_4d:
3133 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_and_tile_5d:
3134 SelectCpAsyncBulkTensorReduceCommon(N, CastTy(TMARedTy::AND));
3135 return true;
3136 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_and_im2col_3d:
3137 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_and_im2col_4d:
3138 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_and_im2col_5d:
3139 SelectCpAsyncBulkTensorReduceCommon(N, CastTy(TMARedTy::AND),
3140 /*IsIm2Col=*/true);
3141 return true;
3142 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_or_tile_1d:
3143 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_or_tile_2d:
3144 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_or_tile_3d:
3145 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_or_tile_4d:
3146 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_or_tile_5d:
3147 SelectCpAsyncBulkTensorReduceCommon(N, CastTy(TMARedTy::OR));
3148 return true;
3149 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_or_im2col_3d:
3150 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_or_im2col_4d:
3151 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_or_im2col_5d:
3152 SelectCpAsyncBulkTensorReduceCommon(N, CastTy(TMARedTy::OR),
3153 /*IsIm2Col=*/true);
3154 return true;
3155 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_xor_tile_1d:
3156 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_xor_tile_2d:
3157 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_xor_tile_3d:
3158 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_xor_tile_4d:
3159 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_xor_tile_5d:
3160 SelectCpAsyncBulkTensorReduceCommon(N, CastTy(TMARedTy::XOR));
3161 return true;
3162 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_xor_im2col_3d:
3163 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_xor_im2col_4d:
3164 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_xor_im2col_5d:
3165 SelectCpAsyncBulkTensorReduceCommon(N, CastTy(TMARedTy::XOR),
3166 /*IsIm2Col=*/true);
3167 return true;
3168 }
3169}
amdgpu aa AMDGPU Address space based Alias Analysis Wrapper
static const LLT F64
static const LLT F32
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Atomic ordering constants.
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
uint64_t Addr
#define DEBUG_TYPE
#define F(x, y, z)
Definition: MD5.cpp:55
#define getOpcodeForVectorStParam(n, ty, isimm)
static unsigned int getCodeAddrSpace(MemSDNode *N)
static bool isVectorElementTypeUpsized(EVT EltVT)
static size_t GetDimsFromIntrinsic(unsigned IID)
static int getLdStRegType(EVT VT)
static unsigned pickOpcodeForVectorStParam(SmallVector< SDValue, 8 > &Ops, unsigned NumElts, MVT::SimpleValueType MemTy, SelectionDAG *CurDAG, SDLoc DL)
#define getOpcodeForVectorStParamV2(ty, isimm)
static cl::opt< bool > EnableRsqrtOpt("nvptx-rsqrt-approx-opt", cl::init(true), cl::Hidden, cl::desc("Enable reciprocal sqrt optimization"))
#define GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH(dim, mode, is_ch)
static unsigned GetCpAsyncBulkTensorPrefetchOpcode(size_t Dim, bool IsCacheHint, bool IsIm2Col)
#define GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(dim, mode, is_mc, is_ch, is_s32)
static unsigned GetCpAsyncBulkTensorG2SOpcode(size_t Dim, bool IsShared32, bool IsMultiCast, bool IsCacheHint, bool IsIm2Col)
static unsigned getPTXCmpMode(const CondCodeSDNode &CondCode, bool FTZ)
static unsigned int getFenceOp(NVPTX::Ordering O, NVPTX::Scope S, NVPTXSubtarget const *T)
static unsigned GetCpAsyncBulkTensorS2GOpcode(size_t Dim, bool IsShared32, bool IsCacheHint, bool IsIm2Col, bool IsReduce=false)
static std::optional< unsigned > pickOpcodeForVT(MVT::SimpleValueType VT, unsigned Opcode_i8, unsigned Opcode_i16, unsigned Opcode_i32, std::optional< unsigned > Opcode_i64, unsigned Opcode_f32, std::optional< unsigned > Opcode_f64)
#define GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G(dim, mode, is_reduce, is_ch, is_s32)
static bool canLowerToLDG(MemSDNode *N, const NVPTXSubtarget &Subtarget, unsigned CodeAddrSpace, MachineFunction *F)
This file contains the definitions of the enumerations and flags associated with NVVM Intrinsics.
if(PassOpts->AAPipeline)
#define INITIALIZE_PASS(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:38
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
#define PASS_NAME
Value * RHS
Value * LHS
unsigned getSrcAddressSpace() const
unsigned getDestAddressSpace() const
This is an SDNode representing atomic operations.
const SDValue & getVal() const
const ConstantFP * getConstantFPValue() const
ConstantFP - Floating Point Values [float, double].
Definition: Constants.h:271
This is the shared class of boolean and integer constants.
Definition: Constants.h:83
const ConstantInt * getConstantIntValue() const
uint64_t getZExtValue() const
This class represents an Operation in the Expression.
unsigned getPointerSizeInBits(unsigned AS=0) const
Layout pointer size, in bits FIXME: The defaults need to be removed once all of the backends/clients ...
Definition: DataLayout.h:364
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:310
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition: Function.cpp:369
Record instruction ordering so we can query their relative positions within a function.
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
bool isIndexed() const
Return true if this is a pre/post inc/dec load/store.
This class is used to represent ISD::LOAD nodes.
ISD::LoadExtType getExtensionType() const
Return whether this is a plain node, or one of the varieties of value-extending loads.
Machine Value Type.
SimpleValueType SimpleTy
bool isVector() const
Return true if this is a vector value type.
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
MVT getVectorElementType() const
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
Function & getFunction()
Return the LLVM function that this machine code represents.
A description of a memory reference used in the backend.
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
EVT getMemoryVT() const
Return the type of the in-memory value.
NVPTXDAGToDAGISelLegacy(NVPTXTargetMachine &tm, CodeGenOptLevel OptLevel)
bool runOnMachineFunction(MachineFunction &MF) override
bool SelectInlineAsmMemoryOperand(const SDValue &Op, InlineAsm::ConstraintCode ConstraintID, std::vector< SDValue > &OutOps) override
SelectInlineAsmMemoryOperand - Implement addressing mode selection for inline asm expressions.
const NVPTXSubtarget * Subtarget
void failIfClustersUnsupported(std::string const &FailureMessage) const
const NVPTXTargetLowering * getTargetLowering() const override
bool hasRelaxedMMIO() const
bool hasMemoryOrdering() const
bool useF32FTZ(const MachineFunction &MF) const
bool allowFMA(MachineFunction &MF, CodeGenOptLevel OptLevel) const
bool allowUnsafeFPMath(MachineFunction &MF) const
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
Represents one node in the SelectionDAG.
const SDValue & getOperand(unsigned Num) const
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
SDNode * getNode() const
get the SDNode which holds the desired result
EVT getValueType() const
Return the ValueType of the referenced return value.
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
unsigned getOpcode() const
SelectionDAGISel - This is the common base class used for SelectionDAG-based pattern-matching instruc...
MachineFunction * MF
CodeGenOptLevel OptLevel
void ReplaceUses(SDValue F, SDValue T)
ReplaceUses - replace all uses of the old node F with the use of the new node T.
void ReplaceNode(SDNode *F, SDNode *T)
Replace all uses of F with T, then remove F from the DAG.
virtual bool runOnMachineFunction(MachineFunction &mf)
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
Definition: SelectionDAG.h:228
SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
void setNodeMemRefs(MachineSDNode *N, ArrayRef< MachineMemOperand * > NewMemRefs)
Mutate the specified machine node's memory references to the provided list.
const DataLayout & getDataLayout() const
Definition: SelectionDAG.h:495
SDValue getTargetFrameIndex(int FI, EVT VT)
Definition: SelectionDAG.h:753
SDValue getSignedTargetConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
Definition: SelectionDAG.h:710
SDValue getTargetConstantFP(double Val, const SDLoc &DL, EVT VT)
Definition: SelectionDAG.h:734
SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
Definition: SelectionDAG.h:698
bool empty() const
Definition: SmallVector.h:81
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
Definition: SmallVector.h:683
void push_back(const T &Elt)
Definition: SmallVector.h:413
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1196
This class is used to represent ISD::STORE nodes.
const SDValue & getValue() const
unsigned getPointerSizeInBits(unsigned AS) const
LLVM Value Representation.
Definition: Value.h:74
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Definition: Lint.cpp:87
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:125
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
@ ATOMIC_STORE
OUTCHAIN = ATOMIC_STORE(INCHAIN, val, ptr) This corresponds to "store atomic" instruction.
Definition: ISDOpcodes.h:1312
@ ADD
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:246
@ LOAD
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
Definition: ISDOpcodes.h:1102
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
Definition: ISDOpcodes.h:205
@ ATOMIC_FENCE
OUTCHAIN = ATOMIC_FENCE(INCHAIN, ordering, scope) This corresponds to the fence instruction.
Definition: ISDOpcodes.h:1304
@ TargetExternalSymbol
Definition: ISDOpcodes.h:175
@ ATOMIC_LOAD
Val, OUTCHAIN = ATOMIC_LOAD(INCHAIN, ptr) This corresponds to "load atomic" instruction.
Definition: ISDOpcodes.h:1308
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
Definition: ISDOpcodes.h:215
@ TargetGlobalAddress
TargetGlobalAddress - Like GlobalAddress, but the DAG does no folding or anything else with this node...
Definition: ISDOpcodes.h:170
@ SHL
Shift and rotation operations.
Definition: ISDOpcodes.h:735
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition: ISDOpcodes.h:550
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition: ISDOpcodes.h:209
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition: ISDOpcodes.h:709
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition: ISDOpcodes.h:190
@ ADDRSPACECAST
ADDRSPACECAST - This operator converts between pointers of different address spaces.
Definition: ISDOpcodes.h:958
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition: ISDOpcodes.h:198
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
Definition: ISDOpcodes.h:1613
std::string ScopeToString(Scope S)
@ Global
Definition: NVPTX.h:145
@ Shared
Definition: NVPTX.h:146
@ Generic
Definition: NVPTX.h:144
std::string OrderingToString(Ordering Order)
@ System
Definition: NVPTX.h:138
@ Cluster
Definition: NVPTX.h:136
@ Thread
Definition: NVPTX.h:134
@ Device
Definition: NVPTX.h:137
@ RelaxedMMIO
Definition: NVPTX.h:128
@ Acquire
Definition: NVPTX.h:122
@ Relaxed
Definition: NVPTX.h:120
@ AcquireRelease
Definition: NVPTX.h:124
@ NotAtomic
Definition: NVPTX.h:117
@ Volatile
Definition: NVPTX.h:127
@ Release
Definition: NVPTX.h:123
@ SequentiallyConsistent
Definition: NVPTX.h:125
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
@ Offset
Definition: DWP.cpp:480
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1739
bool Isv2x16VT(EVT VT)
int countr_one(T Value)
Count the number of ones from the least significant bit to the first zero bit.
Definition: bit.h:307
FunctionPass * createNVPTXISelDag(NVPTXTargetMachine &TM, llvm::CodeGenOptLevel OptLevel)
createNVPTXISelDag - This pass converts a legalized DAG into a NVPTX-specific DAG,...
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition: bit.h:215
constexpr bool isShiftedMask_64(uint64_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (64 bit ver...
Definition: MathExtras.h:285
const char * toIRString(AtomicOrdering ao)
String used by LLVM IR to represent atomic ordering.
auto formatv(bool Validate, const char *Fmt, Ts &&...Vals)
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition: Error.cpp:167
constexpr bool isMask_64(uint64_t Value)
Return true if the argument is a non-empty sequence of ones starting at the least significant bit wit...
Definition: MathExtras.h:273
CodeGenOptLevel
Code generation optimization level.
Definition: CodeGen.h:54
AtomicOrdering
Atomic ordering for LLVM's memory model.
DWARFExpression::Operation Op
bool isKernelFunction(const Function &F)
void getUnderlyingObjects(const Value *V, SmallVectorImpl< const Value * > &Objects, const LoopInfo *LI=nullptr, unsigned MaxLookup=6)
This method is similar to getUnderlyingObject except that it can look through phi and select instruct...
Implement std::hash so that hash_code can be used in STL containers.
Definition: BitVector.h:858
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:860
#define N
Extended Value Type.
Definition: ValueTypes.h:35
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition: ValueTypes.h:137
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
Definition: ValueTypes.h:147
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition: ValueTypes.h:311
bool isVector() const
Return true if this is a vector value type.
Definition: ValueTypes.h:168
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition: ValueTypes.h:323
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition: ValueTypes.h:331
NVPTXScopes()=default
NVPTX::Scope operator[](SyncScope::ID ID) const
This represents a list of ValueType's that has been intern'd by a SelectionDAG.