LLVM 20.0.0git
NVPTXISelDAGToDAG.cpp
Go to the documentation of this file.
1//===-- NVPTXISelDAGToDAG.cpp - A dag to dag inst selector for NVPTX ------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines an instruction selector for the NVPTX target.
10//
11//===----------------------------------------------------------------------===//
12
13#include "NVPTXISelDAGToDAG.h"
14#include "NVPTXUtilities.h"
18#include "llvm/IR/GlobalValue.h"
20#include "llvm/IR/IntrinsicsNVPTX.h"
27
28using namespace llvm;
29
30#define DEBUG_TYPE "nvptx-isel"
31#define PASS_NAME "NVPTX DAG->DAG Pattern Instruction Selection"
32
33static cl::opt<bool>
34 EnableRsqrtOpt("nvptx-rsqrt-approx-opt", cl::init(true), cl::Hidden,
35 cl::desc("Enable reciprocal sqrt optimization"));
36
37/// createNVPTXISelDag - This pass converts a legalized DAG into a
38/// NVPTX-specific DAG, ready for instruction scheduling.
40 llvm::CodeGenOptLevel OptLevel) {
41 return new NVPTXDAGToDAGISelLegacy(TM, OptLevel);
42}
43
45 CodeGenOptLevel OptLevel)
47 ID, std::make_unique<NVPTXDAGToDAGISel>(tm, OptLevel)) {}
48
50
52
54 CodeGenOptLevel OptLevel)
55 : SelectionDAGISel(tm, OptLevel), TM(tm) {
56 doMulWide = (OptLevel > CodeGenOptLevel::None);
57}
58
63}
64
65int NVPTXDAGToDAGISel::getDivF32Level() const {
67}
68
69bool NVPTXDAGToDAGISel::usePrecSqrtF32() const {
71}
72
73bool NVPTXDAGToDAGISel::useF32FTZ() const {
75}
76
77bool NVPTXDAGToDAGISel::allowFMA() const {
79 return TL->allowFMA(*MF, OptLevel);
80}
81
82bool NVPTXDAGToDAGISel::allowUnsafeFPMath() const {
84 return TL->allowUnsafeFPMath(*MF);
85}
86
87bool NVPTXDAGToDAGISel::doRsqrtOpt() const { return EnableRsqrtOpt; }
88
89/// Select - Select instructions not customized! Used for
90/// expanded, promoted and normal instructions.
91void NVPTXDAGToDAGISel::Select(SDNode *N) {
92
93 if (N->isMachineOpcode()) {
94 N->setNodeId(-1);
95 return; // Already selected.
96 }
97
98 switch (N->getOpcode()) {
99 case ISD::LOAD:
100 case ISD::ATOMIC_LOAD:
101 if (tryLoad(N))
102 return;
103 break;
104 case ISD::STORE:
106 if (tryStore(N))
107 return;
108 break;
110 if (tryFence(N))
111 return;
112 break;
114 if (tryEXTRACT_VECTOR_ELEMENT(N))
115 return;
116 break;
118 SelectSETP_F16X2(N);
119 return;
121 SelectSETP_BF16X2(N);
122 return;
123 case NVPTXISD::LoadV2:
124 case NVPTXISD::LoadV4:
125 if (tryLoadVector(N))
126 return;
127 break;
128 case NVPTXISD::LDUV2:
129 case NVPTXISD::LDUV4:
130 if (tryLDGLDU(N))
131 return;
132 break;
135 if (tryStoreVector(N))
136 return;
137 break;
141 if (tryLoadParam(N))
142 return;
143 break;
147 if (tryStoreRetval(N))
148 return;
149 break;
155 if (tryStoreParam(N))
156 return;
157 break;
159 if (tryIntrinsicNoChain(N))
160 return;
161 break;
163 if (tryIntrinsicChain(N))
164 return;
165 break;
167 if (tryIntrinsicVoid(N))
168 return;
169 break;
170 case ISD::AND:
171 case ISD::SRA:
172 case ISD::SRL:
173 // Try to select BFE
174 if (tryBFE(N))
175 return;
176 break;
178 SelectAddrSpaceCast(N);
179 return;
180 case ISD::CopyToReg: {
181 if (N->getOperand(1).getValueType() == MVT::i128) {
182 SelectV2I64toI128(N);
183 return;
184 }
185 break;
186 }
187 case ISD::CopyFromReg: {
188 if (N->getOperand(1).getValueType() == MVT::i128) {
189 SelectI128toV2I64(N);
190 return;
191 }
192 break;
193 }
194 default:
195 break;
196 }
197 SelectCode(N);
198}
199
200bool NVPTXDAGToDAGISel::tryIntrinsicChain(SDNode *N) {
201 unsigned IID = N->getConstantOperandVal(1);
202 switch (IID) {
203 default:
204 return false;
205 case Intrinsic::nvvm_ldu_global_f:
206 case Intrinsic::nvvm_ldu_global_i:
207 case Intrinsic::nvvm_ldu_global_p:
208 return tryLDGLDU(N);
209 }
210}
211
212// Map ISD:CONDCODE value to appropriate CmpMode expected by
213// NVPTXInstPrinter::printCmpMode()
214static unsigned getPTXCmpMode(const CondCodeSDNode &CondCode, bool FTZ) {
216 unsigned PTXCmpMode = [](ISD::CondCode CC) {
217 switch (CC) {
218 default:
219 llvm_unreachable("Unexpected condition code.");
220 case ISD::SETOEQ:
221 return CmpMode::EQ;
222 case ISD::SETOGT:
223 return CmpMode::GT;
224 case ISD::SETOGE:
225 return CmpMode::GE;
226 case ISD::SETOLT:
227 return CmpMode::LT;
228 case ISD::SETOLE:
229 return CmpMode::LE;
230 case ISD::SETONE:
231 return CmpMode::NE;
232 case ISD::SETO:
233 return CmpMode::NUM;
234 case ISD::SETUO:
235 return CmpMode::NotANumber;
236 case ISD::SETUEQ:
237 return CmpMode::EQU;
238 case ISD::SETUGT:
239 return CmpMode::GTU;
240 case ISD::SETUGE:
241 return CmpMode::GEU;
242 case ISD::SETULT:
243 return CmpMode::LTU;
244 case ISD::SETULE:
245 return CmpMode::LEU;
246 case ISD::SETUNE:
247 return CmpMode::NEU;
248 case ISD::SETEQ:
249 return CmpMode::EQ;
250 case ISD::SETGT:
251 return CmpMode::GT;
252 case ISD::SETGE:
253 return CmpMode::GE;
254 case ISD::SETLT:
255 return CmpMode::LT;
256 case ISD::SETLE:
257 return CmpMode::LE;
258 case ISD::SETNE:
259 return CmpMode::NE;
260 }
261 }(CondCode.get());
262
263 if (FTZ)
264 PTXCmpMode |= NVPTX::PTXCmpMode::FTZ_FLAG;
265
266 return PTXCmpMode;
267}
268
269bool NVPTXDAGToDAGISel::SelectSETP_F16X2(SDNode *N) {
270 unsigned PTXCmpMode =
271 getPTXCmpMode(*cast<CondCodeSDNode>(N->getOperand(2)), useF32FTZ());
272 SDLoc DL(N);
274 NVPTX::SETP_f16x2rr, DL, MVT::i1, MVT::i1, N->getOperand(0),
275 N->getOperand(1), CurDAG->getTargetConstant(PTXCmpMode, DL, MVT::i32));
276 ReplaceNode(N, SetP);
277 return true;
278}
279
280bool NVPTXDAGToDAGISel::SelectSETP_BF16X2(SDNode *N) {
281 unsigned PTXCmpMode =
282 getPTXCmpMode(*cast<CondCodeSDNode>(N->getOperand(2)), useF32FTZ());
283 SDLoc DL(N);
285 NVPTX::SETP_bf16x2rr, DL, MVT::i1, MVT::i1, N->getOperand(0),
286 N->getOperand(1), CurDAG->getTargetConstant(PTXCmpMode, DL, MVT::i32));
287 ReplaceNode(N, SetP);
288 return true;
289}
290
291// Find all instances of extract_vector_elt that use this v2f16 vector
292// and coalesce them into a scattering move instruction.
293bool NVPTXDAGToDAGISel::tryEXTRACT_VECTOR_ELEMENT(SDNode *N) {
294 SDValue Vector = N->getOperand(0);
295
296 // We only care about 16x2 as it's the only real vector type we
297 // need to deal with.
298 MVT VT = Vector.getSimpleValueType();
299 if (!Isv2x16VT(VT))
300 return false;
301 // Find and record all uses of this vector that extract element 0 or 1.
303 for (auto *U : Vector.getNode()->users()) {
304 if (U->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
305 continue;
306 if (U->getOperand(0) != Vector)
307 continue;
308 if (const ConstantSDNode *IdxConst =
309 dyn_cast<ConstantSDNode>(U->getOperand(1))) {
310 if (IdxConst->getZExtValue() == 0)
311 E0.push_back(U);
312 else if (IdxConst->getZExtValue() == 1)
313 E1.push_back(U);
314 else
315 llvm_unreachable("Invalid vector index.");
316 }
317 }
318
319 // There's no point scattering f16x2 if we only ever access one
320 // element of it.
321 if (E0.empty() || E1.empty())
322 return false;
323
324 // Merge (f16 extractelt(V, 0), f16 extractelt(V,1))
325 // into f16,f16 SplitF16x2(V)
326 MVT EltVT = VT.getVectorElementType();
327 SDNode *ScatterOp =
328 CurDAG->getMachineNode(NVPTX::I32toV2I16, SDLoc(N), EltVT, EltVT, Vector);
329 for (auto *Node : E0)
330 ReplaceUses(SDValue(Node, 0), SDValue(ScatterOp, 0));
331 for (auto *Node : E1)
332 ReplaceUses(SDValue(Node, 0), SDValue(ScatterOp, 1));
333
334 return true;
335}
336
337static unsigned int getCodeAddrSpace(MemSDNode *N) {
338 const Value *Src = N->getMemOperand()->getValue();
339
340 if (!Src)
342
343 if (auto *PT = dyn_cast<PointerType>(Src->getType())) {
344 switch (PT->getAddressSpace()) {
357 default: break;
358 }
359 }
361}
362
363namespace {
364
365struct OperationOrderings {
367 OperationOrderings(NVPTX::Ordering IO = NVPTX::Ordering::NotAtomic,
368 NVPTX::Ordering FO = NVPTX::Ordering::NotAtomic)
369 : InstructionOrdering(IO), FenceOrdering(FO) {}
370};
371
372static OperationOrderings
373getOperationOrderings(MemSDNode *N, const NVPTXSubtarget *Subtarget) {
374 AtomicOrdering Ordering = N->getSuccessOrdering();
375 auto CodeAddrSpace = getCodeAddrSpace(N);
376
377 bool HasMemoryOrdering = Subtarget->hasMemoryOrdering();
378 bool HasRelaxedMMIO = Subtarget->hasRelaxedMMIO();
379
380 // clang-format off
381
382 // Lowering for Load/Store Operations (note: AcquireRelease Loads or Stores error).
383 // Note: uses of Relaxed in the Atomic column of this table refer
384 // to LLVM AtomicOrdering::Monotonic.
385 //
386 // | Atomic | Volatile | Statespace | PTX sm_60- | PTX sm_70+ |
387 // |---------|----------|--------------------|------------|------------------------------|
388 // | No | No | All | plain | .weak |
389 // | No | Yes | Generic,Shared, | .volatile | .volatile |
390 // | | | Global [0] | | |
391 // | No | Yes | Local,Const,Param | plain [1] | .weak [1] |
392 // | Unorder | Yes/No | All | == Relaxed | == Relaxed |
393 // | Relaxed | No | Generic,Shared, | .volatile | <atomic sem> |
394 // | | | Global [0] | | |
395 // | Other | No | Generic,Shared, | Error [2] | <atomic sem> |
396 // | | | Global [0] | | |
397 // | Yes | No | Local,Const,Param | plain [1] | .weak [1] |
398 // | Relaxed | Yes | Generic,Shared [0] | .volatile | .volatile |
399 // | Relaxed | Yes | Global [0] | .volatile | .mmio.relaxed.sys (PTX 8.2+) |
400 // | | | | | or .volatile (PTX 8.1-) |
401 // | Relaxed | Yes | Local,Const,Param | plain [1] | .weak [1] |
402 // | Other | Yes | Generic, Shared, | Error [2] | <atomic sem> [3] |
403 // | | | / Global [0] | | |
404
405 // Lowering of CUDA C++ SequentiallyConsistent Operations and Fences to PTX
406 // by following the ABI proven sound in:
407 // Lustig et al, A Formal Analysis of the NVIDIA PTX Memory Consistency Model, ASPLOS’19.
408 // https://dl.acm.org/doi/pdf/10.1145/3297858.3304043
409 //
410 // | CUDA C++ Atomic Operation or Atomic Fence | PTX Atomic Operation or Fence |
411 // |------------------------------------------------------|-------------------------------|
412 // | cuda::atomic_thread_fence | fence.sc.<scope>; |
413 // | (memory_order_seq_cst, cuda::thread_scope_<scope>) | |
414 // |------------------------------------------------------|-------------------------------|
415 // | cuda::atomic_load | fence.sc.<scope>; |
416 // | (memory_order_seq_cst, cuda::thread_scope_<scope>) | ld.acquire.<scope>; |
417 // |------------------------------------------------------|-------------------------------|
418 // | cuda::atomic_store | fence.sc.<scope>; |
419 // | (memory_order_seq_cst, cuda::thread_scope_<scope>) | st.release.<scope>; |
420 // |------------------------------------------------------|-------------------------------|
421 // | cuda::atomic_fetch_<op> | fence.sc.<scope>; |
422 // | (memory_order_seq_cst, cuda::thread_scope_<scope>) | atom.acq_rel.<scope>; |
423
424 // clang-format on
425
426 // [0]: volatile and atomics are only supported on global or shared
427 // memory locations, accessed via generic/shared/global pointers.
428 // MMIO is only supported on global memory locations,
429 // accessed via generic/global pointers.
430 // TODO: Implement MMIO access via generic pointer to global.
431 // Currently implemented for global pointers only.
432
433 // [1]: Lowering volatile/atomic operations to non-volatile/non-atomic
434 // PTX instructions fails to preserve their C++ side-effects.
435 //
436 // Example (https://github.com/llvm/llvm-project/issues/62057):
437 //
438 // void example() {
439 // std::atomic<bool> True = true;
440 // while (True.load(std::memory_order_relaxed));
441 // }
442 //
443 // A C++ program that calls "example" is well-defined: the infinite loop
444 // performs an atomic operation. By lowering volatile/atomics to
445 // "weak" memory operations, we are transforming the above into:
446 //
447 // void undefined_behavior() {
448 // bool True = true;
449 // while (True);
450 // }
451 //
452 // which exhibits undefined behavior in both C++ and PTX.
453 //
454 // Calling "example" in CUDA C++ compiled for sm_60- exhibits undefined
455 // behavior due to lack of Independent Forward Progress. Lowering these
456 // to weak memory operations in sm_60- is therefore fine.
457 //
458 // TODO: lower atomic and volatile operations to memory locations
459 // in local, const, and param to two PTX instructions in sm_70+:
460 // - the "weak" memory instruction we are currently lowering to, and
461 // - some other instruction that preserves the side-effect, e.g.,
462 // a dead dummy volatile load.
463 if (CodeAddrSpace == NVPTX::AddressSpace::Local ||
464 CodeAddrSpace == NVPTX::AddressSpace::Const ||
465 CodeAddrSpace == NVPTX::AddressSpace::Param) {
467 }
468
469 // [2]: Atomics with Ordering different than Unordered or Relaxed are not
470 // supported on sm_60 and older; this includes volatile atomics.
471 if (!(Ordering == AtomicOrdering::NotAtomic ||
472 Ordering == AtomicOrdering::Unordered ||
473 Ordering == AtomicOrdering::Monotonic) &&
474 !HasMemoryOrdering) {
476 formatv("PTX does not support \"atomic\" for orderings different than"
477 "\"NotAtomic\" or \"Monotonic\" for sm_60 or older, but order "
478 "is: \"{}\".",
479 toIRString(Ordering)));
480 }
481
482 // [3]: TODO: these should eventually use .mmio<.atomic sem>; for now we drop
483 // the volatile semantics and preserve the atomic ones.
484
485 // PTX volatile and PTX atomics are not available for statespace that differ
486 // from .generic, .global, or .shared. The behavior of PTX volatile and PTX
487 // atomics is undefined if the generic address does not refer to a .global or
488 // .shared memory location.
489 bool AddrGenericOrGlobalOrShared =
490 (CodeAddrSpace == NVPTX::AddressSpace::Generic ||
491 CodeAddrSpace == NVPTX::AddressSpace::Global ||
492 CodeAddrSpace == NVPTX::AddressSpace::Shared);
493 if (!AddrGenericOrGlobalOrShared)
495
496 bool UseRelaxedMMIO =
497 HasRelaxedMMIO && CodeAddrSpace == NVPTX::AddressSpace::Global;
498
499 switch (Ordering) {
501 return N->isVolatile() ? NVPTX::Ordering::Volatile
504 // We lower unordered in the exact same way as 'monotonic' to respect
505 // LLVM IR atomicity requirements.
507 if (N->isVolatile())
508 return UseRelaxedMMIO ? NVPTX::Ordering::RelaxedMMIO
510 else
511 return HasMemoryOrdering ? NVPTX::Ordering::Relaxed
513 // case AtomicOrdering::Consume: // If LLVM ever provides this, lower it to
514 // Acquire.
516 if (!N->readMem())
518 formatv("PTX only supports Acquire Ordering on reads: {}",
519 N->getOperationName()));
522 if (!N->writeMem())
524 formatv("PTX only supports Release Ordering on writes: {}",
525 N->getOperationName()));
529 formatv("NVPTX does not support AcquireRelease Ordering on "
530 "read-modify-write "
531 "yet and PTX does not support it on loads or stores: {}",
532 N->getOperationName()));
533 }
535 // LLVM-IR SequentiallyConsistent atomics map to a two-instruction PTX
536 // sequence including a "fence.sc.sco" and the memory instruction with an
537 // Ordering that differs from "sc": acq, rel, or acq_rel, depending on
538 // whether the memory operation is a read, write, or read-modify-write.
539 //
540 // This sets the ordering of the fence to SequentiallyConsistent, and
541 // sets the corresponding ordering for the instruction.
542 NVPTX::Ordering InstrOrder;
543 if (N->readMem())
544 InstrOrder = NVPTX::Ordering::Acquire;
545 else if (N->writeMem())
546 InstrOrder = NVPTX::Ordering::Release;
547 else
549 formatv("NVPTX does not support SequentiallyConsistent Ordering on "
550 "read-modify-writes yet: {}",
551 N->getOperationName()));
552 return OperationOrderings(InstrOrder,
554 }
555 }
557 formatv("NVPTX backend does not support AtomicOrdering \"{}\" yet.",
558 toIRString(Ordering)));
559}
560
561} // namespace
562
563NVPTX::Scope NVPTXDAGToDAGISel::getOperationScope(MemSDNode *N,
564 NVPTX::Ordering O) const {
565 switch (O) {
567 case NVPTX::Ordering::Volatile: // Non-atomic volatile operations
568 // NVPTX uses Thread scope as the scope of non-atomic operations.
571 // RelaxedMMIO operations are always system scope.
572 // If a RelaxedMMIO order was generated from an atomic volatile operation
573 // with a smaller thread scope, we bump it here to system scope.
580 auto S = Scopes[N->getSyncScopeID()];
581
582 // Atomic operations must have a scope greater than thread.
583 if (S == NVPTX::Scope::Thread)
585 formatv("Atomics need scope > \"{}\".", ScopeToString(S)));
586
587 // If scope is cluster, clusters must be supported.
588 if (S == NVPTX::Scope::Cluster)
589 Subtarget->failIfClustersUnsupported("cluster scope");
590
591 // If operation is volatile, then its scope is system.
592 return N->isVolatile() ? NVPTX::Scope::System : S;
593 }
594 llvm_unreachable("unhandled ordering");
595}
596
597static bool canLowerToLDG(MemSDNode *N, const NVPTXSubtarget &Subtarget,
598 unsigned CodeAddrSpace, MachineFunction *F) {
599 // We use ldg (i.e. ld.global.nc) for invariant loads from the global address
600 // space.
601 //
602 // We have two ways of identifying invariant loads: Loads may be explicitly
603 // marked as invariant, or we may infer them to be invariant.
604 //
605 // We currently infer invariance for loads from
606 // - constant global variables, and
607 // - kernel function pointer params that are noalias (i.e. __restrict) and
608 // never written to.
609 //
610 // TODO: Perform a more powerful invariance analysis (ideally IPO, and ideally
611 // not during the SelectionDAG phase).
612 //
613 // TODO: Infer invariance only at -O2. We still want to use ldg at -O0 for
614 // explicitly invariant loads because these are how clang tells us to use ldg
615 // when the user uses a builtin.
616 if (!Subtarget.hasLDG() || CodeAddrSpace != NVPTX::AddressSpace::Global)
617 return false;
618
619 if (N->isInvariant())
620 return true;
621
622 bool IsKernelFn = isKernelFunction(F->getFunction());
623
624 // We use getUnderlyingObjects() here instead of getUnderlyingObject() mainly
625 // because the former looks through phi nodes while the latter does not. We
626 // need to look through phi nodes to handle pointer induction variables.
628 getUnderlyingObjects(N->getMemOperand()->getValue(), Objs);
629
630 return all_of(Objs, [&](const Value *V) {
631 if (auto *A = dyn_cast<const Argument>(V))
632 return IsKernelFn && A->onlyReadsMemory() && A->hasNoAliasAttr();
633 if (auto *GV = dyn_cast<const GlobalVariable>(V))
634 return GV->isConstant();
635 return false;
636 });
637}
638
639static unsigned int getFenceOp(NVPTX::Ordering O, NVPTX::Scope S,
640 NVPTXSubtarget const *T) {
641 if (S == NVPTX::Scope::Cluster)
642 T->failIfClustersUnsupported(".cluster scope fence");
643
644 switch (O) {
648 switch (S) {
650 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_acq_rel_sys
651 : NVPTX::INT_MEMBAR_SYS;
653 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_acq_rel_cta
654 : NVPTX::INT_MEMBAR_CTA;
656 return NVPTX::atomic_thread_fence_acq_rel_cluster;
658 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_acq_rel_gpu
659 : NVPTX::INT_MEMBAR_GL;
662 formatv("Unsupported scope \"{}\" for acquire/release/acq_rel fence.",
663 ScopeToString(S)));
664 }
665 break;
666 }
668 switch (S) {
670 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_seq_cst_sys
671 : NVPTX::INT_MEMBAR_SYS;
673 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_seq_cst_cta
674 : NVPTX::INT_MEMBAR_CTA;
676 return NVPTX::atomic_thread_fence_seq_cst_cluster;
678 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_seq_cst_gpu
679 : NVPTX::INT_MEMBAR_GL;
681 report_fatal_error(formatv("Unsupported scope \"{}\" for seq_cst fence.",
682 ScopeToString(S)));
683 }
684 break;
685 }
691 formatv("Unsupported \"{}\" ordering and \"{}\" scope for fence.",
692 OrderingToString(O), ScopeToString(S)));
693 }
694 llvm_unreachable("unhandled ordering");
695}
696
697// Returns Memory Order and Scope of a memory instruction, and
698// inserts any fence before the instruction that's required to
699// implement its memory ordering.
700std::pair<NVPTX::Ordering, NVPTX::Scope>
701NVPTXDAGToDAGISel::insertMemoryInstructionFence(SDLoc DL, SDValue &Chain,
702 MemSDNode *N) {
703 auto [InstructionOrdering, FenceOrdering] =
704 getOperationOrderings(N, Subtarget);
705 auto Scope = getOperationScope(N, InstructionOrdering);
706
707 // If a fence is required before the operation, insert it:
708 switch (NVPTX::Ordering(FenceOrdering)) {
710 break;
712 auto Op = getFenceOp(FenceOrdering, Scope, Subtarget);
713 Chain = SDValue(CurDAG->getMachineNode(Op, DL, MVT::Other, Chain), 0);
714 break;
715 }
716 default:
718 formatv("Unexpected fence ordering: \"{}\".",
719 OrderingToString(NVPTX::Ordering(FenceOrdering))));
720 }
721 return {InstructionOrdering, Scope};
722}
723
724bool NVPTXDAGToDAGISel::tryIntrinsicNoChain(SDNode *N) {
725 unsigned IID = N->getConstantOperandVal(0);
726 switch (IID) {
727 default:
728 return false;
729 case Intrinsic::nvvm_texsurf_handle_internal:
730 SelectTexSurfHandle(N);
731 return true;
732 }
733}
734
735void NVPTXDAGToDAGISel::SelectTexSurfHandle(SDNode *N) {
736 // Op 0 is the intrinsic ID
737 SDValue Wrapper = N->getOperand(1);
738 SDValue GlobalVal = Wrapper.getOperand(0);
739 ReplaceNode(N, CurDAG->getMachineNode(NVPTX::texsurf_handles, SDLoc(N),
740 MVT::i64, GlobalVal));
741}
742
743void NVPTXDAGToDAGISel::SelectAddrSpaceCast(SDNode *N) {
744 SDValue Src = N->getOperand(0);
745 AddrSpaceCastSDNode *CastN = cast<AddrSpaceCastSDNode>(N);
746 unsigned SrcAddrSpace = CastN->getSrcAddressSpace();
747 unsigned DstAddrSpace = CastN->getDestAddressSpace();
748 SDLoc DL(N);
749 assert(SrcAddrSpace != DstAddrSpace &&
750 "addrspacecast must be between different address spaces");
751
752 if (DstAddrSpace == ADDRESS_SPACE_GENERIC) {
753 // Specific to generic
754
755 if (TM.is64Bit() && TM.getPointerSizeInBits(SrcAddrSpace) == 32) {
756 SDValue CvtNone =
758 SDNode *Cvt = CurDAG->getMachineNode(NVPTX::CVT_u64_u32, DL, MVT::i64,
759 Src, CvtNone);
760 Src = SDValue(Cvt, 0);
761 }
762
763 unsigned Opc;
764 switch (SrcAddrSpace) {
765 default: report_fatal_error("Bad address space in addrspacecast");
767 Opc = TM.is64Bit() ? NVPTX::cvta_global_64 : NVPTX::cvta_global;
768 break;
770 Opc = TM.is64Bit() ? NVPTX::cvta_shared_64 : NVPTX::cvta_shared;
771 break;
773 Opc = TM.is64Bit() ? NVPTX::cvta_const_64 : NVPTX::cvta_const;
774 break;
776 Opc = TM.is64Bit() ? NVPTX::cvta_local_64 : NVPTX::cvta_local;
777 break;
778 }
779 ReplaceNode(N, CurDAG->getMachineNode(Opc, DL, N->getValueType(0), Src));
780 return;
781 } else {
782 // Generic to specific
783 if (SrcAddrSpace != 0)
784 report_fatal_error("Cannot cast between two non-generic address spaces");
785 unsigned Opc;
786 switch (DstAddrSpace) {
787 default: report_fatal_error("Bad address space in addrspacecast");
789 Opc = TM.is64Bit() ? NVPTX::cvta_to_global_64 : NVPTX::cvta_to_global;
790 break;
792 Opc = TM.is64Bit() ? NVPTX::cvta_to_shared_64 : NVPTX::cvta_to_shared;
793 break;
795 Opc = TM.is64Bit() ? NVPTX::cvta_to_const_64 : NVPTX::cvta_to_const;
796 break;
798 Opc = TM.is64Bit() ? NVPTX::cvta_to_local_64 : NVPTX::cvta_to_local;
799 break;
801 Opc = TM.is64Bit() ? NVPTX::IMOV64rr : NVPTX::IMOV32rr;
802 break;
803 }
804
805 SDNode *CVTA = CurDAG->getMachineNode(Opc, DL, N->getValueType(0), Src);
806 if (TM.is64Bit() && TM.getPointerSizeInBits(DstAddrSpace) == 32) {
807 SDValue CvtNone =
809 CVTA = CurDAG->getMachineNode(NVPTX::CVT_u32_u64, DL, MVT::i32,
810 SDValue(CVTA, 0), CvtNone);
811 }
812
813 ReplaceNode(N, CVTA);
814 return;
815 }
816}
817
818// Helper function template to reduce amount of boilerplate code for
819// opcode selection.
820static std::optional<unsigned>
822 unsigned Opcode_i16, unsigned Opcode_i32,
823 std::optional<unsigned> Opcode_i64, unsigned Opcode_f32,
824 std::optional<unsigned> Opcode_f64) {
825 switch (VT) {
826 case MVT::i1:
827 case MVT::i8:
828 return Opcode_i8;
829 case MVT::i16:
830 return Opcode_i16;
831 case MVT::i32:
832 return Opcode_i32;
833 case MVT::i64:
834 return Opcode_i64;
835 case MVT::f16:
836 case MVT::bf16:
837 return Opcode_i16;
838 case MVT::v2f16:
839 case MVT::v2bf16:
840 case MVT::v2i16:
841 case MVT::v4i8:
842 return Opcode_i32;
843 case MVT::f32:
844 return Opcode_f32;
845 case MVT::f64:
846 return Opcode_f64;
847 default:
848 return std::nullopt;
849 }
850}
851
852static int getLdStRegType(EVT VT) {
853 if (VT.isFloatingPoint())
854 switch (VT.getSimpleVT().SimpleTy) {
855 case MVT::f16:
856 case MVT::bf16:
857 case MVT::v2f16:
858 case MVT::v2bf16:
860 default:
862 }
863 else
865}
866
867bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
868 MemSDNode *LD = cast<MemSDNode>(N);
869 assert(LD->readMem() && "Expected load");
870
871 // do not support pre/post inc/dec
872 LoadSDNode *PlainLoad = dyn_cast<LoadSDNode>(N);
873 if (PlainLoad && PlainLoad->isIndexed())
874 return false;
875
876 EVT LoadedVT = LD->getMemoryVT();
877 if (!LoadedVT.isSimple())
878 return false;
879
880 // Address Space Setting
881 unsigned int CodeAddrSpace = getCodeAddrSpace(LD);
882 if (canLowerToLDG(LD, *Subtarget, CodeAddrSpace, MF)) {
883 return tryLDGLDU(N);
884 }
885 unsigned int PointerSize =
886 CurDAG->getDataLayout().getPointerSizeInBits(LD->getAddressSpace());
887
888 SDLoc DL(N);
889 SDValue Chain = N->getOperand(0);
890 auto [Ordering, Scope] = insertMemoryInstructionFence(DL, Chain, LD);
891
892 // Type Setting: fromType + fromTypeWidth
893 //
894 // Sign : ISD::SEXTLOAD
895 // Unsign : ISD::ZEXTLOAD, ISD::NON_EXTLOAD or ISD::EXTLOAD and the
896 // type is integer
897 // Float : ISD::NON_EXTLOAD or ISD::EXTLOAD and the type is float
898 MVT SimpleVT = LoadedVT.getSimpleVT();
899 MVT ScalarVT = SimpleVT.getScalarType();
900 // Read at least 8 bits (predicates are stored as 8-bit values)
901 unsigned FromTypeWidth = std::max(8U, (unsigned)ScalarVT.getSizeInBits());
902 unsigned int FromType;
903
904 // Vector Setting
906 if (SimpleVT.isVector()) {
907 assert((Isv2x16VT(LoadedVT) || LoadedVT == MVT::v4i8) &&
908 "Unexpected vector type");
909 // v2f16/v2bf16/v2i16 is loaded using ld.b32
910 FromTypeWidth = 32;
911 }
912
913 if (PlainLoad && (PlainLoad->getExtensionType() == ISD::SEXTLOAD))
915 else
916 FromType = getLdStRegType(ScalarVT);
917
918 // Create the machine instruction DAG
919 SDValue N1 = N->getOperand(1);
922 std::optional<unsigned> Opcode;
923 MVT::SimpleValueType TargetVT = LD->getSimpleValueType(0).SimpleTy;
924
925 SmallVector<SDValue, 12> Ops({getI32Imm(Ordering, DL), getI32Imm(Scope, DL),
926 getI32Imm(CodeAddrSpace, DL),
927 getI32Imm(VecType, DL), getI32Imm(FromType, DL),
928 getI32Imm(FromTypeWidth, DL)});
929
930 if (SelectDirectAddr(N1, Addr)) {
931 Opcode = pickOpcodeForVT(TargetVT, NVPTX::LD_i8_avar, NVPTX::LD_i16_avar,
932 NVPTX::LD_i32_avar, NVPTX::LD_i64_avar,
933 NVPTX::LD_f32_avar, NVPTX::LD_f64_avar);
934 if (!Opcode)
935 return false;
936 Ops.append({Addr, Chain});
937 } else if (PointerSize == 64 ? SelectADDRsi64(N1.getNode(), N1, Base, Offset)
938 : SelectADDRsi(N1.getNode(), N1, Base, Offset)) {
939 Opcode = pickOpcodeForVT(TargetVT, NVPTX::LD_i8_asi, NVPTX::LD_i16_asi,
940 NVPTX::LD_i32_asi, NVPTX::LD_i64_asi,
941 NVPTX::LD_f32_asi, NVPTX::LD_f64_asi);
942 if (!Opcode)
943 return false;
944 Ops.append({Base, Offset, Chain});
945 } else if (PointerSize == 64 ? SelectADDRri64(N1.getNode(), N1, Base, Offset)
946 : SelectADDRri(N1.getNode(), N1, Base, Offset)) {
947 if (PointerSize == 64)
948 Opcode =
949 pickOpcodeForVT(TargetVT, NVPTX::LD_i8_ari_64, NVPTX::LD_i16_ari_64,
950 NVPTX::LD_i32_ari_64, NVPTX::LD_i64_ari_64,
951 NVPTX::LD_f32_ari_64, NVPTX::LD_f64_ari_64);
952 else
953 Opcode = pickOpcodeForVT(TargetVT, NVPTX::LD_i8_ari, NVPTX::LD_i16_ari,
954 NVPTX::LD_i32_ari, NVPTX::LD_i64_ari,
955 NVPTX::LD_f32_ari, NVPTX::LD_f64_ari);
956 if (!Opcode)
957 return false;
958 Ops.append({Base, Offset, Chain});
959 } else {
960 if (PointerSize == 64)
961 Opcode =
962 pickOpcodeForVT(TargetVT, NVPTX::LD_i8_areg_64, NVPTX::LD_i16_areg_64,
963 NVPTX::LD_i32_areg_64, NVPTX::LD_i64_areg_64,
964 NVPTX::LD_f32_areg_64, NVPTX::LD_f64_areg_64);
965 else
966 Opcode = pickOpcodeForVT(TargetVT, NVPTX::LD_i8_areg, NVPTX::LD_i16_areg,
967 NVPTX::LD_i32_areg, NVPTX::LD_i64_areg,
968 NVPTX::LD_f32_areg, NVPTX::LD_f64_areg);
969 if (!Opcode)
970 return false;
971 Ops.append({N1, Chain});
972 }
973
974 SDNode *NVPTXLD =
975 CurDAG->getMachineNode(*Opcode, DL, TargetVT, MVT::Other, Ops);
976 if (!NVPTXLD)
977 return false;
978
979 MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand();
980 CurDAG->setNodeMemRefs(cast<MachineSDNode>(NVPTXLD), {MemRef});
981
982 ReplaceNode(N, NVPTXLD);
983 return true;
984}
985
986static bool isVectorElementTypeUpsized(EVT EltVT) {
987 // Despite vectors like v8i8, v16i8, v8i16 being within the bit-limit for
988 // total load/store size, PTX syntax only supports v2/v4. Thus, we can't use
989 // vectorized loads/stores with the actual element type for i8/i16 as that
990 // would require v8/v16 variants that do not exist.
991 // In order to load/store such vectors efficiently, in Type Legalization
992 // we split the vector into word-sized chunks (v2x16/v4i8). Now, we will
993 // lower to PTX as vectors of b32.
994 return Isv2x16VT(EltVT) || EltVT == MVT::v4i8;
995}
996
997bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
998 MemSDNode *MemSD = cast<MemSDNode>(N);
999 EVT LoadedVT = MemSD->getMemoryVT();
1000 if (!LoadedVT.isSimple())
1001 return false;
1002
1003 // Address Space Setting
1004 unsigned int CodeAddrSpace = getCodeAddrSpace(MemSD);
1005 if (canLowerToLDG(MemSD, *Subtarget, CodeAddrSpace, MF)) {
1006 return tryLDGLDU(N);
1007 }
1008 unsigned int PointerSize =
1010
1011 SDLoc DL(N);
1012 SDValue Chain = N->getOperand(0);
1013 auto [Ordering, Scope] = insertMemoryInstructionFence(DL, Chain, MemSD);
1014
1015 // Vector Setting
1016 MVT SimpleVT = LoadedVT.getSimpleVT();
1017
1018 // Type Setting: fromType + fromTypeWidth
1019 //
1020 // Sign : ISD::SEXTLOAD
1021 // Unsign : ISD::ZEXTLOAD, ISD::NON_EXTLOAD or ISD::EXTLOAD and the
1022 // type is integer
1023 // Float : ISD::NON_EXTLOAD or ISD::EXTLOAD and the type is float
1024 MVT ScalarVT = SimpleVT.getScalarType();
1025 // Read at least 8 bits (predicates are stored as 8-bit values)
1026 unsigned FromTypeWidth = std::max(8U, (unsigned)ScalarVT.getSizeInBits());
1027 unsigned int FromType;
1028 // The last operand holds the original LoadSDNode::getExtensionType() value
1029 unsigned ExtensionType = cast<ConstantSDNode>(
1030 N->getOperand(N->getNumOperands() - 1))->getZExtValue();
1031 if (ExtensionType == ISD::SEXTLOAD)
1033 else
1034 FromType = getLdStRegType(ScalarVT);
1035
1036 unsigned VecType;
1037
1038 switch (N->getOpcode()) {
1039 case NVPTXISD::LoadV2:
1041 break;
1042 case NVPTXISD::LoadV4:
1044 break;
1045 default:
1046 return false;
1047 }
1048
1049 EVT EltVT = N->getValueType(0);
1050
1051 if (isVectorElementTypeUpsized(EltVT)) {
1052 EltVT = MVT::i32;
1054 FromTypeWidth = 32;
1055 }
1056
1057 SDValue Op1 = N->getOperand(1);
1059 std::optional<unsigned> Opcode;
1060 SDNode *LD;
1061
1062 SmallVector<SDValue, 12> Ops({getI32Imm(Ordering, DL), getI32Imm(Scope, DL),
1063 getI32Imm(CodeAddrSpace, DL),
1064 getI32Imm(VecType, DL), getI32Imm(FromType, DL),
1065 getI32Imm(FromTypeWidth, DL)});
1066
1067 if (SelectDirectAddr(Op1, Addr)) {
1068 switch (N->getOpcode()) {
1069 default:
1070 return false;
1071 case NVPTXISD::LoadV2:
1072 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1073 NVPTX::LDV_i8_v2_avar, NVPTX::LDV_i16_v2_avar,
1074 NVPTX::LDV_i32_v2_avar, NVPTX::LDV_i64_v2_avar,
1075 NVPTX::LDV_f32_v2_avar, NVPTX::LDV_f64_v2_avar);
1076 break;
1077 case NVPTXISD::LoadV4:
1078 Opcode =
1079 pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_avar,
1080 NVPTX::LDV_i16_v4_avar, NVPTX::LDV_i32_v4_avar,
1081 std::nullopt, NVPTX::LDV_f32_v4_avar, std::nullopt);
1082 break;
1083 }
1084 if (!Opcode)
1085 return false;
1086 Ops.append({Addr, Chain});
1087 } else if (PointerSize == 64
1088 ? SelectADDRsi64(Op1.getNode(), Op1, Base, Offset)
1089 : SelectADDRsi(Op1.getNode(), Op1, Base, Offset)) {
1090 switch (N->getOpcode()) {
1091 default:
1092 return false;
1093 case NVPTXISD::LoadV2:
1094 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1095 NVPTX::LDV_i8_v2_asi, NVPTX::LDV_i16_v2_asi,
1096 NVPTX::LDV_i32_v2_asi, NVPTX::LDV_i64_v2_asi,
1097 NVPTX::LDV_f32_v2_asi, NVPTX::LDV_f64_v2_asi);
1098 break;
1099 case NVPTXISD::LoadV4:
1100 Opcode =
1101 pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_asi,
1102 NVPTX::LDV_i16_v4_asi, NVPTX::LDV_i32_v4_asi,
1103 std::nullopt, NVPTX::LDV_f32_v4_asi, std::nullopt);
1104 break;
1105 }
1106 if (!Opcode)
1107 return false;
1108 Ops.append({Base, Offset, Chain});
1109 } else if (PointerSize == 64
1110 ? SelectADDRri64(Op1.getNode(), Op1, Base, Offset)
1111 : SelectADDRri(Op1.getNode(), Op1, Base, Offset)) {
1112 if (PointerSize == 64) {
1113 switch (N->getOpcode()) {
1114 default:
1115 return false;
1116 case NVPTXISD::LoadV2:
1117 Opcode =
1119 NVPTX::LDV_i8_v2_ari_64, NVPTX::LDV_i16_v2_ari_64,
1120 NVPTX::LDV_i32_v2_ari_64, NVPTX::LDV_i64_v2_ari_64,
1121 NVPTX::LDV_f32_v2_ari_64, NVPTX::LDV_f64_v2_ari_64);
1122 break;
1123 case NVPTXISD::LoadV4:
1124 Opcode = pickOpcodeForVT(
1125 EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_ari_64,
1126 NVPTX::LDV_i16_v4_ari_64, NVPTX::LDV_i32_v4_ari_64, std::nullopt,
1127 NVPTX::LDV_f32_v4_ari_64, std::nullopt);
1128 break;
1129 }
1130 } else {
1131 switch (N->getOpcode()) {
1132 default:
1133 return false;
1134 case NVPTXISD::LoadV2:
1135 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1136 NVPTX::LDV_i8_v2_ari, NVPTX::LDV_i16_v2_ari,
1137 NVPTX::LDV_i32_v2_ari, NVPTX::LDV_i64_v2_ari,
1138 NVPTX::LDV_f32_v2_ari, NVPTX::LDV_f64_v2_ari);
1139 break;
1140 case NVPTXISD::LoadV4:
1141 Opcode =
1142 pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_ari,
1143 NVPTX::LDV_i16_v4_ari, NVPTX::LDV_i32_v4_ari,
1144 std::nullopt, NVPTX::LDV_f32_v4_ari, std::nullopt);
1145 break;
1146 }
1147 }
1148 if (!Opcode)
1149 return false;
1150 Ops.append({Base, Offset, Chain});
1151 } else {
1152 if (PointerSize == 64) {
1153 switch (N->getOpcode()) {
1154 default:
1155 return false;
1156 case NVPTXISD::LoadV2:
1157 Opcode = pickOpcodeForVT(
1158 EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v2_areg_64,
1159 NVPTX::LDV_i16_v2_areg_64, NVPTX::LDV_i32_v2_areg_64,
1160 NVPTX::LDV_i64_v2_areg_64, NVPTX::LDV_f32_v2_areg_64,
1161 NVPTX::LDV_f64_v2_areg_64);
1162 break;
1163 case NVPTXISD::LoadV4:
1164 Opcode = pickOpcodeForVT(
1165 EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_areg_64,
1166 NVPTX::LDV_i16_v4_areg_64, NVPTX::LDV_i32_v4_areg_64, std::nullopt,
1167 NVPTX::LDV_f32_v4_areg_64, std::nullopt);
1168 break;
1169 }
1170 } else {
1171 switch (N->getOpcode()) {
1172 default:
1173 return false;
1174 case NVPTXISD::LoadV2:
1175 Opcode =
1176 pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v2_areg,
1177 NVPTX::LDV_i16_v2_areg, NVPTX::LDV_i32_v2_areg,
1178 NVPTX::LDV_i64_v2_areg, NVPTX::LDV_f32_v2_areg,
1179 NVPTX::LDV_f64_v2_areg);
1180 break;
1181 case NVPTXISD::LoadV4:
1182 Opcode =
1183 pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_areg,
1184 NVPTX::LDV_i16_v4_areg, NVPTX::LDV_i32_v4_areg,
1185 std::nullopt, NVPTX::LDV_f32_v4_areg, std::nullopt);
1186 break;
1187 }
1188 }
1189 if (!Opcode)
1190 return false;
1191 Ops.append({Op1, Chain});
1192 }
1193 LD = CurDAG->getMachineNode(*Opcode, DL, N->getVTList(), Ops);
1194
1195 MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand();
1196 CurDAG->setNodeMemRefs(cast<MachineSDNode>(LD), {MemRef});
1197
1198 ReplaceNode(N, LD);
1199 return true;
1200}
1201
1202bool NVPTXDAGToDAGISel::tryLDGLDU(SDNode *N) {
1203 auto *Mem = cast<MemSDNode>(N);
1204
1205 // If this is an LDG intrinsic, the address is the third operand. If its an
1206 // LDG/LDU SD node (from custom vector handling), then its the second operand
1207 SDValue Op1 = N->getOperand(N->getOpcode() == ISD::INTRINSIC_W_CHAIN ? 2 : 1);
1208
1209 EVT OrigType = N->getValueType(0);
1210 EVT EltVT = Mem->getMemoryVT();
1211 unsigned NumElts = 1;
1212 if (EltVT.isVector()) {
1213 NumElts = EltVT.getVectorNumElements();
1214 EltVT = EltVT.getVectorElementType();
1215 // vectors of 8/16bits type are loaded/stored as multiples of v4i8/v2x16
1216 // elements.
1217 if ((EltVT == MVT::f16 && OrigType == MVT::v2f16) ||
1218 (EltVT == MVT::bf16 && OrigType == MVT::v2bf16) ||
1219 (EltVT == MVT::i16 && OrigType == MVT::v2i16) ||
1220 (EltVT == MVT::i8 && OrigType == MVT::v4i8)) {
1221 assert(NumElts % OrigType.getVectorNumElements() == 0 &&
1222 "NumElts must be divisible by the number of elts in subvectors");
1223 EltVT = OrigType;
1224 NumElts /= OrigType.getVectorNumElements();
1225 }
1226 }
1227
1228 // Build the "promoted" result VTList for the load. If we are really loading
1229 // i8s, then the return type will be promoted to i16 since we do not expose
1230 // 8-bit registers in NVPTX.
1231 EVT NodeVT = (EltVT == MVT::i8) ? MVT::i16 : EltVT;
1232 SmallVector<EVT, 5> InstVTs;
1233 for (unsigned i = 0; i != NumElts; ++i) {
1234 InstVTs.push_back(NodeVT);
1235 }
1236 InstVTs.push_back(MVT::Other);
1237 SDVTList InstVTList = CurDAG->getVTList(InstVTs);
1238 SDValue Chain = N->getOperand(0);
1239
1240 std::optional<unsigned> Opcode;
1241 SDLoc DL(N);
1242 SDNode *LD;
1244
1245 if (SelectDirectAddr(Op1, Addr)) {
1246 switch (N->getOpcode()) {
1247 default:
1248 return false;
1249 case ISD::LOAD:
1250 Opcode = pickOpcodeForVT(
1251 EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDG_GLOBAL_i8avar,
1252 NVPTX::INT_PTX_LDG_GLOBAL_i16avar, NVPTX::INT_PTX_LDG_GLOBAL_i32avar,
1253 NVPTX::INT_PTX_LDG_GLOBAL_i64avar, NVPTX::INT_PTX_LDG_GLOBAL_f32avar,
1254 NVPTX::INT_PTX_LDG_GLOBAL_f64avar);
1255 break;
1257 Opcode = pickOpcodeForVT(
1258 EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDU_GLOBAL_i8avar,
1259 NVPTX::INT_PTX_LDU_GLOBAL_i16avar, NVPTX::INT_PTX_LDU_GLOBAL_i32avar,
1260 NVPTX::INT_PTX_LDU_GLOBAL_i64avar, NVPTX::INT_PTX_LDU_GLOBAL_f32avar,
1261 NVPTX::INT_PTX_LDU_GLOBAL_f64avar);
1262 break;
1263 case NVPTXISD::LoadV2:
1264 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1265 NVPTX::INT_PTX_LDG_G_v2i8_ELE_avar,
1266 NVPTX::INT_PTX_LDG_G_v2i16_ELE_avar,
1267 NVPTX::INT_PTX_LDG_G_v2i32_ELE_avar,
1268 NVPTX::INT_PTX_LDG_G_v2i64_ELE_avar,
1269 NVPTX::INT_PTX_LDG_G_v2f32_ELE_avar,
1270 NVPTX::INT_PTX_LDG_G_v2f64_ELE_avar);
1271 break;
1272 case NVPTXISD::LDUV2:
1273 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1274 NVPTX::INT_PTX_LDU_G_v2i8_ELE_avar,
1275 NVPTX::INT_PTX_LDU_G_v2i16_ELE_avar,
1276 NVPTX::INT_PTX_LDU_G_v2i32_ELE_avar,
1277 NVPTX::INT_PTX_LDU_G_v2i64_ELE_avar,
1278 NVPTX::INT_PTX_LDU_G_v2f32_ELE_avar,
1279 NVPTX::INT_PTX_LDU_G_v2f64_ELE_avar);
1280 break;
1281 case NVPTXISD::LoadV4:
1282 Opcode = pickOpcodeForVT(
1283 EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDG_G_v4i8_ELE_avar,
1284 NVPTX::INT_PTX_LDG_G_v4i16_ELE_avar,
1285 NVPTX::INT_PTX_LDG_G_v4i32_ELE_avar, std::nullopt,
1286 NVPTX::INT_PTX_LDG_G_v4f32_ELE_avar, std::nullopt);
1287 break;
1288 case NVPTXISD::LDUV4:
1289 Opcode = pickOpcodeForVT(
1290 EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDU_G_v4i8_ELE_avar,
1291 NVPTX::INT_PTX_LDU_G_v4i16_ELE_avar,
1292 NVPTX::INT_PTX_LDU_G_v4i32_ELE_avar, std::nullopt,
1293 NVPTX::INT_PTX_LDU_G_v4f32_ELE_avar, std::nullopt);
1294 break;
1295 }
1296 if (!Opcode)
1297 return false;
1298 SDValue Ops[] = { Addr, Chain };
1299 LD = CurDAG->getMachineNode(*Opcode, DL, InstVTList, Ops);
1300 } else if (TM.is64Bit() ? SelectADDRri64(Op1.getNode(), Op1, Base, Offset)
1301 : SelectADDRri(Op1.getNode(), Op1, Base, Offset)) {
1302 if (TM.is64Bit()) {
1303 switch (N->getOpcode()) {
1304 default:
1305 return false;
1306 case ISD::LOAD:
1307 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1308 NVPTX::INT_PTX_LDG_GLOBAL_i8ari64,
1309 NVPTX::INT_PTX_LDG_GLOBAL_i16ari64,
1310 NVPTX::INT_PTX_LDG_GLOBAL_i32ari64,
1311 NVPTX::INT_PTX_LDG_GLOBAL_i64ari64,
1312 NVPTX::INT_PTX_LDG_GLOBAL_f32ari64,
1313 NVPTX::INT_PTX_LDG_GLOBAL_f64ari64);
1314 break;
1316 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1317 NVPTX::INT_PTX_LDU_GLOBAL_i8ari64,
1318 NVPTX::INT_PTX_LDU_GLOBAL_i16ari64,
1319 NVPTX::INT_PTX_LDU_GLOBAL_i32ari64,
1320 NVPTX::INT_PTX_LDU_GLOBAL_i64ari64,
1321 NVPTX::INT_PTX_LDU_GLOBAL_f32ari64,
1322 NVPTX::INT_PTX_LDU_GLOBAL_f64ari64);
1323 break;
1324 case NVPTXISD::LoadV2:
1325 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1326 NVPTX::INT_PTX_LDG_G_v2i8_ELE_ari64,
1327 NVPTX::INT_PTX_LDG_G_v2i16_ELE_ari64,
1328 NVPTX::INT_PTX_LDG_G_v2i32_ELE_ari64,
1329 NVPTX::INT_PTX_LDG_G_v2i64_ELE_ari64,
1330 NVPTX::INT_PTX_LDG_G_v2f32_ELE_ari64,
1331 NVPTX::INT_PTX_LDG_G_v2f64_ELE_ari64);
1332 break;
1333 case NVPTXISD::LDUV2:
1334 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1335 NVPTX::INT_PTX_LDU_G_v2i8_ELE_ari64,
1336 NVPTX::INT_PTX_LDU_G_v2i16_ELE_ari64,
1337 NVPTX::INT_PTX_LDU_G_v2i32_ELE_ari64,
1338 NVPTX::INT_PTX_LDU_G_v2i64_ELE_ari64,
1339 NVPTX::INT_PTX_LDU_G_v2f32_ELE_ari64,
1340 NVPTX::INT_PTX_LDU_G_v2f64_ELE_ari64);
1341 break;
1342 case NVPTXISD::LoadV4:
1343 Opcode = pickOpcodeForVT(
1344 EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDG_G_v4i8_ELE_ari64,
1345 NVPTX::INT_PTX_LDG_G_v4i16_ELE_ari64,
1346 NVPTX::INT_PTX_LDG_G_v4i32_ELE_ari64, std::nullopt,
1347 NVPTX::INT_PTX_LDG_G_v4f32_ELE_ari64, std::nullopt);
1348 break;
1349 case NVPTXISD::LDUV4:
1350 Opcode = pickOpcodeForVT(
1351 EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDU_G_v4i8_ELE_ari64,
1352 NVPTX::INT_PTX_LDU_G_v4i16_ELE_ari64,
1353 NVPTX::INT_PTX_LDU_G_v4i32_ELE_ari64, std::nullopt,
1354 NVPTX::INT_PTX_LDU_G_v4f32_ELE_ari64, std::nullopt);
1355 break;
1356 }
1357 } else {
1358 switch (N->getOpcode()) {
1359 default:
1360 return false;
1361 case ISD::LOAD:
1362 Opcode = pickOpcodeForVT(
1363 EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDG_GLOBAL_i8ari,
1364 NVPTX::INT_PTX_LDG_GLOBAL_i16ari, NVPTX::INT_PTX_LDG_GLOBAL_i32ari,
1365 NVPTX::INT_PTX_LDG_GLOBAL_i64ari, NVPTX::INT_PTX_LDG_GLOBAL_f32ari,
1366 NVPTX::INT_PTX_LDG_GLOBAL_f64ari);
1367 break;
1369 Opcode = pickOpcodeForVT(
1370 EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDU_GLOBAL_i8ari,
1371 NVPTX::INT_PTX_LDU_GLOBAL_i16ari, NVPTX::INT_PTX_LDU_GLOBAL_i32ari,
1372 NVPTX::INT_PTX_LDU_GLOBAL_i64ari, NVPTX::INT_PTX_LDU_GLOBAL_f32ari,
1373 NVPTX::INT_PTX_LDU_GLOBAL_f64ari);
1374 break;
1375 case NVPTXISD::LoadV2:
1376 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1377 NVPTX::INT_PTX_LDG_G_v2i8_ELE_ari32,
1378 NVPTX::INT_PTX_LDG_G_v2i16_ELE_ari32,
1379 NVPTX::INT_PTX_LDG_G_v2i32_ELE_ari32,
1380 NVPTX::INT_PTX_LDG_G_v2i64_ELE_ari32,
1381 NVPTX::INT_PTX_LDG_G_v2f32_ELE_ari32,
1382 NVPTX::INT_PTX_LDG_G_v2f64_ELE_ari32);
1383 break;
1384 case NVPTXISD::LDUV2:
1385 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1386 NVPTX::INT_PTX_LDU_G_v2i8_ELE_ari32,
1387 NVPTX::INT_PTX_LDU_G_v2i16_ELE_ari32,
1388 NVPTX::INT_PTX_LDU_G_v2i32_ELE_ari32,
1389 NVPTX::INT_PTX_LDU_G_v2i64_ELE_ari32,
1390 NVPTX::INT_PTX_LDU_G_v2f32_ELE_ari32,
1391 NVPTX::INT_PTX_LDU_G_v2f64_ELE_ari32);
1392 break;
1393 case NVPTXISD::LoadV4:
1394 Opcode = pickOpcodeForVT(
1395 EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDG_G_v4i8_ELE_ari32,
1396 NVPTX::INT_PTX_LDG_G_v4i16_ELE_ari32,
1397 NVPTX::INT_PTX_LDG_G_v4i32_ELE_ari32, std::nullopt,
1398 NVPTX::INT_PTX_LDG_G_v4f32_ELE_ari32, std::nullopt);
1399 break;
1400 case NVPTXISD::LDUV4:
1401 Opcode = pickOpcodeForVT(
1402 EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDU_G_v4i8_ELE_ari32,
1403 NVPTX::INT_PTX_LDU_G_v4i16_ELE_ari32,
1404 NVPTX::INT_PTX_LDU_G_v4i32_ELE_ari32, std::nullopt,
1405 NVPTX::INT_PTX_LDU_G_v4f32_ELE_ari32, std::nullopt);
1406 break;
1407 }
1408 }
1409 if (!Opcode)
1410 return false;
1411 SDValue Ops[] = {Base, Offset, Chain};
1412 LD = CurDAG->getMachineNode(*Opcode, DL, InstVTList, Ops);
1413 } else {
1414 if (TM.is64Bit()) {
1415 switch (N->getOpcode()) {
1416 default:
1417 return false;
1418 case ISD::LOAD:
1419 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1420 NVPTX::INT_PTX_LDG_GLOBAL_i8areg64,
1421 NVPTX::INT_PTX_LDG_GLOBAL_i16areg64,
1422 NVPTX::INT_PTX_LDG_GLOBAL_i32areg64,
1423 NVPTX::INT_PTX_LDG_GLOBAL_i64areg64,
1424 NVPTX::INT_PTX_LDG_GLOBAL_f32areg64,
1425 NVPTX::INT_PTX_LDG_GLOBAL_f64areg64);
1426 break;
1428 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1429 NVPTX::INT_PTX_LDU_GLOBAL_i8areg64,
1430 NVPTX::INT_PTX_LDU_GLOBAL_i16areg64,
1431 NVPTX::INT_PTX_LDU_GLOBAL_i32areg64,
1432 NVPTX::INT_PTX_LDU_GLOBAL_i64areg64,
1433 NVPTX::INT_PTX_LDU_GLOBAL_f32areg64,
1434 NVPTX::INT_PTX_LDU_GLOBAL_f64areg64);
1435 break;
1436 case NVPTXISD::LoadV2:
1437 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1438 NVPTX::INT_PTX_LDG_G_v2i8_ELE_areg64,
1439 NVPTX::INT_PTX_LDG_G_v2i16_ELE_areg64,
1440 NVPTX::INT_PTX_LDG_G_v2i32_ELE_areg64,
1441 NVPTX::INT_PTX_LDG_G_v2i64_ELE_areg64,
1442 NVPTX::INT_PTX_LDG_G_v2f32_ELE_areg64,
1443 NVPTX::INT_PTX_LDG_G_v2f64_ELE_areg64);
1444 break;
1445 case NVPTXISD::LDUV2:
1446 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1447 NVPTX::INT_PTX_LDU_G_v2i8_ELE_areg64,
1448 NVPTX::INT_PTX_LDU_G_v2i16_ELE_areg64,
1449 NVPTX::INT_PTX_LDU_G_v2i32_ELE_areg64,
1450 NVPTX::INT_PTX_LDU_G_v2i64_ELE_areg64,
1451 NVPTX::INT_PTX_LDU_G_v2f32_ELE_areg64,
1452 NVPTX::INT_PTX_LDU_G_v2f64_ELE_areg64);
1453 break;
1454 case NVPTXISD::LoadV4:
1455 Opcode = pickOpcodeForVT(
1456 EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDG_G_v4i8_ELE_areg64,
1457 NVPTX::INT_PTX_LDG_G_v4i16_ELE_areg64,
1458 NVPTX::INT_PTX_LDG_G_v4i32_ELE_areg64, std::nullopt,
1459 NVPTX::INT_PTX_LDG_G_v4f32_ELE_areg64, std::nullopt);
1460 break;
1461 case NVPTXISD::LDUV4:
1462 Opcode = pickOpcodeForVT(
1463 EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDU_G_v4i8_ELE_areg64,
1464 NVPTX::INT_PTX_LDU_G_v4i16_ELE_areg64,
1465 NVPTX::INT_PTX_LDU_G_v4i32_ELE_areg64, std::nullopt,
1466 NVPTX::INT_PTX_LDU_G_v4f32_ELE_areg64, std::nullopt);
1467 break;
1468 }
1469 } else {
1470 switch (N->getOpcode()) {
1471 default:
1472 return false;
1473 case ISD::LOAD:
1474 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1475 NVPTX::INT_PTX_LDG_GLOBAL_i8areg,
1476 NVPTX::INT_PTX_LDG_GLOBAL_i16areg,
1477 NVPTX::INT_PTX_LDG_GLOBAL_i32areg,
1478 NVPTX::INT_PTX_LDG_GLOBAL_i64areg,
1479 NVPTX::INT_PTX_LDG_GLOBAL_f32areg,
1480 NVPTX::INT_PTX_LDG_GLOBAL_f64areg);
1481 break;
1483 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1484 NVPTX::INT_PTX_LDU_GLOBAL_i8areg,
1485 NVPTX::INT_PTX_LDU_GLOBAL_i16areg,
1486 NVPTX::INT_PTX_LDU_GLOBAL_i32areg,
1487 NVPTX::INT_PTX_LDU_GLOBAL_i64areg,
1488 NVPTX::INT_PTX_LDU_GLOBAL_f32areg,
1489 NVPTX::INT_PTX_LDU_GLOBAL_f64areg);
1490 break;
1491 case NVPTXISD::LoadV2:
1492 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1493 NVPTX::INT_PTX_LDG_G_v2i8_ELE_areg32,
1494 NVPTX::INT_PTX_LDG_G_v2i16_ELE_areg32,
1495 NVPTX::INT_PTX_LDG_G_v2i32_ELE_areg32,
1496 NVPTX::INT_PTX_LDG_G_v2i64_ELE_areg32,
1497 NVPTX::INT_PTX_LDG_G_v2f32_ELE_areg32,
1498 NVPTX::INT_PTX_LDG_G_v2f64_ELE_areg32);
1499 break;
1500 case NVPTXISD::LDUV2:
1501 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1502 NVPTX::INT_PTX_LDU_G_v2i8_ELE_areg32,
1503 NVPTX::INT_PTX_LDU_G_v2i16_ELE_areg32,
1504 NVPTX::INT_PTX_LDU_G_v2i32_ELE_areg32,
1505 NVPTX::INT_PTX_LDU_G_v2i64_ELE_areg32,
1506 NVPTX::INT_PTX_LDU_G_v2f32_ELE_areg32,
1507 NVPTX::INT_PTX_LDU_G_v2f64_ELE_areg32);
1508 break;
1509 case NVPTXISD::LoadV4:
1510 Opcode = pickOpcodeForVT(
1511 EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDG_G_v4i8_ELE_areg32,
1512 NVPTX::INT_PTX_LDG_G_v4i16_ELE_areg32,
1513 NVPTX::INT_PTX_LDG_G_v4i32_ELE_areg32, std::nullopt,
1514 NVPTX::INT_PTX_LDG_G_v4f32_ELE_areg32, std::nullopt);
1515 break;
1516 case NVPTXISD::LDUV4:
1517 Opcode = pickOpcodeForVT(
1518 EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDU_G_v4i8_ELE_areg32,
1519 NVPTX::INT_PTX_LDU_G_v4i16_ELE_areg32,
1520 NVPTX::INT_PTX_LDU_G_v4i32_ELE_areg32, std::nullopt,
1521 NVPTX::INT_PTX_LDU_G_v4f32_ELE_areg32, std::nullopt);
1522 break;
1523 }
1524 }
1525 if (!Opcode)
1526 return false;
1527 SDValue Ops[] = { Op1, Chain };
1528 LD = CurDAG->getMachineNode(*Opcode, DL, InstVTList, Ops);
1529 }
1530
1531 // For automatic generation of LDG (through SelectLoad[Vector], not the
1532 // intrinsics), we may have an extending load like:
1533 //
1534 // i32,ch = load<LD1[%data1(addrspace=1)], zext from i8> t0, t7, undef:i64
1535 //
1536 // In this case, the matching logic above will select a load for the original
1537 // memory type (in this case, i8) and our types will not match (the node needs
1538 // to return an i32 in this case). Our LDG/LDU nodes do not support the
1539 // concept of sign-/zero-extension, so emulate it here by adding an explicit
1540 // CVT instruction. Ptxas should clean up any redundancies here.
1541
1542 LoadSDNode *LdNode = dyn_cast<LoadSDNode>(N);
1543
1544 if (OrigType != EltVT &&
1545 (LdNode || (OrigType.isFloatingPoint() && EltVT.isFloatingPoint()))) {
1546 // We have an extending-load. The instruction we selected operates on the
1547 // smaller type, but the SDNode we are replacing has the larger type. We
1548 // need to emit a CVT to make the types match.
1549 unsigned CvtOpc =
1550 GetConvertOpcode(OrigType.getSimpleVT(), EltVT.getSimpleVT(), LdNode);
1551
1552 // For each output value, apply the manual sign/zero-extension and make sure
1553 // all users of the load go through that CVT.
1554 for (unsigned i = 0; i != NumElts; ++i) {
1555 SDValue Res(LD, i);
1556 SDValue OrigVal(N, i);
1557
1558 SDNode *CvtNode =
1559 CurDAG->getMachineNode(CvtOpc, DL, OrigType, Res,
1561 DL, MVT::i32));
1562 ReplaceUses(OrigVal, SDValue(CvtNode, 0));
1563 }
1564 }
1565
1566 ReplaceNode(N, LD);
1567 return true;
1568}
1569
1570bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
1571 MemSDNode *ST = cast<MemSDNode>(N);
1572 assert(ST->writeMem() && "Expected store");
1573 StoreSDNode *PlainStore = dyn_cast<StoreSDNode>(N);
1574 AtomicSDNode *AtomicStore = dyn_cast<AtomicSDNode>(N);
1575 assert((PlainStore || AtomicStore) && "Expected store");
1576
1577 // do not support pre/post inc/dec
1578 if (PlainStore && PlainStore->isIndexed())
1579 return false;
1580
1581 EVT StoreVT = ST->getMemoryVT();
1582 if (!StoreVT.isSimple())
1583 return false;
1584
1585 // Address Space Setting
1586 unsigned int CodeAddrSpace = getCodeAddrSpace(ST);
1587 unsigned int PointerSize =
1588 CurDAG->getDataLayout().getPointerSizeInBits(ST->getAddressSpace());
1589
1590 SDLoc DL(N);
1591 SDValue Chain = ST->getChain();
1592 auto [Ordering, Scope] = insertMemoryInstructionFence(DL, Chain, ST);
1593
1594 // Vector Setting
1595 MVT SimpleVT = StoreVT.getSimpleVT();
1597
1598 // Type Setting: toType + toTypeWidth
1599 // - for integer type, always use 'u'
1600 MVT ScalarVT = SimpleVT.getScalarType();
1601 unsigned ToTypeWidth = ScalarVT.getSizeInBits();
1602 if (SimpleVT.isVector()) {
1603 assert((Isv2x16VT(StoreVT) || StoreVT == MVT::v4i8) &&
1604 "Unexpected vector type");
1605 // v2x16 is stored using st.b32
1606 ToTypeWidth = 32;
1607 }
1608
1609 unsigned int ToType = getLdStRegType(ScalarVT);
1610
1611 // Create the machine instruction DAG
1612 SDValue Value = PlainStore ? PlainStore->getValue() : AtomicStore->getVal();
1613 SDValue BasePtr = ST->getBasePtr();
1614 SDValue Addr;
1616 std::optional<unsigned> Opcode;
1617 MVT::SimpleValueType SourceVT =
1618 Value.getNode()->getSimpleValueType(0).SimpleTy;
1619
1621 {Value, getI32Imm(Ordering, DL), getI32Imm(Scope, DL),
1622 getI32Imm(CodeAddrSpace, DL), getI32Imm(VecType, DL),
1623 getI32Imm(ToType, DL), getI32Imm(ToTypeWidth, DL)});
1624
1625 if (SelectDirectAddr(BasePtr, Addr)) {
1626 Opcode = pickOpcodeForVT(SourceVT, NVPTX::ST_i8_avar, NVPTX::ST_i16_avar,
1627 NVPTX::ST_i32_avar, NVPTX::ST_i64_avar,
1628 NVPTX::ST_f32_avar, NVPTX::ST_f64_avar);
1629 if (!Opcode)
1630 return false;
1631 Ops.append({Addr, Chain});
1632 } else if (PointerSize == 64
1633 ? SelectADDRsi64(BasePtr.getNode(), BasePtr, Base, Offset)
1634 : SelectADDRsi(BasePtr.getNode(), BasePtr, Base, Offset)) {
1635 Opcode = pickOpcodeForVT(SourceVT, NVPTX::ST_i8_asi, NVPTX::ST_i16_asi,
1636 NVPTX::ST_i32_asi, NVPTX::ST_i64_asi,
1637 NVPTX::ST_f32_asi, NVPTX::ST_f64_asi);
1638 if (!Opcode)
1639 return false;
1640 Ops.append({Base, Offset, Chain});
1641 } else if (PointerSize == 64
1642 ? SelectADDRri64(BasePtr.getNode(), BasePtr, Base, Offset)
1643 : SelectADDRri(BasePtr.getNode(), BasePtr, Base, Offset)) {
1644 if (PointerSize == 64)
1645 Opcode =
1646 pickOpcodeForVT(SourceVT, NVPTX::ST_i8_ari_64, NVPTX::ST_i16_ari_64,
1647 NVPTX::ST_i32_ari_64, NVPTX::ST_i64_ari_64,
1648 NVPTX::ST_f32_ari_64, NVPTX::ST_f64_ari_64);
1649 else
1650 Opcode = pickOpcodeForVT(SourceVT, NVPTX::ST_i8_ari, NVPTX::ST_i16_ari,
1651 NVPTX::ST_i32_ari, NVPTX::ST_i64_ari,
1652 NVPTX::ST_f32_ari, NVPTX::ST_f64_ari);
1653 if (!Opcode)
1654 return false;
1655 Ops.append({Base, Offset, Chain});
1656 } else {
1657 if (PointerSize == 64)
1658 Opcode =
1659 pickOpcodeForVT(SourceVT, NVPTX::ST_i8_areg_64, NVPTX::ST_i16_areg_64,
1660 NVPTX::ST_i32_areg_64, NVPTX::ST_i64_areg_64,
1661 NVPTX::ST_f32_areg_64, NVPTX::ST_f64_areg_64);
1662 else
1663 Opcode = pickOpcodeForVT(SourceVT, NVPTX::ST_i8_areg, NVPTX::ST_i16_areg,
1664 NVPTX::ST_i32_areg, NVPTX::ST_i64_areg,
1665 NVPTX::ST_f32_areg, NVPTX::ST_f64_areg);
1666 if (!Opcode)
1667 return false;
1668 Ops.append({BasePtr, Chain});
1669 }
1670
1671 SDNode *NVPTXST = CurDAG->getMachineNode(*Opcode, DL, MVT::Other, Ops);
1672
1673 if (!NVPTXST)
1674 return false;
1675
1676 MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand();
1677 CurDAG->setNodeMemRefs(cast<MachineSDNode>(NVPTXST), {MemRef});
1678 ReplaceNode(N, NVPTXST);
1679 return true;
1680}
1681
1682bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
1683 SDValue Op1 = N->getOperand(1);
1685 std::optional<unsigned> Opcode;
1686 SDNode *ST;
1687 EVT EltVT = Op1.getValueType();
1688 MemSDNode *MemSD = cast<MemSDNode>(N);
1689 EVT StoreVT = MemSD->getMemoryVT();
1690
1691 // Address Space Setting
1692 unsigned CodeAddrSpace = getCodeAddrSpace(MemSD);
1693 if (CodeAddrSpace == NVPTX::AddressSpace::Const) {
1694 report_fatal_error("Cannot store to pointer that points to constant "
1695 "memory space");
1696 }
1697 unsigned int PointerSize =
1699
1700 SDLoc DL(N);
1701 SDValue Chain = N->getOperand(0);
1702 auto [Ordering, Scope] = insertMemoryInstructionFence(DL, Chain, MemSD);
1703
1704 // Type Setting: toType + toTypeWidth
1705 // - for integer type, always use 'u'
1706 assert(StoreVT.isSimple() && "Store value is not simple");
1707 MVT ScalarVT = StoreVT.getSimpleVT().getScalarType();
1708 unsigned ToTypeWidth = ScalarVT.getSizeInBits();
1709 unsigned ToType = getLdStRegType(ScalarVT);
1710
1712 SDValue N2;
1713 unsigned VecType;
1714
1715 switch (N->getOpcode()) {
1716 case NVPTXISD::StoreV2:
1718 Ops.append({N->getOperand(1), N->getOperand(2)});
1719 N2 = N->getOperand(3);
1720 break;
1721 case NVPTXISD::StoreV4:
1723 Ops.append({N->getOperand(1), N->getOperand(2), N->getOperand(3),
1724 N->getOperand(4)});
1725 N2 = N->getOperand(5);
1726 break;
1727 default:
1728 return false;
1729 }
1730
1731 if (isVectorElementTypeUpsized(EltVT)) {
1732 EltVT = MVT::i32;
1734 ToTypeWidth = 32;
1735 }
1736
1737 Ops.append({getI32Imm(Ordering, DL), getI32Imm(Scope, DL),
1738 getI32Imm(CodeAddrSpace, DL), getI32Imm(VecType, DL),
1739 getI32Imm(ToType, DL), getI32Imm(ToTypeWidth, DL)});
1740
1741 if (SelectDirectAddr(N2, Addr)) {
1742 switch (N->getOpcode()) {
1743 default:
1744 return false;
1745 case NVPTXISD::StoreV2:
1746 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1747 NVPTX::STV_i8_v2_avar, NVPTX::STV_i16_v2_avar,
1748 NVPTX::STV_i32_v2_avar, NVPTX::STV_i64_v2_avar,
1749 NVPTX::STV_f32_v2_avar, NVPTX::STV_f64_v2_avar);
1750 break;
1751 case NVPTXISD::StoreV4:
1752 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1753 NVPTX::STV_i8_v4_avar, NVPTX::STV_i16_v4_avar,
1754 NVPTX::STV_i32_v4_avar, std::nullopt,
1755 NVPTX::STV_f32_v4_avar, std::nullopt);
1756 break;
1757 }
1758 Ops.push_back(Addr);
1759 } else if (PointerSize == 64 ? SelectADDRsi64(N2.getNode(), N2, Base, Offset)
1760 : SelectADDRsi(N2.getNode(), N2, Base, Offset)) {
1761 switch (N->getOpcode()) {
1762 default:
1763 return false;
1764 case NVPTXISD::StoreV2:
1765 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1766 NVPTX::STV_i8_v2_asi, NVPTX::STV_i16_v2_asi,
1767 NVPTX::STV_i32_v2_asi, NVPTX::STV_i64_v2_asi,
1768 NVPTX::STV_f32_v2_asi, NVPTX::STV_f64_v2_asi);
1769 break;
1770 case NVPTXISD::StoreV4:
1771 Opcode =
1772 pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v4_asi,
1773 NVPTX::STV_i16_v4_asi, NVPTX::STV_i32_v4_asi,
1774 std::nullopt, NVPTX::STV_f32_v4_asi, std::nullopt);
1775 break;
1776 }
1777 Ops.append({Base, Offset});
1778 } else if (PointerSize == 64 ? SelectADDRri64(N2.getNode(), N2, Base, Offset)
1779 : SelectADDRri(N2.getNode(), N2, Base, Offset)) {
1780 if (PointerSize == 64) {
1781 switch (N->getOpcode()) {
1782 default:
1783 return false;
1784 case NVPTXISD::StoreV2:
1785 Opcode =
1787 NVPTX::STV_i8_v2_ari_64, NVPTX::STV_i16_v2_ari_64,
1788 NVPTX::STV_i32_v2_ari_64, NVPTX::STV_i64_v2_ari_64,
1789 NVPTX::STV_f32_v2_ari_64, NVPTX::STV_f64_v2_ari_64);
1790 break;
1791 case NVPTXISD::StoreV4:
1792 Opcode = pickOpcodeForVT(
1793 EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v4_ari_64,
1794 NVPTX::STV_i16_v4_ari_64, NVPTX::STV_i32_v4_ari_64, std::nullopt,
1795 NVPTX::STV_f32_v4_ari_64, std::nullopt);
1796 break;
1797 }
1798 } else {
1799 switch (N->getOpcode()) {
1800 default:
1801 return false;
1802 case NVPTXISD::StoreV2:
1803 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1804 NVPTX::STV_i8_v2_ari, NVPTX::STV_i16_v2_ari,
1805 NVPTX::STV_i32_v2_ari, NVPTX::STV_i64_v2_ari,
1806 NVPTX::STV_f32_v2_ari, NVPTX::STV_f64_v2_ari);
1807 break;
1808 case NVPTXISD::StoreV4:
1809 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1810 NVPTX::STV_i8_v4_ari, NVPTX::STV_i16_v4_ari,
1811 NVPTX::STV_i32_v4_ari, std::nullopt,
1812 NVPTX::STV_f32_v4_ari, std::nullopt);
1813 break;
1814 }
1815 }
1816 Ops.append({Base, Offset});
1817 } else {
1818 if (PointerSize == 64) {
1819 switch (N->getOpcode()) {
1820 default:
1821 return false;
1822 case NVPTXISD::StoreV2:
1823 Opcode = pickOpcodeForVT(
1824 EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v2_areg_64,
1825 NVPTX::STV_i16_v2_areg_64, NVPTX::STV_i32_v2_areg_64,
1826 NVPTX::STV_i64_v2_areg_64, NVPTX::STV_f32_v2_areg_64,
1827 NVPTX::STV_f64_v2_areg_64);
1828 break;
1829 case NVPTXISD::StoreV4:
1830 Opcode = pickOpcodeForVT(
1831 EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v4_areg_64,
1832 NVPTX::STV_i16_v4_areg_64, NVPTX::STV_i32_v4_areg_64, std::nullopt,
1833 NVPTX::STV_f32_v4_areg_64, std::nullopt);
1834 break;
1835 }
1836 } else {
1837 switch (N->getOpcode()) {
1838 default:
1839 return false;
1840 case NVPTXISD::StoreV2:
1841 Opcode =
1842 pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v2_areg,
1843 NVPTX::STV_i16_v2_areg, NVPTX::STV_i32_v2_areg,
1844 NVPTX::STV_i64_v2_areg, NVPTX::STV_f32_v2_areg,
1845 NVPTX::STV_f64_v2_areg);
1846 break;
1847 case NVPTXISD::StoreV4:
1848 Opcode =
1849 pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v4_areg,
1850 NVPTX::STV_i16_v4_areg, NVPTX::STV_i32_v4_areg,
1851 std::nullopt, NVPTX::STV_f32_v4_areg, std::nullopt);
1852 break;
1853 }
1854 }
1855 Ops.push_back(N2);
1856 }
1857
1858 if (!Opcode)
1859 return false;
1860
1861 Ops.push_back(Chain);
1862
1863 ST = CurDAG->getMachineNode(*Opcode, DL, MVT::Other, Ops);
1864
1865 MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand();
1866 CurDAG->setNodeMemRefs(cast<MachineSDNode>(ST), {MemRef});
1867
1868 ReplaceNode(N, ST);
1869 return true;
1870}
1871
1872bool NVPTXDAGToDAGISel::tryLoadParam(SDNode *Node) {
1873 SDValue Chain = Node->getOperand(0);
1874 SDValue Offset = Node->getOperand(2);
1875 SDValue Glue = Node->getOperand(3);
1876 SDLoc DL(Node);
1877 MemSDNode *Mem = cast<MemSDNode>(Node);
1878
1879 unsigned VecSize;
1880 switch (Node->getOpcode()) {
1881 default:
1882 return false;
1884 VecSize = 1;
1885 break;
1887 VecSize = 2;
1888 break;
1890 VecSize = 4;
1891 break;
1892 }
1893
1894 EVT EltVT = Node->getValueType(0);
1895 EVT MemVT = Mem->getMemoryVT();
1896
1897 std::optional<unsigned> Opcode;
1898
1899 switch (VecSize) {
1900 default:
1901 return false;
1902 case 1:
1903 Opcode = pickOpcodeForVT(MemVT.getSimpleVT().SimpleTy,
1904 NVPTX::LoadParamMemI8, NVPTX::LoadParamMemI16,
1905 NVPTX::LoadParamMemI32, NVPTX::LoadParamMemI64,
1906 NVPTX::LoadParamMemF32, NVPTX::LoadParamMemF64);
1907 break;
1908 case 2:
1909 Opcode =
1910 pickOpcodeForVT(MemVT.getSimpleVT().SimpleTy, NVPTX::LoadParamMemV2I8,
1911 NVPTX::LoadParamMemV2I16, NVPTX::LoadParamMemV2I32,
1912 NVPTX::LoadParamMemV2I64, NVPTX::LoadParamMemV2F32,
1913 NVPTX::LoadParamMemV2F64);
1914 break;
1915 case 4:
1916 Opcode =
1917 pickOpcodeForVT(MemVT.getSimpleVT().SimpleTy, NVPTX::LoadParamMemV4I8,
1918 NVPTX::LoadParamMemV4I16, NVPTX::LoadParamMemV4I32,
1919 std::nullopt, NVPTX::LoadParamMemV4F32, std::nullopt);
1920 break;
1921 }
1922 if (!Opcode)
1923 return false;
1924
1925 SDVTList VTs;
1926 if (VecSize == 1) {
1927 VTs = CurDAG->getVTList(EltVT, MVT::Other, MVT::Glue);
1928 } else if (VecSize == 2) {
1929 VTs = CurDAG->getVTList(EltVT, EltVT, MVT::Other, MVT::Glue);
1930 } else {
1931 EVT EVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other, MVT::Glue };
1932 VTs = CurDAG->getVTList(EVTs);
1933 }
1934
1935 unsigned OffsetVal = Offset->getAsZExtVal();
1936
1938 {CurDAG->getTargetConstant(OffsetVal, DL, MVT::i32), Chain, Glue});
1939
1940 ReplaceNode(Node, CurDAG->getMachineNode(*Opcode, DL, VTs, Ops));
1941 return true;
1942}
1943
1944bool NVPTXDAGToDAGISel::tryStoreRetval(SDNode *N) {
1945 SDLoc DL(N);
1946 SDValue Chain = N->getOperand(0);
1947 SDValue Offset = N->getOperand(1);
1948 unsigned OffsetVal = Offset->getAsZExtVal();
1949 MemSDNode *Mem = cast<MemSDNode>(N);
1950
1951 // How many elements do we have?
1952 unsigned NumElts = 1;
1953 switch (N->getOpcode()) {
1954 default:
1955 return false;
1957 NumElts = 1;
1958 break;
1960 NumElts = 2;
1961 break;
1963 NumElts = 4;
1964 break;
1965 }
1966
1967 // Build vector of operands
1969 for (unsigned i = 0; i < NumElts; ++i)
1970 Ops.push_back(N->getOperand(i + 2));
1971 Ops.append({CurDAG->getTargetConstant(OffsetVal, DL, MVT::i32), Chain});
1972
1973 // Determine target opcode
1974 // If we have an i1, use an 8-bit store. The lowering code in
1975 // NVPTXISelLowering will have already emitted an upcast.
1976 std::optional<unsigned> Opcode = 0;
1977 switch (NumElts) {
1978 default:
1979 return false;
1980 case 1:
1982 NVPTX::StoreRetvalI8, NVPTX::StoreRetvalI16,
1983 NVPTX::StoreRetvalI32, NVPTX::StoreRetvalI64,
1984 NVPTX::StoreRetvalF32, NVPTX::StoreRetvalF64);
1985 if (Opcode == NVPTX::StoreRetvalI8) {
1986 // Fine tune the opcode depending on the size of the operand.
1987 // This helps to avoid creating redundant COPY instructions in
1988 // InstrEmitter::AddRegisterOperand().
1989 switch (Ops[0].getSimpleValueType().SimpleTy) {
1990 default:
1991 break;
1992 case MVT::i32:
1993 Opcode = NVPTX::StoreRetvalI8TruncI32;
1994 break;
1995 case MVT::i64:
1996 Opcode = NVPTX::StoreRetvalI8TruncI64;
1997 break;
1998 }
1999 }
2000 break;
2001 case 2:
2003 NVPTX::StoreRetvalV2I8, NVPTX::StoreRetvalV2I16,
2004 NVPTX::StoreRetvalV2I32, NVPTX::StoreRetvalV2I64,
2005 NVPTX::StoreRetvalV2F32, NVPTX::StoreRetvalV2F64);
2006 break;
2007 case 4:
2009 NVPTX::StoreRetvalV4I8, NVPTX::StoreRetvalV4I16,
2010 NVPTX::StoreRetvalV4I32, std::nullopt,
2011 NVPTX::StoreRetvalV4F32, std::nullopt);
2012 break;
2013 }
2014 if (!Opcode)
2015 return false;
2016
2017 SDNode *Ret = CurDAG->getMachineNode(*Opcode, DL, MVT::Other, Ops);
2018 MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand();
2019 CurDAG->setNodeMemRefs(cast<MachineSDNode>(Ret), {MemRef});
2020
2021 ReplaceNode(N, Ret);
2022 return true;
2023}
2024
2025// Helpers for constructing opcode (ex: NVPTX::StoreParamV4F32_iiri)
2026#define getOpcV2H(ty, opKind0, opKind1) \
2027 NVPTX::StoreParamV2##ty##_##opKind0##opKind1
2028
2029#define getOpcV2H1(ty, opKind0, isImm1) \
2030 (isImm1) ? getOpcV2H(ty, opKind0, i) : getOpcV2H(ty, opKind0, r)
2031
2032#define getOpcodeForVectorStParamV2(ty, isimm) \
2033 (isimm[0]) ? getOpcV2H1(ty, i, isimm[1]) : getOpcV2H1(ty, r, isimm[1])
2034
2035#define getOpcV4H(ty, opKind0, opKind1, opKind2, opKind3) \
2036 NVPTX::StoreParamV4##ty##_##opKind0##opKind1##opKind2##opKind3
2037
2038#define getOpcV4H3(ty, opKind0, opKind1, opKind2, isImm3) \
2039 (isImm3) ? getOpcV4H(ty, opKind0, opKind1, opKind2, i) \
2040 : getOpcV4H(ty, opKind0, opKind1, opKind2, r)
2041
2042#define getOpcV4H2(ty, opKind0, opKind1, isImm2, isImm3) \
2043 (isImm2) ? getOpcV4H3(ty, opKind0, opKind1, i, isImm3) \
2044 : getOpcV4H3(ty, opKind0, opKind1, r, isImm3)
2045
2046#define getOpcV4H1(ty, opKind0, isImm1, isImm2, isImm3) \
2047 (isImm1) ? getOpcV4H2(ty, opKind0, i, isImm2, isImm3) \
2048 : getOpcV4H2(ty, opKind0, r, isImm2, isImm3)
2049
2050#define getOpcodeForVectorStParamV4(ty, isimm) \
2051 (isimm[0]) ? getOpcV4H1(ty, i, isimm[1], isimm[2], isimm[3]) \
2052 : getOpcV4H1(ty, r, isimm[1], isimm[2], isimm[3])
2053
2054#define getOpcodeForVectorStParam(n, ty, isimm) \
2055 (n == 2) ? getOpcodeForVectorStParamV2(ty, isimm) \
2056 : getOpcodeForVectorStParamV4(ty, isimm)
2057
2059 unsigned NumElts,
2061 SelectionDAG *CurDAG, SDLoc DL) {
2062 // Determine which inputs are registers and immediates make new operators
2063 // with constant values
2064 SmallVector<bool, 4> IsImm(NumElts, false);
2065 for (unsigned i = 0; i < NumElts; i++) {
2066 IsImm[i] = (isa<ConstantSDNode>(Ops[i]) || isa<ConstantFPSDNode>(Ops[i]));
2067 if (IsImm[i]) {
2068 SDValue Imm = Ops[i];
2069 if (MemTy == MVT::f32 || MemTy == MVT::f64) {
2070 const ConstantFPSDNode *ConstImm = cast<ConstantFPSDNode>(Imm);
2071 const ConstantFP *CF = ConstImm->getConstantFPValue();
2072 Imm = CurDAG->getTargetConstantFP(*CF, DL, Imm->getValueType(0));
2073 } else {
2074 const ConstantSDNode *ConstImm = cast<ConstantSDNode>(Imm);
2075 const ConstantInt *CI = ConstImm->getConstantIntValue();
2076 Imm = CurDAG->getTargetConstant(*CI, DL, Imm->getValueType(0));
2077 }
2078 Ops[i] = Imm;
2079 }
2080 }
2081
2082 // Get opcode for MemTy, size, and register/immediate operand ordering
2083 switch (MemTy) {
2084 case MVT::i8:
2085 return getOpcodeForVectorStParam(NumElts, I8, IsImm);
2086 case MVT::i16:
2087 return getOpcodeForVectorStParam(NumElts, I16, IsImm);
2088 case MVT::i32:
2089 return getOpcodeForVectorStParam(NumElts, I32, IsImm);
2090 case MVT::i64:
2091 assert(NumElts == 2 && "MVT too large for NumElts > 2");
2092 return getOpcodeForVectorStParamV2(I64, IsImm);
2093 case MVT::f32:
2094 return getOpcodeForVectorStParam(NumElts, F32, IsImm);
2095 case MVT::f64:
2096 assert(NumElts == 2 && "MVT too large for NumElts > 2");
2097 return getOpcodeForVectorStParamV2(F64, IsImm);
2098
2099 // These cases don't support immediates, just use the all register version
2100 // and generate moves.
2101 case MVT::i1:
2102 return (NumElts == 2) ? NVPTX::StoreParamV2I8_rr
2103 : NVPTX::StoreParamV4I8_rrrr;
2104 case MVT::f16:
2105 case MVT::bf16:
2106 return (NumElts == 2) ? NVPTX::StoreParamV2I16_rr
2107 : NVPTX::StoreParamV4I16_rrrr;
2108 case MVT::v2f16:
2109 case MVT::v2bf16:
2110 case MVT::v2i16:
2111 case MVT::v4i8:
2112 return (NumElts == 2) ? NVPTX::StoreParamV2I32_rr
2113 : NVPTX::StoreParamV4I32_rrrr;
2114 default:
2115 llvm_unreachable("Cannot select st.param for unknown MemTy");
2116 }
2117}
2118
2119bool NVPTXDAGToDAGISel::tryStoreParam(SDNode *N) {
2120 SDLoc DL(N);
2121 SDValue Chain = N->getOperand(0);
2122 SDValue Param = N->getOperand(1);
2123 unsigned ParamVal = Param->getAsZExtVal();
2124 SDValue Offset = N->getOperand(2);
2125 unsigned OffsetVal = Offset->getAsZExtVal();
2126 MemSDNode *Mem = cast<MemSDNode>(N);
2127 SDValue Glue = N->getOperand(N->getNumOperands() - 1);
2128
2129 // How many elements do we have?
2130 unsigned NumElts;
2131 switch (N->getOpcode()) {
2132 default:
2133 llvm_unreachable("Unexpected opcode");
2137 NumElts = 1;
2138 break;
2140 NumElts = 2;
2141 break;
2143 NumElts = 4;
2144 break;
2145 }
2146
2147 // Build vector of operands
2149 for (unsigned i = 0; i < NumElts; ++i)
2150 Ops.push_back(N->getOperand(i + 3));
2151 Ops.append({CurDAG->getTargetConstant(ParamVal, DL, MVT::i32),
2152 CurDAG->getTargetConstant(OffsetVal, DL, MVT::i32), Chain, Glue});
2153
2154 // Determine target opcode
2155 // If we have an i1, use an 8-bit store. The lowering code in
2156 // NVPTXISelLowering will have already emitted an upcast.
2157 std::optional<unsigned> Opcode;
2158 switch (N->getOpcode()) {
2159 default:
2160 switch (NumElts) {
2161 default:
2162 llvm_unreachable("Unexpected NumElts");
2163 case 1: {
2165 SDValue Imm = Ops[0];
2166 if (MemTy != MVT::f16 && MemTy != MVT::v2f16 &&
2167 (isa<ConstantSDNode>(Imm) || isa<ConstantFPSDNode>(Imm))) {
2168 // Convert immediate to target constant
2169 if (MemTy == MVT::f32 || MemTy == MVT::f64) {
2170 const ConstantFPSDNode *ConstImm = cast<ConstantFPSDNode>(Imm);
2171 const ConstantFP *CF = ConstImm->getConstantFPValue();
2172 Imm = CurDAG->getTargetConstantFP(*CF, DL, Imm->getValueType(0));
2173 } else {
2174 const ConstantSDNode *ConstImm = cast<ConstantSDNode>(Imm);
2175 const ConstantInt *CI = ConstImm->getConstantIntValue();
2176 Imm = CurDAG->getTargetConstant(*CI, DL, Imm->getValueType(0));
2177 }
2178 Ops[0] = Imm;
2179 // Use immediate version of store param
2180 Opcode = pickOpcodeForVT(MemTy, NVPTX::StoreParamI8_i,
2181 NVPTX::StoreParamI16_i, NVPTX::StoreParamI32_i,
2182 NVPTX::StoreParamI64_i, NVPTX::StoreParamF32_i,
2183 NVPTX::StoreParamF64_i);
2184 } else
2185 Opcode =
2187 NVPTX::StoreParamI8_r, NVPTX::StoreParamI16_r,
2188 NVPTX::StoreParamI32_r, NVPTX::StoreParamI64_r,
2189 NVPTX::StoreParamF32_r, NVPTX::StoreParamF64_r);
2190 if (Opcode == NVPTX::StoreParamI8_r) {
2191 // Fine tune the opcode depending on the size of the operand.
2192 // This helps to avoid creating redundant COPY instructions in
2193 // InstrEmitter::AddRegisterOperand().
2194 switch (Ops[0].getSimpleValueType().SimpleTy) {
2195 default:
2196 break;
2197 case MVT::i32:
2198 Opcode = NVPTX::StoreParamI8TruncI32_r;
2199 break;
2200 case MVT::i64:
2201 Opcode = NVPTX::StoreParamI8TruncI64_r;
2202 break;
2203 }
2204 }
2205 break;
2206 }
2207 case 2:
2208 case 4: {
2210 Opcode = pickOpcodeForVectorStParam(Ops, NumElts, MemTy, CurDAG, DL);
2211 break;
2212 }
2213 }
2214 break;
2215 // Special case: if we have a sign-extend/zero-extend node, insert the
2216 // conversion instruction first, and use that as the value operand to
2217 // the selected StoreParam node.
2219 Opcode = NVPTX::StoreParamI32_r;
2221 MVT::i32);
2222 SDNode *Cvt = CurDAG->getMachineNode(NVPTX::CVT_u32_u16, DL,
2223 MVT::i32, Ops[0], CvtNone);
2224 Ops[0] = SDValue(Cvt, 0);
2225 break;
2226 }
2228 Opcode = NVPTX::StoreParamI32_r;
2230 MVT::i32);
2231 SDNode *Cvt = CurDAG->getMachineNode(NVPTX::CVT_s32_s16, DL,
2232 MVT::i32, Ops[0], CvtNone);
2233 Ops[0] = SDValue(Cvt, 0);
2234 break;
2235 }
2236 }
2237
2238 SDVTList RetVTs = CurDAG->getVTList(MVT::Other, MVT::Glue);
2239 SDNode *Ret = CurDAG->getMachineNode(*Opcode, DL, RetVTs, Ops);
2240 MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand();
2241 CurDAG->setNodeMemRefs(cast<MachineSDNode>(Ret), {MemRef});
2242
2243 ReplaceNode(N, Ret);
2244 return true;
2245}
2246
2247/// SelectBFE - Look for instruction sequences that can be made more efficient
2248/// by using the 'bfe' (bit-field extract) PTX instruction
2249bool NVPTXDAGToDAGISel::tryBFE(SDNode *N) {
2250 SDLoc DL(N);
2251 SDValue LHS = N->getOperand(0);
2252 SDValue RHS = N->getOperand(1);
2253 SDValue Len;
2254 SDValue Start;
2255 SDValue Val;
2256 bool IsSigned = false;
2257
2258 if (N->getOpcode() == ISD::AND) {
2259 // Canonicalize the operands
2260 // We want 'and %val, %mask'
2261 if (isa<ConstantSDNode>(LHS) && !isa<ConstantSDNode>(RHS)) {
2262 std::swap(LHS, RHS);
2263 }
2264
2265 ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(RHS);
2266 if (!Mask) {
2267 // We need a constant mask on the RHS of the AND
2268 return false;
2269 }
2270
2271 // Extract the mask bits
2272 uint64_t MaskVal = Mask->getZExtValue();
2273 if (!isMask_64(MaskVal)) {
2274 // We *could* handle shifted masks here, but doing so would require an
2275 // 'and' operation to fix up the low-order bits so we would trade
2276 // shr+and for bfe+and, which has the same throughput
2277 return false;
2278 }
2279
2280 // How many bits are in our mask?
2281 int64_t NumBits = countr_one(MaskVal);
2282 Len = CurDAG->getTargetConstant(NumBits, DL, MVT::i32);
2283
2284 if (LHS.getOpcode() == ISD::SRL || LHS.getOpcode() == ISD::SRA) {
2285 // We have a 'srl/and' pair, extract the effective start bit and length
2286 Val = LHS.getNode()->getOperand(0);
2287 Start = LHS.getNode()->getOperand(1);
2288 ConstantSDNode *StartConst = dyn_cast<ConstantSDNode>(Start);
2289 if (StartConst) {
2290 uint64_t StartVal = StartConst->getZExtValue();
2291 // How many "good" bits do we have left? "good" is defined here as bits
2292 // that exist in the original value, not shifted in.
2293 int64_t GoodBits = Start.getValueSizeInBits() - StartVal;
2294 if (NumBits > GoodBits) {
2295 // Do not handle the case where bits have been shifted in. In theory
2296 // we could handle this, but the cost is likely higher than just
2297 // emitting the srl/and pair.
2298 return false;
2299 }
2300 Start = CurDAG->getTargetConstant(StartVal, DL, MVT::i32);
2301 } else {
2302 // Do not handle the case where the shift amount (can be zero if no srl
2303 // was found) is not constant. We could handle this case, but it would
2304 // require run-time logic that would be more expensive than just
2305 // emitting the srl/and pair.
2306 return false;
2307 }
2308 } else {
2309 // Do not handle the case where the LHS of the and is not a shift. While
2310 // it would be trivial to handle this case, it would just transform
2311 // 'and' -> 'bfe', but 'and' has higher-throughput.
2312 return false;
2313 }
2314 } else if (N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) {
2315 if (LHS->getOpcode() == ISD::AND) {
2316 ConstantSDNode *ShiftCnst = dyn_cast<ConstantSDNode>(RHS);
2317 if (!ShiftCnst) {
2318 // Shift amount must be constant
2319 return false;
2320 }
2321
2322 uint64_t ShiftAmt = ShiftCnst->getZExtValue();
2323
2324 SDValue AndLHS = LHS->getOperand(0);
2325 SDValue AndRHS = LHS->getOperand(1);
2326
2327 // Canonicalize the AND to have the mask on the RHS
2328 if (isa<ConstantSDNode>(AndLHS)) {
2329 std::swap(AndLHS, AndRHS);
2330 }
2331
2332 ConstantSDNode *MaskCnst = dyn_cast<ConstantSDNode>(AndRHS);
2333 if (!MaskCnst) {
2334 // Mask must be constant
2335 return false;
2336 }
2337
2338 uint64_t MaskVal = MaskCnst->getZExtValue();
2339 uint64_t NumZeros;
2340 uint64_t NumBits;
2341 if (isMask_64(MaskVal)) {
2342 NumZeros = 0;
2343 // The number of bits in the result bitfield will be the number of
2344 // trailing ones (the AND) minus the number of bits we shift off
2345 NumBits = llvm::countr_one(MaskVal) - ShiftAmt;
2346 } else if (isShiftedMask_64(MaskVal)) {
2347 NumZeros = llvm::countr_zero(MaskVal);
2348 unsigned NumOnes = llvm::countr_one(MaskVal >> NumZeros);
2349 // The number of bits in the result bitfield will be the number of
2350 // trailing zeros plus the number of set bits in the mask minus the
2351 // number of bits we shift off
2352 NumBits = NumZeros + NumOnes - ShiftAmt;
2353 } else {
2354 // This is not a mask we can handle
2355 return false;
2356 }
2357
2358 if (ShiftAmt < NumZeros) {
2359 // Handling this case would require extra logic that would make this
2360 // transformation non-profitable
2361 return false;
2362 }
2363
2364 Val = AndLHS;
2365 Start = CurDAG->getTargetConstant(ShiftAmt, DL, MVT::i32);
2366 Len = CurDAG->getTargetConstant(NumBits, DL, MVT::i32);
2367 } else if (LHS->getOpcode() == ISD::SHL) {
2368 // Here, we have a pattern like:
2369 //
2370 // (sra (shl val, NN), MM)
2371 // or
2372 // (srl (shl val, NN), MM)
2373 //
2374 // If MM >= NN, we can efficiently optimize this with bfe
2375 Val = LHS->getOperand(0);
2376
2377 SDValue ShlRHS = LHS->getOperand(1);
2378 ConstantSDNode *ShlCnst = dyn_cast<ConstantSDNode>(ShlRHS);
2379 if (!ShlCnst) {
2380 // Shift amount must be constant
2381 return false;
2382 }
2383 uint64_t InnerShiftAmt = ShlCnst->getZExtValue();
2384
2385 SDValue ShrRHS = RHS;
2386 ConstantSDNode *ShrCnst = dyn_cast<ConstantSDNode>(ShrRHS);
2387 if (!ShrCnst) {
2388 // Shift amount must be constant
2389 return false;
2390 }
2391 uint64_t OuterShiftAmt = ShrCnst->getZExtValue();
2392
2393 // To avoid extra codegen and be profitable, we need Outer >= Inner
2394 if (OuterShiftAmt < InnerShiftAmt) {
2395 return false;
2396 }
2397
2398 // If the outer shift is more than the type size, we have no bitfield to
2399 // extract (since we also check that the inner shift is <= the outer shift
2400 // then this also implies that the inner shift is < the type size)
2401 if (OuterShiftAmt >= Val.getValueSizeInBits()) {
2402 return false;
2403 }
2404
2405 Start = CurDAG->getTargetConstant(OuterShiftAmt - InnerShiftAmt, DL,
2406 MVT::i32);
2407 Len = CurDAG->getTargetConstant(Val.getValueSizeInBits() - OuterShiftAmt,
2408 DL, MVT::i32);
2409
2410 if (N->getOpcode() == ISD::SRA) {
2411 // If we have a arithmetic right shift, we need to use the signed bfe
2412 // variant
2413 IsSigned = true;
2414 }
2415 } else {
2416 // No can do...
2417 return false;
2418 }
2419 } else {
2420 // No can do...
2421 return false;
2422 }
2423
2424
2425 unsigned Opc;
2426 // For the BFE operations we form here from "and" and "srl", always use the
2427 // unsigned variants.
2428 if (Val.getValueType() == MVT::i32) {
2429 if (IsSigned) {
2430 Opc = NVPTX::BFE_S32rii;
2431 } else {
2432 Opc = NVPTX::BFE_U32rii;
2433 }
2434 } else if (Val.getValueType() == MVT::i64) {
2435 if (IsSigned) {
2436 Opc = NVPTX::BFE_S64rii;
2437 } else {
2438 Opc = NVPTX::BFE_U64rii;
2439 }
2440 } else {
2441 // We cannot handle this type
2442 return false;
2443 }
2444
2445 SDValue Ops[] = {
2446 Val, Start, Len
2447 };
2448
2449 ReplaceNode(N, CurDAG->getMachineNode(Opc, DL, N->getVTList(), Ops));
2450 return true;
2451}
2452
2453static inline bool isAddLike(const SDValue V) {
2454 return V.getOpcode() == ISD::ADD ||
2455 (V->getOpcode() == ISD::OR && V->getFlags().hasDisjoint());
2456}
2457
2458// SelectDirectAddr - Match a direct address for DAG.
2459// A direct address could be a globaladdress or externalsymbol.
2460bool NVPTXDAGToDAGISel::SelectDirectAddr(SDValue N, SDValue &Address) {
2461 // Return true if TGA or ES.
2462 if (N.getOpcode() == ISD::TargetGlobalAddress ||
2463 N.getOpcode() == ISD::TargetExternalSymbol) {
2464 Address = N;
2465 return true;
2466 }
2467 if (N.getOpcode() == NVPTXISD::Wrapper) {
2468 Address = N.getOperand(0);
2469 return true;
2470 }
2471 // addrspacecast(MoveParam(arg_symbol) to addrspace(PARAM)) -> arg_symbol
2472 if (AddrSpaceCastSDNode *CastN = dyn_cast<AddrSpaceCastSDNode>(N)) {
2473 if (CastN->getSrcAddressSpace() == ADDRESS_SPACE_GENERIC &&
2476 return SelectDirectAddr(CastN->getOperand(0).getOperand(0), Address);
2477 }
2478 return false;
2479}
2480
2481// symbol+offset
2482bool NVPTXDAGToDAGISel::SelectADDRsi_imp(
2483 SDNode *OpNode, SDValue Addr, SDValue &Base, SDValue &Offset, MVT mvt) {
2484 if (isAddLike(Addr)) {
2485 if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1))) {
2486 SDValue base = Addr.getOperand(0);
2487 if (SelectDirectAddr(base, Base)) {
2488 Offset = CurDAG->getTargetConstant(CN->getZExtValue(), SDLoc(OpNode),
2489 mvt);
2490 return true;
2491 }
2492 }
2493 }
2494 return false;
2495}
2496
2497// symbol+offset
2498bool NVPTXDAGToDAGISel::SelectADDRsi(SDNode *OpNode, SDValue Addr,
2500 return SelectADDRsi_imp(OpNode, Addr, Base, Offset, MVT::i32);
2501}
2502
2503// symbol+offset
2504bool NVPTXDAGToDAGISel::SelectADDRsi64(SDNode *OpNode, SDValue Addr,
2506 return SelectADDRsi_imp(OpNode, Addr, Base, Offset, MVT::i64);
2507}
2508
2509// register+offset
2510bool NVPTXDAGToDAGISel::SelectADDRri_imp(
2511 SDNode *OpNode, SDValue Addr, SDValue &Base, SDValue &Offset, MVT mvt) {
2512 if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Addr)) {
2513 Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), mvt);
2514 Offset = CurDAG->getTargetConstant(0, SDLoc(OpNode), mvt);
2515 return true;
2516 }
2517 if (Addr.getOpcode() == ISD::TargetExternalSymbol ||
2518 Addr.getOpcode() == ISD::TargetGlobalAddress)
2519 return false; // direct calls.
2520
2521 if (isAddLike(Addr)) {
2522 if (SelectDirectAddr(Addr.getOperand(0), Addr)) {
2523 return false;
2524 }
2525 if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1))) {
2526 if (FrameIndexSDNode *FIN =
2527 dyn_cast<FrameIndexSDNode>(Addr.getOperand(0)))
2528 // Constant offset from frame ref.
2529 Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), mvt);
2530 else
2531 Base = Addr.getOperand(0);
2532
2533 // Offset must fit in a 32-bit signed int in PTX [register+offset] address
2534 // mode
2535 if (!CN->getAPIntValue().isSignedIntN(32))
2536 return false;
2537
2538 Offset = CurDAG->getSignedTargetConstant(CN->getSExtValue(),
2539 SDLoc(OpNode), MVT::i32);
2540 return true;
2541 }
2542 }
2543 return false;
2544}
2545
2546// register+offset
2547bool NVPTXDAGToDAGISel::SelectADDRri(SDNode *OpNode, SDValue Addr,
2549 return SelectADDRri_imp(OpNode, Addr, Base, Offset, MVT::i32);
2550}
2551
2552// register+offset
2553bool NVPTXDAGToDAGISel::SelectADDRri64(SDNode *OpNode, SDValue Addr,
2555 return SelectADDRri_imp(OpNode, Addr, Base, Offset, MVT::i64);
2556}
2557
2558bool NVPTXDAGToDAGISel::ChkMemSDNodeAddressSpace(SDNode *N,
2559 unsigned int spN) const {
2560 const Value *Src = nullptr;
2561 if (MemSDNode *mN = dyn_cast<MemSDNode>(N)) {
2562 if (spN == 0 && mN->getMemOperand()->getPseudoValue())
2563 return true;
2564 Src = mN->getMemOperand()->getValue();
2565 }
2566 if (!Src)
2567 return false;
2568 if (auto *PT = dyn_cast<PointerType>(Src->getType()))
2569 return (PT->getAddressSpace() == spN);
2570 return false;
2571}
2572
2573/// SelectInlineAsmMemoryOperand - Implement addressing mode selection for
2574/// inline asm expressions.
2576 const SDValue &Op, InlineAsm::ConstraintCode ConstraintID,
2577 std::vector<SDValue> &OutOps) {
2578 SDValue Op0, Op1;
2579 switch (ConstraintID) {
2580 default:
2581 return true;
2582 case InlineAsm::ConstraintCode::m: // memory
2583 if (SelectDirectAddr(Op, Op0)) {
2584 OutOps.push_back(Op0);
2585 OutOps.push_back(CurDAG->getTargetConstant(0, SDLoc(Op), MVT::i32));
2586 return false;
2587 }
2588 if (SelectADDRri(Op.getNode(), Op, Op0, Op1)) {
2589 OutOps.push_back(Op0);
2590 OutOps.push_back(Op1);
2591 return false;
2592 }
2593 break;
2594 }
2595 return true;
2596}
2597
2598void NVPTXDAGToDAGISel::SelectV2I64toI128(SDNode *N) {
2599 // Lower a CopyToReg with two 64-bit inputs
2600 // Dst:i128, lo:i64, hi:i64
2601 //
2602 // CopyToReg Dst, lo, hi;
2603 //
2604 // ==>
2605 //
2606 // tmp = V2I64toI128 {lo, hi};
2607 // CopyToReg Dst, tmp;
2608 SDValue Dst = N->getOperand(1);
2609 SDValue Lo = N->getOperand(2);
2610 SDValue Hi = N->getOperand(3);
2611
2612 SDLoc DL(N);
2613 SDNode *Mov =
2614 CurDAG->getMachineNode(NVPTX::V2I64toI128, DL, MVT::i128, {Lo, Hi});
2615
2616 SmallVector<SDValue, 4> NewOps(N->getNumOperands() - 1);
2617 NewOps[0] = N->getOperand(0);
2618 NewOps[1] = Dst;
2619 NewOps[2] = SDValue(Mov, 0);
2620 if (N->getNumOperands() == 5)
2621 NewOps[3] = N->getOperand(4);
2622 SDValue NewValue = CurDAG->getNode(ISD::CopyToReg, DL, SmallVector<EVT>(N->values()), NewOps);
2623
2624 ReplaceNode(N, NewValue.getNode());
2625}
2626
2627void NVPTXDAGToDAGISel::SelectI128toV2I64(SDNode *N) {
2628 // Lower CopyFromReg from a 128-bit regs to two 64-bit regs
2629 // Dst:i128, Src:i128
2630 //
2631 // {lo, hi} = CopyFromReg Src
2632 //
2633 // ==>
2634 //
2635 // {lo, hi} = I128toV2I64 Src
2636 //
2637 SDValue Ch = N->getOperand(0);
2638 SDValue Src = N->getOperand(1);
2639 SDValue Glue = N->getOperand(2);
2640 SDLoc DL(N);
2641
2642 // Add Glue and Ch to the operands and results to avoid break the execution
2643 // order
2645 NVPTX::I128toV2I64, DL,
2646 {MVT::i64, MVT::i64, Ch.getValueType(), Glue.getValueType()},
2647 {Src, Ch, Glue});
2648
2649 ReplaceNode(N, Mov);
2650}
2651
2652/// GetConvertOpcode - Returns the CVT_ instruction opcode that implements a
2653/// conversion from \p SrcTy to \p DestTy.
2654unsigned NVPTXDAGToDAGISel::GetConvertOpcode(MVT DestTy, MVT SrcTy,
2655 LoadSDNode *LdNode) {
2656 bool IsSigned = LdNode && LdNode->getExtensionType() == ISD::SEXTLOAD;
2657 switch (SrcTy.SimpleTy) {
2658 default:
2659 llvm_unreachable("Unhandled source type");
2660 case MVT::i8:
2661 switch (DestTy.SimpleTy) {
2662 default:
2663 llvm_unreachable("Unhandled dest type");
2664 case MVT::i16:
2665 return IsSigned ? NVPTX::CVT_s16_s8 : NVPTX::CVT_u16_u8;
2666 case MVT::i32:
2667 return IsSigned ? NVPTX::CVT_s32_s8 : NVPTX::CVT_u32_u8;
2668 case MVT::i64:
2669 return IsSigned ? NVPTX::CVT_s64_s8 : NVPTX::CVT_u64_u8;
2670 }
2671 case MVT::i16:
2672 switch (DestTy.SimpleTy) {
2673 default:
2674 llvm_unreachable("Unhandled dest type");
2675 case MVT::i8:
2676 return IsSigned ? NVPTX::CVT_s8_s16 : NVPTX::CVT_u8_u16;
2677 case MVT::i32:
2678 return IsSigned ? NVPTX::CVT_s32_s16 : NVPTX::CVT_u32_u16;
2679 case MVT::i64:
2680 return IsSigned ? NVPTX::CVT_s64_s16 : NVPTX::CVT_u64_u16;
2681 }
2682 case MVT::i32:
2683 switch (DestTy.SimpleTy) {
2684 default:
2685 llvm_unreachable("Unhandled dest type");
2686 case MVT::i8:
2687 return IsSigned ? NVPTX::CVT_s8_s32 : NVPTX::CVT_u8_u32;
2688 case MVT::i16:
2689 return IsSigned ? NVPTX::CVT_s16_s32 : NVPTX::CVT_u16_u32;
2690 case MVT::i64:
2691 return IsSigned ? NVPTX::CVT_s64_s32 : NVPTX::CVT_u64_u32;
2692 }
2693 case MVT::i64:
2694 switch (DestTy.SimpleTy) {
2695 default:
2696 llvm_unreachable("Unhandled dest type");
2697 case MVT::i8:
2698 return IsSigned ? NVPTX::CVT_s8_s64 : NVPTX::CVT_u8_u64;
2699 case MVT::i16:
2700 return IsSigned ? NVPTX::CVT_s16_s64 : NVPTX::CVT_u16_u64;
2701 case MVT::i32:
2702 return IsSigned ? NVPTX::CVT_s32_s64 : NVPTX::CVT_u32_u64;
2703 }
2704 case MVT::f16:
2705 switch (DestTy.SimpleTy) {
2706 default:
2707 llvm_unreachable("Unhandled dest type");
2708 case MVT::f32:
2709 return NVPTX::CVT_f32_f16;
2710 case MVT::f64:
2711 return NVPTX::CVT_f64_f16;
2712 }
2713 }
2714}
2715
2716bool NVPTXDAGToDAGISel::tryFence(SDNode *N) {
2717 SDLoc DL(N);
2718 assert(N->getOpcode() == ISD::ATOMIC_FENCE);
2719 unsigned int FenceOp =
2720 getFenceOp(NVPTX::Ordering(N->getConstantOperandVal(1)),
2721 Scopes[N->getConstantOperandVal(2)], Subtarget);
2722 SDValue Chain = N->getOperand(0);
2723 SDNode *FenceNode = CurDAG->getMachineNode(FenceOp, DL, MVT::Other, Chain);
2724 ReplaceNode(N, FenceNode);
2725 return true;
2726}
2727
2729 Scopes[C.getOrInsertSyncScopeID("singlethread")] = NVPTX::Scope::Thread;
2730 Scopes[C.getOrInsertSyncScopeID("")] = NVPTX::Scope::System;
2731 Scopes[C.getOrInsertSyncScopeID("block")] = NVPTX::Scope::Block;
2732 Scopes[C.getOrInsertSyncScopeID("cluster")] = NVPTX::Scope::Cluster;
2733 Scopes[C.getOrInsertSyncScopeID("device")] = NVPTX::Scope::Device;
2734}
2735
2737 if (Scopes.empty())
2738 llvm_unreachable("NVPTX Scopes must be initialized before calling "
2739 "NVPTXScopes::operator[]");
2740
2741 auto S = Scopes.find(ID);
2742 if (S == Scopes.end()) {
2743 // TODO:
2744 // - Add API to LLVMContext to get the name of a single scope.
2745 // - Use that API here to print an error containing the name
2746 // of this Unknown ID.
2747 report_fatal_error(formatv("Could not find scope ID={}.", int(ID)));
2748 }
2749 return S->second;
2750}
2751
2752bool NVPTXScopes::empty() const { return Scopes.size() == 0; }
2753
2754#define CP_ASYNC_BULK_TENSOR_OPCODE(dir, dim, mode, is_s32, suffix) \
2755 (is_s32 \
2756 ? NVPTX::CP_ASYNC_BULK_TENSOR_##dir##_##dim##_SHARED32_##mode##suffix \
2757 : NVPTX::CP_ASYNC_BULK_TENSOR_##dir##_##dim##_##mode##suffix)
2758
2759#define CP_ASYNC_BULK_TENSOR_OPCODE_S2G_IMPL(op, dim, mode, is_ch, is_s32) \
2760 (is_ch ? (CP_ASYNC_BULK_TENSOR_OPCODE(op, dim, mode, is_s32, _CH)) \
2761 : (CP_ASYNC_BULK_TENSOR_OPCODE(op, dim, mode, is_s32, )))
2762
2763#define GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G(dim, mode, is_reduce, is_ch, \
2764 is_s32) \
2765 (is_reduce \
2766 ? (CP_ASYNC_BULK_TENSOR_OPCODE_S2G_IMPL(RED, dim, mode, is_ch, is_s32)) \
2767 : (CP_ASYNC_BULK_TENSOR_OPCODE_S2G_IMPL(S2G, dim, mode, is_ch, \
2768 is_s32)))
2769
2770#define GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(dim, mode, is_mc, is_ch, is_s32) \
2771 [&]() -> auto { \
2772 if (is_mc && is_ch) \
2773 return CP_ASYNC_BULK_TENSOR_OPCODE(G2S, dim, mode, is_s32, _MC_CH); \
2774 if (is_ch) \
2775 return CP_ASYNC_BULK_TENSOR_OPCODE(G2S, dim, mode, is_s32, _CH); \
2776 if (is_mc) \
2777 return CP_ASYNC_BULK_TENSOR_OPCODE(G2S, dim, mode, is_s32, _MC); \
2778 return CP_ASYNC_BULK_TENSOR_OPCODE(G2S, dim, mode, is_s32, ); \
2779 }()
2780
2781#define GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH(dim, mode, is_ch) \
2782 (is_ch ? NVPTX::CP_ASYNC_BULK_TENSOR_PREFETCH_##dim##_##mode##_CH \
2783 : NVPTX::CP_ASYNC_BULK_TENSOR_PREFETCH_##dim##_##mode)
2784
2785static unsigned GetCpAsyncBulkTensorS2GOpcode(size_t Dim, bool IsShared32,
2786 bool IsCacheHint, bool IsIm2Col,
2787 bool IsReduce = false) {
2788 if (IsIm2Col) {
2789 switch (Dim) {
2790 case 3:
2791 return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G(3D, IM2COL, IsReduce,
2792 IsCacheHint, IsShared32);
2793 case 4:
2794 return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G(4D, IM2COL, IsReduce,
2795 IsCacheHint, IsShared32);
2796 case 5:
2797 return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G(5D, IM2COL, IsReduce,
2798 IsCacheHint, IsShared32);
2799 default:
2800 llvm_unreachable("Invalid Dimension in im2col mode for "
2801 "GetCpAsyncBulkTensorS2GOpcode.");
2802 }
2803 } else {
2804 switch (Dim) {
2805 case 1:
2806 return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G(1D, TILE, IsReduce,
2807 IsCacheHint, IsShared32);
2808 case 2:
2809 return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G(2D, TILE, IsReduce,
2810 IsCacheHint, IsShared32);
2811 case 3:
2812 return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G(3D, TILE, IsReduce,
2813 IsCacheHint, IsShared32);
2814 case 4:
2815 return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G(4D, TILE, IsReduce,
2816 IsCacheHint, IsShared32);
2817 case 5:
2818 return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G(5D, TILE, IsReduce,
2819 IsCacheHint, IsShared32);
2820 default:
2822 "Invalid Dimension in tile mode for GetCpAsyncBulkTensorS2GOpcode.");
2823 }
2824 }
2825}
2826
2827static unsigned GetCpAsyncBulkTensorG2SOpcode(size_t Dim, bool IsShared32,
2828 bool IsMultiCast,
2829 bool IsCacheHint, bool IsIm2Col) {
2830 if (IsIm2Col) {
2831 switch (Dim) {
2832 case 3:
2833 return GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(3D, IM2COL, IsMultiCast,
2834 IsCacheHint, IsShared32);
2835 case 4:
2836 return GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(4D, IM2COL, IsMultiCast,
2837 IsCacheHint, IsShared32);
2838 case 5:
2839 return GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(5D, IM2COL, IsMultiCast,
2840 IsCacheHint, IsShared32);
2841 default:
2842 llvm_unreachable("Invalid Dimension in im2col mode for "
2843 "GetCpAsyncBulkTensorG2SOpcode.");
2844 }
2845 } else {
2846 switch (Dim) {
2847 case 1:
2848 return GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(1D, TILE, IsMultiCast,
2849 IsCacheHint, IsShared32);
2850 case 2:
2851 return GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(2D, TILE, IsMultiCast,
2852 IsCacheHint, IsShared32);
2853 case 3:
2854 return GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(3D, TILE, IsMultiCast,
2855 IsCacheHint, IsShared32);
2856 case 4:
2857 return GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(4D, TILE, IsMultiCast,
2858 IsCacheHint, IsShared32);
2859 case 5:
2860 return GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(5D, TILE, IsMultiCast,
2861 IsCacheHint, IsShared32);
2862 default:
2864 "Invalid Dimension in tile mode for GetCpAsyncBulkTensorG2SOpcode.");
2865 }
2866 }
2867}
2868
2869static unsigned GetCpAsyncBulkTensorPrefetchOpcode(size_t Dim, bool IsCacheHint,
2870 bool IsIm2Col) {
2871 if (IsIm2Col) {
2872 switch (Dim) {
2873 case 3:
2874 return GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH(3D, IM2COL, IsCacheHint);
2875 case 4:
2876 return GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH(4D, IM2COL, IsCacheHint);
2877 case 5:
2878 return GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH(5D, IM2COL, IsCacheHint);
2879 default:
2880 llvm_unreachable("Invalid Dimension in im2col mode for "
2881 "GetCpAsyncBulkTensorPrefetchOpcode.");
2882 }
2883 } else {
2884 switch (Dim) {
2885 case 1:
2886 return GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH(1D, TILE, IsCacheHint);
2887 case 2:
2888 return GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH(2D, TILE, IsCacheHint);
2889 case 3:
2890 return GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH(3D, TILE, IsCacheHint);
2891 case 4:
2892 return GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH(4D, TILE, IsCacheHint);
2893 case 5:
2894 return GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH(5D, TILE, IsCacheHint);
2895 default:
2896 llvm_unreachable("Invalid Dimension in tile mode for "
2897 "GetCpAsyncBulkTensorPrefetchOpcode.");
2898 }
2899 }
2900}
2901
2902static size_t GetDimsFromIntrinsic(unsigned IID) {
2903 switch (IID) {
2904 case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_3d:
2905 case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_im2col_3d:
2906 return 3;
2907 case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_4d:
2908 case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_im2col_4d:
2909 return 4;
2910 case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_5d:
2911 case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_im2col_5d:
2912 return 5;
2913 default:
2914 llvm_unreachable("Invalid im2col intrinsic in GetDimsFromIntrinsic.");
2915 }
2916}
2917
2918void NVPTXDAGToDAGISel::SelectCpAsyncBulkTensorG2SCommon(SDNode *N,
2919 bool IsIm2Col) {
2920 // We have {Chain, Intrinsic-ID} followed by the actual intrisic args:
2921 // {dst, mbar, src, dims{d0...dN}, im2col_offsets{dims-2}
2922 // multicast, cache_hint,
2923 // multicast_flag, cache_hint_flag}
2924 // NumOperands = {Chain, IID} + {Actual intrinsic args}
2925 // = {2} + {7 + dims + im2col_offsets}
2926 size_t NumOps = N->getNumOperands();
2927 size_t NumDims = IsIm2Col ? GetDimsFromIntrinsic(N->getConstantOperandVal(1))
2928 : (NumOps - 9);
2929 // Offsets is always 'NumDims - 2' and only for im2col mode
2930 size_t NumOffsets = IsIm2Col ? (NumDims - 2) : 0;
2931 bool IsCacheHint = N->getConstantOperandVal(NumOps - 1) == 1;
2932 bool IsMultiCast = N->getConstantOperandVal(NumOps - 2) == 1;
2933 size_t NumBaseArgs = NumDims + NumOffsets + 3; // for {dst, mbar, src}
2934 size_t MultiCastIdx = NumBaseArgs + 2; // for Chain and IID
2935
2936 SDLoc DL(N);
2937 SmallVector<SDValue, 8> Ops(N->ops().slice(2, NumBaseArgs));
2938
2939 // Push MultiCast operand, if available
2940 if (IsMultiCast)
2941 Ops.push_back(N->getOperand(MultiCastIdx));
2942
2943 // Push CacheHint operand, if available
2944 if (IsCacheHint)
2945 Ops.push_back(N->getOperand(MultiCastIdx + 1));
2946
2947 // Finally, the chain operand
2948 Ops.push_back(N->getOperand(0));
2949
2950 bool IsShared32 =
2952 unsigned Opcode = GetCpAsyncBulkTensorG2SOpcode(
2953 NumDims, IsShared32, IsMultiCast, IsCacheHint, IsIm2Col);
2954 ReplaceNode(N, CurDAG->getMachineNode(Opcode, DL, N->getVTList(), Ops));
2955}
2956
2957void NVPTXDAGToDAGISel::SelectCpAsyncBulkTensorS2GCommon(SDNode *N,
2958 bool IsIm2Col) {
2959 // We have {Chain, Intrinsic-ID} followed by the actual intrisic args:
2960 // src, dst, dims{d0...dN}, cache_hint, cache_hint_flag
2961 // NumOperands = {Chain, IID} + {Actual intrinsic args}
2962 // = {2} + {4 + dims}
2963 size_t NumOps = N->getNumOperands();
2964 size_t NumDims = NumOps - 6;
2965 bool IsCacheHint = N->getConstantOperandVal(NumOps - 1) == 1;
2966 size_t NumArgs = NumDims + (IsCacheHint ? 3 : 2); // src, dst, cache_hint
2967
2968 SDLoc DL(N);
2969 SmallVector<SDValue, 8> Ops(N->ops().slice(2, NumArgs));
2970 Ops.push_back(N->getOperand(0)); // Chain operand
2971
2972 bool IsShared32 =
2974 unsigned Opcode =
2975 GetCpAsyncBulkTensorS2GOpcode(NumDims, IsShared32, IsCacheHint, IsIm2Col);
2976 ReplaceNode(N, CurDAG->getMachineNode(Opcode, DL, N->getVTList(), Ops));
2977}
2978
2979void NVPTXDAGToDAGISel::SelectCpAsyncBulkTensorPrefetchCommon(SDNode *N,
2980 bool IsIm2Col) {
2981 // We have {Chain, Intrinsic-ID} followed by the actual intrisic args:
2982 // {src, dims{d0...dN}, im2col_offsets{dims-2}
2983 // cache_hint, cache_hint_flag}
2984 // NumOperands = {Chain, IID} + {Actual intrinsic args}
2985 // = {2} + {3 + dims + im2col_offsets}
2986 size_t NumOps = N->getNumOperands();
2987 size_t NumDims = IsIm2Col ? GetDimsFromIntrinsic(N->getConstantOperandVal(1))
2988 : (NumOps - 5);
2989 // Offsets is always 'NumDims - 2' and only for im2col mode
2990 size_t NumOffsets = IsIm2Col ? (NumDims - 2) : 0;
2991 bool IsCacheHint = N->getConstantOperandVal(NumOps - 1) == 1;
2992 size_t NumArgs = NumDims + NumOffsets + (IsCacheHint ? 2 : 1);
2993
2994 SDLoc DL(N);
2995 SmallVector<SDValue, 12> Ops(N->ops().slice(2, NumArgs));
2996 Ops.push_back(N->getOperand(0)); // Chain operand
2997
2998 unsigned Opcode =
2999 GetCpAsyncBulkTensorPrefetchOpcode(NumDims, IsCacheHint, IsIm2Col);
3000 ReplaceNode(N, CurDAG->getMachineNode(Opcode, DL, N->getVTList(), Ops));
3001}
3002
3003void NVPTXDAGToDAGISel::SelectCpAsyncBulkTensorReduceCommon(SDNode *N,
3004 unsigned RedOp,
3005 bool IsIm2Col) {
3006 // We have {Chain, Intrinsic-ID} followed by the actual intrisic args:
3007 // src, dst, dims{d0...dN}, cache_hint, cache_hint_flag
3008 // NumOperands = {Chain, IID} + {Actual intrinsic args}
3009 // = {2} + {4 + dims}
3010 size_t NumOps = N->getNumOperands();
3011 size_t NumDims = NumOps - 6;
3012 bool IsCacheHint = N->getConstantOperandVal(NumOps - 1) == 1;
3013 size_t NumArgs = NumDims + (IsCacheHint ? 3 : 2); // src, dst, cache_hint
3014
3015 SDLoc DL(N);
3016 SmallVector<SDValue, 12> Ops(N->ops().slice(2, NumArgs));
3017 Ops.push_back(getI32Imm(RedOp, DL)); // Reduction Op
3018 Ops.push_back(N->getOperand(0)); // Chain operand
3019
3020 bool IsShared32 =
3022 unsigned Opcode = GetCpAsyncBulkTensorS2GOpcode(
3023 NumDims, IsShared32, IsCacheHint, IsIm2Col, /*IsReduce=*/true);
3024 ReplaceNode(N, CurDAG->getMachineNode(Opcode, DL, N->getVTList(), Ops));
3025}
3026
3027bool NVPTXDAGToDAGISel::tryIntrinsicVoid(SDNode *N) {
3028 unsigned IID = N->getConstantOperandVal(1);
3029 using TMARedTy = llvm::nvvm::TMAReductionOp;
3030 auto CastTy = [](TMARedTy Op) { return static_cast<unsigned>(Op); };
3031 switch (IID) {
3032 default:
3033 return false;
3034 case Intrinsic::nvvm_cp_async_bulk_tensor_s2g_tile_1d:
3035 case Intrinsic::nvvm_cp_async_bulk_tensor_s2g_tile_2d:
3036 case Intrinsic::nvvm_cp_async_bulk_tensor_s2g_tile_3d:
3037 case Intrinsic::nvvm_cp_async_bulk_tensor_s2g_tile_4d:
3038 case Intrinsic::nvvm_cp_async_bulk_tensor_s2g_tile_5d:
3039 SelectCpAsyncBulkTensorS2GCommon(N);
3040 return true;
3041 case Intrinsic::nvvm_cp_async_bulk_tensor_s2g_im2col_3d:
3042 case Intrinsic::nvvm_cp_async_bulk_tensor_s2g_im2col_4d:
3043 case Intrinsic::nvvm_cp_async_bulk_tensor_s2g_im2col_5d:
3044 SelectCpAsyncBulkTensorS2GCommon(N, /*IsIm2Col=*/true);
3045 return true;
3046 case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_1d:
3047 case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_2d:
3048 case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_3d:
3049 case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_4d:
3050 case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_5d:
3051 SelectCpAsyncBulkTensorG2SCommon(N);
3052 return true;
3053 case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_3d:
3054 case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_4d:
3055 case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_5d:
3056 SelectCpAsyncBulkTensorG2SCommon(N, /*IsIm2Col=*/true);
3057 return true;
3058 case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_tile_1d:
3059 case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_tile_2d:
3060 case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_tile_3d:
3061 case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_tile_4d:
3062 case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_tile_5d:
3063 SelectCpAsyncBulkTensorPrefetchCommon(N);
3064 return true;
3065 case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_im2col_3d:
3066 case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_im2col_4d:
3067 case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_im2col_5d:
3068 SelectCpAsyncBulkTensorPrefetchCommon(N, /*IsIm2Col=*/true);
3069 return true;
3070 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_tile_1d:
3071 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_tile_2d:
3072 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_tile_3d:
3073 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_tile_4d:
3074 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_tile_5d:
3075 SelectCpAsyncBulkTensorReduceCommon(N, CastTy(TMARedTy::ADD));
3076 return true;
3077 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_im2col_3d:
3078 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_im2col_4d:
3079 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_im2col_5d:
3080 SelectCpAsyncBulkTensorReduceCommon(N, CastTy(TMARedTy::ADD),
3081 /*IsIm2Col=*/true);
3082 return true;
3083 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_min_tile_1d:
3084 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_min_tile_2d:
3085 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_min_tile_3d:
3086 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_min_tile_4d:
3087 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_min_tile_5d:
3088 SelectCpAsyncBulkTensorReduceCommon(N, CastTy(TMARedTy::MIN));
3089 return true;
3090 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_min_im2col_3d:
3091 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_min_im2col_4d:
3092 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_min_im2col_5d:
3093 SelectCpAsyncBulkTensorReduceCommon(N, CastTy(TMARedTy::MIN),
3094 /*IsIm2Col=*/true);
3095 return true;
3096 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_max_tile_1d:
3097 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_max_tile_2d:
3098 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_max_tile_3d:
3099 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_max_tile_4d:
3100 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_max_tile_5d:
3101 SelectCpAsyncBulkTensorReduceCommon(N, CastTy(TMARedTy::MAX));
3102 return true;
3103 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_max_im2col_3d:
3104 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_max_im2col_4d:
3105 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_max_im2col_5d:
3106 SelectCpAsyncBulkTensorReduceCommon(N, CastTy(TMARedTy::MAX),
3107 /*IsIm2Col=*/true);
3108 return true;
3109 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_inc_tile_1d:
3110 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_inc_tile_2d:
3111 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_inc_tile_3d:
3112 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_inc_tile_4d:
3113 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_inc_tile_5d:
3114 SelectCpAsyncBulkTensorReduceCommon(N, CastTy(TMARedTy::INC));
3115 return true;
3116 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_inc_im2col_3d:
3117 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_inc_im2col_4d:
3118 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_inc_im2col_5d:
3119 SelectCpAsyncBulkTensorReduceCommon(N, CastTy(TMARedTy::INC),
3120 /*IsIm2Col=*/true);
3121 return true;
3122 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_dec_tile_1d:
3123 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_dec_tile_2d:
3124 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_dec_tile_3d:
3125 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_dec_tile_4d:
3126 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_dec_tile_5d:
3127 SelectCpAsyncBulkTensorReduceCommon(N, CastTy(TMARedTy::DEC));
3128 return true;
3129 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_dec_im2col_3d:
3130 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_dec_im2col_4d:
3131 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_dec_im2col_5d:
3132 SelectCpAsyncBulkTensorReduceCommon(N, CastTy(TMARedTy::DEC),
3133 /*IsIm2Col=*/true);
3134 return true;
3135 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_and_tile_1d:
3136 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_and_tile_2d:
3137 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_and_tile_3d:
3138 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_and_tile_4d:
3139 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_and_tile_5d:
3140 SelectCpAsyncBulkTensorReduceCommon(N, CastTy(TMARedTy::AND));
3141 return true;
3142 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_and_im2col_3d:
3143 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_and_im2col_4d:
3144 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_and_im2col_5d:
3145 SelectCpAsyncBulkTensorReduceCommon(N, CastTy(TMARedTy::AND),
3146 /*IsIm2Col=*/true);
3147 return true;
3148 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_or_tile_1d:
3149 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_or_tile_2d:
3150 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_or_tile_3d:
3151 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_or_tile_4d:
3152 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_or_tile_5d:
3153 SelectCpAsyncBulkTensorReduceCommon(N, CastTy(TMARedTy::OR));
3154 return true;
3155 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_or_im2col_3d:
3156 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_or_im2col_4d:
3157 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_or_im2col_5d:
3158 SelectCpAsyncBulkTensorReduceCommon(N, CastTy(TMARedTy::OR),
3159 /*IsIm2Col=*/true);
3160 return true;
3161 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_xor_tile_1d:
3162 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_xor_tile_2d:
3163 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_xor_tile_3d:
3164 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_xor_tile_4d:
3165 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_xor_tile_5d:
3166 SelectCpAsyncBulkTensorReduceCommon(N, CastTy(TMARedTy::XOR));
3167 return true;
3168 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_xor_im2col_3d:
3169 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_xor_im2col_4d:
3170 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_xor_im2col_5d:
3171 SelectCpAsyncBulkTensorReduceCommon(N, CastTy(TMARedTy::XOR),
3172 /*IsIm2Col=*/true);
3173 return true;
3174 }
3175}
amdgpu aa AMDGPU Address space based Alias Analysis Wrapper
static const LLT F64
static const LLT F32
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Atomic ordering constants.
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
uint64_t Addr
#define DEBUG_TYPE
#define F(x, y, z)
Definition: MD5.cpp:55
#define getOpcodeForVectorStParam(n, ty, isimm)
static unsigned int getCodeAddrSpace(MemSDNode *N)
static bool isAddLike(const SDValue V)
static bool isVectorElementTypeUpsized(EVT EltVT)
static size_t GetDimsFromIntrinsic(unsigned IID)
static int getLdStRegType(EVT VT)
static unsigned pickOpcodeForVectorStParam(SmallVector< SDValue, 8 > &Ops, unsigned NumElts, MVT::SimpleValueType MemTy, SelectionDAG *CurDAG, SDLoc DL)
#define getOpcodeForVectorStParamV2(ty, isimm)
static cl::opt< bool > EnableRsqrtOpt("nvptx-rsqrt-approx-opt", cl::init(true), cl::Hidden, cl::desc("Enable reciprocal sqrt optimization"))
#define GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH(dim, mode, is_ch)
static unsigned GetCpAsyncBulkTensorPrefetchOpcode(size_t Dim, bool IsCacheHint, bool IsIm2Col)
#define GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(dim, mode, is_mc, is_ch, is_s32)
static unsigned GetCpAsyncBulkTensorG2SOpcode(size_t Dim, bool IsShared32, bool IsMultiCast, bool IsCacheHint, bool IsIm2Col)
static unsigned getPTXCmpMode(const CondCodeSDNode &CondCode, bool FTZ)
static unsigned int getFenceOp(NVPTX::Ordering O, NVPTX::Scope S, NVPTXSubtarget const *T)
static unsigned GetCpAsyncBulkTensorS2GOpcode(size_t Dim, bool IsShared32, bool IsCacheHint, bool IsIm2Col, bool IsReduce=false)
static std::optional< unsigned > pickOpcodeForVT(MVT::SimpleValueType VT, unsigned Opcode_i8, unsigned Opcode_i16, unsigned Opcode_i32, std::optional< unsigned > Opcode_i64, unsigned Opcode_f32, std::optional< unsigned > Opcode_f64)
#define GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G(dim, mode, is_reduce, is_ch, is_s32)
static bool canLowerToLDG(MemSDNode *N, const NVPTXSubtarget &Subtarget, unsigned CodeAddrSpace, MachineFunction *F)
This file contains the definitions of the enumerations and flags associated with NVVM Intrinsics,...
if(PassOpts->AAPipeline)
#define INITIALIZE_PASS(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:38
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
#define PASS_NAME
Value * RHS
Value * LHS
unsigned getSrcAddressSpace() const
unsigned getDestAddressSpace() const
This is an SDNode representing atomic operations.
const SDValue & getVal() const
const ConstantFP * getConstantFPValue() const
ConstantFP - Floating Point Values [float, double].
Definition: Constants.h:271
This is the shared class of boolean and integer constants.
Definition: Constants.h:83
const ConstantInt * getConstantIntValue() const
uint64_t getZExtValue() const
This class represents an Operation in the Expression.
unsigned getPointerSizeInBits(unsigned AS=0) const
Layout pointer size, in bits FIXME: The defaults need to be removed once all of the backends/clients ...
Definition: DataLayout.h:364
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:310
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition: Function.cpp:369
Record instruction ordering so we can query their relative positions within a function.
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
bool isIndexed() const
Return true if this is a pre/post inc/dec load/store.
This class is used to represent ISD::LOAD nodes.
ISD::LoadExtType getExtensionType() const
Return whether this is a plain node, or one of the varieties of value-extending loads.
Machine Value Type.
SimpleValueType SimpleTy
bool isVector() const
Return true if this is a vector value type.
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
MVT getVectorElementType() const
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
Function & getFunction()
Return the LLVM function that this machine code represents.
A description of a memory reference used in the backend.
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
EVT getMemoryVT() const
Return the type of the in-memory value.
NVPTXDAGToDAGISelLegacy(NVPTXTargetMachine &tm, CodeGenOptLevel OptLevel)
bool runOnMachineFunction(MachineFunction &MF) override
bool SelectInlineAsmMemoryOperand(const SDValue &Op, InlineAsm::ConstraintCode ConstraintID, std::vector< SDValue > &OutOps) override
SelectInlineAsmMemoryOperand - Implement addressing mode selection for inline asm expressions.
const NVPTXSubtarget * Subtarget
void failIfClustersUnsupported(std::string const &FailureMessage) const
const NVPTXTargetLowering * getTargetLowering() const override
bool hasRelaxedMMIO() const
bool hasMemoryOrdering() const
bool useF32FTZ(const MachineFunction &MF) const
bool allowFMA(MachineFunction &MF, CodeGenOptLevel OptLevel) const
bool allowUnsafeFPMath(MachineFunction &MF) const
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
Represents one node in the SelectionDAG.
const SDValue & getOperand(unsigned Num) const
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
SDNode * getNode() const
get the SDNode which holds the desired result
EVT getValueType() const
Return the ValueType of the referenced return value.
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
unsigned getOpcode() const
SelectionDAGISel - This is the common base class used for SelectionDAG-based pattern-matching instruc...
MachineFunction * MF
CodeGenOptLevel OptLevel
void ReplaceUses(SDValue F, SDValue T)
ReplaceUses - replace all uses of the old node F with the use of the new node T.
void ReplaceNode(SDNode *F, SDNode *T)
Replace all uses of F with T, then remove F from the DAG.
virtual bool runOnMachineFunction(MachineFunction &mf)
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
Definition: SelectionDAG.h:228
SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
void setNodeMemRefs(MachineSDNode *N, ArrayRef< MachineMemOperand * > NewMemRefs)
Mutate the specified machine node's memory references to the provided list.
const DataLayout & getDataLayout() const
Definition: SelectionDAG.h:495
SDValue getTargetFrameIndex(int FI, EVT VT)
Definition: SelectionDAG.h:753
SDValue getSignedTargetConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
Definition: SelectionDAG.h:710
SDValue getTargetConstantFP(double Val, const SDLoc &DL, EVT VT)
Definition: SelectionDAG.h:734
SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
Definition: SelectionDAG.h:698
bool empty() const
Definition: SmallVector.h:81
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
Definition: SmallVector.h:683
void push_back(const T &Elt)
Definition: SmallVector.h:413
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1196
This class is used to represent ISD::STORE nodes.
const SDValue & getValue() const
unsigned getPointerSizeInBits(unsigned AS) const
LLVM Value Representation.
Definition: Value.h:74
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Definition: Lint.cpp:87
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:125
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
@ ATOMIC_STORE
OUTCHAIN = ATOMIC_STORE(INCHAIN, val, ptr) This corresponds to "store atomic" instruction.
Definition: ISDOpcodes.h:1312
@ ADD
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:246
@ LOAD
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
Definition: ISDOpcodes.h:1102
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
Definition: ISDOpcodes.h:205
@ ATOMIC_FENCE
OUTCHAIN = ATOMIC_FENCE(INCHAIN, ordering, scope) This corresponds to the fence instruction.
Definition: ISDOpcodes.h:1304
@ TargetExternalSymbol
Definition: ISDOpcodes.h:175
@ ATOMIC_LOAD
Val, OUTCHAIN = ATOMIC_LOAD(INCHAIN, ptr) This corresponds to "load atomic" instruction.
Definition: ISDOpcodes.h:1308
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
Definition: ISDOpcodes.h:215
@ TargetGlobalAddress
TargetGlobalAddress - Like GlobalAddress, but the DAG does no folding or anything else with this node...
Definition: ISDOpcodes.h:170
@ SHL
Shift and rotation operations.
Definition: ISDOpcodes.h:735
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition: ISDOpcodes.h:550
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition: ISDOpcodes.h:209
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition: ISDOpcodes.h:709
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition: ISDOpcodes.h:190
@ ADDRSPACECAST
ADDRSPACECAST - This operator converts between pointers of different address spaces.
Definition: ISDOpcodes.h:958
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition: ISDOpcodes.h:198
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
Definition: ISDOpcodes.h:1606
std::string ScopeToString(Scope S)
@ Global
Definition: NVPTX.h:145
@ Shared
Definition: NVPTX.h:146
@ Generic
Definition: NVPTX.h:144
std::string OrderingToString(Ordering Order)
@ System
Definition: NVPTX.h:138
@ Cluster
Definition: NVPTX.h:136
@ Thread
Definition: NVPTX.h:134
@ Device
Definition: NVPTX.h:137
@ RelaxedMMIO
Definition: NVPTX.h:128
@ Acquire
Definition: NVPTX.h:122
@ Relaxed
Definition: NVPTX.h:120
@ AcquireRelease
Definition: NVPTX.h:124
@ NotAtomic
Definition: NVPTX.h:117
@ Volatile
Definition: NVPTX.h:127
@ Release
Definition: NVPTX.h:123
@ SequentiallyConsistent
Definition: NVPTX.h:125
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
@ Offset
Definition: DWP.cpp:480
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1739
bool Isv2x16VT(EVT VT)
int countr_one(T Value)
Count the number of ones from the least significant bit to the first zero bit.
Definition: bit.h:307
FunctionPass * createNVPTXISelDag(NVPTXTargetMachine &TM, llvm::CodeGenOptLevel OptLevel)
createNVPTXISelDag - This pass converts a legalized DAG into a NVPTX-specific DAG,...
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition: bit.h:215
constexpr bool isShiftedMask_64(uint64_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (64 bit ver...
Definition: MathExtras.h:285
const char * toIRString(AtomicOrdering ao)
String used by LLVM IR to represent atomic ordering.
auto formatv(bool Validate, const char *Fmt, Ts &&...Vals)
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition: Error.cpp:167
constexpr bool isMask_64(uint64_t Value)
Return true if the argument is a non-empty sequence of ones starting at the least significant bit wit...
Definition: MathExtras.h:273
CodeGenOptLevel
Code generation optimization level.
Definition: CodeGen.h:54
AtomicOrdering
Atomic ordering for LLVM's memory model.
DWARFExpression::Operation Op
bool isKernelFunction(const Function &F)
void getUnderlyingObjects(const Value *V, SmallVectorImpl< const Value * > &Objects, const LoopInfo *LI=nullptr, unsigned MaxLookup=6)
This method is similar to getUnderlyingObject except that it can look through phi and select instruct...
Implement std::hash so that hash_code can be used in STL containers.
Definition: BitVector.h:858
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:860
#define N
Extended Value Type.
Definition: ValueTypes.h:35
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition: ValueTypes.h:137
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
Definition: ValueTypes.h:147
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition: ValueTypes.h:311
bool isVector() const
Return true if this is a vector value type.
Definition: ValueTypes.h:168
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition: ValueTypes.h:323
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition: ValueTypes.h:331
NVPTXScopes()=default
NVPTX::Scope operator[](SyncScope::ID ID) const
This represents a list of ValueType's that has been intern'd by a SelectionDAG.