LLVM 22.0.0git
NVPTXISelDAGToDAG.cpp
Go to the documentation of this file.
1//===-- NVPTXISelDAGToDAG.cpp - A dag to dag inst selector for NVPTX ------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines an instruction selector for the NVPTX target.
10//
11//===----------------------------------------------------------------------===//
12
13#include "NVPTXISelDAGToDAG.h"
14#include "NVPTX.h"
15#include "NVPTXUtilities.h"
16#include "llvm/ADT/APInt.h"
21#include "llvm/IR/GlobalValue.h"
23#include "llvm/IR/IntrinsicsNVPTX.h"
30#include <optional>
31
32using namespace llvm;
33
34#define DEBUG_TYPE "nvptx-isel"
35#define PASS_NAME "NVPTX DAG->DAG Pattern Instruction Selection"
36
37static cl::opt<bool>
38 EnableRsqrtOpt("nvptx-rsqrt-approx-opt", cl::init(true), cl::Hidden,
39 cl::desc("Enable reciprocal sqrt optimization"));
40
41// FIXME: This is a WAR to recover lost performance from #155024.
42// We still need to investigate the regression and find a more permanent
43// solution.
44static cl::opt<bool> EnableMADWide("nvptx-mad-wide-opt", cl::init(false),
46 cl::desc("Enable MAD wide optimization"));
47
48/// createNVPTXISelDag - This pass converts a legalized DAG into a
49/// NVPTX-specific DAG, ready for instruction scheduling.
54
59
61
63
67
69 Subtarget = &MF.getSubtarget<NVPTXSubtarget>();
70 Scopes = NVPTXScopes(MF.getFunction().getContext());
72}
73
75NVPTXDAGToDAGISel::getDivF32Level(const SDNode *N) const {
77}
78
79bool NVPTXDAGToDAGISel::usePrecSqrtF32(const SDNode *N) const {
81}
82
83bool NVPTXDAGToDAGISel::useF32FTZ() const {
84 return Subtarget->getTargetLowering()->useF32FTZ(*MF);
85}
86
87bool NVPTXDAGToDAGISel::allowFMA() const {
88 const NVPTXTargetLowering *TL = Subtarget->getTargetLowering();
89 return TL->allowFMA(*MF, OptLevel);
90}
91
92bool NVPTXDAGToDAGISel::doRsqrtOpt() const { return EnableRsqrtOpt; }
93
94bool NVPTXDAGToDAGISel::doMADWideOpt() const { return EnableMADWide; }
95
96/// Select - Select instructions not customized! Used for
97/// expanded, promoted and normal instructions.
98void NVPTXDAGToDAGISel::Select(SDNode *N) {
99
100 if (N->isMachineOpcode()) {
101 N->setNodeId(-1);
102 return; // Already selected.
103 }
104
105 switch (N->getOpcode()) {
106 case ISD::LOAD:
107 case ISD::ATOMIC_LOAD:
108 case NVPTXISD::MLoad:
109 if (tryLoad(N))
110 return;
111 break;
112 case ISD::STORE:
113 case ISD::ATOMIC_STORE:
114 if (tryStore(N))
115 return;
116 break;
117 case ISD::ATOMIC_FENCE:
118 if (tryFence(N))
119 return;
120 break;
122 tryUNPACK_VECTOR(N);
123 return;
125 if (tryEXTRACT_VECTOR_ELEMENT(N))
126 return;
127 break;
129 SelectSETP_F16X2(N);
130 return;
132 SelectSETP_BF16X2(N);
133 return;
134 case NVPTXISD::LoadV2:
135 case NVPTXISD::LoadV4:
136 case NVPTXISD::LoadV8:
137 if (tryLoadVector(N))
138 return;
139 break;
140 case NVPTXISD::LDUV2:
141 case NVPTXISD::LDUV4:
142 if (tryLDU(N))
143 return;
144 break;
148 if (tryStoreVector(N))
149 return;
150 break;
152 if (tryIntrinsicChain(N))
153 return;
154 break;
156 if (tryIntrinsicVoid(N))
157 return;
158 break;
159 case ISD::AND:
160 case ISD::SRA:
161 case ISD::SRL:
162 // Try to select BFE
163 if (tryBFE(N))
164 return;
165 break;
166 case ISD::ADDRSPACECAST:
167 SelectAddrSpaceCast(N);
168 return;
169 case ISD::CopyToReg: {
170 if (N->getOperand(1).getValueType() == MVT::i128) {
171 SelectV2I64toI128(N);
172 return;
173 }
174 break;
175 }
176 case ISD::CopyFromReg: {
177 if (N->getOperand(1).getValueType() == MVT::i128) {
178 SelectI128toV2I64(N);
179 return;
180 }
181 break;
182 }
185 selectAtomicSwap128(N);
186 return;
187 case ISD::FADD:
188 case ISD::FMUL:
189 case ISD::FSUB:
190 if (tryBF16ArithToFMA(N))
191 return;
192 break;
193 default:
194 break;
195 }
196 SelectCode(N);
197}
198
199#define TCGEN05_LD_OPCODE(SHAPE, NUM) \
200 (enablePack ? NVPTX::TCGEN05_LD_##SHAPE##_##NUM##_PACK \
201 : NVPTX::TCGEN05_LD_##SHAPE##_##NUM)
202
203static unsigned getTcgen05LdOpcode(unsigned IID, bool enablePack) {
204 switch (IID) {
205 case Intrinsic::nvvm_tcgen05_ld_16x64b_x1:
206 return TCGEN05_LD_OPCODE(16x64b, x1);
207 case Intrinsic::nvvm_tcgen05_ld_16x64b_x2:
208 return TCGEN05_LD_OPCODE(16x64b, x2);
209 case Intrinsic::nvvm_tcgen05_ld_16x64b_x4:
210 return TCGEN05_LD_OPCODE(16x64b, x4);
211 case Intrinsic::nvvm_tcgen05_ld_16x64b_x8:
212 return TCGEN05_LD_OPCODE(16x64b, x8);
213 case Intrinsic::nvvm_tcgen05_ld_16x64b_x16:
214 return TCGEN05_LD_OPCODE(16x64b, x16);
215 case Intrinsic::nvvm_tcgen05_ld_16x64b_x32:
216 return TCGEN05_LD_OPCODE(16x64b, x32);
217 case Intrinsic::nvvm_tcgen05_ld_16x64b_x64:
218 return TCGEN05_LD_OPCODE(16x64b, x64);
219 case Intrinsic::nvvm_tcgen05_ld_16x64b_x128:
220 return TCGEN05_LD_OPCODE(16x64b, x128);
221 case Intrinsic::nvvm_tcgen05_ld_16x128b_x1:
222 return TCGEN05_LD_OPCODE(16x128b, x1);
223 case Intrinsic::nvvm_tcgen05_ld_16x128b_x2:
224 return TCGEN05_LD_OPCODE(16x128b, x2);
225 case Intrinsic::nvvm_tcgen05_ld_16x128b_x4:
226 return TCGEN05_LD_OPCODE(16x128b, x4);
227 case Intrinsic::nvvm_tcgen05_ld_16x128b_x8:
228 return TCGEN05_LD_OPCODE(16x128b, x8);
229 case Intrinsic::nvvm_tcgen05_ld_16x128b_x16:
230 return TCGEN05_LD_OPCODE(16x128b, x16);
231 case Intrinsic::nvvm_tcgen05_ld_16x128b_x32:
232 return TCGEN05_LD_OPCODE(16x128b, x32);
233 case Intrinsic::nvvm_tcgen05_ld_16x128b_x64:
234 return TCGEN05_LD_OPCODE(16x128b, x64);
235 case Intrinsic::nvvm_tcgen05_ld_16x256b_x1:
236 return TCGEN05_LD_OPCODE(16x256b, x1);
237 case Intrinsic::nvvm_tcgen05_ld_16x256b_x2:
238 return TCGEN05_LD_OPCODE(16x256b, x2);
239 case Intrinsic::nvvm_tcgen05_ld_16x256b_x4:
240 return TCGEN05_LD_OPCODE(16x256b, x4);
241 case Intrinsic::nvvm_tcgen05_ld_16x256b_x8:
242 return TCGEN05_LD_OPCODE(16x256b, x8);
243 case Intrinsic::nvvm_tcgen05_ld_16x256b_x16:
244 return TCGEN05_LD_OPCODE(16x256b, x16);
245 case Intrinsic::nvvm_tcgen05_ld_16x256b_x32:
246 return TCGEN05_LD_OPCODE(16x256b, x32);
247 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x1:
248 return TCGEN05_LD_OPCODE(16x32bx2, x1);
249 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x2:
250 return TCGEN05_LD_OPCODE(16x32bx2, x2);
251 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x4:
252 return TCGEN05_LD_OPCODE(16x32bx2, x4);
253 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x8:
254 return TCGEN05_LD_OPCODE(16x32bx2, x8);
255 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x16:
256 return TCGEN05_LD_OPCODE(16x32bx2, x16);
257 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x32:
258 return TCGEN05_LD_OPCODE(16x32bx2, x32);
259 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x64:
260 return TCGEN05_LD_OPCODE(16x32bx2, x64);
261 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x128:
262 return TCGEN05_LD_OPCODE(16x32bx2, x128);
263 case Intrinsic::nvvm_tcgen05_ld_32x32b_x1:
264 return TCGEN05_LD_OPCODE(32x32b, x1);
265 case Intrinsic::nvvm_tcgen05_ld_32x32b_x2:
266 return TCGEN05_LD_OPCODE(32x32b, x2);
267 case Intrinsic::nvvm_tcgen05_ld_32x32b_x4:
268 return TCGEN05_LD_OPCODE(32x32b, x4);
269 case Intrinsic::nvvm_tcgen05_ld_32x32b_x8:
270 return TCGEN05_LD_OPCODE(32x32b, x8);
271 case Intrinsic::nvvm_tcgen05_ld_32x32b_x16:
272 return TCGEN05_LD_OPCODE(32x32b, x16);
273 case Intrinsic::nvvm_tcgen05_ld_32x32b_x32:
274 return TCGEN05_LD_OPCODE(32x32b, x32);
275 case Intrinsic::nvvm_tcgen05_ld_32x32b_x64:
276 return TCGEN05_LD_OPCODE(32x32b, x64);
277 case Intrinsic::nvvm_tcgen05_ld_32x32b_x128:
278 return TCGEN05_LD_OPCODE(32x32b, x128);
279 }
280 llvm_unreachable("unhandled tcgen05.ld lowering");
281}
282
283void NVPTXDAGToDAGISel::SelectTcgen05Ld(SDNode *N, bool hasOffset) {
284 if (!Subtarget->hasTcgen05InstSupport())
286 "tcgen05.ld is not supported on this architecture variant");
287
288 SDLoc DL(N);
289 unsigned IID = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
290
291 if (hasOffset) {
292 bool enablePack = cast<ConstantSDNode>(N->getOperand(4))->getZExtValue();
293 auto OffsetNode = CurDAG->getTargetConstant(
294 cast<ConstantSDNode>(N->getOperand(3))->getZExtValue(), DL, MVT::i32);
295 ReplaceNode(N, CurDAG->getMachineNode(
296 getTcgen05LdOpcode(IID, enablePack), DL, N->getVTList(),
297 {N->getOperand(2), OffsetNode, N->getOperand(0)}));
298 } else {
299 bool enablePack = cast<ConstantSDNode>(N->getOperand(3))->getZExtValue();
300 ReplaceNode(N, CurDAG->getMachineNode(
301 getTcgen05LdOpcode(IID, enablePack), DL, N->getVTList(),
302 {N->getOperand(2), N->getOperand(0)}));
303 }
304}
305
306bool NVPTXDAGToDAGISel::tryIntrinsicChain(SDNode *N) {
307 unsigned IID = N->getConstantOperandVal(1);
308 switch (IID) {
309 default:
310 return false;
311 case Intrinsic::nvvm_ldu_global_f:
312 case Intrinsic::nvvm_ldu_global_i:
313 case Intrinsic::nvvm_ldu_global_p:
314 return tryLDU(N);
315
316 case Intrinsic::nvvm_tcgen05_ld_16x64b_x1:
317 case Intrinsic::nvvm_tcgen05_ld_16x64b_x2:
318 case Intrinsic::nvvm_tcgen05_ld_16x64b_x4:
319 case Intrinsic::nvvm_tcgen05_ld_16x64b_x8:
320 case Intrinsic::nvvm_tcgen05_ld_16x64b_x16:
321 case Intrinsic::nvvm_tcgen05_ld_16x64b_x32:
322 case Intrinsic::nvvm_tcgen05_ld_16x64b_x64:
323 case Intrinsic::nvvm_tcgen05_ld_16x64b_x128:
324 case Intrinsic::nvvm_tcgen05_ld_16x128b_x1:
325 case Intrinsic::nvvm_tcgen05_ld_16x128b_x2:
326 case Intrinsic::nvvm_tcgen05_ld_16x128b_x4:
327 case Intrinsic::nvvm_tcgen05_ld_16x128b_x16:
328 case Intrinsic::nvvm_tcgen05_ld_16x128b_x32:
329 case Intrinsic::nvvm_tcgen05_ld_16x128b_x64:
330 case Intrinsic::nvvm_tcgen05_ld_16x256b_x1:
331 case Intrinsic::nvvm_tcgen05_ld_16x128b_x8:
332 case Intrinsic::nvvm_tcgen05_ld_16x256b_x2:
333 case Intrinsic::nvvm_tcgen05_ld_16x256b_x4:
334 case Intrinsic::nvvm_tcgen05_ld_16x256b_x8:
335 case Intrinsic::nvvm_tcgen05_ld_16x256b_x16:
336 case Intrinsic::nvvm_tcgen05_ld_16x256b_x32:
337 case Intrinsic::nvvm_tcgen05_ld_32x32b_x1:
338 case Intrinsic::nvvm_tcgen05_ld_32x32b_x2:
339 case Intrinsic::nvvm_tcgen05_ld_32x32b_x4:
340 case Intrinsic::nvvm_tcgen05_ld_32x32b_x8:
341 case Intrinsic::nvvm_tcgen05_ld_32x32b_x16:
342 case Intrinsic::nvvm_tcgen05_ld_32x32b_x32:
343 case Intrinsic::nvvm_tcgen05_ld_32x32b_x64:
344 case Intrinsic::nvvm_tcgen05_ld_32x32b_x128: {
345 SelectTcgen05Ld(N);
346 return true;
347 }
348
349 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x1:
350 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x2:
351 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x4:
352 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x8:
353 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x16:
354 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x32:
355 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x64:
356 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x128: {
357 SelectTcgen05Ld(N, /* hasOffset */ true);
358 return true;
359 }
360 }
361}
362
363// Map ISD:CONDCODE value to appropriate CmpMode expected by
364// NVPTXInstPrinter::printCmpMode()
365SDValue NVPTXDAGToDAGISel::getPTXCmpMode(const CondCodeSDNode &CondCode) {
367 const unsigned PTXCmpMode = [](ISD::CondCode CC) {
368 switch (CC) {
369 default:
370 llvm_unreachable("Unexpected condition code.");
371 case ISD::SETOEQ:
372 case ISD::SETEQ:
373 return CmpMode::EQ;
374 case ISD::SETOGT:
375 case ISD::SETGT:
376 return CmpMode::GT;
377 case ISD::SETOGE:
378 case ISD::SETGE:
379 return CmpMode::GE;
380 case ISD::SETOLT:
381 case ISD::SETLT:
382 return CmpMode::LT;
383 case ISD::SETOLE:
384 case ISD::SETLE:
385 return CmpMode::LE;
386 case ISD::SETONE:
387 case ISD::SETNE:
388 return CmpMode::NE;
389 case ISD::SETO:
390 return CmpMode::NUM;
391 case ISD::SETUO:
392 return CmpMode::NotANumber;
393 case ISD::SETUEQ:
394 return CmpMode::EQU;
395 case ISD::SETUGT:
396 return CmpMode::GTU;
397 case ISD::SETUGE:
398 return CmpMode::GEU;
399 case ISD::SETULT:
400 return CmpMode::LTU;
401 case ISD::SETULE:
402 return CmpMode::LEU;
403 case ISD::SETUNE:
404 return CmpMode::NEU;
405 }
406 }(CondCode.get());
407 return CurDAG->getTargetConstant(PTXCmpMode, SDLoc(), MVT::i32);
408}
409
410bool NVPTXDAGToDAGISel::SelectSETP_F16X2(SDNode *N) {
411 SDValue PTXCmpMode = getPTXCmpMode(*cast<CondCodeSDNode>(N->getOperand(2)));
412 SDLoc DL(N);
413 SDNode *SetP = CurDAG->getMachineNode(
414 NVPTX::SETP_f16x2rr, DL, MVT::i1, MVT::i1,
415 {N->getOperand(0), N->getOperand(1), PTXCmpMode,
416 CurDAG->getTargetConstant(useF32FTZ() ? 1 : 0, DL, MVT::i1)});
417 ReplaceNode(N, SetP);
418 return true;
419}
420
421bool NVPTXDAGToDAGISel::SelectSETP_BF16X2(SDNode *N) {
422 SDValue PTXCmpMode = getPTXCmpMode(*cast<CondCodeSDNode>(N->getOperand(2)));
423 SDLoc DL(N);
424 SDNode *SetP = CurDAG->getMachineNode(
425 NVPTX::SETP_bf16x2rr, DL, MVT::i1, MVT::i1,
426 {N->getOperand(0), N->getOperand(1), PTXCmpMode,
427 CurDAG->getTargetConstant(useF32FTZ() ? 1 : 0, DL, MVT::i1)});
428 ReplaceNode(N, SetP);
429 return true;
430}
431
432bool NVPTXDAGToDAGISel::tryUNPACK_VECTOR(SDNode *N) {
433 SDValue Vector = N->getOperand(0);
434 MVT EltVT = N->getSimpleValueType(0);
435
436 MachineSDNode *N2 =
437 CurDAG->getMachineNode(NVPTX::I64toV2I32, SDLoc(N), EltVT, EltVT, Vector);
438
439 ReplaceNode(N, N2);
440 return true;
441}
442
443// Find all instances of extract_vector_elt that use this v2f16 vector
444// and coalesce them into a scattering move instruction.
445bool NVPTXDAGToDAGISel::tryEXTRACT_VECTOR_ELEMENT(SDNode *N) {
446 SDValue Vector = N->getOperand(0);
447
448 MVT VT = Vector.getSimpleValueType();
449 if (!(NVPTX::isPackedVectorTy(VT) && VT.getVectorNumElements() == 2))
450 return false;
451
452 unsigned Opcode;
453 if (VT.is32BitVector())
454 Opcode = NVPTX::I32toV2I16;
455 else if (VT.is64BitVector())
456 Opcode = NVPTX::I64toV2I32;
457 else
458 llvm_unreachable("Unhandled packed type");
459
460 // Find and record all uses of this vector that extract element 0 or 1.
462 for (auto *U : Vector.getNode()->users()) {
463 if (U->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
464 continue;
465 if (U->getOperand(0) != Vector)
466 continue;
467 if (const ConstantSDNode *IdxConst =
468 dyn_cast<ConstantSDNode>(U->getOperand(1))) {
469 if (IdxConst->getZExtValue() == 0)
470 E0.push_back(U);
471 else if (IdxConst->getZExtValue() == 1)
472 E1.push_back(U);
473 else
474 llvm_unreachable("Invalid vector index.");
475 }
476 }
477
478 // There's no point scattering f16x2 if we only ever access one
479 // element of it.
480 if (E0.empty() || E1.empty())
481 return false;
482
483 // Merge (EltTy extractelt(V, 0), EltTy extractelt(V,1))
484 // into EltTy,EltTy Split[EltTy]x2(V)
485 MVT EltVT = VT.getVectorElementType();
486 SDNode *ScatterOp =
487 CurDAG->getMachineNode(Opcode, SDLoc(N), EltVT, EltVT, Vector);
488 for (auto *Node : E0)
489 ReplaceUses(SDValue(Node, 0), SDValue(ScatterOp, 0));
490 for (auto *Node : E1)
491 ReplaceUses(SDValue(Node, 0), SDValue(ScatterOp, 1));
492
493 return true;
494}
495
496static std::optional<NVPTX::AddressSpace> convertAS(unsigned AS) {
497 switch (AS) {
512 default:
513 return std::nullopt;
514 }
515}
516
518 return convertAS(N->getMemOperand()->getAddrSpace())
520}
521
522NVPTX::Ordering NVPTXDAGToDAGISel::getMemOrder(const MemSDNode *N) const {
523 // No "sem" orderings for SM/PTX versions which do not support memory ordering
526 auto Ordering = N->getMergedOrdering();
527 switch (Ordering) {
541 }
542 llvm_unreachable("Invalid atomic ordering");
543}
544
545NVPTX::Scope NVPTXDAGToDAGISel::getAtomicScope(const MemSDNode *N) const {
546 // No "scope" modifier for SM/PTX versions which do not support scoped atomics
547 // Functionally, these atomics are at device scope
548 if (!Subtarget->hasAtomScope())
550 return Scopes[N->getSyncScopeID()];
551}
552
553namespace {
554
555struct OperationOrderings {
556 NVPTX::Ordering InstructionOrdering, FenceOrdering;
557 OperationOrderings(NVPTX::Ordering IO = NVPTX::Ordering::NotAtomic,
558 NVPTX::Ordering FO = NVPTX::Ordering::NotAtomic)
559 : InstructionOrdering(IO), FenceOrdering(FO) {}
560};
561
562static OperationOrderings
563getOperationOrderings(MemSDNode *N, const NVPTXSubtarget *Subtarget) {
564 AtomicOrdering Ordering = N->getSuccessOrdering();
565 auto CodeAddrSpace = NVPTXDAGToDAGISel::getAddrSpace(N);
566
567 bool HasMemoryOrdering = Subtarget->hasMemoryOrdering();
568 bool HasRelaxedMMIO = Subtarget->hasRelaxedMMIO();
569
570 // clang-format off
571
572 // Lowering for Load/Store Operations (note: AcquireRelease Loads or Stores error).
573 // Note: uses of Relaxed in the Atomic column of this table refer
574 // to LLVM AtomicOrdering::Monotonic.
575 //
576 // | Atomic | Volatile | Statespace | PTX sm_60- | PTX sm_70+ |
577 // |---------|----------|--------------------|------------|------------------------------|
578 // | No | No | All | plain | .weak |
579 // | No | Yes | Generic,Shared, | .volatile | .volatile |
580 // | | | Global [0] | | |
581 // | No | Yes | Local,Const,Param | plain [1] | .weak [1] |
582 // | Unorder | Yes/No | All | == Relaxed | == Relaxed |
583 // | Relaxed | No | Generic,Shared, | .volatile | <atomic sem> |
584 // | | | Global [0] | | |
585 // | Other | No | Generic,Shared, | Error [2] | <atomic sem> |
586 // | | | Global [0] | | |
587 // | Yes | No | Local,Const,Param | plain [1] | .weak [1] |
588 // | Relaxed | Yes | Generic,Shared [0] | .volatile | .volatile |
589 // | Relaxed | Yes | Global [0] | .volatile | .mmio.relaxed.sys (PTX 8.2+) |
590 // | | | | | or .volatile (PTX 8.1-) |
591 // | Relaxed | Yes | Local,Const,Param | plain [1] | .weak [1] |
592 // | Other | Yes | Generic, Shared, | Error [2] | <atomic sem> [3] |
593 // | | | / Global [0] | | |
594
595 // Lowering of CUDA C++ SequentiallyConsistent Operations and Fences to PTX
596 // by following the ABI proven sound in:
597 // Lustig et al, A Formal Analysis of the NVIDIA PTX Memory Consistency Model, ASPLOS’19.
598 // https://dl.acm.org/doi/pdf/10.1145/3297858.3304043
599 //
600 // | CUDA C++ Atomic Operation or Atomic Fence | PTX Atomic Operation or Fence |
601 // |------------------------------------------------------|-------------------------------|
602 // | cuda::atomic_thread_fence | fence.sc.<scope>; |
603 // | (memory_order_seq_cst, cuda::thread_scope_<scope>) | |
604 // |------------------------------------------------------|-------------------------------|
605 // | cuda::atomic_load | fence.sc.<scope>; |
606 // | (memory_order_seq_cst, cuda::thread_scope_<scope>) | ld.acquire.<scope>; |
607 // |------------------------------------------------------|-------------------------------|
608 // | cuda::atomic_store | fence.sc.<scope>; |
609 // | (memory_order_seq_cst, cuda::thread_scope_<scope>) | st.release.<scope>; |
610 // |------------------------------------------------------|-------------------------------|
611 // | cuda::atomic_fetch_<op> | fence.sc.<scope>; |
612 // | (memory_order_seq_cst, cuda::thread_scope_<scope>) | atom.acq_rel.<scope>; |
613
614 // clang-format on
615
616 // [0]: volatile and atomics are only supported on global or shared
617 // memory locations, accessed via generic/shared/global pointers.
618 // MMIO is only supported on global memory locations,
619 // accessed via generic/global pointers.
620 // TODO: Implement MMIO access via generic pointer to global.
621 // Currently implemented for global pointers only.
622
623 // [1]: Lowering volatile/atomic operations to non-volatile/non-atomic
624 // PTX instructions fails to preserve their C++ side-effects.
625 //
626 // Example (https://github.com/llvm/llvm-project/issues/62057):
627 //
628 // void example() {
629 // std::atomic<bool> True = true;
630 // while (True.load(std::memory_order_relaxed));
631 // }
632 //
633 // A C++ program that calls "example" is well-defined: the infinite loop
634 // performs an atomic operation. By lowering volatile/atomics to
635 // "weak" memory operations, we are transforming the above into:
636 //
637 // void undefined_behavior() {
638 // bool True = true;
639 // while (True);
640 // }
641 //
642 // which exhibits undefined behavior in both C++ and PTX.
643 //
644 // Calling "example" in CUDA C++ compiled for sm_60- exhibits undefined
645 // behavior due to lack of Independent Forward Progress. Lowering these
646 // to weak memory operations in sm_60- is therefore fine.
647 //
648 // TODO: lower atomic and volatile operations to memory locations
649 // in local, const, and param to two PTX instructions in sm_70+:
650 // - the "weak" memory instruction we are currently lowering to, and
651 // - some other instruction that preserves the side-effect, e.g.,
652 // a dead dummy volatile load.
653 if (CodeAddrSpace == NVPTX::AddressSpace::Local ||
654 CodeAddrSpace == NVPTX::AddressSpace::Const ||
655 CodeAddrSpace == NVPTX::AddressSpace::Param) {
657 }
658
659 // [2]: Atomics with Ordering different than Unordered or Relaxed are not
660 // supported on sm_60 and older; this includes volatile atomics.
661 if (!(Ordering == AtomicOrdering::NotAtomic ||
662 Ordering == AtomicOrdering::Unordered ||
663 Ordering == AtomicOrdering::Monotonic) &&
664 !HasMemoryOrdering) {
666 formatv("PTX does not support \"atomic\" for orderings different than"
667 "\"NotAtomic\" or \"Monotonic\" for sm_60 or older, but order "
668 "is: \"{}\".",
669 toIRString(Ordering)));
670 }
671
672 // [3]: TODO: these should eventually use .mmio<.atomic sem>; for now we drop
673 // the volatile semantics and preserve the atomic ones.
674
675 // PTX volatile and PTX atomics are not available for statespace that differ
676 // from .generic, .global, or .shared. The behavior of PTX volatile and PTX
677 // atomics is undefined if the generic address does not refer to a .global or
678 // .shared memory location.
679 bool AddrGenericOrGlobalOrShared =
680 (CodeAddrSpace == NVPTX::AddressSpace::Generic ||
681 CodeAddrSpace == NVPTX::AddressSpace::Global ||
682 CodeAddrSpace == NVPTX::AddressSpace::Shared ||
683 CodeAddrSpace == NVPTX::AddressSpace::SharedCluster);
684 if (!AddrGenericOrGlobalOrShared)
686
687 bool UseRelaxedMMIO =
688 HasRelaxedMMIO && CodeAddrSpace == NVPTX::AddressSpace::Global;
689
690 switch (Ordering) {
692 return N->isVolatile() ? NVPTX::Ordering::Volatile
695 // We lower unordered in the exact same way as 'monotonic' to respect
696 // LLVM IR atomicity requirements.
698 if (N->isVolatile())
699 return UseRelaxedMMIO ? NVPTX::Ordering::RelaxedMMIO
701 else
702 return HasMemoryOrdering ? NVPTX::Ordering::Relaxed
704 // case AtomicOrdering::Consume: // If LLVM ever provides this, lower it to
705 // Acquire.
707 if (!N->readMem())
709 formatv("PTX only supports Acquire Ordering on reads: {}",
710 N->getOperationName()));
713 if (!N->writeMem())
715 formatv("PTX only supports Release Ordering on writes: {}",
716 N->getOperationName()));
720 formatv("NVPTX does not support AcquireRelease Ordering on "
721 "read-modify-write "
722 "yet and PTX does not support it on loads or stores: {}",
723 N->getOperationName()));
724 }
726 // LLVM-IR SequentiallyConsistent atomics map to a two-instruction PTX
727 // sequence including a "fence.sc.sco" and the memory instruction with an
728 // Ordering that differs from "sc": acq, rel, or acq_rel, depending on
729 // whether the memory operation is a read, write, or read-modify-write.
730 //
731 // This sets the ordering of the fence to SequentiallyConsistent, and
732 // sets the corresponding ordering for the instruction.
733 NVPTX::Ordering InstrOrder;
734 if (N->readMem())
735 InstrOrder = NVPTX::Ordering::Acquire;
736 else if (N->writeMem())
737 InstrOrder = NVPTX::Ordering::Release;
738 else
740 formatv("NVPTX does not support SequentiallyConsistent Ordering on "
741 "read-modify-writes yet: {}",
742 N->getOperationName()));
743 return OperationOrderings(InstrOrder,
745 }
746 }
748 formatv("NVPTX backend does not support AtomicOrdering \"{}\" yet.",
749 toIRString(Ordering)));
750}
751
752} // namespace
753
754NVPTX::Scope NVPTXDAGToDAGISel::getOperationScope(MemSDNode *N,
755 NVPTX::Ordering O) const {
756 switch (O) {
758 case NVPTX::Ordering::Volatile: // Non-atomic volatile operations
759 // NVPTX uses Thread scope as the scope of non-atomic operations.
762 // RelaxedMMIO operations are always system scope.
763 // If a RelaxedMMIO order was generated from an atomic volatile operation
764 // with a smaller thread scope, we bump it here to system scope.
771 auto S = Scopes[N->getSyncScopeID()];
772
773 // Atomic operations must have a scope greater than thread.
774 if (S == NVPTX::Scope::Thread)
776 formatv("Atomics need scope > \"{}\".", ScopeToString(S)));
777
778 // If scope is cluster, clusters must be supported.
779 if (S == NVPTX::Scope::Cluster)
780 Subtarget->failIfClustersUnsupported("cluster scope");
781
782 // If operation is volatile, then its scope is system.
783 return N->isVolatile() ? NVPTX::Scope::System : S;
784 }
785 llvm_unreachable("unhandled ordering");
786}
787
788static bool canLowerToLDG(const MemSDNode &N, const NVPTXSubtarget &Subtarget,
789 NVPTX::AddressSpace CodeAddrSpace) {
790 // We use ldg (i.e. ld.global.nc) for invariant loads from the global address
791 // space.
792 return Subtarget.hasLDG() && CodeAddrSpace == NVPTX::AddressSpace::Global &&
793 N.isInvariant();
794}
795
796static unsigned int getFenceOp(NVPTX::Ordering O, NVPTX::Scope S,
797 NVPTXSubtarget const *T) {
798 if (S == NVPTX::Scope::Cluster)
799 T->failIfClustersUnsupported(".cluster scope fence");
800
801 // Fall back to .acq_rel if .acquire, .release is not supported.
802 if (!T->hasSplitAcquireAndReleaseFences() &&
805
806 switch (O) {
808 switch (S) {
810 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_acquire_sys
811 : NVPTX::INT_MEMBAR_SYS;
813 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_acquire_cta
814 : NVPTX::INT_MEMBAR_CTA;
816 return NVPTX::atomic_thread_fence_acquire_cluster;
818 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_acquire_gpu
819 : NVPTX::INT_MEMBAR_GL;
823 formatv("Unsupported scope \"{}\" for acquire/release/acq_rel fence.",
824 ScopeToString(S)));
825 }
826 break;
828 switch (S) {
830 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_release_sys
831 : NVPTX::INT_MEMBAR_SYS;
833 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_release_cta
834 : NVPTX::INT_MEMBAR_CTA;
836 return NVPTX::atomic_thread_fence_release_cluster;
838 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_release_gpu
839 : NVPTX::INT_MEMBAR_GL;
843 formatv("Unsupported scope \"{}\" for acquire/release/acq_rel fence.",
844 ScopeToString(S)));
845 }
846 break;
848 switch (S) {
850 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_acq_rel_sys
851 : NVPTX::INT_MEMBAR_SYS;
853 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_acq_rel_cta
854 : NVPTX::INT_MEMBAR_CTA;
856 return NVPTX::atomic_thread_fence_acq_rel_cluster;
858 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_acq_rel_gpu
859 : NVPTX::INT_MEMBAR_GL;
863 formatv("Unsupported scope \"{}\" for acquire/release/acq_rel fence.",
864 ScopeToString(S)));
865 }
866 break;
867 }
869 switch (S) {
871 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_seq_cst_sys
872 : NVPTX::INT_MEMBAR_SYS;
874 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_seq_cst_cta
875 : NVPTX::INT_MEMBAR_CTA;
877 return NVPTX::atomic_thread_fence_seq_cst_cluster;
879 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_seq_cst_gpu
880 : NVPTX::INT_MEMBAR_GL;
883 report_fatal_error(formatv("Unsupported scope \"{}\" for seq_cst fence.",
884 ScopeToString(S)));
885 }
886 break;
887 }
893 formatv("Unsupported \"{}\" ordering and \"{}\" scope for fence.",
894 OrderingToString(O), ScopeToString(S)));
895 }
896 llvm_unreachable("unhandled ordering");
897}
898
899// Returns Memory Order and Scope of a memory instruction, and
900// inserts any fence before the instruction that's required to
901// implement its memory ordering.
902std::pair<NVPTX::Ordering, NVPTX::Scope>
903NVPTXDAGToDAGISel::insertMemoryInstructionFence(SDLoc DL, SDValue &Chain,
904 MemSDNode *N) {
905 auto [InstructionOrdering, FenceOrdering] =
906 getOperationOrderings(N, Subtarget);
907 auto Scope = getOperationScope(N, InstructionOrdering);
908
909 // If a fence is required before the operation, insert it:
910 switch (NVPTX::Ordering(FenceOrdering)) {
912 break;
914 auto Op = getFenceOp(FenceOrdering, Scope, Subtarget);
915 Chain = SDValue(CurDAG->getMachineNode(Op, DL, MVT::Other, Chain), 0);
916 break;
917 }
918 default:
920 formatv("Unexpected fence ordering: \"{}\".",
921 OrderingToString(NVPTX::Ordering(FenceOrdering))));
922 }
923 return {InstructionOrdering, Scope};
924}
925
926void NVPTXDAGToDAGISel::SelectAddrSpaceCast(SDNode *N) {
927 SDValue Src = N->getOperand(0);
928 AddrSpaceCastSDNode *CastN = cast<AddrSpaceCastSDNode>(N);
929 unsigned SrcAddrSpace = CastN->getSrcAddressSpace();
930 unsigned DstAddrSpace = CastN->getDestAddressSpace();
931 SDLoc DL(N);
932 assert(SrcAddrSpace != DstAddrSpace &&
933 "addrspacecast must be between different address spaces");
934
935 if (DstAddrSpace == ADDRESS_SPACE_GENERIC) {
936 // Specific to generic
937
938 if (TM.is64Bit() && TM.getPointerSizeInBits(SrcAddrSpace) == 32) {
939 SDValue CvtNone =
940 CurDAG->getTargetConstant(NVPTX::PTXCvtMode::NONE, DL, MVT::i32);
941 SDNode *Cvt = CurDAG->getMachineNode(NVPTX::CVT_u64_u32, DL, MVT::i64,
942 Src, CvtNone);
943 Src = SDValue(Cvt, 0);
944 }
945
946 unsigned Opc;
947 switch (SrcAddrSpace) {
948 default: report_fatal_error("Bad address space in addrspacecast");
950 Opc = TM.is64Bit() ? NVPTX::cvta_global_64 : NVPTX::cvta_global;
951 break;
953 Opc = TM.is64Bit() ? NVPTX::cvta_shared_64 : NVPTX::cvta_shared;
954 break;
956 if (!TM.is64Bit())
958 "Shared cluster address space is only supported in 64-bit mode");
959 Opc = NVPTX::cvta_shared_cluster_64;
960 break;
962 Opc = TM.is64Bit() ? NVPTX::cvta_const_64 : NVPTX::cvta_const;
963 break;
965 Opc = TM.is64Bit() ? NVPTX::cvta_local_64 : NVPTX::cvta_local;
966 break;
968 Opc = TM.is64Bit() ? NVPTX::cvta_param_64 : NVPTX::cvta_param;
969 break;
970 }
971 ReplaceNode(N, CurDAG->getMachineNode(Opc, DL, N->getValueType(0), Src));
972 return;
973 } else {
974 // Generic to specific
975 if (SrcAddrSpace != 0)
976 report_fatal_error("Cannot cast between two non-generic address spaces");
977 unsigned Opc;
978 switch (DstAddrSpace) {
979 default: report_fatal_error("Bad address space in addrspacecast");
981 Opc = TM.is64Bit() ? NVPTX::cvta_to_global_64 : NVPTX::cvta_to_global;
982 break;
984 Opc = TM.is64Bit() ? NVPTX::cvta_to_shared_64 : NVPTX::cvta_to_shared;
985 break;
987 if (!TM.is64Bit())
989 "Shared cluster address space is only supported in 64-bit mode");
990 Opc = NVPTX::cvta_to_shared_cluster_64;
991 break;
993 Opc = TM.is64Bit() ? NVPTX::cvta_to_const_64 : NVPTX::cvta_to_const;
994 break;
996 Opc = TM.is64Bit() ? NVPTX::cvta_to_local_64 : NVPTX::cvta_to_local;
997 break;
999 Opc = TM.is64Bit() ? NVPTX::cvta_to_param_64 : NVPTX::cvta_to_param;
1000 break;
1001 }
1002
1003 SDNode *CVTA = CurDAG->getMachineNode(Opc, DL, N->getValueType(0), Src);
1004 if (TM.is64Bit() && TM.getPointerSizeInBits(DstAddrSpace) == 32) {
1005 SDValue CvtNone =
1006 CurDAG->getTargetConstant(NVPTX::PTXCvtMode::NONE, DL, MVT::i32);
1007 CVTA = CurDAG->getMachineNode(NVPTX::CVT_u32_u64, DL, MVT::i32,
1008 SDValue(CVTA, 0), CvtNone);
1009 }
1010
1011 ReplaceNode(N, CVTA);
1012 return;
1013 }
1014}
1015
1016// Helper function template to reduce amount of boilerplate code for
1017// opcode selection.
1018static std::optional<unsigned>
1019pickOpcodeForVT(MVT::SimpleValueType VT, std::optional<unsigned> Opcode_i16,
1020 std::optional<unsigned> Opcode_i32,
1021 std::optional<unsigned> Opcode_i64) {
1022 switch (VT) {
1023 case MVT::f16:
1024 case MVT::i16:
1025 case MVT::bf16:
1026 return Opcode_i16;
1027 case MVT::v2f16:
1028 case MVT::v2bf16:
1029 case MVT::v2i16:
1030 case MVT::v4i8:
1031 case MVT::i32:
1032 case MVT::f32:
1033 return Opcode_i32;
1034 case MVT::v2f32:
1035 case MVT::v2i32:
1036 case MVT::i64:
1037 case MVT::f64:
1038 return Opcode_i64;
1039 default:
1040 return std::nullopt;
1041 }
1042}
1043
1044static inline bool isAddLike(const SDValue V) {
1045 return V.getOpcode() == ISD::ADD ||
1046 (V->getOpcode() == ISD::OR && V->getFlags().hasDisjoint());
1047}
1048
1050 if (N.getOpcode() == ISD::AssertAlign)
1051 N = N.getOperand(0);
1052 return N;
1053}
1054
1055// selectBaseADDR - Match a dag node which will serve as the base address for an
1056// ADDR operand pair.
1058 N = stripAssertAlign(N);
1059 if (const auto *GA = dyn_cast<GlobalAddressSDNode>(N))
1060 return DAG->getTargetGlobalAddress(GA->getGlobal(), SDLoc(N),
1061 GA->getValueType(0), GA->getOffset(),
1062 GA->getTargetFlags());
1063 if (const auto *ES = dyn_cast<ExternalSymbolSDNode>(N))
1064 return DAG->getTargetExternalSymbol(ES->getSymbol(), ES->getValueType(0),
1065 ES->getTargetFlags());
1066 if (const auto *FIN = dyn_cast<FrameIndexSDNode>(N))
1067 return DAG->getTargetFrameIndex(FIN->getIndex(), FIN->getValueType(0));
1068
1069 return N;
1070}
1071
1073 Addr = stripAssertAlign(Addr);
1074 APInt AccumulatedOffset(64u, 0);
1075 while (isAddLike(Addr)) {
1076 const auto *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1));
1077 if (!CN)
1078 break;
1079
1080 const APInt CI = CN->getAPIntValue().sext(64);
1081 if (!(CI + AccumulatedOffset).isSignedIntN(32))
1082 break;
1083
1084 AccumulatedOffset += CI;
1085 Addr = stripAssertAlign(Addr->getOperand(0));
1086 }
1087 return DAG->getSignedTargetConstant(AccumulatedOffset.getSExtValue(), DL,
1088 MVT::i32);
1089}
1090
1091static std::pair<SDValue, SDValue> selectADDR(SDValue Addr, SelectionDAG *DAG) {
1092 SDValue Offset = accumulateOffset(Addr, SDLoc(Addr), DAG);
1093 SDValue Base = selectBaseADDR(Addr, DAG);
1094 return {Base, Offset};
1095}
1096
1097// Select a pair of operands which represent a valid PTX address, this could be
1098// one of the following things:
1099// - [var] - Offset is simply set to 0
1100// - [reg] - Offset is simply set to 0
1101// - [reg+immOff]
1102// - [var+immOff]
1103// Note that immOff must fit into a 32-bit signed integer.
1104bool NVPTXDAGToDAGISel::SelectADDR(SDValue Addr, SDValue &Base,
1105 SDValue &Offset) {
1106 std::tie(Base, Offset) = selectADDR(Addr, CurDAG);
1107 return true;
1108}
1109
1110bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
1111 MemSDNode *LD = cast<MemSDNode>(N);
1112 assert(LD->readMem() && "Expected load");
1113
1114 // do not support pre/post inc/dec
1115 const LoadSDNode *PlainLoad = dyn_cast<LoadSDNode>(LD);
1116 if (PlainLoad && PlainLoad->isIndexed())
1117 return false;
1118
1119 // Address Space Setting
1120 const auto CodeAddrSpace = getAddrSpace(LD);
1121 if (canLowerToLDG(*LD, *Subtarget, CodeAddrSpace))
1122 return tryLDG(LD);
1123
1124 SDLoc DL(LD);
1125 SDValue Chain = N->getOperand(0);
1126 const auto [Ordering, Scope] = insertMemoryInstructionFence(DL, Chain, LD);
1127
1128 const unsigned FromTypeWidth = LD->getMemoryVT().getSizeInBits();
1129
1130 // Vector Setting
1131 const unsigned FromType =
1132 (PlainLoad && (PlainLoad->getExtensionType() == ISD::SEXTLOAD))
1135
1136 uint32_t UsedBytesMask;
1137 switch (N->getOpcode()) {
1138 case ISD::LOAD:
1139 case ISD::ATOMIC_LOAD:
1140 UsedBytesMask = UINT32_MAX;
1141 break;
1142 case NVPTXISD::MLoad:
1143 UsedBytesMask = N->getConstantOperandVal(3);
1144 break;
1145 default:
1146 llvm_unreachable("Unexpected opcode");
1147 }
1148
1149 assert(isPowerOf2_32(FromTypeWidth) && FromTypeWidth >= 8 &&
1150 FromTypeWidth <= 128 && "Invalid width for load");
1151
1152 // Create the machine instruction DAG
1153 const auto [Base, Offset] = selectADDR(N->getOperand(1), CurDAG);
1154 SDValue Ops[] = {getI32Imm(Ordering, DL),
1155 getI32Imm(Scope, DL),
1156 getI32Imm(CodeAddrSpace, DL),
1157 getI32Imm(FromType, DL),
1158 getI32Imm(FromTypeWidth, DL),
1159 getI32Imm(UsedBytesMask, DL),
1160 Base,
1161 Offset,
1162 Chain};
1163
1164 const MVT::SimpleValueType TargetVT = LD->getSimpleValueType(0).SimpleTy;
1165 const std::optional<unsigned> Opcode =
1166 pickOpcodeForVT(TargetVT, NVPTX::LD_i16, NVPTX::LD_i32, NVPTX::LD_i64);
1167 if (!Opcode)
1168 return false;
1169
1170 SDNode *NVPTXLD = CurDAG->getMachineNode(*Opcode, DL, LD->getVTList(), Ops);
1171 if (!NVPTXLD)
1172 return false;
1173
1174 MachineMemOperand *MemRef = LD->getMemOperand();
1175 CurDAG->setNodeMemRefs(cast<MachineSDNode>(NVPTXLD), {MemRef});
1176
1177 ReplaceNode(LD, NVPTXLD);
1178 return true;
1179}
1180
1181static unsigned getStoreVectorNumElts(SDNode *N) {
1182 switch (N->getOpcode()) {
1183 case NVPTXISD::StoreV2:
1184 return 2;
1185 case NVPTXISD::StoreV4:
1186 return 4;
1187 case NVPTXISD::StoreV8:
1188 return 8;
1189 default:
1190 llvm_unreachable("Unexpected opcode");
1191 }
1192}
1193
1194bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
1195 MemSDNode *LD = cast<MemSDNode>(N);
1196
1197 // Address Space Setting
1198 const auto CodeAddrSpace = getAddrSpace(LD);
1199 if (canLowerToLDG(*LD, *Subtarget, CodeAddrSpace))
1200 return tryLDG(LD);
1201
1202 const MVT EltVT = LD->getSimpleValueType(0);
1203 SDLoc DL(LD);
1204 SDValue Chain = LD->getChain();
1205 const auto [Ordering, Scope] = insertMemoryInstructionFence(DL, Chain, LD);
1206
1207 // Type Setting: fromType + fromTypeWidth
1208 //
1209 // Sign : ISD::SEXTLOAD
1210 // Unsign : ISD::ZEXTLOAD, ISD::NON_EXTLOAD or ISD::EXTLOAD and the
1211 // type is integer
1212 // Float : ISD::NON_EXTLOAD or ISD::EXTLOAD and the type is float
1213 // Read at least 8 bits (predicates are stored as 8-bit values)
1214 // Get the original LoadSDNode::getExtensionType() value
1215 const unsigned ExtensionType = N->getConstantOperandVal(4);
1216 const unsigned FromType = (ExtensionType == ISD::SEXTLOAD)
1218 : NVPTX::PTXLdStInstCode::Untyped;
1219
1220 const unsigned FromTypeWidth = getFromTypeWidthForLoad(LD);
1221 const uint32_t UsedBytesMask = N->getConstantOperandVal(3);
1222
1223 assert(!(EltVT.isVector() && ExtensionType != ISD::NON_EXTLOAD));
1224
1225 const auto [Base, Offset] = selectADDR(N->getOperand(1), CurDAG);
1226 SDValue Ops[] = {getI32Imm(Ordering, DL),
1227 getI32Imm(Scope, DL),
1228 getI32Imm(CodeAddrSpace, DL),
1229 getI32Imm(FromType, DL),
1230 getI32Imm(FromTypeWidth, DL),
1231 getI32Imm(UsedBytesMask, DL),
1232 Base,
1233 Offset,
1234 Chain};
1235
1236 std::optional<unsigned> Opcode;
1237 switch (N->getOpcode()) {
1238 default:
1239 llvm_unreachable("Unexpected opcode");
1240 case NVPTXISD::LoadV2:
1241 Opcode = pickOpcodeForVT(EltVT.SimpleTy, NVPTX::LDV_i16_v2,
1242 NVPTX::LDV_i32_v2, NVPTX::LDV_i64_v2);
1243 break;
1244 case NVPTXISD::LoadV4:
1245 Opcode = pickOpcodeForVT(EltVT.SimpleTy, NVPTX::LDV_i16_v4,
1246 NVPTX::LDV_i32_v4, NVPTX::LDV_i64_v4);
1247 break;
1248 case NVPTXISD::LoadV8:
1249 Opcode = pickOpcodeForVT(EltVT.SimpleTy, {/* no v8i16 */},
1250 NVPTX::LDV_i32_v8, {/* no v8i64 */});
1251 break;
1252 }
1253 if (!Opcode)
1254 return false;
1255
1256 SDNode *NVPTXLD = CurDAG->getMachineNode(*Opcode, DL, LD->getVTList(), Ops);
1257
1258 MachineMemOperand *MemRef = LD->getMemOperand();
1259 CurDAG->setNodeMemRefs(cast<MachineSDNode>(NVPTXLD), {MemRef});
1260
1261 ReplaceNode(LD, NVPTXLD);
1262 return true;
1263}
1264
1265bool NVPTXDAGToDAGISel::tryLDG(MemSDNode *LD) {
1266 SDLoc DL(LD);
1267
1268 unsigned ExtensionType;
1269 uint32_t UsedBytesMask;
1270 if (const auto *Load = dyn_cast<LoadSDNode>(LD)) {
1271 ExtensionType = Load->getExtensionType();
1272 UsedBytesMask = UINT32_MAX;
1273 } else {
1274 ExtensionType = LD->getConstantOperandVal(4);
1275 UsedBytesMask = LD->getConstantOperandVal(3);
1276 }
1277 const unsigned FromType = (ExtensionType == ISD::SEXTLOAD)
1279 : NVPTX::PTXLdStInstCode::Untyped;
1280
1281 const unsigned FromTypeWidth = getFromTypeWidthForLoad(LD);
1282
1283 assert(!(LD->getSimpleValueType(0).isVector() &&
1284 ExtensionType != ISD::NON_EXTLOAD));
1285
1286 const auto [Base, Offset] = selectADDR(LD->getOperand(1), CurDAG);
1287 SDValue Ops[] = {getI32Imm(FromType, DL),
1288 getI32Imm(FromTypeWidth, DL),
1289 getI32Imm(UsedBytesMask, DL),
1290 Base,
1291 Offset,
1292 LD->getChain()};
1293
1294 const MVT::SimpleValueType TargetVT = LD->getSimpleValueType(0).SimpleTy;
1295 std::optional<unsigned> Opcode;
1296 switch (LD->getOpcode()) {
1297 default:
1298 llvm_unreachable("Unexpected opcode");
1299 case ISD::LOAD:
1300 Opcode = pickOpcodeForVT(TargetVT, NVPTX::LD_GLOBAL_NC_i16,
1301 NVPTX::LD_GLOBAL_NC_i32, NVPTX::LD_GLOBAL_NC_i64);
1302 break;
1303 case NVPTXISD::MLoad:
1304 Opcode = pickOpcodeForVT(TargetVT, std::nullopt, NVPTX::LD_GLOBAL_NC_i32,
1305 NVPTX::LD_GLOBAL_NC_i64);
1306 break;
1307 case NVPTXISD::LoadV2:
1308 Opcode =
1309 pickOpcodeForVT(TargetVT, NVPTX::LD_GLOBAL_NC_v2i16,
1310 NVPTX::LD_GLOBAL_NC_v2i32, NVPTX::LD_GLOBAL_NC_v2i64);
1311 break;
1312 case NVPTXISD::LoadV4:
1313 Opcode =
1314 pickOpcodeForVT(TargetVT, NVPTX::LD_GLOBAL_NC_v4i16,
1315 NVPTX::LD_GLOBAL_NC_v4i32, NVPTX::LD_GLOBAL_NC_v4i64);
1316 break;
1317 case NVPTXISD::LoadV8:
1318 Opcode = pickOpcodeForVT(TargetVT, {/* no v8i16 */},
1319 NVPTX::LD_GLOBAL_NC_v8i32, {/* no v8i64 */});
1320 break;
1321 }
1322 if (!Opcode)
1323 return false;
1324
1325 SDNode *NVPTXLDG = CurDAG->getMachineNode(*Opcode, DL, LD->getVTList(), Ops);
1326
1327 ReplaceNode(LD, NVPTXLDG);
1328 return true;
1329}
1330
1332 auto TotalWidth = Mem->getMemoryVT().getSizeInBits();
1333 auto NumElts = Mem->getNumValues() - 1;
1334 auto ElementBitWidth = TotalWidth / NumElts;
1335 assert(isPowerOf2_32(ElementBitWidth) && ElementBitWidth >= 8 &&
1336 ElementBitWidth <= 128 && TotalWidth <= 256 &&
1337 "Invalid width for load");
1338 return ElementBitWidth;
1339}
1340
1341bool NVPTXDAGToDAGISel::tryLDU(SDNode *N) {
1342 auto *LD = cast<MemSDNode>(N);
1343
1344 SDLoc DL(N);
1345 const unsigned FromTypeWidth = getFromTypeWidthForLoad(LD);
1346 const MVT::SimpleValueType TargetVT = LD->getSimpleValueType(0).SimpleTy;
1347
1348 // If this is an LDU intrinsic, the address is the third operand. If its an
1349 // LDU SD node (from custom vector handling), then its the second operand
1350 SDValue Addr =
1351 LD->getOperand(LD->getOpcode() == ISD::INTRINSIC_W_CHAIN ? 2 : 1);
1352
1353 const auto [Base, Offset] = selectADDR(Addr, CurDAG);
1354 SDValue Ops[] = {getI32Imm(FromTypeWidth, DL), Base, Offset, LD->getChain()};
1355
1356 std::optional<unsigned> Opcode;
1357 switch (N->getOpcode()) {
1358 default:
1359 llvm_unreachable("Unexpected opcode");
1361 Opcode = pickOpcodeForVT(TargetVT, NVPTX::LDU_GLOBAL_i16,
1362 NVPTX::LDU_GLOBAL_i32, NVPTX::LDU_GLOBAL_i64);
1363 break;
1364 case NVPTXISD::LDUV2:
1365 Opcode = pickOpcodeForVT(TargetVT, NVPTX::LDU_GLOBAL_v2i16,
1366 NVPTX::LDU_GLOBAL_v2i32, NVPTX::LDU_GLOBAL_v2i64);
1367 break;
1368 case NVPTXISD::LDUV4:
1369 Opcode = pickOpcodeForVT(TargetVT, NVPTX::LDU_GLOBAL_v4i16,
1370 NVPTX::LDU_GLOBAL_v4i32, {/* no v4i64 */});
1371 break;
1372 }
1373 if (!Opcode)
1374 return false;
1375
1376 SDNode *NVPTXLDU = CurDAG->getMachineNode(*Opcode, DL, LD->getVTList(), Ops);
1377
1378 ReplaceNode(LD, NVPTXLDU);
1379 return true;
1380}
1381
1382bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
1383 MemSDNode *ST = cast<MemSDNode>(N);
1384 assert(ST->writeMem() && "Expected store");
1385 StoreSDNode *PlainStore = dyn_cast<StoreSDNode>(ST);
1386 AtomicSDNode *AtomicStore = dyn_cast<AtomicSDNode>(ST);
1387 assert((PlainStore || AtomicStore) && "Expected store");
1388
1389 // do not support pre/post inc/dec
1390 if (PlainStore && PlainStore->isIndexed())
1391 return false;
1392
1393 // Address Space Setting
1394 const auto CodeAddrSpace = getAddrSpace(ST);
1395
1396 SDLoc DL(ST);
1397 SDValue Chain = ST->getChain();
1398 const auto [Ordering, Scope] = insertMemoryInstructionFence(DL, Chain, ST);
1399
1400 // Vector Setting
1401 const unsigned ToTypeWidth = ST->getMemoryVT().getSizeInBits();
1402
1403 // Create the machine instruction DAG
1404 SDValue Value = PlainStore ? PlainStore->getValue() : AtomicStore->getVal();
1405
1406 assert(isPowerOf2_32(ToTypeWidth) && ToTypeWidth >= 8 && ToTypeWidth <= 128 &&
1407 "Invalid width for store");
1408
1409 const auto [Base, Offset] = selectADDR(ST->getBasePtr(), CurDAG);
1410 SDValue Ops[] = {selectPossiblyImm(Value),
1411 getI32Imm(Ordering, DL),
1412 getI32Imm(Scope, DL),
1413 getI32Imm(CodeAddrSpace, DL),
1414 getI32Imm(ToTypeWidth, DL),
1415 Base,
1416 Offset,
1417 Chain};
1418
1419 const std::optional<unsigned> Opcode =
1420 pickOpcodeForVT(Value.getSimpleValueType().SimpleTy, NVPTX::ST_i16,
1421 NVPTX::ST_i32, NVPTX::ST_i64);
1422 if (!Opcode)
1423 return false;
1424
1425 SDNode *NVPTXST = CurDAG->getMachineNode(*Opcode, DL, MVT::Other, Ops);
1426
1427 if (!NVPTXST)
1428 return false;
1429
1430 MachineMemOperand *MemRef = ST->getMemOperand();
1431 CurDAG->setNodeMemRefs(cast<MachineSDNode>(NVPTXST), {MemRef});
1432 ReplaceNode(ST, NVPTXST);
1433 return true;
1434}
1435
1436bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
1437 MemSDNode *ST = cast<MemSDNode>(N);
1438 const unsigned TotalWidth = ST->getMemoryVT().getSizeInBits();
1439
1440 // Address Space Setting
1441 const auto CodeAddrSpace = getAddrSpace(ST);
1442 if (CodeAddrSpace == NVPTX::AddressSpace::Const) {
1443 report_fatal_error("Cannot store to pointer that points to constant "
1444 "memory space");
1445 }
1446
1447 SDLoc DL(ST);
1448 SDValue Chain = ST->getChain();
1449 const auto [Ordering, Scope] = insertMemoryInstructionFence(DL, Chain, ST);
1450
1451 const unsigned NumElts = getStoreVectorNumElts(ST);
1452
1454 for (auto &V : ST->ops().slice(1, NumElts))
1455 Ops.push_back(selectPossiblyImm(V));
1456 SDValue Addr = N->getOperand(NumElts + 1);
1457 const unsigned ToTypeWidth = TotalWidth / NumElts;
1458
1459 assert(isPowerOf2_32(ToTypeWidth) && ToTypeWidth >= 8 && ToTypeWidth <= 128 &&
1460 TotalWidth <= 256 && "Invalid width for store");
1461
1462 const auto [Base, Offset] = selectADDR(Addr, CurDAG);
1463 Ops.append({getI32Imm(Ordering, DL), getI32Imm(Scope, DL),
1464 getI32Imm(CodeAddrSpace, DL), getI32Imm(ToTypeWidth, DL), Base,
1465 Offset, Chain});
1466
1467 const MVT::SimpleValueType EltVT =
1468 ST->getOperand(1).getSimpleValueType().SimpleTy;
1469 std::optional<unsigned> Opcode;
1470 switch (ST->getOpcode()) {
1471 default:
1472 return false;
1473 case NVPTXISD::StoreV2:
1474 Opcode = pickOpcodeForVT(EltVT, NVPTX::STV_i16_v2, NVPTX::STV_i32_v2,
1475 NVPTX::STV_i64_v2);
1476 break;
1477 case NVPTXISD::StoreV4:
1478 Opcode = pickOpcodeForVT(EltVT, NVPTX::STV_i16_v4, NVPTX::STV_i32_v4,
1479 NVPTX::STV_i64_v4);
1480 break;
1481 case NVPTXISD::StoreV8:
1482 Opcode = pickOpcodeForVT(EltVT, {/* no v8i16 */}, NVPTX::STV_i32_v8,
1483 {/* no v8i64 */});
1484 break;
1485 }
1486
1487 if (!Opcode)
1488 return false;
1489
1490 SDNode *NVPTXST = CurDAG->getMachineNode(*Opcode, DL, MVT::Other, Ops);
1491
1492 MachineMemOperand *MemRef = ST->getMemOperand();
1493 CurDAG->setNodeMemRefs(cast<MachineSDNode>(NVPTXST), {MemRef});
1494
1495 ReplaceNode(ST, NVPTXST);
1496 return true;
1497}
1498
1499/// SelectBFE - Look for instruction sequences that can be made more efficient
1500/// by using the 'bfe' (bit-field extract) PTX instruction
1501bool NVPTXDAGToDAGISel::tryBFE(SDNode *N) {
1502 SDLoc DL(N);
1503 SDValue LHS = N->getOperand(0);
1504 SDValue RHS = N->getOperand(1);
1505 SDValue Len;
1506 SDValue Start;
1507 SDValue Val;
1508 bool IsSigned = false;
1509
1510 if (N->getOpcode() == ISD::AND) {
1511 // Canonicalize the operands
1512 // We want 'and %val, %mask'
1514 std::swap(LHS, RHS);
1515 }
1516
1517 ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(RHS);
1518 if (!Mask) {
1519 // We need a constant mask on the RHS of the AND
1520 return false;
1521 }
1522
1523 // Extract the mask bits
1524 uint64_t MaskVal = Mask->getZExtValue();
1525 if (!isMask_64(MaskVal)) {
1526 // We *could* handle shifted masks here, but doing so would require an
1527 // 'and' operation to fix up the low-order bits so we would trade
1528 // shr+and for bfe+and, which has the same throughput
1529 return false;
1530 }
1531
1532 // How many bits are in our mask?
1533 int64_t NumBits = countr_one(MaskVal);
1534 Len = CurDAG->getTargetConstant(NumBits, DL, MVT::i32);
1535
1536 if (LHS.getOpcode() == ISD::SRL || LHS.getOpcode() == ISD::SRA) {
1537 // We have a 'srl/and' pair, extract the effective start bit and length
1538 Val = LHS.getNode()->getOperand(0);
1539 Start = LHS.getNode()->getOperand(1);
1540 ConstantSDNode *StartConst = dyn_cast<ConstantSDNode>(Start);
1541 if (StartConst) {
1542 uint64_t StartVal = StartConst->getZExtValue();
1543 // How many "good" bits do we have left? "good" is defined here as bits
1544 // that exist in the original value, not shifted in.
1545 int64_t GoodBits = Start.getValueSizeInBits() - StartVal;
1546 if (NumBits > GoodBits) {
1547 // Do not handle the case where bits have been shifted in. In theory
1548 // we could handle this, but the cost is likely higher than just
1549 // emitting the srl/and pair.
1550 return false;
1551 }
1552 Start = CurDAG->getTargetConstant(StartVal, DL, MVT::i32);
1553 } else {
1554 // Do not handle the case where the shift amount (can be zero if no srl
1555 // was found) is not constant. We could handle this case, but it would
1556 // require run-time logic that would be more expensive than just
1557 // emitting the srl/and pair.
1558 return false;
1559 }
1560 } else {
1561 // Do not handle the case where the LHS of the and is not a shift. While
1562 // it would be trivial to handle this case, it would just transform
1563 // 'and' -> 'bfe', but 'and' has higher-throughput.
1564 return false;
1565 }
1566 } else if (N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) {
1567 if (LHS->getOpcode() == ISD::AND) {
1568 ConstantSDNode *ShiftCnst = dyn_cast<ConstantSDNode>(RHS);
1569 if (!ShiftCnst) {
1570 // Shift amount must be constant
1571 return false;
1572 }
1573
1574 uint64_t ShiftAmt = ShiftCnst->getZExtValue();
1575
1576 SDValue AndLHS = LHS->getOperand(0);
1577 SDValue AndRHS = LHS->getOperand(1);
1578
1579 // Canonicalize the AND to have the mask on the RHS
1580 if (isa<ConstantSDNode>(AndLHS)) {
1581 std::swap(AndLHS, AndRHS);
1582 }
1583
1584 ConstantSDNode *MaskCnst = dyn_cast<ConstantSDNode>(AndRHS);
1585 if (!MaskCnst) {
1586 // Mask must be constant
1587 return false;
1588 }
1589
1590 uint64_t MaskVal = MaskCnst->getZExtValue();
1591 uint64_t NumZeros;
1592 uint64_t NumBits;
1593 if (isMask_64(MaskVal)) {
1594 NumZeros = 0;
1595 // The number of bits in the result bitfield will be the number of
1596 // trailing ones (the AND) minus the number of bits we shift off
1597 NumBits = llvm::countr_one(MaskVal) - ShiftAmt;
1598 } else if (isShiftedMask_64(MaskVal)) {
1599 NumZeros = llvm::countr_zero(MaskVal);
1600 unsigned NumOnes = llvm::countr_one(MaskVal >> NumZeros);
1601 // The number of bits in the result bitfield will be the number of
1602 // trailing zeros plus the number of set bits in the mask minus the
1603 // number of bits we shift off
1604 NumBits = NumZeros + NumOnes - ShiftAmt;
1605 } else {
1606 // This is not a mask we can handle
1607 return false;
1608 }
1609
1610 if (ShiftAmt < NumZeros) {
1611 // Handling this case would require extra logic that would make this
1612 // transformation non-profitable
1613 return false;
1614 }
1615
1616 Val = AndLHS;
1617 Start = CurDAG->getTargetConstant(ShiftAmt, DL, MVT::i32);
1618 Len = CurDAG->getTargetConstant(NumBits, DL, MVT::i32);
1619
1620 // If pre-shift AND includes the sign bit in the bitfield, we must use
1621 // signed BFE to replicate that bit during bitfield extraction. If the
1622 // sign bit is not part of the mask, unsigned BFE will zero out upper bits
1623 // of the result
1624 if (N->getOpcode() == ISD::SRA)
1625 IsSigned = (ShiftAmt + NumBits) == Val.getValueSizeInBits();
1626 } else if (LHS->getOpcode() == ISD::SHL) {
1627 // Here, we have a pattern like:
1628 //
1629 // (sra (shl val, NN), MM)
1630 // or
1631 // (srl (shl val, NN), MM)
1632 //
1633 // If MM >= NN, we can efficiently optimize this with bfe
1634 Val = LHS->getOperand(0);
1635
1636 SDValue ShlRHS = LHS->getOperand(1);
1637 ConstantSDNode *ShlCnst = dyn_cast<ConstantSDNode>(ShlRHS);
1638 if (!ShlCnst) {
1639 // Shift amount must be constant
1640 return false;
1641 }
1642 uint64_t InnerShiftAmt = ShlCnst->getZExtValue();
1643
1644 SDValue ShrRHS = RHS;
1645 ConstantSDNode *ShrCnst = dyn_cast<ConstantSDNode>(ShrRHS);
1646 if (!ShrCnst) {
1647 // Shift amount must be constant
1648 return false;
1649 }
1650 uint64_t OuterShiftAmt = ShrCnst->getZExtValue();
1651
1652 // To avoid extra codegen and be profitable, we need Outer >= Inner
1653 if (OuterShiftAmt < InnerShiftAmt) {
1654 return false;
1655 }
1656
1657 // If the outer shift is more than the type size, we have no bitfield to
1658 // extract (since we also check that the inner shift is <= the outer shift
1659 // then this also implies that the inner shift is < the type size)
1660 if (OuterShiftAmt >= Val.getValueSizeInBits()) {
1661 return false;
1662 }
1663
1664 Start = CurDAG->getTargetConstant(OuterShiftAmt - InnerShiftAmt, DL,
1665 MVT::i32);
1666 Len = CurDAG->getTargetConstant(Val.getValueSizeInBits() - OuterShiftAmt,
1667 DL, MVT::i32);
1668
1669 if (N->getOpcode() == ISD::SRA) {
1670 // If we have a arithmetic right shift, we need to use the signed bfe
1671 // variant
1672 IsSigned = true;
1673 }
1674 } else {
1675 // No can do...
1676 return false;
1677 }
1678 } else {
1679 // No can do...
1680 return false;
1681 }
1682
1683
1684 unsigned Opc;
1685 // For the BFE operations we form here from "and" and "srl", always use the
1686 // unsigned variants.
1687 if (Val.getValueType() == MVT::i32) {
1688 if (IsSigned) {
1689 Opc = NVPTX::BFE_S32rii;
1690 } else {
1691 Opc = NVPTX::BFE_U32rii;
1692 }
1693 } else if (Val.getValueType() == MVT::i64) {
1694 if (IsSigned) {
1695 Opc = NVPTX::BFE_S64rii;
1696 } else {
1697 Opc = NVPTX::BFE_U64rii;
1698 }
1699 } else {
1700 // We cannot handle this type
1701 return false;
1702 }
1703
1704 SDValue Ops[] = {
1705 Val, Start, Len
1706 };
1707
1708 ReplaceNode(N, CurDAG->getMachineNode(Opc, DL, N->getVTList(), Ops));
1709 return true;
1710}
1711
1712// Select bf16/bf16v2 FADD, FSUB, FMUL as fma on targets with only fma
1713bool NVPTXDAGToDAGISel::tryBF16ArithToFMA(SDNode *N) {
1714 EVT VT = SDValue(N, 0).getValueType();
1715 if (VT.getScalarType() != MVT::bf16)
1716 return false;
1717
1718 const NVPTXSubtarget *STI = TM.getSubtargetImpl();
1719 if (STI->hasNativeBF16Support(N->getOpcode()))
1720 return false;
1721
1722 const bool IsVec = VT.isVector();
1723 assert(!IsVec || VT.getVectorNumElements() == 2);
1724 SDLoc DL(N);
1725 SDValue N0 = N->getOperand(0);
1726 SDValue N1 = N->getOperand(1);
1727 SmallVector<SDValue, 3> Operands;
1728 auto GetConstant = [&](float Value) -> SDValue {
1729 // BF16 immediates must be legalized to integer register values
1730 APFloat APF(Value);
1731 bool LosesInfo;
1732 APF.convert(APFloat::BFloat(), APFloat::rmNearestTiesToEven, &LosesInfo);
1733 assert(!LosesInfo);
1734 if (IsVec) {
1735 auto API = APF.bitcastToAPInt();
1736 API = API.concat(API);
1737 auto Const = CurDAG->getTargetConstant(API, DL, MVT::i32);
1738 return SDValue(CurDAG->getMachineNode(NVPTX::MOV_B32_i, DL, VT, Const),
1739 0);
1740 }
1741 auto Const = CurDAG->getTargetConstantFP(APF, DL, VT);
1742 return SDValue(CurDAG->getMachineNode(NVPTX::MOV_BF16_i, DL, VT, Const), 0);
1743 };
1744
1745 switch (N->getOpcode()) {
1746 case ISD::FADD:
1747 // add(a, b) -> fma(a, 1.0, b)
1748 Operands = {N0, GetConstant(1.0), N1};
1749 break;
1750 case ISD::FSUB:
1751 // sub(a, b) -> fma(b, -1.0, a)
1752 Operands = {N1, GetConstant(-1.0), N0};
1753 break;
1754 case ISD::FMUL:
1755 // mul(a, b) -> fma(a, b, -0.0)
1756 // NOTE: The identity is -0, not 0, because -0 + 0 == 0 for floats
1757 Operands = {N0, N1, GetConstant(-0.0)};
1758 break;
1759 default:
1760 llvm_unreachable("Unexpected opcode");
1761 };
1762
1763 int Opcode = IsVec ? NVPTX::FMA_BF16x2rrr : NVPTX::FMA_BF16rrr;
1764 MachineSDNode *FMA = CurDAG->getMachineNode(Opcode, DL, VT, Operands);
1765 ReplaceNode(N, FMA);
1766 return true;
1767}
1768
1769SDValue NVPTXDAGToDAGISel::selectPossiblyImm(SDValue V) {
1770 if (V.getOpcode() == ISD::BITCAST)
1771 V = V.getOperand(0);
1772
1773 if (auto *CN = dyn_cast<ConstantSDNode>(V))
1774 return CurDAG->getTargetConstant(CN->getAPIntValue(), SDLoc(V),
1775 V.getValueType());
1776 if (auto *CN = dyn_cast<ConstantFPSDNode>(V))
1777 return CurDAG->getTargetConstantFP(CN->getValueAPF(), SDLoc(V),
1778 V.getValueType());
1779 return V;
1780}
1781
1782/// SelectInlineAsmMemoryOperand - Implement addressing mode selection for
1783/// inline asm expressions.
1785 const SDValue &Op, InlineAsm::ConstraintCode ConstraintID,
1786 std::vector<SDValue> &OutOps) {
1787 switch (ConstraintID) {
1788 default:
1789 return true;
1790 case InlineAsm::ConstraintCode::m: { // memory
1791 const auto [Base, Offset] = selectADDR(Op, CurDAG);
1792 OutOps.push_back(Base);
1793 OutOps.push_back(Offset);
1794 return false;
1795 }
1796 }
1797 return true;
1798}
1799
1800void NVPTXDAGToDAGISel::SelectV2I64toI128(SDNode *N) {
1801 // Lower a CopyToReg with two 64-bit inputs
1802 // Dst:i128, lo:i64, hi:i64
1803 //
1804 // CopyToReg Dst, lo, hi;
1805 //
1806 // ==>
1807 //
1808 // tmp = V2I64toI128 {lo, hi};
1809 // CopyToReg Dst, tmp;
1810 SDValue Dst = N->getOperand(1);
1811 SDValue Lo = N->getOperand(2);
1812 SDValue Hi = N->getOperand(3);
1813
1814 SDLoc DL(N);
1815 SDNode *Mov =
1816 CurDAG->getMachineNode(NVPTX::V2I64toI128, DL, MVT::i128, {Lo, Hi});
1817
1818 SmallVector<SDValue, 4> NewOps(N->getNumOperands() - 1);
1819 NewOps[0] = N->getOperand(0);
1820 NewOps[1] = Dst;
1821 NewOps[2] = SDValue(Mov, 0);
1822 if (N->getNumOperands() == 5)
1823 NewOps[3] = N->getOperand(4);
1824 SDValue NewValue = CurDAG->getNode(ISD::CopyToReg, DL, SmallVector<EVT>(N->values()), NewOps);
1825
1826 ReplaceNode(N, NewValue.getNode());
1827}
1828
1829void NVPTXDAGToDAGISel::SelectI128toV2I64(SDNode *N) {
1830 // Lower CopyFromReg from a 128-bit regs to two 64-bit regs
1831 // Dst:i128, Src:i128
1832 //
1833 // {lo, hi} = CopyFromReg Src
1834 //
1835 // ==>
1836 //
1837 // {lo, hi} = I128toV2I64 Src
1838 //
1839 SDValue Ch = N->getOperand(0);
1840 SDValue Src = N->getOperand(1);
1841 SDValue Glue = N->getOperand(2);
1842 SDLoc DL(N);
1843
1844 // Add Glue and Ch to the operands and results to avoid break the execution
1845 // order
1846 SDNode *Mov = CurDAG->getMachineNode(
1847 NVPTX::I128toV2I64, DL,
1848 {MVT::i64, MVT::i64, Ch.getValueType(), Glue.getValueType()},
1849 {Src, Ch, Glue});
1850
1851 ReplaceNode(N, Mov);
1852}
1853
1854bool NVPTXDAGToDAGISel::tryFence(SDNode *N) {
1855 SDLoc DL(N);
1856 assert(N->getOpcode() == ISD::ATOMIC_FENCE);
1857 unsigned int FenceOp =
1858 getFenceOp(NVPTX::Ordering(N->getConstantOperandVal(1)),
1859 Scopes[N->getConstantOperandVal(2)], Subtarget);
1860 SDValue Chain = N->getOperand(0);
1861 SDNode *FenceNode = CurDAG->getMachineNode(FenceOp, DL, MVT::Other, Chain);
1862 ReplaceNode(N, FenceNode);
1863 return true;
1864}
1865
1867 Scopes[C.getOrInsertSyncScopeID("singlethread")] = NVPTX::Scope::Thread;
1868 Scopes[C.getOrInsertSyncScopeID("")] = NVPTX::Scope::System;
1869 Scopes[C.getOrInsertSyncScopeID("block")] = NVPTX::Scope::Block;
1870 Scopes[C.getOrInsertSyncScopeID("cluster")] = NVPTX::Scope::Cluster;
1871 Scopes[C.getOrInsertSyncScopeID("device")] = NVPTX::Scope::Device;
1872}
1873
1875 if (Scopes.empty())
1876 llvm_unreachable("NVPTX Scopes must be initialized before calling "
1877 "NVPTXScopes::operator[]");
1878
1879 auto S = Scopes.find(ID);
1880 if (S == Scopes.end()) {
1881 auto scopeName = Context->getSyncScopeName(ID);
1882 assert(scopeName.has_value() && "Scope name must exist.");
1883
1884 // Build list of supported syncscopes programmatically
1885 SmallVector<StringRef> supportedScopes;
1886 for (const auto &Entry : Scopes) {
1887 if (auto name = Context->getSyncScopeName(Entry.first))
1888 supportedScopes.push_back(name->empty() ? "<empty string>" : *name);
1889 }
1890
1892 formatv("NVPTX backend does not support syncscope \"{0}\" (ID={1}).\n"
1893 "Supported syncscopes are: {2}.",
1894 scopeName.value(), int(ID),
1895 make_range(supportedScopes.begin(), supportedScopes.end())));
1896 }
1897 return S->second;
1898}
1899
1900bool NVPTXScopes::empty() const { return Scopes.size() == 0; }
1901
1902#define CP_ASYNC_BULK_TENSOR_OPCODE(dir, dim, mode, is_s32, suffix) \
1903 (is_s32 \
1904 ? NVPTX::CP_ASYNC_BULK_TENSOR_##dir##_##dim##_SHARED32_##mode##suffix \
1905 : NVPTX::CP_ASYNC_BULK_TENSOR_##dir##_##dim##_##mode##suffix)
1906
1907#define GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G_RED(dim, mode, is_ch, is_s32) \
1908 (is_ch ? (CP_ASYNC_BULK_TENSOR_OPCODE(RED, dim, mode, is_s32, _CH)) \
1909 : (CP_ASYNC_BULK_TENSOR_OPCODE(RED, dim, mode, is_s32, )))
1910
1912 bool IsShared32,
1913 bool IsCacheHint,
1914 bool IsIm2Col) {
1915 if (IsIm2Col) {
1916 switch (Dim) {
1917 case 3:
1918 return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G_RED(3D, IM2COL, IsCacheHint,
1919 IsShared32);
1920 case 4:
1921 return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G_RED(4D, IM2COL, IsCacheHint,
1922 IsShared32);
1923 case 5:
1924 return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G_RED(5D, IM2COL, IsCacheHint,
1925 IsShared32);
1926 default:
1927 llvm_unreachable("Invalid Dimension in im2col mode for "
1928 "GetCpAsyncBulkTensorS2GReductionOpcode.");
1929 }
1930 } else {
1931 switch (Dim) {
1932 case 1:
1933 return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G_RED(1D, TILE, IsCacheHint,
1934 IsShared32);
1935 case 2:
1936 return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G_RED(2D, TILE, IsCacheHint,
1937 IsShared32);
1938 case 3:
1939 return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G_RED(3D, TILE, IsCacheHint,
1940 IsShared32);
1941 case 4:
1942 return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G_RED(4D, TILE, IsCacheHint,
1943 IsShared32);
1944 case 5:
1945 return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G_RED(5D, TILE, IsCacheHint,
1946 IsShared32);
1947 default:
1948 llvm_unreachable("Invalid Dimension in tile mode for "
1949 "GetCpAsyncBulkTensorS2GReductionOpcode.");
1950 }
1951 }
1952}
1953
1954void NVPTXDAGToDAGISel::SelectCpAsyncBulkTensorReduceCommon(SDNode *N,
1955 unsigned RedOp,
1956 bool IsIm2Col) {
1957 // We have {Chain, Intrinsic-ID} followed by the actual intrisic args:
1958 // src, dst, dims{d0...dN}, cache_hint, cache_hint_flag
1959 // NumOperands = {Chain, IID} + {Actual intrinsic args}
1960 // = {2} + {4 + dims}
1961 size_t NumOps = N->getNumOperands();
1962 size_t NumDims = NumOps - 6;
1963 bool IsCacheHint = N->getConstantOperandVal(NumOps - 1) == 1;
1964 size_t NumArgs = NumDims + (IsCacheHint ? 3 : 2); // src, dst, cache_hint
1965
1966 SDLoc DL(N);
1967 SmallVector<SDValue, 12> Ops(N->ops().slice(2, NumArgs));
1968 Ops.push_back(getI32Imm(RedOp, DL)); // Reduction Op
1969 Ops.push_back(N->getOperand(0)); // Chain operand
1970
1971 bool IsShared32 =
1972 CurDAG->getDataLayout().getPointerSizeInBits(ADDRESS_SPACE_SHARED) == 32;
1974 NumDims, IsShared32, IsCacheHint, IsIm2Col);
1975 ReplaceNode(N, CurDAG->getMachineNode(Opcode, DL, N->getVTList(), Ops));
1976}
1977
1978#define TCGEN05_ST_OPCODE(SHAPE, NUM) \
1979 (enableUnpack ? NVPTX::TCGEN05_ST_##SHAPE##_##NUM##_UNPACK \
1980 : NVPTX::TCGEN05_ST_##SHAPE##_##NUM)
1981
1982static unsigned getTcgen05StOpcode(unsigned IID, bool enableUnpack) {
1983 switch (IID) {
1984 case Intrinsic::nvvm_tcgen05_st_16x64b_x1:
1985 return TCGEN05_ST_OPCODE(16x64b, x1);
1986 case Intrinsic::nvvm_tcgen05_st_16x64b_x2:
1987 return TCGEN05_ST_OPCODE(16x64b, x2);
1988 case Intrinsic::nvvm_tcgen05_st_16x64b_x4:
1989 return TCGEN05_ST_OPCODE(16x64b, x4);
1990 case Intrinsic::nvvm_tcgen05_st_16x64b_x8:
1991 return TCGEN05_ST_OPCODE(16x64b, x8);
1992 case Intrinsic::nvvm_tcgen05_st_16x64b_x16:
1993 return TCGEN05_ST_OPCODE(16x64b, x16);
1994 case Intrinsic::nvvm_tcgen05_st_16x64b_x32:
1995 return TCGEN05_ST_OPCODE(16x64b, x32);
1996 case Intrinsic::nvvm_tcgen05_st_16x64b_x64:
1997 return TCGEN05_ST_OPCODE(16x64b, x64);
1998 case Intrinsic::nvvm_tcgen05_st_16x64b_x128:
1999 return TCGEN05_ST_OPCODE(16x64b, x128);
2000 case Intrinsic::nvvm_tcgen05_st_16x128b_x1:
2001 return TCGEN05_ST_OPCODE(16x128b, x1);
2002 case Intrinsic::nvvm_tcgen05_st_16x128b_x2:
2003 return TCGEN05_ST_OPCODE(16x128b, x2);
2004 case Intrinsic::nvvm_tcgen05_st_16x128b_x4:
2005 return TCGEN05_ST_OPCODE(16x128b, x4);
2006 case Intrinsic::nvvm_tcgen05_st_16x128b_x8:
2007 return TCGEN05_ST_OPCODE(16x128b, x8);
2008 case Intrinsic::nvvm_tcgen05_st_16x128b_x16:
2009 return TCGEN05_ST_OPCODE(16x128b, x16);
2010 case Intrinsic::nvvm_tcgen05_st_16x128b_x32:
2011 return TCGEN05_ST_OPCODE(16x128b, x32);
2012 case Intrinsic::nvvm_tcgen05_st_16x128b_x64:
2013 return TCGEN05_ST_OPCODE(16x128b, x64);
2014 case Intrinsic::nvvm_tcgen05_st_16x256b_x1:
2015 return TCGEN05_ST_OPCODE(16x256b, x1);
2016 case Intrinsic::nvvm_tcgen05_st_16x256b_x2:
2017 return TCGEN05_ST_OPCODE(16x256b, x2);
2018 case Intrinsic::nvvm_tcgen05_st_16x256b_x4:
2019 return TCGEN05_ST_OPCODE(16x256b, x4);
2020 case Intrinsic::nvvm_tcgen05_st_16x256b_x8:
2021 return TCGEN05_ST_OPCODE(16x256b, x8);
2022 case Intrinsic::nvvm_tcgen05_st_16x256b_x16:
2023 return TCGEN05_ST_OPCODE(16x256b, x16);
2024 case Intrinsic::nvvm_tcgen05_st_16x256b_x32:
2025 return TCGEN05_ST_OPCODE(16x256b, x32);
2026 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x1:
2027 return TCGEN05_ST_OPCODE(16x32bx2, x1);
2028 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x2:
2029 return TCGEN05_ST_OPCODE(16x32bx2, x2);
2030 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x4:
2031 return TCGEN05_ST_OPCODE(16x32bx2, x4);
2032 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x8:
2033 return TCGEN05_ST_OPCODE(16x32bx2, x8);
2034 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x16:
2035 return TCGEN05_ST_OPCODE(16x32bx2, x16);
2036 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x32:
2037 return TCGEN05_ST_OPCODE(16x32bx2, x32);
2038 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x64:
2039 return TCGEN05_ST_OPCODE(16x32bx2, x64);
2040 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x128:
2041 return TCGEN05_ST_OPCODE(16x32bx2, x128);
2042 case Intrinsic::nvvm_tcgen05_st_32x32b_x1:
2043 return TCGEN05_ST_OPCODE(32x32b, x1);
2044 case Intrinsic::nvvm_tcgen05_st_32x32b_x2:
2045 return TCGEN05_ST_OPCODE(32x32b, x2);
2046 case Intrinsic::nvvm_tcgen05_st_32x32b_x4:
2047 return TCGEN05_ST_OPCODE(32x32b, x4);
2048 case Intrinsic::nvvm_tcgen05_st_32x32b_x8:
2049 return TCGEN05_ST_OPCODE(32x32b, x8);
2050 case Intrinsic::nvvm_tcgen05_st_32x32b_x16:
2051 return TCGEN05_ST_OPCODE(32x32b, x16);
2052 case Intrinsic::nvvm_tcgen05_st_32x32b_x32:
2053 return TCGEN05_ST_OPCODE(32x32b, x32);
2054 case Intrinsic::nvvm_tcgen05_st_32x32b_x64:
2055 return TCGEN05_ST_OPCODE(32x32b, x64);
2056 case Intrinsic::nvvm_tcgen05_st_32x32b_x128:
2057 return TCGEN05_ST_OPCODE(32x32b, x128);
2058 }
2059 llvm_unreachable("unhandled tcgen05.st lowering");
2060}
2061
2062void NVPTXDAGToDAGISel::SelectTcgen05St(SDNode *N, bool hasOffset) {
2063 if (!Subtarget->hasTcgen05InstSupport())
2065 "tcgen05.st is not supported on this architecture variant");
2066
2067 SDLoc DL(N);
2068 unsigned IID = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
2069
2070 SmallVector<SDValue, 128> Operands = {
2071 N->getOperand(2) // taddr
2072 };
2073
2074 if (hasOffset)
2075 Operands.push_back(CurDAG->getTargetConstant(
2076 cast<ConstantSDNode>(N->getOperand(3))->getZExtValue(), DL,
2077 MVT::i32)); // Offset
2078
2079 for (unsigned I = hasOffset ? 4 : 3; I < (N->getNumOperands() - 1); I++)
2080 Operands.push_back(N->getOperand(I));
2081
2082 bool enableUnpack =
2083 cast<ConstantSDNode>(N->getOperand(N->getNumOperands() - 1))
2084 ->getZExtValue();
2085
2086 Operands.push_back(N->getOperand(0)); // Chain
2087 ReplaceNode(N, CurDAG->getMachineNode(getTcgen05StOpcode(IID, enableUnpack),
2088 DL, N->getVTList(), Operands));
2089}
2090
2091bool NVPTXDAGToDAGISel::tryIntrinsicVoid(SDNode *N) {
2092 unsigned IID = N->getConstantOperandVal(1);
2093 using TMARedTy = llvm::nvvm::TMAReductionOp;
2094 auto CastTy = [](TMARedTy Op) { return static_cast<unsigned>(Op); };
2095 switch (IID) {
2096 default:
2097 return false;
2098 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_tile_1d:
2099 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_tile_2d:
2100 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_tile_3d:
2101 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_tile_4d:
2102 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_tile_5d:
2103 SelectCpAsyncBulkTensorReduceCommon(N, CastTy(TMARedTy::ADD));
2104 return true;
2105 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_im2col_3d:
2106 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_im2col_4d:
2107 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_im2col_5d:
2108 SelectCpAsyncBulkTensorReduceCommon(N, CastTy(TMARedTy::ADD),
2109 /*IsIm2Col=*/true);
2110 return true;
2111 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_min_tile_1d:
2112 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_min_tile_2d:
2113 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_min_tile_3d:
2114 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_min_tile_4d:
2115 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_min_tile_5d:
2116 SelectCpAsyncBulkTensorReduceCommon(N, CastTy(TMARedTy::MIN));
2117 return true;
2118 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_min_im2col_3d:
2119 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_min_im2col_4d:
2120 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_min_im2col_5d:
2121 SelectCpAsyncBulkTensorReduceCommon(N, CastTy(TMARedTy::MIN),
2122 /*IsIm2Col=*/true);
2123 return true;
2124 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_max_tile_1d:
2125 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_max_tile_2d:
2126 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_max_tile_3d:
2127 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_max_tile_4d:
2128 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_max_tile_5d:
2129 SelectCpAsyncBulkTensorReduceCommon(N, CastTy(TMARedTy::MAX));
2130 return true;
2131 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_max_im2col_3d:
2132 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_max_im2col_4d:
2133 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_max_im2col_5d:
2134 SelectCpAsyncBulkTensorReduceCommon(N, CastTy(TMARedTy::MAX),
2135 /*IsIm2Col=*/true);
2136 return true;
2137 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_inc_tile_1d:
2138 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_inc_tile_2d:
2139 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_inc_tile_3d:
2140 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_inc_tile_4d:
2141 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_inc_tile_5d:
2142 SelectCpAsyncBulkTensorReduceCommon(N, CastTy(TMARedTy::INC));
2143 return true;
2144 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_inc_im2col_3d:
2145 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_inc_im2col_4d:
2146 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_inc_im2col_5d:
2147 SelectCpAsyncBulkTensorReduceCommon(N, CastTy(TMARedTy::INC),
2148 /*IsIm2Col=*/true);
2149 return true;
2150 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_dec_tile_1d:
2151 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_dec_tile_2d:
2152 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_dec_tile_3d:
2153 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_dec_tile_4d:
2154 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_dec_tile_5d:
2155 SelectCpAsyncBulkTensorReduceCommon(N, CastTy(TMARedTy::DEC));
2156 return true;
2157 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_dec_im2col_3d:
2158 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_dec_im2col_4d:
2159 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_dec_im2col_5d:
2160 SelectCpAsyncBulkTensorReduceCommon(N, CastTy(TMARedTy::DEC),
2161 /*IsIm2Col=*/true);
2162 return true;
2163 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_and_tile_1d:
2164 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_and_tile_2d:
2165 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_and_tile_3d:
2166 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_and_tile_4d:
2167 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_and_tile_5d:
2168 SelectCpAsyncBulkTensorReduceCommon(N, CastTy(TMARedTy::AND));
2169 return true;
2170 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_and_im2col_3d:
2171 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_and_im2col_4d:
2172 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_and_im2col_5d:
2173 SelectCpAsyncBulkTensorReduceCommon(N, CastTy(TMARedTy::AND),
2174 /*IsIm2Col=*/true);
2175 return true;
2176 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_or_tile_1d:
2177 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_or_tile_2d:
2178 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_or_tile_3d:
2179 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_or_tile_4d:
2180 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_or_tile_5d:
2181 SelectCpAsyncBulkTensorReduceCommon(N, CastTy(TMARedTy::OR));
2182 return true;
2183 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_or_im2col_3d:
2184 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_or_im2col_4d:
2185 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_or_im2col_5d:
2186 SelectCpAsyncBulkTensorReduceCommon(N, CastTy(TMARedTy::OR),
2187 /*IsIm2Col=*/true);
2188 return true;
2189 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_xor_tile_1d:
2190 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_xor_tile_2d:
2191 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_xor_tile_3d:
2192 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_xor_tile_4d:
2193 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_xor_tile_5d:
2194 SelectCpAsyncBulkTensorReduceCommon(N, CastTy(TMARedTy::XOR));
2195 return true;
2196 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_xor_im2col_3d:
2197 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_xor_im2col_4d:
2198 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_xor_im2col_5d:
2199 SelectCpAsyncBulkTensorReduceCommon(N, CastTy(TMARedTy::XOR),
2200 /*IsIm2Col=*/true);
2201 return true;
2202
2203 case Intrinsic::nvvm_tcgen05_st_16x64b_x1:
2204 case Intrinsic::nvvm_tcgen05_st_16x64b_x2:
2205 case Intrinsic::nvvm_tcgen05_st_16x64b_x4:
2206 case Intrinsic::nvvm_tcgen05_st_16x64b_x8:
2207 case Intrinsic::nvvm_tcgen05_st_16x64b_x16:
2208 case Intrinsic::nvvm_tcgen05_st_16x64b_x32:
2209 case Intrinsic::nvvm_tcgen05_st_16x64b_x64:
2210 case Intrinsic::nvvm_tcgen05_st_16x64b_x128:
2211 case Intrinsic::nvvm_tcgen05_st_32x32b_x1:
2212 case Intrinsic::nvvm_tcgen05_st_32x32b_x2:
2213 case Intrinsic::nvvm_tcgen05_st_32x32b_x4:
2214 case Intrinsic::nvvm_tcgen05_st_32x32b_x8:
2215 case Intrinsic::nvvm_tcgen05_st_32x32b_x16:
2216 case Intrinsic::nvvm_tcgen05_st_32x32b_x32:
2217 case Intrinsic::nvvm_tcgen05_st_32x32b_x64:
2218 case Intrinsic::nvvm_tcgen05_st_32x32b_x128:
2219 case Intrinsic::nvvm_tcgen05_st_16x128b_x1:
2220 case Intrinsic::nvvm_tcgen05_st_16x128b_x2:
2221 case Intrinsic::nvvm_tcgen05_st_16x128b_x4:
2222 case Intrinsic::nvvm_tcgen05_st_16x128b_x8:
2223 case Intrinsic::nvvm_tcgen05_st_16x128b_x16:
2224 case Intrinsic::nvvm_tcgen05_st_16x128b_x32:
2225 case Intrinsic::nvvm_tcgen05_st_16x128b_x64:
2226 case Intrinsic::nvvm_tcgen05_st_16x256b_x1:
2227 case Intrinsic::nvvm_tcgen05_st_16x256b_x2:
2228 case Intrinsic::nvvm_tcgen05_st_16x256b_x4:
2229 case Intrinsic::nvvm_tcgen05_st_16x256b_x8:
2230 case Intrinsic::nvvm_tcgen05_st_16x256b_x16:
2231 case Intrinsic::nvvm_tcgen05_st_16x256b_x32: {
2232 SelectTcgen05St(N);
2233 return true;
2234 }
2235
2236 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x1:
2237 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x2:
2238 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x4:
2239 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x8:
2240 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x16:
2241 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x32:
2242 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x64:
2243 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x128: {
2244 SelectTcgen05St(N, /* hasOffset */ true);
2245 return true;
2246 }
2247 }
2248}
2249
2250void NVPTXDAGToDAGISel::selectAtomicSwap128(SDNode *N) {
2251 MemSDNode *AN = cast<MemSDNode>(N);
2252 SDLoc dl(N);
2253
2254 const SDValue Chain = N->getOperand(0);
2255 const auto [Base, Offset] = selectADDR(N->getOperand(1), CurDAG);
2257 Ops.append(N->op_begin() + 2, N->op_end());
2258 Ops.append({
2259 getI32Imm(getMemOrder(AN), dl),
2260 getI32Imm(getAtomicScope(AN), dl),
2261 getI32Imm(getAddrSpace(AN), dl),
2262 Chain,
2263 });
2264
2265 assert(N->getOpcode() == NVPTXISD::ATOMIC_CMP_SWAP_B128 ||
2266 N->getOpcode() == NVPTXISD::ATOMIC_SWAP_B128);
2267 unsigned Opcode = N->getOpcode() == NVPTXISD::ATOMIC_SWAP_B128
2268 ? NVPTX::ATOM_EXCH_B128
2269 : NVPTX::ATOM_CAS_B128;
2270
2271 auto *ATOM = CurDAG->getMachineNode(Opcode, dl, N->getVTList(), Ops);
2272 CurDAG->setNodeMemRefs(ATOM, AN->getMemOperand());
2273
2274 ReplaceNode(N, ATOM);
2275}
return SDValue()
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
This file implements a class to represent arbitrary precision integral constant values and operations...
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Atomic ordering constants.
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
#define DEBUG_TYPE
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
#define I(x, y, z)
Definition MD5.cpp:57
#define T
static unsigned getStoreVectorNumElts(SDNode *N)
static bool isAddLike(const SDValue V)
static SDValue selectBaseADDR(SDValue N, SelectionDAG *DAG)
static SDValue accumulateOffset(SDValue &Addr, SDLoc DL, SelectionDAG *DAG)
static unsigned getTcgen05StOpcode(unsigned IID, bool enableUnpack)
static std::optional< unsigned > pickOpcodeForVT(MVT::SimpleValueType VT, std::optional< unsigned > Opcode_i16, std::optional< unsigned > Opcode_i32, std::optional< unsigned > Opcode_i64)
static cl::opt< bool > EnableMADWide("nvptx-mad-wide-opt", cl::init(false), cl::Hidden, cl::desc("Enable MAD wide optimization"))
static unsigned GetCpAsyncBulkTensorS2GReductionOpcode(size_t Dim, bool IsShared32, bool IsCacheHint, bool IsIm2Col)
#define TCGEN05_LD_OPCODE(SHAPE, NUM)
static SDValue stripAssertAlign(SDValue N)
static cl::opt< bool > EnableRsqrtOpt("nvptx-rsqrt-approx-opt", cl::init(true), cl::Hidden, cl::desc("Enable reciprocal sqrt optimization"))
static unsigned int getFenceOp(NVPTX::Ordering O, NVPTX::Scope S, NVPTXSubtarget const *T)
#define GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G_RED(dim, mode, is_ch, is_s32)
#define TCGEN05_ST_OPCODE(SHAPE, NUM)
static std::optional< NVPTX::AddressSpace > convertAS(unsigned AS)
static std::pair< SDValue, SDValue > selectADDR(SDValue Addr, SelectionDAG *DAG)
static unsigned getTcgen05LdOpcode(unsigned IID, bool enablePack)
static bool canLowerToLDG(const MemSDNode &N, const NVPTXSubtarget &Subtarget, NVPTX::AddressSpace CodeAddrSpace)
This file contains the definitions of the enumerations and flags associated with NVVM Intrinsics,...
#define INITIALIZE_PASS(passName, arg, name, cfg, analysis)
Definition PassSupport.h:56
static const char * name
#define PASS_NAME
Value * RHS
Value * LHS
static const fltSemantics & BFloat()
Definition APFloat.h:295
static constexpr roundingMode rmNearestTiesToEven
Definition APFloat.h:344
Class for arbitrary precision integers.
Definition APInt.h:78
LLVM_ABI APInt sext(unsigned width) const
Sign extend to a new width.
Definition APInt.cpp:985
int64_t getSExtValue() const
Get sign extended value.
Definition APInt.h:1563
unsigned getSrcAddressSpace() const
unsigned getDestAddressSpace() const
const SDValue & getVal() const
uint64_t getZExtValue() const
FunctionPass class - This class is used to implement most global optimizations.
Definition Pass.h:314
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
bool isIndexed() const
Return true if this is a pre/post inc/dec load/store.
ISD::LoadExtType getExtensionType() const
Return whether this is a plain node, or one of the varieties of value-extending loads.
SimpleValueType SimpleTy
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool is32BitVector() const
Return true if this is a 32-bit vector type.
MVT getVectorElementType() const
bool is64BitVector() const
Return true if this is a 64-bit vector type.
This is an abstract virtual class for memory operations.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
EVT getMemoryVT() const
Return the type of the in-memory value.
NVPTXDAGToDAGISelLegacy(NVPTXTargetMachine &tm, CodeGenOptLevel OptLevel)
bool runOnMachineFunction(MachineFunction &MF) override
static NVPTX::AddressSpace getAddrSpace(const MemSDNode *N)
bool SelectInlineAsmMemoryOperand(const SDValue &Op, InlineAsm::ConstraintCode ConstraintID, std::vector< SDValue > &OutOps) override
SelectInlineAsmMemoryOperand - Implement addressing mode selection for inline asm expressions.
static unsigned getFromTypeWidthForLoad(const MemSDNode *Mem)
const NVPTXSubtarget * Subtarget
const NVPTXTargetLowering * getTargetLowering() const override
bool hasNativeBF16Support(int Opcode) const
bool hasRelaxedMMIO() const
bool hasMemoryOrdering() const
NVPTX::DivPrecisionLevel getDivF32Level(const MachineFunction &MF, const SDNode &N) const
bool allowFMA(MachineFunction &MF, CodeGenOptLevel OptLevel) const
bool usePrecSqrtF32(const SDNode *N=nullptr) const
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
Represents one node in the SelectionDAG.
unsigned getNumValues() const
Return the number of values defined/returned by this operator.
const SDValue & getOperand(unsigned Num) const
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
SDNode * getNode() const
get the SDNode which holds the desired result
EVT getValueType() const
Return the ValueType of the referenced return value.
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
SelectionDAGISelLegacy(char &ID, std::unique_ptr< SelectionDAGISel > S)
void ReplaceUses(SDValue F, SDValue T)
ReplaceUses - replace all uses of the old node F with the use of the new node T.
void ReplaceNode(SDNode *F, SDNode *T)
Replace all uses of F with T, then remove F from the DAG.
SelectionDAGISel(TargetMachine &tm, CodeGenOptLevel OL=CodeGenOptLevel::Default)
virtual bool runOnMachineFunction(MachineFunction &mf)
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
SDValue getTargetGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, unsigned TargetFlags=0)
LLVM_ABI MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
SDValue getTargetFrameIndex(int FI, EVT VT)
SDValue getSignedTargetConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
LLVM_ABI SDValue getTargetExternalSymbol(const char *Sym, EVT VT, unsigned TargetFlags=0)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
const SDValue & getValue() const
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ ADD
Simple integer binary arithmetic operators.
Definition ISDOpcodes.h:259
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition ISDOpcodes.h:511
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
Definition ISDOpcodes.h:215
@ FADD
Simple binary floating point operators.
Definition ISDOpcodes.h:410
@ AssertAlign
AssertAlign - These nodes record if a register contains a value that has a known alignment and the tr...
Definition ISDOpcodes.h:69
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
Definition ISDOpcodes.h:225
@ SHL
Shift and rotation operations.
Definition ISDOpcodes.h:762
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition ISDOpcodes.h:569
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition ISDOpcodes.h:219
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition ISDOpcodes.h:736
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition ISDOpcodes.h:208
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
@ ATOMIC_CMP_SWAP_B128
These nodes are used to lower atomic instructions with i128 type.
std::string ScopeToString(Scope S)
@ SharedCluster
Definition NVPTX.h:186
std::string OrderingToString(Ordering Order)
bool isPackedVectorTy(EVT VT)
DivPrecisionLevel
Definition NVPTX.h:251
@ DefaultDevice
Definition NVPTX.h:175
@ RelaxedMMIO
Definition NVPTX.h:165
@ AcquireRelease
Definition NVPTX.h:161
@ NotAtomic
Definition NVPTX.h:154
@ SequentiallyConsistent
Definition NVPTX.h:162
initializer< Ty > init(const Ty &Val)
This is an optimization pass for GlobalISel generic memory operations.
@ Offset
Definition DWP.cpp:532
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
int countr_one(T Value)
Count the number of ones from the least significant bit to the first zero bit.
Definition bit.h:293
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
FunctionPass * createNVPTXISelDag(NVPTXTargetMachine &TM, llvm::CodeGenOptLevel OptLevel)
createNVPTXISelDag - This pass converts a legalized DAG into a NVPTX-specific DAG,...
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition bit.h:202
constexpr bool isShiftedMask_64(uint64_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (64 bit ver...
Definition MathExtras.h:273
const char * toIRString(AtomicOrdering ao)
String used by LLVM IR to represent atomic ordering.
auto formatv(bool Validate, const char *Fmt, Ts &&...Vals)
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:279
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:167
constexpr bool isMask_64(uint64_t Value)
Return true if the argument is a non-empty sequence of ones starting at the least significant bit wit...
Definition MathExtras.h:261
CodeGenOptLevel
Code generation optimization level.
Definition CodeGen.h:82
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
AtomicOrdering
Atomic ordering for LLVM's memory model.
DWARFExpression::Operation Op
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
LLVM_ABI void reportFatalUsageError(Error Err)
Report a fatal error that does not indicate a bug in LLVM.
Definition Error.cpp:180
Implement std::hash so that hash_code can be used in STL containers.
Definition BitVector.h:867
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:869
#define N
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition ValueTypes.h:373
bool isVector() const
Return true if this is a vector value type.
Definition ValueTypes.h:168
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition ValueTypes.h:323
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition ValueTypes.h:336
NVPTXScopes()=default
NVPTX::Scope operator[](SyncScope::ID ID) const