LLVM 22.0.0git
NVPTXISelDAGToDAG.cpp
Go to the documentation of this file.
1//===-- NVPTXISelDAGToDAG.cpp - A dag to dag inst selector for NVPTX ------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines an instruction selector for the NVPTX target.
10//
11//===----------------------------------------------------------------------===//
12
13#include "NVPTXISelDAGToDAG.h"
14#include "NVPTX.h"
15#include "NVPTXUtilities.h"
16#include "llvm/ADT/APInt.h"
21#include "llvm/IR/GlobalValue.h"
23#include "llvm/IR/IntrinsicsNVPTX.h"
30#include <optional>
31
32using namespace llvm;
33
34#define DEBUG_TYPE "nvptx-isel"
35#define PASS_NAME "NVPTX DAG->DAG Pattern Instruction Selection"
36
37static cl::opt<bool>
38 EnableRsqrtOpt("nvptx-rsqrt-approx-opt", cl::init(true), cl::Hidden,
39 cl::desc("Enable reciprocal sqrt optimization"));
40
41// FIXME: This is a WAR to recover lost performance from #155024.
42// We still need to investigate the regression and find a more permanent
43// solution.
44static cl::opt<bool> EnableMADWide("nvptx-mad-wide-opt", cl::init(false),
46 cl::desc("Enable MAD wide optimization"));
47
48/// createNVPTXISelDag - This pass converts a legalized DAG into a
49/// NVPTX-specific DAG, ready for instruction scheduling.
54
59
61
63
67
69 Subtarget = &MF.getSubtarget<NVPTXSubtarget>();
70 Scopes = NVPTXScopes(MF.getFunction().getContext());
72}
73
75NVPTXDAGToDAGISel::getDivF32Level(const SDNode *N) const {
77}
78
79bool NVPTXDAGToDAGISel::usePrecSqrtF32(const SDNode *N) const {
81}
82
83bool NVPTXDAGToDAGISel::useF32FTZ() const {
84 return Subtarget->getTargetLowering()->useF32FTZ(*MF);
85}
86
87bool NVPTXDAGToDAGISel::allowFMA() const {
88 const NVPTXTargetLowering *TL = Subtarget->getTargetLowering();
89 return TL->allowFMA(*MF, OptLevel);
90}
91
92bool NVPTXDAGToDAGISel::doRsqrtOpt() const { return EnableRsqrtOpt; }
93
94bool NVPTXDAGToDAGISel::doMADWideOpt() const { return EnableMADWide; }
95
96/// Select - Select instructions not customized! Used for
97/// expanded, promoted and normal instructions.
98void NVPTXDAGToDAGISel::Select(SDNode *N) {
99
100 if (N->isMachineOpcode()) {
101 N->setNodeId(-1);
102 return; // Already selected.
103 }
104
105 switch (N->getOpcode()) {
106 case ISD::LOAD:
107 case ISD::ATOMIC_LOAD:
108 if (tryLoad(N))
109 return;
110 break;
111 case ISD::STORE:
112 case ISD::ATOMIC_STORE:
113 if (tryStore(N))
114 return;
115 break;
116 case ISD::ATOMIC_FENCE:
117 if (tryFence(N))
118 return;
119 break;
121 tryUNPACK_VECTOR(N);
122 return;
124 if (tryEXTRACT_VECTOR_ELEMENT(N))
125 return;
126 break;
128 SelectSETP_F16X2(N);
129 return;
131 SelectSETP_BF16X2(N);
132 return;
133 case NVPTXISD::LoadV2:
134 case NVPTXISD::LoadV4:
135 case NVPTXISD::LoadV8:
136 if (tryLoadVector(N))
137 return;
138 break;
139 case NVPTXISD::LDUV2:
140 case NVPTXISD::LDUV4:
141 if (tryLDU(N))
142 return;
143 break;
147 if (tryStoreVector(N))
148 return;
149 break;
151 if (tryIntrinsicChain(N))
152 return;
153 break;
155 if (tryIntrinsicVoid(N))
156 return;
157 break;
158 case ISD::AND:
159 case ISD::SRA:
160 case ISD::SRL:
161 // Try to select BFE
162 if (tryBFE(N))
163 return;
164 break;
165 case ISD::ADDRSPACECAST:
166 SelectAddrSpaceCast(N);
167 return;
168 case ISD::CopyToReg: {
169 if (N->getOperand(1).getValueType() == MVT::i128) {
170 SelectV2I64toI128(N);
171 return;
172 }
173 break;
174 }
175 case ISD::CopyFromReg: {
176 if (N->getOperand(1).getValueType() == MVT::i128) {
177 SelectI128toV2I64(N);
178 return;
179 }
180 break;
181 }
184 selectAtomicSwap128(N);
185 return;
186 case ISD::FADD:
187 case ISD::FMUL:
188 case ISD::FSUB:
189 if (tryBF16ArithToFMA(N))
190 return;
191 break;
192 default:
193 break;
194 }
195 SelectCode(N);
196}
197
198#define TCGEN05_LD_OPCODE(SHAPE, NUM) \
199 (enablePack ? NVPTX::TCGEN05_LD_##SHAPE##_##NUM##_PACK \
200 : NVPTX::TCGEN05_LD_##SHAPE##_##NUM)
201
202static unsigned getTcgen05LdOpcode(unsigned IID, bool enablePack) {
203 switch (IID) {
204 case Intrinsic::nvvm_tcgen05_ld_16x64b_x1:
205 return TCGEN05_LD_OPCODE(16x64b, x1);
206 case Intrinsic::nvvm_tcgen05_ld_16x64b_x2:
207 return TCGEN05_LD_OPCODE(16x64b, x2);
208 case Intrinsic::nvvm_tcgen05_ld_16x64b_x4:
209 return TCGEN05_LD_OPCODE(16x64b, x4);
210 case Intrinsic::nvvm_tcgen05_ld_16x64b_x8:
211 return TCGEN05_LD_OPCODE(16x64b, x8);
212 case Intrinsic::nvvm_tcgen05_ld_16x64b_x16:
213 return TCGEN05_LD_OPCODE(16x64b, x16);
214 case Intrinsic::nvvm_tcgen05_ld_16x64b_x32:
215 return TCGEN05_LD_OPCODE(16x64b, x32);
216 case Intrinsic::nvvm_tcgen05_ld_16x64b_x64:
217 return TCGEN05_LD_OPCODE(16x64b, x64);
218 case Intrinsic::nvvm_tcgen05_ld_16x64b_x128:
219 return TCGEN05_LD_OPCODE(16x64b, x128);
220 case Intrinsic::nvvm_tcgen05_ld_16x128b_x1:
221 return TCGEN05_LD_OPCODE(16x128b, x1);
222 case Intrinsic::nvvm_tcgen05_ld_16x128b_x2:
223 return TCGEN05_LD_OPCODE(16x128b, x2);
224 case Intrinsic::nvvm_tcgen05_ld_16x128b_x4:
225 return TCGEN05_LD_OPCODE(16x128b, x4);
226 case Intrinsic::nvvm_tcgen05_ld_16x128b_x8:
227 return TCGEN05_LD_OPCODE(16x128b, x8);
228 case Intrinsic::nvvm_tcgen05_ld_16x128b_x16:
229 return TCGEN05_LD_OPCODE(16x128b, x16);
230 case Intrinsic::nvvm_tcgen05_ld_16x128b_x32:
231 return TCGEN05_LD_OPCODE(16x128b, x32);
232 case Intrinsic::nvvm_tcgen05_ld_16x128b_x64:
233 return TCGEN05_LD_OPCODE(16x128b, x64);
234 case Intrinsic::nvvm_tcgen05_ld_16x256b_x1:
235 return TCGEN05_LD_OPCODE(16x256b, x1);
236 case Intrinsic::nvvm_tcgen05_ld_16x256b_x2:
237 return TCGEN05_LD_OPCODE(16x256b, x2);
238 case Intrinsic::nvvm_tcgen05_ld_16x256b_x4:
239 return TCGEN05_LD_OPCODE(16x256b, x4);
240 case Intrinsic::nvvm_tcgen05_ld_16x256b_x8:
241 return TCGEN05_LD_OPCODE(16x256b, x8);
242 case Intrinsic::nvvm_tcgen05_ld_16x256b_x16:
243 return TCGEN05_LD_OPCODE(16x256b, x16);
244 case Intrinsic::nvvm_tcgen05_ld_16x256b_x32:
245 return TCGEN05_LD_OPCODE(16x256b, x32);
246 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x1:
247 return TCGEN05_LD_OPCODE(16x32bx2, x1);
248 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x2:
249 return TCGEN05_LD_OPCODE(16x32bx2, x2);
250 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x4:
251 return TCGEN05_LD_OPCODE(16x32bx2, x4);
252 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x8:
253 return TCGEN05_LD_OPCODE(16x32bx2, x8);
254 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x16:
255 return TCGEN05_LD_OPCODE(16x32bx2, x16);
256 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x32:
257 return TCGEN05_LD_OPCODE(16x32bx2, x32);
258 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x64:
259 return TCGEN05_LD_OPCODE(16x32bx2, x64);
260 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x128:
261 return TCGEN05_LD_OPCODE(16x32bx2, x128);
262 case Intrinsic::nvvm_tcgen05_ld_32x32b_x1:
263 return TCGEN05_LD_OPCODE(32x32b, x1);
264 case Intrinsic::nvvm_tcgen05_ld_32x32b_x2:
265 return TCGEN05_LD_OPCODE(32x32b, x2);
266 case Intrinsic::nvvm_tcgen05_ld_32x32b_x4:
267 return TCGEN05_LD_OPCODE(32x32b, x4);
268 case Intrinsic::nvvm_tcgen05_ld_32x32b_x8:
269 return TCGEN05_LD_OPCODE(32x32b, x8);
270 case Intrinsic::nvvm_tcgen05_ld_32x32b_x16:
271 return TCGEN05_LD_OPCODE(32x32b, x16);
272 case Intrinsic::nvvm_tcgen05_ld_32x32b_x32:
273 return TCGEN05_LD_OPCODE(32x32b, x32);
274 case Intrinsic::nvvm_tcgen05_ld_32x32b_x64:
275 return TCGEN05_LD_OPCODE(32x32b, x64);
276 case Intrinsic::nvvm_tcgen05_ld_32x32b_x128:
277 return TCGEN05_LD_OPCODE(32x32b, x128);
278 }
279 llvm_unreachable("unhandled tcgen05.ld lowering");
280}
281
282void NVPTXDAGToDAGISel::SelectTcgen05Ld(SDNode *N, bool hasOffset) {
283 if (!Subtarget->hasTcgen05InstSupport())
285 "tcgen05.ld is not supported on this architecture variant");
286
287 SDLoc DL(N);
288 unsigned IID = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
289
290 if (hasOffset) {
291 bool enablePack = cast<ConstantSDNode>(N->getOperand(4))->getZExtValue();
292 auto OffsetNode = CurDAG->getTargetConstant(
293 cast<ConstantSDNode>(N->getOperand(3))->getZExtValue(), DL, MVT::i32);
294 ReplaceNode(N, CurDAG->getMachineNode(
295 getTcgen05LdOpcode(IID, enablePack), DL, N->getVTList(),
296 {N->getOperand(2), OffsetNode, N->getOperand(0)}));
297 } else {
298 bool enablePack = cast<ConstantSDNode>(N->getOperand(3))->getZExtValue();
299 ReplaceNode(N, CurDAG->getMachineNode(
300 getTcgen05LdOpcode(IID, enablePack), DL, N->getVTList(),
301 {N->getOperand(2), N->getOperand(0)}));
302 }
303}
304
305bool NVPTXDAGToDAGISel::tryIntrinsicChain(SDNode *N) {
306 unsigned IID = N->getConstantOperandVal(1);
307 switch (IID) {
308 default:
309 return false;
310 case Intrinsic::nvvm_ldu_global_f:
311 case Intrinsic::nvvm_ldu_global_i:
312 case Intrinsic::nvvm_ldu_global_p:
313 return tryLDU(N);
314
315 case Intrinsic::nvvm_tcgen05_ld_16x64b_x1:
316 case Intrinsic::nvvm_tcgen05_ld_16x64b_x2:
317 case Intrinsic::nvvm_tcgen05_ld_16x64b_x4:
318 case Intrinsic::nvvm_tcgen05_ld_16x64b_x8:
319 case Intrinsic::nvvm_tcgen05_ld_16x64b_x16:
320 case Intrinsic::nvvm_tcgen05_ld_16x64b_x32:
321 case Intrinsic::nvvm_tcgen05_ld_16x64b_x64:
322 case Intrinsic::nvvm_tcgen05_ld_16x64b_x128:
323 case Intrinsic::nvvm_tcgen05_ld_16x128b_x1:
324 case Intrinsic::nvvm_tcgen05_ld_16x128b_x2:
325 case Intrinsic::nvvm_tcgen05_ld_16x128b_x4:
326 case Intrinsic::nvvm_tcgen05_ld_16x128b_x16:
327 case Intrinsic::nvvm_tcgen05_ld_16x128b_x32:
328 case Intrinsic::nvvm_tcgen05_ld_16x128b_x64:
329 case Intrinsic::nvvm_tcgen05_ld_16x256b_x1:
330 case Intrinsic::nvvm_tcgen05_ld_16x128b_x8:
331 case Intrinsic::nvvm_tcgen05_ld_16x256b_x2:
332 case Intrinsic::nvvm_tcgen05_ld_16x256b_x4:
333 case Intrinsic::nvvm_tcgen05_ld_16x256b_x8:
334 case Intrinsic::nvvm_tcgen05_ld_16x256b_x16:
335 case Intrinsic::nvvm_tcgen05_ld_16x256b_x32:
336 case Intrinsic::nvvm_tcgen05_ld_32x32b_x1:
337 case Intrinsic::nvvm_tcgen05_ld_32x32b_x2:
338 case Intrinsic::nvvm_tcgen05_ld_32x32b_x4:
339 case Intrinsic::nvvm_tcgen05_ld_32x32b_x8:
340 case Intrinsic::nvvm_tcgen05_ld_32x32b_x16:
341 case Intrinsic::nvvm_tcgen05_ld_32x32b_x32:
342 case Intrinsic::nvvm_tcgen05_ld_32x32b_x64:
343 case Intrinsic::nvvm_tcgen05_ld_32x32b_x128: {
344 SelectTcgen05Ld(N);
345 return true;
346 }
347
348 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x1:
349 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x2:
350 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x4:
351 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x8:
352 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x16:
353 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x32:
354 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x64:
355 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x128: {
356 SelectTcgen05Ld(N, /* hasOffset */ true);
357 return true;
358 }
359 }
360}
361
362// Map ISD:CONDCODE value to appropriate CmpMode expected by
363// NVPTXInstPrinter::printCmpMode()
364SDValue NVPTXDAGToDAGISel::getPTXCmpMode(const CondCodeSDNode &CondCode) {
366 const unsigned PTXCmpMode = [](ISD::CondCode CC) {
367 switch (CC) {
368 default:
369 llvm_unreachable("Unexpected condition code.");
370 case ISD::SETOEQ:
371 case ISD::SETEQ:
372 return CmpMode::EQ;
373 case ISD::SETOGT:
374 case ISD::SETGT:
375 return CmpMode::GT;
376 case ISD::SETOGE:
377 case ISD::SETGE:
378 return CmpMode::GE;
379 case ISD::SETOLT:
380 case ISD::SETLT:
381 return CmpMode::LT;
382 case ISD::SETOLE:
383 case ISD::SETLE:
384 return CmpMode::LE;
385 case ISD::SETONE:
386 case ISD::SETNE:
387 return CmpMode::NE;
388 case ISD::SETO:
389 return CmpMode::NUM;
390 case ISD::SETUO:
391 return CmpMode::NotANumber;
392 case ISD::SETUEQ:
393 return CmpMode::EQU;
394 case ISD::SETUGT:
395 return CmpMode::GTU;
396 case ISD::SETUGE:
397 return CmpMode::GEU;
398 case ISD::SETULT:
399 return CmpMode::LTU;
400 case ISD::SETULE:
401 return CmpMode::LEU;
402 case ISD::SETUNE:
403 return CmpMode::NEU;
404 }
405 }(CondCode.get());
406 return CurDAG->getTargetConstant(PTXCmpMode, SDLoc(), MVT::i32);
407}
408
409bool NVPTXDAGToDAGISel::SelectSETP_F16X2(SDNode *N) {
410 SDValue PTXCmpMode = getPTXCmpMode(*cast<CondCodeSDNode>(N->getOperand(2)));
411 SDLoc DL(N);
412 SDNode *SetP = CurDAG->getMachineNode(
413 NVPTX::SETP_f16x2rr, DL, MVT::i1, MVT::i1,
414 {N->getOperand(0), N->getOperand(1), PTXCmpMode,
415 CurDAG->getTargetConstant(useF32FTZ() ? 1 : 0, DL, MVT::i1)});
416 ReplaceNode(N, SetP);
417 return true;
418}
419
420bool NVPTXDAGToDAGISel::SelectSETP_BF16X2(SDNode *N) {
421 SDValue PTXCmpMode = getPTXCmpMode(*cast<CondCodeSDNode>(N->getOperand(2)));
422 SDLoc DL(N);
423 SDNode *SetP = CurDAG->getMachineNode(
424 NVPTX::SETP_bf16x2rr, DL, MVT::i1, MVT::i1,
425 {N->getOperand(0), N->getOperand(1), PTXCmpMode,
426 CurDAG->getTargetConstant(useF32FTZ() ? 1 : 0, DL, MVT::i1)});
427 ReplaceNode(N, SetP);
428 return true;
429}
430
431bool NVPTXDAGToDAGISel::tryUNPACK_VECTOR(SDNode *N) {
432 SDValue Vector = N->getOperand(0);
433 MVT EltVT = N->getSimpleValueType(0);
434
435 MachineSDNode *N2 =
436 CurDAG->getMachineNode(NVPTX::I64toV2I32, SDLoc(N), EltVT, EltVT, Vector);
437
438 ReplaceNode(N, N2);
439 return true;
440}
441
442// Find all instances of extract_vector_elt that use this v2f16 vector
443// and coalesce them into a scattering move instruction.
444bool NVPTXDAGToDAGISel::tryEXTRACT_VECTOR_ELEMENT(SDNode *N) {
445 SDValue Vector = N->getOperand(0);
446
447 MVT VT = Vector.getSimpleValueType();
448 if (!(NVPTX::isPackedVectorTy(VT) && VT.getVectorNumElements() == 2))
449 return false;
450
451 unsigned Opcode;
452 if (VT.is32BitVector())
453 Opcode = NVPTX::I32toV2I16;
454 else if (VT.is64BitVector())
455 Opcode = NVPTX::I64toV2I32;
456 else
457 llvm_unreachable("Unhandled packed type");
458
459 // Find and record all uses of this vector that extract element 0 or 1.
461 for (auto *U : Vector.getNode()->users()) {
462 if (U->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
463 continue;
464 if (U->getOperand(0) != Vector)
465 continue;
466 if (const ConstantSDNode *IdxConst =
467 dyn_cast<ConstantSDNode>(U->getOperand(1))) {
468 if (IdxConst->getZExtValue() == 0)
469 E0.push_back(U);
470 else if (IdxConst->getZExtValue() == 1)
471 E1.push_back(U);
472 else
473 llvm_unreachable("Invalid vector index.");
474 }
475 }
476
477 // There's no point scattering f16x2 if we only ever access one
478 // element of it.
479 if (E0.empty() || E1.empty())
480 return false;
481
482 // Merge (EltTy extractelt(V, 0), EltTy extractelt(V,1))
483 // into EltTy,EltTy Split[EltTy]x2(V)
484 MVT EltVT = VT.getVectorElementType();
485 SDNode *ScatterOp =
486 CurDAG->getMachineNode(Opcode, SDLoc(N), EltVT, EltVT, Vector);
487 for (auto *Node : E0)
488 ReplaceUses(SDValue(Node, 0), SDValue(ScatterOp, 0));
489 for (auto *Node : E1)
490 ReplaceUses(SDValue(Node, 0), SDValue(ScatterOp, 1));
491
492 return true;
493}
494
495static std::optional<NVPTX::AddressSpace> convertAS(unsigned AS) {
496 switch (AS) {
511 default:
512 return std::nullopt;
513 }
514}
515
517 return convertAS(N->getMemOperand()->getAddrSpace())
519}
520
521NVPTX::Ordering NVPTXDAGToDAGISel::getMemOrder(const MemSDNode *N) const {
522 // No "sem" orderings for SM/PTX versions which do not support memory ordering
525 auto Ordering = N->getMergedOrdering();
526 switch (Ordering) {
540 }
541 llvm_unreachable("Invalid atomic ordering");
542}
543
544NVPTX::Scope NVPTXDAGToDAGISel::getAtomicScope(const MemSDNode *N) const {
545 // No "scope" modifier for SM/PTX versions which do not support scoped atomics
546 // Functionally, these atomics are at device scope
547 if (!Subtarget->hasAtomScope())
549 return Scopes[N->getSyncScopeID()];
550}
551
552namespace {
553
554struct OperationOrderings {
555 NVPTX::Ordering InstructionOrdering, FenceOrdering;
556 OperationOrderings(NVPTX::Ordering IO = NVPTX::Ordering::NotAtomic,
557 NVPTX::Ordering FO = NVPTX::Ordering::NotAtomic)
558 : InstructionOrdering(IO), FenceOrdering(FO) {}
559};
560
561static OperationOrderings
562getOperationOrderings(MemSDNode *N, const NVPTXSubtarget *Subtarget) {
563 AtomicOrdering Ordering = N->getSuccessOrdering();
564 auto CodeAddrSpace = NVPTXDAGToDAGISel::getAddrSpace(N);
565
566 bool HasMemoryOrdering = Subtarget->hasMemoryOrdering();
567 bool HasRelaxedMMIO = Subtarget->hasRelaxedMMIO();
568
569 // clang-format off
570
571 // Lowering for Load/Store Operations (note: AcquireRelease Loads or Stores error).
572 // Note: uses of Relaxed in the Atomic column of this table refer
573 // to LLVM AtomicOrdering::Monotonic.
574 //
575 // | Atomic | Volatile | Statespace | PTX sm_60- | PTX sm_70+ |
576 // |---------|----------|--------------------|------------|------------------------------|
577 // | No | No | All | plain | .weak |
578 // | No | Yes | Generic,Shared, | .volatile | .volatile |
579 // | | | Global [0] | | |
580 // | No | Yes | Local,Const,Param | plain [1] | .weak [1] |
581 // | Unorder | Yes/No | All | == Relaxed | == Relaxed |
582 // | Relaxed | No | Generic,Shared, | .volatile | <atomic sem> |
583 // | | | Global [0] | | |
584 // | Other | No | Generic,Shared, | Error [2] | <atomic sem> |
585 // | | | Global [0] | | |
586 // | Yes | No | Local,Const,Param | plain [1] | .weak [1] |
587 // | Relaxed | Yes | Generic,Shared [0] | .volatile | .volatile |
588 // | Relaxed | Yes | Global [0] | .volatile | .mmio.relaxed.sys (PTX 8.2+) |
589 // | | | | | or .volatile (PTX 8.1-) |
590 // | Relaxed | Yes | Local,Const,Param | plain [1] | .weak [1] |
591 // | Other | Yes | Generic, Shared, | Error [2] | <atomic sem> [3] |
592 // | | | / Global [0] | | |
593
594 // Lowering of CUDA C++ SequentiallyConsistent Operations and Fences to PTX
595 // by following the ABI proven sound in:
596 // Lustig et al, A Formal Analysis of the NVIDIA PTX Memory Consistency Model, ASPLOS’19.
597 // https://dl.acm.org/doi/pdf/10.1145/3297858.3304043
598 //
599 // | CUDA C++ Atomic Operation or Atomic Fence | PTX Atomic Operation or Fence |
600 // |------------------------------------------------------|-------------------------------|
601 // | cuda::atomic_thread_fence | fence.sc.<scope>; |
602 // | (memory_order_seq_cst, cuda::thread_scope_<scope>) | |
603 // |------------------------------------------------------|-------------------------------|
604 // | cuda::atomic_load | fence.sc.<scope>; |
605 // | (memory_order_seq_cst, cuda::thread_scope_<scope>) | ld.acquire.<scope>; |
606 // |------------------------------------------------------|-------------------------------|
607 // | cuda::atomic_store | fence.sc.<scope>; |
608 // | (memory_order_seq_cst, cuda::thread_scope_<scope>) | st.release.<scope>; |
609 // |------------------------------------------------------|-------------------------------|
610 // | cuda::atomic_fetch_<op> | fence.sc.<scope>; |
611 // | (memory_order_seq_cst, cuda::thread_scope_<scope>) | atom.acq_rel.<scope>; |
612
613 // clang-format on
614
615 // [0]: volatile and atomics are only supported on global or shared
616 // memory locations, accessed via generic/shared/global pointers.
617 // MMIO is only supported on global memory locations,
618 // accessed via generic/global pointers.
619 // TODO: Implement MMIO access via generic pointer to global.
620 // Currently implemented for global pointers only.
621
622 // [1]: Lowering volatile/atomic operations to non-volatile/non-atomic
623 // PTX instructions fails to preserve their C++ side-effects.
624 //
625 // Example (https://github.com/llvm/llvm-project/issues/62057):
626 //
627 // void example() {
628 // std::atomic<bool> True = true;
629 // while (True.load(std::memory_order_relaxed));
630 // }
631 //
632 // A C++ program that calls "example" is well-defined: the infinite loop
633 // performs an atomic operation. By lowering volatile/atomics to
634 // "weak" memory operations, we are transforming the above into:
635 //
636 // void undefined_behavior() {
637 // bool True = true;
638 // while (True);
639 // }
640 //
641 // which exhibits undefined behavior in both C++ and PTX.
642 //
643 // Calling "example" in CUDA C++ compiled for sm_60- exhibits undefined
644 // behavior due to lack of Independent Forward Progress. Lowering these
645 // to weak memory operations in sm_60- is therefore fine.
646 //
647 // TODO: lower atomic and volatile operations to memory locations
648 // in local, const, and param to two PTX instructions in sm_70+:
649 // - the "weak" memory instruction we are currently lowering to, and
650 // - some other instruction that preserves the side-effect, e.g.,
651 // a dead dummy volatile load.
652 if (CodeAddrSpace == NVPTX::AddressSpace::Local ||
653 CodeAddrSpace == NVPTX::AddressSpace::Const ||
654 CodeAddrSpace == NVPTX::AddressSpace::Param) {
656 }
657
658 // [2]: Atomics with Ordering different than Unordered or Relaxed are not
659 // supported on sm_60 and older; this includes volatile atomics.
660 if (!(Ordering == AtomicOrdering::NotAtomic ||
661 Ordering == AtomicOrdering::Unordered ||
662 Ordering == AtomicOrdering::Monotonic) &&
663 !HasMemoryOrdering) {
665 formatv("PTX does not support \"atomic\" for orderings different than"
666 "\"NotAtomic\" or \"Monotonic\" for sm_60 or older, but order "
667 "is: \"{}\".",
668 toIRString(Ordering)));
669 }
670
671 // [3]: TODO: these should eventually use .mmio<.atomic sem>; for now we drop
672 // the volatile semantics and preserve the atomic ones.
673
674 // PTX volatile and PTX atomics are not available for statespace that differ
675 // from .generic, .global, or .shared. The behavior of PTX volatile and PTX
676 // atomics is undefined if the generic address does not refer to a .global or
677 // .shared memory location.
678 bool AddrGenericOrGlobalOrShared =
679 (CodeAddrSpace == NVPTX::AddressSpace::Generic ||
680 CodeAddrSpace == NVPTX::AddressSpace::Global ||
681 CodeAddrSpace == NVPTX::AddressSpace::Shared ||
682 CodeAddrSpace == NVPTX::AddressSpace::SharedCluster);
683 if (!AddrGenericOrGlobalOrShared)
685
686 bool UseRelaxedMMIO =
687 HasRelaxedMMIO && CodeAddrSpace == NVPTX::AddressSpace::Global;
688
689 switch (Ordering) {
691 return N->isVolatile() ? NVPTX::Ordering::Volatile
694 // We lower unordered in the exact same way as 'monotonic' to respect
695 // LLVM IR atomicity requirements.
697 if (N->isVolatile())
698 return UseRelaxedMMIO ? NVPTX::Ordering::RelaxedMMIO
700 else
701 return HasMemoryOrdering ? NVPTX::Ordering::Relaxed
703 // case AtomicOrdering::Consume: // If LLVM ever provides this, lower it to
704 // Acquire.
706 if (!N->readMem())
708 formatv("PTX only supports Acquire Ordering on reads: {}",
709 N->getOperationName()));
712 if (!N->writeMem())
714 formatv("PTX only supports Release Ordering on writes: {}",
715 N->getOperationName()));
719 formatv("NVPTX does not support AcquireRelease Ordering on "
720 "read-modify-write "
721 "yet and PTX does not support it on loads or stores: {}",
722 N->getOperationName()));
723 }
725 // LLVM-IR SequentiallyConsistent atomics map to a two-instruction PTX
726 // sequence including a "fence.sc.sco" and the memory instruction with an
727 // Ordering that differs from "sc": acq, rel, or acq_rel, depending on
728 // whether the memory operation is a read, write, or read-modify-write.
729 //
730 // This sets the ordering of the fence to SequentiallyConsistent, and
731 // sets the corresponding ordering for the instruction.
732 NVPTX::Ordering InstrOrder;
733 if (N->readMem())
734 InstrOrder = NVPTX::Ordering::Acquire;
735 else if (N->writeMem())
736 InstrOrder = NVPTX::Ordering::Release;
737 else
739 formatv("NVPTX does not support SequentiallyConsistent Ordering on "
740 "read-modify-writes yet: {}",
741 N->getOperationName()));
742 return OperationOrderings(InstrOrder,
744 }
745 }
747 formatv("NVPTX backend does not support AtomicOrdering \"{}\" yet.",
748 toIRString(Ordering)));
749}
750
751} // namespace
752
753NVPTX::Scope NVPTXDAGToDAGISel::getOperationScope(MemSDNode *N,
754 NVPTX::Ordering O) const {
755 switch (O) {
757 case NVPTX::Ordering::Volatile: // Non-atomic volatile operations
758 // NVPTX uses Thread scope as the scope of non-atomic operations.
761 // RelaxedMMIO operations are always system scope.
762 // If a RelaxedMMIO order was generated from an atomic volatile operation
763 // with a smaller thread scope, we bump it here to system scope.
770 auto S = Scopes[N->getSyncScopeID()];
771
772 // Atomic operations must have a scope greater than thread.
773 if (S == NVPTX::Scope::Thread)
775 formatv("Atomics need scope > \"{}\".", ScopeToString(S)));
776
777 // If scope is cluster, clusters must be supported.
778 if (S == NVPTX::Scope::Cluster)
779 Subtarget->failIfClustersUnsupported("cluster scope");
780
781 // If operation is volatile, then its scope is system.
782 return N->isVolatile() ? NVPTX::Scope::System : S;
783 }
784 llvm_unreachable("unhandled ordering");
785}
786
787static bool canLowerToLDG(const MemSDNode &N, const NVPTXSubtarget &Subtarget,
788 NVPTX::AddressSpace CodeAddrSpace) {
789 // We use ldg (i.e. ld.global.nc) for invariant loads from the global address
790 // space.
791 return Subtarget.hasLDG() && CodeAddrSpace == NVPTX::AddressSpace::Global &&
792 N.isInvariant();
793}
794
795static unsigned int getFenceOp(NVPTX::Ordering O, NVPTX::Scope S,
796 NVPTXSubtarget const *T) {
797 if (S == NVPTX::Scope::Cluster)
798 T->failIfClustersUnsupported(".cluster scope fence");
799
800 // Fall back to .acq_rel if .acquire, .release is not supported.
801 if (!T->hasSplitAcquireAndReleaseFences() &&
804
805 switch (O) {
807 switch (S) {
809 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_acquire_sys
810 : NVPTX::INT_MEMBAR_SYS;
812 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_acquire_cta
813 : NVPTX::INT_MEMBAR_CTA;
815 return NVPTX::atomic_thread_fence_acquire_cluster;
817 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_acquire_gpu
818 : NVPTX::INT_MEMBAR_GL;
822 formatv("Unsupported scope \"{}\" for acquire/release/acq_rel fence.",
823 ScopeToString(S)));
824 }
825 break;
827 switch (S) {
829 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_release_sys
830 : NVPTX::INT_MEMBAR_SYS;
832 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_release_cta
833 : NVPTX::INT_MEMBAR_CTA;
835 return NVPTX::atomic_thread_fence_release_cluster;
837 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_release_gpu
838 : NVPTX::INT_MEMBAR_GL;
842 formatv("Unsupported scope \"{}\" for acquire/release/acq_rel fence.",
843 ScopeToString(S)));
844 }
845 break;
847 switch (S) {
849 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_acq_rel_sys
850 : NVPTX::INT_MEMBAR_SYS;
852 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_acq_rel_cta
853 : NVPTX::INT_MEMBAR_CTA;
855 return NVPTX::atomic_thread_fence_acq_rel_cluster;
857 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_acq_rel_gpu
858 : NVPTX::INT_MEMBAR_GL;
862 formatv("Unsupported scope \"{}\" for acquire/release/acq_rel fence.",
863 ScopeToString(S)));
864 }
865 break;
866 }
868 switch (S) {
870 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_seq_cst_sys
871 : NVPTX::INT_MEMBAR_SYS;
873 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_seq_cst_cta
874 : NVPTX::INT_MEMBAR_CTA;
876 return NVPTX::atomic_thread_fence_seq_cst_cluster;
878 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_seq_cst_gpu
879 : NVPTX::INT_MEMBAR_GL;
882 report_fatal_error(formatv("Unsupported scope \"{}\" for seq_cst fence.",
883 ScopeToString(S)));
884 }
885 break;
886 }
892 formatv("Unsupported \"{}\" ordering and \"{}\" scope for fence.",
893 OrderingToString(O), ScopeToString(S)));
894 }
895 llvm_unreachable("unhandled ordering");
896}
897
898// Returns Memory Order and Scope of a memory instruction, and
899// inserts any fence before the instruction that's required to
900// implement its memory ordering.
901std::pair<NVPTX::Ordering, NVPTX::Scope>
902NVPTXDAGToDAGISel::insertMemoryInstructionFence(SDLoc DL, SDValue &Chain,
903 MemSDNode *N) {
904 auto [InstructionOrdering, FenceOrdering] =
905 getOperationOrderings(N, Subtarget);
906 auto Scope = getOperationScope(N, InstructionOrdering);
907
908 // If a fence is required before the operation, insert it:
909 switch (NVPTX::Ordering(FenceOrdering)) {
911 break;
913 auto Op = getFenceOp(FenceOrdering, Scope, Subtarget);
914 Chain = SDValue(CurDAG->getMachineNode(Op, DL, MVT::Other, Chain), 0);
915 break;
916 }
917 default:
919 formatv("Unexpected fence ordering: \"{}\".",
920 OrderingToString(NVPTX::Ordering(FenceOrdering))));
921 }
922 return {InstructionOrdering, Scope};
923}
924
925void NVPTXDAGToDAGISel::SelectAddrSpaceCast(SDNode *N) {
926 SDValue Src = N->getOperand(0);
927 AddrSpaceCastSDNode *CastN = cast<AddrSpaceCastSDNode>(N);
928 unsigned SrcAddrSpace = CastN->getSrcAddressSpace();
929 unsigned DstAddrSpace = CastN->getDestAddressSpace();
930 SDLoc DL(N);
931 assert(SrcAddrSpace != DstAddrSpace &&
932 "addrspacecast must be between different address spaces");
933
934 if (DstAddrSpace == ADDRESS_SPACE_GENERIC) {
935 // Specific to generic
936
937 if (TM.is64Bit() && TM.getPointerSizeInBits(SrcAddrSpace) == 32) {
938 SDValue CvtNone =
939 CurDAG->getTargetConstant(NVPTX::PTXCvtMode::NONE, DL, MVT::i32);
940 SDNode *Cvt = CurDAG->getMachineNode(NVPTX::CVT_u64_u32, DL, MVT::i64,
941 Src, CvtNone);
942 Src = SDValue(Cvt, 0);
943 }
944
945 unsigned Opc;
946 switch (SrcAddrSpace) {
947 default: report_fatal_error("Bad address space in addrspacecast");
949 Opc = TM.is64Bit() ? NVPTX::cvta_global_64 : NVPTX::cvta_global;
950 break;
952 Opc = TM.is64Bit() ? NVPTX::cvta_shared_64 : NVPTX::cvta_shared;
953 break;
955 if (!TM.is64Bit())
957 "Shared cluster address space is only supported in 64-bit mode");
958 Opc = NVPTX::cvta_shared_cluster_64;
959 break;
961 Opc = TM.is64Bit() ? NVPTX::cvta_const_64 : NVPTX::cvta_const;
962 break;
964 Opc = TM.is64Bit() ? NVPTX::cvta_local_64 : NVPTX::cvta_local;
965 break;
967 Opc = TM.is64Bit() ? NVPTX::cvta_param_64 : NVPTX::cvta_param;
968 break;
969 }
970 ReplaceNode(N, CurDAG->getMachineNode(Opc, DL, N->getValueType(0), Src));
971 return;
972 } else {
973 // Generic to specific
974 if (SrcAddrSpace != 0)
975 report_fatal_error("Cannot cast between two non-generic address spaces");
976 unsigned Opc;
977 switch (DstAddrSpace) {
978 default: report_fatal_error("Bad address space in addrspacecast");
980 Opc = TM.is64Bit() ? NVPTX::cvta_to_global_64 : NVPTX::cvta_to_global;
981 break;
983 Opc = TM.is64Bit() ? NVPTX::cvta_to_shared_64 : NVPTX::cvta_to_shared;
984 break;
986 if (!TM.is64Bit())
988 "Shared cluster address space is only supported in 64-bit mode");
989 Opc = NVPTX::cvta_to_shared_cluster_64;
990 break;
992 Opc = TM.is64Bit() ? NVPTX::cvta_to_const_64 : NVPTX::cvta_to_const;
993 break;
995 Opc = TM.is64Bit() ? NVPTX::cvta_to_local_64 : NVPTX::cvta_to_local;
996 break;
998 Opc = TM.is64Bit() ? NVPTX::cvta_to_param_64 : NVPTX::cvta_to_param;
999 break;
1000 }
1001
1002 SDNode *CVTA = CurDAG->getMachineNode(Opc, DL, N->getValueType(0), Src);
1003 if (TM.is64Bit() && TM.getPointerSizeInBits(DstAddrSpace) == 32) {
1004 SDValue CvtNone =
1005 CurDAG->getTargetConstant(NVPTX::PTXCvtMode::NONE, DL, MVT::i32);
1006 CVTA = CurDAG->getMachineNode(NVPTX::CVT_u32_u64, DL, MVT::i32,
1007 SDValue(CVTA, 0), CvtNone);
1008 }
1009
1010 ReplaceNode(N, CVTA);
1011 return;
1012 }
1013}
1014
1015// Helper function template to reduce amount of boilerplate code for
1016// opcode selection.
1017static std::optional<unsigned>
1018pickOpcodeForVT(MVT::SimpleValueType VT, std::optional<unsigned> Opcode_i16,
1019 std::optional<unsigned> Opcode_i32,
1020 std::optional<unsigned> Opcode_i64) {
1021 switch (VT) {
1022 case MVT::f16:
1023 case MVT::i16:
1024 case MVT::bf16:
1025 return Opcode_i16;
1026 case MVT::v2f16:
1027 case MVT::v2bf16:
1028 case MVT::v2i16:
1029 case MVT::v4i8:
1030 case MVT::i32:
1031 case MVT::f32:
1032 return Opcode_i32;
1033 case MVT::v2f32:
1034 case MVT::v2i32:
1035 case MVT::i64:
1036 case MVT::f64:
1037 return Opcode_i64;
1038 default:
1039 return std::nullopt;
1040 }
1041}
1042
1043static inline bool isAddLike(const SDValue V) {
1044 return V.getOpcode() == ISD::ADD ||
1045 (V->getOpcode() == ISD::OR && V->getFlags().hasDisjoint());
1046}
1047
1049 if (N.getOpcode() == ISD::AssertAlign)
1050 N = N.getOperand(0);
1051 return N;
1052}
1053
1054// selectBaseADDR - Match a dag node which will serve as the base address for an
1055// ADDR operand pair.
1057 N = stripAssertAlign(N);
1058 if (const auto *GA = dyn_cast<GlobalAddressSDNode>(N))
1059 return DAG->getTargetGlobalAddress(GA->getGlobal(), SDLoc(N),
1060 GA->getValueType(0), GA->getOffset(),
1061 GA->getTargetFlags());
1062 if (const auto *ES = dyn_cast<ExternalSymbolSDNode>(N))
1063 return DAG->getTargetExternalSymbol(ES->getSymbol(), ES->getValueType(0),
1064 ES->getTargetFlags());
1065 if (const auto *FIN = dyn_cast<FrameIndexSDNode>(N))
1066 return DAG->getTargetFrameIndex(FIN->getIndex(), FIN->getValueType(0));
1067
1068 return N;
1069}
1070
1072 Addr = stripAssertAlign(Addr);
1073 APInt AccumulatedOffset(64u, 0);
1074 while (isAddLike(Addr)) {
1075 const auto *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1));
1076 if (!CN)
1077 break;
1078
1079 const APInt CI = CN->getAPIntValue().sext(64);
1080 if (!(CI + AccumulatedOffset).isSignedIntN(32))
1081 break;
1082
1083 AccumulatedOffset += CI;
1084 Addr = stripAssertAlign(Addr->getOperand(0));
1085 }
1086 return DAG->getSignedTargetConstant(AccumulatedOffset.getSExtValue(), DL,
1087 MVT::i32);
1088}
1089
1090static std::pair<SDValue, SDValue> selectADDR(SDValue Addr, SelectionDAG *DAG) {
1091 SDValue Offset = accumulateOffset(Addr, SDLoc(Addr), DAG);
1092 SDValue Base = selectBaseADDR(Addr, DAG);
1093 return {Base, Offset};
1094}
1095
1096// Select a pair of operands which represent a valid PTX address, this could be
1097// one of the following things:
1098// - [var] - Offset is simply set to 0
1099// - [reg] - Offset is simply set to 0
1100// - [reg+immOff]
1101// - [var+immOff]
1102// Note that immOff must fit into a 32-bit signed integer.
1103bool NVPTXDAGToDAGISel::SelectADDR(SDValue Addr, SDValue &Base,
1104 SDValue &Offset) {
1105 std::tie(Base, Offset) = selectADDR(Addr, CurDAG);
1106 return true;
1107}
1108
1109bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
1110 MemSDNode *LD = cast<MemSDNode>(N);
1111 assert(LD->readMem() && "Expected load");
1112
1113 // do not support pre/post inc/dec
1114 const LoadSDNode *PlainLoad = dyn_cast<LoadSDNode>(LD);
1115 if (PlainLoad && PlainLoad->isIndexed())
1116 return false;
1117
1118 // Address Space Setting
1119 const auto CodeAddrSpace = getAddrSpace(LD);
1120 if (canLowerToLDG(*LD, *Subtarget, CodeAddrSpace))
1121 return tryLDG(LD);
1122
1123 SDLoc DL(LD);
1124 SDValue Chain = N->getOperand(0);
1125 const auto [Ordering, Scope] = insertMemoryInstructionFence(DL, Chain, LD);
1126
1127 const unsigned FromTypeWidth = LD->getMemoryVT().getSizeInBits();
1128
1129 // Vector Setting
1130 const unsigned FromType =
1131 (PlainLoad && (PlainLoad->getExtensionType() == ISD::SEXTLOAD))
1134
1135 assert(isPowerOf2_32(FromTypeWidth) && FromTypeWidth >= 8 &&
1136 FromTypeWidth <= 128 && "Invalid width for load");
1137
1138 // Create the machine instruction DAG
1139 const auto [Base, Offset] = selectADDR(N->getOperand(1), CurDAG);
1140 SDValue Ops[] = {getI32Imm(Ordering, DL),
1141 getI32Imm(Scope, DL),
1142 getI32Imm(CodeAddrSpace, DL),
1143 getI32Imm(FromType, DL),
1144 getI32Imm(FromTypeWidth, DL),
1145 Base,
1146 Offset,
1147 Chain};
1148
1149 const MVT::SimpleValueType TargetVT = LD->getSimpleValueType(0).SimpleTy;
1150 const std::optional<unsigned> Opcode =
1151 pickOpcodeForVT(TargetVT, NVPTX::LD_i16, NVPTX::LD_i32, NVPTX::LD_i64);
1152 if (!Opcode)
1153 return false;
1154
1155 SDNode *NVPTXLD = CurDAG->getMachineNode(*Opcode, DL, LD->getVTList(), Ops);
1156 if (!NVPTXLD)
1157 return false;
1158
1159 MachineMemOperand *MemRef = LD->getMemOperand();
1160 CurDAG->setNodeMemRefs(cast<MachineSDNode>(NVPTXLD), {MemRef});
1161
1162 ReplaceNode(LD, NVPTXLD);
1163 return true;
1164}
1165
1166static unsigned getStoreVectorNumElts(SDNode *N) {
1167 switch (N->getOpcode()) {
1168 case NVPTXISD::StoreV2:
1169 return 2;
1170 case NVPTXISD::StoreV4:
1171 return 4;
1172 case NVPTXISD::StoreV8:
1173 return 8;
1174 default:
1175 llvm_unreachable("Unexpected opcode");
1176 }
1177}
1178
1179bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
1180 MemSDNode *LD = cast<MemSDNode>(N);
1181
1182 // Address Space Setting
1183 const auto CodeAddrSpace = getAddrSpace(LD);
1184 if (canLowerToLDG(*LD, *Subtarget, CodeAddrSpace))
1185 return tryLDG(LD);
1186
1187 const MVT EltVT = LD->getSimpleValueType(0);
1188 SDLoc DL(LD);
1189 SDValue Chain = LD->getChain();
1190 const auto [Ordering, Scope] = insertMemoryInstructionFence(DL, Chain, LD);
1191
1192 // Type Setting: fromType + fromTypeWidth
1193 //
1194 // Sign : ISD::SEXTLOAD
1195 // Unsign : ISD::ZEXTLOAD, ISD::NON_EXTLOAD or ISD::EXTLOAD and the
1196 // type is integer
1197 // Float : ISD::NON_EXTLOAD or ISD::EXTLOAD and the type is float
1198 // Read at least 8 bits (predicates are stored as 8-bit values)
1199 // The last operand holds the original LoadSDNode::getExtensionType() value
1200 const unsigned ExtensionType =
1201 N->getConstantOperandVal(N->getNumOperands() - 1);
1202 const unsigned FromType = (ExtensionType == ISD::SEXTLOAD)
1204 : NVPTX::PTXLdStInstCode::Untyped;
1205
1206 const unsigned FromTypeWidth = getFromTypeWidthForLoad(LD);
1207
1208 assert(!(EltVT.isVector() && ExtensionType != ISD::NON_EXTLOAD));
1209
1210 const auto [Base, Offset] = selectADDR(N->getOperand(1), CurDAG);
1211 SDValue Ops[] = {getI32Imm(Ordering, DL),
1212 getI32Imm(Scope, DL),
1213 getI32Imm(CodeAddrSpace, DL),
1214 getI32Imm(FromType, DL),
1215 getI32Imm(FromTypeWidth, DL),
1216 Base,
1217 Offset,
1218 Chain};
1219
1220 std::optional<unsigned> Opcode;
1221 switch (N->getOpcode()) {
1222 default:
1223 llvm_unreachable("Unexpected opcode");
1224 case NVPTXISD::LoadV2:
1225 Opcode = pickOpcodeForVT(EltVT.SimpleTy, NVPTX::LDV_i16_v2,
1226 NVPTX::LDV_i32_v2, NVPTX::LDV_i64_v2);
1227 break;
1228 case NVPTXISD::LoadV4:
1229 Opcode = pickOpcodeForVT(EltVT.SimpleTy, NVPTX::LDV_i16_v4,
1230 NVPTX::LDV_i32_v4, NVPTX::LDV_i64_v4);
1231 break;
1232 case NVPTXISD::LoadV8:
1233 Opcode = pickOpcodeForVT(EltVT.SimpleTy, {/* no v8i16 */},
1234 NVPTX::LDV_i32_v8, {/* no v8i64 */});
1235 break;
1236 }
1237 if (!Opcode)
1238 return false;
1239
1240 SDNode *NVPTXLD = CurDAG->getMachineNode(*Opcode, DL, LD->getVTList(), Ops);
1241
1242 MachineMemOperand *MemRef = LD->getMemOperand();
1243 CurDAG->setNodeMemRefs(cast<MachineSDNode>(NVPTXLD), {MemRef});
1244
1245 ReplaceNode(LD, NVPTXLD);
1246 return true;
1247}
1248
1249bool NVPTXDAGToDAGISel::tryLDG(MemSDNode *LD) {
1250 SDLoc DL(LD);
1251
1252 unsigned ExtensionType;
1253 if (const auto *Load = dyn_cast<LoadSDNode>(LD)) {
1254 ExtensionType = Load->getExtensionType();
1255 } else {
1256 ExtensionType = LD->getConstantOperandVal(LD->getNumOperands() - 1);
1257 }
1258 const unsigned FromType = (ExtensionType == ISD::SEXTLOAD)
1260 : NVPTX::PTXLdStInstCode::Untyped;
1261
1262 const unsigned FromTypeWidth = getFromTypeWidthForLoad(LD);
1263
1264 assert(!(LD->getSimpleValueType(0).isVector() &&
1265 ExtensionType != ISD::NON_EXTLOAD));
1266
1267 const auto [Base, Offset] = selectADDR(LD->getOperand(1), CurDAG);
1268 SDValue Ops[] = {getI32Imm(FromType, DL), getI32Imm(FromTypeWidth, DL), Base,
1269 Offset, LD->getChain()};
1270
1271 const MVT::SimpleValueType TargetVT = LD->getSimpleValueType(0).SimpleTy;
1272 std::optional<unsigned> Opcode;
1273 switch (LD->getOpcode()) {
1274 default:
1275 llvm_unreachable("Unexpected opcode");
1276 case ISD::LOAD:
1277 Opcode = pickOpcodeForVT(TargetVT, NVPTX::LD_GLOBAL_NC_i16,
1278 NVPTX::LD_GLOBAL_NC_i32, NVPTX::LD_GLOBAL_NC_i64);
1279 break;
1280 case NVPTXISD::LoadV2:
1281 Opcode =
1282 pickOpcodeForVT(TargetVT, NVPTX::LD_GLOBAL_NC_v2i16,
1283 NVPTX::LD_GLOBAL_NC_v2i32, NVPTX::LD_GLOBAL_NC_v2i64);
1284 break;
1285 case NVPTXISD::LoadV4:
1286 Opcode =
1287 pickOpcodeForVT(TargetVT, NVPTX::LD_GLOBAL_NC_v4i16,
1288 NVPTX::LD_GLOBAL_NC_v4i32, NVPTX::LD_GLOBAL_NC_v4i64);
1289 break;
1290 case NVPTXISD::LoadV8:
1291 Opcode = pickOpcodeForVT(TargetVT, {/* no v8i16 */},
1292 NVPTX::LD_GLOBAL_NC_v8i32, {/* no v8i64 */});
1293 break;
1294 }
1295 if (!Opcode)
1296 return false;
1297
1298 SDNode *NVPTXLDG = CurDAG->getMachineNode(*Opcode, DL, LD->getVTList(), Ops);
1299
1300 ReplaceNode(LD, NVPTXLDG);
1301 return true;
1302}
1303
1305 auto TotalWidth = Mem->getMemoryVT().getSizeInBits();
1306 auto NumElts = Mem->getNumValues() - 1;
1307 auto ElementBitWidth = TotalWidth / NumElts;
1308 assert(isPowerOf2_32(ElementBitWidth) && ElementBitWidth >= 8 &&
1309 ElementBitWidth <= 128 && TotalWidth <= 256 &&
1310 "Invalid width for load");
1311 return ElementBitWidth;
1312}
1313
1314bool NVPTXDAGToDAGISel::tryLDU(SDNode *N) {
1315 auto *LD = cast<MemSDNode>(N);
1316
1317 SDLoc DL(N);
1318 const unsigned FromTypeWidth = getFromTypeWidthForLoad(LD);
1319 const MVT::SimpleValueType TargetVT = LD->getSimpleValueType(0).SimpleTy;
1320
1321 // If this is an LDU intrinsic, the address is the third operand. If its an
1322 // LDU SD node (from custom vector handling), then its the second operand
1323 SDValue Addr =
1324 LD->getOperand(LD->getOpcode() == ISD::INTRINSIC_W_CHAIN ? 2 : 1);
1325
1326 const auto [Base, Offset] = selectADDR(Addr, CurDAG);
1327 SDValue Ops[] = {getI32Imm(FromTypeWidth, DL), Base, Offset, LD->getChain()};
1328
1329 std::optional<unsigned> Opcode;
1330 switch (N->getOpcode()) {
1331 default:
1332 llvm_unreachable("Unexpected opcode");
1334 Opcode = pickOpcodeForVT(TargetVT, NVPTX::LDU_GLOBAL_i16,
1335 NVPTX::LDU_GLOBAL_i32, NVPTX::LDU_GLOBAL_i64);
1336 break;
1337 case NVPTXISD::LDUV2:
1338 Opcode = pickOpcodeForVT(TargetVT, NVPTX::LDU_GLOBAL_v2i16,
1339 NVPTX::LDU_GLOBAL_v2i32, NVPTX::LDU_GLOBAL_v2i64);
1340 break;
1341 case NVPTXISD::LDUV4:
1342 Opcode = pickOpcodeForVT(TargetVT, NVPTX::LDU_GLOBAL_v4i16,
1343 NVPTX::LDU_GLOBAL_v4i32, {/* no v4i64 */});
1344 break;
1345 }
1346 if (!Opcode)
1347 return false;
1348
1349 SDNode *NVPTXLDU = CurDAG->getMachineNode(*Opcode, DL, LD->getVTList(), Ops);
1350
1351 ReplaceNode(LD, NVPTXLDU);
1352 return true;
1353}
1354
1355bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
1356 MemSDNode *ST = cast<MemSDNode>(N);
1357 assert(ST->writeMem() && "Expected store");
1358 StoreSDNode *PlainStore = dyn_cast<StoreSDNode>(ST);
1359 AtomicSDNode *AtomicStore = dyn_cast<AtomicSDNode>(ST);
1360 assert((PlainStore || AtomicStore) && "Expected store");
1361
1362 // do not support pre/post inc/dec
1363 if (PlainStore && PlainStore->isIndexed())
1364 return false;
1365
1366 // Address Space Setting
1367 const auto CodeAddrSpace = getAddrSpace(ST);
1368
1369 SDLoc DL(ST);
1370 SDValue Chain = ST->getChain();
1371 const auto [Ordering, Scope] = insertMemoryInstructionFence(DL, Chain, ST);
1372
1373 // Vector Setting
1374 const unsigned ToTypeWidth = ST->getMemoryVT().getSizeInBits();
1375
1376 // Create the machine instruction DAG
1377 SDValue Value = PlainStore ? PlainStore->getValue() : AtomicStore->getVal();
1378
1379 assert(isPowerOf2_32(ToTypeWidth) && ToTypeWidth >= 8 && ToTypeWidth <= 128 &&
1380 "Invalid width for store");
1381
1382 const auto [Base, Offset] = selectADDR(ST->getBasePtr(), CurDAG);
1383 SDValue Ops[] = {selectPossiblyImm(Value),
1384 getI32Imm(Ordering, DL),
1385 getI32Imm(Scope, DL),
1386 getI32Imm(CodeAddrSpace, DL),
1387 getI32Imm(ToTypeWidth, DL),
1388 Base,
1389 Offset,
1390 Chain};
1391
1392 const std::optional<unsigned> Opcode =
1393 pickOpcodeForVT(Value.getSimpleValueType().SimpleTy, NVPTX::ST_i16,
1394 NVPTX::ST_i32, NVPTX::ST_i64);
1395 if (!Opcode)
1396 return false;
1397
1398 SDNode *NVPTXST = CurDAG->getMachineNode(*Opcode, DL, MVT::Other, Ops);
1399
1400 if (!NVPTXST)
1401 return false;
1402
1403 MachineMemOperand *MemRef = ST->getMemOperand();
1404 CurDAG->setNodeMemRefs(cast<MachineSDNode>(NVPTXST), {MemRef});
1405 ReplaceNode(ST, NVPTXST);
1406 return true;
1407}
1408
1409bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
1410 MemSDNode *ST = cast<MemSDNode>(N);
1411 const unsigned TotalWidth = ST->getMemoryVT().getSizeInBits();
1412
1413 // Address Space Setting
1414 const auto CodeAddrSpace = getAddrSpace(ST);
1415 if (CodeAddrSpace == NVPTX::AddressSpace::Const) {
1416 report_fatal_error("Cannot store to pointer that points to constant "
1417 "memory space");
1418 }
1419
1420 SDLoc DL(ST);
1421 SDValue Chain = ST->getChain();
1422 const auto [Ordering, Scope] = insertMemoryInstructionFence(DL, Chain, ST);
1423
1424 const unsigned NumElts = getStoreVectorNumElts(ST);
1425
1427 for (auto &V : ST->ops().slice(1, NumElts))
1428 Ops.push_back(selectPossiblyImm(V));
1429 SDValue Addr = N->getOperand(NumElts + 1);
1430 const unsigned ToTypeWidth = TotalWidth / NumElts;
1431
1432 assert(isPowerOf2_32(ToTypeWidth) && ToTypeWidth >= 8 && ToTypeWidth <= 128 &&
1433 TotalWidth <= 256 && "Invalid width for store");
1434
1435 const auto [Base, Offset] = selectADDR(Addr, CurDAG);
1436 Ops.append({getI32Imm(Ordering, DL), getI32Imm(Scope, DL),
1437 getI32Imm(CodeAddrSpace, DL), getI32Imm(ToTypeWidth, DL), Base,
1438 Offset, Chain});
1439
1440 const MVT::SimpleValueType EltVT =
1441 ST->getOperand(1).getSimpleValueType().SimpleTy;
1442 std::optional<unsigned> Opcode;
1443 switch (ST->getOpcode()) {
1444 default:
1445 return false;
1446 case NVPTXISD::StoreV2:
1447 Opcode = pickOpcodeForVT(EltVT, NVPTX::STV_i16_v2, NVPTX::STV_i32_v2,
1448 NVPTX::STV_i64_v2);
1449 break;
1450 case NVPTXISD::StoreV4:
1451 Opcode = pickOpcodeForVT(EltVT, NVPTX::STV_i16_v4, NVPTX::STV_i32_v4,
1452 NVPTX::STV_i64_v4);
1453 break;
1454 case NVPTXISD::StoreV8:
1455 Opcode = pickOpcodeForVT(EltVT, {/* no v8i16 */}, NVPTX::STV_i32_v8,
1456 {/* no v8i64 */});
1457 break;
1458 }
1459
1460 if (!Opcode)
1461 return false;
1462
1463 SDNode *NVPTXST = CurDAG->getMachineNode(*Opcode, DL, MVT::Other, Ops);
1464
1465 MachineMemOperand *MemRef = ST->getMemOperand();
1466 CurDAG->setNodeMemRefs(cast<MachineSDNode>(NVPTXST), {MemRef});
1467
1468 ReplaceNode(ST, NVPTXST);
1469 return true;
1470}
1471
1472/// SelectBFE - Look for instruction sequences that can be made more efficient
1473/// by using the 'bfe' (bit-field extract) PTX instruction
1474bool NVPTXDAGToDAGISel::tryBFE(SDNode *N) {
1475 SDLoc DL(N);
1476 SDValue LHS = N->getOperand(0);
1477 SDValue RHS = N->getOperand(1);
1478 SDValue Len;
1479 SDValue Start;
1480 SDValue Val;
1481 bool IsSigned = false;
1482
1483 if (N->getOpcode() == ISD::AND) {
1484 // Canonicalize the operands
1485 // We want 'and %val, %mask'
1487 std::swap(LHS, RHS);
1488 }
1489
1490 ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(RHS);
1491 if (!Mask) {
1492 // We need a constant mask on the RHS of the AND
1493 return false;
1494 }
1495
1496 // Extract the mask bits
1497 uint64_t MaskVal = Mask->getZExtValue();
1498 if (!isMask_64(MaskVal)) {
1499 // We *could* handle shifted masks here, but doing so would require an
1500 // 'and' operation to fix up the low-order bits so we would trade
1501 // shr+and for bfe+and, which has the same throughput
1502 return false;
1503 }
1504
1505 // How many bits are in our mask?
1506 int64_t NumBits = countr_one(MaskVal);
1507 Len = CurDAG->getTargetConstant(NumBits, DL, MVT::i32);
1508
1509 if (LHS.getOpcode() == ISD::SRL || LHS.getOpcode() == ISD::SRA) {
1510 // We have a 'srl/and' pair, extract the effective start bit and length
1511 Val = LHS.getNode()->getOperand(0);
1512 Start = LHS.getNode()->getOperand(1);
1513 ConstantSDNode *StartConst = dyn_cast<ConstantSDNode>(Start);
1514 if (StartConst) {
1515 uint64_t StartVal = StartConst->getZExtValue();
1516 // How many "good" bits do we have left? "good" is defined here as bits
1517 // that exist in the original value, not shifted in.
1518 int64_t GoodBits = Start.getValueSizeInBits() - StartVal;
1519 if (NumBits > GoodBits) {
1520 // Do not handle the case where bits have been shifted in. In theory
1521 // we could handle this, but the cost is likely higher than just
1522 // emitting the srl/and pair.
1523 return false;
1524 }
1525 Start = CurDAG->getTargetConstant(StartVal, DL, MVT::i32);
1526 } else {
1527 // Do not handle the case where the shift amount (can be zero if no srl
1528 // was found) is not constant. We could handle this case, but it would
1529 // require run-time logic that would be more expensive than just
1530 // emitting the srl/and pair.
1531 return false;
1532 }
1533 } else {
1534 // Do not handle the case where the LHS of the and is not a shift. While
1535 // it would be trivial to handle this case, it would just transform
1536 // 'and' -> 'bfe', but 'and' has higher-throughput.
1537 return false;
1538 }
1539 } else if (N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) {
1540 if (LHS->getOpcode() == ISD::AND) {
1541 ConstantSDNode *ShiftCnst = dyn_cast<ConstantSDNode>(RHS);
1542 if (!ShiftCnst) {
1543 // Shift amount must be constant
1544 return false;
1545 }
1546
1547 uint64_t ShiftAmt = ShiftCnst->getZExtValue();
1548
1549 SDValue AndLHS = LHS->getOperand(0);
1550 SDValue AndRHS = LHS->getOperand(1);
1551
1552 // Canonicalize the AND to have the mask on the RHS
1553 if (isa<ConstantSDNode>(AndLHS)) {
1554 std::swap(AndLHS, AndRHS);
1555 }
1556
1557 ConstantSDNode *MaskCnst = dyn_cast<ConstantSDNode>(AndRHS);
1558 if (!MaskCnst) {
1559 // Mask must be constant
1560 return false;
1561 }
1562
1563 uint64_t MaskVal = MaskCnst->getZExtValue();
1564 uint64_t NumZeros;
1565 uint64_t NumBits;
1566 if (isMask_64(MaskVal)) {
1567 NumZeros = 0;
1568 // The number of bits in the result bitfield will be the number of
1569 // trailing ones (the AND) minus the number of bits we shift off
1570 NumBits = llvm::countr_one(MaskVal) - ShiftAmt;
1571 } else if (isShiftedMask_64(MaskVal)) {
1572 NumZeros = llvm::countr_zero(MaskVal);
1573 unsigned NumOnes = llvm::countr_one(MaskVal >> NumZeros);
1574 // The number of bits in the result bitfield will be the number of
1575 // trailing zeros plus the number of set bits in the mask minus the
1576 // number of bits we shift off
1577 NumBits = NumZeros + NumOnes - ShiftAmt;
1578 } else {
1579 // This is not a mask we can handle
1580 return false;
1581 }
1582
1583 if (ShiftAmt < NumZeros) {
1584 // Handling this case would require extra logic that would make this
1585 // transformation non-profitable
1586 return false;
1587 }
1588
1589 Val = AndLHS;
1590 Start = CurDAG->getTargetConstant(ShiftAmt, DL, MVT::i32);
1591 Len = CurDAG->getTargetConstant(NumBits, DL, MVT::i32);
1592
1593 // If pre-shift AND includes the sign bit in the bitfield, we must use
1594 // signed BFE to replicate that bit during bitfield extraction. If the
1595 // sign bit is not part of the mask, unsigned BFE will zero out upper bits
1596 // of the result
1597 if (N->getOpcode() == ISD::SRA)
1598 IsSigned = (ShiftAmt + NumBits) == Val.getValueSizeInBits();
1599 } else if (LHS->getOpcode() == ISD::SHL) {
1600 // Here, we have a pattern like:
1601 //
1602 // (sra (shl val, NN), MM)
1603 // or
1604 // (srl (shl val, NN), MM)
1605 //
1606 // If MM >= NN, we can efficiently optimize this with bfe
1607 Val = LHS->getOperand(0);
1608
1609 SDValue ShlRHS = LHS->getOperand(1);
1610 ConstantSDNode *ShlCnst = dyn_cast<ConstantSDNode>(ShlRHS);
1611 if (!ShlCnst) {
1612 // Shift amount must be constant
1613 return false;
1614 }
1615 uint64_t InnerShiftAmt = ShlCnst->getZExtValue();
1616
1617 SDValue ShrRHS = RHS;
1618 ConstantSDNode *ShrCnst = dyn_cast<ConstantSDNode>(ShrRHS);
1619 if (!ShrCnst) {
1620 // Shift amount must be constant
1621 return false;
1622 }
1623 uint64_t OuterShiftAmt = ShrCnst->getZExtValue();
1624
1625 // To avoid extra codegen and be profitable, we need Outer >= Inner
1626 if (OuterShiftAmt < InnerShiftAmt) {
1627 return false;
1628 }
1629
1630 // If the outer shift is more than the type size, we have no bitfield to
1631 // extract (since we also check that the inner shift is <= the outer shift
1632 // then this also implies that the inner shift is < the type size)
1633 if (OuterShiftAmt >= Val.getValueSizeInBits()) {
1634 return false;
1635 }
1636
1637 Start = CurDAG->getTargetConstant(OuterShiftAmt - InnerShiftAmt, DL,
1638 MVT::i32);
1639 Len = CurDAG->getTargetConstant(Val.getValueSizeInBits() - OuterShiftAmt,
1640 DL, MVT::i32);
1641
1642 if (N->getOpcode() == ISD::SRA) {
1643 // If we have a arithmetic right shift, we need to use the signed bfe
1644 // variant
1645 IsSigned = true;
1646 }
1647 } else {
1648 // No can do...
1649 return false;
1650 }
1651 } else {
1652 // No can do...
1653 return false;
1654 }
1655
1656
1657 unsigned Opc;
1658 // For the BFE operations we form here from "and" and "srl", always use the
1659 // unsigned variants.
1660 if (Val.getValueType() == MVT::i32) {
1661 if (IsSigned) {
1662 Opc = NVPTX::BFE_S32rii;
1663 } else {
1664 Opc = NVPTX::BFE_U32rii;
1665 }
1666 } else if (Val.getValueType() == MVT::i64) {
1667 if (IsSigned) {
1668 Opc = NVPTX::BFE_S64rii;
1669 } else {
1670 Opc = NVPTX::BFE_U64rii;
1671 }
1672 } else {
1673 // We cannot handle this type
1674 return false;
1675 }
1676
1677 SDValue Ops[] = {
1678 Val, Start, Len
1679 };
1680
1681 ReplaceNode(N, CurDAG->getMachineNode(Opc, DL, N->getVTList(), Ops));
1682 return true;
1683}
1684
1685// Select bf16/bf16v2 FADD, FSUB, FMUL as fma on targets with only fma
1686bool NVPTXDAGToDAGISel::tryBF16ArithToFMA(SDNode *N) {
1687 EVT VT = SDValue(N, 0).getValueType();
1688 if (VT.getScalarType() != MVT::bf16)
1689 return false;
1690
1691 const NVPTXSubtarget *STI = TM.getSubtargetImpl();
1692 if (STI->hasNativeBF16Support(N->getOpcode()))
1693 return false;
1694
1695 const bool IsVec = VT.isVector();
1696 assert(!IsVec || VT.getVectorNumElements() == 2);
1697 SDLoc DL(N);
1698 SDValue N0 = N->getOperand(0);
1699 SDValue N1 = N->getOperand(1);
1700 SmallVector<SDValue, 3> Operands;
1701 auto GetConstant = [&](float Value) -> SDValue {
1702 // BF16 immediates must be legalized to integer register values
1703 APFloat APF(Value);
1704 bool LosesInfo;
1705 APF.convert(APFloat::BFloat(), APFloat::rmNearestTiesToEven, &LosesInfo);
1706 assert(!LosesInfo);
1707 if (IsVec) {
1708 auto API = APF.bitcastToAPInt();
1709 API = API.concat(API);
1710 auto Const = CurDAG->getTargetConstant(API, DL, MVT::i32);
1711 return SDValue(CurDAG->getMachineNode(NVPTX::MOV_B32_i, DL, VT, Const),
1712 0);
1713 }
1714 auto Const = CurDAG->getTargetConstantFP(APF, DL, VT);
1715 return SDValue(CurDAG->getMachineNode(NVPTX::MOV_BF16_i, DL, VT, Const), 0);
1716 };
1717
1718 switch (N->getOpcode()) {
1719 case ISD::FADD:
1720 // add(a, b) -> fma(a, 1.0, b)
1721 Operands = {N0, GetConstant(1.0), N1};
1722 break;
1723 case ISD::FSUB:
1724 // sub(a, b) -> fma(b, -1.0, a)
1725 Operands = {N1, GetConstant(-1.0), N0};
1726 break;
1727 case ISD::FMUL:
1728 // mul(a, b) -> fma(a, b, -0.0)
1729 // NOTE: The identity is -0, not 0, because -0 + 0 == 0 for floats
1730 Operands = {N0, N1, GetConstant(-0.0)};
1731 break;
1732 default:
1733 llvm_unreachable("Unexpected opcode");
1734 };
1735
1736 int Opcode = IsVec ? NVPTX::FMA_BF16x2rrr : NVPTX::FMA_BF16rrr;
1737 MachineSDNode *FMA = CurDAG->getMachineNode(Opcode, DL, VT, Operands);
1738 ReplaceNode(N, FMA);
1739 return true;
1740}
1741
1742SDValue NVPTXDAGToDAGISel::selectPossiblyImm(SDValue V) {
1743 if (V.getOpcode() == ISD::BITCAST)
1744 V = V.getOperand(0);
1745
1746 if (auto *CN = dyn_cast<ConstantSDNode>(V))
1747 return CurDAG->getTargetConstant(CN->getAPIntValue(), SDLoc(V),
1748 V.getValueType());
1749 if (auto *CN = dyn_cast<ConstantFPSDNode>(V))
1750 return CurDAG->getTargetConstantFP(CN->getValueAPF(), SDLoc(V),
1751 V.getValueType());
1752 return V;
1753}
1754
1755/// SelectInlineAsmMemoryOperand - Implement addressing mode selection for
1756/// inline asm expressions.
1758 const SDValue &Op, InlineAsm::ConstraintCode ConstraintID,
1759 std::vector<SDValue> &OutOps) {
1760 switch (ConstraintID) {
1761 default:
1762 return true;
1763 case InlineAsm::ConstraintCode::m: { // memory
1764 const auto [Base, Offset] = selectADDR(Op, CurDAG);
1765 OutOps.push_back(Base);
1766 OutOps.push_back(Offset);
1767 return false;
1768 }
1769 }
1770 return true;
1771}
1772
1773void NVPTXDAGToDAGISel::SelectV2I64toI128(SDNode *N) {
1774 // Lower a CopyToReg with two 64-bit inputs
1775 // Dst:i128, lo:i64, hi:i64
1776 //
1777 // CopyToReg Dst, lo, hi;
1778 //
1779 // ==>
1780 //
1781 // tmp = V2I64toI128 {lo, hi};
1782 // CopyToReg Dst, tmp;
1783 SDValue Dst = N->getOperand(1);
1784 SDValue Lo = N->getOperand(2);
1785 SDValue Hi = N->getOperand(3);
1786
1787 SDLoc DL(N);
1788 SDNode *Mov =
1789 CurDAG->getMachineNode(NVPTX::V2I64toI128, DL, MVT::i128, {Lo, Hi});
1790
1791 SmallVector<SDValue, 4> NewOps(N->getNumOperands() - 1);
1792 NewOps[0] = N->getOperand(0);
1793 NewOps[1] = Dst;
1794 NewOps[2] = SDValue(Mov, 0);
1795 if (N->getNumOperands() == 5)
1796 NewOps[3] = N->getOperand(4);
1797 SDValue NewValue = CurDAG->getNode(ISD::CopyToReg, DL, SmallVector<EVT>(N->values()), NewOps);
1798
1799 ReplaceNode(N, NewValue.getNode());
1800}
1801
1802void NVPTXDAGToDAGISel::SelectI128toV2I64(SDNode *N) {
1803 // Lower CopyFromReg from a 128-bit regs to two 64-bit regs
1804 // Dst:i128, Src:i128
1805 //
1806 // {lo, hi} = CopyFromReg Src
1807 //
1808 // ==>
1809 //
1810 // {lo, hi} = I128toV2I64 Src
1811 //
1812 SDValue Ch = N->getOperand(0);
1813 SDValue Src = N->getOperand(1);
1814 SDValue Glue = N->getOperand(2);
1815 SDLoc DL(N);
1816
1817 // Add Glue and Ch to the operands and results to avoid break the execution
1818 // order
1819 SDNode *Mov = CurDAG->getMachineNode(
1820 NVPTX::I128toV2I64, DL,
1821 {MVT::i64, MVT::i64, Ch.getValueType(), Glue.getValueType()},
1822 {Src, Ch, Glue});
1823
1824 ReplaceNode(N, Mov);
1825}
1826
1827bool NVPTXDAGToDAGISel::tryFence(SDNode *N) {
1828 SDLoc DL(N);
1829 assert(N->getOpcode() == ISD::ATOMIC_FENCE);
1830 unsigned int FenceOp =
1831 getFenceOp(NVPTX::Ordering(N->getConstantOperandVal(1)),
1832 Scopes[N->getConstantOperandVal(2)], Subtarget);
1833 SDValue Chain = N->getOperand(0);
1834 SDNode *FenceNode = CurDAG->getMachineNode(FenceOp, DL, MVT::Other, Chain);
1835 ReplaceNode(N, FenceNode);
1836 return true;
1837}
1838
1840 Scopes[C.getOrInsertSyncScopeID("singlethread")] = NVPTX::Scope::Thread;
1841 Scopes[C.getOrInsertSyncScopeID("")] = NVPTX::Scope::System;
1842 Scopes[C.getOrInsertSyncScopeID("block")] = NVPTX::Scope::Block;
1843 Scopes[C.getOrInsertSyncScopeID("cluster")] = NVPTX::Scope::Cluster;
1844 Scopes[C.getOrInsertSyncScopeID("device")] = NVPTX::Scope::Device;
1845}
1846
1848 if (Scopes.empty())
1849 llvm_unreachable("NVPTX Scopes must be initialized before calling "
1850 "NVPTXScopes::operator[]");
1851
1852 auto S = Scopes.find(ID);
1853 if (S == Scopes.end()) {
1854 // TODO:
1855 // - Add API to LLVMContext to get the name of a single scope.
1856 // - Use that API here to print an error containing the name
1857 // of this Unknown ID.
1858 report_fatal_error(formatv("Could not find scope ID={}.", int(ID)));
1859 }
1860 return S->second;
1861}
1862
1863bool NVPTXScopes::empty() const { return Scopes.size() == 0; }
1864
1865#define CP_ASYNC_BULK_TENSOR_OPCODE(dir, dim, mode, is_s32, suffix) \
1866 (is_s32 \
1867 ? NVPTX::CP_ASYNC_BULK_TENSOR_##dir##_##dim##_SHARED32_##mode##suffix \
1868 : NVPTX::CP_ASYNC_BULK_TENSOR_##dir##_##dim##_##mode##suffix)
1869
1870#define GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G_RED(dim, mode, is_ch, is_s32) \
1871 (is_ch ? (CP_ASYNC_BULK_TENSOR_OPCODE(RED, dim, mode, is_s32, _CH)) \
1872 : (CP_ASYNC_BULK_TENSOR_OPCODE(RED, dim, mode, is_s32, )))
1873
1874#define GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(dim, mode, is_mc, is_ch, is_s32) \
1875 [&]() -> auto { \
1876 if (is_mc && is_ch) \
1877 return CP_ASYNC_BULK_TENSOR_OPCODE(G2S, dim, mode, is_s32, _MC_CH); \
1878 if (is_ch) \
1879 return CP_ASYNC_BULK_TENSOR_OPCODE(G2S, dim, mode, is_s32, _CH); \
1880 if (is_mc) \
1881 return CP_ASYNC_BULK_TENSOR_OPCODE(G2S, dim, mode, is_s32, _MC); \
1882 return CP_ASYNC_BULK_TENSOR_OPCODE(G2S, dim, mode, is_s32, ); \
1883 }()
1884
1886 bool IsShared32,
1887 bool IsCacheHint,
1888 bool IsIm2Col) {
1889 if (IsIm2Col) {
1890 switch (Dim) {
1891 case 3:
1892 return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G_RED(3D, IM2COL, IsCacheHint,
1893 IsShared32);
1894 case 4:
1895 return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G_RED(4D, IM2COL, IsCacheHint,
1896 IsShared32);
1897 case 5:
1898 return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G_RED(5D, IM2COL, IsCacheHint,
1899 IsShared32);
1900 default:
1901 llvm_unreachable("Invalid Dimension in im2col mode for "
1902 "GetCpAsyncBulkTensorS2GReductionOpcode.");
1903 }
1904 } else {
1905 switch (Dim) {
1906 case 1:
1907 return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G_RED(1D, TILE, IsCacheHint,
1908 IsShared32);
1909 case 2:
1910 return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G_RED(2D, TILE, IsCacheHint,
1911 IsShared32);
1912 case 3:
1913 return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G_RED(3D, TILE, IsCacheHint,
1914 IsShared32);
1915 case 4:
1916 return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G_RED(4D, TILE, IsCacheHint,
1917 IsShared32);
1918 case 5:
1919 return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G_RED(5D, TILE, IsCacheHint,
1920 IsShared32);
1921 default:
1922 llvm_unreachable("Invalid Dimension in tile mode for "
1923 "GetCpAsyncBulkTensorS2GReductionOpcode.");
1924 }
1925 }
1926}
1927
1928static unsigned GetCpAsyncBulkTensorG2SOpcode(size_t Dim, bool IsShared32,
1929 bool IsMultiCast,
1930 bool IsCacheHint, bool IsIm2Col) {
1931 if (IsIm2Col) {
1932 switch (Dim) {
1933 case 3:
1934 return GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(3D, IM2COL, IsMultiCast,
1935 IsCacheHint, IsShared32);
1936 case 4:
1937 return GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(4D, IM2COL, IsMultiCast,
1938 IsCacheHint, IsShared32);
1939 case 5:
1940 return GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(5D, IM2COL, IsMultiCast,
1941 IsCacheHint, IsShared32);
1942 default:
1943 llvm_unreachable("Invalid Dimension in im2col mode for "
1944 "GetCpAsyncBulkTensorG2SOpcode.");
1945 }
1946 } else {
1947 switch (Dim) {
1948 case 1:
1949 return GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(1D, TILE, IsMultiCast,
1950 IsCacheHint, IsShared32);
1951 case 2:
1952 return GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(2D, TILE, IsMultiCast,
1953 IsCacheHint, IsShared32);
1954 case 3:
1955 return GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(3D, TILE, IsMultiCast,
1956 IsCacheHint, IsShared32);
1957 case 4:
1958 return GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(4D, TILE, IsMultiCast,
1959 IsCacheHint, IsShared32);
1960 case 5:
1961 return GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(5D, TILE, IsMultiCast,
1962 IsCacheHint, IsShared32);
1963 default:
1965 "Invalid Dimension in tile mode for GetCpAsyncBulkTensorG2SOpcode.");
1966 }
1967 }
1968}
1969
1970static size_t GetDimsFromIntrinsic(unsigned IID) {
1971 switch (IID) {
1972 case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_3d:
1973 case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_im2col_3d:
1974 return 3;
1975 case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_4d:
1976 case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_im2col_4d:
1977 return 4;
1978 case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_5d:
1979 case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_im2col_5d:
1980 return 5;
1981 default:
1982 llvm_unreachable("Invalid im2col intrinsic in GetDimsFromIntrinsic.");
1983 }
1984}
1985
1986void NVPTXDAGToDAGISel::SelectCpAsyncBulkTensorG2SCommon(SDNode *N,
1987 bool IsIm2Col) {
1988 // We have {Chain, Intrinsic-ID} followed by the actual intrisic args:
1989 // {dst, mbar, src, dims{d0...dN}, im2col_offsets{dims-2}
1990 // multicast, cache_hint,
1991 // multicast_flag, cache_hint_flag, cta_group_flag}
1992 // NumOperands = {Chain, IID} + {Actual intrinsic args}
1993 // = {2} + {8 + dims + im2col_offsets}
1994 size_t NumOps = N->getNumOperands();
1995 size_t NumDims = IsIm2Col ? GetDimsFromIntrinsic(N->getConstantOperandVal(1))
1996 : (NumOps - 10);
1997 // Offsets is always 'NumDims - 2' and only for im2col mode
1998 size_t NumOffsets = IsIm2Col ? (NumDims - 2) : 0;
1999 bool IsCacheHint = N->getConstantOperandVal(NumOps - 2) == 1;
2000 bool IsMultiCast = N->getConstantOperandVal(NumOps - 3) == 1;
2001 size_t NumBaseArgs = NumDims + NumOffsets + 3; // for {dst, mbar, src}
2002 size_t MultiCastIdx = NumBaseArgs + 2; // for Chain and IID
2003
2004 unsigned CTAGroupVal = N->getConstantOperandVal(NumOps - 1);
2005 if ((CTAGroupVal > 0) && !Subtarget->hasCpAsyncBulkTensorCTAGroupSupport())
2007 formatv("CpAsyncBulkTensorG2S cta_group::1/2 is not supported on sm_{}",
2008 Subtarget->getSmVersion()));
2009
2010 SDLoc DL(N);
2011 SmallVector<SDValue, 8> Ops(N->ops().slice(2, NumBaseArgs));
2012
2013 // Push MultiCast operand, if available
2014 if (IsMultiCast)
2015 Ops.push_back(N->getOperand(MultiCastIdx));
2016
2017 // Push CacheHint operand, if available
2018 if (IsCacheHint)
2019 Ops.push_back(N->getOperand(MultiCastIdx + 1));
2020
2021 // Flag for CTA Group
2022 Ops.push_back(getI32Imm(CTAGroupVal, DL));
2023
2024 // Finally, the chain operand
2025 Ops.push_back(N->getOperand(0));
2026
2027 bool IsShared32 =
2028 CurDAG->getDataLayout().getPointerSizeInBits(ADDRESS_SPACE_SHARED) == 32;
2029 unsigned Opcode = GetCpAsyncBulkTensorG2SOpcode(
2030 NumDims, IsShared32, IsMultiCast, IsCacheHint, IsIm2Col);
2031 ReplaceNode(N, CurDAG->getMachineNode(Opcode, DL, N->getVTList(), Ops));
2032}
2033
2034void NVPTXDAGToDAGISel::SelectCpAsyncBulkTensorReduceCommon(SDNode *N,
2035 unsigned RedOp,
2036 bool IsIm2Col) {
2037 // We have {Chain, Intrinsic-ID} followed by the actual intrisic args:
2038 // src, dst, dims{d0...dN}, cache_hint, cache_hint_flag
2039 // NumOperands = {Chain, IID} + {Actual intrinsic args}
2040 // = {2} + {4 + dims}
2041 size_t NumOps = N->getNumOperands();
2042 size_t NumDims = NumOps - 6;
2043 bool IsCacheHint = N->getConstantOperandVal(NumOps - 1) == 1;
2044 size_t NumArgs = NumDims + (IsCacheHint ? 3 : 2); // src, dst, cache_hint
2045
2046 SDLoc DL(N);
2047 SmallVector<SDValue, 12> Ops(N->ops().slice(2, NumArgs));
2048 Ops.push_back(getI32Imm(RedOp, DL)); // Reduction Op
2049 Ops.push_back(N->getOperand(0)); // Chain operand
2050
2051 bool IsShared32 =
2052 CurDAG->getDataLayout().getPointerSizeInBits(ADDRESS_SPACE_SHARED) == 32;
2054 NumDims, IsShared32, IsCacheHint, IsIm2Col);
2055 ReplaceNode(N, CurDAG->getMachineNode(Opcode, DL, N->getVTList(), Ops));
2056}
2057
2058#define TCGEN05_ST_OPCODE(SHAPE, NUM) \
2059 (enableUnpack ? NVPTX::TCGEN05_ST_##SHAPE##_##NUM##_UNPACK \
2060 : NVPTX::TCGEN05_ST_##SHAPE##_##NUM)
2061
2062static unsigned getTcgen05StOpcode(unsigned IID, bool enableUnpack) {
2063 switch (IID) {
2064 case Intrinsic::nvvm_tcgen05_st_16x64b_x1:
2065 return TCGEN05_ST_OPCODE(16x64b, x1);
2066 case Intrinsic::nvvm_tcgen05_st_16x64b_x2:
2067 return TCGEN05_ST_OPCODE(16x64b, x2);
2068 case Intrinsic::nvvm_tcgen05_st_16x64b_x4:
2069 return TCGEN05_ST_OPCODE(16x64b, x4);
2070 case Intrinsic::nvvm_tcgen05_st_16x64b_x8:
2071 return TCGEN05_ST_OPCODE(16x64b, x8);
2072 case Intrinsic::nvvm_tcgen05_st_16x64b_x16:
2073 return TCGEN05_ST_OPCODE(16x64b, x16);
2074 case Intrinsic::nvvm_tcgen05_st_16x64b_x32:
2075 return TCGEN05_ST_OPCODE(16x64b, x32);
2076 case Intrinsic::nvvm_tcgen05_st_16x64b_x64:
2077 return TCGEN05_ST_OPCODE(16x64b, x64);
2078 case Intrinsic::nvvm_tcgen05_st_16x64b_x128:
2079 return TCGEN05_ST_OPCODE(16x64b, x128);
2080 case Intrinsic::nvvm_tcgen05_st_16x128b_x1:
2081 return TCGEN05_ST_OPCODE(16x128b, x1);
2082 case Intrinsic::nvvm_tcgen05_st_16x128b_x2:
2083 return TCGEN05_ST_OPCODE(16x128b, x2);
2084 case Intrinsic::nvvm_tcgen05_st_16x128b_x4:
2085 return TCGEN05_ST_OPCODE(16x128b, x4);
2086 case Intrinsic::nvvm_tcgen05_st_16x128b_x8:
2087 return TCGEN05_ST_OPCODE(16x128b, x8);
2088 case Intrinsic::nvvm_tcgen05_st_16x128b_x16:
2089 return TCGEN05_ST_OPCODE(16x128b, x16);
2090 case Intrinsic::nvvm_tcgen05_st_16x128b_x32:
2091 return TCGEN05_ST_OPCODE(16x128b, x32);
2092 case Intrinsic::nvvm_tcgen05_st_16x128b_x64:
2093 return TCGEN05_ST_OPCODE(16x128b, x64);
2094 case Intrinsic::nvvm_tcgen05_st_16x256b_x1:
2095 return TCGEN05_ST_OPCODE(16x256b, x1);
2096 case Intrinsic::nvvm_tcgen05_st_16x256b_x2:
2097 return TCGEN05_ST_OPCODE(16x256b, x2);
2098 case Intrinsic::nvvm_tcgen05_st_16x256b_x4:
2099 return TCGEN05_ST_OPCODE(16x256b, x4);
2100 case Intrinsic::nvvm_tcgen05_st_16x256b_x8:
2101 return TCGEN05_ST_OPCODE(16x256b, x8);
2102 case Intrinsic::nvvm_tcgen05_st_16x256b_x16:
2103 return TCGEN05_ST_OPCODE(16x256b, x16);
2104 case Intrinsic::nvvm_tcgen05_st_16x256b_x32:
2105 return TCGEN05_ST_OPCODE(16x256b, x32);
2106 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x1:
2107 return TCGEN05_ST_OPCODE(16x32bx2, x1);
2108 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x2:
2109 return TCGEN05_ST_OPCODE(16x32bx2, x2);
2110 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x4:
2111 return TCGEN05_ST_OPCODE(16x32bx2, x4);
2112 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x8:
2113 return TCGEN05_ST_OPCODE(16x32bx2, x8);
2114 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x16:
2115 return TCGEN05_ST_OPCODE(16x32bx2, x16);
2116 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x32:
2117 return TCGEN05_ST_OPCODE(16x32bx2, x32);
2118 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x64:
2119 return TCGEN05_ST_OPCODE(16x32bx2, x64);
2120 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x128:
2121 return TCGEN05_ST_OPCODE(16x32bx2, x128);
2122 case Intrinsic::nvvm_tcgen05_st_32x32b_x1:
2123 return TCGEN05_ST_OPCODE(32x32b, x1);
2124 case Intrinsic::nvvm_tcgen05_st_32x32b_x2:
2125 return TCGEN05_ST_OPCODE(32x32b, x2);
2126 case Intrinsic::nvvm_tcgen05_st_32x32b_x4:
2127 return TCGEN05_ST_OPCODE(32x32b, x4);
2128 case Intrinsic::nvvm_tcgen05_st_32x32b_x8:
2129 return TCGEN05_ST_OPCODE(32x32b, x8);
2130 case Intrinsic::nvvm_tcgen05_st_32x32b_x16:
2131 return TCGEN05_ST_OPCODE(32x32b, x16);
2132 case Intrinsic::nvvm_tcgen05_st_32x32b_x32:
2133 return TCGEN05_ST_OPCODE(32x32b, x32);
2134 case Intrinsic::nvvm_tcgen05_st_32x32b_x64:
2135 return TCGEN05_ST_OPCODE(32x32b, x64);
2136 case Intrinsic::nvvm_tcgen05_st_32x32b_x128:
2137 return TCGEN05_ST_OPCODE(32x32b, x128);
2138 }
2139 llvm_unreachable("unhandled tcgen05.st lowering");
2140}
2141
2142void NVPTXDAGToDAGISel::SelectTcgen05St(SDNode *N, bool hasOffset) {
2143 if (!Subtarget->hasTcgen05InstSupport())
2145 "tcgen05.st is not supported on this architecture variant");
2146
2147 SDLoc DL(N);
2148 unsigned IID = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
2149
2150 SmallVector<SDValue, 128> Operands = {
2151 N->getOperand(2) // taddr
2152 };
2153
2154 if (hasOffset)
2155 Operands.push_back(CurDAG->getTargetConstant(
2156 cast<ConstantSDNode>(N->getOperand(3))->getZExtValue(), DL,
2157 MVT::i32)); // Offset
2158
2159 for (unsigned I = hasOffset ? 4 : 3; I < (N->getNumOperands() - 1); I++)
2160 Operands.push_back(N->getOperand(I));
2161
2162 bool enableUnpack =
2163 cast<ConstantSDNode>(N->getOperand(N->getNumOperands() - 1))
2164 ->getZExtValue();
2165
2166 Operands.push_back(N->getOperand(0)); // Chain
2167 ReplaceNode(N, CurDAG->getMachineNode(getTcgen05StOpcode(IID, enableUnpack),
2168 DL, N->getVTList(), Operands));
2169}
2170
2171bool NVPTXDAGToDAGISel::tryIntrinsicVoid(SDNode *N) {
2172 unsigned IID = N->getConstantOperandVal(1);
2173 using TMARedTy = llvm::nvvm::TMAReductionOp;
2174 auto CastTy = [](TMARedTy Op) { return static_cast<unsigned>(Op); };
2175 switch (IID) {
2176 default:
2177 return false;
2178 case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_1d:
2179 case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_2d:
2180 case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_3d:
2181 case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_4d:
2182 case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_5d:
2183 SelectCpAsyncBulkTensorG2SCommon(N);
2184 return true;
2185 case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_3d:
2186 case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_4d:
2187 case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_5d:
2188 SelectCpAsyncBulkTensorG2SCommon(N, /*IsIm2Col=*/true);
2189 return true;
2190 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_tile_1d:
2191 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_tile_2d:
2192 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_tile_3d:
2193 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_tile_4d:
2194 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_tile_5d:
2195 SelectCpAsyncBulkTensorReduceCommon(N, CastTy(TMARedTy::ADD));
2196 return true;
2197 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_im2col_3d:
2198 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_im2col_4d:
2199 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_im2col_5d:
2200 SelectCpAsyncBulkTensorReduceCommon(N, CastTy(TMARedTy::ADD),
2201 /*IsIm2Col=*/true);
2202 return true;
2203 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_min_tile_1d:
2204 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_min_tile_2d:
2205 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_min_tile_3d:
2206 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_min_tile_4d:
2207 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_min_tile_5d:
2208 SelectCpAsyncBulkTensorReduceCommon(N, CastTy(TMARedTy::MIN));
2209 return true;
2210 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_min_im2col_3d:
2211 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_min_im2col_4d:
2212 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_min_im2col_5d:
2213 SelectCpAsyncBulkTensorReduceCommon(N, CastTy(TMARedTy::MIN),
2214 /*IsIm2Col=*/true);
2215 return true;
2216 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_max_tile_1d:
2217 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_max_tile_2d:
2218 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_max_tile_3d:
2219 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_max_tile_4d:
2220 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_max_tile_5d:
2221 SelectCpAsyncBulkTensorReduceCommon(N, CastTy(TMARedTy::MAX));
2222 return true;
2223 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_max_im2col_3d:
2224 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_max_im2col_4d:
2225 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_max_im2col_5d:
2226 SelectCpAsyncBulkTensorReduceCommon(N, CastTy(TMARedTy::MAX),
2227 /*IsIm2Col=*/true);
2228 return true;
2229 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_inc_tile_1d:
2230 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_inc_tile_2d:
2231 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_inc_tile_3d:
2232 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_inc_tile_4d:
2233 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_inc_tile_5d:
2234 SelectCpAsyncBulkTensorReduceCommon(N, CastTy(TMARedTy::INC));
2235 return true;
2236 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_inc_im2col_3d:
2237 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_inc_im2col_4d:
2238 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_inc_im2col_5d:
2239 SelectCpAsyncBulkTensorReduceCommon(N, CastTy(TMARedTy::INC),
2240 /*IsIm2Col=*/true);
2241 return true;
2242 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_dec_tile_1d:
2243 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_dec_tile_2d:
2244 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_dec_tile_3d:
2245 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_dec_tile_4d:
2246 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_dec_tile_5d:
2247 SelectCpAsyncBulkTensorReduceCommon(N, CastTy(TMARedTy::DEC));
2248 return true;
2249 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_dec_im2col_3d:
2250 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_dec_im2col_4d:
2251 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_dec_im2col_5d:
2252 SelectCpAsyncBulkTensorReduceCommon(N, CastTy(TMARedTy::DEC),
2253 /*IsIm2Col=*/true);
2254 return true;
2255 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_and_tile_1d:
2256 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_and_tile_2d:
2257 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_and_tile_3d:
2258 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_and_tile_4d:
2259 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_and_tile_5d:
2260 SelectCpAsyncBulkTensorReduceCommon(N, CastTy(TMARedTy::AND));
2261 return true;
2262 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_and_im2col_3d:
2263 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_and_im2col_4d:
2264 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_and_im2col_5d:
2265 SelectCpAsyncBulkTensorReduceCommon(N, CastTy(TMARedTy::AND),
2266 /*IsIm2Col=*/true);
2267 return true;
2268 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_or_tile_1d:
2269 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_or_tile_2d:
2270 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_or_tile_3d:
2271 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_or_tile_4d:
2272 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_or_tile_5d:
2273 SelectCpAsyncBulkTensorReduceCommon(N, CastTy(TMARedTy::OR));
2274 return true;
2275 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_or_im2col_3d:
2276 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_or_im2col_4d:
2277 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_or_im2col_5d:
2278 SelectCpAsyncBulkTensorReduceCommon(N, CastTy(TMARedTy::OR),
2279 /*IsIm2Col=*/true);
2280 return true;
2281 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_xor_tile_1d:
2282 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_xor_tile_2d:
2283 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_xor_tile_3d:
2284 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_xor_tile_4d:
2285 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_xor_tile_5d:
2286 SelectCpAsyncBulkTensorReduceCommon(N, CastTy(TMARedTy::XOR));
2287 return true;
2288 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_xor_im2col_3d:
2289 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_xor_im2col_4d:
2290 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_xor_im2col_5d:
2291 SelectCpAsyncBulkTensorReduceCommon(N, CastTy(TMARedTy::XOR),
2292 /*IsIm2Col=*/true);
2293 return true;
2294
2295 case Intrinsic::nvvm_tcgen05_st_16x64b_x1:
2296 case Intrinsic::nvvm_tcgen05_st_16x64b_x2:
2297 case Intrinsic::nvvm_tcgen05_st_16x64b_x4:
2298 case Intrinsic::nvvm_tcgen05_st_16x64b_x8:
2299 case Intrinsic::nvvm_tcgen05_st_16x64b_x16:
2300 case Intrinsic::nvvm_tcgen05_st_16x64b_x32:
2301 case Intrinsic::nvvm_tcgen05_st_16x64b_x64:
2302 case Intrinsic::nvvm_tcgen05_st_16x64b_x128:
2303 case Intrinsic::nvvm_tcgen05_st_32x32b_x1:
2304 case Intrinsic::nvvm_tcgen05_st_32x32b_x2:
2305 case Intrinsic::nvvm_tcgen05_st_32x32b_x4:
2306 case Intrinsic::nvvm_tcgen05_st_32x32b_x8:
2307 case Intrinsic::nvvm_tcgen05_st_32x32b_x16:
2308 case Intrinsic::nvvm_tcgen05_st_32x32b_x32:
2309 case Intrinsic::nvvm_tcgen05_st_32x32b_x64:
2310 case Intrinsic::nvvm_tcgen05_st_32x32b_x128:
2311 case Intrinsic::nvvm_tcgen05_st_16x128b_x1:
2312 case Intrinsic::nvvm_tcgen05_st_16x128b_x2:
2313 case Intrinsic::nvvm_tcgen05_st_16x128b_x4:
2314 case Intrinsic::nvvm_tcgen05_st_16x128b_x8:
2315 case Intrinsic::nvvm_tcgen05_st_16x128b_x16:
2316 case Intrinsic::nvvm_tcgen05_st_16x128b_x32:
2317 case Intrinsic::nvvm_tcgen05_st_16x128b_x64:
2318 case Intrinsic::nvvm_tcgen05_st_16x256b_x1:
2319 case Intrinsic::nvvm_tcgen05_st_16x256b_x2:
2320 case Intrinsic::nvvm_tcgen05_st_16x256b_x4:
2321 case Intrinsic::nvvm_tcgen05_st_16x256b_x8:
2322 case Intrinsic::nvvm_tcgen05_st_16x256b_x16:
2323 case Intrinsic::nvvm_tcgen05_st_16x256b_x32: {
2324 SelectTcgen05St(N);
2325 return true;
2326 }
2327
2328 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x1:
2329 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x2:
2330 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x4:
2331 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x8:
2332 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x16:
2333 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x32:
2334 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x64:
2335 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x128: {
2336 SelectTcgen05St(N, /* hasOffset */ true);
2337 return true;
2338 }
2339 }
2340}
2341
2342void NVPTXDAGToDAGISel::selectAtomicSwap128(SDNode *N) {
2343 MemSDNode *AN = cast<MemSDNode>(N);
2344 SDLoc dl(N);
2345
2346 const SDValue Chain = N->getOperand(0);
2347 const auto [Base, Offset] = selectADDR(N->getOperand(1), CurDAG);
2349 Ops.append(N->op_begin() + 2, N->op_end());
2350 Ops.append({
2351 getI32Imm(getMemOrder(AN), dl),
2352 getI32Imm(getAtomicScope(AN), dl),
2353 getI32Imm(getAddrSpace(AN), dl),
2354 Chain,
2355 });
2356
2357 assert(N->getOpcode() == NVPTXISD::ATOMIC_CMP_SWAP_B128 ||
2358 N->getOpcode() == NVPTXISD::ATOMIC_SWAP_B128);
2359 unsigned Opcode = N->getOpcode() == NVPTXISD::ATOMIC_SWAP_B128
2360 ? NVPTX::ATOM_EXCH_B128
2361 : NVPTX::ATOM_CAS_B128;
2362
2363 auto *ATOM = CurDAG->getMachineNode(Opcode, dl, N->getVTList(), Ops);
2364 CurDAG->setNodeMemRefs(ATOM, AN->getMemOperand());
2365
2366 ReplaceNode(N, ATOM);
2367}
return SDValue()
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
This file implements a class to represent arbitrary precision integral constant values and operations...
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Atomic ordering constants.
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
#define DEBUG_TYPE
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
#define I(x, y, z)
Definition MD5.cpp:58
#define T
static unsigned getStoreVectorNumElts(SDNode *N)
static bool isAddLike(const SDValue V)
static SDValue selectBaseADDR(SDValue N, SelectionDAG *DAG)
static SDValue accumulateOffset(SDValue &Addr, SDLoc DL, SelectionDAG *DAG)
static size_t GetDimsFromIntrinsic(unsigned IID)
static unsigned getTcgen05StOpcode(unsigned IID, bool enableUnpack)
static std::optional< unsigned > pickOpcodeForVT(MVT::SimpleValueType VT, std::optional< unsigned > Opcode_i16, std::optional< unsigned > Opcode_i32, std::optional< unsigned > Opcode_i64)
static cl::opt< bool > EnableMADWide("nvptx-mad-wide-opt", cl::init(false), cl::Hidden, cl::desc("Enable MAD wide optimization"))
static unsigned GetCpAsyncBulkTensorS2GReductionOpcode(size_t Dim, bool IsShared32, bool IsCacheHint, bool IsIm2Col)
#define TCGEN05_LD_OPCODE(SHAPE, NUM)
static SDValue stripAssertAlign(SDValue N)
static cl::opt< bool > EnableRsqrtOpt("nvptx-rsqrt-approx-opt", cl::init(true), cl::Hidden, cl::desc("Enable reciprocal sqrt optimization"))
#define GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(dim, mode, is_mc, is_ch, is_s32)
static unsigned GetCpAsyncBulkTensorG2SOpcode(size_t Dim, bool IsShared32, bool IsMultiCast, bool IsCacheHint, bool IsIm2Col)
static unsigned int getFenceOp(NVPTX::Ordering O, NVPTX::Scope S, NVPTXSubtarget const *T)
#define GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G_RED(dim, mode, is_ch, is_s32)
#define TCGEN05_ST_OPCODE(SHAPE, NUM)
static std::optional< NVPTX::AddressSpace > convertAS(unsigned AS)
static std::pair< SDValue, SDValue > selectADDR(SDValue Addr, SelectionDAG *DAG)
static unsigned getTcgen05LdOpcode(unsigned IID, bool enablePack)
static bool canLowerToLDG(const MemSDNode &N, const NVPTXSubtarget &Subtarget, NVPTX::AddressSpace CodeAddrSpace)
This file contains the definitions of the enumerations and flags associated with NVVM Intrinsics,...
#define INITIALIZE_PASS(passName, arg, name, cfg, analysis)
Definition PassSupport.h:56
#define PASS_NAME
Value * RHS
Value * LHS
Class for arbitrary precision integers.
Definition APInt.h:78
LLVM_ABI APInt sext(unsigned width) const
Sign extend to a new width.
Definition APInt.cpp:985
int64_t getSExtValue() const
Get sign extended value.
Definition APInt.h:1562
unsigned getSrcAddressSpace() const
unsigned getDestAddressSpace() const
const SDValue & getVal() const
uint64_t getZExtValue() const
FunctionPass class - This class is used to implement most global optimizations.
Definition Pass.h:314
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
bool isIndexed() const
Return true if this is a pre/post inc/dec load/store.
ISD::LoadExtType getExtensionType() const
Return whether this is a plain node, or one of the varieties of value-extending loads.
SimpleValueType SimpleTy
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool is32BitVector() const
Return true if this is a 32-bit vector type.
MVT getVectorElementType() const
bool is64BitVector() const
Return true if this is a 64-bit vector type.
This is an abstract virtual class for memory operations.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
EVT getMemoryVT() const
Return the type of the in-memory value.
NVPTXDAGToDAGISelLegacy(NVPTXTargetMachine &tm, CodeGenOptLevel OptLevel)
bool runOnMachineFunction(MachineFunction &MF) override
static NVPTX::AddressSpace getAddrSpace(const MemSDNode *N)
bool SelectInlineAsmMemoryOperand(const SDValue &Op, InlineAsm::ConstraintCode ConstraintID, std::vector< SDValue > &OutOps) override
SelectInlineAsmMemoryOperand - Implement addressing mode selection for inline asm expressions.
static unsigned getFromTypeWidthForLoad(const MemSDNode *Mem)
const NVPTXSubtarget * Subtarget
const NVPTXTargetLowering * getTargetLowering() const override
bool hasNativeBF16Support(int Opcode) const
bool hasRelaxedMMIO() const
bool hasMemoryOrdering() const
NVPTX::DivPrecisionLevel getDivF32Level(const MachineFunction &MF, const SDNode &N) const
bool allowFMA(MachineFunction &MF, CodeGenOptLevel OptLevel) const
bool usePrecSqrtF32(const SDNode *N=nullptr) const
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
Represents one node in the SelectionDAG.
unsigned getNumValues() const
Return the number of values defined/returned by this operator.
const SDValue & getOperand(unsigned Num) const
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
SDNode * getNode() const
get the SDNode which holds the desired result
EVT getValueType() const
Return the ValueType of the referenced return value.
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
SelectionDAGISelLegacy(char &ID, std::unique_ptr< SelectionDAGISel > S)
void ReplaceUses(SDValue F, SDValue T)
ReplaceUses - replace all uses of the old node F with the use of the new node T.
void ReplaceNode(SDNode *F, SDNode *T)
Replace all uses of F with T, then remove F from the DAG.
SelectionDAGISel(TargetMachine &tm, CodeGenOptLevel OL=CodeGenOptLevel::Default)
virtual bool runOnMachineFunction(MachineFunction &mf)
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
SDValue getTargetGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, unsigned TargetFlags=0)
LLVM_ABI MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
SDValue getTargetFrameIndex(int FI, EVT VT)
SDValue getSignedTargetConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
LLVM_ABI SDValue getTargetExternalSymbol(const char *Sym, EVT VT, unsigned TargetFlags=0)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
const SDValue & getValue() const
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ ADD
Simple integer binary arithmetic operators.
Definition ISDOpcodes.h:259
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition ISDOpcodes.h:511
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
Definition ISDOpcodes.h:215
@ FADD
Simple binary floating point operators.
Definition ISDOpcodes.h:410
@ AssertAlign
AssertAlign - These nodes record if a register contains a value that has a known alignment and the tr...
Definition ISDOpcodes.h:69
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
Definition ISDOpcodes.h:225
@ SHL
Shift and rotation operations.
Definition ISDOpcodes.h:762
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition ISDOpcodes.h:569
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition ISDOpcodes.h:219
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition ISDOpcodes.h:736
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition ISDOpcodes.h:208
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
@ UNPACK_VECTOR
This node is the inverse of NVPTX::BUILD_VECTOR.
@ ATOMIC_CMP_SWAP_B128
These nodes are used to lower atomic instructions with i128 type.
std::string ScopeToString(Scope S)
@ SharedCluster
Definition NVPTX.h:187
std::string OrderingToString(Ordering Order)
bool isPackedVectorTy(EVT VT)
DivPrecisionLevel
Definition NVPTX.h:252
@ DefaultDevice
Definition NVPTX.h:176
@ RelaxedMMIO
Definition NVPTX.h:166
@ AcquireRelease
Definition NVPTX.h:162
@ NotAtomic
Definition NVPTX.h:155
@ SequentiallyConsistent
Definition NVPTX.h:163
initializer< Ty > init(const Ty &Val)
This is an optimization pass for GlobalISel generic memory operations.
@ Offset
Definition DWP.cpp:477
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:644
int countr_one(T Value)
Count the number of ones from the least significant bit to the first zero bit.
Definition bit.h:279
FunctionPass * createNVPTXISelDag(NVPTXTargetMachine &TM, llvm::CodeGenOptLevel OptLevel)
createNVPTXISelDag - This pass converts a legalized DAG into a NVPTX-specific DAG,...
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition bit.h:186
constexpr bool isShiftedMask_64(uint64_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (64 bit ver...
Definition MathExtras.h:282
const char * toIRString(AtomicOrdering ao)
String used by LLVM IR to represent atomic ordering.
auto formatv(bool Validate, const char *Fmt, Ts &&...Vals)
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:288
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:167
constexpr bool isMask_64(uint64_t Value)
Return true if the argument is a non-empty sequence of ones starting at the least significant bit wit...
Definition MathExtras.h:270
CodeGenOptLevel
Code generation optimization level.
Definition CodeGen.h:82
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:548
AtomicOrdering
Atomic ordering for LLVM's memory model.
DWARFExpression::Operation Op
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:560
Implement std::hash so that hash_code can be used in STL containers.
Definition BitVector.h:867
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:869
#define N
static constexpr roundingMode rmNearestTiesToEven
Definition APFloat.h:304
static LLVM_ABI const fltSemantics & BFloat() LLVM_READNONE
Definition APFloat.cpp:265
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition ValueTypes.h:373
bool isVector() const
Return true if this is a vector value type.
Definition ValueTypes.h:168
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition ValueTypes.h:323
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition ValueTypes.h:336
NVPTXScopes()=default
NVPTX::Scope operator[](SyncScope::ID ID) const