23#include "llvm/IR/IntrinsicsNVPTX.h"
34#define DEBUG_TYPE "nvptx-isel"
35#define PASS_NAME "NVPTX DAG->DAG Pattern Instruction Selection"
39 cl::desc(
"Enable reciprocal sqrt optimization"));
46 cl::desc(
"Enable MAD wide optimization"));
75NVPTXDAGToDAGISel::getDivF32Level(
const SDNode *
N)
const {
79bool NVPTXDAGToDAGISel::usePrecSqrtF32(
const SDNode *
N)
const {
83bool NVPTXDAGToDAGISel::useF32FTZ()
const {
84 return Subtarget->getTargetLowering()->useF32FTZ(*
MF);
87bool NVPTXDAGToDAGISel::allowFMA()
const {
88 const NVPTXTargetLowering *TL =
Subtarget->getTargetLowering();
92bool NVPTXDAGToDAGISel::doRsqrtOpt()
const {
return EnableRsqrtOpt; }
94bool NVPTXDAGToDAGISel::doMADWideOpt()
const {
return EnableMADWide; }
98void NVPTXDAGToDAGISel::Select(
SDNode *
N) {
100 if (
N->isMachineOpcode()) {
105 switch (
N->getOpcode()) {
107 case ISD::ATOMIC_LOAD:
112 case ISD::ATOMIC_STORE:
116 case ISD::ATOMIC_FENCE:
124 if (tryEXTRACT_VECTOR_ELEMENT(
N))
131 SelectSETP_BF16X2(
N);
136 if (tryLoadVector(
N))
147 if (tryStoreVector(
N))
151 if (tryIntrinsicChain(
N))
155 if (tryIntrinsicVoid(
N))
165 case ISD::ADDRSPACECAST:
166 SelectAddrSpaceCast(
N);
169 if (
N->getOperand(1).getValueType() == MVT::i128) {
170 SelectV2I64toI128(
N);
176 if (
N->getOperand(1).getValueType() == MVT::i128) {
177 SelectI128toV2I64(
N);
184 selectAtomicSwap128(
N);
189 if (tryBF16ArithToFMA(
N))
198#define TCGEN05_LD_OPCODE(SHAPE, NUM) \
199 (enablePack ? NVPTX::TCGEN05_LD_##SHAPE##_##NUM##_PACK \
200 : NVPTX::TCGEN05_LD_##SHAPE##_##NUM)
204 case Intrinsic::nvvm_tcgen05_ld_16x64b_x1:
206 case Intrinsic::nvvm_tcgen05_ld_16x64b_x2:
208 case Intrinsic::nvvm_tcgen05_ld_16x64b_x4:
210 case Intrinsic::nvvm_tcgen05_ld_16x64b_x8:
212 case Intrinsic::nvvm_tcgen05_ld_16x64b_x16:
214 case Intrinsic::nvvm_tcgen05_ld_16x64b_x32:
216 case Intrinsic::nvvm_tcgen05_ld_16x64b_x64:
218 case Intrinsic::nvvm_tcgen05_ld_16x64b_x128:
220 case Intrinsic::nvvm_tcgen05_ld_16x128b_x1:
222 case Intrinsic::nvvm_tcgen05_ld_16x128b_x2:
224 case Intrinsic::nvvm_tcgen05_ld_16x128b_x4:
226 case Intrinsic::nvvm_tcgen05_ld_16x128b_x8:
228 case Intrinsic::nvvm_tcgen05_ld_16x128b_x16:
230 case Intrinsic::nvvm_tcgen05_ld_16x128b_x32:
232 case Intrinsic::nvvm_tcgen05_ld_16x128b_x64:
234 case Intrinsic::nvvm_tcgen05_ld_16x256b_x1:
236 case Intrinsic::nvvm_tcgen05_ld_16x256b_x2:
238 case Intrinsic::nvvm_tcgen05_ld_16x256b_x4:
240 case Intrinsic::nvvm_tcgen05_ld_16x256b_x8:
242 case Intrinsic::nvvm_tcgen05_ld_16x256b_x16:
244 case Intrinsic::nvvm_tcgen05_ld_16x256b_x32:
246 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x1:
248 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x2:
250 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x4:
252 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x8:
254 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x16:
256 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x32:
258 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x64:
260 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x128:
262 case Intrinsic::nvvm_tcgen05_ld_32x32b_x1:
264 case Intrinsic::nvvm_tcgen05_ld_32x32b_x2:
266 case Intrinsic::nvvm_tcgen05_ld_32x32b_x4:
268 case Intrinsic::nvvm_tcgen05_ld_32x32b_x8:
270 case Intrinsic::nvvm_tcgen05_ld_32x32b_x16:
272 case Intrinsic::nvvm_tcgen05_ld_32x32b_x32:
274 case Intrinsic::nvvm_tcgen05_ld_32x32b_x64:
276 case Intrinsic::nvvm_tcgen05_ld_32x32b_x128:
282void NVPTXDAGToDAGISel::SelectTcgen05Ld(
SDNode *
N,
bool hasOffset) {
285 "tcgen05.ld is not supported on this architecture variant");
292 auto OffsetNode =
CurDAG->getTargetConstant(
296 {N->getOperand(2), OffsetNode, N->getOperand(0)}));
301 {N->getOperand(2), N->getOperand(0)}));
305bool NVPTXDAGToDAGISel::tryIntrinsicChain(
SDNode *
N) {
306 unsigned IID =
N->getConstantOperandVal(1);
310 case Intrinsic::nvvm_ldu_global_f:
311 case Intrinsic::nvvm_ldu_global_i:
312 case Intrinsic::nvvm_ldu_global_p:
315 case Intrinsic::nvvm_tcgen05_ld_16x64b_x1:
316 case Intrinsic::nvvm_tcgen05_ld_16x64b_x2:
317 case Intrinsic::nvvm_tcgen05_ld_16x64b_x4:
318 case Intrinsic::nvvm_tcgen05_ld_16x64b_x8:
319 case Intrinsic::nvvm_tcgen05_ld_16x64b_x16:
320 case Intrinsic::nvvm_tcgen05_ld_16x64b_x32:
321 case Intrinsic::nvvm_tcgen05_ld_16x64b_x64:
322 case Intrinsic::nvvm_tcgen05_ld_16x64b_x128:
323 case Intrinsic::nvvm_tcgen05_ld_16x128b_x1:
324 case Intrinsic::nvvm_tcgen05_ld_16x128b_x2:
325 case Intrinsic::nvvm_tcgen05_ld_16x128b_x4:
326 case Intrinsic::nvvm_tcgen05_ld_16x128b_x16:
327 case Intrinsic::nvvm_tcgen05_ld_16x128b_x32:
328 case Intrinsic::nvvm_tcgen05_ld_16x128b_x64:
329 case Intrinsic::nvvm_tcgen05_ld_16x256b_x1:
330 case Intrinsic::nvvm_tcgen05_ld_16x128b_x8:
331 case Intrinsic::nvvm_tcgen05_ld_16x256b_x2:
332 case Intrinsic::nvvm_tcgen05_ld_16x256b_x4:
333 case Intrinsic::nvvm_tcgen05_ld_16x256b_x8:
334 case Intrinsic::nvvm_tcgen05_ld_16x256b_x16:
335 case Intrinsic::nvvm_tcgen05_ld_16x256b_x32:
336 case Intrinsic::nvvm_tcgen05_ld_32x32b_x1:
337 case Intrinsic::nvvm_tcgen05_ld_32x32b_x2:
338 case Intrinsic::nvvm_tcgen05_ld_32x32b_x4:
339 case Intrinsic::nvvm_tcgen05_ld_32x32b_x8:
340 case Intrinsic::nvvm_tcgen05_ld_32x32b_x16:
341 case Intrinsic::nvvm_tcgen05_ld_32x32b_x32:
342 case Intrinsic::nvvm_tcgen05_ld_32x32b_x64:
343 case Intrinsic::nvvm_tcgen05_ld_32x32b_x128: {
348 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x1:
349 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x2:
350 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x4:
351 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x8:
352 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x16:
353 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x32:
354 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x64:
355 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x128: {
356 SelectTcgen05Ld(
N,
true);
391 return CmpMode::NotANumber;
406 return CurDAG->getTargetConstant(PTXCmpMode, SDLoc(), MVT::i32);
409bool NVPTXDAGToDAGISel::SelectSETP_F16X2(
SDNode *
N) {
412 SDNode *SetP =
CurDAG->getMachineNode(
413 NVPTX::SETP_f16x2rr,
DL, MVT::i1, MVT::i1,
414 {
N->getOperand(0),
N->getOperand(1), PTXCmpMode,
415 CurDAG->getTargetConstant(useF32FTZ() ? 1 : 0,
DL, MVT::i1)});
420bool NVPTXDAGToDAGISel::SelectSETP_BF16X2(
SDNode *
N) {
423 SDNode *SetP =
CurDAG->getMachineNode(
424 NVPTX::SETP_bf16x2rr,
DL, MVT::i1, MVT::i1,
425 {
N->getOperand(0),
N->getOperand(1), PTXCmpMode,
426 CurDAG->getTargetConstant(useF32FTZ() ? 1 : 0,
DL, MVT::i1)});
431bool NVPTXDAGToDAGISel::tryUNPACK_VECTOR(
SDNode *
N) {
433 MVT EltVT =
N->getSimpleValueType(0);
436 CurDAG->getMachineNode(NVPTX::I64toV2I32, SDLoc(
N), EltVT, EltVT,
Vector);
444bool NVPTXDAGToDAGISel::tryEXTRACT_VECTOR_ELEMENT(
SDNode *
N) {
447 MVT VT =
Vector.getSimpleValueType();
453 Opcode = NVPTX::I32toV2I16;
455 Opcode = NVPTX::I64toV2I32;
461 for (
auto *U :
Vector.getNode()->users()) {
464 if (
U->getOperand(0) !=
Vector)
466 if (
const ConstantSDNode *IdxConst =
468 if (IdxConst->getZExtValue() == 0)
470 else if (IdxConst->getZExtValue() == 1)
486 CurDAG->getMachineNode(Opcode, SDLoc(
N), EltVT, EltVT,
Vector);
487 for (
auto *Node : E0)
489 for (
auto *Node : E1)
495static std::optional<NVPTX::AddressSpace>
convertAS(
unsigned AS) {
517 return convertAS(
N->getMemOperand()->getAddrSpace())
525 auto Ordering =
N->getMergedOrdering();
549 return Scopes[
N->getSyncScopeID()];
554struct OperationOrderings {
555 NVPTX::Ordering InstructionOrdering, FenceOrdering;
556 OperationOrderings(NVPTX::Ordering IO = NVPTX::Ordering::NotAtomic,
557 NVPTX::Ordering FO = NVPTX::Ordering::NotAtomic)
558 : InstructionOrdering(IO), FenceOrdering(FO) {}
561static OperationOrderings
663 !HasMemoryOrdering) {
665 formatv(
"PTX does not support \"atomic\" for orderings different than"
666 "\"NotAtomic\" or \"Monotonic\" for sm_60 or older, but order "
678 bool AddrGenericOrGlobalOrShared =
683 if (!AddrGenericOrGlobalOrShared)
686 bool UseRelaxedMMIO =
708 formatv(
"PTX only supports Acquire Ordering on reads: {}",
709 N->getOperationName()));
714 formatv(
"PTX only supports Release Ordering on writes: {}",
715 N->getOperationName()));
719 formatv(
"NVPTX does not support AcquireRelease Ordering on "
721 "yet and PTX does not support it on loads or stores: {}",
722 N->getOperationName()));
735 else if (
N->writeMem())
739 formatv(
"NVPTX does not support SequentiallyConsistent Ordering on "
740 "read-modify-writes yet: {}",
741 N->getOperationName()));
742 return OperationOrderings(InstrOrder,
747 formatv(
"NVPTX backend does not support AtomicOrdering \"{}\" yet.",
770 auto S = Scopes[
N->getSyncScopeID()];
779 Subtarget->failIfClustersUnsupported(
"cluster scope");
798 T->failIfClustersUnsupported(
".cluster scope fence");
801 if (!
T->hasSplitAcquireAndReleaseFences() &&
809 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_acquire_sys
810 : NVPTX::INT_MEMBAR_SYS;
812 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_acquire_cta
813 : NVPTX::INT_MEMBAR_CTA;
815 return NVPTX::atomic_thread_fence_acquire_cluster;
817 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_acquire_gpu
818 : NVPTX::INT_MEMBAR_GL;
822 formatv(
"Unsupported scope \"{}\" for acquire/release/acq_rel fence.",
829 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_release_sys
830 : NVPTX::INT_MEMBAR_SYS;
832 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_release_cta
833 : NVPTX::INT_MEMBAR_CTA;
835 return NVPTX::atomic_thread_fence_release_cluster;
837 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_release_gpu
838 : NVPTX::INT_MEMBAR_GL;
842 formatv(
"Unsupported scope \"{}\" for acquire/release/acq_rel fence.",
849 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_acq_rel_sys
850 : NVPTX::INT_MEMBAR_SYS;
852 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_acq_rel_cta
853 : NVPTX::INT_MEMBAR_CTA;
855 return NVPTX::atomic_thread_fence_acq_rel_cluster;
857 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_acq_rel_gpu
858 : NVPTX::INT_MEMBAR_GL;
862 formatv(
"Unsupported scope \"{}\" for acquire/release/acq_rel fence.",
870 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_seq_cst_sys
871 : NVPTX::INT_MEMBAR_SYS;
873 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_seq_cst_cta
874 : NVPTX::INT_MEMBAR_CTA;
876 return NVPTX::atomic_thread_fence_seq_cst_cluster;
878 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_seq_cst_gpu
879 : NVPTX::INT_MEMBAR_GL;
892 formatv(
"Unsupported \"{}\" ordering and \"{}\" scope for fence.",
893 OrderingToString(O), ScopeToString(S)));
901std::pair<NVPTX::Ordering, NVPTX::Scope>
902NVPTXDAGToDAGISel::insertMemoryInstructionFence(
SDLoc DL,
SDValue &Chain,
904 auto [InstructionOrdering, FenceOrdering] =
906 auto Scope = getOperationScope(
N, InstructionOrdering);
919 formatv(
"Unexpected fence ordering: \"{}\".",
922 return {InstructionOrdering,
Scope};
925void NVPTXDAGToDAGISel::SelectAddrSpaceCast(
SDNode *
N) {
931 assert(SrcAddrSpace != DstAddrSpace &&
932 "addrspacecast must be between different address spaces");
937 if (TM.is64Bit() && TM.getPointerSizeInBits(SrcAddrSpace) == 32) {
940 SDNode *Cvt =
CurDAG->getMachineNode(NVPTX::CVT_u64_u32,
DL, MVT::i64,
946 switch (SrcAddrSpace) {
949 Opc = TM.is64Bit() ? NVPTX::cvta_global_64 : NVPTX::cvta_global;
952 Opc = TM.is64Bit() ? NVPTX::cvta_shared_64 : NVPTX::cvta_shared;
957 "Shared cluster address space is only supported in 64-bit mode");
958 Opc = NVPTX::cvta_shared_cluster_64;
961 Opc = TM.is64Bit() ? NVPTX::cvta_const_64 : NVPTX::cvta_const;
964 Opc = TM.is64Bit() ? NVPTX::cvta_local_64 : NVPTX::cvta_local;
967 Opc = TM.is64Bit() ? NVPTX::cvta_param_64 : NVPTX::cvta_param;
974 if (SrcAddrSpace != 0)
977 switch (DstAddrSpace) {
980 Opc = TM.is64Bit() ? NVPTX::cvta_to_global_64 : NVPTX::cvta_to_global;
983 Opc = TM.is64Bit() ? NVPTX::cvta_to_shared_64 : NVPTX::cvta_to_shared;
988 "Shared cluster address space is only supported in 64-bit mode");
989 Opc = NVPTX::cvta_to_shared_cluster_64;
992 Opc = TM.is64Bit() ? NVPTX::cvta_to_const_64 : NVPTX::cvta_to_const;
995 Opc = TM.is64Bit() ? NVPTX::cvta_to_local_64 : NVPTX::cvta_to_local;
998 Opc = TM.is64Bit() ? NVPTX::cvta_to_param_64 : NVPTX::cvta_to_param;
1002 SDNode *CVTA =
CurDAG->getMachineNode(
Opc,
DL,
N->getValueType(0), Src);
1003 if (TM.is64Bit() && TM.getPointerSizeInBits(DstAddrSpace) == 32) {
1006 CVTA =
CurDAG->getMachineNode(NVPTX::CVT_u32_u64,
DL, MVT::i32,
1017static std::optional<unsigned>
1019 std::optional<unsigned> Opcode_i32,
1020 std::optional<unsigned> Opcode_i64) {
1039 return std::nullopt;
1044 return V.getOpcode() ==
ISD::ADD ||
1045 (V->getOpcode() ==
ISD::OR && V->getFlags().hasDisjoint());
1050 N =
N.getOperand(0);
1060 GA->getValueType(0), GA->getOffset(),
1061 GA->getTargetFlags());
1064 ES->getTargetFlags());
1073 APInt AccumulatedOffset(64u, 0);
1079 const APInt CI = CN->getAPIntValue().
sext(64);
1080 if (!(CI + AccumulatedOffset).isSignedIntN(32))
1083 AccumulatedOffset += CI;
1109bool NVPTXDAGToDAGISel::tryLoad(
SDNode *
N) {
1111 assert(
LD->readMem() &&
"Expected load");
1115 if (PlainLoad && PlainLoad->
isIndexed())
1125 const auto [
Ordering,
Scope] = insertMemoryInstructionFence(
DL, Chain, LD);
1127 const unsigned FromTypeWidth =
LD->getMemoryVT().getSizeInBits();
1136 FromTypeWidth <= 128 &&
"Invalid width for load");
1141 getI32Imm(Scope,
DL),
1142 getI32Imm(CodeAddrSpace,
DL),
1143 getI32Imm(FromType,
DL),
1144 getI32Imm(FromTypeWidth,
DL),
1150 const std::optional<unsigned> Opcode =
1151 pickOpcodeForVT(TargetVT, NVPTX::LD_i16, NVPTX::LD_i32, NVPTX::LD_i64);
1155 SDNode *NVPTXLD =
CurDAG->getMachineNode(*Opcode,
DL,
LD->getVTList(),
Ops);
1159 MachineMemOperand *MemRef =
LD->getMemOperand();
1167 switch (
N->getOpcode()) {
1179bool NVPTXDAGToDAGISel::tryLoadVector(
SDNode *
N) {
1187 const MVT EltVT =
LD->getSimpleValueType(0);
1190 const auto [
Ordering,
Scope] = insertMemoryInstructionFence(
DL, Chain, LD);
1200 const unsigned ExtensionType =
1201 N->getConstantOperandVal(
N->getNumOperands() - 1);
1204 : NVPTX::PTXLdStInstCode::
Untyped;
1212 getI32Imm(Scope,
DL),
1213 getI32Imm(CodeAddrSpace,
DL),
1214 getI32Imm(FromType,
DL),
1215 getI32Imm(FromTypeWidth,
DL),
1220 std::optional<unsigned> Opcode;
1221 switch (
N->getOpcode()) {
1226 NVPTX::LDV_i32_v2, NVPTX::LDV_i64_v2);
1230 NVPTX::LDV_i32_v4, NVPTX::LDV_i64_v4);
1234 NVPTX::LDV_i32_v8, {});
1240 SDNode *NVPTXLD =
CurDAG->getMachineNode(*Opcode,
DL,
LD->getVTList(),
Ops);
1242 MachineMemOperand *MemRef =
LD->getMemOperand();
1249bool NVPTXDAGToDAGISel::tryLDG(
MemSDNode *LD) {
1252 unsigned ExtensionType;
1254 ExtensionType =
Load->getExtensionType();
1256 ExtensionType =
LD->getConstantOperandVal(
LD->getNumOperands() - 1);
1260 : NVPTX::PTXLdStInstCode::
Untyped;
1264 assert(!(
LD->getSimpleValueType(0).isVector() &&
1272 std::optional<unsigned> Opcode;
1273 switch (
LD->getOpcode()) {
1278 NVPTX::LD_GLOBAL_NC_i32, NVPTX::LD_GLOBAL_NC_i64);
1283 NVPTX::LD_GLOBAL_NC_v2i32, NVPTX::LD_GLOBAL_NC_v2i64);
1288 NVPTX::LD_GLOBAL_NC_v4i32, NVPTX::LD_GLOBAL_NC_v4i64);
1292 NVPTX::LD_GLOBAL_NC_v8i32, {});
1298 SDNode *NVPTXLDG =
CurDAG->getMachineNode(*Opcode,
DL,
LD->getVTList(),
Ops);
1307 auto ElementBitWidth = TotalWidth / NumElts;
1309 ElementBitWidth <= 128 && TotalWidth <= 256 &&
1310 "Invalid width for load");
1311 return ElementBitWidth;
1314bool NVPTXDAGToDAGISel::tryLDU(
SDNode *
N) {
1329 std::optional<unsigned> Opcode;
1330 switch (
N->getOpcode()) {
1335 NVPTX::LDU_GLOBAL_i32, NVPTX::LDU_GLOBAL_i64);
1339 NVPTX::LDU_GLOBAL_v2i32, NVPTX::LDU_GLOBAL_v2i64);
1343 NVPTX::LDU_GLOBAL_v4i32, {});
1349 SDNode *NVPTXLDU =
CurDAG->getMachineNode(*Opcode,
DL,
LD->getVTList(),
Ops);
1355bool NVPTXDAGToDAGISel::tryStore(
SDNode *
N) {
1357 assert(
ST->writeMem() &&
"Expected store");
1360 assert((PlainStore || AtomicStore) &&
"Expected store");
1363 if (PlainStore && PlainStore->
isIndexed())
1371 const auto [
Ordering,
Scope] = insertMemoryInstructionFence(
DL, Chain, ST);
1374 const unsigned ToTypeWidth =
ST->getMemoryVT().getSizeInBits();
1380 "Invalid width for store");
1384 getI32Imm(Ordering,
DL),
1385 getI32Imm(Scope,
DL),
1386 getI32Imm(CodeAddrSpace,
DL),
1387 getI32Imm(ToTypeWidth,
DL),
1392 const std::optional<unsigned> Opcode =
1394 NVPTX::ST_i32, NVPTX::ST_i64);
1398 SDNode *NVPTXST =
CurDAG->getMachineNode(*Opcode,
DL, MVT::Other,
Ops);
1403 MachineMemOperand *MemRef =
ST->getMemOperand();
1409bool NVPTXDAGToDAGISel::tryStoreVector(
SDNode *
N) {
1411 const unsigned TotalWidth =
ST->getMemoryVT().getSizeInBits();
1422 const auto [
Ordering,
Scope] = insertMemoryInstructionFence(
DL, Chain, ST);
1427 for (
auto &V :
ST->ops().slice(1, NumElts))
1428 Ops.push_back(selectPossiblyImm(V));
1430 const unsigned ToTypeWidth = TotalWidth / NumElts;
1433 TotalWidth <= 256 &&
"Invalid width for store");
1436 Ops.append({getI32Imm(Ordering,
DL), getI32Imm(Scope,
DL),
1437 getI32Imm(CodeAddrSpace,
DL), getI32Imm(ToTypeWidth,
DL),
Base,
1441 ST->getOperand(1).getSimpleValueType().SimpleTy;
1442 std::optional<unsigned> Opcode;
1443 switch (
ST->getOpcode()) {
1463 SDNode *NVPTXST =
CurDAG->getMachineNode(*Opcode,
DL, MVT::Other,
Ops);
1465 MachineMemOperand *MemRef =
ST->getMemOperand();
1474bool NVPTXDAGToDAGISel::tryBFE(
SDNode *
N) {
1481 bool IsSigned =
false;
1497 uint64_t MaskVal =
Mask->getZExtValue();
1507 Len =
CurDAG->getTargetConstant(NumBits,
DL, MVT::i32);
1511 Val =
LHS.getNode()->getOperand(0);
1512 Start =
LHS.getNode()->getOperand(1);
1518 int64_t GoodBits =
Start.getValueSizeInBits() - StartVal;
1519 if (NumBits > GoodBits) {
1577 NumBits = NumZeros + NumOnes - ShiftAmt;
1583 if (ShiftAmt < NumZeros) {
1591 Len =
CurDAG->getTargetConstant(NumBits,
DL, MVT::i32);
1607 Val =
LHS->getOperand(0);
1626 if (OuterShiftAmt < InnerShiftAmt) {
1637 Start =
CurDAG->getTargetConstant(OuterShiftAmt - InnerShiftAmt,
DL,
1662 Opc = NVPTX::BFE_S32rii;
1664 Opc = NVPTX::BFE_U32rii;
1668 Opc = NVPTX::BFE_S64rii;
1670 Opc = NVPTX::BFE_U64rii;
1686bool NVPTXDAGToDAGISel::tryBF16ArithToFMA(
SDNode *
N) {
1687 EVT VT =
SDValue(
N, 0).getValueType();
1691 const NVPTXSubtarget *STI = TM.getSubtargetImpl();
1708 auto API = APF.bitcastToAPInt();
1709 API = API.concat(API);
1711 return SDValue(
CurDAG->getMachineNode(NVPTX::MOV_B32_i,
DL, VT, Const),
1715 return SDValue(
CurDAG->getMachineNode(NVPTX::MOV_BF16_i,
DL, VT, Const), 0);
1718 switch (
N->getOpcode()) {
1721 Operands = {N0, GetConstant(1.0), N1};
1725 Operands = {N1, GetConstant(-1.0), N0};
1730 Operands = {N0, N1, GetConstant(-0.0)};
1736 int Opcode = IsVec ? NVPTX::FMA_BF16x2rrr : NVPTX::FMA_BF16rrr;
1737 MachineSDNode *
FMA =
CurDAG->getMachineNode(Opcode,
DL, VT, Operands);
1743 if (
V.getOpcode() == ISD::BITCAST)
1744 V =
V.getOperand(0);
1747 return CurDAG->getTargetConstant(CN->getAPIntValue(), SDLoc(V),
1750 return CurDAG->getTargetConstantFP(CN->getValueAPF(), SDLoc(V),
1759 std::vector<SDValue> &OutOps) {
1760 switch (ConstraintID) {
1765 OutOps.push_back(
Base);
1766 OutOps.push_back(
Offset);
1773void NVPTXDAGToDAGISel::SelectV2I64toI128(
SDNode *
N) {
1792 NewOps[0] =
N->getOperand(0);
1795 if (
N->getNumOperands() == 5)
1796 NewOps[3] =
N->getOperand(4);
1802void NVPTXDAGToDAGISel::SelectI128toV2I64(
SDNode *
N) {
1819 SDNode *Mov =
CurDAG->getMachineNode(
1820 NVPTX::I128toV2I64,
DL,
1827bool NVPTXDAGToDAGISel::tryFence(
SDNode *
N) {
1829 assert(
N->getOpcode() == ISD::ATOMIC_FENCE);
1830 unsigned int FenceOp =
1832 Scopes[
N->getConstantOperandVal(2)],
Subtarget);
1834 SDNode *FenceNode =
CurDAG->getMachineNode(FenceOp,
DL, MVT::Other, Chain);
1850 "NVPTXScopes::operator[]");
1852 auto S = Scopes.find(
ID);
1853 if (S == Scopes.end()) {
1865#define CP_ASYNC_BULK_TENSOR_OPCODE(dir, dim, mode, is_s32, suffix) \
1867 ? NVPTX::CP_ASYNC_BULK_TENSOR_##dir##_##dim##_SHARED32_##mode##suffix \
1868 : NVPTX::CP_ASYNC_BULK_TENSOR_##dir##_##dim##_##mode##suffix)
1870#define GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G_RED(dim, mode, is_ch, is_s32) \
1871 (is_ch ? (CP_ASYNC_BULK_TENSOR_OPCODE(RED, dim, mode, is_s32, _CH)) \
1872 : (CP_ASYNC_BULK_TENSOR_OPCODE(RED, dim, mode, is_s32, )))
1874#define GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(dim, mode, is_mc, is_ch, is_s32) \
1876 if (is_mc && is_ch) \
1877 return CP_ASYNC_BULK_TENSOR_OPCODE(G2S, dim, mode, is_s32, _MC_CH); \
1879 return CP_ASYNC_BULK_TENSOR_OPCODE(G2S, dim, mode, is_s32, _CH); \
1881 return CP_ASYNC_BULK_TENSOR_OPCODE(G2S, dim, mode, is_s32, _MC); \
1882 return CP_ASYNC_BULK_TENSOR_OPCODE(G2S, dim, mode, is_s32, ); \
1902 "GetCpAsyncBulkTensorS2GReductionOpcode.");
1923 "GetCpAsyncBulkTensorS2GReductionOpcode.");
1930 bool IsCacheHint,
bool IsIm2Col) {
1935 IsCacheHint, IsShared32);
1938 IsCacheHint, IsShared32);
1941 IsCacheHint, IsShared32);
1944 "GetCpAsyncBulkTensorG2SOpcode.");
1950 IsCacheHint, IsShared32);
1953 IsCacheHint, IsShared32);
1956 IsCacheHint, IsShared32);
1959 IsCacheHint, IsShared32);
1962 IsCacheHint, IsShared32);
1965 "Invalid Dimension in tile mode for GetCpAsyncBulkTensorG2SOpcode.");
1972 case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_3d:
1973 case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_im2col_3d:
1975 case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_4d:
1976 case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_im2col_4d:
1978 case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_5d:
1979 case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_im2col_5d:
1986void NVPTXDAGToDAGISel::SelectCpAsyncBulkTensorG2SCommon(
SDNode *
N,
1994 size_t NumOps =
N->getNumOperands();
1998 size_t NumOffsets = IsIm2Col ? (NumDims - 2) : 0;
1999 bool IsCacheHint =
N->getConstantOperandVal(
NumOps - 2) == 1;
2000 bool IsMultiCast =
N->getConstantOperandVal(
NumOps - 3) == 1;
2001 size_t NumBaseArgs = NumDims + NumOffsets + 3;
2002 size_t MultiCastIdx = NumBaseArgs + 2;
2004 unsigned CTAGroupVal =
N->getConstantOperandVal(
NumOps - 1);
2005 if ((CTAGroupVal > 0) && !
Subtarget->hasCpAsyncBulkTensorCTAGroupSupport())
2007 formatv(
"CpAsyncBulkTensorG2S cta_group::1/2 is not supported on sm_{}",
2015 Ops.push_back(
N->getOperand(MultiCastIdx));
2019 Ops.push_back(
N->getOperand(MultiCastIdx + 1));
2022 Ops.push_back(getI32Imm(CTAGroupVal,
DL));
2025 Ops.push_back(
N->getOperand(0));
2030 NumDims, IsShared32, IsMultiCast, IsCacheHint, IsIm2Col);
2034void NVPTXDAGToDAGISel::SelectCpAsyncBulkTensorReduceCommon(
SDNode *
N,
2041 size_t NumOps =
N->getNumOperands();
2042 size_t NumDims =
NumOps - 6;
2043 bool IsCacheHint =
N->getConstantOperandVal(
NumOps - 1) == 1;
2044 size_t NumArgs = NumDims + (IsCacheHint ? 3 : 2);
2048 Ops.push_back(getI32Imm(RedOp,
DL));
2049 Ops.push_back(
N->getOperand(0));
2054 NumDims, IsShared32, IsCacheHint, IsIm2Col);
2058#define TCGEN05_ST_OPCODE(SHAPE, NUM) \
2059 (enableUnpack ? NVPTX::TCGEN05_ST_##SHAPE##_##NUM##_UNPACK \
2060 : NVPTX::TCGEN05_ST_##SHAPE##_##NUM)
2064 case Intrinsic::nvvm_tcgen05_st_16x64b_x1:
2066 case Intrinsic::nvvm_tcgen05_st_16x64b_x2:
2068 case Intrinsic::nvvm_tcgen05_st_16x64b_x4:
2070 case Intrinsic::nvvm_tcgen05_st_16x64b_x8:
2072 case Intrinsic::nvvm_tcgen05_st_16x64b_x16:
2074 case Intrinsic::nvvm_tcgen05_st_16x64b_x32:
2076 case Intrinsic::nvvm_tcgen05_st_16x64b_x64:
2078 case Intrinsic::nvvm_tcgen05_st_16x64b_x128:
2080 case Intrinsic::nvvm_tcgen05_st_16x128b_x1:
2082 case Intrinsic::nvvm_tcgen05_st_16x128b_x2:
2084 case Intrinsic::nvvm_tcgen05_st_16x128b_x4:
2086 case Intrinsic::nvvm_tcgen05_st_16x128b_x8:
2088 case Intrinsic::nvvm_tcgen05_st_16x128b_x16:
2090 case Intrinsic::nvvm_tcgen05_st_16x128b_x32:
2092 case Intrinsic::nvvm_tcgen05_st_16x128b_x64:
2094 case Intrinsic::nvvm_tcgen05_st_16x256b_x1:
2096 case Intrinsic::nvvm_tcgen05_st_16x256b_x2:
2098 case Intrinsic::nvvm_tcgen05_st_16x256b_x4:
2100 case Intrinsic::nvvm_tcgen05_st_16x256b_x8:
2102 case Intrinsic::nvvm_tcgen05_st_16x256b_x16:
2104 case Intrinsic::nvvm_tcgen05_st_16x256b_x32:
2106 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x1:
2108 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x2:
2110 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x4:
2112 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x8:
2114 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x16:
2116 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x32:
2118 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x64:
2120 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x128:
2122 case Intrinsic::nvvm_tcgen05_st_32x32b_x1:
2124 case Intrinsic::nvvm_tcgen05_st_32x32b_x2:
2126 case Intrinsic::nvvm_tcgen05_st_32x32b_x4:
2128 case Intrinsic::nvvm_tcgen05_st_32x32b_x8:
2130 case Intrinsic::nvvm_tcgen05_st_32x32b_x16:
2132 case Intrinsic::nvvm_tcgen05_st_32x32b_x32:
2134 case Intrinsic::nvvm_tcgen05_st_32x32b_x64:
2136 case Intrinsic::nvvm_tcgen05_st_32x32b_x128:
2142void NVPTXDAGToDAGISel::SelectTcgen05St(
SDNode *
N,
bool hasOffset) {
2143 if (!
Subtarget->hasTcgen05InstSupport())
2145 "tcgen05.st is not supported on this architecture variant");
2159 for (
unsigned I = hasOffset ? 4 : 3;
I < (
N->getNumOperands() - 1);
I++)
2168 DL,
N->getVTList(), Operands));
2171bool NVPTXDAGToDAGISel::tryIntrinsicVoid(
SDNode *
N) {
2172 unsigned IID =
N->getConstantOperandVal(1);
2174 auto CastTy = [](TMARedTy
Op) {
return static_cast<unsigned>(
Op); };
2178 case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_1d:
2179 case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_2d:
2180 case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_3d:
2181 case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_4d:
2182 case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_5d:
2183 SelectCpAsyncBulkTensorG2SCommon(
N);
2185 case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_3d:
2186 case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_4d:
2187 case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_5d:
2188 SelectCpAsyncBulkTensorG2SCommon(
N,
true);
2190 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_tile_1d:
2191 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_tile_2d:
2192 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_tile_3d:
2193 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_tile_4d:
2194 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_tile_5d:
2195 SelectCpAsyncBulkTensorReduceCommon(
N, CastTy(TMARedTy::ADD));
2197 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_im2col_3d:
2198 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_im2col_4d:
2199 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_im2col_5d:
2200 SelectCpAsyncBulkTensorReduceCommon(
N, CastTy(TMARedTy::ADD),
2203 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_min_tile_1d:
2204 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_min_tile_2d:
2205 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_min_tile_3d:
2206 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_min_tile_4d:
2207 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_min_tile_5d:
2208 SelectCpAsyncBulkTensorReduceCommon(
N, CastTy(TMARedTy::MIN));
2210 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_min_im2col_3d:
2211 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_min_im2col_4d:
2212 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_min_im2col_5d:
2213 SelectCpAsyncBulkTensorReduceCommon(
N, CastTy(TMARedTy::MIN),
2216 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_max_tile_1d:
2217 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_max_tile_2d:
2218 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_max_tile_3d:
2219 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_max_tile_4d:
2220 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_max_tile_5d:
2221 SelectCpAsyncBulkTensorReduceCommon(
N, CastTy(TMARedTy::MAX));
2223 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_max_im2col_3d:
2224 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_max_im2col_4d:
2225 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_max_im2col_5d:
2226 SelectCpAsyncBulkTensorReduceCommon(
N, CastTy(TMARedTy::MAX),
2229 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_inc_tile_1d:
2230 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_inc_tile_2d:
2231 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_inc_tile_3d:
2232 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_inc_tile_4d:
2233 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_inc_tile_5d:
2234 SelectCpAsyncBulkTensorReduceCommon(
N, CastTy(TMARedTy::INC));
2236 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_inc_im2col_3d:
2237 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_inc_im2col_4d:
2238 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_inc_im2col_5d:
2239 SelectCpAsyncBulkTensorReduceCommon(
N, CastTy(TMARedTy::INC),
2242 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_dec_tile_1d:
2243 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_dec_tile_2d:
2244 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_dec_tile_3d:
2245 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_dec_tile_4d:
2246 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_dec_tile_5d:
2247 SelectCpAsyncBulkTensorReduceCommon(
N, CastTy(TMARedTy::DEC));
2249 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_dec_im2col_3d:
2250 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_dec_im2col_4d:
2251 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_dec_im2col_5d:
2252 SelectCpAsyncBulkTensorReduceCommon(
N, CastTy(TMARedTy::DEC),
2255 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_and_tile_1d:
2256 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_and_tile_2d:
2257 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_and_tile_3d:
2258 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_and_tile_4d:
2259 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_and_tile_5d:
2260 SelectCpAsyncBulkTensorReduceCommon(
N, CastTy(TMARedTy::AND));
2262 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_and_im2col_3d:
2263 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_and_im2col_4d:
2264 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_and_im2col_5d:
2265 SelectCpAsyncBulkTensorReduceCommon(
N, CastTy(TMARedTy::AND),
2268 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_or_tile_1d:
2269 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_or_tile_2d:
2270 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_or_tile_3d:
2271 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_or_tile_4d:
2272 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_or_tile_5d:
2273 SelectCpAsyncBulkTensorReduceCommon(
N, CastTy(TMARedTy::OR));
2275 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_or_im2col_3d:
2276 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_or_im2col_4d:
2277 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_or_im2col_5d:
2278 SelectCpAsyncBulkTensorReduceCommon(
N, CastTy(TMARedTy::OR),
2281 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_xor_tile_1d:
2282 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_xor_tile_2d:
2283 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_xor_tile_3d:
2284 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_xor_tile_4d:
2285 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_xor_tile_5d:
2286 SelectCpAsyncBulkTensorReduceCommon(
N, CastTy(TMARedTy::XOR));
2288 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_xor_im2col_3d:
2289 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_xor_im2col_4d:
2290 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_xor_im2col_5d:
2291 SelectCpAsyncBulkTensorReduceCommon(
N, CastTy(TMARedTy::XOR),
2295 case Intrinsic::nvvm_tcgen05_st_16x64b_x1:
2296 case Intrinsic::nvvm_tcgen05_st_16x64b_x2:
2297 case Intrinsic::nvvm_tcgen05_st_16x64b_x4:
2298 case Intrinsic::nvvm_tcgen05_st_16x64b_x8:
2299 case Intrinsic::nvvm_tcgen05_st_16x64b_x16:
2300 case Intrinsic::nvvm_tcgen05_st_16x64b_x32:
2301 case Intrinsic::nvvm_tcgen05_st_16x64b_x64:
2302 case Intrinsic::nvvm_tcgen05_st_16x64b_x128:
2303 case Intrinsic::nvvm_tcgen05_st_32x32b_x1:
2304 case Intrinsic::nvvm_tcgen05_st_32x32b_x2:
2305 case Intrinsic::nvvm_tcgen05_st_32x32b_x4:
2306 case Intrinsic::nvvm_tcgen05_st_32x32b_x8:
2307 case Intrinsic::nvvm_tcgen05_st_32x32b_x16:
2308 case Intrinsic::nvvm_tcgen05_st_32x32b_x32:
2309 case Intrinsic::nvvm_tcgen05_st_32x32b_x64:
2310 case Intrinsic::nvvm_tcgen05_st_32x32b_x128:
2311 case Intrinsic::nvvm_tcgen05_st_16x128b_x1:
2312 case Intrinsic::nvvm_tcgen05_st_16x128b_x2:
2313 case Intrinsic::nvvm_tcgen05_st_16x128b_x4:
2314 case Intrinsic::nvvm_tcgen05_st_16x128b_x8:
2315 case Intrinsic::nvvm_tcgen05_st_16x128b_x16:
2316 case Intrinsic::nvvm_tcgen05_st_16x128b_x32:
2317 case Intrinsic::nvvm_tcgen05_st_16x128b_x64:
2318 case Intrinsic::nvvm_tcgen05_st_16x256b_x1:
2319 case Intrinsic::nvvm_tcgen05_st_16x256b_x2:
2320 case Intrinsic::nvvm_tcgen05_st_16x256b_x4:
2321 case Intrinsic::nvvm_tcgen05_st_16x256b_x8:
2322 case Intrinsic::nvvm_tcgen05_st_16x256b_x16:
2323 case Intrinsic::nvvm_tcgen05_st_16x256b_x32: {
2328 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x1:
2329 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x2:
2330 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x4:
2331 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x8:
2332 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x16:
2333 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x32:
2334 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x64:
2335 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x128: {
2336 SelectTcgen05St(
N,
true);
2342void NVPTXDAGToDAGISel::selectAtomicSwap128(
SDNode *
N) {
2349 Ops.append(
N->op_begin() + 2,
N->op_end());
2351 getI32Imm(getMemOrder(AN), dl),
2352 getI32Imm(getAtomicScope(AN), dl),
2360 ? NVPTX::ATOM_EXCH_B128
2361 : NVPTX::ATOM_CAS_B128;
2363 auto *ATOM =
CurDAG->getMachineNode(Opcode, dl,
N->getVTList(),
Ops);
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
This file implements a class to represent arbitrary precision integral constant values and operations...
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Atomic ordering constants.
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static unsigned getStoreVectorNumElts(SDNode *N)
static bool isAddLike(const SDValue V)
static SDValue selectBaseADDR(SDValue N, SelectionDAG *DAG)
static SDValue accumulateOffset(SDValue &Addr, SDLoc DL, SelectionDAG *DAG)
static size_t GetDimsFromIntrinsic(unsigned IID)
static unsigned getTcgen05StOpcode(unsigned IID, bool enableUnpack)
static std::optional< unsigned > pickOpcodeForVT(MVT::SimpleValueType VT, std::optional< unsigned > Opcode_i16, std::optional< unsigned > Opcode_i32, std::optional< unsigned > Opcode_i64)
static cl::opt< bool > EnableMADWide("nvptx-mad-wide-opt", cl::init(false), cl::Hidden, cl::desc("Enable MAD wide optimization"))
static unsigned GetCpAsyncBulkTensorS2GReductionOpcode(size_t Dim, bool IsShared32, bool IsCacheHint, bool IsIm2Col)
#define TCGEN05_LD_OPCODE(SHAPE, NUM)
static SDValue stripAssertAlign(SDValue N)
static cl::opt< bool > EnableRsqrtOpt("nvptx-rsqrt-approx-opt", cl::init(true), cl::Hidden, cl::desc("Enable reciprocal sqrt optimization"))
#define GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(dim, mode, is_mc, is_ch, is_s32)
static unsigned GetCpAsyncBulkTensorG2SOpcode(size_t Dim, bool IsShared32, bool IsMultiCast, bool IsCacheHint, bool IsIm2Col)
static unsigned int getFenceOp(NVPTX::Ordering O, NVPTX::Scope S, NVPTXSubtarget const *T)
#define GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G_RED(dim, mode, is_ch, is_s32)
#define TCGEN05_ST_OPCODE(SHAPE, NUM)
static std::optional< NVPTX::AddressSpace > convertAS(unsigned AS)
static std::pair< SDValue, SDValue > selectADDR(SDValue Addr, SelectionDAG *DAG)
static unsigned getTcgen05LdOpcode(unsigned IID, bool enablePack)
static bool canLowerToLDG(const MemSDNode &N, const NVPTXSubtarget &Subtarget, NVPTX::AddressSpace CodeAddrSpace)
This file contains the definitions of the enumerations and flags associated with NVVM Intrinsics,...
#define INITIALIZE_PASS(passName, arg, name, cfg, analysis)
Class for arbitrary precision integers.
LLVM_ABI APInt sext(unsigned width) const
Sign extend to a new width.
int64_t getSExtValue() const
Get sign extended value.
unsigned getSrcAddressSpace() const
unsigned getDestAddressSpace() const
const SDValue & getVal() const
uint64_t getZExtValue() const
FunctionPass class - This class is used to implement most global optimizations.
This is an important class for using LLVM in a threaded context.
bool isIndexed() const
Return true if this is a pre/post inc/dec load/store.
ISD::LoadExtType getExtensionType() const
Return whether this is a plain node, or one of the varieties of value-extending loads.
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool is32BitVector() const
Return true if this is a 32-bit vector type.
MVT getVectorElementType() const
bool is64BitVector() const
Return true if this is a 64-bit vector type.
This is an abstract virtual class for memory operations.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
EVT getMemoryVT() const
Return the type of the in-memory value.
NVPTXDAGToDAGISelLegacy(NVPTXTargetMachine &tm, CodeGenOptLevel OptLevel)
bool runOnMachineFunction(MachineFunction &MF) override
NVPTXDAGToDAGISel()=delete
static NVPTX::AddressSpace getAddrSpace(const MemSDNode *N)
bool SelectInlineAsmMemoryOperand(const SDValue &Op, InlineAsm::ConstraintCode ConstraintID, std::vector< SDValue > &OutOps) override
SelectInlineAsmMemoryOperand - Implement addressing mode selection for inline asm expressions.
static unsigned getFromTypeWidthForLoad(const MemSDNode *Mem)
const NVPTXSubtarget * Subtarget
const NVPTXTargetLowering * getTargetLowering() const override
bool hasNativeBF16Support(int Opcode) const
bool hasRelaxedMMIO() const
bool hasMemoryOrdering() const
NVPTX::DivPrecisionLevel getDivF32Level(const MachineFunction &MF, const SDNode &N) const
bool allowFMA(MachineFunction &MF, CodeGenOptLevel OptLevel) const
bool usePrecSqrtF32(const SDNode *N=nullptr) const
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
Represents one node in the SelectionDAG.
unsigned getNumValues() const
Return the number of values defined/returned by this operator.
const SDValue & getOperand(unsigned Num) const
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
SDNode * getNode() const
get the SDNode which holds the desired result
EVT getValueType() const
Return the ValueType of the referenced return value.
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
SelectionDAGISelLegacy(char &ID, std::unique_ptr< SelectionDAGISel > S)
void ReplaceUses(SDValue F, SDValue T)
ReplaceUses - replace all uses of the old node F with the use of the new node T.
void ReplaceNode(SDNode *F, SDNode *T)
Replace all uses of F with T, then remove F from the DAG.
SelectionDAGISel(TargetMachine &tm, CodeGenOptLevel OL=CodeGenOptLevel::Default)
virtual bool runOnMachineFunction(MachineFunction &mf)
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
SDValue getTargetGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, unsigned TargetFlags=0)
LLVM_ABI MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
SDValue getTargetFrameIndex(int FI, EVT VT)
SDValue getSignedTargetConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
LLVM_ABI SDValue getTargetExternalSymbol(const char *Sym, EVT VT, unsigned TargetFlags=0)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
const SDValue & getValue() const
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
@ C
The default llvm calling convention, compatible with C.
@ ADD
Simple integer binary arithmetic operators.
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
@ FADD
Simple binary floating point operators.
@ AssertAlign
AssertAlign - These nodes record if a register contains a value that has a known alignment and the tr...
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
@ SHL
Shift and rotation operations.
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
@ AND
Bitwise operators - logical and, logical or, logical xor.
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
@ ADDRESS_SPACE_SHARED_CLUSTER
@ UNPACK_VECTOR
This node is the inverse of NVPTX::BUILD_VECTOR.
@ ATOMIC_CMP_SWAP_B128
These nodes are used to lower atomic instructions with i128 type.
std::string ScopeToString(Scope S)
std::string OrderingToString(Ordering Order)
bool isPackedVectorTy(EVT VT)
initializer< Ty > init(const Ty &Val)
Scope
Defines the scope in which this symbol should be visible: Default – Visible in the public interface o...
This is an optimization pass for GlobalISel generic memory operations.
FunctionAddr VTableAddr Value
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
int countr_one(T Value)
Count the number of ones from the least significant bit to the first zero bit.
FunctionPass * createNVPTXISelDag(NVPTXTargetMachine &TM, llvm::CodeGenOptLevel OptLevel)
createNVPTXISelDag - This pass converts a legalized DAG into a NVPTX-specific DAG,...
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
constexpr bool isShiftedMask_64(uint64_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (64 bit ver...
const char * toIRString(AtomicOrdering ao)
String used by LLVM IR to represent atomic ordering.
auto formatv(bool Validate, const char *Fmt, Ts &&...Vals)
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
constexpr bool isMask_64(uint64_t Value)
Return true if the argument is a non-empty sequence of ones starting at the least significant bit wit...
CodeGenOptLevel
Code generation optimization level.
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
AtomicOrdering
Atomic ordering for LLVM's memory model.
DWARFExpression::Operation Op
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Implement std::hash so that hash_code can be used in STL containers.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
static constexpr roundingMode rmNearestTiesToEven
static LLVM_ABI const fltSemantics & BFloat() LLVM_READNONE
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
bool isVector() const
Return true if this is a vector value type.
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
NVPTX::Scope operator[](SyncScope::ID ID) const