24#include "llvm/IR/IntrinsicsNVPTX.h"
35#define DEBUG_TYPE "nvptx-isel"
36#define PASS_NAME "NVPTX DAG->DAG Pattern Instruction Selection"
40 cl::desc(
"Enable reciprocal sqrt optimization"));
47 cl::desc(
"Enable MAD wide optimization"));
76NVPTXDAGToDAGISel::getDivF32Level(
const SDNode *
N)
const {
80bool NVPTXDAGToDAGISel::usePrecSqrtF32(
const SDNode *
N)
const {
84bool NVPTXDAGToDAGISel::useF32FTZ()
const {
85 return Subtarget->getTargetLowering()->useF32FTZ(*
MF);
88bool NVPTXDAGToDAGISel::allowFMA()
const {
89 const NVPTXTargetLowering *TL =
Subtarget->getTargetLowering();
93bool NVPTXDAGToDAGISel::doRsqrtOpt()
const {
return EnableRsqrtOpt; }
95bool NVPTXDAGToDAGISel::doMADWideOpt()
const {
return EnableMADWide; }
99void NVPTXDAGToDAGISel::Select(
SDNode *
N) {
101 if (
N->isMachineOpcode()) {
106 switch (
N->getOpcode()) {
126 if (tryEXTRACT_VECTOR_ELEMENT(
N))
133 SelectSETP_BF16X2(
N);
138 if (tryLoadVector(
N))
149 if (tryStoreVector(
N))
153 if (tryIntrinsicChain(
N))
157 if (tryIntrinsicVoid(
N))
168 SelectAddrSpaceCast(
N);
171 if (
N->getOperand(1).getValueType() == MVT::i128) {
172 SelectV2I64toI128(
N);
178 if (
N->getOperand(1).getValueType() == MVT::i128) {
179 SelectI128toV2I64(
N);
186 selectAtomicSwap128(
N);
191 if (tryBF16ArithToFMA(
N))
195 return selectBR_JT(
N);
202#define TCGEN05_LD_OPCODE(SHAPE, NUM) \
203 (enablePack ? NVPTX::TCGEN05_LD_##SHAPE##_##NUM##_PACK \
204 : NVPTX::TCGEN05_LD_##SHAPE##_##NUM)
208 case Intrinsic::nvvm_tcgen05_ld_16x64b_x1:
210 case Intrinsic::nvvm_tcgen05_ld_16x64b_x2:
212 case Intrinsic::nvvm_tcgen05_ld_16x64b_x4:
214 case Intrinsic::nvvm_tcgen05_ld_16x64b_x8:
216 case Intrinsic::nvvm_tcgen05_ld_16x64b_x16:
218 case Intrinsic::nvvm_tcgen05_ld_16x64b_x32:
220 case Intrinsic::nvvm_tcgen05_ld_16x64b_x64:
222 case Intrinsic::nvvm_tcgen05_ld_16x64b_x128:
224 case Intrinsic::nvvm_tcgen05_ld_16x128b_x1:
226 case Intrinsic::nvvm_tcgen05_ld_16x128b_x2:
228 case Intrinsic::nvvm_tcgen05_ld_16x128b_x4:
230 case Intrinsic::nvvm_tcgen05_ld_16x128b_x8:
232 case Intrinsic::nvvm_tcgen05_ld_16x128b_x16:
234 case Intrinsic::nvvm_tcgen05_ld_16x128b_x32:
236 case Intrinsic::nvvm_tcgen05_ld_16x128b_x64:
238 case Intrinsic::nvvm_tcgen05_ld_16x256b_x1:
240 case Intrinsic::nvvm_tcgen05_ld_16x256b_x2:
242 case Intrinsic::nvvm_tcgen05_ld_16x256b_x4:
244 case Intrinsic::nvvm_tcgen05_ld_16x256b_x8:
246 case Intrinsic::nvvm_tcgen05_ld_16x256b_x16:
248 case Intrinsic::nvvm_tcgen05_ld_16x256b_x32:
250 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x1:
252 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x2:
254 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x4:
256 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x8:
258 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x16:
260 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x32:
262 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x64:
264 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x128:
266 case Intrinsic::nvvm_tcgen05_ld_32x32b_x1:
268 case Intrinsic::nvvm_tcgen05_ld_32x32b_x2:
270 case Intrinsic::nvvm_tcgen05_ld_32x32b_x4:
272 case Intrinsic::nvvm_tcgen05_ld_32x32b_x8:
274 case Intrinsic::nvvm_tcgen05_ld_32x32b_x16:
276 case Intrinsic::nvvm_tcgen05_ld_32x32b_x32:
278 case Intrinsic::nvvm_tcgen05_ld_32x32b_x64:
280 case Intrinsic::nvvm_tcgen05_ld_32x32b_x128:
286void NVPTXDAGToDAGISel::SelectTcgen05Ld(
SDNode *
N,
bool hasOffset) {
289 "tcgen05.ld is not supported on this architecture variant");
296 auto OffsetNode =
CurDAG->getTargetConstant(
300 {N->getOperand(2), OffsetNode, N->getOperand(0)}));
305 {N->getOperand(2), N->getOperand(0)}));
309bool NVPTXDAGToDAGISel::tryIntrinsicChain(
SDNode *
N) {
310 unsigned IID =
N->getConstantOperandVal(1);
314 case Intrinsic::nvvm_ldu_global_f:
315 case Intrinsic::nvvm_ldu_global_i:
316 case Intrinsic::nvvm_ldu_global_p:
319 case Intrinsic::nvvm_tcgen05_ld_16x64b_x1:
320 case Intrinsic::nvvm_tcgen05_ld_16x64b_x2:
321 case Intrinsic::nvvm_tcgen05_ld_16x64b_x4:
322 case Intrinsic::nvvm_tcgen05_ld_16x64b_x8:
323 case Intrinsic::nvvm_tcgen05_ld_16x64b_x16:
324 case Intrinsic::nvvm_tcgen05_ld_16x64b_x32:
325 case Intrinsic::nvvm_tcgen05_ld_16x64b_x64:
326 case Intrinsic::nvvm_tcgen05_ld_16x64b_x128:
327 case Intrinsic::nvvm_tcgen05_ld_16x128b_x1:
328 case Intrinsic::nvvm_tcgen05_ld_16x128b_x2:
329 case Intrinsic::nvvm_tcgen05_ld_16x128b_x4:
330 case Intrinsic::nvvm_tcgen05_ld_16x128b_x16:
331 case Intrinsic::nvvm_tcgen05_ld_16x128b_x32:
332 case Intrinsic::nvvm_tcgen05_ld_16x128b_x64:
333 case Intrinsic::nvvm_tcgen05_ld_16x256b_x1:
334 case Intrinsic::nvvm_tcgen05_ld_16x128b_x8:
335 case Intrinsic::nvvm_tcgen05_ld_16x256b_x2:
336 case Intrinsic::nvvm_tcgen05_ld_16x256b_x4:
337 case Intrinsic::nvvm_tcgen05_ld_16x256b_x8:
338 case Intrinsic::nvvm_tcgen05_ld_16x256b_x16:
339 case Intrinsic::nvvm_tcgen05_ld_16x256b_x32:
340 case Intrinsic::nvvm_tcgen05_ld_32x32b_x1:
341 case Intrinsic::nvvm_tcgen05_ld_32x32b_x2:
342 case Intrinsic::nvvm_tcgen05_ld_32x32b_x4:
343 case Intrinsic::nvvm_tcgen05_ld_32x32b_x8:
344 case Intrinsic::nvvm_tcgen05_ld_32x32b_x16:
345 case Intrinsic::nvvm_tcgen05_ld_32x32b_x32:
346 case Intrinsic::nvvm_tcgen05_ld_32x32b_x64:
347 case Intrinsic::nvvm_tcgen05_ld_32x32b_x128: {
352 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x1:
353 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x2:
354 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x4:
355 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x8:
356 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x16:
357 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x32:
358 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x64:
359 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x128: {
360 SelectTcgen05Ld(
N,
true);
395 return CmpMode::NotANumber;
410 return CurDAG->getTargetConstant(PTXCmpMode, SDLoc(), MVT::i32);
413bool NVPTXDAGToDAGISel::SelectSETP_F16X2(
SDNode *
N) {
416 SDNode *SetP =
CurDAG->getMachineNode(
417 NVPTX::SETP_f16x2rr,
DL, MVT::i1, MVT::i1,
418 {
N->getOperand(0),
N->getOperand(1), PTXCmpMode,
419 CurDAG->getTargetConstant(useF32FTZ() ? 1 : 0,
DL, MVT::i1)});
424bool NVPTXDAGToDAGISel::SelectSETP_BF16X2(
SDNode *
N) {
427 SDNode *SetP =
CurDAG->getMachineNode(
428 NVPTX::SETP_bf16x2rr,
DL, MVT::i1, MVT::i1,
429 {
N->getOperand(0),
N->getOperand(1), PTXCmpMode,
430 CurDAG->getTargetConstant(useF32FTZ() ? 1 : 0,
DL, MVT::i1)});
435bool NVPTXDAGToDAGISel::tryUNPACK_VECTOR(
SDNode *
N) {
437 MVT EltVT =
N->getSimpleValueType(0);
440 CurDAG->getMachineNode(NVPTX::I64toV2I32, SDLoc(
N), EltVT, EltVT,
Vector);
448bool NVPTXDAGToDAGISel::tryEXTRACT_VECTOR_ELEMENT(
SDNode *
N) {
451 MVT VT =
Vector.getSimpleValueType();
457 Opcode = NVPTX::I32toV2I16;
459 Opcode = NVPTX::I64toV2I32;
465 for (
auto *U :
Vector.getNode()->users()) {
468 if (
U->getOperand(0) !=
Vector)
470 if (
const ConstantSDNode *IdxConst =
472 if (IdxConst->getZExtValue() == 0)
474 else if (IdxConst->getZExtValue() == 1)
490 CurDAG->getMachineNode(Opcode, SDLoc(
N), EltVT, EltVT,
Vector);
491 for (
auto *Node : E0)
493 for (
auto *Node : E1)
520 auto Ordering =
N->getMergedOrdering();
554struct OperationOrderings {
555 NVPTX::Ordering InstructionOrdering, FenceOrdering;
556 OperationOrderings(NVPTX::Ordering IO = NVPTX::Ordering::NotAtomic,
557 NVPTX::Ordering FO = NVPTX::Ordering::NotAtomic)
558 : InstructionOrdering(IO), FenceOrdering(FO) {}
561static OperationOrderings
664 !HasMemoryOrdering) {
666 formatv(
"PTX does not support \"atomic\" for orderings different than"
667 "\"NotAtomic\" or \"Monotonic\" for sm_60 or older, but order "
679 bool AddrGenericOrGlobalOrShared =
684 if (!AddrGenericOrGlobalOrShared)
687 bool UseRelaxedMMIO =
709 formatv(
"PTX only supports Acquire Ordering on reads: {}",
710 N->getOperationName()));
715 formatv(
"PTX only supports Release Ordering on writes: {}",
716 N->getOperationName()));
720 formatv(
"NVPTX does not support AcquireRelease Ordering on "
722 "yet and PTX does not support it on loads or stores: {}",
723 N->getOperationName()));
736 else if (
N->writeMem())
740 formatv(
"NVPTX does not support SequentiallyConsistent Ordering on "
741 "read-modify-writes yet: {}",
742 N->getOperationName()));
743 return OperationOrderings(InstrOrder,
748 formatv(
"NVPTX backend does not support AtomicOrdering \"{}\" yet.",
771 auto S = Scopes[
N->getSyncScopeID()];
794 if (!
T->hasSplitAcquireAndReleaseFences() &&
802 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_acquire_sys
803 : NVPTX::INT_MEMBAR_SYS;
805 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_acquire_cta
806 : NVPTX::INT_MEMBAR_CTA;
808 return NVPTX::atomic_thread_fence_acquire_cluster;
810 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_acquire_gpu
811 : NVPTX::INT_MEMBAR_GL;
815 formatv(
"Unsupported scope \"{}\" for acquire/release/acq_rel fence.",
822 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_release_sys
823 : NVPTX::INT_MEMBAR_SYS;
825 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_release_cta
826 : NVPTX::INT_MEMBAR_CTA;
828 return NVPTX::atomic_thread_fence_release_cluster;
830 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_release_gpu
831 : NVPTX::INT_MEMBAR_GL;
835 formatv(
"Unsupported scope \"{}\" for acquire/release/acq_rel fence.",
842 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_acq_rel_sys
843 : NVPTX::INT_MEMBAR_SYS;
845 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_acq_rel_cta
846 : NVPTX::INT_MEMBAR_CTA;
848 return NVPTX::atomic_thread_fence_acq_rel_cluster;
850 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_acq_rel_gpu
851 : NVPTX::INT_MEMBAR_GL;
855 formatv(
"Unsupported scope \"{}\" for acquire/release/acq_rel fence.",
863 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_seq_cst_sys
864 : NVPTX::INT_MEMBAR_SYS;
866 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_seq_cst_cta
867 : NVPTX::INT_MEMBAR_CTA;
869 return NVPTX::atomic_thread_fence_seq_cst_cluster;
871 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_seq_cst_gpu
872 : NVPTX::INT_MEMBAR_GL;
885 formatv(
"Unsupported \"{}\" ordering and \"{}\" scope for fence.",
886 OrderingToString(O), ScopeToString(S)));
894std::pair<NVPTX::Ordering, NVPTX::Scope>
895NVPTXDAGToDAGISel::insertMemoryInstructionFence(
SDLoc DL,
SDValue &Chain,
897 auto [InstructionOrdering, FenceOrdering] =
899 auto Scope = getOperationScope(
N, InstructionOrdering);
921 formatv(
"Unexpected fence ordering: \"{}\".",
924 return {InstructionOrdering,
Scope};
927void NVPTXDAGToDAGISel::SelectAddrSpaceCast(
SDNode *
N) {
933 assert(SrcAddrSpace != DstAddrSpace &&
934 "addrspacecast must be between different address spaces");
939 if (TM.is64Bit() && TM.getPointerSizeInBits(SrcAddrSpace) == 32) {
942 SDNode *Cvt =
CurDAG->getMachineNode(NVPTX::CVT_u64_u32,
DL, MVT::i64,
948 switch (SrcAddrSpace) {
951 Opc = TM.is64Bit() ? NVPTX::cvta_global_64 : NVPTX::cvta_global;
954 Opc = TM.is64Bit() ? NVPTX::cvta_shared_64 : NVPTX::cvta_shared;
959 "Shared cluster address space is only supported in 64-bit mode");
960 Opc = NVPTX::cvta_shared_cluster_64;
963 Opc = TM.is64Bit() ? NVPTX::cvta_const_64 : NVPTX::cvta_const;
966 Opc = TM.is64Bit() ? NVPTX::cvta_local_64 : NVPTX::cvta_local;
969 Opc = TM.is64Bit() ? NVPTX::cvta_param_64 : NVPTX::cvta_param;
976 if (SrcAddrSpace != 0)
979 switch (DstAddrSpace) {
982 Opc = TM.is64Bit() ? NVPTX::cvta_to_global_64 : NVPTX::cvta_to_global;
985 Opc = TM.is64Bit() ? NVPTX::cvta_to_shared_64 : NVPTX::cvta_to_shared;
990 "Shared cluster address space is only supported in 64-bit mode");
991 Opc = NVPTX::cvta_to_shared_cluster_64;
994 Opc = TM.is64Bit() ? NVPTX::cvta_to_const_64 : NVPTX::cvta_to_const;
997 Opc = TM.is64Bit() ? NVPTX::cvta_to_local_64 : NVPTX::cvta_to_local;
1000 Opc = TM.is64Bit() ? NVPTX::cvta_to_param_64 : NVPTX::cvta_to_param;
1004 SDNode *CVTA =
CurDAG->getMachineNode(
Opc,
DL,
N->getValueType(0), Src);
1005 if (TM.is64Bit() && TM.getPointerSizeInBits(DstAddrSpace) == 32) {
1008 CVTA =
CurDAG->getMachineNode(NVPTX::CVT_u32_u64,
DL, MVT::i32,
1019static std::optional<unsigned>
1021 std::optional<unsigned> Opcode_i32,
1022 std::optional<unsigned> Opcode_i64) {
1041 return std::nullopt;
1046 return V.getOpcode() ==
ISD::ADD ||
1047 (V->getOpcode() ==
ISD::OR && V->getFlags().hasDisjoint());
1052 N =
N.getOperand(0);
1062 GA->getValueType(0), GA->getOffset(),
1063 GA->getTargetFlags());
1066 ES->getTargetFlags());
1075 APInt AccumulatedOffset(64u, 0);
1081 const APInt CI = CN->getAPIntValue().
sext(64);
1082 if (!(CI + AccumulatedOffset).isSignedIntN(32))
1085 AccumulatedOffset += CI;
1111bool NVPTXDAGToDAGISel::tryLoad(
SDNode *
N) {
1113 assert(
LD->readMem() &&
"Expected load");
1117 if (PlainLoad && PlainLoad->
isIndexed())
1127 const auto [
Ordering,
Scope] = insertMemoryInstructionFence(
DL, Chain, LD);
1129 const unsigned FromTypeWidth =
LD->getMemoryVT().getSizeInBits();
1137 uint32_t UsedBytesMask;
1138 switch (
N->getOpcode()) {
1141 UsedBytesMask = UINT32_MAX;
1144 UsedBytesMask =
N->getConstantOperandVal(3);
1151 FromTypeWidth <= 128 &&
"Invalid width for load");
1156 getI32Imm(Scope,
DL),
1157 getI32Imm(CodeAddrSpace,
DL),
1158 getI32Imm(FromType,
DL),
1159 getI32Imm(FromTypeWidth,
DL),
1160 getI32Imm(UsedBytesMask,
DL),
1166 const std::optional<unsigned> Opcode =
1167 pickOpcodeForVT(TargetVT, NVPTX::LD_i16, NVPTX::LD_i32, NVPTX::LD_i64);
1171 SDNode *NVPTXLD =
CurDAG->getMachineNode(*Opcode,
DL,
LD->getVTList(),
Ops);
1175 MachineMemOperand *MemRef =
LD->getMemOperand();
1183 switch (
N->getOpcode()) {
1195bool NVPTXDAGToDAGISel::tryLoadVector(
SDNode *
N) {
1203 const MVT EltVT =
LD->getSimpleValueType(0);
1206 const auto [
Ordering,
Scope] = insertMemoryInstructionFence(
DL, Chain, LD);
1216 const unsigned ExtensionType =
N->getConstantOperandVal(4);
1219 : NVPTX::PTXLdStInstCode::
Untyped;
1222 const uint32_t UsedBytesMask =
N->getConstantOperandVal(3);
1228 getI32Imm(Scope,
DL),
1229 getI32Imm(CodeAddrSpace,
DL),
1230 getI32Imm(FromType,
DL),
1231 getI32Imm(FromTypeWidth,
DL),
1232 getI32Imm(UsedBytesMask,
DL),
1237 std::optional<unsigned> Opcode;
1238 switch (
N->getOpcode()) {
1243 NVPTX::LDV_i32_v2, NVPTX::LDV_i64_v2);
1247 NVPTX::LDV_i32_v4, NVPTX::LDV_i64_v4);
1251 NVPTX::LDV_i32_v8, {});
1257 SDNode *NVPTXLD =
CurDAG->getMachineNode(*Opcode,
DL,
LD->getVTList(),
Ops);
1259 MachineMemOperand *MemRef =
LD->getMemOperand();
1266bool NVPTXDAGToDAGISel::tryLDG(
MemSDNode *LD) {
1269 unsigned ExtensionType;
1270 uint32_t UsedBytesMask;
1272 ExtensionType =
Load->getExtensionType();
1273 UsedBytesMask = UINT32_MAX;
1275 ExtensionType =
LD->getConstantOperandVal(4);
1276 UsedBytesMask =
LD->getConstantOperandVal(3);
1280 : NVPTX::PTXLdStInstCode::
Untyped;
1284 assert(!(
LD->getSimpleValueType(0).isVector() &&
1289 getI32Imm(FromTypeWidth,
DL),
1290 getI32Imm(UsedBytesMask,
DL),
1296 std::optional<unsigned> Opcode;
1297 switch (
LD->getOpcode()) {
1302 NVPTX::LD_GLOBAL_NC_i32, NVPTX::LD_GLOBAL_NC_i64);
1305 Opcode =
pickOpcodeForVT(TargetVT, std::nullopt, NVPTX::LD_GLOBAL_NC_i32,
1306 NVPTX::LD_GLOBAL_NC_i64);
1311 NVPTX::LD_GLOBAL_NC_v2i32, NVPTX::LD_GLOBAL_NC_v2i64);
1316 NVPTX::LD_GLOBAL_NC_v4i32, NVPTX::LD_GLOBAL_NC_v4i64);
1320 NVPTX::LD_GLOBAL_NC_v8i32, {});
1326 SDNode *NVPTXLDG =
CurDAG->getMachineNode(*Opcode,
DL,
LD->getVTList(),
Ops);
1335 auto ElementBitWidth = TotalWidth / NumElts;
1337 ElementBitWidth <= 128 && TotalWidth <= 256 &&
1338 "Invalid width for load");
1339 return ElementBitWidth;
1342bool NVPTXDAGToDAGISel::tryLDU(
SDNode *
N) {
1357 std::optional<unsigned> Opcode;
1358 switch (
N->getOpcode()) {
1363 NVPTX::LDU_GLOBAL_i32, NVPTX::LDU_GLOBAL_i64);
1367 NVPTX::LDU_GLOBAL_v2i32, NVPTX::LDU_GLOBAL_v2i64);
1371 NVPTX::LDU_GLOBAL_v4i32, {});
1377 SDNode *NVPTXLDU =
CurDAG->getMachineNode(*Opcode,
DL,
LD->getVTList(),
Ops);
1383bool NVPTXDAGToDAGISel::tryStore(
SDNode *
N) {
1385 assert(
ST->writeMem() &&
"Expected store");
1388 assert((PlainStore || AtomicStore) &&
"Expected store");
1391 if (PlainStore && PlainStore->
isIndexed())
1399 const auto [
Ordering,
Scope] = insertMemoryInstructionFence(
DL, Chain, ST);
1402 const unsigned ToTypeWidth =
ST->getMemoryVT().getSizeInBits();
1408 "Invalid width for store");
1412 getI32Imm(Ordering,
DL),
1413 getI32Imm(Scope,
DL),
1414 getI32Imm(CodeAddrSpace,
DL),
1415 getI32Imm(ToTypeWidth,
DL),
1420 const std::optional<unsigned> Opcode =
1422 NVPTX::ST_i32, NVPTX::ST_i64);
1426 SDNode *NVPTXST =
CurDAG->getMachineNode(*Opcode,
DL, MVT::Other,
Ops);
1431 MachineMemOperand *MemRef =
ST->getMemOperand();
1437bool NVPTXDAGToDAGISel::tryStoreVector(
SDNode *
N) {
1439 const unsigned TotalWidth =
ST->getMemoryVT().getSizeInBits();
1450 const auto [
Ordering,
Scope] = insertMemoryInstructionFence(
DL, Chain, ST);
1455 for (
auto &V :
ST->ops().slice(1, NumElts))
1456 Ops.push_back(selectPossiblyImm(V));
1458 const unsigned ToTypeWidth = TotalWidth / NumElts;
1461 TotalWidth <= 256 &&
"Invalid width for store");
1464 Ops.append({getI32Imm(Ordering,
DL), getI32Imm(Scope,
DL),
1465 getI32Imm(CodeAddrSpace,
DL), getI32Imm(ToTypeWidth,
DL),
Base,
1469 ST->getOperand(1).getSimpleValueType().SimpleTy;
1470 std::optional<unsigned> Opcode;
1471 switch (
ST->getOpcode()) {
1491 SDNode *NVPTXST =
CurDAG->getMachineNode(*Opcode,
DL, MVT::Other,
Ops);
1493 MachineMemOperand *MemRef =
ST->getMemOperand();
1502bool NVPTXDAGToDAGISel::tryBFE(
SDNode *
N) {
1509 bool IsSigned =
false;
1525 uint64_t MaskVal =
Mask->getZExtValue();
1535 Len =
CurDAG->getTargetConstant(NumBits,
DL, MVT::i32);
1539 Val =
LHS.getNode()->getOperand(0);
1540 Start =
LHS.getNode()->getOperand(1);
1546 int64_t GoodBits =
Start.getValueSizeInBits() - StartVal;
1547 if (NumBits > GoodBits) {
1605 NumBits = NumZeros + NumOnes - ShiftAmt;
1611 if (ShiftAmt < NumZeros) {
1619 Len =
CurDAG->getTargetConstant(NumBits,
DL, MVT::i32);
1635 Val =
LHS->getOperand(0);
1654 if (OuterShiftAmt < InnerShiftAmt) {
1665 Start =
CurDAG->getTargetConstant(OuterShiftAmt - InnerShiftAmt,
DL,
1690 Opc = NVPTX::BFE_S32rii;
1692 Opc = NVPTX::BFE_U32rii;
1696 Opc = NVPTX::BFE_S64rii;
1698 Opc = NVPTX::BFE_U64rii;
1714bool NVPTXDAGToDAGISel::tryBF16ArithToFMA(
SDNode *
N) {
1715 EVT VT =
SDValue(
N, 0).getValueType();
1719 const NVPTXSubtarget *STI = TM.getSubtargetImpl();
1736 auto API = APF.bitcastToAPInt();
1737 API = API.concat(API);
1739 return SDValue(
CurDAG->getMachineNode(NVPTX::MOV_B32_i,
DL, VT, Const),
1743 return SDValue(
CurDAG->getMachineNode(NVPTX::MOV_BF16_i,
DL, VT, Const), 0);
1746 switch (
N->getOpcode()) {
1749 Operands = {N0, GetConstant(1.0), N1};
1753 Operands = {N1, GetConstant(-1.0), N0};
1758 Operands = {N0, N1, GetConstant(-0.0)};
1764 int Opcode = IsVec ? NVPTX::FMA_BF16x2rrr : NVPTX::FMA_BF16rrr;
1765 MachineSDNode *
FMA =
CurDAG->getMachineNode(Opcode,
DL, VT, Operands);
1772 V =
V.getOperand(0);
1775 return CurDAG->getTargetConstant(CN->getAPIntValue(), SDLoc(V),
1778 return CurDAG->getTargetConstantFP(CN->getValueAPF(), SDLoc(V),
1787 std::vector<SDValue> &OutOps) {
1788 switch (ConstraintID) {
1793 OutOps.push_back(
Base);
1794 OutOps.push_back(
Offset);
1801void NVPTXDAGToDAGISel::SelectV2I64toI128(
SDNode *
N) {
1820 NewOps[0] =
N->getOperand(0);
1823 if (
N->getNumOperands() == 5)
1824 NewOps[3] =
N->getOperand(4);
1830void NVPTXDAGToDAGISel::SelectI128toV2I64(
SDNode *
N) {
1847 SDNode *Mov =
CurDAG->getMachineNode(
1848 NVPTX::I128toV2I64,
DL,
1855bool NVPTXDAGToDAGISel::tryFence(
SDNode *
N) {
1858 auto Scope = Scopes[
N->getConstantOperandVal(2)];
1872 SDNode *FenceNode =
CurDAG->getMachineNode(FenceOp,
DL, MVT::Other, Chain);
1888 "NVPTXScopes::operator[]");
1890 auto S = Scopes.find(
ID);
1891 if (S == Scopes.end()) {
1892 auto scopeName = Context->getSyncScopeName(
ID);
1893 assert(scopeName.has_value() &&
"Scope name must exist.");
1897 for (
const auto &Entry : Scopes) {
1898 if (
auto name = Context->getSyncScopeName(Entry.first))
1903 formatv(
"NVPTX backend does not support syncscope \"{0}\" (ID={1}).\n"
1904 "Supported syncscopes are: {2}.",
1905 scopeName.value(),
int(
ID),
1913#define CP_ASYNC_BULK_TENSOR_OPCODE(dir, dim, mode, is_s32, suffix) \
1915 ? NVPTX::CP_ASYNC_BULK_TENSOR_##dir##_##dim##_SHARED32_##mode##suffix \
1916 : NVPTX::CP_ASYNC_BULK_TENSOR_##dir##_##dim##_##mode##suffix)
1918#define GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G_RED(dim, mode, is_ch, is_s32) \
1919 (is_ch ? (CP_ASYNC_BULK_TENSOR_OPCODE(RED, dim, mode, is_s32, _CH)) \
1920 : (CP_ASYNC_BULK_TENSOR_OPCODE(RED, dim, mode, is_s32, )))
1939 "GetCpAsyncBulkTensorS2GReductionOpcode.");
1960 "GetCpAsyncBulkTensorS2GReductionOpcode.");
1965void NVPTXDAGToDAGISel::SelectCpAsyncBulkTensorReduceCommon(
SDNode *
N,
1972 size_t NumOps =
N->getNumOperands();
1973 size_t NumDims =
NumOps - 6;
1974 bool IsCacheHint =
N->getConstantOperandVal(
NumOps - 1) == 1;
1975 size_t NumArgs = NumDims + (IsCacheHint ? 3 : 2);
1979 Ops.push_back(getI32Imm(RedOp,
DL));
1980 Ops.push_back(
N->getOperand(0));
1985 NumDims, IsShared32, IsCacheHint, IsIm2Col);
1989#define TCGEN05_ST_OPCODE(SHAPE, NUM) \
1990 (enableUnpack ? NVPTX::TCGEN05_ST_##SHAPE##_##NUM##_UNPACK \
1991 : NVPTX::TCGEN05_ST_##SHAPE##_##NUM)
1995 case Intrinsic::nvvm_tcgen05_st_16x64b_x1:
1997 case Intrinsic::nvvm_tcgen05_st_16x64b_x2:
1999 case Intrinsic::nvvm_tcgen05_st_16x64b_x4:
2001 case Intrinsic::nvvm_tcgen05_st_16x64b_x8:
2003 case Intrinsic::nvvm_tcgen05_st_16x64b_x16:
2005 case Intrinsic::nvvm_tcgen05_st_16x64b_x32:
2007 case Intrinsic::nvvm_tcgen05_st_16x64b_x64:
2009 case Intrinsic::nvvm_tcgen05_st_16x64b_x128:
2011 case Intrinsic::nvvm_tcgen05_st_16x128b_x1:
2013 case Intrinsic::nvvm_tcgen05_st_16x128b_x2:
2015 case Intrinsic::nvvm_tcgen05_st_16x128b_x4:
2017 case Intrinsic::nvvm_tcgen05_st_16x128b_x8:
2019 case Intrinsic::nvvm_tcgen05_st_16x128b_x16:
2021 case Intrinsic::nvvm_tcgen05_st_16x128b_x32:
2023 case Intrinsic::nvvm_tcgen05_st_16x128b_x64:
2025 case Intrinsic::nvvm_tcgen05_st_16x256b_x1:
2027 case Intrinsic::nvvm_tcgen05_st_16x256b_x2:
2029 case Intrinsic::nvvm_tcgen05_st_16x256b_x4:
2031 case Intrinsic::nvvm_tcgen05_st_16x256b_x8:
2033 case Intrinsic::nvvm_tcgen05_st_16x256b_x16:
2035 case Intrinsic::nvvm_tcgen05_st_16x256b_x32:
2037 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x1:
2039 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x2:
2041 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x4:
2043 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x8:
2045 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x16:
2047 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x32:
2049 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x64:
2051 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x128:
2053 case Intrinsic::nvvm_tcgen05_st_32x32b_x1:
2055 case Intrinsic::nvvm_tcgen05_st_32x32b_x2:
2057 case Intrinsic::nvvm_tcgen05_st_32x32b_x4:
2059 case Intrinsic::nvvm_tcgen05_st_32x32b_x8:
2061 case Intrinsic::nvvm_tcgen05_st_32x32b_x16:
2063 case Intrinsic::nvvm_tcgen05_st_32x32b_x32:
2065 case Intrinsic::nvvm_tcgen05_st_32x32b_x64:
2067 case Intrinsic::nvvm_tcgen05_st_32x32b_x128:
2073void NVPTXDAGToDAGISel::SelectTcgen05St(
SDNode *
N,
bool hasOffset) {
2074 if (!
Subtarget->hasTcgen05InstSupport())
2076 "tcgen05.st is not supported on this architecture variant");
2090 for (
unsigned I = hasOffset ? 4 : 3;
I < (
N->getNumOperands() - 1);
I++)
2099 DL,
N->getVTList(), Operands));
2102bool NVPTXDAGToDAGISel::tryIntrinsicVoid(
SDNode *
N) {
2103 unsigned IID =
N->getConstantOperandVal(1);
2105 auto CastTy = [](TMARedTy
Op) {
return static_cast<unsigned>(
Op); };
2109 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_tile_1d:
2110 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_tile_2d:
2111 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_tile_3d:
2112 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_tile_4d:
2113 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_tile_5d:
2114 SelectCpAsyncBulkTensorReduceCommon(
N, CastTy(TMARedTy::ADD));
2116 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_im2col_3d:
2117 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_im2col_4d:
2118 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_im2col_5d:
2119 SelectCpAsyncBulkTensorReduceCommon(
N, CastTy(TMARedTy::ADD),
2122 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_min_tile_1d:
2123 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_min_tile_2d:
2124 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_min_tile_3d:
2125 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_min_tile_4d:
2126 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_min_tile_5d:
2127 SelectCpAsyncBulkTensorReduceCommon(
N, CastTy(TMARedTy::MIN));
2129 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_min_im2col_3d:
2130 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_min_im2col_4d:
2131 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_min_im2col_5d:
2132 SelectCpAsyncBulkTensorReduceCommon(
N, CastTy(TMARedTy::MIN),
2135 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_max_tile_1d:
2136 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_max_tile_2d:
2137 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_max_tile_3d:
2138 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_max_tile_4d:
2139 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_max_tile_5d:
2140 SelectCpAsyncBulkTensorReduceCommon(
N, CastTy(TMARedTy::MAX));
2142 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_max_im2col_3d:
2143 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_max_im2col_4d:
2144 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_max_im2col_5d:
2145 SelectCpAsyncBulkTensorReduceCommon(
N, CastTy(TMARedTy::MAX),
2148 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_inc_tile_1d:
2149 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_inc_tile_2d:
2150 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_inc_tile_3d:
2151 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_inc_tile_4d:
2152 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_inc_tile_5d:
2153 SelectCpAsyncBulkTensorReduceCommon(
N, CastTy(TMARedTy::INC));
2155 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_inc_im2col_3d:
2156 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_inc_im2col_4d:
2157 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_inc_im2col_5d:
2158 SelectCpAsyncBulkTensorReduceCommon(
N, CastTy(TMARedTy::INC),
2161 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_dec_tile_1d:
2162 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_dec_tile_2d:
2163 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_dec_tile_3d:
2164 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_dec_tile_4d:
2165 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_dec_tile_5d:
2166 SelectCpAsyncBulkTensorReduceCommon(
N, CastTy(TMARedTy::DEC));
2168 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_dec_im2col_3d:
2169 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_dec_im2col_4d:
2170 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_dec_im2col_5d:
2171 SelectCpAsyncBulkTensorReduceCommon(
N, CastTy(TMARedTy::DEC),
2174 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_and_tile_1d:
2175 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_and_tile_2d:
2176 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_and_tile_3d:
2177 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_and_tile_4d:
2178 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_and_tile_5d:
2179 SelectCpAsyncBulkTensorReduceCommon(
N, CastTy(TMARedTy::AND));
2181 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_and_im2col_3d:
2182 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_and_im2col_4d:
2183 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_and_im2col_5d:
2184 SelectCpAsyncBulkTensorReduceCommon(
N, CastTy(TMARedTy::AND),
2187 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_or_tile_1d:
2188 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_or_tile_2d:
2189 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_or_tile_3d:
2190 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_or_tile_4d:
2191 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_or_tile_5d:
2192 SelectCpAsyncBulkTensorReduceCommon(
N, CastTy(TMARedTy::OR));
2194 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_or_im2col_3d:
2195 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_or_im2col_4d:
2196 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_or_im2col_5d:
2197 SelectCpAsyncBulkTensorReduceCommon(
N, CastTy(TMARedTy::OR),
2200 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_xor_tile_1d:
2201 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_xor_tile_2d:
2202 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_xor_tile_3d:
2203 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_xor_tile_4d:
2204 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_xor_tile_5d:
2205 SelectCpAsyncBulkTensorReduceCommon(
N, CastTy(TMARedTy::XOR));
2207 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_xor_im2col_3d:
2208 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_xor_im2col_4d:
2209 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_xor_im2col_5d:
2210 SelectCpAsyncBulkTensorReduceCommon(
N, CastTy(TMARedTy::XOR),
2214 case Intrinsic::nvvm_tcgen05_st_16x64b_x1:
2215 case Intrinsic::nvvm_tcgen05_st_16x64b_x2:
2216 case Intrinsic::nvvm_tcgen05_st_16x64b_x4:
2217 case Intrinsic::nvvm_tcgen05_st_16x64b_x8:
2218 case Intrinsic::nvvm_tcgen05_st_16x64b_x16:
2219 case Intrinsic::nvvm_tcgen05_st_16x64b_x32:
2220 case Intrinsic::nvvm_tcgen05_st_16x64b_x64:
2221 case Intrinsic::nvvm_tcgen05_st_16x64b_x128:
2222 case Intrinsic::nvvm_tcgen05_st_32x32b_x1:
2223 case Intrinsic::nvvm_tcgen05_st_32x32b_x2:
2224 case Intrinsic::nvvm_tcgen05_st_32x32b_x4:
2225 case Intrinsic::nvvm_tcgen05_st_32x32b_x8:
2226 case Intrinsic::nvvm_tcgen05_st_32x32b_x16:
2227 case Intrinsic::nvvm_tcgen05_st_32x32b_x32:
2228 case Intrinsic::nvvm_tcgen05_st_32x32b_x64:
2229 case Intrinsic::nvvm_tcgen05_st_32x32b_x128:
2230 case Intrinsic::nvvm_tcgen05_st_16x128b_x1:
2231 case Intrinsic::nvvm_tcgen05_st_16x128b_x2:
2232 case Intrinsic::nvvm_tcgen05_st_16x128b_x4:
2233 case Intrinsic::nvvm_tcgen05_st_16x128b_x8:
2234 case Intrinsic::nvvm_tcgen05_st_16x128b_x16:
2235 case Intrinsic::nvvm_tcgen05_st_16x128b_x32:
2236 case Intrinsic::nvvm_tcgen05_st_16x128b_x64:
2237 case Intrinsic::nvvm_tcgen05_st_16x256b_x1:
2238 case Intrinsic::nvvm_tcgen05_st_16x256b_x2:
2239 case Intrinsic::nvvm_tcgen05_st_16x256b_x4:
2240 case Intrinsic::nvvm_tcgen05_st_16x256b_x8:
2241 case Intrinsic::nvvm_tcgen05_st_16x256b_x16:
2242 case Intrinsic::nvvm_tcgen05_st_16x256b_x32: {
2247 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x1:
2248 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x2:
2249 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x4:
2250 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x8:
2251 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x16:
2252 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x32:
2253 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x64:
2254 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x128: {
2255 SelectTcgen05St(
N,
true);
2261void NVPTXDAGToDAGISel::selectAtomicSwap128(
SDNode *
N) {
2268 Ops.append(
N->op_begin() + 2,
N->op_end());
2270 getI32Imm(getMemOrder(AN), dl),
2271 getI32Imm(getAtomicScope(AN), dl),
2279 ? NVPTX::ATOM_EXCH_B128
2280 : NVPTX::ATOM_CAS_B128;
2282 auto *ATOM =
CurDAG->getMachineNode(Opcode, dl,
N->getVTList(),
Ops);
2288void NVPTXDAGToDAGISel::selectBR_JT(
SDNode *
N) {
2290 "BR_JT should be expanded during legalization on unsupported targets");
2293 const SDValue InChain =
N->getOperand(0);
2297 unsigned JId = JT->getIndex();
2298 MachineJumpTableInfo *MJTI =
CurDAG->getMachineFunction().getJumpTableInfo();
2304 MachineSDNode *Chain =
CurDAG->getMachineNode(
2305 NVPTX::BRX_START,
DL, {MVT::Other, MVT::Glue}, {IdV, InChain});
2310 Chain =
CurDAG->getMachineNode(
2311 NVPTX::BRX_ITEM,
DL, {MVT::Other, MVT::Glue},
2315 MachineSDNode *BrxEnd =
2316 CurDAG->getMachineNode(NVPTX::BRX_END,
DL, MVT::Other,
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
This file implements a class to represent arbitrary precision integral constant values and operations...
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Atomic ordering constants.
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static NVPTX::Scope resolveScope(NVPTX::Scope S, const NVPTXSubtarget *T)
static unsigned getStoreVectorNumElts(SDNode *N)
static bool isAddLike(const SDValue V)
static SDValue selectBaseADDR(SDValue N, SelectionDAG *DAG)
static SDValue accumulateOffset(SDValue &Addr, SDLoc DL, SelectionDAG *DAG)
static unsigned getTcgen05StOpcode(unsigned IID, bool enableUnpack)
static std::optional< unsigned > pickOpcodeForVT(MVT::SimpleValueType VT, std::optional< unsigned > Opcode_i16, std::optional< unsigned > Opcode_i32, std::optional< unsigned > Opcode_i64)
static cl::opt< bool > EnableMADWide("nvptx-mad-wide-opt", cl::init(false), cl::Hidden, cl::desc("Enable MAD wide optimization"))
static unsigned GetCpAsyncBulkTensorS2GReductionOpcode(size_t Dim, bool IsShared32, bool IsCacheHint, bool IsIm2Col)
#define TCGEN05_LD_OPCODE(SHAPE, NUM)
static SDValue stripAssertAlign(SDValue N)
static cl::opt< bool > EnableRsqrtOpt("nvptx-rsqrt-approx-opt", cl::init(true), cl::Hidden, cl::desc("Enable reciprocal sqrt optimization"))
static unsigned int getFenceOp(NVPTX::Ordering O, NVPTX::Scope S, NVPTXSubtarget const *T)
#define GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G_RED(dim, mode, is_ch, is_s32)
#define TCGEN05_ST_OPCODE(SHAPE, NUM)
static std::pair< SDValue, SDValue > selectADDR(SDValue Addr, SelectionDAG *DAG)
static unsigned getTcgen05LdOpcode(unsigned IID, bool enablePack)
static bool canLowerToLDG(const MemSDNode &N, const NVPTXSubtarget &Subtarget, NVPTX::AddressSpace CodeAddrSpace)
This file contains the definitions of the enumerations and flags associated with NVVM Intrinsics,...
#define INITIALIZE_PASS(passName, arg, name, cfg, analysis)
static const fltSemantics & BFloat()
static constexpr roundingMode rmNearestTiesToEven
Class for arbitrary precision integers.
LLVM_ABI APInt sext(unsigned width) const
Sign extend to a new width.
int64_t getSExtValue() const
Get sign extended value.
unsigned getSrcAddressSpace() const
unsigned getDestAddressSpace() const
const T & back() const
back - Get the last element.
ArrayRef< T > drop_back(size_t N=1) const
Drop the last N elements of the array.
bool empty() const
empty - Check if the array is empty.
const SDValue & getVal() const
uint64_t getZExtValue() const
FunctionPass class - This class is used to implement most global optimizations.
This is an important class for using LLVM in a threaded context.
bool isIndexed() const
Return true if this is a pre/post inc/dec load/store.
ISD::LoadExtType getExtensionType() const
Return whether this is a plain node, or one of the varieties of value-extending loads.
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool is32BitVector() const
Return true if this is a 32-bit vector type.
MVT getVectorElementType() const
bool is64BitVector() const
Return true if this is a 64-bit vector type.
const std::vector< MachineJumpTableEntry > & getJumpTables() const
This is an abstract virtual class for memory operations.
MachineMemOperand * getMemOperand() const
Return the unique MachineMemOperand object describing the memory reference performed by operation.
EVT getMemoryVT() const
Return the type of the in-memory value.
NVPTXDAGToDAGISelLegacy(NVPTXTargetMachine &tm, CodeGenOptLevel OptLevel)
bool runOnMachineFunction(MachineFunction &MF) override
NVPTXDAGToDAGISel()=delete
static NVPTX::AddressSpace getAddrSpace(const MemSDNode *N)
bool SelectInlineAsmMemoryOperand(const SDValue &Op, InlineAsm::ConstraintCode ConstraintID, std::vector< SDValue > &OutOps) override
SelectInlineAsmMemoryOperand - Implement addressing mode selection for inline asm expressions.
static unsigned getFromTypeWidthForLoad(const MemSDNode *Mem)
const NVPTXSubtarget * Subtarget
const NVPTXTargetLowering * getTargetLowering() const override
bool hasNativeBF16Support(int Opcode) const
bool hasRelaxedMMIO() const
bool hasMemoryOrdering() const
NVPTX::DivPrecisionLevel getDivF32Level(const MachineFunction &MF, const SDNode &N) const
bool allowFMA(MachineFunction &MF, CodeGenOptLevel OptLevel) const
bool usePrecSqrtF32(const SDNode *N=nullptr) const
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
Represents one node in the SelectionDAG.
unsigned getNumValues() const
Return the number of values defined/returned by this operator.
const SDValue & getOperand(unsigned Num) const
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
SDNode * getNode() const
get the SDNode which holds the desired result
EVT getValueType() const
Return the ValueType of the referenced return value.
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
SelectionDAGISelLegacy(char &ID, std::unique_ptr< SelectionDAGISel > S)
void ReplaceUses(SDValue F, SDValue T)
ReplaceUses - replace all uses of the old node F with the use of the new node T.
void ReplaceNode(SDNode *F, SDNode *T)
Replace all uses of F with T, then remove F from the DAG.
SelectionDAGISel(TargetMachine &tm, CodeGenOptLevel OL=CodeGenOptLevel::Default)
virtual bool runOnMachineFunction(MachineFunction &mf)
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
SDValue getTargetGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, unsigned TargetFlags=0)
LLVM_ABI MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
SDValue getTargetFrameIndex(int FI, EVT VT)
SDValue getSignedTargetConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
LLVM_ABI SDValue getTargetExternalSymbol(const char *Sym, EVT VT, unsigned TargetFlags=0)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
const SDValue & getValue() const
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
@ C
The default llvm calling convention, compatible with C.
@ ATOMIC_STORE
OUTCHAIN = ATOMIC_STORE(INCHAIN, val, ptr) This corresponds to "store atomic" instruction.
@ ADD
Simple integer binary arithmetic operators.
@ LOAD
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
@ FADD
Simple binary floating point operators.
@ ATOMIC_FENCE
OUTCHAIN = ATOMIC_FENCE(INCHAIN, ordering, scope) This corresponds to the fence instruction.
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
@ BR_JT
BR_JT - Jumptable branch.
@ ATOMIC_LOAD
Val, OUTCHAIN = ATOMIC_LOAD(INCHAIN, ptr) This corresponds to "load atomic" instruction.
@ AssertAlign
AssertAlign - These nodes record if a register contains a value that has a known alignment and the tr...
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
@ SHL
Shift and rotation operations.
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
@ AND
Bitwise operators - logical and, logical or, logical xor.
@ ADDRSPACECAST
ADDRSPACECAST - This operator converts between pointers of different address spaces.
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
@ ADDRESS_SPACE_ENTRY_PARAM
@ ADDRESS_SPACE_SHARED_CLUSTER
@ ATOMIC_CMP_SWAP_B128
These nodes are used to lower atomic instructions with i128 type.
std::string OrderingToString(Ordering Order)
bool isPackedVectorTy(EVT VT)
initializer< Ty > init(const Ty &Val)
Scope
Defines the scope in which this symbol should be visible: Default – Visible in the public interface o...
This is an optimization pass for GlobalISel generic memory operations.
FunctionAddr VTableAddr Value
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
int countr_one(T Value)
Count the number of ones from the least significant bit to the first zero bit.
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
FunctionPass * createNVPTXISelDag(NVPTXTargetMachine &TM, llvm::CodeGenOptLevel OptLevel)
createNVPTXISelDag - This pass converts a legalized DAG into a NVPTX-specific DAG,...
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
constexpr bool isShiftedMask_64(uint64_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (64 bit ver...
const char * toIRString(AtomicOrdering ao)
String used by LLVM IR to represent atomic ordering.
auto formatv(bool Validate, const char *Fmt, Ts &&...Vals)
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
constexpr bool isMask_64(uint64_t Value)
Return true if the argument is a non-empty sequence of ones starting at the least significant bit wit...
CodeGenOptLevel
Code generation optimization level.
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
AtomicOrdering
Atomic ordering for LLVM's memory model.
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
LLVM_ABI void reportFatalUsageError(Error Err)
Report a fatal error that does not indicate a bug in LLVM.
Implement std::hash so that hash_code can be used in STL containers.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
bool isVector() const
Return true if this is a vector value type.
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
NVPTX::Scope operator[](SyncScope::ID ID) const