23#include "llvm/IR/IntrinsicsNVPTX.h"
34#define DEBUG_TYPE "nvptx-isel"
35#define PASS_NAME "NVPTX DAG->DAG Pattern Instruction Selection"
39 cl::desc(
"Enable reciprocal sqrt optimization"));
46 cl::desc(
"Enable MAD wide optimization"));
75NVPTXDAGToDAGISel::getDivF32Level(
const SDNode *
N)
const {
79bool NVPTXDAGToDAGISel::usePrecSqrtF32(
const SDNode *
N)
const {
83bool NVPTXDAGToDAGISel::useF32FTZ()
const {
84 return Subtarget->getTargetLowering()->useF32FTZ(*
MF);
87bool NVPTXDAGToDAGISel::allowFMA()
const {
88 const NVPTXTargetLowering *TL =
Subtarget->getTargetLowering();
92bool NVPTXDAGToDAGISel::doRsqrtOpt()
const {
return EnableRsqrtOpt; }
94bool NVPTXDAGToDAGISel::doMADWideOpt()
const {
return EnableMADWide; }
98void NVPTXDAGToDAGISel::Select(
SDNode *
N) {
100 if (
N->isMachineOpcode()) {
105 switch (
N->getOpcode()) {
107 case ISD::ATOMIC_LOAD:
113 case ISD::ATOMIC_STORE:
117 case ISD::ATOMIC_FENCE:
125 if (tryEXTRACT_VECTOR_ELEMENT(
N))
132 SelectSETP_BF16X2(
N);
137 if (tryLoadVector(
N))
148 if (tryStoreVector(
N))
152 if (tryIntrinsicChain(
N))
156 if (tryIntrinsicVoid(
N))
166 case ISD::ADDRSPACECAST:
167 SelectAddrSpaceCast(
N);
170 if (
N->getOperand(1).getValueType() == MVT::i128) {
171 SelectV2I64toI128(
N);
177 if (
N->getOperand(1).getValueType() == MVT::i128) {
178 SelectI128toV2I64(
N);
185 selectAtomicSwap128(
N);
190 if (tryBF16ArithToFMA(
N))
199#define TCGEN05_LD_OPCODE(SHAPE, NUM) \
200 (enablePack ? NVPTX::TCGEN05_LD_##SHAPE##_##NUM##_PACK \
201 : NVPTX::TCGEN05_LD_##SHAPE##_##NUM)
205 case Intrinsic::nvvm_tcgen05_ld_16x64b_x1:
207 case Intrinsic::nvvm_tcgen05_ld_16x64b_x2:
209 case Intrinsic::nvvm_tcgen05_ld_16x64b_x4:
211 case Intrinsic::nvvm_tcgen05_ld_16x64b_x8:
213 case Intrinsic::nvvm_tcgen05_ld_16x64b_x16:
215 case Intrinsic::nvvm_tcgen05_ld_16x64b_x32:
217 case Intrinsic::nvvm_tcgen05_ld_16x64b_x64:
219 case Intrinsic::nvvm_tcgen05_ld_16x64b_x128:
221 case Intrinsic::nvvm_tcgen05_ld_16x128b_x1:
223 case Intrinsic::nvvm_tcgen05_ld_16x128b_x2:
225 case Intrinsic::nvvm_tcgen05_ld_16x128b_x4:
227 case Intrinsic::nvvm_tcgen05_ld_16x128b_x8:
229 case Intrinsic::nvvm_tcgen05_ld_16x128b_x16:
231 case Intrinsic::nvvm_tcgen05_ld_16x128b_x32:
233 case Intrinsic::nvvm_tcgen05_ld_16x128b_x64:
235 case Intrinsic::nvvm_tcgen05_ld_16x256b_x1:
237 case Intrinsic::nvvm_tcgen05_ld_16x256b_x2:
239 case Intrinsic::nvvm_tcgen05_ld_16x256b_x4:
241 case Intrinsic::nvvm_tcgen05_ld_16x256b_x8:
243 case Intrinsic::nvvm_tcgen05_ld_16x256b_x16:
245 case Intrinsic::nvvm_tcgen05_ld_16x256b_x32:
247 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x1:
249 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x2:
251 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x4:
253 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x8:
255 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x16:
257 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x32:
259 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x64:
261 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x128:
263 case Intrinsic::nvvm_tcgen05_ld_32x32b_x1:
265 case Intrinsic::nvvm_tcgen05_ld_32x32b_x2:
267 case Intrinsic::nvvm_tcgen05_ld_32x32b_x4:
269 case Intrinsic::nvvm_tcgen05_ld_32x32b_x8:
271 case Intrinsic::nvvm_tcgen05_ld_32x32b_x16:
273 case Intrinsic::nvvm_tcgen05_ld_32x32b_x32:
275 case Intrinsic::nvvm_tcgen05_ld_32x32b_x64:
277 case Intrinsic::nvvm_tcgen05_ld_32x32b_x128:
283void NVPTXDAGToDAGISel::SelectTcgen05Ld(
SDNode *
N,
bool hasOffset) {
286 "tcgen05.ld is not supported on this architecture variant");
293 auto OffsetNode =
CurDAG->getTargetConstant(
297 {N->getOperand(2), OffsetNode, N->getOperand(0)}));
302 {N->getOperand(2), N->getOperand(0)}));
306bool NVPTXDAGToDAGISel::tryIntrinsicChain(
SDNode *
N) {
307 unsigned IID =
N->getConstantOperandVal(1);
311 case Intrinsic::nvvm_ldu_global_f:
312 case Intrinsic::nvvm_ldu_global_i:
313 case Intrinsic::nvvm_ldu_global_p:
316 case Intrinsic::nvvm_tcgen05_ld_16x64b_x1:
317 case Intrinsic::nvvm_tcgen05_ld_16x64b_x2:
318 case Intrinsic::nvvm_tcgen05_ld_16x64b_x4:
319 case Intrinsic::nvvm_tcgen05_ld_16x64b_x8:
320 case Intrinsic::nvvm_tcgen05_ld_16x64b_x16:
321 case Intrinsic::nvvm_tcgen05_ld_16x64b_x32:
322 case Intrinsic::nvvm_tcgen05_ld_16x64b_x64:
323 case Intrinsic::nvvm_tcgen05_ld_16x64b_x128:
324 case Intrinsic::nvvm_tcgen05_ld_16x128b_x1:
325 case Intrinsic::nvvm_tcgen05_ld_16x128b_x2:
326 case Intrinsic::nvvm_tcgen05_ld_16x128b_x4:
327 case Intrinsic::nvvm_tcgen05_ld_16x128b_x16:
328 case Intrinsic::nvvm_tcgen05_ld_16x128b_x32:
329 case Intrinsic::nvvm_tcgen05_ld_16x128b_x64:
330 case Intrinsic::nvvm_tcgen05_ld_16x256b_x1:
331 case Intrinsic::nvvm_tcgen05_ld_16x128b_x8:
332 case Intrinsic::nvvm_tcgen05_ld_16x256b_x2:
333 case Intrinsic::nvvm_tcgen05_ld_16x256b_x4:
334 case Intrinsic::nvvm_tcgen05_ld_16x256b_x8:
335 case Intrinsic::nvvm_tcgen05_ld_16x256b_x16:
336 case Intrinsic::nvvm_tcgen05_ld_16x256b_x32:
337 case Intrinsic::nvvm_tcgen05_ld_32x32b_x1:
338 case Intrinsic::nvvm_tcgen05_ld_32x32b_x2:
339 case Intrinsic::nvvm_tcgen05_ld_32x32b_x4:
340 case Intrinsic::nvvm_tcgen05_ld_32x32b_x8:
341 case Intrinsic::nvvm_tcgen05_ld_32x32b_x16:
342 case Intrinsic::nvvm_tcgen05_ld_32x32b_x32:
343 case Intrinsic::nvvm_tcgen05_ld_32x32b_x64:
344 case Intrinsic::nvvm_tcgen05_ld_32x32b_x128: {
349 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x1:
350 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x2:
351 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x4:
352 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x8:
353 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x16:
354 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x32:
355 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x64:
356 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x128: {
357 SelectTcgen05Ld(
N,
true);
392 return CmpMode::NotANumber;
407 return CurDAG->getTargetConstant(PTXCmpMode, SDLoc(), MVT::i32);
410bool NVPTXDAGToDAGISel::SelectSETP_F16X2(
SDNode *
N) {
413 SDNode *SetP =
CurDAG->getMachineNode(
414 NVPTX::SETP_f16x2rr,
DL, MVT::i1, MVT::i1,
415 {
N->getOperand(0),
N->getOperand(1), PTXCmpMode,
416 CurDAG->getTargetConstant(useF32FTZ() ? 1 : 0,
DL, MVT::i1)});
421bool NVPTXDAGToDAGISel::SelectSETP_BF16X2(
SDNode *
N) {
424 SDNode *SetP =
CurDAG->getMachineNode(
425 NVPTX::SETP_bf16x2rr,
DL, MVT::i1, MVT::i1,
426 {
N->getOperand(0),
N->getOperand(1), PTXCmpMode,
427 CurDAG->getTargetConstant(useF32FTZ() ? 1 : 0,
DL, MVT::i1)});
432bool NVPTXDAGToDAGISel::tryUNPACK_VECTOR(
SDNode *
N) {
434 MVT EltVT =
N->getSimpleValueType(0);
437 CurDAG->getMachineNode(NVPTX::I64toV2I32, SDLoc(
N), EltVT, EltVT,
Vector);
445bool NVPTXDAGToDAGISel::tryEXTRACT_VECTOR_ELEMENT(
SDNode *
N) {
448 MVT VT =
Vector.getSimpleValueType();
454 Opcode = NVPTX::I32toV2I16;
456 Opcode = NVPTX::I64toV2I32;
462 for (
auto *U :
Vector.getNode()->users()) {
465 if (
U->getOperand(0) !=
Vector)
467 if (
const ConstantSDNode *IdxConst =
469 if (IdxConst->getZExtValue() == 0)
471 else if (IdxConst->getZExtValue() == 1)
487 CurDAG->getMachineNode(Opcode, SDLoc(
N), EltVT, EltVT,
Vector);
488 for (
auto *Node : E0)
490 for (
auto *Node : E1)
496static std::optional<NVPTX::AddressSpace>
convertAS(
unsigned AS) {
518 return convertAS(
N->getMemOperand()->getAddrSpace())
526 auto Ordering =
N->getMergedOrdering();
550 return Scopes[
N->getSyncScopeID()];
555struct OperationOrderings {
556 NVPTX::Ordering InstructionOrdering, FenceOrdering;
557 OperationOrderings(NVPTX::Ordering IO = NVPTX::Ordering::NotAtomic,
558 NVPTX::Ordering FO = NVPTX::Ordering::NotAtomic)
559 : InstructionOrdering(IO), FenceOrdering(FO) {}
562static OperationOrderings
664 !HasMemoryOrdering) {
666 formatv(
"PTX does not support \"atomic\" for orderings different than"
667 "\"NotAtomic\" or \"Monotonic\" for sm_60 or older, but order "
679 bool AddrGenericOrGlobalOrShared =
684 if (!AddrGenericOrGlobalOrShared)
687 bool UseRelaxedMMIO =
709 formatv(
"PTX only supports Acquire Ordering on reads: {}",
710 N->getOperationName()));
715 formatv(
"PTX only supports Release Ordering on writes: {}",
716 N->getOperationName()));
720 formatv(
"NVPTX does not support AcquireRelease Ordering on "
722 "yet and PTX does not support it on loads or stores: {}",
723 N->getOperationName()));
736 else if (
N->writeMem())
740 formatv(
"NVPTX does not support SequentiallyConsistent Ordering on "
741 "read-modify-writes yet: {}",
742 N->getOperationName()));
743 return OperationOrderings(InstrOrder,
748 formatv(
"NVPTX backend does not support AtomicOrdering \"{}\" yet.",
771 auto S = Scopes[
N->getSyncScopeID()];
780 Subtarget->failIfClustersUnsupported(
"cluster scope");
799 T->failIfClustersUnsupported(
".cluster scope fence");
802 if (!
T->hasSplitAcquireAndReleaseFences() &&
810 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_acquire_sys
811 : NVPTX::INT_MEMBAR_SYS;
813 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_acquire_cta
814 : NVPTX::INT_MEMBAR_CTA;
816 return NVPTX::atomic_thread_fence_acquire_cluster;
818 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_acquire_gpu
819 : NVPTX::INT_MEMBAR_GL;
823 formatv(
"Unsupported scope \"{}\" for acquire/release/acq_rel fence.",
830 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_release_sys
831 : NVPTX::INT_MEMBAR_SYS;
833 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_release_cta
834 : NVPTX::INT_MEMBAR_CTA;
836 return NVPTX::atomic_thread_fence_release_cluster;
838 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_release_gpu
839 : NVPTX::INT_MEMBAR_GL;
843 formatv(
"Unsupported scope \"{}\" for acquire/release/acq_rel fence.",
850 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_acq_rel_sys
851 : NVPTX::INT_MEMBAR_SYS;
853 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_acq_rel_cta
854 : NVPTX::INT_MEMBAR_CTA;
856 return NVPTX::atomic_thread_fence_acq_rel_cluster;
858 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_acq_rel_gpu
859 : NVPTX::INT_MEMBAR_GL;
863 formatv(
"Unsupported scope \"{}\" for acquire/release/acq_rel fence.",
871 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_seq_cst_sys
872 : NVPTX::INT_MEMBAR_SYS;
874 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_seq_cst_cta
875 : NVPTX::INT_MEMBAR_CTA;
877 return NVPTX::atomic_thread_fence_seq_cst_cluster;
879 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_seq_cst_gpu
880 : NVPTX::INT_MEMBAR_GL;
893 formatv(
"Unsupported \"{}\" ordering and \"{}\" scope for fence.",
894 OrderingToString(O), ScopeToString(S)));
902std::pair<NVPTX::Ordering, NVPTX::Scope>
903NVPTXDAGToDAGISel::insertMemoryInstructionFence(
SDLoc DL,
SDValue &Chain,
905 auto [InstructionOrdering, FenceOrdering] =
907 auto Scope = getOperationScope(
N, InstructionOrdering);
920 formatv(
"Unexpected fence ordering: \"{}\".",
923 return {InstructionOrdering,
Scope};
926void NVPTXDAGToDAGISel::SelectAddrSpaceCast(
SDNode *
N) {
932 assert(SrcAddrSpace != DstAddrSpace &&
933 "addrspacecast must be between different address spaces");
938 if (TM.is64Bit() && TM.getPointerSizeInBits(SrcAddrSpace) == 32) {
941 SDNode *Cvt =
CurDAG->getMachineNode(NVPTX::CVT_u64_u32,
DL, MVT::i64,
947 switch (SrcAddrSpace) {
950 Opc = TM.is64Bit() ? NVPTX::cvta_global_64 : NVPTX::cvta_global;
953 Opc = TM.is64Bit() ? NVPTX::cvta_shared_64 : NVPTX::cvta_shared;
958 "Shared cluster address space is only supported in 64-bit mode");
959 Opc = NVPTX::cvta_shared_cluster_64;
962 Opc = TM.is64Bit() ? NVPTX::cvta_const_64 : NVPTX::cvta_const;
965 Opc = TM.is64Bit() ? NVPTX::cvta_local_64 : NVPTX::cvta_local;
968 Opc = TM.is64Bit() ? NVPTX::cvta_param_64 : NVPTX::cvta_param;
975 if (SrcAddrSpace != 0)
978 switch (DstAddrSpace) {
981 Opc = TM.is64Bit() ? NVPTX::cvta_to_global_64 : NVPTX::cvta_to_global;
984 Opc = TM.is64Bit() ? NVPTX::cvta_to_shared_64 : NVPTX::cvta_to_shared;
989 "Shared cluster address space is only supported in 64-bit mode");
990 Opc = NVPTX::cvta_to_shared_cluster_64;
993 Opc = TM.is64Bit() ? NVPTX::cvta_to_const_64 : NVPTX::cvta_to_const;
996 Opc = TM.is64Bit() ? NVPTX::cvta_to_local_64 : NVPTX::cvta_to_local;
999 Opc = TM.is64Bit() ? NVPTX::cvta_to_param_64 : NVPTX::cvta_to_param;
1003 SDNode *CVTA =
CurDAG->getMachineNode(
Opc,
DL,
N->getValueType(0), Src);
1004 if (TM.is64Bit() && TM.getPointerSizeInBits(DstAddrSpace) == 32) {
1007 CVTA =
CurDAG->getMachineNode(NVPTX::CVT_u32_u64,
DL, MVT::i32,
1018static std::optional<unsigned>
1020 std::optional<unsigned> Opcode_i32,
1021 std::optional<unsigned> Opcode_i64) {
1040 return std::nullopt;
1045 return V.getOpcode() ==
ISD::ADD ||
1046 (V->getOpcode() ==
ISD::OR && V->getFlags().hasDisjoint());
1051 N =
N.getOperand(0);
1061 GA->getValueType(0), GA->getOffset(),
1062 GA->getTargetFlags());
1065 ES->getTargetFlags());
1074 APInt AccumulatedOffset(64u, 0);
1080 const APInt CI = CN->getAPIntValue().
sext(64);
1081 if (!(CI + AccumulatedOffset).isSignedIntN(32))
1084 AccumulatedOffset += CI;
1110bool NVPTXDAGToDAGISel::tryLoad(
SDNode *
N) {
1112 assert(
LD->readMem() &&
"Expected load");
1116 if (PlainLoad && PlainLoad->
isIndexed())
1126 const auto [
Ordering,
Scope] = insertMemoryInstructionFence(
DL, Chain, LD);
1128 const unsigned FromTypeWidth =
LD->getMemoryVT().getSizeInBits();
1136 uint32_t UsedBytesMask;
1137 switch (
N->getOpcode()) {
1139 case ISD::ATOMIC_LOAD:
1140 UsedBytesMask = UINT32_MAX;
1143 UsedBytesMask =
N->getConstantOperandVal(3);
1150 FromTypeWidth <= 128 &&
"Invalid width for load");
1155 getI32Imm(Scope,
DL),
1156 getI32Imm(CodeAddrSpace,
DL),
1157 getI32Imm(FromType,
DL),
1158 getI32Imm(FromTypeWidth,
DL),
1159 getI32Imm(UsedBytesMask,
DL),
1165 const std::optional<unsigned> Opcode =
1166 pickOpcodeForVT(TargetVT, NVPTX::LD_i16, NVPTX::LD_i32, NVPTX::LD_i64);
1170 SDNode *NVPTXLD =
CurDAG->getMachineNode(*Opcode,
DL,
LD->getVTList(),
Ops);
1174 MachineMemOperand *MemRef =
LD->getMemOperand();
1182 switch (
N->getOpcode()) {
1194bool NVPTXDAGToDAGISel::tryLoadVector(
SDNode *
N) {
1202 const MVT EltVT =
LD->getSimpleValueType(0);
1205 const auto [
Ordering,
Scope] = insertMemoryInstructionFence(
DL, Chain, LD);
1215 const unsigned ExtensionType =
N->getConstantOperandVal(4);
1218 : NVPTX::PTXLdStInstCode::
Untyped;
1221 const uint32_t UsedBytesMask =
N->getConstantOperandVal(3);
1227 getI32Imm(Scope,
DL),
1228 getI32Imm(CodeAddrSpace,
DL),
1229 getI32Imm(FromType,
DL),
1230 getI32Imm(FromTypeWidth,
DL),
1231 getI32Imm(UsedBytesMask,
DL),
1236 std::optional<unsigned> Opcode;
1237 switch (
N->getOpcode()) {
1242 NVPTX::LDV_i32_v2, NVPTX::LDV_i64_v2);
1246 NVPTX::LDV_i32_v4, NVPTX::LDV_i64_v4);
1250 NVPTX::LDV_i32_v8, {});
1256 SDNode *NVPTXLD =
CurDAG->getMachineNode(*Opcode,
DL,
LD->getVTList(),
Ops);
1258 MachineMemOperand *MemRef =
LD->getMemOperand();
1265bool NVPTXDAGToDAGISel::tryLDG(
MemSDNode *LD) {
1268 unsigned ExtensionType;
1269 uint32_t UsedBytesMask;
1271 ExtensionType =
Load->getExtensionType();
1272 UsedBytesMask = UINT32_MAX;
1274 ExtensionType =
LD->getConstantOperandVal(4);
1275 UsedBytesMask =
LD->getConstantOperandVal(3);
1279 : NVPTX::PTXLdStInstCode::
Untyped;
1283 assert(!(
LD->getSimpleValueType(0).isVector() &&
1288 getI32Imm(FromTypeWidth,
DL),
1289 getI32Imm(UsedBytesMask,
DL),
1295 std::optional<unsigned> Opcode;
1296 switch (
LD->getOpcode()) {
1301 NVPTX::LD_GLOBAL_NC_i32, NVPTX::LD_GLOBAL_NC_i64);
1304 Opcode =
pickOpcodeForVT(TargetVT, std::nullopt, NVPTX::LD_GLOBAL_NC_i32,
1305 NVPTX::LD_GLOBAL_NC_i64);
1310 NVPTX::LD_GLOBAL_NC_v2i32, NVPTX::LD_GLOBAL_NC_v2i64);
1315 NVPTX::LD_GLOBAL_NC_v4i32, NVPTX::LD_GLOBAL_NC_v4i64);
1319 NVPTX::LD_GLOBAL_NC_v8i32, {});
1325 SDNode *NVPTXLDG =
CurDAG->getMachineNode(*Opcode,
DL,
LD->getVTList(),
Ops);
1334 auto ElementBitWidth = TotalWidth / NumElts;
1336 ElementBitWidth <= 128 && TotalWidth <= 256 &&
1337 "Invalid width for load");
1338 return ElementBitWidth;
1341bool NVPTXDAGToDAGISel::tryLDU(
SDNode *
N) {
1356 std::optional<unsigned> Opcode;
1357 switch (
N->getOpcode()) {
1362 NVPTX::LDU_GLOBAL_i32, NVPTX::LDU_GLOBAL_i64);
1366 NVPTX::LDU_GLOBAL_v2i32, NVPTX::LDU_GLOBAL_v2i64);
1370 NVPTX::LDU_GLOBAL_v4i32, {});
1376 SDNode *NVPTXLDU =
CurDAG->getMachineNode(*Opcode,
DL,
LD->getVTList(),
Ops);
1382bool NVPTXDAGToDAGISel::tryStore(
SDNode *
N) {
1384 assert(
ST->writeMem() &&
"Expected store");
1387 assert((PlainStore || AtomicStore) &&
"Expected store");
1390 if (PlainStore && PlainStore->
isIndexed())
1398 const auto [
Ordering,
Scope] = insertMemoryInstructionFence(
DL, Chain, ST);
1401 const unsigned ToTypeWidth =
ST->getMemoryVT().getSizeInBits();
1407 "Invalid width for store");
1411 getI32Imm(Ordering,
DL),
1412 getI32Imm(Scope,
DL),
1413 getI32Imm(CodeAddrSpace,
DL),
1414 getI32Imm(ToTypeWidth,
DL),
1419 const std::optional<unsigned> Opcode =
1421 NVPTX::ST_i32, NVPTX::ST_i64);
1425 SDNode *NVPTXST =
CurDAG->getMachineNode(*Opcode,
DL, MVT::Other,
Ops);
1430 MachineMemOperand *MemRef =
ST->getMemOperand();
1436bool NVPTXDAGToDAGISel::tryStoreVector(
SDNode *
N) {
1438 const unsigned TotalWidth =
ST->getMemoryVT().getSizeInBits();
1449 const auto [
Ordering,
Scope] = insertMemoryInstructionFence(
DL, Chain, ST);
1454 for (
auto &V :
ST->ops().slice(1, NumElts))
1455 Ops.push_back(selectPossiblyImm(V));
1457 const unsigned ToTypeWidth = TotalWidth / NumElts;
1460 TotalWidth <= 256 &&
"Invalid width for store");
1463 Ops.append({getI32Imm(Ordering,
DL), getI32Imm(Scope,
DL),
1464 getI32Imm(CodeAddrSpace,
DL), getI32Imm(ToTypeWidth,
DL),
Base,
1468 ST->getOperand(1).getSimpleValueType().SimpleTy;
1469 std::optional<unsigned> Opcode;
1470 switch (
ST->getOpcode()) {
1490 SDNode *NVPTXST =
CurDAG->getMachineNode(*Opcode,
DL, MVT::Other,
Ops);
1492 MachineMemOperand *MemRef =
ST->getMemOperand();
1501bool NVPTXDAGToDAGISel::tryBFE(
SDNode *
N) {
1508 bool IsSigned =
false;
1524 uint64_t MaskVal =
Mask->getZExtValue();
1534 Len =
CurDAG->getTargetConstant(NumBits,
DL, MVT::i32);
1538 Val =
LHS.getNode()->getOperand(0);
1539 Start =
LHS.getNode()->getOperand(1);
1545 int64_t GoodBits =
Start.getValueSizeInBits() - StartVal;
1546 if (NumBits > GoodBits) {
1604 NumBits = NumZeros + NumOnes - ShiftAmt;
1610 if (ShiftAmt < NumZeros) {
1618 Len =
CurDAG->getTargetConstant(NumBits,
DL, MVT::i32);
1634 Val =
LHS->getOperand(0);
1653 if (OuterShiftAmt < InnerShiftAmt) {
1664 Start =
CurDAG->getTargetConstant(OuterShiftAmt - InnerShiftAmt,
DL,
1689 Opc = NVPTX::BFE_S32rii;
1691 Opc = NVPTX::BFE_U32rii;
1695 Opc = NVPTX::BFE_S64rii;
1697 Opc = NVPTX::BFE_U64rii;
1713bool NVPTXDAGToDAGISel::tryBF16ArithToFMA(
SDNode *
N) {
1714 EVT VT =
SDValue(
N, 0).getValueType();
1718 const NVPTXSubtarget *STI = TM.getSubtargetImpl();
1735 auto API = APF.bitcastToAPInt();
1736 API = API.concat(API);
1738 return SDValue(
CurDAG->getMachineNode(NVPTX::MOV_B32_i,
DL, VT, Const),
1742 return SDValue(
CurDAG->getMachineNode(NVPTX::MOV_BF16_i,
DL, VT, Const), 0);
1745 switch (
N->getOpcode()) {
1748 Operands = {N0, GetConstant(1.0), N1};
1752 Operands = {N1, GetConstant(-1.0), N0};
1757 Operands = {N0, N1, GetConstant(-0.0)};
1763 int Opcode = IsVec ? NVPTX::FMA_BF16x2rrr : NVPTX::FMA_BF16rrr;
1764 MachineSDNode *
FMA =
CurDAG->getMachineNode(Opcode,
DL, VT, Operands);
1770 if (
V.getOpcode() == ISD::BITCAST)
1771 V =
V.getOperand(0);
1774 return CurDAG->getTargetConstant(CN->getAPIntValue(), SDLoc(V),
1777 return CurDAG->getTargetConstantFP(CN->getValueAPF(), SDLoc(V),
1786 std::vector<SDValue> &OutOps) {
1787 switch (ConstraintID) {
1792 OutOps.push_back(
Base);
1793 OutOps.push_back(
Offset);
1800void NVPTXDAGToDAGISel::SelectV2I64toI128(
SDNode *
N) {
1819 NewOps[0] =
N->getOperand(0);
1822 if (
N->getNumOperands() == 5)
1823 NewOps[3] =
N->getOperand(4);
1829void NVPTXDAGToDAGISel::SelectI128toV2I64(
SDNode *
N) {
1846 SDNode *Mov =
CurDAG->getMachineNode(
1847 NVPTX::I128toV2I64,
DL,
1854bool NVPTXDAGToDAGISel::tryFence(
SDNode *
N) {
1856 assert(
N->getOpcode() == ISD::ATOMIC_FENCE);
1857 unsigned int FenceOp =
1859 Scopes[
N->getConstantOperandVal(2)],
Subtarget);
1861 SDNode *FenceNode =
CurDAG->getMachineNode(FenceOp,
DL, MVT::Other, Chain);
1877 "NVPTXScopes::operator[]");
1879 auto S = Scopes.find(
ID);
1880 if (S == Scopes.end()) {
1881 auto scopeName = Context->getSyncScopeName(
ID);
1882 assert(scopeName.has_value() &&
"Scope name must exist.");
1886 for (
const auto &Entry : Scopes) {
1887 if (
auto name = Context->getSyncScopeName(Entry.first))
1892 formatv(
"NVPTX backend does not support syncscope \"{0}\" (ID={1}).\n"
1893 "Supported syncscopes are: {2}.",
1894 scopeName.value(),
int(
ID),
1902#define CP_ASYNC_BULK_TENSOR_OPCODE(dir, dim, mode, is_s32, suffix) \
1904 ? NVPTX::CP_ASYNC_BULK_TENSOR_##dir##_##dim##_SHARED32_##mode##suffix \
1905 : NVPTX::CP_ASYNC_BULK_TENSOR_##dir##_##dim##_##mode##suffix)
1907#define GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G_RED(dim, mode, is_ch, is_s32) \
1908 (is_ch ? (CP_ASYNC_BULK_TENSOR_OPCODE(RED, dim, mode, is_s32, _CH)) \
1909 : (CP_ASYNC_BULK_TENSOR_OPCODE(RED, dim, mode, is_s32, )))
1928 "GetCpAsyncBulkTensorS2GReductionOpcode.");
1949 "GetCpAsyncBulkTensorS2GReductionOpcode.");
1954void NVPTXDAGToDAGISel::SelectCpAsyncBulkTensorReduceCommon(
SDNode *
N,
1961 size_t NumOps =
N->getNumOperands();
1962 size_t NumDims =
NumOps - 6;
1963 bool IsCacheHint =
N->getConstantOperandVal(
NumOps - 1) == 1;
1964 size_t NumArgs = NumDims + (IsCacheHint ? 3 : 2);
1968 Ops.push_back(getI32Imm(RedOp,
DL));
1969 Ops.push_back(
N->getOperand(0));
1974 NumDims, IsShared32, IsCacheHint, IsIm2Col);
1978#define TCGEN05_ST_OPCODE(SHAPE, NUM) \
1979 (enableUnpack ? NVPTX::TCGEN05_ST_##SHAPE##_##NUM##_UNPACK \
1980 : NVPTX::TCGEN05_ST_##SHAPE##_##NUM)
1984 case Intrinsic::nvvm_tcgen05_st_16x64b_x1:
1986 case Intrinsic::nvvm_tcgen05_st_16x64b_x2:
1988 case Intrinsic::nvvm_tcgen05_st_16x64b_x4:
1990 case Intrinsic::nvvm_tcgen05_st_16x64b_x8:
1992 case Intrinsic::nvvm_tcgen05_st_16x64b_x16:
1994 case Intrinsic::nvvm_tcgen05_st_16x64b_x32:
1996 case Intrinsic::nvvm_tcgen05_st_16x64b_x64:
1998 case Intrinsic::nvvm_tcgen05_st_16x64b_x128:
2000 case Intrinsic::nvvm_tcgen05_st_16x128b_x1:
2002 case Intrinsic::nvvm_tcgen05_st_16x128b_x2:
2004 case Intrinsic::nvvm_tcgen05_st_16x128b_x4:
2006 case Intrinsic::nvvm_tcgen05_st_16x128b_x8:
2008 case Intrinsic::nvvm_tcgen05_st_16x128b_x16:
2010 case Intrinsic::nvvm_tcgen05_st_16x128b_x32:
2012 case Intrinsic::nvvm_tcgen05_st_16x128b_x64:
2014 case Intrinsic::nvvm_tcgen05_st_16x256b_x1:
2016 case Intrinsic::nvvm_tcgen05_st_16x256b_x2:
2018 case Intrinsic::nvvm_tcgen05_st_16x256b_x4:
2020 case Intrinsic::nvvm_tcgen05_st_16x256b_x8:
2022 case Intrinsic::nvvm_tcgen05_st_16x256b_x16:
2024 case Intrinsic::nvvm_tcgen05_st_16x256b_x32:
2026 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x1:
2028 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x2:
2030 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x4:
2032 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x8:
2034 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x16:
2036 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x32:
2038 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x64:
2040 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x128:
2042 case Intrinsic::nvvm_tcgen05_st_32x32b_x1:
2044 case Intrinsic::nvvm_tcgen05_st_32x32b_x2:
2046 case Intrinsic::nvvm_tcgen05_st_32x32b_x4:
2048 case Intrinsic::nvvm_tcgen05_st_32x32b_x8:
2050 case Intrinsic::nvvm_tcgen05_st_32x32b_x16:
2052 case Intrinsic::nvvm_tcgen05_st_32x32b_x32:
2054 case Intrinsic::nvvm_tcgen05_st_32x32b_x64:
2056 case Intrinsic::nvvm_tcgen05_st_32x32b_x128:
2062void NVPTXDAGToDAGISel::SelectTcgen05St(
SDNode *
N,
bool hasOffset) {
2063 if (!
Subtarget->hasTcgen05InstSupport())
2065 "tcgen05.st is not supported on this architecture variant");
2079 for (
unsigned I = hasOffset ? 4 : 3;
I < (
N->getNumOperands() - 1);
I++)
2088 DL,
N->getVTList(), Operands));
2091bool NVPTXDAGToDAGISel::tryIntrinsicVoid(
SDNode *
N) {
2092 unsigned IID =
N->getConstantOperandVal(1);
2094 auto CastTy = [](TMARedTy
Op) {
return static_cast<unsigned>(
Op); };
2098 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_tile_1d:
2099 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_tile_2d:
2100 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_tile_3d:
2101 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_tile_4d:
2102 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_tile_5d:
2103 SelectCpAsyncBulkTensorReduceCommon(
N, CastTy(TMARedTy::ADD));
2105 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_im2col_3d:
2106 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_im2col_4d:
2107 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_im2col_5d:
2108 SelectCpAsyncBulkTensorReduceCommon(
N, CastTy(TMARedTy::ADD),
2111 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_min_tile_1d:
2112 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_min_tile_2d:
2113 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_min_tile_3d:
2114 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_min_tile_4d:
2115 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_min_tile_5d:
2116 SelectCpAsyncBulkTensorReduceCommon(
N, CastTy(TMARedTy::MIN));
2118 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_min_im2col_3d:
2119 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_min_im2col_4d:
2120 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_min_im2col_5d:
2121 SelectCpAsyncBulkTensorReduceCommon(
N, CastTy(TMARedTy::MIN),
2124 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_max_tile_1d:
2125 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_max_tile_2d:
2126 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_max_tile_3d:
2127 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_max_tile_4d:
2128 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_max_tile_5d:
2129 SelectCpAsyncBulkTensorReduceCommon(
N, CastTy(TMARedTy::MAX));
2131 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_max_im2col_3d:
2132 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_max_im2col_4d:
2133 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_max_im2col_5d:
2134 SelectCpAsyncBulkTensorReduceCommon(
N, CastTy(TMARedTy::MAX),
2137 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_inc_tile_1d:
2138 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_inc_tile_2d:
2139 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_inc_tile_3d:
2140 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_inc_tile_4d:
2141 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_inc_tile_5d:
2142 SelectCpAsyncBulkTensorReduceCommon(
N, CastTy(TMARedTy::INC));
2144 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_inc_im2col_3d:
2145 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_inc_im2col_4d:
2146 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_inc_im2col_5d:
2147 SelectCpAsyncBulkTensorReduceCommon(
N, CastTy(TMARedTy::INC),
2150 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_dec_tile_1d:
2151 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_dec_tile_2d:
2152 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_dec_tile_3d:
2153 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_dec_tile_4d:
2154 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_dec_tile_5d:
2155 SelectCpAsyncBulkTensorReduceCommon(
N, CastTy(TMARedTy::DEC));
2157 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_dec_im2col_3d:
2158 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_dec_im2col_4d:
2159 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_dec_im2col_5d:
2160 SelectCpAsyncBulkTensorReduceCommon(
N, CastTy(TMARedTy::DEC),
2163 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_and_tile_1d:
2164 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_and_tile_2d:
2165 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_and_tile_3d:
2166 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_and_tile_4d:
2167 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_and_tile_5d:
2168 SelectCpAsyncBulkTensorReduceCommon(
N, CastTy(TMARedTy::AND));
2170 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_and_im2col_3d:
2171 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_and_im2col_4d:
2172 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_and_im2col_5d:
2173 SelectCpAsyncBulkTensorReduceCommon(
N, CastTy(TMARedTy::AND),
2176 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_or_tile_1d:
2177 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_or_tile_2d:
2178 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_or_tile_3d:
2179 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_or_tile_4d:
2180 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_or_tile_5d:
2181 SelectCpAsyncBulkTensorReduceCommon(
N, CastTy(TMARedTy::OR));
2183 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_or_im2col_3d:
2184 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_or_im2col_4d:
2185 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_or_im2col_5d:
2186 SelectCpAsyncBulkTensorReduceCommon(
N, CastTy(TMARedTy::OR),
2189 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_xor_tile_1d:
2190 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_xor_tile_2d:
2191 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_xor_tile_3d:
2192 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_xor_tile_4d:
2193 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_xor_tile_5d:
2194 SelectCpAsyncBulkTensorReduceCommon(
N, CastTy(TMARedTy::XOR));
2196 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_xor_im2col_3d:
2197 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_xor_im2col_4d:
2198 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_xor_im2col_5d:
2199 SelectCpAsyncBulkTensorReduceCommon(
N, CastTy(TMARedTy::XOR),
2203 case Intrinsic::nvvm_tcgen05_st_16x64b_x1:
2204 case Intrinsic::nvvm_tcgen05_st_16x64b_x2:
2205 case Intrinsic::nvvm_tcgen05_st_16x64b_x4:
2206 case Intrinsic::nvvm_tcgen05_st_16x64b_x8:
2207 case Intrinsic::nvvm_tcgen05_st_16x64b_x16:
2208 case Intrinsic::nvvm_tcgen05_st_16x64b_x32:
2209 case Intrinsic::nvvm_tcgen05_st_16x64b_x64:
2210 case Intrinsic::nvvm_tcgen05_st_16x64b_x128:
2211 case Intrinsic::nvvm_tcgen05_st_32x32b_x1:
2212 case Intrinsic::nvvm_tcgen05_st_32x32b_x2:
2213 case Intrinsic::nvvm_tcgen05_st_32x32b_x4:
2214 case Intrinsic::nvvm_tcgen05_st_32x32b_x8:
2215 case Intrinsic::nvvm_tcgen05_st_32x32b_x16:
2216 case Intrinsic::nvvm_tcgen05_st_32x32b_x32:
2217 case Intrinsic::nvvm_tcgen05_st_32x32b_x64:
2218 case Intrinsic::nvvm_tcgen05_st_32x32b_x128:
2219 case Intrinsic::nvvm_tcgen05_st_16x128b_x1:
2220 case Intrinsic::nvvm_tcgen05_st_16x128b_x2:
2221 case Intrinsic::nvvm_tcgen05_st_16x128b_x4:
2222 case Intrinsic::nvvm_tcgen05_st_16x128b_x8:
2223 case Intrinsic::nvvm_tcgen05_st_16x128b_x16:
2224 case Intrinsic::nvvm_tcgen05_st_16x128b_x32:
2225 case Intrinsic::nvvm_tcgen05_st_16x128b_x64:
2226 case Intrinsic::nvvm_tcgen05_st_16x256b_x1:
2227 case Intrinsic::nvvm_tcgen05_st_16x256b_x2:
2228 case Intrinsic::nvvm_tcgen05_st_16x256b_x4:
2229 case Intrinsic::nvvm_tcgen05_st_16x256b_x8:
2230 case Intrinsic::nvvm_tcgen05_st_16x256b_x16:
2231 case Intrinsic::nvvm_tcgen05_st_16x256b_x32: {
2236 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x1:
2237 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x2:
2238 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x4:
2239 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x8:
2240 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x16:
2241 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x32:
2242 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x64:
2243 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x128: {
2244 SelectTcgen05St(
N,
true);
2250void NVPTXDAGToDAGISel::selectAtomicSwap128(
SDNode *
N) {
2257 Ops.append(
N->op_begin() + 2,
N->op_end());
2259 getI32Imm(getMemOrder(AN), dl),
2260 getI32Imm(getAtomicScope(AN), dl),
2268 ? NVPTX::ATOM_EXCH_B128
2269 : NVPTX::ATOM_CAS_B128;
2271 auto *ATOM =
CurDAG->getMachineNode(Opcode, dl,
N->getVTList(),
Ops);
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
This file implements a class to represent arbitrary precision integral constant values and operations...
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Atomic ordering constants.
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static unsigned getStoreVectorNumElts(SDNode *N)
static bool isAddLike(const SDValue V)
static SDValue selectBaseADDR(SDValue N, SelectionDAG *DAG)
static SDValue accumulateOffset(SDValue &Addr, SDLoc DL, SelectionDAG *DAG)
static unsigned getTcgen05StOpcode(unsigned IID, bool enableUnpack)
static std::optional< unsigned > pickOpcodeForVT(MVT::SimpleValueType VT, std::optional< unsigned > Opcode_i16, std::optional< unsigned > Opcode_i32, std::optional< unsigned > Opcode_i64)
static cl::opt< bool > EnableMADWide("nvptx-mad-wide-opt", cl::init(false), cl::Hidden, cl::desc("Enable MAD wide optimization"))
static unsigned GetCpAsyncBulkTensorS2GReductionOpcode(size_t Dim, bool IsShared32, bool IsCacheHint, bool IsIm2Col)
#define TCGEN05_LD_OPCODE(SHAPE, NUM)
static SDValue stripAssertAlign(SDValue N)
static cl::opt< bool > EnableRsqrtOpt("nvptx-rsqrt-approx-opt", cl::init(true), cl::Hidden, cl::desc("Enable reciprocal sqrt optimization"))
static unsigned int getFenceOp(NVPTX::Ordering O, NVPTX::Scope S, NVPTXSubtarget const *T)
#define GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G_RED(dim, mode, is_ch, is_s32)
#define TCGEN05_ST_OPCODE(SHAPE, NUM)
static std::optional< NVPTX::AddressSpace > convertAS(unsigned AS)
static std::pair< SDValue, SDValue > selectADDR(SDValue Addr, SelectionDAG *DAG)
static unsigned getTcgen05LdOpcode(unsigned IID, bool enablePack)
static bool canLowerToLDG(const MemSDNode &N, const NVPTXSubtarget &Subtarget, NVPTX::AddressSpace CodeAddrSpace)
This file contains the definitions of the enumerations and flags associated with NVVM Intrinsics,...
#define INITIALIZE_PASS(passName, arg, name, cfg, analysis)
static const fltSemantics & BFloat()
static constexpr roundingMode rmNearestTiesToEven
Class for arbitrary precision integers.
LLVM_ABI APInt sext(unsigned width) const
Sign extend to a new width.
int64_t getSExtValue() const
Get sign extended value.
unsigned getSrcAddressSpace() const
unsigned getDestAddressSpace() const
const SDValue & getVal() const
uint64_t getZExtValue() const
FunctionPass class - This class is used to implement most global optimizations.
This is an important class for using LLVM in a threaded context.
bool isIndexed() const
Return true if this is a pre/post inc/dec load/store.
ISD::LoadExtType getExtensionType() const
Return whether this is a plain node, or one of the varieties of value-extending loads.
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool is32BitVector() const
Return true if this is a 32-bit vector type.
MVT getVectorElementType() const
bool is64BitVector() const
Return true if this is a 64-bit vector type.
This is an abstract virtual class for memory operations.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
EVT getMemoryVT() const
Return the type of the in-memory value.
NVPTXDAGToDAGISelLegacy(NVPTXTargetMachine &tm, CodeGenOptLevel OptLevel)
bool runOnMachineFunction(MachineFunction &MF) override
NVPTXDAGToDAGISel()=delete
static NVPTX::AddressSpace getAddrSpace(const MemSDNode *N)
bool SelectInlineAsmMemoryOperand(const SDValue &Op, InlineAsm::ConstraintCode ConstraintID, std::vector< SDValue > &OutOps) override
SelectInlineAsmMemoryOperand - Implement addressing mode selection for inline asm expressions.
static unsigned getFromTypeWidthForLoad(const MemSDNode *Mem)
const NVPTXSubtarget * Subtarget
const NVPTXTargetLowering * getTargetLowering() const override
bool hasNativeBF16Support(int Opcode) const
bool hasRelaxedMMIO() const
bool hasMemoryOrdering() const
NVPTX::DivPrecisionLevel getDivF32Level(const MachineFunction &MF, const SDNode &N) const
bool allowFMA(MachineFunction &MF, CodeGenOptLevel OptLevel) const
bool usePrecSqrtF32(const SDNode *N=nullptr) const
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
Represents one node in the SelectionDAG.
unsigned getNumValues() const
Return the number of values defined/returned by this operator.
const SDValue & getOperand(unsigned Num) const
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
SDNode * getNode() const
get the SDNode which holds the desired result
EVT getValueType() const
Return the ValueType of the referenced return value.
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
SelectionDAGISelLegacy(char &ID, std::unique_ptr< SelectionDAGISel > S)
void ReplaceUses(SDValue F, SDValue T)
ReplaceUses - replace all uses of the old node F with the use of the new node T.
void ReplaceNode(SDNode *F, SDNode *T)
Replace all uses of F with T, then remove F from the DAG.
SelectionDAGISel(TargetMachine &tm, CodeGenOptLevel OL=CodeGenOptLevel::Default)
virtual bool runOnMachineFunction(MachineFunction &mf)
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
SDValue getTargetGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, unsigned TargetFlags=0)
LLVM_ABI MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
SDValue getTargetFrameIndex(int FI, EVT VT)
SDValue getSignedTargetConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
LLVM_ABI SDValue getTargetExternalSymbol(const char *Sym, EVT VT, unsigned TargetFlags=0)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
const SDValue & getValue() const
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
@ C
The default llvm calling convention, compatible with C.
@ ADD
Simple integer binary arithmetic operators.
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
@ FADD
Simple binary floating point operators.
@ AssertAlign
AssertAlign - These nodes record if a register contains a value that has a known alignment and the tr...
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
@ SHL
Shift and rotation operations.
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
@ AND
Bitwise operators - logical and, logical or, logical xor.
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
@ ADDRESS_SPACE_SHARED_CLUSTER
@ ATOMIC_CMP_SWAP_B128
These nodes are used to lower atomic instructions with i128 type.
std::string ScopeToString(Scope S)
std::string OrderingToString(Ordering Order)
bool isPackedVectorTy(EVT VT)
initializer< Ty > init(const Ty &Val)
Scope
Defines the scope in which this symbol should be visible: Default – Visible in the public interface o...
This is an optimization pass for GlobalISel generic memory operations.
FunctionAddr VTableAddr Value
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
int countr_one(T Value)
Count the number of ones from the least significant bit to the first zero bit.
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
FunctionPass * createNVPTXISelDag(NVPTXTargetMachine &TM, llvm::CodeGenOptLevel OptLevel)
createNVPTXISelDag - This pass converts a legalized DAG into a NVPTX-specific DAG,...
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
constexpr bool isShiftedMask_64(uint64_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (64 bit ver...
const char * toIRString(AtomicOrdering ao)
String used by LLVM IR to represent atomic ordering.
auto formatv(bool Validate, const char *Fmt, Ts &&...Vals)
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
constexpr bool isMask_64(uint64_t Value)
Return true if the argument is a non-empty sequence of ones starting at the least significant bit wit...
CodeGenOptLevel
Code generation optimization level.
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
AtomicOrdering
Atomic ordering for LLVM's memory model.
DWARFExpression::Operation Op
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
LLVM_ABI void reportFatalUsageError(Error Err)
Report a fatal error that does not indicate a bug in LLVM.
Implement std::hash so that hash_code can be used in STL containers.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
bool isVector() const
Return true if this is a vector value type.
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
NVPTX::Scope operator[](SyncScope::ID ID) const