23#include "llvm/IR/IntrinsicsNVPTX.h"
34#define DEBUG_TYPE "nvptx-isel"
35#define PASS_NAME "NVPTX DAG->DAG Pattern Instruction Selection"
39 cl::desc(
"Enable reciprocal sqrt optimization"));
46 cl::desc(
"Enable MAD wide optimization"));
75NVPTXDAGToDAGISel::getDivF32Level(
const SDNode *
N)
const {
79bool NVPTXDAGToDAGISel::usePrecSqrtF32(
const SDNode *
N)
const {
83bool NVPTXDAGToDAGISel::useF32FTZ()
const {
84 return Subtarget->getTargetLowering()->useF32FTZ(*
MF);
87bool NVPTXDAGToDAGISel::allowFMA()
const {
88 const NVPTXTargetLowering *TL =
Subtarget->getTargetLowering();
92bool NVPTXDAGToDAGISel::doRsqrtOpt()
const {
return EnableRsqrtOpt; }
94bool NVPTXDAGToDAGISel::doMADWideOpt()
const {
return EnableMADWide; }
98void NVPTXDAGToDAGISel::Select(
SDNode *
N) {
100 if (
N->isMachineOpcode()) {
105 switch (
N->getOpcode()) {
107 case ISD::ATOMIC_LOAD:
112 case ISD::ATOMIC_STORE:
116 case ISD::ATOMIC_FENCE:
124 if (tryEXTRACT_VECTOR_ELEMENT(
N))
131 SelectSETP_BF16X2(
N);
136 if (tryLoadVector(
N))
147 if (tryStoreVector(
N))
151 if (tryIntrinsicChain(
N))
155 if (tryIntrinsicVoid(
N))
165 case ISD::ADDRSPACECAST:
166 SelectAddrSpaceCast(
N);
169 if (
N->getOperand(1).getValueType() == MVT::i128) {
170 SelectV2I64toI128(
N);
176 if (
N->getOperand(1).getValueType() == MVT::i128) {
177 SelectI128toV2I64(
N);
184 selectAtomicSwap128(
N);
189 if (tryBF16ArithToFMA(
N))
198#define TCGEN05_LD_OPCODE(SHAPE, NUM) \
199 (enablePack ? NVPTX::TCGEN05_LD_##SHAPE##_##NUM##_PACK \
200 : NVPTX::TCGEN05_LD_##SHAPE##_##NUM)
204 case Intrinsic::nvvm_tcgen05_ld_16x64b_x1:
206 case Intrinsic::nvvm_tcgen05_ld_16x64b_x2:
208 case Intrinsic::nvvm_tcgen05_ld_16x64b_x4:
210 case Intrinsic::nvvm_tcgen05_ld_16x64b_x8:
212 case Intrinsic::nvvm_tcgen05_ld_16x64b_x16:
214 case Intrinsic::nvvm_tcgen05_ld_16x64b_x32:
216 case Intrinsic::nvvm_tcgen05_ld_16x64b_x64:
218 case Intrinsic::nvvm_tcgen05_ld_16x64b_x128:
220 case Intrinsic::nvvm_tcgen05_ld_16x128b_x1:
222 case Intrinsic::nvvm_tcgen05_ld_16x128b_x2:
224 case Intrinsic::nvvm_tcgen05_ld_16x128b_x4:
226 case Intrinsic::nvvm_tcgen05_ld_16x128b_x8:
228 case Intrinsic::nvvm_tcgen05_ld_16x128b_x16:
230 case Intrinsic::nvvm_tcgen05_ld_16x128b_x32:
232 case Intrinsic::nvvm_tcgen05_ld_16x128b_x64:
234 case Intrinsic::nvvm_tcgen05_ld_16x256b_x1:
236 case Intrinsic::nvvm_tcgen05_ld_16x256b_x2:
238 case Intrinsic::nvvm_tcgen05_ld_16x256b_x4:
240 case Intrinsic::nvvm_tcgen05_ld_16x256b_x8:
242 case Intrinsic::nvvm_tcgen05_ld_16x256b_x16:
244 case Intrinsic::nvvm_tcgen05_ld_16x256b_x32:
246 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x1:
248 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x2:
250 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x4:
252 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x8:
254 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x16:
256 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x32:
258 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x64:
260 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x128:
262 case Intrinsic::nvvm_tcgen05_ld_32x32b_x1:
264 case Intrinsic::nvvm_tcgen05_ld_32x32b_x2:
266 case Intrinsic::nvvm_tcgen05_ld_32x32b_x4:
268 case Intrinsic::nvvm_tcgen05_ld_32x32b_x8:
270 case Intrinsic::nvvm_tcgen05_ld_32x32b_x16:
272 case Intrinsic::nvvm_tcgen05_ld_32x32b_x32:
274 case Intrinsic::nvvm_tcgen05_ld_32x32b_x64:
276 case Intrinsic::nvvm_tcgen05_ld_32x32b_x128:
282void NVPTXDAGToDAGISel::SelectTcgen05Ld(
SDNode *
N,
bool hasOffset) {
285 "tcgen05.ld is not supported on this architecture variant");
292 auto OffsetNode =
CurDAG->getTargetConstant(
296 {N->getOperand(2), OffsetNode, N->getOperand(0)}));
301 {N->getOperand(2), N->getOperand(0)}));
305bool NVPTXDAGToDAGISel::tryIntrinsicChain(
SDNode *
N) {
306 unsigned IID =
N->getConstantOperandVal(1);
310 case Intrinsic::nvvm_ldu_global_f:
311 case Intrinsic::nvvm_ldu_global_i:
312 case Intrinsic::nvvm_ldu_global_p:
315 case Intrinsic::nvvm_tcgen05_ld_16x64b_x1:
316 case Intrinsic::nvvm_tcgen05_ld_16x64b_x2:
317 case Intrinsic::nvvm_tcgen05_ld_16x64b_x4:
318 case Intrinsic::nvvm_tcgen05_ld_16x64b_x8:
319 case Intrinsic::nvvm_tcgen05_ld_16x64b_x16:
320 case Intrinsic::nvvm_tcgen05_ld_16x64b_x32:
321 case Intrinsic::nvvm_tcgen05_ld_16x64b_x64:
322 case Intrinsic::nvvm_tcgen05_ld_16x64b_x128:
323 case Intrinsic::nvvm_tcgen05_ld_16x128b_x1:
324 case Intrinsic::nvvm_tcgen05_ld_16x128b_x2:
325 case Intrinsic::nvvm_tcgen05_ld_16x128b_x4:
326 case Intrinsic::nvvm_tcgen05_ld_16x128b_x16:
327 case Intrinsic::nvvm_tcgen05_ld_16x128b_x32:
328 case Intrinsic::nvvm_tcgen05_ld_16x128b_x64:
329 case Intrinsic::nvvm_tcgen05_ld_16x256b_x1:
330 case Intrinsic::nvvm_tcgen05_ld_16x128b_x8:
331 case Intrinsic::nvvm_tcgen05_ld_16x256b_x2:
332 case Intrinsic::nvvm_tcgen05_ld_16x256b_x4:
333 case Intrinsic::nvvm_tcgen05_ld_16x256b_x8:
334 case Intrinsic::nvvm_tcgen05_ld_16x256b_x16:
335 case Intrinsic::nvvm_tcgen05_ld_16x256b_x32:
336 case Intrinsic::nvvm_tcgen05_ld_32x32b_x1:
337 case Intrinsic::nvvm_tcgen05_ld_32x32b_x2:
338 case Intrinsic::nvvm_tcgen05_ld_32x32b_x4:
339 case Intrinsic::nvvm_tcgen05_ld_32x32b_x8:
340 case Intrinsic::nvvm_tcgen05_ld_32x32b_x16:
341 case Intrinsic::nvvm_tcgen05_ld_32x32b_x32:
342 case Intrinsic::nvvm_tcgen05_ld_32x32b_x64:
343 case Intrinsic::nvvm_tcgen05_ld_32x32b_x128: {
348 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x1:
349 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x2:
350 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x4:
351 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x8:
352 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x16:
353 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x32:
354 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x64:
355 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x128: {
356 SelectTcgen05Ld(
N,
true);
391 return CmpMode::NotANumber;
406 return CurDAG->getTargetConstant(PTXCmpMode, SDLoc(), MVT::i32);
409bool NVPTXDAGToDAGISel::SelectSETP_F16X2(
SDNode *
N) {
412 SDNode *SetP =
CurDAG->getMachineNode(
413 NVPTX::SETP_f16x2rr,
DL, MVT::i1, MVT::i1,
414 {
N->getOperand(0),
N->getOperand(1), PTXCmpMode,
415 CurDAG->getTargetConstant(useF32FTZ() ? 1 : 0,
DL, MVT::i1)});
420bool NVPTXDAGToDAGISel::SelectSETP_BF16X2(
SDNode *
N) {
423 SDNode *SetP =
CurDAG->getMachineNode(
424 NVPTX::SETP_bf16x2rr,
DL, MVT::i1, MVT::i1,
425 {
N->getOperand(0),
N->getOperand(1), PTXCmpMode,
426 CurDAG->getTargetConstant(useF32FTZ() ? 1 : 0,
DL, MVT::i1)});
431bool NVPTXDAGToDAGISel::tryUNPACK_VECTOR(
SDNode *
N) {
433 MVT EltVT =
N->getSimpleValueType(0);
436 CurDAG->getMachineNode(NVPTX::I64toV2I32, SDLoc(
N), EltVT, EltVT,
Vector);
444bool NVPTXDAGToDAGISel::tryEXTRACT_VECTOR_ELEMENT(
SDNode *
N) {
447 MVT VT =
Vector.getSimpleValueType();
453 Opcode = NVPTX::I32toV2I16;
455 Opcode = NVPTX::I64toV2I32;
461 for (
auto *U :
Vector.getNode()->users()) {
464 if (
U->getOperand(0) !=
Vector)
466 if (
const ConstantSDNode *IdxConst =
468 if (IdxConst->getZExtValue() == 0)
470 else if (IdxConst->getZExtValue() == 1)
486 CurDAG->getMachineNode(Opcode, SDLoc(
N), EltVT, EltVT,
Vector);
487 for (
auto *Node : E0)
489 for (
auto *Node : E1)
495static std::optional<NVPTX::AddressSpace>
convertAS(
unsigned AS) {
517 return convertAS(
N->getMemOperand()->getAddrSpace())
525 auto Ordering =
N->getMergedOrdering();
549 return Scopes[
N->getSyncScopeID()];
554struct OperationOrderings {
555 NVPTX::Ordering InstructionOrdering, FenceOrdering;
556 OperationOrderings(NVPTX::Ordering IO = NVPTX::Ordering::NotAtomic,
557 NVPTX::Ordering FO = NVPTX::Ordering::NotAtomic)
558 : InstructionOrdering(IO), FenceOrdering(FO) {}
561static OperationOrderings
663 !HasMemoryOrdering) {
665 formatv(
"PTX does not support \"atomic\" for orderings different than"
666 "\"NotAtomic\" or \"Monotonic\" for sm_60 or older, but order "
678 bool AddrGenericOrGlobalOrShared =
683 if (!AddrGenericOrGlobalOrShared)
686 bool UseRelaxedMMIO =
708 formatv(
"PTX only supports Acquire Ordering on reads: {}",
709 N->getOperationName()));
714 formatv(
"PTX only supports Release Ordering on writes: {}",
715 N->getOperationName()));
719 formatv(
"NVPTX does not support AcquireRelease Ordering on "
721 "yet and PTX does not support it on loads or stores: {}",
722 N->getOperationName()));
735 else if (
N->writeMem())
739 formatv(
"NVPTX does not support SequentiallyConsistent Ordering on "
740 "read-modify-writes yet: {}",
741 N->getOperationName()));
742 return OperationOrderings(InstrOrder,
747 formatv(
"NVPTX backend does not support AtomicOrdering \"{}\" yet.",
770 auto S = Scopes[
N->getSyncScopeID()];
779 Subtarget->failIfClustersUnsupported(
"cluster scope");
798 T->failIfClustersUnsupported(
".cluster scope fence");
801 if (!
T->hasSplitAcquireAndReleaseFences() &&
809 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_acquire_sys
810 : NVPTX::INT_MEMBAR_SYS;
812 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_acquire_cta
813 : NVPTX::INT_MEMBAR_CTA;
815 return NVPTX::atomic_thread_fence_acquire_cluster;
817 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_acquire_gpu
818 : NVPTX::INT_MEMBAR_GL;
822 formatv(
"Unsupported scope \"{}\" for acquire/release/acq_rel fence.",
829 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_release_sys
830 : NVPTX::INT_MEMBAR_SYS;
832 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_release_cta
833 : NVPTX::INT_MEMBAR_CTA;
835 return NVPTX::atomic_thread_fence_release_cluster;
837 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_release_gpu
838 : NVPTX::INT_MEMBAR_GL;
842 formatv(
"Unsupported scope \"{}\" for acquire/release/acq_rel fence.",
849 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_acq_rel_sys
850 : NVPTX::INT_MEMBAR_SYS;
852 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_acq_rel_cta
853 : NVPTX::INT_MEMBAR_CTA;
855 return NVPTX::atomic_thread_fence_acq_rel_cluster;
857 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_acq_rel_gpu
858 : NVPTX::INT_MEMBAR_GL;
862 formatv(
"Unsupported scope \"{}\" for acquire/release/acq_rel fence.",
870 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_seq_cst_sys
871 : NVPTX::INT_MEMBAR_SYS;
873 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_seq_cst_cta
874 : NVPTX::INT_MEMBAR_CTA;
876 return NVPTX::atomic_thread_fence_seq_cst_cluster;
878 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_seq_cst_gpu
879 : NVPTX::INT_MEMBAR_GL;
892 formatv(
"Unsupported \"{}\" ordering and \"{}\" scope for fence.",
893 OrderingToString(O), ScopeToString(S)));
901std::pair<NVPTX::Ordering, NVPTX::Scope>
902NVPTXDAGToDAGISel::insertMemoryInstructionFence(
SDLoc DL,
SDValue &Chain,
904 auto [InstructionOrdering, FenceOrdering] =
906 auto Scope = getOperationScope(
N, InstructionOrdering);
919 formatv(
"Unexpected fence ordering: \"{}\".",
922 return {InstructionOrdering,
Scope};
925void NVPTXDAGToDAGISel::SelectAddrSpaceCast(
SDNode *
N) {
931 assert(SrcAddrSpace != DstAddrSpace &&
932 "addrspacecast must be between different address spaces");
937 if (TM.is64Bit() && TM.getPointerSizeInBits(SrcAddrSpace) == 32) {
940 SDNode *Cvt =
CurDAG->getMachineNode(NVPTX::CVT_u64_u32,
DL, MVT::i64,
946 switch (SrcAddrSpace) {
949 Opc = TM.is64Bit() ? NVPTX::cvta_global_64 : NVPTX::cvta_global;
952 Opc = TM.is64Bit() ? NVPTX::cvta_shared_64 : NVPTX::cvta_shared;
957 "Shared cluster address space is only supported in 64-bit mode");
958 Opc = NVPTX::cvta_shared_cluster_64;
961 Opc = TM.is64Bit() ? NVPTX::cvta_const_64 : NVPTX::cvta_const;
964 Opc = TM.is64Bit() ? NVPTX::cvta_local_64 : NVPTX::cvta_local;
967 Opc = TM.is64Bit() ? NVPTX::cvta_param_64 : NVPTX::cvta_param;
974 if (SrcAddrSpace != 0)
977 switch (DstAddrSpace) {
980 Opc = TM.is64Bit() ? NVPTX::cvta_to_global_64 : NVPTX::cvta_to_global;
983 Opc = TM.is64Bit() ? NVPTX::cvta_to_shared_64 : NVPTX::cvta_to_shared;
988 "Shared cluster address space is only supported in 64-bit mode");
989 Opc = NVPTX::cvta_to_shared_cluster_64;
992 Opc = TM.is64Bit() ? NVPTX::cvta_to_const_64 : NVPTX::cvta_to_const;
995 Opc = TM.is64Bit() ? NVPTX::cvta_to_local_64 : NVPTX::cvta_to_local;
998 Opc = TM.is64Bit() ? NVPTX::cvta_to_param_64 : NVPTX::cvta_to_param;
1002 SDNode *CVTA =
CurDAG->getMachineNode(
Opc,
DL,
N->getValueType(0), Src);
1003 if (TM.is64Bit() && TM.getPointerSizeInBits(DstAddrSpace) == 32) {
1006 CVTA =
CurDAG->getMachineNode(NVPTX::CVT_u32_u64,
DL, MVT::i32,
1017static std::optional<unsigned>
1019 std::optional<unsigned> Opcode_i32,
1020 std::optional<unsigned> Opcode_i64) {
1039 return std::nullopt;
1044 return V.getOpcode() ==
ISD::ADD ||
1045 (V->getOpcode() ==
ISD::OR && V->getFlags().hasDisjoint());
1050 N =
N.getOperand(0);
1060 GA->getValueType(0), GA->getOffset(),
1061 GA->getTargetFlags());
1064 ES->getTargetFlags());
1073 APInt AccumulatedOffset(64u, 0);
1079 const APInt CI = CN->getAPIntValue().
sext(64);
1080 if (!(CI + AccumulatedOffset).isSignedIntN(32))
1083 AccumulatedOffset += CI;
1109bool NVPTXDAGToDAGISel::tryLoad(
SDNode *
N) {
1111 assert(
LD->readMem() &&
"Expected load");
1115 if (PlainLoad && PlainLoad->
isIndexed())
1125 const auto [
Ordering,
Scope] = insertMemoryInstructionFence(
DL, Chain, LD);
1127 const unsigned FromTypeWidth =
LD->getMemoryVT().getSizeInBits();
1136 FromTypeWidth <= 128 &&
"Invalid width for load");
1141 getI32Imm(Scope,
DL),
1142 getI32Imm(CodeAddrSpace,
DL),
1143 getI32Imm(FromType,
DL),
1144 getI32Imm(FromTypeWidth,
DL),
1150 const std::optional<unsigned> Opcode =
1151 pickOpcodeForVT(TargetVT, NVPTX::LD_i16, NVPTX::LD_i32, NVPTX::LD_i64);
1155 SDNode *NVPTXLD =
CurDAG->getMachineNode(*Opcode,
DL,
LD->getVTList(),
Ops);
1159 MachineMemOperand *MemRef =
LD->getMemOperand();
1167 switch (
N->getOpcode()) {
1179bool NVPTXDAGToDAGISel::tryLoadVector(
SDNode *
N) {
1187 const MVT EltVT =
LD->getSimpleValueType(0);
1190 const auto [
Ordering,
Scope] = insertMemoryInstructionFence(
DL, Chain, LD);
1200 const unsigned ExtensionType =
1201 N->getConstantOperandVal(
N->getNumOperands() - 1);
1204 : NVPTX::PTXLdStInstCode::
Untyped;
1212 getI32Imm(Scope,
DL),
1213 getI32Imm(CodeAddrSpace,
DL),
1214 getI32Imm(FromType,
DL),
1215 getI32Imm(FromTypeWidth,
DL),
1220 std::optional<unsigned> Opcode;
1221 switch (
N->getOpcode()) {
1226 NVPTX::LDV_i32_v2, NVPTX::LDV_i64_v2);
1230 NVPTX::LDV_i32_v4, NVPTX::LDV_i64_v4);
1234 NVPTX::LDV_i32_v8, {});
1240 SDNode *NVPTXLD =
CurDAG->getMachineNode(*Opcode,
DL,
LD->getVTList(),
Ops);
1242 MachineMemOperand *MemRef =
LD->getMemOperand();
1249bool NVPTXDAGToDAGISel::tryLDG(
MemSDNode *LD) {
1252 unsigned ExtensionType;
1254 ExtensionType =
Load->getExtensionType();
1256 ExtensionType =
LD->getConstantOperandVal(
LD->getNumOperands() - 1);
1260 : NVPTX::PTXLdStInstCode::
Untyped;
1264 assert(!(
LD->getSimpleValueType(0).isVector() &&
1272 std::optional<unsigned> Opcode;
1273 switch (
LD->getOpcode()) {
1278 NVPTX::LD_GLOBAL_NC_i32, NVPTX::LD_GLOBAL_NC_i64);
1283 NVPTX::LD_GLOBAL_NC_v2i32, NVPTX::LD_GLOBAL_NC_v2i64);
1288 NVPTX::LD_GLOBAL_NC_v4i32, NVPTX::LD_GLOBAL_NC_v4i64);
1292 NVPTX::LD_GLOBAL_NC_v8i32, {});
1298 SDNode *NVPTXLDG =
CurDAG->getMachineNode(*Opcode,
DL,
LD->getVTList(),
Ops);
1307 auto ElementBitWidth = TotalWidth / NumElts;
1309 ElementBitWidth <= 128 && TotalWidth <= 256 &&
1310 "Invalid width for load");
1311 return ElementBitWidth;
1314bool NVPTXDAGToDAGISel::tryLDU(
SDNode *
N) {
1329 std::optional<unsigned> Opcode;
1330 switch (
N->getOpcode()) {
1335 NVPTX::LDU_GLOBAL_i32, NVPTX::LDU_GLOBAL_i64);
1339 NVPTX::LDU_GLOBAL_v2i32, NVPTX::LDU_GLOBAL_v2i64);
1343 NVPTX::LDU_GLOBAL_v4i32, {});
1349 SDNode *NVPTXLDU =
CurDAG->getMachineNode(*Opcode,
DL,
LD->getVTList(),
Ops);
1355bool NVPTXDAGToDAGISel::tryStore(
SDNode *
N) {
1357 assert(
ST->writeMem() &&
"Expected store");
1360 assert((PlainStore || AtomicStore) &&
"Expected store");
1363 if (PlainStore && PlainStore->
isIndexed())
1371 const auto [
Ordering,
Scope] = insertMemoryInstructionFence(
DL, Chain, ST);
1374 const unsigned ToTypeWidth =
ST->getMemoryVT().getSizeInBits();
1380 "Invalid width for store");
1384 getI32Imm(Ordering,
DL),
1385 getI32Imm(Scope,
DL),
1386 getI32Imm(CodeAddrSpace,
DL),
1387 getI32Imm(ToTypeWidth,
DL),
1392 const std::optional<unsigned> Opcode =
1394 NVPTX::ST_i32, NVPTX::ST_i64);
1398 SDNode *NVPTXST =
CurDAG->getMachineNode(*Opcode,
DL, MVT::Other,
Ops);
1403 MachineMemOperand *MemRef =
ST->getMemOperand();
1409bool NVPTXDAGToDAGISel::tryStoreVector(
SDNode *
N) {
1411 const unsigned TotalWidth =
ST->getMemoryVT().getSizeInBits();
1422 const auto [
Ordering,
Scope] = insertMemoryInstructionFence(
DL, Chain, ST);
1427 for (
auto &V :
ST->ops().slice(1, NumElts))
1428 Ops.push_back(selectPossiblyImm(V));
1430 const unsigned ToTypeWidth = TotalWidth / NumElts;
1433 TotalWidth <= 256 &&
"Invalid width for store");
1436 Ops.append({getI32Imm(Ordering,
DL), getI32Imm(Scope,
DL),
1437 getI32Imm(CodeAddrSpace,
DL), getI32Imm(ToTypeWidth,
DL),
Base,
1441 ST->getOperand(1).getSimpleValueType().SimpleTy;
1442 std::optional<unsigned> Opcode;
1443 switch (
ST->getOpcode()) {
1463 SDNode *NVPTXST =
CurDAG->getMachineNode(*Opcode,
DL, MVT::Other,
Ops);
1465 MachineMemOperand *MemRef =
ST->getMemOperand();
1474bool NVPTXDAGToDAGISel::tryBFE(
SDNode *
N) {
1481 bool IsSigned =
false;
1497 uint64_t MaskVal =
Mask->getZExtValue();
1507 Len =
CurDAG->getTargetConstant(NumBits,
DL, MVT::i32);
1511 Val =
LHS.getNode()->getOperand(0);
1512 Start =
LHS.getNode()->getOperand(1);
1518 int64_t GoodBits =
Start.getValueSizeInBits() - StartVal;
1519 if (NumBits > GoodBits) {
1577 NumBits = NumZeros + NumOnes - ShiftAmt;
1583 if (ShiftAmt < NumZeros) {
1591 Len =
CurDAG->getTargetConstant(NumBits,
DL, MVT::i32);
1607 Val =
LHS->getOperand(0);
1626 if (OuterShiftAmt < InnerShiftAmt) {
1637 Start =
CurDAG->getTargetConstant(OuterShiftAmt - InnerShiftAmt,
DL,
1662 Opc = NVPTX::BFE_S32rii;
1664 Opc = NVPTX::BFE_U32rii;
1668 Opc = NVPTX::BFE_S64rii;
1670 Opc = NVPTX::BFE_U64rii;
1686bool NVPTXDAGToDAGISel::tryBF16ArithToFMA(
SDNode *
N) {
1687 EVT VT =
SDValue(
N, 0).getValueType();
1691 const NVPTXSubtarget *STI = TM.getSubtargetImpl();
1708 auto API = APF.bitcastToAPInt();
1709 API = API.concat(API);
1711 return SDValue(
CurDAG->getMachineNode(NVPTX::MOV_B32_i,
DL, VT, Const),
1715 return SDValue(
CurDAG->getMachineNode(NVPTX::MOV_BF16_i,
DL, VT, Const), 0);
1718 switch (
N->getOpcode()) {
1721 Operands = {N0, GetConstant(1.0), N1};
1725 Operands = {N1, GetConstant(-1.0), N0};
1730 Operands = {N0, N1, GetConstant(-0.0)};
1736 int Opcode = IsVec ? NVPTX::FMA_BF16x2rrr : NVPTX::FMA_BF16rrr;
1737 MachineSDNode *
FMA =
CurDAG->getMachineNode(Opcode,
DL, VT, Operands);
1743 if (
V.getOpcode() == ISD::BITCAST)
1744 V =
V.getOperand(0);
1747 return CurDAG->getTargetConstant(CN->getAPIntValue(), SDLoc(V),
1750 return CurDAG->getTargetConstantFP(CN->getValueAPF(), SDLoc(V),
1759 std::vector<SDValue> &OutOps) {
1760 switch (ConstraintID) {
1765 OutOps.push_back(
Base);
1766 OutOps.push_back(
Offset);
1773void NVPTXDAGToDAGISel::SelectV2I64toI128(
SDNode *
N) {
1792 NewOps[0] =
N->getOperand(0);
1795 if (
N->getNumOperands() == 5)
1796 NewOps[3] =
N->getOperand(4);
1802void NVPTXDAGToDAGISel::SelectI128toV2I64(
SDNode *
N) {
1819 SDNode *Mov =
CurDAG->getMachineNode(
1820 NVPTX::I128toV2I64,
DL,
1827bool NVPTXDAGToDAGISel::tryFence(
SDNode *
N) {
1829 assert(
N->getOpcode() == ISD::ATOMIC_FENCE);
1830 unsigned int FenceOp =
1832 Scopes[
N->getConstantOperandVal(2)],
Subtarget);
1834 SDNode *FenceNode =
CurDAG->getMachineNode(FenceOp,
DL, MVT::Other, Chain);
1850 "NVPTXScopes::operator[]");
1852 auto S = Scopes.find(
ID);
1853 if (S == Scopes.end()) {
1865#define CP_ASYNC_BULK_TENSOR_OPCODE(dir, dim, mode, is_s32, suffix) \
1867 ? NVPTX::CP_ASYNC_BULK_TENSOR_##dir##_##dim##_SHARED32_##mode##suffix \
1868 : NVPTX::CP_ASYNC_BULK_TENSOR_##dir##_##dim##_##mode##suffix)
1870#define GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G_RED(dim, mode, is_ch, is_s32) \
1871 (is_ch ? (CP_ASYNC_BULK_TENSOR_OPCODE(RED, dim, mode, is_s32, _CH)) \
1872 : (CP_ASYNC_BULK_TENSOR_OPCODE(RED, dim, mode, is_s32, )))
1891 "GetCpAsyncBulkTensorS2GReductionOpcode.");
1912 "GetCpAsyncBulkTensorS2GReductionOpcode.");
1917void NVPTXDAGToDAGISel::SelectCpAsyncBulkTensorReduceCommon(
SDNode *
N,
1924 size_t NumOps =
N->getNumOperands();
1925 size_t NumDims =
NumOps - 6;
1926 bool IsCacheHint =
N->getConstantOperandVal(
NumOps - 1) == 1;
1927 size_t NumArgs = NumDims + (IsCacheHint ? 3 : 2);
1931 Ops.push_back(getI32Imm(RedOp,
DL));
1932 Ops.push_back(
N->getOperand(0));
1937 NumDims, IsShared32, IsCacheHint, IsIm2Col);
1941#define TCGEN05_ST_OPCODE(SHAPE, NUM) \
1942 (enableUnpack ? NVPTX::TCGEN05_ST_##SHAPE##_##NUM##_UNPACK \
1943 : NVPTX::TCGEN05_ST_##SHAPE##_##NUM)
1947 case Intrinsic::nvvm_tcgen05_st_16x64b_x1:
1949 case Intrinsic::nvvm_tcgen05_st_16x64b_x2:
1951 case Intrinsic::nvvm_tcgen05_st_16x64b_x4:
1953 case Intrinsic::nvvm_tcgen05_st_16x64b_x8:
1955 case Intrinsic::nvvm_tcgen05_st_16x64b_x16:
1957 case Intrinsic::nvvm_tcgen05_st_16x64b_x32:
1959 case Intrinsic::nvvm_tcgen05_st_16x64b_x64:
1961 case Intrinsic::nvvm_tcgen05_st_16x64b_x128:
1963 case Intrinsic::nvvm_tcgen05_st_16x128b_x1:
1965 case Intrinsic::nvvm_tcgen05_st_16x128b_x2:
1967 case Intrinsic::nvvm_tcgen05_st_16x128b_x4:
1969 case Intrinsic::nvvm_tcgen05_st_16x128b_x8:
1971 case Intrinsic::nvvm_tcgen05_st_16x128b_x16:
1973 case Intrinsic::nvvm_tcgen05_st_16x128b_x32:
1975 case Intrinsic::nvvm_tcgen05_st_16x128b_x64:
1977 case Intrinsic::nvvm_tcgen05_st_16x256b_x1:
1979 case Intrinsic::nvvm_tcgen05_st_16x256b_x2:
1981 case Intrinsic::nvvm_tcgen05_st_16x256b_x4:
1983 case Intrinsic::nvvm_tcgen05_st_16x256b_x8:
1985 case Intrinsic::nvvm_tcgen05_st_16x256b_x16:
1987 case Intrinsic::nvvm_tcgen05_st_16x256b_x32:
1989 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x1:
1991 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x2:
1993 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x4:
1995 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x8:
1997 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x16:
1999 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x32:
2001 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x64:
2003 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x128:
2005 case Intrinsic::nvvm_tcgen05_st_32x32b_x1:
2007 case Intrinsic::nvvm_tcgen05_st_32x32b_x2:
2009 case Intrinsic::nvvm_tcgen05_st_32x32b_x4:
2011 case Intrinsic::nvvm_tcgen05_st_32x32b_x8:
2013 case Intrinsic::nvvm_tcgen05_st_32x32b_x16:
2015 case Intrinsic::nvvm_tcgen05_st_32x32b_x32:
2017 case Intrinsic::nvvm_tcgen05_st_32x32b_x64:
2019 case Intrinsic::nvvm_tcgen05_st_32x32b_x128:
2025void NVPTXDAGToDAGISel::SelectTcgen05St(
SDNode *
N,
bool hasOffset) {
2026 if (!
Subtarget->hasTcgen05InstSupport())
2028 "tcgen05.st is not supported on this architecture variant");
2042 for (
unsigned I = hasOffset ? 4 : 3;
I < (
N->getNumOperands() - 1);
I++)
2051 DL,
N->getVTList(), Operands));
2054bool NVPTXDAGToDAGISel::tryIntrinsicVoid(
SDNode *
N) {
2055 unsigned IID =
N->getConstantOperandVal(1);
2057 auto CastTy = [](TMARedTy
Op) {
return static_cast<unsigned>(
Op); };
2061 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_tile_1d:
2062 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_tile_2d:
2063 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_tile_3d:
2064 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_tile_4d:
2065 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_tile_5d:
2066 SelectCpAsyncBulkTensorReduceCommon(
N, CastTy(TMARedTy::ADD));
2068 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_im2col_3d:
2069 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_im2col_4d:
2070 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_im2col_5d:
2071 SelectCpAsyncBulkTensorReduceCommon(
N, CastTy(TMARedTy::ADD),
2074 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_min_tile_1d:
2075 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_min_tile_2d:
2076 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_min_tile_3d:
2077 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_min_tile_4d:
2078 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_min_tile_5d:
2079 SelectCpAsyncBulkTensorReduceCommon(
N, CastTy(TMARedTy::MIN));
2081 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_min_im2col_3d:
2082 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_min_im2col_4d:
2083 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_min_im2col_5d:
2084 SelectCpAsyncBulkTensorReduceCommon(
N, CastTy(TMARedTy::MIN),
2087 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_max_tile_1d:
2088 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_max_tile_2d:
2089 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_max_tile_3d:
2090 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_max_tile_4d:
2091 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_max_tile_5d:
2092 SelectCpAsyncBulkTensorReduceCommon(
N, CastTy(TMARedTy::MAX));
2094 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_max_im2col_3d:
2095 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_max_im2col_4d:
2096 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_max_im2col_5d:
2097 SelectCpAsyncBulkTensorReduceCommon(
N, CastTy(TMARedTy::MAX),
2100 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_inc_tile_1d:
2101 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_inc_tile_2d:
2102 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_inc_tile_3d:
2103 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_inc_tile_4d:
2104 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_inc_tile_5d:
2105 SelectCpAsyncBulkTensorReduceCommon(
N, CastTy(TMARedTy::INC));
2107 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_inc_im2col_3d:
2108 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_inc_im2col_4d:
2109 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_inc_im2col_5d:
2110 SelectCpAsyncBulkTensorReduceCommon(
N, CastTy(TMARedTy::INC),
2113 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_dec_tile_1d:
2114 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_dec_tile_2d:
2115 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_dec_tile_3d:
2116 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_dec_tile_4d:
2117 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_dec_tile_5d:
2118 SelectCpAsyncBulkTensorReduceCommon(
N, CastTy(TMARedTy::DEC));
2120 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_dec_im2col_3d:
2121 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_dec_im2col_4d:
2122 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_dec_im2col_5d:
2123 SelectCpAsyncBulkTensorReduceCommon(
N, CastTy(TMARedTy::DEC),
2126 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_and_tile_1d:
2127 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_and_tile_2d:
2128 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_and_tile_3d:
2129 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_and_tile_4d:
2130 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_and_tile_5d:
2131 SelectCpAsyncBulkTensorReduceCommon(
N, CastTy(TMARedTy::AND));
2133 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_and_im2col_3d:
2134 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_and_im2col_4d:
2135 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_and_im2col_5d:
2136 SelectCpAsyncBulkTensorReduceCommon(
N, CastTy(TMARedTy::AND),
2139 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_or_tile_1d:
2140 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_or_tile_2d:
2141 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_or_tile_3d:
2142 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_or_tile_4d:
2143 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_or_tile_5d:
2144 SelectCpAsyncBulkTensorReduceCommon(
N, CastTy(TMARedTy::OR));
2146 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_or_im2col_3d:
2147 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_or_im2col_4d:
2148 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_or_im2col_5d:
2149 SelectCpAsyncBulkTensorReduceCommon(
N, CastTy(TMARedTy::OR),
2152 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_xor_tile_1d:
2153 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_xor_tile_2d:
2154 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_xor_tile_3d:
2155 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_xor_tile_4d:
2156 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_xor_tile_5d:
2157 SelectCpAsyncBulkTensorReduceCommon(
N, CastTy(TMARedTy::XOR));
2159 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_xor_im2col_3d:
2160 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_xor_im2col_4d:
2161 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_xor_im2col_5d:
2162 SelectCpAsyncBulkTensorReduceCommon(
N, CastTy(TMARedTy::XOR),
2166 case Intrinsic::nvvm_tcgen05_st_16x64b_x1:
2167 case Intrinsic::nvvm_tcgen05_st_16x64b_x2:
2168 case Intrinsic::nvvm_tcgen05_st_16x64b_x4:
2169 case Intrinsic::nvvm_tcgen05_st_16x64b_x8:
2170 case Intrinsic::nvvm_tcgen05_st_16x64b_x16:
2171 case Intrinsic::nvvm_tcgen05_st_16x64b_x32:
2172 case Intrinsic::nvvm_tcgen05_st_16x64b_x64:
2173 case Intrinsic::nvvm_tcgen05_st_16x64b_x128:
2174 case Intrinsic::nvvm_tcgen05_st_32x32b_x1:
2175 case Intrinsic::nvvm_tcgen05_st_32x32b_x2:
2176 case Intrinsic::nvvm_tcgen05_st_32x32b_x4:
2177 case Intrinsic::nvvm_tcgen05_st_32x32b_x8:
2178 case Intrinsic::nvvm_tcgen05_st_32x32b_x16:
2179 case Intrinsic::nvvm_tcgen05_st_32x32b_x32:
2180 case Intrinsic::nvvm_tcgen05_st_32x32b_x64:
2181 case Intrinsic::nvvm_tcgen05_st_32x32b_x128:
2182 case Intrinsic::nvvm_tcgen05_st_16x128b_x1:
2183 case Intrinsic::nvvm_tcgen05_st_16x128b_x2:
2184 case Intrinsic::nvvm_tcgen05_st_16x128b_x4:
2185 case Intrinsic::nvvm_tcgen05_st_16x128b_x8:
2186 case Intrinsic::nvvm_tcgen05_st_16x128b_x16:
2187 case Intrinsic::nvvm_tcgen05_st_16x128b_x32:
2188 case Intrinsic::nvvm_tcgen05_st_16x128b_x64:
2189 case Intrinsic::nvvm_tcgen05_st_16x256b_x1:
2190 case Intrinsic::nvvm_tcgen05_st_16x256b_x2:
2191 case Intrinsic::nvvm_tcgen05_st_16x256b_x4:
2192 case Intrinsic::nvvm_tcgen05_st_16x256b_x8:
2193 case Intrinsic::nvvm_tcgen05_st_16x256b_x16:
2194 case Intrinsic::nvvm_tcgen05_st_16x256b_x32: {
2199 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x1:
2200 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x2:
2201 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x4:
2202 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x8:
2203 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x16:
2204 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x32:
2205 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x64:
2206 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x128: {
2207 SelectTcgen05St(
N,
true);
2213void NVPTXDAGToDAGISel::selectAtomicSwap128(
SDNode *
N) {
2220 Ops.append(
N->op_begin() + 2,
N->op_end());
2222 getI32Imm(getMemOrder(AN), dl),
2223 getI32Imm(getAtomicScope(AN), dl),
2231 ? NVPTX::ATOM_EXCH_B128
2232 : NVPTX::ATOM_CAS_B128;
2234 auto *ATOM =
CurDAG->getMachineNode(Opcode, dl,
N->getVTList(),
Ops);
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
This file implements a class to represent arbitrary precision integral constant values and operations...
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Atomic ordering constants.
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static unsigned getStoreVectorNumElts(SDNode *N)
static bool isAddLike(const SDValue V)
static SDValue selectBaseADDR(SDValue N, SelectionDAG *DAG)
static SDValue accumulateOffset(SDValue &Addr, SDLoc DL, SelectionDAG *DAG)
static unsigned getTcgen05StOpcode(unsigned IID, bool enableUnpack)
static std::optional< unsigned > pickOpcodeForVT(MVT::SimpleValueType VT, std::optional< unsigned > Opcode_i16, std::optional< unsigned > Opcode_i32, std::optional< unsigned > Opcode_i64)
static cl::opt< bool > EnableMADWide("nvptx-mad-wide-opt", cl::init(false), cl::Hidden, cl::desc("Enable MAD wide optimization"))
static unsigned GetCpAsyncBulkTensorS2GReductionOpcode(size_t Dim, bool IsShared32, bool IsCacheHint, bool IsIm2Col)
#define TCGEN05_LD_OPCODE(SHAPE, NUM)
static SDValue stripAssertAlign(SDValue N)
static cl::opt< bool > EnableRsqrtOpt("nvptx-rsqrt-approx-opt", cl::init(true), cl::Hidden, cl::desc("Enable reciprocal sqrt optimization"))
static unsigned int getFenceOp(NVPTX::Ordering O, NVPTX::Scope S, NVPTXSubtarget const *T)
#define GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G_RED(dim, mode, is_ch, is_s32)
#define TCGEN05_ST_OPCODE(SHAPE, NUM)
static std::optional< NVPTX::AddressSpace > convertAS(unsigned AS)
static std::pair< SDValue, SDValue > selectADDR(SDValue Addr, SelectionDAG *DAG)
static unsigned getTcgen05LdOpcode(unsigned IID, bool enablePack)
static bool canLowerToLDG(const MemSDNode &N, const NVPTXSubtarget &Subtarget, NVPTX::AddressSpace CodeAddrSpace)
This file contains the definitions of the enumerations and flags associated with NVVM Intrinsics,...
#define INITIALIZE_PASS(passName, arg, name, cfg, analysis)
static const fltSemantics & BFloat()
static constexpr roundingMode rmNearestTiesToEven
Class for arbitrary precision integers.
LLVM_ABI APInt sext(unsigned width) const
Sign extend to a new width.
int64_t getSExtValue() const
Get sign extended value.
unsigned getSrcAddressSpace() const
unsigned getDestAddressSpace() const
const SDValue & getVal() const
uint64_t getZExtValue() const
FunctionPass class - This class is used to implement most global optimizations.
This is an important class for using LLVM in a threaded context.
bool isIndexed() const
Return true if this is a pre/post inc/dec load/store.
ISD::LoadExtType getExtensionType() const
Return whether this is a plain node, or one of the varieties of value-extending loads.
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool is32BitVector() const
Return true if this is a 32-bit vector type.
MVT getVectorElementType() const
bool is64BitVector() const
Return true if this is a 64-bit vector type.
This is an abstract virtual class for memory operations.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
EVT getMemoryVT() const
Return the type of the in-memory value.
NVPTXDAGToDAGISelLegacy(NVPTXTargetMachine &tm, CodeGenOptLevel OptLevel)
bool runOnMachineFunction(MachineFunction &MF) override
NVPTXDAGToDAGISel()=delete
static NVPTX::AddressSpace getAddrSpace(const MemSDNode *N)
bool SelectInlineAsmMemoryOperand(const SDValue &Op, InlineAsm::ConstraintCode ConstraintID, std::vector< SDValue > &OutOps) override
SelectInlineAsmMemoryOperand - Implement addressing mode selection for inline asm expressions.
static unsigned getFromTypeWidthForLoad(const MemSDNode *Mem)
const NVPTXSubtarget * Subtarget
const NVPTXTargetLowering * getTargetLowering() const override
bool hasNativeBF16Support(int Opcode) const
bool hasRelaxedMMIO() const
bool hasMemoryOrdering() const
NVPTX::DivPrecisionLevel getDivF32Level(const MachineFunction &MF, const SDNode &N) const
bool allowFMA(MachineFunction &MF, CodeGenOptLevel OptLevel) const
bool usePrecSqrtF32(const SDNode *N=nullptr) const
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
Represents one node in the SelectionDAG.
unsigned getNumValues() const
Return the number of values defined/returned by this operator.
const SDValue & getOperand(unsigned Num) const
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
SDNode * getNode() const
get the SDNode which holds the desired result
EVT getValueType() const
Return the ValueType of the referenced return value.
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
SelectionDAGISelLegacy(char &ID, std::unique_ptr< SelectionDAGISel > S)
void ReplaceUses(SDValue F, SDValue T)
ReplaceUses - replace all uses of the old node F with the use of the new node T.
void ReplaceNode(SDNode *F, SDNode *T)
Replace all uses of F with T, then remove F from the DAG.
SelectionDAGISel(TargetMachine &tm, CodeGenOptLevel OL=CodeGenOptLevel::Default)
virtual bool runOnMachineFunction(MachineFunction &mf)
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
SDValue getTargetGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, unsigned TargetFlags=0)
LLVM_ABI MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
SDValue getTargetFrameIndex(int FI, EVT VT)
SDValue getSignedTargetConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
LLVM_ABI SDValue getTargetExternalSymbol(const char *Sym, EVT VT, unsigned TargetFlags=0)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
const SDValue & getValue() const
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
@ C
The default llvm calling convention, compatible with C.
@ ADD
Simple integer binary arithmetic operators.
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
@ FADD
Simple binary floating point operators.
@ AssertAlign
AssertAlign - These nodes record if a register contains a value that has a known alignment and the tr...
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
@ SHL
Shift and rotation operations.
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
@ AND
Bitwise operators - logical and, logical or, logical xor.
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
@ ADDRESS_SPACE_SHARED_CLUSTER
@ UNPACK_VECTOR
This node is the inverse of NVPTX::BUILD_VECTOR.
@ ATOMIC_CMP_SWAP_B128
These nodes are used to lower atomic instructions with i128 type.
std::string ScopeToString(Scope S)
std::string OrderingToString(Ordering Order)
bool isPackedVectorTy(EVT VT)
initializer< Ty > init(const Ty &Val)
Scope
Defines the scope in which this symbol should be visible: Default – Visible in the public interface o...
This is an optimization pass for GlobalISel generic memory operations.
FunctionAddr VTableAddr Value
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
int countr_one(T Value)
Count the number of ones from the least significant bit to the first zero bit.
FunctionPass * createNVPTXISelDag(NVPTXTargetMachine &TM, llvm::CodeGenOptLevel OptLevel)
createNVPTXISelDag - This pass converts a legalized DAG into a NVPTX-specific DAG,...
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
constexpr bool isShiftedMask_64(uint64_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (64 bit ver...
const char * toIRString(AtomicOrdering ao)
String used by LLVM IR to represent atomic ordering.
auto formatv(bool Validate, const char *Fmt, Ts &&...Vals)
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
constexpr bool isMask_64(uint64_t Value)
Return true if the argument is a non-empty sequence of ones starting at the least significant bit wit...
CodeGenOptLevel
Code generation optimization level.
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
AtomicOrdering
Atomic ordering for LLVM's memory model.
DWARFExpression::Operation Op
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Implement std::hash so that hash_code can be used in STL containers.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
bool isVector() const
Return true if this is a vector value type.
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
NVPTX::Scope operator[](SyncScope::ID ID) const