21#include "llvm/IR/IntrinsicsNVPTX.h"
31#define DEBUG_TYPE "nvptx-isel"
32#define PASS_NAME "NVPTX DAG->DAG Pattern Instruction Selection"
36 cl::desc(
"Enable reciprocal sqrt optimization"));
66int NVPTXDAGToDAGISel::getDivF32Level()
const {
70bool NVPTXDAGToDAGISel::usePrecSqrtF32()
const {
74bool NVPTXDAGToDAGISel::useF32FTZ()
const {
78bool NVPTXDAGToDAGISel::allowFMA()
const {
83bool NVPTXDAGToDAGISel::allowUnsafeFPMath()
const {
88bool NVPTXDAGToDAGISel::doRsqrtOpt()
const {
return EnableRsqrtOpt; }
92void NVPTXDAGToDAGISel::Select(
SDNode *
N) {
94 if (
N->isMachineOpcode()) {
99 switch (
N->getOpcode()) {
115 if (tryEXTRACT_VECTOR_ELEMENT(
N))
122 SelectSETP_BF16X2(
N);
126 if (tryLoadVector(
N))
136 if (tryStoreVector(
N))
148 if (tryStoreRetval(
N))
156 if (tryStoreParam(
N))
160 if (tryIntrinsicNoChain(
N))
164 if (tryIntrinsicChain(
N))
168 if (tryIntrinsicVoid(
N))
179 SelectAddrSpaceCast(
N);
182 if (
N->getOperand(1).getValueType() == MVT::i128) {
183 SelectV2I64toI128(
N);
189 if (
N->getOperand(1).getValueType() == MVT::i128) {
190 SelectI128toV2I64(
N);
198 if (tryBF16ArithToFMA(
N))
207bool NVPTXDAGToDAGISel::tryIntrinsicChain(
SDNode *
N) {
208 unsigned IID =
N->getConstantOperandVal(1);
212 case Intrinsic::nvvm_ldu_global_f:
213 case Intrinsic::nvvm_ldu_global_i:
214 case Intrinsic::nvvm_ldu_global_p:
242 return CmpMode::NotANumber;
276bool NVPTXDAGToDAGISel::SelectSETP_F16X2(
SDNode *
N) {
277 unsigned PTXCmpMode =
278 getPTXCmpMode(*cast<CondCodeSDNode>(
N->getOperand(2)), useF32FTZ());
281 NVPTX::SETP_f16x2rr,
DL, MVT::i1, MVT::i1,
N->getOperand(0),
287bool NVPTXDAGToDAGISel::SelectSETP_BF16X2(
SDNode *
N) {
288 unsigned PTXCmpMode =
289 getPTXCmpMode(*cast<CondCodeSDNode>(
N->getOperand(2)), useF32FTZ());
292 NVPTX::SETP_bf16x2rr,
DL, MVT::i1, MVT::i1,
N->getOperand(0),
300bool NVPTXDAGToDAGISel::tryEXTRACT_VECTOR_ELEMENT(
SDNode *
N) {
310 for (
auto *U :
Vector.getNode()->users()) {
313 if (
U->getOperand(0) !=
Vector)
316 dyn_cast<ConstantSDNode>(
U->getOperand(1))) {
317 if (IdxConst->getZExtValue() == 0)
319 else if (IdxConst->getZExtValue() == 1)
336 for (
auto *
Node : E0)
338 for (
auto *
Node : E1)
345 const Value *Src =
N->getMemOperand()->getValue();
350 if (
auto *PT = dyn_cast<PointerType>(Src->getType())) {
351 switch (PT->getAddressSpace()) {
372struct OperationOrderings {
379static OperationOrderings
481 !HasMemoryOrdering) {
483 formatv(
"PTX does not support \"atomic\" for orderings different than"
484 "\"NotAtomic\" or \"Monotonic\" for sm_60 or older, but order "
496 bool AddrGenericOrGlobalOrShared =
500 if (!AddrGenericOrGlobalOrShared)
503 bool UseRelaxedMMIO =
525 formatv(
"PTX only supports Acquire Ordering on reads: {}",
526 N->getOperationName()));
531 formatv(
"PTX only supports Release Ordering on writes: {}",
532 N->getOperationName()));
536 formatv(
"NVPTX does not support AcquireRelease Ordering on "
538 "yet and PTX does not support it on loads or stores: {}",
539 N->getOperationName()));
552 else if (
N->writeMem())
556 formatv(
"NVPTX does not support SequentiallyConsistent Ordering on "
557 "read-modify-writes yet: {}",
558 N->getOperationName()));
559 return OperationOrderings(InstrOrder,
564 formatv(
"NVPTX backend does not support AtomicOrdering \"{}\" yet.",
587 auto S = Scopes[
N->getSyncScopeID()];
626 if (
N->isInvariant())
638 if (
auto *
A = dyn_cast<const Argument>(V))
639 return IsKernelFn &&
A->onlyReadsMemory() &&
A->hasNoAliasAttr();
640 if (
auto *GV = dyn_cast<const GlobalVariable>(V))
641 return GV->isConstant();
649 T->failIfClustersUnsupported(
".cluster scope fence");
657 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_acq_rel_sys
658 : NVPTX::INT_MEMBAR_SYS;
660 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_acq_rel_cta
661 : NVPTX::INT_MEMBAR_CTA;
663 return NVPTX::atomic_thread_fence_acq_rel_cluster;
665 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_acq_rel_gpu
666 : NVPTX::INT_MEMBAR_GL;
669 formatv(
"Unsupported scope \"{}\" for acquire/release/acq_rel fence.",
677 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_seq_cst_sys
678 : NVPTX::INT_MEMBAR_SYS;
680 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_seq_cst_cta
681 : NVPTX::INT_MEMBAR_CTA;
683 return NVPTX::atomic_thread_fence_seq_cst_cluster;
685 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_seq_cst_gpu
686 : NVPTX::INT_MEMBAR_GL;
698 formatv(
"Unsupported \"{}\" ordering and \"{}\" scope for fence.",
699 OrderingToString(O), ScopeToString(S)));
707std::pair<NVPTX::Ordering, NVPTX::Scope>
708NVPTXDAGToDAGISel::insertMemoryInstructionFence(
SDLoc DL,
SDValue &Chain,
725 formatv(
"Unexpected fence ordering: \"{}\".",
731bool NVPTXDAGToDAGISel::tryIntrinsicNoChain(
SDNode *
N) {
732 unsigned IID =
N->getConstantOperandVal(0);
736 case Intrinsic::nvvm_texsurf_handle_internal:
737 SelectTexSurfHandle(
N);
742void NVPTXDAGToDAGISel::SelectTexSurfHandle(
SDNode *
N) {
747 MVT::i64, GlobalVal));
750void NVPTXDAGToDAGISel::SelectAddrSpaceCast(
SDNode *
N) {
756 assert(SrcAddrSpace != DstAddrSpace &&
757 "addrspacecast must be between different address spaces");
771 switch (SrcAddrSpace) {
774 Opc = TM.
is64Bit() ? NVPTX::cvta_global_64 : NVPTX::cvta_global;
777 Opc = TM.
is64Bit() ? NVPTX::cvta_shared_64 : NVPTX::cvta_shared;
780 Opc = TM.
is64Bit() ? NVPTX::cvta_const_64 : NVPTX::cvta_const;
783 Opc = TM.
is64Bit() ? NVPTX::cvta_local_64 : NVPTX::cvta_local;
790 if (SrcAddrSpace != 0)
793 switch (DstAddrSpace) {
796 Opc = TM.
is64Bit() ? NVPTX::cvta_to_global_64 : NVPTX::cvta_to_global;
799 Opc = TM.
is64Bit() ? NVPTX::cvta_to_shared_64 : NVPTX::cvta_to_shared;
802 Opc = TM.
is64Bit() ? NVPTX::cvta_to_const_64 : NVPTX::cvta_to_const;
805 Opc = TM.
is64Bit() ? NVPTX::cvta_to_local_64 : NVPTX::cvta_to_local;
808 Opc = TM.
is64Bit() ? NVPTX::IMOV64rr : NVPTX::IMOV32rr;
827static std::optional<unsigned>
829 unsigned Opcode_i16,
unsigned Opcode_i32,
830 std::optional<unsigned> Opcode_i64,
unsigned Opcode_f32,
831 std::optional<unsigned> Opcode_f64) {
874bool NVPTXDAGToDAGISel::tryLoad(
SDNode *
N) {
876 assert(
LD->readMem() &&
"Expected load");
883 EVT LoadedVT =
LD->getMemoryVT();
908 unsigned FromTypeWidth = std::max(8U, (
unsigned)ScalarVT.
getSizeInBits());
915 "Unexpected vector type");
929 std::optional<unsigned> Opcode;
933 getI32Imm(CodeAddrSpace,
DL),
934 getI32Imm(VecType,
DL), getI32Imm(FromType,
DL),
935 getI32Imm(FromTypeWidth,
DL)});
937 if (SelectDirectAddr(N1,
Addr)) {
938 Opcode =
pickOpcodeForVT(TargetVT, NVPTX::LD_i8_avar, NVPTX::LD_i16_avar,
939 NVPTX::LD_i32_avar, NVPTX::LD_i64_avar,
940 NVPTX::LD_f32_avar, NVPTX::LD_f64_avar);
943 Ops.append({
Addr, Chain});
946 Opcode =
pickOpcodeForVT(TargetVT, NVPTX::LD_i8_asi, NVPTX::LD_i16_asi,
947 NVPTX::LD_i32_asi, NVPTX::LD_i64_asi,
948 NVPTX::LD_f32_asi, NVPTX::LD_f64_asi);
954 if (PointerSize == 64)
957 NVPTX::LD_i32_ari_64, NVPTX::LD_i64_ari_64,
958 NVPTX::LD_f32_ari_64, NVPTX::LD_f64_ari_64);
960 Opcode =
pickOpcodeForVT(TargetVT, NVPTX::LD_i8_ari, NVPTX::LD_i16_ari,
961 NVPTX::LD_i32_ari, NVPTX::LD_i64_ari,
962 NVPTX::LD_f32_ari, NVPTX::LD_f64_ari);
967 if (PointerSize == 64)
970 NVPTX::LD_i32_areg_64, NVPTX::LD_i64_areg_64,
971 NVPTX::LD_f32_areg_64, NVPTX::LD_f64_areg_64);
973 Opcode =
pickOpcodeForVT(TargetVT, NVPTX::LD_i8_areg, NVPTX::LD_i16_areg,
974 NVPTX::LD_i32_areg, NVPTX::LD_i64_areg,
975 NVPTX::LD_f32_areg, NVPTX::LD_f64_areg);
978 Ops.append({N1, Chain});
1001 return Isv2x16VT(EltVT) || EltVT == MVT::v4i8;
1004bool NVPTXDAGToDAGISel::tryLoadVector(
SDNode *
N) {
1013 return tryLDGLDU(
N);
1020 auto [
Ordering,
Scope] = insertMemoryInstructionFence(
DL, Chain, MemSD);
1033 unsigned FromTypeWidth = std::max(8U, (
unsigned)ScalarVT.
getSizeInBits());
1036 unsigned ExtensionType = cast<ConstantSDNode>(
1037 N->getOperand(
N->getNumOperands() - 1))->getZExtValue();
1045 switch (
N->getOpcode()) {
1056 EVT EltVT =
N->getValueType(0);
1066 std::optional<unsigned> Opcode;
1070 getI32Imm(CodeAddrSpace,
DL),
1071 getI32Imm(VecType,
DL), getI32Imm(FromType,
DL),
1072 getI32Imm(FromTypeWidth,
DL)});
1074 if (SelectDirectAddr(Op1,
Addr)) {
1075 switch (
N->getOpcode()) {
1080 NVPTX::LDV_i8_v2_avar, NVPTX::LDV_i16_v2_avar,
1081 NVPTX::LDV_i32_v2_avar, NVPTX::LDV_i64_v2_avar,
1082 NVPTX::LDV_f32_v2_avar, NVPTX::LDV_f64_v2_avar);
1087 NVPTX::LDV_i16_v4_avar, NVPTX::LDV_i32_v4_avar,
1088 std::nullopt, NVPTX::LDV_f32_v4_avar, std::nullopt);
1093 Ops.append({
Addr, Chain});
1094 }
else if (PointerSize == 64
1097 switch (
N->getOpcode()) {
1102 NVPTX::LDV_i8_v2_asi, NVPTX::LDV_i16_v2_asi,
1103 NVPTX::LDV_i32_v2_asi, NVPTX::LDV_i64_v2_asi,
1104 NVPTX::LDV_f32_v2_asi, NVPTX::LDV_f64_v2_asi);
1109 NVPTX::LDV_i16_v4_asi, NVPTX::LDV_i32_v4_asi,
1110 std::nullopt, NVPTX::LDV_f32_v4_asi, std::nullopt);
1116 }
else if (PointerSize == 64
1119 if (PointerSize == 64) {
1120 switch (
N->getOpcode()) {
1126 NVPTX::LDV_i8_v2_ari_64, NVPTX::LDV_i16_v2_ari_64,
1127 NVPTX::LDV_i32_v2_ari_64, NVPTX::LDV_i64_v2_ari_64,
1128 NVPTX::LDV_f32_v2_ari_64, NVPTX::LDV_f64_v2_ari_64);
1133 NVPTX::LDV_i16_v4_ari_64, NVPTX::LDV_i32_v4_ari_64, std::nullopt,
1134 NVPTX::LDV_f32_v4_ari_64, std::nullopt);
1138 switch (
N->getOpcode()) {
1143 NVPTX::LDV_i8_v2_ari, NVPTX::LDV_i16_v2_ari,
1144 NVPTX::LDV_i32_v2_ari, NVPTX::LDV_i64_v2_ari,
1145 NVPTX::LDV_f32_v2_ari, NVPTX::LDV_f64_v2_ari);
1150 NVPTX::LDV_i16_v4_ari, NVPTX::LDV_i32_v4_ari,
1151 std::nullopt, NVPTX::LDV_f32_v4_ari, std::nullopt);
1159 if (PointerSize == 64) {
1160 switch (
N->getOpcode()) {
1166 NVPTX::LDV_i16_v2_areg_64, NVPTX::LDV_i32_v2_areg_64,
1167 NVPTX::LDV_i64_v2_areg_64, NVPTX::LDV_f32_v2_areg_64,
1168 NVPTX::LDV_f64_v2_areg_64);
1173 NVPTX::LDV_i16_v4_areg_64, NVPTX::LDV_i32_v4_areg_64, std::nullopt,
1174 NVPTX::LDV_f32_v4_areg_64, std::nullopt);
1178 switch (
N->getOpcode()) {
1184 NVPTX::LDV_i16_v2_areg, NVPTX::LDV_i32_v2_areg,
1185 NVPTX::LDV_i64_v2_areg, NVPTX::LDV_f32_v2_areg,
1186 NVPTX::LDV_f64_v2_areg);
1191 NVPTX::LDV_i16_v4_areg, NVPTX::LDV_i32_v4_areg,
1192 std::nullopt, NVPTX::LDV_f32_v4_areg, std::nullopt);
1198 Ops.append({Op1, Chain});
1209bool NVPTXDAGToDAGISel::tryLDGLDU(
SDNode *
N) {
1210 auto *Mem = cast<MemSDNode>(
N);
1216 EVT OrigType =
N->getValueType(0);
1217 EVT EltVT = Mem->getMemoryVT();
1218 unsigned NumElts = 1;
1224 if ((EltVT == MVT::f16 && OrigType == MVT::v2f16) ||
1225 (EltVT == MVT::bf16 && OrigType == MVT::v2bf16) ||
1226 (EltVT == MVT::i16 && OrigType == MVT::v2i16) ||
1227 (EltVT == MVT::i8 && OrigType == MVT::v4i8)) {
1229 "NumElts must be divisible by the number of elts in subvectors");
1238 EVT NodeVT = (EltVT == MVT::i8) ? MVT::i16 : EltVT;
1240 for (
unsigned i = 0; i != NumElts; ++i) {
1247 std::optional<unsigned> Opcode;
1252 if (SelectDirectAddr(Op1,
Addr)) {
1253 switch (
N->getOpcode()) {
1259 NVPTX::INT_PTX_LDG_GLOBAL_i16avar, NVPTX::INT_PTX_LDG_GLOBAL_i32avar,
1260 NVPTX::INT_PTX_LDG_GLOBAL_i64avar, NVPTX::INT_PTX_LDG_GLOBAL_f32avar,
1261 NVPTX::INT_PTX_LDG_GLOBAL_f64avar);
1266 NVPTX::INT_PTX_LDU_GLOBAL_i16avar, NVPTX::INT_PTX_LDU_GLOBAL_i32avar,
1267 NVPTX::INT_PTX_LDU_GLOBAL_i64avar, NVPTX::INT_PTX_LDU_GLOBAL_f32avar,
1268 NVPTX::INT_PTX_LDU_GLOBAL_f64avar);
1272 NVPTX::INT_PTX_LDG_G_v2i8_ELE_avar,
1273 NVPTX::INT_PTX_LDG_G_v2i16_ELE_avar,
1274 NVPTX::INT_PTX_LDG_G_v2i32_ELE_avar,
1275 NVPTX::INT_PTX_LDG_G_v2i64_ELE_avar,
1276 NVPTX::INT_PTX_LDG_G_v2f32_ELE_avar,
1277 NVPTX::INT_PTX_LDG_G_v2f64_ELE_avar);
1281 NVPTX::INT_PTX_LDU_G_v2i8_ELE_avar,
1282 NVPTX::INT_PTX_LDU_G_v2i16_ELE_avar,
1283 NVPTX::INT_PTX_LDU_G_v2i32_ELE_avar,
1284 NVPTX::INT_PTX_LDU_G_v2i64_ELE_avar,
1285 NVPTX::INT_PTX_LDU_G_v2f32_ELE_avar,
1286 NVPTX::INT_PTX_LDU_G_v2f64_ELE_avar);
1291 NVPTX::INT_PTX_LDG_G_v4i16_ELE_avar,
1292 NVPTX::INT_PTX_LDG_G_v4i32_ELE_avar, std::nullopt,
1293 NVPTX::INT_PTX_LDG_G_v4f32_ELE_avar, std::nullopt);
1298 NVPTX::INT_PTX_LDU_G_v4i16_ELE_avar,
1299 NVPTX::INT_PTX_LDU_G_v4i32_ELE_avar, std::nullopt,
1300 NVPTX::INT_PTX_LDU_G_v4f32_ELE_avar, std::nullopt);
1310 switch (
N->getOpcode()) {
1315 NVPTX::INT_PTX_LDG_GLOBAL_i8ari64,
1316 NVPTX::INT_PTX_LDG_GLOBAL_i16ari64,
1317 NVPTX::INT_PTX_LDG_GLOBAL_i32ari64,
1318 NVPTX::INT_PTX_LDG_GLOBAL_i64ari64,
1319 NVPTX::INT_PTX_LDG_GLOBAL_f32ari64,
1320 NVPTX::INT_PTX_LDG_GLOBAL_f64ari64);
1324 NVPTX::INT_PTX_LDU_GLOBAL_i8ari64,
1325 NVPTX::INT_PTX_LDU_GLOBAL_i16ari64,
1326 NVPTX::INT_PTX_LDU_GLOBAL_i32ari64,
1327 NVPTX::INT_PTX_LDU_GLOBAL_i64ari64,
1328 NVPTX::INT_PTX_LDU_GLOBAL_f32ari64,
1329 NVPTX::INT_PTX_LDU_GLOBAL_f64ari64);
1333 NVPTX::INT_PTX_LDG_G_v2i8_ELE_ari64,
1334 NVPTX::INT_PTX_LDG_G_v2i16_ELE_ari64,
1335 NVPTX::INT_PTX_LDG_G_v2i32_ELE_ari64,
1336 NVPTX::INT_PTX_LDG_G_v2i64_ELE_ari64,
1337 NVPTX::INT_PTX_LDG_G_v2f32_ELE_ari64,
1338 NVPTX::INT_PTX_LDG_G_v2f64_ELE_ari64);
1342 NVPTX::INT_PTX_LDU_G_v2i8_ELE_ari64,
1343 NVPTX::INT_PTX_LDU_G_v2i16_ELE_ari64,
1344 NVPTX::INT_PTX_LDU_G_v2i32_ELE_ari64,
1345 NVPTX::INT_PTX_LDU_G_v2i64_ELE_ari64,
1346 NVPTX::INT_PTX_LDU_G_v2f32_ELE_ari64,
1347 NVPTX::INT_PTX_LDU_G_v2f64_ELE_ari64);
1352 NVPTX::INT_PTX_LDG_G_v4i16_ELE_ari64,
1353 NVPTX::INT_PTX_LDG_G_v4i32_ELE_ari64, std::nullopt,
1354 NVPTX::INT_PTX_LDG_G_v4f32_ELE_ari64, std::nullopt);
1359 NVPTX::INT_PTX_LDU_G_v4i16_ELE_ari64,
1360 NVPTX::INT_PTX_LDU_G_v4i32_ELE_ari64, std::nullopt,
1361 NVPTX::INT_PTX_LDU_G_v4f32_ELE_ari64, std::nullopt);
1365 switch (
N->getOpcode()) {
1371 NVPTX::INT_PTX_LDG_GLOBAL_i16ari, NVPTX::INT_PTX_LDG_GLOBAL_i32ari,
1372 NVPTX::INT_PTX_LDG_GLOBAL_i64ari, NVPTX::INT_PTX_LDG_GLOBAL_f32ari,
1373 NVPTX::INT_PTX_LDG_GLOBAL_f64ari);
1378 NVPTX::INT_PTX_LDU_GLOBAL_i16ari, NVPTX::INT_PTX_LDU_GLOBAL_i32ari,
1379 NVPTX::INT_PTX_LDU_GLOBAL_i64ari, NVPTX::INT_PTX_LDU_GLOBAL_f32ari,
1380 NVPTX::INT_PTX_LDU_GLOBAL_f64ari);
1384 NVPTX::INT_PTX_LDG_G_v2i8_ELE_ari32,
1385 NVPTX::INT_PTX_LDG_G_v2i16_ELE_ari32,
1386 NVPTX::INT_PTX_LDG_G_v2i32_ELE_ari32,
1387 NVPTX::INT_PTX_LDG_G_v2i64_ELE_ari32,
1388 NVPTX::INT_PTX_LDG_G_v2f32_ELE_ari32,
1389 NVPTX::INT_PTX_LDG_G_v2f64_ELE_ari32);
1393 NVPTX::INT_PTX_LDU_G_v2i8_ELE_ari32,
1394 NVPTX::INT_PTX_LDU_G_v2i16_ELE_ari32,
1395 NVPTX::INT_PTX_LDU_G_v2i32_ELE_ari32,
1396 NVPTX::INT_PTX_LDU_G_v2i64_ELE_ari32,
1397 NVPTX::INT_PTX_LDU_G_v2f32_ELE_ari32,
1398 NVPTX::INT_PTX_LDU_G_v2f64_ELE_ari32);
1403 NVPTX::INT_PTX_LDG_G_v4i16_ELE_ari32,
1404 NVPTX::INT_PTX_LDG_G_v4i32_ELE_ari32, std::nullopt,
1405 NVPTX::INT_PTX_LDG_G_v4f32_ELE_ari32, std::nullopt);
1410 NVPTX::INT_PTX_LDU_G_v4i16_ELE_ari32,
1411 NVPTX::INT_PTX_LDU_G_v4i32_ELE_ari32, std::nullopt,
1412 NVPTX::INT_PTX_LDU_G_v4f32_ELE_ari32, std::nullopt);
1422 switch (
N->getOpcode()) {
1427 NVPTX::INT_PTX_LDG_GLOBAL_i8areg64,
1428 NVPTX::INT_PTX_LDG_GLOBAL_i16areg64,
1429 NVPTX::INT_PTX_LDG_GLOBAL_i32areg64,
1430 NVPTX::INT_PTX_LDG_GLOBAL_i64areg64,
1431 NVPTX::INT_PTX_LDG_GLOBAL_f32areg64,
1432 NVPTX::INT_PTX_LDG_GLOBAL_f64areg64);
1436 NVPTX::INT_PTX_LDU_GLOBAL_i8areg64,
1437 NVPTX::INT_PTX_LDU_GLOBAL_i16areg64,
1438 NVPTX::INT_PTX_LDU_GLOBAL_i32areg64,
1439 NVPTX::INT_PTX_LDU_GLOBAL_i64areg64,
1440 NVPTX::INT_PTX_LDU_GLOBAL_f32areg64,
1441 NVPTX::INT_PTX_LDU_GLOBAL_f64areg64);
1445 NVPTX::INT_PTX_LDG_G_v2i8_ELE_areg64,
1446 NVPTX::INT_PTX_LDG_G_v2i16_ELE_areg64,
1447 NVPTX::INT_PTX_LDG_G_v2i32_ELE_areg64,
1448 NVPTX::INT_PTX_LDG_G_v2i64_ELE_areg64,
1449 NVPTX::INT_PTX_LDG_G_v2f32_ELE_areg64,
1450 NVPTX::INT_PTX_LDG_G_v2f64_ELE_areg64);
1454 NVPTX::INT_PTX_LDU_G_v2i8_ELE_areg64,
1455 NVPTX::INT_PTX_LDU_G_v2i16_ELE_areg64,
1456 NVPTX::INT_PTX_LDU_G_v2i32_ELE_areg64,
1457 NVPTX::INT_PTX_LDU_G_v2i64_ELE_areg64,
1458 NVPTX::INT_PTX_LDU_G_v2f32_ELE_areg64,
1459 NVPTX::INT_PTX_LDU_G_v2f64_ELE_areg64);
1464 NVPTX::INT_PTX_LDG_G_v4i16_ELE_areg64,
1465 NVPTX::INT_PTX_LDG_G_v4i32_ELE_areg64, std::nullopt,
1466 NVPTX::INT_PTX_LDG_G_v4f32_ELE_areg64, std::nullopt);
1471 NVPTX::INT_PTX_LDU_G_v4i16_ELE_areg64,
1472 NVPTX::INT_PTX_LDU_G_v4i32_ELE_areg64, std::nullopt,
1473 NVPTX::INT_PTX_LDU_G_v4f32_ELE_areg64, std::nullopt);
1477 switch (
N->getOpcode()) {
1482 NVPTX::INT_PTX_LDG_GLOBAL_i8areg,
1483 NVPTX::INT_PTX_LDG_GLOBAL_i16areg,
1484 NVPTX::INT_PTX_LDG_GLOBAL_i32areg,
1485 NVPTX::INT_PTX_LDG_GLOBAL_i64areg,
1486 NVPTX::INT_PTX_LDG_GLOBAL_f32areg,
1487 NVPTX::INT_PTX_LDG_GLOBAL_f64areg);
1491 NVPTX::INT_PTX_LDU_GLOBAL_i8areg,
1492 NVPTX::INT_PTX_LDU_GLOBAL_i16areg,
1493 NVPTX::INT_PTX_LDU_GLOBAL_i32areg,
1494 NVPTX::INT_PTX_LDU_GLOBAL_i64areg,
1495 NVPTX::INT_PTX_LDU_GLOBAL_f32areg,
1496 NVPTX::INT_PTX_LDU_GLOBAL_f64areg);
1500 NVPTX::INT_PTX_LDG_G_v2i8_ELE_areg32,
1501 NVPTX::INT_PTX_LDG_G_v2i16_ELE_areg32,
1502 NVPTX::INT_PTX_LDG_G_v2i32_ELE_areg32,
1503 NVPTX::INT_PTX_LDG_G_v2i64_ELE_areg32,
1504 NVPTX::INT_PTX_LDG_G_v2f32_ELE_areg32,
1505 NVPTX::INT_PTX_LDG_G_v2f64_ELE_areg32);
1509 NVPTX::INT_PTX_LDU_G_v2i8_ELE_areg32,
1510 NVPTX::INT_PTX_LDU_G_v2i16_ELE_areg32,
1511 NVPTX::INT_PTX_LDU_G_v2i32_ELE_areg32,
1512 NVPTX::INT_PTX_LDU_G_v2i64_ELE_areg32,
1513 NVPTX::INT_PTX_LDU_G_v2f32_ELE_areg32,
1514 NVPTX::INT_PTX_LDU_G_v2f64_ELE_areg32);
1519 NVPTX::INT_PTX_LDG_G_v4i16_ELE_areg32,
1520 NVPTX::INT_PTX_LDG_G_v4i32_ELE_areg32, std::nullopt,
1521 NVPTX::INT_PTX_LDG_G_v4f32_ELE_areg32, std::nullopt);
1526 NVPTX::INT_PTX_LDU_G_v4i16_ELE_areg32,
1527 NVPTX::INT_PTX_LDU_G_v4i32_ELE_areg32, std::nullopt,
1528 NVPTX::INT_PTX_LDU_G_v4f32_ELE_areg32, std::nullopt);
1534 SDValue Ops[] = { Op1, Chain };
1551 if (OrigType != EltVT &&
1561 for (
unsigned i = 0; i != NumElts; ++i) {
1577bool NVPTXDAGToDAGISel::tryStore(
SDNode *
N) {
1579 assert(
ST->writeMem() &&
"Expected store");
1582 assert((PlainStore || AtomicStore) &&
"Expected store");
1585 if (PlainStore && PlainStore->
isIndexed())
1588 EVT StoreVT =
ST->getMemoryVT();
1611 "Unexpected vector type");
1623 std::optional<unsigned> Opcode;
1625 Value.getNode()->getSimpleValueType(0).SimpleTy;
1628 {
Value, getI32Imm(Ordering,
DL), getI32Imm(Scope,
DL),
1629 getI32Imm(CodeAddrSpace,
DL), getI32Imm(VecType,
DL),
1630 getI32Imm(ToType,
DL), getI32Imm(ToTypeWidth,
DL)});
1632 if (SelectDirectAddr(BasePtr,
Addr)) {
1633 Opcode =
pickOpcodeForVT(SourceVT, NVPTX::ST_i8_avar, NVPTX::ST_i16_avar,
1634 NVPTX::ST_i32_avar, NVPTX::ST_i64_avar,
1635 NVPTX::ST_f32_avar, NVPTX::ST_f64_avar);
1638 Ops.append({
Addr, Chain});
1639 }
else if (PointerSize == 64
1642 Opcode =
pickOpcodeForVT(SourceVT, NVPTX::ST_i8_asi, NVPTX::ST_i16_asi,
1643 NVPTX::ST_i32_asi, NVPTX::ST_i64_asi,
1644 NVPTX::ST_f32_asi, NVPTX::ST_f64_asi);
1648 }
else if (PointerSize == 64
1651 if (PointerSize == 64)
1654 NVPTX::ST_i32_ari_64, NVPTX::ST_i64_ari_64,
1655 NVPTX::ST_f32_ari_64, NVPTX::ST_f64_ari_64);
1657 Opcode =
pickOpcodeForVT(SourceVT, NVPTX::ST_i8_ari, NVPTX::ST_i16_ari,
1658 NVPTX::ST_i32_ari, NVPTX::ST_i64_ari,
1659 NVPTX::ST_f32_ari, NVPTX::ST_f64_ari);
1664 if (PointerSize == 64)
1666 pickOpcodeForVT(SourceVT, NVPTX::ST_i8_areg_64, NVPTX::ST_i16_areg_64,
1667 NVPTX::ST_i32_areg_64, NVPTX::ST_i64_areg_64,
1668 NVPTX::ST_f32_areg_64, NVPTX::ST_f64_areg_64);
1670 Opcode =
pickOpcodeForVT(SourceVT, NVPTX::ST_i8_areg, NVPTX::ST_i16_areg,
1671 NVPTX::ST_i32_areg, NVPTX::ST_i64_areg,
1672 NVPTX::ST_f32_areg, NVPTX::ST_f64_areg);
1689bool NVPTXDAGToDAGISel::tryStoreVector(
SDNode *
N) {
1692 std::optional<unsigned> Opcode;
1709 auto [
Ordering,
Scope] = insertMemoryInstructionFence(
DL, Chain, MemSD);
1722 switch (
N->getOpcode()) {
1725 Ops.
append({
N->getOperand(1),
N->getOperand(2)});
1726 N2 =
N->getOperand(3);
1730 Ops.
append({
N->getOperand(1),
N->getOperand(2),
N->getOperand(3),
1732 N2 =
N->getOperand(5);
1744 Ops.
append({getI32Imm(Ordering,
DL), getI32Imm(Scope,
DL),
1745 getI32Imm(CodeAddrSpace,
DL), getI32Imm(VecType,
DL),
1746 getI32Imm(ToType,
DL), getI32Imm(ToTypeWidth,
DL)});
1748 if (SelectDirectAddr(N2,
Addr)) {
1749 switch (
N->getOpcode()) {
1754 NVPTX::STV_i8_v2_avar, NVPTX::STV_i16_v2_avar,
1755 NVPTX::STV_i32_v2_avar, NVPTX::STV_i64_v2_avar,
1756 NVPTX::STV_f32_v2_avar, NVPTX::STV_f64_v2_avar);
1760 NVPTX::STV_i8_v4_avar, NVPTX::STV_i16_v4_avar,
1761 NVPTX::STV_i32_v4_avar, std::nullopt,
1762 NVPTX::STV_f32_v4_avar, std::nullopt);
1768 switch (
N->getOpcode()) {
1773 NVPTX::STV_i8_v2_asi, NVPTX::STV_i16_v2_asi,
1774 NVPTX::STV_i32_v2_asi, NVPTX::STV_i64_v2_asi,
1775 NVPTX::STV_f32_v2_asi, NVPTX::STV_f64_v2_asi);
1780 NVPTX::STV_i16_v4_asi, NVPTX::STV_i32_v4_asi,
1781 std::nullopt, NVPTX::STV_f32_v4_asi, std::nullopt);
1787 if (PointerSize == 64) {
1788 switch (
N->getOpcode()) {
1794 NVPTX::STV_i8_v2_ari_64, NVPTX::STV_i16_v2_ari_64,
1795 NVPTX::STV_i32_v2_ari_64, NVPTX::STV_i64_v2_ari_64,
1796 NVPTX::STV_f32_v2_ari_64, NVPTX::STV_f64_v2_ari_64);
1801 NVPTX::STV_i16_v4_ari_64, NVPTX::STV_i32_v4_ari_64, std::nullopt,
1802 NVPTX::STV_f32_v4_ari_64, std::nullopt);
1806 switch (
N->getOpcode()) {
1811 NVPTX::STV_i8_v2_ari, NVPTX::STV_i16_v2_ari,
1812 NVPTX::STV_i32_v2_ari, NVPTX::STV_i64_v2_ari,
1813 NVPTX::STV_f32_v2_ari, NVPTX::STV_f64_v2_ari);
1817 NVPTX::STV_i8_v4_ari, NVPTX::STV_i16_v4_ari,
1818 NVPTX::STV_i32_v4_ari, std::nullopt,
1819 NVPTX::STV_f32_v4_ari, std::nullopt);
1825 if (PointerSize == 64) {
1826 switch (
N->getOpcode()) {
1832 NVPTX::STV_i16_v2_areg_64, NVPTX::STV_i32_v2_areg_64,
1833 NVPTX::STV_i64_v2_areg_64, NVPTX::STV_f32_v2_areg_64,
1834 NVPTX::STV_f64_v2_areg_64);
1839 NVPTX::STV_i16_v4_areg_64, NVPTX::STV_i32_v4_areg_64, std::nullopt,
1840 NVPTX::STV_f32_v4_areg_64, std::nullopt);
1844 switch (
N->getOpcode()) {
1850 NVPTX::STV_i16_v2_areg, NVPTX::STV_i32_v2_areg,
1851 NVPTX::STV_i64_v2_areg, NVPTX::STV_f32_v2_areg,
1852 NVPTX::STV_f64_v2_areg);
1857 NVPTX::STV_i16_v4_areg, NVPTX::STV_i32_v4_areg,
1858 std::nullopt, NVPTX::STV_f32_v4_areg, std::nullopt);
1879bool NVPTXDAGToDAGISel::tryLoadParam(
SDNode *
Node) {
1887 switch (
Node->getOpcode()) {
1901 EVT EltVT =
Node->getValueType(0);
1904 std::optional<unsigned> Opcode;
1911 NVPTX::LoadParamMemI8, NVPTX::LoadParamMemI16,
1912 NVPTX::LoadParamMemI32, NVPTX::LoadParamMemI64,
1913 NVPTX::LoadParamMemF32, NVPTX::LoadParamMemF64);
1918 NVPTX::LoadParamMemV2I16, NVPTX::LoadParamMemV2I32,
1919 NVPTX::LoadParamMemV2I64, NVPTX::LoadParamMemV2F32,
1920 NVPTX::LoadParamMemV2F64);
1925 NVPTX::LoadParamMemV4I16, NVPTX::LoadParamMemV4I32,
1926 std::nullopt, NVPTX::LoadParamMemV4F32, std::nullopt);
1935 }
else if (VecSize == 2) {
1938 EVT EVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other, MVT::Glue };
1942 unsigned OffsetVal =
Offset->getAsZExtVal();
1951bool NVPTXDAGToDAGISel::tryStoreRetval(
SDNode *
N) {
1955 unsigned OffsetVal =
Offset->getAsZExtVal();
1959 unsigned NumElts = 1;
1960 switch (
N->getOpcode()) {
1976 for (
unsigned i = 0; i < NumElts; ++i)
1983 std::optional<unsigned> Opcode = 0;
1989 NVPTX::StoreRetvalI8, NVPTX::StoreRetvalI16,
1990 NVPTX::StoreRetvalI32, NVPTX::StoreRetvalI64,
1991 NVPTX::StoreRetvalF32, NVPTX::StoreRetvalF64);
1992 if (Opcode == NVPTX::StoreRetvalI8) {
1996 switch (Ops[0].getSimpleValueType().SimpleTy) {
2000 Opcode = NVPTX::StoreRetvalI8TruncI32;
2003 Opcode = NVPTX::StoreRetvalI8TruncI64;
2010 NVPTX::StoreRetvalV2I8, NVPTX::StoreRetvalV2I16,
2011 NVPTX::StoreRetvalV2I32, NVPTX::StoreRetvalV2I64,
2012 NVPTX::StoreRetvalV2F32, NVPTX::StoreRetvalV2F64);
2016 NVPTX::StoreRetvalV4I8, NVPTX::StoreRetvalV4I16,
2017 NVPTX::StoreRetvalV4I32, std::nullopt,
2018 NVPTX::StoreRetvalV4F32, std::nullopt);
2033#define getOpcV2H(ty, opKind0, opKind1) \
2034 NVPTX::StoreParamV2##ty##_##opKind0##opKind1
2036#define getOpcV2H1(ty, opKind0, isImm1) \
2037 (isImm1) ? getOpcV2H(ty, opKind0, i) : getOpcV2H(ty, opKind0, r)
2039#define getOpcodeForVectorStParamV2(ty, isimm) \
2040 (isimm[0]) ? getOpcV2H1(ty, i, isimm[1]) : getOpcV2H1(ty, r, isimm[1])
2042#define getOpcV4H(ty, opKind0, opKind1, opKind2, opKind3) \
2043 NVPTX::StoreParamV4##ty##_##opKind0##opKind1##opKind2##opKind3
2045#define getOpcV4H3(ty, opKind0, opKind1, opKind2, isImm3) \
2046 (isImm3) ? getOpcV4H(ty, opKind0, opKind1, opKind2, i) \
2047 : getOpcV4H(ty, opKind0, opKind1, opKind2, r)
2049#define getOpcV4H2(ty, opKind0, opKind1, isImm2, isImm3) \
2050 (isImm2) ? getOpcV4H3(ty, opKind0, opKind1, i, isImm3) \
2051 : getOpcV4H3(ty, opKind0, opKind1, r, isImm3)
2053#define getOpcV4H1(ty, opKind0, isImm1, isImm2, isImm3) \
2054 (isImm1) ? getOpcV4H2(ty, opKind0, i, isImm2, isImm3) \
2055 : getOpcV4H2(ty, opKind0, r, isImm2, isImm3)
2057#define getOpcodeForVectorStParamV4(ty, isimm) \
2058 (isimm[0]) ? getOpcV4H1(ty, i, isimm[1], isimm[2], isimm[3]) \
2059 : getOpcV4H1(ty, r, isimm[1], isimm[2], isimm[3])
2061#define getOpcodeForVectorStParam(n, ty, isimm) \
2062 (n == 2) ? getOpcodeForVectorStParamV2(ty, isimm) \
2063 : getOpcodeForVectorStParamV4(ty, isimm)
2072 for (
unsigned i = 0; i < NumElts; i++) {
2073 IsImm[i] = (isa<ConstantSDNode>(Ops[i]) || isa<ConstantFPSDNode>(Ops[i]));
2076 if (MemTy == MVT::f32 || MemTy == MVT::f64) {
2098 assert(NumElts == 2 &&
"MVT too large for NumElts > 2");
2103 assert(NumElts == 2 &&
"MVT too large for NumElts > 2");
2109 return (NumElts == 2) ? NVPTX::StoreParamV2I8_rr
2110 : NVPTX::StoreParamV4I8_rrrr;
2113 return (NumElts == 2) ? NVPTX::StoreParamV2I16_rr
2114 : NVPTX::StoreParamV4I16_rrrr;
2119 return (NumElts == 2) ? NVPTX::StoreParamV2I32_rr
2120 : NVPTX::StoreParamV4I32_rrrr;
2126bool NVPTXDAGToDAGISel::tryStoreParam(
SDNode *
N) {
2130 unsigned ParamVal =
Param->getAsZExtVal();
2132 unsigned OffsetVal =
Offset->getAsZExtVal();
2134 SDValue Glue =
N->getOperand(
N->getNumOperands() - 1);
2138 switch (
N->getOpcode()) {
2156 for (
unsigned i = 0; i < NumElts; ++i)
2164 std::optional<unsigned> Opcode;
2165 switch (
N->getOpcode()) {
2173 if (MemTy != MVT::f16 && MemTy != MVT::v2f16 &&
2174 (isa<ConstantSDNode>(Imm) || isa<ConstantFPSDNode>(Imm))) {
2176 if (MemTy == MVT::f32 || MemTy == MVT::f64) {
2188 NVPTX::StoreParamI16_i, NVPTX::StoreParamI32_i,
2189 NVPTX::StoreParamI64_i, NVPTX::StoreParamF32_i,
2190 NVPTX::StoreParamF64_i);
2194 NVPTX::StoreParamI8_r, NVPTX::StoreParamI16_r,
2195 NVPTX::StoreParamI32_r, NVPTX::StoreParamI64_r,
2196 NVPTX::StoreParamF32_r, NVPTX::StoreParamF64_r);
2197 if (Opcode == NVPTX::StoreParamI8_r) {
2201 switch (Ops[0].getSimpleValueType().SimpleTy) {
2205 Opcode = NVPTX::StoreParamI8TruncI32_r;
2208 Opcode = NVPTX::StoreParamI8TruncI64_r;
2226 Opcode = NVPTX::StoreParamI32_r;
2230 MVT::i32, Ops[0], CvtNone);
2235 Opcode = NVPTX::StoreParamI32_r;
2239 MVT::i32, Ops[0], CvtNone);
2256bool NVPTXDAGToDAGISel::tryBFE(
SDNode *
N) {
2263 bool IsSigned =
false;
2268 if (isa<ConstantSDNode>(LHS) && !isa<ConstantSDNode>(RHS)) {
2293 Val =
LHS.getNode()->getOperand(0);
2294 Start =
LHS.getNode()->getOperand(1);
2300 int64_t GoodBits = Start.getValueSizeInBits() - StartVal;
2301 if (NumBits > GoodBits) {
2335 if (isa<ConstantSDNode>(AndLHS)) {
2359 NumBits = NumZeros + NumOnes - ShiftAmt;
2365 if (ShiftAmt < NumZeros) {
2382 Val =
LHS->getOperand(0);
2401 if (OuterShiftAmt < InnerShiftAmt) {
2437 Opc = NVPTX::BFE_S32rii;
2439 Opc = NVPTX::BFE_U32rii;
2443 Opc = NVPTX::BFE_S64rii;
2445 Opc = NVPTX::BFE_U64rii;
2461bool NVPTXDAGToDAGISel::tryBF16ArithToFMA(
SDNode *
N) {
2483 auto API = APF.bitcastToAPInt();
2484 API = API.concat(API);
2492 switch (
N->getOpcode()) {
2495 Operands = {N0, GetConstant(1.0), N1};
2499 Operands = {N1, GetConstant(-1.0), N0};
2504 Operands = {N0, N1, GetConstant(-0.0)};
2510 int Opcode = IsVec ? NVPTX::BFMA16x2rrr : NVPTX::BFMA16rrr;
2517 return V.getOpcode() ==
ISD::ADD ||
2518 (V->getOpcode() ==
ISD::OR && V->getFlags().hasDisjoint());
2549 FindRootAddressAndTotalOffset =
2551 uint64_t AccumulatedOffset) -> std::optional<uint64_t> {
2555 AccumulatedOffset += CN->getZExtValue();
2556 if (SelectDirectAddr(PossibleBaseAddr,
Base))
2557 return AccumulatedOffset;
2558 return FindRootAddressAndTotalOffset(PossibleBaseAddr,
2562 return std::nullopt;
2564 if (
auto AccumulatedOffset = FindRootAddressAndTotalOffset(
Addr, 0)) {
2597 if (SelectDirectAddr(
Addr.getOperand(0),
Addr)) {
2602 dyn_cast<FrameIndexSDNode>(
Addr.getOperand(0)))
2610 if (!CN->getAPIntValue().isSignedIntN(32))
2614 SDLoc(OpNode), MVT::i32);
2633bool NVPTXDAGToDAGISel::ChkMemSDNodeAddressSpace(
SDNode *
N,
2634 unsigned int spN)
const {
2635 const Value *Src =
nullptr;
2636 if (
MemSDNode *mN = dyn_cast<MemSDNode>(
N)) {
2637 if (spN == 0 && mN->getMemOperand()->getPseudoValue())
2639 Src = mN->getMemOperand()->getValue();
2643 if (
auto *PT = dyn_cast<PointerType>(Src->getType()))
2644 return (PT->getAddressSpace() == spN);
2652 std::vector<SDValue> &OutOps) {
2654 switch (ConstraintID) {
2658 if (SelectDirectAddr(
Op, Op0)) {
2659 OutOps.push_back(Op0);
2663 if (SelectADDRri(
Op.getNode(),
Op, Op0, Op1)) {
2664 OutOps.push_back(Op0);
2665 OutOps.push_back(Op1);
2673void NVPTXDAGToDAGISel::SelectV2I64toI128(
SDNode *
N) {
2692 NewOps[0] =
N->getOperand(0);
2695 if (
N->getNumOperands() == 5)
2696 NewOps[3] =
N->getOperand(4);
2702void NVPTXDAGToDAGISel::SelectI128toV2I64(
SDNode *
N) {
2720 NVPTX::I128toV2I64,
DL,
2729unsigned NVPTXDAGToDAGISel::GetConvertOpcode(
MVT DestTy,
MVT SrcTy,
2740 return IsSigned ? NVPTX::CVT_s16_s8 : NVPTX::CVT_u16_u8;
2742 return IsSigned ? NVPTX::CVT_s32_s8 : NVPTX::CVT_u32_u8;
2744 return IsSigned ? NVPTX::CVT_s64_s8 : NVPTX::CVT_u64_u8;
2751 return IsSigned ? NVPTX::CVT_s8_s16 : NVPTX::CVT_u8_u16;
2753 return IsSigned ? NVPTX::CVT_s32_s16 : NVPTX::CVT_u32_u16;
2755 return IsSigned ? NVPTX::CVT_s64_s16 : NVPTX::CVT_u64_u16;
2762 return IsSigned ? NVPTX::CVT_s8_s32 : NVPTX::CVT_u8_u32;
2764 return IsSigned ? NVPTX::CVT_s16_s32 : NVPTX::CVT_u16_u32;
2766 return IsSigned ? NVPTX::CVT_s64_s32 : NVPTX::CVT_u64_u32;
2773 return IsSigned ? NVPTX::CVT_s8_s64 : NVPTX::CVT_u8_u64;
2775 return IsSigned ? NVPTX::CVT_s16_s64 : NVPTX::CVT_u16_u64;
2777 return IsSigned ? NVPTX::CVT_s32_s64 : NVPTX::CVT_u32_u64;
2784 return NVPTX::CVT_f32_f16;
2786 return NVPTX::CVT_f64_f16;
2791bool NVPTXDAGToDAGISel::tryFence(
SDNode *
N) {
2794 unsigned int FenceOp =
2796 Scopes[
N->getConstantOperandVal(2)],
Subtarget);
2814 "NVPTXScopes::operator[]");
2816 auto S = Scopes.find(
ID);
2817 if (S == Scopes.end()) {
2829#define CP_ASYNC_BULK_TENSOR_OPCODE(dir, dim, mode, is_s32, suffix) \
2831 ? NVPTX::CP_ASYNC_BULK_TENSOR_##dir##_##dim##_SHARED32_##mode##suffix \
2832 : NVPTX::CP_ASYNC_BULK_TENSOR_##dir##_##dim##_##mode##suffix)
2834#define CP_ASYNC_BULK_TENSOR_OPCODE_S2G_IMPL(op, dim, mode, is_ch, is_s32) \
2835 (is_ch ? (CP_ASYNC_BULK_TENSOR_OPCODE(op, dim, mode, is_s32, _CH)) \
2836 : (CP_ASYNC_BULK_TENSOR_OPCODE(op, dim, mode, is_s32, )))
2838#define GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G(dim, mode, is_reduce, is_ch, \
2841 ? (CP_ASYNC_BULK_TENSOR_OPCODE_S2G_IMPL(RED, dim, mode, is_ch, is_s32)) \
2842 : (CP_ASYNC_BULK_TENSOR_OPCODE_S2G_IMPL(S2G, dim, mode, is_ch, \
2845#define GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(dim, mode, is_mc, is_ch, is_s32) \
2847 if (is_mc && is_ch) \
2848 return CP_ASYNC_BULK_TENSOR_OPCODE(G2S, dim, mode, is_s32, _MC_CH); \
2850 return CP_ASYNC_BULK_TENSOR_OPCODE(G2S, dim, mode, is_s32, _CH); \
2852 return CP_ASYNC_BULK_TENSOR_OPCODE(G2S, dim, mode, is_s32, _MC); \
2853 return CP_ASYNC_BULK_TENSOR_OPCODE(G2S, dim, mode, is_s32, ); \
2856#define GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH(dim, mode, is_ch) \
2857 (is_ch ? NVPTX::CP_ASYNC_BULK_TENSOR_PREFETCH_##dim##_##mode##_CH \
2858 : NVPTX::CP_ASYNC_BULK_TENSOR_PREFETCH_##dim##_##mode)
2861 bool IsCacheHint,
bool IsIm2Col,
2862 bool IsReduce =
false) {
2867 IsCacheHint, IsShared32);
2870 IsCacheHint, IsShared32);
2873 IsCacheHint, IsShared32);
2876 "GetCpAsyncBulkTensorS2GOpcode.");
2882 IsCacheHint, IsShared32);
2885 IsCacheHint, IsShared32);
2888 IsCacheHint, IsShared32);
2891 IsCacheHint, IsShared32);
2894 IsCacheHint, IsShared32);
2897 "Invalid Dimension in tile mode for GetCpAsyncBulkTensorS2GOpcode.");
2904 bool IsCacheHint,
bool IsIm2Col) {
2909 IsCacheHint, IsShared32);
2912 IsCacheHint, IsShared32);
2915 IsCacheHint, IsShared32);
2918 "GetCpAsyncBulkTensorG2SOpcode.");
2924 IsCacheHint, IsShared32);
2927 IsCacheHint, IsShared32);
2930 IsCacheHint, IsShared32);
2933 IsCacheHint, IsShared32);
2936 IsCacheHint, IsShared32);
2939 "Invalid Dimension in tile mode for GetCpAsyncBulkTensorG2SOpcode.");
2956 "GetCpAsyncBulkTensorPrefetchOpcode.");
2972 "GetCpAsyncBulkTensorPrefetchOpcode.");
2979 case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_3d:
2980 case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_im2col_3d:
2982 case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_4d:
2983 case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_im2col_4d:
2985 case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_5d:
2986 case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_im2col_5d:
2993void NVPTXDAGToDAGISel::SelectCpAsyncBulkTensorG2SCommon(
SDNode *
N,
3001 size_t NumOps =
N->getNumOperands();
3005 size_t NumOffsets = IsIm2Col ? (NumDims - 2) : 0;
3006 bool IsCacheHint =
N->getConstantOperandVal(NumOps - 1) == 1;
3007 bool IsMultiCast =
N->getConstantOperandVal(NumOps - 2) == 1;
3008 size_t NumBaseArgs = NumDims + NumOffsets + 3;
3009 size_t MultiCastIdx = NumBaseArgs + 2;
3020 Ops.
push_back(
N->getOperand(MultiCastIdx + 1));
3028 NumDims, IsShared32, IsMultiCast, IsCacheHint, IsIm2Col);
3032void NVPTXDAGToDAGISel::SelectCpAsyncBulkTensorS2GCommon(
SDNode *
N,
3038 size_t NumOps =
N->getNumOperands();
3039 size_t NumDims = NumOps - 6;
3040 bool IsCacheHint =
N->getConstantOperandVal(NumOps - 1) == 1;
3041 size_t NumArgs = NumDims + (IsCacheHint ? 3 : 2);
3054void NVPTXDAGToDAGISel::SelectCpAsyncBulkTensorPrefetchCommon(
SDNode *
N,
3061 size_t NumOps =
N->getNumOperands();
3065 size_t NumOffsets = IsIm2Col ? (NumDims - 2) : 0;
3066 bool IsCacheHint =
N->getConstantOperandVal(NumOps - 1) == 1;
3067 size_t NumArgs = NumDims + NumOffsets + (IsCacheHint ? 2 : 1);
3078void NVPTXDAGToDAGISel::SelectCpAsyncBulkTensorReduceCommon(
SDNode *
N,
3085 size_t NumOps =
N->getNumOperands();
3086 size_t NumDims = NumOps - 6;
3087 bool IsCacheHint =
N->getConstantOperandVal(NumOps - 1) == 1;
3088 size_t NumArgs = NumDims + (IsCacheHint ? 3 : 2);
3098 NumDims, IsShared32, IsCacheHint, IsIm2Col,
true);
3102void NVPTXDAGToDAGISel::SelectCpAsyncBulkS2G(
SDNode *
N) {
3107 size_t NumOps =
N->getNumOperands();
3108 bool IsCacheHint =
N->getConstantOperandVal(NumOps - 1) == 1;
3109 size_t NumArgs = IsCacheHint ? 4 : 3;
3119 Opcode = IsShared32 ? NVPTX::CP_ASYNC_BULK_S2G_SHARED32_CH
3120 : NVPTX::CP_ASYNC_BULK_S2G_CH;
3122 Opcode = IsShared32 ? NVPTX::CP_ASYNC_BULK_S2G_SHARED32
3123 : NVPTX::CP_ASYNC_BULK_S2G;
3127void NVPTXDAGToDAGISel::SelectCpAsyncBulkG2S(
SDNode *
N) {
3133 size_t NumOps =
N->getNumOperands();
3134 bool IsCacheHint =
N->getConstantOperandVal(NumOps - 1) == 1;
3135 bool IsMultiCast =
N->getConstantOperandVal(NumOps - 2) == 1;
3136 size_t NumBaseArgs = 4;
3137 size_t MultiCastIdx = NumBaseArgs + 2;
3148 Ops.
push_back(
N->getOperand(MultiCastIdx + 1));
3155 unsigned Opcode = [&]() {
3156 if (IsMultiCast && IsCacheHint)
3157 return IsShared32 ? NVPTX::CP_ASYNC_BULK_G2S_SHARED32_MC_CH
3158 : NVPTX::CP_ASYNC_BULK_G2S_MC_CH;
3160 return IsShared32 ? NVPTX::CP_ASYNC_BULK_G2S_SHARED32_MC
3161 : NVPTX::CP_ASYNC_BULK_G2S_MC;
3163 return IsShared32 ? NVPTX::CP_ASYNC_BULK_G2S_SHARED32_CH
3164 : NVPTX::CP_ASYNC_BULK_G2S_CH;
3165 return IsShared32 ? NVPTX::CP_ASYNC_BULK_G2S_SHARED32
3166 : NVPTX::CP_ASYNC_BULK_G2S;
3171void NVPTXDAGToDAGISel::SelectCpAsyncBulkPrefetchL2(
SDNode *
N) {
3176 size_t NumOps =
N->getNumOperands();
3177 bool IsCacheHint =
N->getConstantOperandVal(NumOps - 1) == 1;
3178 size_t NumArgs = IsCacheHint ? 3 : 2;
3184 unsigned Opcode = IsCacheHint
3185 ? NVPTX::CP_ASYNC_BULK_PREFETCH_CH
3186 : NVPTX::CP_ASYNC_BULK_PREFETCH;
3190bool NVPTXDAGToDAGISel::tryIntrinsicVoid(
SDNode *
N) {
3191 unsigned IID =
N->getConstantOperandVal(1);
3193 auto CastTy = [](TMARedTy
Op) {
return static_cast<unsigned>(
Op); };
3197 case Intrinsic::nvvm_cp_async_bulk_global_to_shared_cluster:
3198 SelectCpAsyncBulkG2S(
N);
3200 case Intrinsic::nvvm_cp_async_bulk_shared_cta_to_global:
3201 SelectCpAsyncBulkS2G(
N);
3203 case Intrinsic::nvvm_cp_async_bulk_prefetch_L2:
3204 SelectCpAsyncBulkPrefetchL2(
N);
3206 case Intrinsic::nvvm_cp_async_bulk_tensor_s2g_tile_1d:
3207 case Intrinsic::nvvm_cp_async_bulk_tensor_s2g_tile_2d:
3208 case Intrinsic::nvvm_cp_async_bulk_tensor_s2g_tile_3d:
3209 case Intrinsic::nvvm_cp_async_bulk_tensor_s2g_tile_4d:
3210 case Intrinsic::nvvm_cp_async_bulk_tensor_s2g_tile_5d:
3211 SelectCpAsyncBulkTensorS2GCommon(
N);
3213 case Intrinsic::nvvm_cp_async_bulk_tensor_s2g_im2col_3d:
3214 case Intrinsic::nvvm_cp_async_bulk_tensor_s2g_im2col_4d:
3215 case Intrinsic::nvvm_cp_async_bulk_tensor_s2g_im2col_5d:
3216 SelectCpAsyncBulkTensorS2GCommon(
N,
true);
3218 case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_1d:
3219 case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_2d:
3220 case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_3d:
3221 case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_4d:
3222 case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_5d:
3223 SelectCpAsyncBulkTensorG2SCommon(
N);
3225 case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_3d:
3226 case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_4d:
3227 case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_5d:
3228 SelectCpAsyncBulkTensorG2SCommon(
N,
true);
3230 case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_tile_1d:
3231 case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_tile_2d:
3232 case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_tile_3d:
3233 case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_tile_4d:
3234 case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_tile_5d:
3235 SelectCpAsyncBulkTensorPrefetchCommon(
N);
3237 case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_im2col_3d:
3238 case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_im2col_4d:
3239 case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_im2col_5d:
3240 SelectCpAsyncBulkTensorPrefetchCommon(
N,
true);
3242 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_tile_1d:
3243 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_tile_2d:
3244 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_tile_3d:
3245 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_tile_4d:
3246 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_tile_5d:
3247 SelectCpAsyncBulkTensorReduceCommon(
N, CastTy(TMARedTy::ADD));
3249 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_im2col_3d:
3250 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_im2col_4d:
3251 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_im2col_5d:
3252 SelectCpAsyncBulkTensorReduceCommon(
N, CastTy(TMARedTy::ADD),
3255 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_min_tile_1d:
3256 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_min_tile_2d:
3257 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_min_tile_3d:
3258 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_min_tile_4d:
3259 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_min_tile_5d:
3260 SelectCpAsyncBulkTensorReduceCommon(
N, CastTy(TMARedTy::MIN));
3262 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_min_im2col_3d:
3263 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_min_im2col_4d:
3264 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_min_im2col_5d:
3265 SelectCpAsyncBulkTensorReduceCommon(
N, CastTy(TMARedTy::MIN),
3268 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_max_tile_1d:
3269 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_max_tile_2d:
3270 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_max_tile_3d:
3271 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_max_tile_4d:
3272 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_max_tile_5d:
3273 SelectCpAsyncBulkTensorReduceCommon(
N, CastTy(TMARedTy::MAX));
3275 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_max_im2col_3d:
3276 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_max_im2col_4d:
3277 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_max_im2col_5d:
3278 SelectCpAsyncBulkTensorReduceCommon(
N, CastTy(TMARedTy::MAX),
3281 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_inc_tile_1d:
3282 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_inc_tile_2d:
3283 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_inc_tile_3d:
3284 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_inc_tile_4d:
3285 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_inc_tile_5d:
3286 SelectCpAsyncBulkTensorReduceCommon(
N, CastTy(TMARedTy::INC));
3288 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_inc_im2col_3d:
3289 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_inc_im2col_4d:
3290 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_inc_im2col_5d:
3291 SelectCpAsyncBulkTensorReduceCommon(
N, CastTy(TMARedTy::INC),
3294 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_dec_tile_1d:
3295 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_dec_tile_2d:
3296 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_dec_tile_3d:
3297 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_dec_tile_4d:
3298 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_dec_tile_5d:
3299 SelectCpAsyncBulkTensorReduceCommon(
N, CastTy(TMARedTy::DEC));
3301 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_dec_im2col_3d:
3302 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_dec_im2col_4d:
3303 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_dec_im2col_5d:
3304 SelectCpAsyncBulkTensorReduceCommon(
N, CastTy(TMARedTy::DEC),
3307 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_and_tile_1d:
3308 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_and_tile_2d:
3309 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_and_tile_3d:
3310 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_and_tile_4d:
3311 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_and_tile_5d:
3312 SelectCpAsyncBulkTensorReduceCommon(
N, CastTy(TMARedTy::AND));
3314 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_and_im2col_3d:
3315 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_and_im2col_4d:
3316 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_and_im2col_5d:
3317 SelectCpAsyncBulkTensorReduceCommon(
N, CastTy(TMARedTy::AND),
3320 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_or_tile_1d:
3321 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_or_tile_2d:
3322 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_or_tile_3d:
3323 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_or_tile_4d:
3324 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_or_tile_5d:
3325 SelectCpAsyncBulkTensorReduceCommon(
N, CastTy(TMARedTy::OR));
3327 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_or_im2col_3d:
3328 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_or_im2col_4d:
3329 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_or_im2col_5d:
3330 SelectCpAsyncBulkTensorReduceCommon(
N, CastTy(TMARedTy::OR),
3333 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_xor_tile_1d:
3334 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_xor_tile_2d:
3335 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_xor_tile_3d:
3336 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_xor_tile_4d:
3337 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_xor_tile_5d:
3338 SelectCpAsyncBulkTensorReduceCommon(
N, CastTy(TMARedTy::XOR));
3340 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_xor_im2col_3d:
3341 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_xor_im2col_4d:
3342 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_xor_im2col_5d:
3343 SelectCpAsyncBulkTensorReduceCommon(
N, CastTy(TMARedTy::XOR),
amdgpu aa AMDGPU Address space based Alias Analysis Wrapper
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Atomic ordering constants.
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
mir Rename Register Operands
#define getOpcodeForVectorStParam(n, ty, isimm)
static unsigned int getCodeAddrSpace(MemSDNode *N)
static bool isAddLike(const SDValue V)
static bool isVectorElementTypeUpsized(EVT EltVT)
static size_t GetDimsFromIntrinsic(unsigned IID)
static int getLdStRegType(EVT VT)
static unsigned pickOpcodeForVectorStParam(SmallVector< SDValue, 8 > &Ops, unsigned NumElts, MVT::SimpleValueType MemTy, SelectionDAG *CurDAG, SDLoc DL)
#define getOpcodeForVectorStParamV2(ty, isimm)
static cl::opt< bool > EnableRsqrtOpt("nvptx-rsqrt-approx-opt", cl::init(true), cl::Hidden, cl::desc("Enable reciprocal sqrt optimization"))
#define GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH(dim, mode, is_ch)
static unsigned GetCpAsyncBulkTensorPrefetchOpcode(size_t Dim, bool IsCacheHint, bool IsIm2Col)
#define GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(dim, mode, is_mc, is_ch, is_s32)
static unsigned GetCpAsyncBulkTensorG2SOpcode(size_t Dim, bool IsShared32, bool IsMultiCast, bool IsCacheHint, bool IsIm2Col)
static unsigned getPTXCmpMode(const CondCodeSDNode &CondCode, bool FTZ)
static unsigned int getFenceOp(NVPTX::Ordering O, NVPTX::Scope S, NVPTXSubtarget const *T)
static unsigned GetCpAsyncBulkTensorS2GOpcode(size_t Dim, bool IsShared32, bool IsCacheHint, bool IsIm2Col, bool IsReduce=false)
static std::optional< unsigned > pickOpcodeForVT(MVT::SimpleValueType VT, unsigned Opcode_i8, unsigned Opcode_i16, unsigned Opcode_i32, std::optional< unsigned > Opcode_i64, unsigned Opcode_f32, std::optional< unsigned > Opcode_f64)
#define GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G(dim, mode, is_reduce, is_ch, is_s32)
static bool canLowerToLDG(MemSDNode *N, const NVPTXSubtarget &Subtarget, unsigned CodeAddrSpace, MachineFunction *F)
This file contains the definitions of the enumerations and flags associated with NVVM Intrinsics,...
#define INITIALIZE_PASS(passName, arg, name, cfg, analysis)
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
unsigned getSrcAddressSpace() const
unsigned getDestAddressSpace() const
This is an SDNode representing atomic operations.
const SDValue & getVal() const
const ConstantFP * getConstantFPValue() const
ConstantFP - Floating Point Values [float, double].
This is the shared class of boolean and integer constants.
const ConstantInt * getConstantIntValue() const
uint64_t getZExtValue() const
This class represents an Operation in the Expression.
unsigned getPointerSizeInBits(unsigned AS=0) const
Layout pointer size, in bits FIXME: The defaults need to be removed once all of the backends/clients ...
FunctionPass class - This class is used to implement most global optimizations.
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Record instruction ordering so we can query their relative positions within a function.
This is an important class for using LLVM in a threaded context.
bool isIndexed() const
Return true if this is a pre/post inc/dec load/store.
This class is used to represent ISD::LOAD nodes.
ISD::LoadExtType getExtensionType() const
Return whether this is a plain node, or one of the varieties of value-extending loads.
bool isVector() const
Return true if this is a vector value type.
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
MVT getVectorElementType() const
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
Function & getFunction()
Return the LLVM function that this machine code represents.
A description of a memory reference used in the backend.
An SDNode that represents everything that will be needed to construct a MachineInstr.
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
EVT getMemoryVT() const
Return the type of the in-memory value.
NVPTXDAGToDAGISelLegacy(NVPTXTargetMachine &tm, CodeGenOptLevel OptLevel)
bool runOnMachineFunction(MachineFunction &MF) override
bool SelectInlineAsmMemoryOperand(const SDValue &Op, InlineAsm::ConstraintCode ConstraintID, std::vector< SDValue > &OutOps) override
SelectInlineAsmMemoryOperand - Implement addressing mode selection for inline asm expressions.
const NVPTXSubtarget * Subtarget
void failIfClustersUnsupported(std::string const &FailureMessage) const
const NVPTXTargetLowering * getTargetLowering() const override
bool hasNativeBF16Support(int Opcode) const
bool hasRelaxedMMIO() const
bool hasMemoryOrdering() const
bool useF32FTZ(const MachineFunction &MF) const
bool allowFMA(MachineFunction &MF, CodeGenOptLevel OptLevel) const
bool usePrecSqrtF32() const
bool allowUnsafeFPMath(MachineFunction &MF) const
int getDivF32Level() const
const NVPTXSubtarget * getSubtargetImpl(const Function &) const override
Virtual method implemented by subclasses that returns a reference to that target's TargetSubtargetInf...
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
Represents one node in the SelectionDAG.
const SDValue & getOperand(unsigned Num) const
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
SDNode * getNode() const
get the SDNode which holds the desired result
EVT getValueType() const
Return the ValueType of the referenced return value.
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
unsigned getOpcode() const
SelectionDAGISel - This is the common base class used for SelectionDAG-based pattern-matching instruc...
void ReplaceUses(SDValue F, SDValue T)
ReplaceUses - replace all uses of the old node F with the use of the new node T.
void ReplaceNode(SDNode *F, SDNode *T)
Replace all uses of F with T, then remove F from the DAG.
virtual bool runOnMachineFunction(MachineFunction &mf)
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
void setNodeMemRefs(MachineSDNode *N, ArrayRef< MachineMemOperand * > NewMemRefs)
Mutate the specified machine node's memory references to the provided list.
const DataLayout & getDataLayout() const
SDValue getTargetFrameIndex(int FI, EVT VT)
SDValue getSignedTargetConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
SDValue getTargetConstantFP(double Val, const SDLoc &DL, EVT VT)
SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
This class is used to represent ISD::STORE nodes.
const SDValue & getValue() const
unsigned getPointerSizeInBits(unsigned AS) const
LLVM Value Representation.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
@ C
The default llvm calling convention, compatible with C.
@ ATOMIC_STORE
OUTCHAIN = ATOMIC_STORE(INCHAIN, val, ptr) This corresponds to "store atomic" instruction.
@ ADD
Simple integer binary arithmetic operators.
@ LOAD
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
@ FADD
Simple binary floating point operators.
@ ATOMIC_FENCE
OUTCHAIN = ATOMIC_FENCE(INCHAIN, ordering, scope) This corresponds to the fence instruction.
@ ATOMIC_LOAD
Val, OUTCHAIN = ATOMIC_LOAD(INCHAIN, ptr) This corresponds to "load atomic" instruction.
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
@ TargetGlobalAddress
TargetGlobalAddress - Like GlobalAddress, but the DAG does no folding or anything else with this node...
@ SHL
Shift and rotation operations.
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
@ AND
Bitwise operators - logical and, logical or, logical xor.
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
@ ADDRSPACECAST
ADDRSPACECAST - This operator converts between pointers of different address spaces.
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
std::string ScopeToString(Scope S)
std::string OrderingToString(Ordering Order)
initializer< Ty > init(const Ty &Val)
constexpr uint64_t PointerSize
aarch64 pointer size.
Scope
Defines the scope in which this symbol should be visible: Default – Visible in the public interface o...
This is an optimization pass for GlobalISel generic memory operations.
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
int countr_one(T Value)
Count the number of ones from the least significant bit to the first zero bit.
FunctionPass * createNVPTXISelDag(NVPTXTargetMachine &TM, llvm::CodeGenOptLevel OptLevel)
createNVPTXISelDag - This pass converts a legalized DAG into a NVPTX-specific DAG,...
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
constexpr bool isShiftedMask_64(uint64_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (64 bit ver...
const char * toIRString(AtomicOrdering ao)
String used by LLVM IR to represent atomic ordering.
auto formatv(bool Validate, const char *Fmt, Ts &&...Vals)
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
constexpr bool isMask_64(uint64_t Value)
Return true if the argument is a non-empty sequence of ones starting at the least significant bit wit...
CodeGenOptLevel
Code generation optimization level.
AtomicOrdering
Atomic ordering for LLVM's memory model.
DWARFExpression::Operation Op
bool isKernelFunction(const Function &F)
void getUnderlyingObjects(const Value *V, SmallVectorImpl< const Value * > &Objects, const LoopInfo *LI=nullptr, unsigned MaxLookup=6)
This method is similar to getUnderlyingObject except that it can look through phi and select instruct...
Implement std::hash so that hash_code can be used in STL containers.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
static constexpr roundingMode rmNearestTiesToEven
static const fltSemantics & BFloat() LLVM_READNONE
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
bool isVector() const
Return true if this is a vector value type.
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
EVT getVectorElementType() const
Given a vector type, return the type of each element.
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
NVPTX::Scope operator[](SyncScope::ID ID) const
This represents a list of ValueType's that has been intern'd by a SelectionDAG.