42#include "llvm/IR/IntrinsicsAMDGPU.h"
43#include "llvm/IR/IntrinsicsR600.h"
54#define DEBUG_TYPE "si-lower"
60 cl::desc(
"Do not align and prefetch loops"),
64 "amdgpu-use-divergent-register-indexing",
cl::Hidden,
65 cl::desc(
"Use indirect register addressing for divergent indexes"),
79 unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
80 for (
unsigned Reg = 0;
Reg < NumSGPRs; ++
Reg) {
82 return AMDGPU::SGPR0 +
Reg;
98 TRI->getDefaultVectorSuperClassForBitWidth(32);
104 TRI->getDefaultVectorSuperClassForBitWidth(64);
142 TRI->getDefaultVectorSuperClassForBitWidth(320));
146 TRI->getDefaultVectorSuperClassForBitWidth(352));
150 TRI->getDefaultVectorSuperClassForBitWidth(384));
154 TRI->getDefaultVectorSuperClassForBitWidth(512));
161 TRI->getDefaultVectorSuperClassForBitWidth(1024));
163 if (Subtarget->has16BitInsts()) {
164 if (Subtarget->useRealTrue16Insts()) {
194 TRI->getDefaultVectorSuperClassForBitWidth(1024));
207 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
208 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
209 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
210 MVT::i1, MVT::v32i32},
214 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
215 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
216 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
217 MVT::i1, MVT::v32i32},
224 ISD::FMINIMUM, ISD::FMAXIMUM, ISD::FSQRT, ISD::FCBRT,
225 ISD::FSIN, ISD::FCOS, ISD::FPOW, ISD::FPOWI,
226 ISD::FLDEXP, ISD::FFREXP, ISD::FLOG, ISD::FLOG2,
227 ISD::FLOG10, ISD::FEXP, ISD::FEXP2, ISD::FEXP10,
228 ISD::FCEIL, ISD::FTRUNC, ISD::FRINT, ISD::FNEARBYINT,
286 {MVT::f32, MVT::i32, MVT::i64, MVT::f64, MVT::i1},
Expand);
293 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
294 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
295 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32},
298 {MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32,
299 MVT::v6f32, MVT::v7f32, MVT::v8f32, MVT::v9f32,
300 MVT::v10f32, MVT::v11f32, MVT::v12f32, MVT::v16f32},
304 {MVT::v2i1, MVT::v4i1, MVT::v2i8, MVT::v4i8, MVT::v2i16,
305 MVT::v3i16, MVT::v4i16, MVT::Other},
310 {MVT::i1, MVT::i32, MVT::i64, MVT::f32, MVT::f64},
Expand);
326 {MVT::v8i32, MVT::v8f32, MVT::v9i32, MVT::v9f32, MVT::v10i32,
327 MVT::v10f32, MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32,
328 MVT::v16i32, MVT::v16f32, MVT::v2i64, MVT::v2f64, MVT::v4i16,
329 MVT::v4f16, MVT::v4bf16, MVT::v3i64, MVT::v3f64, MVT::v6i32,
330 MVT::v6f32, MVT::v4i64, MVT::v4f64, MVT::v8i64, MVT::v8f64,
331 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
332 MVT::v16bf16, MVT::v16i64, MVT::v16f64, MVT::v32i32, MVT::v32f32,
333 MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
365 for (
MVT Vec64 : {MVT::v2i64, MVT::v2f64}) {
379 for (
MVT Vec64 : {MVT::v3i64, MVT::v3f64}) {
393 for (
MVT Vec64 : {MVT::v4i64, MVT::v4f64}) {
407 for (
MVT Vec64 : {MVT::v8i64, MVT::v8f64}) {
421 for (
MVT Vec64 : {MVT::v16i64, MVT::v16f64}) {
436 {MVT::v4i32, MVT::v4f32, MVT::v8i32, MVT::v8f32,
437 MVT::v16i32, MVT::v16f32, MVT::v32i32, MVT::v32f32},
440 if (Subtarget->hasPkMovB32()) {
461 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v2i8, MVT::v4i8,
462 MVT::v8i8, MVT::v4i16, MVT::v4f16, MVT::v4bf16},
467 {MVT::v3i32, MVT::v3f32, MVT::v4i32, MVT::v4f32},
Custom);
471 {MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32,
472 MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32,
473 MVT::v9i32, MVT::v9f32, MVT::v10i32, MVT::v10f32,
474 MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32},
498 if (Subtarget->hasSMemRealTime() ||
503 if (Subtarget->has16BitInsts()) {
510 if (Subtarget->hasMadMacF32Insts())
513 if (!Subtarget->hasBFI())
517 if (!Subtarget->hasBCNT(32))
520 if (!Subtarget->hasBCNT(64))
523 if (Subtarget->hasFFBH())
526 if (Subtarget->hasFFBL())
537 if (Subtarget->hasBFE())
541 if (Subtarget->hasIntClamp())
544 if (Subtarget->hasAddNoCarry())
549 {ISD::FMINNUM, ISD::FMAXNUM, ISD::FMINIMUMNUM, ISD::FMAXIMUMNUM},
550 {MVT::f32, MVT::f64},
Custom);
556 {MVT::f32, MVT::f64},
Legal);
558 if (Subtarget->haveRoundOpsF64())
581 if (Subtarget->has16BitInsts()) {
630 ISD::FSIN, ISD::FROUND},
634 if (Subtarget->hasBF16TransInsts())
653 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v4i16, MVT::v4f16,
654 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16,
655 MVT::v16f16, MVT::v16bf16, MVT::v32i16, MVT::v32f16}) {
788 {MVT::v2f16, MVT::v2bf16, MVT::v4f16, MVT::v4bf16,
789 MVT::v8f16, MVT::v8bf16, MVT::v16f16, MVT::v16bf16,
790 MVT::v32f16, MVT::v32bf16},
794 {ISD::FMAXNUM, ISD::FMINNUM, ISD::FMINIMUMNUM, ISD::FMAXIMUMNUM},
800 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
804 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
808 {MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
809 MVT::v16bf16, MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
817 if (Subtarget->hasVOP3PInsts()) {
828 {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
Custom);
831 {MVT::v4f16, MVT::v4i16, MVT::v4bf16, MVT::v8f16,
832 MVT::v8i16, MVT::v8bf16, MVT::v16f16, MVT::v16i16,
833 MVT::v16bf16, MVT::v32f16, MVT::v32i16, MVT::v32bf16},
836 for (
MVT VT : {MVT::v4i16, MVT::v8i16, MVT::v16i16, MVT::v32i16})
844 for (
MVT VT : {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16})
850 {ISD::FMAXNUM, ISD::FMINNUM, ISD::FMINIMUMNUM, ISD::FMAXIMUMNUM},
851 {MVT::v2f16, MVT::v4f16},
Custom);
857 if (Subtarget->hasBF16PackedInsts()) {
858 for (
MVT VT : {MVT::v4bf16, MVT::v8bf16, MVT::v16bf16, MVT::v32bf16})
864 if (Subtarget->hasPackedFP32Ops()) {
868 {MVT::v4f32, MVT::v8f32, MVT::v16f32, MVT::v32f32},
875 if (Subtarget->has16BitInsts()) {
888 {MVT::v4i16, MVT::v4f16, MVT::v4bf16, MVT::v2i8, MVT::v4i8,
889 MVT::v8i8, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
890 MVT::v16i16, MVT::v16f16, MVT::v16bf16, MVT::v32i16,
891 MVT::v32f16, MVT::v32bf16},
896 if (Subtarget->hasVectorMulU64())
898 else if (Subtarget->hasScalarSMulU64())
901 if (Subtarget->hasMad64_32())
904 if (Subtarget->hasSafeSmemPrefetch() || Subtarget->hasVmemPrefInsts())
907 if (Subtarget->hasIEEEMinimumMaximumInsts()) {
909 {MVT::f16, MVT::f32, MVT::f64, MVT::v2f16},
Legal);
912 if (Subtarget->hasMinimum3Maximum3F32())
915 if (Subtarget->hasMinimum3Maximum3PKF16()) {
919 if (!Subtarget->hasMinimum3Maximum3F16())
924 if (Subtarget->hasVOP3PInsts()) {
927 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
931 if (Subtarget->hasIntMinMax64())
936 {MVT::Other, MVT::f32, MVT::v4f32, MVT::i16, MVT::f16,
937 MVT::bf16, MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::i128,
942 {MVT::v2f16, MVT::v2i16, MVT::v2bf16, MVT::v3f16,
943 MVT::v3i16, MVT::v4f16, MVT::v4i16, MVT::v4bf16,
944 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::Other, MVT::f16,
945 MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
949 {MVT::Other, MVT::v2i16, MVT::v2f16, MVT::v2bf16,
950 MVT::v3i16, MVT::v3f16, MVT::v4f16, MVT::v4i16,
951 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
952 MVT::f16, MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
967 if (Subtarget->hasBF16ConversionInsts()) {
972 if (Subtarget->hasBF16PackedInsts()) {
978 if (Subtarget->hasBF16TransInsts()) {
982 if (Subtarget->hasCvtPkF16F32Inst()) {
984 {MVT::v2f16, MVT::v4f16, MVT::v8f16, MVT::v16f16},
1034 if (Subtarget->has16BitInsts() && !Subtarget->hasMed3_16())
1043 ISD::ATOMIC_CMP_SWAP,
1044 ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS,
1046 ISD::ATOMIC_LOAD_ADD,
1047 ISD::ATOMIC_LOAD_SUB,
1048 ISD::ATOMIC_LOAD_AND,
1049 ISD::ATOMIC_LOAD_OR,
1050 ISD::ATOMIC_LOAD_XOR,
1051 ISD::ATOMIC_LOAD_NAND,
1052 ISD::ATOMIC_LOAD_MIN,
1053 ISD::ATOMIC_LOAD_MAX,
1054 ISD::ATOMIC_LOAD_UMIN,
1055 ISD::ATOMIC_LOAD_UMAX,
1056 ISD::ATOMIC_LOAD_FADD,
1057 ISD::ATOMIC_LOAD_FMIN,
1058 ISD::ATOMIC_LOAD_FMAX,
1059 ISD::ATOMIC_LOAD_UINC_WRAP,
1060 ISD::ATOMIC_LOAD_UDEC_WRAP,
1073 static const MCPhysReg RCRegs[] = {AMDGPU::MODE};
1086 EVT DestVT,
EVT SrcVT)
const {
1088 ((((Opcode ==
ISD::FMAD && Subtarget->hasMadMixInsts()) ||
1089 (Opcode ==
ISD::FMA && Subtarget->hasFmaMixInsts())) &&
1091 (Opcode ==
ISD::FMA && Subtarget->hasFmaMixBF16Insts() &&
1098 LLT DestTy,
LLT SrcTy)
const {
1099 return ((Opcode == TargetOpcode::G_FMAD && Subtarget->hasMadMixInsts()) ||
1100 (Opcode == TargetOpcode::G_FMA && Subtarget->hasFmaMixInsts())) &&
1102 SrcTy.getScalarSizeInBits() == 16 &&
1123 if (Subtarget->has16BitInsts()) {
1126 return (ScalarVT == MVT::bf16 ? MVT::i32 : MVT::v2f16);
1128 return VT.
isInteger() ? MVT::i32 : MVT::f32;
1132 return Subtarget->has16BitInsts() ? MVT::i16 : MVT::i32;
1154 if (
Size == 16 && Subtarget->has16BitInsts())
1155 return (NumElts + 1) / 2;
1161 return NumElts * ((
Size + 31) / 32);
1170 unsigned &NumIntermediates,
MVT &RegisterVT)
const {
1178 if (
Size == 16 && Subtarget->has16BitInsts()) {
1179 if (ScalarVT == MVT::bf16) {
1180 RegisterVT = MVT::i32;
1181 IntermediateVT = MVT::v2bf16;
1183 RegisterVT = VT.
isInteger() ? MVT::v2i16 : MVT::v2f16;
1184 IntermediateVT = RegisterVT;
1186 NumIntermediates = (NumElts + 1) / 2;
1187 return NumIntermediates;
1192 IntermediateVT = RegisterVT;
1193 NumIntermediates = NumElts;
1194 return NumIntermediates;
1199 RegisterVT = MVT::i16;
1200 IntermediateVT = ScalarVT;
1201 NumIntermediates = NumElts;
1202 return NumIntermediates;
1206 RegisterVT = MVT::i32;
1207 IntermediateVT = ScalarVT;
1208 NumIntermediates = NumElts;
1209 return NumIntermediates;
1213 RegisterVT = MVT::i32;
1214 IntermediateVT = RegisterVT;
1215 NumIntermediates = NumElts * ((
Size + 31) / 32);
1216 return NumIntermediates;
1221 Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
1226 unsigned MaxNumLanes) {
1227 assert(MaxNumLanes != 0);
1231 unsigned NumElts = std::min(MaxNumLanes, VT->getNumElements());
1242 unsigned MaxNumLanes) {
1248 assert(ST->getNumContainedTypes() == 2 &&
1249 ST->getContainedType(1)->isIntegerTy(32));
1263 return MVT::amdgpuBufferFatPointer;
1265 DL.getPointerSizeInBits(AS) == 192)
1266 return MVT::amdgpuBufferStridedPointer;
1275 DL.getPointerSizeInBits(AS) == 160) ||
1277 DL.getPointerSizeInBits(AS) == 192))
1284 case Intrinsic::amdgcn_global_load_async_to_lds_b8:
1285 case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:
1286 case Intrinsic::amdgcn_global_store_async_from_lds_b8:
1288 case Intrinsic::amdgcn_global_load_async_to_lds_b32:
1289 case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
1290 case Intrinsic::amdgcn_global_store_async_from_lds_b32:
1291 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
1292 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
1294 case Intrinsic::amdgcn_global_load_async_to_lds_b64:
1295 case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
1296 case Intrinsic::amdgcn_global_store_async_from_lds_b64:
1297 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
1298 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
1300 case Intrinsic::amdgcn_global_load_async_to_lds_b128:
1301 case Intrinsic::amdgcn_cluster_load_async_to_lds_b128:
1302 case Intrinsic::amdgcn_global_store_async_from_lds_b128:
1303 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B:
1304 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B:
1343 unsigned IntrID)
const {
1345 if (CI.
hasMetadata(LLVMContext::MD_invariant_load))
1363 if (RsrcIntr->IsImage) {
1378 Info.ptrVal = RsrcArg;
1381 bool IsSPrefetch = IntrID == Intrinsic::amdgcn_s_buffer_prefetch_data;
1390 if (RsrcIntr->IsImage) {
1391 unsigned MaxNumLanes = 4;
1406 std::numeric_limits<unsigned>::max());
1416 if (RsrcIntr->IsImage) {
1437 if ((RsrcIntr->IsImage && BaseOpcode->
NoReturn) || IsSPrefetch) {
1439 Info.memVT = MVT::i32;
1446 case Intrinsic::amdgcn_raw_buffer_load_lds:
1447 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
1448 case Intrinsic::amdgcn_struct_buffer_load_lds:
1449 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
1455 case Intrinsic::amdgcn_raw_atomic_buffer_load:
1456 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
1457 case Intrinsic::amdgcn_struct_atomic_buffer_load:
1458 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: {
1461 std::numeric_limits<unsigned>::max());
1471 case Intrinsic::amdgcn_ds_ordered_add:
1472 case Intrinsic::amdgcn_ds_ordered_swap: {
1485 case Intrinsic::amdgcn_ds_add_gs_reg_rtn:
1486 case Intrinsic::amdgcn_ds_sub_gs_reg_rtn: {
1489 Info.ptrVal =
nullptr;
1494 case Intrinsic::amdgcn_ds_append:
1495 case Intrinsic::amdgcn_ds_consume: {
1508 case Intrinsic::amdgcn_ds_atomic_async_barrier_arrive_b64:
1509 case Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64: {
1510 Info.opc = (IntrID == Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64)
1515 Info.memVT = MVT::i64;
1521 case Intrinsic::amdgcn_global_atomic_csub: {
1530 case Intrinsic::amdgcn_image_bvh_dual_intersect_ray:
1531 case Intrinsic::amdgcn_image_bvh_intersect_ray:
1532 case Intrinsic::amdgcn_image_bvh8_intersect_ray: {
1535 MVT::getVT(IntrID == Intrinsic::amdgcn_image_bvh_intersect_ray
1538 ->getElementType(0));
1546 case Intrinsic::amdgcn_global_atomic_fmin_num:
1547 case Intrinsic::amdgcn_global_atomic_fmax_num:
1548 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1549 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1550 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1551 case Intrinsic::amdgcn_atomic_cond_sub_u32: {
1561 case Intrinsic::amdgcn_flat_load_monitor_b32:
1562 case Intrinsic::amdgcn_flat_load_monitor_b64:
1563 case Intrinsic::amdgcn_flat_load_monitor_b128:
1564 case Intrinsic::amdgcn_global_load_monitor_b32:
1565 case Intrinsic::amdgcn_global_load_monitor_b64:
1566 case Intrinsic::amdgcn_global_load_monitor_b128:
1567 case Intrinsic::amdgcn_cluster_load_b32:
1568 case Intrinsic::amdgcn_cluster_load_b64:
1569 case Intrinsic::amdgcn_cluster_load_b128:
1570 case Intrinsic::amdgcn_ds_load_tr6_b96:
1571 case Intrinsic::amdgcn_ds_load_tr4_b64:
1572 case Intrinsic::amdgcn_ds_load_tr8_b64:
1573 case Intrinsic::amdgcn_ds_load_tr16_b128:
1574 case Intrinsic::amdgcn_global_load_tr6_b96:
1575 case Intrinsic::amdgcn_global_load_tr4_b64:
1576 case Intrinsic::amdgcn_global_load_tr_b64:
1577 case Intrinsic::amdgcn_global_load_tr_b128:
1578 case Intrinsic::amdgcn_ds_read_tr4_b64:
1579 case Intrinsic::amdgcn_ds_read_tr6_b96:
1580 case Intrinsic::amdgcn_ds_read_tr8_b64:
1581 case Intrinsic::amdgcn_ds_read_tr16_b64: {
1589 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
1590 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
1591 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B: {
1599 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
1600 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
1601 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B: {
1609 case Intrinsic::amdgcn_ds_gws_init:
1610 case Intrinsic::amdgcn_ds_gws_barrier:
1611 case Intrinsic::amdgcn_ds_gws_sema_v:
1612 case Intrinsic::amdgcn_ds_gws_sema_br:
1613 case Intrinsic::amdgcn_ds_gws_sema_p:
1614 case Intrinsic::amdgcn_ds_gws_sema_release_all: {
1624 Info.memVT = MVT::i32;
1626 Info.align =
Align(4);
1628 if (IntrID == Intrinsic::amdgcn_ds_gws_barrier)
1634 case Intrinsic::amdgcn_global_load_async_to_lds_b8:
1635 case Intrinsic::amdgcn_global_load_async_to_lds_b32:
1636 case Intrinsic::amdgcn_global_load_async_to_lds_b64:
1637 case Intrinsic::amdgcn_global_load_async_to_lds_b128:
1638 case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:
1639 case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
1640 case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
1641 case Intrinsic::amdgcn_cluster_load_async_to_lds_b128: {
1648 case Intrinsic::amdgcn_global_store_async_from_lds_b8:
1649 case Intrinsic::amdgcn_global_store_async_from_lds_b32:
1650 case Intrinsic::amdgcn_global_store_async_from_lds_b64:
1651 case Intrinsic::amdgcn_global_store_async_from_lds_b128: {
1658 case Intrinsic::amdgcn_load_to_lds:
1659 case Intrinsic::amdgcn_global_load_lds: {
1670 case Intrinsic::amdgcn_ds_bvh_stack_rtn:
1671 case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn:
1672 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:
1673 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn: {
1683 Info.memVT = MVT::i32;
1685 Info.align =
Align(4);
1690 case Intrinsic::amdgcn_s_prefetch_data:
1691 case Intrinsic::amdgcn_flat_prefetch:
1692 case Intrinsic::amdgcn_global_prefetch: {
1707 case Intrinsic::amdgcn_addrspacecast_nonnull: {
1710 unsigned SrcAS =
I.getOperand(0)->getType()->getPointerAddressSpace();
1711 unsigned DstAS =
I.getType()->getPointerAddressSpace();
1723 Type *&AccessTy)
const {
1724 Value *Ptr =
nullptr;
1725 switch (
II->getIntrinsicID()) {
1726 case Intrinsic::amdgcn_atomic_cond_sub_u32:
1727 case Intrinsic::amdgcn_cluster_load_b128:
1728 case Intrinsic::amdgcn_cluster_load_b64:
1729 case Intrinsic::amdgcn_cluster_load_b32:
1730 case Intrinsic::amdgcn_ds_append:
1731 case Intrinsic::amdgcn_ds_consume:
1732 case Intrinsic::amdgcn_ds_load_tr8_b64:
1733 case Intrinsic::amdgcn_ds_load_tr16_b128:
1734 case Intrinsic::amdgcn_ds_load_tr4_b64:
1735 case Intrinsic::amdgcn_ds_load_tr6_b96:
1736 case Intrinsic::amdgcn_ds_read_tr4_b64:
1737 case Intrinsic::amdgcn_ds_read_tr6_b96:
1738 case Intrinsic::amdgcn_ds_read_tr8_b64:
1739 case Intrinsic::amdgcn_ds_read_tr16_b64:
1740 case Intrinsic::amdgcn_ds_ordered_add:
1741 case Intrinsic::amdgcn_ds_ordered_swap:
1742 case Intrinsic::amdgcn_ds_atomic_async_barrier_arrive_b64:
1743 case Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64:
1744 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1745 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1746 case Intrinsic::amdgcn_flat_load_monitor_b128:
1747 case Intrinsic::amdgcn_flat_load_monitor_b32:
1748 case Intrinsic::amdgcn_flat_load_monitor_b64:
1749 case Intrinsic::amdgcn_global_atomic_csub:
1750 case Intrinsic::amdgcn_global_atomic_fmax_num:
1751 case Intrinsic::amdgcn_global_atomic_fmin_num:
1752 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1753 case Intrinsic::amdgcn_global_load_monitor_b128:
1754 case Intrinsic::amdgcn_global_load_monitor_b32:
1755 case Intrinsic::amdgcn_global_load_monitor_b64:
1756 case Intrinsic::amdgcn_global_load_tr_b64:
1757 case Intrinsic::amdgcn_global_load_tr_b128:
1758 case Intrinsic::amdgcn_global_load_tr4_b64:
1759 case Intrinsic::amdgcn_global_load_tr6_b96:
1760 case Intrinsic::amdgcn_global_store_async_from_lds_b8:
1761 case Intrinsic::amdgcn_global_store_async_from_lds_b32:
1762 case Intrinsic::amdgcn_global_store_async_from_lds_b64:
1763 case Intrinsic::amdgcn_global_store_async_from_lds_b128:
1764 Ptr =
II->getArgOperand(0);
1766 case Intrinsic::amdgcn_load_to_lds:
1767 case Intrinsic::amdgcn_global_load_lds:
1768 case Intrinsic::amdgcn_global_load_async_to_lds_b8:
1769 case Intrinsic::amdgcn_global_load_async_to_lds_b32:
1770 case Intrinsic::amdgcn_global_load_async_to_lds_b64:
1771 case Intrinsic::amdgcn_global_load_async_to_lds_b128:
1772 case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:
1773 case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
1774 case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
1775 case Intrinsic::amdgcn_cluster_load_async_to_lds_b128:
1776 Ptr =
II->getArgOperand(1);
1781 AccessTy =
II->getType();
1787 unsigned AddrSpace)
const {
1788 if (!Subtarget->hasFlatInstOffsets()) {
1799 return AM.
Scale == 0 &&
1800 (AM.
BaseOffs == 0 || Subtarget->getInstrInfo()->isLegalFLATOffset(
1801 AM.
BaseOffs, AddrSpace, FlatVariant));
1805 if (Subtarget->hasFlatGlobalInsts())
1808 if (!Subtarget->hasAddr64() || Subtarget->useFlatForGlobal()) {
1821 return isLegalMUBUFAddressingMode(AM);
1824bool SITargetLowering::isLegalMUBUFAddressingMode(
const AddrMode &AM)
const {
1835 if (!
TII->isLegalMUBUFImmOffset(AM.BaseOffs))
1847 if (AM.HasBaseReg) {
1879 return isLegalMUBUFAddressingMode(AM);
1881 if (!Subtarget->hasScalarSubwordLoads()) {
1886 if (Ty->isSized() &&
DL.getTypeStoreSize(Ty) < 4)
1934 return Subtarget->enableFlatScratch()
1936 : isLegalMUBUFAddressingMode(AM);
1983 unsigned Size,
unsigned AddrSpace,
Align Alignment,
1992 if (!Subtarget->hasUnalignedDSAccessEnabled() && Alignment <
Align(4))
1995 Align RequiredAlignment(
1997 if (Subtarget->hasLDSMisalignedBug() &&
Size > 32 &&
1998 Alignment < RequiredAlignment)
2013 if (!Subtarget->hasUsableDSOffset() && Alignment <
Align(8))
2019 RequiredAlignment =
Align(4);
2021 if (Subtarget->hasUnalignedDSAccessEnabled()) {
2037 *IsFast = (Alignment >= RequiredAlignment) ? 64
2038 : (Alignment <
Align(4)) ? 32
2045 if (!Subtarget->hasDS96AndDS128())
2051 if (Subtarget->hasUnalignedDSAccessEnabled()) {
2060 *IsFast = (Alignment >= RequiredAlignment) ? 96
2061 : (Alignment <
Align(4)) ? 32
2068 if (!Subtarget->hasDS96AndDS128() || !Subtarget->useDS128())
2074 RequiredAlignment =
Align(8);
2076 if (Subtarget->hasUnalignedDSAccessEnabled()) {
2085 *IsFast = (Alignment >= RequiredAlignment) ? 128
2086 : (Alignment <
Align(4)) ? 32
2103 *IsFast = (Alignment >= RequiredAlignment) ?
Size : 0;
2105 return Alignment >= RequiredAlignment ||
2106 Subtarget->hasUnalignedDSAccessEnabled();
2114 bool AlignedBy4 = Alignment >=
Align(4);
2115 if (Subtarget->hasUnalignedScratchAccessEnabled()) {
2117 *IsFast = AlignedBy4 ?
Size : 1;
2122 *IsFast = AlignedBy4;
2133 return Alignment >=
Align(4) ||
2134 Subtarget->hasUnalignedBufferAccessEnabled();
2146 if (!Subtarget->hasRelaxedBufferOOBMode() &&
2161 return Size >= 32 && Alignment >=
Align(4);
2166 unsigned *IsFast)
const {
2168 Alignment, Flags, IsFast);
2173 const AttributeList &FuncAttributes)
const {
2179 if (
Op.size() >= 16 &&
2183 if (
Op.size() >= 8 &&
Op.isDstAligned(
Align(4)))
2201 unsigned DestAS)
const {
2204 Subtarget->hasGloballyAddressableScratch()) {
2234 unsigned Index)
const {
2250 if (Subtarget->has16BitInsts() && VT == MVT::i16) {
2277 auto [InputPtrReg, RC, ArgTy] =
2287 Chain, SL,
MRI.getLiveInVirtReg(InputPtrReg->getRegister()), PtrVT);
2293 const SDLoc &SL)
const {
2300 const SDLoc &SL)
const {
2303 std::optional<uint32_t> KnownSize =
2305 if (KnownSize.has_value())
2331 Val = getFPExtOrFPRound(DAG, Val, SL, VT);
2340SDValue SITargetLowering::lowerKernargMemParameter(
2352 int64_t OffsetDiff =
Offset - AlignDownOffset;
2358 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, AlignDownOffset);
2367 ArgVal = DAG.
getNode(ISD::BITCAST, SL, MemVT, ArgVal);
2368 ArgVal = convertArgType(DAG, VT, MemVT, SL, ArgVal,
Signed, Arg);
2373 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain,
Offset);
2378 SDValue Val = convertArgType(DAG, VT, MemVT, SL, Load,
Signed, Arg);
2387 const SDLoc &SL)
const {
2397 return DAG.
getNode(ISD::BITCAST, SL, ValVT, Val);
2456 ExtType, SL, VA.
getLocVT(), Chain, FIN,
2459 SDValue ConvertedVal = convertABITypeToValueType(DAG, ArgValue, VA, SL);
2460 if (ConvertedVal == ArgValue)
2461 return ConvertedVal;
2466SDValue SITargetLowering::lowerWorkGroupId(
2471 if (!Subtarget->hasClusters())
2472 return getPreloadedValue(DAG, MFI, VT, WorkGroupIdPV);
2480 SDValue ClusterIdXYZ = getPreloadedValue(DAG, MFI, VT, WorkGroupIdPV);
2481 SDLoc SL(ClusterIdXYZ);
2482 SDValue ClusterMaxIdXYZ = getPreloadedValue(DAG, MFI, VT, ClusterMaxIdPV);
2485 SDValue ClusterWorkGroupIdXYZ =
2486 getPreloadedValue(DAG, MFI, VT, ClusterWorkGroupIdPV);
2496 return ClusterIdXYZ;
2498 using namespace AMDGPU::Hwreg;
2502 DAG.
getMachineNode(AMDGPU::S_GETREG_B32_const, SL, VT, ClusterIdField);
2513SDValue SITargetLowering::getPreloadedValue(
2516 const ArgDescriptor *
Reg =
nullptr;
2517 const TargetRegisterClass *RC;
2521 const ArgDescriptor WorkGroupIDX =
2529 const ArgDescriptor WorkGroupIDZ =
2531 const ArgDescriptor ClusterWorkGroupIDX =
2533 const ArgDescriptor ClusterWorkGroupIDY =
2535 const ArgDescriptor ClusterWorkGroupIDZ =
2537 const ArgDescriptor ClusterWorkGroupMaxIDX =
2539 const ArgDescriptor ClusterWorkGroupMaxIDY =
2541 const ArgDescriptor ClusterWorkGroupMaxIDZ =
2543 const ArgDescriptor ClusterWorkGroupMaxFlatID =
2546 auto LoadConstant = [&](
unsigned N) {
2550 if (Subtarget->hasArchitectedSGPRs() &&
2557 Reg = &WorkGroupIDX;
2558 RC = &AMDGPU::SReg_32RegClass;
2562 Reg = &WorkGroupIDY;
2563 RC = &AMDGPU::SReg_32RegClass;
2567 Reg = &WorkGroupIDZ;
2568 RC = &AMDGPU::SReg_32RegClass;
2572 if (HasFixedDims && ClusterDims.
getDims()[0] == 1)
2573 return LoadConstant(0);
2574 Reg = &ClusterWorkGroupIDX;
2575 RC = &AMDGPU::SReg_32RegClass;
2579 if (HasFixedDims && ClusterDims.
getDims()[1] == 1)
2580 return LoadConstant(0);
2581 Reg = &ClusterWorkGroupIDY;
2582 RC = &AMDGPU::SReg_32RegClass;
2586 if (HasFixedDims && ClusterDims.
getDims()[2] == 1)
2587 return LoadConstant(0);
2588 Reg = &ClusterWorkGroupIDZ;
2589 RC = &AMDGPU::SReg_32RegClass;
2594 return LoadConstant(ClusterDims.
getDims()[0] - 1);
2595 Reg = &ClusterWorkGroupMaxIDX;
2596 RC = &AMDGPU::SReg_32RegClass;
2601 return LoadConstant(ClusterDims.
getDims()[1] - 1);
2602 Reg = &ClusterWorkGroupMaxIDY;
2603 RC = &AMDGPU::SReg_32RegClass;
2608 return LoadConstant(ClusterDims.
getDims()[2] - 1);
2609 Reg = &ClusterWorkGroupMaxIDZ;
2610 RC = &AMDGPU::SReg_32RegClass;
2614 Reg = &ClusterWorkGroupMaxFlatID;
2615 RC = &AMDGPU::SReg_32RegClass;
2646 for (
unsigned I = 0,
E = Ins.size(), PSInputNum = 0;
I !=
E; ++
I) {
2650 "vector type argument should have been split");
2655 bool SkipArg = !Arg->
Used && !
Info->isPSInputAllocated(PSInputNum);
2663 "unexpected vector split in ps argument type");
2677 Info->markPSInputAllocated(PSInputNum);
2679 Info->markPSInputEnabled(PSInputNum);
2695 if (Info.hasWorkItemIDX()) {
2701 (Subtarget->hasPackedTID() && Info.hasWorkItemIDY()) ? 0x3ff : ~0u;
2705 if (Info.hasWorkItemIDY()) {
2706 assert(Info.hasWorkItemIDX());
2707 if (Subtarget->hasPackedTID()) {
2708 Info.setWorkItemIDY(
2711 unsigned Reg = AMDGPU::VGPR1;
2719 if (Info.hasWorkItemIDZ()) {
2720 assert(Info.hasWorkItemIDX() && Info.hasWorkItemIDY());
2721 if (Subtarget->hasPackedTID()) {
2722 Info.setWorkItemIDZ(
2725 unsigned Reg = AMDGPU::VGPR2;
2745 if (RegIdx == ArgVGPRs.
size()) {
2752 unsigned Reg = ArgVGPRs[RegIdx];
2764 unsigned NumArgRegs) {
2767 if (RegIdx == ArgSGPRs.
size())
2770 unsigned Reg = ArgSGPRs[RegIdx];
2812 const unsigned Mask = 0x3ff;
2815 if (Info.hasWorkItemIDX()) {
2817 Info.setWorkItemIDX(Arg);
2820 if (Info.hasWorkItemIDY()) {
2822 Info.setWorkItemIDY(Arg);
2825 if (Info.hasWorkItemIDZ())
2837 const unsigned Mask = 0x3ff;
2846 auto &
ArgInfo = Info.getArgInfo();
2858 if (Info.hasImplicitArgPtr())
2866 if (Info.hasWorkGroupIDX())
2869 if (Info.hasWorkGroupIDY())
2872 if (Info.hasWorkGroupIDZ())
2875 if (Info.hasLDSKernelId())
2886 Register ImplicitBufferPtrReg = Info.addImplicitBufferPtr(
TRI);
2887 MF.
addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);
2893 Register PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(
TRI);
2894 MF.
addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass);
2899 Register DispatchPtrReg = Info.addDispatchPtr(
TRI);
2900 MF.
addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass);
2906 MF.
addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
2912 Register InputPtrReg = Info.addKernargSegmentPtr(
TRI);
2921 MF.
addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass);
2926 Register FlatScratchInitReg = Info.addFlatScratchInit(
TRI);
2927 MF.
addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);
2932 Register PrivateSegmentSizeReg = Info.addPrivateSegmentSize(
TRI);
2933 MF.
addLiveIn(PrivateSegmentSizeReg, &AMDGPU::SGPR_32RegClass);
2948 unsigned LastExplicitArgOffset = Subtarget->getExplicitKernelArgOffset();
2950 bool InPreloadSequence =
true;
2952 bool AlignedForImplictArgs =
false;
2953 unsigned ImplicitArgOffset = 0;
2954 for (
auto &Arg :
F.args()) {
2955 if (!InPreloadSequence || !Arg.hasInRegAttr())
2958 unsigned ArgIdx = Arg.getArgNo();
2961 if (InIdx < Ins.size() &&
2962 (!Ins[InIdx].isOrigArg() || Ins[InIdx].getOrigArgIndex() != ArgIdx))
2965 for (; InIdx < Ins.size() && Ins[InIdx].isOrigArg() &&
2966 Ins[InIdx].getOrigArgIndex() == ArgIdx;
2968 assert(ArgLocs[ArgIdx].isMemLoc());
2969 auto &ArgLoc = ArgLocs[InIdx];
2971 unsigned ArgOffset = ArgLoc.getLocMemOffset();
2973 unsigned NumAllocSGPRs =
2974 alignTo(ArgLoc.getLocVT().getFixedSizeInBits(), 32) / 32;
2977 if (Arg.hasAttribute(
"amdgpu-hidden-argument")) {
2978 if (!AlignedForImplictArgs) {
2980 alignTo(LastExplicitArgOffset,
2981 Subtarget->getAlignmentForImplicitArgPtr()) -
2982 LastExplicitArgOffset;
2983 AlignedForImplictArgs =
true;
2985 ArgOffset += ImplicitArgOffset;
2989 if (ArgLoc.getLocVT().getStoreSize() < 4 && Alignment < 4) {
2990 assert(InIdx >= 1 &&
"No previous SGPR");
2991 Info.getArgInfo().PreloadKernArgs[InIdx].Regs.push_back(
2992 Info.getArgInfo().PreloadKernArgs[InIdx - 1].Regs[0]);
2996 unsigned Padding = ArgOffset - LastExplicitArgOffset;
2997 unsigned PaddingSGPRs =
alignTo(Padding, 4) / 4;
3000 InPreloadSequence =
false;
3006 TRI.getSGPRClassForBitWidth(NumAllocSGPRs * 32);
3008 Info.addPreloadedKernArg(
TRI, RC, NumAllocSGPRs, InIdx, PaddingSGPRs);
3010 if (PreloadRegs->
size() > 1)
3011 RC = &AMDGPU::SGPR_32RegClass;
3012 for (
auto &Reg : *PreloadRegs) {
3018 LastExplicitArgOffset = NumAllocSGPRs * 4 + ArgOffset;
3027 if (Info.hasLDSKernelId()) {
3028 Register Reg = Info.addLDSKernelId();
3029 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3038 bool IsShader)
const {
3039 bool HasArchitectedSGPRs = Subtarget->hasArchitectedSGPRs();
3040 if (Subtarget->hasUserSGPRInit16Bug() && !IsShader) {
3046 assert(!HasArchitectedSGPRs &&
"Unhandled feature for the subtarget");
3048 unsigned CurrentUserSGPRs = Info.getNumUserSGPRs();
3052 unsigned NumRequiredSystemSGPRs =
3053 Info.hasWorkGroupIDX() + Info.hasWorkGroupIDY() +
3054 Info.hasWorkGroupIDZ() + Info.hasWorkGroupInfo();
3055 for (
unsigned i = NumRequiredSystemSGPRs + CurrentUserSGPRs; i < 16; ++i) {
3056 Register Reg = Info.addReservedUserSGPR();
3057 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3062 if (!HasArchitectedSGPRs) {
3063 if (Info.hasWorkGroupIDX()) {
3064 Register Reg = Info.addWorkGroupIDX();
3065 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3069 if (Info.hasWorkGroupIDY()) {
3070 Register Reg = Info.addWorkGroupIDY();
3071 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3075 if (Info.hasWorkGroupIDZ()) {
3076 Register Reg = Info.addWorkGroupIDZ();
3077 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3082 if (Info.hasWorkGroupInfo()) {
3083 Register Reg = Info.addWorkGroupInfo();
3084 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3088 if (Info.hasPrivateSegmentWaveByteOffset()) {
3090 unsigned PrivateSegmentWaveByteOffsetReg;
3093 PrivateSegmentWaveByteOffsetReg =
3094 Info.getPrivateSegmentWaveByteOffsetSystemSGPR();
3098 if (PrivateSegmentWaveByteOffsetReg == AMDGPU::NoRegister) {
3100 Info.setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg);
3103 PrivateSegmentWaveByteOffsetReg = Info.addPrivateSegmentWaveByteOffset();
3105 MF.
addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass);
3106 CCInfo.
AllocateReg(PrivateSegmentWaveByteOffsetReg);
3109 assert(!Subtarget->hasUserSGPRInit16Bug() || IsShader ||
3110 Info.getNumPreloadedSGPRs() >= 16);
3125 if (HasStackObjects)
3126 Info.setHasNonSpillStackObjects(
true);
3131 HasStackObjects =
true;
3135 bool RequiresStackAccess = HasStackObjects || MFI.
hasCalls();
3137 if (!ST.enableFlatScratch()) {
3138 if (RequiresStackAccess && ST.isAmdHsaOrMesa(MF.
getFunction())) {
3145 Info.setScratchRSrcReg(PrivateSegmentBufferReg);
3147 unsigned ReservedBufferReg =
TRI.reservedPrivateSegmentBufferReg(MF);
3157 Info.setScratchRSrcReg(ReservedBufferReg);
3176 if (!
MRI.isLiveIn(AMDGPU::SGPR32)) {
3177 Info.setStackPtrOffsetReg(AMDGPU::SGPR32);
3184 for (
unsigned Reg : AMDGPU::SGPR_32RegClass) {
3185 if (!
MRI.isLiveIn(
Reg)) {
3186 Info.setStackPtrOffsetReg(
Reg);
3191 if (
Info.getStackPtrOffsetReg() == AMDGPU::SP_REG)
3198 if (ST.getFrameLowering()->hasFP(MF)) {
3199 Info.setFrameOffsetReg(AMDGPU::SGPR33);
3215 const MCPhysReg *IStart =
TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
3224 if (AMDGPU::SReg_64RegClass.
contains(*
I))
3225 RC = &AMDGPU::SGPR_64RegClass;
3226 else if (AMDGPU::SReg_32RegClass.
contains(*
I))
3227 RC = &AMDGPU::SGPR_32RegClass;
3233 Entry->addLiveIn(*
I);
3238 for (
auto *Exit : Exits)
3240 TII->get(TargetOpcode::COPY), *
I)
3255 bool IsError =
false;
3259 Fn,
"unsupported non-compute shaders with HSA",
DL.getDebugLoc()));
3277 !Info->hasLDSKernelId() && !Info->hasWorkItemIDX() &&
3278 !Info->hasWorkItemIDY() && !Info->hasWorkItemIDZ());
3280 if (!Subtarget->enableFlatScratch())
3285 !Subtarget->hasArchitectedSGPRs())
3286 assert(!Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() &&
3287 !Info->hasWorkGroupIDZ());
3290 bool IsWholeWaveFunc = Info->isWholeWaveFunction();
3308 if ((Info->getPSInputAddr() & 0x7F) == 0 ||
3309 ((Info->getPSInputAddr() & 0xF) == 0 && Info->isPSInputAllocated(11))) {
3312 Info->markPSInputAllocated(0);
3313 Info->markPSInputEnabled(0);
3315 if (Subtarget->isAmdPalOS()) {
3324 unsigned PsInputBits = Info->getPSInputAddr() & Info->getPSInputEnable();
3325 if ((PsInputBits & 0x7F) == 0 ||
3326 ((PsInputBits & 0xF) == 0 && (PsInputBits >> 11 & 1)))
3329 }
else if (IsKernel) {
3330 assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX());
3332 Splits.
append(IsWholeWaveFunc ? std::next(Ins.begin()) : Ins.begin(),
3342 if (IsKernel && Subtarget->hasKernargPreload())
3346 }
else if (!IsGraphics) {
3351 if (!Subtarget->enableFlatScratch())
3363 Info->setNumWaveDispatchSGPRs(
3365 Info->setNumWaveDispatchVGPRs(
3367 }
else if (Info->getNumKernargPreloadedSGPRs()) {
3368 Info->setNumWaveDispatchSGPRs(Info->getNumUserSGPRs());
3373 if (IsWholeWaveFunc) {
3375 {MVT::i1, MVT::Other}, Chain);
3387 for (
unsigned i = IsWholeWaveFunc ? 1 : 0, e = Ins.size(), ArgIdx = 0; i != e;
3398 if (IsEntryFunc && VA.
isMemLoc()) {
3421 if (Arg.
isOrigArg() && Info->getArgInfo().PreloadKernArgs.count(i)) {
3425 int64_t OffsetDiff =
Offset - AlignDownOffset;
3432 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs[0];
3442 ArgVal = DAG.
getNode(ISD::BITCAST,
DL, MemVT, ArgVal);
3443 NewArg = convertArgType(DAG, VT, MemVT,
DL, ArgVal,
3444 Ins[i].Flags.isSExt(), &Ins[i]);
3452 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs;
3455 if (PreloadRegs.
size() == 1) {
3456 Register VReg =
MRI.getLiveInVirtReg(PreloadRegs[0]);
3461 TRI->getRegSizeInBits(*RC)));
3469 for (
auto Reg : PreloadRegs) {
3476 PreloadRegs.size()),
3493 NewArg = convertArgType(DAG, VT, MemVT,
DL, NewArg,
3494 Ins[i].Flags.isSExt(), &Ins[i]);
3506 "hidden argument in kernel signature was not preloaded",
3512 lowerKernargMemParameter(DAG, VT, MemVT,
DL, Chain,
Offset,
3513 Alignment, Ins[i].Flags.isSExt(), &Ins[i]);
3533 if (!IsEntryFunc && VA.
isMemLoc()) {
3534 SDValue Val = lowerStackParameter(DAG, VA,
DL, Chain, Arg);
3545 if (AMDGPU::VGPR_32RegClass.
contains(Reg))
3546 RC = &AMDGPU::VGPR_32RegClass;
3547 else if (AMDGPU::SGPR_32RegClass.
contains(Reg))
3548 RC = &AMDGPU::SGPR_32RegClass;
3568 Val = convertABITypeToValueType(DAG, Val, VA,
DL);
3584 Info->setBytesInStackArgArea(StackArgSize);
3586 return Chains.
empty() ? Chain
3595 const Type *RetTy)
const {
3603 CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
3608 unsigned MaxNumVGPRs = Subtarget->getMaxNumVGPRs(MF);
3609 unsigned TotalNumVGPRs = Subtarget->getAddressableNumArchVGPRs();
3610 for (
unsigned i = MaxNumVGPRs; i < TotalNumVGPRs; ++i)
3611 if (CCInfo.
isAllocated(AMDGPU::VGPR_32RegClass.getRegister(i)))
3634 Info->setIfReturnsVoid(Outs.
empty());
3635 bool IsWaveEnd = Info->returnsVoid() && IsShader;
3654 for (
unsigned I = 0, RealRVLocIdx = 0, E = RVLocs.
size();
I != E;
3655 ++
I, ++RealRVLocIdx) {
3659 SDValue Arg = OutVals[RealRVLocIdx];
3682 ReadFirstLane, Arg);
3689 if (!Info->isEntryFunction()) {
3695 if (AMDGPU::SReg_64RegClass.
contains(*
I))
3697 else if (AMDGPU::SReg_32RegClass.
contains(*
I))
3710 unsigned Opc = AMDGPUISD::ENDPGM;
3712 Opc = Info->isWholeWaveFunction() ? AMDGPUISD::WHOLE_WAVE_RETURN
3713 : IsShader ? AMDGPUISD::RETURN_TO_EPILOG
3714 : AMDGPUISD::RET_GLUE;
3798 auto &ArgUsageInfo =
3800 CalleeArgInfo = &ArgUsageInfo.lookupFuncArgInfo(*CalleeFunc);
3828 const auto [OutgoingArg, ArgRC, ArgTy] =
3833 const auto [IncomingArg, IncomingArgRC, Ty] =
3835 assert(IncomingArgRC == ArgRC);
3838 EVT ArgVT =
TRI->getSpillSize(*ArgRC) == 8 ? MVT::i64 : MVT::i32;
3846 InputReg = getImplicitArgPtr(DAG,
DL);
3848 std::optional<uint32_t> Id =
3850 if (Id.has_value()) {
3861 if (OutgoingArg->isRegister()) {
3862 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
3863 if (!CCInfo.
AllocateReg(OutgoingArg->getRegister()))
3866 unsigned SpecialArgOffset =
3877 auto [OutgoingArg, ArgRC, Ty] =
3880 std::tie(OutgoingArg, ArgRC, Ty) =
3883 std::tie(OutgoingArg, ArgRC, Ty) =
3898 const bool NeedWorkItemIDX = !CLI.
CB->
hasFnAttr(
"amdgpu-no-workitem-id-x");
3899 const bool NeedWorkItemIDY = !CLI.
CB->
hasFnAttr(
"amdgpu-no-workitem-id-y");
3900 const bool NeedWorkItemIDZ = !CLI.
CB->
hasFnAttr(
"amdgpu-no-workitem-id-z");
3905 if (Subtarget->getMaxWorkitemID(
F, 0) != 0) {
3913 NeedWorkItemIDY && Subtarget->getMaxWorkitemID(
F, 1) != 0) {
3923 NeedWorkItemIDZ && Subtarget->getMaxWorkitemID(
F, 2) != 0) {
3932 if (!InputReg && (NeedWorkItemIDX || NeedWorkItemIDY || NeedWorkItemIDZ)) {
3933 if (!IncomingArgX && !IncomingArgY && !IncomingArgZ) {
3944 : IncomingArgY ? *IncomingArgY
3951 if (OutgoingArg->isRegister()) {
3953 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
3979 if (Callee->isDivergent())
3986 const uint32_t *CallerPreserved =
TRI->getCallPreservedMask(MF, CallerCC);
3990 if (!CallerPreserved)
3993 bool CCMatch = CallerCC == CalleeCC;
4006 if (Arg.hasByValAttr())
4020 const uint32_t *CalleePreserved =
TRI->getCallPreservedMask(MF, CalleeCC);
4021 if (!
TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
4030 CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, Ctx);
4043 for (
const auto &[CCVA, ArgVal] :
zip_equal(ArgLocs, OutVals)) {
4045 if (!CCVA.isRegLoc())
4050 if (ArgVal->
isDivergent() &&
TRI->isSGPRPhysReg(CCVA.getLocReg())) {
4052 dbgs() <<
"Cannot tail call due to divergent outgoing argument in "
4076enum ChainCallArgIdx {
4098 bool UsesDynamicVGPRs =
false;
4099 if (IsChainCallConv) {
4104 auto RequestedExecIt =
4106 return Arg.OrigArgIndex == 2;
4108 assert(RequestedExecIt != CLI.
Outs.end() &&
"No node for EXEC");
4110 size_t SpecialArgsBeginIdx = RequestedExecIt - CLI.
Outs.begin();
4113 CLI.
Outs.erase(RequestedExecIt, CLI.
Outs.end());
4116 "Haven't popped all the special args");
4119 CLI.
Args[ChainCallArgIdx::Exec];
4120 if (!RequestedExecArg.
Ty->
isIntegerTy(Subtarget->getWavefrontSize()))
4128 ArgNode->getAPIntValue(),
DL, ArgNode->getValueType(0)));
4130 ChainCallSpecialArgs.
push_back(Arg.Node);
4133 PushNodeOrTargetConstant(RequestedExecArg);
4139 if (FlagsValue.
isZero()) {
4140 if (CLI.
Args.size() > ChainCallArgIdx::Flags + 1)
4142 "no additional args allowed if flags == 0");
4144 if (CLI.
Args.size() != ChainCallArgIdx::FallbackCallee + 1) {
4148 if (!Subtarget->isWave32()) {
4150 CLI, InVals,
"dynamic VGPR mode is only supported for wave32");
4153 UsesDynamicVGPRs =
true;
4154 std::for_each(CLI.
Args.begin() + ChainCallArgIdx::NumVGPRs,
4155 CLI.
Args.end(), PushNodeOrTargetConstant);
4164 bool IsSibCall =
false;
4178 "unsupported call to variadic function ");
4186 "unsupported required tail call to function ");
4191 Outs, OutVals, Ins, DAG);
4195 "site marked musttail or on llvm.amdgcn.cs.chain");
4202 if (!TailCallOpt && IsTailCall)
4242 auto *
TRI = Subtarget->getRegisterInfo();
4249 if (!IsSibCall || IsChainCallConv) {
4250 if (!Subtarget->enableFlatScratch()) {
4256 RegsToPass.emplace_back(IsChainCallConv
4257 ? AMDGPU::SGPR48_SGPR49_SGPR50_SGPR51
4258 : AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3,
4265 const unsigned NumSpecialInputs = RegsToPass.size();
4267 MVT PtrVT = MVT::i32;
4270 for (
unsigned i = 0, e = ArgLocs.
size(); i != e; ++i) {
4298 RegsToPass.push_back(std::pair(VA.
getLocReg(), Arg));
4306 int32_t
Offset = LocMemOffset;
4313 unsigned OpSize = Flags.isByVal() ? Flags.getByValSize()
4319 ? Flags.getNonZeroByValAlign()
4346 if (Outs[i].Flags.isByVal()) {
4348 DAG.
getConstant(Outs[i].Flags.getByValSize(),
DL, MVT::i32);
4351 Outs[i].Flags.getNonZeroByValAlign(),
4353 nullptr, std::nullopt, DstInfo,
4359 DAG.
getStore(Chain,
DL, Arg, DstAddr, DstInfo, Alignment);
4365 if (!MemOpChains.
empty())
4373 TokenGlue = DAG.
getNode(ISD::CONVERGENCECTRL_GLUE,
DL, MVT::Glue,
4381 unsigned ArgIdx = 0;
4382 for (
auto [Reg, Val] : RegsToPass) {
4383 if (ArgIdx++ >= NumSpecialInputs &&
4384 (IsChainCallConv || !Val->
isDivergent()) &&
TRI->isSGPRPhysReg(Reg)) {
4410 if (IsTailCall && !IsSibCall) {
4415 std::vector<SDValue>
Ops({Chain});
4421 Ops.push_back(Callee);
4438 Ops.push_back(Callee);
4449 if (IsChainCallConv)
4454 for (
auto &[Reg, Val] : RegsToPass)
4458 const uint32_t *Mask =
TRI->getCallPreservedMask(MF, CallConv);
4459 assert(Mask &&
"Missing call preserved mask for calling convention");
4469 MVT::Glue, GlueOps),
4474 Ops.push_back(InGlue);
4480 unsigned OPC = AMDGPUISD::TC_RETURN;
4483 OPC = AMDGPUISD::TC_RETURN_GFX;
4487 OPC = UsesDynamicVGPRs ? AMDGPUISD::TC_RETURN_CHAIN_DVGPR
4488 : AMDGPUISD::TC_RETURN_CHAIN;
4494 if (Info->isWholeWaveFunction())
4495 OPC = AMDGPUISD::TC_RETURN_GFX_WholeWave;
4502 Chain =
Call.getValue(0);
4503 InGlue =
Call.getValue(1);
4505 uint64_t CalleePopBytes = NumBytes;
4526 EVT VT =
Op.getValueType();
4540 "Stack grows upwards for AMDGPU");
4542 Chain = BaseAddr.getValue(1);
4544 if (Alignment > StackAlign) {
4546 << Subtarget->getWavefrontSizeLog2();
4547 uint64_t StackAlignMask = ScaledAlignment - 1;
4554 assert(
Size.getValueType() == MVT::i32 &&
"Size must be 32-bit");
4560 DAG.
getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32));
4571 DAG.
getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32));
4587 if (
Op.getValueType() != MVT::i32)
4606 assert(
Op.getValueType() == MVT::i32);
4615 Op.getOperand(0), IntrinID, GetRoundBothImm);
4649 SDValue RoundModeTimesNumBits =
4669 TableEntry, EnumOffset);
4685 static_cast<uint32_t>(ConstMode->getZExtValue()),
4697 if (UseReducedTable) {
4703 SDValue RoundModeTimesNumBits =
4723 SDValue RoundModeTimesNumBits =
4732 NewMode = TruncTable;
4741 ReadFirstLaneID, NewMode);
4754 IntrinID, RoundBothImm, NewMode);
4760 if (
Op->isDivergent() &&
4761 (!Subtarget->hasVmemPrefInsts() || !
Op.getConstantOperandVal(4)))
4771 if (Subtarget->hasSafeSmemPrefetch())
4779 if (!Subtarget->hasSafeSmemPrefetch() && !
Op.getConstantOperandVal(4))
4788 SDValue Src =
Op.getOperand(IsStrict ? 1 : 0);
4789 EVT SrcVT = Src.getValueType();
4798 EVT DstVT =
Op.getValueType();
4802 return DAG.
getNode(ISD::BF16_TO_FP, SL, DstVT, BitCast);
4807 if (
Op.getValueType() != MVT::i64)
4821 Op.getOperand(0), IntrinID, ModeHwRegImm);
4823 Op.getOperand(0), IntrinID, TrapHwRegImm);
4830 SDValue Result = DAG.
getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
4837 if (
Op.getOperand(1).getValueType() != MVT::i64)
4849 ReadFirstLaneID, NewModeReg);
4851 ReadFirstLaneID, NewTrapReg);
4853 unsigned ModeHwReg =
4856 unsigned TrapHwReg =
4864 IntrinID, ModeHwRegImm, NewModeReg);
4867 IntrinID, TrapHwRegImm, NewTrapReg);
4876 .
Case(
"m0", AMDGPU::M0)
4877 .
Case(
"exec", AMDGPU::EXEC)
4878 .
Case(
"exec_lo", AMDGPU::EXEC_LO)
4879 .
Case(
"exec_hi", AMDGPU::EXEC_HI)
4880 .
Case(
"flat_scratch", AMDGPU::FLAT_SCR)
4881 .
Case(
"flat_scratch_lo", AMDGPU::FLAT_SCR_LO)
4882 .
Case(
"flat_scratch_hi", AMDGPU::FLAT_SCR_HI)
4887 if (!Subtarget->hasFlatScrRegister() &&
4888 Subtarget->getRegisterInfo()->regsOverlap(Reg, AMDGPU::FLAT_SCR)) {
4890 "\" for subtarget."));
4895 case AMDGPU::EXEC_LO:
4896 case AMDGPU::EXEC_HI:
4897 case AMDGPU::FLAT_SCR_LO:
4898 case AMDGPU::FLAT_SCR_HI:
4903 case AMDGPU::FLAT_SCR:
4922 MI.setDesc(
TII->getKillTerminatorFromPseudo(
MI.getOpcode()));
4931static std::pair<MachineBasicBlock *, MachineBasicBlock *>
4953 auto Next = std::next(
I);
4964 MBB.addSuccessor(LoopBB);
4966 return std::pair(LoopBB, RemainderBB);
4973 auto I =
MI.getIterator();
4974 auto E = std::next(
I);
4996 Src->setIsKill(
false);
5006 BuildMI(*LoopBB, LoopBB->begin(),
DL,
TII->get(AMDGPU::S_SETREG_IMM32_B32))
5012 Register Reg =
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5015 BuildMI(*LoopBB,
I,
DL,
TII->get(AMDGPU::S_GETREG_B32), Reg)
5039 unsigned InitReg,
unsigned ResultReg,
unsigned PhiReg,
5040 unsigned InitSaveExecReg,
int Offset,
bool UseGPRIdxMode,
5050 Register PhiExec =
MRI.createVirtualRegister(BoolRC);
5051 Register NewExec =
MRI.createVirtualRegister(BoolRC);
5053 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5054 Register CondReg =
MRI.createVirtualRegister(BoolRC);
5062 BuildMI(LoopBB,
I,
DL,
TII->get(TargetOpcode::PHI), PhiExec)
5069 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::V_READFIRSTLANE_B32), CurrentIdxReg)
5073 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::V_CMP_EQ_U32_e64), CondReg)
5081 MRI.setSimpleHint(NewExec, CondReg);
5083 if (UseGPRIdxMode) {
5085 SGPRIdxReg = CurrentIdxReg;
5087 SGPRIdxReg =
MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
5088 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::S_ADD_I32), SGPRIdxReg)
5098 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
5129 unsigned InitResultReg,
unsigned PhiReg,
int Offset,
5130 bool UseGPRIdxMode,
Register &SGPRIdxReg) {
5138 const auto *BoolXExecRC =
TRI->getWaveMaskRegClass();
5140 Register SaveExec =
MRI.createVirtualRegister(BoolXExecRC);
5141 Register TmpExec =
MRI.createVirtualRegister(BoolXExecRC);
5157 InitResultReg, DstReg, PhiReg, TmpExec,
5158 Offset, UseGPRIdxMode, SGPRIdxReg);
5164 LoopBB->removeSuccessor(RemainderBB);
5166 LoopBB->addSuccessor(LandingPad);
5177static std::pair<unsigned, int>
5181 int NumElts =
TRI.getRegSizeInBits(*SuperRC) / 32;
5186 return std::pair(AMDGPU::sub0,
Offset);
5226 Register Tmp =
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5243 Register SrcReg =
TII->getNamedOperand(
MI, AMDGPU::OpName::src)->getReg();
5244 int Offset =
TII->getNamedOperand(
MI, AMDGPU::OpName::offset)->getImm();
5253 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
5256 if (
TII->getRegisterInfo().isSGPRClass(IdxRC)) {
5260 if (UseGPRIdxMode) {
5267 TII->getIndirectGPRIDXPseudo(
TRI.getRegSizeInBits(*VecRC),
true);
5280 MI.eraseFromParent();
5289 Register PhiReg =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5290 Register InitReg =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5296 UseGPRIdxMode, SGPRIdxReg);
5300 if (UseGPRIdxMode) {
5302 TII->getIndirectGPRIDXPseudo(
TRI.getRegSizeInBits(*VecRC),
true);
5304 BuildMI(*LoopBB, InsPt,
DL, GPRIDXDesc, Dst)
5309 BuildMI(*LoopBB, InsPt,
DL,
TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
5314 MI.eraseFromParent();
5331 int Offset =
TII->getNamedOperand(
MI, AMDGPU::OpName::offset)->getImm();
5341 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
5343 if (Idx->
getReg() == AMDGPU::NoRegister) {
5354 MI.eraseFromParent();
5359 if (
TII->getRegisterInfo().isSGPRClass(IdxRC)) {
5363 if (UseGPRIdxMode) {
5367 TII->getIndirectGPRIDXPseudo(
TRI.getRegSizeInBits(*VecRC),
false);
5376 const MCInstrDesc &MovRelDesc =
TII->getIndirectRegWriteMovRelPseudo(
5377 TRI.getRegSizeInBits(*VecRC), 32,
false);
5383 MI.eraseFromParent();
5393 Register PhiReg =
MRI.createVirtualRegister(VecRC);
5397 UseGPRIdxMode, SGPRIdxReg);
5400 if (UseGPRIdxMode) {
5402 TII->getIndirectGPRIDXPseudo(
TRI.getRegSizeInBits(*VecRC),
false);
5404 BuildMI(*LoopBB, InsPt,
DL, GPRIDXDesc, Dst)
5410 const MCInstrDesc &MovRelDesc =
TII->getIndirectRegWriteMovRelPseudo(
5411 TRI.getRegSizeInBits(*VecRC), 32,
false);
5412 BuildMI(*LoopBB, InsPt,
DL, MovRelDesc, Dst)
5418 MI.eraseFromParent();
5434 bool IsAdd = (
MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
5435 if (ST.hasScalarAddSub64()) {
5436 unsigned Opc = IsAdd ? AMDGPU::S_ADD_U64 : AMDGPU::S_SUB_U64;
5446 Register DestSub0 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5447 Register DestSub1 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5450 MI,
MRI, Src0, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5452 MI,
MRI, Src0, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5455 MI,
MRI, Src1, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5457 MI,
MRI, Src1, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5459 unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
5460 unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
5469 MI.eraseFromParent();
5475 case AMDGPU::S_MIN_U32:
5476 return std::numeric_limits<uint32_t>::max();
5477 case AMDGPU::S_MIN_I32:
5478 return std::numeric_limits<int32_t>::max();
5479 case AMDGPU::S_MAX_U32:
5480 return std::numeric_limits<uint32_t>::min();
5481 case AMDGPU::S_MAX_I32:
5482 return std::numeric_limits<int32_t>::min();
5483 case AMDGPU::V_ADD_F32_e64:
5485 case AMDGPU::V_SUB_F32_e64:
5487 case AMDGPU::S_ADD_I32:
5488 case AMDGPU::S_SUB_I32:
5489 case AMDGPU::S_OR_B32:
5490 case AMDGPU::S_XOR_B32:
5491 return std::numeric_limits<uint32_t>::min();
5492 case AMDGPU::S_AND_B32:
5493 return std::numeric_limits<uint32_t>::max();
5494 case AMDGPU::V_MIN_F32_e64:
5495 case AMDGPU::V_MAX_F32_e64:
5499 "Unexpected opcode in getIdentityValueFor32BitWaveReduction");
5505 case AMDGPU::V_CMP_LT_U64_e64:
5506 return std::numeric_limits<uint64_t>::max();
5507 case AMDGPU::V_CMP_LT_I64_e64:
5508 return std::numeric_limits<int64_t>::max();
5509 case AMDGPU::V_CMP_GT_U64_e64:
5510 return std::numeric_limits<uint64_t>::min();
5511 case AMDGPU::V_CMP_GT_I64_e64:
5512 return std::numeric_limits<int64_t>::min();
5513 case AMDGPU::S_ADD_U64_PSEUDO:
5514 case AMDGPU::S_SUB_U64_PSEUDO:
5515 case AMDGPU::S_OR_B64:
5516 case AMDGPU::S_XOR_B64:
5517 return std::numeric_limits<uint64_t>::min();
5518 case AMDGPU::S_AND_B64:
5519 return std::numeric_limits<uint64_t>::max();
5522 "Unexpected opcode in getIdentityValueFor64BitWaveReduction");
5527 return Opc == AMDGPU::S_MIN_U32 ||
Opc == AMDGPU::S_MIN_I32 ||
5528 Opc == AMDGPU::S_MAX_U32 ||
Opc == AMDGPU::S_MAX_I32 ||
5529 Opc == AMDGPU::S_ADD_I32 ||
Opc == AMDGPU::S_SUB_I32 ||
5530 Opc == AMDGPU::S_AND_B32 ||
Opc == AMDGPU::S_OR_B32 ||
5531 Opc == AMDGPU::S_XOR_B32 ||
Opc == AMDGPU::V_MIN_F32_e64 ||
5532 Opc == AMDGPU::V_MAX_F32_e64 ||
Opc == AMDGPU::V_ADD_F32_e64 ||
5533 Opc == AMDGPU::V_SUB_F32_e64;
5537 return Opc == AMDGPU::V_MIN_F32_e64 ||
Opc == AMDGPU::V_MAX_F32_e64 ||
5538 Opc == AMDGPU::V_ADD_F32_e64 ||
Opc == AMDGPU::V_SUB_F32_e64;
5552 bool isSGPR =
TRI->isSGPRClass(
MRI.getRegClass(SrcReg));
5557 case AMDGPU::S_MIN_U32:
5558 case AMDGPU::S_MIN_I32:
5559 case AMDGPU::V_MIN_F32_e64:
5560 case AMDGPU::S_MAX_U32:
5561 case AMDGPU::S_MAX_I32:
5562 case AMDGPU::V_MAX_F32_e64:
5563 case AMDGPU::S_AND_B32:
5564 case AMDGPU::S_OR_B32: {
5570 case AMDGPU::V_CMP_LT_U64_e64:
5571 case AMDGPU::V_CMP_LT_I64_e64:
5572 case AMDGPU::V_CMP_GT_U64_e64:
5573 case AMDGPU::V_CMP_GT_I64_e64:
5574 case AMDGPU::S_AND_B64:
5575 case AMDGPU::S_OR_B64: {
5581 case AMDGPU::S_XOR_B32:
5582 case AMDGPU::S_XOR_B64:
5583 case AMDGPU::S_ADD_I32:
5584 case AMDGPU::S_ADD_U64_PSEUDO:
5585 case AMDGPU::V_ADD_F32_e64:
5586 case AMDGPU::S_SUB_I32:
5587 case AMDGPU::S_SUB_U64_PSEUDO:
5588 case AMDGPU::V_SUB_F32_e64: {
5591 Register ExecMask =
MRI.createVirtualRegister(WaveMaskRegClass);
5593 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5595 bool IsWave32 = ST.isWave32();
5596 unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5597 MCRegister ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5598 unsigned BitCountOpc =
5599 IsWave32 ? AMDGPU::S_BCNT1_I32_B32 : AMDGPU::S_BCNT1_I32_B64;
5603 auto NewAccumulator =
5608 case AMDGPU::S_XOR_B32:
5609 case AMDGPU::S_XOR_B64: {
5615 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5618 .
addReg(NewAccumulator->getOperand(0).getReg())
5621 if (
Opc == AMDGPU::S_XOR_B32) {
5627 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5629 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5633 TRI->getSubRegisterClass(SrcRC, AMDGPU::sub0);
5636 MI,
MRI,
MI.getOperand(1), SrcRC, AMDGPU::sub0, SrcSubRC);
5638 MI,
MRI,
MI.getOperand(1), SrcRC, AMDGPU::sub1, SrcSubRC);
5648 BuildMI(BB,
MI,
DL,
TII->get(TargetOpcode::REG_SEQUENCE), DstReg)
5656 case AMDGPU::S_SUB_I32: {
5657 Register NegatedVal =
MRI.createVirtualRegister(DstRegClass);
5665 .
addReg(NewAccumulator->getOperand(0).getReg());
5668 case AMDGPU::S_ADD_I32: {
5671 .
addReg(NewAccumulator->getOperand(0).getReg());
5674 case AMDGPU::S_ADD_U64_PSEUDO:
5675 case AMDGPU::S_SUB_U64_PSEUDO: {
5676 Register DestSub0 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5677 Register DestSub1 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5679 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5681 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5682 Register CarryReg =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5683 Register AddReg =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5685 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5687 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5691 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub0);
5694 MI,
MRI,
MI.getOperand(1), Src1RC, AMDGPU::sub0, Src1SubRC);
5696 MI,
MRI,
MI.getOperand(1), Src1RC, AMDGPU::sub1, Src1SubRC);
5698 if (
Opc == AMDGPU::S_SUB_U64_PSEUDO) {
5701 .
addReg(NewAccumulator->getOperand(0).getReg())
5711 Register LowOpcode =
Opc == AMDGPU::S_SUB_U64_PSEUDO
5713 : NewAccumulator->getOperand(0).getReg();
5724 Register HiVal =
Opc == AMDGPU::S_SUB_U64_PSEUDO ? AddReg : DestSub1;
5730 if (
Opc == AMDGPU::S_SUB_U64_PSEUDO) {
5736 BuildMI(BB,
MI,
DL,
TII->get(TargetOpcode::REG_SEQUENCE), DstReg)
5743 case AMDGPU::V_ADD_F32_e64:
5744 case AMDGPU::V_SUB_F32_e64: {
5746 MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5747 Register DstVreg =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5751 .
addReg(NewAccumulator->getOperand(0).getReg())
5756 unsigned srcMod =
Opc == AMDGPU::V_SUB_F32_e64 ? 1 : 0;
5764 BuildMI(BB,
MI,
DL,
TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
5793 Register LoopIterator =
MRI.createVirtualRegister(WaveMaskRegClass);
5794 Register IdentityValReg =
MRI.createVirtualRegister(DstRegClass);
5795 Register AccumulatorReg =
MRI.createVirtualRegister(DstRegClass);
5796 Register ActiveBitsReg =
MRI.createVirtualRegister(WaveMaskRegClass);
5797 Register NewActiveBitsReg =
MRI.createVirtualRegister(WaveMaskRegClass);
5798 Register FF1Reg =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5799 Register LaneValueReg =
MRI.createVirtualRegister(DstRegClass);
5801 bool IsWave32 = ST.isWave32();
5802 unsigned MovOpcForExec = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5803 unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5810 BuildMI(BB,
I,
DL,
TII->get(AMDGPU::S_MOV_B32), IdentityValReg)
5814 BuildMI(BB,
I,
DL,
TII->get(AMDGPU::S_MOV_B64_IMM_PSEUDO), IdentityValReg)
5823 I = ComputeLoop->begin();
5825 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::PHI), AccumulatorReg)
5829 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::PHI), ActiveBitsReg)
5833 I = ComputeLoop->end();
5836 unsigned SFFOpc = IsWave32 ? AMDGPU::S_FF1_I32_B32 : AMDGPU::S_FF1_I32_B64;
5840 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::V_READLANE_B32),
5846 MRI.createVirtualRegister(
MRI.getRegClass(SrcReg));
5847 Register DstVreg =
MRI.createVirtualRegister(
MRI.getRegClass(SrcReg));
5849 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::V_MOV_B32_e32),
5859 NewAccumulator =
BuildMI(*ComputeLoop,
I,
DL,
5860 TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
5869 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5871 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5872 Register LaneValReg =
MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
5875 TRI->getSubRegisterClass(SrcRC, AMDGPU::sub0);
5877 MI,
MRI,
MI.getOperand(1), SrcRC, AMDGPU::sub0, SrcSubRC);
5879 MI,
MRI,
MI.getOperand(1), SrcRC, AMDGPU::sub1, SrcSubRC);
5881 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::V_READLANE_B32),
5885 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::V_READLANE_B32),
5889 auto LaneValue =
BuildMI(*ComputeLoop,
I,
DL,
5890 TII->get(TargetOpcode::REG_SEQUENCE), LaneValReg)
5896 case AMDGPU::S_OR_B64:
5897 case AMDGPU::S_AND_B64:
5898 case AMDGPU::S_XOR_B64: {
5901 .
addReg(LaneValue->getOperand(0).getReg())
5905 case AMDGPU::V_CMP_GT_I64_e64:
5906 case AMDGPU::V_CMP_GT_U64_e64:
5907 case AMDGPU::V_CMP_LT_I64_e64:
5908 case AMDGPU::V_CMP_LT_U64_e64: {
5909 Register LaneMaskReg =
MRI.createVirtualRegister(WaveMaskRegClass);
5911 MRI.createVirtualRegister(WaveMaskRegClass);
5914 TRI->getSubRegisterClass(VregClass, AMDGPU::sub0);
5915 Register AccumulatorVReg =
MRI.createVirtualRegister(VregClass);
5918 VregClass, AMDGPU::sub0, VSubRegClass);
5921 VregClass, AMDGPU::sub1, VSubRegClass);
5922 BuildMI(*ComputeLoop,
I,
DL,
TII->get(TargetOpcode::REG_SEQUENCE),
5929 .
addReg(LaneValue->getOperand(0).getReg())
5930 .
addReg(AccumulatorVReg);
5932 unsigned AndOpc = IsWave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
5933 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AndOpc), ComparisonResultReg)
5937 NewAccumulator =
BuildMI(*ComputeLoop,
I,
DL,
5938 TII->get(AMDGPU::S_CSELECT_B64), DstReg)
5939 .
addReg(LaneValue->getOperand(0).getReg())
5943 case AMDGPU::S_ADD_U64_PSEUDO:
5944 case AMDGPU::S_SUB_U64_PSEUDO: {
5947 .
addReg(LaneValue->getOperand(0).getReg());
5954 unsigned BITSETOpc =
5955 IsWave32 ? AMDGPU::S_BITSET0_B32 : AMDGPU::S_BITSET0_B64;
5956 BuildMI(*ComputeLoop,
I,
DL,
TII->get(BITSETOpc), NewActiveBitsReg)
5962 ActiveBits.addReg(NewActiveBitsReg).addMBB(ComputeLoop);
5965 unsigned CMPOpc = IsWave32 ? AMDGPU::S_CMP_LG_U32 : AMDGPU::S_CMP_LG_U64;
5967 .
addReg(NewActiveBitsReg)
5969 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::S_CBRANCH_SCC1))
5974 MI.eraseFromParent();
5989 switch (
MI.getOpcode()) {
5990 case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U32:
5992 case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U64:
5994 case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I32:
5996 case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I64:
5998 case AMDGPU::WAVE_REDUCE_FMIN_PSEUDO_F32:
6000 case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U32:
6002 case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U64:
6004 case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I32:
6006 case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I64:
6008 case AMDGPU::WAVE_REDUCE_FMAX_PSEUDO_F32:
6010 case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_I32:
6012 case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_U64:
6014 case AMDGPU::WAVE_REDUCE_FADD_PSEUDO_F32:
6016 case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_I32:
6018 case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_U64:
6020 case AMDGPU::WAVE_REDUCE_FSUB_PSEUDO_F32:
6022 case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B32:
6024 case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B64:
6026 case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B32:
6028 case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B64:
6030 case AMDGPU::WAVE_REDUCE_XOR_PSEUDO_B32:
6032 case AMDGPU::WAVE_REDUCE_XOR_PSEUDO_B64:
6034 case AMDGPU::S_UADDO_PSEUDO:
6035 case AMDGPU::S_USUBO_PSEUDO: {
6041 unsigned Opc = (
MI.getOpcode() == AMDGPU::S_UADDO_PSEUDO)
6043 : AMDGPU::S_SUB_U32;
6051 Subtarget->isWave64() ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
6054 MI.eraseFromParent();
6057 case AMDGPU::S_ADD_U64_PSEUDO:
6058 case AMDGPU::S_SUB_U64_PSEUDO: {
6061 case AMDGPU::V_ADD_U64_PSEUDO:
6062 case AMDGPU::V_SUB_U64_PSEUDO: {
6063 bool IsAdd = (
MI.getOpcode() == AMDGPU::V_ADD_U64_PSEUDO);
6069 if (ST.hasAddSubU64Insts()) {
6071 TII->get(IsAdd ? AMDGPU::V_ADD_U64_e64
6072 : AMDGPU::V_SUB_U64_e64),
6077 TII->legalizeOperands(*
I);
6078 MI.eraseFromParent();
6082 if (IsAdd && ST.hasLshlAddU64Inst()) {
6088 TII->legalizeOperands(*
Add);
6089 MI.eraseFromParent();
6093 const auto *CarryRC =
TRI->getWaveMaskRegClass();
6095 Register DestSub0 =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6096 Register DestSub1 =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6098 Register CarryReg =
MRI.createVirtualRegister(CarryRC);
6099 Register DeadCarryReg =
MRI.createVirtualRegister(CarryRC);
6103 : &AMDGPU::VReg_64RegClass;
6106 : &AMDGPU::VReg_64RegClass;
6109 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
6111 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
6114 MI,
MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
6116 MI,
MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
6119 MI,
MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
6121 MI,
MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
6124 IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
6131 unsigned HiOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
6145 TII->legalizeOperands(*LoHalf);
6146 TII->legalizeOperands(*HiHalf);
6147 MI.eraseFromParent();
6150 case AMDGPU::S_ADD_CO_PSEUDO:
6151 case AMDGPU::S_SUB_CO_PSEUDO: {
6162 Register RegOp0 =
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6163 BuildMI(*BB, MII,
DL,
TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp0)
6168 Register RegOp1 =
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6169 BuildMI(*BB, MII,
DL,
TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp1)
6173 Register RegOp2 =
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6175 BuildMI(*BB, MII,
DL,
TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp2)
6180 if (ST.isWave64()) {
6181 if (ST.hasScalarCompareEq64()) {
6188 TRI->getSubRegisterClass(Src2RC, AMDGPU::sub0);
6190 MII,
MRI, Src2, Src2RC, AMDGPU::sub0, SubRC);
6192 MII,
MRI, Src2, Src2RC, AMDGPU::sub1, SubRC);
6193 Register Src2_32 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6195 BuildMI(*BB, MII,
DL,
TII->get(AMDGPU::S_OR_B32), Src2_32)
6209 unsigned Opc =
MI.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO
6210 ? AMDGPU::S_ADDC_U32
6211 : AMDGPU::S_SUBB_U32;
6216 ST.isWave64() ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
6222 MI.eraseFromParent();
6225 case AMDGPU::SI_INIT_M0: {
6228 TII->get(M0Init.
isReg() ? AMDGPU::COPY : AMDGPU::S_MOV_B32),
6231 MI.eraseFromParent();
6234 case AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM: {
6237 TII->get(AMDGPU::S_CMP_EQ_U32))
6242 case AMDGPU::GET_GROUPSTATICSIZE: {
6246 .
add(
MI.getOperand(0))
6248 MI.eraseFromParent();
6251 case AMDGPU::GET_SHADERCYCLESHILO: {
6264 Register RegHi1 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6266 .
addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
6267 Register RegLo1 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6269 .
addImm(HwregEncoding::encode(ID_SHADER_CYCLES, 0, 32));
6270 Register RegHi2 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6272 .
addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
6276 Register RegLo =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6281 .
add(
MI.getOperand(0))
6286 MI.eraseFromParent();
6289 case AMDGPU::SI_INDIRECT_SRC_V1:
6290 case AMDGPU::SI_INDIRECT_SRC_V2:
6291 case AMDGPU::SI_INDIRECT_SRC_V4:
6292 case AMDGPU::SI_INDIRECT_SRC_V8:
6293 case AMDGPU::SI_INDIRECT_SRC_V9:
6294 case AMDGPU::SI_INDIRECT_SRC_V10:
6295 case AMDGPU::SI_INDIRECT_SRC_V11:
6296 case AMDGPU::SI_INDIRECT_SRC_V12:
6297 case AMDGPU::SI_INDIRECT_SRC_V16:
6298 case AMDGPU::SI_INDIRECT_SRC_V32:
6300 case AMDGPU::SI_INDIRECT_DST_V1:
6301 case AMDGPU::SI_INDIRECT_DST_V2:
6302 case AMDGPU::SI_INDIRECT_DST_V4:
6303 case AMDGPU::SI_INDIRECT_DST_V8:
6304 case AMDGPU::SI_INDIRECT_DST_V9:
6305 case AMDGPU::SI_INDIRECT_DST_V10:
6306 case AMDGPU::SI_INDIRECT_DST_V11:
6307 case AMDGPU::SI_INDIRECT_DST_V12:
6308 case AMDGPU::SI_INDIRECT_DST_V16:
6309 case AMDGPU::SI_INDIRECT_DST_V32:
6311 case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
6312 case AMDGPU::SI_KILL_I1_PSEUDO:
6314 case AMDGPU::V_CNDMASK_B64_PSEUDO: {
6318 Register SrcCond =
MI.getOperand(3).getReg();
6320 Register DstLo =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6321 Register DstHi =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6322 const auto *CondRC =
TRI->getWaveMaskRegClass();
6323 Register SrcCondCopy =
MRI.createVirtualRegister(CondRC);
6327 : &AMDGPU::VReg_64RegClass;
6330 : &AMDGPU::VReg_64RegClass;
6333 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
6335 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
6338 MI,
MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
6340 MI,
MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
6343 MI,
MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
6345 MI,
MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
6366 MI.eraseFromParent();
6369 case AMDGPU::SI_BR_UNDEF: {
6371 .
add(
MI.getOperand(0));
6373 MI.eraseFromParent();
6376 case AMDGPU::ADJCALLSTACKUP:
6377 case AMDGPU::ADJCALLSTACKDOWN: {
6384 case AMDGPU::SI_CALL_ISEL: {
6385 unsigned ReturnAddrReg =
TII->getRegisterInfo().getReturnAddressReg(*MF);
6388 MIB =
BuildMI(*BB,
MI,
DL,
TII->get(AMDGPU::SI_CALL), ReturnAddrReg);
6394 MI.eraseFromParent();
6397 case AMDGPU::V_ADD_CO_U32_e32:
6398 case AMDGPU::V_SUB_CO_U32_e32:
6399 case AMDGPU::V_SUBREV_CO_U32_e32: {
6401 unsigned Opc =
MI.getOpcode();
6403 bool NeedClampOperand =
false;
6404 if (
TII->pseudoToMCOpcode(
Opc) == -1) {
6406 NeedClampOperand =
true;
6410 if (
TII->isVOP3(*
I)) {
6413 I.add(
MI.getOperand(1)).add(
MI.getOperand(2));
6414 if (NeedClampOperand)
6417 TII->legalizeOperands(*
I);
6419 MI.eraseFromParent();
6422 case AMDGPU::V_ADDC_U32_e32:
6423 case AMDGPU::V_SUBB_U32_e32:
6424 case AMDGPU::V_SUBBREV_U32_e32:
6427 TII->legalizeOperands(
MI);
6429 case AMDGPU::DS_GWS_INIT:
6430 case AMDGPU::DS_GWS_SEMA_BR:
6431 case AMDGPU::DS_GWS_BARRIER:
6432 case AMDGPU::DS_GWS_SEMA_V:
6433 case AMDGPU::DS_GWS_SEMA_P:
6434 case AMDGPU::DS_GWS_SEMA_RELEASE_ALL:
6442 case AMDGPU::S_SETREG_B32: {
6458 const unsigned SetMask = WidthMask <<
Offset;
6461 unsigned SetDenormOp = 0;
6462 unsigned SetRoundOp = 0;
6470 SetRoundOp = AMDGPU::S_ROUND_MODE;
6471 SetDenormOp = AMDGPU::S_DENORM_MODE;
6473 SetRoundOp = AMDGPU::S_ROUND_MODE;
6475 SetDenormOp = AMDGPU::S_DENORM_MODE;
6478 if (SetRoundOp || SetDenormOp) {
6480 if (Def && Def->isMoveImmediate() && Def->getOperand(1).isImm()) {
6481 unsigned ImmVal = Def->getOperand(1).getImm();
6495 MI.eraseFromParent();
6504 MI.setDesc(
TII->get(AMDGPU::S_SETREG_B32_mode));
6508 case AMDGPU::S_INVERSE_BALLOT_U32:
6509 case AMDGPU::S_INVERSE_BALLOT_U64:
6512 MI.setDesc(
TII->get(AMDGPU::COPY));
6514 case AMDGPU::ENDPGM_TRAP: {
6516 MI.setDesc(
TII->get(AMDGPU::S_ENDPGM));
6536 MI.eraseFromParent();
6539 case AMDGPU::SIMULATED_TRAP: {
6540 assert(Subtarget->hasPrivEnabledTrap2NopBug());
6542 TII->insertSimulatedTrap(
MRI, *BB,
MI,
MI.getDebugLoc());
6543 MI.eraseFromParent();
6546 case AMDGPU::SI_TCRETURN_GFX_WholeWave:
6547 case AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN: {
6553 assert(Setup &&
"Couldn't find SI_SETUP_WHOLE_WAVE_FUNC");
6554 Register OriginalExec = Setup->getOperand(0).getReg();
6556 MI.getOperand(0).setReg(OriginalExec);
6593 return (VT == MVT::i16) ? MVT::i16 : MVT::i32;
6597 return (Ty.getScalarSizeInBits() <= 16 && Subtarget->has16BitInsts())
6624 if (!Subtarget->hasMadMacF32Insts())
6625 return Subtarget->hasFastFMAF32();
6631 return Subtarget->hasFastFMAF32() || Subtarget->hasDLInsts();
6634 return Subtarget->hasFastFMAF32() && Subtarget->hasDLInsts();
6650 switch (Ty.getScalarSizeInBits()) {
6668 if (Ty.getScalarSizeInBits() == 16)
6670 if (Ty.getScalarSizeInBits() == 32)
6671 return Subtarget->hasMadMacF32Insts() &&
6681 EVT VT =
N->getValueType(0);
6683 return Subtarget->hasMadMacF32Insts() &&
6685 if (VT == MVT::f16) {
6686 return Subtarget->hasMadF16() &&
6701 unsigned Opc =
Op.getOpcode();
6702 EVT VT =
Op.getValueType();
6703 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16 ||
6704 VT == MVT::v4f32 || VT == MVT::v8i16 || VT == MVT::v8f16 ||
6705 VT == MVT::v8bf16 || VT == MVT::v16i16 || VT == MVT::v16f16 ||
6706 VT == MVT::v16bf16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
6707 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16 ||
6708 VT == MVT::v32bf16);
6724 [[maybe_unused]]
EVT VT =
Op.getValueType();
6726 assert((VT == MVT::v2i32 || VT == MVT::v4i32 || VT == MVT::v8i32 ||
6727 VT == MVT::v16i32) &&
6728 "Unexpected ValueType.");
6737 unsigned Opc =
Op.getOpcode();
6738 EVT VT =
Op.getValueType();
6739 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16 ||
6740 VT == MVT::v4f32 || VT == MVT::v8i16 || VT == MVT::v8f16 ||
6741 VT == MVT::v8bf16 || VT == MVT::v16i16 || VT == MVT::v16f16 ||
6742 VT == MVT::v16bf16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
6743 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16 ||
6744 VT == MVT::v32bf16);
6752 DAG.
getNode(
Opc, SL, Lo0.getValueType(), Lo0, Lo1,
Op->getFlags());
6754 DAG.
getNode(
Opc, SL, Hi0.getValueType(), Hi0, Hi1,
Op->getFlags());
6761 unsigned Opc =
Op.getOpcode();
6762 EVT VT =
Op.getValueType();
6763 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v8i16 ||
6764 VT == MVT::v8f16 || VT == MVT::v4f32 || VT == MVT::v16i16 ||
6765 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
6766 VT == MVT::v32f32 || VT == MVT::v32f16 || VT == MVT::v32i16 ||
6767 VT == MVT::v4bf16 || VT == MVT::v8bf16 || VT == MVT::v16bf16 ||
6768 VT == MVT::v32bf16);
6773 : std::pair(Op0, Op0);
6782 DAG.
getNode(
Opc, SL, ResVT.first, Lo0, Lo1, Lo2,
Op->getFlags());
6784 DAG.
getNode(
Opc, SL, ResVT.second, Hi0, Hi1, Hi2,
Op->getFlags());
6790 switch (
Op.getOpcode()) {
6794 return LowerBRCOND(
Op, DAG);
6796 return LowerRETURNADDR(
Op, DAG);
6799 assert((!Result.getNode() || Result.getNode()->getNumValues() == 2) &&
6800 "Load should return a value and a chain");
6804 EVT VT =
Op.getValueType();
6806 return lowerFSQRTF32(
Op, DAG);
6808 return lowerFSQRTF64(
Op, DAG);
6813 return LowerTrig(
Op, DAG);
6815 return LowerSELECT(
Op, DAG);
6817 return LowerFDIV(
Op, DAG);
6819 return LowerFFREXP(
Op, DAG);
6820 case ISD::ATOMIC_CMP_SWAP:
6821 return LowerATOMIC_CMP_SWAP(
Op, DAG);
6823 return LowerSTORE(
Op, DAG);
6827 return LowerGlobalAddress(MFI,
Op, DAG);
6830 return LowerINTRINSIC_WO_CHAIN(
Op, DAG);
6832 return LowerINTRINSIC_W_CHAIN(
Op, DAG);
6834 return LowerINTRINSIC_VOID(
Op, DAG);
6835 case ISD::ADDRSPACECAST:
6836 return lowerADDRSPACECAST(
Op, DAG);
6838 return lowerINSERT_SUBVECTOR(
Op, DAG);
6840 return lowerINSERT_VECTOR_ELT(
Op, DAG);
6842 return lowerEXTRACT_VECTOR_ELT(
Op, DAG);
6844 return lowerVECTOR_SHUFFLE(
Op, DAG);
6846 return lowerSCALAR_TO_VECTOR(
Op, DAG);
6848 return lowerBUILD_VECTOR(
Op, DAG);
6851 return lowerFP_ROUND(
Op, DAG);
6853 return lowerTRAP(
Op, DAG);
6854 case ISD::DEBUGTRAP:
6855 return lowerDEBUGTRAP(
Op, DAG);
6864 return lowerFMINNUM_FMAXNUM(
Op, DAG);
6865 case ISD::FMINIMUMNUM:
6866 case ISD::FMAXIMUMNUM:
6867 return lowerFMINIMUMNUM_FMAXIMUMNUM(
Op, DAG);
6870 return lowerFMINIMUM_FMAXIMUM(
Op, DAG);
6873 return lowerFLDEXP(
Op, DAG);
6890 case ISD::FMINNUM_IEEE:
6891 case ISD::FMAXNUM_IEEE:
6898 return lowerFCOPYSIGN(
Op, DAG);
6900 return lowerMUL(
Op, DAG);
6903 return lowerXMULO(
Op, DAG);
6906 return lowerXMUL_LOHI(
Op, DAG);
6907 case ISD::DYNAMIC_STACKALLOC:
6909 case ISD::STACKSAVE:
6913 case ISD::SET_ROUNDING:
6917 case ISD::FP_EXTEND:
6920 case ISD::GET_FPENV:
6922 case ISD::SET_FPENV:
6941 EVT FittingLoadVT = LoadVT;
6966 return DAG.
getNode(ISD::BITCAST,
DL, FittingLoadVT, Result);
6970 return DAG.
getNode(ISD::BITCAST,
DL, FittingLoadVT, Result);
6973SDValue SITargetLowering::adjustLoadValueType(
unsigned Opcode,
MemSDNode *M,
6976 bool IsIntrinsic)
const {
6979 bool Unpacked = Subtarget->hasUnpackedD16VMem();
6980 EVT LoadVT =
M->getValueType(0);
6982 EVT EquivLoadVT = LoadVT;
6996 SDVTList VTList = DAG.
getVTList(EquivLoadVT, MVT::Other);
7000 M->getMemoryVT(),
M->getMemOperand());
7011 EVT LoadVT =
M->getValueType(0);
7017 assert(
M->getNumValues() == 2 ||
M->getNumValues() == 3);
7018 bool IsTFE =
M->getNumValues() == 3;
7020 unsigned Opc = IsFormat ? (IsTFE ? AMDGPUISD::BUFFER_LOAD_FORMAT_TFE
7021 : AMDGPUISD::BUFFER_LOAD_FORMAT)
7022 : IsTFE ? AMDGPUISD::BUFFER_LOAD_TFE
7023 : AMDGPUISD::BUFFER_LOAD;
7026 return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16, M, DAG,
Ops);
7031 return handleByteShortBufferLoads(DAG, LoadVT,
DL,
Ops,
M->getMemOperand(),
7035 return getMemIntrinsicNode(
Opc,
DL,
M->getVTList(),
Ops, IntVT,
7036 M->getMemOperand(), DAG);
7040 SDVTList VTList = DAG.
getVTList(CastVT, MVT::Other);
7042 M->getMemOperand(), DAG);
7050 EVT VT =
N->getValueType(0);
7051 unsigned CondCode =
N->getConstantOperandVal(3);
7062 EVT CmpVT =
LHS.getValueType();
7063 if (CmpVT == MVT::i16 && !TLI.
isTypeLegal(MVT::i16)) {
7064 unsigned PromoteOp =
7084 EVT VT =
N->getValueType(0);
7086 unsigned CondCode =
N->getConstantOperandVal(3);
7095 if (CmpVT == MVT::f16 && !TLI.
isTypeLegal(CmpVT)) {
7096 Src0 = DAG.
getNode(ISD::FP_EXTEND, SL, MVT::f32, Src0);
7097 Src1 = DAG.
getNode(ISD::FP_EXTEND, SL, MVT::f32, Src1);
7104 SDValue SetCC = DAG.
getNode(AMDGPUISD::SETCC, SL, CCVT, Src0, Src1,
7113 EVT VT =
N->getValueType(0);
7122 Op0 = DAG.
getNode(ISD::FP_EXTEND, SL, MVT::f32, Op0);
7123 Op1 = DAG.
getNode(ISD::FP_EXTEND, SL, MVT::f32, Op1);
7137 Exec = AMDGPU::EXEC_LO;
7139 Exec = AMDGPU::EXEC;
7156 EVT VT =
N->getValueType(0);
7158 unsigned IID =
N->getConstantOperandVal(0);
7159 bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 ||
7160 IID == Intrinsic::amdgcn_permlanex16;
7161 bool IsSetInactive = IID == Intrinsic::amdgcn_set_inactive ||
7162 IID == Intrinsic::amdgcn_set_inactive_chain_arg;
7166 unsigned SplitSize = 32;
7167 if (IID == Intrinsic::amdgcn_update_dpp && (ValSize % 64 == 0) &&
7168 ST->hasDPALU_DPP() &&
7176 case Intrinsic::amdgcn_permlane16:
7177 case Intrinsic::amdgcn_permlanex16:
7178 case Intrinsic::amdgcn_update_dpp:
7183 case Intrinsic::amdgcn_writelane:
7186 case Intrinsic::amdgcn_readlane:
7187 case Intrinsic::amdgcn_set_inactive:
7188 case Intrinsic::amdgcn_set_inactive_chain_arg:
7189 case Intrinsic::amdgcn_mov_dpp8:
7192 case Intrinsic::amdgcn_readfirstlane:
7193 case Intrinsic::amdgcn_permlane64:
7201 std::reverse(Operands.
begin(), Operands.
end());
7203 if (
SDNode *GL =
N->getGluedNode()) {
7204 assert(GL->getOpcode() == ISD::CONVERGENCECTRL_GLUE);
7205 GL = GL->getOperand(0).getNode();
7215 if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
7216 IID == Intrinsic::amdgcn_mov_dpp8 ||
7217 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
7218 Src1 =
N->getOperand(2);
7219 if (IID == Intrinsic::amdgcn_writelane ||
7220 IID == Intrinsic::amdgcn_update_dpp || IsPermLane16)
7221 Src2 =
N->getOperand(3);
7224 if (ValSize == SplitSize) {
7234 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
7239 if (IID == Intrinsic::amdgcn_writelane) {
7244 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, MVT::i32);
7246 return IsFloat ? DAG.
getBitcast(VT, Trunc) : Trunc;
7249 if (ValSize % SplitSize != 0)
7253 EVT VT =
N->getValueType(0);
7257 unsigned NumOperands =
N->getNumOperands();
7259 SDNode *GL =
N->getGluedNode();
7264 for (
unsigned i = 0; i != NE; ++i) {
7265 for (
unsigned j = 0, e = GL ? NumOperands - 1 : NumOperands; j != e;
7267 SDValue Operand =
N->getOperand(j);
7276 Operands[j] = Operand;
7281 Operands[NumOperands - 1] =
7282 DAG.
getNode(ISD::CONVERGENCECTRL_GLUE, SL, MVT::Glue,
7297 if (SplitSize == 32) {
7299 return unrollLaneOp(LaneOp.
getNode());
7305 unsigned SubVecNumElt =
7309 SDValue Src0SubVec, Src1SubVec, Src2SubVec;
7310 for (
unsigned i = 0, EltIdx = 0; i < ValSize / SplitSize; i++) {
7314 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive ||
7319 if (IID == Intrinsic::amdgcn_writelane)
7324 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16
7325 ? createLaneOp(Src0SubVec, Src1SubVec, Src2, SubVecVT)
7326 : createLaneOp(Src0SubVec, Src1, Src2SubVec, SubVecVT));
7327 EltIdx += SubVecNumElt;
7341 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
7344 if (IID == Intrinsic::amdgcn_writelane)
7347 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VecVT);
7355 switch (
N->getOpcode()) {
7367 unsigned IID =
N->getConstantOperandVal(0);
7369 case Intrinsic::amdgcn_make_buffer_rsrc:
7370 Results.push_back(lowerPointerAsRsrcIntrin(
N, DAG));
7372 case Intrinsic::amdgcn_cvt_pkrtz: {
7377 DAG.
getNode(AMDGPUISD::CVT_PKRTZ_F16_F32, SL, MVT::i32, Src0, Src1);
7378 Results.push_back(DAG.
getNode(ISD::BITCAST, SL, MVT::v2f16, Cvt));
7381 case Intrinsic::amdgcn_cvt_pknorm_i16:
7382 case Intrinsic::amdgcn_cvt_pknorm_u16:
7383 case Intrinsic::amdgcn_cvt_pk_i16:
7384 case Intrinsic::amdgcn_cvt_pk_u16: {
7390 if (IID == Intrinsic::amdgcn_cvt_pknorm_i16)
7391 Opcode = AMDGPUISD::CVT_PKNORM_I16_F32;
7392 else if (IID == Intrinsic::amdgcn_cvt_pknorm_u16)
7393 Opcode = AMDGPUISD::CVT_PKNORM_U16_F32;
7394 else if (IID == Intrinsic::amdgcn_cvt_pk_i16)
7395 Opcode = AMDGPUISD::CVT_PK_I16_I32;
7397 Opcode = AMDGPUISD::CVT_PK_U16_U32;
7399 EVT VT =
N->getValueType(0);
7404 Results.push_back(DAG.
getNode(ISD::BITCAST, SL, MVT::v2i16, Cvt));
7408 case Intrinsic::amdgcn_s_buffer_load: {
7414 if (!Subtarget->hasScalarSubwordLoads())
7420 EVT VT =
Op.getValueType();
7421 assert(VT == MVT::i8 &&
"Expected 8-bit s_buffer_load intrinsics.\n");
7433 if (!
Offset->isDivergent()) {
7452 LoadVal = handleByteShortBufferLoads(DAG, VT,
DL,
Ops, MMO);
7457 case Intrinsic::amdgcn_dead: {
7458 for (
unsigned I = 0, E =
N->getNumValues();
I < E; ++
I)
7469 for (
unsigned I = 0;
I < Res.getNumOperands();
I++) {
7470 Results.push_back(Res.getOperand(
I));
7474 Results.push_back(Res.getValue(1));
7483 EVT VT =
N->getValueType(0);
7488 EVT SelectVT = NewVT;
7489 if (NewVT.
bitsLT(MVT::i32)) {
7492 SelectVT = MVT::i32;
7498 if (NewVT != SelectVT)
7504 if (
N->getValueType(0) != MVT::v2f16)
7508 SDValue BC = DAG.
getNode(ISD::BITCAST, SL, MVT::i32,
N->getOperand(0));
7516 if (
N->getValueType(0) != MVT::v2f16)
7520 SDValue BC = DAG.
getNode(ISD::BITCAST, SL, MVT::i32,
N->getOperand(0));
7528 if (
N->getValueType(0) != MVT::f16)
7543 if (U.get() !=
Value)
7546 if (U.getUser()->getOpcode() == Opcode)
7552unsigned SITargetLowering::isCFIntrinsic(
const SDNode *Intr)
const {
7555 case Intrinsic::amdgcn_if:
7556 return AMDGPUISD::IF;
7557 case Intrinsic::amdgcn_else:
7558 return AMDGPUISD::ELSE;
7559 case Intrinsic::amdgcn_loop:
7560 return AMDGPUISD::LOOP;
7561 case Intrinsic::amdgcn_end_cf:
7581 if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS())
7608 SDNode *Intr = BRCOND.getOperand(1).getNode();
7625 Intr =
LHS.getNode();
7633 assert(BR &&
"brcond missing unconditional branch user");
7638 unsigned CFNode = isCFIntrinsic(Intr);
7658 Ops.push_back(Target);
7681 for (
unsigned i = 1, e = Intr->
getNumValues() - 1; i != e; ++i) {
7700 MVT VT =
Op.getSimpleValueType();
7703 if (
Op.getConstantOperandVal(0) != 0)
7707 const SIMachineFunctionInfo *
Info = MF.
getInfo<SIMachineFunctionInfo>();
7709 if (
Info->isEntryFunction())
7726 return Op.getValueType().bitsLE(VT)
7734 EVT DstVT =
Op.getValueType();
7741 unsigned Opc =
Op.getOpcode();
7753 EVT SrcVT = Src.getValueType();
7754 EVT DstVT =
Op.getValueType();
7757 assert(Subtarget->hasCvtPkF16F32Inst() &&
"support v_cvt_pk_f16_f32");
7760 return SrcVT == MVT::v2f32 ?
Op : splitFP_ROUNDVectorOp(
Op, DAG);
7767 if (DstVT == MVT::f16) {
7772 if (!Subtarget->has16BitInsts()) {
7775 return DAG.
getNode(ISD::BITCAST,
DL, MVT::f16, Trunc);
7777 if (
Op->getFlags().hasApproximateFuncs()) {
7784 return DAG.
getNode(ISD::BITCAST,
DL, MVT::f16, Trunc);
7788 "custom lower FP_ROUND for f16 or bf16");
7789 assert(Subtarget->hasBF16ConversionInsts() &&
"f32 -> bf16 is legal");
7802 EVT VT =
Op.getValueType();
7804 const SIMachineFunctionInfo *
Info = MF.
getInfo<SIMachineFunctionInfo>();
7805 bool IsIEEEMode =
Info->getMode().IEEE;
7814 if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16 ||
7821SITargetLowering::lowerFMINIMUMNUM_FMAXIMUMNUM(
SDValue Op,
7823 EVT VT =
Op.getValueType();
7825 const SIMachineFunctionInfo *
Info = MF.
getInfo<SIMachineFunctionInfo>();
7826 bool IsIEEEMode =
Info->getMode().IEEE;
7831 if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16 ||
7839 EVT VT =
Op.getValueType();
7843 assert(!Subtarget->hasIEEEMinimumMaximumInsts() &&
7844 !Subtarget->hasMinimum3Maximum3F16() &&
7845 Subtarget->hasMinimum3Maximum3PKF16() && VT == MVT::f16 &&
7846 "should not need to widen f16 minimum/maximum to v2f16");
7860 DAG.
getNode(
Op.getOpcode(), SL, MVT::v2f16, WideSrc0, WideSrc1);
7868 EVT VT =
Op.getValueType();
7872 EVT ExpVT =
Exp.getValueType();
7873 if (ExpVT == MVT::i16)
7894 {
Op.getOperand(0),
Op.getOperand(1), TruncExp});
7897 return DAG.
getNode(ISD::FLDEXP,
DL, VT,
Op.getOperand(0), TruncExp);
7901 switch (
Op->getOpcode()) {
7931 DAGCombinerInfo &DCI)
const {
7932 const unsigned Opc =
Op.getOpcode();
7940 :
Op->getOperand(0).getValueType();
7943 if (DCI.isBeforeLegalizeOps() ||
7947 auto &DAG = DCI.DAG;
7953 LHS =
Op->getOperand(1);
7954 RHS =
Op->getOperand(2);
7956 LHS =
Op->getOperand(0);
7957 RHS =
Op->getOperand(1);
7996 if (MagVT == SignVT)
8003 SDValue SignAsInt32 = DAG.
getNode(ISD::BITCAST, SL, MVT::v2i32, Sign);
8006 SDValue SignAsHalf16 = DAG.
getNode(ISD::BITCAST, SL, MagVT, SignAsInt16);
8013 EVT VT =
Op.getValueType();
8019 assert(VT == MVT::i64 &&
"The following code is a special for s_mul_u64");
8046 if (
Op->isDivergent())
8059 if (Op0LeadingZeros >= 32 && Op1LeadingZeros >= 32)
8061 DAG.
getMachineNode(AMDGPU::S_MUL_U64_U32_PSEUDO, SL, VT, Op0, Op1), 0);
8064 if (Op0SignBits >= 33 && Op1SignBits >= 33)
8066 DAG.
getMachineNode(AMDGPU::S_MUL_I64_I32_PSEUDO, SL, VT, Op0, Op1), 0);
8072 EVT VT =
Op.getValueType();
8079 const APInt &
C = RHSC->getAPIntValue();
8081 if (
C.isPowerOf2()) {
8083 bool UseArithShift =
isSigned && !
C.isMinSignedValue();
8110 if (
Op->isDivergent()) {
8114 if (Subtarget->hasSMulHi()) {
8125 if (!Subtarget->isTrapHandlerEnabled() ||
8127 return lowerTrapEndpgm(
Op, DAG);
8129 return Subtarget->supportsGetDoorbellID() ? lowerTrapHsa(
Op, DAG)
8130 : lowerTrapHsaQueuePtr(
Op, DAG);
8136 return DAG.
getNode(AMDGPUISD::ENDPGM_TRAP, SL, MVT::Other, Chain);
8140SITargetLowering::loadImplicitKernelArgument(
SelectionDAG &DAG,
MVT VT,
8142 ImplicitParameter Param)
const {
8162 loadImplicitKernelArgument(DAG, MVT::i64, SL,
Align(8),
QUEUE_PTR);
8165 SIMachineFunctionInfo *
Info = MF.
getInfo<SIMachineFunctionInfo>();
8168 if (UserSGPR == AMDGPU::NoRegister) {
8185 return DAG.
getNode(AMDGPUISD::TRAP, SL, MVT::Other,
Ops);
8194 if (Subtarget->hasPrivEnabledTrap2NopBug())
8195 return DAG.
getNode(AMDGPUISD::SIMULATED_TRAP, SL, MVT::Other, Chain);
8199 return DAG.
getNode(AMDGPUISD::TRAP, SL, MVT::Other,
Ops);
8207 if (!Subtarget->isTrapHandlerEnabled() ||
8211 "debugtrap handler not supported",
8219 return DAG.
getNode(AMDGPUISD::TRAP, SL, MVT::Other,
Ops);
8222SDValue SITargetLowering::getSegmentAperture(
unsigned AS,
const SDLoc &
DL,
8224 if (Subtarget->hasApertureRegs()) {
8226 ? AMDGPU::SRC_SHARED_BASE
8227 : AMDGPU::SRC_PRIVATE_BASE;
8228 assert((ApertureRegNo != AMDGPU::SRC_PRIVATE_BASE ||
8229 !Subtarget->hasGloballyAddressableScratch()) &&
8230 "Cannot use src_private_base with globally addressable scratch!");
8251 return loadImplicitKernelArgument(DAG, MVT::i32,
DL,
Align(4), Param);
8255 SIMachineFunctionInfo *
Info = MF.
getInfo<SIMachineFunctionInfo>();
8257 if (UserSGPR == AMDGPU::NoRegister) {
8302 const AMDGPUTargetMachine &TM =
8305 unsigned DestAS, SrcAS;
8307 bool IsNonNull =
false;
8309 SrcAS = ASC->getSrcAddressSpace();
8310 Src = ASC->getOperand(0);
8311 DestAS = ASC->getDestAddressSpace();
8314 Op.getConstantOperandVal(0) ==
8315 Intrinsic::amdgcn_addrspacecast_nonnull);
8316 Src =
Op->getOperand(1);
8317 SrcAS =
Op->getConstantOperandVal(2);
8318 DestAS =
Op->getConstantOperandVal(3);
8331 Subtarget->hasGloballyAddressableScratch()) {
8336 AMDGPU::S_MOV_B32, SL, MVT::i32,
8337 DAG.
getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_LO, MVT::i32)),
8345 unsigned NullVal = TM.getNullPointerValue(DestAS);
8360 Subtarget->hasGloballyAddressableScratch()) {
8369 if (Subtarget->isWave64())
8375 57 - 32 - Subtarget->getWavefrontSizeLog2(), MVT::i32, SL);
8378 CvtPtr = DAG.
getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
8383 AMDGPU::S_MOV_B64, SL, MVT::i64,
8384 DAG.
getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE, MVT::i64)),
8386 CvtPtr = DAG.
getNode(
ISD::ADD, SL, MVT::i64, CvtPtr, FlatScratchBase);
8388 SDValue Aperture = getSegmentAperture(SrcAS, SL, DAG);
8390 CvtPtr = DAG.
getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
8396 unsigned NullVal = TM.getNullPointerValue(SrcAS);
8408 Op.getValueType() == MVT::i64) {
8409 const SIMachineFunctionInfo *
Info =
8413 return DAG.
getNode(ISD::BITCAST, SL, MVT::i64, Vec);
8417 Src.getValueType() == MVT::i64)
8437 EVT InsVT =
Ins.getValueType();
8445 assert(InsNumElts % 2 == 0 &&
"expect legal vector types");
8450 EVT NewInsVT = InsNumElts == 2 ? MVT::i32
8452 MVT::i32, InsNumElts / 2);
8454 Vec = DAG.
getNode(ISD::BITCAST, SL, NewVecVT, Vec);
8455 Ins = DAG.
getNode(ISD::BITCAST, SL, NewInsVT, Ins);
8457 for (
unsigned I = 0;
I != InsNumElts / 2; ++
I) {
8459 if (InsNumElts == 2) {
8469 return DAG.
getNode(ISD::BITCAST, SL, VecVT, Vec);
8472 for (
unsigned I = 0;
I != InsNumElts; ++
I) {
8495 if (NumElts == 4 && EltSize == 16 && KIdx) {
8503 SDValue LoVec = DAG.
getNode(ISD::BITCAST, SL, MVT::v2i16, LoHalf);
8504 SDValue HiVec = DAG.
getNode(ISD::BITCAST, SL, MVT::v2i16, HiHalf);
8506 unsigned Idx = KIdx->getZExtValue();
8507 bool InsertLo = Idx < 2;
8510 DAG.
getNode(ISD::BITCAST, SL, MVT::i16, InsVal),
8511 DAG.
getConstant(InsertLo ? Idx : (Idx - 2), SL, MVT::i32));
8513 InsHalf = DAG.
getNode(ISD::BITCAST, SL, MVT::i32, InsHalf);
8517 : DAG.getBuildVector(MVT::v2i32, SL, {LoHalf, InsHalf});
8530 assert(VecSize <= 64 &&
"Expected target vector size to be <= 64 bits");
8558 return DAG.
getNode(ISD::BITCAST, SL, VecVT, BFI);
8565 EVT ResultVT =
Op.getValueType();
8578 if (
SDValue Combined = performExtractVectorEltCombine(
Op.getNode(), DCI))
8581 if (VecSize == 128 || VecSize == 256 || VecSize == 512) {
8585 if (VecSize == 128) {
8593 }
else if (VecSize == 256) {
8596 for (
unsigned P = 0;
P < 4; ++
P) {
8602 Parts[0], Parts[1]));
8604 Parts[2], Parts[3]));
8610 for (
unsigned P = 0;
P < 8; ++
P) {
8617 Parts[0], Parts[1], Parts[2], Parts[3]));
8620 Parts[4], Parts[5], Parts[6], Parts[7]));
8640 Src = DAG.
getBitcast(Src.getValueType().changeTypeToInteger(), Src);
8655 if (ResultVT == MVT::f16 || ResultVT == MVT::bf16) {
8657 return DAG.
getNode(ISD::BITCAST, SL, ResultVT, Result);
8665 return Mask[Elt + 1] == Mask[Elt] + 1 && (Mask[Elt] % 2 == 0);
8670 return Mask[Elt] >= 0 && Mask[Elt + 1] >= 0 && (Mask[Elt] & 1) &&
8671 !(Mask[Elt + 1] & 1);
8677 EVT ResultVT =
Op.getValueType();
8680 const int NewSrcNumElts = 2;
8682 int SrcNumElts =
Op.getOperand(0).getValueType().getVectorNumElements();
8698 const bool ShouldUseConsecutiveExtract = EltVT.
getSizeInBits() == 16;
8720 if (ShouldUseConsecutiveExtract &&
8723 int VecIdx = Idx < SrcNumElts ? 0 : 1;
8724 int EltIdx = Idx < SrcNumElts ? Idx : Idx - SrcNumElts;
8736 if (Idx0 >= SrcNumElts) {
8741 if (Idx1 >= SrcNumElts) {
8746 int AlignedIdx0 = Idx0 & ~(NewSrcNumElts - 1);
8747 int AlignedIdx1 = Idx1 & ~(NewSrcNumElts - 1);
8755 int NewMaskIdx0 = Idx0 - AlignedIdx0;
8756 int NewMaskIdx1 = Idx1 - AlignedIdx1;
8761 if (SubVec0 != SubVec1) {
8762 NewMaskIdx1 += NewSrcNumElts;
8769 {NewMaskIdx0, NewMaskIdx1});
8774 int VecIdx0 = Idx0 < SrcNumElts ? 0 : 1;
8775 int VecIdx1 = Idx1 < SrcNumElts ? 0 : 1;
8776 int EltIdx0 = Idx0 < SrcNumElts ? Idx0 : Idx0 - SrcNumElts;
8777 int EltIdx1 = Idx1 < SrcNumElts ? Idx1 : Idx1 - SrcNumElts;
8796 EVT ResultVT =
Op.getValueType();
8812 EVT VT =
Op.getValueType();
8814 if (VT == MVT::v2f16 || VT == MVT::v2i16 || VT == MVT::v2bf16) {
8815 assert(!Subtarget->hasVOP3PInsts() &&
"this should be legal");
8824 return DAG.
getNode(ISD::BITCAST, SL, VT, ExtLo);
8833 return DAG.
getNode(ISD::BITCAST, SL, VT, ShlHi);
8840 return DAG.
getNode(ISD::BITCAST, SL, VT,
Or);
8849 for (
unsigned P = 0;
P < NumParts; ++
P) {
8851 PartVT, SL, {
Op.getOperand(
P * 2),
Op.getOperand(
P * 2 + 1)});
8857 return DAG.
getNode(ISD::BITCAST, SL, VT, Blend);
8870 if (!Subtarget->isAmdHsaOS())
8913 return DAG.
getNode(AMDGPUISD::PC_ADD_REL_OFFSET64,
DL, PtrVT, Ptr);
8922 return DAG.
getNode(AMDGPUISD::PC_ADD_REL_OFFSET,
DL, PtrVT, PtrLo, PtrHi);
8930 EVT PtrVT =
Op.getValueType();
8932 const GlobalValue *GV = GSD->
getGlobal();
8946 assert(PtrVT == MVT::i32 &&
"32-bit pointer is expected.");
8961 return DAG.
getNode(AMDGPUISD::LDS,
DL, MVT::i32, GA);
8964 if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS()) {
8965 if (Subtarget->has64BitLiterals()) {
8996 MachinePointerInfo PtrInfo =
9024 SDValue Param = lowerKernargMemParameter(
9035 "non-hsa intrinsic with hsa target",
DL.getDebugLoc()));
9043 "intrinsic not supported on subtarget",
DL.getDebugLoc()));
9051 unsigned NumElts = Elts.
size();
9053 if (NumElts <= 12) {
9062 for (
unsigned i = 0; i < Elts.
size(); ++i) {
9068 for (
unsigned i = Elts.
size(); i < NumElts; ++i)
9078 EVT SrcVT = Src.getValueType();
9099 bool Unpacked,
bool IsD16,
int DMaskPop,
9100 int NumVDataDwords,
bool IsAtomicPacked16Bit,
9104 EVT ReqRetVT = ResultTypes[0];
9106 int NumDataDwords = ((IsD16 && !Unpacked) || IsAtomicPacked16Bit)
9107 ? (ReqRetNumElts + 1) / 2
9110 int MaskPopDwords = (!IsD16 || Unpacked) ? DMaskPop : (DMaskPop + 1) / 2;
9121 if (DMaskPop > 0 &&
Data.getValueType() != MaskPopVT) {
9132 if (DataDwordVT.
isVector() && !IsAtomicPacked16Bit)
9134 NumDataDwords - MaskPopDwords);
9139 EVT LegalReqRetVT = ReqRetVT;
9141 if (!
Data.getValueType().isInteger())
9143 Data.getValueType().changeTypeToInteger(),
Data);
9164 if (Result->getNumValues() == 1)
9171 SDValue *LWE,
bool &IsTexFail) {
9191 unsigned DimIdx,
unsigned EndIdx,
9192 unsigned NumGradients) {
9194 for (
unsigned I = DimIdx;
I < EndIdx;
I++) {
9202 if (((
I + 1) >= EndIdx) ||
9203 ((NumGradients / 2) % 2 == 1 && (
I == DimIdx + (NumGradients / 2) - 1 ||
9204 I == DimIdx + NumGradients - 1))) {
9226 !
Op.getNode()->hasAnyUseOfValue(0))
9228 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
9238 ResultTypes.erase(&ResultTypes[0]);
9244 int NumVDataDwords = 0;
9245 bool AdjustRetType =
false;
9246 bool IsAtomicPacked16Bit =
false;
9249 const unsigned ArgOffset = WithChain ? 2 : 1;
9252 unsigned DMaskLanes = 0;
9254 if (BaseOpcode->
Atomic) {
9255 VData =
Op.getOperand(2);
9257 IsAtomicPacked16Bit =
9258 (IntrOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
9259 IntrOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16_NORTN ||
9260 IntrOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16 ||
9261 IntrOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16_NORTN);
9272 ResultTypes[0] = Is64Bit ? MVT::v2i64 : MVT::v2i32;
9274 DMask = Is64Bit ? 0xf : 0x3;
9275 NumVDataDwords = Is64Bit ? 4 : 2;
9277 DMask = Is64Bit ? 0x3 : 0x1;
9278 NumVDataDwords = Is64Bit ? 2 : 1;
9281 DMask =
Op->getConstantOperandVal(ArgOffset + Intr->
DMaskIndex);
9284 if (BaseOpcode->
Store) {
9285 VData =
Op.getOperand(2);
9289 if (!Subtarget->hasD16Images() || !BaseOpcode->
HasD16)
9293 VData = handleD16VData(VData, DAG,
true);
9296 NumVDataDwords = (VData.
getValueType().getSizeInBits() + 31) / 32;
9297 }
else if (!BaseOpcode->
NoReturn) {
9302 if (!Subtarget->hasD16Images() || !BaseOpcode->
HasD16)
9310 (!LoadVT.
isVector() && DMaskLanes > 1))
9316 if (IsD16 && !Subtarget->hasUnpackedD16VMem() &&
9317 !(BaseOpcode->
Gather4 && Subtarget->hasImageGather4D16Bug()))
9318 NumVDataDwords = (DMaskLanes + 1) / 2;
9320 NumVDataDwords = DMaskLanes;
9322 AdjustRetType =
true;
9326 unsigned VAddrEnd = ArgOffset + Intr->
VAddrEnd;
9333 MVT GradPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
9334 IsG16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
9336 VAddrVT =
Op.getOperand(ArgOffset + Intr->
CoordStart).getSimpleValueType();
9338 MVT AddrPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
9339 IsA16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
9343 if (IsA16 && (
Op.getOperand(ArgOffset +
I).getValueType() == MVT::f16)) {
9349 {
Op.getOperand(ArgOffset +
I), DAG.
getPOISON(MVT::f16)});
9353 "Bias needs to be converted to 16 bit in A16 mode");
9358 if (BaseOpcode->
Gradients && !
ST->hasG16() && (IsA16 != IsG16)) {
9362 dbgs() <<
"Failed to lower image intrinsic: 16 bit addresses "
9363 "require 16 bit args for both gradients and addresses");
9368 if (!
ST->hasA16()) {
9369 LLVM_DEBUG(
dbgs() <<
"Failed to lower image intrinsic: Target does not "
9370 "support 16 bit addresses\n");
9380 if (BaseOpcode->
Gradients && IsG16 &&
ST->hasG16()) {
9382 const AMDGPU::MIMGG16MappingInfo *G16MappingInfo =
9384 IntrOpcode = G16MappingInfo->
G16;
9407 for (
unsigned I = ArgOffset + Intr->
CoordStart;
I < VAddrEnd;
I++)
9425 const unsigned NSAMaxSize =
ST->getNSAMaxSize(BaseOpcode->
Sampler);
9426 const bool HasPartialNSAEncoding =
ST->hasPartialNSAEncoding();
9427 const bool UseNSA =
ST->hasNSAEncoding() &&
9428 VAddrs.
size() >=
ST->getNSAThreshold(MF) &&
9429 (VAddrs.
size() <= NSAMaxSize || HasPartialNSAEncoding);
9430 const bool UsePartialNSA =
9431 UseNSA && HasPartialNSAEncoding && VAddrs.
size() > NSAMaxSize;
9434 if (UsePartialNSA) {
9436 ArrayRef(VAddrs).drop_front(NSAMaxSize - 1));
9437 }
else if (!UseNSA) {
9447 uint64_t UnormConst =
9448 Op.getConstantOperandVal(ArgOffset + Intr->
UnormIndex);
9450 Unorm = UnormConst ? True : False;
9456 bool IsTexFail =
false;
9457 if (!
parseTexFail(TexFail, DAG, &TFE, &LWE, IsTexFail))
9468 NumVDataDwords += 1;
9469 AdjustRetType =
true;
9474 if (AdjustRetType) {
9477 if (DMaskLanes == 0 && !BaseOpcode->
Store) {
9486 MVT::i32, NumVDataDwords)
9489 ResultTypes[0] = NewVT;
9490 if (ResultTypes.size() == 3) {
9494 ResultTypes.erase(&ResultTypes[1]);
9508 Ops.push_back(VData);
9509 if (UsePartialNSA) {
9511 Ops.push_back(VAddr);
9515 Ops.push_back(VAddr);
9518 if (RsrcVT != MVT::v4i32 && RsrcVT != MVT::v8i32)
9520 Ops.push_back(Rsrc);
9525 Ops.push_back(Samp);
9530 if (!IsGFX12Plus || BaseOpcode->
Sampler || BaseOpcode->
MSAA)
9531 Ops.push_back(Unorm);
9533 Ops.push_back(IsA16 &&
9534 ST->hasFeature(AMDGPU::FeatureR128A16)
9538 Ops.push_back(IsA16 ? True : False);
9540 if (!Subtarget->hasGFX90AInsts())
9545 "TFE is not supported on this GPU",
DL.getDebugLoc()));
9548 if (!IsGFX12Plus || BaseOpcode->
Sampler || BaseOpcode->
MSAA)
9551 Ops.push_back(DimInfo->
DA ? True : False);
9553 Ops.push_back(IsD16 ? True : False);
9555 Ops.push_back(
Op.getOperand(0));
9557 int NumVAddrDwords =
9563 NumVDataDwords, NumVAddrDwords);
9564 }
else if (IsGFX11Plus) {
9566 UseNSA ? AMDGPU::MIMGEncGfx11NSA
9567 : AMDGPU::MIMGEncGfx11Default,
9568 NumVDataDwords, NumVAddrDwords);
9569 }
else if (IsGFX10Plus) {
9571 UseNSA ? AMDGPU::MIMGEncGfx10NSA
9572 : AMDGPU::MIMGEncGfx10Default,
9573 NumVDataDwords, NumVAddrDwords);
9575 if (Subtarget->hasGFX90AInsts()) {
9577 NumVDataDwords, NumVAddrDwords);
9581 "requested image instruction is not supported on this GPU",
9586 for (EVT VT : OrigResultTypes) {
9587 if (VT == MVT::Other)
9588 RetValues[Idx++] =
Op.getOperand(0);
9599 NumVDataDwords, NumVAddrDwords);
9602 NumVDataDwords, NumVAddrDwords);
9609 MachineMemOperand *MemRef = MemOp->getMemOperand();
9628 Subtarget->hasUnpackedD16VMem(), IsD16, DMaskLanes,
9629 NumVDataDwords, IsAtomicPacked16Bit,
DL);
9642 MachinePointerInfo(),
9647 if (!
Offset->isDivergent()) {
9654 if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
9663 !Subtarget->hasScalarDwordx3Loads()) {
9667 AMDGPUISD::SBUFFER_LOAD,
DL, DAG.
getVTList(WidenedVT),
Ops, WidenedVT,
9690 if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
9692 return handleByteShortBufferLoads(DAG, VT,
DL,
Ops, MMO);
9696 unsigned NumLoads = 1;
9702 if (NumElts == 8 || NumElts == 16) {
9703 NumLoads = NumElts / 4;
9707 SDVTList VTList = DAG.
getVTList({LoadVT, MVT::Other});
9712 NumLoads > 1 ?
Align(16 * NumLoads) :
Align(4));
9714 uint64_t InstOffset =
Ops[5]->getAsZExtVal();
9715 for (
unsigned i = 0; i < NumLoads; ++i) {
9717 Loads.
push_back(getMemIntrinsicNode(AMDGPUISD::BUFFER_LOAD,
DL, VTList,
Ops,
9721 if (NumElts == 8 || NumElts == 16)
9729 if (!Subtarget->hasArchitectedSGPRs())
9734 return DAG.
getNode(AMDGPUISD::BFE_U32, SL, VT, TTMP8,
9741 unsigned Width)
const {
9743 using namespace AMDGPU::Hwreg;
9745 AMDGPU::S_GETREG_B32_const, SL, MVT::i32,
9784 auto *MFI = MF.
getInfo<SIMachineFunctionInfo>();
9786 EVT VT =
Op.getValueType();
9788 unsigned IntrinsicID =
Op.getConstantOperandVal(0);
9792 switch (IntrinsicID) {
9793 case Intrinsic::amdgcn_implicit_buffer_ptr: {
9796 return getPreloadedValue(DAG, *MFI, VT,
9799 case Intrinsic::amdgcn_dispatch_ptr:
9800 case Intrinsic::amdgcn_queue_ptr: {
9801 if (!Subtarget->isAmdHsaOrMesa(MF.
getFunction())) {
9803 MF.
getFunction(),
"unsupported hsa intrinsic without hsa target",
9808 auto RegID = IntrinsicID == Intrinsic::amdgcn_dispatch_ptr
9811 return getPreloadedValue(DAG, *MFI, VT, RegID);
9813 case Intrinsic::amdgcn_implicitarg_ptr: {
9815 return getImplicitArgPtr(DAG,
DL);
9816 return getPreloadedValue(DAG, *MFI, VT,
9819 case Intrinsic::amdgcn_kernarg_segment_ptr: {
9825 return getPreloadedValue(DAG, *MFI, VT,
9828 case Intrinsic::amdgcn_dispatch_id: {
9831 case Intrinsic::amdgcn_rcp:
9832 return DAG.
getNode(AMDGPUISD::RCP,
DL, VT,
Op.getOperand(1));
9833 case Intrinsic::amdgcn_rsq:
9834 return DAG.
getNode(AMDGPUISD::RSQ,
DL, VT,
Op.getOperand(1));
9835 case Intrinsic::amdgcn_rsq_legacy:
9839 case Intrinsic::amdgcn_rcp_legacy:
9842 return DAG.
getNode(AMDGPUISD::RCP_LEGACY,
DL, VT,
Op.getOperand(1));
9843 case Intrinsic::amdgcn_rsq_clamp: {
9845 return DAG.
getNode(AMDGPUISD::RSQ_CLAMP,
DL, VT,
Op.getOperand(1));
9854 return DAG.
getNode(ISD::FMAXNUM,
DL, VT, Tmp,
9857 case Intrinsic::r600_read_ngroups_x:
9858 if (Subtarget->isAmdHsaOS())
9861 return lowerKernargMemParameter(DAG, VT, VT,
DL, DAG.
getEntryNode(),
9864 case Intrinsic::r600_read_ngroups_y:
9865 if (Subtarget->isAmdHsaOS())
9868 return lowerKernargMemParameter(DAG, VT, VT,
DL, DAG.
getEntryNode(),
9871 case Intrinsic::r600_read_ngroups_z:
9872 if (Subtarget->isAmdHsaOS())
9875 return lowerKernargMemParameter(DAG, VT, VT,
DL, DAG.
getEntryNode(),
9878 case Intrinsic::r600_read_local_size_x:
9879 if (Subtarget->isAmdHsaOS())
9882 return lowerImplicitZextParam(DAG,
Op, MVT::i16,
9884 case Intrinsic::r600_read_local_size_y:
9885 if (Subtarget->isAmdHsaOS())
9888 return lowerImplicitZextParam(DAG,
Op, MVT::i16,
9890 case Intrinsic::r600_read_local_size_z:
9891 if (Subtarget->isAmdHsaOS())
9894 return lowerImplicitZextParam(DAG,
Op, MVT::i16,
9896 case Intrinsic::amdgcn_workgroup_id_x:
9897 return lowerWorkGroupId(DAG, *MFI, VT,
9901 case Intrinsic::amdgcn_workgroup_id_y:
9902 return lowerWorkGroupId(DAG, *MFI, VT,
9906 case Intrinsic::amdgcn_workgroup_id_z:
9907 return lowerWorkGroupId(DAG, *MFI, VT,
9911 case Intrinsic::amdgcn_cluster_id_x:
9912 return Subtarget->hasClusters()
9913 ? getPreloadedValue(DAG, *MFI, VT,
9915 : DAG.getPOISON(VT);
9916 case Intrinsic::amdgcn_cluster_id_y:
9917 return Subtarget->hasClusters()
9918 ? getPreloadedValue(DAG, *MFI, VT,
9921 case Intrinsic::amdgcn_cluster_id_z:
9922 return Subtarget->hasClusters()
9923 ? getPreloadedValue(DAG, *MFI, VT,
9926 case Intrinsic::amdgcn_cluster_workgroup_id_x:
9927 return Subtarget->hasClusters()
9928 ? getPreloadedValue(
9932 case Intrinsic::amdgcn_cluster_workgroup_id_y:
9933 return Subtarget->hasClusters()
9934 ? getPreloadedValue(
9938 case Intrinsic::amdgcn_cluster_workgroup_id_z:
9939 return Subtarget->hasClusters()
9940 ? getPreloadedValue(
9944 case Intrinsic::amdgcn_cluster_workgroup_flat_id:
9945 return Subtarget->hasClusters()
9948 case Intrinsic::amdgcn_cluster_workgroup_max_id_x:
9949 return Subtarget->hasClusters()
9950 ? getPreloadedValue(
9954 case Intrinsic::amdgcn_cluster_workgroup_max_id_y:
9955 return Subtarget->hasClusters()
9956 ? getPreloadedValue(
9960 case Intrinsic::amdgcn_cluster_workgroup_max_id_z:
9961 return Subtarget->hasClusters()
9962 ? getPreloadedValue(
9966 case Intrinsic::amdgcn_cluster_workgroup_max_flat_id:
9967 return Subtarget->hasClusters()
9968 ? getPreloadedValue(
9972 case Intrinsic::amdgcn_wave_id:
9973 return lowerWaveID(DAG,
Op);
9974 case Intrinsic::amdgcn_lds_kernel_id: {
9976 return getLDSKernelId(DAG,
DL);
9977 return getPreloadedValue(DAG, *MFI, VT,
9980 case Intrinsic::amdgcn_workitem_id_x:
9981 return lowerWorkitemID(DAG,
Op, 0, MFI->getArgInfo().WorkItemIDX);
9982 case Intrinsic::amdgcn_workitem_id_y:
9983 return lowerWorkitemID(DAG,
Op, 1, MFI->getArgInfo().WorkItemIDY);
9984 case Intrinsic::amdgcn_workitem_id_z:
9985 return lowerWorkitemID(DAG,
Op, 2, MFI->getArgInfo().WorkItemIDZ);
9986 case Intrinsic::amdgcn_wavefrontsize:
9988 SDLoc(
Op), MVT::i32);
9989 case Intrinsic::amdgcn_s_buffer_load: {
9990 unsigned CPol =
Op.getConstantOperandVal(3);
9997 return lowerSBuffer(VT,
DL,
Op.getOperand(1),
Op.getOperand(2),
9998 Op.getOperand(3), DAG);
10000 case Intrinsic::amdgcn_fdiv_fast:
10001 return lowerFDIV_FAST(
Op, DAG);
10002 case Intrinsic::amdgcn_sin:
10003 return DAG.
getNode(AMDGPUISD::SIN_HW,
DL, VT,
Op.getOperand(1));
10005 case Intrinsic::amdgcn_cos:
10006 return DAG.
getNode(AMDGPUISD::COS_HW,
DL, VT,
Op.getOperand(1));
10008 case Intrinsic::amdgcn_mul_u24:
10009 return DAG.
getNode(AMDGPUISD::MUL_U24,
DL, VT,
Op.getOperand(1),
10011 case Intrinsic::amdgcn_mul_i24:
10012 return DAG.
getNode(AMDGPUISD::MUL_I24,
DL, VT,
Op.getOperand(1),
10015 case Intrinsic::amdgcn_log_clamp: {
10021 case Intrinsic::amdgcn_fract:
10022 return DAG.
getNode(AMDGPUISD::FRACT,
DL, VT,
Op.getOperand(1));
10024 case Intrinsic::amdgcn_class:
10025 return DAG.
getNode(AMDGPUISD::FP_CLASS,
DL, VT,
Op.getOperand(1),
10027 case Intrinsic::amdgcn_div_fmas:
10028 return DAG.
getNode(AMDGPUISD::DIV_FMAS,
DL, VT,
Op.getOperand(1),
10029 Op.getOperand(2),
Op.getOperand(3),
Op.getOperand(4));
10031 case Intrinsic::amdgcn_div_fixup:
10032 return DAG.
getNode(AMDGPUISD::DIV_FIXUP,
DL, VT,
Op.getOperand(1),
10033 Op.getOperand(2),
Op.getOperand(3));
10035 case Intrinsic::amdgcn_div_scale: {
10041 SDValue Denominator =
Op.getOperand(2);
10048 SDValue Src0 =
Param->isAllOnes() ? Numerator : Denominator;
10050 return DAG.
getNode(AMDGPUISD::DIV_SCALE,
DL,
Op->getVTList(), Src0,
10051 Denominator, Numerator);
10053 case Intrinsic::amdgcn_icmp: {
10055 if (
Op.getOperand(1).getValueType() == MVT::i1 &&
10056 Op.getConstantOperandVal(2) == 0 &&
10061 case Intrinsic::amdgcn_fcmp: {
10064 case Intrinsic::amdgcn_ballot:
10066 case Intrinsic::amdgcn_fmed3:
10067 return DAG.
getNode(AMDGPUISD::FMED3,
DL, VT,
Op.getOperand(1),
10068 Op.getOperand(2),
Op.getOperand(3));
10069 case Intrinsic::amdgcn_fdot2:
10070 return DAG.
getNode(AMDGPUISD::FDOT2,
DL, VT,
Op.getOperand(1),
10071 Op.getOperand(2),
Op.getOperand(3),
Op.getOperand(4));
10072 case Intrinsic::amdgcn_fmul_legacy:
10073 return DAG.
getNode(AMDGPUISD::FMUL_LEGACY,
DL, VT,
Op.getOperand(1),
10075 case Intrinsic::amdgcn_sffbh:
10076 return DAG.
getNode(AMDGPUISD::FFBH_I32,
DL, VT,
Op.getOperand(1));
10077 case Intrinsic::amdgcn_sbfe:
10078 return DAG.
getNode(AMDGPUISD::BFE_I32,
DL, VT,
Op.getOperand(1),
10079 Op.getOperand(2),
Op.getOperand(3));
10080 case Intrinsic::amdgcn_ubfe:
10081 return DAG.
getNode(AMDGPUISD::BFE_U32,
DL, VT,
Op.getOperand(1),
10082 Op.getOperand(2),
Op.getOperand(3));
10083 case Intrinsic::amdgcn_cvt_pkrtz:
10084 case Intrinsic::amdgcn_cvt_pknorm_i16:
10085 case Intrinsic::amdgcn_cvt_pknorm_u16:
10086 case Intrinsic::amdgcn_cvt_pk_i16:
10087 case Intrinsic::amdgcn_cvt_pk_u16: {
10089 EVT VT =
Op.getValueType();
10092 if (IntrinsicID == Intrinsic::amdgcn_cvt_pkrtz)
10093 Opcode = AMDGPUISD::CVT_PKRTZ_F16_F32;
10094 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_i16)
10095 Opcode = AMDGPUISD::CVT_PKNORM_I16_F32;
10096 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_u16)
10097 Opcode = AMDGPUISD::CVT_PKNORM_U16_F32;
10098 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pk_i16)
10099 Opcode = AMDGPUISD::CVT_PK_I16_I32;
10101 Opcode = AMDGPUISD::CVT_PK_U16_U32;
10104 return DAG.
getNode(Opcode,
DL, VT,
Op.getOperand(1),
Op.getOperand(2));
10107 DAG.
getNode(Opcode,
DL, MVT::i32,
Op.getOperand(1),
Op.getOperand(2));
10108 return DAG.
getNode(ISD::BITCAST,
DL, VT, Node);
10110 case Intrinsic::amdgcn_fmad_ftz:
10111 return DAG.
getNode(AMDGPUISD::FMAD_FTZ,
DL, VT,
Op.getOperand(1),
10112 Op.getOperand(2),
Op.getOperand(3));
10114 case Intrinsic::amdgcn_if_break:
10116 Op->getOperand(1),
Op->getOperand(2)),
10119 case Intrinsic::amdgcn_groupstaticsize: {
10125 const GlobalValue *GV =
10131 case Intrinsic::amdgcn_is_shared:
10132 case Intrinsic::amdgcn_is_private: {
10135 DAG.
getNode(ISD::BITCAST,
DL, MVT::v2i32,
Op.getOperand(1));
10139 unsigned AS = (IntrinsicID == Intrinsic::amdgcn_is_shared)
10143 Subtarget->hasGloballyAddressableScratch()) {
10146 AMDGPU::S_MOV_B32,
DL, MVT::i32,
10147 DAG.
getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_HI, MVT::i32)),
10156 SDValue Aperture = getSegmentAperture(AS, SL, DAG);
10159 case Intrinsic::amdgcn_perm:
10160 return DAG.
getNode(AMDGPUISD::PERM,
DL, MVT::i32,
Op.getOperand(1),
10161 Op.getOperand(2),
Op.getOperand(3));
10162 case Intrinsic::amdgcn_reloc_constant: {
10172 case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
10173 case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
10174 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
10175 case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
10176 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
10177 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
10178 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
10179 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {
10180 if (
Op.getOperand(4).getValueType() == MVT::i32)
10186 Op.getOperand(0),
Op.getOperand(1),
Op.getOperand(2),
10187 Op.getOperand(3), IndexKeyi32);
10189 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_fp8:
10190 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_bf8:
10191 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_fp8:
10192 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_bf8:
10193 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_fp8:
10194 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_bf8:
10195 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_fp8:
10196 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_bf8: {
10197 if (
Op.getOperand(4).getValueType() == MVT::i64)
10203 {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
10204 Op.getOperand(3), IndexKeyi64, Op.getOperand(5),
10205 Op.getOperand(6)});
10207 case Intrinsic::amdgcn_swmmac_f16_16x16x64_f16:
10208 case Intrinsic::amdgcn_swmmac_bf16_16x16x64_bf16:
10209 case Intrinsic::amdgcn_swmmac_f32_16x16x64_bf16:
10210 case Intrinsic::amdgcn_swmmac_bf16f32_16x16x64_bf16:
10211 case Intrinsic::amdgcn_swmmac_f32_16x16x64_f16:
10212 case Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8: {
10213 EVT IndexKeyTy = IntrinsicID == Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8
10216 if (
Op.getOperand(6).getValueType() == IndexKeyTy)
10222 {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
10223 Op.getOperand(3), Op.getOperand(4), Op.getOperand(5),
10224 IndexKey, Op.getOperand(7),
10225 Op.getOperand(8)});
10227 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
10228 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
10229 case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
10230 if (
Op.getOperand(6).getValueType() == MVT::i32)
10236 {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
10237 Op.getOperand(3), Op.getOperand(4), Op.getOperand(5),
10238 IndexKeyi32, Op.getOperand(7)});
10240 case Intrinsic::amdgcn_addrspacecast_nonnull:
10241 return lowerADDRSPACECAST(
Op, DAG);
10242 case Intrinsic::amdgcn_readlane:
10243 case Intrinsic::amdgcn_readfirstlane:
10244 case Intrinsic::amdgcn_writelane:
10245 case Intrinsic::amdgcn_permlane16:
10246 case Intrinsic::amdgcn_permlanex16:
10247 case Intrinsic::amdgcn_permlane64:
10248 case Intrinsic::amdgcn_set_inactive:
10249 case Intrinsic::amdgcn_set_inactive_chain_arg:
10250 case Intrinsic::amdgcn_mov_dpp8:
10251 case Intrinsic::amdgcn_update_dpp:
10253 case Intrinsic::amdgcn_dead: {
10255 for (
const EVT ValTy :
Op.getNode()->values())
10260 if (
const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
10262 return lowerImage(
Op, ImageDimIntr, DAG,
false);
10273 return DAG.
getRegister(AMDGPU::SGPR_NULL, MVT::i32);
10279 unsigned NewOpcode)
const {
10283 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
10284 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(4), DAG);
10302 M->getMemOperand());
10307 unsigned NewOpcode)
const {
10311 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
10312 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(5), DAG);
10330 M->getMemOperand());
10335 unsigned IntrID =
Op.getConstantOperandVal(1);
10339 case Intrinsic::amdgcn_ds_ordered_add:
10340 case Intrinsic::amdgcn_ds_ordered_swap: {
10345 unsigned IndexOperand =
M->getConstantOperandVal(7);
10346 unsigned WaveRelease =
M->getConstantOperandVal(8);
10347 unsigned WaveDone =
M->getConstantOperandVal(9);
10349 unsigned OrderedCountIndex = IndexOperand & 0x3f;
10350 IndexOperand &= ~0x3f;
10351 unsigned CountDw = 0;
10354 CountDw = (IndexOperand >> 24) & 0xf;
10355 IndexOperand &= ~(0xf << 24);
10357 if (CountDw < 1 || CountDw > 4) {
10360 Fn,
"ds_ordered_count: dword count must be between 1 and 4",
10361 DL.getDebugLoc()));
10366 if (IndexOperand) {
10369 Fn,
"ds_ordered_count: bad index operand",
DL.getDebugLoc()));
10372 if (WaveDone && !WaveRelease) {
10376 Fn,
"ds_ordered_count: wave_done requires wave_release",
10377 DL.getDebugLoc()));
10380 unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1;
10381 unsigned ShaderType =
10383 unsigned Offset0 = OrderedCountIndex << 2;
10384 unsigned Offset1 = WaveRelease | (WaveDone << 1) | (Instruction << 4);
10387 Offset1 |= (CountDw - 1) << 6;
10390 Offset1 |= ShaderType << 2;
10392 unsigned Offset = Offset0 | (Offset1 << 8);
10399 M->getVTList(),
Ops,
M->getMemoryVT(),
10400 M->getMemOperand());
10402 case Intrinsic::amdgcn_raw_buffer_load:
10403 case Intrinsic::amdgcn_raw_ptr_buffer_load:
10404 case Intrinsic::amdgcn_raw_atomic_buffer_load:
10405 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
10406 case Intrinsic::amdgcn_raw_buffer_load_format:
10407 case Intrinsic::amdgcn_raw_ptr_buffer_load_format: {
10408 const bool IsFormat =
10409 IntrID == Intrinsic::amdgcn_raw_buffer_load_format ||
10410 IntrID == Intrinsic::amdgcn_raw_ptr_buffer_load_format;
10412 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
10413 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(3), DAG);
10427 return lowerIntrinsicLoad(M, IsFormat, DAG,
Ops);
10429 case Intrinsic::amdgcn_struct_buffer_load:
10430 case Intrinsic::amdgcn_struct_ptr_buffer_load:
10431 case Intrinsic::amdgcn_struct_buffer_load_format:
10432 case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
10433 case Intrinsic::amdgcn_struct_atomic_buffer_load:
10434 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: {
10435 const bool IsFormat =
10436 IntrID == Intrinsic::amdgcn_struct_buffer_load_format ||
10437 IntrID == Intrinsic::amdgcn_struct_ptr_buffer_load_format;
10439 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
10440 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(4), DAG);
10455 case Intrinsic::amdgcn_raw_tbuffer_load:
10456 case Intrinsic::amdgcn_raw_ptr_tbuffer_load: {
10458 EVT LoadVT =
Op.getValueType();
10459 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
10460 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(3), DAG);
10476 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16, M, DAG,
10478 return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT,
DL,
10479 Op->getVTList(),
Ops, LoadVT,
M->getMemOperand(),
10482 case Intrinsic::amdgcn_struct_tbuffer_load:
10483 case Intrinsic::amdgcn_struct_ptr_tbuffer_load: {
10485 EVT LoadVT =
Op.getValueType();
10486 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
10487 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(4), DAG);
10503 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16, M, DAG,
10505 return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT,
DL,
10506 Op->getVTList(),
Ops, LoadVT,
M->getMemOperand(),
10509 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
10510 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
10511 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FADD);
10512 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
10513 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
10514 return lowerStructBufferAtomicIntrin(
Op, DAG,
10515 AMDGPUISD::BUFFER_ATOMIC_FADD);
10516 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
10517 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
10518 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMIN);
10519 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
10520 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
10521 return lowerStructBufferAtomicIntrin(
Op, DAG,
10522 AMDGPUISD::BUFFER_ATOMIC_FMIN);
10523 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
10524 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
10525 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMAX);
10526 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
10527 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
10528 return lowerStructBufferAtomicIntrin(
Op, DAG,
10529 AMDGPUISD::BUFFER_ATOMIC_FMAX);
10530 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
10531 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
10532 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SWAP);
10533 case Intrinsic::amdgcn_raw_buffer_atomic_add:
10534 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
10535 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_ADD);
10536 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
10537 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
10538 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SUB);
10539 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
10540 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
10541 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SMIN);
10542 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
10543 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
10544 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_UMIN);
10545 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
10546 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
10547 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SMAX);
10548 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
10549 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
10550 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_UMAX);
10551 case Intrinsic::amdgcn_raw_buffer_atomic_and:
10552 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
10553 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_AND);
10554 case Intrinsic::amdgcn_raw_buffer_atomic_or:
10555 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
10556 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_OR);
10557 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
10558 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
10559 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_XOR);
10560 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
10561 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
10562 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_INC);
10563 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
10564 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
10565 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_DEC);
10566 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
10567 return lowerRawBufferAtomicIntrin(
Op, DAG,
10568 AMDGPUISD::BUFFER_ATOMIC_COND_SUB_U32);
10569 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
10570 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
10571 return lowerStructBufferAtomicIntrin(
Op, DAG,
10572 AMDGPUISD::BUFFER_ATOMIC_SWAP);
10573 case Intrinsic::amdgcn_struct_buffer_atomic_add:
10574 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
10575 return lowerStructBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_ADD);
10576 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
10577 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
10578 return lowerStructBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SUB);
10579 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
10580 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
10581 return lowerStructBufferAtomicIntrin(
Op, DAG,
10582 AMDGPUISD::BUFFER_ATOMIC_SMIN);
10583 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
10584 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
10585 return lowerStructBufferAtomicIntrin(
Op, DAG,
10586 AMDGPUISD::BUFFER_ATOMIC_UMIN);
10587 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
10588 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
10589 return lowerStructBufferAtomicIntrin(
Op, DAG,
10590 AMDGPUISD::BUFFER_ATOMIC_SMAX);
10591 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
10592 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
10593 return lowerStructBufferAtomicIntrin(
Op, DAG,
10594 AMDGPUISD::BUFFER_ATOMIC_UMAX);
10595 case Intrinsic::amdgcn_struct_buffer_atomic_and:
10596 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
10597 return lowerStructBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_AND);
10598 case Intrinsic::amdgcn_struct_buffer_atomic_or:
10599 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
10600 return lowerStructBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_OR);
10601 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
10602 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
10603 return lowerStructBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_XOR);
10604 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
10605 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
10606 return lowerStructBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_INC);
10607 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
10608 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
10609 return lowerStructBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_DEC);
10610 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
10611 return lowerStructBufferAtomicIntrin(
Op, DAG,
10612 AMDGPUISD::BUFFER_ATOMIC_COND_SUB_U32);
10614 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
10615 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap: {
10616 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(4), DAG);
10617 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(5), DAG);
10631 EVT VT =
Op.getValueType();
10635 Op->getVTList(),
Ops, VT,
10636 M->getMemOperand());
10638 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
10639 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap: {
10640 SDValue Rsrc = bufferRsrcPtrToVector(
Op->getOperand(4), DAG);
10641 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(6), DAG);
10655 EVT VT =
Op.getValueType();
10659 Op->getVTList(),
Ops, VT,
10660 M->getMemOperand());
10662 case Intrinsic::amdgcn_image_bvh_dual_intersect_ray:
10663 case Intrinsic::amdgcn_image_bvh8_intersect_ray: {
10665 SDValue NodePtr =
M->getOperand(2);
10666 SDValue RayExtent =
M->getOperand(3);
10667 SDValue InstanceMask =
M->getOperand(4);
10668 SDValue RayOrigin =
M->getOperand(5);
10669 SDValue RayDir =
M->getOperand(6);
10671 SDValue TDescr =
M->getOperand(8);
10676 if (!Subtarget->hasBVHDualAndBVH8Insts()) {
10681 bool IsBVH8 = IntrID == Intrinsic::amdgcn_image_bvh8_intersect_ray;
10682 const unsigned NumVDataDwords = 10;
10683 const unsigned NumVAddrDwords = IsBVH8 ? 11 : 12;
10685 IsBVH8 ? AMDGPU::IMAGE_BVH8_INTERSECT_RAY
10686 : AMDGPU::IMAGE_BVH_DUAL_INTERSECT_RAY,
10687 AMDGPU::MIMGEncGfx12, NumVDataDwords, NumVAddrDwords);
10691 Ops.push_back(NodePtr);
10694 {DAG.getBitcast(MVT::i32, RayExtent),
10695 DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, InstanceMask)}));
10696 Ops.push_back(RayOrigin);
10697 Ops.push_back(RayDir);
10698 Ops.push_back(Offsets);
10699 Ops.push_back(TDescr);
10700 Ops.push_back(
M->getChain());
10703 MachineMemOperand *MemRef =
M->getMemOperand();
10707 case Intrinsic::amdgcn_image_bvh_intersect_ray: {
10709 SDValue NodePtr =
M->getOperand(2);
10710 SDValue RayExtent =
M->getOperand(3);
10711 SDValue RayOrigin =
M->getOperand(4);
10712 SDValue RayDir =
M->getOperand(5);
10713 SDValue RayInvDir =
M->getOperand(6);
10714 SDValue TDescr =
M->getOperand(7);
10721 if (!Subtarget->hasGFX10_AEncoding()) {
10731 const unsigned NumVDataDwords = 4;
10732 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
10733 const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
10734 const bool UseNSA = (Subtarget->hasNSAEncoding() &&
10737 const unsigned BaseOpcodes[2][2] = {
10738 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
10739 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
10740 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
10744 IsGFX12Plus ? AMDGPU::MIMGEncGfx12
10745 : IsGFX11 ? AMDGPU::MIMGEncGfx11NSA
10746 : AMDGPU::MIMGEncGfx10NSA,
10747 NumVDataDwords, NumVAddrDwords);
10751 IsGFX11 ? AMDGPU::MIMGEncGfx11Default
10752 : AMDGPU::MIMGEncGfx10Default,
10753 NumVDataDwords, NumVAddrDwords);
10759 auto packLanes = [&DAG, &
Ops, &
DL](
SDValue Op,
bool IsAligned) {
10762 if (Lanes[0].getValueSizeInBits() == 32) {
10763 for (
unsigned I = 0;
I < 3; ++
I)
10770 Ops.push_back(Lanes[2]);
10782 if (UseNSA && IsGFX11Plus) {
10783 Ops.push_back(NodePtr);
10785 Ops.push_back(RayOrigin);
10790 for (
unsigned I = 0;
I < 3; ++
I) {
10793 {DirLanes[I], InvDirLanes[I]})));
10797 Ops.push_back(RayDir);
10798 Ops.push_back(RayInvDir);
10805 Ops.push_back(NodePtr);
10808 packLanes(RayOrigin,
true);
10809 packLanes(RayDir,
true);
10810 packLanes(RayInvDir,
false);
10815 if (NumVAddrDwords > 12) {
10817 Ops.append(16 -
Ops.size(), Undef);
10823 Ops.push_back(MergedOps);
10826 Ops.push_back(TDescr);
10828 Ops.push_back(
M->getChain());
10831 MachineMemOperand *MemRef =
M->getMemOperand();
10835 case Intrinsic::amdgcn_global_atomic_fmin_num:
10836 case Intrinsic::amdgcn_global_atomic_fmax_num:
10837 case Intrinsic::amdgcn_flat_atomic_fmin_num:
10838 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
10845 unsigned Opcode = 0;
10847 case Intrinsic::amdgcn_global_atomic_fmin_num:
10848 case Intrinsic::amdgcn_flat_atomic_fmin_num: {
10849 Opcode = ISD::ATOMIC_LOAD_FMIN;
10852 case Intrinsic::amdgcn_global_atomic_fmax_num:
10853 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
10854 Opcode = ISD::ATOMIC_LOAD_FMAX;
10860 return DAG.
getAtomic(Opcode, SDLoc(
Op),
M->getMemoryVT(),
M->getVTList(),
10861 Ops,
M->getMemOperand());
10863 case Intrinsic::amdgcn_s_get_barrier_state:
10864 case Intrinsic::amdgcn_s_get_named_barrier_state: {
10871 if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state)
10872 BarID = (BarID >> 4) & 0x3F;
10873 Opc = AMDGPU::S_GET_BARRIER_STATE_IMM;
10876 Ops.push_back(Chain);
10878 Opc = AMDGPU::S_GET_BARRIER_STATE_M0;
10879 if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state) {
10887 Ops.push_back(
copyToM0(DAG, Chain,
DL, M0Val).getValue(0));
10895 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
10896 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
10897 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B: {
10901 EVT VT =
Op->getValueType(0);
10907 if (
const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
10909 return lowerImage(
Op, ImageDimIntr, DAG,
true);
10917SDValue SITargetLowering::getMemIntrinsicNode(
unsigned Opcode,
const SDLoc &
DL,
10924 EVT VT = VTList.
VTs[0];
10927 bool IsTFE = VTList.
NumVTs == 3;
10930 unsigned NumOpDWords = NumValueDWords + 1;
10932 SDVTList OpDWordsVTList = DAG.
getVTList(OpDWordsVT, VTList.
VTs[2]);
10933 MachineMemOperand *OpDWordsMMO =
10935 SDValue Op = getMemIntrinsicNode(Opcode,
DL, OpDWordsVTList,
Ops,
10936 OpDWordsVT, OpDWordsMMO, DAG);
10941 NumValueDWords == 1
10950 if (!Subtarget->hasDwordx3LoadStores() &&
10951 (VT == MVT::v3i32 || VT == MVT::v3f32)) {
10955 SDVTList WidenedVTList = DAG.
getVTList(WidenedVT, VTList.
VTs[1]);
10957 WidenedMemVT, WidenedMMO);
10967 bool ImageStore)
const {
10977 if (Subtarget->hasUnpackedD16VMem()) {
10991 if (ImageStore && Subtarget->hasImageStoreD16Bug()) {
11002 for (
unsigned I = 0;
I < Elts.
size() / 2;
I += 1) {
11008 if ((NumElements % 2) == 1) {
11010 unsigned I = Elts.
size() / 2;
11026 if (NumElements == 3) {
11036 return DAG.
getNode(ISD::BITCAST,
DL, WidenedStoreVT, ZExt);
11047 unsigned IntrinsicID =
Op.getConstantOperandVal(1);
11050 switch (IntrinsicID) {
11051 case Intrinsic::amdgcn_exp_compr: {
11052 if (!Subtarget->hasCompressedExport()) {
11055 "intrinsic not supported on subtarget",
DL.getDebugLoc()));
11067 DAG.
getNode(ISD::BITCAST,
DL, MVT::f32, Src0),
11068 DAG.
getNode(ISD::BITCAST,
DL, MVT::f32, Src1),
11077 unsigned Opc =
Done->isZero() ? AMDGPU::EXP : AMDGPU::EXP_DONE;
11081 case Intrinsic::amdgcn_struct_tbuffer_store:
11082 case Intrinsic::amdgcn_struct_ptr_tbuffer_store: {
11084 bool IsD16 = (VData.
getValueType().getScalarType() == MVT::f16);
11086 VData = handleD16VData(VData, DAG);
11087 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
11088 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(5), DAG);
11102 unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16
11103 : AMDGPUISD::TBUFFER_STORE_FORMAT;
11106 M->getMemoryVT(),
M->getMemOperand());
11109 case Intrinsic::amdgcn_raw_tbuffer_store:
11110 case Intrinsic::amdgcn_raw_ptr_tbuffer_store: {
11112 bool IsD16 = (VData.
getValueType().getScalarType() == MVT::f16);
11114 VData = handleD16VData(VData, DAG);
11115 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
11116 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(4), DAG);
11130 unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16
11131 : AMDGPUISD::TBUFFER_STORE_FORMAT;
11134 M->getMemoryVT(),
M->getMemOperand());
11137 case Intrinsic::amdgcn_raw_buffer_store:
11138 case Intrinsic::amdgcn_raw_ptr_buffer_store:
11139 case Intrinsic::amdgcn_raw_buffer_store_format:
11140 case Intrinsic::amdgcn_raw_ptr_buffer_store_format: {
11141 const bool IsFormat =
11142 IntrinsicID == Intrinsic::amdgcn_raw_buffer_store_format ||
11143 IntrinsicID == Intrinsic::amdgcn_raw_ptr_buffer_store_format;
11150 VData = handleD16VData(VData, DAG);
11160 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
11161 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(4), DAG);
11175 IsFormat ? AMDGPUISD::BUFFER_STORE_FORMAT : AMDGPUISD::BUFFER_STORE;
11176 Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 :
Opc;
11181 return handleByteShortBufferStores(DAG, VDataVT,
DL,
Ops, M);
11184 M->getMemoryVT(),
M->getMemOperand());
11187 case Intrinsic::amdgcn_struct_buffer_store:
11188 case Intrinsic::amdgcn_struct_ptr_buffer_store:
11189 case Intrinsic::amdgcn_struct_buffer_store_format:
11190 case Intrinsic::amdgcn_struct_ptr_buffer_store_format: {
11191 const bool IsFormat =
11192 IntrinsicID == Intrinsic::amdgcn_struct_buffer_store_format ||
11193 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_store_format;
11201 VData = handleD16VData(VData, DAG);
11211 auto Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
11212 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(5), DAG);
11226 !IsFormat ? AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT;
11227 Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 :
Opc;
11231 EVT VDataType = VData.getValueType().getScalarType();
11233 return handleByteShortBufferStores(DAG, VDataType,
DL,
Ops, M);
11236 M->getMemoryVT(),
M->getMemOperand());
11238 case Intrinsic::amdgcn_raw_buffer_load_lds:
11239 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
11240 case Intrinsic::amdgcn_struct_buffer_load_lds:
11241 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
11242 if (!Subtarget->hasVMemToLDSLoad())
11246 IntrinsicID == Intrinsic::amdgcn_struct_buffer_load_lds ||
11247 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_load_lds;
11248 unsigned OpOffset = HasVIndex ? 1 : 0;
11249 SDValue VOffset =
Op.getOperand(5 + OpOffset);
11251 unsigned Size =
Op->getConstantOperandVal(4);
11257 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_BOTHEN
11258 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_IDXEN
11259 : HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFEN
11260 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFSET;
11263 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_BOTHEN
11264 : AMDGPU::BUFFER_LOAD_USHORT_LDS_IDXEN
11265 : HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFEN
11266 : AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFSET;
11269 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_BOTHEN
11270 : AMDGPU::BUFFER_LOAD_DWORD_LDS_IDXEN
11271 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN
11272 : AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET;
11275 if (!Subtarget->hasLDSLoadB96_B128())
11277 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_BOTHEN
11278 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_IDXEN
11279 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFEN
11280 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFSET;
11283 if (!Subtarget->hasLDSLoadB96_B128())
11285 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_BOTHEN
11286 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_IDXEN
11287 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFEN
11288 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFSET;
11296 if (HasVIndex && HasVOffset)
11300 else if (HasVIndex)
11301 Ops.push_back(
Op.getOperand(5));
11302 else if (HasVOffset)
11303 Ops.push_back(VOffset);
11305 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
11306 Ops.push_back(Rsrc);
11307 Ops.push_back(
Op.getOperand(6 + OpOffset));
11308 Ops.push_back(
Op.getOperand(7 + OpOffset));
11310 unsigned Aux =
Op.getConstantOperandVal(8 + OpOffset);
11323 MachineMemOperand *LoadMMO =
M->getMemOperand();
11328 MachinePointerInfo StorePtrI = LoadPtrI;
11352 case Intrinsic::amdgcn_load_to_lds:
11353 case Intrinsic::amdgcn_global_load_lds: {
11354 if (!Subtarget->hasVMemToLDSLoad())
11358 unsigned Size =
Op->getConstantOperandVal(4);
11363 Opc = AMDGPU::GLOBAL_LOAD_LDS_UBYTE;
11366 Opc = AMDGPU::GLOBAL_LOAD_LDS_USHORT;
11369 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD;
11372 if (!Subtarget->hasLDSLoadB96_B128())
11374 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX3;
11377 if (!Subtarget->hasLDSLoadB96_B128())
11379 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX4;
11395 if (
LHS->isDivergent())
11399 RHS.getOperand(0).getValueType() == MVT::i32) {
11402 VOffset =
RHS.getOperand(0);
11406 Ops.push_back(Addr);
11414 Ops.push_back(VOffset);
11417 Ops.push_back(
Op.getOperand(5));
11419 unsigned Aux =
Op.getConstantOperandVal(6);
11427 MachineMemOperand *LoadMMO =
M->getMemOperand();
11429 LoadPtrI.
Offset =
Op->getConstantOperandVal(5);
11430 MachinePointerInfo StorePtrI = LoadPtrI;
11449 case Intrinsic::amdgcn_end_cf:
11451 Op->getOperand(2), Chain),
11453 case Intrinsic::amdgcn_s_barrier_init:
11454 case Intrinsic::amdgcn_s_barrier_signal_var: {
11461 unsigned Opc = IntrinsicID == Intrinsic::amdgcn_s_barrier_init
11462 ? AMDGPU::S_BARRIER_INIT_M0
11463 : AMDGPU::S_BARRIER_SIGNAL_M0;
11478 constexpr unsigned ShAmt = 16;
11485 Ops.push_back(
copyToM0(DAG, Chain,
DL, M0Val).getValue(0));
11490 case Intrinsic::amdgcn_s_barrier_join: {
11499 Opc = AMDGPU::S_BARRIER_JOIN_IMM;
11502 unsigned BarID = (BarVal >> 4) & 0x3F;
11505 Ops.push_back(Chain);
11507 Opc = AMDGPU::S_BARRIER_JOIN_M0;
11517 Ops.push_back(
copyToM0(DAG, Chain,
DL, M0Val).getValue(0));
11523 case Intrinsic::amdgcn_s_prefetch_data: {
11526 return Op.getOperand(0);
11529 case Intrinsic::amdgcn_s_buffer_prefetch_data: {
11531 Chain, bufferRsrcPtrToVector(
Op.getOperand(2), DAG),
11538 Op->getVTList(),
Ops,
M->getMemoryVT(),
11539 M->getMemOperand());
11541 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
11542 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
11543 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B: {
11552 if (
const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
11554 return lowerImage(
Op, ImageDimIntr, DAG,
true);
11570 return PtrVT == MVT::i64;
11584std::pair<SDValue, SDValue>
11614 unsigned Overflow = ImmOffset & ~MaxImm;
11615 ImmOffset -= Overflow;
11616 if ((int32_t)Overflow < 0) {
11617 Overflow += ImmOffset;
11622 auto OverflowVal = DAG.
getConstant(Overflow,
DL, MVT::i32);
11641void SITargetLowering::setBufferOffsets(
SDValue CombinedOffset,
11643 Align Alignment)
const {
11645 SDLoc
DL(CombinedOffset);
11647 uint32_t
Imm =
C->getZExtValue();
11648 uint32_t SOffset, ImmOffset;
11649 if (
TII->splitMUBUFOffset(Imm, SOffset, ImmOffset, Alignment)) {
11659 uint32_t SOffset, ImmOffset;
11662 TII->splitMUBUFOffset(
Offset, SOffset, ImmOffset, Alignment)) {
11670 SDValue SOffsetZero = Subtarget->hasRestrictedSOffset()
11679SDValue SITargetLowering::bufferRsrcPtrToVector(
SDValue MaybePointer,
11682 return MaybePointer;
11696 SDValue NumRecords =
Op->getOperand(3);
11702 if (Subtarget->has45BitNumRecordsBufferResource()) {
11721 SDValue ExtShiftedStrideVec =
11724 DAG.
getNode(ISD::BITCAST, Loc, MVT::i64, ExtShiftedStrideVec);
11731 DAG.
getNode(ISD::BITCAST, Loc, MVT::i64, ExtShiftedFlagsVec);
11733 DAG.
getNode(
ISD::OR, Loc, MVT::i64, NumRecordsRHS, ExtShiftedStride);
11735 DAG.
getNode(
ISD::OR, Loc, MVT::i64, CombinedFields, ExtShiftedFlags);
11740 auto [LowHalf, HighHalf] =
11741 DAG.
SplitScalar(Pointer, Loc, MVT::i32, MVT::i32);
11751 NumRecords, Flags);
11754 SDValue RsrcPtr = DAG.
getNode(ISD::BITCAST, Loc, MVT::i128, Rsrc);
11763 bool IsTFE)
const {
11768 ? AMDGPUISD::BUFFER_LOAD_UBYTE_TFE
11769 : AMDGPUISD::BUFFER_LOAD_USHORT_TFE;
11772 SDVTList VTs = DAG.
getVTList(MVT::v2i32, MVT::Other);
11784 ? AMDGPUISD::BUFFER_LOAD_UBYTE
11785 : AMDGPUISD::BUFFER_LOAD_USHORT;
11787 SDVTList ResList = DAG.
getVTList(MVT::i32, MVT::Other);
11791 LoadVal = DAG.
getNode(ISD::BITCAST,
DL, LoadVT, LoadVal);
11801 if (VDataType == MVT::f16 || VDataType == MVT::bf16)
11805 Ops[1] = BufferStoreExt;
11806 unsigned Opc = (VDataType == MVT::i8) ? AMDGPUISD::BUFFER_STORE_BYTE
11807 : AMDGPUISD::BUFFER_STORE_SHORT;
11810 M->getMemOperand());
11835 DAGCombinerInfo &DCI)
const {
11836 SelectionDAG &DAG = DCI.DAG;
11851 if ((MemVT.
isSimple() && !DCI.isAfterLegalizeDAG()) ||
11858 "unexpected vector extload");
11871 "unexpected fp extload");
11889 DCI.AddToWorklist(Cvt.
getNode());
11894 DCI.AddToWorklist(Cvt.
getNode());
11897 Cvt = DAG.
getNode(ISD::BITCAST, SL, VT, Cvt);
11905 if (
Info.isEntryFunction())
11906 return Info.getUserSGPRInfo().hasFlatScratchInit();
11914 EVT MemVT =
Load->getMemoryVT();
11915 MachineMemOperand *MMO =
Load->getMemOperand();
11927 EVT RealMemVT = (MemVT == MVT::i1) ? MVT::i8 : MVT::i16;
11955 assert(
Op.getValueType().getVectorElementType() == MVT::i32 &&
11956 "Custom lowering for non-i32 vectors hasn't been implemented.");
11959 unsigned AS =
Load->getAddressSpace();
11966 SIMachineFunctionInfo *MFI = MF.
getInfo<SIMachineFunctionInfo>();
11970 !Subtarget->hasMultiDwordFlatScratchAddressing())
11980 Subtarget->getScalarizeGlobalBehavior() &&
Load->isSimple() &&
11983 Alignment >=
Align(4) && NumElements < 32) {
11985 (Subtarget->hasScalarDwordx3Loads() && NumElements == 3))
11997 if (NumElements > 4)
12000 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
12010 switch (Subtarget->getMaxPrivateElementSize()) {
12016 if (NumElements > 2)
12021 if (NumElements > 4)
12024 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
12033 auto Flags =
Load->getMemOperand()->getFlags();
12035 Load->getAlign(), Flags, &
Fast) &&
12044 MemVT, *
Load->getMemOperand())) {
12053 EVT VT =
Op.getValueType();
12080 return DAG.
getNode(ISD::BITCAST,
DL, VT, Res);
12090 EVT VT =
Op.getValueType();
12091 const SDNodeFlags
Flags =
Op->getFlags();
12093 bool AllowInaccurateRcp =
Flags.hasApproximateFuncs();
12099 if (!AllowInaccurateRcp && VT != MVT::f16 && VT != MVT::bf16)
12102 if (CLHS->isExactlyValue(1.0)) {
12115 return DAG.
getNode(AMDGPUISD::RCP, SL, VT,
RHS);
12119 if (CLHS->isExactlyValue(-1.0)) {
12122 return DAG.
getNode(AMDGPUISD::RCP, SL, VT, FNegRHS);
12128 if (!AllowInaccurateRcp &&
12129 ((VT != MVT::f16 && VT != MVT::bf16) || !
Flags.hasAllowReciprocal()))
12143 EVT VT =
Op.getValueType();
12144 const SDNodeFlags
Flags =
Op->getFlags();
12146 bool AllowInaccurateDiv =
Flags.hasApproximateFuncs();
12147 if (!AllowInaccurateDiv)
12168 return DAG.
getNode(Opcode, SL, VT,
A,
B, Flags);
12178 Opcode = AMDGPUISD::FMUL_W_CHAIN;
12182 return DAG.
getNode(Opcode, SL, VTList,
12191 return DAG.
getNode(Opcode, SL, VT, {
A,
B,
C}, Flags);
12201 Opcode = AMDGPUISD::FMA_W_CHAIN;
12205 return DAG.
getNode(Opcode, SL, VTList,
12211 if (
SDValue FastLowered = lowerFastUnsafeFDIV(
Op, DAG))
12212 return FastLowered;
12215 EVT VT =
Op.getValueType();
12222 if (VT == MVT::bf16) {
12245 unsigned FMADOpCode =
12247 SDValue NegRHSExt = DAG.
getNode(ISD::FNEG, SL, MVT::f32, RHSExt);
12249 DAG.
getNode(AMDGPUISD::RCP, SL, MVT::f32, RHSExt,
Op->getFlags());
12252 SDValue Err = DAG.
getNode(FMADOpCode, SL, MVT::f32, NegRHSExt, Quot, LHSExt,
12254 Quot = DAG.
getNode(FMADOpCode, SL, MVT::f32, Err, Rcp, Quot,
Op->getFlags());
12255 Err = DAG.
getNode(FMADOpCode, SL, MVT::f32, NegRHSExt, Quot, LHSExt,
12261 Tmp = DAG.
getNode(ISD::BITCAST, SL, MVT::f32, TmpCast);
12265 return DAG.
getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f16, RDst,
RHS,
LHS,
12271 SDNodeFlags
Flags =
Op->getFlags();
12278 const APFloat K0Val(0x1p+96f);
12281 const APFloat K1Val(0x1p-32f);
12308 assert(ST->hasDenormModeInst() &&
"Requires S_DENORM_MODE");
12309 uint32_t DPDenormModeDefault =
Info->getMode().fpDenormModeDPValue();
12310 uint32_t Mode = SPDenormMode | (DPDenormModeDefault << 2);
12315 if (
SDValue FastLowered = lowerFastUnsafeFDIV(
Op, DAG))
12316 return FastLowered;
12322 SDNodeFlags
Flags =
Op->getFlags();
12323 Flags.setNoFPExcept(
true);
12331 SDVTList ScaleVT = DAG.
getVTList(MVT::f32, MVT::i1);
12340 DAG.
getNode(AMDGPUISD::RCP, SL, MVT::f32, DenominatorScaled, Flags);
12342 DAG.
getNode(ISD::FNEG, SL, MVT::f32, DenominatorScaled, Flags);
12344 using namespace AMDGPU::Hwreg;
12345 const unsigned Denorm32Reg = HwregEncoding::encode(ID_MODE, 4, 2);
12349 const SIMachineFunctionInfo *
Info = MF.
getInfo<SIMachineFunctionInfo>();
12350 const DenormalMode DenormMode =
Info->getMode().FP32Denormals;
12353 const bool HasDynamicDenormals =
12359 if (!PreservesDenormals) {
12364 SDVTList BindParamVTs = DAG.
getVTList(MVT::Other, MVT::Glue);
12367 if (HasDynamicDenormals) {
12371 SavedDenormMode =
SDValue(GetReg, 0);
12377 SDNode *EnableDenorm;
12378 if (Subtarget->hasDenormModeInst()) {
12379 const SDValue EnableDenormValue =
12382 EnableDenorm = DAG.
getNode(AMDGPUISD::DENORM_MODE, SL, BindParamVTs, Glue,
12386 const SDValue EnableDenormValue =
12388 EnableDenorm = DAG.
getMachineNode(AMDGPU::S_SETREG_B32, SL, BindParamVTs,
12389 {EnableDenormValue,
BitField, Glue});
12399 ApproxRcp, One, NegDivScale0, Flags);
12402 ApproxRcp, Fma0, Flags);
12408 NumeratorScaled,
Mul, Flags);
12414 NumeratorScaled, Fma3, Flags);
12416 if (!PreservesDenormals) {
12417 SDNode *DisableDenorm;
12418 if (!HasDynamicDenormals && Subtarget->hasDenormModeInst()) {
12422 SDVTList BindParamVTs = DAG.
getVTList(MVT::Other, MVT::Glue);
12424 DAG.
getNode(AMDGPUISD::DENORM_MODE, SL, BindParamVTs,
12428 assert(HasDynamicDenormals == (
bool)SavedDenormMode);
12429 const SDValue DisableDenormValue =
12430 HasDynamicDenormals
12435 AMDGPU::S_SETREG_B32, SL, MVT::Other,
12446 {Fma4, Fma1, Fma3, Scale},
Flags);
12448 return DAG.
getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f32, Fmas,
RHS,
LHS, Flags);
12452 if (
SDValue FastLowered = lowerFastUnsafeFDIV64(
Op, DAG))
12453 return FastLowered;
12461 SDVTList ScaleVT = DAG.
getVTList(MVT::f64, MVT::i1);
12465 SDValue NegDivScale0 = DAG.
getNode(ISD::FNEG, SL, MVT::f64, DivScale0);
12467 SDValue Rcp = DAG.
getNode(AMDGPUISD::RCP, SL, MVT::f64, DivScale0);
12485 if (!Subtarget->hasUsableDivScaleConditionOutput()) {
12494 SDValue Scale0BC = DAG.
getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale0);
12495 SDValue Scale1BC = DAG.
getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale1);
12515 DAG.
getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f64, Fma4, Fma3,
Mul, Scale);
12517 return DAG.
getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f64, Fmas,
Y,
X);
12521 EVT VT =
Op.getValueType();
12523 if (VT == MVT::f32)
12524 return LowerFDIV32(
Op, DAG);
12526 if (VT == MVT::f64)
12527 return LowerFDIV64(
Op, DAG);
12529 if (VT == MVT::f16 || VT == MVT::bf16)
12530 return LowerFDIV16(
Op, DAG);
12539 EVT ResultExpVT =
Op->getValueType(1);
12540 EVT InstrExpVT = VT == MVT::f16 ? MVT::i16 : MVT::i32;
12550 if (Subtarget->hasFractBug()) {
12568 EVT VT =
Store->getMemoryVT();
12570 if (VT == MVT::i1) {
12574 Store->getBasePtr(), MVT::i1,
Store->getMemOperand());
12578 Store->getValue().getValueType().getScalarType() == MVT::i32);
12580 unsigned AS =
Store->getAddressSpace();
12588 SIMachineFunctionInfo *MFI = MF.
getInfo<SIMachineFunctionInfo>();
12592 !Subtarget->hasMultiDwordFlatScratchAddressing())
12599 if (NumElements > 4)
12602 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
12606 VT, *
Store->getMemOperand()))
12612 switch (Subtarget->getMaxPrivateElementSize()) {
12616 if (NumElements > 2)
12620 if (NumElements > 4 ||
12621 (NumElements == 3 && !Subtarget->enableFlatScratch()))
12629 auto Flags =
Store->getMemOperand()->getFlags();
12648 assert(!Subtarget->has16BitInsts());
12649 SDNodeFlags
Flags =
Op->getFlags();
12651 DAG.
getNode(ISD::FP_EXTEND, SL, MVT::f32,
Op.getOperand(0), Flags);
12663 SDNodeFlags
Flags =
Op->getFlags();
12664 MVT VT =
Op.getValueType().getSimpleVT();
12694 SDValue SqrtSNextDown = DAG.
getNode(ISD::BITCAST,
DL, VT, SqrtSNextDownInt);
12697 DAG.
getNode(ISD::FNEG,
DL, VT, SqrtSNextDown, Flags);
12706 SDValue NegSqrtSNextUp = DAG.
getNode(ISD::FNEG,
DL, VT, SqrtSNextUp, Flags);
12772 SDNodeFlags
Flags =
Op->getFlags();
12818 SqrtRet = DAG.
getNode(ISD::FLDEXP,
DL, MVT::f64, SqrtRet, ScaleDown, Flags);
12835 EVT VT =
Op.getValueType();
12845 if (Subtarget->hasTrigReducedRange()) {
12847 TrigVal = DAG.
getNode(AMDGPUISD::FRACT,
DL, VT, MulVal, Flags);
12852 switch (
Op.getOpcode()) {
12854 return DAG.
getNode(AMDGPUISD::COS_HW, SDLoc(
Op), VT, TrigVal, Flags);
12856 return DAG.
getNode(AMDGPUISD::SIN_HW, SDLoc(
Op), VT, TrigVal, Flags);
12879 EVT VT =
Op.getValueType();
12887 Op->getVTList(),
Ops, VT,
12896SITargetLowering::performUCharToFloatCombine(
SDNode *
N,
12897 DAGCombinerInfo &DCI)
const {
12898 EVT VT =
N->getValueType(0);
12900 if (ScalarVT != MVT::f32 && ScalarVT != MVT::f16)
12903 SelectionDAG &DAG = DCI.DAG;
12907 EVT SrcVT = Src.getValueType();
12913 if (DCI.isAfterLegalizeDAG() && SrcVT == MVT::i32) {
12916 DCI.AddToWorklist(Cvt.
getNode());
12919 if (ScalarVT != MVT::f32) {
12931 DAGCombinerInfo &DCI)
const {
12938 if (SignOp.
getOpcode() == ISD::FP_EXTEND ||
12942 SelectionDAG &DAG = DCI.DAG;
12961 for (
unsigned I = 0;
I != NumElts; ++
I) {
12985 if (NewElts.
size() == 1)
13007 for (
unsigned I = 0;
I != NumElts; ++
I) {
13042SDValue SITargetLowering::performSHLPtrCombine(
SDNode *
N,
unsigned AddrSpace,
13044 DAGCombinerInfo &DCI)
const {
13061 SelectionDAG &DAG = DCI.DAG;
13074 AM.BaseOffs =
Offset.getSExtValue();
13079 EVT VT =
N->getValueType(0);
13085 Flags.setNoUnsignedWrap(
13086 N->getFlags().hasNoUnsignedWrap() &&
13098 switch (
N->getOpcode()) {
13109 DAGCombinerInfo &DCI)
const {
13110 SelectionDAG &DAG = DCI.DAG;
13117 SDValue NewPtr = performSHLPtrCombine(Ptr.
getNode(),
N->getAddressSpace(),
13118 N->getMemoryVT(), DCI);
13122 NewOps[PtrIdx] = NewPtr;
13131 return (
Opc ==
ISD::AND && (Val == 0 || Val == 0xffffffff)) ||
13132 (
Opc ==
ISD::OR && (Val == 0xffffffff || Val == 0)) ||
13141SDValue SITargetLowering::splitBinaryBitConstantOp(
13145 uint32_t ValLo =
Lo_32(Val);
13146 uint32_t ValHi =
Hi_32(Val);
13153 if (Subtarget->has64BitLiterals() && CRHS->
hasOneUse() &&
13167 if (V.getValueType() != MVT::i1)
13169 switch (V.getOpcode()) {
13174 case AMDGPUISD::FP_CLASS:
13186 return V.getResNo() == 1;
13188 unsigned IntrinsicID = V.getConstantOperandVal(0);
13189 switch (IntrinsicID) {
13190 case Intrinsic::amdgcn_is_shared:
13191 case Intrinsic::amdgcn_is_private:
13208 if (!(
C & 0x000000ff))
13209 ZeroByteMask |= 0x000000ff;
13210 if (!(
C & 0x0000ff00))
13211 ZeroByteMask |= 0x0000ff00;
13212 if (!(
C & 0x00ff0000))
13213 ZeroByteMask |= 0x00ff0000;
13214 if (!(
C & 0xff000000))
13215 ZeroByteMask |= 0xff000000;
13216 uint32_t NonZeroByteMask = ~ZeroByteMask;
13217 if ((NonZeroByteMask &
C) != NonZeroByteMask)
13230 assert(V.getValueSizeInBits() == 32);
13232 if (V.getNumOperands() != 2)
13241 switch (V.getOpcode()) {
13246 return (0x03020100 & ConstMask) | (0x0c0c0c0c & ~ConstMask);
13251 return (0x03020100 & ~ConstMask) | ConstMask;
13258 return uint32_t((0x030201000c0c0c0cull <<
C) >> 32);
13264 return uint32_t(0x0c0c0c0c03020100ull >>
C);
13271 DAGCombinerInfo &DCI)
const {
13272 if (DCI.isBeforeLegalize())
13275 SelectionDAG &DAG = DCI.DAG;
13276 EVT VT =
N->getValueType(0);
13281 if (VT == MVT::i64 && CRHS) {
13283 splitBinaryBitConstantOp(DCI, SDLoc(
N),
ISD::AND,
LHS, CRHS))
13287 if (CRHS && VT == MVT::i32) {
13297 unsigned Shift = CShift->getZExtValue();
13299 unsigned Offset = NB + Shift;
13300 if ((
Offset & (Bits - 1)) == 0) {
13303 DAG.
getNode(AMDGPUISD::BFE_U32, SL, MVT::i32,
LHS->getOperand(0),
13324 Sel = (
LHS.getConstantOperandVal(2) & Sel) | (~Sel & 0x0c0c0c0c);
13326 return DAG.
getNode(AMDGPUISD::PERM,
DL, MVT::i32,
LHS.getOperand(0),
13339 if (
Y.getOpcode() != ISD::FABS ||
Y.getOperand(0) !=
X ||
13344 if (
X !=
LHS.getOperand(1))
13348 const ConstantFPSDNode *C1 =
13365 return DAG.
getNode(AMDGPUISD::FP_CLASS,
DL, MVT::i1,
X,
13371 if (
RHS.getOpcode() ==
ISD::SETCC &&
LHS.getOpcode() == AMDGPUISD::FP_CLASS)
13374 if (
LHS.getOpcode() ==
ISD::SETCC &&
RHS.getOpcode() == AMDGPUISD::FP_CLASS &&
13382 (
RHS.getOperand(0) ==
LHS.getOperand(0) &&
13383 LHS.getOperand(0) ==
LHS.getOperand(1))) {
13385 unsigned NewMask = LCC ==
ISD::SETO ?
Mask->getZExtValue() & ~OrdMask
13386 :
Mask->getZExtValue() & OrdMask;
13389 return DAG.
getNode(AMDGPUISD::FP_CLASS,
DL, MVT::i1,
RHS.getOperand(0),
13407 N->isDivergent() &&
TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
13410 if (LHSMask != ~0u && RHSMask != ~0u) {
13413 if (LHSMask > RHSMask) {
13420 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
13421 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
13424 if (!(LHSUsedLanes & RHSUsedLanes) &&
13427 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
13433 uint32_t
Mask = LHSMask & RHSMask;
13434 for (
unsigned I = 0;
I < 32;
I += 8) {
13435 uint32_t ByteSel = 0xff <<
I;
13436 if ((LHSMask & ByteSel) == 0x0c || (RHSMask & ByteSel) == 0x0c)
13437 Mask &= (0x0c <<
I) & 0xffffffff;
13442 uint32_t Sel =
Mask | (LHSUsedLanes & 0x04040404);
13445 return DAG.
getNode(AMDGPUISD::PERM,
DL, MVT::i32,
LHS.getOperand(0),
13495static const std::optional<ByteProvider<SDValue>>
13497 unsigned Depth = 0) {
13500 return std::nullopt;
13502 if (
Op.getValueSizeInBits() < 8)
13503 return std::nullopt;
13505 if (
Op.getValueType().isVector())
13508 switch (
Op->getOpcode()) {
13520 NarrowVT = VTSign->getVT();
13523 return std::nullopt;
13526 if (SrcIndex >= NarrowByteWidth)
13527 return std::nullopt;
13535 return std::nullopt;
13537 uint64_t BitShift = ShiftOp->getZExtValue();
13539 if (BitShift % 8 != 0)
13540 return std::nullopt;
13542 SrcIndex += BitShift / 8;
13560static const std::optional<ByteProvider<SDValue>>
13562 unsigned StartingIndex = 0) {
13566 return std::nullopt;
13568 unsigned BitWidth =
Op.getScalarValueSizeInBits();
13570 return std::nullopt;
13572 return std::nullopt;
13574 bool IsVec =
Op.getValueType().isVector();
13575 switch (
Op.getOpcode()) {
13578 return std::nullopt;
13583 return std::nullopt;
13587 return std::nullopt;
13590 if (!
LHS->isConstantZero() && !
RHS->isConstantZero())
13591 return std::nullopt;
13592 if (!
LHS ||
LHS->isConstantZero())
13594 if (!
RHS ||
RHS->isConstantZero())
13596 return std::nullopt;
13601 return std::nullopt;
13605 return std::nullopt;
13607 uint32_t BitMask = BitMaskOp->getZExtValue();
13609 uint32_t IndexMask = 0xFF << (Index * 8);
13611 if ((IndexMask & BitMask) != IndexMask) {
13614 if (IndexMask & BitMask)
13615 return std::nullopt;
13624 return std::nullopt;
13628 if (!ShiftOp ||
Op.getValueType().isVector())
13629 return std::nullopt;
13631 uint64_t BitsProvided =
Op.getValueSizeInBits();
13632 if (BitsProvided % 8 != 0)
13633 return std::nullopt;
13635 uint64_t BitShift = ShiftOp->getAPIntValue().urem(BitsProvided);
13637 return std::nullopt;
13639 uint64_t ConcatSizeInBytes = BitsProvided / 4;
13640 uint64_t ByteShift = BitShift / 8;
13642 uint64_t NewIndex = (Index + ByteShift) % ConcatSizeInBytes;
13643 uint64_t BytesProvided = BitsProvided / 8;
13644 SDValue NextOp =
Op.getOperand(NewIndex >= BytesProvided ? 0 : 1);
13645 NewIndex %= BytesProvided;
13652 return std::nullopt;
13656 return std::nullopt;
13658 uint64_t BitShift = ShiftOp->getZExtValue();
13660 return std::nullopt;
13662 auto BitsProvided =
Op.getScalarValueSizeInBits();
13663 if (BitsProvided % 8 != 0)
13664 return std::nullopt;
13666 uint64_t BytesProvided = BitsProvided / 8;
13667 uint64_t ByteShift = BitShift / 8;
13672 return BytesProvided - ByteShift > Index
13680 return std::nullopt;
13684 return std::nullopt;
13686 uint64_t BitShift = ShiftOp->getZExtValue();
13687 if (BitShift % 8 != 0)
13688 return std::nullopt;
13689 uint64_t ByteShift = BitShift / 8;
13695 return Index < ByteShift
13698 Depth + 1, StartingIndex);
13707 return std::nullopt;
13715 NarrowBitWidth = VTSign->getVT().getSizeInBits();
13717 if (NarrowBitWidth % 8 != 0)
13718 return std::nullopt;
13719 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
13721 if (Index >= NarrowByteWidth)
13723 ? std::optional<ByteProvider<SDValue>>(
13731 return std::nullopt;
13735 if (NarrowByteWidth >= Index) {
13740 return std::nullopt;
13747 return std::nullopt;
13753 unsigned NarrowBitWidth = L->getMemoryVT().getSizeInBits();
13754 if (NarrowBitWidth % 8 != 0)
13755 return std::nullopt;
13756 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
13761 if (Index >= NarrowByteWidth) {
13763 ? std::optional<ByteProvider<SDValue>>(
13768 if (NarrowByteWidth > Index) {
13772 return std::nullopt;
13777 return std::nullopt;
13780 Depth + 1, StartingIndex);
13786 return std::nullopt;
13787 auto VecIdx = IdxOp->getZExtValue();
13788 auto ScalarSize =
Op.getScalarValueSizeInBits();
13789 if (ScalarSize < 32)
13790 Index = ScalarSize == 8 ? VecIdx : VecIdx * 2 + Index;
13792 StartingIndex, Index);
13795 case AMDGPUISD::PERM: {
13797 return std::nullopt;
13801 return std::nullopt;
13804 (PermMask->getZExtValue() & (0xFF << (Index * 8))) >> (Index * 8);
13805 if (IdxMask > 0x07 && IdxMask != 0x0c)
13806 return std::nullopt;
13808 auto NextOp =
Op.getOperand(IdxMask > 0x03 ? 0 : 1);
13809 auto NextIndex = IdxMask > 0x03 ? IdxMask % 4 : IdxMask;
13811 return IdxMask != 0x0c ?
calculateSrcByte(NextOp, StartingIndex, NextIndex)
13817 return std::nullopt;
13832 return !OpVT.
isVector() && OpVT.getSizeInBits() == 16;
13839 auto MemVT = L->getMemoryVT();
13842 return L->getMemoryVT().getSizeInBits() == 16;
13852 int Low8 = Mask & 0xff;
13853 int Hi8 = (Mask & 0xff00) >> 8;
13855 assert(Low8 < 8 && Hi8 < 8);
13857 bool IsConsecutive = (Hi8 - Low8 == 1);
13862 bool Is16Aligned = !(Low8 % 2);
13864 return IsConsecutive && Is16Aligned;
13872 int Low16 = PermMask & 0xffff;
13873 int Hi16 = (PermMask & 0xffff0000) >> 16;
13883 auto OtherOpIs16Bit = TempOtherOp.getValueSizeInBits() == 16 ||
13885 if (!OtherOpIs16Bit)
13893 unsigned DWordOffset) {
13898 assert(Src.getValueSizeInBits().isKnownMultipleOf(8));
13903 if (Src.getValueType().isVector()) {
13904 auto ScalarTySize = Src.getScalarValueSizeInBits();
13905 auto ScalarTy = Src.getValueType().getScalarType();
13906 if (ScalarTySize == 32) {
13910 if (ScalarTySize > 32) {
13913 DAG.
getConstant(DWordOffset / (ScalarTySize / 32), SL, MVT::i32));
13914 auto ShiftVal = 32 * (DWordOffset % (ScalarTySize / 32));
13921 assert(ScalarTySize < 32);
13922 auto NumElements =
TypeSize / ScalarTySize;
13923 auto Trunc32Elements = (ScalarTySize * NumElements) / 32;
13924 auto NormalizedTrunc = Trunc32Elements * 32 / ScalarTySize;
13925 auto NumElementsIn32 = 32 / ScalarTySize;
13926 auto NumAvailElements = DWordOffset < Trunc32Elements
13928 : NumElements - NormalizedTrunc;
13941 auto ShiftVal = 32 * DWordOffset;
13949 [[maybe_unused]]
EVT VT =
N->getValueType(0);
13954 for (
int i = 0; i < 4; i++) {
13956 std::optional<ByteProvider<SDValue>>
P =
13959 if (!
P ||
P->isConstantZero())
13964 if (PermNodes.
size() != 4)
13967 std::pair<unsigned, unsigned> FirstSrc(0, PermNodes[0].SrcOffset / 4);
13968 std::optional<std::pair<unsigned, unsigned>> SecondSrc;
13970 for (
size_t i = 0; i < PermNodes.
size(); i++) {
13971 auto PermOp = PermNodes[i];
13974 int SrcByteAdjust = 4;
13978 if (!PermOp.hasSameSrc(PermNodes[FirstSrc.first]) ||
13979 ((PermOp.SrcOffset / 4) != FirstSrc.second)) {
13981 if (!PermOp.hasSameSrc(PermNodes[SecondSrc->first]) ||
13982 ((PermOp.SrcOffset / 4) != SecondSrc->second))
13986 SecondSrc = {i, PermNodes[i].SrcOffset / 4};
13987 assert(!(PermNodes[SecondSrc->first].Src->getValueSizeInBits() % 8));
13990 assert((PermOp.SrcOffset % 4) + SrcByteAdjust < 8);
13992 PermMask |= ((PermOp.SrcOffset % 4) + SrcByteAdjust) << (i * 8);
13995 SDValue Op = *PermNodes[FirstSrc.first].Src;
13997 assert(
Op.getValueSizeInBits() == 32);
14001 int Low16 = PermMask & 0xffff;
14002 int Hi16 = (PermMask & 0xffff0000) >> 16;
14004 bool WellFormedLow = (Low16 == 0x0504) || (Low16 == 0x0100);
14005 bool WellFormedHi = (Hi16 == 0x0706) || (Hi16 == 0x0302);
14008 if (WellFormedLow && WellFormedHi)
14012 SDValue OtherOp = SecondSrc ? *PermNodes[SecondSrc->first].Src :
Op;
14021 assert(
Op.getValueType().isByteSized() &&
14032 return DAG.
getNode(AMDGPUISD::PERM,
DL, MVT::i32,
Op, OtherOp,
14039 DAGCombinerInfo &DCI)
const {
14040 SelectionDAG &DAG = DCI.DAG;
14044 EVT VT =
N->getValueType(0);
14045 if (VT == MVT::i1) {
14047 if (
LHS.getOpcode() == AMDGPUISD::FP_CLASS &&
14048 RHS.getOpcode() == AMDGPUISD::FP_CLASS) {
14050 if (Src !=
RHS.getOperand(0))
14055 if (!CLHS || !CRHS)
14059 static const uint32_t MaxMask = 0x3ff;
14064 return DAG.
getNode(AMDGPUISD::FP_CLASS,
DL, MVT::i1, Src,
14073 LHS.getOpcode() == AMDGPUISD::PERM &&
14079 Sel |=
LHS.getConstantOperandVal(2);
14081 return DAG.
getNode(AMDGPUISD::PERM,
DL, MVT::i32,
LHS.getOperand(0),
14088 N->isDivergent() &&
TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
14092 auto usesCombinedOperand = [](SDNode *OrUse) {
14094 if (OrUse->getOpcode() != ISD::BITCAST ||
14095 !OrUse->getValueType(0).isVector())
14099 for (
auto *VUser : OrUse->users()) {
14100 if (!VUser->getValueType(0).isVector())
14107 if (VUser->getOpcode() == VectorwiseOp)
14113 if (!
any_of(
N->users(), usesCombinedOperand))
14119 if (LHSMask != ~0u && RHSMask != ~0u) {
14122 if (LHSMask > RHSMask) {
14129 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
14130 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
14133 if (!(LHSUsedLanes & RHSUsedLanes) &&
14136 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
14138 LHSMask &= ~RHSUsedLanes;
14139 RHSMask &= ~LHSUsedLanes;
14141 LHSMask |= LHSUsedLanes & 0x04040404;
14143 uint32_t Sel = LHSMask | RHSMask;
14146 return DAG.
getNode(AMDGPUISD::PERM,
DL, MVT::i32,
LHS.getOperand(0),
14151 if (LHSMask == ~0u || RHSMask == ~0u) {
14192 return IdentitySrc;
14198 if (VT != MVT::i64 || DCI.isBeforeLegalizeOps())
14213 if (SrcVT == MVT::i32) {
14218 DCI.AddToWorklist(LowOr.
getNode());
14219 DCI.AddToWorklist(HiBits.getNode());
14223 return DAG.
getNode(ISD::BITCAST, SL, MVT::i64, Vec);
14230 N->getOperand(0), CRHS))
14238 DAGCombinerInfo &DCI)
const {
14239 if (
SDValue RV = reassociateScalarOps(
N, DCI.DAG))
14246 SelectionDAG &DAG = DCI.DAG;
14248 EVT VT =
N->getValueType(0);
14249 if (CRHS && VT == MVT::i64) {
14251 splitBinaryBitConstantOp(DCI, SDLoc(
N),
ISD::XOR,
LHS, CRHS))
14258 unsigned Opc =
LHS.getOpcode();
14282 DAG.
getNode(ISD::BITCAST,
DL, MVT::f32,
LHS->getOperand(1));
14284 DAG.
getNode(ISD::BITCAST,
DL, MVT::f32,
LHS->getOperand(2));
14288 LHS->getOperand(0), FNegLHS, FNegRHS);
14289 return DAG.
getNode(ISD::BITCAST,
DL, VT, NewSelect);
14297 DAGCombinerInfo &DCI)
const {
14298 if (!Subtarget->has16BitInsts() ||
14302 EVT VT =
N->getValueType(0);
14303 if (VT != MVT::i32)
14307 if (Src.getValueType() != MVT::i16)
14314SITargetLowering::performSignExtendInRegCombine(
SDNode *
N,
14315 DAGCombinerInfo &DCI)
const {
14321 if (((Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_UBYTE &&
14322 VTSign->getVT() == MVT::i8) ||
14323 (Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_USHORT &&
14324 VTSign->getVT() == MVT::i16))) {
14325 assert(Subtarget->hasScalarSubwordLoads() &&
14326 "s_buffer_load_{u8, i8} are supported "
14327 "in GFX12 (or newer) architectures.");
14328 EVT VT = Src.getValueType();
14329 unsigned Opc = (Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_UBYTE)
14330 ? AMDGPUISD::SBUFFER_LOAD_BYTE
14331 : AMDGPUISD::SBUFFER_LOAD_SHORT;
14333 SDVTList ResList = DCI.DAG.getVTList(MVT::i32);
14340 SDValue BufferLoad = DCI.DAG.getMemIntrinsicNode(
14341 Opc,
DL, ResList,
Ops,
M->getMemoryVT(),
M->getMemOperand());
14345 if (((Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE &&
14346 VTSign->getVT() == MVT::i8) ||
14347 (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_USHORT &&
14348 VTSign->getVT() == MVT::i16)) &&
14357 Src.getOperand(6), Src.getOperand(7)};
14360 DCI.DAG.getVTList(MVT::i32, Src.getOperand(0).getValueType());
14361 unsigned Opc = (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE)
14362 ? AMDGPUISD::BUFFER_LOAD_BYTE
14363 : AMDGPUISD::BUFFER_LOAD_SHORT;
14364 SDValue BufferLoadSignExt = DCI.DAG.getMemIntrinsicNode(
14365 Opc, SDLoc(
N), ResList,
Ops,
M->getMemoryVT(),
M->getMemOperand());
14366 return DCI.DAG.getMergeValues(
14367 {BufferLoadSignExt, BufferLoadSignExt.
getValue(1)}, SDLoc(
N));
14373 DAGCombinerInfo &DCI)
const {
14374 SelectionDAG &DAG = DCI.DAG;
14381 if (
N->getOperand(0).isUndef())
14388 DAGCombinerInfo &DCI)
const {
14389 EVT VT =
N->getValueType(0);
14399 return DCI.DAG.getNode(AMDGPUISD::RCP_IFLAG, SDLoc(
N), VT, N0,
14404 if ((VT == MVT::f16 && N0.
getOpcode() == ISD::FSQRT) &&
14406 return DCI.DAG.getNode(AMDGPUISD::RSQ, SDLoc(
N), VT, N0.
getOperand(0),
14414 unsigned MaxDepth)
const {
14415 unsigned Opcode =
Op.getOpcode();
14420 const auto &
F = CFP->getValueAPF();
14421 if (
F.isNaN() &&
F.isSignaling())
14423 if (!
F.isDenormal())
14449 case ISD::FP_EXTEND:
14450 case ISD::FP16_TO_FP:
14451 case ISD::FP_TO_FP16:
14452 case ISD::BF16_TO_FP:
14453 case ISD::FP_TO_BF16:
14455 case AMDGPUISD::FMUL_LEGACY:
14456 case AMDGPUISD::FMAD_FTZ:
14457 case AMDGPUISD::RCP:
14458 case AMDGPUISD::RSQ:
14459 case AMDGPUISD::RSQ_CLAMP:
14460 case AMDGPUISD::RCP_LEGACY:
14461 case AMDGPUISD::RCP_IFLAG:
14462 case AMDGPUISD::LOG:
14463 case AMDGPUISD::EXP:
14464 case AMDGPUISD::DIV_SCALE:
14465 case AMDGPUISD::DIV_FMAS:
14466 case AMDGPUISD::DIV_FIXUP:
14467 case AMDGPUISD::FRACT:
14468 case AMDGPUISD::CVT_PKRTZ_F16_F32:
14469 case AMDGPUISD::CVT_F32_UBYTE0:
14470 case AMDGPUISD::CVT_F32_UBYTE1:
14471 case AMDGPUISD::CVT_F32_UBYTE2:
14472 case AMDGPUISD::CVT_F32_UBYTE3:
14473 case AMDGPUISD::FP_TO_FP16:
14474 case AMDGPUISD::SIN_HW:
14475 case AMDGPUISD::COS_HW:
14486 if (
Op.getValueType() == MVT::i32) {
14492 if (RHS->getZExtValue() == 0xffff0000) {
14502 return Op.getValueType().getScalarType() != MVT::f16;
14506 case ISD::FMINNUM_IEEE:
14507 case ISD::FMAXNUM_IEEE:
14508 case ISD::FMINIMUM:
14509 case ISD::FMAXIMUM:
14510 case ISD::FMINIMUMNUM:
14511 case ISD::FMAXIMUMNUM:
14512 case AMDGPUISD::CLAMP:
14513 case AMDGPUISD::FMED3:
14514 case AMDGPUISD::FMAX3:
14515 case AMDGPUISD::FMIN3:
14516 case AMDGPUISD::FMAXIMUM3:
14517 case AMDGPUISD::FMINIMUM3: {
14523 if (Subtarget->supportsMinMaxDenormModes() ||
14533 for (
unsigned I = 0, E =
Op.getNumOperands();
I != E; ++
I) {
14545 for (
unsigned i = 0, e =
Op.getNumOperands(); i != e; ++i) {
14572 if (
Op.getValueType() == MVT::i16) {
14575 TruncSrc.
getOpcode() == ISD::BITCAST &&
14583 unsigned IntrinsicID =
Op.getConstantOperandVal(0);
14585 switch (IntrinsicID) {
14586 case Intrinsic::amdgcn_cvt_pkrtz:
14587 case Intrinsic::amdgcn_cubeid:
14588 case Intrinsic::amdgcn_frexp_mant:
14589 case Intrinsic::amdgcn_fdot2:
14590 case Intrinsic::amdgcn_rcp:
14591 case Intrinsic::amdgcn_rsq:
14592 case Intrinsic::amdgcn_rsq_clamp:
14593 case Intrinsic::amdgcn_rcp_legacy:
14594 case Intrinsic::amdgcn_rsq_legacy:
14595 case Intrinsic::amdgcn_trig_preop:
14596 case Intrinsic::amdgcn_tanh:
14597 case Intrinsic::amdgcn_log:
14598 case Intrinsic::amdgcn_exp2:
14599 case Intrinsic::amdgcn_sqrt:
14617 unsigned MaxDepth)
const {
14620 unsigned Opcode =
MI->getOpcode();
14622 if (Opcode == AMDGPU::G_FCANONICALIZE)
14625 std::optional<FPValueAndVReg> FCR;
14628 if (FCR->Value.isSignaling())
14630 if (!FCR->Value.isDenormal())
14641 case AMDGPU::G_FADD:
14642 case AMDGPU::G_FSUB:
14643 case AMDGPU::G_FMUL:
14644 case AMDGPU::G_FCEIL:
14645 case AMDGPU::G_FFLOOR:
14646 case AMDGPU::G_FRINT:
14647 case AMDGPU::G_FNEARBYINT:
14648 case AMDGPU::G_INTRINSIC_FPTRUNC_ROUND:
14649 case AMDGPU::G_INTRINSIC_TRUNC:
14650 case AMDGPU::G_INTRINSIC_ROUNDEVEN:
14651 case AMDGPU::G_FMA:
14652 case AMDGPU::G_FMAD:
14653 case AMDGPU::G_FSQRT:
14654 case AMDGPU::G_FDIV:
14655 case AMDGPU::G_FREM:
14656 case AMDGPU::G_FPOW:
14657 case AMDGPU::G_FPEXT:
14658 case AMDGPU::G_FLOG:
14659 case AMDGPU::G_FLOG2:
14660 case AMDGPU::G_FLOG10:
14661 case AMDGPU::G_FPTRUNC:
14662 case AMDGPU::G_AMDGPU_RCP_IFLAG:
14663 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE0:
14664 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE1:
14665 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE2:
14666 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3:
14668 case AMDGPU::G_FNEG:
14669 case AMDGPU::G_FABS:
14670 case AMDGPU::G_FCOPYSIGN:
14672 case AMDGPU::G_FMINNUM:
14673 case AMDGPU::G_FMAXNUM:
14674 case AMDGPU::G_FMINNUM_IEEE:
14675 case AMDGPU::G_FMAXNUM_IEEE:
14676 case AMDGPU::G_FMINIMUM:
14677 case AMDGPU::G_FMAXIMUM:
14678 case AMDGPU::G_FMINIMUMNUM:
14679 case AMDGPU::G_FMAXIMUMNUM: {
14680 if (Subtarget->supportsMinMaxDenormModes() ||
14687 case AMDGPU::G_BUILD_VECTOR:
14692 case AMDGPU::G_INTRINSIC:
14693 case AMDGPU::G_INTRINSIC_CONVERGENT:
14695 case Intrinsic::amdgcn_fmul_legacy:
14696 case Intrinsic::amdgcn_fmad_ftz:
14697 case Intrinsic::amdgcn_sqrt:
14698 case Intrinsic::amdgcn_fmed3:
14699 case Intrinsic::amdgcn_sin:
14700 case Intrinsic::amdgcn_cos:
14701 case Intrinsic::amdgcn_log:
14702 case Intrinsic::amdgcn_exp2:
14703 case Intrinsic::amdgcn_log_clamp:
14704 case Intrinsic::amdgcn_rcp:
14705 case Intrinsic::amdgcn_rcp_legacy:
14706 case Intrinsic::amdgcn_rsq:
14707 case Intrinsic::amdgcn_rsq_clamp:
14708 case Intrinsic::amdgcn_rsq_legacy:
14709 case Intrinsic::amdgcn_div_scale:
14710 case Intrinsic::amdgcn_div_fmas:
14711 case Intrinsic::amdgcn_div_fixup:
14712 case Intrinsic::amdgcn_fract:
14713 case Intrinsic::amdgcn_cvt_pkrtz:
14714 case Intrinsic::amdgcn_cubeid:
14715 case Intrinsic::amdgcn_cubema:
14716 case Intrinsic::amdgcn_cubesc:
14717 case Intrinsic::amdgcn_cubetc:
14718 case Intrinsic::amdgcn_frexp_mant:
14719 case Intrinsic::amdgcn_fdot2:
14720 case Intrinsic::amdgcn_trig_preop:
14721 case Intrinsic::amdgcn_tanh:
14740 if (
C.isDenormal()) {
14754 if (
C.isSignaling()) {
14777SITargetLowering::performFCanonicalizeCombine(
SDNode *
N,
14778 DAGCombinerInfo &DCI)
const {
14779 SelectionDAG &DAG = DCI.DAG;
14781 EVT VT =
N->getValueType(0);
14790 EVT VT =
N->getValueType(0);
14791 return getCanonicalConstantFP(DAG, SDLoc(
N), VT, CFP->getValueAPF());
14807 EVT EltVT =
Lo.getValueType();
14810 for (
unsigned I = 0;
I != 2; ++
I) {
14814 getCanonicalConstantFP(DAG, SL, EltVT, CFP->getValueAPF());
14815 }
else if (
Op.isUndef()) {
14849 case ISD::FMAXNUM_IEEE:
14850 case ISD::FMAXIMUMNUM:
14851 return AMDGPUISD::FMAX3;
14852 case ISD::FMAXIMUM:
14853 return AMDGPUISD::FMAXIMUM3;
14855 return AMDGPUISD::SMAX3;
14857 return AMDGPUISD::UMAX3;
14859 case ISD::FMINNUM_IEEE:
14860 case ISD::FMINIMUMNUM:
14861 return AMDGPUISD::FMIN3;
14862 case ISD::FMINIMUM:
14863 return AMDGPUISD::FMINIMUM3;
14865 return AMDGPUISD::SMIN3;
14867 return AMDGPUISD::UMIN3;
14888 if (!MinK || !MaxK)
14900 unsigned Med3Opc =
Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3;
14901 if (VT == MVT::i32 || (VT == MVT::i16 && Subtarget->hasMed3_16()))
14902 return DAG.
getNode(Med3Opc, SL, VT, Src, MaxVal, MinVal);
14961 const SIMachineFunctionInfo *
Info = MF.
getInfo<SIMachineFunctionInfo>();
14967 if (
Info->getMode().DX10Clamp) {
14976 if (VT == MVT::f32 || (VT == MVT::f16 && Subtarget->hasMed3_16())) {
15004 case ISD::FMINNUM_IEEE:
15005 case ISD::FMAXNUM_IEEE:
15006 case ISD::FMINIMUMNUM:
15007 case ISD::FMAXIMUMNUM:
15008 case AMDGPUISD::FMIN_LEGACY:
15009 case AMDGPUISD::FMAX_LEGACY:
15010 return (VT == MVT::f32) || (VT == MVT::f16 && Subtarget.
hasMin3Max3_16()) ||
15012 case ISD::FMINIMUM:
15013 case ISD::FMAXIMUM:
15021 return (VT == MVT::i32) || (VT == MVT::i16 && Subtarget.
hasMin3Max3_16());
15030 DAGCombinerInfo &DCI)
const {
15031 SelectionDAG &DAG = DCI.DAG;
15063 if (
SDValue Med3 = performIntMed3ImmCombine(
15068 if (
SDValue Med3 = performIntMed3ImmCombine(
15074 if (
SDValue Med3 = performIntMed3ImmCombine(
15079 if (
SDValue Med3 = performIntMed3ImmCombine(
15089 if (((
Opc == ISD::FMINNUM && Op0.
getOpcode() == ISD::FMAXNUM) ||
15090 (
Opc == ISD::FMINNUM_IEEE && Op0.
getOpcode() == ISD::FMAXNUM_IEEE) ||
15091 (
Opc == ISD::FMINIMUMNUM && Op0.
getOpcode() == ISD::FMAXIMUMNUM) ||
15092 (
Opc == AMDGPUISD::FMIN_LEGACY &&
15093 Op0.
getOpcode() == AMDGPUISD::FMAX_LEGACY)) &&
15094 (VT == MVT::f32 || VT == MVT::f64 ||
15095 (VT == MVT::f16 && Subtarget->has16BitInsts()) ||
15096 (VT == MVT::bf16 && Subtarget->hasBF16PackedInsts()) ||
15097 (VT == MVT::v2bf16 && Subtarget->hasBF16PackedInsts()) ||
15098 (VT == MVT::v2f16 && Subtarget->hasVOP3PInsts())) &&
15100 if (
SDValue Res = performFPMed3ImmCombine(DAG, SDLoc(
N), Op0, Op1))
15107 const SDNodeFlags
Flags =
N->getFlags();
15108 if ((
Opc == ISD::FMINIMUM ||
Opc == ISD::FMAXIMUM) &&
15109 !Subtarget->hasIEEEMinimumMaximumInsts() &&
Flags.hasNoNaNs()) {
15111 Opc == ISD::FMINIMUM ? ISD::FMINNUM_IEEE : ISD::FMAXNUM_IEEE;
15112 return DAG.
getNode(NewOpc, SDLoc(
N), VT, Op0, Op1, Flags);
15122 return (CA->isExactlyValue(0.0) && CB->isExactlyValue(1.0)) ||
15123 (CA->isExactlyValue(1.0) && CB->isExactlyValue(0.0));
15132 DAGCombinerInfo &DCI)
const {
15133 EVT VT =
N->getValueType(0);
15137 SelectionDAG &DAG = DCI.DAG;
15148 return DAG.
getNode(AMDGPUISD::CLAMP, SL, VT, Src2);
15152 const SIMachineFunctionInfo *
Info = MF.
getInfo<SIMachineFunctionInfo>();
15156 if (
Info->getMode().DX10Clamp) {
15169 return DAG.
getNode(AMDGPUISD::CLAMP, SL, VT, Src0);
15176 DAGCombinerInfo &DCI)
const {
15180 return DCI.DAG.getUNDEF(
N->getValueType(0));
15188 bool IsDivergentIdx,
15193 unsigned VecSize = EltSize * NumElem;
15196 if (VecSize <= 64 && EltSize < 32)
15205 if (IsDivergentIdx)
15209 unsigned NumInsts = NumElem +
15210 ((EltSize + 31) / 32) * NumElem ;
15214 if (Subtarget->useVGPRIndexMode())
15215 return NumInsts <= 16;
15219 if (Subtarget->hasMovrel())
15220 return NumInsts <= 15;
15226 SDValue Idx =
N->getOperand(
N->getNumOperands() - 1);
15241SITargetLowering::performExtractVectorEltCombine(
SDNode *
N,
15242 DAGCombinerInfo &DCI)
const {
15248 EVT ResVT =
N->getValueType(0);
15272 if (!
C ||
C->getZExtValue() != 0x1f)
15288 if (Vec.
hasOneUse() && DCI.isBeforeLegalize() && VecEltVT == ResVT) {
15307 case ISD::FMAXNUM_IEEE:
15308 case ISD::FMINNUM_IEEE:
15309 case ISD::FMAXIMUM:
15310 case ISD::FMINIMUM: {
15316 DCI.AddToWorklist(Elt0.
getNode());
15317 DCI.AddToWorklist(Elt1.
getNode());
15339 if (!DCI.isBeforeLegalize())
15347 VecSize > 32 && VecSize % 32 == 0 && Idx) {
15350 unsigned BitIndex = Idx->getZExtValue() * VecEltSize;
15351 unsigned EltIdx = BitIndex / 32;
15352 unsigned LeftoverBitIdx = BitIndex % 32;
15356 DCI.AddToWorklist(Cast.
getNode());
15360 DCI.AddToWorklist(Elt.
getNode());
15363 DCI.AddToWorklist(Srl.
getNode());
15367 DCI.AddToWorklist(Trunc.
getNode());
15369 if (VecEltVT == ResVT) {
15370 return DAG.
getNode(ISD::BITCAST, SL, VecEltVT, Trunc);
15381SITargetLowering::performInsertVectorEltCombine(
SDNode *
N,
15382 DAGCombinerInfo &DCI)
const {
15393 SelectionDAG &DAG = DCI.DAG;
15412 if (Src.getOpcode() == ISD::FP_EXTEND &&
15413 Src.getOperand(0).getValueType() == MVT::f16) {
15414 return Src.getOperand(0);
15418 APFloat Val = CFP->getValueAPF();
15419 bool LosesInfo =
true;
15429 DAGCombinerInfo &DCI)
const {
15430 assert(Subtarget->has16BitInsts() && !Subtarget->hasMed3_16() &&
15431 "combine only useful on gfx8");
15433 SDValue TruncSrc =
N->getOperand(0);
15434 EVT VT =
N->getValueType(0);
15435 if (VT != MVT::f16)
15438 if (TruncSrc.
getOpcode() != AMDGPUISD::FMED3 ||
15442 SelectionDAG &DAG = DCI.DAG;
15470 return DAG.
getNode(ISD::FMINNUM_IEEE, SL, VT, B1, C1);
15473unsigned SITargetLowering::getFusedOpcode(
const SelectionDAG &DAG,
15475 const SDNode *N1)
const {
15480 if (((VT == MVT::f32 &&
15482 (VT == MVT::f16 && Subtarget->hasMadF16() &&
15502 EVT VT =
N->getValueType(0);
15503 if (VT != MVT::i32 && VT != MVT::i64)
15509 unsigned Opc =
N->getOpcode();
15564 if (!Const ||
Hi_32(Const->getZExtValue()) !=
uint32_t(-1))
15583 DAGCombinerInfo &DCI)
const {
15586 SelectionDAG &DAG = DCI.DAG;
15587 EVT VT =
N->getValueType(0);
15597 if (!
N->isDivergent() && Subtarget->hasSMulHi())
15601 if (NumBits <= 32 || NumBits > 64)
15612 if (!Subtarget->hasFullRate64Ops()) {
15613 unsigned NumUsers = 0;
15614 for (SDNode *User :
LHS->
users()) {
15617 if (!
User->isAnyAdd())
15641 bool MulSignedLo =
false;
15642 if (!MulLHSUnsigned32 || !MulRHSUnsigned32) {
15651 if (VT != MVT::i64) {
15674 getMad64_32(DAG, SL, MVT::i64, MulLHSLo, MulRHSLo, AddRHS, MulSignedLo);
15676 if (!MulSignedLo && (!MulLHSUnsigned32 || !MulRHSUnsigned32)) {
15677 auto [AccumLo, AccumHi] = DAG.
SplitScalar(Accum, SL, MVT::i32, MVT::i32);
15679 if (!MulLHSUnsigned32) {
15686 if (!MulRHSUnsigned32) {
15697 if (VT != MVT::i64)
15703SITargetLowering::foldAddSub64WithZeroLowBitsTo32(
SDNode *
N,
15704 DAGCombinerInfo &DCI)
const {
15714 SelectionDAG &DAG = DCI.DAG;
15729 unsigned Opcode =
N->getOpcode();
15730 if (Opcode == ISD::PTRADD)
15733 DAG.
getNode(Opcode, SL, MVT::i32,
Hi, ConstHi32,
N->getFlags());
15744static std::optional<ByteProvider<SDValue>>
15747 if (!Byte0 || Byte0->isConstantZero()) {
15748 return std::nullopt;
15751 if (Byte1 && !Byte1->isConstantZero()) {
15752 return std::nullopt;
15758 unsigned FirstCs =
First & 0x0c0c0c0c;
15759 unsigned SecondCs = Second & 0x0c0c0c0c;
15760 unsigned FirstNoCs =
First & ~0x0c0c0c0c;
15761 unsigned SecondNoCs = Second & ~0x0c0c0c0c;
15763 assert((FirstCs & 0xFF) | (SecondCs & 0xFF));
15764 assert((FirstCs & 0xFF00) | (SecondCs & 0xFF00));
15765 assert((FirstCs & 0xFF0000) | (SecondCs & 0xFF0000));
15766 assert((FirstCs & 0xFF000000) | (SecondCs & 0xFF000000));
15768 return (FirstNoCs | SecondNoCs) | (FirstCs & SecondCs);
15792 for (
int BPI = 0; BPI < 2; BPI++) {
15795 BPP = {Src1, Src0};
15797 unsigned ZeroMask = 0x0c0c0c0c;
15798 unsigned FMask = 0xFF << (8 * (3 - Step));
15800 unsigned FirstMask =
15801 (BPP.first.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
15802 unsigned SecondMask =
15803 (BPP.second.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
15807 int FirstGroup = -1;
15808 for (
int I = 0;
I < 2;
I++) {
15810 auto MatchesFirst = [&BPP](
DotSrc &IterElt) {
15811 return IterElt.SrcOp == *BPP.first.Src &&
15812 (IterElt.DWordOffset == (BPP.first.SrcOffset / 4));
15816 if (Match != Srcs.
end()) {
15817 Match->PermMask =
addPermMasks(FirstMask, Match->PermMask);
15822 if (FirstGroup != -1) {
15824 auto MatchesSecond = [&BPP](
DotSrc &IterElt) {
15825 return IterElt.SrcOp == *BPP.second.Src &&
15826 (IterElt.DWordOffset == (BPP.second.SrcOffset / 4));
15829 if (Match != Srcs.
end()) {
15830 Match->PermMask =
addPermMasks(SecondMask, Match->PermMask);
15832 Srcs.
push_back({*BPP.second.Src, SecondMask, BPP.second.SrcOffset / 4});
15840 unsigned ZeroMask = 0x0c0c0c0c;
15841 unsigned FMask = 0xFF << (8 * (3 - Step));
15845 ((Src0.
SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
15849 ((Src1.
SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
15858 if (Srcs.
size() == 1) {
15859 auto *Elt = Srcs.
begin();
15863 if (Elt->PermMask == 0x3020100)
15866 return DAG.
getNode(AMDGPUISD::PERM, SL, MVT::i32, EltOp, EltOp,
15870 auto *FirstElt = Srcs.
begin();
15871 auto *SecondElt = std::next(FirstElt);
15878 auto FirstMask = FirstElt->PermMask;
15879 auto SecondMask = SecondElt->PermMask;
15881 unsigned FirstCs = FirstMask & 0x0c0c0c0c;
15882 unsigned FirstPlusFour = FirstMask | 0x04040404;
15885 FirstMask = (FirstPlusFour & 0x0F0F0F0F) | FirstCs;
15897 FirstElt = std::next(SecondElt);
15898 if (FirstElt == Srcs.
end())
15901 SecondElt = std::next(FirstElt);
15904 if (SecondElt == Srcs.
end()) {
15909 DAG.
getNode(AMDGPUISD::PERM, SL, MVT::i32, EltOp, EltOp,
15910 DAG.
getConstant(FirstElt->PermMask, SL, MVT::i32)));
15916 return Perms.
size() == 2
15922 for (
auto &[EntryVal, EntryMask, EntryOffset] : Srcs) {
15923 EntryMask = EntryMask >> ((4 - ChainLength) * 8);
15924 auto ZeroMask = ChainLength == 2 ? 0x0c0c0000 : 0x0c000000;
15925 EntryMask += ZeroMask;
15930 auto Opcode =
Op.getOpcode();
15932 return (Opcode ==
ISD::MUL || Opcode == AMDGPUISD::MUL_U24 ||
15933 Opcode == AMDGPUISD::MUL_I24);
15936static std::optional<bool>
15947 bool S0IsSigned = Known0.countMinLeadingOnes() > 0;
15950 bool S1IsSigned = Known1.countMinLeadingOnes() > 0;
15952 assert(!(S0IsUnsigned && S0IsSigned));
15953 assert(!(S1IsUnsigned && S1IsSigned));
15961 if ((S0IsUnsigned && S1IsUnsigned) || (S0IsSigned && S1IsSigned))
15967 if ((S0IsUnsigned && S1IsSigned) || (S0IsSigned && S1IsUnsigned))
15968 return std::nullopt;
15980 if ((S0IsSigned && !(S1IsSigned || S1IsUnsigned)) ||
15981 ((S1IsSigned && !(S0IsSigned || S0IsUnsigned))))
15986 if ((!(S1IsSigned || S1IsUnsigned) && !(S0IsSigned || S0IsUnsigned)))
15992 if ((S0IsUnsigned && !(S1IsSigned || S1IsUnsigned)) ||
15993 ((S1IsUnsigned && !(S0IsSigned || S0IsUnsigned))))
15994 return std::nullopt;
16000 DAGCombinerInfo &DCI)
const {
16001 SelectionDAG &DAG = DCI.DAG;
16002 EVT VT =
N->getValueType(0);
16008 if (Subtarget->hasMad64_32()) {
16009 if (
SDValue Folded = tryFoldToMad64_32(
N, DCI))
16014 if (
SDValue V = reassociateScalarOps(
N, DAG)) {
16018 if (VT == MVT::i64) {
16019 if (
SDValue Folded = foldAddSub64WithZeroLowBitsTo32(
N, DCI))
16024 (Subtarget->hasDot1Insts() || Subtarget->hasDot8Insts())) {
16026 std::optional<bool> IsSigned;
16032 int ChainLength = 0;
16033 for (
int I = 0;
I < 4;
I++) {
16037 auto Src0 =
handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(0));
16040 auto Src1 =
handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(1));
16045 TempNode->getOperand(MulIdx), *Src0, *Src1,
16046 TempNode->getOperand(MulIdx)->getOperand(0),
16047 TempNode->getOperand(MulIdx)->getOperand(1), DAG);
16051 IsSigned = *IterIsSigned;
16052 if (*IterIsSigned != *IsSigned)
16055 auto AddIdx = 1 - MulIdx;
16058 if (
I == 2 &&
isMul(TempNode->getOperand(AddIdx))) {
16059 Src2s.
push_back(TempNode->getOperand(AddIdx));
16069 TempNode->getOperand(AddIdx), *Src0, *Src1,
16070 TempNode->getOperand(AddIdx)->getOperand(0),
16071 TempNode->getOperand(AddIdx)->getOperand(1), DAG);
16075 if (*IterIsSigned != *IsSigned)
16079 ChainLength =
I + 2;
16083 TempNode = TempNode->getOperand(AddIdx);
16085 ChainLength =
I + 1;
16086 if (TempNode->getNumOperands() < 2)
16088 LHS = TempNode->getOperand(0);
16089 RHS = TempNode->getOperand(1);
16092 if (ChainLength < 2)
16098 if (ChainLength < 4) {
16108 bool UseOriginalSrc =
false;
16109 if (ChainLength == 4 && Src0s.
size() == 1 && Src1s.
size() == 1 &&
16110 Src0s.
begin()->PermMask == Src1s.
begin()->PermMask &&
16111 Src0s.
begin()->SrcOp.getValueSizeInBits() >= 32 &&
16112 Src1s.
begin()->SrcOp.getValueSizeInBits() >= 32) {
16113 SmallVector<unsigned, 4> SrcBytes;
16114 auto Src0Mask = Src0s.
begin()->PermMask;
16115 SrcBytes.
push_back(Src0Mask & 0xFF000000);
16116 bool UniqueEntries =
true;
16117 for (
auto I = 1;
I < 4;
I++) {
16118 auto NextByte = Src0Mask & (0xFF << ((3 -
I) * 8));
16121 UniqueEntries =
false;
16127 if (UniqueEntries) {
16128 UseOriginalSrc =
true;
16130 auto *FirstElt = Src0s.
begin();
16134 auto *SecondElt = Src1s.
begin();
16136 SecondElt->DWordOffset);
16145 if (!UseOriginalSrc) {
16152 DAG.
getExtOrTrunc(*IsSigned, Src2s[ChainLength - 1], SL, MVT::i32);
16155 : Intrinsic::amdgcn_udot4,
16165 if (VT != MVT::i32 || !DCI.isAfterLegalizeDAG())
16170 unsigned Opc =
LHS.getOpcode();
16182 auto Cond =
RHS.getOperand(0);
16187 SDVTList VTList = DAG.
getVTList(MVT::i32, MVT::i1);
16204 DAGCombinerInfo &DCI)
const {
16205 SelectionDAG &DAG = DCI.DAG;
16207 EVT VT =
N->getValueType(0);
16220 SDNodeFlags ShlFlags = N1->
getFlags();
16224 SDNodeFlags NewShlFlags =
16229 DCI.AddToWorklist(Inner.
getNode());
16236 if (Subtarget->hasMad64_32()) {
16237 if (
SDValue Folded = tryFoldToMad64_32(
N, DCI))
16246 if (VT == MVT::i64) {
16247 if (
SDValue Folded = foldAddSub64WithZeroLowBitsTo32(
N, DCI))
16260 if (!YIsConstant && !ZIsConstant && !
X->isDivergent() &&
16261 Y->isDivergent() !=
Z->isDivergent()) {
16270 if (
Y->isDivergent())
16273 SDNodeFlags ReassocFlags =
16276 DCI.AddToWorklist(UniformInner.
getNode());
16284 DAGCombinerInfo &DCI)
const {
16285 SelectionDAG &DAG = DCI.DAG;
16286 EVT VT =
N->getValueType(0);
16288 if (VT == MVT::i64) {
16289 if (
SDValue Folded = foldAddSub64WithZeroLowBitsTo32(
N, DCI))
16293 if (VT != MVT::i32)
16302 unsigned Opc =
RHS.getOpcode();
16309 auto Cond =
RHS.getOperand(0);
16314 SDVTList VTList = DAG.
getVTList(MVT::i32, MVT::i1);
16332SITargetLowering::performAddCarrySubCarryCombine(
SDNode *
N,
16333 DAGCombinerInfo &DCI)
const {
16335 if (
N->getValueType(0) != MVT::i32)
16341 SelectionDAG &DAG = DCI.DAG;
16346 unsigned LHSOpc =
LHS.getOpcode();
16347 unsigned Opc =
N->getOpcode();
16351 return DAG.
getNode(
Opc, SDLoc(
N),
N->getVTList(), Args);
16357 DAGCombinerInfo &DCI)
const {
16361 SelectionDAG &DAG = DCI.DAG;
16362 EVT VT =
N->getValueType(0);
16374 if (
A ==
LHS.getOperand(1)) {
16375 unsigned FusedOp = getFusedOpcode(DAG,
N,
LHS.getNode());
16376 if (FusedOp != 0) {
16378 return DAG.
getNode(FusedOp, SL, VT,
A, Two,
RHS);
16386 if (
A ==
RHS.getOperand(1)) {
16387 unsigned FusedOp = getFusedOpcode(DAG,
N,
RHS.getNode());
16388 if (FusedOp != 0) {
16390 return DAG.
getNode(FusedOp, SL, VT,
A, Two,
LHS);
16399 DAGCombinerInfo &DCI)
const {
16403 SelectionDAG &DAG = DCI.DAG;
16405 EVT VT =
N->getValueType(0);
16418 if (
A ==
LHS.getOperand(1)) {
16419 unsigned FusedOp = getFusedOpcode(DAG,
N,
LHS.getNode());
16420 if (FusedOp != 0) {
16424 return DAG.
getNode(FusedOp, SL, VT,
A, Two, NegRHS);
16433 if (
A ==
RHS.getOperand(1)) {
16434 unsigned FusedOp = getFusedOpcode(DAG,
N,
RHS.getNode());
16435 if (FusedOp != 0) {
16437 return DAG.
getNode(FusedOp, SL, VT,
A, NegTwo,
LHS);
16446 DAGCombinerInfo &DCI)
const {
16447 SelectionDAG &DAG = DCI.DAG;
16449 EVT VT =
N->getValueType(0);
16450 if ((VT != MVT::f16 && VT != MVT::bf16) || !Subtarget->has16BitInsts())
16456 SDNodeFlags
Flags =
N->getFlags();
16457 SDNodeFlags RHSFlags =
RHS->getFlags();
16463 bool IsNegative =
false;
16464 if (CLHS->isExactlyValue(1.0) ||
16465 (IsNegative = CLHS->isExactlyValue(-1.0))) {
16468 if (
RHS.getOpcode() == ISD::FSQRT) {
16471 DAG.
getNode(AMDGPUISD::RSQ, SL, VT,
RHS.getOperand(0), Flags);
16472 return IsNegative ? DAG.
getNode(ISD::FNEG, SL, VT, Rsq, Flags) : Rsq;
16481 DAGCombinerInfo &DCI)
const {
16482 SelectionDAG &DAG = DCI.DAG;
16483 EVT VT =
N->getValueType(0);
16487 if (!
N->isDivergent() &&
getSubtarget()->hasSALUFloatInsts() &&
16488 (ScalarVT == MVT::f32 || ScalarVT == MVT::f16)) {
16503 if ((ScalarVT == MVT::f64 || ScalarVT == MVT::f32 || ScalarVT == MVT::f16) &&
16508 const ConstantFPSDNode *FalseNode =
16518 if (ScalarVT == MVT::f32 &&
16524 if (TrueNodeExpVal == INT_MIN)
16527 if (FalseNodeExpVal == INT_MIN)
16540 return DAG.
getNode(ISD::FLDEXP, SL, VT,
LHS, SelectNode,
N->getFlags());
16547 DAGCombinerInfo &DCI)
const {
16548 SelectionDAG &DAG = DCI.DAG;
16549 EVT VT =
N->getValueType(0);
16552 if (!Subtarget->hasDot10Insts() || VT != MVT::f32)
16570 (
N->getFlags().hasAllowContract() &&
16571 FMA->getFlags().hasAllowContract())) {
16586 if (FMAOp1.
getOpcode() != ISD::FP_EXTEND ||
16605 if (Vec1 == Vec2 || Vec3 == Vec4)
16611 if ((Vec1 == Vec3 && Vec2 == Vec4) || (Vec1 == Vec4 && Vec2 == Vec3)) {
16612 return DAG.
getNode(AMDGPUISD::FDOT2, SL, MVT::f32, Vec1, Vec2, FMAAcc,
16620 DAGCombinerInfo &DCI)
const {
16621 SelectionDAG &DAG = DCI.DAG;
16626 EVT VT =
LHS.getValueType();
16655 return LHS.getOperand(0);
16663 LHS.getConstantOperandVal(1) !=
LHS.getConstantOperandVal(2) &&
16670 const APInt &CT =
LHS.getConstantOperandAPInt(1);
16671 const APInt &CF =
LHS.getConstantOperandAPInt(2);
16679 return LHS.getOperand(0);
16711 DAG.
getVTList(MVT::i32, MVT::i1), {Op0Lo, Op1Lo});
16716 {Op0Hi, Op1Hi, CarryInHi});
16726 DCI.CombineTo(
LHS.getNode(), Result);
16730 if (VT != MVT::f32 && VT != MVT::f64 &&
16731 (!Subtarget->has16BitInsts() || VT != MVT::f16))
16739 LHS.getOpcode() == ISD::FABS) {
16746 const unsigned IsInfMask =
16748 const unsigned IsFiniteMask =
16753 return DAG.
getNode(AMDGPUISD::FP_CLASS, SL, MVT::i1,
LHS.getOperand(0),
16762SITargetLowering::performCvtF32UByteNCombine(
SDNode *
N,
16763 DAGCombinerInfo &DCI)
const {
16764 SelectionDAG &DAG = DCI.DAG;
16766 unsigned Offset =
N->getOpcode() - AMDGPUISD::CVT_F32_UBYTE0;
16785 unsigned ShiftOffset = 8 *
Offset;
16787 ShiftOffset -=
C->getZExtValue();
16789 ShiftOffset +=
C->getZExtValue();
16791 if (ShiftOffset < 32 && (ShiftOffset % 8) == 0) {
16792 return DAG.
getNode(AMDGPUISD::CVT_F32_UBYTE0 + ShiftOffset / 8, SL,
16793 MVT::f32, Shifted);
16804 DCI.AddToWorklist(
N);
16811 return DAG.
getNode(
N->getOpcode(), SL, MVT::f32, DemandedSrc);
16817 DAGCombinerInfo &DCI)
const {
16822 const MachineFunction &MF = DCI.DAG.getMachineFunction();
16826 (
F.isNaN() && MF.
getInfo<SIMachineFunctionInfo>()->getMode().DX10Clamp)) {
16827 return DCI.DAG.getConstantFP(Zero, SDLoc(
N),
N->getValueType(0));
16830 APFloat One(
F.getSemantics(),
"1.0");
16832 return DCI.DAG.getConstantFP(One, SDLoc(
N),
N->getValueType(0));
16838 DAGCombinerInfo &DCI)
const {
16859 bool isFloatingPoint =
LHS.getValueType().isFloatingPoint();
16860 bool isInteger =
LHS.getValueType().isInteger();
16863 if (!isFloatingPoint && !isInteger)
16868 if (!isEquality && !isNonEquality)
16885 if (isFloatingPoint) {
16887 if (!Val.
isNormal() || Subtarget->getInstrInfo()->isInlineConstant(Val))
16898 if (!(isEquality && TrueVal == ConstVal) &&
16899 !(isNonEquality && FalseVal == ConstVal))
16906 SelectLHS, SelectRHS);
16911 switch (
N->getOpcode()) {
16927 if (
auto Res = promoteUniformOpToI32(
SDValue(
N, 0), DCI))
16937 switch (
N->getOpcode()) {
16939 return performAddCombine(
N, DCI);
16941 return performPtrAddCombine(
N, DCI);
16943 return performSubCombine(
N, DCI);
16946 return performAddCarrySubCarryCombine(
N, DCI);
16948 return performFAddCombine(
N, DCI);
16950 return performFSubCombine(
N, DCI);
16952 return performFDivCombine(
N, DCI);
16954 return performFMulCombine(
N, DCI);
16956 return performSetCCCombine(
N, DCI);
16958 if (
auto Res = performSelectCombine(
N, DCI))
16963 case ISD::FMAXNUM_IEEE:
16964 case ISD::FMINNUM_IEEE:
16965 case ISD::FMAXIMUM:
16966 case ISD::FMINIMUM:
16967 case ISD::FMAXIMUMNUM:
16968 case ISD::FMINIMUMNUM:
16973 case AMDGPUISD::FMIN_LEGACY:
16974 case AMDGPUISD::FMAX_LEGACY:
16975 return performMinMaxCombine(
N, DCI);
16977 return performFMACombine(
N, DCI);
16979 return performAndCombine(
N, DCI);
16981 return performOrCombine(
N, DCI);
16984 if (
N->getValueType(0) == MVT::i32 &&
N->isDivergent() &&
16985 TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
16991 return performXorCombine(
N, DCI);
16993 return performZeroExtendCombine(
N, DCI);
16995 return performSignExtendInRegCombine(
N, DCI);
16996 case AMDGPUISD::FP_CLASS:
16997 return performClassCombine(
N, DCI);
16999 return performFCanonicalizeCombine(
N, DCI);
17000 case AMDGPUISD::RCP:
17001 return performRcpCombine(
N, DCI);
17003 case AMDGPUISD::FRACT:
17004 case AMDGPUISD::RSQ:
17005 case AMDGPUISD::RCP_LEGACY:
17006 case AMDGPUISD::RCP_IFLAG:
17007 case AMDGPUISD::RSQ_CLAMP: {
17016 return performUCharToFloatCombine(
N, DCI);
17018 return performFCopySignCombine(
N, DCI);
17019 case AMDGPUISD::CVT_F32_UBYTE0:
17020 case AMDGPUISD::CVT_F32_UBYTE1:
17021 case AMDGPUISD::CVT_F32_UBYTE2:
17022 case AMDGPUISD::CVT_F32_UBYTE3:
17023 return performCvtF32UByteNCombine(
N, DCI);
17024 case AMDGPUISD::FMED3:
17025 return performFMed3Combine(
N, DCI);
17026 case AMDGPUISD::CVT_PKRTZ_F16_F32:
17027 return performCvtPkRTZCombine(
N, DCI);
17028 case AMDGPUISD::CLAMP:
17029 return performClampCombine(
N, DCI);
17032 EVT VT =
N->getValueType(0);
17035 if (VT == MVT::v2i16 || VT == MVT::v2f16 || VT == MVT::v2bf16) {
17038 EVT EltVT = Src.getValueType();
17039 if (EltVT != MVT::i16)
17040 Src = DAG.
getNode(ISD::BITCAST, SL, MVT::i16, Src);
17043 return DAG.
getNode(ISD::BITCAST, SL, VT, Ext);
17049 return performExtractVectorEltCombine(
N, DCI);
17051 return performInsertVectorEltCombine(
N, DCI);
17053 return performFPRoundCombine(
N, DCI);
17062 return performMemSDNodeCombine(MemNode, DCI);
17093 unsigned Opcode =
Node->getMachineOpcode();
17096 int D16Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::d16) - 1;
17097 if (D16Idx >= 0 &&
Node->getConstantOperandVal(D16Idx))
17100 SDNode *
Users[5] = {
nullptr};
17102 unsigned DmaskIdx =
17103 AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::dmask) - 1;
17104 unsigned OldDmask =
Node->getConstantOperandVal(DmaskIdx);
17105 unsigned NewDmask = 0;
17106 unsigned TFEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::tfe) - 1;
17107 unsigned LWEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::lwe) - 1;
17108 bool UsesTFC = (int(TFEIdx) >= 0 &&
Node->getConstantOperandVal(TFEIdx)) ||
17109 (
int(LWEIdx) >= 0 &&
Node->getConstantOperandVal(LWEIdx));
17110 unsigned TFCLane = 0;
17111 bool HasChain =
Node->getNumValues() > 1;
17113 if (OldDmask == 0) {
17121 TFCLane = OldBitsSet;
17125 for (SDUse &Use :
Node->uses()) {
17128 if (
Use.getResNo() != 0)
17131 SDNode *
User =
Use.getUser();
17134 if (!
User->isMachineOpcode() ||
17135 User->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG)
17147 if (UsesTFC && Lane == TFCLane) {
17152 for (
unsigned i = 0, Dmask = OldDmask; (i <= Lane) && (Dmask != 0); i++) {
17154 Dmask &= ~(1 << Comp);
17162 NewDmask |= 1 << Comp;
17167 bool NoChannels = !NewDmask;
17174 if (OldBitsSet == 1)
17180 if (NewDmask == OldDmask)
17189 unsigned NewChannels = BitsSet + UsesTFC;
17193 assert(NewOpcode != -1 &&
17194 NewOpcode !=
static_cast<int>(
Node->getMachineOpcode()) &&
17195 "failed to find equivalent MIMG op");
17203 MVT SVT =
Node->getValueType(0).getVectorElementType().getSimpleVT();
17205 MVT ResultVT = NewChannels == 1
17208 : NewChannels == 5 ? 8
17210 SDVTList NewVTList =
17213 MachineSDNode *NewNode =
17222 if (NewChannels == 1) {
17232 for (
unsigned i = 0, Idx = AMDGPU::sub0; i < 5; ++i) {
17237 if (i || !NoChannels)
17242 if (NewUser != User) {
17252 Idx = AMDGPU::sub1;
17255 Idx = AMDGPU::sub2;
17258 Idx = AMDGPU::sub3;
17261 Idx = AMDGPU::sub4;
17272 Op =
Op.getOperand(0);
17293 MRI.createVirtualRegister(&AMDGPU::VReg_1RegClass), MVT::i1);
17297 Node->getOperand(0), SL, VReg, SrcVal,
17303 return ToResultReg.
getNode();
17308 for (
unsigned i = 0; i <
Node->getNumOperands(); ++i) {
17310 Ops.push_back(
Node->getOperand(i));
17316 Node->getOperand(i).getValueType(),
17317 Node->getOperand(i)),
17329 unsigned Opcode =
Node->getMachineOpcode();
17331 if (
TII->isImage(Opcode) && !
TII->get(Opcode).mayStore() &&
17332 !
TII->isGather4(Opcode) &&
17334 return adjustWritemask(
Node, DAG);
17337 if (Opcode == AMDGPU::INSERT_SUBREG || Opcode == AMDGPU::REG_SEQUENCE) {
17343 case AMDGPU::V_DIV_SCALE_F32_e64:
17344 case AMDGPU::V_DIV_SCALE_F64_e64: {
17354 (Src0 == Src1 || Src0 == Src2))
17410 AMDGPU::getNamedOperandIdx(
MI.getOpcode(), AMDGPU::OpName::vdata);
17411 unsigned InitIdx = 0;
17413 if (
TII->isImage(
MI)) {
17421 unsigned TFEVal = TFE ? TFE->
getImm() : 0;
17422 unsigned LWEVal = LWE ? LWE->
getImm() : 0;
17423 unsigned D16Val = D16 ? D16->getImm() : 0;
17425 if (!TFEVal && !LWEVal)
17436 assert(MO_Dmask &&
"Expected dmask operand in instruction");
17438 unsigned dmask = MO_Dmask->
getImm();
17443 bool Packed = !Subtarget->hasUnpackedD16VMem();
17445 InitIdx = D16Val && Packed ? ((ActiveLanes + 1) >> 1) + 1 : ActiveLanes + 1;
17452 uint32_t DstSize =
TRI.getRegSizeInBits(*DstRC) / 32;
17453 if (DstSize < InitIdx)
17457 InitIdx =
TRI.getRegSizeInBits(*DstRC) / 32;
17465 Register PrevDst =
MRI.cloneVirtualRegister(
MI.getOperand(DstIdx).getReg());
17466 unsigned NewDst = 0;
17471 unsigned SizeLeft = Subtarget->usePRTStrictNull() ? InitIdx : 1;
17472 unsigned CurrIdx = Subtarget->usePRTStrictNull() ? 0 : (InitIdx - 1);
17475 for (; SizeLeft; SizeLeft--, CurrIdx++) {
17476 NewDst =
MRI.createVirtualRegister(
TII->getOpRegClass(
MI, DstIdx));
17496 MI.tieOperands(DstIdx,
MI.getNumOperands() - 1);
17508 if (
TII->isVOP3(
MI.getOpcode())) {
17510 TII->legalizeOperandsVOP3(
MRI,
MI);
17512 if (
TII->isMAI(
MI)) {
17517 int Src0Idx = AMDGPU::getNamedOperandIdx(
MI.getOpcode(),
17518 AMDGPU::OpName::scale_src0);
17519 if (Src0Idx != -1) {
17520 int Src1Idx = AMDGPU::getNamedOperandIdx(
MI.getOpcode(),
17521 AMDGPU::OpName::scale_src1);
17522 if (
TII->usesConstantBus(
MRI,
MI, Src0Idx) &&
17523 TII->usesConstantBus(
MRI,
MI, Src1Idx))
17524 TII->legalizeOpWithMove(
MI, Src1Idx);
17531 if (
TII->isImage(
MI))
17532 TII->enforceOperandRCAlignment(
MI, AMDGPU::OpName::vaddr);
17606std::pair<unsigned, const TargetRegisterClass *>
17613 if (Constraint.
size() == 1) {
17617 if (VT == MVT::Other)
17620 switch (Constraint[0]) {
17627 RC = &AMDGPU::SReg_32RegClass;
17630 RC = &AMDGPU::SGPR_64RegClass;
17635 return std::pair(0U,
nullptr);
17642 RC = Subtarget->useRealTrue16Insts() ? &AMDGPU::VGPR_16RegClass
17643 : &AMDGPU::VGPR_32_Lo256RegClass;
17646 RC = Subtarget->has1024AddressableVGPRs()
17647 ?
TRI->getAlignedLo256VGPRClassForBitWidth(
BitWidth)
17650 return std::pair(0U,
nullptr);
17655 if (!Subtarget->hasMAIInsts())
17659 RC = &AMDGPU::AGPR_32RegClass;
17664 return std::pair(0U,
nullptr);
17669 }
else if (Constraint ==
"VA" && Subtarget->hasGFX90AInsts()) {
17673 RC = &AMDGPU::AV_32RegClass;
17676 RC =
TRI->getVectorSuperClassForBitWidth(
BitWidth);
17678 return std::pair(0U,
nullptr);
17687 return std::pair(0U, RC);
17690 if (Kind !=
'\0') {
17692 RC = &AMDGPU::VGPR_32_Lo256RegClass;
17693 }
else if (Kind ==
's') {
17694 RC = &AMDGPU::SGPR_32RegClass;
17695 }
else if (Kind ==
'a') {
17696 RC = &AMDGPU::AGPR_32RegClass;
17702 return std::pair(0U,
nullptr);
17708 return std::pair(0U,
nullptr);
17712 RC =
TRI->getVGPRClassForBitWidth(Width);
17714 RC =
TRI->getSGPRClassForBitWidth(Width);
17716 RC =
TRI->getAGPRClassForBitWidth(Width);
17718 Reg =
TRI->getMatchingSuperReg(Reg, AMDGPU::sub0, RC);
17723 return std::pair(0U,
nullptr);
17725 return std::pair(Reg, RC);
17731 return std::pair(0U,
nullptr);
17732 if (Idx < RC->getNumRegs())
17734 return std::pair(0U,
nullptr);
17740 Ret.second =
TRI->getPhysRegBaseClass(Ret.first);
17746 if (Constraint.
size() == 1) {
17747 switch (Constraint[0]) {
17757 }
else if (Constraint ==
"DA" || Constraint ==
"DB") {
17765 if (Constraint.
size() == 1) {
17766 switch (Constraint[0]) {
17774 }
else if (Constraint.
size() == 2) {
17775 if (Constraint ==
"VA")
17793 std::vector<SDValue> &
Ops,
17808 unsigned Size =
Op.getScalarValueSizeInBits();
17812 if (
Size == 16 && !Subtarget->has16BitInsts())
17816 Val =
C->getSExtValue();
17820 Val =
C->getValueAPF().bitcastToAPInt().getSExtValue();
17824 if (
Size != 16 ||
Op.getNumOperands() != 2)
17826 if (
Op.getOperand(0).isUndef() ||
Op.getOperand(1).isUndef())
17829 Val =
C->getSExtValue();
17833 Val =
C->getValueAPF().bitcastToAPInt().getSExtValue();
17843 if (Constraint.
size() == 1) {
17844 switch (Constraint[0]) {
17859 }
else if (Constraint.
size() == 2) {
17860 if (Constraint ==
"DA") {
17861 int64_t HiBits =
static_cast<int32_t
>(Val >> 32);
17862 int64_t LoBits =
static_cast<int32_t
>(Val);
17866 if (Constraint ==
"DB") {
17874 unsigned MaxSize)
const {
17875 unsigned Size = std::min<unsigned>(
Op.getScalarValueSizeInBits(), MaxSize);
17876 bool HasInv2Pi = Subtarget->hasInv2PiInlineImm();
17878 MVT VT =
Op.getSimpleValueType();
17903 switch (UnalignedClassID) {
17904 case AMDGPU::VReg_64RegClassID:
17905 return AMDGPU::VReg_64_Align2RegClassID;
17906 case AMDGPU::VReg_96RegClassID:
17907 return AMDGPU::VReg_96_Align2RegClassID;
17908 case AMDGPU::VReg_128RegClassID:
17909 return AMDGPU::VReg_128_Align2RegClassID;
17910 case AMDGPU::VReg_160RegClassID:
17911 return AMDGPU::VReg_160_Align2RegClassID;
17912 case AMDGPU::VReg_192RegClassID:
17913 return AMDGPU::VReg_192_Align2RegClassID;
17914 case AMDGPU::VReg_224RegClassID:
17915 return AMDGPU::VReg_224_Align2RegClassID;
17916 case AMDGPU::VReg_256RegClassID:
17917 return AMDGPU::VReg_256_Align2RegClassID;
17918 case AMDGPU::VReg_288RegClassID:
17919 return AMDGPU::VReg_288_Align2RegClassID;
17920 case AMDGPU::VReg_320RegClassID:
17921 return AMDGPU::VReg_320_Align2RegClassID;
17922 case AMDGPU::VReg_352RegClassID:
17923 return AMDGPU::VReg_352_Align2RegClassID;
17924 case AMDGPU::VReg_384RegClassID:
17925 return AMDGPU::VReg_384_Align2RegClassID;
17926 case AMDGPU::VReg_512RegClassID:
17927 return AMDGPU::VReg_512_Align2RegClassID;
17928 case AMDGPU::VReg_1024RegClassID:
17929 return AMDGPU::VReg_1024_Align2RegClassID;
17930 case AMDGPU::AReg_64RegClassID:
17931 return AMDGPU::AReg_64_Align2RegClassID;
17932 case AMDGPU::AReg_96RegClassID:
17933 return AMDGPU::AReg_96_Align2RegClassID;
17934 case AMDGPU::AReg_128RegClassID:
17935 return AMDGPU::AReg_128_Align2RegClassID;
17936 case AMDGPU::AReg_160RegClassID:
17937 return AMDGPU::AReg_160_Align2RegClassID;
17938 case AMDGPU::AReg_192RegClassID:
17939 return AMDGPU::AReg_192_Align2RegClassID;
17940 case AMDGPU::AReg_256RegClassID:
17941 return AMDGPU::AReg_256_Align2RegClassID;
17942 case AMDGPU::AReg_512RegClassID:
17943 return AMDGPU::AReg_512_Align2RegClassID;
17944 case AMDGPU::AReg_1024RegClassID:
17945 return AMDGPU::AReg_1024_Align2RegClassID;
17961 if (Info->isEntryFunction()) {
17968 unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF);
17970 ? AMDGPU::SGPR_32RegClass.getRegister(MaxNumSGPRs - 1)
17971 :
TRI->getAlignedHighSGPRForRC(MF, 2,
17972 &AMDGPU::SGPR_64RegClass);
17973 Info->setSGPRForEXECCopy(SReg);
17975 assert(!
TRI->isSubRegister(Info->getScratchRSrcReg(),
17976 Info->getStackPtrOffsetReg()));
17977 if (Info->getStackPtrOffsetReg() != AMDGPU::SP_REG)
17978 MRI.replaceRegWith(AMDGPU::SP_REG, Info->getStackPtrOffsetReg());
17982 if (Info->getScratchRSrcReg() != AMDGPU::PRIVATE_RSRC_REG)
17983 MRI.replaceRegWith(AMDGPU::PRIVATE_RSRC_REG, Info->getScratchRSrcReg());
17985 if (Info->getFrameOffsetReg() != AMDGPU::FP_REG)
17986 MRI.replaceRegWith(AMDGPU::FP_REG, Info->getFrameOffsetReg());
17988 Info->limitOccupancy(MF);
17990 if (ST.isWave32() && !MF.
empty()) {
17991 for (
auto &
MBB : MF) {
17992 for (
auto &
MI :
MBB) {
17993 TII->fixImplicitOperands(
MI);
18003 if (ST.needsAlignedVGPRs()) {
18004 for (
unsigned I = 0, E =
MRI.getNumVirtRegs();
I != E; ++
I) {
18010 if (NewClassID != -1)
18011 MRI.setRegClass(Reg,
TRI->getRegClass(NewClassID));
18020 const APInt &DemandedElts,
18022 unsigned Depth)
const {
18024 unsigned Opc =
Op.getOpcode();
18027 unsigned IID =
Op.getConstantOperandVal(0);
18029 case Intrinsic::amdgcn_mbcnt_lo:
18030 case Intrinsic::amdgcn_mbcnt_hi: {
18036 IID == Intrinsic::amdgcn_mbcnt_lo ? ST.getWavefrontSizeLog2() : 5);
18046 Op, Known, DemandedElts, DAG,
Depth);
18062 unsigned MaxValue =
18069 unsigned BFEWidth,
bool SExt,
unsigned Depth) {
18073 unsigned Src1Cst = 0;
18074 if (Src1.
isImm()) {
18075 Src1Cst = Src1.
getImm();
18076 }
else if (Src1.
isReg()) {
18080 Src1Cst = Cst->Value.getZExtValue();
18091 if (Width >= BFEWidth)
18100 Known = Known.
sext(BFEWidth);
18102 Known = Known.
zext(BFEWidth);
18108 unsigned Depth)
const {
18111 switch (
MI->getOpcode()) {
18112 case AMDGPU::S_BFE_I32:
18115 case AMDGPU::S_BFE_U32:
18118 case AMDGPU::S_BFE_I64:
18121 case AMDGPU::S_BFE_U64:
18124 case AMDGPU::G_INTRINSIC:
18125 case AMDGPU::G_INTRINSIC_CONVERGENT: {
18128 case Intrinsic::amdgcn_workitem_id_x:
18131 case Intrinsic::amdgcn_workitem_id_y:
18134 case Intrinsic::amdgcn_workitem_id_z:
18137 case Intrinsic::amdgcn_mbcnt_lo:
18138 case Intrinsic::amdgcn_mbcnt_hi: {
18150 case Intrinsic::amdgcn_groupstaticsize: {
18161 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
18164 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
18167 case AMDGPU::G_AMDGPU_SMED3:
18168 case AMDGPU::G_AMDGPU_UMED3: {
18169 auto [Dst, Src0, Src1, Src2] =
MI->getFirst4Regs();
18196 unsigned Depth)
const {
18203 AttributeList Attrs =
18205 if (
MaybeAlign RetAlign = Attrs.getRetAlignment())
18232 if (Header->getAlignment() != PrefAlign)
18233 return Header->getAlignment();
18235 unsigned LoopSize = 0;
18240 LoopSize +=
MBB->getAlignment().value() / 2;
18243 LoopSize +=
TII->getInstSizeInBytes(
MI);
18244 if (LoopSize > 192)
18249 if (LoopSize <= 64)
18252 if (LoopSize <= 128)
18253 return CacheLineAlign;
18259 auto I = Exit->getFirstNonDebugInstr();
18260 if (
I != Exit->end() &&
I->getOpcode() == AMDGPU::S_INST_PREFETCH)
18261 return CacheLineAlign;
18270 if (PreTerm == Pre->
begin() ||
18271 std::prev(PreTerm)->getOpcode() != AMDGPU::S_INST_PREFETCH)
18275 auto ExitHead = Exit->getFirstNonDebugInstr();
18276 if (ExitHead == Exit->end() ||
18277 ExitHead->getOpcode() != AMDGPU::S_INST_PREFETCH)
18282 return CacheLineAlign;
18290 N =
N->getOperand(0).getNode();
18291 if (
N->getOpcode() == ISD::INLINEASM ||
N->getOpcode() == ISD::INLINEASM_BR)
18300 switch (
N->getOpcode()) {
18308 if (Reg.isPhysical() ||
MRI.isLiveIn(Reg))
18309 return !
TRI->isSGPRReg(
MRI, Reg);
18315 return !
TRI->isSGPRReg(
MRI, Reg);
18319 unsigned AS = L->getAddressSpace();
18323 case ISD::CALLSEQ_END:
18329 case AMDGPUISD::ATOMIC_CMP_SWAP:
18330 case AMDGPUISD::BUFFER_ATOMIC_SWAP:
18331 case AMDGPUISD::BUFFER_ATOMIC_ADD:
18332 case AMDGPUISD::BUFFER_ATOMIC_SUB:
18333 case AMDGPUISD::BUFFER_ATOMIC_SMIN:
18334 case AMDGPUISD::BUFFER_ATOMIC_UMIN:
18335 case AMDGPUISD::BUFFER_ATOMIC_SMAX:
18336 case AMDGPUISD::BUFFER_ATOMIC_UMAX:
18337 case AMDGPUISD::BUFFER_ATOMIC_AND:
18338 case AMDGPUISD::BUFFER_ATOMIC_OR:
18339 case AMDGPUISD::BUFFER_ATOMIC_XOR:
18340 case AMDGPUISD::BUFFER_ATOMIC_INC:
18341 case AMDGPUISD::BUFFER_ATOMIC_DEC:
18342 case AMDGPUISD::BUFFER_ATOMIC_CMPSWAP:
18343 case AMDGPUISD::BUFFER_ATOMIC_CSUB:
18344 case AMDGPUISD::BUFFER_ATOMIC_FADD:
18345 case AMDGPUISD::BUFFER_ATOMIC_FMIN:
18346 case AMDGPUISD::BUFFER_ATOMIC_FMAX:
18352 return A->readMem() &&
A->writeMem();
18373 switch (Ty.getScalarSizeInBits()) {
18385 const APInt &DemandedElts,
18388 unsigned Depth)
const {
18389 if (
Op.getOpcode() == AMDGPUISD::CLAMP) {
18393 if (Info->getMode().DX10Clamp)
18405 if (RMW->
hasMetadata(
"amdgpu.ignore.denormal.mode"))
18425 <<
"Hardware instruction generated for atomic "
18427 <<
" operation at memory scope " << MemScope;
18432 Type *EltTy = VT->getElementType();
18433 return VT->getNumElements() == 2 &&
18453 unsigned BW =
IT->getBitWidth();
18454 return BW == 32 || BW == 64;
18468 unsigned BW =
DL.getPointerSizeInBits(PT->getAddressSpace());
18469 return BW == 32 || BW == 64;
18472 if (Ty->isFloatTy() || Ty->isDoubleTy())
18476 return VT->getNumElements() == 2 &&
18477 VT->getElementType()->getPrimitiveSizeInBits() == 16;
18487 bool HasSystemScope) {
18494 if (HasSystemScope) {
18503 return RMW->
hasMetadata(
"amdgpu.no.fine.grained.memory");
18516 const MDNode *MD =
I->getMetadata(LLVMContext::MD_noalias_addrspace);
18542 DL.getTypeSizeInBits(RMW->
getType()) == 64 &&
18555 bool HasSystemScope =
18581 if (Subtarget->hasEmulatedSystemScopeAtomics())
18597 if (!HasSystemScope &&
18598 Subtarget->supportsAgentScopeFineGrainedRemoteMemoryAtomics())
18610 if (RMW->
hasMetadata(
"amdgpu.no.fine.grained.memory"))
18618 ConstVal && ConstVal->isNullValue())
18656 if (Ty->isFloatTy()) {
18661 if (Ty->isDoubleTy()) {
18682 if (Ty->isFloatTy() &&
18683 !Subtarget->hasMemoryAtomicFaddF32DenormalSupport() &&
18696 if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() &&
isV2F16(Ty))
18700 if (Subtarget->hasAtomicGlobalPkAddBF16Inst() &&
isV2BF16(Ty))
18704 if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() &&
isV2F16(Ty))
18709 if (Subtarget->hasAtomicBufferPkAddBF16Inst() &&
isV2BF16(Ty))
18714 if (Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst() && Ty->isDoubleTy())
18718 if (Ty->isFloatTy()) {
18721 if (RMW->
use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
18724 if (!RMW->
use_empty() && Subtarget->hasAtomicFaddRtnInsts())
18729 Subtarget->hasAtomicBufferGlobalPkAddF16NoRtnInsts() &&
18737 if (Subtarget->hasFlatAtomicFaddF32Inst())
18746 if (Subtarget->hasLDSFPAtomicAddF32()) {
18747 if (RMW->
use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
18749 if (!RMW->
use_empty() && Subtarget->hasAtomicFaddRtnInsts())
18777 if (Subtarget->hasAtomicFMinFMaxF32FlatInsts() && Ty->isFloatTy())
18779 if (Subtarget->hasAtomicFMinFMaxF64FlatInsts() && Ty->isDoubleTy())
18783 if (Subtarget->hasAtomicFMinFMaxF32GlobalInsts() && Ty->isFloatTy())
18785 if (Subtarget->hasAtomicFMinFMaxF64GlobalInsts() && Ty->isDoubleTy())
18838 if (RC == &AMDGPU::VReg_1RegClass && !isDivergent)
18839 return Subtarget->isWave64() ? &AMDGPU::SReg_64RegClass
18840 : &AMDGPU::SReg_32RegClass;
18841 if (!
TRI->isSGPRClass(RC) && !isDivergent)
18842 return TRI->getEquivalentSGPRClass(RC);
18843 if (
TRI->isSGPRClass(RC) && isDivergent) {
18844 if (Subtarget->hasGFX90AInsts())
18845 return TRI->getEquivalentAVClass(RC);
18846 return TRI->getEquivalentVGPRClass(RC);
18859 unsigned WaveSize) {
18864 if (!
IT ||
IT->getBitWidth() != WaveSize)
18869 if (!Visited.
insert(V).second)
18871 bool Result =
false;
18872 for (
const auto *U : V->users()) {
18874 if (V == U->getOperand(1)) {
18879 case Intrinsic::amdgcn_if_break:
18880 case Intrinsic::amdgcn_if:
18881 case Intrinsic::amdgcn_else:
18886 if (V == U->getOperand(0)) {
18891 case Intrinsic::amdgcn_end_cf:
18892 case Intrinsic::amdgcn_loop:
18898 Result =
hasCFUser(U, Visited, WaveSize);
18907 const Value *V)
const {
18909 if (CI->isInlineAsm()) {
18918 for (
auto &TC : TargetConstraints) {
18932 return hasCFUser(V, Visited, Subtarget->getWavefrontSize());
18960 return MRI.hasOneNonDBGUse(N0);
18967 if (
I.getMetadata(
"amdgpu.noclobber"))
18969 if (
I.getMetadata(
"amdgpu.last.use"))
19033 Alignment = RMW->getAlign();
19046 bool FullFlatEmulation =
19048 ((Subtarget->hasAtomicFaddInsts() && RMW->getType()->isFloatTy()) ||
19049 (Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst() &&
19050 RMW->getType()->isDoubleTy()));
19053 bool ReturnValueIsUsed = !AI->
use_empty();
19062 if (FullFlatEmulation) {
19073 std::prev(BB->
end())->eraseFromParent();
19074 Builder.SetInsertPoint(BB);
19076 Value *LoadedShared =
nullptr;
19077 if (FullFlatEmulation) {
19078 CallInst *IsShared = Builder.CreateIntrinsic(Intrinsic::amdgcn_is_shared,
19079 {Addr},
nullptr,
"is.shared");
19080 Builder.CreateCondBr(IsShared, SharedBB, CheckPrivateBB);
19081 Builder.SetInsertPoint(SharedBB);
19082 Value *CastToLocal = Builder.CreateAddrSpaceCast(
19088 LoadedShared = Clone;
19090 Builder.CreateBr(PhiBB);
19091 Builder.SetInsertPoint(CheckPrivateBB);
19094 CallInst *IsPrivate = Builder.CreateIntrinsic(Intrinsic::amdgcn_is_private,
19095 {Addr},
nullptr,
"is.private");
19096 Builder.CreateCondBr(IsPrivate, PrivateBB, GlobalBB);
19098 Builder.SetInsertPoint(PrivateBB);
19100 Value *CastToPrivate = Builder.CreateAddrSpaceCast(
19103 Value *LoadedPrivate;
19105 LoadedPrivate = Builder.CreateAlignedLoad(
19106 RMW->getType(), CastToPrivate, RMW->getAlign(),
"loaded.private");
19109 LoadedPrivate, RMW->getValOperand());
19111 Builder.CreateAlignedStore(NewVal, CastToPrivate, RMW->getAlign());
19113 auto [ResultLoad, Equal] =
19119 LoadedPrivate = Builder.CreateInsertValue(Insert, Equal, 1);
19122 Builder.CreateBr(PhiBB);
19124 Builder.SetInsertPoint(GlobalBB);
19128 if (FullFlatEmulation) {
19129 Value *CastToGlobal = Builder.CreateAddrSpaceCast(
19138 if (!FullFlatEmulation) {
19143 MDNode *RangeNotPrivate =
19146 LoadedGlobal->
setMetadata(LLVMContext::MD_noalias_addrspace,
19150 Builder.CreateBr(PhiBB);
19152 Builder.SetInsertPoint(PhiBB);
19154 if (ReturnValueIsUsed) {
19157 if (FullFlatEmulation)
19164 Builder.CreateBr(ExitBB);
19168 unsigned PtrOpIdx) {
19169 Value *PtrOp =
I->getOperand(PtrOpIdx);
19176 I->setOperand(PtrOpIdx, ASCast);
19188 ConstVal && ConstVal->isNullValue()) {
19218 "Expand Atomic Load only handles SCRATCH -> FLAT conversion");
19226 "Expand Atomic Store only handles SCRATCH -> FLAT conversion");
19241 LoadInst *LI = Builder.CreateAlignedLoad(
static bool isMul(MachineInstr *MI)
unsigned const MachineRegisterInfo * MRI
static unsigned getIntrinsicID(const SDNode *N)
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static constexpr std::pair< ImplicitArgumentMask, StringLiteral > ImplicitAttrs[]
static bool allUsesHaveSourceMods(MachineInstr &MI, MachineRegisterInfo &MRI, unsigned CostThreshold=4)
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static bool isNoUnsignedWrap(MachineInstr *Addr)
static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE, bool &IsTexFail)
static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, SmallVectorImpl< Register > &PackedAddrs, unsigned ArgOffset, const AMDGPU::ImageDimIntrinsicInfo *Intr, bool IsA16, bool IsG16)
Turn a set of s16 typed registers in AddrRegs into a dword sized vector with s16 typed elements.
static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI, const AMDGPUTargetMachine &TM, unsigned AddrSpace)
Return true if the value is a known valid address, such that a null check is not necessary.
Provides AMDGPU specific target descriptions.
The AMDGPU TargetMachine interface definition for hw codegen targets.
This file implements a class to represent arbitrary precision integral constant values and operations...
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static cl::opt< ITMode > IT(cl::desc("IT block support"), cl::Hidden, cl::init(DefaultIT), cl::values(clEnumValN(DefaultIT, "arm-default-it", "Generate any type of IT block"), clEnumValN(RestrictedIT, "arm-restrict-it", "Disallow complex IT blocks")))
Function Alias Analysis Results
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
Analysis containing CSE Info
static std::optional< SDByteProvider > calculateByteProvider(SDValue Op, unsigned Index, unsigned Depth, std::optional< uint64_t > VectorIndex, unsigned StartingIndex=0)
static bool isSigned(unsigned int Opcode)
Utilities for dealing with flags related to floating point properties and mode controls.
AMD GCN specific subclass of TargetSubtarget.
Provides analysis for querying information about KnownBits during GISel passes.
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
const HexagonInstrInfo * TII
iv Induction Variable Users
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
Contains matchers for matching SSA Machine Instructions.
Machine Check Debug Module
static bool isUndef(const MachineInstr &MI)
Register const TargetRegisterInfo * TRI
Promote Memory to Register
static unsigned getAddressSpace(const Value *V, unsigned MaxLookup)
uint64_t IntrinsicInst * II
static constexpr MCPhysReg SPReg
const SmallVectorImpl< MachineOperand > & Cond
static cl::opt< RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode > Mode("regalloc-enable-advisor", cl::Hidden, cl::init(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default), cl::desc("Enable regalloc advisor mode"), cl::values(clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default, "default", "Default"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Release, "release", "precompiled"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Development, "development", "for training")))
Contains matchers for matching SelectionDAG nodes and values.
static void r0(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
static void r3(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
static void r2(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
static void r1(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
#define FP_DENORM_FLUSH_NONE
#define FP_DENORM_FLUSH_IN_FLUSH_OUT
static void reservePrivateMemoryRegs(const TargetMachine &TM, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info)
static SDValue adjustLoadValueTypeImpl(SDValue Result, EVT LoadVT, const SDLoc &DL, SelectionDAG &DAG, bool Unpacked)
static MachineBasicBlock * emitIndirectSrc(MachineInstr &MI, MachineBasicBlock &MBB, const GCNSubtarget &ST)
static bool denormalModeIsFlushAllF64F16(const MachineFunction &MF)
static bool isAtomicRMWLegalIntTy(Type *Ty)
static void knownBitsForWorkitemID(const GCNSubtarget &ST, GISelValueTracking &VT, KnownBits &Known, unsigned Dim)
static bool flatInstrMayAccessPrivate(const Instruction *I)
Return if a flat address space atomicrmw can access private memory.
static std::pair< unsigned, int > computeIndirectRegAndOffset(const SIRegisterInfo &TRI, const TargetRegisterClass *SuperRC, unsigned VecReg, int Offset)
static bool denormalModeIsFlushAllF32(const MachineFunction &MF)
static bool addresses16Bits(int Mask)
static bool isClampZeroToOne(SDValue A, SDValue B)
static bool supportsMin3Max3(const GCNSubtarget &Subtarget, unsigned Opc, EVT VT)
static unsigned findFirstFreeSGPR(CCState &CCInfo)
static uint32_t getPermuteMask(SDValue V)
static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static int getAlignedAGPRClassID(unsigned UnalignedClassID)
static void processPSInputArgs(SmallVectorImpl< ISD::InputArg > &Splits, CallingConv::ID CallConv, ArrayRef< ISD::InputArg > Ins, BitVector &Skipped, FunctionType *FType, SIMachineFunctionInfo *Info)
static void getCoopAtomicOperandsInfo(const CallInst &CI, bool IsLoad, TargetLoweringBase::IntrinsicInfo &Info)
static uint64_t getIdentityValueFor64BitWaveReduction(unsigned Opc)
static SDValue selectSOffset(SDValue SOffset, SelectionDAG &DAG, const GCNSubtarget *Subtarget)
static SDValue getLoadExtOrTrunc(SelectionDAG &DAG, ISD::LoadExtType ExtType, SDValue Op, const SDLoc &SL, EVT VT)
static bool globalMemoryFPAtomicIsLegal(const GCNSubtarget &Subtarget, const AtomicRMWInst *RMW, bool HasSystemScope)
static void fixMasks(SmallVectorImpl< DotSrc > &Srcs, unsigned ChainLength)
static bool is32bitWaveReduceOperation(unsigned Opc)
static TargetLowering::AtomicExpansionKind atomicSupportedIfLegalIntType(const AtomicRMWInst *RMW)
static SDValue strictFPExtFromF16(SelectionDAG &DAG, SDValue Src)
Return the source of an fp_extend from f16 to f32, or a converted FP constant.
static bool isAtomicRMWLegalXChgTy(const AtomicRMWInst *RMW)
static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val)
static void convertScratchAtomicToFlatAtomic(Instruction *I, unsigned PtrOpIdx)
static bool isCopyFromRegOfInlineAsm(const SDNode *N)
static bool elementPairIsOddToEven(ArrayRef< int > Mask, int Elt)
static cl::opt< bool > DisableLoopAlignment("amdgpu-disable-loop-alignment", cl::desc("Do not align and prefetch loops"), cl::init(false))
static SDValue getDWordFromOffset(SelectionDAG &DAG, SDLoc SL, SDValue Src, unsigned DWordOffset)
static MachineBasicBlock::iterator loadM0FromVGPR(const SIInstrInfo *TII, MachineBasicBlock &MBB, MachineInstr &MI, unsigned InitResultReg, unsigned PhiReg, int Offset, bool UseGPRIdxMode, Register &SGPRIdxReg)
static bool isFloatingPointWaveReduceOperation(unsigned Opc)
static bool isImmConstraint(StringRef Constraint)
static SDValue padEltsToUndef(SelectionDAG &DAG, const SDLoc &DL, EVT CastVT, SDValue Src, int ExtraElts)
static SDValue lowerICMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static bool hasCFUser(const Value *V, SmallPtrSet< const Value *, 16 > &Visited, unsigned WaveSize)
static OptimizationRemark emitAtomicRMWLegalRemark(const AtomicRMWInst *RMW)
static unsigned SubIdx2Lane(unsigned Idx)
Helper function for adjustWritemask.
static TargetLowering::AtomicExpansionKind getPrivateAtomicExpansionKind(const GCNSubtarget &STI)
static bool addressMayBeAccessedAsPrivate(const MachineMemOperand *MMO, const SIMachineFunctionInfo &Info)
static MachineBasicBlock * lowerWaveReduce(MachineInstr &MI, MachineBasicBlock &BB, const GCNSubtarget &ST, unsigned Opc)
static bool elementPairIsContiguous(ArrayRef< int > Mask, int Elt)
static bool isV2BF16(Type *Ty)
static ArgDescriptor allocateSGPR32InputImpl(CCState &CCInfo, const TargetRegisterClass *RC, unsigned NumArgRegs)
static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue N0, SDValue N1, SDValue N2, bool Signed)
static SDValue resolveSources(SelectionDAG &DAG, SDLoc SL, SmallVectorImpl< DotSrc > &Srcs, bool IsSigned, bool IsAny)
static bool hasNon16BitAccesses(uint64_t PermMask, SDValue &Op, SDValue &OtherOp)
static void placeSources(ByteProvider< SDValue > &Src0, ByteProvider< SDValue > &Src1, SmallVectorImpl< DotSrc > &Src0s, SmallVectorImpl< DotSrc > &Src1s, int Step)
static EVT memVTFromLoadIntrReturn(const SITargetLowering &TLI, const DataLayout &DL, Type *Ty, unsigned MaxNumLanes)
static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB, const DebugLoc &DL, const MachineOperand &Idx, unsigned InitReg, unsigned ResultReg, unsigned PhiReg, unsigned InitSaveExecReg, int Offset, bool UseGPRIdxMode, Register &SGPRIdxReg)
static SDValue matchPERM(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static bool isFrameIndexOp(SDValue Op)
static ConstantFPSDNode * getSplatConstantFP(SDValue Op)
static void allocateSGPR32Input(CCState &CCInfo, ArgDescriptor &Arg)
static void knownBitsForSBFE(const MachineInstr &MI, GISelValueTracking &VT, KnownBits &Known, const APInt &DemandedElts, unsigned BFEWidth, bool SExt, unsigned Depth)
static bool isExtendedFrom16Bits(SDValue &Operand)
static std::optional< bool > checkDot4MulSignedness(const SDValue &N, ByteProvider< SDValue > &Src0, ByteProvider< SDValue > &Src1, const SDValue &S0Op, const SDValue &S1Op, const SelectionDAG &DAG)
static bool vectorEltWillFoldAway(SDValue Op)
static SDValue getSPDenormModeValue(uint32_t SPDenormMode, SelectionDAG &DAG, const SIMachineFunctionInfo *Info, const GCNSubtarget *ST)
static uint32_t getConstantPermuteMask(uint32_t C)
static MachineBasicBlock * emitIndirectDst(MachineInstr &MI, MachineBasicBlock &MBB, const GCNSubtarget &ST)
static void setM0ToIndexFromSGPR(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineInstr &MI, int Offset)
static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask=~0u, ArgDescriptor Arg=ArgDescriptor())
static MachineBasicBlock * Expand64BitScalarArithmetic(MachineInstr &MI, MachineBasicBlock *BB)
static std::pair< MachineBasicBlock *, MachineBasicBlock * > splitBlockForLoop(MachineInstr &MI, MachineBasicBlock &MBB, bool InstInLoop)
static uint32_t getIdentityValueFor32BitWaveReduction(unsigned Opc)
static unsigned getBasePtrIndex(const MemSDNode *N)
MemSDNode::getBasePtr() does not work for intrinsics, which needs to offset by the chain and intrinsi...
static void allocateFixedSGPRInputImpl(CCState &CCInfo, const TargetRegisterClass *RC, MCRegister Reg)
static SDValue constructRetValue(SelectionDAG &DAG, MachineSDNode *Result, ArrayRef< EVT > ResultTypes, bool IsTexFail, bool Unpacked, bool IsD16, int DMaskPop, int NumVDataDwords, bool IsAtomicPacked16Bit, const SDLoc &DL)
static std::optional< ByteProvider< SDValue > > handleMulOperand(const SDValue &MulOperand)
static SDValue lowerFCMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static Register getIndirectSGPRIdx(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineInstr &MI, int Offset)
static SDValue emitNonHSAIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, EVT VT)
static EVT memVTFromLoadIntrData(const SITargetLowering &TLI, const DataLayout &DL, Type *Ty, unsigned MaxNumLanes)
static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc)
static unsigned getExtOpcodeForPromotedOp(SDValue Op)
static SDValue lowerBALLOTIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static SDValue buildSMovImm32(SelectionDAG &DAG, const SDLoc &DL, uint64_t Val)
static SDValue tryFoldMADwithSRL(SelectionDAG &DAG, const SDLoc &SL, SDValue MulLHS, SDValue MulRHS, SDValue AddRHS)
static unsigned getIntrMemWidth(unsigned IntrID)
static SDValue getBuildDwordsVector(SelectionDAG &DAG, SDLoc DL, ArrayRef< SDValue > Elts)
static SDNode * findUser(SDValue Value, unsigned Opcode)
Helper function for LowerBRCOND.
static unsigned addPermMasks(unsigned First, unsigned Second)
static uint64_t clearUnusedBits(uint64_t Val, unsigned Size)
static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue C, SDValue GlueChain, SDNodeFlags Flags)
static bool isV2F16OrV2BF16(Type *Ty)
static bool atomicIgnoresDenormalModeOrFPModeIsFTZ(const AtomicRMWInst *RMW)
static SDValue emitRemovedIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, EVT VT)
static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue GlueChain, SDNodeFlags Flags)
static SDValue buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV, const SDLoc &DL, int64_t Offset, EVT PtrVT, unsigned GAFlags=SIInstrInfo::MO_NONE)
static cl::opt< bool > UseDivergentRegisterIndexing("amdgpu-use-divergent-register-indexing", cl::Hidden, cl::desc("Use indirect register addressing for divergent indexes"), cl::init(false))
static const std::optional< ByteProvider< SDValue > > calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex=0, unsigned Depth=0)
static bool isV2F16(Type *Ty)
static void allocateSGPR64Input(CCState &CCInfo, ArgDescriptor &Arg)
SI DAG Lowering interface definition.
Interface definition for SIRegisterInfo.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
static constexpr int Concat[]
static const AMDGPUFunctionArgInfo FixedABIFunctionInfo
void setFuncArgInfo(const Function &F, const AMDGPUFunctionArgInfo &ArgInfo)
static std::optional< uint32_t > getLDSKernelIdMetadata(const Function &F)
uint32_t getLDSSize() const
void setUsesDynamicLDS(bool DynLDS)
void setDynLDSAlign(const Function &F, const GlobalVariable &GV)
bool isEntryFunction() const
unsigned getWavefrontSize() const
static unsigned numBitsSigned(SDValue Op, SelectionDAG &DAG)
SDValue SplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Split a vector load into 2 loads of half the vector.
void analyzeFormalArgumentsCompute(CCState &State, const SmallVectorImpl< ISD::InputArg > &Ins) const
The SelectionDAGBuilder will automatically promote function arguments with illegal types.
SDValue LowerF64ToF16Safe(SDValue Src, const SDLoc &DL, SelectionDAG &DAG) const
SDValue storeStackInputValue(SelectionDAG &DAG, const SDLoc &SL, SDValue Chain, SDValue ArgVal, int64_t Offset) const
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
SDValue splitBinaryBitConstantOpImpl(DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS, uint32_t ValLo, uint32_t ValHi) const
Split the 64-bit value LHS into two 32-bit components, and perform the binary operation Opc to it wit...
SDValue lowerUnhandledCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals, StringRef Reason) const
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
SDValue addTokenForArgument(SDValue Chain, SelectionDAG &DAG, MachineFrameInfo &MFI, int ClobberedFI) const
bool isKnownNeverNaNForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
static bool needsDenormHandlingF32(const SelectionDAG &DAG, SDValue Src, SDNodeFlags Flags)
uint32_t getImplicitParameterOffset(const MachineFunction &MF, const ImplicitParameter Param) const
Helper function that returns the byte offset of the given type of implicit parameter.
SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const
virtual SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op, SelectionDAG &DAG) const
SDValue loadInputValue(SelectionDAG &DAG, const TargetRegisterClass *RC, EVT VT, const SDLoc &SL, const ArgDescriptor &Arg) const
AMDGPUTargetLowering(const TargetMachine &TM, const TargetSubtargetInfo &STI, const AMDGPUSubtarget &AMDGPUSTI)
static EVT getEquivalentMemType(LLVMContext &Context, EVT VT)
SDValue CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC, Register Reg, EVT VT, const SDLoc &SL, bool RawReg=false) const
Helper function that adds Reg to the LiveIn list of the DAG's MachineFunction.
SDValue SplitVectorStore(SDValue Op, SelectionDAG &DAG) const
Split a vector store into 2 stores of half the vector.
std::pair< SDValue, SDValue > split64BitValue(SDValue Op, SelectionDAG &DAG) const
Return 64-bit value Op as two 32-bit integers.
static CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg)
static CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg)
Selects the correct CCAssignFn for a given CallingConvention value.
static unsigned numBitsUnsigned(SDValue Op, SelectionDAG &DAG)
static bool allowApproxFunc(const SelectionDAG &DAG, SDNodeFlags Flags)
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
SDValue performRcpCombine(SDNode *N, DAGCombinerInfo &DCI) const
static bool shouldFoldFNegIntoSrc(SDNode *FNeg, SDValue FNegSrc)
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
SDValue WidenOrSplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Widen a suitably aligned v3 load.
SDValue getHiHalf64(SDValue Op, SelectionDAG &DAG) const
static int64_t getNullPointerValue(unsigned AddrSpace)
Get the integer value of a null pointer in the given address space.
bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override
Returns true if a cast between SrcAS and DestAS is a noop.
const std::array< unsigned, 3 > & getDims() const
static const LaneMaskConstants & get(const GCNSubtarget &ST)
const unsigned XorTermOpc
const unsigned AndSaveExecOpc
static constexpr roundingMode rmNearestTiesToEven
static const fltSemantics & IEEEhalf()
static APFloat getQNaN(const fltSemantics &Sem, bool Negative=false, const APInt *payload=nullptr)
Factory for QNaN values.
LLVM_ABI opStatus convert(const fltSemantics &ToSemantics, roundingMode RM, bool *losesInfo)
LLVM_READONLY int getExactLog2Abs() const
APInt bitcastToAPInt() const
static APFloat getLargest(const fltSemantics &Sem, bool Negative=false)
Returns the largest finite number in the given semantics.
static APFloat getInf(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Infinity.
static APFloat getZero(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Zero.
Class for arbitrary precision integers.
void setHighBits(unsigned hiBits)
Set the top hiBits bits.
void setBitsFrom(unsigned loBit)
Set the top bits starting from loBit.
static APInt getBitsSet(unsigned numBits, unsigned loBit, unsigned hiBit)
Get a value with a block of bits set.
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
bool isSignMask() const
Check if the APInt's value is returned by getSignMask.
unsigned countr_zero() const
Count the number of trailing zero bits.
bool isOneBitSet(unsigned BitNo) const
Determine if this APInt Value only has the specified bit set.
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
bool sge(const APInt &RHS) const
Signed greater or equal comparison.
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
This class represents an incoming formal argument to a Function.
LLVM_ABI bool hasAttribute(Attribute::AttrKind Kind) const
Check if an argument has a given attribute.
const Function * getParent() const
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
size_t size() const
size - Get the array size.
bool empty() const
empty - Check if the array is empty.
An instruction that atomically checks whether a specified value is in a memory location,...
Value * getNewValOperand()
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Value * getCompareOperand()
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
static unsigned getPointerOperandIndex()
an instruction that atomically reads a memory location, combines it with another value,...
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
static unsigned getPointerOperandIndex()
BinOp
This enumeration lists the possible modifications atomicrmw can make.
@ Min
*p = old <signed v ? old : v
@ UIncWrap
Increment one up to a maximum value.
@ Max
*p = old >signed v ? old : v
@ UMin
*p = old <unsigned v ? old : v
@ FMin
*p = minnum(old, v) minnum matches the behavior of llvm.minnum.
@ UMax
*p = old >unsigned v ? old : v
@ FMax
*p = maxnum(old, v) maxnum matches the behavior of llvm.maxnum.
@ UDecWrap
Decrement one until a minimum value or zero.
Value * getPointerOperand()
void setOperation(BinOp Operation)
BinOp getOperation() const
SyncScope::ID getSyncScopeID() const
Returns the synchronization scope ID of this rmw instruction.
static LLVM_ABI StringRef getOperationName(BinOp Op)
AtomicOrdering getOrdering() const
Returns the ordering constraint of this rmw instruction.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
bool isCompareAndSwap() const
Returns true if this SDNode represents cmpxchg atomic operation, false otherwise.
This class holds the attributes for a particular argument, parameter, function, or return value.
LLVM_ABI MemoryEffects getMemoryEffects() const
LLVM_ABI bool getValueAsBool() const
Return the attribute's value as a boolean.
LLVM Basic Block Representation.
const Function * getParent() const
Return the enclosing method, or null if none.
static BasicBlock * Create(LLVMContext &Context, const Twine &Name="", Function *Parent=nullptr, BasicBlock *InsertBefore=nullptr)
Creates a new BasicBlock.
LLVM_ABI BasicBlock * splitBasicBlock(iterator I, const Twine &BBName="", bool Before=false)
Split the basic block into two basic blocks at the specified instruction.
A "pseudo-class" with methods for operating on BUILD_VECTORs.
Represents known origin of an individual byte in combine pattern.
static ByteProvider getConstantZero()
static ByteProvider getSrc(std::optional< ISelOp > Val, int64_t ByteOffset, int64_t VectorOffset)
std::optional< ISelOp > Src
CCState - This class holds information needed while lowering arguments and return values.
MachineFunction & getMachineFunction() const
unsigned getFirstUnallocated(ArrayRef< MCPhysReg > Regs) const
getFirstUnallocated - Return the index of the first unallocated register in the set,...
static LLVM_ABI bool resultsCompatible(CallingConv::ID CalleeCC, CallingConv::ID CallerCC, MachineFunction &MF, LLVMContext &C, const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn CalleeFn, CCAssignFn CallerFn)
Returns true if the results of the two calling conventions are compatible.
LLVM_ABI void AnalyzeCallResult(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeCallResult - Analyze the return values of a call, incorporating info about the passed values i...
MCRegister AllocateReg(MCPhysReg Reg)
AllocateReg - Attempt to allocate one register.
LLVM_ABI bool CheckReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
CheckReturn - Analyze the return values of a function, returning true if the return can be performed ...
LLVM_ABI void AnalyzeReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeReturn - Analyze the returned values of a return, incorporating info about the result values i...
int64_t AllocateStack(unsigned Size, Align Alignment)
AllocateStack - Allocate a chunk of stack space with the specified size and alignment.
LLVM_ABI void AnalyzeCallOperands(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeCallOperands - Analyze the outgoing arguments to a call, incorporating info about the passed v...
uint64_t getStackSize() const
Returns the size of the currently allocated portion of the stack.
bool isAllocated(MCRegister Reg) const
isAllocated - Return true if the specified register (or an alias) is allocated.
LLVM_ABI void AnalyzeFormalArguments(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeFormalArguments - Analyze an array of argument values, incorporating info about the formals in...
CCValAssign - Represent assignment of one arg/retval to a location.
Register getLocReg() const
LocInfo getLocInfo() const
int64_t getLocMemOffset() const
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
bool hasFnAttr(Attribute::AttrKind Kind) const
Determine whether this call has the given attribute.
LLVM_ABI bool isMustTailCall() const
Tests if this call site must be tail call optimized.
Value * getArgOperand(unsigned i) const
unsigned arg_size() const
This class represents a function call, abstracting a target machine's calling convention.
static LLVM_ABI CastInst * CreatePointerCast(Value *S, Type *Ty, const Twine &Name="", InsertPosition InsertBefore=nullptr)
Create a BitCast, AddrSpaceCast or a PtrToInt cast instruction.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
static bool isFPPredicate(Predicate P)
static bool isIntPredicate(Predicate P)
const APFloat & getValueAPF() const
bool isExactlyValue(double V) const
We don't rely on operator== working on double values, as it returns true for things that are clearly ...
bool isNegative() const
Return true if the value is negative.
bool isInfinity() const
Return true if the value is an infinity.
This is the shared class of boolean and integer constants.
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
uint64_t getZExtValue() const
const APInt & getAPIntValue() const
This is an important base class in LLVM.
A parsed version of the target data layout string in and methods for querying it.
LLVM_ABI Align getABITypeAlign(Type *Ty) const
Returns the minimum ABI-required alignment for the specified type.
LLVM_ABI TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
Diagnostic information for unsupported feature in backend.
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
FunctionLoweringInfo - This contains information that is global to a function that is used when lower...
Register DemoteRegister
DemoteRegister - if CanLowerReturn is false, DemoteRegister is a vreg allocated to hold a pointer to ...
const Value * getValueFromVirtualReg(Register Vreg)
This method is called from TargetLowerinInfo::isSDNodeSourceOfDivergence to get the Value correspondi...
Class to represent function types.
Type * getParamType(unsigned i) const
Parameter type accessors.
FunctionType * getFunctionType() const
Returns the FunctionType for me.
const DataLayout & getDataLayout() const
Get the data layout of the module this function belongs to.
iterator_range< arg_iterator > args()
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
Argument * getArg(unsigned i) const
bool hasMinimum3Maximum3F32() const
bool supportsAgentScopeFineGrainedRemoteMemoryAtomics() const
const SIInstrInfo * getInstrInfo() const override
bool hasMin3Max3PKF16() const
const SIRegisterInfo * getRegisterInfo() const override
bool hasMinimum3Maximum3PKF16() const
bool hasGloballyAddressableScratch() const
bool hasMinimum3Maximum3F16() const
bool hasRestrictedSOffset() const
bool hasMin3Max3_16() const
bool hasEmulatedSystemScopeAtomics() const
unsigned getKnownHighZeroBitsForFrameIndex() const
Return the number of high bits known to be zero for a frame index.
bool hasShaderCyclesHiLoRegisters() const
unsigned getMaxPrivateElementSize(bool ForBufferRSrc=false) const
bool hasKernargSegmentPtr() const
bool hasDispatchID() const
bool hasPrivateSegmentBuffer() const
unsigned getNumFreeUserSGPRs()
bool hasImplicitBufferPtr() const
bool hasPrivateSegmentSize() const
bool hasDispatchPtr() const
bool hasFlatScratchInit() const
const MachineFunction & getMachineFunction() const
void computeKnownBitsImpl(Register R, KnownBits &Known, const APInt &DemandedElts, unsigned Depth=0)
int64_t getOffset() const
LLVM_ABI unsigned getAddressSpace() const
const GlobalValue * getGlobal() const
bool hasExternalLinkage() const
unsigned getAddressSpace() const
Module * getParent()
Get the module that this global value is contained inside of...
Type * getValueType() const
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
LLVM_ABI Instruction * clone() const
Create a copy of 'this' instruction that is identical in all ways except the following:
LLVM_ABI void removeFromParent()
This method unlinks 'this' from the containing basic block, but does not delete it.
bool hasMetadata() const
Return true if this instruction has any metadata attached to it.
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
LLVM_ABI const Function * getFunction() const
Return the function this instruction belongs to.
LLVM_ABI void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
LLVM_ABI void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this instruction belongs to.
LLVM_ABI InstListType::iterator insertInto(BasicBlock *ParentBB, InstListType::iterator It)
Inserts an unlinked instruction into ParentBB at position It and returns the iterator of the inserted...
Class to represent integer types.
A wrapper class for inspecting calls to intrinsic functions.
constexpr unsigned getScalarSizeInBits() const
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
static constexpr LLT pointer(unsigned AddressSpace, unsigned SizeInBits)
Get a low-level pointer in the given address space.
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
constexpr LLT changeElementSize(unsigned NewEltSize) const
If this type is a vector, return a vector with the same number of elements but the new element size.
This is an important class for using LLVM in a threaded context.
LLVM_ABI void emitError(const Instruction *I, const Twine &ErrorStr)
emitError - Emit an error message to the currently installed error handler with optional location inf...
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
LLVM_ABI SyncScope::ID getOrInsertSyncScopeID(StringRef SSN)
getOrInsertSyncScopeID - Maps synchronization scope name to synchronization scope ID.
An instruction for reading from memory.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this load instruction.
static unsigned getPointerOperandIndex()
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
const SDValue & getOffset() const
ISD::LoadExtType getExtensionType() const
Return whether this is a plain node, or one of the varieties of value-extending loads.
Describe properties that are true of each instruction in the target description file.
Wrapper class representing physical registers. Should be passed by value.
LLVM_ABI MDNode * createRange(const APInt &Lo, const APInt &Hi)
Return metadata describing the range [Lo, Hi).
const MDOperand & getOperand(unsigned I) const
Helper class for constructing bundles of MachineInstrs.
MachineBasicBlock::instr_iterator begin() const
Return an iterator to the first bundled instruction.
uint64_t getScalarSizeInBits() const
bool bitsLE(MVT VT) const
Return true if this has no more bits than VT.
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isScalableVector() const
Return true if this is a vector value type where the runtime length is machine dependent.
static LLVM_ABI MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
static MVT getVectorVT(MVT VT, unsigned NumElements)
static MVT getIntegerVT(unsigned BitWidth)
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
LLVM_ABI void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
LLVM_ABI iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
LLVM_ABI MachineBasicBlock * splitAt(MachineInstr &SplitInst, bool UpdateLiveIns=true, LiveIntervals *LIS=nullptr)
Split a basic block into 2 pieces at SplitPoint.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
MachineInstrBundleIterator< MachineInstr > iterator
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
LLVM_ABI int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
bool hasCalls() const
Return true if the current function has any function calls.
void setHasTailCall(bool V=true)
void setReturnAddressIsTaken(bool s)
bool hasStackObjects() const
Return true if there are any stack objects in this function.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
void push_back(MachineBasicBlock *MBB)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
const DataLayout & getDataLayout() const
Return the DataLayout attached to the Module associated to this MF.
Function & getFunction()
Return the LLVM function that this machine code represents.
BasicBlockListType::iterator iterator
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Register addLiveIn(MCRegister PReg, const TargetRegisterClass *RC)
addLiveIn - Add the specified physical register as a live-in value and create a corresponding virtual...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
const MachineInstrBuilder & setOperandDead(unsigned OpIdx) const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
Representation of each machine instruction.
const MachineOperand & getOperand(unsigned i) const
A description of a memory reference used in the backend.
Flags
Flags values. These may be or'd together.
@ MOVolatile
The memory access is volatile.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MONonTemporal
The memory access is non-temporal.
@ MOInvariant
The memory access always returns the same value (or traps).
@ MOStore
The memory access writes data.
const MachinePointerInfo & getPointerInfo() const
Flags getFlags() const
Return the raw flags of the source value,.
AAMDNodes getAAInfo() const
Return the AA tags for the memory reference.
Align getBaseAlign() const
Return the minimum known alignment in bytes of the base address, without the offset.
MachineOperand class - Representation of each machine instruction operand.
unsigned getSubReg() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
static MachineOperand CreateImm(int64_t Val)
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLVM_ABI void clearKillFlags(Register Reg) const
clearKillFlags - Iterate over all the uses of the given register and clear the kill flag from the Mac...
LLVM_ABI void setType(Register VReg, LLT Ty)
Set the low-level type of VReg to Ty.
An SDNode that represents everything that will be needed to construct a MachineInstr.
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
AAMDNodes getAAInfo() const
Returns the AA info that describes the dereference.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
const MachinePointerInfo & getPointerInfo() const
const SDValue & getChain() const
EVT getMemoryVT() const
Return the type of the in-memory value.
bool onlyWritesMemory() const
Whether this function only (at most) writes memory.
bool doesNotAccessMemory() const
Whether this function accesses no memory.
bool onlyReadsMemory() const
Whether this function only (at most) reads memory.
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
AnalysisType & getAnalysis() const
getAnalysis<AnalysisType>() - This function is used by subclasses to get to the analysis information ...
static LLVM_ABI PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Wrapper class representing virtual and physical registers.
static Register index2VirtReg(unsigned Index)
Convert a 0-based index to a virtual register number.
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
Represents one node in the SelectionDAG.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool hasOneUse() const
Return true if there is exactly one use of this node.
value_iterator value_end() const
SDNodeFlags getFlags() const
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getNumValues() const
Return the number of values defined/returned by this operator.
SDVTList getVTList() const
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
EVT getValueType(unsigned ResNo) const
Return the type of a specified result.
user_iterator user_begin() const
Provide iteration support to walk over all users of an SDNode.
op_iterator op_end() const
bool isAnyAdd() const
Returns true if the node type is ADD or PTRADD.
value_iterator value_begin() const
op_iterator op_begin() const
Represents a use of a SDNode.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
bool isMachineOpcode() const
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
MVT getSimpleValueType() const
Return the simple ValueType of the referenced return value.
unsigned getMachineOpcode() const
unsigned getOpcode() const
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
static unsigned getDSShaderTypeValue(const MachineFunction &MF)
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
bool isWholeWaveFunction() const
bool hasWorkGroupIDZ() const
AMDGPU::ClusterDimsAttr getClusterDims() const
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(AMDGPUFunctionArgInfo::PreloadedValue Value) const
unsigned getBytesInStackArgArea() const
const AMDGPUGWSResourcePseudoSourceValue * getGWSPSV(const AMDGPUTargetMachine &TM)
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
static LLVM_READONLY const TargetRegisterClass * getSGPRClassForBitWidth(unsigned BitWidth)
static bool isVGPRClass(const TargetRegisterClass *RC)
static bool isSGPRClass(const TargetRegisterClass *RC)
static bool isAGPRClass(const TargetRegisterClass *RC)
bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override
Return true if folding a constant offset with the given GlobalAddress is legal.
bool isTypeDesirableForOp(unsigned Op, EVT VT) const override
Return true if the target has native support for the specified value type and it is 'desirable' to us...
SDNode * PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const override
Fold the instructions after selecting them.
SDValue splitTernaryVectorOp(SDValue Op, SelectionDAG &DAG) const
MachineSDNode * wrapAddr64Rsrc(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr) const
bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, EVT VT) const override
Return true if an FMA operation is faster than a pair of fmul and fadd instructions.
SDValue lowerGET_ROUNDING(SDValue Op, SelectionDAG &DAG) const
bool requiresUniformRegister(MachineFunction &MF, const Value *V) const override
Allows target to decide about the register class of the specific value that is live outside the defin...
bool isFMADLegal(const SelectionDAG &DAG, const SDNode *N) const override
Returns true if be combined with to form an ISD::FMAD.
AtomicExpansionKind shouldExpandAtomicStoreInIR(StoreInst *SI) const override
Returns how the given (atomic) store should be expanded by the IR-level AtomicExpand pass into.
void bundleInstWithWaitcnt(MachineInstr &MI) const
Insert MI into a BUNDLE with an S_WAITCNT 0 immediately following it.
SDValue lowerROTR(SDValue Op, SelectionDAG &DAG) const
MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override
Return the type to use for a scalar shift opcode, given the shifted amount type.
SDValue LowerCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower calls into the specified DAG.
MVT getPointerTy(const DataLayout &DL, unsigned AS) const override
Map address space 7 to MVT::amdgpuBufferFatPointer because that's its in-memory representation.
bool denormalsEnabledForType(const SelectionDAG &DAG, EVT VT) const
void insertCopiesSplitCSR(MachineBasicBlock *Entry, const SmallVectorImpl< MachineBasicBlock * > &Exits) const override
Insert explicit copies in entry and exit blocks.
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override
Return the ValueType of the result of SETCC operations.
SDNode * legalizeTargetIndependentNode(SDNode *Node, SelectionDAG &DAG) const
Legalize target independent instructions (e.g.
bool allowsMisalignedMemoryAccessesImpl(unsigned Size, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const
TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const override
Return the preferred vector type legalization action.
SDValue lowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const
const GCNSubtarget * getSubtarget() const
bool enableAggressiveFMAFusion(EVT VT) const override
Return true if target always benefits from combining into FMA for a given value type.
bool shouldEmitGOTReloc(const GlobalValue *GV) const
bool isCanonicalized(SelectionDAG &DAG, SDValue Op, unsigned MaxDepth=5) const
void CollectTargetIntrinsicOperands(const CallInst &I, SmallVectorImpl< SDValue > &Ops, SelectionDAG &DAG) const override
SDValue splitUnaryVectorOp(SDValue Op, SelectionDAG &DAG) const
SDValue lowerGET_FPENV(SDValue Op, SelectionDAG &DAG) const
void allocateSpecialInputSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
void allocateLDSKernelId(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
SDValue LowerSTACKSAVE(SDValue Op, SelectionDAG &DAG) const
bool isReassocProfitable(SelectionDAG &DAG, SDValue N0, SDValue N1) const override
void allocateHSAUserSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
ArrayRef< MCPhysReg > getRoundingControlRegisters() const override
Returns a 0 terminated array of rounding control registers that can be attached into strict FP call.
ConstraintType getConstraintType(StringRef Constraint) const override
Given a constraint, return the type of constraint it is for this target.
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool IsVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent) const override
Return the register class that should be used for the specified value type.
void AddMemOpInit(MachineInstr &MI) const
MachineMemOperand::Flags getTargetMMOFlags(const Instruction &I) const override
This callback is used to inspect load/store instructions and add target-specific MachineMemOperand fl...
bool isLegalGlobalAddressingMode(const AddrMode &AM) const
void computeKnownBitsForFrameIndex(int FrameIdx, KnownBits &Known, const MachineFunction &MF) const override
Determine which of the bits of FrameIndex FIOp are known to be 0.
bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override
Return true if it is beneficial to convert a load of a constant to just the constant itself.
Align getPrefLoopAlignment(MachineLoop *ML) const override
Return the preferred loop alignment.
std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override
Given a physical register constraint (e.g.
void emitExpandAtomicStore(StoreInst *SI) const override
Perform a atomic store using a target-specific way.
AtomicExpansionKind shouldExpandAtomicLoadInIR(LoadInst *LI) const override
Returns how the given (atomic) load should be expanded by the IR-level AtomicExpand pass.
Align computeKnownAlignForTargetInstr(GISelValueTracking &Analysis, Register R, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
Determine the known alignment for the pointer value R.
bool getAsmOperandConstVal(SDValue Op, uint64_t &Val) const
bool isShuffleMaskLegal(ArrayRef< int >, EVT) const override
Targets can use this to indicate that they only support some VECTOR_SHUFFLE operations,...
void emitExpandAtomicLoad(LoadInst *LI) const override
Perform a atomic load using a target-specific way.
EVT getOptimalMemOpType(LLVMContext &Context, const MemOp &Op, const AttributeList &FuncAttributes) const override
Returns the target specific optimal type for load and store operations as a result of memset,...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
Register getRegisterByName(const char *RegName, LLT VT, const MachineFunction &MF) const override
Return the register ID of the name passed in.
void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const override
Lower the specified operand into the Ops vector.
LLT getPreferredShiftAmountTy(LLT Ty) const override
Return the preferred type to use for a shift opcode, given the shifted amount type is ShiftValueTy.
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I=nullptr) const override
Return true if the addressing mode represented by AM is legal for this target, for a load/store of th...
SDValue lowerSET_FPENV(SDValue Op, SelectionDAG &DAG) const
bool shouldPreservePtrArith(const Function &F, EVT PtrVT) const override
True if target has some particular form of dealing with pointer arithmetic semantics for pointers wit...
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
SDValue lowerSET_ROUNDING(SDValue Op, SelectionDAG &DAG) const
void allocateSpecialInputVGPRsFixed(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Allocate implicit function VGPR arguments in fixed registers.
LoadInst * lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const override
On some platforms, an AtomicRMW that never actually modifies the value (such as fetch_add of 0) can b...
MachineBasicBlock * emitGWSMemViolTestLoop(MachineInstr &MI, MachineBasicBlock *BB) const
bool getAddrModeArguments(const IntrinsicInst *I, SmallVectorImpl< Value * > &Ops, Type *&AccessTy) const override
CodeGenPrepare sinks address calculations into the same BB as Load/Store instructions reading the add...
bool checkAsmConstraintValA(SDValue Op, uint64_t Val, unsigned MaxSize=64) const
AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *) const override
Returns how the IR-level AtomicExpand pass should expand the given AtomicRMW, if at all.
bool shouldEmitFixup(const GlobalValue *GV) const
MachineBasicBlock * splitKillBlock(MachineInstr &MI, MachineBasicBlock *BB) const
void emitExpandAtomicCmpXchg(AtomicCmpXchgInst *CI) const override
Perform a cmpxchg expansion using a target-specific method.
bool canTransformPtrArithOutOfBounds(const Function &F, EVT PtrVT) const override
True if the target allows transformations of in-bounds pointer arithmetic that cause out-of-bounds in...
bool hasMemSDNodeUser(SDNode *N) const
bool isSDNodeSourceOfDivergence(const SDNode *N, FunctionLoweringInfo *FLI, UniformityInfo *UA) const override
MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) const override
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
bool isEligibleForTailCallOptimization(SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SmallVectorImpl< ISD::InputArg > &Ins, SelectionDAG &DAG) const
bool isMemOpHasNoClobberedMemOperand(const SDNode *N) const
bool isLegalFlatAddressingMode(const AddrMode &AM, unsigned AddrSpace) const
SDValue LowerCallResult(SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals, bool isThisReturn, SDValue ThisVal) const
SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower the incoming (formal) arguments, described by the Ins array,...
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
AtomicExpansionKind shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const override
Returns how the given atomic cmpxchg should be expanded by the IR-level AtomicExpand pass.
bool isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode, EVT DestVT, EVT SrcVT) const override
Return true if an fpext operation input to an Opcode operation is free (for instance,...
void AdjustInstrPostInstrSelection(MachineInstr &MI, SDNode *Node) const override
Assign the register class depending on the number of bits set in the writemask.
MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
void allocateSpecialInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Allocate implicit function VGPR arguments at the end of allocated user arguments.
void finalizeLowering(MachineFunction &MF) const override
Execute target specific actions to finalize target lowering.
static bool isNonGlobalAddrSpace(unsigned AS)
void emitExpandAtomicAddrSpacePredicate(Instruction *AI) const
MachineSDNode * buildRSRC(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr, uint32_t RsrcDword1, uint64_t RsrcDword2And3) const
Return a resource descriptor with the 'Add TID' bit enabled The TID (Thread ID) is multiplied by the ...
unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain targets require unusual breakdowns of certain types.
bool mayBeEmittedAsTailCall(const CallInst *) const override
Return true if the target may be able emit the call instruction as a tail call.
void passSpecialInputs(CallLoweringInfo &CLI, CCState &CCInfo, const SIMachineFunctionInfo &Info, SmallVectorImpl< std::pair< unsigned, SDValue > > &RegsToPass, SmallVectorImpl< SDValue > &MemOpChains, SDValue Chain) const
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
bool checkAsmConstraintVal(SDValue Op, StringRef Constraint, uint64_t Val) const
bool isKnownNeverNaNForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
void emitExpandAtomicRMW(AtomicRMWInst *AI) const override
Perform a atomicrmw expansion using a target-specific way.
static bool shouldExpandVectorDynExt(unsigned EltSize, unsigned NumElem, bool IsDivergentIdx, const GCNSubtarget *Subtarget)
Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be expanded into a set of cmp...
bool shouldUseLDSConstAddress(const GlobalValue *GV) const
bool supportSplitCSR(MachineFunction *MF) const override
Return true if the target supports that a subset of CSRs for the given machine function is handled ex...
bool isExtractVecEltCheap(EVT VT, unsigned Index) const override
Return true if extraction of a scalar element from the given vector type at the given index is cheap.
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const
bool allowsMisalignedMemoryAccesses(LLT Ty, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const override
LLT handling variant.
bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, unsigned Index) const override
Return true if EXTRACT_SUBVECTOR is cheap for extracting this result type from this source type with ...
bool canMergeStoresTo(unsigned AS, EVT MemVT, const MachineFunction &MF) const override
Returns if it's reasonable to merge stores to MemVT size.
SDValue lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const
SITargetLowering(const TargetMachine &tm, const GCNSubtarget &STI)
void computeKnownBitsForTargetInstr(GISelValueTracking &Analysis, Register R, KnownBits &Known, const APInt &DemandedElts, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
bool isFreeAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override
Returns true if a cast from SrcAS to DestAS is "cheap", such that e.g.
bool shouldEmitPCReloc(const GlobalValue *GV) const
void initializeSplitCSR(MachineBasicBlock *Entry) const override
Perform necessary initialization to handle a subset of CSRs explicitly via copies.
void allocateSpecialEntryInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
void allocatePreloadKernArgSGPRs(CCState &CCInfo, SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< ISD::InputArg > &Ins, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
SDValue copyToM0(SelectionDAG &DAG, SDValue Chain, const SDLoc &DL, SDValue V) const
bool getTgtMemIntrinsic(IntrinsicInfo &, const CallInst &, MachineFunction &MF, unsigned IntrinsicID) const override
Given an intrinsic, checks if on the target the intrinsic will need to map to a MemIntrinsicNode (tou...
SDValue splitBinaryVectorOp(SDValue Op, SelectionDAG &DAG) const
unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const override
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
MVT getPointerMemTy(const DataLayout &DL, unsigned AS) const override
Similarly, the in-memory representation of a p7 is {p8, i32}, aka v8i32 when padding is added.
void allocateSystemSGPRs(CCState &CCInfo, MachineFunction &MF, SIMachineFunctionInfo &Info, CallingConv::ID CallConv, bool IsShader) const
bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, LLVMContext &Context, const Type *RetTy) const override
This hook should be implemented to check whether the return values described by the Outs array can fi...
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
LLVM_ABI SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
SDValue getTargetGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, unsigned TargetFlags=0)
SDValue getExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT, unsigned Opcode)
Convert Op, which must be of integer type, to the integer type VT, by either any/sign/zero-extending ...
SDValue getExtractVectorElt(const SDLoc &DL, EVT VT, SDValue Vec, unsigned Idx)
Extract element at Idx from Vec.
const SDValue & getRoot() const
Return the root tag of the SelectionDAG.
LLVM_ABI SDValue getAddrSpaceCast(const SDLoc &dl, EVT VT, SDValue Ptr, unsigned SrcAS, unsigned DestAS)
Return an AddrSpaceCastSDNode.
bool isKnownNeverSNaN(SDValue Op, const APInt &DemandedElts, unsigned Depth=0) const
const TargetSubtargetInfo & getSubtarget() const
SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, Register Reg, SDValue N)
const Pass * getPass() const
LLVM_ABI SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
LLVM_ABI SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
LLVM_ABI SDValue getShiftAmountConstant(uint64_t Val, EVT VT, const SDLoc &DL)
LLVM_ABI SDValue getAllOnesConstant(const SDLoc &DL, EVT VT, bool IsTarget=false, bool IsOpaque=false)
LLVM_ABI MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
LLVM_ABI void ExtractVectorElements(SDValue Op, SmallVectorImpl< SDValue > &Args, unsigned Start=0, unsigned Count=0, EVT EltVT=EVT())
Append the extracted elements from Start to Count out of the vector Op in Args.
LLVM_ABI SDValue getAtomicLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT MemVT, EVT VT, SDValue Chain, SDValue Ptr, MachineMemOperand *MMO)
LLVM_ABI SDValue getFreeze(SDValue V)
Return a freeze using the SDLoc of the value operand.
LLVM_ABI bool isConstantIntBuildVectorOrConstantInt(SDValue N, bool AllowOpaques=true) const
Test whether the given value is a constant int or similar node.
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false)
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
LLVM_ABI SDValue UnrollVectorOp(SDNode *N, unsigned ResNE=0)
Utility function used by legalize and lowering to "unroll" a vector operation by splitting out the sc...
LLVM_ABI SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
LLVM_ABI bool haveNoCommonBitsSet(SDValue A, SDValue B) const
Return true if A and B have no common bits set.
LLVM_ABI SDValue getRegister(Register Reg, EVT VT)
LLVM_ABI SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
LLVM_ABI SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, LocationSize Size=LocationSize::precise(0), const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
LLVM_ABI SDValue getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT, SDValue Chain, SDValue Ptr, SDValue Val, MachineMemOperand *MMO)
Gets a node for an atomic op, produces result (if relevant) and chain and takes 2 operands.
LLVM_ABI SDValue getMemcpy(SDValue Chain, const SDLoc &dl, SDValue Dst, SDValue Src, SDValue Size, Align Alignment, bool isVol, bool AlwaysInline, const CallInst *CI, std::optional< bool > OverrideTailCall, MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo, const AAMDNodes &AAInfo=AAMDNodes(), BatchAAResults *BatchAA=nullptr)
std::pair< SDValue, SDValue > SplitVectorOperand(const SDNode *N, unsigned OpNo)
Split the node's operand with EXTRACT_SUBVECTOR and return the low/high part.
LLVM_ABI SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
const TargetLowering & getTargetLoweringInfo() const
LLVM_ABI std::pair< EVT, EVT > GetSplitDestVTs(const EVT &VT) const
Compute the VTs needed for the low/hi parts of a type which is split (or expanded) into two not neces...
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
LLVM_ABI SDValue getBitcastedAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by first bitcasting (from potentia...
LLVM_ABI SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, Register Reg, EVT VT)
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build Select's if you just have operands and don't want to check...
LLVM_ABI void setNodeMemRefs(MachineSDNode *N, ArrayRef< MachineMemOperand * > NewMemRefs)
Mutate the specified machine node's memory references to the provided list.
LLVM_ABI SDValue getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT VT)
Return the expression required to zero extend the Op value assuming it was the smaller SrcTy value.
const DataLayout & getDataLayout() const
LLVM_ABI SDValue getTokenFactor(const SDLoc &DL, SmallVectorImpl< SDValue > &Vals)
Creates a new TokenFactor containing Vals.
LLVM_ABI SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
LLVM_ABI SDValue getMemBasePlusOffset(SDValue Base, TypeSize Offset, const SDLoc &DL, const SDNodeFlags Flags=SDNodeFlags())
Returns sum of the base pointer and offset.
SDValue getSignedTargetConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
LLVM_ABI void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
LLVM_ABI SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
LLVM_ABI SDValue getSignedConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
LLVM_ABI void RemoveDeadNode(SDNode *N)
Remove the specified node from the system.
LLVM_ABI SDValue getTargetExtractSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand)
A convenience function for creating TargetInstrInfo::EXTRACT_SUBREG nodes.
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
LLVM_ABI SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either sign-extending or trunca...
const TargetMachine & getTarget() const
LLVM_ABI SDValue getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either any-extending or truncat...
LLVM_ABI SDValue getValueType(EVT)
LLVM_ABI SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
LLVM_ABI bool isKnownNeverNaN(SDValue Op, const APInt &DemandedElts, bool SNaN=false, unsigned Depth=0) const
Test whether the given SDValue (or all elements of it, if it is a vector) is known to never be NaN in...
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI unsigned ComputeNumSignBits(SDValue Op, unsigned Depth=0) const
Return the number of times the sign bit of the register is replicated into the other bits.
LLVM_ABI bool isBaseWithConstantOffset(SDValue Op) const
Return true if the specified operand is an ISD::ADD with a ConstantSDNode on the right-hand side,...
LLVM_ABI SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
LLVM_ABI void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
SDValue getPOISON(EVT VT)
Return a POISON node. POISON does not have a useful SDLoc.
SDValue getSplatBuildVector(EVT VT, const SDLoc &DL, SDValue Op)
Return a splat ISD::BUILD_VECTOR node, consisting of Op splatted to all elements.
LLVM_ABI SDValue getFrameIndex(int FI, EVT VT, bool isTarget=false)
LLVM_ABI KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
LLVM_ABI SDValue getRegisterMask(const uint32_t *RegMask)
LLVM_ABI SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
LLVM_ABI SDValue getCondCode(ISD::CondCode Cond)
LLVM_ABI bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
SDValue getObjectPtrOffset(const SDLoc &SL, SDValue Ptr, TypeSize Offset)
Create an add instruction with appropriate flags when used for addressing some offset of an object.
LLVMContext * getContext() const
const SDValue & setRoot(SDValue N)
Set the current root tag of the SelectionDAG.
LLVM_ABI SDNode * UpdateNodeOperands(SDNode *N, SDValue Op)
Mutate the specified node in-place to have the specified operands.
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
LLVM_ABI std::pair< SDValue, SDValue > SplitScalar(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the scalar node with EXTRACT_ELEMENT using the provided VTs and return the low/high part.
LLVM_ABI SDValue getVectorShuffle(EVT VT, const SDLoc &dl, SDValue N1, SDValue N2, ArrayRef< int > Mask)
Return an ISD::VECTOR_SHUFFLE node.
int getMaskElt(unsigned Idx) const
ArrayRef< int > getMask() const
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
StringRef - Represent a constant reference to a string, i.e.
constexpr bool empty() const
empty - Check if the string is empty.
constexpr size_t size() const
size - Get the string size.
A switch()-like statement whose cases are string literals.
StringSwitch & Case(StringLiteral S, T Value)
Information about stack frame layout on the target.
Align getStackAlign() const
getStackAlignment - This method returns the number of bytes to which the stack pointer must be aligne...
StackDirection getStackGrowthDirection() const
getStackGrowthDirection - Return the direction the stack grows
TargetInstrInfo - Interface to description of machine instruction set.
Type * Ty
Same as OrigTy, or partially legalized for soft float libcalls.
void setBooleanVectorContents(BooleanContent Ty)
Specify how the target extends the result of a vector boolean value from a vector of i1 to a wider ty...
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
virtual void finalizeLowering(MachineFunction &MF) const
Execute target specific actions to finalize target lowering.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
virtual const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent=false) const
Return the register class that should be used for the specified value type.
const TargetMachine & getTargetMachine() const
virtual unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain targets require unusual breakdowns of certain types.
virtual MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
LegalizeTypeAction
This enum indicates whether a types are legal for a target, and if not, what action should be used to...
void setHasExtractBitsInsn(bool hasExtractInsn=true)
Tells the code generator that the target has BitExtract instructions.
virtual TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const
Return the preferred vector type legalization action.
virtual unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
Register getStackPointerRegisterToSaveRestore() const
If a physical register, this specifies the register that llvm.savestack/llvm.restorestack should save...
void setBooleanContents(BooleanContent Ty)
Specify how the target extends the result of integer and floating point boolean values from i1 to a w...
virtual Align getPrefLoopAlignment(MachineLoop *ML=nullptr) const
Return the preferred loop alignment.
void computeRegisterProperties(const TargetRegisterInfo *TRI)
Once all of the register classes are added, this allows us to compute derived properties we expose.
void addRegisterClass(MVT VT, const TargetRegisterClass *RC)
Add the specified register class as an available regclass for the specified value type.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
virtual MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
@ ZeroOrOneBooleanContent
bool isOperationLegalOrCustom(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
virtual bool isNarrowingProfitable(SDNode *N, EVT SrcVT, EVT DestVT) const
Return true if it's profitable to narrow operations of type SrcVT to DestVT.
void setStackPointerRegisterToSaveRestore(Register R)
If set to a physical register, this specifies the register that llvm.savestack/llvm....
void AddPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
If Opc/OrigVT is specified as being promoted, the promotion code defaults to trying a larger integer/...
AtomicExpansionKind
Enum that specifies what an atomic load/AtomicRMWInst is expanded to, if at all.
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
bool allowsMemoryAccessForAlignment(LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const
This function returns true if the memory access is aligned or if the target allows this specific unal...
virtual MVT getPointerMemTy(const DataLayout &DL, uint32_t AS=0) const
Return the in-memory pointer type for the given address space, defaults to the pointer type from the ...
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
virtual void computeKnownBitsForFrameIndex(int FIOp, KnownBits &Known, const MachineFunction &MF) const
Determine which of the bits of FrameIndex FIOp are known to be 0.
SDValue scalarizeVectorStore(StoreSDNode *ST, SelectionDAG &DAG) const
std::vector< AsmOperandInfo > AsmOperandInfoVector
SDValue SimplifyMultipleUseDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth=0) const
More limited version of SimplifyDemandedBits that can be used to "lookthrough" ops that don't contrib...
SDValue expandUnalignedStore(StoreSDNode *ST, SelectionDAG &DAG) const
Expands an unaligned store to 2 half-size stores for integer values, and possibly more for vectors.
virtual ConstraintType getConstraintType(StringRef Constraint) const
Given a constraint, return the type of constraint it is for this target.
bool parametersInCSRMatch(const MachineRegisterInfo &MRI, const uint32_t *CallerPreservedMask, const SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< SDValue > &OutVals) const
Check whether parameters to a call that are passed in callee saved registers are the same as from the...
std::pair< SDValue, SDValue > expandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Expands an unaligned load to 2 half-size loads for an integer, and possibly more for vectors.
SDValue expandFMINIMUMNUM_FMAXIMUMNUM(SDNode *N, SelectionDAG &DAG) const
Expand fminimumnum/fmaximumnum into multiple comparison with selects.
virtual bool isTypeDesirableForOp(unsigned, EVT VT) const
Return true if the target has native support for the specified value type and it is 'desirable' to us...
std::pair< SDValue, SDValue > scalarizeVectorLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Turn load of vector type into a load of the individual elements.
virtual std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const
Given a physical register constraint (e.g.
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Op.
TargetLowering(const TargetLowering &)=delete
virtual MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
virtual AsmOperandInfoVector ParseConstraints(const DataLayout &DL, const TargetRegisterInfo *TRI, const CallBase &Call) const
Split up the constraint string from the inline assembly value into the specific constraints and their...
SDValue expandRoundInexactToOdd(EVT ResultVT, SDValue Op, const SDLoc &DL, SelectionDAG &DAG) const
Truncate Op to ResultVT.
virtual void ComputeConstraintToUse(AsmOperandInfo &OpInfo, SDValue Op, SelectionDAG *DAG=nullptr) const
Determines the constraint code and constraint type to use for the specific AsmOperandInfo,...
virtual void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const
Lower the specified operand into the Ops vector.
SDValue expandFMINNUM_FMAXNUM(SDNode *N, SelectionDAG &DAG) const
Expand fminnum/fmaxnum into fminnum_ieee/fmaxnum_ieee with quieted inputs.
Primary interface to the complete machine description for the target machine.
CodeGenOptLevel getOptLevel() const
Returns the optimization level: None, Less, Default, or Aggressive.
const Triple & getTargetTriple() const
bool shouldAssumeDSOLocal(const GlobalValue *GV) const
unsigned GuaranteedTailCallOpt
GuaranteedTailCallOpt - This flag is enabled when -tailcallopt is specified on the commandline.
unsigned getNumRegs() const
Return the number of registers in this class.
unsigned getID() const
Return the register class ID number.
MCRegister getRegister(unsigned i) const
Return the specified register in the class.
iterator begin() const
begin/end - Return all of the registers in this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
Target - Wrapper for Target specific information.
Triple - Helper class for working with autoconf configuration names.
OSType getOS() const
Get the parsed operating system type of this triple.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
static constexpr TypeSize getFixed(ScalarTy ExactSize)
The instances of the Type class are immutable: once they are created, they are never changed.
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
bool isBFloatTy() const
Return true if this is 'bfloat', a 16-bit bfloat type.
LLVM_ABI unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
bool isFunctionTy() const
True if this is an instance of FunctionType.
bool isIntegerTy() const
True if this is an instance of IntegerType.
LLVM_ABI const fltSemantics & getFltSemantics() const
bool isVoidTy() const
Return true if this is 'void'.
A Use represents the edge between a Value definition and its users.
LLVM_ABI unsigned getOperandNo() const
Return the operand # of this use in its User.
LLVM_ABI void set(Value *Val)
User * getUser() const
Returns the User that contains this Use.
const Use & getOperandUse(unsigned i) const
Value * getOperand(unsigned i) const
LLVM Value Representation.
Type * getType() const
All values are typed, get the type of this value.
bool hasOneUse() const
Return true if there is exactly one use of this value.
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
iterator_range< user_iterator > users()
LLVM_ABI LLVMContext & getContext() const
All values hold a context through their type.
iterator_range< use_iterator > uses()
LLVM_ABI void takeName(Value *V)
Transfer the name from V to this value.
Type * getElementType() const
constexpr ScalarTy getFixedValue() const
constexpr bool isZero() const
self_iterator getIterator()
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ BUFFER_STRIDED_POINTER
Address space for 192-bit fat buffer pointers with an additional index.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ STREAMOUT_REGISTER
Internal address spaces. Can be freely renumbered.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ BUFFER_FAT_POINTER
Address space for 160-bit buffer fat pointers.
@ PRIVATE_ADDRESS
Address space for private memory.
@ BUFFER_RESOURCE
Address space for 128-bit buffer resources.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr char SymbolName[]
Key for Kernel::Metadata::mSymbolName.
bool isInlinableLiteralBF16(int16_t Literal, bool HasInv2Pi)
LLVM_READONLY const MIMGG16MappingInfo * getMIMGG16MappingInfo(unsigned G)
LLVM_READONLY int getGlobalSaddrOp(uint16_t Opcode)
bool isInlinableLiteralFP16(int16_t Literal, bool HasInv2Pi)
int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, unsigned VDataDwords, unsigned VAddrDwords)
LLVM_READNONE constexpr bool isShader(CallingConv::ID CC)
bool shouldEmitConstantsToTextSection(const Triple &TT)
bool isFlatGlobalAddrSpace(unsigned AS)
const uint64_t FltRoundToHWConversionTable
bool isGFX12Plus(const MCSubtargetInfo &STI)
unsigned getNSAMaxSize(const MCSubtargetInfo &STI, bool HasSampler)
bool isGFX11(const MCSubtargetInfo &STI)
bool hasValueInRangeLikeMetadata(const MDNode &MD, int64_t Val)
Checks if Val is inside MD, a !range-like metadata.
LLVM_READNONE bool isLegalDPALU_DPPControl(const MCSubtargetInfo &ST, unsigned DC)
LLVM_READNONE constexpr bool mayTailCallThisCC(CallingConv::ID CC)
Return true if we might ever do TCO for calls with this calling convention.
unsigned getAMDHSACodeObjectVersion(const Module &M)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx)
LLVM_READNONE constexpr bool isKernel(CallingConv::ID CC)
LLVM_READNONE constexpr bool isEntryFunctionCC(CallingConv::ID CC)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
LLVM_READNONE constexpr bool isCompute(CallingConv::ID CC)
bool isIntrinsicSourceOfDivergence(unsigned IntrID)
LLVM_READNONE bool isInlinableIntLiteral(int64_t Literal)
Is this literal inlinable, and not one of the values intended for floating point values.
bool getMUBUFTfe(unsigned Opc)
bool isGFX11Plus(const MCSubtargetInfo &STI)
std::optional< unsigned > getInlineEncodingV2F16(uint32_t Literal)
std::tuple< char, unsigned, unsigned > parseAsmConstraintPhysReg(StringRef Constraint)
Returns a valid charcode or 0 in the first entry if this is a valid physical register constraint.
bool isGFX10Plus(const MCSubtargetInfo &STI)
LLVM_READONLY int getVOPe64(uint16_t Opcode)
@ TowardZeroF32_TowardNegativeF64
bool isUniformMMO(const MachineMemOperand *MMO)
std::optional< unsigned > getInlineEncodingV2I16(uint32_t Literal)
uint32_t decodeFltRoundToHWConversionTable(uint32_t FltRounds)
Read the hardware rounding mode equivalent of a AMDGPUFltRounds value.
bool isGFX1250(const MCSubtargetInfo &STI)
bool isExtendedGlobalAddrSpace(unsigned AS)
LLVM_READONLY const MIMGDimInfo * getMIMGDimInfo(unsigned DimEnum)
std::optional< unsigned > getInlineEncodingV2BF16(uint32_t Literal)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
LLVM_READNONE constexpr bool isChainCC(CallingConv::ID CC)
int getMaskedMIMGOp(unsigned Opc, unsigned NewChannels)
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)
bool isInlinableLiteralI16(int32_t Literal, bool HasInv2Pi)
LLVM_READNONE constexpr bool canGuaranteeTCO(CallingConv::ID CC)
LLVM_READNONE constexpr bool isGraphics(CallingConv::ID CC)
bool isInlinableLiteral64(int64_t Literal, bool HasInv2Pi)
Is this literal inlinable.
const RsrcIntrinsic * lookupRsrcIntrinsic(unsigned Intr)
const uint64_t FltRoundConversionTable
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ MaxID
The highest possible ID. Must be some 2^k - 1.
@ AMDGPU_Gfx
Used for AMD graphics targets.
@ AMDGPU_CS_ChainPreserve
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_CS_Chain
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
@ Fast
Attempts to make calls as fast as possible (e.g.
@ C
The default llvm calling convention, compatible with C.
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
@ MERGE_VALUES
MERGE_VALUES - This node takes multiple discrete operands and returns them all as its individual resu...
@ DELETED_NODE
DELETED_NODE - This is an illegal value that is used to catch errors.
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
@ BSWAP
Byte Swap and Counting operators.
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
@ ADD
Simple integer binary arithmetic operators.
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
@ FADD
Simple binary floating point operators.
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
@ BUILTIN_OP_END
BUILTIN_OP_END - This must be the last enum value in this list.
@ SIGN_EXTEND
Conversion operators.
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
@ CTTZ_ZERO_UNDEF
Bit counting operators with an undefined result for zero inputs.
@ SSUBO
Same for subtraction.
@ FCANONICALIZE
Returns platform specific canonical encoding of a floating point number.
@ IS_FPCLASS
Performs a check of floating point class property, defined by IEEE-754.
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
@ UNDEF
UNDEF - An undefined node.
@ EXTRACT_ELEMENT
EXTRACT_ELEMENT - This is used to get the lower or upper (determined by a Constant,...
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
@ SADDO
RESULT, BOOL = [SU]ADDO(LHS, RHS) - Overflow-aware nodes for addition.
@ GET_ROUNDING
Returns current rounding mode: -1 Undefined 0 Round to 0 1 Round to nearest, ties to even 2 Round to ...
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
@ SHL
Shift and rotation operations.
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
@ SMULO
Same for multiplication.
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
@ VSELECT
Select with a vector condition (op #0) and two vector operands (ops #1 and #2), returning a vector re...
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
@ STRICT_FP_ROUND
X = STRICT_FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision ...
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
@ STRICT_FP_EXTEND
X = STRICT_FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
@ AND
Bitwise operators - logical and, logical or, logical xor.
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
@ SHL_PARTS
SHL_PARTS/SRA_PARTS/SRL_PARTS - These operators are used for expanded integer shift operations.
@ AssertSext
AssertSext, AssertZext - These nodes record if a register contains a value that has already been zero...
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
LLVM_ABI CondCode getSetCCSwappedOperands(CondCode Operation)
Return the operation corresponding to (Y op X) when given the operation for (X op Y).
bool isSignedIntSetCC(CondCode Code)
Return true if this is a setcc instruction that performs a signed comparison when used with integer o...
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
LoadExtType
LoadExtType enum - This enum defines the three variants of LOADEXT (load with extension).
This namespace contains an enum with a value for every intrinsic/builtin function known by LLVM.
LLVM_ABI Function * getDeclarationIfExists(const Module *M, ID id)
Look up the Function declaration of the intrinsic id in the Module M and return it if it exists.
LLVM_ABI AttributeSet getFnAttributes(LLVMContext &C, ID id)
Return the function attributes for an intrinsic.
LLVM_ABI AttributeList getAttributes(LLVMContext &C, ID id, FunctionType *FT)
Return the attributes for an intrinsic.
LLVM_ABI FunctionType * getType(LLVMContext &Context, ID id, ArrayRef< Type * > Tys={})
Return the function type for an intrinsic.
BinaryOp_match< SpecificConstantMatch, SrcTy, TargetOpcode::G_SUB > m_Neg(const SrcTy &&Src)
Matches a register negated by a G_SUB.
bool mi_match(Reg R, const MachineRegisterInfo &MRI, Pattern &&P)
GFCstOrSplatGFCstMatch m_GFCstOrSplat(std::optional< FPValueAndVReg > &FPValReg)
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
cst_pred_ty< is_one > m_One()
Match an integer 1 or a vector with all elements equal to 1.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Sub > m_Sub(const LHS &L, const RHS &R)
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Define
Register definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
bool sd_match(SDNode *N, const SelectionDAG *DAG, Pattern &&P)
@ System
Synchronized with respect to all concurrently executing threads.
@ BRCOND
X86 conditional branches.
initializer< Ty > init(const Ty &Val)
@ User
could "use" a pointer
NodeAddr< UseNode * > Use
NodeAddr< NodeBase * > Node
friend class Instruction
Iterator for Instructions in a `BasicBlock.
This is an optimization pass for GlobalISel generic memory operations.
GenericUniformityInfo< SSAContext > UniformityInfo
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
FunctionAddr VTableAddr Value
ISD::CondCode getICmpCondCode(ICmpInst::Predicate Pred)
getICmpCondCode - Return the ISD condition code corresponding to the given LLVM IR integer condition ...
LLVM_ABI void finalizeBundle(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
finalizeBundle - Finalize a machine instruction bundle which includes a sequence of instructions star...
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
detail::zippy< detail::zip_first, T, U, Args... > zip_equal(T &&t, U &&u, Args &&...args)
zip iterator that assumes that all iteratees have the same length.
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
LLVM_ABI bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
std::pair< Value *, Value * > buildCmpXchgValue(IRBuilderBase &Builder, Value *Ptr, Value *Cmp, Value *Val, Align Alignment)
Emit IR to implement the given cmpxchg operation on values in registers, returning the new value.
LLVM_ABI SDValue peekThroughBitcasts(SDValue V)
Return the non-bitcasted source operand of V if it exists.
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
bool CCAssignFn(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
CCAssignFn - This function assigns a location for Val, updating State to reflect the change.
constexpr int64_t minIntN(int64_t N)
Gets the minimum value for a N-bit signed integer.
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
MemoryEffectsBase< IRMemLocation > MemoryEffects
Summary of how a function affects memory in the program.
constexpr int popcount(T Value) noexcept
Count the number of set bits in a value.
LLVM_ABI ConstantFPSDNode * isConstOrConstSplatFP(SDValue N, bool AllowUndefs=false)
Returns the SDNode if it is a constant splat BuildVector or constant float.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
constexpr bool isShiftedMask_64(uint64_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (64 bit ver...
bool isReleaseOrStronger(AtomicOrdering AO)
static const MachineMemOperand::Flags MONoClobber
Mark the MMO of a uniform load if there are no potentially clobbering stores on any path from the sta...
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
AtomicOrderingCABI
Atomic ordering for C11 / C++11's memory models.
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
bool isBoolSGPR(SDValue V)
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
ISD::CondCode getFCmpCondCode(FCmpInst::Predicate Pred)
getFCmpCondCode - Return the ISD condition code corresponding to the given LLVM IR floating-point con...
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
static const MachineMemOperand::Flags MOCooperative
Mark the MMO of cooperative load/store atomics.
Value * buildAtomicRMWValue(AtomicRMWInst::BinOp Op, IRBuilderBase &Builder, Value *Loaded, Value *Val)
Emit IR to implement the given atomicrmw operation on values in registers, returning the new value.
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
FunctionAddr VTableAddr uintptr_t uintptr_t Data
unsigned getUndefRegState(bool B)
@ Or
Bitwise or logical OR of integers.
@ Mul
Product of integers.
uint16_t MCPhysReg
An unsigned integer type large enough to represent all physical registers, but not necessarily virtua...
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
FunctionAddr VTableAddr Next
DWARFExpression::Operation Op
unsigned M0(unsigned Val)
ArrayRef(const T &OneElt) -> ArrayRef< T >
LLVM_ABI ConstantSDNode * isConstOrConstSplat(SDValue N, bool AllowUndefs=false, bool AllowTruncation=false)
Returns the SDNode if it is a constant splat BuildVector or constant int.
constexpr int64_t maxIntN(int64_t N)
Gets the maximum value for a N-bit signed integer.
constexpr unsigned BitWidth
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
LLVM_ABI std::optional< ValueAndVReg > getIConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT returns its...
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
static const MachineMemOperand::Flags MOLastUse
Mark the MMO of a load as the last use.
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
constexpr T maskTrailingOnes(unsigned N)
Create a bitmask with the N right-most bits set to 1, and all other bits set to 0.
LLVM_ABI Printable printReg(Register Reg, const TargetRegisterInfo *TRI=nullptr, unsigned SubIdx=0, const MachineRegisterInfo *MRI=nullptr)
Prints virtual and physical registers with or without a TRI instance.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
@ CLUSTER_WORKGROUP_MAX_ID_X
@ CLUSTER_WORKGROUP_MAX_ID_Z
@ CLUSTER_WORKGROUP_MAX_FLAT_ID
@ CLUSTER_WORKGROUP_MAX_ID_Y
ArgDescriptor WorkItemIDZ
ArgDescriptor WorkItemIDY
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(PreloadedValue Value) const
ArgDescriptor WorkItemIDX
static constexpr uint64_t encode(Fields... Values)
static std::tuple< typename Fields::ValueType... > decode(uint64_t Encoded)
unsigned AtomicNoRetBaseOpcode
This struct is a compact representation of a valid (non-zero power of two) alignment.
constexpr uint64_t value() const
This is a hole in the type system and should not be abused.
static ArgDescriptor createStack(unsigned Offset, unsigned Mask=~0u)
MCRegister getRegister() const
static ArgDescriptor createArg(const ArgDescriptor &Arg, unsigned Mask)
static ArgDescriptor createRegister(Register Reg, unsigned Mask=~0u)
Helper struct shared between Function Specialization and SCCP Solver.
Represent subnormal handling kind for floating point instruction inputs and outputs.
@ Dynamic
Denormals have unknown treatment.
static constexpr DenormalMode getPreserveSign()
static constexpr DenormalMode getIEEE()
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
EVT changeTypeToInteger() const
Return the type converted to an equivalently sized integer or vector with integer element type.
bool bitsLT(EVT VT) const
Return true if this has less bits than VT.
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
bool isByteSized() const
Return true if the bit size is a multiple of 8.
EVT changeElementType(EVT EltVT) const
Return a VT for a type whose attributes match ourselves with the exception of the element type that i...
uint64_t getScalarSizeInBits() const
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
TypeSize getStoreSizeInBits() const
Return the number of bits overwritten by a store of the specified value type.
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
bool isVector() const
Return true if this is a vector value type.
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
bool bitsEq(EVT VT) const
Return true if this has the same number of bits as VT.
LLVM_ABI Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
EVT getVectorElementType() const
Given a vector type, return the type of each element.
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
EVT changeVectorElementType(EVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
LLVM_ABI const fltSemantics & getFltSemantics() const
Returns an APFloat semantics tag appropriate for the value type.
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
bool isInteger() const
Return true if this is an integer or a vector integer type.
unsigned getPointerAddrSpace() const
unsigned getByValSize() const
OutputArg - This struct carries flags and a value for a single outgoing (actual) argument or outgoing...
bool isUnknown() const
Returns true if we don't know any bits.
KnownBits zext(unsigned BitWidth) const
Return known bits for a zero extension of the value we're tracking.
void resetAll()
Resets the known state of all bits.
KnownBits extractBits(unsigned NumBits, unsigned BitPosition) const
Return a subset of the known bits from [bitPosition,bitPosition+numBits).
KnownBits sext(unsigned BitWidth) const
Return known bits for a sign extension of the value we're tracking.
static KnownBits add(const KnownBits &LHS, const KnownBits &RHS, bool NSW=false, bool NUW=false)
Compute knownbits resulting from addition of LHS and RHS.
unsigned countMinLeadingZeros() const
Returns the minimum number of leading zero bits.
This class contains a discriminated union of information about pointers in memory operands,...
static LLVM_ABI MachinePointerInfo getStack(MachineFunction &MF, int64_t Offset, uint8_t ID=0)
Stack pointer relative access.
int64_t Offset
Offset - This is an offset from the base Value*.
PointerUnion< const Value *, const PseudoSourceValue * > V
This is the IR pointer value for the access, or it is null if unknown.
static LLVM_ABI MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
static LLVM_ABI MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
These are IR-level optimization flags that may be propagated to SDNodes.
bool hasNoUnsignedWrap() const
bool hasAllowContract() const
bool hasNoSignedWrap() const
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
This structure contains all information that is necessary for lowering calls.
SDValue ConvergenceControlToken
SmallVector< ISD::InputArg, 32 > Ins
SmallVector< ISD::OutputArg, 32 > Outs
SmallVector< SDValue, 32 > OutVals
bool isBeforeLegalize() const