43#include "llvm/IR/IntrinsicsAMDGPU.h"
44#include "llvm/IR/IntrinsicsR600.h"
55#define DEBUG_TYPE "si-lower"
61 cl::desc(
"Do not align and prefetch loops"),
65 "amdgpu-use-divergent-register-indexing",
cl::Hidden,
66 cl::desc(
"Use indirect register addressing for divergent indexes"),
80 unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
81 for (
unsigned Reg = 0;
Reg < NumSGPRs; ++
Reg) {
83 return AMDGPU::SGPR0 +
Reg;
99 TRI->getDefaultVectorSuperClassForBitWidth(32);
105 TRI->getDefaultVectorSuperClassForBitWidth(64);
143 TRI->getDefaultVectorSuperClassForBitWidth(320));
147 TRI->getDefaultVectorSuperClassForBitWidth(352));
151 TRI->getDefaultVectorSuperClassForBitWidth(384));
155 TRI->getDefaultVectorSuperClassForBitWidth(512));
162 TRI->getDefaultVectorSuperClassForBitWidth(1024));
164 if (Subtarget->has16BitInsts()) {
165 if (Subtarget->useRealTrue16Insts()) {
195 TRI->getDefaultVectorSuperClassForBitWidth(1024));
208 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
209 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
210 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
211 MVT::i1, MVT::v32i32},
215 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
216 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
217 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
218 MVT::i1, MVT::v32i32},
286 {MVT::f32, MVT::i32, MVT::i64, MVT::f64, MVT::i1},
Expand);
293 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
294 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
295 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32},
298 {MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32,
299 MVT::v6f32, MVT::v7f32, MVT::v8f32, MVT::v9f32,
300 MVT::v10f32, MVT::v11f32, MVT::v12f32, MVT::v16f32},
304 {MVT::v2i1, MVT::v4i1, MVT::v2i8, MVT::v4i8, MVT::v2i16,
305 MVT::v3i16, MVT::v4i16, MVT::Other},
310 {MVT::i1, MVT::i32, MVT::i64, MVT::f32, MVT::f64},
Expand);
326 {MVT::v8i32, MVT::v8f32, MVT::v9i32, MVT::v9f32, MVT::v10i32,
327 MVT::v10f32, MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32,
328 MVT::v16i32, MVT::v16f32, MVT::v2i64, MVT::v2f64, MVT::v4i16,
329 MVT::v4f16, MVT::v4bf16, MVT::v3i64, MVT::v3f64, MVT::v6i32,
330 MVT::v6f32, MVT::v4i64, MVT::v4f64, MVT::v8i64, MVT::v8f64,
331 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
332 MVT::v16bf16, MVT::v16i64, MVT::v16f64, MVT::v32i32, MVT::v32f32,
333 MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
365 for (
MVT Vec64 : {MVT::v2i64, MVT::v2f64}) {
379 for (
MVT Vec64 : {MVT::v3i64, MVT::v3f64}) {
393 for (
MVT Vec64 : {MVT::v4i64, MVT::v4f64}) {
407 for (
MVT Vec64 : {MVT::v8i64, MVT::v8f64}) {
421 for (
MVT Vec64 : {MVT::v16i64, MVT::v16f64}) {
436 {MVT::v4i32, MVT::v4f32, MVT::v8i32, MVT::v8f32,
437 MVT::v16i32, MVT::v16f32, MVT::v32i32, MVT::v32f32},
440 if (Subtarget->hasPkMovB32()) {
461 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v2i8, MVT::v4i8,
462 MVT::v8i8, MVT::v4i16, MVT::v4f16, MVT::v4bf16},
467 {MVT::v3i32, MVT::v3f32, MVT::v4i32, MVT::v4f32},
Custom);
471 {MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32,
472 MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32,
473 MVT::v9i32, MVT::v9f32, MVT::v10i32, MVT::v10f32,
474 MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32},
498 if (Subtarget->hasSMemRealTime() ||
503 if (Subtarget->has16BitInsts()) {
513 if (Subtarget->hasMadMacF32Insts())
530 if (Subtarget->hasIntClamp())
533 if (Subtarget->hasAddNoCarryInsts())
539 {MVT::f32, MVT::f64},
Custom);
545 {MVT::f32, MVT::f64},
Legal);
547 if (Subtarget->haveRoundOpsF64())
577 if (Subtarget->has16BitInsts()) {
630 if (Subtarget->hasBF16TransInsts())
653 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v4i16, MVT::v4f16,
654 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16,
655 MVT::v16f16, MVT::v16bf16, MVT::v32i16, MVT::v32f16}) {
790 {MVT::v2f16, MVT::v2bf16, MVT::v4f16, MVT::v4bf16,
791 MVT::v8f16, MVT::v8bf16, MVT::v16f16, MVT::v16bf16,
792 MVT::v32f16, MVT::v32bf16},
802 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
806 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
810 {MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
811 MVT::v16bf16, MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
819 if (Subtarget->hasVOP3PInsts()) {
830 {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
Custom);
833 {MVT::v4f16, MVT::v4i16, MVT::v4bf16, MVT::v8f16,
834 MVT::v8i16, MVT::v8bf16, MVT::v16f16, MVT::v16i16,
835 MVT::v16bf16, MVT::v32f16, MVT::v32i16, MVT::v32bf16},
838 for (
MVT VT : {MVT::v4i16, MVT::v8i16, MVT::v16i16, MVT::v32i16})
846 for (
MVT VT : {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16})
853 {MVT::v2f16, MVT::v4f16},
Custom);
859 if (Subtarget->hasBF16PackedInsts()) {
860 for (
MVT VT : {MVT::v4bf16, MVT::v8bf16, MVT::v16bf16, MVT::v32bf16})
866 if (Subtarget->hasPackedFP32Ops()) {
870 {MVT::v4f32, MVT::v8f32, MVT::v16f32, MVT::v32f32},
877 if (Subtarget->has16BitInsts()) {
890 {MVT::v4i16, MVT::v4f16, MVT::v4bf16, MVT::v2i8, MVT::v4i8,
891 MVT::v8i8, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
892 MVT::v16i16, MVT::v16f16, MVT::v16bf16, MVT::v32i16,
893 MVT::v32f16, MVT::v32bf16},
898 if (Subtarget->hasVectorMulU64())
900 else if (Subtarget->hasScalarSMulU64())
903 if (Subtarget->hasMad64_32())
906 if (Subtarget->hasSafeSmemPrefetch() || Subtarget->hasVmemPrefInsts())
909 if (Subtarget->hasIEEEMinimumMaximumInsts()) {
911 {MVT::f16, MVT::f32, MVT::f64, MVT::v2f16},
Legal);
914 if (Subtarget->hasMinimum3Maximum3F32())
917 if (Subtarget->hasMinimum3Maximum3PKF16()) {
921 if (!Subtarget->hasMinimum3Maximum3F16())
926 if (Subtarget->hasVOP3PInsts()) {
929 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
933 if (Subtarget->hasIntMinMax64())
938 {MVT::Other, MVT::f32, MVT::v4f32, MVT::i16, MVT::f16,
939 MVT::bf16, MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::i128,
944 {MVT::v2f16, MVT::v2i16, MVT::v2bf16, MVT::v3f16,
945 MVT::v3i16, MVT::v4f16, MVT::v4i16, MVT::v4bf16,
946 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::Other, MVT::f16,
947 MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
951 {MVT::Other, MVT::v2i16, MVT::v2f16, MVT::v2bf16,
952 MVT::v3i16, MVT::v3f16, MVT::v4f16, MVT::v4i16,
953 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
954 MVT::f16, MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
969 if (Subtarget->hasBF16ConversionInsts()) {
974 if (Subtarget->hasBF16PackedInsts()) {
980 if (Subtarget->hasBF16TransInsts()) {
984 if (Subtarget->hasCvtPkF16F32Inst()) {
986 {MVT::v2f16, MVT::v4f16, MVT::v8f16, MVT::v16f16},
1037 if (Subtarget->has16BitInsts() && !Subtarget->hasMed3_16())
1078 static const MCPhysReg RCRegs[] = {AMDGPU::MODE};
1091 EVT DestVT,
EVT SrcVT)
const {
1093 ((((Opcode ==
ISD::FMAD && Subtarget->hasMadMixInsts()) ||
1094 (Opcode ==
ISD::FMA && Subtarget->hasFmaMixInsts())) &&
1096 (Opcode ==
ISD::FMA && Subtarget->hasFmaMixBF16Insts() &&
1103 LLT DestTy,
LLT SrcTy)
const {
1104 return ((Opcode == TargetOpcode::G_FMAD && Subtarget->hasMadMixInsts()) ||
1105 (Opcode == TargetOpcode::G_FMA && Subtarget->hasFmaMixInsts())) &&
1107 SrcTy.getScalarSizeInBits() == 16 &&
1128 return Subtarget->has16BitInsts()
1134 return Subtarget->has16BitInsts() ? MVT::i16 : MVT::i32;
1138 if (!Subtarget->has16BitInsts() && VT.
getSizeInBits() == 16)
1160 return (NumElts + 1) / 2;
1166 return NumElts * ((
Size + 31) / 32);
1175 unsigned &NumIntermediates,
MVT &RegisterVT)
const {
1184 MVT SimpleIntermediateVT =
1186 IntermediateVT = SimpleIntermediateVT;
1187 RegisterVT = Subtarget->has16BitInsts() ? SimpleIntermediateVT : MVT::i32;
1188 NumIntermediates = (NumElts + 1) / 2;
1189 return (NumElts + 1) / 2;
1194 IntermediateVT = RegisterVT;
1195 NumIntermediates = NumElts;
1196 return NumIntermediates;
1201 RegisterVT = MVT::i16;
1202 IntermediateVT = ScalarVT;
1203 NumIntermediates = NumElts;
1204 return NumIntermediates;
1208 RegisterVT = MVT::i32;
1209 IntermediateVT = ScalarVT;
1210 NumIntermediates = NumElts;
1211 return NumIntermediates;
1215 RegisterVT = MVT::i32;
1216 IntermediateVT = RegisterVT;
1217 NumIntermediates = NumElts * ((
Size + 31) / 32);
1218 return NumIntermediates;
1223 Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
1228 unsigned MaxNumLanes) {
1229 assert(MaxNumLanes != 0);
1233 unsigned NumElts = std::min(MaxNumLanes, VT->getNumElements());
1244 unsigned MaxNumLanes) {
1250 assert(ST->getNumContainedTypes() == 2 &&
1251 ST->getContainedType(1)->isIntegerTy(32));
1265 return MVT::amdgpuBufferFatPointer;
1267 DL.getPointerSizeInBits(AS) == 192)
1268 return MVT::amdgpuBufferStridedPointer;
1277 DL.getPointerSizeInBits(AS) == 160) ||
1279 DL.getPointerSizeInBits(AS) == 192))
1286 case Intrinsic::amdgcn_global_load_async_to_lds_b8:
1287 case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:
1288 case Intrinsic::amdgcn_global_store_async_from_lds_b8:
1290 case Intrinsic::amdgcn_global_load_async_to_lds_b32:
1291 case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
1292 case Intrinsic::amdgcn_global_store_async_from_lds_b32:
1293 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
1294 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
1295 case Intrinsic::amdgcn_flat_load_monitor_b32:
1296 case Intrinsic::amdgcn_global_load_monitor_b32:
1298 case Intrinsic::amdgcn_global_load_async_to_lds_b64:
1299 case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
1300 case Intrinsic::amdgcn_global_store_async_from_lds_b64:
1301 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
1302 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
1303 case Intrinsic::amdgcn_flat_load_monitor_b64:
1304 case Intrinsic::amdgcn_global_load_monitor_b64:
1306 case Intrinsic::amdgcn_global_load_async_to_lds_b128:
1307 case Intrinsic::amdgcn_cluster_load_async_to_lds_b128:
1308 case Intrinsic::amdgcn_global_store_async_from_lds_b128:
1309 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B:
1310 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B:
1311 case Intrinsic::amdgcn_flat_load_monitor_b128:
1312 case Intrinsic::amdgcn_global_load_monitor_b128:
1348 unsigned IntrID)
const {
1350 if (CI.
hasMetadata(LLVMContext::MD_invariant_load))
1364 bool IsSPrefetch = IntrID == Intrinsic::amdgcn_s_buffer_prefetch_data;
1377 if (RsrcIntr->IsImage) {
1392 Info.ptrVal = RsrcArg;
1396 if (RsrcIntr->IsImage) {
1397 unsigned MaxNumLanes = 4;
1412 std::numeric_limits<unsigned>::max());
1422 if (RsrcIntr->IsImage) {
1442 if ((RsrcIntr->IsImage && BaseOpcode->
NoReturn) || IsSPrefetch) {
1444 Info.memVT = MVT::i32;
1451 case Intrinsic::amdgcn_raw_buffer_load_lds:
1452 case Intrinsic::amdgcn_raw_buffer_load_async_lds:
1453 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
1454 case Intrinsic::amdgcn_raw_ptr_buffer_load_async_lds:
1455 case Intrinsic::amdgcn_struct_buffer_load_lds:
1456 case Intrinsic::amdgcn_struct_buffer_load_async_lds:
1457 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds:
1458 case Intrinsic::amdgcn_struct_ptr_buffer_load_async_lds: {
1472 CI.
getContext(), Width * 8 * Subtarget->getWavefrontSize());
1481 case Intrinsic::amdgcn_raw_atomic_buffer_load:
1482 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
1483 case Intrinsic::amdgcn_struct_atomic_buffer_load:
1484 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: {
1487 std::numeric_limits<unsigned>::max());
1500 case Intrinsic::amdgcn_ds_ordered_add:
1501 case Intrinsic::amdgcn_ds_ordered_swap: {
1515 case Intrinsic::amdgcn_ds_add_gs_reg_rtn:
1516 case Intrinsic::amdgcn_ds_sub_gs_reg_rtn: {
1519 Info.ptrVal =
nullptr;
1525 case Intrinsic::amdgcn_ds_append:
1526 case Intrinsic::amdgcn_ds_consume: {
1540 case Intrinsic::amdgcn_ds_atomic_async_barrier_arrive_b64:
1541 case Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64: {
1542 Info.opc = (IntrID == Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64)
1547 Info.memVT = MVT::i64;
1554 case Intrinsic::amdgcn_image_bvh_dual_intersect_ray:
1555 case Intrinsic::amdgcn_image_bvh_intersect_ray:
1556 case Intrinsic::amdgcn_image_bvh8_intersect_ray: {
1559 MVT::getVT(IntrID == Intrinsic::amdgcn_image_bvh_intersect_ray
1562 ->getElementType(0));
1571 case Intrinsic::amdgcn_global_atomic_fmin_num:
1572 case Intrinsic::amdgcn_global_atomic_fmax_num:
1573 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1574 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1575 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
1586 case Intrinsic::amdgcn_cluster_load_b32:
1587 case Intrinsic::amdgcn_cluster_load_b64:
1588 case Intrinsic::amdgcn_cluster_load_b128:
1589 case Intrinsic::amdgcn_ds_load_tr6_b96:
1590 case Intrinsic::amdgcn_ds_load_tr4_b64:
1591 case Intrinsic::amdgcn_ds_load_tr8_b64:
1592 case Intrinsic::amdgcn_ds_load_tr16_b128:
1593 case Intrinsic::amdgcn_global_load_tr6_b96:
1594 case Intrinsic::amdgcn_global_load_tr4_b64:
1595 case Intrinsic::amdgcn_global_load_tr_b64:
1596 case Intrinsic::amdgcn_global_load_tr_b128:
1597 case Intrinsic::amdgcn_ds_read_tr4_b64:
1598 case Intrinsic::amdgcn_ds_read_tr6_b96:
1599 case Intrinsic::amdgcn_ds_read_tr8_b64:
1600 case Intrinsic::amdgcn_ds_read_tr16_b64: {
1609 case Intrinsic::amdgcn_flat_load_monitor_b32:
1610 case Intrinsic::amdgcn_flat_load_monitor_b64:
1611 case Intrinsic::amdgcn_flat_load_monitor_b128:
1612 case Intrinsic::amdgcn_global_load_monitor_b32:
1613 case Intrinsic::amdgcn_global_load_monitor_b64:
1614 case Intrinsic::amdgcn_global_load_monitor_b128: {
1625 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
1626 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
1627 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B: {
1638 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
1639 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
1640 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B: {
1651 case Intrinsic::amdgcn_ds_gws_init:
1652 case Intrinsic::amdgcn_ds_gws_barrier:
1653 case Intrinsic::amdgcn_ds_gws_sema_v:
1654 case Intrinsic::amdgcn_ds_gws_sema_br:
1655 case Intrinsic::amdgcn_ds_gws_sema_p:
1656 case Intrinsic::amdgcn_ds_gws_sema_release_all: {
1666 Info.memVT = MVT::i32;
1668 Info.align =
Align(4);
1670 if (IntrID == Intrinsic::amdgcn_ds_gws_barrier)
1677 case Intrinsic::amdgcn_global_load_async_to_lds_b8:
1678 case Intrinsic::amdgcn_global_load_async_to_lds_b32:
1679 case Intrinsic::amdgcn_global_load_async_to_lds_b64:
1680 case Intrinsic::amdgcn_global_load_async_to_lds_b128:
1681 case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:
1682 case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
1683 case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
1684 case Intrinsic::amdgcn_cluster_load_async_to_lds_b128: {
1699 case Intrinsic::amdgcn_global_store_async_from_lds_b8:
1700 case Intrinsic::amdgcn_global_store_async_from_lds_b32:
1701 case Intrinsic::amdgcn_global_store_async_from_lds_b64:
1702 case Intrinsic::amdgcn_global_store_async_from_lds_b128: {
1717 case Intrinsic::amdgcn_load_to_lds:
1718 case Intrinsic::amdgcn_load_async_to_lds:
1719 case Intrinsic::amdgcn_global_load_lds:
1720 case Intrinsic::amdgcn_global_load_async_lds: {
1739 Width * 8 * Subtarget->getWavefrontSize());
1745 case Intrinsic::amdgcn_ds_bvh_stack_rtn:
1746 case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn:
1747 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:
1748 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn: {
1758 Info.memVT = MVT::i32;
1760 Info.align =
Align(4);
1766 case Intrinsic::amdgcn_s_prefetch_data:
1767 case Intrinsic::amdgcn_flat_prefetch:
1768 case Intrinsic::amdgcn_global_prefetch: {
1784 case Intrinsic::amdgcn_addrspacecast_nonnull: {
1787 unsigned SrcAS =
I.getOperand(0)->getType()->getPointerAddressSpace();
1788 unsigned DstAS =
I.getType()->getPointerAddressSpace();
1800 Type *&AccessTy)
const {
1801 Value *Ptr =
nullptr;
1802 switch (
II->getIntrinsicID()) {
1803 case Intrinsic::amdgcn_cluster_load_b128:
1804 case Intrinsic::amdgcn_cluster_load_b64:
1805 case Intrinsic::amdgcn_cluster_load_b32:
1806 case Intrinsic::amdgcn_ds_append:
1807 case Intrinsic::amdgcn_ds_consume:
1808 case Intrinsic::amdgcn_ds_load_tr8_b64:
1809 case Intrinsic::amdgcn_ds_load_tr16_b128:
1810 case Intrinsic::amdgcn_ds_load_tr4_b64:
1811 case Intrinsic::amdgcn_ds_load_tr6_b96:
1812 case Intrinsic::amdgcn_ds_read_tr4_b64:
1813 case Intrinsic::amdgcn_ds_read_tr6_b96:
1814 case Intrinsic::amdgcn_ds_read_tr8_b64:
1815 case Intrinsic::amdgcn_ds_read_tr16_b64:
1816 case Intrinsic::amdgcn_ds_ordered_add:
1817 case Intrinsic::amdgcn_ds_ordered_swap:
1818 case Intrinsic::amdgcn_ds_atomic_async_barrier_arrive_b64:
1819 case Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64:
1820 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1821 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1822 case Intrinsic::amdgcn_global_atomic_fmax_num:
1823 case Intrinsic::amdgcn_global_atomic_fmin_num:
1824 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1825 case Intrinsic::amdgcn_global_load_tr_b64:
1826 case Intrinsic::amdgcn_global_load_tr_b128:
1827 case Intrinsic::amdgcn_global_load_tr4_b64:
1828 case Intrinsic::amdgcn_global_load_tr6_b96:
1829 case Intrinsic::amdgcn_global_store_async_from_lds_b8:
1830 case Intrinsic::amdgcn_global_store_async_from_lds_b32:
1831 case Intrinsic::amdgcn_global_store_async_from_lds_b64:
1832 case Intrinsic::amdgcn_global_store_async_from_lds_b128:
1833 Ptr =
II->getArgOperand(0);
1835 case Intrinsic::amdgcn_load_to_lds:
1836 case Intrinsic::amdgcn_load_async_to_lds:
1837 case Intrinsic::amdgcn_global_load_lds:
1838 case Intrinsic::amdgcn_global_load_async_lds:
1839 case Intrinsic::amdgcn_global_load_async_to_lds_b8:
1840 case Intrinsic::amdgcn_global_load_async_to_lds_b32:
1841 case Intrinsic::amdgcn_global_load_async_to_lds_b64:
1842 case Intrinsic::amdgcn_global_load_async_to_lds_b128:
1843 case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:
1844 case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
1845 case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
1846 case Intrinsic::amdgcn_cluster_load_async_to_lds_b128:
1847 Ptr =
II->getArgOperand(1);
1852 AccessTy =
II->getType();
1858 unsigned AddrSpace)
const {
1859 if (!Subtarget->hasFlatInstOffsets()) {
1870 return AM.
Scale == 0 &&
1871 (AM.
BaseOffs == 0 || Subtarget->getInstrInfo()->isLegalFLATOffset(
1872 AM.
BaseOffs, AddrSpace, FlatVariant));
1876 if (Subtarget->hasFlatGlobalInsts())
1879 if (!Subtarget->hasAddr64() || Subtarget->useFlatForGlobal()) {
1892 return isLegalMUBUFAddressingMode(AM);
1895bool SITargetLowering::isLegalMUBUFAddressingMode(
const AddrMode &AM)
const {
1906 if (!
TII->isLegalMUBUFImmOffset(AM.BaseOffs))
1918 if (AM.HasBaseReg) {
1950 return isLegalMUBUFAddressingMode(AM);
1952 if (!Subtarget->hasScalarSubwordLoads()) {
1957 if (Ty->isSized() &&
DL.getTypeStoreSize(Ty) < 4)
2005 return Subtarget->hasFlatScratchEnabled()
2007 : isLegalMUBUFAddressingMode(AM);
2054 unsigned Size,
unsigned AddrSpace,
Align Alignment,
2063 if (!Subtarget->hasUnalignedDSAccessEnabled() && Alignment <
Align(4))
2066 Align RequiredAlignment(
2068 if (Subtarget->hasLDSMisalignedBugInWGPMode() &&
Size > 32 &&
2069 Alignment < RequiredAlignment)
2084 if (!Subtarget->hasUsableDSOffset() && Alignment <
Align(8))
2090 RequiredAlignment =
Align(4);
2092 if (Subtarget->hasUnalignedDSAccessEnabled()) {
2108 *IsFast = (Alignment >= RequiredAlignment) ? 64
2109 : (Alignment <
Align(4)) ? 32
2116 if (!Subtarget->hasDS96AndDS128())
2122 if (Subtarget->hasUnalignedDSAccessEnabled()) {
2131 *IsFast = (Alignment >= RequiredAlignment) ? 96
2132 : (Alignment <
Align(4)) ? 32
2139 if (!Subtarget->hasDS96AndDS128() || !Subtarget->useDS128())
2145 RequiredAlignment =
Align(8);
2147 if (Subtarget->hasUnalignedDSAccessEnabled()) {
2156 *IsFast = (Alignment >= RequiredAlignment) ? 128
2157 : (Alignment <
Align(4)) ? 32
2174 *IsFast = (Alignment >= RequiredAlignment) ?
Size : 0;
2176 return Alignment >= RequiredAlignment ||
2177 Subtarget->hasUnalignedDSAccessEnabled();
2185 bool AlignedBy4 = Alignment >=
Align(4);
2186 if (Subtarget->hasUnalignedScratchAccessEnabled()) {
2188 *IsFast = AlignedBy4 ?
Size : 1;
2193 *IsFast = AlignedBy4;
2204 return Alignment >=
Align(4) ||
2205 Subtarget->hasUnalignedBufferAccessEnabled();
2217 if (!Subtarget->hasRelaxedBufferOOBMode() &&
2232 return Size >= 32 && Alignment >=
Align(4);
2237 unsigned *IsFast)
const {
2239 Alignment, Flags, IsFast);
2244 const AttributeList &FuncAttributes)
const {
2250 if (
Op.size() >= 16 &&
2254 if (
Op.size() >= 8 &&
Op.isDstAligned(
Align(4)))
2272 unsigned DestAS)
const {
2275 Subtarget->hasGloballyAddressableScratch()) {
2305 unsigned Index)
const {
2317 unsigned MinAlign = Subtarget->useRealTrue16Insts() ? 16 : 32;
2322 if (Subtarget->has16BitInsts() && VT == MVT::i16) {
2357 auto [InputPtrReg, RC, ArgTy] =
2367 Chain, SL,
MRI.getLiveInVirtReg(InputPtrReg->getRegister()), PtrVT);
2373 const SDLoc &SL)
const {
2380 const SDLoc &SL)
const {
2383 std::optional<uint32_t> KnownSize =
2385 if (KnownSize.has_value())
2412 Val = getFPExtOrFPRound(DAG, Val, SL, VT);
2427SDValue SITargetLowering::lowerKernargMemParameter(
2432 MachinePointerInfo PtrInfo =
2441 int64_t OffsetDiff =
Offset - AlignDownOffset;
2447 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, AlignDownOffset);
2458 ArgVal = convertArgType(DAG, VT, MemVT, SL, ArgVal,
Signed, Arg);
2463 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain,
Offset);
2468 SDValue Val = convertArgType(DAG, VT, MemVT, SL, Load,
Signed, Arg);
2477 const SDLoc &SL)
const {
2546 ExtType, SL, VA.
getLocVT(), Chain, FIN,
2549 SDValue ConvertedVal = convertABITypeToValueType(DAG, ArgValue, VA, SL);
2550 if (ConvertedVal == ArgValue)
2551 return ConvertedVal;
2556SDValue SITargetLowering::lowerWorkGroupId(
2561 if (!Subtarget->hasClusters())
2562 return getPreloadedValue(DAG, MFI, VT, WorkGroupIdPV);
2570 SDValue ClusterIdXYZ = getPreloadedValue(DAG, MFI, VT, WorkGroupIdPV);
2571 SDLoc SL(ClusterIdXYZ);
2572 SDValue ClusterMaxIdXYZ = getPreloadedValue(DAG, MFI, VT, ClusterMaxIdPV);
2575 SDValue ClusterWorkGroupIdXYZ =
2576 getPreloadedValue(DAG, MFI, VT, ClusterWorkGroupIdPV);
2586 return ClusterIdXYZ;
2588 using namespace AMDGPU::Hwreg;
2592 DAG.
getMachineNode(AMDGPU::S_GETREG_B32_const, SL, VT, ClusterIdField);
2603SDValue SITargetLowering::getPreloadedValue(
2606 const ArgDescriptor *
Reg =
nullptr;
2607 const TargetRegisterClass *RC;
2611 const ArgDescriptor WorkGroupIDX =
2619 const ArgDescriptor WorkGroupIDZ =
2621 const ArgDescriptor ClusterWorkGroupIDX =
2623 const ArgDescriptor ClusterWorkGroupIDY =
2625 const ArgDescriptor ClusterWorkGroupIDZ =
2627 const ArgDescriptor ClusterWorkGroupMaxIDX =
2629 const ArgDescriptor ClusterWorkGroupMaxIDY =
2631 const ArgDescriptor ClusterWorkGroupMaxIDZ =
2633 const ArgDescriptor ClusterWorkGroupMaxFlatID =
2636 auto LoadConstant = [&](
unsigned N) {
2640 if (Subtarget->hasArchitectedSGPRs() &&
2647 Reg = &WorkGroupIDX;
2648 RC = &AMDGPU::SReg_32RegClass;
2652 Reg = &WorkGroupIDY;
2653 RC = &AMDGPU::SReg_32RegClass;
2657 Reg = &WorkGroupIDZ;
2658 RC = &AMDGPU::SReg_32RegClass;
2662 if (HasFixedDims && ClusterDims.
getDims()[0] == 1)
2663 return LoadConstant(0);
2664 Reg = &ClusterWorkGroupIDX;
2665 RC = &AMDGPU::SReg_32RegClass;
2669 if (HasFixedDims && ClusterDims.
getDims()[1] == 1)
2670 return LoadConstant(0);
2671 Reg = &ClusterWorkGroupIDY;
2672 RC = &AMDGPU::SReg_32RegClass;
2676 if (HasFixedDims && ClusterDims.
getDims()[2] == 1)
2677 return LoadConstant(0);
2678 Reg = &ClusterWorkGroupIDZ;
2679 RC = &AMDGPU::SReg_32RegClass;
2684 return LoadConstant(ClusterDims.
getDims()[0] - 1);
2685 Reg = &ClusterWorkGroupMaxIDX;
2686 RC = &AMDGPU::SReg_32RegClass;
2691 return LoadConstant(ClusterDims.
getDims()[1] - 1);
2692 Reg = &ClusterWorkGroupMaxIDY;
2693 RC = &AMDGPU::SReg_32RegClass;
2698 return LoadConstant(ClusterDims.
getDims()[2] - 1);
2699 Reg = &ClusterWorkGroupMaxIDZ;
2700 RC = &AMDGPU::SReg_32RegClass;
2704 Reg = &ClusterWorkGroupMaxFlatID;
2705 RC = &AMDGPU::SReg_32RegClass;
2736 for (
unsigned I = 0,
E = Ins.
size(), PSInputNum = 0;
I !=
E; ++
I) {
2740 "vector type argument should have been split");
2745 bool SkipArg = !Arg->
Used && !Info->isPSInputAllocated(PSInputNum);
2753 "unexpected vector split in ps argument type");
2767 Info->markPSInputAllocated(PSInputNum);
2769 Info->markPSInputEnabled(PSInputNum);
2785 if (Info.hasWorkItemIDX()) {
2791 (Subtarget->hasPackedTID() && Info.hasWorkItemIDY()) ? 0x3ff : ~0u;
2795 if (Info.hasWorkItemIDY()) {
2796 assert(Info.hasWorkItemIDX());
2797 if (Subtarget->hasPackedTID()) {
2798 Info.setWorkItemIDY(
2801 unsigned Reg = AMDGPU::VGPR1;
2809 if (Info.hasWorkItemIDZ()) {
2810 assert(Info.hasWorkItemIDX() && Info.hasWorkItemIDY());
2811 if (Subtarget->hasPackedTID()) {
2812 Info.setWorkItemIDZ(
2815 unsigned Reg = AMDGPU::VGPR2;
2835 if (RegIdx == ArgVGPRs.
size()) {
2842 unsigned Reg = ArgVGPRs[RegIdx];
2854 unsigned NumArgRegs) {
2857 if (RegIdx == ArgSGPRs.
size())
2860 unsigned Reg = ArgSGPRs[RegIdx];
2902 const unsigned Mask = 0x3ff;
2905 if (Info.hasWorkItemIDX()) {
2907 Info.setWorkItemIDX(Arg);
2910 if (Info.hasWorkItemIDY()) {
2912 Info.setWorkItemIDY(Arg);
2915 if (Info.hasWorkItemIDZ())
2927 const unsigned Mask = 0x3ff;
2936 auto &
ArgInfo = Info.getArgInfo();
2948 if (Info.hasImplicitArgPtr())
2956 if (Info.hasWorkGroupIDX())
2959 if (Info.hasWorkGroupIDY())
2962 if (Info.hasWorkGroupIDZ())
2965 if (Info.hasLDSKernelId())
2976 Register ImplicitBufferPtrReg = Info.addImplicitBufferPtr(
TRI);
2977 MF.
addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);
2983 Register PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(
TRI);
2984 MF.
addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass);
2989 Register DispatchPtrReg = Info.addDispatchPtr(
TRI);
2990 MF.
addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass);
2996 MF.
addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
3002 Register InputPtrReg = Info.addKernargSegmentPtr(
TRI);
3011 MF.
addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass);
3016 Register FlatScratchInitReg = Info.addFlatScratchInit(
TRI);
3017 MF.
addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);
3022 Register PrivateSegmentSizeReg = Info.addPrivateSegmentSize(
TRI);
3023 MF.
addLiveIn(PrivateSegmentSizeReg, &AMDGPU::SGPR_32RegClass);
3038 unsigned LastExplicitArgOffset = Subtarget->getExplicitKernelArgOffset();
3040 bool InPreloadSequence =
true;
3042 bool AlignedForImplictArgs =
false;
3043 unsigned ImplicitArgOffset = 0;
3044 for (
auto &Arg :
F.args()) {
3045 if (!InPreloadSequence || !Arg.hasInRegAttr())
3048 unsigned ArgIdx = Arg.getArgNo();
3051 if (InIdx < Ins.
size() &&
3052 (!Ins[InIdx].isOrigArg() || Ins[InIdx].getOrigArgIndex() != ArgIdx))
3055 for (; InIdx < Ins.
size() && Ins[InIdx].isOrigArg() &&
3056 Ins[InIdx].getOrigArgIndex() == ArgIdx;
3058 assert(ArgLocs[ArgIdx].isMemLoc());
3059 auto &ArgLoc = ArgLocs[InIdx];
3061 unsigned ArgOffset = ArgLoc.getLocMemOffset();
3063 unsigned NumAllocSGPRs =
3064 alignTo(ArgLoc.getLocVT().getFixedSizeInBits(), 32) / 32;
3067 if (Arg.hasAttribute(
"amdgpu-hidden-argument")) {
3068 if (!AlignedForImplictArgs) {
3070 alignTo(LastExplicitArgOffset,
3071 Subtarget->getAlignmentForImplicitArgPtr()) -
3072 LastExplicitArgOffset;
3073 AlignedForImplictArgs =
true;
3075 ArgOffset += ImplicitArgOffset;
3079 if (ArgLoc.getLocVT().getStoreSize() < 4 && Alignment < 4) {
3080 assert(InIdx >= 1 &&
"No previous SGPR");
3081 Info.getArgInfo().PreloadKernArgs[InIdx].Regs.push_back(
3082 Info.getArgInfo().PreloadKernArgs[InIdx - 1].Regs[0]);
3086 unsigned Padding = ArgOffset - LastExplicitArgOffset;
3087 unsigned PaddingSGPRs =
alignTo(Padding, 4) / 4;
3090 InPreloadSequence =
false;
3096 TRI.getSGPRClassForBitWidth(NumAllocSGPRs * 32);
3098 Info.addPreloadedKernArg(
TRI, RC, NumAllocSGPRs, InIdx, PaddingSGPRs);
3100 if (PreloadRegs->
size() > 1)
3101 RC = &AMDGPU::SGPR_32RegClass;
3102 for (
auto &Reg : *PreloadRegs) {
3108 LastExplicitArgOffset = NumAllocSGPRs * 4 + ArgOffset;
3117 if (Info.hasLDSKernelId()) {
3118 Register Reg = Info.addLDSKernelId();
3119 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3128 bool IsShader)
const {
3129 bool HasArchitectedSGPRs = Subtarget->hasArchitectedSGPRs();
3130 if (Subtarget->hasUserSGPRInit16BugInWave32() && !IsShader) {
3136 assert(!HasArchitectedSGPRs &&
"Unhandled feature for the subtarget");
3138 unsigned CurrentUserSGPRs = Info.getNumUserSGPRs();
3142 unsigned NumRequiredSystemSGPRs =
3143 Info.hasWorkGroupIDX() + Info.hasWorkGroupIDY() +
3144 Info.hasWorkGroupIDZ() + Info.hasWorkGroupInfo();
3145 for (
unsigned i = NumRequiredSystemSGPRs + CurrentUserSGPRs; i < 16; ++i) {
3146 Register Reg = Info.addReservedUserSGPR();
3147 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3152 if (!HasArchitectedSGPRs) {
3153 if (Info.hasWorkGroupIDX()) {
3154 Register Reg = Info.addWorkGroupIDX();
3155 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3159 if (Info.hasWorkGroupIDY()) {
3160 Register Reg = Info.addWorkGroupIDY();
3161 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3165 if (Info.hasWorkGroupIDZ()) {
3166 Register Reg = Info.addWorkGroupIDZ();
3167 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3172 if (Info.hasWorkGroupInfo()) {
3173 Register Reg = Info.addWorkGroupInfo();
3174 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3178 if (Info.hasPrivateSegmentWaveByteOffset()) {
3180 unsigned PrivateSegmentWaveByteOffsetReg;
3183 PrivateSegmentWaveByteOffsetReg =
3184 Info.getPrivateSegmentWaveByteOffsetSystemSGPR();
3188 if (PrivateSegmentWaveByteOffsetReg == AMDGPU::NoRegister) {
3190 Info.setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg);
3193 PrivateSegmentWaveByteOffsetReg = Info.addPrivateSegmentWaveByteOffset();
3195 MF.
addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass);
3196 CCInfo.
AllocateReg(PrivateSegmentWaveByteOffsetReg);
3199 assert(!Subtarget->hasUserSGPRInit16BugInWave32() || IsShader ||
3200 Info.getNumPreloadedSGPRs() >= 16);
3215 if (HasStackObjects)
3216 Info.setHasNonSpillStackObjects(
true);
3221 HasStackObjects =
true;
3225 bool RequiresStackAccess = HasStackObjects || MFI.
hasCalls();
3227 if (!ST.hasFlatScratchEnabled()) {
3228 if (RequiresStackAccess && ST.isAmdHsaOrMesa(MF.
getFunction())) {
3235 Info.setScratchRSrcReg(PrivateSegmentBufferReg);
3237 unsigned ReservedBufferReg =
TRI.reservedPrivateSegmentBufferReg(MF);
3247 Info.setScratchRSrcReg(ReservedBufferReg);
3266 if (!
MRI.isLiveIn(AMDGPU::SGPR32)) {
3267 Info.setStackPtrOffsetReg(AMDGPU::SGPR32);
3274 for (
unsigned Reg : AMDGPU::SGPR_32RegClass) {
3275 if (!
MRI.isLiveIn(
Reg)) {
3276 Info.setStackPtrOffsetReg(
Reg);
3281 if (Info.getStackPtrOffsetReg() == AMDGPU::SP_REG)
3288 if (ST.getFrameLowering()->hasFP(MF)) {
3289 Info.setFrameOffsetReg(AMDGPU::SGPR33);
3305 const MCPhysReg *IStart =
TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
3314 if (AMDGPU::SReg_64RegClass.
contains(*
I))
3315 RC = &AMDGPU::SGPR_64RegClass;
3316 else if (AMDGPU::SReg_32RegClass.
contains(*
I))
3317 RC = &AMDGPU::SGPR_32RegClass;
3323 Entry->addLiveIn(*
I);
3328 for (
auto *Exit : Exits)
3330 TII->get(TargetOpcode::COPY), *
I)
3345 bool IsError =
false;
3349 Fn,
"unsupported non-compute shaders with HSA",
DL.getDebugLoc()));
3367 !Info->hasLDSKernelId() && !Info->hasWorkItemIDX() &&
3368 !Info->hasWorkItemIDY() && !Info->hasWorkItemIDZ());
3370 if (!Subtarget->hasFlatScratchEnabled())
3375 !Subtarget->hasArchitectedSGPRs())
3376 assert(!Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() &&
3377 !Info->hasWorkGroupIDZ());
3380 bool IsWholeWaveFunc = Info->isWholeWaveFunction();
3398 if ((Info->getPSInputAddr() & 0x7F) == 0 ||
3399 ((Info->getPSInputAddr() & 0xF) == 0 && Info->isPSInputAllocated(11))) {
3402 Info->markPSInputAllocated(0);
3403 Info->markPSInputEnabled(0);
3405 if (Subtarget->isAmdPalOS()) {
3414 unsigned PsInputBits = Info->getPSInputAddr() & Info->getPSInputEnable();
3415 if ((PsInputBits & 0x7F) == 0 ||
3416 ((PsInputBits & 0xF) == 0 && (PsInputBits >> 11 & 1)))
3419 }
else if (IsKernel) {
3420 assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX());
3432 if (IsKernel && Subtarget->hasKernargPreload())
3436 }
else if (!IsGraphics) {
3441 if (!Subtarget->hasFlatScratchEnabled())
3453 Info->setNumWaveDispatchSGPRs(
3455 Info->setNumWaveDispatchVGPRs(
3457 }
else if (Info->getNumKernargPreloadedSGPRs()) {
3458 Info->setNumWaveDispatchSGPRs(Info->getNumUserSGPRs());
3463 if (IsWholeWaveFunc) {
3465 {MVT::i1, MVT::Other}, Chain);
3477 for (
unsigned i = IsWholeWaveFunc ? 1 : 0, e = Ins.
size(), ArgIdx = 0; i != e;
3488 if (IsEntryFunc && VA.
isMemLoc()) {
3511 if (Arg.
isOrigArg() && Info->getArgInfo().PreloadKernArgs.count(i)) {
3515 int64_t OffsetDiff =
Offset - AlignDownOffset;
3522 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs[0];
3533 NewArg = convertArgType(DAG, VT, MemVT,
DL, ArgVal,
3534 Ins[i].Flags.isSExt(), &Ins[i]);
3542 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs;
3545 if (PreloadRegs.
size() == 1) {
3546 Register VReg =
MRI.getLiveInVirtReg(PreloadRegs[0]);
3551 TRI->getRegSizeInBits(*RC)));
3559 for (
auto Reg : PreloadRegs) {
3566 PreloadRegs.size()),
3583 NewArg = convertArgType(DAG, VT, MemVT,
DL, NewArg,
3584 Ins[i].Flags.isSExt(), &Ins[i]);
3596 "hidden argument in kernel signature was not preloaded",
3602 lowerKernargMemParameter(DAG, VT, MemVT,
DL, Chain,
Offset,
3603 Alignment, Ins[i].Flags.isSExt(), &Ins[i]);
3623 if (!IsEntryFunc && VA.
isMemLoc()) {
3624 SDValue Val = lowerStackParameter(DAG, VA,
DL, Chain, Arg);
3635 if (AMDGPU::VGPR_32RegClass.
contains(Reg))
3636 RC = &AMDGPU::VGPR_32RegClass;
3637 else if (AMDGPU::SGPR_32RegClass.
contains(Reg))
3638 RC = &AMDGPU::SGPR_32RegClass;
3644 if (Arg.
Flags.
isInReg() && RC == &AMDGPU::VGPR_32RegClass) {
3650 ReadFirstLane, Val);
3666 Val = convertABITypeToValueType(DAG, Val, VA,
DL);
3675 Info->setBytesInStackArgArea(StackArgSize);
3677 return Chains.
empty() ? Chain
3686 const Type *RetTy)
const {
3694 CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
3699 unsigned MaxNumVGPRs = Subtarget->getMaxNumVGPRs(MF);
3700 unsigned TotalNumVGPRs = Subtarget->getAddressableNumArchVGPRs();
3701 for (
unsigned i = MaxNumVGPRs; i < TotalNumVGPRs; ++i)
3702 if (CCInfo.
isAllocated(AMDGPU::VGPR_32RegClass.getRegister(i)))
3725 Info->setIfReturnsVoid(Outs.
empty());
3726 bool IsWaveEnd = Info->returnsVoid() && IsShader;
3745 for (
unsigned I = 0, RealRVLocIdx = 0, E = RVLocs.
size();
I != E;
3746 ++
I, ++RealRVLocIdx) {
3750 SDValue Arg = OutVals[RealRVLocIdx];
3773 ReadFirstLane, Arg);
3780 if (!Info->isEntryFunction()) {
3786 if (AMDGPU::SReg_64RegClass.
contains(*
I))
3788 else if (AMDGPU::SReg_32RegClass.
contains(*
I))
3801 unsigned Opc = AMDGPUISD::ENDPGM;
3803 Opc = Info->isWholeWaveFunction() ? AMDGPUISD::WHOLE_WAVE_RETURN
3804 : IsShader ? AMDGPUISD::RETURN_TO_EPILOG
3805 : AMDGPUISD::RET_GLUE;
3910 const auto [OutgoingArg, ArgRC, ArgTy] =
3915 const auto [IncomingArg, IncomingArgRC, Ty] =
3917 assert(IncomingArgRC == ArgRC);
3920 EVT ArgVT =
TRI->getSpillSize(*ArgRC) == 8 ? MVT::i64 : MVT::i32;
3928 InputReg = getImplicitArgPtr(DAG,
DL);
3930 std::optional<uint32_t> Id =
3932 if (Id.has_value()) {
3943 if (OutgoingArg->isRegister()) {
3944 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
3945 if (!CCInfo.
AllocateReg(OutgoingArg->getRegister()))
3948 unsigned SpecialArgOffset =
3959 auto [OutgoingArg, ArgRC, Ty] =
3962 std::tie(OutgoingArg, ArgRC, Ty) =
3965 std::tie(OutgoingArg, ArgRC, Ty) =
3980 const bool NeedWorkItemIDX = !CLI.
CB->
hasFnAttr(
"amdgpu-no-workitem-id-x");
3981 const bool NeedWorkItemIDY = !CLI.
CB->
hasFnAttr(
"amdgpu-no-workitem-id-y");
3982 const bool NeedWorkItemIDZ = !CLI.
CB->
hasFnAttr(
"amdgpu-no-workitem-id-z");
3987 if (Subtarget->getMaxWorkitemID(
F, 0) != 0) {
3995 NeedWorkItemIDY && Subtarget->getMaxWorkitemID(
F, 1) != 0) {
4005 NeedWorkItemIDZ && Subtarget->getMaxWorkitemID(
F, 2) != 0) {
4014 if (!InputReg && (NeedWorkItemIDX || NeedWorkItemIDY || NeedWorkItemIDZ)) {
4015 if (!IncomingArgX && !IncomingArgY && !IncomingArgZ) {
4026 : IncomingArgY ? *IncomingArgY
4033 if (OutgoingArg->isRegister()) {
4035 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
4061 if (Callee->isDivergent())
4068 const uint32_t *CallerPreserved =
TRI->getCallPreservedMask(MF, CallerCC);
4072 if (!CallerPreserved)
4075 bool CCMatch = CallerCC == CalleeCC;
4088 if (Arg.hasByValAttr())
4102 const uint32_t *CalleePreserved =
TRI->getCallPreservedMask(MF, CalleeCC);
4103 if (!
TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
4112 CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, Ctx);
4125 for (
const auto &[CCVA, ArgVal] :
zip_equal(ArgLocs, OutVals)) {
4127 if (!CCVA.isRegLoc())
4132 if (ArgVal->
isDivergent() &&
TRI->isSGPRPhysReg(CCVA.getLocReg())) {
4134 dbgs() <<
"Cannot tail call due to divergent outgoing argument in "
4158enum ChainCallArgIdx {
4180 bool UsesDynamicVGPRs =
false;
4181 if (IsChainCallConv) {
4186 auto RequestedExecIt =
4188 return Arg.OrigArgIndex == 2;
4190 assert(RequestedExecIt != CLI.
Outs.end() &&
"No node for EXEC");
4192 size_t SpecialArgsBeginIdx = RequestedExecIt - CLI.
Outs.begin();
4195 CLI.
Outs.erase(RequestedExecIt, CLI.
Outs.end());
4198 "Haven't popped all the special args");
4201 CLI.
Args[ChainCallArgIdx::Exec];
4202 if (!RequestedExecArg.
Ty->
isIntegerTy(Subtarget->getWavefrontSize()))
4210 ArgNode->getAPIntValue(),
DL, ArgNode->getValueType(0)));
4212 ChainCallSpecialArgs.
push_back(Arg.Node);
4215 PushNodeOrTargetConstant(RequestedExecArg);
4221 if (FlagsValue.
isZero()) {
4222 if (CLI.
Args.size() > ChainCallArgIdx::Flags + 1)
4224 "no additional args allowed if flags == 0");
4226 if (CLI.
Args.size() != ChainCallArgIdx::FallbackCallee + 1) {
4230 if (!Subtarget->isWave32()) {
4232 CLI, InVals,
"dynamic VGPR mode is only supported for wave32");
4235 UsesDynamicVGPRs =
true;
4236 std::for_each(CLI.
Args.begin() + ChainCallArgIdx::NumVGPRs,
4237 CLI.
Args.end(), PushNodeOrTargetConstant);
4246 bool IsSibCall =
false;
4260 "unsupported call to variadic function ");
4268 "unsupported required tail call to function ");
4273 Outs, OutVals, Ins, DAG);
4277 "site marked musttail or on llvm.amdgcn.cs.chain");
4284 if (!TailCallOpt && IsTailCall)
4308 if (!Subtarget->hasFlatScratchEnabled())
4329 auto *
TRI = Subtarget->getRegisterInfo();
4336 if (!IsSibCall || IsChainCallConv) {
4337 if (!Subtarget->hasFlatScratchEnabled()) {
4343 RegsToPass.emplace_back(IsChainCallConv
4344 ? AMDGPU::SGPR48_SGPR49_SGPR50_SGPR51
4345 : AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3,
4352 const unsigned NumSpecialInputs = RegsToPass.size();
4354 MVT PtrVT = MVT::i32;
4357 for (
unsigned i = 0, e = ArgLocs.
size(); i != e; ++i) {
4385 RegsToPass.push_back(std::pair(VA.
getLocReg(), Arg));
4393 int32_t
Offset = LocMemOffset;
4400 unsigned OpSize = Flags.isByVal() ? Flags.getByValSize()
4406 ? Flags.getNonZeroByValAlign()
4433 if (Outs[i].Flags.isByVal()) {
4435 DAG.
getConstant(Outs[i].Flags.getByValSize(),
DL, MVT::i32);
4438 Outs[i].Flags.getNonZeroByValAlign(),
4440 nullptr, std::nullopt, DstInfo,
4446 DAG.
getStore(Chain,
DL, Arg, DstAddr, DstInfo, Alignment);
4452 if (!MemOpChains.
empty())
4468 unsigned ArgIdx = 0;
4469 for (
auto [Reg, Val] : RegsToPass) {
4470 if (ArgIdx++ >= NumSpecialInputs &&
4471 (IsChainCallConv || !Val->
isDivergent()) &&
TRI->isSGPRPhysReg(Reg)) {
4497 if (IsTailCall && !IsSibCall) {
4502 std::vector<SDValue>
Ops({Chain});
4508 Ops.push_back(Callee);
4525 Ops.push_back(Callee);
4536 if (IsChainCallConv)
4541 for (
auto &[Reg, Val] : RegsToPass)
4545 const uint32_t *Mask =
TRI->getCallPreservedMask(MF, CallConv);
4546 assert(Mask &&
"Missing call preserved mask for calling convention");
4556 MVT::Glue, GlueOps),
4561 Ops.push_back(InGlue);
4567 unsigned OPC = AMDGPUISD::TC_RETURN;
4570 OPC = AMDGPUISD::TC_RETURN_GFX;
4574 OPC = UsesDynamicVGPRs ? AMDGPUISD::TC_RETURN_CHAIN_DVGPR
4575 : AMDGPUISD::TC_RETURN_CHAIN;
4581 if (Info->isWholeWaveFunction())
4582 OPC = AMDGPUISD::TC_RETURN_GFX_WholeWave;
4589 Chain =
Call.getValue(0);
4590 InGlue =
Call.getValue(1);
4592 uint64_t CalleePopBytes = NumBytes;
4613 EVT VT =
Op.getValueType();
4627 "Stack grows upwards for AMDGPU");
4629 Chain = BaseAddr.getValue(1);
4631 if (Alignment > StackAlign) {
4633 << Subtarget->getWavefrontSizeLog2();
4634 uint64_t StackAlignMask = ScaledAlignment - 1;
4641 assert(
Size.getValueType() == MVT::i32 &&
"Size must be 32-bit");
4647 DAG.
getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32));
4658 DAG.
getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32));
4674 if (
Op.getValueType() != MVT::i32)
4693 assert(
Op.getValueType() == MVT::i32);
4702 Op.getOperand(0), IntrinID, GetRoundBothImm);
4736 SDValue RoundModeTimesNumBits =
4756 TableEntry, EnumOffset);
4772 static_cast<uint32_t>(ConstMode->getZExtValue()),
4784 if (UseReducedTable) {
4790 SDValue RoundModeTimesNumBits =
4810 SDValue RoundModeTimesNumBits =
4819 NewMode = TruncTable;
4828 ReadFirstLaneID, NewMode);
4841 IntrinID, RoundBothImm, NewMode);
4847 if (
Op->isDivergent() &&
4848 (!Subtarget->hasVmemPrefInsts() || !
Op.getConstantOperandVal(4)))
4858 if (Subtarget->hasSafeSmemPrefetch())
4866 if (!Subtarget->hasSafeSmemPrefetch() && !
Op.getConstantOperandVal(4))
4875 SDValue Src =
Op.getOperand(IsStrict ? 1 : 0);
4876 EVT SrcVT = Src.getValueType();
4885 EVT DstVT =
Op.getValueType();
4894 if (
Op.getValueType() != MVT::i64)
4908 Op.getOperand(0), IntrinID, ModeHwRegImm);
4910 Op.getOperand(0), IntrinID, TrapHwRegImm);
4924 if (
Op.getOperand(1).getValueType() != MVT::i64)
4936 ReadFirstLaneID, NewModeReg);
4938 ReadFirstLaneID, NewTrapReg);
4940 unsigned ModeHwReg =
4943 unsigned TrapHwReg =
4951 IntrinID, ModeHwRegImm, NewModeReg);
4954 IntrinID, TrapHwRegImm, NewTrapReg);
4963 .
Case(
"m0", AMDGPU::M0)
4964 .
Case(
"exec", AMDGPU::EXEC)
4965 .
Case(
"exec_lo", AMDGPU::EXEC_LO)
4966 .
Case(
"exec_hi", AMDGPU::EXEC_HI)
4967 .
Case(
"flat_scratch", AMDGPU::FLAT_SCR)
4968 .
Case(
"flat_scratch_lo", AMDGPU::FLAT_SCR_LO)
4969 .
Case(
"flat_scratch_hi", AMDGPU::FLAT_SCR_HI)
4974 if (!Subtarget->hasFlatScrRegister() &&
4975 Subtarget->getRegisterInfo()->regsOverlap(Reg, AMDGPU::FLAT_SCR)) {
4977 "\" for subtarget."));
4982 case AMDGPU::EXEC_LO:
4983 case AMDGPU::EXEC_HI:
4984 case AMDGPU::FLAT_SCR_LO:
4985 case AMDGPU::FLAT_SCR_HI:
4990 case AMDGPU::FLAT_SCR:
5009 MI.setDesc(
TII->getKillTerminatorFromPseudo(
MI.getOpcode()));
5018static std::pair<MachineBasicBlock *, MachineBasicBlock *>
5040 auto Next = std::next(
I);
5051 MBB.addSuccessor(LoopBB);
5053 return std::pair(LoopBB, RemainderBB);
5060 auto I =
MI.getIterator();
5061 auto E = std::next(
I);
5083 Src->setIsKill(
false);
5093 BuildMI(*LoopBB, LoopBB->begin(),
DL,
TII->get(AMDGPU::S_SETREG_IMM32_B32))
5099 Register Reg =
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5102 BuildMI(*LoopBB,
I,
DL,
TII->get(AMDGPU::S_GETREG_B32), Reg)
5126 unsigned InitReg,
unsigned ResultReg,
unsigned PhiReg,
5127 unsigned InitSaveExecReg,
int Offset,
bool UseGPRIdxMode,
5137 Register PhiExec =
MRI.createVirtualRegister(BoolRC);
5138 Register NewExec =
MRI.createVirtualRegister(BoolRC);
5140 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5141 Register CondReg =
MRI.createVirtualRegister(BoolRC);
5149 BuildMI(LoopBB,
I,
DL,
TII->get(TargetOpcode::PHI), PhiExec)
5156 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::V_READFIRSTLANE_B32), CurrentIdxReg)
5160 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::V_CMP_EQ_U32_e64), CondReg)
5168 MRI.setSimpleHint(NewExec, CondReg);
5170 if (UseGPRIdxMode) {
5172 SGPRIdxReg = CurrentIdxReg;
5174 SGPRIdxReg =
MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
5175 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::S_ADD_I32), SGPRIdxReg)
5185 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
5216 unsigned InitResultReg,
unsigned PhiReg,
int Offset,
5217 bool UseGPRIdxMode,
Register &SGPRIdxReg) {
5225 const auto *BoolXExecRC =
TRI->getWaveMaskRegClass();
5227 Register SaveExec =
MRI.createVirtualRegister(BoolXExecRC);
5228 Register TmpExec =
MRI.createVirtualRegister(BoolXExecRC);
5244 InitResultReg, DstReg, PhiReg, TmpExec,
5245 Offset, UseGPRIdxMode, SGPRIdxReg);
5251 LoopBB->removeSuccessor(RemainderBB);
5253 LoopBB->addSuccessor(LandingPad);
5264static std::pair<unsigned, int>
5268 int NumElts =
TRI.getRegSizeInBits(*SuperRC) / 32;
5273 return std::pair(AMDGPU::sub0,
Offset);
5313 Register Tmp =
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5330 Register SrcReg =
TII->getNamedOperand(
MI, AMDGPU::OpName::src)->getReg();
5331 int Offset =
TII->getNamedOperand(
MI, AMDGPU::OpName::offset)->getImm();
5340 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
5343 if (
TII->getRegisterInfo().isSGPRClass(IdxRC)) {
5347 if (UseGPRIdxMode) {
5354 TII->getIndirectGPRIDXPseudo(
TRI.getRegSizeInBits(*VecRC),
true);
5367 MI.eraseFromParent();
5376 Register PhiReg =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5377 Register InitReg =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5383 UseGPRIdxMode, SGPRIdxReg);
5387 if (UseGPRIdxMode) {
5389 TII->getIndirectGPRIDXPseudo(
TRI.getRegSizeInBits(*VecRC),
true);
5391 BuildMI(*LoopBB, InsPt,
DL, GPRIDXDesc, Dst)
5396 BuildMI(*LoopBB, InsPt,
DL,
TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
5401 MI.eraseFromParent();
5418 int Offset =
TII->getNamedOperand(
MI, AMDGPU::OpName::offset)->getImm();
5428 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
5430 if (Idx->
getReg() == AMDGPU::NoRegister) {
5441 MI.eraseFromParent();
5446 if (
TII->getRegisterInfo().isSGPRClass(IdxRC)) {
5450 if (UseGPRIdxMode) {
5454 TII->getIndirectGPRIDXPseudo(
TRI.getRegSizeInBits(*VecRC),
false);
5463 const MCInstrDesc &MovRelDesc =
TII->getIndirectRegWriteMovRelPseudo(
5464 TRI.getRegSizeInBits(*VecRC), 32,
false);
5470 MI.eraseFromParent();
5480 Register PhiReg =
MRI.createVirtualRegister(VecRC);
5484 UseGPRIdxMode, SGPRIdxReg);
5487 if (UseGPRIdxMode) {
5489 TII->getIndirectGPRIDXPseudo(
TRI.getRegSizeInBits(*VecRC),
false);
5491 BuildMI(*LoopBB, InsPt,
DL, GPRIDXDesc, Dst)
5497 const MCInstrDesc &MovRelDesc =
TII->getIndirectRegWriteMovRelPseudo(
5498 TRI.getRegSizeInBits(*VecRC), 32,
false);
5499 BuildMI(*LoopBB, InsPt,
DL, MovRelDesc, Dst)
5505 MI.eraseFromParent();
5521 bool IsAdd = (
MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
5522 if (ST.hasScalarAddSub64()) {
5523 unsigned Opc = IsAdd ? AMDGPU::S_ADD_U64 : AMDGPU::S_SUB_U64;
5533 Register DestSub0 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5534 Register DestSub1 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5537 MI,
MRI, Src0, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5539 MI,
MRI, Src0, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5542 MI,
MRI, Src1, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5544 MI,
MRI, Src1, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5546 unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
5547 unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
5556 MI.eraseFromParent();
5562 case AMDGPU::S_MIN_U32:
5563 return std::numeric_limits<uint32_t>::max();
5564 case AMDGPU::S_MIN_I32:
5565 return std::numeric_limits<int32_t>::max();
5566 case AMDGPU::S_MAX_U32:
5567 return std::numeric_limits<uint32_t>::min();
5568 case AMDGPU::S_MAX_I32:
5569 return std::numeric_limits<int32_t>::min();
5570 case AMDGPU::V_ADD_F32_e64:
5572 case AMDGPU::V_SUB_F32_e64:
5574 case AMDGPU::S_ADD_I32:
5575 case AMDGPU::S_SUB_I32:
5576 case AMDGPU::S_OR_B32:
5577 case AMDGPU::S_XOR_B32:
5578 return std::numeric_limits<uint32_t>::min();
5579 case AMDGPU::S_AND_B32:
5580 return std::numeric_limits<uint32_t>::max();
5581 case AMDGPU::V_MIN_F32_e64:
5582 case AMDGPU::V_MAX_F32_e64:
5586 "Unexpected opcode in getIdentityValueFor32BitWaveReduction");
5592 case AMDGPU::V_CMP_LT_U64_e64:
5593 return std::numeric_limits<uint64_t>::max();
5594 case AMDGPU::V_CMP_LT_I64_e64:
5595 return std::numeric_limits<int64_t>::max();
5596 case AMDGPU::V_CMP_GT_U64_e64:
5597 return std::numeric_limits<uint64_t>::min();
5598 case AMDGPU::V_CMP_GT_I64_e64:
5599 return std::numeric_limits<int64_t>::min();
5600 case AMDGPU::V_MIN_F64_e64:
5601 case AMDGPU::V_MAX_F64_e64:
5602 case AMDGPU::V_MIN_NUM_F64_e64:
5603 case AMDGPU::V_MAX_NUM_F64_e64:
5604 return 0x7FF8000000000000;
5605 case AMDGPU::S_ADD_U64_PSEUDO:
5606 case AMDGPU::S_SUB_U64_PSEUDO:
5607 case AMDGPU::S_OR_B64:
5608 case AMDGPU::S_XOR_B64:
5609 return std::numeric_limits<uint64_t>::min();
5610 case AMDGPU::S_AND_B64:
5611 return std::numeric_limits<uint64_t>::max();
5612 case AMDGPU::V_ADD_F64_e64:
5613 case AMDGPU::V_ADD_F64_pseudo_e64:
5614 return 0x8000000000000000;
5617 "Unexpected opcode in getIdentityValueFor64BitWaveReduction");
5622 return Opc == AMDGPU::S_MIN_U32 ||
Opc == AMDGPU::S_MIN_I32 ||
5623 Opc == AMDGPU::S_MAX_U32 ||
Opc == AMDGPU::S_MAX_I32 ||
5624 Opc == AMDGPU::S_ADD_I32 ||
Opc == AMDGPU::S_SUB_I32 ||
5625 Opc == AMDGPU::S_AND_B32 ||
Opc == AMDGPU::S_OR_B32 ||
5626 Opc == AMDGPU::S_XOR_B32 ||
Opc == AMDGPU::V_MIN_F32_e64 ||
5627 Opc == AMDGPU::V_MAX_F32_e64 ||
Opc == AMDGPU::V_ADD_F32_e64 ||
5628 Opc == AMDGPU::V_SUB_F32_e64;
5632 return Opc == AMDGPU::V_MIN_F32_e64 ||
Opc == AMDGPU::V_MAX_F32_e64 ||
5633 Opc == AMDGPU::V_ADD_F32_e64 ||
Opc == AMDGPU::V_SUB_F32_e64 ||
5634 Opc == AMDGPU::V_MIN_F64_e64 ||
Opc == AMDGPU::V_MAX_F64_e64 ||
5635 Opc == AMDGPU::V_MIN_NUM_F64_e64 ||
Opc == AMDGPU::V_MAX_NUM_F64_e64 ||
5636 Opc == AMDGPU::V_ADD_F64_e64 ||
Opc == AMDGPU::V_ADD_F64_pseudo_e64;
5650 bool isSGPR =
TRI->isSGPRClass(
MRI.getRegClass(SrcReg));
5655 case AMDGPU::S_MIN_U32:
5656 case AMDGPU::S_MIN_I32:
5657 case AMDGPU::V_MIN_F32_e64:
5658 case AMDGPU::S_MAX_U32:
5659 case AMDGPU::S_MAX_I32:
5660 case AMDGPU::V_MAX_F32_e64:
5661 case AMDGPU::S_AND_B32:
5662 case AMDGPU::S_OR_B32: {
5668 case AMDGPU::V_CMP_LT_U64_e64:
5669 case AMDGPU::V_CMP_LT_I64_e64:
5670 case AMDGPU::V_CMP_GT_U64_e64:
5671 case AMDGPU::V_CMP_GT_I64_e64:
5672 case AMDGPU::V_MIN_F64_e64:
5673 case AMDGPU::V_MIN_NUM_F64_e64:
5674 case AMDGPU::V_MAX_F64_e64:
5675 case AMDGPU::V_MAX_NUM_F64_e64:
5676 case AMDGPU::S_AND_B64:
5677 case AMDGPU::S_OR_B64: {
5683 case AMDGPU::S_XOR_B32:
5684 case AMDGPU::S_XOR_B64:
5685 case AMDGPU::S_ADD_I32:
5686 case AMDGPU::S_ADD_U64_PSEUDO:
5687 case AMDGPU::V_ADD_F32_e64:
5688 case AMDGPU::V_ADD_F64_e64:
5689 case AMDGPU::V_ADD_F64_pseudo_e64:
5690 case AMDGPU::S_SUB_I32:
5691 case AMDGPU::S_SUB_U64_PSEUDO:
5692 case AMDGPU::V_SUB_F32_e64: {
5695 Register ExecMask =
MRI.createVirtualRegister(WaveMaskRegClass);
5697 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5699 bool IsWave32 = ST.isWave32();
5700 unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5701 MCRegister ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5702 unsigned BitCountOpc =
5703 IsWave32 ? AMDGPU::S_BCNT1_I32_B32 : AMDGPU::S_BCNT1_I32_B64;
5707 auto NewAccumulator =
5712 case AMDGPU::S_XOR_B32:
5713 case AMDGPU::S_XOR_B64: {
5719 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5722 .
addReg(NewAccumulator->getOperand(0).getReg())
5725 if (
Opc == AMDGPU::S_XOR_B32) {
5731 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5733 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5737 TRI->getSubRegisterClass(SrcRC, AMDGPU::sub0);
5740 MI,
MRI,
MI.getOperand(1), SrcRC, AMDGPU::sub0, SrcSubRC);
5742 MI,
MRI,
MI.getOperand(1), SrcRC, AMDGPU::sub1, SrcSubRC);
5752 BuildMI(BB,
MI,
DL,
TII->get(TargetOpcode::REG_SEQUENCE), DstReg)
5760 case AMDGPU::S_SUB_I32: {
5761 Register NegatedVal =
MRI.createVirtualRegister(DstRegClass);
5769 .
addReg(NewAccumulator->getOperand(0).getReg());
5772 case AMDGPU::S_ADD_I32: {
5775 .
addReg(NewAccumulator->getOperand(0).getReg());
5778 case AMDGPU::S_ADD_U64_PSEUDO:
5779 case AMDGPU::S_SUB_U64_PSEUDO: {
5780 Register DestSub0 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5781 Register DestSub1 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5783 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5785 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5786 Register CarryReg =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5787 Register AddReg =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5789 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5791 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5795 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub0);
5798 MI,
MRI,
MI.getOperand(1), Src1RC, AMDGPU::sub0, Src1SubRC);
5800 MI,
MRI,
MI.getOperand(1), Src1RC, AMDGPU::sub1, Src1SubRC);
5802 if (
Opc == AMDGPU::S_SUB_U64_PSEUDO) {
5805 .
addReg(NewAccumulator->getOperand(0).getReg())
5815 Register LowOpcode =
Opc == AMDGPU::S_SUB_U64_PSEUDO
5817 : NewAccumulator->getOperand(0).getReg();
5828 Register HiVal =
Opc == AMDGPU::S_SUB_U64_PSEUDO ? AddReg : DestSub1;
5834 if (
Opc == AMDGPU::S_SUB_U64_PSEUDO) {
5840 BuildMI(BB,
MI,
DL,
TII->get(TargetOpcode::REG_SEQUENCE), DstReg)
5847 case AMDGPU::V_ADD_F32_e64:
5848 case AMDGPU::V_ADD_F64_e64:
5849 case AMDGPU::V_ADD_F64_pseudo_e64:
5850 case AMDGPU::V_SUB_F32_e64: {
5853 Register ActiveLanesVreg =
MRI.createVirtualRegister(VregRC);
5854 Register DstVreg =
MRI.createVirtualRegister(VregRC);
5857 TII->get(is32BitOpc ? AMDGPU::V_CVT_F32_I32_e64
5858 : AMDGPU::V_CVT_F64_I32_e64),
5860 .
addReg(NewAccumulator->getOperand(0).getReg())
5866 (
Opc == AMDGPU::V_SUB_F32_e64 ||
5867 MI.getOpcode() == AMDGPU::WAVE_REDUCE_FSUB_PSEUDO_F64)
5870 unsigned MulOpc = is32BitOpc ? AMDGPU::V_MUL_F32_e64
5872 ? AMDGPU::V_MUL_F64_pseudo_e64
5873 : AMDGPU::V_MUL_F64_e64;
5883 BuildMI(BB,
MI,
DL,
TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
5887 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5889 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5891 TRI->getSubRegisterClass(VregRC, AMDGPU::sub0);
5893 TII->buildExtractSubRegOrImm(
MI,
MRI, DestVregInst->getOperand(0),
5894 VregRC, AMDGPU::sub0, VregSubRC);
5896 TII->buildExtractSubRegOrImm(
MI,
MRI, DestVregInst->getOperand(0),
5897 VregRC, AMDGPU::sub1, VregSubRC);
5906 BuildMI(BB,
MI,
DL,
TII->get(TargetOpcode::REG_SEQUENCE), DstReg)
5939 Register LoopIterator =
MRI.createVirtualRegister(WaveMaskRegClass);
5940 Register IdentityValReg =
MRI.createVirtualRegister(DstRegClass);
5941 Register AccumulatorReg =
MRI.createVirtualRegister(DstRegClass);
5942 Register ActiveBitsReg =
MRI.createVirtualRegister(WaveMaskRegClass);
5943 Register NewActiveBitsReg =
MRI.createVirtualRegister(WaveMaskRegClass);
5944 Register FF1Reg =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5945 Register LaneValueReg =
MRI.createVirtualRegister(DstRegClass);
5947 bool IsWave32 = ST.isWave32();
5948 unsigned MovOpcForExec = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5949 unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5956 BuildMI(BB,
I,
DL,
TII->get(AMDGPU::S_MOV_B32), IdentityValReg)
5960 MI.getOpcode() == AMDGPU::WAVE_REDUCE_FSUB_PSEUDO_F64
5963 BuildMI(BB,
I,
DL,
TII->get(AMDGPU::S_MOV_B64_IMM_PSEUDO), IdentityValReg)
5972 I = ComputeLoop->begin();
5974 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::PHI), AccumulatorReg)
5978 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::PHI), ActiveBitsReg)
5982 I = ComputeLoop->end();
5985 unsigned SFFOpc = IsWave32 ? AMDGPU::S_FF1_I32_B32 : AMDGPU::S_FF1_I32_B64;
5989 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::V_READLANE_B32),
5995 MRI.createVirtualRegister(
MRI.getRegClass(SrcReg));
5996 Register DstVreg =
MRI.createVirtualRegister(
MRI.getRegClass(SrcReg));
5998 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::V_MOV_B32_e32),
6008 NewAccumulator =
BuildMI(*ComputeLoop,
I,
DL,
6009 TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
6018 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6020 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6021 Register LaneValReg =
MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
6024 TRI->getSubRegisterClass(SrcRC, AMDGPU::sub0);
6026 MI,
MRI,
MI.getOperand(1), SrcRC, AMDGPU::sub0, SrcSubRC);
6028 MI,
MRI,
MI.getOperand(1), SrcRC, AMDGPU::sub1, SrcSubRC);
6030 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::V_READLANE_B32),
6034 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::V_READLANE_B32),
6038 auto LaneValue =
BuildMI(*ComputeLoop,
I,
DL,
6039 TII->get(TargetOpcode::REG_SEQUENCE), LaneValReg)
6045 case AMDGPU::S_OR_B64:
6046 case AMDGPU::S_AND_B64:
6047 case AMDGPU::S_XOR_B64: {
6050 .
addReg(LaneValue->getOperand(0).getReg())
6054 case AMDGPU::V_CMP_GT_I64_e64:
6055 case AMDGPU::V_CMP_GT_U64_e64:
6056 case AMDGPU::V_CMP_LT_I64_e64:
6057 case AMDGPU::V_CMP_LT_U64_e64: {
6058 Register LaneMaskReg =
MRI.createVirtualRegister(WaveMaskRegClass);
6060 MRI.createVirtualRegister(WaveMaskRegClass);
6062 AMDGPU::getNamedOperandIdx(
MI.getOpcode(), AMDGPU::OpName::src);
6064 TRI->getAllocatableClass(
TII->getRegClass(
MI.getDesc(), SrcIdx));
6066 TRI->getSubRegisterClass(VregClass, AMDGPU::sub0);
6067 Register AccumulatorVReg =
MRI.createVirtualRegister(VregClass);
6070 VregClass, AMDGPU::sub0, VSubRegClass);
6073 VregClass, AMDGPU::sub1, VSubRegClass);
6074 BuildMI(*ComputeLoop,
I,
DL,
TII->get(TargetOpcode::REG_SEQUENCE),
6081 .
addReg(LaneValue->getOperand(0).getReg())
6082 .
addReg(AccumulatorVReg);
6084 unsigned AndOpc = IsWave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
6085 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AndOpc), ComparisonResultReg)
6089 NewAccumulator =
BuildMI(*ComputeLoop,
I,
DL,
6090 TII->get(AMDGPU::S_CSELECT_B64), DstReg)
6091 .
addReg(LaneValue->getOperand(0).getReg())
6095 case AMDGPU::V_MIN_F64_e64:
6096 case AMDGPU::V_MIN_NUM_F64_e64:
6097 case AMDGPU::V_MAX_F64_e64:
6098 case AMDGPU::V_MAX_NUM_F64_e64:
6099 case AMDGPU::V_ADD_F64_e64:
6100 case AMDGPU::V_ADD_F64_pseudo_e64: {
6102 AMDGPU::getNamedOperandIdx(
MI.getOpcode(), AMDGPU::OpName::src);
6104 TRI->getAllocatableClass(
TII->getRegClass(
MI.getDesc(), SrcIdx));
6106 TRI->getSubRegisterClass(VregRC, AMDGPU::sub0);
6107 Register AccumulatorVReg =
MRI.createVirtualRegister(VregRC);
6108 Register DstVreg =
MRI.createVirtualRegister(VregRC);
6110 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6112 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6113 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::COPY), AccumulatorVReg)
6116 MI.getOpcode() == AMDGPU::WAVE_REDUCE_FSUB_PSEUDO_F64
6121 .
addReg(LaneValue->getOperand(0).getReg())
6127 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::V_READFIRSTLANE_B32),
6130 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::V_READFIRSTLANE_B32),
6134 TII->buildExtractSubRegOrImm(Iters,
MRI, DstVregInst->getOperand(0),
6135 VregRC, AMDGPU::sub0, VregSubRC);
6137 TII->buildExtractSubRegOrImm(Iters,
MRI, DstVregInst->getOperand(0),
6138 VregRC, AMDGPU::sub1, VregSubRC);
6139 ReadLaneLo.add(Op1L);
6140 ReadLaneHi.add(Op1H);
6141 NewAccumulator =
BuildMI(*ComputeLoop,
I,
DL,
6142 TII->get(TargetOpcode::REG_SEQUENCE), DstReg)
6149 case AMDGPU::S_ADD_U64_PSEUDO:
6150 case AMDGPU::S_SUB_U64_PSEUDO: {
6153 .
addReg(LaneValue->getOperand(0).getReg());
6160 unsigned BITSETOpc =
6161 IsWave32 ? AMDGPU::S_BITSET0_B32 : AMDGPU::S_BITSET0_B64;
6162 BuildMI(*ComputeLoop,
I,
DL,
TII->get(BITSETOpc), NewActiveBitsReg)
6168 ActiveBits.addReg(NewActiveBitsReg).addMBB(ComputeLoop);
6171 unsigned CMPOpc = IsWave32 ? AMDGPU::S_CMP_LG_U32 : AMDGPU::S_CMP_LG_U64;
6173 .
addReg(NewActiveBitsReg)
6175 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::S_CBRANCH_SCC1))
6180 MI.eraseFromParent();
6195 switch (
MI.getOpcode()) {
6196 case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U32:
6198 case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U64:
6200 case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I32:
6202 case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I64:
6204 case AMDGPU::WAVE_REDUCE_FMIN_PSEUDO_F32:
6206 case AMDGPU::WAVE_REDUCE_FMIN_PSEUDO_F64:
6209 ? AMDGPU::V_MIN_NUM_F64_e64
6210 : AMDGPU::V_MIN_F64_e64);
6211 case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U32:
6213 case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U64:
6215 case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I32:
6217 case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I64:
6219 case AMDGPU::WAVE_REDUCE_FMAX_PSEUDO_F32:
6221 case AMDGPU::WAVE_REDUCE_FMAX_PSEUDO_F64:
6224 ? AMDGPU::V_MAX_NUM_F64_e64
6225 : AMDGPU::V_MAX_F64_e64);
6226 case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_I32:
6228 case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_U64:
6230 case AMDGPU::WAVE_REDUCE_FADD_PSEUDO_F32:
6232 case AMDGPU::WAVE_REDUCE_FADD_PSEUDO_F64:
6235 ? AMDGPU::V_ADD_F64_pseudo_e64
6236 : AMDGPU::V_ADD_F64_e64);
6237 case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_I32:
6239 case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_U64:
6241 case AMDGPU::WAVE_REDUCE_FSUB_PSEUDO_F32:
6243 case AMDGPU::WAVE_REDUCE_FSUB_PSEUDO_F64:
6248 ? AMDGPU::V_ADD_F64_pseudo_e64
6249 : AMDGPU::V_ADD_F64_e64);
6250 case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B32:
6252 case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B64:
6254 case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B32:
6256 case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B64:
6258 case AMDGPU::WAVE_REDUCE_XOR_PSEUDO_B32:
6260 case AMDGPU::WAVE_REDUCE_XOR_PSEUDO_B64:
6262 case AMDGPU::S_UADDO_PSEUDO:
6263 case AMDGPU::S_USUBO_PSEUDO: {
6269 unsigned Opc = (
MI.getOpcode() == AMDGPU::S_UADDO_PSEUDO)
6271 : AMDGPU::S_SUB_U32;
6279 Subtarget->isWave64() ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
6282 MI.eraseFromParent();
6285 case AMDGPU::S_ADD_U64_PSEUDO:
6286 case AMDGPU::S_SUB_U64_PSEUDO: {
6289 case AMDGPU::V_ADD_U64_PSEUDO:
6290 case AMDGPU::V_SUB_U64_PSEUDO: {
6291 bool IsAdd = (
MI.getOpcode() == AMDGPU::V_ADD_U64_PSEUDO);
6297 if (ST.hasAddSubU64Insts()) {
6299 TII->get(IsAdd ? AMDGPU::V_ADD_U64_e64
6300 : AMDGPU::V_SUB_U64_e64),
6305 TII->legalizeOperands(*
I);
6306 MI.eraseFromParent();
6310 if (IsAdd && ST.hasLshlAddU64Inst()) {
6316 TII->legalizeOperands(*
Add);
6317 MI.eraseFromParent();
6321 const auto *CarryRC =
TRI->getWaveMaskRegClass();
6323 Register DestSub0 =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6324 Register DestSub1 =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6326 Register CarryReg =
MRI.createVirtualRegister(CarryRC);
6327 Register DeadCarryReg =
MRI.createVirtualRegister(CarryRC);
6331 : &AMDGPU::VReg_64RegClass;
6334 : &AMDGPU::VReg_64RegClass;
6337 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
6339 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
6342 MI,
MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
6344 MI,
MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
6347 MI,
MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
6349 MI,
MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
6352 IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
6359 unsigned HiOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
6373 TII->legalizeOperands(*LoHalf);
6374 TII->legalizeOperands(*HiHalf);
6375 MI.eraseFromParent();
6378 case AMDGPU::S_ADD_CO_PSEUDO:
6379 case AMDGPU::S_SUB_CO_PSEUDO: {
6390 Register RegOp0 =
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6391 BuildMI(*BB, MII,
DL,
TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp0)
6396 Register RegOp1 =
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6397 BuildMI(*BB, MII,
DL,
TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp1)
6401 Register RegOp2 =
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6403 BuildMI(*BB, MII,
DL,
TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp2)
6408 if (ST.isWave64()) {
6409 if (ST.hasScalarCompareEq64()) {
6416 TRI->getSubRegisterClass(Src2RC, AMDGPU::sub0);
6418 MII,
MRI, Src2, Src2RC, AMDGPU::sub0, SubRC);
6420 MII,
MRI, Src2, Src2RC, AMDGPU::sub1, SubRC);
6421 Register Src2_32 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6423 BuildMI(*BB, MII,
DL,
TII->get(AMDGPU::S_OR_B32), Src2_32)
6437 unsigned Opc =
MI.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO
6438 ? AMDGPU::S_ADDC_U32
6439 : AMDGPU::S_SUBB_U32;
6444 ST.isWave64() ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
6450 MI.eraseFromParent();
6453 case AMDGPU::SI_INIT_M0: {
6456 TII->get(M0Init.
isReg() ? AMDGPU::COPY : AMDGPU::S_MOV_B32),
6459 MI.eraseFromParent();
6462 case AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM: {
6465 TII->get(AMDGPU::S_CMP_EQ_U32))
6470 case AMDGPU::GET_GROUPSTATICSIZE: {
6474 .
add(
MI.getOperand(0))
6476 MI.eraseFromParent();
6479 case AMDGPU::GET_SHADERCYCLESHILO: {
6492 Register RegHi1 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6494 .
addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
6495 Register RegLo1 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6497 .
addImm(HwregEncoding::encode(ID_SHADER_CYCLES, 0, 32));
6498 Register RegHi2 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6500 .
addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
6504 Register RegLo =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6509 .
add(
MI.getOperand(0))
6514 MI.eraseFromParent();
6517 case AMDGPU::SI_INDIRECT_SRC_V1:
6518 case AMDGPU::SI_INDIRECT_SRC_V2:
6519 case AMDGPU::SI_INDIRECT_SRC_V3:
6520 case AMDGPU::SI_INDIRECT_SRC_V4:
6521 case AMDGPU::SI_INDIRECT_SRC_V5:
6522 case AMDGPU::SI_INDIRECT_SRC_V6:
6523 case AMDGPU::SI_INDIRECT_SRC_V7:
6524 case AMDGPU::SI_INDIRECT_SRC_V8:
6525 case AMDGPU::SI_INDIRECT_SRC_V9:
6526 case AMDGPU::SI_INDIRECT_SRC_V10:
6527 case AMDGPU::SI_INDIRECT_SRC_V11:
6528 case AMDGPU::SI_INDIRECT_SRC_V12:
6529 case AMDGPU::SI_INDIRECT_SRC_V16:
6530 case AMDGPU::SI_INDIRECT_SRC_V32:
6532 case AMDGPU::SI_INDIRECT_DST_V1:
6533 case AMDGPU::SI_INDIRECT_DST_V2:
6534 case AMDGPU::SI_INDIRECT_DST_V3:
6535 case AMDGPU::SI_INDIRECT_DST_V4:
6536 case AMDGPU::SI_INDIRECT_DST_V5:
6537 case AMDGPU::SI_INDIRECT_DST_V6:
6538 case AMDGPU::SI_INDIRECT_DST_V7:
6539 case AMDGPU::SI_INDIRECT_DST_V8:
6540 case AMDGPU::SI_INDIRECT_DST_V9:
6541 case AMDGPU::SI_INDIRECT_DST_V10:
6542 case AMDGPU::SI_INDIRECT_DST_V11:
6543 case AMDGPU::SI_INDIRECT_DST_V12:
6544 case AMDGPU::SI_INDIRECT_DST_V16:
6545 case AMDGPU::SI_INDIRECT_DST_V32:
6547 case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
6548 case AMDGPU::SI_KILL_I1_PSEUDO:
6550 case AMDGPU::V_CNDMASK_B64_PSEUDO: {
6554 Register SrcCond =
MI.getOperand(3).getReg();
6556 Register DstLo =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6557 Register DstHi =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6558 const auto *CondRC =
TRI->getWaveMaskRegClass();
6559 Register SrcCondCopy =
MRI.createVirtualRegister(CondRC);
6563 : &AMDGPU::VReg_64RegClass;
6566 : &AMDGPU::VReg_64RegClass;
6569 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
6571 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
6574 MI,
MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
6576 MI,
MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
6579 MI,
MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
6581 MI,
MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
6602 MI.eraseFromParent();
6605 case AMDGPU::SI_BR_UNDEF: {
6607 .
add(
MI.getOperand(0));
6609 MI.eraseFromParent();
6612 case AMDGPU::ADJCALLSTACKUP:
6613 case AMDGPU::ADJCALLSTACKDOWN: {
6620 case AMDGPU::SI_CALL_ISEL: {
6621 unsigned ReturnAddrReg =
TII->getRegisterInfo().getReturnAddressReg(*MF);
6624 MIB =
BuildMI(*BB,
MI,
DL,
TII->get(AMDGPU::SI_CALL), ReturnAddrReg);
6630 MI.eraseFromParent();
6633 case AMDGPU::V_ADD_CO_U32_e32:
6634 case AMDGPU::V_SUB_CO_U32_e32:
6635 case AMDGPU::V_SUBREV_CO_U32_e32: {
6637 unsigned Opc =
MI.getOpcode();
6639 bool NeedClampOperand =
false;
6640 if (
TII->pseudoToMCOpcode(
Opc) == -1) {
6642 NeedClampOperand =
true;
6646 if (
TII->isVOP3(*
I)) {
6649 I.add(
MI.getOperand(1)).add(
MI.getOperand(2));
6650 if (NeedClampOperand)
6653 TII->legalizeOperands(*
I);
6655 MI.eraseFromParent();
6658 case AMDGPU::V_ADDC_U32_e32:
6659 case AMDGPU::V_SUBB_U32_e32:
6660 case AMDGPU::V_SUBBREV_U32_e32:
6663 TII->legalizeOperands(
MI);
6665 case AMDGPU::DS_GWS_INIT:
6666 case AMDGPU::DS_GWS_SEMA_BR:
6667 case AMDGPU::DS_GWS_BARRIER:
6668 case AMDGPU::DS_GWS_SEMA_V:
6669 case AMDGPU::DS_GWS_SEMA_P:
6670 case AMDGPU::DS_GWS_SEMA_RELEASE_ALL:
6678 case AMDGPU::S_SETREG_B32: {
6694 const unsigned SetMask = WidthMask <<
Offset;
6697 unsigned SetDenormOp = 0;
6698 unsigned SetRoundOp = 0;
6706 SetRoundOp = AMDGPU::S_ROUND_MODE;
6707 SetDenormOp = AMDGPU::S_DENORM_MODE;
6709 SetRoundOp = AMDGPU::S_ROUND_MODE;
6711 SetDenormOp = AMDGPU::S_DENORM_MODE;
6714 if (SetRoundOp || SetDenormOp) {
6716 if (Def && Def->isMoveImmediate() && Def->getOperand(1).isImm()) {
6717 unsigned ImmVal = Def->getOperand(1).getImm();
6731 MI.eraseFromParent();
6740 MI.setDesc(
TII->get(AMDGPU::S_SETREG_B32_mode));
6744 case AMDGPU::S_INVERSE_BALLOT_U32:
6745 case AMDGPU::S_INVERSE_BALLOT_U64:
6748 MI.setDesc(
TII->get(AMDGPU::COPY));
6750 case AMDGPU::ENDPGM_TRAP: {
6752 MI.setDesc(
TII->get(AMDGPU::S_ENDPGM));
6772 MI.eraseFromParent();
6775 case AMDGPU::SIMULATED_TRAP: {
6776 assert(Subtarget->hasPrivEnabledTrap2NopBug());
6778 TII->insertSimulatedTrap(
MRI, *BB,
MI,
MI.getDebugLoc());
6779 MI.eraseFromParent();
6782 case AMDGPU::SI_TCRETURN_GFX_WholeWave:
6783 case AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN: {
6789 assert(Setup &&
"Couldn't find SI_SETUP_WHOLE_WAVE_FUNC");
6790 Register OriginalExec = Setup->getOperand(0).getReg();
6792 MI.getOperand(0).setReg(OriginalExec);
6829 return (VT == MVT::i16) ? MVT::i16 : MVT::i32;
6833 return (Ty.getScalarSizeInBits() <= 16 && Subtarget->has16BitInsts())
6860 if (!Subtarget->hasMadMacF32Insts())
6861 return Subtarget->hasFastFMAF32();
6867 return Subtarget->hasFastFMAF32() || Subtarget->hasDLInsts();
6870 return Subtarget->hasFastFMAF32() && Subtarget->hasDLInsts();
6886 switch (Ty.getScalarSizeInBits()) {
6904 if (Ty.getScalarSizeInBits() == 16)
6906 if (Ty.getScalarSizeInBits() == 32)
6907 return Subtarget->hasMadMacF32Insts() &&
6917 EVT VT =
N->getValueType(0);
6919 return Subtarget->hasMadMacF32Insts() &&
6921 if (VT == MVT::f16) {
6922 return Subtarget->hasMadF16() &&
6937 unsigned Opc =
Op.getOpcode();
6938 EVT VT =
Op.getValueType();
6939 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16 ||
6940 VT == MVT::v4f32 || VT == MVT::v8i16 || VT == MVT::v8f16 ||
6941 VT == MVT::v8bf16 || VT == MVT::v16i16 || VT == MVT::v16f16 ||
6942 VT == MVT::v16bf16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
6943 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16 ||
6944 VT == MVT::v32bf16);
6960 [[maybe_unused]]
EVT VT =
Op.getValueType();
6962 assert((VT == MVT::v2i32 || VT == MVT::v4i32 || VT == MVT::v8i32 ||
6963 VT == MVT::v16i32) &&
6964 "Unexpected ValueType.");
6973 unsigned Opc =
Op.getOpcode();
6974 EVT VT =
Op.getValueType();
6975 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16 ||
6976 VT == MVT::v4f32 || VT == MVT::v8i16 || VT == MVT::v8f16 ||
6977 VT == MVT::v8bf16 || VT == MVT::v16i16 || VT == MVT::v16f16 ||
6978 VT == MVT::v16bf16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
6979 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16 ||
6980 VT == MVT::v32bf16);
6988 DAG.
getNode(
Opc, SL, Lo0.getValueType(), Lo0, Lo1,
Op->getFlags());
6990 DAG.
getNode(
Opc, SL, Hi0.getValueType(), Hi0, Hi1,
Op->getFlags());
6997 unsigned Opc =
Op.getOpcode();
6998 EVT VT =
Op.getValueType();
6999 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v8i16 ||
7000 VT == MVT::v8f16 || VT == MVT::v4f32 || VT == MVT::v16i16 ||
7001 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
7002 VT == MVT::v32f32 || VT == MVT::v32f16 || VT == MVT::v32i16 ||
7003 VT == MVT::v4bf16 || VT == MVT::v8bf16 || VT == MVT::v16bf16 ||
7004 VT == MVT::v32bf16);
7009 : std::pair(Op0, Op0);
7018 DAG.
getNode(
Opc, SL, ResVT.first, Lo0, Lo1, Lo2,
Op->getFlags());
7020 DAG.
getNode(
Opc, SL, ResVT.second, Hi0, Hi1, Hi2,
Op->getFlags());
7026 switch (
Op.getOpcode()) {
7030 return LowerBRCOND(
Op, DAG);
7032 return LowerRETURNADDR(
Op, DAG);
7034 return LowerSPONENTRY(
Op, DAG);
7037 assert((!Result.getNode() || Result.getNode()->getNumValues() == 2) &&
7038 "Load should return a value and a chain");
7042 EVT VT =
Op.getValueType();
7044 return lowerFSQRTF32(
Op, DAG);
7046 return lowerFSQRTF64(
Op, DAG);
7051 return LowerTrig(
Op, DAG);
7053 return LowerSELECT(
Op, DAG);
7055 return LowerFDIV(
Op, DAG);
7057 return LowerFFREXP(
Op, DAG);
7059 return LowerATOMIC_CMP_SWAP(
Op, DAG);
7061 return LowerSTORE(
Op, DAG);
7065 return LowerGlobalAddress(MFI,
Op, DAG);
7068 return LowerExternalSymbol(
Op, DAG);
7070 return LowerINTRINSIC_WO_CHAIN(
Op, DAG);
7072 return LowerINTRINSIC_W_CHAIN(
Op, DAG);
7074 return LowerINTRINSIC_VOID(
Op, DAG);
7076 return lowerADDRSPACECAST(
Op, DAG);
7078 return lowerINSERT_SUBVECTOR(
Op, DAG);
7080 return lowerINSERT_VECTOR_ELT(
Op, DAG);
7082 return lowerEXTRACT_VECTOR_ELT(
Op, DAG);
7084 return lowerVECTOR_SHUFFLE(
Op, DAG);
7086 return lowerSCALAR_TO_VECTOR(
Op, DAG);
7088 return lowerBUILD_VECTOR(
Op, DAG);
7091 return lowerFP_ROUND(
Op, DAG);
7093 return lowerTRAP(
Op, DAG);
7095 return lowerDEBUGTRAP(
Op, DAG);
7104 return lowerFMINNUM_FMAXNUM(
Op, DAG);
7107 return lowerFMINIMUMNUM_FMAXIMUMNUM(
Op, DAG);
7110 return lowerFMINIMUM_FMAXIMUM(
Op, DAG);
7113 return lowerFLDEXP(
Op, DAG);
7119 Op.getValueType() == MVT::i16 &&
7120 Op.getOperand(0).getValueType() == MVT::f32) {
7144 return lowerFCOPYSIGN(
Op, DAG);
7146 return lowerMUL(
Op, DAG);
7149 return lowerXMULO(
Op, DAG);
7152 return lowerXMUL_LOHI(
Op, DAG);
7187 EVT FittingLoadVT = LoadVT;
7219SDValue SITargetLowering::adjustLoadValueType(
unsigned Opcode,
MemSDNode *M,
7222 bool IsIntrinsic)
const {
7225 bool Unpacked = Subtarget->hasUnpackedD16VMem();
7226 EVT LoadVT =
M->getValueType(0);
7228 EVT EquivLoadVT = LoadVT;
7242 SDVTList VTList = DAG.
getVTList(EquivLoadVT, MVT::Other);
7246 M->getMemoryVT(),
M->getMemOperand());
7257 EVT LoadVT =
M->getValueType(0);
7263 assert(
M->getNumValues() == 2 ||
M->getNumValues() == 3);
7264 bool IsTFE =
M->getNumValues() == 3;
7266 unsigned Opc = IsFormat ? (IsTFE ? AMDGPUISD::BUFFER_LOAD_FORMAT_TFE
7267 : AMDGPUISD::BUFFER_LOAD_FORMAT)
7268 : IsTFE ? AMDGPUISD::BUFFER_LOAD_TFE
7269 : AMDGPUISD::BUFFER_LOAD;
7272 return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16, M, DAG,
Ops);
7277 return handleByteShortBufferLoads(DAG, LoadVT,
DL,
Ops,
M->getMemOperand(),
7281 return getMemIntrinsicNode(
Opc,
DL,
M->getVTList(),
Ops, IntVT,
7282 M->getMemOperand(), DAG);
7286 SDVTList VTList = DAG.
getVTList(CastVT, MVT::Other);
7288 M->getMemOperand(), DAG);
7296 EVT VT =
N->getValueType(0);
7297 unsigned CondCode =
N->getConstantOperandVal(3);
7308 EVT CmpVT =
LHS.getValueType();
7309 if (CmpVT == MVT::i16 && !TLI.
isTypeLegal(MVT::i16)) {
7310 unsigned PromoteOp =
7330 EVT VT =
N->getValueType(0);
7332 unsigned CondCode =
N->getConstantOperandVal(3);
7341 if (CmpVT == MVT::f16 && !TLI.
isTypeLegal(CmpVT)) {
7350 SDValue SetCC = DAG.
getNode(AMDGPUISD::SETCC, SL, CCVT, Src0, Src1,
7359 EVT VT =
N->getValueType(0);
7383 Exec = AMDGPU::EXEC_LO;
7385 Exec = AMDGPU::EXEC;
7402 EVT VT =
N->getValueType(0);
7404 unsigned IID =
N->getConstantOperandVal(0);
7405 bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 ||
7406 IID == Intrinsic::amdgcn_permlanex16;
7407 bool IsSetInactive = IID == Intrinsic::amdgcn_set_inactive ||
7408 IID == Intrinsic::amdgcn_set_inactive_chain_arg;
7412 unsigned SplitSize = 32;
7413 if (IID == Intrinsic::amdgcn_update_dpp && (ValSize % 64 == 0) &&
7414 ST->hasDPALU_DPP() &&
7422 case Intrinsic::amdgcn_permlane16:
7423 case Intrinsic::amdgcn_permlanex16:
7424 case Intrinsic::amdgcn_update_dpp:
7429 case Intrinsic::amdgcn_writelane:
7432 case Intrinsic::amdgcn_readlane:
7433 case Intrinsic::amdgcn_set_inactive:
7434 case Intrinsic::amdgcn_set_inactive_chain_arg:
7435 case Intrinsic::amdgcn_mov_dpp8:
7438 case Intrinsic::amdgcn_readfirstlane:
7439 case Intrinsic::amdgcn_permlane64:
7447 std::reverse(Operands.
begin(), Operands.
end());
7449 if (
SDNode *GL =
N->getGluedNode()) {
7451 GL = GL->getOperand(0).getNode();
7461 if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
7462 IID == Intrinsic::amdgcn_mov_dpp8 ||
7463 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
7464 Src1 =
N->getOperand(2);
7465 if (IID == Intrinsic::amdgcn_writelane ||
7466 IID == Intrinsic::amdgcn_update_dpp || IsPermLane16)
7467 Src2 =
N->getOperand(3);
7470 if (ValSize == SplitSize) {
7480 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
7485 if (IID == Intrinsic::amdgcn_writelane) {
7490 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, MVT::i32);
7492 return IsFloat ? DAG.
getBitcast(VT, Trunc) : Trunc;
7495 if (ValSize % SplitSize != 0)
7499 EVT VT =
N->getValueType(0);
7503 unsigned NumOperands =
N->getNumOperands();
7505 SDNode *GL =
N->getGluedNode();
7510 for (
unsigned i = 0; i != NE; ++i) {
7511 for (
unsigned j = 0, e = GL ? NumOperands - 1 : NumOperands; j != e;
7513 SDValue Operand =
N->getOperand(j);
7522 Operands[j] = Operand;
7527 Operands[NumOperands - 1] =
7543 if (SplitSize == 32) {
7545 return unrollLaneOp(LaneOp.
getNode());
7551 unsigned SubVecNumElt =
7555 SDValue Src0SubVec, Src1SubVec, Src2SubVec;
7556 for (
unsigned i = 0, EltIdx = 0; i < ValSize / SplitSize; i++) {
7560 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive ||
7565 if (IID == Intrinsic::amdgcn_writelane)
7570 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16
7571 ? createLaneOp(Src0SubVec, Src1SubVec, Src2, SubVecVT)
7572 : createLaneOp(Src0SubVec, Src1, Src2SubVec, SubVecVT));
7573 EltIdx += SubVecNumElt;
7587 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
7590 if (IID == Intrinsic::amdgcn_writelane)
7593 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VecVT);
7600 EVT VT =
N->getValueType(0);
7618 auto MakeIntrinsic = [&DAG, &SL](
unsigned IID,
MVT RetVT,
7622 Operands.
append(IntrinArgs);
7628 SDValue BPermute = MakeIntrinsic(Intrinsic::amdgcn_ds_bpermute, MVT::i32,
7629 {ShiftedIndex, ValueI32});
7639 SDValue WWMValue = MakeIntrinsic(Intrinsic::amdgcn_set_inactive, MVT::i32,
7640 {ValueI32, PoisonVal});
7641 SDValue WWMIndex = MakeIntrinsic(Intrinsic::amdgcn_set_inactive, MVT::i32,
7642 {ShiftedIndex, PoisonVal});
7645 MakeIntrinsic(Intrinsic::amdgcn_permlane64, MVT::i32, {WWMValue});
7648 SDValue BPermSameHalf = MakeIntrinsic(Intrinsic::amdgcn_ds_bpermute, MVT::i32,
7649 {WWMIndex, WWMValue});
7650 SDValue BPermOtherHalf = MakeIntrinsic(Intrinsic::amdgcn_ds_bpermute,
7651 MVT::i32, {WWMIndex, Swapped});
7653 MakeIntrinsic(Intrinsic::amdgcn_wwm, MVT::i32, {BPermOtherHalf});
7661 MakeIntrinsic(Intrinsic::amdgcn_mbcnt_lo, MVT::i32,
7669 DAG.
getSetCC(SL, MVT::i1, SameOrOtherHalf,
7679 switch (
N->getOpcode()) {
7691 unsigned IID =
N->getConstantOperandVal(0);
7693 case Intrinsic::amdgcn_make_buffer_rsrc:
7694 Results.push_back(lowerPointerAsRsrcIntrin(
N, DAG));
7696 case Intrinsic::amdgcn_cvt_pkrtz: {
7701 DAG.
getNode(AMDGPUISD::CVT_PKRTZ_F16_F32, SL, MVT::i32, Src0, Src1);
7705 case Intrinsic::amdgcn_cvt_pknorm_i16:
7706 case Intrinsic::amdgcn_cvt_pknorm_u16:
7707 case Intrinsic::amdgcn_cvt_pk_i16:
7708 case Intrinsic::amdgcn_cvt_pk_u16: {
7714 if (IID == Intrinsic::amdgcn_cvt_pknorm_i16)
7715 Opcode = AMDGPUISD::CVT_PKNORM_I16_F32;
7716 else if (IID == Intrinsic::amdgcn_cvt_pknorm_u16)
7717 Opcode = AMDGPUISD::CVT_PKNORM_U16_F32;
7718 else if (IID == Intrinsic::amdgcn_cvt_pk_i16)
7719 Opcode = AMDGPUISD::CVT_PK_I16_I32;
7721 Opcode = AMDGPUISD::CVT_PK_U16_U32;
7723 EVT VT =
N->getValueType(0);
7732 case Intrinsic::amdgcn_s_buffer_load: {
7738 if (!Subtarget->hasScalarSubwordLoads())
7744 EVT VT =
Op.getValueType();
7745 assert(VT == MVT::i8 &&
"Expected 8-bit s_buffer_load intrinsics.\n");
7757 if (!
Offset->isDivergent()) {
7776 LoadVal = handleByteShortBufferLoads(DAG, VT,
DL,
Ops, MMO);
7781 case Intrinsic::amdgcn_dead: {
7782 for (
unsigned I = 0, E =
N->getNumValues();
I < E; ++
I)
7793 for (
unsigned I = 0;
I < Res.getNumOperands();
I++) {
7794 Results.push_back(Res.getOperand(
I));
7798 Results.push_back(Res.getValue(1));
7807 EVT VT =
N->getValueType(0);
7812 EVT SelectVT = NewVT;
7813 if (NewVT.
bitsLT(MVT::i32)) {
7816 SelectVT = MVT::i32;
7822 if (NewVT != SelectVT)
7828 if (
N->getValueType(0) != MVT::v2f16)
7840 if (
N->getValueType(0) != MVT::v2f16)
7852 if (
N->getValueType(0) != MVT::f16)
7867 if (U.get() !=
Value)
7870 if (U.getUser()->getOpcode() == Opcode)
7876unsigned SITargetLowering::isCFIntrinsic(
const SDNode *Intr)
const {
7879 case Intrinsic::amdgcn_if:
7880 return AMDGPUISD::IF;
7881 case Intrinsic::amdgcn_else:
7882 return AMDGPUISD::ELSE;
7883 case Intrinsic::amdgcn_loop:
7884 return AMDGPUISD::LOOP;
7885 case Intrinsic::amdgcn_end_cf:
7905 if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS())
7932 SDNode *Intr = BRCOND.getOperand(1).getNode();
7949 Intr =
LHS.getNode();
7957 assert(BR &&
"brcond missing unconditional branch user");
7962 unsigned CFNode = isCFIntrinsic(Intr);
7982 Ops.push_back(Target);
8005 for (
unsigned i = 1, e = Intr->
getNumValues() - 1; i != e; ++i) {
8024 MVT VT =
Op.getSimpleValueType();
8027 if (
Op.getConstantOperandVal(0) != 0)
8031 const SIMachineFunctionInfo *
Info = MF.
getInfo<SIMachineFunctionInfo>();
8033 if (
Info->isEntryFunction())
8050 SIMachineFunctionInfo *MFI = MF.
getInfo<SIMachineFunctionInfo>();
8064 return Op.getValueType().bitsLE(VT)
8072 EVT DstVT =
Op.getValueType();
8079 unsigned Opc =
Op.getOpcode();
8091 EVT SrcVT = Src.getValueType();
8092 EVT DstVT =
Op.getValueType();
8095 assert(Subtarget->hasCvtPkF16F32Inst() &&
"support v_cvt_pk_f16_f32");
8098 return SrcVT == MVT::v2f32 ?
Op : splitFP_ROUNDVectorOp(
Op, DAG);
8105 if (DstVT == MVT::f16) {
8110 if (!Subtarget->has16BitInsts()) {
8115 if (
Op->getFlags().hasApproximateFuncs()) {
8126 "custom lower FP_ROUND for f16 or bf16");
8127 assert(Subtarget->hasBF16ConversionInsts() &&
"f32 -> bf16 is legal");
8139 EVT VT =
Op.getValueType();
8141 const SIMachineFunctionInfo *
Info = MF.
getInfo<SIMachineFunctionInfo>();
8142 bool IsIEEEMode =
Info->getMode().IEEE;
8151 if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16 ||
8158SITargetLowering::lowerFMINIMUMNUM_FMAXIMUMNUM(
SDValue Op,
8160 EVT VT =
Op.getValueType();
8162 const SIMachineFunctionInfo *
Info = MF.
getInfo<SIMachineFunctionInfo>();
8163 bool IsIEEEMode =
Info->getMode().IEEE;
8168 if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16 ||
8176 EVT VT =
Op.getValueType();
8180 assert(!Subtarget->hasIEEEMinimumMaximumInsts() &&
8181 !Subtarget->hasMinimum3Maximum3F16() &&
8182 Subtarget->hasMinimum3Maximum3PKF16() && VT == MVT::f16 &&
8183 "should not need to widen f16 minimum/maximum to v2f16");
8197 DAG.
getNode(
Op.getOpcode(), SL, MVT::v2f16, WideSrc0, WideSrc1);
8205 EVT VT =
Op.getValueType();
8209 EVT ExpVT =
Exp.getValueType();
8210 if (ExpVT == MVT::i16)
8231 {
Op.getOperand(0),
Op.getOperand(1), TruncExp});
8238 switch (
Op->getOpcode()) {
8268 DAGCombinerInfo &DCI)
const {
8269 const unsigned Opc =
Op.getOpcode();
8277 :
Op->getOperand(0).getValueType();
8278 auto &DAG = DCI.DAG;
8281 if (DCI.isBeforeLegalizeOps() ||
8289 LHS =
Op->getOperand(1);
8290 RHS =
Op->getOperand(2);
8292 LHS =
Op->getOperand(0);
8293 RHS =
Op->getOperand(1);
8332 if (MagVT == SignVT)
8349 EVT VT =
Op.getValueType();
8355 assert(VT == MVT::i64 &&
"The following code is a special for s_mul_u64");
8382 if (
Op->isDivergent())
8395 if (Op0LeadingZeros >= 32 && Op1LeadingZeros >= 32)
8397 DAG.
getMachineNode(AMDGPU::S_MUL_U64_U32_PSEUDO, SL, VT, Op0, Op1), 0);
8400 if (Op0SignBits >= 33 && Op1SignBits >= 33)
8402 DAG.
getMachineNode(AMDGPU::S_MUL_I64_I32_PSEUDO, SL, VT, Op0, Op1), 0);
8408 EVT VT =
Op.getValueType();
8415 const APInt &
C = RHSC->getAPIntValue();
8417 if (
C.isPowerOf2()) {
8419 bool UseArithShift = isSigned && !
C.isMinSignedValue();
8446 if (
Op->isDivergent()) {
8450 if (Subtarget->hasSMulHi()) {
8461 if (!Subtarget->hasTrapHandler() ||
8463 return lowerTrapEndpgm(
Op, DAG);
8465 return Subtarget->supportsGetDoorbellID() ? lowerTrapHsa(
Op, DAG)
8466 : lowerTrapHsaQueuePtr(
Op, DAG);
8472 return DAG.
getNode(AMDGPUISD::ENDPGM_TRAP, SL, MVT::Other, Chain);
8476SITargetLowering::loadImplicitKernelArgument(
SelectionDAG &DAG,
MVT VT,
8478 ImplicitParameter Param)
const {
8482 MachinePointerInfo PtrInfo =
8499 loadImplicitKernelArgument(DAG, MVT::i64, SL,
Align(8),
QUEUE_PTR);
8502 SIMachineFunctionInfo *
Info = MF.
getInfo<SIMachineFunctionInfo>();
8505 if (UserSGPR == AMDGPU::NoRegister) {
8522 return DAG.
getNode(AMDGPUISD::TRAP, SL, MVT::Other,
Ops);
8531 if (Subtarget->hasPrivEnabledTrap2NopBug())
8532 return DAG.
getNode(AMDGPUISD::SIMULATED_TRAP, SL, MVT::Other, Chain);
8536 return DAG.
getNode(AMDGPUISD::TRAP, SL, MVT::Other,
Ops);
8544 if (!Subtarget->hasTrapHandler() ||
8548 "debugtrap handler not supported",
8556 return DAG.
getNode(AMDGPUISD::TRAP, SL, MVT::Other,
Ops);
8559SDValue SITargetLowering::getSegmentAperture(
unsigned AS,
const SDLoc &
DL,
8561 if (Subtarget->hasApertureRegs()) {
8563 ? AMDGPU::SRC_SHARED_BASE
8564 : AMDGPU::SRC_PRIVATE_BASE;
8565 assert((ApertureRegNo != AMDGPU::SRC_PRIVATE_BASE ||
8566 !Subtarget->hasGloballyAddressableScratch()) &&
8567 "Cannot use src_private_base with globally addressable scratch!");
8588 return loadImplicitKernelArgument(DAG, MVT::i32,
DL,
Align(4), Param);
8592 SIMachineFunctionInfo *
Info = MF.
getInfo<SIMachineFunctionInfo>();
8594 if (UserSGPR == AMDGPU::NoRegister) {
8639 const AMDGPUTargetMachine &TM =
8642 unsigned DestAS, SrcAS;
8644 bool IsNonNull =
false;
8646 SrcAS = ASC->getSrcAddressSpace();
8647 Src = ASC->getOperand(0);
8648 DestAS = ASC->getDestAddressSpace();
8651 Op.getConstantOperandVal(0) ==
8652 Intrinsic::amdgcn_addrspacecast_nonnull);
8653 Src =
Op->getOperand(1);
8654 SrcAS =
Op->getConstantOperandVal(2);
8655 DestAS =
Op->getConstantOperandVal(3);
8668 Subtarget->hasGloballyAddressableScratch()) {
8673 AMDGPU::S_MOV_B32, SL, MVT::i32,
8674 DAG.
getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_LO, MVT::i32)),
8682 unsigned NullVal = TM.getNullPointerValue(DestAS);
8697 Subtarget->hasGloballyAddressableScratch()) {
8706 if (Subtarget->isWave64())
8712 57 - 32 - Subtarget->getWavefrontSizeLog2(), MVT::i32, SL);
8720 AMDGPU::S_MOV_B64, SL, MVT::i64,
8721 DAG.
getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE, MVT::i64)),
8723 CvtPtr = DAG.
getNode(
ISD::ADD, SL, MVT::i64, CvtPtr, FlatScratchBase);
8725 SDValue Aperture = getSegmentAperture(SrcAS, SL, DAG);
8733 unsigned NullVal = TM.getNullPointerValue(SrcAS);
8745 Op.getValueType() == MVT::i64) {
8746 const SIMachineFunctionInfo *
Info =
8748 if (
Info->get32BitAddressHighBits() == 0)
8757 Src.getValueType() == MVT::i64)
8785 assert(InsNumElts % 2 == 0 &&
"expect legal vector types");
8790 EVT NewInsVT = InsNumElts == 2 ? MVT::i32
8792 MVT::i32, InsNumElts / 2);
8797 for (
unsigned I = 0;
I != InsNumElts / 2; ++
I) {
8799 if (InsNumElts == 2) {
8812 for (
unsigned I = 0;
I != InsNumElts; ++
I) {
8835 if (NumElts == 4 && EltSize == 16 && KIdx) {
8846 unsigned Idx = KIdx->getZExtValue();
8847 bool InsertLo = Idx < 2;
8851 DAG.
getConstant(InsertLo ? Idx : (Idx - 2), SL, MVT::i32));
8857 : DAG.getBuildVector(MVT::v2i32, SL, {LoHalf, InsHalf});
8870 assert(VecSize <= 64 &&
"Expected target vector size to be <= 64 bits");
8905 EVT ResultVT =
Op.getValueType();
8918 if (
SDValue Combined = performExtractVectorEltCombine(
Op.getNode(), DCI))
8921 if (VecSize == 128 || VecSize == 256 || VecSize == 512) {
8925 if (VecSize == 128) {
8933 }
else if (VecSize == 256) {
8936 for (
unsigned P = 0;
P < 4; ++
P) {
8942 Parts[0], Parts[1]));
8944 Parts[2], Parts[3]));
8950 for (
unsigned P = 0;
P < 8; ++
P) {
8957 Parts[0], Parts[1], Parts[2], Parts[3]));
8960 Parts[4], Parts[5], Parts[6], Parts[7]));
8980 Src = DAG.
getBitcast(Src.getValueType().changeTypeToInteger(), Src);
8995 if (ResultVT == MVT::f16 || ResultVT == MVT::bf16) {
9005 return Mask[Elt + 1] == Mask[Elt] + 1 && (Mask[Elt] % 2 == 0);
9010 return Mask[Elt] >= 0 && Mask[Elt + 1] >= 0 && (Mask[Elt] & 1) &&
9011 !(Mask[Elt + 1] & 1);
9017 EVT ResultVT =
Op.getValueType();
9020 const int NewSrcNumElts = 2;
9022 int SrcNumElts =
Op.getOperand(0).getValueType().getVectorNumElements();
9038 const bool ShouldUseConsecutiveExtract = EltVT.
getSizeInBits() == 16;
9060 if (ShouldUseConsecutiveExtract &&
9063 int VecIdx = Idx < SrcNumElts ? 0 : 1;
9064 int EltIdx = Idx < SrcNumElts ? Idx : Idx - SrcNumElts;
9076 if (Idx0 >= SrcNumElts) {
9081 if (Idx1 >= SrcNumElts) {
9086 int AlignedIdx0 = Idx0 & ~(NewSrcNumElts - 1);
9087 int AlignedIdx1 = Idx1 & ~(NewSrcNumElts - 1);
9095 int NewMaskIdx0 = Idx0 - AlignedIdx0;
9096 int NewMaskIdx1 = Idx1 - AlignedIdx1;
9101 if (SubVec0 != SubVec1) {
9102 NewMaskIdx1 += NewSrcNumElts;
9109 {NewMaskIdx0, NewMaskIdx1});
9114 int VecIdx0 = Idx0 < SrcNumElts ? 0 : 1;
9115 int VecIdx1 = Idx1 < SrcNumElts ? 0 : 1;
9116 int EltIdx0 = Idx0 < SrcNumElts ? Idx0 : Idx0 - SrcNumElts;
9117 int EltIdx1 = Idx1 < SrcNumElts ? Idx1 : Idx1 - SrcNumElts;
9136 EVT ResultVT =
Op.getValueType();
9152 EVT VT =
Op.getValueType();
9154 if (VT == MVT::v2f16 || VT == MVT::v2i16 || VT == MVT::v2bf16) {
9155 assert(!Subtarget->hasVOP3PInsts() &&
"this should be legal");
9189 for (
unsigned P = 0;
P < NumParts; ++
P) {
9191 PartVT, SL, {
Op.getOperand(
P * 2),
Op.getOperand(
P * 2 + 1)});
9210 if (!Subtarget->isAmdHsaOS())
9253 return DAG.
getNode(AMDGPUISD::PC_ADD_REL_OFFSET64,
DL, PtrVT, Ptr);
9262 return DAG.
getNode(AMDGPUISD::PC_ADD_REL_OFFSET,
DL, PtrVT, PtrLo, PtrHi);
9270 EVT PtrVT =
Op.getValueType();
9272 const GlobalValue *GV = GSD->
getGlobal();
9286 assert(PtrVT == MVT::i32 &&
"32-bit pointer is expected.");
9301 return DAG.
getNode(AMDGPUISD::LDS,
DL, MVT::i32, GA);
9304 if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS()) {
9305 if (Subtarget->has64BitLiterals()) {
9336 MachinePointerInfo PtrInfo =
9349 Fn,
"unsupported external symbol",
Op.getDebugLoc()));
9373 SDValue Param = lowerKernargMemParameter(
9384 "non-hsa intrinsic with hsa target",
DL.getDebugLoc()));
9392 "intrinsic not supported on subtarget",
DL.getDebugLoc()));
9400 unsigned NumElts = Elts.
size();
9402 if (NumElts <= 12) {
9411 for (
unsigned i = 0; i < Elts.
size(); ++i) {
9417 for (
unsigned i = Elts.
size(); i < NumElts; ++i)
9427 EVT SrcVT = Src.getValueType();
9448 bool Unpacked,
bool IsD16,
int DMaskPop,
9449 int NumVDataDwords,
bool IsAtomicPacked16Bit,
9453 EVT ReqRetVT = ResultTypes[0];
9455 int NumDataDwords = ((IsD16 && !Unpacked) || IsAtomicPacked16Bit)
9456 ? (ReqRetNumElts + 1) / 2
9459 int MaskPopDwords = (!IsD16 || Unpacked) ? DMaskPop : (DMaskPop + 1) / 2;
9470 if (DMaskPop > 0 &&
Data.getValueType() != MaskPopVT) {
9481 if (DataDwordVT.
isVector() && !IsAtomicPacked16Bit)
9483 NumDataDwords - MaskPopDwords);
9488 EVT LegalReqRetVT = ReqRetVT;
9490 if (!
Data.getValueType().isInteger())
9492 Data.getValueType().changeTypeToInteger(),
Data);
9513 if (Result->getNumValues() == 1)
9520 SDValue *LWE,
bool &IsTexFail) {
9540 unsigned DimIdx,
unsigned EndIdx,
9541 unsigned NumGradients) {
9543 for (
unsigned I = DimIdx;
I < EndIdx;
I++) {
9551 if (((
I + 1) >= EndIdx) ||
9552 ((NumGradients / 2) % 2 == 1 && (
I == DimIdx + (NumGradients / 2) - 1 ||
9553 I == DimIdx + NumGradients - 1))) {
9575 !
Op.getNode()->hasAnyUseOfValue(0))
9577 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
9587 ResultTypes.erase(&ResultTypes[0]);
9593 int NumVDataDwords = 0;
9594 bool AdjustRetType =
false;
9595 bool IsAtomicPacked16Bit =
false;
9598 const unsigned ArgOffset = WithChain ? 2 : 1;
9601 unsigned DMaskLanes = 0;
9603 if (BaseOpcode->
Atomic) {
9604 VData =
Op.getOperand(2);
9606 IsAtomicPacked16Bit =
9607 (IntrOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
9608 IntrOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16_NORTN ||
9609 IntrOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16 ||
9610 IntrOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16_NORTN);
9621 ResultTypes[0] = Is64Bit ? MVT::v2i64 : MVT::v2i32;
9623 DMask = Is64Bit ? 0xf : 0x3;
9624 NumVDataDwords = Is64Bit ? 4 : 2;
9626 DMask = Is64Bit ? 0x3 : 0x1;
9627 NumVDataDwords = Is64Bit ? 2 : 1;
9630 DMask =
Op->getConstantOperandVal(ArgOffset + Intr->
DMaskIndex);
9633 if (BaseOpcode->
Store) {
9634 VData =
Op.getOperand(2);
9638 if (!Subtarget->hasD16Images() || !BaseOpcode->
HasD16)
9642 VData = handleD16VData(VData, DAG,
true);
9645 NumVDataDwords = (VData.
getValueType().getSizeInBits() + 31) / 32;
9646 }
else if (!BaseOpcode->
NoReturn) {
9651 if (!Subtarget->hasD16Images() || !BaseOpcode->
HasD16)
9659 (!LoadVT.
isVector() && DMaskLanes > 1))
9665 if (IsD16 && !Subtarget->hasUnpackedD16VMem() &&
9666 !(BaseOpcode->
Gather4 && Subtarget->hasImageGather4D16Bug()))
9667 NumVDataDwords = (DMaskLanes + 1) / 2;
9669 NumVDataDwords = DMaskLanes;
9671 AdjustRetType =
true;
9675 unsigned VAddrEnd = ArgOffset + Intr->
VAddrEnd;
9682 MVT GradPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
9683 IsG16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
9685 VAddrVT =
Op.getOperand(ArgOffset + Intr->
CoordStart).getSimpleValueType();
9687 MVT AddrPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
9688 IsA16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
9692 if (IsA16 && (
Op.getOperand(ArgOffset +
I).getValueType() == MVT::f16)) {
9698 {
Op.getOperand(ArgOffset +
I), DAG.
getPOISON(MVT::f16)});
9702 "Bias needs to be converted to 16 bit in A16 mode");
9707 if (BaseOpcode->
Gradients && !
ST->hasG16() && (IsA16 != IsG16)) {
9711 dbgs() <<
"Failed to lower image intrinsic: 16 bit addresses "
9712 "require 16 bit args for both gradients and addresses");
9717 if (!
ST->hasA16()) {
9718 LLVM_DEBUG(
dbgs() <<
"Failed to lower image intrinsic: Target does not "
9719 "support 16 bit addresses\n");
9729 if (BaseOpcode->
Gradients && IsG16 &&
ST->hasG16()) {
9731 const AMDGPU::MIMGG16MappingInfo *G16MappingInfo =
9733 IntrOpcode = G16MappingInfo->
G16;
9756 for (
unsigned I = ArgOffset + Intr->
CoordStart;
I < VAddrEnd;
I++)
9774 const unsigned NSAMaxSize =
ST->getNSAMaxSize(BaseOpcode->
Sampler);
9775 const bool HasPartialNSAEncoding =
ST->hasPartialNSAEncoding();
9776 const bool UseNSA =
ST->hasNSAEncoding() &&
9777 VAddrs.
size() >=
ST->getNSAThreshold(MF) &&
9778 (VAddrs.
size() <= NSAMaxSize || HasPartialNSAEncoding);
9779 const bool UsePartialNSA =
9780 UseNSA && HasPartialNSAEncoding && VAddrs.
size() > NSAMaxSize;
9783 if (UsePartialNSA) {
9785 ArrayRef(VAddrs).drop_front(NSAMaxSize - 1));
9786 }
else if (!UseNSA) {
9796 uint64_t UnormConst =
9797 Op.getConstantOperandVal(ArgOffset + Intr->
UnormIndex);
9799 Unorm = UnormConst ? True : False;
9805 bool IsTexFail =
false;
9806 if (!
parseTexFail(TexFail, DAG, &TFE, &LWE, IsTexFail))
9817 NumVDataDwords += 1;
9818 AdjustRetType =
true;
9823 if (AdjustRetType) {
9826 if (DMaskLanes == 0 && !BaseOpcode->
Store) {
9835 MVT::i32, NumVDataDwords)
9838 ResultTypes[0] = NewVT;
9839 if (ResultTypes.size() == 3) {
9843 ResultTypes.erase(&ResultTypes[1]);
9857 Ops.push_back(VData);
9858 if (UsePartialNSA) {
9860 Ops.push_back(VAddr);
9864 Ops.push_back(VAddr);
9867 if (RsrcVT != MVT::v4i32 && RsrcVT != MVT::v8i32)
9869 Ops.push_back(Rsrc);
9874 Ops.push_back(Samp);
9879 if (!IsGFX12Plus || BaseOpcode->
Sampler || BaseOpcode->
MSAA)
9880 Ops.push_back(Unorm);
9882 Ops.push_back(IsA16 &&
9883 ST->hasFeature(AMDGPU::FeatureR128A16)
9887 Ops.push_back(IsA16 ? True : False);
9889 if (!Subtarget->hasGFX90AInsts())
9894 "TFE is not supported on this GPU",
DL.getDebugLoc()));
9897 if (!IsGFX12Plus || BaseOpcode->
Sampler || BaseOpcode->
MSAA)
9900 Ops.push_back(DimInfo->
DA ? True : False);
9902 Ops.push_back(IsD16 ? True : False);
9904 Ops.push_back(
Op.getOperand(0));
9906 int NumVAddrDwords =
9912 NumVDataDwords, NumVAddrDwords);
9913 }
else if (IsGFX11Plus) {
9915 UseNSA ? AMDGPU::MIMGEncGfx11NSA
9916 : AMDGPU::MIMGEncGfx11Default,
9917 NumVDataDwords, NumVAddrDwords);
9918 }
else if (IsGFX10Plus) {
9920 UseNSA ? AMDGPU::MIMGEncGfx10NSA
9921 : AMDGPU::MIMGEncGfx10Default,
9922 NumVDataDwords, NumVAddrDwords);
9924 if (Subtarget->hasGFX90AInsts()) {
9926 NumVDataDwords, NumVAddrDwords);
9930 "requested image instruction is not supported on this GPU",
9935 for (EVT VT : OrigResultTypes) {
9936 if (VT == MVT::Other)
9937 RetValues[Idx++] =
Op.getOperand(0);
9948 NumVDataDwords, NumVAddrDwords);
9951 NumVDataDwords, NumVAddrDwords);
9958 MachineMemOperand *MemRef = MemOp->getMemOperand();
9977 Subtarget->hasUnpackedD16VMem(), IsD16, DMaskLanes,
9978 NumVDataDwords, IsAtomicPacked16Bit,
DL);
9991 MachinePointerInfo(),
9996 if (!
Offset->isDivergent()) {
10003 if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
10012 !Subtarget->hasScalarDwordx3Loads()) {
10016 AMDGPUISD::SBUFFER_LOAD,
DL, DAG.
getVTList(WidenedVT),
Ops, WidenedVT,
10039 if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
10041 return handleByteShortBufferLoads(DAG, VT,
DL,
Ops, MMO);
10045 unsigned NumLoads = 1;
10051 if (NumElts == 8 || NumElts == 16) {
10052 NumLoads = NumElts / 4;
10056 SDVTList VTList = DAG.
getVTList({LoadVT, MVT::Other});
10061 NumLoads > 1 ?
Align(16 * NumLoads) :
Align(4));
10063 uint64_t InstOffset =
Ops[5]->getAsZExtVal();
10064 for (
unsigned i = 0; i < NumLoads; ++i) {
10066 Loads.
push_back(getMemIntrinsicNode(AMDGPUISD::BUFFER_LOAD,
DL, VTList,
Ops,
10067 LoadVT, MMO, DAG));
10070 if (NumElts == 8 || NumElts == 16)
10078 if (!Subtarget->hasArchitectedSGPRs())
10083 return DAG.
getNode(AMDGPUISD::BFE_U32, SL, VT, TTMP8,
10090 unsigned Width)
const {
10092 using namespace AMDGPU::Hwreg;
10094 AMDGPU::S_GETREG_B32_const, SL, MVT::i32,
10133 auto *MFI = MF.
getInfo<SIMachineFunctionInfo>();
10135 EVT VT =
Op.getValueType();
10137 unsigned IntrinsicID =
Op.getConstantOperandVal(0);
10141 switch (IntrinsicID) {
10142 case Intrinsic::amdgcn_implicit_buffer_ptr: {
10145 return getPreloadedValue(DAG, *MFI, VT,
10148 case Intrinsic::amdgcn_dispatch_ptr:
10149 case Intrinsic::amdgcn_queue_ptr: {
10150 if (!Subtarget->isAmdHsaOrMesa(MF.
getFunction())) {
10152 MF.
getFunction(),
"unsupported hsa intrinsic without hsa target",
10153 DL.getDebugLoc()));
10157 auto RegID = IntrinsicID == Intrinsic::amdgcn_dispatch_ptr
10160 return getPreloadedValue(DAG, *MFI, VT, RegID);
10162 case Intrinsic::amdgcn_implicitarg_ptr: {
10164 return getImplicitArgPtr(DAG,
DL);
10165 return getPreloadedValue(DAG, *MFI, VT,
10168 case Intrinsic::amdgcn_kernarg_segment_ptr: {
10174 return getPreloadedValue(DAG, *MFI, VT,
10177 case Intrinsic::amdgcn_dispatch_id: {
10180 case Intrinsic::amdgcn_rcp:
10181 return DAG.
getNode(AMDGPUISD::RCP,
DL, VT,
Op.getOperand(1));
10182 case Intrinsic::amdgcn_rsq:
10183 return DAG.
getNode(AMDGPUISD::RSQ,
DL, VT,
Op.getOperand(1));
10184 case Intrinsic::amdgcn_rsq_legacy:
10188 case Intrinsic::amdgcn_rcp_legacy:
10191 return DAG.
getNode(AMDGPUISD::RCP_LEGACY,
DL, VT,
Op.getOperand(1));
10192 case Intrinsic::amdgcn_rsq_clamp: {
10194 return DAG.
getNode(AMDGPUISD::RSQ_CLAMP,
DL, VT,
Op.getOperand(1));
10206 case Intrinsic::r600_read_ngroups_x:
10207 if (Subtarget->isAmdHsaOS())
10210 return lowerKernargMemParameter(DAG, VT, VT,
DL, DAG.
getEntryNode(),
10213 case Intrinsic::r600_read_ngroups_y:
10214 if (Subtarget->isAmdHsaOS())
10217 return lowerKernargMemParameter(DAG, VT, VT,
DL, DAG.
getEntryNode(),
10220 case Intrinsic::r600_read_ngroups_z:
10221 if (Subtarget->isAmdHsaOS())
10224 return lowerKernargMemParameter(DAG, VT, VT,
DL, DAG.
getEntryNode(),
10227 case Intrinsic::r600_read_local_size_x:
10228 if (Subtarget->isAmdHsaOS())
10231 return lowerImplicitZextParam(DAG,
Op, MVT::i16,
10233 case Intrinsic::r600_read_local_size_y:
10234 if (Subtarget->isAmdHsaOS())
10237 return lowerImplicitZextParam(DAG,
Op, MVT::i16,
10239 case Intrinsic::r600_read_local_size_z:
10240 if (Subtarget->isAmdHsaOS())
10243 return lowerImplicitZextParam(DAG,
Op, MVT::i16,
10245 case Intrinsic::amdgcn_workgroup_id_x:
10246 return lowerWorkGroupId(DAG, *MFI, VT,
10250 case Intrinsic::amdgcn_workgroup_id_y:
10251 return lowerWorkGroupId(DAG, *MFI, VT,
10255 case Intrinsic::amdgcn_workgroup_id_z:
10256 return lowerWorkGroupId(DAG, *MFI, VT,
10260 case Intrinsic::amdgcn_cluster_id_x:
10261 return Subtarget->hasClusters()
10262 ? getPreloadedValue(DAG, *MFI, VT,
10264 : DAG.getPOISON(VT);
10265 case Intrinsic::amdgcn_cluster_id_y:
10266 return Subtarget->hasClusters()
10267 ? getPreloadedValue(DAG, *MFI, VT,
10270 case Intrinsic::amdgcn_cluster_id_z:
10271 return Subtarget->hasClusters()
10272 ? getPreloadedValue(DAG, *MFI, VT,
10275 case Intrinsic::amdgcn_cluster_workgroup_id_x:
10276 return Subtarget->hasClusters()
10277 ? getPreloadedValue(
10281 case Intrinsic::amdgcn_cluster_workgroup_id_y:
10282 return Subtarget->hasClusters()
10283 ? getPreloadedValue(
10287 case Intrinsic::amdgcn_cluster_workgroup_id_z:
10288 return Subtarget->hasClusters()
10289 ? getPreloadedValue(
10293 case Intrinsic::amdgcn_cluster_workgroup_flat_id:
10294 return Subtarget->hasClusters()
10297 case Intrinsic::amdgcn_cluster_workgroup_max_id_x:
10298 return Subtarget->hasClusters()
10299 ? getPreloadedValue(
10303 case Intrinsic::amdgcn_cluster_workgroup_max_id_y:
10304 return Subtarget->hasClusters()
10305 ? getPreloadedValue(
10309 case Intrinsic::amdgcn_cluster_workgroup_max_id_z:
10310 return Subtarget->hasClusters()
10311 ? getPreloadedValue(
10315 case Intrinsic::amdgcn_cluster_workgroup_max_flat_id:
10316 return Subtarget->hasClusters()
10317 ? getPreloadedValue(
10321 case Intrinsic::amdgcn_wave_id:
10322 return lowerWaveID(DAG,
Op);
10323 case Intrinsic::amdgcn_lds_kernel_id: {
10325 return getLDSKernelId(DAG,
DL);
10326 return getPreloadedValue(DAG, *MFI, VT,
10329 case Intrinsic::amdgcn_workitem_id_x:
10330 return lowerWorkitemID(DAG,
Op, 0, MFI->getArgInfo().WorkItemIDX);
10331 case Intrinsic::amdgcn_workitem_id_y:
10332 return lowerWorkitemID(DAG,
Op, 1, MFI->getArgInfo().WorkItemIDY);
10333 case Intrinsic::amdgcn_workitem_id_z:
10334 return lowerWorkitemID(DAG,
Op, 2, MFI->getArgInfo().WorkItemIDZ);
10335 case Intrinsic::amdgcn_wavefrontsize:
10337 SDLoc(
Op), MVT::i32);
10338 case Intrinsic::amdgcn_s_buffer_load: {
10339 unsigned CPol =
Op.getConstantOperandVal(3);
10346 return lowerSBuffer(VT,
DL,
Op.getOperand(1),
Op.getOperand(2),
10347 Op.getOperand(3), DAG);
10349 case Intrinsic::amdgcn_fdiv_fast:
10350 return lowerFDIV_FAST(
Op, DAG);
10351 case Intrinsic::amdgcn_sin:
10352 return DAG.
getNode(AMDGPUISD::SIN_HW,
DL, VT,
Op.getOperand(1));
10354 case Intrinsic::amdgcn_cos:
10355 return DAG.
getNode(AMDGPUISD::COS_HW,
DL, VT,
Op.getOperand(1));
10357 case Intrinsic::amdgcn_mul_u24:
10358 return DAG.
getNode(AMDGPUISD::MUL_U24,
DL, VT,
Op.getOperand(1),
10360 case Intrinsic::amdgcn_mul_i24:
10361 return DAG.
getNode(AMDGPUISD::MUL_I24,
DL, VT,
Op.getOperand(1),
10364 case Intrinsic::amdgcn_log_clamp: {
10370 case Intrinsic::amdgcn_fract:
10371 return DAG.
getNode(AMDGPUISD::FRACT,
DL, VT,
Op.getOperand(1));
10373 case Intrinsic::amdgcn_class:
10374 return DAG.
getNode(AMDGPUISD::FP_CLASS,
DL, VT,
Op.getOperand(1),
10376 case Intrinsic::amdgcn_div_fmas:
10377 return DAG.
getNode(AMDGPUISD::DIV_FMAS,
DL, VT,
Op.getOperand(1),
10378 Op.getOperand(2),
Op.getOperand(3),
Op.getOperand(4));
10380 case Intrinsic::amdgcn_div_fixup:
10381 return DAG.
getNode(AMDGPUISD::DIV_FIXUP,
DL, VT,
Op.getOperand(1),
10382 Op.getOperand(2),
Op.getOperand(3));
10384 case Intrinsic::amdgcn_div_scale: {
10390 SDValue Denominator =
Op.getOperand(2);
10397 SDValue Src0 =
Param->isAllOnes() ? Numerator : Denominator;
10399 return DAG.
getNode(AMDGPUISD::DIV_SCALE,
DL,
Op->getVTList(), Src0,
10400 Denominator, Numerator);
10402 case Intrinsic::amdgcn_icmp: {
10404 if (
Op.getOperand(1).getValueType() == MVT::i1 &&
10405 Op.getConstantOperandVal(2) == 0 &&
10410 case Intrinsic::amdgcn_fcmp: {
10413 case Intrinsic::amdgcn_ballot:
10415 case Intrinsic::amdgcn_fmed3:
10416 return DAG.
getNode(AMDGPUISD::FMED3,
DL, VT,
Op.getOperand(1),
10417 Op.getOperand(2),
Op.getOperand(3));
10418 case Intrinsic::amdgcn_fdot2:
10419 return DAG.
getNode(AMDGPUISD::FDOT2,
DL, VT,
Op.getOperand(1),
10420 Op.getOperand(2),
Op.getOperand(3),
Op.getOperand(4));
10421 case Intrinsic::amdgcn_fmul_legacy:
10422 return DAG.
getNode(AMDGPUISD::FMUL_LEGACY,
DL, VT,
Op.getOperand(1),
10424 case Intrinsic::amdgcn_sffbh:
10425 return DAG.
getNode(AMDGPUISD::FFBH_I32,
DL, VT,
Op.getOperand(1));
10426 case Intrinsic::amdgcn_sbfe:
10427 return DAG.
getNode(AMDGPUISD::BFE_I32,
DL, VT,
Op.getOperand(1),
10428 Op.getOperand(2),
Op.getOperand(3));
10429 case Intrinsic::amdgcn_ubfe:
10430 return DAG.
getNode(AMDGPUISD::BFE_U32,
DL, VT,
Op.getOperand(1),
10431 Op.getOperand(2),
Op.getOperand(3));
10432 case Intrinsic::amdgcn_cvt_pkrtz:
10433 case Intrinsic::amdgcn_cvt_pknorm_i16:
10434 case Intrinsic::amdgcn_cvt_pknorm_u16:
10435 case Intrinsic::amdgcn_cvt_pk_i16:
10436 case Intrinsic::amdgcn_cvt_pk_u16: {
10438 EVT VT =
Op.getValueType();
10441 if (IntrinsicID == Intrinsic::amdgcn_cvt_pkrtz)
10442 Opcode = AMDGPUISD::CVT_PKRTZ_F16_F32;
10443 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_i16)
10444 Opcode = AMDGPUISD::CVT_PKNORM_I16_F32;
10445 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_u16)
10446 Opcode = AMDGPUISD::CVT_PKNORM_U16_F32;
10447 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pk_i16)
10448 Opcode = AMDGPUISD::CVT_PK_I16_I32;
10450 Opcode = AMDGPUISD::CVT_PK_U16_U32;
10453 return DAG.
getNode(Opcode,
DL, VT,
Op.getOperand(1),
Op.getOperand(2));
10456 DAG.
getNode(Opcode,
DL, MVT::i32,
Op.getOperand(1),
Op.getOperand(2));
10459 case Intrinsic::amdgcn_fmad_ftz:
10460 return DAG.
getNode(AMDGPUISD::FMAD_FTZ,
DL, VT,
Op.getOperand(1),
10461 Op.getOperand(2),
Op.getOperand(3));
10463 case Intrinsic::amdgcn_if_break:
10465 Op->getOperand(1),
Op->getOperand(2)),
10468 case Intrinsic::amdgcn_groupstaticsize: {
10474 const GlobalValue *GV =
10480 case Intrinsic::amdgcn_is_shared:
10481 case Intrinsic::amdgcn_is_private: {
10488 unsigned AS = (IntrinsicID == Intrinsic::amdgcn_is_shared)
10492 Subtarget->hasGloballyAddressableScratch()) {
10495 AMDGPU::S_MOV_B32,
DL, MVT::i32,
10496 DAG.
getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_HI, MVT::i32)),
10505 SDValue Aperture = getSegmentAperture(AS, SL, DAG);
10508 case Intrinsic::amdgcn_perm:
10509 return DAG.
getNode(AMDGPUISD::PERM,
DL, MVT::i32,
Op.getOperand(1),
10510 Op.getOperand(2),
Op.getOperand(3));
10511 case Intrinsic::amdgcn_reloc_constant: {
10521 case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
10522 case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
10523 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
10524 case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
10525 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
10526 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
10527 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
10528 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {
10529 if (
Op.getOperand(4).getValueType() == MVT::i32)
10535 Op.getOperand(0),
Op.getOperand(1),
Op.getOperand(2),
10536 Op.getOperand(3), IndexKeyi32);
10538 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_fp8:
10539 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_bf8:
10540 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_fp8:
10541 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_bf8:
10542 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_fp8:
10543 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_bf8:
10544 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_fp8:
10545 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_bf8: {
10546 if (
Op.getOperand(4).getValueType() == MVT::i64)
10551 Op.getOperand(4).getValueType() == MVT::v2i32
10555 {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
10556 Op.getOperand(3), IndexKeyi64, Op.getOperand(5),
10557 Op.getOperand(6)});
10559 case Intrinsic::amdgcn_swmmac_f16_16x16x64_f16:
10560 case Intrinsic::amdgcn_swmmac_bf16_16x16x64_bf16:
10561 case Intrinsic::amdgcn_swmmac_f32_16x16x64_bf16:
10562 case Intrinsic::amdgcn_swmmac_bf16f32_16x16x64_bf16:
10563 case Intrinsic::amdgcn_swmmac_f32_16x16x64_f16:
10564 case Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8: {
10565 EVT IndexKeyTy = IntrinsicID == Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8
10568 if (
Op.getOperand(6).getValueType() == IndexKeyTy)
10573 Op.getOperand(6).getValueType().isVector()
10577 Op.getOperand(0),
Op.getOperand(1),
Op.getOperand(2),
10578 Op.getOperand(3),
Op.getOperand(4),
Op.getOperand(5),
10579 IndexKey,
Op.getOperand(7),
Op.getOperand(8)};
10580 if (IntrinsicID == Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8)
10581 Args.push_back(
Op.getOperand(9));
10584 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
10585 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
10586 case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
10587 if (
Op.getOperand(6).getValueType() == MVT::i32)
10593 {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
10594 Op.getOperand(3), Op.getOperand(4), Op.getOperand(5),
10595 IndexKeyi32, Op.getOperand(7)});
10597 case Intrinsic::amdgcn_addrspacecast_nonnull:
10598 return lowerADDRSPACECAST(
Op, DAG);
10599 case Intrinsic::amdgcn_readlane:
10600 case Intrinsic::amdgcn_readfirstlane:
10601 case Intrinsic::amdgcn_writelane:
10602 case Intrinsic::amdgcn_permlane16:
10603 case Intrinsic::amdgcn_permlanex16:
10604 case Intrinsic::amdgcn_permlane64:
10605 case Intrinsic::amdgcn_set_inactive:
10606 case Intrinsic::amdgcn_set_inactive_chain_arg:
10607 case Intrinsic::amdgcn_mov_dpp8:
10608 case Intrinsic::amdgcn_update_dpp:
10610 case Intrinsic::amdgcn_dead: {
10612 for (
const EVT ValTy :
Op.getNode()->values())
10616 case Intrinsic::amdgcn_wave_shuffle:
10619 if (
const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
10621 return lowerImage(
Op, ImageDimIntr, DAG,
false);
10631 if (Subtarget->hasRestrictedSOffset() &&
isNullConstant(SOffset))
10632 return DAG.
getRegister(AMDGPU::SGPR_NULL, MVT::i32);
10638 unsigned NewOpcode)
const {
10642 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
10643 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(4), DAG);
10661 M->getMemOperand());
10666 unsigned NewOpcode)
const {
10670 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
10671 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(5), DAG);
10689 M->getMemOperand());
10694 unsigned IntrID =
Op.getConstantOperandVal(1);
10698 case Intrinsic::amdgcn_ds_ordered_add:
10699 case Intrinsic::amdgcn_ds_ordered_swap: {
10704 unsigned IndexOperand =
M->getConstantOperandVal(7);
10705 unsigned WaveRelease =
M->getConstantOperandVal(8);
10706 unsigned WaveDone =
M->getConstantOperandVal(9);
10708 unsigned OrderedCountIndex = IndexOperand & 0x3f;
10709 IndexOperand &= ~0x3f;
10710 unsigned CountDw = 0;
10713 CountDw = (IndexOperand >> 24) & 0xf;
10714 IndexOperand &= ~(0xf << 24);
10716 if (CountDw < 1 || CountDw > 4) {
10719 Fn,
"ds_ordered_count: dword count must be between 1 and 4",
10720 DL.getDebugLoc()));
10725 if (IndexOperand) {
10728 Fn,
"ds_ordered_count: bad index operand",
DL.getDebugLoc()));
10731 if (WaveDone && !WaveRelease) {
10735 Fn,
"ds_ordered_count: wave_done requires wave_release",
10736 DL.getDebugLoc()));
10739 unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1;
10740 unsigned ShaderType =
10742 unsigned Offset0 = OrderedCountIndex << 2;
10743 unsigned Offset1 = WaveRelease | (WaveDone << 1) | (Instruction << 4);
10746 Offset1 |= (CountDw - 1) << 6;
10749 Offset1 |= ShaderType << 2;
10751 unsigned Offset = Offset0 | (Offset1 << 8);
10758 M->getVTList(),
Ops,
M->getMemoryVT(),
10759 M->getMemOperand());
10761 case Intrinsic::amdgcn_raw_buffer_load:
10762 case Intrinsic::amdgcn_raw_ptr_buffer_load:
10763 case Intrinsic::amdgcn_raw_atomic_buffer_load:
10764 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
10765 case Intrinsic::amdgcn_raw_buffer_load_format:
10766 case Intrinsic::amdgcn_raw_ptr_buffer_load_format: {
10767 const bool IsFormat =
10768 IntrID == Intrinsic::amdgcn_raw_buffer_load_format ||
10769 IntrID == Intrinsic::amdgcn_raw_ptr_buffer_load_format;
10771 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
10772 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(3), DAG);
10786 return lowerIntrinsicLoad(M, IsFormat, DAG,
Ops);
10788 case Intrinsic::amdgcn_struct_buffer_load:
10789 case Intrinsic::amdgcn_struct_ptr_buffer_load:
10790 case Intrinsic::amdgcn_struct_buffer_load_format:
10791 case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
10792 case Intrinsic::amdgcn_struct_atomic_buffer_load:
10793 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: {
10794 const bool IsFormat =
10795 IntrID == Intrinsic::amdgcn_struct_buffer_load_format ||
10796 IntrID == Intrinsic::amdgcn_struct_ptr_buffer_load_format;
10798 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
10799 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(4), DAG);
10814 case Intrinsic::amdgcn_raw_tbuffer_load:
10815 case Intrinsic::amdgcn_raw_ptr_tbuffer_load: {
10817 EVT LoadVT =
Op.getValueType();
10818 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
10819 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(3), DAG);
10835 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16, M, DAG,
10837 return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT,
DL,
10838 Op->getVTList(),
Ops, LoadVT,
M->getMemOperand(),
10841 case Intrinsic::amdgcn_struct_tbuffer_load:
10842 case Intrinsic::amdgcn_struct_ptr_tbuffer_load: {
10844 EVT LoadVT =
Op.getValueType();
10845 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
10846 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(4), DAG);
10862 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16, M, DAG,
10864 return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT,
DL,
10865 Op->getVTList(),
Ops, LoadVT,
M->getMemOperand(),
10868 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
10869 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
10870 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FADD);
10871 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
10872 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
10873 return lowerStructBufferAtomicIntrin(
Op, DAG,
10874 AMDGPUISD::BUFFER_ATOMIC_FADD);
10875 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
10876 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
10877 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMIN);
10878 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
10879 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
10880 return lowerStructBufferAtomicIntrin(
Op, DAG,
10881 AMDGPUISD::BUFFER_ATOMIC_FMIN);
10882 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
10883 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
10884 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMAX);
10885 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
10886 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
10887 return lowerStructBufferAtomicIntrin(
Op, DAG,
10888 AMDGPUISD::BUFFER_ATOMIC_FMAX);
10889 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
10890 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
10891 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SWAP);
10892 case Intrinsic::amdgcn_raw_buffer_atomic_add:
10893 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
10894 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_ADD);
10895 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
10896 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
10897 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SUB);
10898 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
10899 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
10900 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SMIN);
10901 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
10902 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
10903 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_UMIN);
10904 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
10905 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
10906 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SMAX);
10907 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
10908 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
10909 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_UMAX);
10910 case Intrinsic::amdgcn_raw_buffer_atomic_and:
10911 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
10912 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_AND);
10913 case Intrinsic::amdgcn_raw_buffer_atomic_or:
10914 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
10915 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_OR);
10916 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
10917 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
10918 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_XOR);
10919 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
10920 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
10921 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_INC);
10922 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
10923 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
10924 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_DEC);
10925 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
10926 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
10927 return lowerStructBufferAtomicIntrin(
Op, DAG,
10928 AMDGPUISD::BUFFER_ATOMIC_SWAP);
10929 case Intrinsic::amdgcn_struct_buffer_atomic_add:
10930 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
10931 return lowerStructBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_ADD);
10932 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
10933 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
10934 return lowerStructBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SUB);
10935 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
10936 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
10937 return lowerStructBufferAtomicIntrin(
Op, DAG,
10938 AMDGPUISD::BUFFER_ATOMIC_SMIN);
10939 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
10940 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
10941 return lowerStructBufferAtomicIntrin(
Op, DAG,
10942 AMDGPUISD::BUFFER_ATOMIC_UMIN);
10943 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
10944 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
10945 return lowerStructBufferAtomicIntrin(
Op, DAG,
10946 AMDGPUISD::BUFFER_ATOMIC_SMAX);
10947 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
10948 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
10949 return lowerStructBufferAtomicIntrin(
Op, DAG,
10950 AMDGPUISD::BUFFER_ATOMIC_UMAX);
10951 case Intrinsic::amdgcn_struct_buffer_atomic_and:
10952 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
10953 return lowerStructBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_AND);
10954 case Intrinsic::amdgcn_struct_buffer_atomic_or:
10955 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
10956 return lowerStructBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_OR);
10957 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
10958 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
10959 return lowerStructBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_XOR);
10960 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
10961 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
10962 return lowerStructBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_INC);
10963 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
10964 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
10965 return lowerStructBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_DEC);
10966 case Intrinsic::amdgcn_raw_buffer_atomic_sub_clamp_u32:
10967 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub_clamp_u32:
10968 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_CSUB);
10969 case Intrinsic::amdgcn_struct_buffer_atomic_sub_clamp_u32:
10970 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub_clamp_u32:
10971 return lowerStructBufferAtomicIntrin(
Op, DAG,
10972 AMDGPUISD::BUFFER_ATOMIC_CSUB);
10973 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
10974 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cond_sub_u32:
10975 return lowerRawBufferAtomicIntrin(
Op, DAG,
10976 AMDGPUISD::BUFFER_ATOMIC_COND_SUB_U32);
10977 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
10978 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cond_sub_u32:
10979 return lowerStructBufferAtomicIntrin(
Op, DAG,
10980 AMDGPUISD::BUFFER_ATOMIC_COND_SUB_U32);
10981 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
10982 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap: {
10983 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(4), DAG);
10984 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(5), DAG);
10998 EVT VT =
Op.getValueType();
11002 Op->getVTList(),
Ops, VT,
11003 M->getMemOperand());
11005 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
11006 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap: {
11007 SDValue Rsrc = bufferRsrcPtrToVector(
Op->getOperand(4), DAG);
11008 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(6), DAG);
11022 EVT VT =
Op.getValueType();
11026 Op->getVTList(),
Ops, VT,
11027 M->getMemOperand());
11029 case Intrinsic::amdgcn_image_bvh_dual_intersect_ray:
11030 case Intrinsic::amdgcn_image_bvh8_intersect_ray: {
11032 SDValue NodePtr =
M->getOperand(2);
11033 SDValue RayExtent =
M->getOperand(3);
11034 SDValue InstanceMask =
M->getOperand(4);
11035 SDValue RayOrigin =
M->getOperand(5);
11036 SDValue RayDir =
M->getOperand(6);
11038 SDValue TDescr =
M->getOperand(8);
11043 if (!Subtarget->hasBVHDualAndBVH8Insts()) {
11048 bool IsBVH8 = IntrID == Intrinsic::amdgcn_image_bvh8_intersect_ray;
11049 const unsigned NumVDataDwords = 10;
11050 const unsigned NumVAddrDwords = IsBVH8 ? 11 : 12;
11052 IsBVH8 ? AMDGPU::IMAGE_BVH8_INTERSECT_RAY
11053 : AMDGPU::IMAGE_BVH_DUAL_INTERSECT_RAY,
11054 AMDGPU::MIMGEncGfx12, NumVDataDwords, NumVAddrDwords);
11058 Ops.push_back(NodePtr);
11061 {DAG.getBitcast(MVT::i32, RayExtent),
11062 DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, InstanceMask)}));
11063 Ops.push_back(RayOrigin);
11064 Ops.push_back(RayDir);
11065 Ops.push_back(Offsets);
11066 Ops.push_back(TDescr);
11067 Ops.push_back(
M->getChain());
11070 MachineMemOperand *MemRef =
M->getMemOperand();
11074 case Intrinsic::amdgcn_image_bvh_intersect_ray: {
11076 SDValue NodePtr =
M->getOperand(2);
11077 SDValue RayExtent =
M->getOperand(3);
11078 SDValue RayOrigin =
M->getOperand(4);
11079 SDValue RayDir =
M->getOperand(5);
11080 SDValue RayInvDir =
M->getOperand(6);
11081 SDValue TDescr =
M->getOperand(7);
11088 if (!Subtarget->hasGFX10_AEncoding()) {
11098 const unsigned NumVDataDwords = 4;
11099 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
11100 const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
11101 const bool UseNSA = (Subtarget->hasNSAEncoding() &&
11104 const unsigned BaseOpcodes[2][2] = {
11105 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
11106 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
11107 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
11111 IsGFX12Plus ? AMDGPU::MIMGEncGfx12
11112 : IsGFX11 ? AMDGPU::MIMGEncGfx11NSA
11113 : AMDGPU::MIMGEncGfx10NSA,
11114 NumVDataDwords, NumVAddrDwords);
11118 IsGFX11 ? AMDGPU::MIMGEncGfx11Default
11119 : AMDGPU::MIMGEncGfx10Default,
11120 NumVDataDwords, NumVAddrDwords);
11126 auto packLanes = [&DAG, &
Ops, &
DL](
SDValue Op,
bool IsAligned) {
11129 if (Lanes[0].getValueSizeInBits() == 32) {
11130 for (
unsigned I = 0;
I < 3; ++
I)
11137 Ops.push_back(Lanes[2]);
11149 if (UseNSA && IsGFX11Plus) {
11150 Ops.push_back(NodePtr);
11152 Ops.push_back(RayOrigin);
11157 for (
unsigned I = 0;
I < 3; ++
I) {
11160 {DirLanes[I], InvDirLanes[I]})));
11164 Ops.push_back(RayDir);
11165 Ops.push_back(RayInvDir);
11172 Ops.push_back(NodePtr);
11175 packLanes(RayOrigin,
true);
11176 packLanes(RayDir,
true);
11177 packLanes(RayInvDir,
false);
11182 if (NumVAddrDwords > 12) {
11190 Ops.push_back(MergedOps);
11193 Ops.push_back(TDescr);
11195 Ops.push_back(
M->getChain());
11198 MachineMemOperand *MemRef =
M->getMemOperand();
11202 case Intrinsic::amdgcn_global_atomic_fmin_num:
11203 case Intrinsic::amdgcn_global_atomic_fmax_num:
11204 case Intrinsic::amdgcn_flat_atomic_fmin_num:
11205 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
11212 unsigned Opcode = 0;
11214 case Intrinsic::amdgcn_global_atomic_fmin_num:
11215 case Intrinsic::amdgcn_flat_atomic_fmin_num: {
11219 case Intrinsic::amdgcn_global_atomic_fmax_num:
11220 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
11227 return DAG.
getAtomic(Opcode, SDLoc(
Op),
M->getMemoryVT(),
M->getVTList(),
11228 Ops,
M->getMemOperand());
11230 case Intrinsic::amdgcn_s_alloc_vgpr: {
11238 ReadFirstLaneID, NumVGPRs);
11241 Op.getOperand(0),
Op.getOperand(1), NumVGPRs);
11243 case Intrinsic::amdgcn_s_get_barrier_state:
11244 case Intrinsic::amdgcn_s_get_named_barrier_state: {
11251 if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state)
11252 BarID = (BarID >> 4) & 0x3F;
11253 Opc = AMDGPU::S_GET_BARRIER_STATE_IMM;
11256 Ops.push_back(Chain);
11258 Opc = AMDGPU::S_GET_BARRIER_STATE_M0;
11259 if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state) {
11267 Ops.push_back(
copyToM0(DAG, Chain,
DL, M0Val).getValue(0));
11275 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
11276 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
11277 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B: {
11281 EVT VT =
Op->getValueType(0);
11285 case Intrinsic::amdgcn_flat_load_monitor_b32:
11286 case Intrinsic::amdgcn_flat_load_monitor_b64:
11287 case Intrinsic::amdgcn_flat_load_monitor_b128: {
11292 Op->getVTList(), {Chain, Ptr},
11295 case Intrinsic::amdgcn_global_load_monitor_b32:
11296 case Intrinsic::amdgcn_global_load_monitor_b64:
11297 case Intrinsic::amdgcn_global_load_monitor_b128: {
11302 Op->getVTList(), {Chain, Ptr},
11307 if (
const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
11309 return lowerImage(
Op, ImageDimIntr, DAG,
true);
11317SDValue SITargetLowering::getMemIntrinsicNode(
unsigned Opcode,
const SDLoc &
DL,
11324 EVT VT = VTList.
VTs[0];
11327 bool IsTFE = VTList.
NumVTs == 3;
11330 unsigned NumOpDWords = NumValueDWords + 1;
11332 SDVTList OpDWordsVTList = DAG.
getVTList(OpDWordsVT, VTList.
VTs[2]);
11333 MachineMemOperand *OpDWordsMMO =
11335 SDValue Op = getMemIntrinsicNode(Opcode,
DL, OpDWordsVTList,
Ops,
11336 OpDWordsVT, OpDWordsMMO, DAG);
11341 NumValueDWords == 1
11350 if (!Subtarget->hasDwordx3LoadStores() &&
11351 (VT == MVT::v3i32 || VT == MVT::v3f32)) {
11355 SDVTList WidenedVTList = DAG.
getVTList(WidenedVT, VTList.
VTs[1]);
11357 WidenedMemVT, WidenedMMO);
11367 bool ImageStore)
const {
11377 if (Subtarget->hasUnpackedD16VMem()) {
11391 if (ImageStore && Subtarget->hasImageStoreD16Bug()) {
11402 for (
unsigned I = 0;
I < Elts.
size() / 2;
I += 1) {
11408 if ((NumElements % 2) == 1) {
11410 unsigned I = Elts.
size() / 2;
11426 if (NumElements == 3) {
11445 case Intrinsic::amdgcn_raw_buffer_load_async_lds:
11446 case Intrinsic::amdgcn_raw_ptr_buffer_load_async_lds:
11447 case Intrinsic::amdgcn_struct_buffer_load_async_lds:
11448 case Intrinsic::amdgcn_struct_ptr_buffer_load_async_lds:
11449 case Intrinsic::amdgcn_load_async_to_lds:
11450 case Intrinsic::amdgcn_global_load_async_lds:
11460 unsigned IntrinsicID =
Op.getConstantOperandVal(1);
11462 switch (IntrinsicID) {
11463 case Intrinsic::amdgcn_exp_compr: {
11464 if (!Subtarget->hasCompressedExport()) {
11467 "intrinsic not supported on subtarget",
DL.getDebugLoc()));
11489 unsigned Opc =
Done->isZero() ? AMDGPU::EXP : AMDGPU::EXP_DONE;
11493 case Intrinsic::amdgcn_struct_tbuffer_store:
11494 case Intrinsic::amdgcn_struct_ptr_tbuffer_store: {
11496 bool IsD16 = (VData.
getValueType().getScalarType() == MVT::f16);
11498 VData = handleD16VData(VData, DAG);
11499 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
11500 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(5), DAG);
11514 unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16
11515 : AMDGPUISD::TBUFFER_STORE_FORMAT;
11518 M->getMemoryVT(),
M->getMemOperand());
11521 case Intrinsic::amdgcn_raw_tbuffer_store:
11522 case Intrinsic::amdgcn_raw_ptr_tbuffer_store: {
11524 bool IsD16 = (VData.
getValueType().getScalarType() == MVT::f16);
11526 VData = handleD16VData(VData, DAG);
11527 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
11528 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(4), DAG);
11542 unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16
11543 : AMDGPUISD::TBUFFER_STORE_FORMAT;
11546 M->getMemoryVT(),
M->getMemOperand());
11549 case Intrinsic::amdgcn_raw_buffer_store:
11550 case Intrinsic::amdgcn_raw_ptr_buffer_store:
11551 case Intrinsic::amdgcn_raw_buffer_store_format:
11552 case Intrinsic::amdgcn_raw_ptr_buffer_store_format: {
11553 const bool IsFormat =
11554 IntrinsicID == Intrinsic::amdgcn_raw_buffer_store_format ||
11555 IntrinsicID == Intrinsic::amdgcn_raw_ptr_buffer_store_format;
11562 VData = handleD16VData(VData, DAG);
11572 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
11573 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(4), DAG);
11587 IsFormat ? AMDGPUISD::BUFFER_STORE_FORMAT : AMDGPUISD::BUFFER_STORE;
11588 Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 :
Opc;
11593 return handleByteShortBufferStores(DAG, VDataVT,
DL,
Ops, M);
11596 M->getMemoryVT(),
M->getMemOperand());
11599 case Intrinsic::amdgcn_struct_buffer_store:
11600 case Intrinsic::amdgcn_struct_ptr_buffer_store:
11601 case Intrinsic::amdgcn_struct_buffer_store_format:
11602 case Intrinsic::amdgcn_struct_ptr_buffer_store_format: {
11603 const bool IsFormat =
11604 IntrinsicID == Intrinsic::amdgcn_struct_buffer_store_format ||
11605 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_store_format;
11613 VData = handleD16VData(VData, DAG);
11623 auto Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
11624 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(5), DAG);
11638 !IsFormat ? AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT;
11639 Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 :
Opc;
11643 EVT VDataType = VData.getValueType().getScalarType();
11645 return handleByteShortBufferStores(DAG, VDataType,
DL,
Ops, M);
11648 M->getMemoryVT(),
M->getMemOperand());
11650 case Intrinsic::amdgcn_raw_buffer_load_lds:
11651 case Intrinsic::amdgcn_raw_buffer_load_async_lds:
11652 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
11653 case Intrinsic::amdgcn_raw_ptr_buffer_load_async_lds:
11654 case Intrinsic::amdgcn_struct_buffer_load_lds:
11655 case Intrinsic::amdgcn_struct_buffer_load_async_lds:
11656 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds:
11657 case Intrinsic::amdgcn_struct_ptr_buffer_load_async_lds: {
11658 if (!Subtarget->hasVMemToLDSLoad())
11662 IntrinsicID == Intrinsic::amdgcn_struct_buffer_load_lds ||
11663 IntrinsicID == Intrinsic::amdgcn_struct_buffer_load_async_lds ||
11664 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_load_lds ||
11665 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_load_async_lds;
11666 unsigned OpOffset = HasVIndex ? 1 : 0;
11667 SDValue VOffset =
Op.getOperand(5 + OpOffset);
11669 unsigned Size =
Op->getConstantOperandVal(4);
11675 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_BOTHEN
11676 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_IDXEN
11677 : HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFEN
11678 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFSET;
11681 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_BOTHEN
11682 : AMDGPU::BUFFER_LOAD_USHORT_LDS_IDXEN
11683 : HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFEN
11684 : AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFSET;
11687 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_BOTHEN
11688 : AMDGPU::BUFFER_LOAD_DWORD_LDS_IDXEN
11689 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN
11690 : AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET;
11693 if (!Subtarget->hasLDSLoadB96_B128())
11695 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_BOTHEN
11696 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_IDXEN
11697 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFEN
11698 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFSET;
11701 if (!Subtarget->hasLDSLoadB96_B128())
11703 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_BOTHEN
11704 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_IDXEN
11705 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFEN
11706 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFSET;
11714 if (HasVIndex && HasVOffset)
11718 else if (HasVIndex)
11719 Ops.push_back(
Op.getOperand(5));
11720 else if (HasVOffset)
11721 Ops.push_back(VOffset);
11723 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
11724 Ops.push_back(Rsrc);
11725 Ops.push_back(
Op.getOperand(6 + OpOffset));
11726 Ops.push_back(
Op.getOperand(7 + OpOffset));
11728 unsigned Aux =
Op.getConstantOperandVal(8 + OpOffset);
11751 case Intrinsic::amdgcn_load_to_lds:
11752 case Intrinsic::amdgcn_load_async_to_lds:
11753 case Intrinsic::amdgcn_global_load_lds:
11754 case Intrinsic::amdgcn_global_load_async_lds: {
11755 if (!Subtarget->hasVMemToLDSLoad())
11759 unsigned Size =
Op->getConstantOperandVal(4);
11764 Opc = AMDGPU::GLOBAL_LOAD_LDS_UBYTE;
11767 Opc = AMDGPU::GLOBAL_LOAD_LDS_USHORT;
11770 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD;
11773 if (!Subtarget->hasLDSLoadB96_B128())
11775 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX3;
11778 if (!Subtarget->hasLDSLoadB96_B128())
11780 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX4;
11796 if (
LHS->isDivergent())
11800 RHS.getOperand(0).getValueType() == MVT::i32) {
11803 VOffset =
RHS.getOperand(0);
11807 Ops.push_back(Addr);
11815 Ops.push_back(VOffset);
11818 Ops.push_back(
Op.getOperand(5));
11820 unsigned Aux =
Op.getConstantOperandVal(6);
11835 case Intrinsic::amdgcn_end_cf:
11837 Op->getOperand(2), Chain),
11839 case Intrinsic::amdgcn_s_barrier_init:
11840 case Intrinsic::amdgcn_s_barrier_signal_var: {
11847 unsigned Opc = IntrinsicID == Intrinsic::amdgcn_s_barrier_init
11848 ? AMDGPU::S_BARRIER_INIT_M0
11849 : AMDGPU::S_BARRIER_SIGNAL_M0;
11864 constexpr unsigned ShAmt = 16;
11871 Ops.push_back(
copyToM0(DAG, Chain,
DL, M0Val).getValue(0));
11876 case Intrinsic::amdgcn_s_wakeup_barrier: {
11877 if (!Subtarget->hasSWakeupBarrier())
11881 case Intrinsic::amdgcn_s_barrier_join: {
11890 switch (IntrinsicID) {
11893 case Intrinsic::amdgcn_s_barrier_join:
11894 Opc = AMDGPU::S_BARRIER_JOIN_IMM;
11896 case Intrinsic::amdgcn_s_wakeup_barrier:
11897 Opc = AMDGPU::S_WAKEUP_BARRIER_IMM;
11901 unsigned BarID = (BarVal >> 4) & 0x3F;
11904 Ops.push_back(Chain);
11906 switch (IntrinsicID) {
11909 case Intrinsic::amdgcn_s_barrier_join:
11910 Opc = AMDGPU::S_BARRIER_JOIN_M0;
11912 case Intrinsic::amdgcn_s_wakeup_barrier:
11913 Opc = AMDGPU::S_WAKEUP_BARRIER_M0;
11924 Ops.push_back(
copyToM0(DAG, Chain,
DL, M0Val).getValue(0));
11930 case Intrinsic::amdgcn_s_prefetch_data: {
11933 return Op.getOperand(0);
11936 case Intrinsic::amdgcn_s_buffer_prefetch_data: {
11938 Chain, bufferRsrcPtrToVector(
Op.getOperand(2), DAG),
11945 Op->getVTList(),
Ops,
M->getMemoryVT(),
11946 M->getMemOperand());
11948 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
11949 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
11950 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B: {
11959 if (
const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
11961 return lowerImage(
Op, ImageDimIntr, DAG,
true);
11977 return PtrVT == MVT::i64;
11991std::pair<SDValue, SDValue>
12004 bool CheckNUW = Subtarget->hasGFX1250Insts();
12021 unsigned Overflow = ImmOffset & ~MaxImm;
12022 ImmOffset -= Overflow;
12023 if ((int32_t)Overflow < 0) {
12024 Overflow += ImmOffset;
12029 auto OverflowVal = DAG.
getConstant(Overflow,
DL, MVT::i32);
12048void SITargetLowering::setBufferOffsets(
SDValue CombinedOffset,
12050 Align Alignment)
const {
12052 SDLoc
DL(CombinedOffset);
12054 uint32_t
Imm =
C->getZExtValue();
12055 uint32_t SOffset, ImmOffset;
12056 if (
TII->splitMUBUFOffset(Imm, SOffset, ImmOffset, Alignment)) {
12067 bool CheckNUW = Subtarget->hasGFX1250Insts();
12070 uint32_t SOffset, ImmOffset;
12073 TII->splitMUBUFOffset(
Offset, SOffset, ImmOffset, Alignment)) {
12081 SDValue SOffsetZero = Subtarget->hasRestrictedSOffset()
12090SDValue SITargetLowering::bufferRsrcPtrToVector(
SDValue MaybePointer,
12093 return MaybePointer;
12107 SDValue NumRecords =
Op->getOperand(3);
12113 if (Subtarget->has45BitNumRecordsBufferResource()) {
12132 SDValue ExtShiftedStrideVec =
12144 DAG.
getNode(
ISD::OR, Loc, MVT::i64, NumRecordsRHS, ExtShiftedStride);
12146 DAG.
getNode(
ISD::OR, Loc, MVT::i64, CombinedFields, ExtShiftedFlags);
12151 auto [LowHalf, HighHalf] =
12152 DAG.
SplitScalar(Pointer, Loc, MVT::i32, MVT::i32);
12162 NumRecords, Flags);
12174 bool IsTFE)
const {
12179 ? AMDGPUISD::BUFFER_LOAD_UBYTE_TFE
12180 : AMDGPUISD::BUFFER_LOAD_USHORT_TFE;
12183 SDVTList VTs = DAG.
getVTList(MVT::v2i32, MVT::Other);
12195 ? AMDGPUISD::BUFFER_LOAD_UBYTE
12196 : AMDGPUISD::BUFFER_LOAD_USHORT;
12198 SDVTList ResList = DAG.
getVTList(MVT::i32, MVT::Other);
12212 if (VDataType == MVT::f16 || VDataType == MVT::bf16)
12216 Ops[1] = BufferStoreExt;
12217 unsigned Opc = (VDataType == MVT::i8) ? AMDGPUISD::BUFFER_STORE_BYTE
12218 : AMDGPUISD::BUFFER_STORE_SHORT;
12221 M->getMemOperand());
12246 DAGCombinerInfo &DCI)
const {
12247 SelectionDAG &DAG = DCI.DAG;
12262 if ((MemVT.
isSimple() && !DCI.isAfterLegalizeDAG()) ||
12269 "unexpected vector extload");
12282 "unexpected fp extload");
12300 DCI.AddToWorklist(Cvt.
getNode());
12305 DCI.AddToWorklist(Cvt.
getNode());
12316 if (Info.isEntryFunction())
12317 return Info.getUserSGPRInfo().hasFlatScratchInit();
12325 EVT MemVT =
Load->getMemoryVT();
12326 MachineMemOperand *MMO =
Load->getMemOperand();
12338 EVT RealMemVT = (MemVT == MVT::i1) ? MVT::i8 : MVT::i16;
12366 assert(
Op.getValueType().getVectorElementType() == MVT::i32 &&
12367 "Custom lowering for non-i32 vectors hasn't been implemented.");
12370 unsigned AS =
Load->getAddressSpace();
12371 if (Subtarget->hasLDSMisalignedBugInWGPMode() &&
12378 SIMachineFunctionInfo *MFI = MF.
getInfo<SIMachineFunctionInfo>();
12382 !Subtarget->hasMultiDwordFlatScratchAddressing())
12392 Subtarget->getScalarizeGlobalBehavior() &&
Load->isSimple() &&
12395 Alignment >=
Align(4) && NumElements < 32) {
12397 (Subtarget->hasScalarDwordx3Loads() && NumElements == 3))
12409 if (NumElements > 4)
12412 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
12422 switch (Subtarget->getMaxPrivateElementSize()) {
12428 if (NumElements > 2)
12433 if (NumElements > 4)
12436 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
12445 auto Flags =
Load->getMemOperand()->getFlags();
12447 Load->getAlign(), Flags, &
Fast) &&
12456 MemVT, *
Load->getMemOperand())) {
12465 EVT VT =
Op.getValueType();
12502 EVT VT =
Op.getValueType();
12503 const SDNodeFlags
Flags =
Op->getFlags();
12505 bool AllowInaccurateRcp =
Flags.hasApproximateFuncs();
12511 if (!AllowInaccurateRcp && VT != MVT::f16 && VT != MVT::bf16)
12514 if (CLHS->isExactlyValue(1.0)) {
12527 return DAG.
getNode(AMDGPUISD::RCP, SL, VT,
RHS);
12531 if (CLHS->isExactlyValue(-1.0)) {
12534 return DAG.
getNode(AMDGPUISD::RCP, SL, VT, FNegRHS);
12540 if (!AllowInaccurateRcp &&
12541 ((VT != MVT::f16 && VT != MVT::bf16) || !
Flags.hasAllowReciprocal()))
12555 EVT VT =
Op.getValueType();
12556 const SDNodeFlags
Flags =
Op->getFlags();
12558 bool AllowInaccurateDiv =
Flags.hasApproximateFuncs();
12559 if (!AllowInaccurateDiv)
12580 return DAG.
getNode(Opcode, SL, VT,
A,
B, Flags);
12590 Opcode = AMDGPUISD::FMUL_W_CHAIN;
12594 return DAG.
getNode(Opcode, SL, VTList,
12603 return DAG.
getNode(Opcode, SL, VT, {
A,
B,
C}, Flags);
12613 Opcode = AMDGPUISD::FMA_W_CHAIN;
12617 return DAG.
getNode(Opcode, SL, VTList,
12623 if (
SDValue FastLowered = lowerFastUnsafeFDIV(
Op, DAG))
12624 return FastLowered;
12627 EVT VT =
Op.getValueType();
12634 if (VT == MVT::bf16) {
12657 unsigned FMADOpCode =
12661 DAG.
getNode(AMDGPUISD::RCP, SL, MVT::f32, RHSExt,
Op->getFlags());
12664 SDValue Err = DAG.
getNode(FMADOpCode, SL, MVT::f32, NegRHSExt, Quot, LHSExt,
12666 Quot = DAG.
getNode(FMADOpCode, SL, MVT::f32, Err, Rcp, Quot,
Op->getFlags());
12667 Err = DAG.
getNode(FMADOpCode, SL, MVT::f32, NegRHSExt, Quot, LHSExt,
12677 return DAG.
getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f16, RDst,
RHS,
LHS,
12683 SDNodeFlags
Flags =
Op->getFlags();
12693 const APFloat K0Val(0x1p+96f);
12696 const APFloat K1Val(0x1p-32f);
12723 assert(ST->hasDenormModeInst() &&
"Requires S_DENORM_MODE");
12724 uint32_t DPDenormModeDefault = Info->getMode().fpDenormModeDPValue();
12725 uint32_t Mode = SPDenormMode | (DPDenormModeDefault << 2);
12730 if (
SDValue FastLowered = lowerFastUnsafeFDIV(
Op, DAG))
12731 return FastLowered;
12737 SDNodeFlags
Flags =
Op->getFlags();
12738 Flags.setNoFPExcept(
true);
12746 SDVTList ScaleVT = DAG.
getVTList(MVT::f32, MVT::i1);
12755 DAG.
getNode(AMDGPUISD::RCP, SL, MVT::f32, DenominatorScaled, Flags);
12759 using namespace AMDGPU::Hwreg;
12760 const unsigned Denorm32Reg = HwregEncoding::encode(ID_MODE, 4, 2);
12764 const SIMachineFunctionInfo *
Info = MF.
getInfo<SIMachineFunctionInfo>();
12765 const DenormalMode DenormMode =
Info->getMode().FP32Denormals;
12768 const bool HasDynamicDenormals =
12774 if (!PreservesDenormals) {
12779 SDVTList BindParamVTs = DAG.
getVTList(MVT::Other, MVT::Glue);
12782 if (HasDynamicDenormals) {
12786 SavedDenormMode =
SDValue(GetReg, 0);
12792 SDNode *EnableDenorm;
12793 if (Subtarget->hasDenormModeInst()) {
12794 const SDValue EnableDenormValue =
12797 EnableDenorm = DAG.
getNode(AMDGPUISD::DENORM_MODE, SL, BindParamVTs, Glue,
12801 const SDValue EnableDenormValue =
12803 EnableDenorm = DAG.
getMachineNode(AMDGPU::S_SETREG_B32, SL, BindParamVTs,
12804 {EnableDenormValue,
BitField, Glue});
12814 ApproxRcp, One, NegDivScale0, Flags);
12817 ApproxRcp, Fma0, Flags);
12823 NumeratorScaled,
Mul, Flags);
12829 NumeratorScaled, Fma3, Flags);
12831 if (!PreservesDenormals) {
12832 SDNode *DisableDenorm;
12833 if (!HasDynamicDenormals && Subtarget->hasDenormModeInst()) {
12837 SDVTList BindParamVTs = DAG.
getVTList(MVT::Other, MVT::Glue);
12839 DAG.
getNode(AMDGPUISD::DENORM_MODE, SL, BindParamVTs,
12843 assert(HasDynamicDenormals == (
bool)SavedDenormMode);
12844 const SDValue DisableDenormValue =
12845 HasDynamicDenormals
12850 AMDGPU::S_SETREG_B32, SL, MVT::Other,
12861 {Fma4, Fma1, Fma3, Scale},
Flags);
12863 return DAG.
getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f32, Fmas,
RHS,
LHS, Flags);
12867 if (
SDValue FastLowered = lowerFastUnsafeFDIV64(
Op, DAG))
12868 return FastLowered;
12876 SDVTList ScaleVT = DAG.
getVTList(MVT::f64, MVT::i1);
12882 SDValue Rcp = DAG.
getNode(AMDGPUISD::RCP, SL, MVT::f64, DivScale0);
12900 if (!Subtarget->hasUsableDivScaleConditionOutput()) {
12930 DAG.
getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f64, Fma4, Fma3,
Mul, Scale);
12932 return DAG.
getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f64, Fmas,
Y,
X);
12936 EVT VT =
Op.getValueType();
12938 if (VT == MVT::f32)
12939 return LowerFDIV32(
Op, DAG);
12941 if (VT == MVT::f64)
12942 return LowerFDIV64(
Op, DAG);
12944 if (VT == MVT::f16 || VT == MVT::bf16)
12945 return LowerFDIV16(
Op, DAG);
12954 EVT ResultExpVT =
Op->getValueType(1);
12955 EVT InstrExpVT = VT == MVT::f16 ? MVT::i16 : MVT::i32;
12965 if (Subtarget->hasFractBug()) {
12983 EVT VT =
Store->getMemoryVT();
12985 if (VT == MVT::i1) {
12989 Store->getBasePtr(), MVT::i1,
Store->getMemOperand());
12993 Store->getValue().getValueType().getScalarType() == MVT::i32);
12995 unsigned AS =
Store->getAddressSpace();
12996 if (Subtarget->hasLDSMisalignedBugInWGPMode() &&
13004 SIMachineFunctionInfo *MFI = MF.
getInfo<SIMachineFunctionInfo>();
13008 !Subtarget->hasMultiDwordFlatScratchAddressing())
13015 if (NumElements > 4)
13018 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
13022 VT, *
Store->getMemOperand()))
13028 switch (Subtarget->getMaxPrivateElementSize()) {
13032 if (NumElements > 2)
13036 if (NumElements > 4 ||
13037 (NumElements == 3 && !Subtarget->hasFlatScratchEnabled()))
13045 auto Flags =
Store->getMemOperand()->getFlags();
13064 assert(!Subtarget->has16BitInsts());
13065 SDNodeFlags
Flags =
Op->getFlags();
13079 SDNodeFlags
Flags =
Op->getFlags();
13080 MVT VT =
Op.getValueType().getSimpleVT();
13188 SDNodeFlags
Flags =
Op->getFlags();
13251 EVT VT =
Op.getValueType();
13262 if (!
V.getValueType().isVector())
13270 if (Subtarget->hasTrigReducedRange()) {
13272 TrigVal = UnrollIfVec(DAG.
getNode(AMDGPUISD::FRACT,
DL, VT, MulVal, Flags));
13277 switch (
Op.getOpcode()) {
13279 TrigVal = DAG.
getNode(AMDGPUISD::COS_HW, SDLoc(
Op), VT, TrigVal, Flags);
13282 TrigVal = DAG.
getNode(AMDGPUISD::SIN_HW, SDLoc(
Op), VT, TrigVal, Flags);
13288 return UnrollIfVec(TrigVal);
13308 EVT VT =
Op.getValueType();
13316 Op->getVTList(),
Ops, VT,
13325SITargetLowering::performUCharToFloatCombine(
SDNode *
N,
13326 DAGCombinerInfo &DCI)
const {
13327 EVT VT =
N->getValueType(0);
13329 if (ScalarVT != MVT::f32 && ScalarVT != MVT::f16)
13332 SelectionDAG &DAG = DCI.DAG;
13336 EVT SrcVT = Src.getValueType();
13342 if (DCI.isAfterLegalizeDAG() && SrcVT == MVT::i32) {
13345 DCI.AddToWorklist(Cvt.
getNode());
13348 if (ScalarVT != MVT::f32) {
13360 DAGCombinerInfo &DCI)
const {
13371 SelectionDAG &DAG = DCI.DAG;
13390 for (
unsigned I = 0;
I != NumElts; ++
I) {
13414 if (NewElts.
size() == 1)
13436 for (
unsigned I = 0;
I != NumElts; ++
I) {
13471SDValue SITargetLowering::performSHLPtrCombine(
SDNode *
N,
unsigned AddrSpace,
13473 DAGCombinerInfo &DCI)
const {
13490 SelectionDAG &DAG = DCI.DAG;
13503 AM.BaseOffs =
Offset.getSExtValue();
13508 EVT VT =
N->getValueType(0);
13514 Flags.setNoUnsignedWrap(
13515 N->getFlags().hasNoUnsignedWrap() &&
13527 switch (
N->getOpcode()) {
13538 DAGCombinerInfo &DCI)
const {
13539 SelectionDAG &DAG = DCI.DAG;
13546 SDValue NewPtr = performSHLPtrCombine(Ptr.
getNode(),
N->getAddressSpace(),
13547 N->getMemoryVT(), DCI);
13551 NewOps[PtrIdx] = NewPtr;
13560 return (
Opc ==
ISD::AND && (Val == 0 || Val == 0xffffffff)) ||
13561 (
Opc ==
ISD::OR && (Val == 0xffffffff || Val == 0)) ||
13570SDValue SITargetLowering::splitBinaryBitConstantOp(
13574 uint32_t ValLo =
Lo_32(Val);
13575 uint32_t ValHi =
Hi_32(Val);
13582 if (Subtarget->has64BitLiterals() && CRHS->
hasOneUse() &&
13596 if (V.getValueType() != MVT::i1)
13598 switch (V.getOpcode()) {
13603 case AMDGPUISD::FP_CLASS:
13615 return V.getResNo() == 1;
13617 unsigned IntrinsicID = V.getConstantOperandVal(0);
13618 switch (IntrinsicID) {
13619 case Intrinsic::amdgcn_is_shared:
13620 case Intrinsic::amdgcn_is_private:
13637 if (!(
C & 0x000000ff))
13638 ZeroByteMask |= 0x000000ff;
13639 if (!(
C & 0x0000ff00))
13640 ZeroByteMask |= 0x0000ff00;
13641 if (!(
C & 0x00ff0000))
13642 ZeroByteMask |= 0x00ff0000;
13643 if (!(
C & 0xff000000))
13644 ZeroByteMask |= 0xff000000;
13645 uint32_t NonZeroByteMask = ~ZeroByteMask;
13646 if ((NonZeroByteMask &
C) != NonZeroByteMask)
13659 assert(V.getValueSizeInBits() == 32);
13661 if (V.getNumOperands() != 2)
13670 switch (V.getOpcode()) {
13675 return (0x03020100 & ConstMask) | (0x0c0c0c0c & ~ConstMask);
13680 return (0x03020100 & ~ConstMask) | ConstMask;
13687 return uint32_t((0x030201000c0c0c0cull <<
C) >> 32);
13693 return uint32_t(0x0c0c0c0c03020100ull >>
C);
13700 DAGCombinerInfo &DCI)
const {
13701 if (DCI.isBeforeLegalize())
13704 SelectionDAG &DAG = DCI.DAG;
13705 EVT VT =
N->getValueType(0);
13710 if (VT == MVT::i64 && CRHS) {
13712 splitBinaryBitConstantOp(DCI, SDLoc(
N),
ISD::AND,
LHS, CRHS))
13716 if (CRHS && VT == MVT::i32) {
13726 unsigned Shift = CShift->getZExtValue();
13728 unsigned Offset = NB + Shift;
13729 if ((
Offset & (Bits - 1)) == 0) {
13732 DAG.
getNode(AMDGPUISD::BFE_U32, SL, MVT::i32,
LHS->getOperand(0),
13753 Sel = (
LHS.getConstantOperandVal(2) & Sel) | (~Sel & 0x0c0c0c0c);
13755 return DAG.
getNode(AMDGPUISD::PERM,
DL, MVT::i32,
LHS.getOperand(0),
13768 if (
Y.getOpcode() !=
ISD::FABS ||
Y.getOperand(0) !=
X ||
13773 if (
X !=
LHS.getOperand(1))
13777 const ConstantFPSDNode *C1 =
13794 return DAG.
getNode(AMDGPUISD::FP_CLASS,
DL, MVT::i1,
X,
13800 if (
RHS.getOpcode() ==
ISD::SETCC &&
LHS.getOpcode() == AMDGPUISD::FP_CLASS)
13803 if (
LHS.getOpcode() ==
ISD::SETCC &&
RHS.getOpcode() == AMDGPUISD::FP_CLASS &&
13811 (
RHS.getOperand(0) ==
LHS.getOperand(0) &&
13812 LHS.getOperand(0) ==
LHS.getOperand(1))) {
13814 unsigned NewMask = LCC ==
ISD::SETO ?
Mask->getZExtValue() & ~OrdMask
13815 :
Mask->getZExtValue() & OrdMask;
13818 return DAG.
getNode(AMDGPUISD::FP_CLASS,
DL, MVT::i1,
RHS.getOperand(0),
13836 N->isDivergent() &&
TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
13839 if (LHSMask != ~0u && RHSMask != ~0u) {
13842 if (LHSMask > RHSMask) {
13849 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
13850 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
13853 if (!(LHSUsedLanes & RHSUsedLanes) &&
13856 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
13862 uint32_t
Mask = LHSMask & RHSMask;
13863 for (
unsigned I = 0;
I < 32;
I += 8) {
13864 uint32_t ByteSel = 0xff <<
I;
13865 if ((LHSMask & ByteSel) == 0x0c || (RHSMask & ByteSel) == 0x0c)
13866 Mask &= (0x0c <<
I) & 0xffffffff;
13871 uint32_t Sel =
Mask | (LHSUsedLanes & 0x04040404);
13874 return DAG.
getNode(AMDGPUISD::PERM,
DL, MVT::i32,
LHS.getOperand(0),
13924static const std::optional<ByteProvider<SDValue>>
13926 unsigned Depth = 0) {
13929 return std::nullopt;
13931 if (
Op.getValueSizeInBits() < 8)
13932 return std::nullopt;
13934 if (
Op.getValueType().isVector())
13937 switch (
Op->getOpcode()) {
13950 NarrowVT = VTSign->getVT();
13953 return std::nullopt;
13956 if (SrcIndex >= NarrowByteWidth)
13957 return std::nullopt;
13965 return std::nullopt;
13967 uint64_t BitShift = ShiftOp->getZExtValue();
13969 if (BitShift % 8 != 0)
13970 return std::nullopt;
13972 SrcIndex += BitShift / 8;
13990static const std::optional<ByteProvider<SDValue>>
13992 unsigned StartingIndex = 0) {
13996 return std::nullopt;
13998 unsigned BitWidth =
Op.getScalarValueSizeInBits();
14000 return std::nullopt;
14002 return std::nullopt;
14004 bool IsVec =
Op.getValueType().isVector();
14005 switch (
Op.getOpcode()) {
14008 return std::nullopt;
14013 return std::nullopt;
14017 return std::nullopt;
14020 if (!
LHS->isConstantZero() && !
RHS->isConstantZero())
14021 return std::nullopt;
14022 if (!
LHS ||
LHS->isConstantZero())
14024 if (!
RHS ||
RHS->isConstantZero())
14026 return std::nullopt;
14031 return std::nullopt;
14035 return std::nullopt;
14037 uint32_t BitMask = BitMaskOp->getZExtValue();
14039 uint32_t IndexMask = 0xFF << (Index * 8);
14041 if ((IndexMask & BitMask) != IndexMask) {
14044 if (IndexMask & BitMask)
14045 return std::nullopt;
14054 return std::nullopt;
14058 if (!ShiftOp ||
Op.getValueType().isVector())
14059 return std::nullopt;
14061 uint64_t BitsProvided =
Op.getValueSizeInBits();
14062 if (BitsProvided % 8 != 0)
14063 return std::nullopt;
14065 uint64_t BitShift = ShiftOp->getAPIntValue().urem(BitsProvided);
14067 return std::nullopt;
14069 uint64_t ConcatSizeInBytes = BitsProvided / 4;
14070 uint64_t ByteShift = BitShift / 8;
14072 uint64_t NewIndex = (Index + ByteShift) % ConcatSizeInBytes;
14073 uint64_t BytesProvided = BitsProvided / 8;
14074 SDValue NextOp =
Op.getOperand(NewIndex >= BytesProvided ? 0 : 1);
14075 NewIndex %= BytesProvided;
14082 return std::nullopt;
14086 return std::nullopt;
14088 uint64_t BitShift = ShiftOp->getZExtValue();
14090 return std::nullopt;
14092 auto BitsProvided =
Op.getScalarValueSizeInBits();
14093 if (BitsProvided % 8 != 0)
14094 return std::nullopt;
14096 uint64_t BytesProvided = BitsProvided / 8;
14097 uint64_t ByteShift = BitShift / 8;
14102 return BytesProvided - ByteShift > Index
14110 return std::nullopt;
14114 return std::nullopt;
14116 uint64_t BitShift = ShiftOp->getZExtValue();
14117 if (BitShift % 8 != 0)
14118 return std::nullopt;
14119 uint64_t ByteShift = BitShift / 8;
14125 return Index < ByteShift
14128 Depth + 1, StartingIndex);
14137 return std::nullopt;
14145 NarrowBitWidth = VTSign->getVT().getSizeInBits();
14147 if (NarrowBitWidth % 8 != 0)
14148 return std::nullopt;
14149 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
14151 if (Index >= NarrowByteWidth)
14153 ? std::optional<ByteProvider<SDValue>>(
14161 return std::nullopt;
14165 if (NarrowByteWidth >= Index) {
14170 return std::nullopt;
14177 return std::nullopt;
14183 unsigned NarrowBitWidth = L->getMemoryVT().getSizeInBits();
14184 if (NarrowBitWidth % 8 != 0)
14185 return std::nullopt;
14186 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
14191 if (Index >= NarrowByteWidth) {
14193 ? std::optional<ByteProvider<SDValue>>(
14198 if (NarrowByteWidth > Index) {
14202 return std::nullopt;
14207 return std::nullopt;
14210 Depth + 1, StartingIndex);
14216 return std::nullopt;
14217 auto VecIdx = IdxOp->getZExtValue();
14218 auto ScalarSize =
Op.getScalarValueSizeInBits();
14219 if (ScalarSize < 32)
14220 Index = ScalarSize == 8 ? VecIdx : VecIdx * 2 + Index;
14222 StartingIndex, Index);
14225 case AMDGPUISD::PERM: {
14227 return std::nullopt;
14231 return std::nullopt;
14234 (PermMask->getZExtValue() & (0xFF << (Index * 8))) >> (Index * 8);
14235 if (IdxMask > 0x07 && IdxMask != 0x0c)
14236 return std::nullopt;
14238 auto NextOp =
Op.getOperand(IdxMask > 0x03 ? 0 : 1);
14239 auto NextIndex = IdxMask > 0x03 ? IdxMask % 4 : IdxMask;
14241 return IdxMask != 0x0c ?
calculateSrcByte(NextOp, StartingIndex, NextIndex)
14247 return std::nullopt;
14262 return !OpVT.
isVector() && OpVT.getSizeInBits() == 16;
14269 auto MemVT = L->getMemoryVT();
14272 return L->getMemoryVT().getSizeInBits() == 16;
14282 int Low8 = Mask & 0xff;
14283 int Hi8 = (Mask & 0xff00) >> 8;
14285 assert(Low8 < 8 && Hi8 < 8);
14287 bool IsConsecutive = (Hi8 - Low8 == 1);
14292 bool Is16Aligned = !(Low8 % 2);
14294 return IsConsecutive && Is16Aligned;
14302 int Low16 = PermMask & 0xffff;
14303 int Hi16 = (PermMask & 0xffff0000) >> 16;
14313 auto OtherOpIs16Bit = TempOtherOp.getValueSizeInBits() == 16 ||
14315 if (!OtherOpIs16Bit)
14323 unsigned DWordOffset) {
14328 assert(Src.getValueSizeInBits().isKnownMultipleOf(8));
14333 if (Src.getValueType().isVector()) {
14334 auto ScalarTySize = Src.getScalarValueSizeInBits();
14335 auto ScalarTy = Src.getValueType().getScalarType();
14336 if (ScalarTySize == 32) {
14340 if (ScalarTySize > 32) {
14343 DAG.
getConstant(DWordOffset / (ScalarTySize / 32), SL, MVT::i32));
14344 auto ShiftVal = 32 * (DWordOffset % (ScalarTySize / 32));
14351 assert(ScalarTySize < 32);
14352 auto NumElements =
TypeSize / ScalarTySize;
14353 auto Trunc32Elements = (ScalarTySize * NumElements) / 32;
14354 auto NormalizedTrunc = Trunc32Elements * 32 / ScalarTySize;
14355 auto NumElementsIn32 = 32 / ScalarTySize;
14356 auto NumAvailElements = DWordOffset < Trunc32Elements
14358 : NumElements - NormalizedTrunc;
14371 auto ShiftVal = 32 * DWordOffset;
14379 [[maybe_unused]]
EVT VT =
N->getValueType(0);
14384 for (
int i = 0; i < 4; i++) {
14386 std::optional<ByteProvider<SDValue>>
P =
14389 if (!
P ||
P->isConstantZero())
14394 if (PermNodes.
size() != 4)
14397 std::pair<unsigned, unsigned> FirstSrc(0, PermNodes[0].SrcOffset / 4);
14398 std::optional<std::pair<unsigned, unsigned>> SecondSrc;
14400 for (
size_t i = 0; i < PermNodes.
size(); i++) {
14401 auto PermOp = PermNodes[i];
14404 int SrcByteAdjust = 4;
14408 if (!PermOp.hasSameSrc(PermNodes[FirstSrc.first]) ||
14409 ((PermOp.SrcOffset / 4) != FirstSrc.second)) {
14411 if (!PermOp.hasSameSrc(PermNodes[SecondSrc->first]) ||
14412 ((PermOp.SrcOffset / 4) != SecondSrc->second))
14416 SecondSrc = {i, PermNodes[i].SrcOffset / 4};
14417 assert(!(PermNodes[SecondSrc->first].Src->getValueSizeInBits() % 8));
14420 assert((PermOp.SrcOffset % 4) + SrcByteAdjust < 8);
14422 PermMask |= ((PermOp.SrcOffset % 4) + SrcByteAdjust) << (i * 8);
14425 SDValue Op = *PermNodes[FirstSrc.first].Src;
14427 assert(
Op.getValueSizeInBits() == 32);
14431 int Low16 = PermMask & 0xffff;
14432 int Hi16 = (PermMask & 0xffff0000) >> 16;
14434 bool WellFormedLow = (Low16 == 0x0504) || (Low16 == 0x0100);
14435 bool WellFormedHi = (Hi16 == 0x0706) || (Hi16 == 0x0302);
14438 if (WellFormedLow && WellFormedHi)
14442 SDValue OtherOp = SecondSrc ? *PermNodes[SecondSrc->first].Src :
Op;
14451 (
N->getOperand(0) ==
Op ||
N->getOperand(0) == OtherOp) &&
14452 (
N->getOperand(1) ==
Op ||
N->getOperand(1) == OtherOp))
14457 assert(
Op.getValueType().isByteSized() &&
14468 return DAG.
getNode(AMDGPUISD::PERM,
DL, MVT::i32,
Op, OtherOp,
14475 DAGCombinerInfo &DCI)
const {
14476 SelectionDAG &DAG = DCI.DAG;
14480 EVT VT =
N->getValueType(0);
14481 if (VT == MVT::i1) {
14483 if (
LHS.getOpcode() == AMDGPUISD::FP_CLASS &&
14484 RHS.getOpcode() == AMDGPUISD::FP_CLASS) {
14486 if (Src !=
RHS.getOperand(0))
14491 if (!CLHS || !CRHS)
14495 static const uint32_t MaxMask = 0x3ff;
14500 return DAG.
getNode(AMDGPUISD::FP_CLASS,
DL, MVT::i1, Src,
14509 LHS.getOpcode() == AMDGPUISD::PERM &&
14515 Sel |=
LHS.getConstantOperandVal(2);
14517 return DAG.
getNode(AMDGPUISD::PERM,
DL, MVT::i32,
LHS.getOperand(0),
14524 N->isDivergent() &&
TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
14528 auto usesCombinedOperand = [](SDNode *OrUse) {
14531 !OrUse->getValueType(0).isVector())
14535 for (
auto *VUser : OrUse->users()) {
14536 if (!VUser->getValueType(0).isVector())
14543 if (VUser->getOpcode() == VectorwiseOp)
14549 if (!
any_of(
N->users(), usesCombinedOperand))
14555 if (LHSMask != ~0u && RHSMask != ~0u) {
14558 if (LHSMask > RHSMask) {
14565 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
14566 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
14569 if (!(LHSUsedLanes & RHSUsedLanes) &&
14572 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
14574 LHSMask &= ~RHSUsedLanes;
14575 RHSMask &= ~LHSUsedLanes;
14577 LHSMask |= LHSUsedLanes & 0x04040404;
14579 uint32_t Sel = LHSMask | RHSMask;
14582 return DAG.
getNode(AMDGPUISD::PERM,
DL, MVT::i32,
LHS.getOperand(0),
14587 if (LHSMask == ~0u || RHSMask == ~0u) {
14628 return IdentitySrc;
14634 if (VT != MVT::i64 || DCI.isBeforeLegalizeOps())
14649 if (SrcVT == MVT::i32) {
14654 DCI.AddToWorklist(LowOr.
getNode());
14655 DCI.AddToWorklist(HiBits.getNode());
14666 N->getOperand(0), CRHS))
14674 DAGCombinerInfo &DCI)
const {
14675 if (
SDValue RV = reassociateScalarOps(
N, DCI.DAG))
14682 SelectionDAG &DAG = DCI.DAG;
14684 EVT VT =
N->getValueType(0);
14685 if (CRHS && VT == MVT::i64) {
14687 splitBinaryBitConstantOp(DCI, SDLoc(
N),
ISD::XOR,
LHS, CRHS))
14694 unsigned Opc =
LHS.getOpcode();
14724 LHS->getOperand(0), FNegLHS, FNegRHS);
14733SITargetLowering::performZeroOrAnyExtendCombine(
SDNode *
N,
14734 DAGCombinerInfo &DCI)
const {
14735 if (!Subtarget->has16BitInsts() ||
14739 EVT VT =
N->getValueType(0);
14740 if (VT != MVT::i32)
14744 if (Src.getValueType() != MVT::i16)
14747 if (!Src->hasOneUse())
14754 std::optional<ByteProvider<SDValue>> BP0 =
14756 if (!BP0 || BP0->SrcOffset >= 4 || !BP0->Src)
14760 std::optional<ByteProvider<SDValue>> BP1 =
14762 if (!BP1 || BP1->SrcOffset >= 4 || !BP1->Src)
14770 SelectionDAG &DAG = DCI.DAG;
14772 uint32_t PermMask = 0x0c0c0c0c;
14775 PermMask = (PermMask & ~0xFF) | (BP0->SrcOffset + 4);
14780 PermMask = (PermMask & ~(0xFF << 8)) | (BP1->SrcOffset << 8);
14783 return DAG.
getNode(AMDGPUISD::PERM,
DL, MVT::i32, V0, V1,
14788SITargetLowering::performSignExtendInRegCombine(
SDNode *
N,
14789 DAGCombinerInfo &DCI)
const {
14795 if (((Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_UBYTE &&
14796 VTSign->getVT() == MVT::i8) ||
14797 (Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_USHORT &&
14798 VTSign->getVT() == MVT::i16))) {
14799 assert(Subtarget->hasScalarSubwordLoads() &&
14800 "s_buffer_load_{u8, i8} are supported "
14801 "in GFX12 (or newer) architectures.");
14802 EVT VT = Src.getValueType();
14803 unsigned Opc = (Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_UBYTE)
14804 ? AMDGPUISD::SBUFFER_LOAD_BYTE
14805 : AMDGPUISD::SBUFFER_LOAD_SHORT;
14807 SDVTList ResList = DCI.DAG.getVTList(MVT::i32);
14814 SDValue BufferLoad = DCI.DAG.getMemIntrinsicNode(
14815 Opc,
DL, ResList,
Ops,
M->getMemoryVT(),
M->getMemOperand());
14819 if (((Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE &&
14820 VTSign->getVT() == MVT::i8) ||
14821 (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_USHORT &&
14822 VTSign->getVT() == MVT::i16)) &&
14831 Src.getOperand(6), Src.getOperand(7)};
14834 DCI.DAG.getVTList(MVT::i32, Src.getOperand(0).getValueType());
14835 unsigned Opc = (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE)
14836 ? AMDGPUISD::BUFFER_LOAD_BYTE
14837 : AMDGPUISD::BUFFER_LOAD_SHORT;
14838 SDValue BufferLoadSignExt = DCI.DAG.getMemIntrinsicNode(
14839 Opc, SDLoc(
N), ResList,
Ops,
M->getMemoryVT(),
M->getMemOperand());
14840 return DCI.DAG.getMergeValues(
14841 {BufferLoadSignExt, BufferLoadSignExt.
getValue(1)}, SDLoc(
N));
14847 DAGCombinerInfo &DCI)
const {
14848 SelectionDAG &DAG = DCI.DAG;
14855 if (
N->getOperand(0).isUndef())
14862 DAGCombinerInfo &DCI)
const {
14863 EVT VT =
N->getValueType(0);
14873 return DCI.DAG.getNode(AMDGPUISD::RCP_IFLAG, SDLoc(
N), VT, N0,
14880 return DCI.DAG.getNode(AMDGPUISD::RSQ, SDLoc(
N), VT, N0.
getOperand(0),
14889 unsigned MaxDepth)
const {
14890 unsigned Opcode =
Op.getOpcode();
14895 const auto &
F = CFP->getValueAPF();
14896 if (
F.isNaN() &&
F.isSignaling())
14898 if (!
F.isDenormal())
14930 case AMDGPUISD::FMUL_LEGACY:
14931 case AMDGPUISD::FMAD_FTZ:
14932 case AMDGPUISD::RCP:
14933 case AMDGPUISD::RSQ:
14934 case AMDGPUISD::RSQ_CLAMP:
14935 case AMDGPUISD::RCP_LEGACY:
14936 case AMDGPUISD::RCP_IFLAG:
14937 case AMDGPUISD::LOG:
14938 case AMDGPUISD::EXP:
14939 case AMDGPUISD::DIV_SCALE:
14940 case AMDGPUISD::DIV_FMAS:
14941 case AMDGPUISD::DIV_FIXUP:
14942 case AMDGPUISD::FRACT:
14943 case AMDGPUISD::CVT_PKRTZ_F16_F32:
14944 case AMDGPUISD::CVT_F32_UBYTE0:
14945 case AMDGPUISD::CVT_F32_UBYTE1:
14946 case AMDGPUISD::CVT_F32_UBYTE2:
14947 case AMDGPUISD::CVT_F32_UBYTE3:
14948 case AMDGPUISD::FP_TO_FP16:
14949 case AMDGPUISD::SIN_HW:
14950 case AMDGPUISD::COS_HW:
14961 if (
Op.getValueType() == MVT::i32) {
14967 if (RHS->getZExtValue() == 0xffff0000) {
14977 return Op.getValueType().getScalarType() != MVT::f16;
14987 case AMDGPUISD::CLAMP:
14988 case AMDGPUISD::FMED3:
14989 case AMDGPUISD::FMAX3:
14990 case AMDGPUISD::FMIN3:
14991 case AMDGPUISD::FMAXIMUM3:
14992 case AMDGPUISD::FMINIMUM3: {
14998 if (Subtarget->supportsMinMaxDenormModes() ||
15008 for (
unsigned I = 0, E =
Op.getNumOperands();
I != E; ++
I) {
15020 for (
unsigned i = 0, e =
Op.getNumOperands(); i != e; ++i) {
15047 if (
Op.getValueType() == MVT::i16) {
15058 unsigned IntrinsicID =
Op.getConstantOperandVal(0);
15060 switch (IntrinsicID) {
15061 case Intrinsic::amdgcn_cvt_pkrtz:
15062 case Intrinsic::amdgcn_cubeid:
15063 case Intrinsic::amdgcn_frexp_mant:
15064 case Intrinsic::amdgcn_fdot2:
15065 case Intrinsic::amdgcn_rcp:
15066 case Intrinsic::amdgcn_rsq:
15067 case Intrinsic::amdgcn_rsq_clamp:
15068 case Intrinsic::amdgcn_rcp_legacy:
15069 case Intrinsic::amdgcn_rsq_legacy:
15070 case Intrinsic::amdgcn_trig_preop:
15071 case Intrinsic::amdgcn_tanh:
15072 case Intrinsic::amdgcn_log:
15073 case Intrinsic::amdgcn_exp2:
15074 case Intrinsic::amdgcn_sqrt:
15092 unsigned MaxDepth)
const {
15095 unsigned Opcode =
MI->getOpcode();
15097 if (Opcode == AMDGPU::G_FCANONICALIZE)
15100 std::optional<FPValueAndVReg> FCR;
15103 if (FCR->Value.isSignaling())
15105 if (!FCR->Value.isDenormal())
15116 case AMDGPU::G_FADD:
15117 case AMDGPU::G_FSUB:
15118 case AMDGPU::G_FMUL:
15119 case AMDGPU::G_FCEIL:
15120 case AMDGPU::G_FFLOOR:
15121 case AMDGPU::G_FRINT:
15122 case AMDGPU::G_FNEARBYINT:
15123 case AMDGPU::G_INTRINSIC_FPTRUNC_ROUND:
15124 case AMDGPU::G_INTRINSIC_TRUNC:
15125 case AMDGPU::G_INTRINSIC_ROUNDEVEN:
15126 case AMDGPU::G_FMA:
15127 case AMDGPU::G_FMAD:
15128 case AMDGPU::G_FSQRT:
15129 case AMDGPU::G_FDIV:
15130 case AMDGPU::G_FREM:
15131 case AMDGPU::G_FPOW:
15132 case AMDGPU::G_FPEXT:
15133 case AMDGPU::G_FLOG:
15134 case AMDGPU::G_FLOG2:
15135 case AMDGPU::G_FLOG10:
15136 case AMDGPU::G_FPTRUNC:
15137 case AMDGPU::G_AMDGPU_RCP_IFLAG:
15138 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE0:
15139 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE1:
15140 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE2:
15141 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3:
15143 case AMDGPU::G_FNEG:
15144 case AMDGPU::G_FABS:
15145 case AMDGPU::G_FCOPYSIGN:
15147 case AMDGPU::G_FMINNUM:
15148 case AMDGPU::G_FMAXNUM:
15149 case AMDGPU::G_FMINNUM_IEEE:
15150 case AMDGPU::G_FMAXNUM_IEEE:
15151 case AMDGPU::G_FMINIMUM:
15152 case AMDGPU::G_FMAXIMUM:
15153 case AMDGPU::G_FMINIMUMNUM:
15154 case AMDGPU::G_FMAXIMUMNUM: {
15155 if (Subtarget->supportsMinMaxDenormModes() ||
15162 case AMDGPU::G_BUILD_VECTOR:
15167 case AMDGPU::G_INTRINSIC:
15168 case AMDGPU::G_INTRINSIC_CONVERGENT:
15170 case Intrinsic::amdgcn_fmul_legacy:
15171 case Intrinsic::amdgcn_fmad_ftz:
15172 case Intrinsic::amdgcn_sqrt:
15173 case Intrinsic::amdgcn_fmed3:
15174 case Intrinsic::amdgcn_sin:
15175 case Intrinsic::amdgcn_cos:
15176 case Intrinsic::amdgcn_log:
15177 case Intrinsic::amdgcn_exp2:
15178 case Intrinsic::amdgcn_log_clamp:
15179 case Intrinsic::amdgcn_rcp:
15180 case Intrinsic::amdgcn_rcp_legacy:
15181 case Intrinsic::amdgcn_rsq:
15182 case Intrinsic::amdgcn_rsq_clamp:
15183 case Intrinsic::amdgcn_rsq_legacy:
15184 case Intrinsic::amdgcn_div_scale:
15185 case Intrinsic::amdgcn_div_fmas:
15186 case Intrinsic::amdgcn_div_fixup:
15187 case Intrinsic::amdgcn_fract:
15188 case Intrinsic::amdgcn_cvt_pkrtz:
15189 case Intrinsic::amdgcn_cubeid:
15190 case Intrinsic::amdgcn_cubema:
15191 case Intrinsic::amdgcn_cubesc:
15192 case Intrinsic::amdgcn_cubetc:
15193 case Intrinsic::amdgcn_frexp_mant:
15194 case Intrinsic::amdgcn_fdot2:
15195 case Intrinsic::amdgcn_trig_preop:
15196 case Intrinsic::amdgcn_tanh:
15215 if (
C.isDenormal()) {
15229 if (
C.isSignaling()) {
15252SITargetLowering::performFCanonicalizeCombine(
SDNode *
N,
15253 DAGCombinerInfo &DCI)
const {
15254 SelectionDAG &DAG = DCI.DAG;
15256 EVT VT =
N->getValueType(0);
15265 EVT VT =
N->getValueType(0);
15266 return getCanonicalConstantFP(DAG, SDLoc(
N), VT, CFP->getValueAPF());
15282 EVT EltVT =
Lo.getValueType();
15285 for (
unsigned I = 0;
I != 2; ++
I) {
15289 getCanonicalConstantFP(DAG, SL, EltVT, CFP->getValueAPF());
15290 }
else if (
Op.isUndef()) {
15326 return AMDGPUISD::FMAX3;
15328 return AMDGPUISD::FMAXIMUM3;
15330 return AMDGPUISD::SMAX3;
15332 return AMDGPUISD::UMAX3;
15336 return AMDGPUISD::FMIN3;
15338 return AMDGPUISD::FMINIMUM3;
15340 return AMDGPUISD::SMIN3;
15342 return AMDGPUISD::UMIN3;
15363 if (!MinK || !MaxK)
15375 unsigned Med3Opc =
Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3;
15376 if (VT == MVT::i32 || (VT == MVT::i16 && Subtarget->hasMed3_16()))
15377 return DAG.
getNode(Med3Opc, SL, VT, Src, MaxVal, MinVal);
15436 const SIMachineFunctionInfo *
Info = MF.
getInfo<SIMachineFunctionInfo>();
15442 if (
Info->getMode().DX10Clamp) {
15451 if (VT == MVT::f32 || (VT == MVT::f16 && Subtarget->hasMed3_16())) {
15483 case AMDGPUISD::FMIN_LEGACY:
15484 case AMDGPUISD::FMAX_LEGACY:
15485 return (VT == MVT::f32) || (VT == MVT::f16 && Subtarget.
hasMin3Max3_16()) ||
15486 (VT == MVT::v2f16 && Subtarget.hasMin3Max3PKF16());
15489 return (VT == MVT::f32 && Subtarget.hasMinimum3Maximum3F32()) ||
15490 (VT == MVT::f16 && Subtarget.hasMinimum3Maximum3F16()) ||
15491 (VT == MVT::v2f16 && Subtarget.hasMinimum3Maximum3PKF16());
15496 return (VT == MVT::i32) || (VT == MVT::i16 && Subtarget.
hasMin3Max3_16());
15505 DAGCombinerInfo &DCI)
const {
15506 SelectionDAG &DAG = DCI.DAG;
15538 if (
SDValue Med3 = performIntMed3ImmCombine(
15543 if (
SDValue Med3 = performIntMed3ImmCombine(
15549 if (
SDValue Med3 = performIntMed3ImmCombine(
15554 if (
SDValue Med3 = performIntMed3ImmCombine(
15567 (
Opc == AMDGPUISD::FMIN_LEGACY &&
15568 Op0.
getOpcode() == AMDGPUISD::FMAX_LEGACY)) &&
15569 (VT == MVT::f32 || VT == MVT::f64 ||
15570 (VT == MVT::f16 && Subtarget->has16BitInsts()) ||
15571 (VT == MVT::bf16 && Subtarget->hasBF16PackedInsts()) ||
15572 (VT == MVT::v2bf16 && Subtarget->hasBF16PackedInsts()) ||
15573 (VT == MVT::v2f16 && Subtarget->hasVOP3PInsts())) &&
15575 if (
SDValue Res = performFPMed3ImmCombine(DAG, SDLoc(
N), Op0, Op1))
15582 const SDNodeFlags
Flags =
N->getFlags();
15584 !Subtarget->hasIEEEMinimumMaximumInsts() &&
15588 return DAG.
getNode(NewOpc, SDLoc(
N), VT, Op0, Op1, Flags);
15598 return (CA->isExactlyValue(0.0) && CB->isExactlyValue(1.0)) ||
15599 (CA->isExactlyValue(1.0) && CB->isExactlyValue(0.0));
15608 DAGCombinerInfo &DCI)
const {
15609 EVT VT =
N->getValueType(0);
15613 SelectionDAG &DAG = DCI.DAG;
15624 return DAG.
getNode(AMDGPUISD::CLAMP, SL, VT, Src2);
15628 const SIMachineFunctionInfo *
Info = MF.
getInfo<SIMachineFunctionInfo>();
15632 if (
Info->getMode().DX10Clamp) {
15645 return DAG.
getNode(AMDGPUISD::CLAMP, SL, VT, Src0);
15652 DAGCombinerInfo &DCI)
const {
15656 return DCI.DAG.getUNDEF(
N->getValueType(0));
15664 bool IsDivergentIdx,
15669 unsigned VecSize = EltSize * NumElem;
15672 if (VecSize <= 64 && EltSize < 32)
15681 if (IsDivergentIdx)
15685 unsigned NumInsts = NumElem +
15686 ((EltSize + 31) / 32) * NumElem ;
15690 if (Subtarget->useVGPRIndexMode())
15691 return NumInsts <= 16;
15695 if (Subtarget->hasMovrel())
15696 return NumInsts <= 15;
15702 SDValue Idx =
N->getOperand(
N->getNumOperands() - 1);
15717SITargetLowering::performExtractVectorEltCombine(
SDNode *
N,
15718 DAGCombinerInfo &DCI)
const {
15724 EVT ResVT =
N->getValueType(0);
15748 if (!
C ||
C->getZExtValue() != 0x1f)
15764 if (Vec.
hasOneUse() && DCI.isBeforeLegalize() && VecEltVT == ResVT) {
15792 DCI.AddToWorklist(Elt0.
getNode());
15793 DCI.AddToWorklist(Elt1.
getNode());
15824 if (KImm && KImm->getValueType(0).getSizeInBits() == 64) {
15825 uint64_t KImmValue = KImm->getZExtValue();
15827 (KImmValue >> (32 * Idx->getZExtValue())) & 0xffffffff, SL, MVT::i32);
15830 if (KFPImm && KFPImm->getValueType(0).getSizeInBits() == 64) {
15831 uint64_t KFPImmValue =
15832 KFPImm->getValueAPF().bitcastToAPInt().getZExtValue();
15833 return DAG.
getConstant((KFPImmValue >> (32 * Idx->getZExtValue())) &
15839 if (!DCI.isBeforeLegalize())
15846 VecSize > 32 && VecSize % 32 == 0 && Idx) {
15849 unsigned BitIndex = Idx->getZExtValue() * VecEltSize;
15850 unsigned EltIdx = BitIndex / 32;
15851 unsigned LeftoverBitIdx = BitIndex % 32;
15855 DCI.AddToWorklist(Cast.
getNode());
15859 DCI.AddToWorklist(Elt.
getNode());
15862 DCI.AddToWorklist(Srl.
getNode());
15866 DCI.AddToWorklist(Trunc.
getNode());
15868 if (VecEltVT == ResVT) {
15880SITargetLowering::performInsertVectorEltCombine(
SDNode *
N,
15881 DAGCombinerInfo &DCI)
const {
15892 SelectionDAG &DAG = DCI.DAG;
15912 Src.getOperand(0).getValueType() == MVT::f16) {
15913 return Src.getOperand(0);
15917 APFloat Val = CFP->getValueAPF();
15918 bool LosesInfo =
true;
15928 DAGCombinerInfo &DCI)
const {
15929 assert(Subtarget->has16BitInsts() && !Subtarget->hasMed3_16() &&
15930 "combine only useful on gfx8");
15932 SDValue TruncSrc =
N->getOperand(0);
15933 EVT VT =
N->getValueType(0);
15934 if (VT != MVT::f16)
15937 if (TruncSrc.
getOpcode() != AMDGPUISD::FMED3 ||
15941 SelectionDAG &DAG = DCI.DAG;
15972unsigned SITargetLowering::getFusedOpcode(
const SelectionDAG &DAG,
15974 const SDNode *N1)
const {
15979 if (((VT == MVT::f32 &&
15981 (VT == MVT::f16 && Subtarget->hasMadF16() &&
16001 EVT VT =
N->getValueType(0);
16002 if (VT != MVT::i32 && VT != MVT::i64)
16008 unsigned Opc =
N->getOpcode();
16063 if (!Const ||
Hi_32(Const->getZExtValue()) !=
uint32_t(-1))
16082 DAGCombinerInfo &DCI)
const {
16085 SelectionDAG &DAG = DCI.DAG;
16086 EVT VT =
N->getValueType(0);
16096 if (!
N->isDivergent() && Subtarget->hasSMulHi())
16100 if (NumBits <= 32 || NumBits > 64)
16111 if (!Subtarget->hasFullRate64Ops()) {
16112 unsigned NumUsers = 0;
16113 for (SDNode *User :
LHS->
users()) {
16116 if (!
User->isAnyAdd())
16140 bool MulSignedLo =
false;
16141 if (!MulLHSUnsigned32 || !MulRHSUnsigned32) {
16150 if (VT != MVT::i64) {
16173 getMad64_32(DAG, SL, MVT::i64, MulLHSLo, MulRHSLo, AddRHS, MulSignedLo);
16175 if (!MulSignedLo && (!MulLHSUnsigned32 || !MulRHSUnsigned32)) {
16176 auto [AccumLo, AccumHi] = DAG.
SplitScalar(Accum, SL, MVT::i32, MVT::i32);
16178 if (!MulLHSUnsigned32) {
16185 if (!MulRHSUnsigned32) {
16196 if (VT != MVT::i64)
16202SITargetLowering::foldAddSub64WithZeroLowBitsTo32(
SDNode *
N,
16203 DAGCombinerInfo &DCI)
const {
16213 SelectionDAG &DAG = DCI.DAG;
16228 unsigned Opcode =
N->getOpcode();
16232 DAG.
getNode(Opcode, SL, MVT::i32,
Hi, ConstHi32,
N->getFlags());
16243static std::optional<ByteProvider<SDValue>>
16246 if (!Byte0 || Byte0->isConstantZero()) {
16247 return std::nullopt;
16250 if (Byte1 && !Byte1->isConstantZero()) {
16251 return std::nullopt;
16257 unsigned FirstCs =
First & 0x0c0c0c0c;
16258 unsigned SecondCs = Second & 0x0c0c0c0c;
16259 unsigned FirstNoCs =
First & ~0x0c0c0c0c;
16260 unsigned SecondNoCs = Second & ~0x0c0c0c0c;
16262 assert((FirstCs & 0xFF) | (SecondCs & 0xFF));
16263 assert((FirstCs & 0xFF00) | (SecondCs & 0xFF00));
16264 assert((FirstCs & 0xFF0000) | (SecondCs & 0xFF0000));
16265 assert((FirstCs & 0xFF000000) | (SecondCs & 0xFF000000));
16267 return (FirstNoCs | SecondNoCs) | (FirstCs & SecondCs);
16291 for (
int BPI = 0; BPI < 2; BPI++) {
16294 BPP = {Src1, Src0};
16296 unsigned ZeroMask = 0x0c0c0c0c;
16297 unsigned FMask = 0xFF << (8 * (3 - Step));
16299 unsigned FirstMask =
16300 (BPP.first.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
16301 unsigned SecondMask =
16302 (BPP.second.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
16306 int FirstGroup = -1;
16307 for (
int I = 0;
I < 2;
I++) {
16309 auto MatchesFirst = [&BPP](
DotSrc &IterElt) {
16310 return IterElt.SrcOp == *BPP.first.Src &&
16311 (IterElt.DWordOffset == (BPP.first.SrcOffset / 4));
16315 if (Match != Srcs.
end()) {
16316 Match->PermMask =
addPermMasks(FirstMask, Match->PermMask);
16321 if (FirstGroup != -1) {
16323 auto MatchesSecond = [&BPP](
DotSrc &IterElt) {
16324 return IterElt.SrcOp == *BPP.second.Src &&
16325 (IterElt.DWordOffset == (BPP.second.SrcOffset / 4));
16328 if (Match != Srcs.
end()) {
16329 Match->PermMask =
addPermMasks(SecondMask, Match->PermMask);
16331 Srcs.
push_back({*BPP.second.Src, SecondMask, BPP.second.SrcOffset / 4});
16339 unsigned ZeroMask = 0x0c0c0c0c;
16340 unsigned FMask = 0xFF << (8 * (3 - Step));
16344 ((Src0.
SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
16348 ((Src1.
SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
16357 if (Srcs.
size() == 1) {
16358 auto *Elt = Srcs.
begin();
16362 if (Elt->PermMask == 0x3020100)
16365 return DAG.
getNode(AMDGPUISD::PERM, SL, MVT::i32, EltOp, EltOp,
16369 auto *FirstElt = Srcs.
begin();
16370 auto *SecondElt = std::next(FirstElt);
16377 auto FirstMask = FirstElt->PermMask;
16378 auto SecondMask = SecondElt->PermMask;
16380 unsigned FirstCs = FirstMask & 0x0c0c0c0c;
16381 unsigned FirstPlusFour = FirstMask | 0x04040404;
16384 FirstMask = (FirstPlusFour & 0x0F0F0F0F) | FirstCs;
16396 FirstElt = std::next(SecondElt);
16397 if (FirstElt == Srcs.
end())
16400 SecondElt = std::next(FirstElt);
16403 if (SecondElt == Srcs.
end()) {
16408 DAG.
getNode(AMDGPUISD::PERM, SL, MVT::i32, EltOp, EltOp,
16409 DAG.
getConstant(FirstElt->PermMask, SL, MVT::i32)));
16415 return Perms.
size() == 2
16421 for (
auto &[EntryVal, EntryMask, EntryOffset] : Srcs) {
16422 EntryMask = EntryMask >> ((4 - ChainLength) * 8);
16423 auto ZeroMask = ChainLength == 2 ? 0x0c0c0000 : 0x0c000000;
16424 EntryMask += ZeroMask;
16429 auto Opcode =
Op.getOpcode();
16431 return (Opcode ==
ISD::MUL || Opcode == AMDGPUISD::MUL_U24 ||
16432 Opcode == AMDGPUISD::MUL_I24);
16435static std::optional<bool>
16446 bool S0IsSigned = Known0.countMinLeadingOnes() > 0;
16449 bool S1IsSigned = Known1.countMinLeadingOnes() > 0;
16451 assert(!(S0IsUnsigned && S0IsSigned));
16452 assert(!(S1IsUnsigned && S1IsSigned));
16460 if ((S0IsUnsigned && S1IsUnsigned) || (S0IsSigned && S1IsSigned))
16466 if ((S0IsUnsigned && S1IsSigned) || (S0IsSigned && S1IsUnsigned))
16467 return std::nullopt;
16479 if ((S0IsSigned && !(S1IsSigned || S1IsUnsigned)) ||
16480 ((S1IsSigned && !(S0IsSigned || S0IsUnsigned))))
16485 if ((!(S1IsSigned || S1IsUnsigned) && !(S0IsSigned || S0IsUnsigned)))
16491 if ((S0IsUnsigned && !(S1IsSigned || S1IsUnsigned)) ||
16492 ((S1IsUnsigned && !(S0IsSigned || S0IsUnsigned))))
16493 return std::nullopt;
16499 DAGCombinerInfo &DCI)
const {
16500 SelectionDAG &DAG = DCI.DAG;
16501 EVT VT =
N->getValueType(0);
16507 if (Subtarget->hasMad64_32()) {
16508 if (
SDValue Folded = tryFoldToMad64_32(
N, DCI))
16513 if (
SDValue V = reassociateScalarOps(
N, DAG)) {
16517 if (VT == MVT::i64) {
16518 if (
SDValue Folded = foldAddSub64WithZeroLowBitsTo32(
N, DCI))
16523 (Subtarget->hasDot1Insts() || Subtarget->hasDot8Insts())) {
16525 std::optional<bool> IsSigned;
16531 int ChainLength = 0;
16532 for (
int I = 0;
I < 4;
I++) {
16536 auto Src0 =
handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(0));
16539 auto Src1 =
handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(1));
16544 TempNode->getOperand(MulIdx), *Src0, *Src1,
16545 TempNode->getOperand(MulIdx)->getOperand(0),
16546 TempNode->getOperand(MulIdx)->getOperand(1), DAG);
16550 IsSigned = *IterIsSigned;
16551 if (*IterIsSigned != *IsSigned)
16554 auto AddIdx = 1 - MulIdx;
16557 if (
I == 2 &&
isMul(TempNode->getOperand(AddIdx))) {
16558 Src2s.
push_back(TempNode->getOperand(AddIdx));
16568 TempNode->getOperand(AddIdx), *Src0, *Src1,
16569 TempNode->getOperand(AddIdx)->getOperand(0),
16570 TempNode->getOperand(AddIdx)->getOperand(1), DAG);
16574 if (*IterIsSigned != *IsSigned)
16578 ChainLength =
I + 2;
16582 TempNode = TempNode->getOperand(AddIdx);
16584 ChainLength =
I + 1;
16585 if (TempNode->getNumOperands() < 2)
16587 LHS = TempNode->getOperand(0);
16588 RHS = TempNode->getOperand(1);
16591 if (ChainLength < 2)
16597 if (ChainLength < 4) {
16607 bool UseOriginalSrc =
false;
16608 if (ChainLength == 4 && Src0s.
size() == 1 && Src1s.
size() == 1 &&
16609 Src0s.
begin()->PermMask == Src1s.
begin()->PermMask &&
16610 Src0s.
begin()->SrcOp.getValueSizeInBits() >= 32 &&
16611 Src1s.
begin()->SrcOp.getValueSizeInBits() >= 32) {
16612 SmallVector<unsigned, 4> SrcBytes;
16613 auto Src0Mask = Src0s.
begin()->PermMask;
16614 SrcBytes.
push_back(Src0Mask & 0xFF000000);
16615 bool UniqueEntries =
true;
16616 for (
auto I = 1;
I < 4;
I++) {
16617 auto NextByte = Src0Mask & (0xFF << ((3 -
I) * 8));
16620 UniqueEntries =
false;
16626 if (UniqueEntries) {
16627 UseOriginalSrc =
true;
16629 auto *FirstElt = Src0s.
begin();
16633 auto *SecondElt = Src1s.
begin();
16635 SecondElt->DWordOffset);
16644 if (!UseOriginalSrc) {
16651 DAG.
getExtOrTrunc(*IsSigned, Src2s[ChainLength - 1], SL, MVT::i32);
16654 : Intrinsic::amdgcn_udot4,
16664 if (VT != MVT::i32 || !DCI.isAfterLegalizeDAG())
16669 unsigned Opc =
LHS.getOpcode();
16681 auto Cond =
RHS.getOperand(0);
16686 SDVTList VTList = DAG.
getVTList(MVT::i32, MVT::i1);
16703 DAGCombinerInfo &DCI)
const {
16704 SelectionDAG &DAG = DCI.DAG;
16706 EVT VT =
N->getValueType(0);
16719 SDNodeFlags ShlFlags = N1->
getFlags();
16723 SDNodeFlags NewShlFlags =
16728 DCI.AddToWorklist(Inner.
getNode());
16735 if (Subtarget->hasMad64_32()) {
16736 if (
SDValue Folded = tryFoldToMad64_32(
N, DCI))
16745 if (VT == MVT::i64) {
16746 if (
SDValue Folded = foldAddSub64WithZeroLowBitsTo32(
N, DCI))
16759 if (!YIsConstant && !ZIsConstant && !
X->isDivergent() &&
16760 Y->isDivergent() !=
Z->isDivergent()) {
16769 if (
Y->isDivergent())
16772 SDNodeFlags ReassocFlags =
16775 DCI.AddToWorklist(UniformInner.
getNode());
16783 DAGCombinerInfo &DCI)
const {
16784 SelectionDAG &DAG = DCI.DAG;
16785 EVT VT =
N->getValueType(0);
16787 if (VT == MVT::i64) {
16788 if (
SDValue Folded = foldAddSub64WithZeroLowBitsTo32(
N, DCI))
16792 if (VT != MVT::i32)
16801 unsigned Opc =
RHS.getOpcode();
16808 auto Cond =
RHS.getOperand(0);
16813 SDVTList VTList = DAG.
getVTList(MVT::i32, MVT::i1);
16831SITargetLowering::performAddCarrySubCarryCombine(
SDNode *
N,
16832 DAGCombinerInfo &DCI)
const {
16834 if (
N->getValueType(0) != MVT::i32)
16840 SelectionDAG &DAG = DCI.DAG;
16845 unsigned LHSOpc =
LHS.getOpcode();
16846 unsigned Opc =
N->getOpcode();
16850 return DAG.
getNode(
Opc, SDLoc(
N),
N->getVTList(), Args);
16856 DAGCombinerInfo &DCI)
const {
16860 SelectionDAG &DAG = DCI.DAG;
16861 EVT VT =
N->getValueType(0);
16873 if (
A ==
LHS.getOperand(1)) {
16874 unsigned FusedOp = getFusedOpcode(DAG,
N,
LHS.getNode());
16875 if (FusedOp != 0) {
16877 return DAG.
getNode(FusedOp, SL, VT,
A, Two,
RHS);
16885 if (
A ==
RHS.getOperand(1)) {
16886 unsigned FusedOp = getFusedOpcode(DAG,
N,
RHS.getNode());
16887 if (FusedOp != 0) {
16889 return DAG.
getNode(FusedOp, SL, VT,
A, Two,
LHS);
16898 DAGCombinerInfo &DCI)
const {
16902 SelectionDAG &DAG = DCI.DAG;
16904 EVT VT =
N->getValueType(0);
16917 if (
A ==
LHS.getOperand(1)) {
16918 unsigned FusedOp = getFusedOpcode(DAG,
N,
LHS.getNode());
16919 if (FusedOp != 0) {
16923 return DAG.
getNode(FusedOp, SL, VT,
A, Two, NegRHS);
16932 if (
A ==
RHS.getOperand(1)) {
16933 unsigned FusedOp = getFusedOpcode(DAG,
N,
RHS.getNode());
16934 if (FusedOp != 0) {
16936 return DAG.
getNode(FusedOp, SL, VT,
A, NegTwo,
LHS);
16945 DAGCombinerInfo &DCI)
const {
16946 SelectionDAG &DAG = DCI.DAG;
16948 EVT VT =
N->getValueType(0);
16957 SDNodeFlags
Flags =
N->getFlags();
16958 SDNodeFlags RHSFlags =
RHS->getFlags();
16964 bool IsNegative =
false;
16965 if (CLHS->isExactlyValue(1.0) ||
16966 (IsNegative = CLHS->isExactlyValue(-1.0))) {
16972 DAG.
getNode(AMDGPUISD::RSQ, SL, VT,
RHS.getOperand(0), Flags);
16982 DAGCombinerInfo &DCI)
const {
16983 SelectionDAG &DAG = DCI.DAG;
16984 EVT VT =
N->getValueType(0);
16988 if (!
N->isDivergent() &&
getSubtarget()->hasSALUFloatInsts() &&
16989 (ScalarVT == MVT::f32 || ScalarVT == MVT::f16)) {
17004 if ((ScalarVT == MVT::f64 || ScalarVT == MVT::f32 || ScalarVT == MVT::f16) &&
17009 const ConstantFPSDNode *FalseNode =
17019 if (ScalarVT == MVT::f32 &&
17025 if (TrueNodeExpVal == INT_MIN)
17028 if (FalseNodeExpVal == INT_MIN)
17048 DAGCombinerInfo &DCI)
const {
17049 SelectionDAG &DAG = DCI.DAG;
17050 EVT VT =
N->getValueType(0);
17053 if (!Subtarget->hasDot10Insts() || VT != MVT::f32)
17071 (
N->getFlags().hasAllowContract() &&
17072 FMA->getFlags().hasAllowContract())) {
17106 if (Vec1 == Vec2 || Vec3 == Vec4)
17112 if ((Vec1 == Vec3 && Vec2 == Vec4) || (Vec1 == Vec4 && Vec2 == Vec3)) {
17113 return DAG.
getNode(AMDGPUISD::FDOT2, SL, MVT::f32, Vec1, Vec2, FMAAcc,
17121 DAGCombinerInfo &DCI)
const {
17122 SelectionDAG &DAG = DCI.DAG;
17127 EVT VT =
LHS.getValueType();
17156 return LHS.getOperand(0);
17170 const APInt &CT =
LHS.getConstantOperandAPInt(1);
17171 const APInt &CF =
LHS.getConstantOperandAPInt(2);
17176 return DAG.
getNOT(SL,
LHS.getOperand(0), MVT::i1);
17179 return LHS.getOperand(0);
17200 if (VT == MVT::i64) {
17212 const std::optional<bool> KnownEq =
17220 const std::optional<bool> KnownEq =
17231 const std::optional<bool> KnownUge =
17251 const std::optional<bool> KnownUle =
17302 DAG.
getVTList(MVT::i32, MVT::i1), {Op0Lo, Op1Lo});
17307 {Op0Hi, Op1Hi, CarryInHi});
17317 DCI.CombineTo(
LHS.getNode(), Result);
17321 if (VT != MVT::f32 && VT != MVT::f64 &&
17322 (!Subtarget->has16BitInsts() || VT != MVT::f16))
17337 const unsigned IsInfMask =
17339 const unsigned IsFiniteMask =
17344 return DAG.
getNode(AMDGPUISD::FP_CLASS, SL, MVT::i1,
LHS.getOperand(0),
17353SITargetLowering::performCvtF32UByteNCombine(
SDNode *
N,
17354 DAGCombinerInfo &DCI)
const {
17355 SelectionDAG &DAG = DCI.DAG;
17357 unsigned Offset =
N->getOpcode() - AMDGPUISD::CVT_F32_UBYTE0;
17376 unsigned ShiftOffset = 8 *
Offset;
17378 ShiftOffset -=
C->getZExtValue();
17380 ShiftOffset +=
C->getZExtValue();
17382 if (ShiftOffset < 32 && (ShiftOffset % 8) == 0) {
17383 return DAG.
getNode(AMDGPUISD::CVT_F32_UBYTE0 + ShiftOffset / 8, SL,
17384 MVT::f32, Shifted);
17395 DCI.AddToWorklist(
N);
17402 return DAG.
getNode(
N->getOpcode(), SL, MVT::f32, DemandedSrc);
17408 DAGCombinerInfo &DCI)
const {
17413 const MachineFunction &MF = DCI.DAG.getMachineFunction();
17417 (
F.isNaN() && MF.
getInfo<SIMachineFunctionInfo>()->getMode().DX10Clamp)) {
17418 return DCI.DAG.getConstantFP(Zero, SDLoc(
N),
N->getValueType(0));
17421 APFloat One(
F.getSemantics(),
"1.0");
17423 return DCI.DAG.getConstantFP(One, SDLoc(
N),
N->getValueType(0));
17429 DAGCombinerInfo &DCI)
const {
17450 bool isFloatingPoint =
LHS.getValueType().isFloatingPoint();
17451 bool isInteger =
LHS.getValueType().isInteger();
17454 if (!isFloatingPoint && !isInteger)
17459 if (!isEquality && !isNonEquality)
17476 if (isFloatingPoint) {
17478 if (!Val.
isNormal() || Subtarget->getInstrInfo()->isInlineConstant(Val))
17489 if (!(isEquality && TrueVal == ConstVal) &&
17490 !(isNonEquality && FalseVal == ConstVal))
17497 SelectLHS, SelectRHS);
17502 switch (
N->getOpcode()) {
17518 if (
auto Res = promoteUniformOpToI32(
SDValue(
N, 0), DCI))
17528 switch (
N->getOpcode()) {
17530 return performAddCombine(
N, DCI);
17532 return performPtrAddCombine(
N, DCI);
17534 return performSubCombine(
N, DCI);
17537 return performAddCarrySubCarryCombine(
N, DCI);
17539 return performFAddCombine(
N, DCI);
17541 return performFSubCombine(
N, DCI);
17543 return performFDivCombine(
N, DCI);
17545 return performFMulCombine(
N, DCI);
17547 return performSetCCCombine(
N, DCI);
17549 if (
auto Res = performSelectCombine(
N, DCI))
17564 case AMDGPUISD::FMIN_LEGACY:
17565 case AMDGPUISD::FMAX_LEGACY:
17566 return performMinMaxCombine(
N, DCI);
17568 return performFMACombine(
N, DCI);
17570 return performAndCombine(
N, DCI);
17572 return performOrCombine(
N, DCI);
17575 if (
N->getValueType(0) == MVT::i32 &&
N->isDivergent() &&
17576 TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
17582 return performXorCombine(
N, DCI);
17585 return performZeroOrAnyExtendCombine(
N, DCI);
17587 return performSignExtendInRegCombine(
N, DCI);
17588 case AMDGPUISD::FP_CLASS:
17589 return performClassCombine(
N, DCI);
17591 return performFCanonicalizeCombine(
N, DCI);
17592 case AMDGPUISD::RCP:
17593 return performRcpCombine(
N, DCI);
17595 case AMDGPUISD::FRACT:
17596 case AMDGPUISD::RSQ:
17597 case AMDGPUISD::RCP_LEGACY:
17598 case AMDGPUISD::RCP_IFLAG:
17599 case AMDGPUISD::RSQ_CLAMP: {
17608 return performUCharToFloatCombine(
N, DCI);
17610 return performFCopySignCombine(
N, DCI);
17611 case AMDGPUISD::CVT_F32_UBYTE0:
17612 case AMDGPUISD::CVT_F32_UBYTE1:
17613 case AMDGPUISD::CVT_F32_UBYTE2:
17614 case AMDGPUISD::CVT_F32_UBYTE3:
17615 return performCvtF32UByteNCombine(
N, DCI);
17616 case AMDGPUISD::FMED3:
17617 return performFMed3Combine(
N, DCI);
17618 case AMDGPUISD::CVT_PKRTZ_F16_F32:
17619 return performCvtPkRTZCombine(
N, DCI);
17620 case AMDGPUISD::CLAMP:
17621 return performClampCombine(
N, DCI);
17624 EVT VT =
N->getValueType(0);
17627 if (VT == MVT::v2i16 || VT == MVT::v2f16 || VT == MVT::v2bf16) {
17630 EVT EltVT = Src.getValueType();
17631 if (EltVT != MVT::i16)
17641 return performExtractVectorEltCombine(
N, DCI);
17643 return performInsertVectorEltCombine(
N, DCI);
17645 return performFPRoundCombine(
N, DCI);
17654 return performMemSDNodeCombine(MemNode, DCI);
17685 unsigned Opcode =
Node->getMachineOpcode();
17688 int D16Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::d16) - 1;
17689 if (D16Idx >= 0 &&
Node->getConstantOperandVal(D16Idx))
17692 SDNode *
Users[5] = {
nullptr};
17694 unsigned DmaskIdx =
17695 AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::dmask) - 1;
17696 unsigned OldDmask =
Node->getConstantOperandVal(DmaskIdx);
17697 unsigned NewDmask = 0;
17698 unsigned TFEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::tfe) - 1;
17699 unsigned LWEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::lwe) - 1;
17700 bool UsesTFC = (int(TFEIdx) >= 0 &&
Node->getConstantOperandVal(TFEIdx)) ||
17701 (
int(LWEIdx) >= 0 &&
Node->getConstantOperandVal(LWEIdx));
17702 unsigned TFCLane = 0;
17703 bool HasChain =
Node->getNumValues() > 1;
17705 if (OldDmask == 0) {
17713 TFCLane = OldBitsSet;
17717 for (SDUse &Use :
Node->uses()) {
17720 if (
Use.getResNo() != 0)
17723 SDNode *
User =
Use.getUser();
17726 if (!
User->isMachineOpcode() ||
17727 User->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG)
17739 if (UsesTFC && Lane == TFCLane) {
17744 for (
unsigned i = 0, Dmask = OldDmask; (i <= Lane) && (Dmask != 0); i++) {
17746 Dmask &= ~(1 << Comp);
17754 NewDmask |= 1 << Comp;
17759 bool NoChannels = !NewDmask;
17766 if (OldBitsSet == 1)
17772 if (NewDmask == OldDmask)
17781 unsigned NewChannels = BitsSet + UsesTFC;
17785 assert(NewOpcode != -1 &&
17786 NewOpcode !=
static_cast<int>(
Node->getMachineOpcode()) &&
17787 "failed to find equivalent MIMG op");
17795 MVT SVT =
Node->getValueType(0).getVectorElementType().getSimpleVT();
17797 MVT ResultVT = NewChannels == 1
17800 : NewChannels == 5 ? 8
17802 SDVTList NewVTList =
17805 MachineSDNode *NewNode =
17814 if (NewChannels == 1) {
17824 for (
unsigned i = 0, Idx = AMDGPU::sub0; i < 5; ++i) {
17829 if (i || !NoChannels)
17834 if (NewUser != User) {
17844 Idx = AMDGPU::sub1;
17847 Idx = AMDGPU::sub2;
17850 Idx = AMDGPU::sub3;
17853 Idx = AMDGPU::sub4;
17864 Op =
Op.getOperand(0);
17885 MRI.createVirtualRegister(&AMDGPU::VReg_1RegClass), MVT::i1);
17889 Node->getOperand(0), SL, VReg, SrcVal,
17895 return ToResultReg.
getNode();
17900 for (
unsigned i = 0; i <
Node->getNumOperands(); ++i) {
17902 Ops.push_back(
Node->getOperand(i));
17908 Node->getOperand(i).getValueType(),
17909 Node->getOperand(i)),
17921 unsigned Opcode =
Node->getMachineOpcode();
17923 if (
TII->isImage(Opcode) && !
TII->get(Opcode).mayStore() &&
17924 !
TII->isGather4(Opcode) &&
17926 return adjustWritemask(
Node, DAG);
17929 if (Opcode == AMDGPU::INSERT_SUBREG || Opcode == AMDGPU::REG_SEQUENCE) {
17935 case AMDGPU::V_DIV_SCALE_F32_e64:
17936 case AMDGPU::V_DIV_SCALE_F64_e64: {
17946 (Src0 == Src1 || Src0 == Src2))
18002 AMDGPU::getNamedOperandIdx(
MI.getOpcode(), AMDGPU::OpName::vdata);
18003 unsigned InitIdx = 0;
18005 if (
TII->isImage(
MI)) {
18013 unsigned TFEVal = TFE ? TFE->
getImm() : 0;
18014 unsigned LWEVal = LWE ? LWE->
getImm() : 0;
18015 unsigned D16Val = D16 ? D16->getImm() : 0;
18017 if (!TFEVal && !LWEVal)
18028 assert(MO_Dmask &&
"Expected dmask operand in instruction");
18030 unsigned dmask = MO_Dmask->
getImm();
18035 bool Packed = !Subtarget->hasUnpackedD16VMem();
18037 InitIdx = D16Val && Packed ? ((ActiveLanes + 1) >> 1) + 1 : ActiveLanes + 1;
18044 uint32_t DstSize =
TRI.getRegSizeInBits(*DstRC) / 32;
18045 if (DstSize < InitIdx)
18049 InitIdx =
TRI.getRegSizeInBits(*DstRC) / 32;
18057 Register PrevDst =
MRI.cloneVirtualRegister(
MI.getOperand(DstIdx).getReg());
18058 unsigned NewDst = 0;
18063 unsigned SizeLeft = Subtarget->usePRTStrictNull() ? InitIdx : 1;
18064 unsigned CurrIdx = Subtarget->usePRTStrictNull() ? 0 : (InitIdx - 1);
18067 for (; SizeLeft; SizeLeft--, CurrIdx++) {
18068 NewDst =
MRI.createVirtualRegister(
TII->getOpRegClass(
MI, DstIdx));
18088 MI.tieOperands(DstIdx,
MI.getNumOperands() - 1);
18100 if (
TII->isVOP3(
MI.getOpcode())) {
18102 TII->legalizeOperandsVOP3(
MRI,
MI);
18104 if (
TII->isMAI(
MI)) {
18109 int Src0Idx = AMDGPU::getNamedOperandIdx(
MI.getOpcode(),
18110 AMDGPU::OpName::scale_src0);
18111 if (Src0Idx != -1) {
18112 int Src1Idx = AMDGPU::getNamedOperandIdx(
MI.getOpcode(),
18113 AMDGPU::OpName::scale_src1);
18114 if (
TII->usesConstantBus(
MRI,
MI, Src0Idx) &&
18115 TII->usesConstantBus(
MRI,
MI, Src1Idx))
18116 TII->legalizeOpWithMove(
MI, Src1Idx);
18123 if (
TII->isImage(
MI))
18124 TII->enforceOperandRCAlignment(
MI, AMDGPU::OpName::vaddr);
18198std::pair<unsigned, const TargetRegisterClass *>
18205 if (Constraint.
size() == 1) {
18209 if (VT == MVT::Other)
18212 switch (Constraint[0]) {
18219 RC = &AMDGPU::SReg_32RegClass;
18222 RC = &AMDGPU::SGPR_64RegClass;
18227 return std::pair(0U,
nullptr);
18234 return std::pair(0U,
nullptr);
18236 RC = Subtarget->useRealTrue16Insts() ? &AMDGPU::VGPR_16RegClass
18237 : &AMDGPU::VGPR_32_Lo256RegClass;
18240 RC = Subtarget->has1024AddressableVGPRs()
18241 ?
TRI->getAlignedLo256VGPRClassForBitWidth(
BitWidth)
18244 return std::pair(0U,
nullptr);
18249 if (!Subtarget->hasMAIInsts())
18253 return std::pair(0U,
nullptr);
18255 RC = &AMDGPU::AGPR_32RegClass;
18260 return std::pair(0U,
nullptr);
18265 }
else if (Constraint ==
"VA" && Subtarget->hasGFX90AInsts()) {
18269 RC = &AMDGPU::AV_32RegClass;
18272 RC =
TRI->getVectorSuperClassForBitWidth(
BitWidth);
18274 return std::pair(0U,
nullptr);
18283 return std::pair(0U, RC);
18286 if (Kind !=
'\0') {
18288 RC = &AMDGPU::VGPR_32_Lo256RegClass;
18289 }
else if (Kind ==
's') {
18290 RC = &AMDGPU::SGPR_32RegClass;
18291 }
else if (Kind ==
'a') {
18292 RC = &AMDGPU::AGPR_32RegClass;
18298 return std::pair(0U,
nullptr);
18304 return std::pair(0U,
nullptr);
18308 RC =
TRI->getVGPRClassForBitWidth(Width);
18310 RC =
TRI->getSGPRClassForBitWidth(Width);
18312 RC =
TRI->getAGPRClassForBitWidth(Width);
18314 Reg =
TRI->getMatchingSuperReg(Reg, AMDGPU::sub0, RC);
18319 return std::pair(0U,
nullptr);
18321 return std::pair(Reg, RC);
18327 return std::pair(0U,
nullptr);
18328 if (Idx < RC->getNumRegs())
18330 return std::pair(0U,
nullptr);
18336 Ret.second =
TRI->getPhysRegBaseClass(Ret.first);
18342 if (Constraint.
size() == 1) {
18343 switch (Constraint[0]) {
18353 }
else if (Constraint ==
"DA" || Constraint ==
"DB") {
18361 if (Constraint.
size() == 1) {
18362 switch (Constraint[0]) {
18370 }
else if (Constraint.
size() == 2) {
18371 if (Constraint ==
"VA")
18389 std::vector<SDValue> &
Ops,
18404 unsigned Size =
Op.getScalarValueSizeInBits();
18408 if (
Size == 16 && !Subtarget->has16BitInsts())
18412 Val =
C->getSExtValue();
18416 Val =
C->getValueAPF().bitcastToAPInt().getSExtValue();
18420 if (
Size != 16 ||
Op.getNumOperands() != 2)
18422 if (
Op.getOperand(0).isUndef() ||
Op.getOperand(1).isUndef())
18425 Val =
C->getSExtValue();
18429 Val =
C->getValueAPF().bitcastToAPInt().getSExtValue();
18439 if (Constraint.
size() == 1) {
18440 switch (Constraint[0]) {
18455 }
else if (Constraint.
size() == 2) {
18456 if (Constraint ==
"DA") {
18457 int64_t HiBits =
static_cast<int32_t
>(Val >> 32);
18458 int64_t LoBits =
static_cast<int32_t
>(Val);
18462 if (Constraint ==
"DB") {
18470 unsigned MaxSize)
const {
18471 unsigned Size = std::min<unsigned>(
Op.getScalarValueSizeInBits(), MaxSize);
18472 bool HasInv2Pi = Subtarget->hasInv2PiInlineImm();
18474 MVT VT =
Op.getSimpleValueType();
18499 switch (UnalignedClassID) {
18500 case AMDGPU::VReg_64RegClassID:
18501 return AMDGPU::VReg_64_Align2RegClassID;
18502 case AMDGPU::VReg_96RegClassID:
18503 return AMDGPU::VReg_96_Align2RegClassID;
18504 case AMDGPU::VReg_128RegClassID:
18505 return AMDGPU::VReg_128_Align2RegClassID;
18506 case AMDGPU::VReg_160RegClassID:
18507 return AMDGPU::VReg_160_Align2RegClassID;
18508 case AMDGPU::VReg_192RegClassID:
18509 return AMDGPU::VReg_192_Align2RegClassID;
18510 case AMDGPU::VReg_224RegClassID:
18511 return AMDGPU::VReg_224_Align2RegClassID;
18512 case AMDGPU::VReg_256RegClassID:
18513 return AMDGPU::VReg_256_Align2RegClassID;
18514 case AMDGPU::VReg_288RegClassID:
18515 return AMDGPU::VReg_288_Align2RegClassID;
18516 case AMDGPU::VReg_320RegClassID:
18517 return AMDGPU::VReg_320_Align2RegClassID;
18518 case AMDGPU::VReg_352RegClassID:
18519 return AMDGPU::VReg_352_Align2RegClassID;
18520 case AMDGPU::VReg_384RegClassID:
18521 return AMDGPU::VReg_384_Align2RegClassID;
18522 case AMDGPU::VReg_512RegClassID:
18523 return AMDGPU::VReg_512_Align2RegClassID;
18524 case AMDGPU::VReg_1024RegClassID:
18525 return AMDGPU::VReg_1024_Align2RegClassID;
18526 case AMDGPU::AReg_64RegClassID:
18527 return AMDGPU::AReg_64_Align2RegClassID;
18528 case AMDGPU::AReg_96RegClassID:
18529 return AMDGPU::AReg_96_Align2RegClassID;
18530 case AMDGPU::AReg_128RegClassID:
18531 return AMDGPU::AReg_128_Align2RegClassID;
18532 case AMDGPU::AReg_160RegClassID:
18533 return AMDGPU::AReg_160_Align2RegClassID;
18534 case AMDGPU::AReg_192RegClassID:
18535 return AMDGPU::AReg_192_Align2RegClassID;
18536 case AMDGPU::AReg_256RegClassID:
18537 return AMDGPU::AReg_256_Align2RegClassID;
18538 case AMDGPU::AReg_512RegClassID:
18539 return AMDGPU::AReg_512_Align2RegClassID;
18540 case AMDGPU::AReg_1024RegClassID:
18541 return AMDGPU::AReg_1024_Align2RegClassID;
18557 if (Info->isEntryFunction()) {
18564 unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF);
18566 ? AMDGPU::SGPR_32RegClass.getRegister(MaxNumSGPRs - 1)
18567 :
TRI->getAlignedHighSGPRForRC(MF, 2,
18568 &AMDGPU::SGPR_64RegClass);
18569 Info->setSGPRForEXECCopy(SReg);
18571 assert(!
TRI->isSubRegister(Info->getScratchRSrcReg(),
18572 Info->getStackPtrOffsetReg()));
18573 if (Info->getStackPtrOffsetReg() != AMDGPU::SP_REG)
18574 MRI.replaceRegWith(AMDGPU::SP_REG, Info->getStackPtrOffsetReg());
18578 if (Info->getScratchRSrcReg() != AMDGPU::PRIVATE_RSRC_REG)
18579 MRI.replaceRegWith(AMDGPU::PRIVATE_RSRC_REG, Info->getScratchRSrcReg());
18581 if (Info->getFrameOffsetReg() != AMDGPU::FP_REG)
18582 MRI.replaceRegWith(AMDGPU::FP_REG, Info->getFrameOffsetReg());
18584 Info->limitOccupancy(MF);
18586 if (ST.isWave32() && !MF.
empty()) {
18587 for (
auto &
MBB : MF) {
18588 for (
auto &
MI :
MBB) {
18589 TII->fixImplicitOperands(
MI);
18599 if (ST.needsAlignedVGPRs()) {
18600 for (
unsigned I = 0, E =
MRI.getNumVirtRegs();
I != E; ++
I) {
18606 if (NewClassID != -1)
18607 MRI.setRegClass(Reg,
TRI->getRegClass(NewClassID));
18616 const APInt &DemandedElts,
18618 unsigned Depth)
const {
18620 unsigned Opc =
Op.getOpcode();
18623 unsigned IID =
Op.getConstantOperandVal(0);
18625 case Intrinsic::amdgcn_mbcnt_lo:
18626 case Intrinsic::amdgcn_mbcnt_hi: {
18632 IID == Intrinsic::amdgcn_mbcnt_lo ? ST.getWavefrontSizeLog2() : 5);
18642 Op, Known, DemandedElts, DAG,
Depth);
18658 unsigned MaxValue =
18665 unsigned BFEWidth,
bool SExt,
unsigned Depth) {
18669 unsigned Src1Cst = 0;
18670 if (Src1.
isImm()) {
18671 Src1Cst = Src1.
getImm();
18672 }
else if (Src1.
isReg()) {
18676 Src1Cst = Cst->Value.getZExtValue();
18687 if (Width >= BFEWidth)
18696 Known = Known.
sext(BFEWidth);
18698 Known = Known.
zext(BFEWidth);
18704 unsigned Depth)
const {
18707 switch (
MI->getOpcode()) {
18708 case AMDGPU::S_BFE_I32:
18711 case AMDGPU::S_BFE_U32:
18714 case AMDGPU::S_BFE_I64:
18717 case AMDGPU::S_BFE_U64:
18720 case AMDGPU::G_INTRINSIC:
18721 case AMDGPU::G_INTRINSIC_CONVERGENT: {
18724 case Intrinsic::amdgcn_workitem_id_x:
18727 case Intrinsic::amdgcn_workitem_id_y:
18730 case Intrinsic::amdgcn_workitem_id_z:
18733 case Intrinsic::amdgcn_mbcnt_lo:
18734 case Intrinsic::amdgcn_mbcnt_hi: {
18746 case Intrinsic::amdgcn_groupstaticsize: {
18757 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
18760 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
18763 case AMDGPU::G_AMDGPU_COPY_SCC_VCC:
18768 case AMDGPU::G_AMDGPU_SMED3:
18769 case AMDGPU::G_AMDGPU_UMED3: {
18770 auto [Dst, Src0, Src1, Src2] =
MI->getFirst4Regs();
18797 unsigned Depth)
const {
18804 AttributeList Attrs =
18806 if (
MaybeAlign RetAlign = Attrs.getRetAlignment())
18824 if (Header->getAlignment() != PrefAlign)
18825 return Header->getAlignment();
18826 if (needsFetchWindowAlignment(*Header))
18847 if (Header->getAlignment() != PrefAlign)
18848 return Header->getAlignment();
18850 unsigned LoopSize = 0;
18855 LoopSize +=
MBB->getAlignment().value() / 2;
18858 LoopSize +=
TII->getInstSizeInBytes(
MI);
18859 if (LoopSize > 192)
18864 if (LoopSize <= 64)
18867 if (LoopSize <= 128)
18868 return CacheLineAlign;
18874 auto I = Exit->getFirstNonDebugInstr();
18875 if (
I != Exit->end() &&
I->getOpcode() == AMDGPU::S_INST_PREFETCH)
18876 return CacheLineAlign;
18885 if (PreTerm == Pre->
begin() ||
18886 std::prev(PreTerm)->getOpcode() != AMDGPU::S_INST_PREFETCH)
18890 auto ExitHead = Exit->getFirstNonDebugInstr();
18891 if (ExitHead == Exit->end() ||
18892 ExitHead->getOpcode() != AMDGPU::S_INST_PREFETCH)
18897 return CacheLineAlign;
18905 if (needsFetchWindowAlignment(*
MBB))
18910bool SITargetLowering::needsFetchWindowAlignment(
18912 if (!
getSubtarget()->hasLoopHeadInstSplitSensitivity())
18916 if (
MI.isMetaInstruction())
18919 return TII->getInstSizeInBytes(
MI) > 4;
18929 N =
N->getOperand(0).getNode();
18939 switch (
N->getOpcode()) {
18947 if (Reg.isPhysical() ||
MRI.isLiveIn(Reg))
18948 return !
TRI->isSGPRReg(
MRI, Reg);
18954 return !
TRI->isSGPRReg(
MRI, Reg);
18958 unsigned AS = L->getAddressSpace();
18968 case AMDGPUISD::ATOMIC_CMP_SWAP:
18969 case AMDGPUISD::BUFFER_ATOMIC_SWAP:
18970 case AMDGPUISD::BUFFER_ATOMIC_ADD:
18971 case AMDGPUISD::BUFFER_ATOMIC_SUB:
18972 case AMDGPUISD::BUFFER_ATOMIC_SMIN:
18973 case AMDGPUISD::BUFFER_ATOMIC_UMIN:
18974 case AMDGPUISD::BUFFER_ATOMIC_SMAX:
18975 case AMDGPUISD::BUFFER_ATOMIC_UMAX:
18976 case AMDGPUISD::BUFFER_ATOMIC_AND:
18977 case AMDGPUISD::BUFFER_ATOMIC_OR:
18978 case AMDGPUISD::BUFFER_ATOMIC_XOR:
18979 case AMDGPUISD::BUFFER_ATOMIC_INC:
18980 case AMDGPUISD::BUFFER_ATOMIC_DEC:
18981 case AMDGPUISD::BUFFER_ATOMIC_CMPSWAP:
18982 case AMDGPUISD::BUFFER_ATOMIC_FADD:
18983 case AMDGPUISD::BUFFER_ATOMIC_FMIN:
18984 case AMDGPUISD::BUFFER_ATOMIC_FMAX:
18990 return A->readMem() &&
A->writeMem();
19011 switch (Ty.getScalarSizeInBits()) {
19023 const APInt &DemandedElts,
19026 unsigned Depth)
const {
19027 if (
Op.getOpcode() == AMDGPUISD::CLAMP) {
19031 if (Info->getMode().DX10Clamp)
19043 if (RMW->
hasMetadata(
"amdgpu.ignore.denormal.mode"))
19063 <<
"Hardware instruction generated for atomic "
19065 <<
" operation at memory scope " << MemScope;
19070 Type *EltTy = VT->getElementType();
19071 return VT->getNumElements() == 2 &&
19091 unsigned BW =
IT->getBitWidth();
19092 return BW == 32 || BW == 64;
19106 unsigned BW =
DL.getPointerSizeInBits(PT->getAddressSpace());
19107 return BW == 32 || BW == 64;
19110 if (Ty->isFloatTy() || Ty->isDoubleTy())
19114 return VT->getNumElements() == 2 &&
19115 VT->getElementType()->getPrimitiveSizeInBits() == 16;
19125 bool HasSystemScope) {
19132 if (HasSystemScope) {
19133 if (Subtarget.hasAgentScopeFineGrainedRemoteMemoryAtomics() &&
19136 if (Subtarget.hasEmulatedSystemScopeAtomics())
19138 }
else if (Subtarget.hasAgentScopeFineGrainedRemoteMemoryAtomics())
19141 return RMW->
hasMetadata(
"amdgpu.no.fine.grained.memory");
19154 const MDNode *MD =
I->getMetadata(LLVMContext::MD_noalias_addrspace);
19162 return STI.hasGloballyAddressableScratch()
19180 DL.getTypeSizeInBits(RMW->
getType()) == 64 &&
19193 bool HasSystemScope =
19225 if (!
IT ||
IT->getBitWidth() != 32)
19231 if (Subtarget->hasEmulatedSystemScopeAtomics())
19247 if (!HasSystemScope &&
19248 Subtarget->hasAgentScopeFineGrainedRemoteMemoryAtomics())
19260 if (RMW->
hasMetadata(
"amdgpu.no.fine.grained.memory"))
19268 ConstVal && ConstVal->isNullValue())
19306 if (Ty->isFloatTy()) {
19311 if (Ty->isDoubleTy()) {
19332 if (Ty->isFloatTy() &&
19333 !Subtarget->hasMemoryAtomicFaddF32DenormalSupport() &&
19346 if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() &&
isV2F16(Ty))
19350 if (Subtarget->hasAtomicGlobalPkAddBF16Inst() &&
isV2BF16(Ty))
19354 if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() &&
isV2F16(Ty))
19359 if (Subtarget->hasAtomicBufferPkAddBF16Inst() &&
isV2BF16(Ty))
19364 if (Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst() && Ty->isDoubleTy())
19368 if (Ty->isFloatTy()) {
19371 if (RMW->
use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
19374 if (!RMW->
use_empty() && Subtarget->hasAtomicFaddRtnInsts())
19379 Subtarget->hasAtomicBufferGlobalPkAddF16NoRtnInsts() &&
19387 if (Subtarget->hasFlatAtomicFaddF32Inst())
19396 if (Subtarget->hasLDSFPAtomicAddF32()) {
19397 if (RMW->
use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
19399 if (!RMW->
use_empty() && Subtarget->hasAtomicFaddRtnInsts())
19427 if (Subtarget->hasAtomicFMinFMaxF32FlatInsts() && Ty->isFloatTy())
19429 if (Subtarget->hasAtomicFMinFMaxF64FlatInsts() && Ty->isDoubleTy())
19433 if (Subtarget->hasAtomicFMinFMaxF32GlobalInsts() && Ty->isFloatTy())
19435 if (Subtarget->hasAtomicFMinFMaxF64GlobalInsts() && Ty->isDoubleTy())
19489 if (RC == &AMDGPU::VReg_1RegClass && !isDivergent)
19490 return Subtarget->isWave64() ? &AMDGPU::SReg_64RegClass
19491 : &AMDGPU::SReg_32RegClass;
19492 if (!
TRI->isSGPRClass(RC) && !isDivergent)
19493 return TRI->getEquivalentSGPRClass(RC);
19494 if (
TRI->isSGPRClass(RC) && isDivergent) {
19495 if (Subtarget->hasGFX90AInsts())
19496 return TRI->getEquivalentAVClass(RC);
19497 return TRI->getEquivalentVGPRClass(RC);
19510 unsigned WaveSize) {
19515 if (!
IT ||
IT->getBitWidth() != WaveSize)
19520 if (!Visited.
insert(V).second)
19522 bool Result =
false;
19523 for (
const auto *U : V->users()) {
19525 if (V == U->getOperand(1)) {
19530 case Intrinsic::amdgcn_if_break:
19531 case Intrinsic::amdgcn_if:
19532 case Intrinsic::amdgcn_else:
19537 if (V == U->getOperand(0)) {
19542 case Intrinsic::amdgcn_end_cf:
19543 case Intrinsic::amdgcn_loop:
19549 Result =
hasCFUser(U, Visited, WaveSize);
19558 const Value *V)
const {
19560 if (CI->isInlineAsm()) {
19569 for (
auto &TC : TargetConstraints) {
19583 return hasCFUser(V, Visited, Subtarget->getWavefrontSize());
19611 return MRI.hasOneNonDBGUse(N0);
19618 if (
I.getMetadata(
"amdgpu.noclobber"))
19620 if (
I.getMetadata(
"amdgpu.last.use"))
19684 Alignment = RMW->getAlign();
19697 bool FullFlatEmulation =
19699 ((Subtarget->hasAtomicFaddInsts() && RMW->getType()->isFloatTy()) ||
19700 (Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst() &&
19701 RMW->getType()->isDoubleTy()));
19704 bool ReturnValueIsUsed = !AI->
use_empty();
19713 if (FullFlatEmulation) {
19724 std::prev(BB->
end())->eraseFromParent();
19725 Builder.SetInsertPoint(BB);
19727 Value *LoadedShared =
nullptr;
19728 if (FullFlatEmulation) {
19729 CallInst *IsShared = Builder.CreateIntrinsic(Intrinsic::amdgcn_is_shared,
19730 {Addr},
nullptr,
"is.shared");
19731 Builder.CreateCondBr(IsShared, SharedBB, CheckPrivateBB);
19732 Builder.SetInsertPoint(SharedBB);
19733 Value *CastToLocal = Builder.CreateAddrSpaceCast(
19739 LoadedShared = Clone;
19741 Builder.CreateBr(PhiBB);
19742 Builder.SetInsertPoint(CheckPrivateBB);
19745 CallInst *IsPrivate = Builder.CreateIntrinsic(Intrinsic::amdgcn_is_private,
19746 {Addr},
nullptr,
"is.private");
19747 Builder.CreateCondBr(IsPrivate, PrivateBB, GlobalBB);
19749 Builder.SetInsertPoint(PrivateBB);
19751 Value *CastToPrivate = Builder.CreateAddrSpaceCast(
19754 Value *LoadedPrivate;
19756 LoadedPrivate = Builder.CreateAlignedLoad(
19757 RMW->getType(), CastToPrivate, RMW->getAlign(),
"loaded.private");
19760 LoadedPrivate, RMW->getValOperand());
19762 Builder.CreateAlignedStore(NewVal, CastToPrivate, RMW->getAlign());
19764 auto [ResultLoad, Equal] =
19770 LoadedPrivate = Builder.CreateInsertValue(Insert, Equal, 1);
19773 Builder.CreateBr(PhiBB);
19775 Builder.SetInsertPoint(GlobalBB);
19779 if (FullFlatEmulation) {
19780 Value *CastToGlobal = Builder.CreateAddrSpaceCast(
19789 if (!FullFlatEmulation) {
19794 MDNode *RangeNotPrivate =
19797 LoadedGlobal->
setMetadata(LLVMContext::MD_noalias_addrspace,
19801 Builder.CreateBr(PhiBB);
19803 Builder.SetInsertPoint(PhiBB);
19805 if (ReturnValueIsUsed) {
19808 if (FullFlatEmulation)
19809 Loaded->addIncoming(LoadedShared, SharedBB);
19810 Loaded->addIncoming(LoadedPrivate, PrivateBB);
19811 Loaded->addIncoming(LoadedGlobal, GlobalBB);
19812 Loaded->takeName(AI);
19815 Builder.CreateBr(ExitBB);
19819 unsigned PtrOpIdx) {
19820 Value *PtrOp =
I->getOperand(PtrOpIdx);
19827 I->setOperand(PtrOpIdx, ASCast);
19839 ConstVal && ConstVal->isNullValue()) {
19869 "Expand Atomic Load only handles SCRATCH -> FLAT conversion");
19877 "Expand Atomic Store only handles SCRATCH -> FLAT conversion");
19892 LoadInst *LI = Builder.CreateAlignedLoad(
static bool isMul(MachineInstr *MI)
unsigned const MachineRegisterInfo * MRI
static unsigned getIntrinsicID(const SDNode *N)
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static constexpr std::pair< ImplicitArgumentMask, StringLiteral > ImplicitAttrs[]
static bool allUsesHaveSourceMods(MachineInstr &MI, MachineRegisterInfo &MRI, unsigned CostThreshold=4)
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static bool isNoUnsignedWrap(MachineInstr *Addr)
static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE, bool &IsTexFail)
static bool isAsyncLDSDMA(Intrinsic::ID Intr)
static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, SmallVectorImpl< Register > &PackedAddrs, unsigned ArgOffset, const AMDGPU::ImageDimIntrinsicInfo *Intr, bool IsA16, bool IsG16)
Turn a set of s16 typed registers in AddrRegs into a dword sized vector with s16 typed elements.
static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI, const AMDGPUTargetMachine &TM, unsigned AddrSpace)
Return true if the value is a known valid address, such that a null check is not necessary.
Provides AMDGPU specific target descriptions.
The AMDGPU TargetMachine interface definition for hw codegen targets.
This file implements a class to represent arbitrary precision integral constant values and operations...
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static cl::opt< ITMode > IT(cl::desc("IT block support"), cl::Hidden, cl::init(DefaultIT), cl::values(clEnumValN(DefaultIT, "arm-default-it", "Generate any type of IT block"), clEnumValN(RestrictedIT, "arm-restrict-it", "Disallow complex IT blocks")))
Function Alias Analysis Results
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static std::optional< SDByteProvider > calculateByteProvider(SDValue Op, unsigned Index, unsigned Depth, std::optional< uint64_t > VectorIndex, unsigned StartingIndex=0)
Utilities for dealing with flags related to floating point properties and mode controls.
AMD GCN specific subclass of TargetSubtarget.
Provides analysis for querying information about KnownBits during GISel passes.
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
const HexagonInstrInfo * TII
iv Induction Variable Users
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
Contains matchers for matching SSA Machine Instructions.
Machine Check Debug Module
static bool isUndef(const MachineInstr &MI)
Register const TargetRegisterInfo * TRI
Promote Memory to Register
static unsigned getAddressSpace(const Value *V, unsigned MaxLookup)
uint64_t IntrinsicInst * II
static constexpr MCPhysReg SPReg
const SmallVectorImpl< MachineOperand > & Cond
static cl::opt< RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode > Mode("regalloc-enable-advisor", cl::Hidden, cl::init(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default), cl::desc("Enable regalloc advisor mode"), cl::values(clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default, "default", "Default"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Release, "release", "precompiled"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Development, "development", "for training")))
Contains matchers for matching SelectionDAG nodes and values.
static void r0(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
static void r3(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
static void r2(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
static void r1(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
#define FP_DENORM_FLUSH_NONE
#define FP_DENORM_FLUSH_IN_FLUSH_OUT
static void reservePrivateMemoryRegs(const TargetMachine &TM, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info)
static SDValue adjustLoadValueTypeImpl(SDValue Result, EVT LoadVT, const SDLoc &DL, SelectionDAG &DAG, bool Unpacked)
static MachineBasicBlock * emitIndirectSrc(MachineInstr &MI, MachineBasicBlock &MBB, const GCNSubtarget &ST)
static bool denormalModeIsFlushAllF64F16(const MachineFunction &MF)
static bool isAtomicRMWLegalIntTy(Type *Ty)
static void knownBitsForWorkitemID(const GCNSubtarget &ST, GISelValueTracking &VT, KnownBits &Known, unsigned Dim)
static bool flatInstrMayAccessPrivate(const Instruction *I)
Return if a flat address space atomicrmw can access private memory.
static std::pair< unsigned, int > computeIndirectRegAndOffset(const SIRegisterInfo &TRI, const TargetRegisterClass *SuperRC, unsigned VecReg, int Offset)
static bool denormalModeIsFlushAllF32(const MachineFunction &MF)
static bool addresses16Bits(int Mask)
static bool isClampZeroToOne(SDValue A, SDValue B)
static bool supportsMin3Max3(const GCNSubtarget &Subtarget, unsigned Opc, EVT VT)
static unsigned findFirstFreeSGPR(CCState &CCInfo)
static uint32_t getPermuteMask(SDValue V)
static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static int getAlignedAGPRClassID(unsigned UnalignedClassID)
static void processPSInputArgs(SmallVectorImpl< ISD::InputArg > &Splits, CallingConv::ID CallConv, ArrayRef< ISD::InputArg > Ins, BitVector &Skipped, FunctionType *FType, SIMachineFunctionInfo *Info)
static uint64_t getIdentityValueFor64BitWaveReduction(unsigned Opc)
static SDValue selectSOffset(SDValue SOffset, SelectionDAG &DAG, const GCNSubtarget *Subtarget)
static SDValue getLoadExtOrTrunc(SelectionDAG &DAG, ISD::LoadExtType ExtType, SDValue Op, const SDLoc &SL, EVT VT)
static bool globalMemoryFPAtomicIsLegal(const GCNSubtarget &Subtarget, const AtomicRMWInst *RMW, bool HasSystemScope)
static void fixMasks(SmallVectorImpl< DotSrc > &Srcs, unsigned ChainLength)
static bool is32bitWaveReduceOperation(unsigned Opc)
static TargetLowering::AtomicExpansionKind atomicSupportedIfLegalIntType(const AtomicRMWInst *RMW)
static SDValue strictFPExtFromF16(SelectionDAG &DAG, SDValue Src)
Return the source of an fp_extend from f16 to f32, or a converted FP constant.
static bool isAtomicRMWLegalXChgTy(const AtomicRMWInst *RMW)
static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val)
static void convertScratchAtomicToFlatAtomic(Instruction *I, unsigned PtrOpIdx)
static bool isCopyFromRegOfInlineAsm(const SDNode *N)
static bool elementPairIsOddToEven(ArrayRef< int > Mask, int Elt)
static cl::opt< bool > DisableLoopAlignment("amdgpu-disable-loop-alignment", cl::desc("Do not align and prefetch loops"), cl::init(false))
static SDValue getDWordFromOffset(SelectionDAG &DAG, SDLoc SL, SDValue Src, unsigned DWordOffset)
static MachineBasicBlock::iterator loadM0FromVGPR(const SIInstrInfo *TII, MachineBasicBlock &MBB, MachineInstr &MI, unsigned InitResultReg, unsigned PhiReg, int Offset, bool UseGPRIdxMode, Register &SGPRIdxReg)
static bool isFloatingPointWaveReduceOperation(unsigned Opc)
static bool isImmConstraint(StringRef Constraint)
static SDValue padEltsToUndef(SelectionDAG &DAG, const SDLoc &DL, EVT CastVT, SDValue Src, int ExtraElts)
static SDValue lowerICMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static bool hasCFUser(const Value *V, SmallPtrSet< const Value *, 16 > &Visited, unsigned WaveSize)
static OptimizationRemark emitAtomicRMWLegalRemark(const AtomicRMWInst *RMW)
static unsigned SubIdx2Lane(unsigned Idx)
Helper function for adjustWritemask.
static TargetLowering::AtomicExpansionKind getPrivateAtomicExpansionKind(const GCNSubtarget &STI)
static bool addressMayBeAccessedAsPrivate(const MachineMemOperand *MMO, const SIMachineFunctionInfo &Info)
static MachineBasicBlock * lowerWaveReduce(MachineInstr &MI, MachineBasicBlock &BB, const GCNSubtarget &ST, unsigned Opc)
static bool elementPairIsContiguous(ArrayRef< int > Mask, int Elt)
static bool isV2BF16(Type *Ty)
static ArgDescriptor allocateSGPR32InputImpl(CCState &CCInfo, const TargetRegisterClass *RC, unsigned NumArgRegs)
static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue N0, SDValue N1, SDValue N2, bool Signed)
static SDValue resolveSources(SelectionDAG &DAG, SDLoc SL, SmallVectorImpl< DotSrc > &Srcs, bool IsSigned, bool IsAny)
static bool hasNon16BitAccesses(uint64_t PermMask, SDValue &Op, SDValue &OtherOp)
static SDValue lowerWaveShuffle(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static void placeSources(ByteProvider< SDValue > &Src0, ByteProvider< SDValue > &Src1, SmallVectorImpl< DotSrc > &Src0s, SmallVectorImpl< DotSrc > &Src1s, int Step)
static unsigned parseSyncscopeMDArg(const CallBase &CI, unsigned ArgIdx)
static EVT memVTFromLoadIntrReturn(const SITargetLowering &TLI, const DataLayout &DL, Type *Ty, unsigned MaxNumLanes)
static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB, const DebugLoc &DL, const MachineOperand &Idx, unsigned InitReg, unsigned ResultReg, unsigned PhiReg, unsigned InitSaveExecReg, int Offset, bool UseGPRIdxMode, Register &SGPRIdxReg)
static SDValue matchPERM(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static bool isFrameIndexOp(SDValue Op)
static ConstantFPSDNode * getSplatConstantFP(SDValue Op)
static void allocateSGPR32Input(CCState &CCInfo, ArgDescriptor &Arg)
static void knownBitsForSBFE(const MachineInstr &MI, GISelValueTracking &VT, KnownBits &Known, const APInt &DemandedElts, unsigned BFEWidth, bool SExt, unsigned Depth)
static bool isExtendedFrom16Bits(SDValue &Operand)
static std::optional< bool > checkDot4MulSignedness(const SDValue &N, ByteProvider< SDValue > &Src0, ByteProvider< SDValue > &Src1, const SDValue &S0Op, const SDValue &S1Op, const SelectionDAG &DAG)
static bool vectorEltWillFoldAway(SDValue Op)
static SDValue getSPDenormModeValue(uint32_t SPDenormMode, SelectionDAG &DAG, const SIMachineFunctionInfo *Info, const GCNSubtarget *ST)
static uint32_t getConstantPermuteMask(uint32_t C)
static AtomicOrdering parseAtomicOrderingCABIArg(const CallBase &CI, unsigned ArgIdx)
static MachineBasicBlock * emitIndirectDst(MachineInstr &MI, MachineBasicBlock &MBB, const GCNSubtarget &ST)
static void setM0ToIndexFromSGPR(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineInstr &MI, int Offset)
static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask=~0u, ArgDescriptor Arg=ArgDescriptor())
static MachineBasicBlock * Expand64BitScalarArithmetic(MachineInstr &MI, MachineBasicBlock *BB)
static std::pair< MachineBasicBlock *, MachineBasicBlock * > splitBlockForLoop(MachineInstr &MI, MachineBasicBlock &MBB, bool InstInLoop)
static uint32_t getIdentityValueFor32BitWaveReduction(unsigned Opc)
static unsigned getBasePtrIndex(const MemSDNode *N)
MemSDNode::getBasePtr() does not work for intrinsics, which needs to offset by the chain and intrinsi...
static void allocateFixedSGPRInputImpl(CCState &CCInfo, const TargetRegisterClass *RC, MCRegister Reg)
static SDValue constructRetValue(SelectionDAG &DAG, MachineSDNode *Result, ArrayRef< EVT > ResultTypes, bool IsTexFail, bool Unpacked, bool IsD16, int DMaskPop, int NumVDataDwords, bool IsAtomicPacked16Bit, const SDLoc &DL)
static std::optional< ByteProvider< SDValue > > handleMulOperand(const SDValue &MulOperand)
static SDValue lowerFCMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static Register getIndirectSGPRIdx(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineInstr &MI, int Offset)
static SDValue emitNonHSAIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, EVT VT)
static EVT memVTFromLoadIntrData(const SITargetLowering &TLI, const DataLayout &DL, Type *Ty, unsigned MaxNumLanes)
static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc)
static unsigned getExtOpcodeForPromotedOp(SDValue Op)
static SDValue lowerBALLOTIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static SDValue buildSMovImm32(SelectionDAG &DAG, const SDLoc &DL, uint64_t Val)
static SDValue tryFoldMADwithSRL(SelectionDAG &DAG, const SDLoc &SL, SDValue MulLHS, SDValue MulRHS, SDValue AddRHS)
static unsigned getIntrMemWidth(unsigned IntrID)
static SDValue getBuildDwordsVector(SelectionDAG &DAG, SDLoc DL, ArrayRef< SDValue > Elts)
static SDNode * findUser(SDValue Value, unsigned Opcode)
Helper function for LowerBRCOND.
static unsigned addPermMasks(unsigned First, unsigned Second)
static uint64_t clearUnusedBits(uint64_t Val, unsigned Size)
static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue C, SDValue GlueChain, SDNodeFlags Flags)
static bool isV2F16OrV2BF16(Type *Ty)
static bool atomicIgnoresDenormalModeOrFPModeIsFTZ(const AtomicRMWInst *RMW)
static SDValue emitRemovedIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, EVT VT)
static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue GlueChain, SDNodeFlags Flags)
static SDValue buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV, const SDLoc &DL, int64_t Offset, EVT PtrVT, unsigned GAFlags=SIInstrInfo::MO_NONE)
static cl::opt< bool > UseDivergentRegisterIndexing("amdgpu-use-divergent-register-indexing", cl::Hidden, cl::desc("Use indirect register addressing for divergent indexes"), cl::init(false))
static const std::optional< ByteProvider< SDValue > > calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex=0, unsigned Depth=0)
static bool isV2F16(Type *Ty)
static void allocateSGPR64Input(CCState &CCInfo, ArgDescriptor &Arg)
SI DAG Lowering interface definition.
Interface definition for SIRegisterInfo.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
static constexpr int Concat[]
static std::optional< uint32_t > getLDSKernelIdMetadata(const Function &F)
uint32_t getLDSSize() const
void setUsesDynamicLDS(bool DynLDS)
bool isBottomOfStack() const
void setDynLDSAlign(const Function &F, const GlobalVariable &GV)
bool isEntryFunction() const
unsigned getWavefrontSize() const
static unsigned numBitsSigned(SDValue Op, SelectionDAG &DAG)
SDValue SplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Split a vector load into 2 loads of half the vector.
void analyzeFormalArgumentsCompute(CCState &State, const SmallVectorImpl< ISD::InputArg > &Ins) const
The SelectionDAGBuilder will automatically promote function arguments with illegal types.
SDValue LowerF64ToF16Safe(SDValue Src, const SDLoc &DL, SelectionDAG &DAG) const
SDValue storeStackInputValue(SelectionDAG &DAG, const SDLoc &SL, SDValue Chain, SDValue ArgVal, int64_t Offset) const
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
SDValue splitBinaryBitConstantOpImpl(DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS, uint32_t ValLo, uint32_t ValHi) const
Split the 64-bit value LHS into two 32-bit components, and perform the binary operation Opc to it wit...
SDValue lowerUnhandledCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals, StringRef Reason) const
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
SDValue addTokenForArgument(SDValue Chain, SelectionDAG &DAG, MachineFrameInfo &MFI, int ClobberedFI) const
bool isKnownNeverNaNForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
static bool needsDenormHandlingF32(const SelectionDAG &DAG, SDValue Src, SDNodeFlags Flags)
uint32_t getImplicitParameterOffset(const MachineFunction &MF, const ImplicitParameter Param) const
Helper function that returns the byte offset of the given type of implicit parameter.
SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const
virtual SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op, SelectionDAG &DAG) const
SDValue loadInputValue(SelectionDAG &DAG, const TargetRegisterClass *RC, EVT VT, const SDLoc &SL, const ArgDescriptor &Arg) const
AMDGPUTargetLowering(const TargetMachine &TM, const TargetSubtargetInfo &STI, const AMDGPUSubtarget &AMDGPUSTI)
static EVT getEquivalentMemType(LLVMContext &Context, EVT VT)
SDValue CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC, Register Reg, EVT VT, const SDLoc &SL, bool RawReg=false) const
Helper function that adds Reg to the LiveIn list of the DAG's MachineFunction.
SDValue SplitVectorStore(SDValue Op, SelectionDAG &DAG) const
Split a vector store into 2 stores of half the vector.
std::pair< SDValue, SDValue > split64BitValue(SDValue Op, SelectionDAG &DAG) const
Return 64-bit value Op as two 32-bit integers.
static CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg)
static CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg)
Selects the correct CCAssignFn for a given CallingConvention value.
static unsigned numBitsUnsigned(SDValue Op, SelectionDAG &DAG)
static bool allowApproxFunc(const SelectionDAG &DAG, SDNodeFlags Flags)
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
SDValue performRcpCombine(SDNode *N, DAGCombinerInfo &DCI) const
static bool shouldFoldFNegIntoSrc(SDNode *FNeg, SDValue FNegSrc)
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
SDValue WidenOrSplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Widen a suitably aligned v3 load.
SDValue getHiHalf64(SDValue Op, SelectionDAG &DAG) const
static int64_t getNullPointerValue(unsigned AddrSpace)
Get the integer value of a null pointer in the given address space.
bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override
Returns true if a cast between SrcAS and DestAS is a noop.
const std::array< unsigned, 3 > & getDims() const
static const LaneMaskConstants & get(const GCNSubtarget &ST)
const unsigned XorTermOpc
const unsigned AndSaveExecOpc
static constexpr roundingMode rmNearestTiesToEven
static const fltSemantics & IEEEhalf()
static APFloat getQNaN(const fltSemantics &Sem, bool Negative=false, const APInt *payload=nullptr)
Factory for QNaN values.
LLVM_ABI opStatus convert(const fltSemantics &ToSemantics, roundingMode RM, bool *losesInfo)
LLVM_READONLY int getExactLog2Abs() const
APInt bitcastToAPInt() const
static APFloat getLargest(const fltSemantics &Sem, bool Negative=false)
Returns the largest finite number in the given semantics.
static APFloat getInf(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Infinity.
static APFloat getZero(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Zero.
Class for arbitrary precision integers.
void setHighBits(unsigned hiBits)
Set the top hiBits bits.
void setBitsFrom(unsigned loBit)
Set the top bits starting from loBit.
static APInt getBitsSet(unsigned numBits, unsigned loBit, unsigned hiBit)
Get a value with a block of bits set.
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
bool isSignMask() const
Check if the APInt's value is returned by getSignMask.
unsigned countr_zero() const
Count the number of trailing zero bits.
bool isOneBitSet(unsigned BitNo) const
Determine if this APInt Value only has the specified bit set.
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
bool sge(const APInt &RHS) const
Signed greater or equal comparison.
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
This class represents an incoming formal argument to a Function.
LLVM_ABI bool hasAttribute(Attribute::AttrKind Kind) const
Check if an argument has a given attribute.
const Function * getParent() const
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
size_t size() const
size - Get the array size.
bool empty() const
empty - Check if the array is empty.
An instruction that atomically checks whether a specified value is in a memory location,...
Value * getNewValOperand()
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Value * getCompareOperand()
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
static unsigned getPointerOperandIndex()
an instruction that atomically reads a memory location, combines it with another value,...
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
static unsigned getPointerOperandIndex()
BinOp
This enumeration lists the possible modifications atomicrmw can make.
@ USubCond
Subtract only if no unsigned overflow.
@ Min
*p = old <signed v ? old : v
@ USubSat
*p = usub.sat(old, v) usub.sat matches the behavior of llvm.usub.sat.
@ UIncWrap
Increment one up to a maximum value.
@ Max
*p = old >signed v ? old : v
@ UMin
*p = old <unsigned v ? old : v
@ FMin
*p = minnum(old, v) minnum matches the behavior of llvm.minnum.
@ UMax
*p = old >unsigned v ? old : v
@ FMax
*p = maxnum(old, v) maxnum matches the behavior of llvm.maxnum.
@ UDecWrap
Decrement one until a minimum value or zero.
Value * getPointerOperand()
void setOperation(BinOp Operation)
BinOp getOperation() const
SyncScope::ID getSyncScopeID() const
Returns the synchronization scope ID of this rmw instruction.
static LLVM_ABI StringRef getOperationName(BinOp Op)
AtomicOrdering getOrdering() const
Returns the ordering constraint of this rmw instruction.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
bool isCompareAndSwap() const
Returns true if this SDNode represents cmpxchg atomic operation, false otherwise.
This class holds the attributes for a particular argument, parameter, function, or return value.
LLVM_ABI MemoryEffects getMemoryEffects() const
LLVM_ABI bool getValueAsBool() const
Return the attribute's value as a boolean.
LLVM Basic Block Representation.
LLVM_ABI BasicBlock * splitBasicBlock(iterator I, const Twine &BBName="")
Split the basic block into two basic blocks at the specified instruction.
const Function * getParent() const
Return the enclosing method, or null if none.
static BasicBlock * Create(LLVMContext &Context, const Twine &Name="", Function *Parent=nullptr, BasicBlock *InsertBefore=nullptr)
Creates a new BasicBlock.
A "pseudo-class" with methods for operating on BUILD_VECTORs.
Represents known origin of an individual byte in combine pattern.
static ByteProvider getConstantZero()
static ByteProvider getSrc(std::optional< ISelOp > Val, int64_t ByteOffset, int64_t VectorOffset)
std::optional< ISelOp > Src
CCState - This class holds information needed while lowering arguments and return values.
MachineFunction & getMachineFunction() const
unsigned getFirstUnallocated(ArrayRef< MCPhysReg > Regs) const
getFirstUnallocated - Return the index of the first unallocated register in the set,...
static LLVM_ABI bool resultsCompatible(CallingConv::ID CalleeCC, CallingConv::ID CallerCC, MachineFunction &MF, LLVMContext &C, const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn CalleeFn, CCAssignFn CallerFn)
Returns true if the results of the two calling conventions are compatible.
LLVM_ABI void AnalyzeCallResult(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeCallResult - Analyze the return values of a call, incorporating info about the passed values i...
MCRegister AllocateReg(MCPhysReg Reg)
AllocateReg - Attempt to allocate one register.
LLVM_ABI bool CheckReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
CheckReturn - Analyze the return values of a function, returning true if the return can be performed ...
LLVM_ABI void AnalyzeReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeReturn - Analyze the returned values of a return, incorporating info about the result values i...
int64_t AllocateStack(unsigned Size, Align Alignment)
AllocateStack - Allocate a chunk of stack space with the specified size and alignment.
LLVM_ABI void AnalyzeCallOperands(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeCallOperands - Analyze the outgoing arguments to a call, incorporating info about the passed v...
uint64_t getStackSize() const
Returns the size of the currently allocated portion of the stack.
bool isAllocated(MCRegister Reg) const
isAllocated - Return true if the specified register (or an alias) is allocated.
LLVM_ABI void AnalyzeFormalArguments(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeFormalArguments - Analyze an array of argument values, incorporating info about the formals in...
CCValAssign - Represent assignment of one arg/retval to a location.
Register getLocReg() const
LocInfo getLocInfo() const
int64_t getLocMemOffset() const
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
bool hasFnAttr(Attribute::AttrKind Kind) const
Determine whether this call has the given attribute.
LLVM_ABI bool isMustTailCall() const
Tests if this call site must be tail call optimized.
Value * getArgOperand(unsigned i) const
unsigned arg_size() const
This class represents a function call, abstracting a target machine's calling convention.
static LLVM_ABI CastInst * CreatePointerCast(Value *S, Type *Ty, const Twine &Name="", InsertPosition InsertBefore=nullptr)
Create a BitCast, AddrSpaceCast or a PtrToInt cast instruction.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
static bool isFPPredicate(Predicate P)
static bool isIntPredicate(Predicate P)
const APFloat & getValueAPF() const
bool isExactlyValue(double V) const
We don't rely on operator== working on double values, as it returns true for things that are clearly ...
bool isNegative() const
Return true if the value is negative.
bool isInfinity() const
Return true if the value is an infinity.
This is the shared class of boolean and integer constants.
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
uint64_t getZExtValue() const
const APInt & getAPIntValue() const
This is an important base class in LLVM.
A parsed version of the target data layout string in and methods for querying it.
LLVM_ABI Align getABITypeAlign(Type *Ty) const
Returns the minimum ABI-required alignment for the specified type.
Diagnostic information for unsupported feature in backend.
static constexpr ElementCount getFixed(ScalarTy MinVal)
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
FunctionLoweringInfo - This contains information that is global to a function that is used when lower...
Register DemoteRegister
DemoteRegister - if CanLowerReturn is false, DemoteRegister is a vreg allocated to hold a pointer to ...
const Value * getValueFromVirtualReg(Register Vreg)
This method is called from TargetLowerinInfo::isSDNodeSourceOfDivergence to get the Value correspondi...
Class to represent function types.
Type * getParamType(unsigned i) const
Parameter type accessors.
FunctionType * getFunctionType() const
Returns the FunctionType for me.
const DataLayout & getDataLayout() const
Get the data layout of the module this function belongs to.
iterator_range< arg_iterator > args()
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
Argument * getArg(unsigned i) const
const SIInstrInfo * getInstrInfo() const override
const SIRegisterInfo * getRegisterInfo() const override
bool hasMin3Max3_16() const
unsigned getKnownHighZeroBitsForFrameIndex() const
Return the number of high bits known to be zero for a frame index.
bool supportsWaveWideBPermute() const
unsigned getMaxPrivateElementSize(bool ForBufferRSrc=false) const
bool hasKernargSegmentPtr() const
bool hasDispatchID() const
bool hasPrivateSegmentBuffer() const
unsigned getNumFreeUserSGPRs()
bool hasImplicitBufferPtr() const
bool hasPrivateSegmentSize() const
bool hasDispatchPtr() const
bool hasFlatScratchInit() const
const MachineFunction & getMachineFunction() const
void computeKnownBitsImpl(Register R, KnownBits &Known, const APInt &DemandedElts, unsigned Depth=0)
int64_t getOffset() const
LLVM_ABI unsigned getAddressSpace() const
const GlobalValue * getGlobal() const
bool hasExternalLinkage() const
unsigned getAddressSpace() const
Module * getParent()
Get the module that this global value is contained inside of...
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this global belongs to.
Type * getValueType() const
LLVM_ABI uint64_t getGlobalSize(const DataLayout &DL) const
Get the size of this global variable in bytes.
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
LLVM_ABI Instruction * clone() const
Create a copy of 'this' instruction that is identical in all ways except the following:
LLVM_ABI void removeFromParent()
This method unlinks 'this' from the containing basic block, but does not delete it.
bool hasMetadata() const
Return true if this instruction has any metadata attached to it.
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
LLVM_ABI const Function * getFunction() const
Return the function this instruction belongs to.
LLVM_ABI void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
LLVM_ABI void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this instruction belongs to.
LLVM_ABI InstListType::iterator insertInto(BasicBlock *ParentBB, InstListType::iterator It)
Inserts an unlinked instruction into ParentBB at position It and returns the iterator of the inserted...
Class to represent integer types.
A wrapper class for inspecting calls to intrinsic functions.
constexpr unsigned getScalarSizeInBits() const
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
static constexpr LLT pointer(unsigned AddressSpace, unsigned SizeInBits)
Get a low-level pointer in the given address space.
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
constexpr LLT changeElementSize(unsigned NewEltSize) const
If this type is a vector, return a vector with the same number of elements but the new element size.
This is an important class for using LLVM in a threaded context.
LLVM_ABI void emitError(const Instruction *I, const Twine &ErrorStr)
emitError - Emit an error message to the currently installed error handler with optional location inf...
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
LLVM_ABI SyncScope::ID getOrInsertSyncScopeID(StringRef SSN)
getOrInsertSyncScopeID - Maps synchronization scope name to synchronization scope ID.
An instruction for reading from memory.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this load instruction.
static unsigned getPointerOperandIndex()
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
const SDValue & getOffset() const
ISD::LoadExtType getExtensionType() const
Return whether this is a plain node, or one of the varieties of value-extending loads.
Describe properties that are true of each instruction in the target description file.
Wrapper class representing physical registers. Should be passed by value.
LLVM_ABI MDNode * createRange(const APInt &Lo, const APInt &Hi)
Return metadata describing the range [Lo, Hi).
const MDOperand & getOperand(unsigned I) const
Helper class for constructing bundles of MachineInstrs.
MachineBasicBlock::instr_iterator begin() const
Return an iterator to the first bundled instruction.
uint64_t getScalarSizeInBits() const
bool bitsLE(MVT VT) const
Return true if this has no more bits than VT.
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isScalableVector() const
Return true if this is a vector value type where the runtime length is machine dependent.
static LLVM_ABI MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
static MVT getVectorVT(MVT VT, unsigned NumElements)
static MVT getIntegerVT(unsigned BitWidth)
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
LLVM_ABI void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
LLVM_ABI iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
LLVM_ABI MachineBasicBlock * splitAt(MachineInstr &SplitInst, bool UpdateLiveIns=true, LiveIntervals *LIS=nullptr)
Split a basic block into 2 pieces at SplitPoint.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
MachineInstrBundleIterator< MachineInstr > iterator
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
LLVM_ABI int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
bool hasCalls() const
Return true if the current function has any function calls.
void setHasTailCall(bool V=true)
void setReturnAddressIsTaken(bool s)
bool hasStackObjects() const
Return true if there are any stack objects in this function.
PseudoSourceValueManager & getPSVManager() const
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
void push_back(MachineBasicBlock *MBB)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
const DataLayout & getDataLayout() const
Return the DataLayout attached to the Module associated to this MF.
Function & getFunction()
Return the LLVM function that this machine code represents.
BasicBlockListType::iterator iterator
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Register addLiveIn(MCRegister PReg, const TargetRegisterClass *RC)
addLiveIn - Add the specified physical register as a live-in value and create a corresponding virtual...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
const MachineInstrBuilder & setOperandDead(unsigned OpIdx) const
const MachineInstrBuilder & addReg(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
Representation of each machine instruction.
const MachineOperand & getOperand(unsigned i) const
A description of a memory reference used in the backend.
Flags
Flags values. These may be or'd together.
@ MOVolatile
The memory access is volatile.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MONonTemporal
The memory access is non-temporal.
@ MOInvariant
The memory access always returns the same value (or traps).
@ MOStore
The memory access writes data.
Flags getFlags() const
Return the raw flags of the source value,.
MachineOperand class - Representation of each machine instruction operand.
unsigned getSubReg() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
static MachineOperand CreateImm(int64_t Val)
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLVM_ABI void clearKillFlags(Register Reg) const
clearKillFlags - Iterate over all the uses of the given register and clear the kill flag from the Mac...
LLVM_ABI void setType(Register VReg, LLT Ty)
Set the low-level type of VReg to Ty.
An SDNode that represents everything that will be needed to construct a MachineInstr.
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
AAMDNodes getAAInfo() const
Returns the AA info that describes the dereference.
MachineMemOperand * getMemOperand() const
Return the unique MachineMemOperand object describing the memory reference performed by operation.
const MachinePointerInfo & getPointerInfo() const
const SDValue & getChain() const
EVT getMemoryVT() const
Return the type of the in-memory value.
bool onlyWritesMemory() const
Whether this function only (at most) writes memory.
bool doesNotAccessMemory() const
Whether this function accesses no memory.
bool onlyReadsMemory() const
Whether this function only (at most) reads memory.
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
static LLVM_ABI PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
LLVM_ABI const PseudoSourceValue * getConstantPool()
Return a pseudo source value referencing the constant pool.
Wrapper class representing virtual and physical registers.
static Register index2VirtReg(unsigned Index)
Convert a 0-based index to a virtual register number.
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
Represents one node in the SelectionDAG.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool hasOneUse() const
Return true if there is exactly one use of this node.
value_iterator value_end() const
SDNodeFlags getFlags() const
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getNumValues() const
Return the number of values defined/returned by this operator.
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
EVT getValueType(unsigned ResNo) const
Return the type of a specified result.
user_iterator user_begin() const
Provide iteration support to walk over all users of an SDNode.
op_iterator op_end() const
bool isAnyAdd() const
Returns true if the node type is ADD or PTRADD.
value_iterator value_begin() const
op_iterator op_begin() const
Represents a use of a SDNode.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
bool isMachineOpcode() const
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
MVT getSimpleValueType() const
Return the simple ValueType of the referenced return value.
unsigned getMachineOpcode() const
unsigned getOpcode() const
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
static unsigned getDSShaderTypeValue(const MachineFunction &MF)
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
bool isWholeWaveFunction() const
bool hasWorkGroupIDZ() const
AMDGPU::ClusterDimsAttr getClusterDims() const
SIModeRegisterDefaults getMode() const
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(AMDGPUFunctionArgInfo::PreloadedValue Value) const
unsigned getBytesInStackArgArea() const
const AMDGPUGWSResourcePseudoSourceValue * getGWSPSV(const AMDGPUTargetMachine &TM)
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
static LLVM_READONLY const TargetRegisterClass * getSGPRClassForBitWidth(unsigned BitWidth)
static bool isVGPRClass(const TargetRegisterClass *RC)
static bool isSGPRClass(const TargetRegisterClass *RC)
static bool isAGPRClass(const TargetRegisterClass *RC)
bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override
Return true if folding a constant offset with the given GlobalAddress is legal.
bool isTypeDesirableForOp(unsigned Op, EVT VT) const override
Return true if the target has native support for the specified value type and it is 'desirable' to us...
SDNode * PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const override
Fold the instructions after selecting them.
SDValue splitTernaryVectorOp(SDValue Op, SelectionDAG &DAG) const
MachineSDNode * wrapAddr64Rsrc(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr) const
bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, EVT VT) const override
Return true if an FMA operation is faster than a pair of fmul and fadd instructions.
SDValue lowerGET_ROUNDING(SDValue Op, SelectionDAG &DAG) const
AtomicExpansionKind shouldExpandAtomicRMWInIR(const AtomicRMWInst *) const override
Returns how the IR-level AtomicExpand pass should expand the given AtomicRMW, if at all.
bool requiresUniformRegister(MachineFunction &MF, const Value *V) const override
Allows target to decide about the register class of the specific value that is live outside the defin...
bool isFMADLegal(const SelectionDAG &DAG, const SDNode *N) const override
Returns true if be combined with to form an ISD::FMAD.
AtomicExpansionKind shouldExpandAtomicStoreInIR(StoreInst *SI) const override
Returns how the given (atomic) store should be expanded by the IR-level AtomicExpand pass into.
void bundleInstWithWaitcnt(MachineInstr &MI) const
Insert MI into a BUNDLE with an S_WAITCNT 0 immediately following it.
SDValue lowerROTR(SDValue Op, SelectionDAG &DAG) const
MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override
Return the type to use for a scalar shift opcode, given the shifted amount type.
SDValue LowerCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower calls into the specified DAG.
MVT getPointerTy(const DataLayout &DL, unsigned AS) const override
Map address space 7 to MVT::amdgpuBufferFatPointer because that's its in-memory representation.
bool denormalsEnabledForType(const SelectionDAG &DAG, EVT VT) const
void insertCopiesSplitCSR(MachineBasicBlock *Entry, const SmallVectorImpl< MachineBasicBlock * > &Exits) const override
Insert explicit copies in entry and exit blocks.
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override
Return the ValueType of the result of SETCC operations.
SDNode * legalizeTargetIndependentNode(SDNode *Node, SelectionDAG &DAG) const
Legalize target independent instructions (e.g.
bool allowsMisalignedMemoryAccessesImpl(unsigned Size, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const
TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const override
Return the preferred vector type legalization action.
SDValue lowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const
const GCNSubtarget * getSubtarget() const
bool enableAggressiveFMAFusion(EVT VT) const override
Return true if target always benefits from combining into FMA for a given value type.
bool shouldEmitGOTReloc(const GlobalValue *GV) const
void CollectTargetIntrinsicOperands(const CallInst &I, SmallVectorImpl< SDValue > &Ops, SelectionDAG &DAG) const override
SDValue splitUnaryVectorOp(SDValue Op, SelectionDAG &DAG) const
SDValue lowerGET_FPENV(SDValue Op, SelectionDAG &DAG) const
bool isCanonicalized(SelectionDAG &DAG, SDValue Op, SDNodeFlags UserFlags={}, unsigned MaxDepth=5) const
void allocateSpecialInputSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
void allocateLDSKernelId(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
SDValue LowerSTACKSAVE(SDValue Op, SelectionDAG &DAG) const
bool isReassocProfitable(SelectionDAG &DAG, SDValue N0, SDValue N1) const override
void allocateHSAUserSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
ArrayRef< MCPhysReg > getRoundingControlRegisters() const override
Returns a 0 terminated array of rounding control registers that can be attached into strict FP call.
ConstraintType getConstraintType(StringRef Constraint) const override
Given a constraint, return the type of constraint it is for this target.
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool IsVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent) const override
Return the register class that should be used for the specified value type.
void AddMemOpInit(MachineInstr &MI) const
MachineMemOperand::Flags getTargetMMOFlags(const Instruction &I) const override
This callback is used to inspect load/store instructions and add target-specific MachineMemOperand fl...
bool isLegalGlobalAddressingMode(const AddrMode &AM) const
void computeKnownBitsForFrameIndex(int FrameIdx, KnownBits &Known, const MachineFunction &MF) const override
Determine which of the bits of FrameIndex FIOp are known to be 0.
bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override
Return true if it is beneficial to convert a load of a constant to just the constant itself.
Align getPrefLoopAlignment(MachineLoop *ML) const override
Return the preferred loop alignment.
std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override
Given a physical register constraint (e.g.
void emitExpandAtomicStore(StoreInst *SI) const override
Perform a atomic store using a target-specific way.
AtomicExpansionKind shouldExpandAtomicLoadInIR(LoadInst *LI) const override
Returns how the given (atomic) load should be expanded by the IR-level AtomicExpand pass.
Align computeKnownAlignForTargetInstr(GISelValueTracking &Analysis, Register R, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
Determine the known alignment for the pointer value R.
bool getAsmOperandConstVal(SDValue Op, uint64_t &Val) const
bool isShuffleMaskLegal(ArrayRef< int >, EVT) const override
Targets can use this to indicate that they only support some VECTOR_SHUFFLE operations,...
void emitExpandAtomicLoad(LoadInst *LI) const override
Perform a atomic load using a target-specific way.
EVT getOptimalMemOpType(LLVMContext &Context, const MemOp &Op, const AttributeList &FuncAttributes) const override
Returns the target specific optimal type for load and store operations as a result of memset,...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
Register getRegisterByName(const char *RegName, LLT VT, const MachineFunction &MF) const override
Return the register ID of the name passed in.
void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const override
Lower the specified operand into the Ops vector.
LLT getPreferredShiftAmountTy(LLT Ty) const override
Return the preferred type to use for a shift opcode, given the shifted amount type is ShiftValueTy.
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I=nullptr) const override
Return true if the addressing mode represented by AM is legal for this target, for a load/store of th...
SDValue lowerSET_FPENV(SDValue Op, SelectionDAG &DAG) const
bool shouldPreservePtrArith(const Function &F, EVT PtrVT) const override
True if target has some particular form of dealing with pointer arithmetic semantics for pointers wit...
void getTgtMemIntrinsic(SmallVectorImpl< IntrinsicInfo > &, const CallBase &, MachineFunction &MF, unsigned IntrinsicID) const override
Given an intrinsic, checks if on the target the intrinsic will need to map to a MemIntrinsicNode (tou...
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
SDValue lowerSET_ROUNDING(SDValue Op, SelectionDAG &DAG) const
void allocateSpecialInputVGPRsFixed(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Allocate implicit function VGPR arguments in fixed registers.
LoadInst * lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const override
On some platforms, an AtomicRMW that never actually modifies the value (such as fetch_add of 0) can b...
MachineBasicBlock * emitGWSMemViolTestLoop(MachineInstr &MI, MachineBasicBlock *BB) const
bool getAddrModeArguments(const IntrinsicInst *I, SmallVectorImpl< Value * > &Ops, Type *&AccessTy) const override
CodeGenPrepare sinks address calculations into the same BB as Load/Store instructions reading the add...
bool checkAsmConstraintValA(SDValue Op, uint64_t Val, unsigned MaxSize=64) const
bool shouldEmitFixup(const GlobalValue *GV) const
MachineBasicBlock * splitKillBlock(MachineInstr &MI, MachineBasicBlock *BB) const
void emitExpandAtomicCmpXchg(AtomicCmpXchgInst *CI) const override
Perform a cmpxchg expansion using a target-specific method.
bool canTransformPtrArithOutOfBounds(const Function &F, EVT PtrVT) const override
True if the target allows transformations of in-bounds pointer arithmetic that cause out-of-bounds in...
bool hasMemSDNodeUser(SDNode *N) const
bool isSDNodeSourceOfDivergence(const SDNode *N, FunctionLoweringInfo *FLI, UniformityInfo *UA) const override
MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) const override
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
bool isEligibleForTailCallOptimization(SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SmallVectorImpl< ISD::InputArg > &Ins, SelectionDAG &DAG) const
bool isMemOpHasNoClobberedMemOperand(const SDNode *N) const
bool isLegalFlatAddressingMode(const AddrMode &AM, unsigned AddrSpace) const
SDValue LowerCallResult(SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals, bool isThisReturn, SDValue ThisVal) const
SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower the incoming (formal) arguments, described by the Ins array,...
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
bool isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode, EVT DestVT, EVT SrcVT) const override
Return true if an fpext operation input to an Opcode operation is free (for instance,...
void AdjustInstrPostInstrSelection(MachineInstr &MI, SDNode *Node) const override
Assign the register class depending on the number of bits set in the writemask.
MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
void allocateSpecialInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Allocate implicit function VGPR arguments at the end of allocated user arguments.
void finalizeLowering(MachineFunction &MF) const override
Execute target specific actions to finalize target lowering.
static bool isNonGlobalAddrSpace(unsigned AS)
void emitExpandAtomicAddrSpacePredicate(Instruction *AI) const
MachineSDNode * buildRSRC(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr, uint32_t RsrcDword1, uint64_t RsrcDword2And3) const
Return a resource descriptor with the 'Add TID' bit enabled The TID (Thread ID) is multiplied by the ...
unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain targets require unusual breakdowns of certain types.
bool mayBeEmittedAsTailCall(const CallInst *) const override
Return true if the target may be able emit the call instruction as a tail call.
void passSpecialInputs(CallLoweringInfo &CLI, CCState &CCInfo, const SIMachineFunctionInfo &Info, SmallVectorImpl< std::pair< unsigned, SDValue > > &RegsToPass, SmallVectorImpl< SDValue > &MemOpChains, SDValue Chain) const
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
bool checkAsmConstraintVal(SDValue Op, StringRef Constraint, uint64_t Val) const
bool isKnownNeverNaNForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
void emitExpandAtomicRMW(AtomicRMWInst *AI) const override
Perform a atomicrmw expansion using a target-specific way.
static bool shouldExpandVectorDynExt(unsigned EltSize, unsigned NumElem, bool IsDivergentIdx, const GCNSubtarget *Subtarget)
Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be expanded into a set of cmp...
bool shouldUseLDSConstAddress(const GlobalValue *GV) const
bool supportSplitCSR(MachineFunction *MF) const override
Return true if the target supports that a subset of CSRs for the given machine function is handled ex...
bool isExtractVecEltCheap(EVT VT, unsigned Index) const override
Return true if extraction of a scalar element from the given vector type at the given index is cheap.
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const
bool allowsMisalignedMemoryAccesses(LLT Ty, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const override
LLT handling variant.
bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, unsigned Index) const override
Return true if EXTRACT_SUBVECTOR is cheap for extracting this result type from this source type with ...
bool canMergeStoresTo(unsigned AS, EVT MemVT, const MachineFunction &MF) const override
Returns if it's reasonable to merge stores to MemVT size.
SDValue lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const
SITargetLowering(const TargetMachine &tm, const GCNSubtarget &STI)
void computeKnownBitsForTargetInstr(GISelValueTracking &Analysis, Register R, KnownBits &Known, const APInt &DemandedElts, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
bool isFreeAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override
Returns true if a cast from SrcAS to DestAS is "cheap", such that e.g.
bool shouldEmitPCReloc(const GlobalValue *GV) const
AtomicExpansionKind shouldExpandAtomicCmpXchgInIR(const AtomicCmpXchgInst *AI) const override
Returns how the given atomic cmpxchg should be expanded by the IR-level AtomicExpand pass.
void initializeSplitCSR(MachineBasicBlock *Entry) const override
Perform necessary initialization to handle a subset of CSRs explicitly via copies.
void allocateSpecialEntryInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
void allocatePreloadKernArgSGPRs(CCState &CCInfo, SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< ISD::InputArg > &Ins, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
SDValue copyToM0(SelectionDAG &DAG, SDValue Chain, const SDLoc &DL, SDValue V) const
SDValue splitBinaryVectorOp(SDValue Op, SelectionDAG &DAG) const
MachinePointerInfo getKernargSegmentPtrInfo(MachineFunction &MF) const
unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const override
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
MVT getPointerMemTy(const DataLayout &DL, unsigned AS) const override
Similarly, the in-memory representation of a p7 is {p8, i32}, aka v8i32 when padding is added.
void allocateSystemSGPRs(CCState &CCInfo, MachineFunction &MF, SIMachineFunctionInfo &Info, CallingConv::ID CallConv, bool IsShader) const
bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, LLVMContext &Context, const Type *RetTy) const override
This hook should be implemented to check whether the return values described by the Outs array can fi...
unsigned getMaxPermittedBytesForAlignment(MachineBasicBlock *MBB) const override
Return the maximum amount of bytes allowed to be emitted when padding for alignment.
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
LLVM_ABI SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
SDValue getTargetGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, unsigned TargetFlags=0)
SDValue getExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT, unsigned Opcode)
Convert Op, which must be of integer type, to the integer type VT, by either any/sign/zero-extending ...
SDValue getExtractVectorElt(const SDLoc &DL, EVT VT, SDValue Vec, unsigned Idx)
Extract element at Idx from Vec.
const SDValue & getRoot() const
Return the root tag of the SelectionDAG.
LLVM_ABI SDValue getAddrSpaceCast(const SDLoc &dl, EVT VT, SDValue Ptr, unsigned SrcAS, unsigned DestAS)
Return an AddrSpaceCastSDNode.
bool isKnownNeverSNaN(SDValue Op, const APInt &DemandedElts, unsigned Depth=0) const
const TargetSubtargetInfo & getSubtarget() const
SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, Register Reg, SDValue N)
LLVM_ABI SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
LLVM_ABI SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
LLVM_ABI SDValue getShiftAmountConstant(uint64_t Val, EVT VT, const SDLoc &DL)
LLVM_ABI SDValue getAllOnesConstant(const SDLoc &DL, EVT VT, bool IsTarget=false, bool IsOpaque=false)
LLVM_ABI MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
LLVM_ABI void ExtractVectorElements(SDValue Op, SmallVectorImpl< SDValue > &Args, unsigned Start=0, unsigned Count=0, EVT EltVT=EVT())
Append the extracted elements from Start to Count out of the vector Op in Args.
LLVM_ABI SDValue getAtomicLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT MemVT, EVT VT, SDValue Chain, SDValue Ptr, MachineMemOperand *MMO)
LLVM_ABI SDValue getFreeze(SDValue V)
Return a freeze using the SDLoc of the value operand.
LLVM_ABI bool isConstantIntBuildVectorOrConstantInt(SDValue N, bool AllowOpaques=true) const
Test whether the given value is a constant int or similar node.
LLVM_ABI SDValue UnrollVectorOp(SDNode *N, unsigned ResNE=0)
Utility function used by legalize and lowering to "unroll" a vector operation by splitting out the sc...
LLVM_ABI SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
LLVM_ABI bool haveNoCommonBitsSet(SDValue A, SDValue B) const
Return true if A and B have no common bits set.
LLVM_ABI SDValue getRegister(Register Reg, EVT VT)
LLVM_ABI SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
LLVM_ABI bool SignBitIsZeroFP(SDValue Op, unsigned Depth=0) const
Return true if the sign bit of Op is known to be zero, for a floating-point value.
LLVM_ABI SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, LocationSize Size=LocationSize::precise(0), const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false, SDNodeFlags Flags={})
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
LLVM_ABI SDValue getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT, SDValue Chain, SDValue Ptr, SDValue Val, MachineMemOperand *MMO)
Gets a node for an atomic op, produces result (if relevant) and chain and takes 2 operands.
LLVM_ABI SDValue getMemcpy(SDValue Chain, const SDLoc &dl, SDValue Dst, SDValue Src, SDValue Size, Align Alignment, bool isVol, bool AlwaysInline, const CallInst *CI, std::optional< bool > OverrideTailCall, MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo, const AAMDNodes &AAInfo=AAMDNodes(), BatchAAResults *BatchAA=nullptr)
std::pair< SDValue, SDValue > SplitVectorOperand(const SDNode *N, unsigned OpNo)
Split the node's operand with EXTRACT_SUBVECTOR and return the low/high part.
LLVM_ABI SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
const TargetLowering & getTargetLoweringInfo() const
LLVM_ABI std::pair< EVT, EVT > GetSplitDestVTs(const EVT &VT) const
Compute the VTs needed for the low/hi parts of a type which is split (or expanded) into two not neces...
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
LLVM_ABI SDValue getBitcastedAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by first bitcasting (from potentia...
LLVM_ABI SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, Register Reg, EVT VT)
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build Select's if you just have operands and don't want to check...
LLVM_ABI void setNodeMemRefs(MachineSDNode *N, ArrayRef< MachineMemOperand * > NewMemRefs)
Mutate the specified machine node's memory references to the provided list.
LLVM_ABI SDValue getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT VT)
Return the expression required to zero extend the Op value assuming it was the smaller SrcTy value.
const DataLayout & getDataLayout() const
LLVM_ABI SDValue getTokenFactor(const SDLoc &DL, SmallVectorImpl< SDValue > &Vals)
Creates a new TokenFactor containing Vals.
LLVM_ABI SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
LLVM_ABI SDValue getMemBasePlusOffset(SDValue Base, TypeSize Offset, const SDLoc &DL, const SDNodeFlags Flags=SDNodeFlags())
Returns sum of the base pointer and offset.
SDValue getSignedTargetConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
LLVM_ABI void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
LLVM_ABI SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
LLVM_ABI SDValue getSignedConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
LLVM_ABI void RemoveDeadNode(SDNode *N)
Remove the specified node from the system.
LLVM_ABI SDValue getTargetExtractSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand)
A convenience function for creating TargetInstrInfo::EXTRACT_SUBREG nodes.
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
LLVM_ABI SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either sign-extending or trunca...
const TargetMachine & getTarget() const
LLVM_ABI SDValue getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either any-extending or truncat...
LLVM_ABI SDValue getValueType(EVT)
LLVM_ABI SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
LLVM_ABI bool isKnownNeverNaN(SDValue Op, const APInt &DemandedElts, bool SNaN=false, unsigned Depth=0) const
Test whether the given SDValue (or all elements of it, if it is a vector) is known to never be NaN in...
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI unsigned ComputeNumSignBits(SDValue Op, unsigned Depth=0) const
Return the number of times the sign bit of the register is replicated into the other bits.
LLVM_ABI bool isBaseWithConstantOffset(SDValue Op) const
Return true if the specified operand is an ISD::ADD with a ConstantSDNode on the right-hand side,...
LLVM_ABI SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
LLVM_ABI void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
SDValue getPOISON(EVT VT)
Return a POISON node. POISON does not have a useful SDLoc.
SDValue getSplatBuildVector(EVT VT, const SDLoc &DL, SDValue Op)
Return a splat ISD::BUILD_VECTOR node, consisting of Op splatted to all elements.
LLVM_ABI SDValue getFrameIndex(int FI, EVT VT, bool isTarget=false)
LLVM_ABI KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
LLVM_ABI SDValue getRegisterMask(const uint32_t *RegMask)
LLVM_ABI SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
LLVM_ABI SDValue getCondCode(ISD::CondCode Cond)
LLVM_ABI bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
SDValue getObjectPtrOffset(const SDLoc &SL, SDValue Ptr, TypeSize Offset)
Create an add instruction with appropriate flags when used for addressing some offset of an object.
LLVMContext * getContext() const
const SDValue & setRoot(SDValue N)
Set the current root tag of the SelectionDAG.
LLVM_ABI SDNode * UpdateNodeOperands(SDNode *N, SDValue Op)
Mutate the specified node in-place to have the specified operands.
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
LLVM_ABI std::pair< SDValue, SDValue > SplitScalar(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the scalar node with EXTRACT_ELEMENT using the provided VTs and return the low/high part.
LLVM_ABI SDValue getVectorShuffle(EVT VT, const SDLoc &dl, SDValue N1, SDValue N2, ArrayRef< int > Mask)
Return an ISD::VECTOR_SHUFFLE node.
int getMaskElt(unsigned Idx) const
ArrayRef< int > getMask() const
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
StringRef - Represent a constant reference to a string, i.e.
constexpr bool empty() const
empty - Check if the string is empty.
constexpr size_t size() const
size - Get the string size.
A switch()-like statement whose cases are string literals.
StringSwitch & Case(StringLiteral S, T Value)
Information about stack frame layout on the target.
Align getStackAlign() const
getStackAlignment - This method returns the number of bytes to which the stack pointer must be aligne...
StackDirection getStackGrowthDirection() const
getStackGrowthDirection - Return the direction the stack grows
TargetInstrInfo - Interface to description of machine instruction set.
Type * Ty
Same as OrigTy, or partially legalized for soft float libcalls.
void setBooleanVectorContents(BooleanContent Ty)
Specify how the target extends the result of a vector boolean value from a vector of i1 to a wider ty...
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
virtual void finalizeLowering(MachineFunction &MF) const
Execute target specific actions to finalize target lowering.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
virtual const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent=false) const
Return the register class that should be used for the specified value type.
virtual unsigned getMaxPermittedBytesForAlignment(MachineBasicBlock *MBB) const
Return the maximum amount of bytes allowed to be emitted when padding for alignment.
const TargetMachine & getTargetMachine() const
virtual unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain targets require unusual breakdowns of certain types.
virtual MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
LegalizeTypeAction
This enum indicates whether a types are legal for a target, and if not, what action should be used to...
void setHasExtractBitsInsn(bool hasExtractInsn=true)
Tells the code generator that the target has BitExtract instructions.
virtual TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const
Return the preferred vector type legalization action.
virtual unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
Register getStackPointerRegisterToSaveRestore() const
If a physical register, this specifies the register that llvm.savestack/llvm.restorestack should save...
void setBooleanContents(BooleanContent Ty)
Specify how the target extends the result of integer and floating point boolean values from i1 to a w...
virtual Align getPrefLoopAlignment(MachineLoop *ML=nullptr) const
Return the preferred loop alignment.
void computeRegisterProperties(const TargetRegisterInfo *TRI)
Once all of the register classes are added, this allows us to compute derived properties we expose.
void addRegisterClass(MVT VT, const TargetRegisterClass *RC)
Add the specified register class as an available regclass for the specified value type.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
virtual MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
@ ZeroOrOneBooleanContent
bool isOperationLegalOrCustom(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
virtual bool isNarrowingProfitable(SDNode *N, EVT SrcVT, EVT DestVT) const
Return true if it's profitable to narrow operations of type SrcVT to DestVT.
void setStackPointerRegisterToSaveRestore(Register R)
If set to a physical register, this specifies the register that llvm.savestack/llvm....
void AddPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
If Opc/OrigVT is specified as being promoted, the promotion code defaults to trying a larger integer/...
AtomicExpansionKind
Enum that specifies what an atomic load/AtomicRMWInst is expanded to, if at all.
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
bool allowsMemoryAccessForAlignment(LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const
This function returns true if the memory access is aligned or if the target allows this specific unal...
virtual MVT getPointerMemTy(const DataLayout &DL, uint32_t AS=0) const
Return the in-memory pointer type for the given address space, defaults to the pointer type from the ...
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
virtual void computeKnownBitsForFrameIndex(int FIOp, KnownBits &Known, const MachineFunction &MF) const
Determine which of the bits of FrameIndex FIOp are known to be 0.
SDValue scalarizeVectorStore(StoreSDNode *ST, SelectionDAG &DAG) const
std::vector< AsmOperandInfo > AsmOperandInfoVector
SDValue SimplifyMultipleUseDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth=0) const
More limited version of SimplifyDemandedBits that can be used to "lookthrough" ops that don't contrib...
SDValue expandUnalignedStore(StoreSDNode *ST, SelectionDAG &DAG) const
Expands an unaligned store to 2 half-size stores for integer values, and possibly more for vectors.
virtual ConstraintType getConstraintType(StringRef Constraint) const
Given a constraint, return the type of constraint it is for this target.
bool parametersInCSRMatch(const MachineRegisterInfo &MRI, const uint32_t *CallerPreservedMask, const SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< SDValue > &OutVals) const
Check whether parameters to a call that are passed in callee saved registers are the same as from the...
std::pair< SDValue, SDValue > expandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Expands an unaligned load to 2 half-size loads for an integer, and possibly more for vectors.
SDValue expandFMINIMUMNUM_FMAXIMUMNUM(SDNode *N, SelectionDAG &DAG) const
Expand fminimumnum/fmaximumnum into multiple comparison with selects.
virtual bool isTypeDesirableForOp(unsigned, EVT VT) const
Return true if the target has native support for the specified value type and it is 'desirable' to us...
std::pair< SDValue, SDValue > scalarizeVectorLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Turn load of vector type into a load of the individual elements.
virtual std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const
Given a physical register constraint (e.g.
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Op.
TargetLowering(const TargetLowering &)=delete
virtual MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
virtual AsmOperandInfoVector ParseConstraints(const DataLayout &DL, const TargetRegisterInfo *TRI, const CallBase &Call) const
Split up the constraint string from the inline assembly value into the specific constraints and their...
SDValue expandRoundInexactToOdd(EVT ResultVT, SDValue Op, const SDLoc &DL, SelectionDAG &DAG) const
Truncate Op to ResultVT.
virtual void ComputeConstraintToUse(AsmOperandInfo &OpInfo, SDValue Op, SelectionDAG *DAG=nullptr) const
Determines the constraint code and constraint type to use for the specific AsmOperandInfo,...
virtual void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const
Lower the specified operand into the Ops vector.
SDValue expandFMINNUM_FMAXNUM(SDNode *N, SelectionDAG &DAG) const
Expand fminnum/fmaxnum into fminnum_ieee/fmaxnum_ieee with quieted inputs.
Primary interface to the complete machine description for the target machine.
CodeGenOptLevel getOptLevel() const
Returns the optimization level: None, Less, Default, or Aggressive.
const Triple & getTargetTriple() const
bool shouldAssumeDSOLocal(const GlobalValue *GV) const
unsigned GuaranteedTailCallOpt
GuaranteedTailCallOpt - This flag is enabled when -tailcallopt is specified on the commandline.
unsigned getNumRegs() const
Return the number of registers in this class.
unsigned getID() const
Return the register class ID number.
MCRegister getRegister(unsigned i) const
Return the specified register in the class.
iterator begin() const
begin/end - Return all of the registers in this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
Target - Wrapper for Target specific information.
Triple - Helper class for working with autoconf configuration names.
OSType getOS() const
Get the parsed operating system type of this triple.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
static constexpr TypeSize getFixed(ScalarTy ExactSize)
The instances of the Type class are immutable: once they are created, they are never changed.
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
bool isBFloatTy() const
Return true if this is 'bfloat', a 16-bit bfloat type.
LLVM_ABI unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
bool isFunctionTy() const
True if this is an instance of FunctionType.
bool isIntegerTy() const
True if this is an instance of IntegerType.
LLVM_ABI const fltSemantics & getFltSemantics() const
bool isVoidTy() const
Return true if this is 'void'.
A Use represents the edge between a Value definition and its users.
LLVM_ABI unsigned getOperandNo() const
Return the operand # of this use in its User.
LLVM_ABI void set(Value *Val)
User * getUser() const
Returns the User that contains this Use.
const Use & getOperandUse(unsigned i) const
Value * getOperand(unsigned i) const
LLVM Value Representation.
Type * getType() const
All values are typed, get the type of this value.
bool hasOneUse() const
Return true if there is exactly one use of this value.
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
LLVMContext & getContext() const
All values hold a context through their type.
iterator_range< user_iterator > users()
iterator_range< use_iterator > uses()
LLVM_ABI void takeName(Value *V)
Transfer the name from V to this value.
Type * getElementType() const
constexpr ScalarTy getFixedValue() const
self_iterator getIterator()
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ BUFFER_STRIDED_POINTER
Address space for 192-bit fat buffer pointers with an additional index.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ STREAMOUT_REGISTER
Internal address spaces. Can be freely renumbered.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ BUFFER_FAT_POINTER
Address space for 160-bit buffer fat pointers.
@ PRIVATE_ADDRESS
Address space for private memory.
@ BUFFER_RESOURCE
Address space for 128-bit buffer resources.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr char NumVGPRs[]
Key for Kernel::CodeProps::Metadata::mNumVGPRs.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr char SymbolName[]
Key for Kernel::Metadata::mSymbolName.
bool isInlinableLiteralBF16(int16_t Literal, bool HasInv2Pi)
LLVM_READONLY const MIMGG16MappingInfo * getMIMGG16MappingInfo(unsigned G)
bool isInlinableLiteralFP16(int16_t Literal, bool HasInv2Pi)
int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, unsigned VDataDwords, unsigned VAddrDwords)
LLVM_READNONE constexpr bool isShader(CallingConv::ID CC)
bool shouldEmitConstantsToTextSection(const Triple &TT)
bool isFlatGlobalAddrSpace(unsigned AS)
const uint64_t FltRoundToHWConversionTable
bool isGFX12Plus(const MCSubtargetInfo &STI)
unsigned getNSAMaxSize(const MCSubtargetInfo &STI, bool HasSampler)
bool isGFX11(const MCSubtargetInfo &STI)
bool hasValueInRangeLikeMetadata(const MDNode &MD, int64_t Val)
Checks if Val is inside MD, a !range-like metadata.
LLVM_READNONE bool isLegalDPALU_DPPControl(const MCSubtargetInfo &ST, unsigned DC)
LLVM_READNONE constexpr bool mayTailCallThisCC(CallingConv::ID CC)
Return true if we might ever do TCO for calls with this calling convention.
unsigned getAMDHSACodeObjectVersion(const Module &M)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx)
LLVM_READNONE constexpr bool isKernel(CallingConv::ID CC)
LLVM_READNONE constexpr bool isEntryFunctionCC(CallingConv::ID CC)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
LLVM_READNONE constexpr bool isCompute(CallingConv::ID CC)
bool isIntrinsicSourceOfDivergence(unsigned IntrID)
LLVM_READNONE bool isInlinableIntLiteral(int64_t Literal)
Is this literal inlinable, and not one of the values intended for floating point values.
bool getMUBUFTfe(unsigned Opc)
LLVM_READONLY int32_t getGlobalSaddrOp(uint32_t Opcode)
LLVM_READONLY int32_t getVOPe64(uint32_t Opcode)
bool isGFX11Plus(const MCSubtargetInfo &STI)
std::optional< unsigned > getInlineEncodingV2F16(uint32_t Literal)
std::tuple< char, unsigned, unsigned > parseAsmConstraintPhysReg(StringRef Constraint)
Returns a valid charcode or 0 in the first entry if this is a valid physical register constraint.
bool isGFX10Plus(const MCSubtargetInfo &STI)
@ TowardZeroF32_TowardNegativeF64
bool isUniformMMO(const MachineMemOperand *MMO)
std::optional< unsigned > getInlineEncodingV2I16(uint32_t Literal)
uint32_t decodeFltRoundToHWConversionTable(uint32_t FltRounds)
Read the hardware rounding mode equivalent of a AMDGPUFltRounds value.
bool isExtendedGlobalAddrSpace(unsigned AS)
LLVM_READONLY const MIMGDimInfo * getMIMGDimInfo(unsigned DimEnum)
std::optional< unsigned > getInlineEncodingV2BF16(uint32_t Literal)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
LLVM_READNONE constexpr bool isChainCC(CallingConv::ID CC)
int getMaskedMIMGOp(unsigned Opc, unsigned NewChannels)
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)
bool isInlinableLiteralI16(int32_t Literal, bool HasInv2Pi)
LLVM_READNONE constexpr bool canGuaranteeTCO(CallingConv::ID CC)
LLVM_READNONE constexpr bool isGraphics(CallingConv::ID CC)
bool isInlinableLiteral64(int64_t Literal, bool HasInv2Pi)
Is this literal inlinable.
const RsrcIntrinsic * lookupRsrcIntrinsic(unsigned Intr)
const uint64_t FltRoundConversionTable
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ MaxID
The highest possible ID. Must be some 2^k - 1.
@ AMDGPU_Gfx
Used for AMD graphics targets.
@ AMDGPU_CS_ChainPreserve
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_CS_Chain
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
@ Fast
Attempts to make calls as fast as possible (e.g.
@ C
The default llvm calling convention, compatible with C.
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
@ MERGE_VALUES
MERGE_VALUES - This node takes multiple discrete operands and returns them all as its individual resu...
@ STACKSAVE
STACKSAVE - STACKSAVE has one operand, an input chain.
@ PTRADD
PTRADD represents pointer arithmetic semantics, for targets that opt in using shouldPreservePtrArith(...
@ DELETED_NODE
DELETED_NODE - This is an illegal value that is used to catch errors.
@ SET_FPENV
Sets the current floating-point environment.
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
@ BSWAP
Byte Swap and Counting operators.